Example #1
0
 def test_none(self):
     n = TextRegularizerProcessorParams(
         rulesets=[],
         rulegroups=["no"],
     ).create(None, PipelineMode.TARGETS)
     self.assertNotEqual(n(Sample(targets="“Resolve quotes”")).targets, "''Resolve quotes''")
     self.assertNotEqual(n(Sample(targets="  “Resolve   spaces  ”   ")).targets, "''Resolve spaces ''")
Example #2
0
def to_unbatched_samples(inputs, targets, outputs, meta) -> Iterable[Sample]:
    flatted_values = tf.nest.flatten(meta) + tf.nest.flatten(outputs)
    batch_size = flatted_values[0].shape[0] if len(
        flatted_values) > 0 else None

    if inputs is not None:
        inputs = unbatched(inputs, batch_size)

    if targets is not None:
        targets = unbatched(targets, batch_size)

    if outputs is not None:
        outputs = unbatched(outputs, batch_size)

    if meta is not None:
        meta = unbatched(meta, batch_size)

    batch_size = len(inputs or targets or outputs or meta)

    for i in range(batch_size):
        yield Sample(
            inputs=inputs[i] if inputs else None,
            targets=targets[i] if targets else None,
            outputs=outputs[i] if outputs else None,
            meta=meta[i] if meta else None,
        )
def join(samples: List[Sample]):
    return Sample(
        inputs=samples[0].inputs,
        targets=samples[0].targets,
        outputs=[s.outputs for s in samples],
        meta=[s.meta for s in samples],
    )
Example #4
0
 def to_sample(self) -> Sample:
     return Sample(inputs=self.data_path,
                   targets=self.gt,
                   meta={
                       "data_path": self.data_path,
                       "data_bins": self.data_bins
                   })
Example #5
0
    def apply(self, sample: Sample) -> Sample:
        inputs = sample.inputs.copy()

        inputs["img"] = inputs["img"] / 255
        if self.params.center:
            inputs["img"] = (inputs["img"] - 0.5) * 2

        return sample.new_inputs(inputs)
 def load_sample(fn) -> Sample:
     img = cv2.imread(fn, flags=cv2.IMREAD_GRAYSCALE)
     gt_path = fn + ".txt"
     if os.path.exists(gt_path):
         with open(gt_path) as f:
             gt = np.asarray([int(f.read())])
     else:
         gt = None
     return Sample(inputs={"img": img}, targets={"gt": gt}, meta={"fn": fn})
Example #7
0
 def to_sample(d: dict) -> Sample:
     return Sample(
         inputs={
             Keys.InputSentence1: d["sentence1"].decode("utf-8"),
             Keys.InputSentence2: d["sentence2"].decode("utf-8"),
         },
         targets={Keys.Target: np.asarray([d["label"]])},
         meta={"index": int(d["idx"])},
     )
Example #8
0
 def apply(self, sample: Sample) -> Sample:
     img = sample.inputs.transpose()
     encoded = [self.data_params.codec.index(c) for c in sample.targets]
     return sample.new_inputs({
         Keys.Image: img,
         Keys.ImageLength: np.array([img.shape[0]])
     }).new_targets({
         Keys.Targets: np.array(encoded),
         Keys.TargetsLength: np.array([len(encoded)])
     })
Example #9
0
    def vote(self, sample: Sample) -> Sample:
        # sample.outputs is a list of the output of each model
        # just do a majority voting
        counts = {}
        for output in sample.outputs:
            p = output["class"]
            counts[p] = counts.get(p, 0) + 1

        voted = max(counts.items(), key=lambda kv: kv[1])[0]
        return sample.new_outputs({"class": voted})
Example #10
0
        def extract_meta(sample: Sample) -> Sample:
            meta = sample.meta or {}
            if "meta" in sample.inputs:
                input_meta = sample.inputs["meta"]
                if isinstance(input_meta, (list, np.ndarray)):
                    assert len(input_meta) == 1, "This must be one, just be sure"
                    input_meta = input_meta[0]

                meta.update(**json.loads(input_meta))
            return sample.new_meta(meta)
Example #11
0
    def test_standalone_pipeline(self):
        from tfaip.imports import DataBaseParams

        class TestDataParams(DataBaseParams):
            @staticmethod
            def cls():
                raise NotImplementedError

        data_params = TestDataParams()
        samples = [Sample()] * 100
        pipeline = data_params.pre_proc.create(
            DataPipelineParams(num_processes=8), data_params)
        for i, d in enumerate(pipeline.apply(samples)):
            print(i, d)
Example #12
0
    def make_sample(self, file_id: str):
        sample = Sample(
            inputs={
                k: self.parsed_files[k][file_id]
                for k in self._input_keys
            },
            targets={
                k: self.parsed_files[k][file_id]
                for k in self._target_keys
            },
            meta={
                "id": file_id,
                **{
                    k + "_filename": v[file_id]
                    for k, v in self.parsed_files.items()
                }
            },
        )
        if len(sample.inputs) == 1:
            sample = sample.new_inputs(list(sample.inputs.values())[0])
        if len(sample.targets) == 1:
            sample = sample.new_targets(list(sample.targets.values())[0])

        return sample
    def _unwrap_batch(self, inputs, targets, outputs, meta) -> Iterable[Sample]:
        try:
            batch_size = next(iter(inputs.values())).shape[0]
        except StopIteration as e:
            raise ValueError(f"Empty inputs {inputs}") from e
        for i in range(batch_size):
            un_batched_outputs = [{k: v[i] for k, v in output.items()} for output in outputs]
            un_batched_inputs = {k: v[i] for k, v in inputs.items()}
            un_batched_targets = {k: v[i] for k, v in targets.items()}
            un_batched_meta = {k: v[i] for k, v in meta.items()}
            parsed_meta = json.loads(un_batched_meta["meta"][0].decode("utf-8"), cls=TFAIPJsonDecoder)
            sample = Sample(
                inputs=un_batched_inputs, outputs=un_batched_outputs, targets=un_batched_targets, meta=parsed_meta
            )

            yield sample
Example #14
0
def generator(params: PredictionGeneratorParams, data: DataBase,
              scenario: ScenarioBase, queue: Queue):
    # This function is called in a separate thread.
    # Load the predictor (thus the model) and predict on the generator params of the predictor
    # Write the results to the output queue
    logger.info(
        f"Loading generator model from {params.model} in separate thread")
    predictor = scenario.predictor_cls()(params.predictor_params, data)
    predictor.set_model(params.model + "/serve")
    for s in predictor.predict(params.generator):
        queue.put(
            Sample(targets=s.targets,
                   inputs=s.inputs,
                   outputs=s.outputs,
                   meta=s.meta))

    queue.put(None)
    logger.info("Generator thread ended.")
Example #15
0
    def generate(self) -> Iterable[Sample]:
        # Generate the samples
        # First flatten all, since shuffling is performed during training (on each epoch anew)
        # Also shuffle in evaluation (no effect on the accuracy) but random examples will be displayed
        flat_samples = []
        for k, filenames in self.params.image_files.items():
            for fn in filenames:
                # Pass inputs and targets, meta data is optional but can be useful for debugging
                flat_samples.append(
                    Sample(inputs=fn,
                           targets=k,
                           meta={
                               "filename": fn,
                               "classname": k
                           }))

        if self.mode in {PipelineMode.TRAINING, PipelineMode.EVALUATION}:
            shuffle(flat_samples)

        return flat_samples
    def apply(self, sample: Sample) -> Sample:
        def encode_sentences(sentence1, sentence2):
            tokens1 = list(self.tokenizer.tokenize(sentence1)) + [
                self.tokenizer.sep_token
            ]
            tokens2 = list(self.tokenizer.tokenize(sentence2)) + [
                self.tokenizer.sep_token
            ]
            return [self.tokenizer.cls_token] + tokens1 + tokens2, [
                0
            ] + [0] * len(tokens1) + [1] * len(tokens2)

        word_ids, type_ids = encode_sentences(
            sample.inputs[Keys.InputSentence1],
            sample.inputs[Keys.InputSentence2])
        word_ids = self.tokenizer.convert_tokens_to_ids(word_ids)
        return sample.new_inputs({
            Keys.InputWordIds:
            np.asarray(word_ids),
            Keys.InputMask:
            np.full(fill_value=1, shape=[len(word_ids)], dtype=np.int32),
            Keys.InputTypeIds:
            np.asarray(type_ids, dtype=np.int32),
        })
def to_samples(samples):
    return [
        Sample(inputs={"img": img}, targets={"gt": gt.reshape((1,))}, meta={"index": i})
        for i, (img, gt) in enumerate(zip(*samples))
    ]
Example #18
0
 def generate(self) -> Iterable[Sample]:
     return [
         Sample(inputs={"data": [i]}, targets={"targets": [i]})
         for i in range(1000)
     ]
Example #19
0
 def assert_str(p_, in_s, out_s):
     computed = list(p_.apply_on_samples([Sample(targets=in_s)]))[0].targets
     self.assertEqual(out_s, computed, f"Wrong output for string {in_s}.")
Example #20
0
    def apply(self, sample: Sample) -> Sample:
        img = cv2.imread(sample.inputs, flags=cv2.IMREAD_GRAYSCALE)
        with open(sample.targets) as f:
            txt = f.read().strip()

        return sample.new_inputs(img).new_targets(txt)
Example #21
0
 def apply(self, sample: Sample) -> Sample:
     return sample.new_inputs(
         cv2.resize(
             sample.inputs,
             (self.data_params.image_height, self.data_params.image_width)))
Example #22
0
 def apply(self, sample: Sample) -> Sample:
     return sample.new_inputs(sample.inputs + self.params.v)
Example #23
0
 def to_samples(data) -> Iterable[Sample]:
     for sample_ in data:
         sample = Sample(inputs={"text": sample_["tokens"]},
                         targets={"tag_ids": sample_["ner_tags"]})
         yield sample
Example #24
0
 def generate(self) -> Iterable[Sample]:
     return map(lambda s: Sample(inputs=np.array([s]), targets=np.array([s])), self.params.numbers_to_generate)
Example #25
0
 def apply(self, sample: Sample) -> Sample:
     return sample.new_inputs({"n": np.asarray([sample.inputs])}).new_targets(
         {"n": np.asarray([sample.targets])}
     )
Example #26
0
 def generate(self) -> Iterable[Sample]:
     return (Sample(inputs=fn) for fn in self.params.image_files)
Example #27
0
 def generate(self) -> Iterable[Sample]:
     return (
         Sample(inputs=fn, targets=split_all_ext(fn)[0] + ".gt.txt", meta={"filename": fn})
         for fn in self.params.image_files
     )
Example #28
0
 def sample_to_sop(self, sop_sample: Sample) -> Sample:
     """note sop data source are different to mlm, since two sentences are needed"""
     sentences = sop_sample.inputs["text"]
     del sop_sample.inputs["text"]
     if self.data_params.segment_train:
         inputlist = sentences.split(" ")
         nowords = len(inputlist)
         # minimal word number is 10
         if nowords >= 10:
             splitindex = random.randint(4, nowords - 5)
         else:
             splitindex = 0
         textpartone = inputlist[:splitindex]
         # maximal text sequence length is 40
         textparttwo = inputlist[splitindex:]
         textpartone = " ".join(textpartone)
         textparttwo = " ".join(textparttwo)
         first_enc_sentence = self.tokenizer.encode(textpartone)
         if len(first_enc_sentence) > self.data_params.max_token_text_part:
             first_enc_sentence = first_enc_sentence[
                 len(first_enc_sentence) -
                 self.data_params.max_token_text_part:]
         sec_enc_sentence = self.tokenizer.encode(textparttwo)
         if len(sec_enc_sentence) > self.data_params.max_token_text_part:
             sec_enc_sentence = sec_enc_sentence[:self.data_params.
                                                 max_token_text_part]
     else:
         first_enc_sentence, sec_enc_sentence = self.build_two_sentence_segments(
             sentences)
     first_mask_enc_sentence, first_masked_index_list = self.mask_enc_sentence(
         first_enc_sentence)
     sec_mask_enc_sentence, sec_masked_index_list = self.mask_enc_sentence(
         sec_enc_sentence)
     # Add CLS-Tag and SEP-Tag
     if self.switch_sentences():
         text_index_list = ([self.data_params.tok_vocab_size] +
                            sec_mask_enc_sentence +
                            [self.data_params.tok_vocab_size + 1] +
                            first_mask_enc_sentence +
                            [self.data_params.tok_vocab_size + 1])
         masked_index_list = [0] + sec_masked_index_list + [
             0
         ] + first_masked_index_list + [0]
         tar_mlm = ([self.data_params.tok_vocab_size] + sec_enc_sentence +
                    [self.data_params.tok_vocab_size + 1] +
                    first_enc_sentence +
                    [self.data_params.tok_vocab_size + 1])
         tar_sop = [0]
     else:
         text_index_list = ([self.data_params.tok_vocab_size] +
                            first_mask_enc_sentence +
                            [self.data_params.tok_vocab_size + 1] +
                            sec_mask_enc_sentence +
                            [self.data_params.tok_vocab_size + 1])
         masked_index_list = [0] + first_masked_index_list + [
             0
         ] + sec_masked_index_list + [0]
         tar_mlm = ([self.data_params.tok_vocab_size] + first_enc_sentence +
                    [self.data_params.tok_vocab_size + 1] +
                    sec_enc_sentence +
                    [self.data_params.tok_vocab_size + 1])
         tar_sop = [1]
     sop_sample.inputs = {
         "text": np.asarray(text_index_list),
         "seq_length": np.asarray([len(text_index_list)])
     }
     sop_sample.inputs["seq_length"] = np.asarray([len(text_index_list)])
     sop_sample.targets = {
         "tgt_mlm": np.asarray(tar_mlm),
         "mask_mlm": np.asarray(masked_index_list),
         "tgt_sop": np.asarray(tar_sop),
     }
     if self._wwa:
         word_length_vector, segment_ids = self.build_whole_word_attention_inputs(
             tar_mlm)
         sop_sample.inputs["word_length_vector"] = np.asarray(
             word_length_vector)
         sop_sample.inputs["segment_ids"] = np.asarray(segment_ids)
     return sop_sample
Example #29
0
def to_samples(samples):
    return [
        Sample(inputs={"img": np.array(img).astype("float")}, targets={"gt": gt.reshape((1,))})
        for img, gt in zip(*samples)
    ]
Example #30
0
    def _pad_batched_samples(self, samples: List[Sample]) -> Sample:
        """Batches and pads the content of samples"""
        data = self.data

        def pack_meta(meta):
            return {"meta": np.asarray([json.dumps(meta, cls=TFAIPJsonEncoder)])}

        if self.mode == PipelineMode.PREDICTION:
            output_signature = (data.dataset_input_layer_specs(), data.dataset_meta_layer_specs())
            extract = lambda s: (s.inputs, pack_meta(s.meta))
            to_sample = lambda i, m: Sample(inputs=i, meta=m)
        elif self.mode == PipelineMode.TARGETS:
            output_signature = (data.dataset_target_layer_specs(), data.dataset_meta_layer_specs())
            extract = lambda s: (s.targets, pack_meta(s.meta))
            to_sample = lambda t, m: Sample(targets=t, meta=m)
        else:
            output_signature = (
                data.dataset_input_layer_specs(),
                data.dataset_target_layer_specs(),
                data.dataset_meta_layer_specs(),
            )
            extract = lambda s: (s.inputs, s.targets, pack_meta(s.meta))
            to_sample = lambda i, t, m: Sample(inputs=i, targets=t, meta=m)

        flat_samples = []
        for sample in samples:
            sample = extract(sample)
            tf.nest.assert_same_structure(sample, output_signature)
            flat_samples.append(tf.nest.flatten(sample))

        def default(dtype):
            if dtype == tf.bool:
                return False
            return "" if dtype == tf.string else 0

        def pad(struct):
            struct, signature = struct
            padding_value = data.padding_values().get(signature.name, default(signature.dtype))
            if signature.dtype == "string":
                return np.stack(struct, axis=0)

            # assert shapes
            for i, axis_dim in enumerate(signature.shape):
                if axis_dim is None:
                    continue
                for s in struct:
                    assert s.shape[i] == axis_dim, f"Shape mismatch. Sample shape {s.shape[i]} but must be {axis_dim}"

            # pad all None axis
            for i, axis_dim in enumerate(signature.shape):
                if axis_dim is not None:
                    continue

                max_dim = max(s.shape[i] for s in struct)

                def pad_shape_for_sample(s):
                    shape = []
                    for i_ax, ax in enumerate(s.shape):
                        if i_ax == i:
                            shape.append((0, max_dim - ax))
                        else:
                            shape.append((0, 0))
                    return shape

                struct = [np.pad(s, pad_shape_for_sample(s), constant_values=padding_value) for s in struct]

            struct = np.stack(struct, axis=0)
            return struct

        flat_signature = tf.nest.flatten(output_signature)
        batched_samples = zip(*flat_samples)
        batched = list(map(pad, zip(batched_samples, flat_signature)))
        batched = tf.nest.pack_sequence_as(output_signature, batched)
        return to_sample(*batched)