def test_none(self): n = TextRegularizerProcessorParams( rulesets=[], rulegroups=["no"], ).create(None, PipelineMode.TARGETS) self.assertNotEqual(n(Sample(targets="“Resolve quotes”")).targets, "''Resolve quotes''") self.assertNotEqual(n(Sample(targets=" “Resolve spaces ” ")).targets, "''Resolve spaces ''")
def to_unbatched_samples(inputs, targets, outputs, meta) -> Iterable[Sample]: flatted_values = tf.nest.flatten(meta) + tf.nest.flatten(outputs) batch_size = flatted_values[0].shape[0] if len( flatted_values) > 0 else None if inputs is not None: inputs = unbatched(inputs, batch_size) if targets is not None: targets = unbatched(targets, batch_size) if outputs is not None: outputs = unbatched(outputs, batch_size) if meta is not None: meta = unbatched(meta, batch_size) batch_size = len(inputs or targets or outputs or meta) for i in range(batch_size): yield Sample( inputs=inputs[i] if inputs else None, targets=targets[i] if targets else None, outputs=outputs[i] if outputs else None, meta=meta[i] if meta else None, )
def join(samples: List[Sample]): return Sample( inputs=samples[0].inputs, targets=samples[0].targets, outputs=[s.outputs for s in samples], meta=[s.meta for s in samples], )
def to_sample(self) -> Sample: return Sample(inputs=self.data_path, targets=self.gt, meta={ "data_path": self.data_path, "data_bins": self.data_bins })
def apply(self, sample: Sample) -> Sample: inputs = sample.inputs.copy() inputs["img"] = inputs["img"] / 255 if self.params.center: inputs["img"] = (inputs["img"] - 0.5) * 2 return sample.new_inputs(inputs)
def load_sample(fn) -> Sample: img = cv2.imread(fn, flags=cv2.IMREAD_GRAYSCALE) gt_path = fn + ".txt" if os.path.exists(gt_path): with open(gt_path) as f: gt = np.asarray([int(f.read())]) else: gt = None return Sample(inputs={"img": img}, targets={"gt": gt}, meta={"fn": fn})
def to_sample(d: dict) -> Sample: return Sample( inputs={ Keys.InputSentence1: d["sentence1"].decode("utf-8"), Keys.InputSentence2: d["sentence2"].decode("utf-8"), }, targets={Keys.Target: np.asarray([d["label"]])}, meta={"index": int(d["idx"])}, )
def apply(self, sample: Sample) -> Sample: img = sample.inputs.transpose() encoded = [self.data_params.codec.index(c) for c in sample.targets] return sample.new_inputs({ Keys.Image: img, Keys.ImageLength: np.array([img.shape[0]]) }).new_targets({ Keys.Targets: np.array(encoded), Keys.TargetsLength: np.array([len(encoded)]) })
def vote(self, sample: Sample) -> Sample: # sample.outputs is a list of the output of each model # just do a majority voting counts = {} for output in sample.outputs: p = output["class"] counts[p] = counts.get(p, 0) + 1 voted = max(counts.items(), key=lambda kv: kv[1])[0] return sample.new_outputs({"class": voted})
def extract_meta(sample: Sample) -> Sample: meta = sample.meta or {} if "meta" in sample.inputs: input_meta = sample.inputs["meta"] if isinstance(input_meta, (list, np.ndarray)): assert len(input_meta) == 1, "This must be one, just be sure" input_meta = input_meta[0] meta.update(**json.loads(input_meta)) return sample.new_meta(meta)
def test_standalone_pipeline(self): from tfaip.imports import DataBaseParams class TestDataParams(DataBaseParams): @staticmethod def cls(): raise NotImplementedError data_params = TestDataParams() samples = [Sample()] * 100 pipeline = data_params.pre_proc.create( DataPipelineParams(num_processes=8), data_params) for i, d in enumerate(pipeline.apply(samples)): print(i, d)
def make_sample(self, file_id: str): sample = Sample( inputs={ k: self.parsed_files[k][file_id] for k in self._input_keys }, targets={ k: self.parsed_files[k][file_id] for k in self._target_keys }, meta={ "id": file_id, **{ k + "_filename": v[file_id] for k, v in self.parsed_files.items() } }, ) if len(sample.inputs) == 1: sample = sample.new_inputs(list(sample.inputs.values())[0]) if len(sample.targets) == 1: sample = sample.new_targets(list(sample.targets.values())[0]) return sample
def _unwrap_batch(self, inputs, targets, outputs, meta) -> Iterable[Sample]: try: batch_size = next(iter(inputs.values())).shape[0] except StopIteration as e: raise ValueError(f"Empty inputs {inputs}") from e for i in range(batch_size): un_batched_outputs = [{k: v[i] for k, v in output.items()} for output in outputs] un_batched_inputs = {k: v[i] for k, v in inputs.items()} un_batched_targets = {k: v[i] for k, v in targets.items()} un_batched_meta = {k: v[i] for k, v in meta.items()} parsed_meta = json.loads(un_batched_meta["meta"][0].decode("utf-8"), cls=TFAIPJsonDecoder) sample = Sample( inputs=un_batched_inputs, outputs=un_batched_outputs, targets=un_batched_targets, meta=parsed_meta ) yield sample
def generator(params: PredictionGeneratorParams, data: DataBase, scenario: ScenarioBase, queue: Queue): # This function is called in a separate thread. # Load the predictor (thus the model) and predict on the generator params of the predictor # Write the results to the output queue logger.info( f"Loading generator model from {params.model} in separate thread") predictor = scenario.predictor_cls()(params.predictor_params, data) predictor.set_model(params.model + "/serve") for s in predictor.predict(params.generator): queue.put( Sample(targets=s.targets, inputs=s.inputs, outputs=s.outputs, meta=s.meta)) queue.put(None) logger.info("Generator thread ended.")
def generate(self) -> Iterable[Sample]: # Generate the samples # First flatten all, since shuffling is performed during training (on each epoch anew) # Also shuffle in evaluation (no effect on the accuracy) but random examples will be displayed flat_samples = [] for k, filenames in self.params.image_files.items(): for fn in filenames: # Pass inputs and targets, meta data is optional but can be useful for debugging flat_samples.append( Sample(inputs=fn, targets=k, meta={ "filename": fn, "classname": k })) if self.mode in {PipelineMode.TRAINING, PipelineMode.EVALUATION}: shuffle(flat_samples) return flat_samples
def apply(self, sample: Sample) -> Sample: def encode_sentences(sentence1, sentence2): tokens1 = list(self.tokenizer.tokenize(sentence1)) + [ self.tokenizer.sep_token ] tokens2 = list(self.tokenizer.tokenize(sentence2)) + [ self.tokenizer.sep_token ] return [self.tokenizer.cls_token] + tokens1 + tokens2, [ 0 ] + [0] * len(tokens1) + [1] * len(tokens2) word_ids, type_ids = encode_sentences( sample.inputs[Keys.InputSentence1], sample.inputs[Keys.InputSentence2]) word_ids = self.tokenizer.convert_tokens_to_ids(word_ids) return sample.new_inputs({ Keys.InputWordIds: np.asarray(word_ids), Keys.InputMask: np.full(fill_value=1, shape=[len(word_ids)], dtype=np.int32), Keys.InputTypeIds: np.asarray(type_ids, dtype=np.int32), })
def to_samples(samples): return [ Sample(inputs={"img": img}, targets={"gt": gt.reshape((1,))}, meta={"index": i}) for i, (img, gt) in enumerate(zip(*samples)) ]
def generate(self) -> Iterable[Sample]: return [ Sample(inputs={"data": [i]}, targets={"targets": [i]}) for i in range(1000) ]
def assert_str(p_, in_s, out_s): computed = list(p_.apply_on_samples([Sample(targets=in_s)]))[0].targets self.assertEqual(out_s, computed, f"Wrong output for string {in_s}.")
def apply(self, sample: Sample) -> Sample: img = cv2.imread(sample.inputs, flags=cv2.IMREAD_GRAYSCALE) with open(sample.targets) as f: txt = f.read().strip() return sample.new_inputs(img).new_targets(txt)
def apply(self, sample: Sample) -> Sample: return sample.new_inputs( cv2.resize( sample.inputs, (self.data_params.image_height, self.data_params.image_width)))
def apply(self, sample: Sample) -> Sample: return sample.new_inputs(sample.inputs + self.params.v)
def to_samples(data) -> Iterable[Sample]: for sample_ in data: sample = Sample(inputs={"text": sample_["tokens"]}, targets={"tag_ids": sample_["ner_tags"]}) yield sample
def generate(self) -> Iterable[Sample]: return map(lambda s: Sample(inputs=np.array([s]), targets=np.array([s])), self.params.numbers_to_generate)
def apply(self, sample: Sample) -> Sample: return sample.new_inputs({"n": np.asarray([sample.inputs])}).new_targets( {"n": np.asarray([sample.targets])} )
def generate(self) -> Iterable[Sample]: return (Sample(inputs=fn) for fn in self.params.image_files)
def generate(self) -> Iterable[Sample]: return ( Sample(inputs=fn, targets=split_all_ext(fn)[0] + ".gt.txt", meta={"filename": fn}) for fn in self.params.image_files )
def sample_to_sop(self, sop_sample: Sample) -> Sample: """note sop data source are different to mlm, since two sentences are needed""" sentences = sop_sample.inputs["text"] del sop_sample.inputs["text"] if self.data_params.segment_train: inputlist = sentences.split(" ") nowords = len(inputlist) # minimal word number is 10 if nowords >= 10: splitindex = random.randint(4, nowords - 5) else: splitindex = 0 textpartone = inputlist[:splitindex] # maximal text sequence length is 40 textparttwo = inputlist[splitindex:] textpartone = " ".join(textpartone) textparttwo = " ".join(textparttwo) first_enc_sentence = self.tokenizer.encode(textpartone) if len(first_enc_sentence) > self.data_params.max_token_text_part: first_enc_sentence = first_enc_sentence[ len(first_enc_sentence) - self.data_params.max_token_text_part:] sec_enc_sentence = self.tokenizer.encode(textparttwo) if len(sec_enc_sentence) > self.data_params.max_token_text_part: sec_enc_sentence = sec_enc_sentence[:self.data_params. max_token_text_part] else: first_enc_sentence, sec_enc_sentence = self.build_two_sentence_segments( sentences) first_mask_enc_sentence, first_masked_index_list = self.mask_enc_sentence( first_enc_sentence) sec_mask_enc_sentence, sec_masked_index_list = self.mask_enc_sentence( sec_enc_sentence) # Add CLS-Tag and SEP-Tag if self.switch_sentences(): text_index_list = ([self.data_params.tok_vocab_size] + sec_mask_enc_sentence + [self.data_params.tok_vocab_size + 1] + first_mask_enc_sentence + [self.data_params.tok_vocab_size + 1]) masked_index_list = [0] + sec_masked_index_list + [ 0 ] + first_masked_index_list + [0] tar_mlm = ([self.data_params.tok_vocab_size] + sec_enc_sentence + [self.data_params.tok_vocab_size + 1] + first_enc_sentence + [self.data_params.tok_vocab_size + 1]) tar_sop = [0] else: text_index_list = ([self.data_params.tok_vocab_size] + first_mask_enc_sentence + [self.data_params.tok_vocab_size + 1] + sec_mask_enc_sentence + [self.data_params.tok_vocab_size + 1]) masked_index_list = [0] + first_masked_index_list + [ 0 ] + sec_masked_index_list + [0] tar_mlm = ([self.data_params.tok_vocab_size] + first_enc_sentence + [self.data_params.tok_vocab_size + 1] + sec_enc_sentence + [self.data_params.tok_vocab_size + 1]) tar_sop = [1] sop_sample.inputs = { "text": np.asarray(text_index_list), "seq_length": np.asarray([len(text_index_list)]) } sop_sample.inputs["seq_length"] = np.asarray([len(text_index_list)]) sop_sample.targets = { "tgt_mlm": np.asarray(tar_mlm), "mask_mlm": np.asarray(masked_index_list), "tgt_sop": np.asarray(tar_sop), } if self._wwa: word_length_vector, segment_ids = self.build_whole_word_attention_inputs( tar_mlm) sop_sample.inputs["word_length_vector"] = np.asarray( word_length_vector) sop_sample.inputs["segment_ids"] = np.asarray(segment_ids) return sop_sample
def to_samples(samples): return [ Sample(inputs={"img": np.array(img).astype("float")}, targets={"gt": gt.reshape((1,))}) for img, gt in zip(*samples) ]
def _pad_batched_samples(self, samples: List[Sample]) -> Sample: """Batches and pads the content of samples""" data = self.data def pack_meta(meta): return {"meta": np.asarray([json.dumps(meta, cls=TFAIPJsonEncoder)])} if self.mode == PipelineMode.PREDICTION: output_signature = (data.dataset_input_layer_specs(), data.dataset_meta_layer_specs()) extract = lambda s: (s.inputs, pack_meta(s.meta)) to_sample = lambda i, m: Sample(inputs=i, meta=m) elif self.mode == PipelineMode.TARGETS: output_signature = (data.dataset_target_layer_specs(), data.dataset_meta_layer_specs()) extract = lambda s: (s.targets, pack_meta(s.meta)) to_sample = lambda t, m: Sample(targets=t, meta=m) else: output_signature = ( data.dataset_input_layer_specs(), data.dataset_target_layer_specs(), data.dataset_meta_layer_specs(), ) extract = lambda s: (s.inputs, s.targets, pack_meta(s.meta)) to_sample = lambda i, t, m: Sample(inputs=i, targets=t, meta=m) flat_samples = [] for sample in samples: sample = extract(sample) tf.nest.assert_same_structure(sample, output_signature) flat_samples.append(tf.nest.flatten(sample)) def default(dtype): if dtype == tf.bool: return False return "" if dtype == tf.string else 0 def pad(struct): struct, signature = struct padding_value = data.padding_values().get(signature.name, default(signature.dtype)) if signature.dtype == "string": return np.stack(struct, axis=0) # assert shapes for i, axis_dim in enumerate(signature.shape): if axis_dim is None: continue for s in struct: assert s.shape[i] == axis_dim, f"Shape mismatch. Sample shape {s.shape[i]} but must be {axis_dim}" # pad all None axis for i, axis_dim in enumerate(signature.shape): if axis_dim is not None: continue max_dim = max(s.shape[i] for s in struct) def pad_shape_for_sample(s): shape = [] for i_ax, ax in enumerate(s.shape): if i_ax == i: shape.append((0, max_dim - ax)) else: shape.append((0, 0)) return shape struct = [np.pad(s, pad_shape_for_sample(s), constant_values=padding_value) for s in struct] struct = np.stack(struct, axis=0) return struct flat_signature = tf.nest.flatten(output_signature) batched_samples = zip(*flat_samples) batched = list(map(pad, zip(batched_samples, flat_signature))) batched = tf.nest.pack_sequence_as(output_signature, batched) return to_sample(*batched)