def to_dict(self, tokenizer: RobertaTokenizerFast): self.text = tokenizer.decode(self.input_ids) d = { 'text': self.text.strip() } # removes the leading space from the RobertaTokenizer for k, v in self.attrib.items(): d[k] = v return d
def self_test(tokenizer: RobertaTokenizerFast): # example = '<sd-panel> of an adult <sd-tag type="gene">Prox1</sd-tag>-<sd-tag type="gene">Cre</sd-tag><sd-tag type="gene">ER</sd-tag>T2;<sd-tag type="gene">Ilk</sd-tag>+/+ <sd-tag type="organism">mouse</sd-tag> (referred to as "Adult Control")</sd-panel>' example = "<xml>Here <sd-panel>it is<sd-tag role='reporter'> </sd-tag>: <i>nested <sd-tag role='reporter'>in</sd-tag> <sd-tag category='entity' type='gene' role='intervention'>Creb-1</sd-tag> with some <sd-tag type='protein' role='assayed'>tail</sd-tag></i>. End </sd-panel>." example += ' 1 2 3 4 5 6 7 8 9 0' + '</xml>' # to test truncation path = Path('/tmp/test_dataprep') path.mkdir() source_path = path / 'source' source_path.mkdir() dest_dir_path = path / 'dataset' source_file_path = source_path / 'example.xml' source_file_path.write_text(example) max_length = 20 # in token! expected_tokens = [ '<s>', 'Here', 'Ġit', 'Ġis', 'Ġ:', 'Ġnested', 'Ġin', 'ĠCre', 'b', '-', '1', 'Ġwith', 'Ġsome', 'Ġtail', '.', 'ĠEnd', 'Ġ.', 'Ġ1', 'Ġ2', '</s>' ] expected_label_codes = { 'entity_types': [ 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-GENEPROD', 'I-GENEPROD', 'I-GENEPROD', 'I-GENEPROD', 'O', 'O', 'B-GENEPROD', 'O', 'O', 'O', 'O', 'O', 'O' ], 'geneprod_roles': [ 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-CONTROLLED_VAR', 'I-CONTROLLED_VAR', 'I-CONTROLLED_VAR', 'I-CONTROLLED_VAR', 'O', 'O', 'B-MEASURED_VAR', 'O', 'O', 'O', 'O', 'O', 'O' ], 'boring': [ 'O', 'O', 'O', 'O', 'O', 'O', 'B-BORING', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O' ], 'panel_start': [ 'O', 'O', 'B-PANEL_START', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O' ] } try: data_prep = Preparator( source_path, dest_dir_path, tokenizer, [sd.ENTITY_TYPES, sd.GENEPROD_ROLES, sd.PANELIZATION], max_length=max_length) labeled_examples = data_prep.run() print("\nXML examples:") print(example) print("\nLabel codes: ") for i in range(len(labeled_examples[0]['tokenized'].input_ids)): token = labeled_examples[0]['tokenized'].tokens()[i] input_id = labeled_examples[0]['tokenized'].input_ids[i] decoded = tokenizer.decode(input_id) label_ids = "\t".join([ labels[i] for labels in labeled_examples[0]['label_ids'].values() ]) print(f"{token}\t{decoded}\t{label_ids}") labeled_example_label_ids = labeled_examples[0]['label_ids'] assert labeled_examples[0]['tokenized'].tokens( ) == expected_tokens, labeled_examples[0]['tokenized'].tokens() assert labeled_example_label_ids[ 'entity_types'] == expected_label_codes[ 'entity_types'], labeled_example_label_ids['entity_types'] assert labeled_example_label_ids[ 'geneprod_roles'] == expected_label_codes[ 'geneprod_roles'], labeled_example_label_ids['geneprod_roles'] assert labeled_example_label_ids[ 'panel_start'] == expected_label_codes[ 'panel_start'], labeled_example_label_ids['panel_start'] assert data_prep.verify() filepath = dest_dir_path / "data.jsonl" print(f"\nContent of saved file ({filepath}):") with filepath.open() as f: for line in f: j = json.loads(line) print(json.dumps(j)) finally: shutil.rmtree('/tmp/test_dataprep/') print("cleaned up and removed /tmp/test_corpus") print("Looks like it is working!")