Beispiel #1
0
 def to_dict(self, tokenizer: RobertaTokenizerFast):
     self.text = tokenizer.decode(self.input_ids)
     d = {
         'text': self.text.strip()
     }  # removes the leading space from the RobertaTokenizer
     for k, v in self.attrib.items():
         d[k] = v
     return d
Beispiel #2
0
def self_test(tokenizer: RobertaTokenizerFast):
    # example = '<sd-panel> of an adult <sd-tag type="gene">Prox1</sd-tag>-<sd-tag type="gene">Cre</sd-tag><sd-tag type="gene">ER</sd-tag>T2;<sd-tag type="gene">Ilk</sd-tag>+/+ <sd-tag type="organism">mouse</sd-tag> (referred to as "Adult Control")</sd-panel>'
    example = "<xml>Here <sd-panel>it is<sd-tag role='reporter'> </sd-tag>: <i>nested <sd-tag role='reporter'>in</sd-tag> <sd-tag category='entity' type='gene' role='intervention'>Creb-1</sd-tag> with some <sd-tag type='protein' role='assayed'>tail</sd-tag></i>. End </sd-panel>."
    example += ' 1 2 3 4 5 6 7 8 9 0' + '</xml>'  # to test truncation
    path = Path('/tmp/test_dataprep')
    path.mkdir()
    source_path = path / 'source'
    source_path.mkdir()
    dest_dir_path = path / 'dataset'
    source_file_path = source_path / 'example.xml'
    source_file_path.write_text(example)
    max_length = 20  # in token!
    expected_tokens = [
        '<s>', 'Here', 'Ġit', 'Ġis', 'Ġ:', 'Ġnested', 'Ġin', 'ĠCre', 'b', '-',
        '1', 'Ġwith', 'Ġsome', 'Ġtail', '.', 'ĠEnd', 'Ġ.', 'Ġ1', 'Ġ2', '</s>'
    ]
    expected_label_codes = {
        'entity_types': [
            'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-GENEPROD', 'I-GENEPROD',
            'I-GENEPROD', 'I-GENEPROD', 'O', 'O', 'B-GENEPROD', 'O', 'O', 'O',
            'O', 'O', 'O'
        ],
        'geneprod_roles': [
            'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-CONTROLLED_VAR',
            'I-CONTROLLED_VAR', 'I-CONTROLLED_VAR', 'I-CONTROLLED_VAR', 'O',
            'O', 'B-MEASURED_VAR', 'O', 'O', 'O', 'O', 'O', 'O'
        ],
        'boring': [
            'O', 'O', 'O', 'O', 'O', 'O', 'B-BORING', 'O', 'O', 'O', 'O', 'O',
            'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'
        ],
        'panel_start': [
            'O', 'O', 'B-PANEL_START', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
            'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'
        ]
    }
    try:
        data_prep = Preparator(
            source_path,
            dest_dir_path,
            tokenizer, [sd.ENTITY_TYPES, sd.GENEPROD_ROLES, sd.PANELIZATION],
            max_length=max_length)
        labeled_examples = data_prep.run()
        print("\nXML examples:")
        print(example)
        print("\nLabel codes: ")
        for i in range(len(labeled_examples[0]['tokenized'].input_ids)):
            token = labeled_examples[0]['tokenized'].tokens()[i]
            input_id = labeled_examples[0]['tokenized'].input_ids[i]
            decoded = tokenizer.decode(input_id)
            label_ids = "\t".join([
                labels[i]
                for labels in labeled_examples[0]['label_ids'].values()
            ])
            print(f"{token}\t{decoded}\t{label_ids}")
        labeled_example_label_ids = labeled_examples[0]['label_ids']
        assert labeled_examples[0]['tokenized'].tokens(
        ) == expected_tokens, labeled_examples[0]['tokenized'].tokens()
        assert labeled_example_label_ids[
            'entity_types'] == expected_label_codes[
                'entity_types'], labeled_example_label_ids['entity_types']
        assert labeled_example_label_ids[
            'geneprod_roles'] == expected_label_codes[
                'geneprod_roles'], labeled_example_label_ids['geneprod_roles']
        assert labeled_example_label_ids[
            'panel_start'] == expected_label_codes[
                'panel_start'], labeled_example_label_ids['panel_start']
        assert data_prep.verify()
        filepath = dest_dir_path / "data.jsonl"
        print(f"\nContent of saved file ({filepath}):")
        with filepath.open() as f:
            for line in f:
                j = json.loads(line)
                print(json.dumps(j))
    finally:
        shutil.rmtree('/tmp/test_dataprep/')
        print("cleaned up and removed /tmp/test_corpus")
    print("Looks like it is working!")