コード例 #1
0
ファイル: data.py プロジェクト: uyaseen/bionlp-ost-2019
def create_data_splits(base_path, span_path, data_path):
    train_tokens_span = read_pickle(
        join_path(span_path, 'train-token_span.pkl'))
    dev_tokens_span = read_pickle(join_path(span_path, 'dev-token_span.pkl'))
    split_2_tr = {}
    split_2_dev = {}
    split_3_tr = {}
    split_3_dev = {}
    count = 0
    docs_per_split = len(train_tokens_span) / 2
    for doc_name, values in train_tokens_span.items():
        count += 1
        if count <= docs_per_split:
            split_3_tr[doc_name] = values
            split_2_dev[doc_name] = values
        else:
            split_2_tr[doc_name] = values
            split_3_dev[doc_name] = values
    split_3_tr = {**split_3_tr, **dev_tokens_span}
    split_2_tr = {**split_2_tr, **dev_tokens_span}

    def write_data_for_splits(train_split, dev_split, write_path):
        create_directory(write_path)
        write_pickle(train_split, join_path(write_path,
                                            'train-token_span.pkl'))
        write_pickle(dev_split, join_path(write_path, 'dev-token_span.pkl'))
        parse_from_list(txt_files=list(train_split.keys()),
                        w_path=join_path(write_path, 'train.txt'),
                        doc_token_span_w_path=join_path(
                            write_path, 'train-token_span.pkl'),
                        train_data_path=join_path(data_path, 'train/'),
                        dev_data_path=join_path(data_path, 'dev/'))
        parse_from_list(txt_files=list(dev_split.keys()),
                        w_path=join_path(write_path, 'dev.txt'),
                        doc_token_span_w_path=join_path(
                            write_path, 'dev-token_span.pkl'),
                        train_data_path=join_path(data_path, 'train/'),
                        dev_data_path=join_path(data_path, 'dev/'))

    # create token-span & bio files
    write_data_for_splits(split_2_tr,
                          split_2_dev,
                          write_path=join_path(base_path, 'data-strategy=2/'))
    write_data_for_splits(split_3_tr,
                          split_3_dev,
                          write_path=join_path(base_path, 'data-strategy=3/'))
コード例 #2
0
ファイル: data.py プロジェクト: uyaseen/bionlp-ost-2019
def parse_test(data_path, w_path, doc_w_path=None, doc_token_span_w_path=None):
    if doc_token_span_w_path and not file_exists(doc_token_span_w_path):
        print('{} not found, computing doc-level-span information dictionary'.
              format(doc_token_span_w_path))
        documents_spans = get_real_token_span(data_path)
        # keep a copy of token spans to avoid re-computing it during training etc.,
        write_pickle(documents_spans, doc_token_span_w_path)
        print('{} created'.format(doc_token_span_w_path))
    else:
        documents_spans = read_pickle(doc_token_span_w_path)
    txt_files = get_files(data_path, ext='txt')
    documents_tokens = []
    documents_pos = []
    documents_ortho = []
    documents_fname = []
    for txt_path in txt_files:
        document_tokens = []
        document_pos = []
        document_ortho = []
        document_fname = []
        f_name = get_filename(txt_path)
        sentences = documents_spans[f_name]
        for sentence in sentences:
            sentence_tokens = []
            sentence_pos = []
            sentence_ortho = []
            sentence_fname = []
            for word_dictio in sentence:
                sentence_tokens.append(word_dictio['word'])
                sentence_pos.append(word_dictio['pos'])
                sentence_ortho.append(get_ortho_feature(word_dictio['word']))
                sentence_fname.append(f_name)
            document_tokens.append(sentence_tokens)
            document_pos.append(sentence_pos)
            document_ortho.append(sentence_ortho)
            document_fname.append(sentence_fname)
        documents_tokens.append(document_tokens)
        documents_pos.append(document_pos)
        documents_ortho.append(document_ortho)
        documents_fname.append(document_fname)
    write_bio_test(w_path,
                   documents_tokens,
                   documents_pos,
                   documents_ortho,
                   documents_fname,
                   sentence_level=True)
    if doc_w_path:
        write_bio_test(doc_w_path,
                       documents_tokens,
                       documents_pos,
                       documents_ortho,
                       documents_fname,
                       sentence_level=False)
コード例 #3
0
ファイル: model.py プロジェクト: uyaseen/bionlp-ost-2019
 def reload_mappings(self):
     """
     Load mappings from disk.
     """
     print('reload mappings ....')
     mappings = read_pickle(self.mappings_path)
     self.id_to_word = mappings['id_to_word']
     self.id_to_char = mappings['id_to_char']
     self.id_to_tag = mappings['id_to_tag']
     self.id_to_pos = mappings['id_to_pos']
     self.id_to_ortho = mappings['id_to_ortho']
     self.id_to_segment = mappings['id_to_segment']
     self.id_to_word_1 = mappings['id_to_word_1']
コード例 #4
0
ファイル: model.py プロジェクト: uyaseen/bionlp-ost-2019
 def __init__(self, parameters=None, model_path=None, to_save=True):
     """
     Initialize the model. We either provide the parameters and a path where
     we store the models, or the location of a trained model.
     """
     self.parameters_path = os.path.join(model_path, 'parameters.pkl')
     self.mappings_path = os.path.join(model_path, 'mappings.pkl')
     self.model_path = model_path
     if parameters:
         self.parameters = parameters
         # Create directory for the model if it does not exist
         if not os.path.exists(model_path):
             print('current directory: {}'.format(os.getcwd()))
             print('models_path: {}'.format(model_path))
             os.makedirs(model_path)
         if to_save:
             # Save the parameters to disk
             write_pickle(parameters, self.parameters_path)
     else:
         self.parameters = read_pickle(self.parameters_path)
         self.reload_mappings()
     self.components = {}
コード例 #5
0
ファイル: evaluate.py プロジェクト: uyaseen/bionlp-ost-2019
def extract_tagger_predictions(model_path,
                               span_path,
                               output_path=None,
                               f_eval=None,
                               parameters=None,
                               return_raw_predictions=False):
    assert file_exists(span_path)
    documents = read_pickle(span_path)
    if not f_eval:
        model = Model(model_path=model_path)
        parameters = model.parameters
        if 'language_model' not in parameters:
            parameters['language_model'] = False
        # Load reverse mappings
        word_to_id, char_to_id, tag_to_id = [{
            v: k
            for k, v in x.items()
        } for x in [model.id_to_word, model.id_to_char, model.id_to_tag]]
        pos_to_id, ortho_to_id, segment_to_id = [{
            v: k
            for k, v in x.items()
        } for x in [model.id_to_pos, model.id_to_ortho, model.id_to_segment]]
        word_to_id_1 = {v: k for k, v in model.id_to_word_1.items()}
        # Load the model
        _, f_eval = model.build(training=False, **parameters)
        model.reload()
        id_to_tag = model.id_to_tag
    else:
        # load mappings
        mappings = read_pickle(join_path(model_path, 'mappings.pkl'))
        id_to_word = mappings['id_to_word']
        id_to_char = mappings['id_to_char']
        id_to_tag = mappings['id_to_tag']
        id_to_pos = mappings['id_to_pos']
        id_to_ortho = mappings['id_to_ortho']
        id_to_segment = mappings['id_to_segment']
        id_to_word_1 = mappings['id_to_word_1']
        # reverse mappings
        word_to_id, char_to_id, tag_to_id = [{
            v: k
            for k, v in x.items()
        } for x in [id_to_word, id_to_char, id_to_tag]]
        pos_to_id, ortho_to_id, segment_to_id = [{
            v: k
            for k, v in x.items()
        } for x in [id_to_pos, id_to_ortho, id_to_segment]]
        word_to_id_1 = {v: k for k, v in id_to_word_1.items()}
    predictions = {}
    docs_count = 0
    for doc_name, sentences in documents.items():
        for sentence in sentences:
            words = [span['word'] for span in sentence]
            start = [span['start'] for span in sentence]
            end = [span['end'] for span in sentence]
            pos = [span['pos'] for span in sentence]
            ortho = [get_ortho_feature(w) for w in words]
            doc_names = [doc_name] * len(words)
            input_dict = {
                'words': words,
                'pos': pos,
                'ortho': ortho,
                'doc_names': doc_names
            }
            sentence_cl = ' '.join(words)
            if parameters['lower']:
                sentence_cl = sentence_cl.lower()
            # Replace all digits with zeros
            if parameters['zeros']:
                sentence_cl = zero_digits(sentence_cl)
            words = sentence_cl.split(' ')
            assert len(words) == len(start) == len(end)
            # Prepare input
            sentence = prepare_sentence(input_dict,
                                        word_to_id,
                                        char_to_id,
                                        pos_to_id,
                                        ortho_to_id,
                                        segment_to_id,
                                        word_to_id_1,
                                        lower=parameters['lower'])
            input = create_input(sentence, parameters, add_label=False)
            # Decoding
            if parameters['crf']:
                y_preds = np.array(f_eval(*input))[1:-1]
            else:
                y_preds = f_eval(*input).argmax(axis=1)
            y_preds = [id_to_tag[y_pred] for y_pred in y_preds]
            # Output tags in the IOB2 format
            if parameters['tag_scheme'] == 'iobes':
                y_preds = iobes_iob(y_preds)
            if not return_raw_predictions:
                y_preds = resolve_inconsistencies(y_preds)
                entities = extract_entities(words, y_preds, start, end)
                if doc_name not in predictions:
                    predictions[doc_name] = []
                if len(entities) > 0:
                    predictions[doc_name] += entities
            else:
                if doc_name not in predictions:
                    predictions[doc_name] = {}
                    predictions[doc_name]['words'] = []
                    predictions[doc_name]['tags'] = []
                    predictions[doc_name]['start'] = []
                    predictions[doc_name]['end'] = []
                predictions[doc_name]['words'].append(words)
                predictions[doc_name]['tags'].append(y_preds)
                predictions[doc_name]['start'].append(start)
                predictions[doc_name]['end'].append(end)
        docs_count += 1
        if docs_count % 100 == 0:
            print('{} documents processed'.format(docs_count))

    if return_raw_predictions:
        return predictions
    else:
        write_predictions(output_path, predictions)
コード例 #6
0
ファイル: data.py プロジェクト: uyaseen/bionlp-ost-2019
def parse_from_list(txt_files,
                    w_path,
                    doc_token_span_w_path,
                    train_data_path,
                    dev_data_path,
                    ann_file_ext='ann',
                    append_i_tag=True):
    assert doc_token_span_w_path is not None
    documents_spans = read_pickle(doc_token_span_w_path)
    documents_tokens = []
    documents_tags = []
    documents_pos = []
    documents_ortho = []
    documents_segment = []
    documents_fname = []
    # 'txt_path' is a misnomer, instead of path it's just a file name without the extension
    for txt_path in txt_files:
        document_tokens = []
        document_tags = []
        document_pos = []
        document_ortho = []
        document_segment = []
        document_fname = []
        att_path = join_path(train_data_path,
                             '{}.{}'.format(txt_path, ann_file_ext))
        if not file_exists(att_path):
            att_path = join_path(dev_data_path,
                                 '{}.{}'.format(txt_path, ann_file_ext))
        entities_dict = parse_annotation_file(att_path)
        f_name = txt_path
        sentences = documents_spans[f_name]
        for sentence in sentences:
            sentence_tokens = []
            sentence_tags = []
            sentence_pos = []
            sentence_ortho = []
            sentence_segment = []
            sentence_fname = []
            for word_dictio in sentence:
                _, tag = is_token_an_entity(word_dictio, entities_dict)
                if append_i_tag:
                    if tag != 'O':
                        tag = 'I-{}'.format(tag)
                segment = 'O' if tag == 'O' else 'I-SEGMENT'
                sentence_tokens.append(word_dictio['word'])
                sentence_tags.append(tag)
                sentence_pos.append(word_dictio['pos'])
                sentence_ortho.append(get_ortho_feature(word_dictio['word']))
                sentence_segment.append(segment)
                sentence_fname.append(f_name)
            document_tokens.append(sentence_tokens)
            document_tags.append(sentence_tags)
            document_pos.append(sentence_pos)
            document_ortho.append(sentence_ortho)
            document_segment.append(sentence_segment)
            document_fname.append(sentence_fname)
        documents_tokens.append(document_tokens)
        documents_tags.append(document_tags)
        documents_pos.append(document_pos)
        documents_ortho.append(document_ortho)
        documents_segment.append(document_segment)
        documents_fname.append(document_fname)
    write_bio(w_path, documents_tokens, documents_tags, documents_pos,
              documents_ortho, documents_segment, documents_fname)
コード例 #7
0
ファイル: data.py プロジェクト: uyaseen/bionlp-ost-2019
def parse(data_path,
          w_path,
          doc_token_span_w_path=None,
          ann_file_ext='ann',
          append_i_tag=True):
    create_directory(get_parent_directory(w_path))
    if not file_exists(doc_token_span_w_path):
        print('{} not found, computing doc-level-span information dictionary'.
              format(doc_token_span_w_path))
        documents_spans = get_real_token_span(data_path)
        # keep a copy of token spans to avoid re-computing it during training etc.,
        write_pickle(documents_spans, doc_token_span_w_path)
        print('{} created'.format(doc_token_span_w_path))
    else:
        documents_spans = read_pickle(doc_token_span_w_path)
    txt_files = get_files(data_path, ext='txt')
    documents_tokens = []
    documents_tags = []
    documents_pos = []
    documents_ortho = []
    documents_segment = []
    documents_fname = []
    for txt_path in txt_files:
        document_tokens = []
        document_tags = []
        document_pos = []
        document_ortho = []
        document_segment = []
        document_fname = []
        att_path = join_path(
            data_path, '{}.{}'.format(get_filename(txt_path), ann_file_ext))
        entities_dict = parse_annotation_file(att_path)
        f_name = get_filename(txt_path)
        sentences = documents_spans[f_name]
        for sentence in sentences:
            sentence_tokens = []
            sentence_tags = []
            sentence_pos = []
            sentence_ortho = []
            sentence_segment = []
            sentence_fname = []
            for word_dictio in sentence:
                _, tag = is_token_an_entity(word_dictio, entities_dict)
                if append_i_tag:
                    if tag != 'O':
                        tag = 'I-{}'.format(tag)
                segment = 'O' if tag == 'O' else 'I-SEGMENT'
                sentence_tokens.append(word_dictio['word'])
                sentence_tags.append(tag)
                sentence_pos.append(word_dictio['pos'])
                sentence_ortho.append(get_ortho_feature(word_dictio['word']))
                sentence_segment.append(segment)
                sentence_fname.append(f_name)
            document_tokens.append(sentence_tokens)
            document_tags.append(sentence_tags)
            document_pos.append(sentence_pos)
            document_ortho.append(sentence_ortho)
            document_segment.append(sentence_segment)
            document_fname.append(sentence_fname)
        documents_tokens.append(document_tokens)
        documents_tags.append(document_tags)
        documents_pos.append(document_pos)
        documents_ortho.append(document_ortho)
        documents_segment.append(document_segment)
        documents_fname.append(document_fname)
    write_bio(w_path, documents_tokens, documents_tags, documents_pos,
              documents_ortho, documents_segment, documents_fname)