Ejemplo n.º 1
0
def dump_results(doc_name, entities, opt):
    entity_id = 1
    collection = bioc.BioCCollection()
    document = bioc.BioCDocument()
    collection.add_document(document)
    document.id = doc_name
    passage = bioc.BioCPassage()
    document.add_passage(passage)
    passage.offset = 0

    for entity in entities:
        anno_entity = bioc.BioCAnnotation()
        passage.add_annotation(anno_entity)
        anno_entity.id = str(entity_id)
        entity_id += 1
        anno_entity.infons['type'] = entity.type
        anno_entity_location = bioc.BioCLocation(
            entity.spans[0][0], entity.spans[0][1] - entity.spans[0][0])
        anno_entity.add_location(anno_entity_location)
        anno_entity.text = entity.name
        if len(entity.norm_ids) > 0:
            anno_entity.infons['UMLS code'] = entity.norm_ids[0]
            anno_entity.infons['UMLS term'] = entity.norm_names[0]
        else:
            anno_entity.infons['UMLS code'] = 'N/A'
            anno_entity.infons['UMLS term'] = 'N/A'

    with codecs.open(os.path.join(opt.predict, doc_name + ".bioc.xml"), 'w',
                     'UTF-8') as fp:
        bioc.dump(collection, fp)
    def dump(self, data_rows, filename=None):
        filename = filename or tempfile.mkstemp(prefix="PredictedFile",
                                                suffix=".xml")[1]
        # Read file and do preliminary pre processing to form rows of records
        dic = {}
        for d in data_rows:
            i_relation = I_GENE2 + 1
            # Only bother with true relations
            if not d[i_relation]:
                continue

            # construct dictionary
            docid = d[I_DOC_ID]
            if docid not in dic:
                dic[docid] = {"relations": []}

            dic[docid]["relations"].append((d[I_GENE1], d[I_GENE2]))

        biocrelation_wrapper = BiocRelation()
        with open(filename, 'w') as fp:
            collection = BioCCollection()
            for docid in dic.keys():
                collection.add_document(
                    biocrelation_wrapper.get_bioc_relations(
                        docid, dic[docid]["relations"]))

            self.logger.info("Writing input to %s", filename)
            bioc.dump(collection, fp)
Ejemplo n.º 3
0
def BioC_Converter(Inputfile, Outputfile, Originalfile):

    tiabs = {}
    with open(Originalfile, 'r', encoding='utf8') as file_Originalfile:
        collection = bioc.load(file_Originalfile)
        document_count = 0
        for document in collection.documents:
            passage_count = 0
            for passage in document.passages:
                if document_count not in tiabs:
                    tiabs[document_count] = {}
                tiabs[document_count][passage_count] = passage.text
                passage_count = passage_count + 1
            document_count = document_count + 1
    file_Originalfile.close()

    with open(Outputfile, 'w', encoding='utf8') as file_Outputfile:
        with open(Inputfile, 'r', encoding='utf8') as file_Inputfile:
            collection = bioc.load(file_Inputfile)
            document_count = 0
            for document in collection.documents:
                passage_count = 0
                for passage in document.passages:
                    passage.text = tiabs[document_count][passage_count]
                    for annotation in passage.annotations:
                        start = annotation.locations[0].offset
                        last = start + annotation.locations[0].length
                        annotation.text = tiabs[document_count][passage_count][
                            start:last]
                    passage_count = passage_count + 1
                document_count = document_count + 1
            bioc.dump(collection, file_Outputfile, pretty_print=False)
        file_Inputfile.close()
    file_Outputfile.close()
    def dump(self, dataframe, handle):
        # Read file and do preliminary pre processing to form rows of records
        dic = {}

        for i, d in dataframe.iterrows():
            if not d["isValid"]: continue

            i_relation = I_GENE2 + 1
            # Only bother with true relations
            if not d[i_relation]:
                continue

            # construct dictionary
            docid = d["docid"]
            if docid not in dic:
                dic[docid] = {"relations": []}

            dic[docid]["relations"].append(
                (d["participant1"], d["participant2"]))

        biocrelation_wrapper = BiocRelation()

        collection = BioCCollection()
        for docid in dic.keys():
            collection.add_document(
                biocrelation_wrapper.get_bioc_relations(
                    docid, dic[docid]["relations"]))

        self.logger.info("Writing input to handle")
        bioc.dump(collection, handle)
Ejemplo n.º 5
0
def BioC_Converter(infile, outfile, biotag_dic, nn_model, para_set):

    with open(infile, 'r', encoding='utf-8') as fin:
        with open(outfile, 'w', encoding='utf8') as fout:
            collection = bioc.load(fin)
            for document in collection.documents:
                for passage in document.passages:
                    tag_result = bioTag(passage.text,
                                        biotag_dic,
                                        nn_model,
                                        onlyLongest=para_set['onlyLongest'],
                                        abbrRecog=para_set['abbrRecog'],
                                        Threshold=para_set['ML_Threshold'])
                    mention_num = 0
                    for ele in tag_result:
                        bioc_note = bioc.BioCAnnotation()
                        bioc_note.id = str(mention_num)
                        mention_num += 1
                        bioc_note.infons['identifier'] = ele[2]
                        bioc_note.infons['type'] = "Phenotype"
                        bioc_note.infons['score'] = ele[3]
                        start = int(ele[0])
                        last = int(ele[1])
                        loc = bioc.BioCLocation(offset=str(start),
                                                length=str(last - start))
                        bioc_note.locations.append(loc)
                        bioc_note.text = passage.text[start:last]
                        passage.annotations.append(bioc_note)
            bioc.dump(collection, fout, pretty_print=True)
Ejemplo n.º 6
0
def csv2collections(dest_top, *sources):
    if not dest_top.exists():
        dest_top.mkdir(parents=True, exist_ok=True)
    total = collections.defaultdict(bioc.BioCCollection)
    for src in sources:
        all_df = pd.read_csv(
            src,
            header=None,
            names=['id', 'report'],
        )
        all_df = all_df.dropna()
        for i, row in tqdm.tqdm(all_df.iterrows(), total=len(all_df)):
            id = row['id']

            text = row['report']
            if text[0] == '"' and text[-1] == '"':
                text = text[1:-1]
            doc = get_one_document(text)
            doc.id = id

            col_id = hashlib.md5(id.encode()).hexdigest()[-2:]
            total[col_id].add_document(doc)

    for k, c in tqdm.tqdm(total.items(), total=len(total)):
        # print(len(c.documents))
        with open(dest_top / f'{k}.xml', 'w', encoding='utf8') as fp:
            bioc.dump(c, fp)
Ejemplo n.º 7
0
def main():
    argv = parse_args(__doc__, version='version 2')
    print(argv)

    lemmatizer = Lemmatizer()
    ptb2dep = NegBioPtb2DepConverter(lemmatizer, universal=True)
    splitter = NegBioSSplitter(newline=argv['--newline_is_sentence_break'])
    parser = NegBioParser(model_dir=argv['--bllip-model'])

    argv = get_absolute_path(argv, '--neg-patterns',
                             'negbio/patterns/neg_patterns.txt')
    argv = get_absolute_path(argv, '--uncertainty-patterns',
                             'negbio/patterns/uncertainty_patterns.txt')

    mm = pymetamap.MetaMap.get_instance(argv['--metamap'])
    neg_detector = negdetect.Detector(argv['--neg-patterns'],
                                      argv['--uncertainty-patterns'])

    if argv['--cuis'] == 'None':
        cuis = None
    else:
        cuis = read_cuis(argv['--cuis'])

    if argv['text']:
        collection = text2bioc.text2collection(argv['SOURCES'])
    elif argv['bioc']:
        with open(argv['SOURCE']) as fp:
            collection = bioc.load(fp)
    else:
        raise KeyError

    pipeline(collection, mm, splitter, parser, ptb2dep, neg_detector, cuis)

    with open(os.path.expanduser(argv['--output']), 'w') as fp:
        bioc.dump(collection, fp)
Ejemplo n.º 8
0
def split(source,
          *,
          prefix: str,
          num_doc: int,
          additional_suffix: str = '.xml',
          suffix_length: int = 2):
    path_format = prefix + '{:0' + str(
        suffix_length) + 'x}' + additional_suffix

    with open(source, encoding='utf8') as fp:
        collection = bioc.load(fp)

    newc = bioc.BioCCollection()
    newc.infons = collection.infons
    newc.source = collection.source
    newc.version = collection.version
    newc.source = collection.source
    newc.standalone = collection.standalone

    i = 0
    for doc in tqdm.tqdm(collection.documents):
        newc.add_document(doc)
        if len(newc.documents) == num_doc:
            dst = path_format.format(i)
            with open(dst, 'w', encoding='utf8') as fp:
                bioc.dump(newc, fp)
            del newc.documents[:]
            i += 1
    if newc.documents:
        dst = path_format.format(i)
        with open(dst, 'w', encoding='utf8') as fp:
            bioc.dump(newc, fp)
Ejemplo n.º 9
0
def main(argv):
    argv = docopt.docopt(__doc__, argv=argv)
    print(argv)

    collection = text2collection(argv['SOURCE'],
                                 split_document=argv['--split-document'])
    with open(os.path.expanduser(argv['--out']), 'w') as fp:
        bioc.dump(collection, fp)
Ejemplo n.º 10
0
def test_dump():
    collection = _get_collection()
    tmp = tempfile.mktemp()
    with open(tmp, 'w', encoding='utf8') as fp:
        bioc.dump(collection, fp)
    with open(tmp, encoding='utf8') as fp:
        collection = bioc.load(fp)
    assert_everything(collection)
Ejemplo n.º 11
0
def test_dump():
    with open(file, encoding='utf8') as fp:
        collection = bioc.load(fp, BioCFileType.BIOC_JSON)
    tmp = tempfile.mktemp()
    with open(tmp, 'w', encoding='utf8') as fp:
        bioc.dump(collection, fp, BioCFileType.BIOC_JSON)
    with open(tmp, encoding='utf8') as fp:
        collection = bioc.load(fp, BioCFileType.BIOC_JSON)
    assert_everything(collection)
Ejemplo n.º 12
0
 def test_dump(self):
     with open(self.src) as fp:
         collection = bioc.load(fp)
     tmp = tempfile.NamedTemporaryFile()
     with open(tmp.name, 'w') as fp:
         bioc.dump(collection, fp)
     with open(tmp.name) as fp:
         collection = bioc.load(fp)
     self.__test_collection(collection)
Ejemplo n.º 13
0
def create_collections():
    filenames = []
    top_dir = tempfile.mkdtemp()
    for i in range(10):
        c = text_to_bioc(['No pneumothorax.'], 'c/d/p')
        filename = os.path.join(top_dir, '{}.xml'.format(i))
        with open(filename, 'w') as fp:
            bioc.dump(c, fp)
        filenames.append(filename)
    return filenames
Ejemplo n.º 14
0
def get_figure_text(src, dest, bioc_dir):
    df = pd.read_csv(src, dtype=str)
    objs = df_to_obj(df)

    # add text
    objs = add_text(objs, bioc_dir)

    collection = bioc.BioCCollection()
    for obj in objs:
        collection.documents.extend(obj.to_bioc_document())
    with open(dest, 'w', encoding='utf8') as fp:
        bioc.dump(collection, fp)
Ejemplo n.º 15
0
def text_to_collection_file(output, *file):
    """
    Convert text FILEs to the BioC output file

    Args:
        output: Specify the output file name.
        file: Specify the input text files
    """
    logging.basicConfig(level=logging.INFO)
    collection = text2collection(*file)
    with open(output, 'w') as fp:
        bioc.dump(collection, fp)
Ejemplo n.º 16
0
def main():
    argv = parse_args(__doc__, version='version 2')
    print(argv)

    lemmatizer = Lemmatizer()
    ptb2dep = NegBioPtb2DepConverter(lemmatizer, universal=True)
    ssplitter = NegBioSSplitter(newline=argv['--newline_is_sentence_break'])
    parser = NegBioParser(model_dir=argv['--bllip-model'])

    argv = get_absolute_path(argv, '--mention_phrases_dir',
                             'negbio/chexpert/phrases/mention')
    argv = get_absolute_path(argv, '--unmention_phrases_dir',
                             'negbio/chexpert/phrases/unmention')
    argv = get_absolute_path(
        argv, '--pre-negation-uncertainty-patterns',
        'negbio/chexpert/patterns/pre_negation_uncertainty.txt')
    argv = get_absolute_path(
        argv, '--post-negation-uncertainty-patterns',
        'negbio/chexpert/patterns/post_negation_uncertainty.txt')
    argv = get_absolute_path(argv, '--neg-patterns',
                             'negbio/chexpert/patterns/negation.txt')

    # chexpert
    loader = NegBioLoader()
    extractor = NegBioExtractor(Path(argv['--mention_phrases_dir']),
                                Path(argv['--unmention_phrases_dir']),
                                verbose=argv['--verbose'])
    neg_detector = ModifiedDetector(
        argv['--pre-negation-uncertainty-patterns'], argv['--neg-patterns'],
        argv['--post-negation-uncertainty-patterns'])
    aggregator = NegBioAggregator(CATEGORIES, verbose=argv['--verbose'])

    if argv['text']:
        collection = text2bioc.text2collection(argv['SOURCES'])
    elif argv['bioc']:
        with open(argv['SOURCE']) as fp:
            collection = bioc.load(fp)
    else:
        raise KeyError

    pipeline(collection,
             loader,
             ssplitter,
             extractor,
             parser,
             ptb2dep,
             neg_detector,
             aggregator,
             verbose=argv['--verbose'])

    with open(os.path.expanduser(argv['--output']), 'w') as fp:
        bioc.dump(collection, fp)
Ejemplo n.º 17
0
def translateNCRFPPintoBioc(doc_token, predict_results, file_name):
    collection = bioc.BioCCollection()
    document = bioc.BioCDocument()
    collection.add_document(document)
    document.id = file_name
    passage = bioc.BioCPassage()
    document.add_passage(passage)
    passage.offset = 0
    entity_id = 1

    sent_num = len(predict_results)
    for idx in range(sent_num):
        sent_length = len(predict_results[idx][0])
        sent_token = doc_token[(doc_token['sent_idx'] == idx)]

        assert sent_token.shape[0] == sent_length, "file {}, sent {}".format(
            file_name, idx)
        labelSequence = []

        for idy in range(sent_length):
            token = sent_token.iloc[idy]
            label = predict_results[idx][0][idy]
            labelSequence.append(label)

            if label[0] == 'S' or label[0] == 'B':
                anno_entity = bioc.BioCAnnotation()
                passage.add_annotation(anno_entity)
                anno_entity.id = str(entity_id)
                anno_entity.infons['type'] = label[2:]
                anno_entity_location = bioc.BioCLocation(
                    token['start'], token['end'] - token['start'])
                anno_entity.add_location(anno_entity_location)
                anno_entity.text = token['text']
                entity_id += 1

            elif label[0] == 'M' or label[0] == 'E':
                if checkWrongState(labelSequence):
                    anno_entity = passage.annotations[-1]

                    whitespacetoAdd = token['start'] - anno_entity.locations[
                        0].end
                    for _ in range(whitespacetoAdd):
                        anno_entity.text += " "
                    anno_entity.text += token['text']
                    anno_entity.locations[0].length = token[
                        'end'] - anno_entity.locations[0].offset

    bioc_file = open(file_name + ".bioc.xml", 'w')
    bioc.dump(collection, bioc_file)
    bioc_file.close()
Ejemplo n.º 18
0
def split_file(source,
               *,
               prefix: str,
               num_doc: int,
               additional_suffix: str = '.xml',
               suffix_length: int = 2):
    path_format = prefix + '{:0' + str(
        suffix_length) + 'x}' + additional_suffix

    with open(source, encoding='utf8') as fp:
        collection = bioc.load(fp)

    for i, subc in tqdm.tqdm(enumerate(itersplit(collection, num_doc))):
        dst = path_format.format(i)
        with open(dst, 'w', encoding='utf8') as fp:
            bioc.dump(subc, fp)
Ejemplo n.º 19
0
def save_predictions(ids, relevant, confidence, output):
    collection = bioc.BioCCollection()
    collection.source = 'PubMed'
    now = datetime.datetime.now()
    collection.date = '{}{:02d}{:02d}'.format(now.year, now.month, now.day)
    collection.key = 'collection.key'
    for i, id in enumerate(ids):
        document = bioc.BioCDocument()
        document.id = id
        document.infons['relevant'] = 'no' if relevant[i] == 0 else 'yes'
        if relevant[i] == 1:
            document.infons['confidence'] = '{:.2f}'.format(confidence[i][0])
        else:
            document.infons['confidence'] = '{:.2f}'.format(
                1 - confidence[i][0])
        collection.add_document(document)

    bioc.dump(collection, open(output, 'w'), pretty_print=True)
Ejemplo n.º 20
0
def scan_document(*_, **kwargs):
    """
    Scan each document in a list of BioC source files, apply fn, and print to directory.

    Args:
        kwargs:
            source(list): a list of source pathnames
            directory(str): output directory
            fn:
                fn should expect the following arguments in this given order:
                    sequence1
                    sequence2
                    ...
                    non_sequence1
                    non_sequence2
                    ...
            verbose(boolean):
    """
    source = kwargs.pop('source')
    verbose = kwargs.pop('verbose', True)
    directory = os.path.expanduser(kwargs.pop('directory'))
    suffix = kwargs.pop('suffix')
    fn = kwargs.pop('fn')
    non_sequences = kwargs.pop('non_sequences', [])

    if not os.path.exists(directory):
        os.makedirs(directory)

    def catch(document, non_sequences):
        try:
            return fn(document, *non_sequences)
        except:
            logging.exception('Cannot process %s', document.id)

    for pathname in tqdm.tqdm(source, total=len(source), disable=not verbose):
        basename = os.path.splitext(os.path.basename(pathname))[0]
        dstname = os.path.join(directory, '{}{}'.format(basename, suffix))
        with io.open(pathname, encoding='utf8') as fp:
            collection = bioc.load(fp)
        collection.documents = [
            catch(doc, non_sequences) for doc in collection.documents
        ]
        with io.open(dstname, 'w', encoding='utf8') as fp:
            bioc.dump(collection, fp)
Ejemplo n.º 21
0
def main(argv):
    argv = docopt.docopt(__doc__, argv=argv)
    print(argv)

    splitter = ssplit.NltkSSplitter(newline=argv['--newline_is_sentence_break'])
    parser = parse.Bllip(model_dir=argv['--bllip-model'])
    ptb2dep = ptb2ud.Ptb2DepConverter(universal=True)
    lemmatizer = ptb2ud.Lemmatizer()
    mm = pymetamap.MetaMap.get_instance(argv['--metamap'])
    neg_detector = negdetect.Detector(argv['--neg-patterns'], argv['--uncertainty-patterns'])

    if argv['--cuis'] == 'None':
        cuis = None
    else:
        cuis = dner_mm.read_cuis(argv['--cuis'])

    collection = text2bioc.text2collection(argv['SOURCE'], split_document=argv['--split-document'])
    pipeline(collection, mm, splitter, parser, ptb2dep, lemmatizer, neg_detector, cuis)

    with open(os.path.expanduser(argv['--out']), 'w') as fp:
        bioc.dump(collection, fp)
Ejemplo n.º 22
0
def test_split_file(tmp_path):
    total_doc = 230
    n = 7
    c = get_collection(total_doc)

    source = tmp_path / 'foo.xml'
    with open(source, 'w') as fp:
        bioc.dump(c, fp)

    split.split_file(source, prefix=str(tmp_path), num_doc=n)
    for i in range(int(total_doc / n)):
        source = str(tmp_path) + '{:02x}.xml'.format(i)
        with open(source) as fp:
            subc = bioc.load(fp)
            assert len(subc.documents) == n

    last_n = int(math.ceil(total_doc / n))
    if last_n > int(total_doc / n):
        source = str(tmp_path) + '{:02x}.xml'.format(last_n - 1)
        with open(source) as fp:
            subc = bioc.load(fp)
            assert len(subc.documents) == total_doc % n
Ejemplo n.º 23
0
def scan_collection(*_, **kwargs):
    """
        Scan each document in a list of BioC source files, apply fn, and print to directory.

        Args:
            kwargs:
                source(list): a list of source pathnames
                directory(str): output directory
                fn:
                    fn should expect the following arguments in this given order:
                        sequence1
                        sequence2
                        ...
                        non_sequence1
                        non_sequence2
                        ...
                verbose(boolean):
        """
    source = kwargs.pop('source')
    verbose = kwargs.pop('verbose', True)
    directory = os.path.expanduser(kwargs.pop('directory'))
    suffix = kwargs.pop('suffix')
    fn = kwargs.pop('fn')
    non_sequences = kwargs.pop('non_sequences', None)

    for pathname in tqdm.tqdm(source, total=len(source), disable=not verbose):
        basename = os.path.splitext(os.path.basename(pathname))[0]
        dstname = os.path.join(directory, '{}{}'.format(basename, suffix))
        with open(pathname) as fp:
            collection = bioc.load(fp)
            try:
                args = [collection] + non_sequences
                fn(*args)
            except:
                logging.exception('Cannot process %s', collection.source)
        with open(dstname, 'w') as fp:
            bioc.dump(collection, fp)
Ejemplo n.º 24
0
def evaluate_via_bioc(test_docs,
                      crf,
                      extractor,
                      prediction_dir,
                      made_base_dir=None):
    print('Total documents for evaluation : {}'.format(len(test_docs)))

    if not os.path.exists(prediction_dir):
        os.makedirs(prediction_dir)

    existing_files = glob.glob('{0}/*'.format(prediction_dir))
    existing_files_removed = 0
    for f in existing_files:
        os.remove(f)
        existing_files_removed += 1

    print('Existing files removed : {}'.format(existing_files_removed))

    prediction_documents_written = 0
    reference_filenames = []
    for test_doc in test_docs:
        #print('Working on document : {}'.format(test_doc.filename))

        collection = bioc.BioCCollection()
        document = bioc.BioCDocument()
        document.id = test_doc.filename
        collection.add_document(document)
        passage = bioc.BioCPassage()
        passage.offset = 0
        document.add_passage(passage)

        next_annotation_id = 1

        # now an annotation can be written for each label prediction
        for sentence in test_doc.tokenized_doc.sentences:
            sentence_tokens = []
            # gather tokens in a sentence
            for token_offset_pair in sentence:
                token = test_doc.text[
                    token_offset_pair[0]:token_offset_pair[1]]
                sentence_tokens.append(token)
            if len(sentence_tokens) == 0:
                continue

            sentence_features = extractor.sent2features(sentence_tokens)
            sentence_pred = crf.predict([sentence_features])[0]

            if len(sentence_pred) != len(sentence):
                print('Sentence Features Length : {}'.format(
                    len(sentence_features)))
                print('Sentence Pred Length : {}'.format(len(sentence_pred)))
                print('Sentence Length : {}'.format(len(sentence)))

            # walk manually through the predictions and add spans as appropriate
            token_idx = 0
            while token_idx < len(sentence_pred):
                token_pred = sentence_pred[token_idx]
                if token_pred != 'O':
                    base_label = token_pred.replace('B-', '').replace('I-', '')
                    start_offset = sentence[token_idx][0]
                    end_offset = sentence[token_idx][1]
                    # now let's look to the right as long as we see tokens which are part of this same label
                    while token_idx + 1 < len(sentence_pred) and sentence_pred[
                            token_idx + 1] == ('I-' + base_label):
                        # advance the token
                        token_idx += 1
                        # update the end of this span
                        end_offset = sentence[token_idx][1]

                    # finally we have an annotation that we can add
                    annotation = bioc.BioCAnnotation()

                    annotation.infons['type'] = base_label
                    annotation.text = test_doc.text[start_offset:end_offset]
                    # current reference replaces newlines with literal '\n'
                    annotation.text = annotation.text.replace('\n',
                                                              '\\n').replace(
                                                                  '\r', '\\r')
                    annotation.id = str(next_annotation_id)
                    location = bioc.BioCLocation(start_offset,
                                                 end_offset - start_offset)

                    next_annotation_id += 1
                    annotation.add_location(location)
                    passage.add_annotation(annotation)

                # advance the token no matter what happened above
                token_idx += 1

        prediction_filename = os.path.join(
            prediction_dir, '{}.bioc.xml'.format(test_doc.filename))

        if made_base_dir is not None:
            reference_filename = os.path.join(
                os.path.join(made_base_dir, 'annotations'),
                '{}.bioc.xml'.format(test_doc.filename))
            reference_filenames.append(reference_filename)

        with open(prediction_filename, 'w') as fp:
            bioc.dump(collection, fp)
            prediction_documents_written += 1

    print('Total prediction documents written : {}'.format(
        prediction_documents_written))

    # finally we can invoke some evaluation (if enabled)
    if made_base_dir is not None:
        annotation_dir = os.path.join(made_base_dir, 'annotations')
        text_dir = os.path.join(made_base_dir, 'corpus')
        # first param can be an actual directory (string) or a list of filepaths
        get_f_scores(reference_filenames, prediction_dir, text_dir)
Ejemplo n.º 25
0
def test(data, opt, predict_dir):
    test_token, test_entity, test_relation, test_name = preprocess.loadPreprocessData(
        data.test_dir)

    # evaluate on test data and output results in bioc format, one doc one file

    data.load(opt.data_file)
    data.MAX_SENTENCE_LENGTH = -1
    data.show_data_summary()

    data.fix_alphabet()
    seq_model = SeqModel(data)
    seq_model.load_state_dict(
        torch.load(os.path.join(opt.ner_dir, 'model.pkl')))
    ner_hiddenlist = []
    for i in range(opt.hidden_num):
        if i == 0:
            input_size = data.word_emb_dim+data.HP_char_hidden_dim+data.feature_emb_dims[data.feature_name2id['[Cap]']]+ \
                         data.feature_emb_dims[data.feature_name2id['[POS]']]
            output_size = data.HP_hidden_dim
        else:
            input_size = data.HP_hidden_dim
            output_size = data.HP_hidden_dim

        temp = HiddenLayer(data, input_size, output_size)
        temp.load_state_dict(
            torch.load(os.path.join(opt.ner_dir, 'hidden_{}.pkl'.format(i))))
        ner_hiddenlist.append(temp)

    ner_wordrep = WordRep(data, False, True, True, data.use_char)
    ner_wordrep.load_state_dict(
        torch.load(os.path.join(opt.ner_dir, 'wordrep.pkl')))

    classify_model = ClassifyModel(data)
    classify_model.load_state_dict(
        torch.load(os.path.join(opt.re_dir, 'model.pkl')))
    re_hiddenlist = []
    for i in range(opt.hidden_num):
        if i == 0:
            input_size = data.word_emb_dim + data.feature_emb_dims[data.feature_name2id['[POS]']]+\
                         2*data.re_feature_emb_dims[data.re_feature_name2id['[POSITION]']]
            output_size = data.HP_hidden_dim
        else:
            input_size = data.HP_hidden_dim
            output_size = data.HP_hidden_dim

        temp = HiddenLayer(data, input_size, output_size)
        temp.load_state_dict(
            torch.load(os.path.join(opt.re_dir, 'hidden_{}.pkl'.format(i))))
        re_hiddenlist.append(temp)

    re_wordrep = WordRep(data, True, False, True, False)
    re_wordrep.load_state_dict(
        torch.load(os.path.join(opt.re_dir, 'wordrep.pkl')))

    for i in tqdm(range(len(test_name))):
        doc_name = test_name[i]
        doc_token = test_token[i]
        doc_entity = test_entity[i]

        if opt.use_gold_ner:
            entities = []
            for _, e in doc_entity.iterrows():
                entity = Entity()
                entity.create(e['id'], e['type'], e['start'], e['end'],
                              e['text'], e['sent_idx'], e['tf_start'],
                              e['tf_end'])
                entities.append(entity)
        else:

            ncrf_data = ner.generateDataForOneDoc(doc_token, doc_entity)

            data.raw_texts, data.raw_Ids = ner.read_instanceFromBuffer(
                ncrf_data, data.word_alphabet, data.char_alphabet,
                data.feature_alphabets, data.label_alphabet,
                data.number_normalized, data.MAX_SENTENCE_LENGTH)

            decode_results = ner_evaluateWhenTest(data, ner_wordrep,
                                                  ner_hiddenlist, seq_model)

            entities = ner.translateNCRFPPintoEntities(doc_token,
                                                       decode_results,
                                                       doc_name)

        collection = bioc.BioCCollection()
        document = bioc.BioCDocument()
        collection.add_document(document)
        document.id = doc_name
        passage = bioc.BioCPassage()
        document.add_passage(passage)
        passage.offset = 0

        for entity in entities:
            anno_entity = bioc.BioCAnnotation()
            passage.add_annotation(anno_entity)
            anno_entity.id = entity.id
            anno_entity.infons['type'] = entity.type
            anno_entity_location = bioc.BioCLocation(entity.start,
                                                     entity.getlength())
            anno_entity.add_location(anno_entity_location)
            anno_entity.text = entity.text

        test_X, test_other = relation_extraction.getRelationInstanceForOneDoc(
            doc_token, entities, doc_name, data)

        relations = re_evaluateWhenTest(
            re_wordrep, re_hiddenlist, classify_model, test_X, data,
            test_other,
            data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']])

        for relation in relations:
            bioc_relation = bioc.BioCRelation()
            passage.add_relation(bioc_relation)
            bioc_relation.id = relation.id
            bioc_relation.infons['type'] = relation.type

            node1 = bioc.BioCNode(relation.node1.id, 'annotation 1')
            bioc_relation.add_node(node1)
            node2 = bioc.BioCNode(relation.node2.id, 'annotation 2')
            bioc_relation.add_node(node2)

        with open(os.path.join(predict_dir, doc_name + ".bioc.xml"),
                  'w') as fp:
            bioc.dump(collection, fp)
Ejemplo n.º 26
0
def test2(test_token, test_entity, test_relation, test_name, result_dumpdir):
    logging.info("loading ... vocab")
    relation_vocab = pickle.load(
        open(os.path.join(opt.pretrain, 'relation_vocab.pkl'), 'rb'))

    logging.info("loading ... result")
    results = pickle.load(open(os.path.join(opt.output, 'results.pkl'), "rb"))

    for i in tqdm(range(len(test_relation))):

        doc_entity = test_entity[i]
        doc_name = test_name[i]

        collection = bioc.BioCCollection()
        document = bioc.BioCDocument()
        collection.add_document(document)
        document.id = doc_name
        passage = bioc.BioCPassage()
        document.add_passage(passage)
        passage.offset = 0

        for _, entity in doc_entity.iterrows():
            anno_entity = bioc.BioCAnnotation()
            passage.add_annotation(anno_entity)
            anno_entity.id = entity['id']
            anno_entity.infons['type'] = entity['type']
            anno_entity_location = bioc.BioCLocation(
                entity['start'], entity['end'] - entity['start'])
            anno_entity.add_location(anno_entity_location)
            anno_entity.text = entity['text']

        relation_id = 1
        for result in results:

            if doc_name == result['doc_name']:

                former = doc_entity[(
                    doc_entity['id'] == result['former_id'])].iloc[0]
                latter = doc_entity[(
                    doc_entity['id'] == result['latter_id'])].iloc[0]

                relation_type = relation_vocab.lookup_id2str(result['type'])
                if relation_type == '<unk>':
                    continue
                elif my_utils.relationConstraint1(relation_type,
                                                  former['type'],
                                                  latter['type']) == False:
                    continue
                else:
                    bioc_relation = bioc.BioCRelation()
                    passage.add_relation(bioc_relation)
                    bioc_relation.id = str(relation_id)
                    relation_id += 1
                    bioc_relation.infons['type'] = relation_type

                    node1 = bioc.BioCNode(former['id'], 'annotation 1')
                    bioc_relation.add_node(node1)
                    node2 = bioc.BioCNode(latter['id'], 'annotation 2')
                    bioc_relation.add_node(node2)

        with open(os.path.join(result_dumpdir, doc_name + ".bioc.xml"),
                  'w') as fp:
            bioc.dump(collection, fp)
Ejemplo n.º 27
0
def predict(opt, data):

    seq_model = SeqModel(data)
    if opt.test_in_cpu:
        seq_model.load_state_dict(torch.load(os.path.join(opt.output, 'ner_model.pkl'), map_location='cpu'))
    else:
        cuda_src = 'cuda:{}'.format(opt.old_gpu)
        cuda_dst = 'cuda:{}'.format(opt.gpu)
        seq_model.load_state_dict(torch.load(os.path.join(opt.output, 'ner_model.pkl'), map_location={cuda_src:cuda_dst}))


    seq_wordseq = WordSequence(data, False, True, True, True)
    if opt.test_in_cpu:
        seq_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 'ner_wordseq.pkl'), map_location='cpu'))
    else:
        cuda_src = 'cuda:{}'.format(opt.old_gpu)
        cuda_dst = 'cuda:{}'.format(opt.gpu)
        seq_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 'ner_wordseq.pkl'), map_location={cuda_src:cuda_dst}))

    classify_model = ClassifyModel(data)
    if opt.test_in_cpu:
        classify_model.load_state_dict(torch.load(os.path.join(opt.output, 're_model.pkl'), map_location='cpu'))
    else:
        cuda_src = 'cuda:{}'.format(opt.old_gpu)
        cuda_dst = 'cuda:{}'.format(opt.gpu)
        classify_model.load_state_dict(torch.load(os.path.join(opt.output, 're_model.pkl'), map_location={cuda_src:cuda_dst}))

    classify_wordseq = WordSequence(data, True, False, True, False)
    if opt.test_in_cpu:
        classify_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 're_wordseq.pkl'), map_location='cpu'))
    else:
        cuda_src = 'cuda:{}'.format(opt.old_gpu)
        cuda_dst = 'cuda:{}'.format(opt.gpu)
        classify_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 're_wordseq.pkl'), map_location={cuda_src:cuda_dst}))

    input_files = [f for f in listdir(opt.input) if isfile(join(opt.input,f)) and f[0]!='.']


    # for idx in tqdm(range(len(input_files))):
    for idx in range(len(input_files)):

        start = time.time()
        fileName = join(opt.input,input_files[idx])
        doc_name = input_files[idx]

        doc_token = processOneFile(fileName)

        doc = generateDataForOneFile(doc_token)

        raw_texts, raw_Ids = read_instance(doc, data.word_alphabet, data.char_alphabet,
                                                                   data.feature_alphabets, data.label_alphabet,
                                                                   data.number_normalized,
                                                                   data.MAX_SENTENCE_LENGTH)

        decode_results = evaluateWhenTest(data, seq_wordseq, seq_model, raw_Ids)


        entities = ner.translateNCRFPPintoEntities(doc_token, decode_results, doc_name)

        collection = bioc.BioCCollection()
        document = bioc.BioCDocument()
        collection.add_document(document)
        document.id = doc_name
        passage = bioc.BioCPassage()
        document.add_passage(passage)
        passage.offset = 0

        for entity in entities:
            anno_entity = bioc.BioCAnnotation()
            passage.add_annotation(anno_entity)
            anno_entity.id = entity.id
            anno_entity.infons['type'] = entity.type
            anno_entity_location = bioc.BioCLocation(entity.start, entity.getlength())
            anno_entity.add_location(anno_entity_location)
            anno_entity.text = entity.text


        test_X, test_other = relation_extraction.getRelationInstanceForOneDoc(doc_token, entities, doc_name, data)

        relations = relation_extraction.evaluateWhenTest(classify_wordseq, classify_model, test_X, data, test_other, data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']])

        for relation in relations:
            bioc_relation = bioc.BioCRelation()
            passage.add_relation(bioc_relation)
            bioc_relation.id = relation.id
            bioc_relation.infons['type'] = relation.type

            node1 = bioc.BioCNode(relation.node1.id, 'argument 1')
            bioc_relation.add_node(node1)
            node2 = bioc.BioCNode(relation.node2.id, 'argument 2')
            bioc_relation.add_node(node2)


        with open(os.path.join(opt.predict, doc_name + ".bioc.xml"), 'w') as fp:
            bioc.dump(collection, fp)

        end = time.time()
        logging.info("process %s complete with %.2fs" % (input_files[idx], end-start))



    logging.info("test finished")
Ejemplo n.º 28
0
"""
Convert text FILEs to the BioC output file

Usage:
    negbio_text2bioc [options] --output=<file> <file> ...

Options:
    --output=<file>     Specify the output file name.
    --verbose           Print more information about progress.
"""

import bioc

from negbio.cli_utils import parse_args
from negbio.ext.text2bioc import text2collection

if __name__ == '__main__':
    argv = parse_args(__doc__)
    collection = text2collection(*argv['<file>'])
    with open(argv['--output'], 'w') as fp:
        bioc.dump(collection, fp)