def dump_results(doc_name, entities, opt): entity_id = 1 collection = bioc.BioCCollection() document = bioc.BioCDocument() collection.add_document(document) document.id = doc_name passage = bioc.BioCPassage() document.add_passage(passage) passage.offset = 0 for entity in entities: anno_entity = bioc.BioCAnnotation() passage.add_annotation(anno_entity) anno_entity.id = str(entity_id) entity_id += 1 anno_entity.infons['type'] = entity.type anno_entity_location = bioc.BioCLocation( entity.spans[0][0], entity.spans[0][1] - entity.spans[0][0]) anno_entity.add_location(anno_entity_location) anno_entity.text = entity.name if len(entity.norm_ids) > 0: anno_entity.infons['UMLS code'] = entity.norm_ids[0] anno_entity.infons['UMLS term'] = entity.norm_names[0] else: anno_entity.infons['UMLS code'] = 'N/A' anno_entity.infons['UMLS term'] = 'N/A' with codecs.open(os.path.join(opt.predict, doc_name + ".bioc.xml"), 'w', 'UTF-8') as fp: bioc.dump(collection, fp)
def dump(self, data_rows, filename=None): filename = filename or tempfile.mkstemp(prefix="PredictedFile", suffix=".xml")[1] # Read file and do preliminary pre processing to form rows of records dic = {} for d in data_rows: i_relation = I_GENE2 + 1 # Only bother with true relations if not d[i_relation]: continue # construct dictionary docid = d[I_DOC_ID] if docid not in dic: dic[docid] = {"relations": []} dic[docid]["relations"].append((d[I_GENE1], d[I_GENE2])) biocrelation_wrapper = BiocRelation() with open(filename, 'w') as fp: collection = BioCCollection() for docid in dic.keys(): collection.add_document( biocrelation_wrapper.get_bioc_relations( docid, dic[docid]["relations"])) self.logger.info("Writing input to %s", filename) bioc.dump(collection, fp)
def BioC_Converter(Inputfile, Outputfile, Originalfile): tiabs = {} with open(Originalfile, 'r', encoding='utf8') as file_Originalfile: collection = bioc.load(file_Originalfile) document_count = 0 for document in collection.documents: passage_count = 0 for passage in document.passages: if document_count not in tiabs: tiabs[document_count] = {} tiabs[document_count][passage_count] = passage.text passage_count = passage_count + 1 document_count = document_count + 1 file_Originalfile.close() with open(Outputfile, 'w', encoding='utf8') as file_Outputfile: with open(Inputfile, 'r', encoding='utf8') as file_Inputfile: collection = bioc.load(file_Inputfile) document_count = 0 for document in collection.documents: passage_count = 0 for passage in document.passages: passage.text = tiabs[document_count][passage_count] for annotation in passage.annotations: start = annotation.locations[0].offset last = start + annotation.locations[0].length annotation.text = tiabs[document_count][passage_count][ start:last] passage_count = passage_count + 1 document_count = document_count + 1 bioc.dump(collection, file_Outputfile, pretty_print=False) file_Inputfile.close() file_Outputfile.close()
def dump(self, dataframe, handle): # Read file and do preliminary pre processing to form rows of records dic = {} for i, d in dataframe.iterrows(): if not d["isValid"]: continue i_relation = I_GENE2 + 1 # Only bother with true relations if not d[i_relation]: continue # construct dictionary docid = d["docid"] if docid not in dic: dic[docid] = {"relations": []} dic[docid]["relations"].append( (d["participant1"], d["participant2"])) biocrelation_wrapper = BiocRelation() collection = BioCCollection() for docid in dic.keys(): collection.add_document( biocrelation_wrapper.get_bioc_relations( docid, dic[docid]["relations"])) self.logger.info("Writing input to handle") bioc.dump(collection, handle)
def BioC_Converter(infile, outfile, biotag_dic, nn_model, para_set): with open(infile, 'r', encoding='utf-8') as fin: with open(outfile, 'w', encoding='utf8') as fout: collection = bioc.load(fin) for document in collection.documents: for passage in document.passages: tag_result = bioTag(passage.text, biotag_dic, nn_model, onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'], Threshold=para_set['ML_Threshold']) mention_num = 0 for ele in tag_result: bioc_note = bioc.BioCAnnotation() bioc_note.id = str(mention_num) mention_num += 1 bioc_note.infons['identifier'] = ele[2] bioc_note.infons['type'] = "Phenotype" bioc_note.infons['score'] = ele[3] start = int(ele[0]) last = int(ele[1]) loc = bioc.BioCLocation(offset=str(start), length=str(last - start)) bioc_note.locations.append(loc) bioc_note.text = passage.text[start:last] passage.annotations.append(bioc_note) bioc.dump(collection, fout, pretty_print=True)
def csv2collections(dest_top, *sources): if not dest_top.exists(): dest_top.mkdir(parents=True, exist_ok=True) total = collections.defaultdict(bioc.BioCCollection) for src in sources: all_df = pd.read_csv( src, header=None, names=['id', 'report'], ) all_df = all_df.dropna() for i, row in tqdm.tqdm(all_df.iterrows(), total=len(all_df)): id = row['id'] text = row['report'] if text[0] == '"' and text[-1] == '"': text = text[1:-1] doc = get_one_document(text) doc.id = id col_id = hashlib.md5(id.encode()).hexdigest()[-2:] total[col_id].add_document(doc) for k, c in tqdm.tqdm(total.items(), total=len(total)): # print(len(c.documents)) with open(dest_top / f'{k}.xml', 'w', encoding='utf8') as fp: bioc.dump(c, fp)
def main(): argv = parse_args(__doc__, version='version 2') print(argv) lemmatizer = Lemmatizer() ptb2dep = NegBioPtb2DepConverter(lemmatizer, universal=True) splitter = NegBioSSplitter(newline=argv['--newline_is_sentence_break']) parser = NegBioParser(model_dir=argv['--bllip-model']) argv = get_absolute_path(argv, '--neg-patterns', 'negbio/patterns/neg_patterns.txt') argv = get_absolute_path(argv, '--uncertainty-patterns', 'negbio/patterns/uncertainty_patterns.txt') mm = pymetamap.MetaMap.get_instance(argv['--metamap']) neg_detector = negdetect.Detector(argv['--neg-patterns'], argv['--uncertainty-patterns']) if argv['--cuis'] == 'None': cuis = None else: cuis = read_cuis(argv['--cuis']) if argv['text']: collection = text2bioc.text2collection(argv['SOURCES']) elif argv['bioc']: with open(argv['SOURCE']) as fp: collection = bioc.load(fp) else: raise KeyError pipeline(collection, mm, splitter, parser, ptb2dep, neg_detector, cuis) with open(os.path.expanduser(argv['--output']), 'w') as fp: bioc.dump(collection, fp)
def split(source, *, prefix: str, num_doc: int, additional_suffix: str = '.xml', suffix_length: int = 2): path_format = prefix + '{:0' + str( suffix_length) + 'x}' + additional_suffix with open(source, encoding='utf8') as fp: collection = bioc.load(fp) newc = bioc.BioCCollection() newc.infons = collection.infons newc.source = collection.source newc.version = collection.version newc.source = collection.source newc.standalone = collection.standalone i = 0 for doc in tqdm.tqdm(collection.documents): newc.add_document(doc) if len(newc.documents) == num_doc: dst = path_format.format(i) with open(dst, 'w', encoding='utf8') as fp: bioc.dump(newc, fp) del newc.documents[:] i += 1 if newc.documents: dst = path_format.format(i) with open(dst, 'w', encoding='utf8') as fp: bioc.dump(newc, fp)
def main(argv): argv = docopt.docopt(__doc__, argv=argv) print(argv) collection = text2collection(argv['SOURCE'], split_document=argv['--split-document']) with open(os.path.expanduser(argv['--out']), 'w') as fp: bioc.dump(collection, fp)
def test_dump(): collection = _get_collection() tmp = tempfile.mktemp() with open(tmp, 'w', encoding='utf8') as fp: bioc.dump(collection, fp) with open(tmp, encoding='utf8') as fp: collection = bioc.load(fp) assert_everything(collection)
def test_dump(): with open(file, encoding='utf8') as fp: collection = bioc.load(fp, BioCFileType.BIOC_JSON) tmp = tempfile.mktemp() with open(tmp, 'w', encoding='utf8') as fp: bioc.dump(collection, fp, BioCFileType.BIOC_JSON) with open(tmp, encoding='utf8') as fp: collection = bioc.load(fp, BioCFileType.BIOC_JSON) assert_everything(collection)
def test_dump(self): with open(self.src) as fp: collection = bioc.load(fp) tmp = tempfile.NamedTemporaryFile() with open(tmp.name, 'w') as fp: bioc.dump(collection, fp) with open(tmp.name) as fp: collection = bioc.load(fp) self.__test_collection(collection)
def create_collections(): filenames = [] top_dir = tempfile.mkdtemp() for i in range(10): c = text_to_bioc(['No pneumothorax.'], 'c/d/p') filename = os.path.join(top_dir, '{}.xml'.format(i)) with open(filename, 'w') as fp: bioc.dump(c, fp) filenames.append(filename) return filenames
def get_figure_text(src, dest, bioc_dir): df = pd.read_csv(src, dtype=str) objs = df_to_obj(df) # add text objs = add_text(objs, bioc_dir) collection = bioc.BioCCollection() for obj in objs: collection.documents.extend(obj.to_bioc_document()) with open(dest, 'w', encoding='utf8') as fp: bioc.dump(collection, fp)
def text_to_collection_file(output, *file): """ Convert text FILEs to the BioC output file Args: output: Specify the output file name. file: Specify the input text files """ logging.basicConfig(level=logging.INFO) collection = text2collection(*file) with open(output, 'w') as fp: bioc.dump(collection, fp)
def main(): argv = parse_args(__doc__, version='version 2') print(argv) lemmatizer = Lemmatizer() ptb2dep = NegBioPtb2DepConverter(lemmatizer, universal=True) ssplitter = NegBioSSplitter(newline=argv['--newline_is_sentence_break']) parser = NegBioParser(model_dir=argv['--bllip-model']) argv = get_absolute_path(argv, '--mention_phrases_dir', 'negbio/chexpert/phrases/mention') argv = get_absolute_path(argv, '--unmention_phrases_dir', 'negbio/chexpert/phrases/unmention') argv = get_absolute_path( argv, '--pre-negation-uncertainty-patterns', 'negbio/chexpert/patterns/pre_negation_uncertainty.txt') argv = get_absolute_path( argv, '--post-negation-uncertainty-patterns', 'negbio/chexpert/patterns/post_negation_uncertainty.txt') argv = get_absolute_path(argv, '--neg-patterns', 'negbio/chexpert/patterns/negation.txt') # chexpert loader = NegBioLoader() extractor = NegBioExtractor(Path(argv['--mention_phrases_dir']), Path(argv['--unmention_phrases_dir']), verbose=argv['--verbose']) neg_detector = ModifiedDetector( argv['--pre-negation-uncertainty-patterns'], argv['--neg-patterns'], argv['--post-negation-uncertainty-patterns']) aggregator = NegBioAggregator(CATEGORIES, verbose=argv['--verbose']) if argv['text']: collection = text2bioc.text2collection(argv['SOURCES']) elif argv['bioc']: with open(argv['SOURCE']) as fp: collection = bioc.load(fp) else: raise KeyError pipeline(collection, loader, ssplitter, extractor, parser, ptb2dep, neg_detector, aggregator, verbose=argv['--verbose']) with open(os.path.expanduser(argv['--output']), 'w') as fp: bioc.dump(collection, fp)
def translateNCRFPPintoBioc(doc_token, predict_results, file_name): collection = bioc.BioCCollection() document = bioc.BioCDocument() collection.add_document(document) document.id = file_name passage = bioc.BioCPassage() document.add_passage(passage) passage.offset = 0 entity_id = 1 sent_num = len(predict_results) for idx in range(sent_num): sent_length = len(predict_results[idx][0]) sent_token = doc_token[(doc_token['sent_idx'] == idx)] assert sent_token.shape[0] == sent_length, "file {}, sent {}".format( file_name, idx) labelSequence = [] for idy in range(sent_length): token = sent_token.iloc[idy] label = predict_results[idx][0][idy] labelSequence.append(label) if label[0] == 'S' or label[0] == 'B': anno_entity = bioc.BioCAnnotation() passage.add_annotation(anno_entity) anno_entity.id = str(entity_id) anno_entity.infons['type'] = label[2:] anno_entity_location = bioc.BioCLocation( token['start'], token['end'] - token['start']) anno_entity.add_location(anno_entity_location) anno_entity.text = token['text'] entity_id += 1 elif label[0] == 'M' or label[0] == 'E': if checkWrongState(labelSequence): anno_entity = passage.annotations[-1] whitespacetoAdd = token['start'] - anno_entity.locations[ 0].end for _ in range(whitespacetoAdd): anno_entity.text += " " anno_entity.text += token['text'] anno_entity.locations[0].length = token[ 'end'] - anno_entity.locations[0].offset bioc_file = open(file_name + ".bioc.xml", 'w') bioc.dump(collection, bioc_file) bioc_file.close()
def split_file(source, *, prefix: str, num_doc: int, additional_suffix: str = '.xml', suffix_length: int = 2): path_format = prefix + '{:0' + str( suffix_length) + 'x}' + additional_suffix with open(source, encoding='utf8') as fp: collection = bioc.load(fp) for i, subc in tqdm.tqdm(enumerate(itersplit(collection, num_doc))): dst = path_format.format(i) with open(dst, 'w', encoding='utf8') as fp: bioc.dump(subc, fp)
def save_predictions(ids, relevant, confidence, output): collection = bioc.BioCCollection() collection.source = 'PubMed' now = datetime.datetime.now() collection.date = '{}{:02d}{:02d}'.format(now.year, now.month, now.day) collection.key = 'collection.key' for i, id in enumerate(ids): document = bioc.BioCDocument() document.id = id document.infons['relevant'] = 'no' if relevant[i] == 0 else 'yes' if relevant[i] == 1: document.infons['confidence'] = '{:.2f}'.format(confidence[i][0]) else: document.infons['confidence'] = '{:.2f}'.format( 1 - confidence[i][0]) collection.add_document(document) bioc.dump(collection, open(output, 'w'), pretty_print=True)
def scan_document(*_, **kwargs): """ Scan each document in a list of BioC source files, apply fn, and print to directory. Args: kwargs: source(list): a list of source pathnames directory(str): output directory fn: fn should expect the following arguments in this given order: sequence1 sequence2 ... non_sequence1 non_sequence2 ... verbose(boolean): """ source = kwargs.pop('source') verbose = kwargs.pop('verbose', True) directory = os.path.expanduser(kwargs.pop('directory')) suffix = kwargs.pop('suffix') fn = kwargs.pop('fn') non_sequences = kwargs.pop('non_sequences', []) if not os.path.exists(directory): os.makedirs(directory) def catch(document, non_sequences): try: return fn(document, *non_sequences) except: logging.exception('Cannot process %s', document.id) for pathname in tqdm.tqdm(source, total=len(source), disable=not verbose): basename = os.path.splitext(os.path.basename(pathname))[0] dstname = os.path.join(directory, '{}{}'.format(basename, suffix)) with io.open(pathname, encoding='utf8') as fp: collection = bioc.load(fp) collection.documents = [ catch(doc, non_sequences) for doc in collection.documents ] with io.open(dstname, 'w', encoding='utf8') as fp: bioc.dump(collection, fp)
def main(argv): argv = docopt.docopt(__doc__, argv=argv) print(argv) splitter = ssplit.NltkSSplitter(newline=argv['--newline_is_sentence_break']) parser = parse.Bllip(model_dir=argv['--bllip-model']) ptb2dep = ptb2ud.Ptb2DepConverter(universal=True) lemmatizer = ptb2ud.Lemmatizer() mm = pymetamap.MetaMap.get_instance(argv['--metamap']) neg_detector = negdetect.Detector(argv['--neg-patterns'], argv['--uncertainty-patterns']) if argv['--cuis'] == 'None': cuis = None else: cuis = dner_mm.read_cuis(argv['--cuis']) collection = text2bioc.text2collection(argv['SOURCE'], split_document=argv['--split-document']) pipeline(collection, mm, splitter, parser, ptb2dep, lemmatizer, neg_detector, cuis) with open(os.path.expanduser(argv['--out']), 'w') as fp: bioc.dump(collection, fp)
def test_split_file(tmp_path): total_doc = 230 n = 7 c = get_collection(total_doc) source = tmp_path / 'foo.xml' with open(source, 'w') as fp: bioc.dump(c, fp) split.split_file(source, prefix=str(tmp_path), num_doc=n) for i in range(int(total_doc / n)): source = str(tmp_path) + '{:02x}.xml'.format(i) with open(source) as fp: subc = bioc.load(fp) assert len(subc.documents) == n last_n = int(math.ceil(total_doc / n)) if last_n > int(total_doc / n): source = str(tmp_path) + '{:02x}.xml'.format(last_n - 1) with open(source) as fp: subc = bioc.load(fp) assert len(subc.documents) == total_doc % n
def scan_collection(*_, **kwargs): """ Scan each document in a list of BioC source files, apply fn, and print to directory. Args: kwargs: source(list): a list of source pathnames directory(str): output directory fn: fn should expect the following arguments in this given order: sequence1 sequence2 ... non_sequence1 non_sequence2 ... verbose(boolean): """ source = kwargs.pop('source') verbose = kwargs.pop('verbose', True) directory = os.path.expanduser(kwargs.pop('directory')) suffix = kwargs.pop('suffix') fn = kwargs.pop('fn') non_sequences = kwargs.pop('non_sequences', None) for pathname in tqdm.tqdm(source, total=len(source), disable=not verbose): basename = os.path.splitext(os.path.basename(pathname))[0] dstname = os.path.join(directory, '{}{}'.format(basename, suffix)) with open(pathname) as fp: collection = bioc.load(fp) try: args = [collection] + non_sequences fn(*args) except: logging.exception('Cannot process %s', collection.source) with open(dstname, 'w') as fp: bioc.dump(collection, fp)
def evaluate_via_bioc(test_docs, crf, extractor, prediction_dir, made_base_dir=None): print('Total documents for evaluation : {}'.format(len(test_docs))) if not os.path.exists(prediction_dir): os.makedirs(prediction_dir) existing_files = glob.glob('{0}/*'.format(prediction_dir)) existing_files_removed = 0 for f in existing_files: os.remove(f) existing_files_removed += 1 print('Existing files removed : {}'.format(existing_files_removed)) prediction_documents_written = 0 reference_filenames = [] for test_doc in test_docs: #print('Working on document : {}'.format(test_doc.filename)) collection = bioc.BioCCollection() document = bioc.BioCDocument() document.id = test_doc.filename collection.add_document(document) passage = bioc.BioCPassage() passage.offset = 0 document.add_passage(passage) next_annotation_id = 1 # now an annotation can be written for each label prediction for sentence in test_doc.tokenized_doc.sentences: sentence_tokens = [] # gather tokens in a sentence for token_offset_pair in sentence: token = test_doc.text[ token_offset_pair[0]:token_offset_pair[1]] sentence_tokens.append(token) if len(sentence_tokens) == 0: continue sentence_features = extractor.sent2features(sentence_tokens) sentence_pred = crf.predict([sentence_features])[0] if len(sentence_pred) != len(sentence): print('Sentence Features Length : {}'.format( len(sentence_features))) print('Sentence Pred Length : {}'.format(len(sentence_pred))) print('Sentence Length : {}'.format(len(sentence))) # walk manually through the predictions and add spans as appropriate token_idx = 0 while token_idx < len(sentence_pred): token_pred = sentence_pred[token_idx] if token_pred != 'O': base_label = token_pred.replace('B-', '').replace('I-', '') start_offset = sentence[token_idx][0] end_offset = sentence[token_idx][1] # now let's look to the right as long as we see tokens which are part of this same label while token_idx + 1 < len(sentence_pred) and sentence_pred[ token_idx + 1] == ('I-' + base_label): # advance the token token_idx += 1 # update the end of this span end_offset = sentence[token_idx][1] # finally we have an annotation that we can add annotation = bioc.BioCAnnotation() annotation.infons['type'] = base_label annotation.text = test_doc.text[start_offset:end_offset] # current reference replaces newlines with literal '\n' annotation.text = annotation.text.replace('\n', '\\n').replace( '\r', '\\r') annotation.id = str(next_annotation_id) location = bioc.BioCLocation(start_offset, end_offset - start_offset) next_annotation_id += 1 annotation.add_location(location) passage.add_annotation(annotation) # advance the token no matter what happened above token_idx += 1 prediction_filename = os.path.join( prediction_dir, '{}.bioc.xml'.format(test_doc.filename)) if made_base_dir is not None: reference_filename = os.path.join( os.path.join(made_base_dir, 'annotations'), '{}.bioc.xml'.format(test_doc.filename)) reference_filenames.append(reference_filename) with open(prediction_filename, 'w') as fp: bioc.dump(collection, fp) prediction_documents_written += 1 print('Total prediction documents written : {}'.format( prediction_documents_written)) # finally we can invoke some evaluation (if enabled) if made_base_dir is not None: annotation_dir = os.path.join(made_base_dir, 'annotations') text_dir = os.path.join(made_base_dir, 'corpus') # first param can be an actual directory (string) or a list of filepaths get_f_scores(reference_filenames, prediction_dir, text_dir)
def test(data, opt, predict_dir): test_token, test_entity, test_relation, test_name = preprocess.loadPreprocessData( data.test_dir) # evaluate on test data and output results in bioc format, one doc one file data.load(opt.data_file) data.MAX_SENTENCE_LENGTH = -1 data.show_data_summary() data.fix_alphabet() seq_model = SeqModel(data) seq_model.load_state_dict( torch.load(os.path.join(opt.ner_dir, 'model.pkl'))) ner_hiddenlist = [] for i in range(opt.hidden_num): if i == 0: input_size = data.word_emb_dim+data.HP_char_hidden_dim+data.feature_emb_dims[data.feature_name2id['[Cap]']]+ \ data.feature_emb_dims[data.feature_name2id['[POS]']] output_size = data.HP_hidden_dim else: input_size = data.HP_hidden_dim output_size = data.HP_hidden_dim temp = HiddenLayer(data, input_size, output_size) temp.load_state_dict( torch.load(os.path.join(opt.ner_dir, 'hidden_{}.pkl'.format(i)))) ner_hiddenlist.append(temp) ner_wordrep = WordRep(data, False, True, True, data.use_char) ner_wordrep.load_state_dict( torch.load(os.path.join(opt.ner_dir, 'wordrep.pkl'))) classify_model = ClassifyModel(data) classify_model.load_state_dict( torch.load(os.path.join(opt.re_dir, 'model.pkl'))) re_hiddenlist = [] for i in range(opt.hidden_num): if i == 0: input_size = data.word_emb_dim + data.feature_emb_dims[data.feature_name2id['[POS]']]+\ 2*data.re_feature_emb_dims[data.re_feature_name2id['[POSITION]']] output_size = data.HP_hidden_dim else: input_size = data.HP_hidden_dim output_size = data.HP_hidden_dim temp = HiddenLayer(data, input_size, output_size) temp.load_state_dict( torch.load(os.path.join(opt.re_dir, 'hidden_{}.pkl'.format(i)))) re_hiddenlist.append(temp) re_wordrep = WordRep(data, True, False, True, False) re_wordrep.load_state_dict( torch.load(os.path.join(opt.re_dir, 'wordrep.pkl'))) for i in tqdm(range(len(test_name))): doc_name = test_name[i] doc_token = test_token[i] doc_entity = test_entity[i] if opt.use_gold_ner: entities = [] for _, e in doc_entity.iterrows(): entity = Entity() entity.create(e['id'], e['type'], e['start'], e['end'], e['text'], e['sent_idx'], e['tf_start'], e['tf_end']) entities.append(entity) else: ncrf_data = ner.generateDataForOneDoc(doc_token, doc_entity) data.raw_texts, data.raw_Ids = ner.read_instanceFromBuffer( ncrf_data, data.word_alphabet, data.char_alphabet, data.feature_alphabets, data.label_alphabet, data.number_normalized, data.MAX_SENTENCE_LENGTH) decode_results = ner_evaluateWhenTest(data, ner_wordrep, ner_hiddenlist, seq_model) entities = ner.translateNCRFPPintoEntities(doc_token, decode_results, doc_name) collection = bioc.BioCCollection() document = bioc.BioCDocument() collection.add_document(document) document.id = doc_name passage = bioc.BioCPassage() document.add_passage(passage) passage.offset = 0 for entity in entities: anno_entity = bioc.BioCAnnotation() passage.add_annotation(anno_entity) anno_entity.id = entity.id anno_entity.infons['type'] = entity.type anno_entity_location = bioc.BioCLocation(entity.start, entity.getlength()) anno_entity.add_location(anno_entity_location) anno_entity.text = entity.text test_X, test_other = relation_extraction.getRelationInstanceForOneDoc( doc_token, entities, doc_name, data) relations = re_evaluateWhenTest( re_wordrep, re_hiddenlist, classify_model, test_X, data, test_other, data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']]) for relation in relations: bioc_relation = bioc.BioCRelation() passage.add_relation(bioc_relation) bioc_relation.id = relation.id bioc_relation.infons['type'] = relation.type node1 = bioc.BioCNode(relation.node1.id, 'annotation 1') bioc_relation.add_node(node1) node2 = bioc.BioCNode(relation.node2.id, 'annotation 2') bioc_relation.add_node(node2) with open(os.path.join(predict_dir, doc_name + ".bioc.xml"), 'w') as fp: bioc.dump(collection, fp)
def test2(test_token, test_entity, test_relation, test_name, result_dumpdir): logging.info("loading ... vocab") relation_vocab = pickle.load( open(os.path.join(opt.pretrain, 'relation_vocab.pkl'), 'rb')) logging.info("loading ... result") results = pickle.load(open(os.path.join(opt.output, 'results.pkl'), "rb")) for i in tqdm(range(len(test_relation))): doc_entity = test_entity[i] doc_name = test_name[i] collection = bioc.BioCCollection() document = bioc.BioCDocument() collection.add_document(document) document.id = doc_name passage = bioc.BioCPassage() document.add_passage(passage) passage.offset = 0 for _, entity in doc_entity.iterrows(): anno_entity = bioc.BioCAnnotation() passage.add_annotation(anno_entity) anno_entity.id = entity['id'] anno_entity.infons['type'] = entity['type'] anno_entity_location = bioc.BioCLocation( entity['start'], entity['end'] - entity['start']) anno_entity.add_location(anno_entity_location) anno_entity.text = entity['text'] relation_id = 1 for result in results: if doc_name == result['doc_name']: former = doc_entity[( doc_entity['id'] == result['former_id'])].iloc[0] latter = doc_entity[( doc_entity['id'] == result['latter_id'])].iloc[0] relation_type = relation_vocab.lookup_id2str(result['type']) if relation_type == '<unk>': continue elif my_utils.relationConstraint1(relation_type, former['type'], latter['type']) == False: continue else: bioc_relation = bioc.BioCRelation() passage.add_relation(bioc_relation) bioc_relation.id = str(relation_id) relation_id += 1 bioc_relation.infons['type'] = relation_type node1 = bioc.BioCNode(former['id'], 'annotation 1') bioc_relation.add_node(node1) node2 = bioc.BioCNode(latter['id'], 'annotation 2') bioc_relation.add_node(node2) with open(os.path.join(result_dumpdir, doc_name + ".bioc.xml"), 'w') as fp: bioc.dump(collection, fp)
def predict(opt, data): seq_model = SeqModel(data) if opt.test_in_cpu: seq_model.load_state_dict(torch.load(os.path.join(opt.output, 'ner_model.pkl'), map_location='cpu')) else: cuda_src = 'cuda:{}'.format(opt.old_gpu) cuda_dst = 'cuda:{}'.format(opt.gpu) seq_model.load_state_dict(torch.load(os.path.join(opt.output, 'ner_model.pkl'), map_location={cuda_src:cuda_dst})) seq_wordseq = WordSequence(data, False, True, True, True) if opt.test_in_cpu: seq_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 'ner_wordseq.pkl'), map_location='cpu')) else: cuda_src = 'cuda:{}'.format(opt.old_gpu) cuda_dst = 'cuda:{}'.format(opt.gpu) seq_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 'ner_wordseq.pkl'), map_location={cuda_src:cuda_dst})) classify_model = ClassifyModel(data) if opt.test_in_cpu: classify_model.load_state_dict(torch.load(os.path.join(opt.output, 're_model.pkl'), map_location='cpu')) else: cuda_src = 'cuda:{}'.format(opt.old_gpu) cuda_dst = 'cuda:{}'.format(opt.gpu) classify_model.load_state_dict(torch.load(os.path.join(opt.output, 're_model.pkl'), map_location={cuda_src:cuda_dst})) classify_wordseq = WordSequence(data, True, False, True, False) if opt.test_in_cpu: classify_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 're_wordseq.pkl'), map_location='cpu')) else: cuda_src = 'cuda:{}'.format(opt.old_gpu) cuda_dst = 'cuda:{}'.format(opt.gpu) classify_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 're_wordseq.pkl'), map_location={cuda_src:cuda_dst})) input_files = [f for f in listdir(opt.input) if isfile(join(opt.input,f)) and f[0]!='.'] # for idx in tqdm(range(len(input_files))): for idx in range(len(input_files)): start = time.time() fileName = join(opt.input,input_files[idx]) doc_name = input_files[idx] doc_token = processOneFile(fileName) doc = generateDataForOneFile(doc_token) raw_texts, raw_Ids = read_instance(doc, data.word_alphabet, data.char_alphabet, data.feature_alphabets, data.label_alphabet, data.number_normalized, data.MAX_SENTENCE_LENGTH) decode_results = evaluateWhenTest(data, seq_wordseq, seq_model, raw_Ids) entities = ner.translateNCRFPPintoEntities(doc_token, decode_results, doc_name) collection = bioc.BioCCollection() document = bioc.BioCDocument() collection.add_document(document) document.id = doc_name passage = bioc.BioCPassage() document.add_passage(passage) passage.offset = 0 for entity in entities: anno_entity = bioc.BioCAnnotation() passage.add_annotation(anno_entity) anno_entity.id = entity.id anno_entity.infons['type'] = entity.type anno_entity_location = bioc.BioCLocation(entity.start, entity.getlength()) anno_entity.add_location(anno_entity_location) anno_entity.text = entity.text test_X, test_other = relation_extraction.getRelationInstanceForOneDoc(doc_token, entities, doc_name, data) relations = relation_extraction.evaluateWhenTest(classify_wordseq, classify_model, test_X, data, test_other, data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']]) for relation in relations: bioc_relation = bioc.BioCRelation() passage.add_relation(bioc_relation) bioc_relation.id = relation.id bioc_relation.infons['type'] = relation.type node1 = bioc.BioCNode(relation.node1.id, 'argument 1') bioc_relation.add_node(node1) node2 = bioc.BioCNode(relation.node2.id, 'argument 2') bioc_relation.add_node(node2) with open(os.path.join(opt.predict, doc_name + ".bioc.xml"), 'w') as fp: bioc.dump(collection, fp) end = time.time() logging.info("process %s complete with %.2fs" % (input_files[idx], end-start)) logging.info("test finished")
""" Convert text FILEs to the BioC output file Usage: negbio_text2bioc [options] --output=<file> <file> ... Options: --output=<file> Specify the output file name. --verbose Print more information about progress. """ import bioc from negbio.cli_utils import parse_args from negbio.ext.text2bioc import text2collection if __name__ == '__main__': argv = parse_args(__doc__) collection = text2collection(*argv['<file>']) with open(argv['--output'], 'w') as fp: bioc.dump(collection, fp)