def dump_results(doc_name, entities, opt): entity_id = 1 collection = bioc.BioCCollection() document = bioc.BioCDocument() collection.add_document(document) document.id = doc_name passage = bioc.BioCPassage() document.add_passage(passage) passage.offset = 0 for entity in entities: anno_entity = bioc.BioCAnnotation() passage.add_annotation(anno_entity) anno_entity.id = str(entity_id) entity_id += 1 anno_entity.infons['type'] = entity.type anno_entity_location = bioc.BioCLocation( entity.spans[0][0], entity.spans[0][1] - entity.spans[0][0]) anno_entity.add_location(anno_entity_location) anno_entity.text = entity.name if len(entity.norm_ids) > 0: anno_entity.infons['UMLS code'] = entity.norm_ids[0] anno_entity.infons['UMLS term'] = entity.norm_names[0] else: anno_entity.infons['UMLS code'] = 'N/A' anno_entity.infons['UMLS term'] = 'N/A' with codecs.open(os.path.join(opt.predict, doc_name + ".bioc.xml"), 'w', 'UTF-8') as fp: bioc.dump(collection, fp)
def text2document(id, text, split_document=True): """ Args: id (str): BioCDocument id text (str): text split_document(bool): True if splits the passages according to the section titles. Returns: BioCDocument """ document = bioc.BioCDocument() document.id = id text = printable(text).replace('\r\n', '\n') if split_document: last_section = None for start, end, section in split_passage(text): passage = bioc.BioCPassage() passage.offset = start passage.text = text[start:end] if section is None: section = last_section passage.infons['title'] = section document.add_passage(passage) last_section = section else: passage = bioc.BioCPassage() passage.offset = 0 passage.text = text document.add_passage(passage) return document
def get_one_document(text): d = bioc.BioCDocument() p = bioc.BioCPassage() p.text = text p.offset = 0 d.add_passage(p) return d
def convertKindredCorpusToBioCCollection(corpus): assert isinstance(corpus,kindred.Corpus) collection = bioc.BioCCollection() for kdoc in corpus.documents: assert isinstance(kdoc,kindred.Document) biocDoc = bioc.BioCDocument() collection.add_document(biocDoc) if 'id' in kdoc.metadata: biocDoc.id = kdoc.metadata['id'] biocDoc.infons = kdoc.metadata passage = bioc.BioCPassage() passage.text = kdoc.text passage.offset = 0 biocDoc.add_passage(passage) seenEntityIDs = set() kindredID2BiocID = {} for e in kdoc.entities: assert isinstance(e,kindred.Entity) a = bioc.BioCAnnotation() a.text = e.text a.infons = {'type':e.entityType} a.infons.update(e.metadata) if e.sourceEntityID is None: a.id = str(e.entityID) else: a.id = e.sourceEntityID assert not a.id in seenEntityIDs, "Multiple entities with the same ID (%s) found" % a.id seenEntityIDs.add(a.id) kindredID2BiocID[e.entityID] = a.id for start,end in e.position: l = bioc.BioCLocation(offset=start, length=(end-start)) a.locations.append(l) passage.annotations.append(a) for r in kdoc.relations: assert isinstance(r,kindred.Relation) biocR = bioc.BioCRelation() biocR.infons = {'type':r.relationType} entitiesInRelation = r.entities argNames = r.argNames if argNames is None: argNames = [ "arg%d" % i for i,_ in enumerate(entitiesInRelation) ] for argName,entity in zip(argNames,entitiesInRelation): node = bioc.BioCNode(role=argName, refid=kindredID2BiocID[entity.entityID]) biocR.nodes.append(node) passage.relations.append(biocR) return collection
def save_as_collection(list_of_pymedext_documents: List[Document]): """save a list of pymedext document as a bioc collection . It will return a bioc collection object. :param list_of_pymedext_documents: a list of Document :returns: a bioc collection object """ this_bioc_collection = bioc.BioCCollection() for this_pymedext_doc in list_of_pymedext_documents: this_bioc_doc = bioc.BioCDocument() for annot in this_pymedext_doc.annotations: # print(annot.type) print(annot.source) if annot.type == "raw_text": if this_bioc_collection.source == '': this_bioc_collection.source = annot.source if annot.source == "BioCPassage": print(annot.ngram) print(annot.value) this_passage = bioc.BioCPassage() this_passage.text = annot.ngram this_passage.offset = annot.span[0] this_bioc_doc.add_passage(this_passage) # passageAttributes to add elif annot.source == "BioCAnnotation": this_annotation = bioc.BioCAnnotation() this_annotation.infons = annot.attributes this_annotation.id = annot.attributes["id"] this_annotation.text = annot.ngram thisLocation = bioc.BioCLocation( annot.span[0], annot.span[1] - annot.span[0]) this_annotation.add_location(thisLocation) this_bioc_doc.passages[-1].add_annotation(this_annotation) this_bioc_collection.add_document(this_bioc_doc) return (this_bioc_collection)
def writeMarcXMLRecordToBiocFile(record,biocWriter): metadata = record['008'].value() language = metadata[35:38] if language != 'eng': return recordid = record['001'].value() title = record.title() textSources = [title] abstract = None if '520' in record and 'a' in record['520']: abstract = record['520']['a'] textSources.append(abstract) #print recordid, language, title, abstract biocDoc = bioc.BioCDocument() biocDoc.id = recordid offset = 0 for textSource in textSources: if isinstance(textSource,six.string_types): textSource = trimSentenceLengths(textSource) passage = bioc.BioCPassage() passage.text = textSource passage.offset = offset offset += len(textSource) biocDoc.add_passage(passage) biocWriter.writedocument(biocDoc)
def pubmedxml2bioc(source): for pmDoc in processMedlineFile(source): biocDoc = bioc.BioCDocument() biocDoc.id = pmDoc["pmid"] biocDoc.infons['title'] = " ".join(pmDoc["title"]) biocDoc.infons['pmid'] = pmDoc["pmid"] biocDoc.infons['year'] = pmDoc["pubYear"] biocDoc.infons['month'] = pmDoc["pubMonth"] biocDoc.infons['day'] = pmDoc["pubDay"] biocDoc.infons['journal'] = pmDoc["journal"] biocDoc.infons['journalISO'] = pmDoc["journalISO"] biocDoc.infons['authors'] = ", ".join(pmDoc["authors"]) biocDoc.infons['chemicals'] = pmDoc['chemicals'] biocDoc.infons['meshHeadings'] = pmDoc['meshHeadings'] offset = 0 for section in ["title","abstract"]: for textSource in pmDoc[section]: textSource = trimSentenceLengths(textSource) passage = bioc.BioCPassage() passage.infons['section'] = section passage.text = textSource passage.offset = offset offset += len(textSource) biocDoc.add_passage(passage) yield biocDoc
def split_document(document, pattern=None): """ Split one report into sections. Section splitting is a deterministic consequence of section titles. Args: document(BioCDocument): one document that contains one passage. pattern: the regular expression patterns for section titles. Returns: BioCDocument: a new BioCDocument instance """ if pattern is None: pattern = SECTION_TITLES new_document = bioc.BioCDocument() new_document.id = document.id new_document.infons = document.infons text = document.passages[0].text offset = document.passages[0].offset def create_passage(start, end, title=None): passage = bioc.BioCPassage() passage.offset = start + offset passage.text = text[start:end] if title is not None: passage.infons['title'] = title[:-1].strip( ) if title[-1] == ':' else title.strip() passage.infons['type'] = 'title_1' strip(passage) return passage start = 0 for matcher in pattern.finditer(text): logging.debug('Match: %s', matcher.group()) # add last end = matcher.start() if end != start: passage = create_passage(start, end) if not is_empty(passage): new_document.add_passage(passage) start = end # add title end = matcher.end() passage = create_passage(start, end, text[start:end]) if not is_empty(passage): new_document.add_passage(passage) start = end # add last piece end = len(text) if start < end: passage = create_passage(start, end) if not is_empty(passage): new_document.add_passage(passage) return new_document
def document(this, json_doc): doc = bioc.BioCDocument() doc.id = json_doc['id'] doc.infons = json_doc['infons'] doc.passages = [this.passage(p) for p in json_doc['passages']] doc.relations = [this.relation(r) for r in json_doc['relations']] return doc
def pubtator2bioc(pubdoc: PubTator): doc = bioc.BioCDocument() doc.id = pubdoc.pmid doc.text = '%s\n%s' % (pubdoc.title, pubdoc.abstract) for ann in pubdoc.annotations: biocann = pubtator2bioc_ann(ann) doc.add_annotation(biocann) for i, rel in enumerate(pubdoc.relations): biocrel = pubtator2bioc_rel(rel) biocrel.id = 'R%s' % i doc.add_relation(biocrel) return doc
def text_to_bioc(list_of_text, type, **kwargs): if type == 'p/s': offset = 0 passage = bioc.BioCPassage() passage.offset = offset for s in list_of_text: sentence = bioc.BioCSentence() sentence.offset = offset sentence.text = s offset += len(s) + 1 passage.add_sentence(sentence) return passage elif type == 'd/p/s': document = bioc.BioCDocument() passage = text_to_bioc(list_of_text, 'p/s') document.add_passage(passage) return document elif type == 'c/d/p/s': c = bioc.BioCCollection() d = text_to_bioc(list_of_text, 'd/p/s') c.add_document(d) return c elif type == 'd/p': document = bioc.BioCDocument() offset = 0 for s in list_of_text: passage = bioc.BioCPassage() passage.offset = offset offset += len(s) + 1 passage.text = s document.add_passage(passage) return document elif type == 'c/d/p': c = bioc.BioCCollection() d = text_to_bioc(list_of_text, 'd/p') c.add_document(d) return c else: raise KeyError
def translateNCRFPPintoBioc(doc_token, predict_results, file_name): collection = bioc.BioCCollection() document = bioc.BioCDocument() collection.add_document(document) document.id = file_name passage = bioc.BioCPassage() document.add_passage(passage) passage.offset = 0 entity_id = 1 sent_num = len(predict_results) for idx in range(sent_num): sent_length = len(predict_results[idx][0]) sent_token = doc_token[(doc_token['sent_idx'] == idx)] assert sent_token.shape[0] == sent_length, "file {}, sent {}".format( file_name, idx) labelSequence = [] for idy in range(sent_length): token = sent_token.iloc[idy] label = predict_results[idx][0][idy] labelSequence.append(label) if label[0] == 'S' or label[0] == 'B': anno_entity = bioc.BioCAnnotation() passage.add_annotation(anno_entity) anno_entity.id = str(entity_id) anno_entity.infons['type'] = label[2:] anno_entity_location = bioc.BioCLocation( token['start'], token['end'] - token['start']) anno_entity.add_location(anno_entity_location) anno_entity.text = token['text'] entity_id += 1 elif label[0] == 'M' or label[0] == 'E': if checkWrongState(labelSequence): anno_entity = passage.annotations[-1] whitespacetoAdd = token['start'] - anno_entity.locations[ 0].end for _ in range(whitespacetoAdd): anno_entity.text += " " anno_entity.text += token['text'] anno_entity.locations[0].length = token[ 'end'] - anno_entity.locations[0].offset bioc_file = open(file_name + ".bioc.xml", 'w') bioc.dump(collection, bioc_file) bioc_file.close()
def to_bioc_document(self) -> bioc.BioCDocument: doc = bioc.BioCDocument() doc.infons['url'] = self.url doc.infons['figure id'] = self.id doc.infons['files'] = json.dumps(self.files) self.caption.infons['type'] = 'caption' doc.add_passage(self.caption) for p in self.text: p.infons['type'] = 'text' doc.add_passage(p) return doc
def save_predictions(ids, relevant, confidence, output): collection = bioc.BioCCollection() collection.source = 'PubMed' now = datetime.datetime.now() collection.date = '{}{:02d}{:02d}'.format(now.year, now.month, now.day) collection.key = 'collection.key' for i, id in enumerate(ids): document = bioc.BioCDocument() document.id = id document.infons['relevant'] = 'no' if relevant[i] == 0 else 'yes' if relevant[i] == 1: document.infons['confidence'] = '{:.2f}'.format(confidence[i][0]) else: document.infons['confidence'] = '{:.2f}'.format( 1 - confidence[i][0]) collection.add_document(document) bioc.dump(collection, open(output, 'w'), pretty_print=True)
def test_convert_to_vec(self, bioc_doc_input): # Arrange expected_vec = [] sut = BiocSentences() doc = bioc.BioCDocument() # construct a bioc doc object given the input array for p in bioc_doc_input: bioc_p = bioc.BioCPassage() doc.add_passage(bioc_p) for s in p: bioc_sent = BioCSentence() bioc_sent.text = s bioc_p.add_sentence(bioc_sent) expected_vec.append(s) # Act actual = sut.convert_to_vec(doc) # Assert self.assertEqual(expected_vec, actual)
def pmcxml2bioc(pmcxmlFilename, biocFilename): try: with bioc.BioCXMLDocumentWriter(biocFilename) as writer: for pmcDoc in processPMCFile(pmcxmlFilename): biocDoc = bioc.BioCDocument() biocDoc.id = pmcDoc["pmid"] biocDoc.infons['title'] = " ".join( pmcDoc["textSources"]["title"]) biocDoc.infons['pmid'] = pmcDoc["pmid"] biocDoc.infons['pmcid'] = pmcDoc["pmcid"] biocDoc.infons['doi'] = pmcDoc["doi"] biocDoc.infons['year'] = pmcDoc["pubYear"] biocDoc.infons['month'] = pmcDoc["pubMonth"] biocDoc.infons['day'] = pmcDoc["pubDay"] biocDoc.infons['journal'] = pmcDoc["journal"] biocDoc.infons['journalISO'] = pmcDoc["journalISO"] offset = 0 for groupName, textSourceGroup in pmcDoc["textSources"].items( ): subsection = None for textSource in textSourceGroup: textSource = trimSentenceLengths(textSource) passage = bioc.BioCPassage() subsectionCheck = textSource.lower().strip( '01234567890. ') if subsectionCheck in allowedSubsections: subsection = subsectionCheck passage.infons['section'] = groupName passage.infons['subsection'] = subsection passage.text = textSource passage.offset = offset offset += len(textSource) biocDoc.add_passage(passage) writer.write_document(biocDoc) except etree.ParseError: raise RuntimeError("Parsing error in PMC xml file: %s" % pmcxmlFilename)
def text2document(id, text): """ Convert text to a BioCDocument instance Args: id (str): BioCDocument id text (str): text Returns: BioCDocument: a BioCDocument instance """ document = bioc.BioCDocument() document.id = id text = printable(text).replace('\r\n', '\n') passage = bioc.BioCPassage() passage.offset = 0 passage.text = text document.add_passage(passage) return document
def pubmedxml2bioc( source: Union[str, TextIO], tag_handlers: Dict[str, TagHandlerFunction] = {}, trim_sentences=True, ) -> Iterable[bioc.BioCDocument]: """ Args: source: path to the MEDLINE xml file """ for pm_doc in process_medline_file(source, tag_handlers=tag_handlers): bioc_doc = bioc.BioCDocument() bioc_doc.id = pm_doc["pmid"] bioc_doc.infons["title"] = " ".join(pm_doc["title"]) bioc_doc.infons["pmid"] = pm_doc["pmid"] bioc_doc.infons["pmcid"] = pm_doc["pmcid"] bioc_doc.infons["doi"] = pm_doc["doi"] bioc_doc.infons["year"] = pm_doc["pubYear"] bioc_doc.infons["month"] = pm_doc["pubMonth"] bioc_doc.infons["day"] = pm_doc["pubDay"] bioc_doc.infons["journal"] = pm_doc["journal"] bioc_doc.infons["journalISO"] = pm_doc["journalISO"] bioc_doc.infons["authors"] = ", ".join(pm_doc["authors"]) bioc_doc.infons["chemicals"] = pm_doc["chemicals"] bioc_doc.infons["meshHeadings"] = pm_doc["meshHeadings"] bioc_doc.infons["supplementaryMesh"] = pm_doc["supplementaryMesh"] bioc_doc.infons["publicationTypes"] = pm_doc["publicationTypes"] offset = 0 for section in ["title", "abstract"]: for text_source in pm_doc[section]: if trim_sentences: text_source = trim_sentence_lengths(text_source) passage = bioc.BioCPassage() passage.infons["section"] = section passage.text = text_source passage.offset = offset offset += len(text_source) bioc_doc.add_passage(passage) yield bioc_doc
def create_bioc_document_from_document_json(document): b_document = bioc.BioCDocument() b_document.id = document['sourceid'] passage = bioc.BioCPassage() passage.text = document['text'] passage.offset = 0 annotation_user_map = {} for denotation in document['denotations']: annotation_user_map[denotation['id']] = denotation['userId'] if denotation['userId'] != 0: continue annotation = bioc.BioCAnnotation() annotation.id = denotation['id'] location = bioc.BioCLocation(0, 0) location.offset = denotation['span']['begin'] location.length = denotation['span']['end'] - denotation['span'][ 'begin'] annotation.locations.append(location) annotation.text = document['text'][ denotation['span']['begin']:denotation['span']['end']] annotation.infons = denotation['obj'] passage.add_annotation(annotation) for relation in document['relations']: subj_from_current_user = annotation_user_map[relation['subj']] == 0 obj_from_current_user = annotation_user_map[relation['obj']] == 0 if not (subj_from_current_user and obj_from_current_user): continue b_relation = bioc.BioCRelation() b_relation.id = relation['id'] start_node = bioc.BioCNode('', '') end_node = bioc.BioCNode('', '') start_node.refid = relation['subj'] end_node.refid = relation['obj'] b_relation.add_node(start_node) b_relation.add_node(end_node) b_relation.infons = relation['pred'] passage.add_relation(b_relation) b_document.add_passage(passage) return b_document
def uimaxmi2bioc(xmiFilename, biocFilename): tree = etree.parse(xmiFilename) root = tree.getroot() metadataNode = root.find('{http:///de/tudarmstadt/ukp/dkpro/core/api/metadata/type.ecore}DocumentMetaData') documentTitle = metadataNode.attrib['documentTitle'] contentNode = root.find('{http:///uima/cas.ecore}Sofa') content = contentNode.attrib['sofaString'] with bioc.iterwrite(biocFilename) as writer: biocDoc = bioc.BioCDocument() biocDoc.id = None biocDoc.infons['title'] = documentTitle passage = bioc.BioCPassage() passage.infons['section'] = 'article' passage.text = content passage.offset = 0 biocDoc.add_passage(passage) writer.writedocument(biocDoc)
def to_bioc_xml(self, outdir): outpath = os.path.join(outdir, self.file_name + '.bioc.xml') writer = bioc.BioCWriter() writer.collection = bioc.BioCCollection() collection = writer.collection document = bioc.BioCDocument() document.id = self.file_name passage = bioc.BioCPassage() passage.offset = '0' document.add_passage(passage) collection.add_document(document) # Add annotations that already have bioc annotations for anno in self.get_annotations(): passage.add_annotation(anno.bioc_anno) for relat in self.get_relations(): # Create new BioCRelation relation = bioc.bioc_relation.BioCRelation() relation.id = relat.id relation.put_infon('type', relat.type) # Reference that nodes that contain the annotations node1 = bioc.bioc_node.BioCNode() node1.role = 'annotation 1' node1.refid = relat.annotation_1.id relation.add_node(node1) node2 = bioc.bioc_node.BioCNode() node2.role = 'annotation 2' node2.refid = relat.annotation_2.id relation.add_node(node2) passage.add_relation(relation) writer.write(outpath)
def evaluate_via_bioc(test_docs, crf, extractor, prediction_dir, made_base_dir=None): print('Total documents for evaluation : {}'.format(len(test_docs))) if not os.path.exists(prediction_dir): os.makedirs(prediction_dir) existing_files = glob.glob('{0}/*'.format(prediction_dir)) existing_files_removed = 0 for f in existing_files: os.remove(f) existing_files_removed += 1 print('Existing files removed : {}'.format(existing_files_removed)) prediction_documents_written = 0 reference_filenames = [] for test_doc in test_docs: #print('Working on document : {}'.format(test_doc.filename)) collection = bioc.BioCCollection() document = bioc.BioCDocument() document.id = test_doc.filename collection.add_document(document) passage = bioc.BioCPassage() passage.offset = 0 document.add_passage(passage) next_annotation_id = 1 # now an annotation can be written for each label prediction for sentence in test_doc.tokenized_doc.sentences: sentence_tokens = [] # gather tokens in a sentence for token_offset_pair in sentence: token = test_doc.text[ token_offset_pair[0]:token_offset_pair[1]] sentence_tokens.append(token) if len(sentence_tokens) == 0: continue sentence_features = extractor.sent2features(sentence_tokens) sentence_pred = crf.predict([sentence_features])[0] if len(sentence_pred) != len(sentence): print('Sentence Features Length : {}'.format( len(sentence_features))) print('Sentence Pred Length : {}'.format(len(sentence_pred))) print('Sentence Length : {}'.format(len(sentence))) # walk manually through the predictions and add spans as appropriate token_idx = 0 while token_idx < len(sentence_pred): token_pred = sentence_pred[token_idx] if token_pred != 'O': base_label = token_pred.replace('B-', '').replace('I-', '') start_offset = sentence[token_idx][0] end_offset = sentence[token_idx][1] # now let's look to the right as long as we see tokens which are part of this same label while token_idx + 1 < len(sentence_pred) and sentence_pred[ token_idx + 1] == ('I-' + base_label): # advance the token token_idx += 1 # update the end of this span end_offset = sentence[token_idx][1] # finally we have an annotation that we can add annotation = bioc.BioCAnnotation() annotation.infons['type'] = base_label annotation.text = test_doc.text[start_offset:end_offset] # current reference replaces newlines with literal '\n' annotation.text = annotation.text.replace('\n', '\\n').replace( '\r', '\\r') annotation.id = str(next_annotation_id) location = bioc.BioCLocation(start_offset, end_offset - start_offset) next_annotation_id += 1 annotation.add_location(location) passage.add_annotation(annotation) # advance the token no matter what happened above token_idx += 1 prediction_filename = os.path.join( prediction_dir, '{}.bioc.xml'.format(test_doc.filename)) if made_base_dir is not None: reference_filename = os.path.join( os.path.join(made_base_dir, 'annotations'), '{}.bioc.xml'.format(test_doc.filename)) reference_filenames.append(reference_filename) with open(prediction_filename, 'w') as fp: bioc.dump(collection, fp) prediction_documents_written += 1 print('Total prediction documents written : {}'.format( prediction_documents_written)) # finally we can invoke some evaluation (if enabled) if made_base_dir is not None: annotation_dir = os.path.join(made_base_dir, 'annotations') text_dir = os.path.join(made_base_dir, 'corpus') # first param can be an actual directory (string) or a list of filepaths get_f_scores(reference_filenames, prediction_dir, text_dir)
def test(data, opt, predict_dir): test_token, test_entity, test_relation, test_name = preprocess.loadPreprocessData( data.test_dir) # evaluate on test data and output results in bioc format, one doc one file data.load(opt.data_file) data.MAX_SENTENCE_LENGTH = -1 data.show_data_summary() data.fix_alphabet() seq_model = SeqModel(data) seq_model.load_state_dict( torch.load(os.path.join(opt.ner_dir, 'model.pkl'))) ner_hiddenlist = [] for i in range(opt.hidden_num): if i == 0: input_size = data.word_emb_dim+data.HP_char_hidden_dim+data.feature_emb_dims[data.feature_name2id['[Cap]']]+ \ data.feature_emb_dims[data.feature_name2id['[POS]']] output_size = data.HP_hidden_dim else: input_size = data.HP_hidden_dim output_size = data.HP_hidden_dim temp = HiddenLayer(data, input_size, output_size) temp.load_state_dict( torch.load(os.path.join(opt.ner_dir, 'hidden_{}.pkl'.format(i)))) ner_hiddenlist.append(temp) ner_wordrep = WordRep(data, False, True, True, data.use_char) ner_wordrep.load_state_dict( torch.load(os.path.join(opt.ner_dir, 'wordrep.pkl'))) classify_model = ClassifyModel(data) classify_model.load_state_dict( torch.load(os.path.join(opt.re_dir, 'model.pkl'))) re_hiddenlist = [] for i in range(opt.hidden_num): if i == 0: input_size = data.word_emb_dim + data.feature_emb_dims[data.feature_name2id['[POS]']]+\ 2*data.re_feature_emb_dims[data.re_feature_name2id['[POSITION]']] output_size = data.HP_hidden_dim else: input_size = data.HP_hidden_dim output_size = data.HP_hidden_dim temp = HiddenLayer(data, input_size, output_size) temp.load_state_dict( torch.load(os.path.join(opt.re_dir, 'hidden_{}.pkl'.format(i)))) re_hiddenlist.append(temp) re_wordrep = WordRep(data, True, False, True, False) re_wordrep.load_state_dict( torch.load(os.path.join(opt.re_dir, 'wordrep.pkl'))) for i in tqdm(range(len(test_name))): doc_name = test_name[i] doc_token = test_token[i] doc_entity = test_entity[i] if opt.use_gold_ner: entities = [] for _, e in doc_entity.iterrows(): entity = Entity() entity.create(e['id'], e['type'], e['start'], e['end'], e['text'], e['sent_idx'], e['tf_start'], e['tf_end']) entities.append(entity) else: ncrf_data = ner.generateDataForOneDoc(doc_token, doc_entity) data.raw_texts, data.raw_Ids = ner.read_instanceFromBuffer( ncrf_data, data.word_alphabet, data.char_alphabet, data.feature_alphabets, data.label_alphabet, data.number_normalized, data.MAX_SENTENCE_LENGTH) decode_results = ner_evaluateWhenTest(data, ner_wordrep, ner_hiddenlist, seq_model) entities = ner.translateNCRFPPintoEntities(doc_token, decode_results, doc_name) collection = bioc.BioCCollection() document = bioc.BioCDocument() collection.add_document(document) document.id = doc_name passage = bioc.BioCPassage() document.add_passage(passage) passage.offset = 0 for entity in entities: anno_entity = bioc.BioCAnnotation() passage.add_annotation(anno_entity) anno_entity.id = entity.id anno_entity.infons['type'] = entity.type anno_entity_location = bioc.BioCLocation(entity.start, entity.getlength()) anno_entity.add_location(anno_entity_location) anno_entity.text = entity.text test_X, test_other = relation_extraction.getRelationInstanceForOneDoc( doc_token, entities, doc_name, data) relations = re_evaluateWhenTest( re_wordrep, re_hiddenlist, classify_model, test_X, data, test_other, data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']]) for relation in relations: bioc_relation = bioc.BioCRelation() passage.add_relation(bioc_relation) bioc_relation.id = relation.id bioc_relation.infons['type'] = relation.type node1 = bioc.BioCNode(relation.node1.id, 'annotation 1') bioc_relation.add_node(node1) node2 = bioc.BioCNode(relation.node2.id, 'annotation 2') bioc_relation.add_node(node2) with open(os.path.join(predict_dir, doc_name + ".bioc.xml"), 'w') as fp: bioc.dump(collection, fp)
def predict(opt, data): seq_model = SeqModel(data) if opt.test_in_cpu: seq_model.load_state_dict(torch.load(os.path.join(opt.output, 'ner_model.pkl'), map_location='cpu')) else: cuda_src = 'cuda:{}'.format(opt.old_gpu) cuda_dst = 'cuda:{}'.format(opt.gpu) seq_model.load_state_dict(torch.load(os.path.join(opt.output, 'ner_model.pkl'), map_location={cuda_src:cuda_dst})) seq_wordseq = WordSequence(data, False, True, True, True) if opt.test_in_cpu: seq_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 'ner_wordseq.pkl'), map_location='cpu')) else: cuda_src = 'cuda:{}'.format(opt.old_gpu) cuda_dst = 'cuda:{}'.format(opt.gpu) seq_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 'ner_wordseq.pkl'), map_location={cuda_src:cuda_dst})) classify_model = ClassifyModel(data) if opt.test_in_cpu: classify_model.load_state_dict(torch.load(os.path.join(opt.output, 're_model.pkl'), map_location='cpu')) else: cuda_src = 'cuda:{}'.format(opt.old_gpu) cuda_dst = 'cuda:{}'.format(opt.gpu) classify_model.load_state_dict(torch.load(os.path.join(opt.output, 're_model.pkl'), map_location={cuda_src:cuda_dst})) classify_wordseq = WordSequence(data, True, False, True, False) if opt.test_in_cpu: classify_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 're_wordseq.pkl'), map_location='cpu')) else: cuda_src = 'cuda:{}'.format(opt.old_gpu) cuda_dst = 'cuda:{}'.format(opt.gpu) classify_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 're_wordseq.pkl'), map_location={cuda_src:cuda_dst})) input_files = [f for f in listdir(opt.input) if isfile(join(opt.input,f)) and f[0]!='.'] # for idx in tqdm(range(len(input_files))): for idx in range(len(input_files)): start = time.time() fileName = join(opt.input,input_files[idx]) doc_name = input_files[idx] doc_token = processOneFile(fileName) doc = generateDataForOneFile(doc_token) raw_texts, raw_Ids = read_instance(doc, data.word_alphabet, data.char_alphabet, data.feature_alphabets, data.label_alphabet, data.number_normalized, data.MAX_SENTENCE_LENGTH) decode_results = evaluateWhenTest(data, seq_wordseq, seq_model, raw_Ids) entities = ner.translateNCRFPPintoEntities(doc_token, decode_results, doc_name) collection = bioc.BioCCollection() document = bioc.BioCDocument() collection.add_document(document) document.id = doc_name passage = bioc.BioCPassage() document.add_passage(passage) passage.offset = 0 for entity in entities: anno_entity = bioc.BioCAnnotation() passage.add_annotation(anno_entity) anno_entity.id = entity.id anno_entity.infons['type'] = entity.type anno_entity_location = bioc.BioCLocation(entity.start, entity.getlength()) anno_entity.add_location(anno_entity_location) anno_entity.text = entity.text test_X, test_other = relation_extraction.getRelationInstanceForOneDoc(doc_token, entities, doc_name, data) relations = relation_extraction.evaluateWhenTest(classify_wordseq, classify_model, test_X, data, test_other, data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']]) for relation in relations: bioc_relation = bioc.BioCRelation() passage.add_relation(bioc_relation) bioc_relation.id = relation.id bioc_relation.infons['type'] = relation.type node1 = bioc.BioCNode(relation.node1.id, 'argument 1') bioc_relation.add_node(node1) node2 = bioc.BioCNode(relation.node2.id, 'argument 2') bioc_relation.add_node(node2) with open(os.path.join(opt.predict, doc_name + ".bioc.xml"), 'w') as fp: bioc.dump(collection, fp) end = time.time() logging.info("process %s complete with %.2fs" % (input_files[idx], end-start)) logging.info("test finished")
def pmcxml2bioc( source: Union[str, TextIO], tag_handlers: Dict[str, TagHandlerFunction] = {}, trim_sentences: bool = False, all_xml_path_infon: bool = False, mark_citations: bool = True, ) -> Iterator[Iterable[bioc.BioCDocument]]: """ Convert a PMC XML file into its Bioc equivalent Args: source: The text or file handle containing the PMC XML tag_handlers: custom overrides for handling specific XML tags. trim_sentences: Trim text content to a maximum sentence length. all_xml_path_infon: Add a xml_path infon element to every passages to describe where in the XML heirarchy this text is from (Will always add to table/figure elements even without flag) mark_citations: Add 0-length bioc annotations for in-text citations Raises: RuntimeError: On any parsing errors Returns: An iterator over the newly generated Bioc documents """ try: for pmc_doc in process_pmc_file(source, tag_handlers=tag_handlers): bioc_doc = bioc.BioCDocument() bioc_doc.id = pmc_doc["pmid"] bioc_doc.infons["title"] = " ".join( [p.text for p in pmc_doc["textSources"]["title"]]) bioc_doc.infons["pmid"] = pmc_doc["pmid"] bioc_doc.infons["pmcid"] = pmc_doc["pmcid"] bioc_doc.infons["doi"] = pmc_doc["doi"] bioc_doc.infons["year"] = pmc_doc["pubYear"] bioc_doc.infons["month"] = pmc_doc["pubMonth"] bioc_doc.infons["day"] = pmc_doc["pubDay"] bioc_doc.infons["journal"] = pmc_doc["journal"] bioc_doc.infons["journalISO"] = pmc_doc["journalISO"] offset = 0 for group_name, text_source_group in pmc_doc["textSources"].items( ): subsection = None for chunk in text_source_group: text_source, annotations = strip_annotation_markers( chunk.text, pmc_doc['annotations']) if trim_sentences: text_source = trim_sentence_lengths(text_source) passage = bioc.BioCPassage() subsection_check = text_source.lower().strip( "01234567890. ") if subsection_check in allowed_subsections: subsection = subsection_check passage.infons["section"] = group_name passage.infons["subsection"] = subsection if chunk.xml_path: if all_xml_path_infon or set( chunk.xml_path.split('/')) & { 'thead', 'tbody', 'fig', }: passage.infons["xml_path"] = chunk.xml_path passage.text = text_source passage.offset = offset if not trim_sentences and mark_citations: for annotation in annotations: for location in annotation.locations: location.offset += offset passage.add_annotation(annotation) offset += len(text_source) bioc_doc.add_passage(passage) yield bioc_doc except etree.ParseError: raise RuntimeError("Parsing error in PMC xml file: %s" % source)
def test2(test_token, test_entity, test_relation, test_name, result_dumpdir): logging.info("loading ... vocab") relation_vocab = pickle.load( open(os.path.join(opt.pretrain, 'relation_vocab.pkl'), 'rb')) logging.info("loading ... result") results = pickle.load(open(os.path.join(opt.output, 'results.pkl'), "rb")) for i in tqdm(range(len(test_relation))): doc_entity = test_entity[i] doc_name = test_name[i] collection = bioc.BioCCollection() document = bioc.BioCDocument() collection.add_document(document) document.id = doc_name passage = bioc.BioCPassage() document.add_passage(passage) passage.offset = 0 for _, entity in doc_entity.iterrows(): anno_entity = bioc.BioCAnnotation() passage.add_annotation(anno_entity) anno_entity.id = entity['id'] anno_entity.infons['type'] = entity['type'] anno_entity_location = bioc.BioCLocation( entity['start'], entity['end'] - entity['start']) anno_entity.add_location(anno_entity_location) anno_entity.text = entity['text'] relation_id = 1 for result in results: if doc_name == result['doc_name']: former = doc_entity[( doc_entity['id'] == result['former_id'])].iloc[0] latter = doc_entity[( doc_entity['id'] == result['latter_id'])].iloc[0] relation_type = relation_vocab.lookup_id2str(result['type']) if relation_type == '<unk>': continue elif my_utils.relationConstraint1(relation_type, former['type'], latter['type']) == False: continue else: bioc_relation = bioc.BioCRelation() passage.add_relation(bioc_relation) bioc_relation.id = str(relation_id) relation_id += 1 bioc_relation.infons['type'] = relation_type node1 = bioc.BioCNode(former['id'], 'annotation 1') bioc_relation.add_node(node1) node2 = bioc.BioCNode(latter['id'], 'annotation 2') bioc_relation.add_node(node2) with open(os.path.join(result_dumpdir, doc_name + ".bioc.xml"), 'w') as fp: bioc.dump(collection, fp)