def test_entityrecognizer_fusion_OFF(): lookup = makeTestLookup() text = 'EGFR-ERBB2 is not a real fusion gene' corpus = kindred.Corpus(text) parser = kindred.Parser() parser.parse(corpus) ner = kindred.EntityRecognizer(lookup) ner.annotate(corpus) doc = corpus.documents[0] assert len(doc.entities) == 2 entity1, entity2 = doc.entities assert entity1.entityType == 'gene' assert entity1.externalID == 'HGNC:3236' assert entity1.text == 'EGFR' assert entity1.position == [(0, 4)] assert entity1.sourceEntityID == 'T1' assert entity2.entityType == 'gene' assert entity2.externalID == 'HGNC:2064' assert entity2.text == 'ERBB2' assert entity2.position == [(5, 10)] assert entity2.sourceEntityID == 'T2'
def test_entityrecognizer_twoSentences(): lookup = makeTestLookup() text = 'EGFR is one gene. ERBB2 is another gene.' corpus = kindred.Corpus(text) parser = kindred.Parser() parser.parse(corpus) ner = kindred.EntityRecognizer(lookup) ner.annotate(corpus) doc = corpus.documents[0] assert len(doc.entities) == 2 entity1, entity2 = doc.entities assert entity1.entityType == 'gene' assert entity1.externalID == 'HGNC:3236' assert entity1.text == 'EGFR' assert entity1.position == [(0, 4)] assert entity1.sourceEntityID == 'T1' assert entity2.entityType == 'gene' assert entity2.externalID == 'HGNC:2064' assert entity2.text == 'ERBB2' assert entity2.position == [(18, 23)] assert entity2.sourceEntityID == 'T2'
def test_entityrecognizer_fusion_3(): lookup = makeTestLookup() text = 'EGFR-lymphoma is not anything.' corpus = kindred.Corpus(text) parser = kindred.Parser() parser.parse(corpus) ner = kindred.EntityRecognizer(lookup, detectFusionGenes=True) ner.annotate(corpus) doc = corpus.documents[0] assert len(doc.entities) == 2 entity1, entity2 = doc.entities assert entity1.entityType == 'gene' assert entity1.externalID == 'HGNC:3236' assert entity1.text == 'EGFR' assert entity1.position == [(0, 4)] assert entity1.sourceEntityID == 'T1' assert entity2.entityType == 'cancer' assert entity2.externalID == 'DOID:0060058' assert entity2.text == 'lymphoma' assert entity2.position == [(5, 13)] assert entity2.sourceEntityID == 'T2'
def test_entityrecognizer_fusion_1(): lookup = makeTestLookup() text = 'EGFR-ERBB2 is not a real fusion gene, but FGFR3-TACC3 is.' corpus = kindred.Corpus(text) parser = kindred.Parser() parser.parse(corpus) ner = kindred.EntityRecognizer(lookup, detectFusionGenes=True) ner.annotate(corpus) doc = corpus.documents[0] assert len(doc.entities) == 2 entity1, entity2 = doc.entities assert entity1.entityType == 'gene' assert entity1.externalID == 'combo|HGNC:3236|HGNC:2064' assert entity1.text == 'EGFR-ERBB2' assert entity1.position == [(0, 10)] assert entity1.sourceEntityID == 'T1' assert entity2.entityType == 'gene' assert entity2.externalID == 'combo|HGNC:3690|HGNC:11524' assert entity2.text == 'FGFR3-TACC3' assert entity2.position == [(42, 53)] assert entity2.sourceEntityID == 'T2'
def test_entityrecognizer_basic(): lookup = makeTestLookup() text = 'EGFR is a gene associated with lung cancer' corpus = kindred.Corpus(text) parser = kindred.Parser() parser.parse(corpus) ner = kindred.EntityRecognizer(lookup) ner.annotate(corpus) doc = corpus.documents[0] assert len(doc.entities) == 1 entity = doc.entities[0] assert entity.entityType == 'gene' assert entity.externalID == 'HGNC:3236' assert entity.text == 'EGFR' assert entity.position == [(0, 4)] assert entity.sourceEntityID == 'T1' assert len(doc.sentences) == 1 sentence = doc.sentences[0] assert sentence.entityAnnotations == [(entity, [0])]
def test_entityrecognizer_merge_brackets_OFF(): lookup = makeTestLookup() text = 'This paper studies non-small cell lung carcinoma (NSCLC).' corpus = kindred.Corpus(text) parser = kindred.Parser() parser.parse(corpus) ner = kindred.EntityRecognizer(lookup) ner.annotate(corpus) doc = corpus.documents[0] assert len(doc.entities) == 2 entity1, entity2 = doc.entities assert entity1.entityType == 'cancer' assert entity1.externalID == 'DOID:3908' assert entity1.text == 'non-small cell lung carcinoma' assert entity1.position == [(19, 48)] assert entity1.sourceEntityID == 'T1' assert entity2.entityType == 'cancer' assert entity2.externalID == 'DOID:3908' assert entity2.text == 'NSCLC' assert entity2.position == [(50, 55)] assert entity2.sourceEntityID == 'T2'
def test_entityrecognizer_acronyms_OFF(): lookup = makeTestLookup() text = 'The Never Ending Umbrella (NEU) is a true classic.' corpus = kindred.Corpus(text) parser = kindred.Parser() parser.parse(corpus) ner = kindred.EntityRecognizer(lookup) ner.annotate(corpus) doc = corpus.documents[0] assert len(doc.entities) == 2 entity1, entity2 = doc.entities assert entity1.entityType == 'movie' assert entity1.externalID == 'IMDB:9999' assert entity1.text == 'Never Ending Umbrella' assert entity1.position == [(4, 25)] assert entity1.sourceEntityID == 'T1' assert entity2.entityType == 'gene' assert entity2.externalID == 'HGNC:2064' assert entity2.text == 'NEU' assert entity2.position == [(27, 30)] assert entity2.sourceEntityID == 'T2'
def test_entityrecognizer_merge_negativecase(): lookup = makeTestLookup() text = 'EGFR ERBB2 is not anything.' corpus = kindred.Corpus(text) parser = kindred.Parser() parser.parse(corpus) ner = kindred.EntityRecognizer(lookup, mergeTerms=True) ner.annotate(corpus) doc = corpus.documents[0] assert len(doc.entities) == 2 entity1, entity2 = doc.entities assert entity1.entityType == 'gene' assert entity1.externalID == 'HGNC:3236' assert entity1.text == 'EGFR' assert entity1.position == [(0, 4)] assert entity1.sourceEntityID == 'T1' assert entity2.entityType == 'gene' assert entity2.externalID == 'HGNC:2064' assert entity2.text == 'ERBB2' assert entity2.position == [(5, 10)] assert entity2.sourceEntityID == 'T2'
def test_entityrecognizer_removepathways_off(): lookup = makeTestLookup() text = 'EGFR signalling is involved in lung cancer' corpus = kindred.Corpus(text) parser = kindred.Parser() parser.parse(corpus) ner = kindred.EntityRecognizer(lookup, removePathways=False) ner.annotate(corpus) doc = corpus.documents[0] assert len(doc.entities) == 1 entity = doc.entities[0] assert entity.entityType == 'gene' assert entity.externalID == 'HGNC:3236' assert entity.text == 'EGFR' assert entity.position == [(0, 4)] assert entity.sourceEntityID == 'T1' assert len(doc.sentences) == 1 sentence = doc.sentences[0] assert sentence.entityAnnotations == [(entity, [0])]
def test_entityrecognizer_merge_idintersections(): lookup = makeTestLookup() text = 'We studied the genes known as GLP-1R GLP1R GLP1 GLP-1.' corpus = kindred.Corpus(text) parser = kindred.Parser() parser.parse(corpus) ner = kindred.EntityRecognizer(lookup, mergeTerms=True) ner.annotate(corpus) doc = corpus.documents[0] assert len(doc.sentences) == 1 assert len(doc.entities) == 2 assert doc.entities[0].entityType == 'gene' assert doc.entities[0].externalID == 'HGNC:4324' assert doc.entities[0].text == 'GLP-1R GLP1R' assert doc.entities[0].position == [(30, 42)] assert doc.entities[0].sourceEntityID == 'T1' assert doc.entities[1].entityType == 'gene' assert doc.entities[1].externalID == 'HGNC:4191' assert doc.entities[1].text == 'GLP1 GLP-1' assert doc.entities[1].position == [(43, 53)] assert doc.entities[1].sourceEntityID == 'T2'
def test_entityrecognizer_merge_triple_brackets(): lookup = makeTestLookup() text = 'HER2 neu (ERBB2) is a gene.' corpus = kindred.Corpus(text) parser = kindred.Parser() parser.parse(corpus) ner = kindred.EntityRecognizer(lookup, mergeTerms=True) ner.annotate(corpus) doc = corpus.documents[0] #print(doc.entities) assert len(doc.sentences) == 1 assert len(doc.entities) == 1 entity = doc.entities[0] assert entity.entityType == 'gene' assert entity.externalID == 'HGNC:2064' assert entity.text == 'HER2 neu (ERBB2)' assert entity.position == [(0, 16)] assert entity.sourceEntityID == 'T1'
def findFusions(biocFile, genesFile, wordlistPickle, outFile): print("%s : start" % now()) with open(wordlistPickle, 'rb') as f: termLookup = pickle.load(f) hugo2Name = {} with open(genesFile) as f: for line in f: hugo_gene_id, gene_name, synoyms, entrez_gene_id = line.strip( '\n').split('\t') hugo2Name[hugo_gene_id] = gene_name print("%s : processing..." % now()) parser = kindred.Parser(model='en_core_sci_sm') ner = kindred.EntityRecognizer(lookup=termLookup, detectFusionGenes=True, detectMicroRNA=True, acronymDetectionForAmbiguity=True, mergeTerms=True, detectVariants=True) with open(outFile, 'w') as outF: for corpusno, corpus in enumerate(kindred.iterLoad( 'biocxml', biocFile)): parser.parse(corpus) ner.annotate(corpus) for doc in corpus.documents: pmid = '' if 'pmid' in doc.metadata: pmid = doc.metadata['pmid'] #fusions = [ e for e in doc.entities if e.entityType == 'Gene' ] for e in doc.entities: if e.entityType == 'gene' and e.externalID.startswith( 'combo|'): gene_ids = e.externalID.split('|')[1:] if len(gene_ids) != 2: continue if any('&' in gene_id for gene_id in gene_ids): continue for gene_id in gene_ids: assert gene_id in hugo2Name, 'Unable to find HUGO gene name for ID: %s (text=%s)' % ( gene_id, e.text) gene_names = [ hugo2Name[gene_id] for gene_id in gene_ids ] assert len(gene_names) == 2 outData = [pmid, e.text] + gene_ids + gene_names outF.write("\t".join(outData) + "\n")
def test_entityrecognizer_removepathways_4(): lookup = makeTestLookup() text = 'EGFR cascade is involved in lung cancer' corpus = kindred.Corpus(text) parser = kindred.Parser() parser.parse(corpus) ner = kindred.EntityRecognizer(lookup, removePathways=True) ner.annotate(corpus) doc = corpus.documents[0] assert len(doc.entities) == 0
def test_entityrecognizer_variant_stopwords(): lookup = {} text = 'The V600E variant is well studied.' corpus = kindred.Corpus(text) parser = kindred.Parser() parser.parse(corpus) ner = kindred.EntityRecognizer(lookup,detectVariants=True,variantStopwords=['V600E']) ner.annotate(corpus) doc = corpus.documents[0] assert len(doc.entities) == 0
def test_entityrecognizer_microRNA_mirOFF(): lookup = makeTestLookup() text = 'mir-83 is a gene associated with lung cancer' corpus = kindred.Corpus(text) parser = kindred.Parser() parser.parse(corpus) ner = kindred.EntityRecognizer(lookup) ner.annotate(corpus) doc = corpus.documents[0] assert len(doc.entities) == 0
def test_entityrecognizer_fusion_4(): lookup = makeTestLookup() text = 'EGFR-banana is not anything.' corpus = kindred.Corpus(text) parser = kindred.Parser() parser.parse(corpus) ner = kindred.EntityRecognizer(lookup, detectFusionGenes=True) ner.annotate(corpus) doc = corpus.documents[0] assert len(doc.entities) == 1 entity = doc.entities[0] assert entity.entityType == 'gene' assert entity.externalID == 'HGNC:3236' assert entity.text == 'EGFR' assert entity.position == [(0, 4)]
def test_entityrecognizer_merge_nobrackets(): lookup = makeTestLookup() text = 'HER2 neu is a gene.' corpus = kindred.Corpus(text) parser = kindred.Parser() parser.parse(corpus) ner = kindred.EntityRecognizer(lookup, mergeTerms=True) ner.annotate(corpus) doc = corpus.documents[0] assert len(doc.entities) == 1 entity = doc.entities[0] assert entity.entityType == 'gene' assert entity.externalID == 'HGNC:2064' assert entity.text == 'HER2 neu' assert entity.position == [(0, 8)]
def test_entityrecognizer_merge_brackets_left(): lookup = makeTestLookup() text = 'This paper studies (NSCLC) non-small cell lung carcinoma.' corpus = kindred.Corpus(text) parser = kindred.Parser() parser.parse(corpus) ner = kindred.EntityRecognizer(lookup, mergeTerms=True) ner.annotate(corpus) doc = corpus.documents[0] assert len(doc.entities) == 1 entity = doc.entities[0] assert entity.entityType == 'cancer' assert entity.externalID == 'DOID:3908' assert entity.text == '(NSCLC) non-small cell lung carcinoma' assert entity.position == [(19, 56)]
def test_entityrecognizer_acronyms_acronymHasCorrectID_hyphen(): lookup = makeTestLookup() text = 'Diffuse large b-cell lymphoma (DLBCL) is a challenging research topic.' corpus = kindred.Corpus(text) parser = kindred.Parser() parser.parse(corpus) ner = kindred.EntityRecognizer(lookup, acronymDetectionForAmbiguity=True) ner.annotate(corpus) doc = corpus.documents[0] assert len(doc.entities) == 1 entity = doc.entities[0] assert entity.entityType == 'cancer' assert entity.externalID == 'DOID:0050745' assert entity.text == 'DLBCL' assert entity.position == [(31, 36)]
def test_entityrecognizer_acronyms_bothHaveIDs_plural(): lookup = makeTestLookup() text = 'The Never Ending Umbrellas (NEUs) are true classics.' corpus = kindred.Corpus(text) parser = kindred.Parser() parser.parse(corpus) ner = kindred.EntityRecognizer(lookup, acronymDetectionForAmbiguity=True) ner.annotate(corpus) doc = corpus.documents[0] assert len(doc.entities) == 1 entity = doc.entities[0] assert entity.entityType == 'movie' assert entity.externalID == 'IMDB:9999' assert entity.text == 'Never Ending Umbrellas' assert entity.position == [(4, 26)]
def test_entityrecognizer_microRNA_mir1(): lookup = makeTestLookup() text = 'mir-83 is a gene associated with lung cancer' corpus = kindred.Corpus(text) parser = kindred.Parser() parser.parse(corpus) ner = kindred.EntityRecognizer(lookup, detectMicroRNA=True) ner.annotate(corpus) doc = corpus.documents[0] assert len(doc.entities) == 1 entity = doc.entities[0] assert entity.entityType == 'gene' assert entity.externalID == 'mirna|mir-83' assert entity.text == 'mir-83' assert entity.position == [(0, 6)]
def test_entityrecognizer_polymorphism(): lookup = {} text = 'The rs12345 variant is well studied.' corpus = kindred.Corpus(text) parser = kindred.Parser() parser.parse(corpus) ner = kindred.EntityRecognizer(lookup, detectPolymorphisms=True) ner.annotate(corpus) doc = corpus.documents[0] assert len(doc.entities) == 1 entity = doc.entities[0] assert entity.entityType == 'variant' assert entity.externalID == 'dbsnp|rs12345' assert entity.text == 'rs12345' assert entity.position == [(4, 11)] assert entity.sourceEntityID == 'T1'
def test_entityrecognizer_variant_2(): lookup = {} text = 'The BRAF p.Val600Glu variant is well studied.' corpus = kindred.Corpus(text) parser = kindred.Parser() parser.parse(corpus) ner = kindred.EntityRecognizer(lookup, detectVariants=True) ner.annotate(corpus) doc = corpus.documents[0] assert len(doc.entities) == 1 entity = doc.entities[0] assert entity.entityType == 'variant' assert entity.externalID == 'substitution|V600E' assert entity.text == 'Val600Glu' assert entity.position == [(11, 20)] assert entity.sourceEntityID == 'T1'
def test_entityrecognizer_variant_1(): lookup = {} text = 'The V600E variant is well studied.' corpus = kindred.Corpus(text) parser = kindred.Parser() parser.parse(corpus) ner = kindred.EntityRecognizer(lookup, detectVariants=True) ner.annotate(corpus) doc = corpus.documents[0] print(doc.entities) assert len(doc.entities) == 1 entity = doc.entities[0] assert entity.entityType == 'omicevent' assert entity.externalID == 'substitution|V600E' assert entity.text == 'V600E' assert entity.position == [(4, 9)]
def test_entityrecognizer_fusion_2(): lookup = makeTestLookup() text = 'HER2-neu is a gene.' corpus = kindred.Corpus(text) parser = kindred.Parser() parser.parse(corpus) ner = kindred.EntityRecognizer(lookup, detectFusionGenes=True) ner.annotate(corpus) doc = corpus.documents[0] assert len(doc.entities) == 1 entity = doc.entities[0] assert entity.entityType == 'gene' assert entity.externalID == 'HGNC:2064' assert entity.text == 'HER2-neu' assert entity.position == [(0, 8)] assert entity.sourceEntityID == 'T1'
documents = [d for d in documents if d['doi'] == testMode] assert len(documents) == 1 text = documents[0]['title'] + "\n" + documents[0]['abstract'] print("Title: %s" % documents[0]['title']) print("Abstract: %s" % documents[0]['abstract']) corpus = kindred.Corpus(text=text) corpus.documents[0].metadata = {"title": documents[0]['title']} parser = kindred.Parser(model='en_core_sci_sm') parser.parse(corpus) print("Annotating corpus...") sys.stdout.flush() corpus.removeEntities() ner = kindred.EntityRecognizer(termLookup, mergeTerms=True) ner.annotate(corpus) if testMode: doc = corpus.documents[0] for e in doc.entities: print(" %s" % str(e)) #print(doc.entities) #assert False corpusMap = {} for kindred_doc in corpus.documents: corpusMap[kindred_doc.text] = kindred_doc for d in documents:
def cancermine(sentenceFile, modelFilenames, filterTerms, wordlistPickle, genes, cancerTypes, outData): print("%s : start" % now()) models = {} assert isinstance(modelFilenames, list) for modelFilename in modelFilenames: with open(modelFilename, 'rb') as f: models[modelFilename] = pickle.load(f) IDToTerm = {} Hugo2Entrez = defaultdict(lambda: 'NA') with codecs.open(genes, 'r', 'utf-8') as f: for line in f: gene_hugo_id, singleterm, _, gene_entrez_id = line.strip().split( '\t') IDToTerm[gene_hugo_id] = singleterm Hugo2Entrez[gene_hugo_id] = gene_entrez_id with codecs.open(cancerTypes, 'r', 'utf-8') as f: for line in f: cancerid, singleterm, _ = line.strip().split('\t') IDToTerm[cancerid] = singleterm with codecs.open(filterTerms, 'r', 'utf-8') as f: filterTerms = [line.strip().lower() for line in f] with open(wordlistPickle, 'rb') as f: termLookup = pickle.load(f) # Truncate the output file with codecs.open(outData, 'w', 'utf-8') as outF: pass timers = Counter() print("%s : loading..." % now()) with open(sentenceFile) as f: sentenceData = json.load(f) corpus = kindred.Corpus() for sentence in sentenceData: metadata = dict(sentence) del metadata["sentence"] doc = kindred.Document(sentence["sentence"], metadata=metadata) corpus.addDocument(doc) print("%s : loaded..." % now()) startTime = time.time() parser = kindred.Parser() parser.parse(corpus) timers['parser'] += time.time() - startTime print("%s : parsed" % now()) startTime = time.time() ner = kindred.EntityRecognizer(lookup=termLookup, detectFusionGenes=False, detectMicroRNA=False, acronymDetectionForAmbiguity=True, mergeTerms=True, removePathways=True) ner.annotate(corpus) timers['ner'] += time.time() - startTime print("%s : ner" % now()) with codecs.open(outData, 'a', 'utf-8') as outF: startTime = time.time() for modelname, model in models.items(): model.predict(corpus) timers['predicted'] += time.time() - startTime print("%s : predicted" % now()) startTime = time.time() for doc in corpus.documents: if len(doc.relations) == 0: continue entity_to_sentence = {} for sentence in doc.sentences: for entity, tokenIndices in sentence.entityAnnotations: assert not entity in entity_to_sentence entity_to_sentence[entity] = sentence for relation in doc.relations: sentence = entity_to_sentence[relation.entities[0]] sentenceTextLower = sentence.text.lower() hasFilterTerm = any(filterTerm in sentenceTextLower for filterTerm in filterTerms) if not hasFilterTerm: continue #words = [ t.word for t in sentence.tokens ] #text = " ".join(words) sentenceStart = sentence.tokens[0].startPos relType = relation.relationType entityData = [] for entity in relation.entities: entityData.append(entity.externalID) if entity.entityType == 'gene': entityData.append(Hugo2Entrez[entity.externalID]) entityData.append(entity.text) if entity.externalID.startswith('combo'): externalIDsplit = entity.externalID.split('|') normalizedTerms = [ getNormalizedTerm("", st.replace('&', ';'), IDToTerm) for st in externalIDsplit[1:] ] normalizedTerm = "|".join(normalizedTerms) elif entity.externalID.startswith('mirna|'): normalizedTerm = normalizeMIRName(entity.externalID) else: normalizedTerm = getNormalizedTerm( entity.text, entity.externalID, IDToTerm) entityData.append(normalizedTerm) assert len( entity.position ) == 1, "Expecting entities that are contigious and have only one start and end position within the text" startPos, endPos = entity.position[0] entityData.append(startPos - sentenceStart) entityData.append(endPos - sentenceStart) if doc.metadata["pmid"]: m = doc.metadata if not 'subsection' in m: m['subsection'] = None prob = relation.probability outData = [ m['pmid'], m['title'], m["journal"], m["year"], m["month"], m["day"], m['section'], m['subsection'], relType, prob ] + entityData + [sentence.text] if applyFinalFilter(outData): outLine = "\t".join(map(str, outData)) outF.write(outLine + "\n") timers['output'] += time.time() - startTime print("%s : output" % now()) sys.stdout.flush() print("%s : done" % now()) for section, sectiontime in timers.items(): print("%s\t%f" % (section, sectiontime))
for wordlist in args.wordlists.split(','): assert os.path.isfile(wordlist), 'Unable to access file: %s' % wordlist entityType = os.path.splitext(os.path.basename(wordlist))[0] wordlistDict[entityType] = wordlist print(" %s - %s" % (entityType, wordlist)) assert len( wordlistDict ) == 2, "This annotation tool currently only handles two entity relations of different types" wordlistLookup = kindred.EntityRecognizer.loadWordlists(wordlistDict, idColumn=0, termsColumn=0) print("Annotating entities in corpus with wordlists") entityRecognizer = kindred.EntityRecognizer(wordlistLookup) entityRecognizer.annotate(sentenceCorpus) print("Finding all candidate relations") acceptedEntityTypes = wordlistDict candidateBuilder = kindred.CandidateBuilder( entityCount=len(wordlistDict), acceptedEntityTypes=[tuple(sorted(wordlistDict.keys()))]) candidateRelations = candidateBuilder.build(sentenceCorpus) print( "Time to through some of the candidate relations and annotate some...") annotatedCorpus, unannotatedCorpus = kindred.manuallyAnnotate( sentenceCorpus, candidateRelations) print(
def parseAndFindEntities(biocFile, filterTermsFile, wordlistPickle, variantStopwordsFile, outSentencesFilename): print("%s : start" % now()) with open(wordlistPickle, 'rb') as f: termLookup = pickle.load(f) with open(filterTermsFile, 'r') as f: filterTerms = [line.strip().lower() for line in f] with open(variantStopwordsFile) as f: variantStopwords = [line.strip() for line in f] timers = Counter() outSentences = [] currentID = None duplicateCheck = set() print("%s : processing..." % now()) parser = kindred.Parser(model='en_core_sci_sm') ner = kindred.EntityRecognizer(lookup=termLookup, detectFusionGenes=True, detectMicroRNA=True, acronymDetectionForAmbiguity=True, mergeTerms=True, detectVariants=True, variantStopwords=variantStopwords) for corpusno, corpus in enumerate(kindred.iterLoad('biocxml', biocFile)): startTime = time.time() corpus = filterCorpus(corpus, filterTerms) timers['filter'] += time.time() - startTime startTime = time.time() parser.parse(corpus) timers['parser'] += time.time() - startTime print("%s : parsed" % now()) startTime = time.time() ner.annotate(corpus) timers['ner'] += time.time() - startTime print("%s : ner" % now()) startTime = time.time() for doc in corpus.documents: # Reset the duplicate check set for each new PMID if doc.metadata['id'] != currentID: currentID = doc.metadata['id'] duplicateCheck = set() for sentence in doc.sentences: sentenceTextLower = sentence.text.lower() containsFilterTerm = any(ft in sentenceTextLower for ft in filterTerms) if not containsFilterTerm: continue entityTypesInSentence = set([ entity.entityType for entity, tokenIndices in sentence.entityAnnotations ]) foundCancer = 'cancer' in entityTypesInSentence foundGene = 'gene' in entityTypesInSentence foundVariant = 'variant' in entityTypesInSentence if foundCancer and foundGene and foundVariant: sentenceText = sentence.text.strip(string.whitespace + ',') if not sentenceText in duplicateCheck: tmpData = dict(doc.metadata) tmpData['sentence'] = sentenceText outSentences.append(tmpData) duplicateCheck.add(sentenceText) timers['entitiesAdded'] += time.time() - startTime print("%s : entities added" % now()) sys.stdout.flush() with open(outSentencesFilename, 'w') as f: json.dump(outSentences, f, indent=2) print("%s : done" % now()) for section, sectiontime in timers.items(): print("%s\t%f" % (section, sectiontime)) print("%s\t%f" % ("parseAndFindEntities total", sum(timers.values())))
def parseAndFindEntities(biocFile,wordlistPickle,outSentencesFilename): print("%s : start" % now()) with open(wordlistPickle,'rb') as f: termLookup = pickle.load(f) #with open(filterTermsFile,'r') as f: # filterTerms = [ line.strip().lower() for line in f ] strictFilterTerms = ['tumor antigen','tumour antigen','tumor-antigen','tumour-antigen'] weakFilterTerms = ['antigen'] timers = Counter() outSentences = [] currentID = None duplicateCheck = set() print("%s : processing..." % now()) parser = kindred.Parser() ner = kindred.EntityRecognizer(lookup=termLookup,detectFusionGenes=True,detectMicroRNA=False,acronymDetectionForAmbiguity=True,mergeTerms=True) for corpusno,corpus in enumerate(kindred.iterLoadDataFromBioc(biocFile)): startTime = time.time() corpus = filterCorpus(corpus,weakFilterTerms) timers['filter'] += time.time() - startTime startTime = time.time() parser.parse(corpus) timers['parser'] += time.time() - startTime print("%s : parsed" % now()) startTime = time.time() ner.annotate(corpus) timers['ner'] += time.time() - startTime print("%s : ner" % now()) startTime = time.time() for doc in corpus.documents: # Reset the duplicate check set for each new PMID if doc.metadata['id'] != currentID: currentID = doc.metadata['id'] duplicateCheck = set() for sentence in doc.sentences: sentenceTextLower = sentence.text.lower() #print(sentence.text) #print(sentence.entitiesWithLocations) entityTypesInSentence = set([ entity.entityType for entity,tokenIndices in sentence.entityAnnotations ]) gotKeyword = 'keyword' in entityTypesInSentence gotGene = 'gene' in entityTypesInSentence gotProtein = 'protein' in entityTypesInSentence gotCancer = 'cancer' in entityTypesInSentence containsStrictTerm = any( ft in sentenceTextLower for ft in strictFilterTerms ) containsWeakTerm = any( ft in sentenceTextLower for ft in weakFilterTerms ) topicMatch = containsStrictTerm or (containsWeakTerm and gotCancer) if topicMatch and gotKeyword and (gotGene or gotProtein): sentenceText = sentence.text.strip(string.whitespace + ',') if not sentenceText in duplicateCheck: tmpData = dict(doc.metadata) tmpData['sentence'] = sentenceText outSentences.append(tmpData) duplicateCheck.add(sentenceText) timers['entitiesAdded'] += time.time() - startTime print("%s : entities added" % now()) sys.stdout.flush() with open(outSentencesFilename,'w') as f: json.dump(outSentences,f,indent=2) print("%s : done" % now()) for section,sectiontime in timers.items(): print("%s\t%f" % (section,sectiontime))