def findVarDisGeneDrug(pmid, text): """ >>> startup({}) >>> list(findVarDisGeneDrug(0, "The R71G BRCA1 is a breast cancer founder mutation not treatable with Herceptin")) """ textLow = text.lower() # very basic filter, remove documents without some basic keywords if " variant " not in textLow and " mutation" not in textLow and " substitution" not in textLow and \ " mutant " not in textLow: return for section, sentStart, sentEnd, sentText in pubNlp.sectionSentences(text): genes = list(geneFinder.findGeneNames(sentText)) if len(genes) == 0: continue #print "genes", genes, sentText conds = list(pubNlp.findDiseases(sentText)) drugs = list(pubNlp.findDrugs(sentText)) # remove diseases and drugs that are also genes drugs = pubNlp.rangeRemoveOverlaps(drugs, genes) conds = pubNlp.rangeRemoveOverlaps(conds, genes) # check if we still have a disease and drug left if len(conds) == 0 or len(drugs) == 0: continue print "drugs", drugs print "diseases", conds geneSnips = pubNlp.rangeTexts(sentText, genes) condSnips = pubNlp.rangeTexts(sentText, conds) drugSnips = pubNlp.rangeTexts(sentText, drugs) genePosSet = pubNlp.rangeToPosSet(genes) variants = varFinder.findVariantDescriptions(sentText, exclPos=genePosSet) # the last field of the genes rows is the entrez ID entrezIds = [r[-1] for r in genes] # we need a protein variant, not DNA if "prot" not in variants: continue for variant, mentions in variants["prot"]: print "grounding variant", variant, mentions groundedMuts, ungroundVar, beds = \ varFinder.groundVariant(pmid, sentText, variant, mentions, [], entrezIds) for mutInfo in groundedMuts: coords = [(m.start, m.end) for m in mentions] varSnips = pubNlp.rangeTexts(sentText, coords) row = [ section, "|".join(geneSnips), "|".join(condSnips), "|".join(drugSnips), "|".join(varSnips), sentText ] yield row
def findVarDisGeneDrug(pmid, text): """ >>> startup({}) >>> list(findVarDisGeneDrug(0, "The R71G BRCA1 is a breast cancer founder mutation not treatable with Herceptin")) """ textLow = text.lower() # very basic filter, remove documents without some basic keywords if " variant " not in textLow and " mutation" not in textLow and " substitution" not in textLow and \ " mutant " not in textLow: return for section, sentStart, sentEnd, sentText in pubNlp.sectionSentences(text): genes = list(geneFinder.findGeneNames(sentText)) if len(genes)==0: continue #print "genes", genes, sentText conds = list(pubNlp.findDiseases(sentText)) drugs = list(pubNlp.findDrugs(sentText)) # remove diseases and drugs that are also genes drugs = pubNlp.rangeRemoveOverlaps(drugs, genes) conds = pubNlp.rangeRemoveOverlaps(conds, genes) # check if we still have a disease and drug left if len(conds)==0 or len(drugs)==0: continue print "drugs", drugs print "diseases", conds geneSnips = pubNlp.rangeTexts(sentText, genes) condSnips = pubNlp.rangeTexts(sentText, conds) drugSnips = pubNlp.rangeTexts(sentText, drugs) genePosSet = pubNlp.rangeToPosSet(genes) variants = varFinder.findVariantDescriptions(sentText, exclPos=genePosSet) # the last field of the genes rows is the entrez ID entrezIds = [r[-1] for r in genes] # we need a protein variant, not DNA if "prot" not in variants: continue for variant, mentions in variants["prot"]: print "grounding variant", variant, mentions groundedMuts, ungroundVar, beds = \ varFinder.groundVariant(pmid, sentText, variant, mentions, [], entrezIds) for mutInfo in groundedMuts: coords = [(m.start, m.end) for m in mentions] varSnips = pubNlp.rangeTexts(sentText, coords) row = [section, "|".join(geneSnips), "|".join(condSnips), "|".join(drugSnips), "|".join(varSnips), sentText] yield row
def annotateFile(article, file): " go over words of text and check if they are in dict " text = file.content if len(text)>100000: return if reqStrings!=None: found = False #sentLower = sent.lower() textLower = text.lower() for rs in reqStrings: if text.find(rs)!=-1: #if sentLower.find(rs)!=-1: found = True break if not found: return for section, sentStart, sentEnd, sent in pubNlp.sectionSentences(text, file.fileType, mustHaveVerb=False): #if len(sent)<20: #logging.debug("Sentence too short: %d characters" % len(text)) #continue #if len(sent)>2000: #logging.debug("Sentence too long: %d characters" % len(text)) #continue found = True posList = [] allMatches = [] for lexName, lex in lexes.iteritems(): matches = [] lexMatches = fastFind(sent, lex, toLower=toLower) if len(lexMatches)==0 or len(lexMatches)>10: found = False break for start, end, word in lexMatches: matches.append(word.replace("="," ").replace(",", " ")) posList.append("%d-%d" % (start, end)) allMatches.append("%s=%s" % (lexName, ",".join(matches))) if found: yield sentStart, sentEnd, "|".join(allMatches), "|".join(posList), sent
def annotateFile(article, file): " go over words of text and check if they are in dict " text = file.content if len(text) > 100000: return if reqStrings != None: found = False # sentLower = sent.lower() textLower = text.lower() for rs in reqStrings: if text.find(rs) != -1: # if sentLower.find(rs)!=-1: found = True break if not found: return for section, sentStart, sentEnd, sent in pubNlp.sectionSentences(text, file.fileType, mustHaveVerb=False): # if len(sent)<20: # logging.debug("Sentence too short: %d characters" % len(text)) # continue # if len(sent)>2000: # logging.debug("Sentence too long: %d characters" % len(text)) # continue found = True posList = [] allMatches = [] for lexName, lex in lexes.iteritems(): matches = [] lexMatches = fastFind(sent, lex, toLower=toLower) if len(lexMatches) == 0 or len(lexMatches) > 10: found = False break for start, end, word in lexMatches: matches.append(word.replace("=", " ").replace(",", " ")) posList.append("%d-%d" % (start, end)) allMatches.append("%s=%s" % (lexName, ",".join(matches))) if found: yield sentStart, sentEnd, "|".join(allMatches), "|".join(posList), sent
def annotateFile(self, article, file): text = file.content pmid = article.pmid if file.fileType == "supp": return for row in pubNlp.sectionSentences(text, file.fileType): section, sentStart, sentEnd, text = row tokens = text.split() if len(tokens) < 6: logging.debug("Sentence too short: %d tokens" % len(tokens)) continue if len(tokens) > 40: logging.debug("Sentence too long: %d tokens" % len(tokens)) continue if len(text) < 20: logging.debug("Sentence too short: %d characters" % len(text)) continue if len(text) > 1000: logging.debug("Sentence too long: %d characters" % len(text)) continue if text.count('"') > 20 or text.count(",") > 20: logging.debug("Too many strange characters") continue genes = list(geneFinder.findGeneNames(text)) if len(genes) < 2: continue if len(genes) > 20: logging.debug("Too many genes, %d" % len(genes)) continue geneDescs = ["%d-%d/%s/%s/%s" % (start,end,text[start:end],name,gid) \ for start,end,name,gid in genes] geneDesc = "|".join(geneDescs) row = [pmid, section, start, end, text, geneDesc] yield row self.rowCount += 1 if self.rowCount % 200 == 0: yield [] # tell caller to start a new output file
def annotateFile(self, article, file): text = file.content pmid = article.pmid if file.fileType=="supp": return for row in pubNlp.sectionSentences(text, file.fileType): section, sentStart, sentEnd, text = row tokens = text.split() if len(tokens)<6: logging.debug("Sentence too short: %d tokens" % len(tokens)) continue if len(tokens)>40: logging.debug("Sentence too long: %d tokens" % len(tokens)) continue if len(text)<20: logging.debug("Sentence too short: %d characters" % len(text)) continue if len(text)>1000: logging.debug("Sentence too long: %d characters" % len(text)) continue if text.count('"') > 20 or text.count(",")>20: logging.debug("Too many strange characters") continue genes = list(geneFinder.findGeneNames(text)) if len(genes) < 2: continue if len(genes) > 20: logging.debug("Too many genes, %d" % len(genes)) continue geneDescs = ["%d-%d/%s/%s/%s" % (start,end,text[start:end],name,gid) \ for start,end,name,gid in genes] geneDesc = "|".join(geneDescs) row = [pmid, section, start, end, text, geneDesc] yield row self.rowCount += 1 if self.rowCount % 200 == 0: yield [] # tell caller to start a new output file
def findDisGeneVariant(text): """ >>> geneFinder.initData(exclMarkerTypes=["dnaSeq", "band"]) >>> varFinder.loadDb(loadSequences=False) >>> list(findDisGeneVariant("Diabetes is caused by a PITX2 mutation, V234T and influenced by Herceptin.")) [(0, 74, 'probablyAbstract', '64-73:Herceptin=Trastuzumab', '0-8:Diabetes=Diabetes Mellitus', '24-29:PITX2=symbol', 'V233T', 'Diabetes is caused by a PITX2 mutation, V234T and influenced by Herceptin.')] >>> #list(findDisGeneVariant("We undertook a quantitative review of the literature to estimate the effectiveness of desferrioxamine and deferiprone in decreasing hepatic iron concentrations (HIC) in thalassemia major.")) >>> list(findDisGeneVariant("his mutation, we cotransfected C3H10T cells with expression vectors encoding SMO-WT or SMO-D473H ")) """ docGenes = list(geneFinder.findGeneNames(text)) docEntrezIds = set([r[-1] for r in docGenes]) for section, start, end, sentence in pubNlp.sectionSentences(text): conds = list(pubNlp.findDiseases(sentence)) drugs = list(pubNlp.findDrugs(sentence)) genes = list(geneFinder.findGeneNames(sentence)) #print conds, drugs, genes, section, sentence # remove drugs and conds that are also genes drugs = rangeRemoveOverlaps(drugs, genes) conds = rangeRemoveOverlaps(conds, genes) #geneSnips = rangeTexts(sentence, genes, useSym=True) #condSnips = rangeTexts(sentence, conds) #drugSnips = rangeTexts(sentence, drugs) # mutDescs = [] mutDict = varFinder.findVariantDescriptions(sentence) if "prot" in mutDict: for varDesc, mentions in mutDict["prot"]: if varDesc.mutType!="sub": continue logging.debug("grounding variant: %s %s"% (varDesc, mentions)) groundedMuts, ungroundVar, beds = \ varFinder.groundVariant(None, sentence, varDesc, mentions, [], docEntrezIds) for mutInfo in groundedMuts: shortDesc = varDesc.origSeq+str(varDesc.start+1)+varDesc.mutSeq # 0-based!! mutDescs.append(shortDesc+"=%s:%s"%(mutInfo.geneSymbol,mutInfo.hgvsProt)) #mutMatches = list(mutRe.finditer(sentence)) #mutDescs = [(m.group(1),m.group(2), m.group(3)) for m in mutMatches] #mutDescSet = set(mutDescs) #blackListMuts = mutDescSet.intersection(blackListStr) #if len(mutMatches)==0: #logging.debug("No mutation found, skipping") #continue #if len(blackListMuts)!=0: #logging.debug("At least one blacklisted mutation found, skipping") #continue #if len(drugs)==0: #logging.debug("No drugs found, skipping") #continue #if len(genes)==0: #logging.debug("No genes found, skipping") #continue mutDesc = "|".join(mutDescs) drugDesc = rangeDescs(sentence, drugs) condDesc = rangeDescs(sentence, conds) geneDesc = rangeDescs(sentence, genes, useSym=True) ret = (start, end, section, drugDesc, condDesc, geneDesc, mutDesc, sentence) yield ret
def findDisGeneVariant(text): """ >>> geneFinder.initData(exclMarkerTypes=["dnaSeq", "band"]) >>> varFinder.loadDb(loadSequences=False) >>> list(findDisGeneVariant("Diabetes is caused by a PITX2 mutation, V234T and influenced by Herceptin.")) [(0, 74, 'probablyAbstract', '64-73:Herceptin=Trastuzumab', '0-8:Diabetes=Diabetes Mellitus', '24-29:PITX2=symbol', 'V233T', 'Diabetes is caused by a PITX2 mutation, V234T and influenced by Herceptin.')] >>> #list(findDisGeneVariant("We undertook a quantitative review of the literature to estimate the effectiveness of desferrioxamine and deferiprone in decreasing hepatic iron concentrations (HIC) in thalassemia major.")) >>> list(findDisGeneVariant("his mutation, we cotransfected C3H10T cells with expression vectors encoding SMO-WT or SMO-D473H ")) """ docGenes = list(geneFinder.findGeneNames(text)) docEntrezIds = set([r[-1] for r in docGenes]) for section, start, end, sentence in pubNlp.sectionSentences(text): conds = list(pubNlp.findDiseases(sentence)) drugs = list(pubNlp.findDrugs(sentence)) genes = list(geneFinder.findGeneNames(sentence)) #print conds, drugs, genes, section, sentence # remove drugs and conds that are also genes drugs = rangeRemoveOverlaps(drugs, genes) conds = rangeRemoveOverlaps(conds, genes) #geneSnips = rangeTexts(sentence, genes, useSym=True) #condSnips = rangeTexts(sentence, conds) #drugSnips = rangeTexts(sentence, drugs) # mutDescs = [] mutDict = varFinder.findVariantDescriptions(sentence) if "prot" in mutDict: for varDesc, mentions in mutDict["prot"]: if varDesc.mutType != "sub": continue logging.debug("grounding variant: %s %s" % (varDesc, mentions)) groundedMuts, ungroundVar, beds = \ varFinder.groundVariant(None, sentence, varDesc, mentions, [], docEntrezIds) for mutInfo in groundedMuts: shortDesc = varDesc.origSeq + str( varDesc.start + 1) + varDesc.mutSeq # 0-based!! mutDescs.append(shortDesc + "=%s:%s" % (mutInfo.geneSymbol, mutInfo.hgvsProt)) #mutMatches = list(mutRe.finditer(sentence)) #mutDescs = [(m.group(1),m.group(2), m.group(3)) for m in mutMatches] #mutDescSet = set(mutDescs) #blackListMuts = mutDescSet.intersection(blackListStr) #if len(mutMatches)==0: #logging.debug("No mutation found, skipping") #continue #if len(blackListMuts)!=0: #logging.debug("At least one blacklisted mutation found, skipping") #continue #if len(drugs)==0: #logging.debug("No drugs found, skipping") #continue #if len(genes)==0: #logging.debug("No genes found, skipping") #continue mutDesc = "|".join(mutDescs) drugDesc = rangeDescs(sentence, drugs) condDesc = rangeDescs(sentence, conds) geneDesc = rangeDescs(sentence, genes, useSym=True) ret = (start, end, section, drugDesc, condDesc, geneDesc, mutDesc, sentence) yield ret