def annotateFile(article, file): " go over words of text and check if they are in dict " text = file.content rows = fastFind(text, lex, wordRegex=r"[\w'_-]+") autAcros = getAuthorAcronyms(article.authors) acros = getAcronyms(text) # print rows if len(rows) > 200: logging.info( "%d gene matches in file (>200), skipping article %s/file %s, fileType %s" % (len(rows), article.externalId, file.fileId, file.fileType) ) return [] newRows = [] for start, end, id in rows: word = text[start:end] if word in autAcros: logging.debug("Ignoring %s: looks like author acronym for author %s" % (word, autAcros[word])) continue if word in acros: logging.debug("Ignoring %s: looks like an acronym defined in the text for %s" % (word, acros[word])) continue type = "name" if id.startswith("*"): id = id.lstrip("*") type = "acc" newRows.append((start, end, id, type, word)) return newRows
def annotateFile(article, file): " go over words of text and check if they are in dict " text = file.content rows = fastFind(text, lex, wordRegex=r"[\w'_-]+", toLower=True) autAcros = getAuthorAcronyms(article.authors) acros = getAcronyms(text) #print rows if len(rows) > 200: logging.info( "%d gene matches in file (>200), skipping article %s/file %s, fileType %s" % (len(rows), article.externalId, file.fileId, file.fileType)) return [] newRows = [] for start, end, id in rows: word = text[start:end] if word in autAcros: logging.debug( "Ignoring %s: looks like author acronym for author %s" % (word, autAcros[word])) continue if word in acros: logging.debug( "Ignoring %s: looks like an acronym defined in the text for %s" % (word, acros[word])) continue type = "name" if id.startswith("*"): id = id.lstrip("*") type = "acc" newRows.append((start, end, id, type, word)) return newRows
def findGeneNames(text): """ look for gene names and symbols. Some symbols need flanking trigger words. If these are not present, they are returned as "symbolMaybe" Will always return the gene name matches before the symbol matches. >>> initData(addOptional=True) >>> list(findGeneNames("thyroid hormone receptor, beta")) [(0, 30, 'geneName', '7068')] >>> list(findGeneNames("FATE1")) [(0, 5, 'symbolMaybe', '89885')] >>> list(findGeneNames("FATE1 is overexpressed")) [(0, 5, 'symbol', '89885')] >>> list(findGeneNames("fate1 is overexpressed")) [] >>> list(findGeneNames("PITX2 overexpression")) [(0, 5, 'symbol', '5308')] # ignore genes that are immediately flanked by "pathway" >>> list(findGeneNames("PITX2 pathway")) [] # XX need to correct this >>> list(findGeneNames(" BLAST ")) [(1, 6, 'symbolMaybe', '962')] """ assert (geneSymLex != None) textLower = text.lower() for start, end, geneId in fastFind.fastFind(textLower, geneNameLex, toLower=False): yield (start, end, 'geneName', geneId) flankFindIter = fastFind.fastFindFlankWords(text, geneSymLex, wordDist=2, wordRe=fastFind.SYMRE, toLower=False) for start, end, geneId, leftWords, rightWords in flankFindIter: # if the symbol is marked as potentially ambiguous, check the flanking words if geneId.startswith("?"): leftWords = [w.lower() for w in leftWords] rightWords = [w.lower() for w in rightWords] geneId = geneId.strip("?") if len(symLeftReqWords.intersection(leftWords))!=0 or \ len(symRightReqWords.intersection(rightWords))!=0: yield (start, end, 'symbol', geneId) else: yield (start, end, 'symbolMaybe', geneId) # otherwise pass them though else: # ignore genes that are immediately flanked by "pathway" flankWords = getFlankWords(start, end, textLower) if "pathway" in flankWords: logging.debug("ignored %s, flank words are %s" % (text[start:end], flankWords)) continue yield (start, end, 'symbol', geneId)
def searchDiseases(text): text = text.lower() annots = list(fastFind.fastFind(text, lex)) newAnnots = [] for annot in annots: start, end, id = annot snippet = pubAlg.getSnippet(text, start, end) dis = text[start:end] newAnnots.append( (start, end, dis, snippet) ) return newAnnots
def annotateFile(self, article, file): " go over words of text and check if they are in dict " resultRows = [] #print "annotate", article, file text = file.content #print "text", text annots = list(fastFind.fastFind(text, self.lex, toLower=True)) if len(annots)>self.MAXCOUNT: logging.info("more than %d annotations, skipping %s" % (self.MAXCOUNT, article.externalId)) return None return annots
def findDiseases(text): """ find diseases in string and return as (start, end, diseaseName) >>> list(findDiseases("AlzhEImer's Disease")) [(0, 19, 'Alzheimer Disease')] """ global disLex if disLex==None: disPath = join(pubConf.staticDataDir, "diseases", "diseases.marshal.gz") disLex = fastFind.loadLex(disPath) for (start, end, name) in fastFind.fastFind(text, disLex, toLower=True): yield start, end, name
def findGeneNames(text): """ look for gene names and symbols. Some symbols need flanking trigger words. If these are not present, they are returned as "symbolMaybe" Will always return the gene name matches before the symbol matches. >>> initData(addOptional=True) >>> list(findGeneNames("thyroid hormone receptor, beta")) [(0, 30, 'geneName', '7068')] >>> list(findGeneNames("FATE1")) [(0, 5, 'symbolMaybe', '89885')] >>> list(findGeneNames("FATE1 is overexpressed")) [(0, 5, 'symbol', '89885')] >>> list(findGeneNames("fate1 is overexpressed")) [] >>> list(findGeneNames("PITX2 overexpression")) [(0, 5, 'symbol', '5308')] # ignore genes that are immediately flanked by "pathway" >>> list(findGeneNames("PITX2 pathway")) [] # XX need to correct this >>> list(findGeneNames(" BLAST ")) [(1, 6, 'symbolMaybe', '962')] """ assert(geneSymLex!=None) textLower = text.lower() for start, end, geneId in fastFind.fastFind(textLower, geneNameLex): yield (start, end, 'geneName', geneId) flankFindIter = fastFind.fastFindFlankWords(text, geneSymLex, wordDist=2, wordRe=fastFind.SYMRE) for start, end, geneId, leftWords, rightWords in flankFindIter: # if the symbol is marked as potentially ambiguous, check the flanking words if geneId.startswith("?"): leftWords = [w.lower() for w in leftWords] rightWords = [w.lower() for w in rightWords] geneId = geneId.strip("?") if len(symLeftReqWords.intersection(leftWords))!=0 or \ len(symRightReqWords.intersection(rightWords))!=0: yield (start, end, 'symbol', geneId) else: yield (start, end, 'symbolMaybe', geneId) # otherwise pass them though else: # ignore genes that are immediately flanked by "pathway" flankWords = getFlankWords(start, end, textLower) if "pathway" in flankWords: logging.debug("ignored %s, flank words are %s" % (text[start:end], flankWords)) continue yield (start, end, 'symbol', geneId)
def findDiseases(text): """ find diseases in string and return as (start, end, diseaseName) >>> list(findDiseases("AlzhEImer's Disease")) [(0, 19, 'Alzheimer Disease')] """ global disLex if disLex == None: disPath = join(pubConf.staticDataDir, "diseases", "diseases.marshal.gz") disLex = fastFind.loadLex(disPath) for (start, end, name) in fastFind.fastFind(text, disLex, toLower=True): yield start, end, name
def findDrugs(text): """ find drugs in string and return as (start, end, drugbankName) >>> list(findDrugs("Acetaminophen, Penicillin V and Herceptin.")) [(0, 13, 'Acetaminophen'), (15, 27, 'Penicillin V'), (32, 41, 'Trastuzumab')] """ global drugLex if drugLex==None: drugPath = join(pubConf.staticDataDir, "drugs", "drugbank.marshal.gz") drugLex = fastFind.loadLex(drugPath) for (start, end, name) in fastFind.fastFind(text, drugLex, toLower=True): if name.lower() in drugBlacklist: continue yield start, end, name
def findCells(text): """ find cell types >>> list(findCells("Oligodendrocytes and neural progenitors.")) [(0, 16, 'oligodendrocyte')] """ global cellLex dictFname = pubConf.getStaticFile("cellTypes", "cellTypes.marshal") if cellLex is None: cellLex = fastFind.loadLex(dictFname) for (start, end, name) in fastFind.fastFind(text.lower(), cellLex): if name.lower() in cellBlackList: continue yield start, end, name
def findDrugs(text): """ find drugs in string and return as (start, end, drugbankName) >>> list(findDrugs("Acetaminophen, Penicillin V and Herceptin.")) [(0, 13, 'Acetaminophen'), (15, 27, 'Penicillin V'), (32, 41, 'Trastuzumab')] """ global drugLex if drugLex == None: drugPath = pubConf.getStaticFile("drugs", "drugbank.marshal.gz") drugLex = fastFind.loadLex(drugPath) for (start, end, name) in fastFind.fastFind(text, drugLex, toLower=True): if name.lower() in drugBlacklist: continue yield start, end, name
def annotateFile(article, file): " go over words of text and check if they are in dict " text = file.content if len(text)>100000: return if reqStrings!=None: found = False #sentLower = sent.lower() textLower = text.lower() for rs in reqStrings: if text.find(rs)!=-1: #if sentLower.find(rs)!=-1: found = True break if not found: return for section, sentStart, sentEnd, sent in pubNlp.sectionSentences(text, file.fileType, mustHaveVerb=False): #if len(sent)<20: #logging.debug("Sentence too short: %d characters" % len(text)) #continue #if len(sent)>2000: #logging.debug("Sentence too long: %d characters" % len(text)) #continue found = True posList = [] allMatches = [] for lexName, lex in lexes.iteritems(): matches = [] lexMatches = fastFind(sent, lex, toLower=toLower) if len(lexMatches)==0 or len(lexMatches)>10: found = False break for start, end, word in lexMatches: matches.append(word.replace("="," ").replace(",", " ")) posList.append("%d-%d" % (start, end)) allMatches.append("%s=%s" % (lexName, ",".join(matches))) if found: yield sentStart, sentEnd, "|".join(allMatches), "|".join(posList), sent
def annotateFile(article, file): " go over words of text and check if they are in dict " text = file.content if len(text) > 100000: return if reqStrings != None: found = False # sentLower = sent.lower() textLower = text.lower() for rs in reqStrings: if text.find(rs) != -1: # if sentLower.find(rs)!=-1: found = True break if not found: return for section, sentStart, sentEnd, sent in pubNlp.sectionSentences(text, file.fileType, mustHaveVerb=False): # if len(sent)<20: # logging.debug("Sentence too short: %d characters" % len(text)) # continue # if len(sent)>2000: # logging.debug("Sentence too long: %d characters" % len(text)) # continue found = True posList = [] allMatches = [] for lexName, lex in lexes.iteritems(): matches = [] lexMatches = fastFind(sent, lex, toLower=toLower) if len(lexMatches) == 0 or len(lexMatches) > 10: found = False break for start, end, word in lexMatches: matches.append(word.replace("=", " ").replace(",", " ")) posList.append("%d-%d" % (start, end)) allMatches.append("%s=%s" % (lexName, ",".join(matches))) if found: yield sentStart, sentEnd, "|".join(allMatches), "|".join(posList), sent