def annotateFile(article, file): text = file.content geneScores, geneSupport = geneFinder.rankGenes(text, pmid=article.pmid) rank = 1 rows = [] for geneId, geneScore in geneScores: supportStrs = [] start, end = None,None firstStart = None snippets = [] for markerType, recogId, startEndList in geneSupport[geneId]: supportStrs.append(markerType+"/"+recogId) start, end = startEndList[0] if firstStart==None: firstStart, firstEnd = start, end snippets.append(text[start:end]) supportStr = "|".join(supportStrs) sym = geneFinder.entrezToSym.get(geneId, "<NOSYM?>") uniprotIds = ",".join(geneFinder.entrezToUp.get(geneId, ["<NOUNIPROT>"])) pmid = article.pmid row = [firstStart, firstEnd, pmid, geneScore, rank, geneId, sym, uniprotIds, \ supportStr, "|".join(snippets)] rows.append(row) rank += 1 if len(rows)>200: logging.warn("too many genes found in document, skipping all") return None return rows
def annotateFile(article, file): text = file.content geneScores, geneSupport = geneFinder.rankGenes(text, pmid=article.pmid) rank = 1 rows = [] for geneId, geneScore in geneScores: supportStrs = [] start, end = None, None firstStart = None snippets = [] sym = geneFinder.entrezToSym.get(geneId, "<NOSYM?>") uniprotIds = ",".join( geneFinder.entrezToUp.get(geneId, ["<NOUNIPROT>"])) for markerType, recogId, startEndList in geneSupport[geneId]: supportStr = markerType + "/" + recogId for start, end in startEndList: row = [start, end, geneId, sym, geneScore, rank, uniprotIds, \ supportStr] rows.append(row) rank += 1 if len(rows) > 200: logging.warn("too many genes found in document, skipping all") return None return rows
def annotateFile(article, file): " go over words of text and check if they are in dict " resultRows = [] text = file.content tokenizer = nltk.tokenize.RegexpTokenizer(r'[.:?!] ', gaps=True) #tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') resultRows = [] # search for genes in text genes, geneSupp = geneFinder.rankGenes(text, pmid=article.pmid) if len(genes) == 0: logging.warn("No gene found") return # resolve to uniprot IDs uniprotIds = [] topGenes = [genes[0][0]] for entrezId in topGenes: upIds = geneFinder.entrezToUp.get(entrezId, None) if upIds == None: logging.warn("cannot map %s to uniprot" % str(entrezId)) continue uniprotIds.extend(upIds) if len(uniprotIds) == 0: uniprotIds = ["UNKNOWN"] #if len(uniprotIds)!=1: #logging.warn("more than one uniprot ID found, skipping text") #return # now find sites in text for sentStart, sentEnd in tokenizer.span_tokenize(text): phrase = text[sentStart:sentEnd] phosWords = [] for match in phosphoWordRe.finditer(phrase): phosWords.append(match.group(1)) for siteRe in siteRes: for match in siteRe.finditer(phrase): aa = match.group("aa") sitePos = match.group("pos") row = [ sentStart, sentEnd, article.pmid, ",".join(uniprotIds), aa, sitePos, ",".join(phosWords) ] resultRows.append(row) #if len(resultRows)<200: return resultRows
def annotateFile(article, file): " go over words of text and check if they are in dict " resultRows = [] text = file.content tokenizer = nltk.tokenize.RegexpTokenizer(r'[.:?!] ', gaps=True) #tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') resultRows = [] # search for genes in text genes, geneSupp = geneFinder.rankGenes(text, pmid=article.pmid) if len(genes)==0: logging.warn("No gene found") return # resolve to uniprot IDs uniprotIds = [] topGenes = [genes[0][0]] for entrezId in topGenes: upIds = geneFinder.entrezToUp.get(entrezId, None) if upIds==None: logging.warn("cannot map %s to uniprot" % str(entrezId)) continue uniprotIds.extend(upIds) if len(uniprotIds)==0: uniprotIds = ["UNKNOWN"] #if len(uniprotIds)!=1: #logging.warn("more than one uniprot ID found, skipping text") #return # now find sites in text for sentStart, sentEnd in tokenizer.span_tokenize(text): phrase = text[sentStart:sentEnd] phosWords = [] for match in phosphoWordRe.finditer(phrase): phosWords.append(match.group(1)) for siteRe in siteRes: for match in siteRe.finditer(phrase): aa = match.group("aa") sitePos = match.group("pos") row =[sentStart, sentEnd, article.pmid, ",".join(uniprotIds), aa, sitePos, ",".join(phosWords)] resultRows.append(row) #if len(resultRows)<200: return resultRows