def annotateFile(self, article, file): " go over words of text and check if they are in dict " text = file.content count = 0 rows = [] for start, end, markerType, markerId in geneFinder.findIdentifiers( text): # never resolve some types if markerType in ["band", "genbank", "refseq", "ensembl", "snp"]: if markerType == "snp": markerId = "rs" + markerId row = [start, end, markerType, markerId, "", ""] rows.append(row) else: # try to resolve most other types to genes genes = geneFinder.markerToGenes(markerType, markerId) if genes != None and len(genes) < MAXGENEPERID: geneSyms = set(genes) for geneSym in geneSyms: geneId, geneSym = genes.items()[0] row = [ start, end, "gene", geneSym, markerType, markerId ] rows.append(row) else: row = [start, end, markerType, markerId, "", ""] rows.append(row) if len(rows) > MAXCOUNT: logging.info("%d annotations, too mant, skipping file %s" % (MAXCOUNT, file.externalId)) return None return rows
def map(self, article, file, text, resultDict): matches = list(geneFinder.findIdentifiers(text)) for start, end, type, word in matches: resultDict.setdefault(word, set()).add(file.fileId)