def parseLines(input, entrez, relex, authors): ''' iterates through valid sentences, valid meaning the sentence is a good candidate for being parsed. Uses geneFinder() to find genes and positions of genes within the sentence, then extracts the metainfo we need using extractGenes(). At this step, filter out sentences without enough genes or relations (aka no possible interactions). Also filter out references using an authors database. It's important to process all Unicode characters because not all programs can handle them. ''' for pmid, sentence in parseSentences(input): # logging.info("Parsing line: {}".format(sentence[:30] + ' ...... ')) try: decodedSentence = unidecode(sentence.decode('utf-8')) except UnicodeDecodeError: logging.warning("Can't process Unicode for {}: {}".format(pmid, sentence)) decodedSentence = sentence if isReference(sentence, authors): continue genesSupport, _ = geneFinder.findGenes(decodedSentence) geneIds, geneNames, rawNames = extractGenes(genesSupport, entrez, decodedSentence) relations = findRelations(sentence, relex) if len(geneNames) < 2 or len(relations) == 0: continue yield pmid, decodedSentence, geneIds, geneNames, rawNames, relations
def findGenes(pmid, text): """ return dict of entrezGene id -> mType -> (markerId, list of start, end) """ genes, genePosSet = geneFinder.findGenes(text, pmid) return genes, genePosSet