def parseLines(input, entrez, relex, authors):
	'''
	iterates through valid sentences, valid meaning the sentence is a good candidate for being parsed. Uses geneFinder() to find genes and positions of genes within the sentence, then extracts the metainfo we need using extractGenes(). At this step, filter out sentences without enough genes or relations (aka no possible interactions). Also filter out references using an authors database. It's important to process all Unicode characters because not all programs can handle them.
	'''
	for pmid, sentence in parseSentences(input):
		# logging.info("Parsing line: {}".format(sentence[:30] + ' ...... '))
		try:
			decodedSentence = unidecode(sentence.decode('utf-8'))
		except UnicodeDecodeError:
			logging.warning("Can't process Unicode for {}: {}".format(pmid, sentence))
			decodedSentence = sentence
		if isReference(sentence, authors):
			continue
		genesSupport, _ = geneFinder.findGenes(decodedSentence)
		geneIds, geneNames, rawNames = extractGenes(genesSupport, entrez, decodedSentence)
		relations = findRelations(sentence, relex)
		if len(geneNames) < 2 or len(relations) == 0:
			continue
		yield pmid, decodedSentence, geneIds, geneNames, rawNames, relations
Exemple #2
0
def parseLines(input, entrez, relex, authors):
	'''
	iterates through valid sentences, valid meaning the sentence is a good candidate for being parsed. Uses geneFinder() to find genes and positions of genes within the sentence, then extracts the metainfo we need using extractGenes(). At this step, filter out sentences without enough genes or relations (aka no possible interactions). Also filter out references using an authors database. It's important to process all Unicode characters because not all programs can handle them.
	'''
	for pmid, sentence in parseSentences(input):
		# logging.info("Parsing line: {}".format(sentence[:30] + ' ...... '))
		try:
			decodedSentence = unidecode(sentence.decode('utf-8'))
		except UnicodeDecodeError:
			logging.warning("Can't process Unicode for {}: {}".format(pmid, sentence))
			decodedSentence = sentence
		if isReference(sentence, authors):
			continue
		genesSupport, _ = geneFinder.findGenes(decodedSentence)
		geneIds, geneNames, rawNames = extractGenes(genesSupport, entrez, decodedSentence)
		relations = findRelations(sentence, relex)
		if len(geneNames) < 2 or len(relations) == 0:
			continue
		yield pmid, decodedSentence, geneIds, geneNames, rawNames, relations
Exemple #3
0
def findGenes(pmid, text):
    """ return dict of entrezGene id -> mType -> (markerId, list of start, end)
    """
    genes, genePosSet = geneFinder.findGenes(text, pmid)
    return genes, genePosSet
Exemple #4
0
def findGenes(pmid, text):
    """ return dict of entrezGene id -> mType -> (markerId, list of start, end)
    """
    genes, genePosSet = geneFinder.findGenes(text, pmid)
    return genes, genePosSet