Example #1
0
def annotateFile(article, file):
    text = file.content
    geneScores, geneSupport = geneFinder.rankGenes(text, pmid=article.pmid)
    rank = 1
    rows = []
    for geneId, geneScore in geneScores:
        supportStrs = []
        start, end = None,None
        firstStart = None
        snippets = []
        for markerType, recogId, startEndList in geneSupport[geneId]:
            supportStrs.append(markerType+"/"+recogId)
            start, end = startEndList[0]
            if firstStart==None:
                firstStart, firstEnd = start, end
            snippets.append(text[start:end])
        supportStr = "|".join(supportStrs)
        sym = geneFinder.entrezToSym.get(geneId, "<NOSYM?>")
        uniprotIds = ",".join(geneFinder.entrezToUp.get(geneId, ["<NOUNIPROT>"]))
        pmid = article.pmid
        row = [firstStart, firstEnd, pmid, geneScore, rank, geneId, sym, uniprotIds, \
            supportStr, "|".join(snippets)]
        rows.append(row)
        rank += 1
    if len(rows)>200:
        logging.warn("too many genes found in document, skipping all")
        return None
    return rows
Example #2
0
def annotateFile(article, file):
    text = file.content
    geneScores, geneSupport = geneFinder.rankGenes(text, pmid=article.pmid)
    rank = 1
    rows = []
    for geneId, geneScore in geneScores:
        supportStrs = []
        start, end = None, None
        firstStart = None
        snippets = []
        sym = geneFinder.entrezToSym.get(geneId, "<NOSYM?>")
        uniprotIds = ",".join(
            geneFinder.entrezToUp.get(geneId, ["<NOUNIPROT>"]))

        for markerType, recogId, startEndList in geneSupport[geneId]:
            supportStr = markerType + "/" + recogId
            for start, end in startEndList:
                row = [start, end, geneId, sym, geneScore, rank, uniprotIds, \
                    supportStr]
            rows.append(row)
            rank += 1
    if len(rows) > 200:
        logging.warn("too many genes found in document, skipping all")
        return None
    return rows
Example #3
0
def annotateFile(article, file):
    " go over words of text and check if they are in dict "
    resultRows = []
    text = file.content

    tokenizer = nltk.tokenize.RegexpTokenizer(r'[.:?!] ', gaps=True)
    #tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    resultRows = []

    # search for genes in text
    genes, geneSupp = geneFinder.rankGenes(text, pmid=article.pmid)
    if len(genes) == 0:
        logging.warn("No gene found")
        return

    # resolve to uniprot IDs
    uniprotIds = []
    topGenes = [genes[0][0]]
    for entrezId in topGenes:
        upIds = geneFinder.entrezToUp.get(entrezId, None)
        if upIds == None:
            logging.warn("cannot map %s to uniprot" % str(entrezId))
            continue
        uniprotIds.extend(upIds)

    if len(uniprotIds) == 0:
        uniprotIds = ["UNKNOWN"]
    #if len(uniprotIds)!=1:
    #logging.warn("more than one uniprot ID found, skipping text")
    #return

    # now find sites in text
    for sentStart, sentEnd in tokenizer.span_tokenize(text):
        phrase = text[sentStart:sentEnd]
        phosWords = []
        for match in phosphoWordRe.finditer(phrase):
            phosWords.append(match.group(1))

        for siteRe in siteRes:
            for match in siteRe.finditer(phrase):
                aa = match.group("aa")
                sitePos = match.group("pos")
                row = [
                    sentStart, sentEnd, article.pmid, ",".join(uniprotIds), aa,
                    sitePos, ",".join(phosWords)
                ]
                resultRows.append(row)

    #if len(resultRows)<200:
    return resultRows
Example #4
0
def annotateFile(article, file):
    " go over words of text and check if they are in dict "
    resultRows = []
    text = file.content

    tokenizer = nltk.tokenize.RegexpTokenizer(r'[.:?!] ', gaps=True)
    #tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    resultRows = []

    # search for genes in text
    genes, geneSupp = geneFinder.rankGenes(text, pmid=article.pmid)
    if len(genes)==0:
        logging.warn("No gene found")
        return

    # resolve to uniprot IDs
    uniprotIds = []
    topGenes = [genes[0][0]]
    for entrezId in topGenes:
        upIds = geneFinder.entrezToUp.get(entrezId, None)
        if upIds==None:
            logging.warn("cannot map %s to uniprot" % str(entrezId))
            continue
        uniprotIds.extend(upIds)

    if len(uniprotIds)==0:
        uniprotIds = ["UNKNOWN"]
    #if len(uniprotIds)!=1:
        #logging.warn("more than one uniprot ID found, skipping text")
        #return

    # now find sites in text
    for sentStart, sentEnd in tokenizer.span_tokenize(text):
        phrase = text[sentStart:sentEnd]
	phosWords = []
        for match in phosphoWordRe.finditer(phrase):
		phosWords.append(match.group(1))

        for siteRe in siteRes:
            for match in siteRe.finditer(phrase):
                aa = match.group("aa")
                sitePos = match.group("pos")
                row =[sentStart, sentEnd, article.pmid, ",".join(uniprotIds), aa, sitePos, ",".join(phosWords)]
                resultRows.append(row)

    #if len(resultRows)<200:
    return resultRows