Esempio n. 1
0
def findVarDisGeneDrug(pmid, text):
    """
    >>> startup({})
    >>> list(findVarDisGeneDrug(0, "The R71G BRCA1 is a breast cancer founder mutation not treatable with Herceptin"))
    """
    textLow = text.lower()
    # very basic filter, remove documents without some basic keywords
    if " variant " not in textLow and " mutation" not in textLow and " substitution" not in textLow and \
            " mutant " not in textLow:
        return

    for section, sentStart, sentEnd, sentText in pubNlp.sectionSentences(text):
        genes = list(geneFinder.findGeneNames(sentText))
        if len(genes) == 0:
            continue
        #print "genes", genes, sentText

        conds = list(pubNlp.findDiseases(sentText))
        drugs = list(pubNlp.findDrugs(sentText))
        # remove diseases and drugs that are also genes
        drugs = pubNlp.rangeRemoveOverlaps(drugs, genes)
        conds = pubNlp.rangeRemoveOverlaps(conds, genes)
        # check if we still have a disease and drug left
        if len(conds) == 0 or len(drugs) == 0:
            continue
        print "drugs", drugs
        print "diseases", conds

        geneSnips = pubNlp.rangeTexts(sentText, genes)
        condSnips = pubNlp.rangeTexts(sentText, conds)
        drugSnips = pubNlp.rangeTexts(sentText, drugs)

        genePosSet = pubNlp.rangeToPosSet(genes)
        variants = varFinder.findVariantDescriptions(sentText,
                                                     exclPos=genePosSet)

        # the last field of the genes rows is the entrez ID
        entrezIds = [r[-1] for r in genes]

        # we need a protein variant, not DNA
        if "prot" not in variants:
            continue

        for variant, mentions in variants["prot"]:
            print "grounding variant", variant, mentions
            groundedMuts, ungroundVar, beds = \
                varFinder.groundVariant(pmid, sentText, variant, mentions, [], entrezIds)

            for mutInfo in groundedMuts:
                coords = [(m.start, m.end) for m in mentions]
                varSnips = pubNlp.rangeTexts(sentText, coords)
                row = [
                    section, "|".join(geneSnips), "|".join(condSnips),
                    "|".join(drugSnips), "|".join(varSnips), sentText
                ]
                yield row
Esempio n. 2
0
def findVarDisGeneDrug(pmid, text):
    """
    >>> startup({})
    >>> list(findVarDisGeneDrug(0, "The R71G BRCA1 is a breast cancer founder mutation not treatable with Herceptin"))
    """
    textLow = text.lower()
    # very basic filter, remove documents without some basic keywords
    if " variant " not in textLow and " mutation" not in textLow and " substitution" not in textLow and \
            " mutant " not in textLow:
        return

    for section, sentStart, sentEnd, sentText in pubNlp.sectionSentences(text):
        genes = list(geneFinder.findGeneNames(sentText))
        if len(genes)==0:
            continue
        #print "genes", genes, sentText

        conds = list(pubNlp.findDiseases(sentText))
        drugs = list(pubNlp.findDrugs(sentText))
        # remove diseases and drugs that are also genes
        drugs = pubNlp.rangeRemoveOverlaps(drugs, genes)
        conds = pubNlp.rangeRemoveOverlaps(conds, genes)
        # check if we still have a disease and drug left
        if len(conds)==0 or len(drugs)==0:
            continue
        print "drugs", drugs
        print "diseases", conds

        geneSnips = pubNlp.rangeTexts(sentText, genes)
        condSnips = pubNlp.rangeTexts(sentText, conds)
        drugSnips = pubNlp.rangeTexts(sentText, drugs)

        genePosSet = pubNlp.rangeToPosSet(genes)
        variants  = varFinder.findVariantDescriptions(sentText, exclPos=genePosSet)

        # the last field of the genes rows is the entrez ID
        entrezIds = [r[-1] for r in genes]

        # we need a protein variant, not DNA
        if "prot" not in variants:
            continue

        for variant, mentions in variants["prot"]:
            print "grounding variant", variant, mentions
            groundedMuts, ungroundVar, beds = \
                varFinder.groundVariant(pmid, sentText, variant, mentions, [], entrezIds)

            for mutInfo in groundedMuts:
                coords = [(m.start, m.end) for m in mentions]
                varSnips = pubNlp.rangeTexts(sentText, coords)
                row = [section, "|".join(geneSnips), "|".join(condSnips), "|".join(drugSnips), "|".join(varSnips), sentText]
                yield row
Esempio n. 3
0
def annotateFile(article, file):
    " go over words of text and check if they are in dict "
    text = file.content
    if len(text)>100000:
        return

    if reqStrings!=None:
        found = False
        #sentLower = sent.lower()
        textLower = text.lower()
        for rs in reqStrings:
            if text.find(rs)!=-1:
            #if sentLower.find(rs)!=-1:
                found = True
                break
        if not found:
            return

    for section, sentStart, sentEnd, sent in pubNlp.sectionSentences(text, file.fileType, mustHaveVerb=False):
        #if len(sent)<20:
            #logging.debug("Sentence too short: %d characters" % len(text))
            #continue
        #if len(sent)>2000:
            #logging.debug("Sentence too long: %d characters" % len(text))
            #continue

        found = True
        posList = []
        allMatches = []
        for lexName, lex in lexes.iteritems():
            matches = []
            lexMatches = fastFind(sent, lex, toLower=toLower)
            if len(lexMatches)==0 or len(lexMatches)>10:
                found = False
                break
            for start, end, word in lexMatches:
                matches.append(word.replace("="," ").replace(",", " "))
                posList.append("%d-%d" % (start, end))
            allMatches.append("%s=%s" % (lexName, ",".join(matches)))
        if found:
            yield sentStart, sentEnd, "|".join(allMatches), "|".join(posList), sent
Esempio n. 4
0
def annotateFile(article, file):
    " go over words of text and check if they are in dict "
    text = file.content
    if len(text) > 100000:
        return

    if reqStrings != None:
        found = False
        # sentLower = sent.lower()
        textLower = text.lower()
        for rs in reqStrings:
            if text.find(rs) != -1:
                # if sentLower.find(rs)!=-1:
                found = True
                break
        if not found:
            return

    for section, sentStart, sentEnd, sent in pubNlp.sectionSentences(text, file.fileType, mustHaveVerb=False):
        # if len(sent)<20:
        # logging.debug("Sentence too short: %d characters" % len(text))
        # continue
        # if len(sent)>2000:
        # logging.debug("Sentence too long: %d characters" % len(text))
        # continue

        found = True
        posList = []
        allMatches = []
        for lexName, lex in lexes.iteritems():
            matches = []
            lexMatches = fastFind(sent, lex, toLower=toLower)
            if len(lexMatches) == 0 or len(lexMatches) > 10:
                found = False
                break
            for start, end, word in lexMatches:
                matches.append(word.replace("=", " ").replace(",", " "))
                posList.append("%d-%d" % (start, end))
            allMatches.append("%s=%s" % (lexName, ",".join(matches)))
        if found:
            yield sentStart, sentEnd, "|".join(allMatches), "|".join(posList), sent
Esempio n. 5
0
    def annotateFile(self, article, file):
        text = file.content
        pmid = article.pmid
        if file.fileType == "supp":
            return
        for row in pubNlp.sectionSentences(text, file.fileType):
            section, sentStart, sentEnd, text = row
            tokens = text.split()
            if len(tokens) < 6:
                logging.debug("Sentence too short: %d tokens" % len(tokens))
                continue
            if len(tokens) > 40:
                logging.debug("Sentence too long: %d tokens" % len(tokens))
                continue
            if len(text) < 20:
                logging.debug("Sentence too short: %d characters" % len(text))
                continue
            if len(text) > 1000:
                logging.debug("Sentence too long: %d characters" % len(text))
                continue
            if text.count('"') > 20 or text.count(",") > 20:
                logging.debug("Too many strange characters")
                continue

            genes = list(geneFinder.findGeneNames(text))
            if len(genes) < 2:
                continue
            if len(genes) > 20:
                logging.debug("Too many genes, %d" % len(genes))
                continue
            geneDescs = ["%d-%d/%s/%s/%s" % (start,end,text[start:end],name,gid) \
                for start,end,name,gid in genes]
            geneDesc = "|".join(geneDescs)

            row = [pmid, section, start, end, text, geneDesc]
            yield row
            self.rowCount += 1
            if self.rowCount % 200 == 0:
                yield []  # tell caller to start a new output file
Esempio n. 6
0
    def annotateFile(self, article, file):
        text = file.content
        pmid = article.pmid
        if file.fileType=="supp":
            return
        for row in pubNlp.sectionSentences(text, file.fileType):
            section, sentStart, sentEnd, text = row
            tokens = text.split()
            if len(tokens)<6:
                logging.debug("Sentence too short: %d tokens" % len(tokens))
                continue
            if len(tokens)>40:
                logging.debug("Sentence too long: %d tokens" % len(tokens))
                continue
            if len(text)<20:
                logging.debug("Sentence too short: %d characters" % len(text))
                continue
            if len(text)>1000:
                logging.debug("Sentence too long: %d characters" % len(text))
                continue
            if text.count('"') > 20 or text.count(",")>20:
                logging.debug("Too many strange characters")
                continue

            genes = list(geneFinder.findGeneNames(text))
            if len(genes) < 2:
                continue
            if len(genes) > 20:
                logging.debug("Too many genes, %d" % len(genes))
                continue
            geneDescs = ["%d-%d/%s/%s/%s" % (start,end,text[start:end],name,gid) \
                for start,end,name,gid in genes]
            geneDesc = "|".join(geneDescs)

            row = [pmid, section, start, end, text, geneDesc]
            yield row
            self.rowCount += 1
            if self.rowCount % 200 == 0:
                yield [] # tell caller to start a new output file
Esempio n. 7
0
def findDisGeneVariant(text):
    """
    >>> geneFinder.initData(exclMarkerTypes=["dnaSeq", "band"])
    >>> varFinder.loadDb(loadSequences=False)
    >>> list(findDisGeneVariant("Diabetes is caused by a PITX2 mutation, V234T and influenced by Herceptin."))
    [(0, 74, 'probablyAbstract', '64-73:Herceptin=Trastuzumab', '0-8:Diabetes=Diabetes Mellitus', '24-29:PITX2=symbol', 'V233T', 'Diabetes is caused by a PITX2 mutation, V234T and influenced by Herceptin.')]
    >>> #list(findDisGeneVariant("We undertook a quantitative review of the literature to estimate the effectiveness of desferrioxamine and deferiprone in decreasing hepatic iron concentrations (HIC) in thalassemia major."))
    >>> list(findDisGeneVariant("his mutation, we cotransfected C3H10T cells with expression vectors encoding SMO-WT or SMO-D473H "))
    """
    docGenes = list(geneFinder.findGeneNames(text))
    docEntrezIds = set([r[-1] for r in docGenes])

    for section, start, end, sentence in pubNlp.sectionSentences(text):
        conds = list(pubNlp.findDiseases(sentence))
        drugs = list(pubNlp.findDrugs(sentence))
        genes = list(geneFinder.findGeneNames(sentence))
        #print conds, drugs, genes, section, sentence
        # remove drugs and conds that are also genes
        drugs = rangeRemoveOverlaps(drugs, genes)
        conds = rangeRemoveOverlaps(conds, genes)

        #geneSnips = rangeTexts(sentence, genes, useSym=True)
        #condSnips = rangeTexts(sentence, conds)
        #drugSnips = rangeTexts(sentence, drugs)
#
        mutDescs = []
        mutDict = varFinder.findVariantDescriptions(sentence)
        if "prot" in mutDict:
            for varDesc, mentions in mutDict["prot"]:
                if varDesc.mutType!="sub":
                    continue
                logging.debug("grounding variant: %s %s"% (varDesc, mentions))
                groundedMuts, ungroundVar, beds = \
                    varFinder.groundVariant(None, sentence, varDesc, mentions, [], docEntrezIds)

                for mutInfo in groundedMuts:
                    shortDesc = varDesc.origSeq+str(varDesc.start+1)+varDesc.mutSeq # 0-based!!
                    mutDescs.append(shortDesc+"=%s:%s"%(mutInfo.geneSymbol,mutInfo.hgvsProt))
            
        #mutMatches =  list(mutRe.finditer(sentence))
        #mutDescs = [(m.group(1),m.group(2), m.group(3)) for m in mutMatches]
        #mutDescSet = set(mutDescs)
        #blackListMuts = mutDescSet.intersection(blackListStr)
        #if len(mutMatches)==0:
            #logging.debug("No mutation found, skipping")
            #continue
        #if len(blackListMuts)!=0:
            #logging.debug("At least one blacklisted mutation found, skipping")
            #continue
        #if len(drugs)==0:
            #logging.debug("No drugs found, skipping")
            #continue
        #if len(genes)==0:
            #logging.debug("No genes found, skipping")
            #continue
    
        mutDesc = "|".join(mutDescs)
        drugDesc = rangeDescs(sentence, drugs)
        condDesc = rangeDescs(sentence, conds)
        geneDesc = rangeDescs(sentence, genes, useSym=True)

        ret = (start, end, section, drugDesc, condDesc, geneDesc, mutDesc, sentence)
        yield ret
Esempio n. 8
0
def findDisGeneVariant(text):
    """
    >>> geneFinder.initData(exclMarkerTypes=["dnaSeq", "band"])
    >>> varFinder.loadDb(loadSequences=False)
    >>> list(findDisGeneVariant("Diabetes is caused by a PITX2 mutation, V234T and influenced by Herceptin."))
    [(0, 74, 'probablyAbstract', '64-73:Herceptin=Trastuzumab', '0-8:Diabetes=Diabetes Mellitus', '24-29:PITX2=symbol', 'V233T', 'Diabetes is caused by a PITX2 mutation, V234T and influenced by Herceptin.')]
    >>> #list(findDisGeneVariant("We undertook a quantitative review of the literature to estimate the effectiveness of desferrioxamine and deferiprone in decreasing hepatic iron concentrations (HIC) in thalassemia major."))
    >>> list(findDisGeneVariant("his mutation, we cotransfected C3H10T cells with expression vectors encoding SMO-WT or SMO-D473H "))
    """
    docGenes = list(geneFinder.findGeneNames(text))
    docEntrezIds = set([r[-1] for r in docGenes])

    for section, start, end, sentence in pubNlp.sectionSentences(text):
        conds = list(pubNlp.findDiseases(sentence))
        drugs = list(pubNlp.findDrugs(sentence))
        genes = list(geneFinder.findGeneNames(sentence))
        #print conds, drugs, genes, section, sentence
        # remove drugs and conds that are also genes
        drugs = rangeRemoveOverlaps(drugs, genes)
        conds = rangeRemoveOverlaps(conds, genes)

        #geneSnips = rangeTexts(sentence, genes, useSym=True)
        #condSnips = rangeTexts(sentence, conds)
        #drugSnips = rangeTexts(sentence, drugs)
        #
        mutDescs = []
        mutDict = varFinder.findVariantDescriptions(sentence)
        if "prot" in mutDict:
            for varDesc, mentions in mutDict["prot"]:
                if varDesc.mutType != "sub":
                    continue
                logging.debug("grounding variant: %s %s" % (varDesc, mentions))
                groundedMuts, ungroundVar, beds = \
                    varFinder.groundVariant(None, sentence, varDesc, mentions, [], docEntrezIds)

                for mutInfo in groundedMuts:
                    shortDesc = varDesc.origSeq + str(
                        varDesc.start + 1) + varDesc.mutSeq  # 0-based!!
                    mutDescs.append(shortDesc + "=%s:%s" %
                                    (mutInfo.geneSymbol, mutInfo.hgvsProt))

        #mutMatches =  list(mutRe.finditer(sentence))
        #mutDescs = [(m.group(1),m.group(2), m.group(3)) for m in mutMatches]
        #mutDescSet = set(mutDescs)
        #blackListMuts = mutDescSet.intersection(blackListStr)
        #if len(mutMatches)==0:
        #logging.debug("No mutation found, skipping")
        #continue
        #if len(blackListMuts)!=0:
        #logging.debug("At least one blacklisted mutation found, skipping")
        #continue
        #if len(drugs)==0:
        #logging.debug("No drugs found, skipping")
        #continue
        #if len(genes)==0:
        #logging.debug("No genes found, skipping")
        #continue

        mutDesc = "|".join(mutDescs)
        drugDesc = rangeDescs(sentence, drugs)
        condDesc = rangeDescs(sentence, conds)
        geneDesc = rangeDescs(sentence, genes, useSym=True)

        ret = (start, end, section, drugDesc, condDesc, geneDesc, mutDesc,
               sentence)
        yield ret