Ejemplo n.º 1
0
def annotateFile(article, file):
    " go over words of text and check if they are in dict "
    text = file.content
    rows = fastFind(text, lex, wordRegex=r"[\w'_-]+")
    autAcros = getAuthorAcronyms(article.authors)
    acros = getAcronyms(text)

    # print rows
    if len(rows) > 200:
        logging.info(
            "%d gene matches in file (>200), skipping article %s/file %s, fileType %s"
            % (len(rows), article.externalId, file.fileId, file.fileType)
        )
        return []

    newRows = []
    for start, end, id in rows:
        word = text[start:end]
        if word in autAcros:
            logging.debug("Ignoring %s: looks like author acronym for author %s" % (word, autAcros[word]))
            continue

        if word in acros:
            logging.debug("Ignoring %s: looks like an acronym defined in the text for %s" % (word, acros[word]))
            continue

        type = "name"
        if id.startswith("*"):
            id = id.lstrip("*")
            type = "acc"
        newRows.append((start, end, id, type, word))
    return newRows
Ejemplo n.º 2
0
def annotateFile(article, file):
    " go over words of text and check if they are in dict "
    text = file.content
    rows = fastFind(text, lex, wordRegex=r"[\w'_-]+", toLower=True)
    autAcros = getAuthorAcronyms(article.authors)
    acros = getAcronyms(text)

    #print rows
    if len(rows) > 200:
        logging.info(
            "%d gene matches in file (>200), skipping article %s/file %s, fileType %s"
            % (len(rows), article.externalId, file.fileId, file.fileType))
        return []

    newRows = []
    for start, end, id in rows:
        word = text[start:end]
        if word in autAcros:
            logging.debug(
                "Ignoring %s: looks like author acronym for author %s" %
                (word, autAcros[word]))
            continue

        if word in acros:
            logging.debug(
                "Ignoring %s: looks like an acronym defined in the text for %s"
                % (word, acros[word]))
            continue

        type = "name"
        if id.startswith("*"):
            id = id.lstrip("*")
            type = "acc"
        newRows.append((start, end, id, type, word))
    return newRows
Ejemplo n.º 3
0
def findGeneNames(text):
    """
    look for gene names and symbols. Some symbols need flanking trigger words. If these 
    are not present, they are returned as "symbolMaybe"

    Will always return the gene name matches before the symbol matches.

    >>> initData(addOptional=True)
    >>> list(findGeneNames("thyroid hormone receptor, beta"))
    [(0, 30, 'geneName', '7068')]
    >>> list(findGeneNames("FATE1"))
    [(0, 5, 'symbolMaybe', '89885')]
    >>> list(findGeneNames("FATE1 is overexpressed"))
    [(0, 5, 'symbol', '89885')]
    >>> list(findGeneNames("fate1 is overexpressed"))
    []
    >>> list(findGeneNames("PITX2 overexpression"))
    [(0, 5, 'symbol', '5308')]

    # ignore genes that are immediately flanked by "pathway"
    >>> list(findGeneNames("PITX2 pathway"))
    []

    # XX need to correct this
    >>> list(findGeneNames(" BLAST "))
    [(1, 6, 'symbolMaybe', '962')]
    """
    assert (geneSymLex != None)
    textLower = text.lower()
    for start, end, geneId in fastFind.fastFind(textLower,
                                                geneNameLex,
                                                toLower=False):
        yield (start, end, 'geneName', geneId)

    flankFindIter = fastFind.fastFindFlankWords(text,
                                                geneSymLex,
                                                wordDist=2,
                                                wordRe=fastFind.SYMRE,
                                                toLower=False)
    for start, end, geneId, leftWords, rightWords in flankFindIter:
        # if the symbol is marked as potentially ambiguous, check the flanking words
        if geneId.startswith("?"):
            leftWords = [w.lower() for w in leftWords]
            rightWords = [w.lower() for w in rightWords]
            geneId = geneId.strip("?")
            if len(symLeftReqWords.intersection(leftWords))!=0 or \
                len(symRightReqWords.intersection(rightWords))!=0:
                yield (start, end, 'symbol', geneId)
            else:
                yield (start, end, 'symbolMaybe', geneId)
        # otherwise pass them though
        else:
            # ignore genes that are immediately flanked by "pathway"
            flankWords = getFlankWords(start, end, textLower)
            if "pathway" in flankWords:
                logging.debug("ignored %s, flank words are %s" %
                              (text[start:end], flankWords))
                continue
            yield (start, end, 'symbol', geneId)
Ejemplo n.º 4
0
def searchDiseases(text):
    text = text.lower()
    annots = list(fastFind.fastFind(text, lex))
    newAnnots = []
    for annot in annots:
        start, end, id = annot
        snippet = pubAlg.getSnippet(text, start, end)
        dis = text[start:end]
        newAnnots.append( (start, end, dis, snippet) )
    return newAnnots
Ejemplo n.º 5
0
 def annotateFile(self, article, file):
     " go over words of text and check if they are in dict "
     resultRows = []
     #print "annotate", article, file
     text = file.content
     #print "text", text
     annots = list(fastFind.fastFind(text, self.lex, toLower=True))
     if len(annots)>self.MAXCOUNT:
         logging.info("more than %d annotations, skipping %s" % (self.MAXCOUNT, article.externalId))
         return None
     return annots
Ejemplo n.º 6
0
def findDiseases(text):
    """ find diseases in string and return as (start, end, diseaseName)
    >>> list(findDiseases("AlzhEImer's Disease"))
    [(0, 19, 'Alzheimer Disease')]
    """
    global disLex
    if disLex==None:
        disPath = join(pubConf.staticDataDir, "diseases", "diseases.marshal.gz")
        disLex = fastFind.loadLex(disPath)

    for (start, end, name) in fastFind.fastFind(text, disLex, toLower=True):
        yield start, end, name
Ejemplo n.º 7
0
 def annotateFile(self, article, file):
     " go over words of text and check if they are in dict "
     resultRows = []
     #print "annotate", article, file
     text = file.content
     #print "text", text
     annots = list(fastFind.fastFind(text, self.lex, toLower=True))
     if len(annots)>self.MAXCOUNT:
         logging.info("more than %d annotations, skipping %s" % (self.MAXCOUNT, article.externalId))
         return None
     return annots
         
Ejemplo n.º 8
0
def findGeneNames(text):
    """
    look for gene names and symbols. Some symbols need flanking trigger words. If these 
    are not present, they are returned as "symbolMaybe"

    Will always return the gene name matches before the symbol matches.

    >>> initData(addOptional=True)
    >>> list(findGeneNames("thyroid hormone receptor, beta"))
    [(0, 30, 'geneName', '7068')]
    >>> list(findGeneNames("FATE1"))
    [(0, 5, 'symbolMaybe', '89885')]
    >>> list(findGeneNames("FATE1 is overexpressed"))
    [(0, 5, 'symbol', '89885')]
    >>> list(findGeneNames("fate1 is overexpressed"))
    []
    >>> list(findGeneNames("PITX2 overexpression"))
    [(0, 5, 'symbol', '5308')]

    # ignore genes that are immediately flanked by "pathway"
    >>> list(findGeneNames("PITX2 pathway"))
    []

    # XX need to correct this
    >>> list(findGeneNames(" BLAST "))
    [(1, 6, 'symbolMaybe', '962')]
    """
    assert(geneSymLex!=None)
    textLower = text.lower()
    for start, end, geneId in fastFind.fastFind(textLower, geneNameLex):
        yield (start, end, 'geneName', geneId)

    flankFindIter = fastFind.fastFindFlankWords(text, geneSymLex, wordDist=2, wordRe=fastFind.SYMRE)
    for start, end, geneId, leftWords, rightWords in flankFindIter:
        # if the symbol is marked as potentially ambiguous, check the flanking words
        if geneId.startswith("?"):
            leftWords = [w.lower() for w in leftWords]
            rightWords = [w.lower() for w in rightWords]
            geneId = geneId.strip("?")
            if len(symLeftReqWords.intersection(leftWords))!=0 or \
                len(symRightReqWords.intersection(rightWords))!=0:
                yield (start, end, 'symbol', geneId)
            else:
                yield (start, end, 'symbolMaybe', geneId)
        # otherwise pass them though
        else:
            # ignore genes that are immediately flanked by "pathway"
            flankWords = getFlankWords(start, end, textLower)
            if "pathway" in flankWords:
                logging.debug("ignored %s, flank words are %s" % (text[start:end], flankWords))
                continue
            yield (start, end, 'symbol', geneId)
Ejemplo n.º 9
0
def findDiseases(text):
    """ find diseases in string and return as (start, end, diseaseName)
    >>> list(findDiseases("AlzhEImer's Disease"))
    [(0, 19, 'Alzheimer Disease')]
    """
    global disLex
    if disLex == None:
        disPath = join(pubConf.staticDataDir, "diseases",
                       "diseases.marshal.gz")
        disLex = fastFind.loadLex(disPath)

    for (start, end, name) in fastFind.fastFind(text, disLex, toLower=True):
        yield start, end, name
Ejemplo n.º 10
0
def findDrugs(text):
    """ find drugs in string and return as (start, end, drugbankName)
    >>> list(findDrugs("Acetaminophen, Penicillin V and Herceptin."))
    [(0, 13, 'Acetaminophen'), (15, 27, 'Penicillin V'), (32, 41, 'Trastuzumab')]
    """
    global drugLex
    if drugLex==None:
        drugPath = join(pubConf.staticDataDir, "drugs", "drugbank.marshal.gz")
        drugLex = fastFind.loadLex(drugPath)

    for (start, end, name) in fastFind.fastFind(text, drugLex, toLower=True):
        if name.lower() in drugBlacklist:
            continue
        yield start, end, name
Ejemplo n.º 11
0
def findCells(text):
    """ find cell types
    >>> list(findCells("Oligodendrocytes and neural progenitors."))
    [(0, 16, 'oligodendrocyte')]
    """
    global cellLex
    dictFname = pubConf.getStaticFile("cellTypes", "cellTypes.marshal")
    if cellLex is None:
        cellLex = fastFind.loadLex(dictFname)

    for (start, end, name) in fastFind.fastFind(text.lower(), cellLex):
        if name.lower() in cellBlackList:
            continue
        yield start, end, name
Ejemplo n.º 12
0
def findDrugs(text):
    """ find drugs in string and return as (start, end, drugbankName)
    >>> list(findDrugs("Acetaminophen, Penicillin V and Herceptin."))
    [(0, 13, 'Acetaminophen'), (15, 27, 'Penicillin V'), (32, 41, 'Trastuzumab')]
    """
    global drugLex
    if drugLex == None:
        drugPath = pubConf.getStaticFile("drugs", "drugbank.marshal.gz")
        drugLex = fastFind.loadLex(drugPath)

    for (start, end, name) in fastFind.fastFind(text, drugLex, toLower=True):
        if name.lower() in drugBlacklist:
            continue
        yield start, end, name
Ejemplo n.º 13
0
def annotateFile(article, file):
    " go over words of text and check if they are in dict "
    text = file.content
    if len(text)>100000:
        return

    if reqStrings!=None:
        found = False
        #sentLower = sent.lower()
        textLower = text.lower()
        for rs in reqStrings:
            if text.find(rs)!=-1:
            #if sentLower.find(rs)!=-1:
                found = True
                break
        if not found:
            return

    for section, sentStart, sentEnd, sent in pubNlp.sectionSentences(text, file.fileType, mustHaveVerb=False):
        #if len(sent)<20:
            #logging.debug("Sentence too short: %d characters" % len(text))
            #continue
        #if len(sent)>2000:
            #logging.debug("Sentence too long: %d characters" % len(text))
            #continue

        found = True
        posList = []
        allMatches = []
        for lexName, lex in lexes.iteritems():
            matches = []
            lexMatches = fastFind(sent, lex, toLower=toLower)
            if len(lexMatches)==0 or len(lexMatches)>10:
                found = False
                break
            for start, end, word in lexMatches:
                matches.append(word.replace("="," ").replace(",", " "))
                posList.append("%d-%d" % (start, end))
            allMatches.append("%s=%s" % (lexName, ",".join(matches)))
        if found:
            yield sentStart, sentEnd, "|".join(allMatches), "|".join(posList), sent
Ejemplo n.º 14
0
def annotateFile(article, file):
    " go over words of text and check if they are in dict "
    text = file.content
    if len(text) > 100000:
        return

    if reqStrings != None:
        found = False
        # sentLower = sent.lower()
        textLower = text.lower()
        for rs in reqStrings:
            if text.find(rs) != -1:
                # if sentLower.find(rs)!=-1:
                found = True
                break
        if not found:
            return

    for section, sentStart, sentEnd, sent in pubNlp.sectionSentences(text, file.fileType, mustHaveVerb=False):
        # if len(sent)<20:
        # logging.debug("Sentence too short: %d characters" % len(text))
        # continue
        # if len(sent)>2000:
        # logging.debug("Sentence too long: %d characters" % len(text))
        # continue

        found = True
        posList = []
        allMatches = []
        for lexName, lex in lexes.iteritems():
            matches = []
            lexMatches = fastFind(sent, lex, toLower=toLower)
            if len(lexMatches) == 0 or len(lexMatches) > 10:
                found = False
                break
            for start, end, word in lexMatches:
                matches.append(word.replace("=", " ").replace(",", " "))
                posList.append("%d-%d" % (start, end))
            allMatches.append("%s=%s" % (lexName, ",".join(matches)))
        if found:
            yield sentStart, sentEnd, "|".join(allMatches), "|".join(posList), sent