Beispiel #1
0
def startup(paramDict, result):
    """ parse sym file into lex """
    global lex, blackList
    lex = fastFind.loadLex(symFname)
    #result["geneCount"] = {}
    #result["allCount"] = {}
    blackList = set(open(pubConf.bncFname).read().splitlines()[:1000])
Beispiel #2
0
def startup(paramDict, result):
    """ parse sym file into lex """
    global lex
    wordFname = paramDict["wordFname"]
    lex = fastFind.loadLex(join(dirname(wordFname), wordFname.split(".")[0]+".marshal.gz"))
    global blackList
    blackList = set(open(pubConf.bncFname).read().splitlines()[:10000])
def startup(paramDict, result):
    """ parse sym file into lex """
    global lex, blackList
    lex = fastFind.loadLex(symFname)
    #result["geneCount"] = {}
    #result["allCount"] = {}
    blackList = set(open(pubConf.bncFname).read().splitlines()[:1000])
Beispiel #4
0
 def startup(self, paramDict):
     """ parse dictioary of keywords """
     if "dict" not in paramDict:
         dictFname = "/hive/data/inside/pubs/geneDisease/diseaseDictionary/malacards/dictionary.marshal.gz"
     else:
         dictFname = paramDict["dict"]
     logging.info("Reading %s" % dictFname)
     self.lex = fastFind.loadLex(dictFname)
Beispiel #5
0
def startup(paramDict, result):
    """ parse sym file into lex """
    global lex
    wordFname = paramDict["wordFname"]
    lex = fastFind.loadLex(
        join(dirname(wordFname),
             wordFname.split(".")[0] + ".marshal.gz"))
    global blackList
    blackList = set(open(pubConf.bncFname).read().splitlines()[:10000])
Beispiel #6
0
def findDiseases(text):
    """ find diseases in string and return as (start, end, diseaseName)
    >>> list(findDiseases("AlzhEImer's Disease"))
    [(0, 19, 'Alzheimer Disease')]
    """
    global disLex
    if disLex==None:
        disPath = join(pubConf.staticDataDir, "diseases", "diseases.marshal.gz")
        disLex = fastFind.loadLex(disPath)

    for (start, end, name) in fastFind.fastFind(text, disLex, toLower=True):
        yield start, end, name
Beispiel #7
0
def findDiseases(text):
    """ find diseases in string and return as (start, end, diseaseName)
    >>> list(findDiseases("AlzhEImer's Disease"))
    [(0, 19, 'Alzheimer Disease')]
    """
    global disLex
    if disLex == None:
        disPath = join(pubConf.staticDataDir, "diseases",
                       "diseases.marshal.gz")
        disLex = fastFind.loadLex(disPath)

    for (start, end, name) in fastFind.fastFind(text, disLex, toLower=True):
        yield start, end, name
Beispiel #8
0
def findDrugs(text):
    """ find drugs in string and return as (start, end, drugbankName)
    >>> list(findDrugs("Acetaminophen, Penicillin V and Herceptin."))
    [(0, 13, 'Acetaminophen'), (15, 27, 'Penicillin V'), (32, 41, 'Trastuzumab')]
    """
    global drugLex
    if drugLex==None:
        drugPath = join(pubConf.staticDataDir, "drugs", "drugbank.marshal.gz")
        drugLex = fastFind.loadLex(drugPath)

    for (start, end, name) in fastFind.fastFind(text, drugLex, toLower=True):
        if name.lower() in drugBlacklist:
            continue
        yield start, end, name
Beispiel #9
0
def findDrugs(text):
    """ find drugs in string and return as (start, end, drugbankName)
    >>> list(findDrugs("Acetaminophen, Penicillin V and Herceptin."))
    [(0, 13, 'Acetaminophen'), (15, 27, 'Penicillin V'), (32, 41, 'Trastuzumab')]
    """
    global drugLex
    if drugLex == None:
        drugPath = pubConf.getStaticFile("drugs", "drugbank.marshal.gz")
        drugLex = fastFind.loadLex(drugPath)

    for (start, end, name) in fastFind.fastFind(text, drugLex, toLower=True):
        if name.lower() in drugBlacklist:
            continue
        yield start, end, name
Beispiel #10
0
def findCells(text):
    """ find cell types
    >>> list(findCells("Oligodendrocytes and neural progenitors."))
    [(0, 16, 'oligodendrocyte')]
    """
    global cellLex
    dictFname = pubConf.getStaticFile("cellTypes", "cellTypes.marshal")
    if cellLex is None:
        cellLex = fastFind.loadLex(dictFname)

    for (start, end, name) in fastFind.fastFind(text.lower(), cellLex):
        if name.lower() in cellBlackList:
            continue
        yield start, end, name
Beispiel #11
0
def startup(paramDict):
    """ parse file into lexicons """
    global lexes
    for fname in paramDict["fnames"].split(","):
        lexName = basename(fname).split(".")[0]
        lexes[lexName]=loadLex(fname)

    if "toLower" in paramDict:
        global toLower
        toLower = bool(int(paramDict["toLower"]))
        logging.info("toLower is %s" % toLower)

    if "reqStrings" in paramDict:
        global reqStrings
        reqStrings = paramDict["reqStrings"].split(",")
Beispiel #12
0
def startup(paramDict):
    """ parse file into lexicons """
    global lexes
    for fname in paramDict["fnames"].split(","):
        lexName = basename(fname).split(".")[0]
        lexes[lexName] = loadLex(fname)

    if "toLower" in paramDict:
        global toLower
        toLower = bool(int(paramDict["toLower"]))
        logging.info("toLower is %s" % toLower)

    if "reqStrings" in paramDict:
        global reqStrings
        reqStrings = paramDict["reqStrings"].split(",")
Beispiel #13
0
 def startup(self, paramDict):
     """ parse dictioary of keywords """
     dictFname = join(dirname(__file__), "data/speciesDict.marshal.gz")
     logging.info("Reading %s" % dictFname)
     self.lex = fastFind.loadLex(dictFname)
def startup(paramDict):
    """ parse HUGO file into dict """
    global lex
    lex = loadLex(dataFname)
Beispiel #15
0
def startup(paramDict):
    """ parse HUGO file into dict """
    global lex
    lex = loadLex(dataFname)
Beispiel #16
0
def initData(markerTypes=None, exclMarkerTypes=None, addOptional=False):
    """ compile regexes and read filter files.
    
    MarkerTypes is the list of markers to prepare, some can be excluded with exclMarkerTypes

    In many applications, looking for dna sequences might not be desireable, as it requires
    a BLAT server which takes a lot of memory, in this case, you can switch off blatting by specifying
    exclMarkerTypes=["dnaSeq"]

    """
    # setup list of marker types as specified
    reDict = compileREs(addOptional)
    if markerTypes == None:
        markerTypes = set(reDict.keys())
        markerTypes.add("geneName")
        markerTypes.add("symbol")
        markerTypes.add("symbolMaybe")
        markerTypes.add("dnaSeq")

    if exclMarkerTypes != None:
        for m in exclMarkerTypes:
            markerTypes.remove(m)

    global searchTypes
    searchTypes = markerTypes

    global filterDict
    kwDictList = []
    for markerType in markerTypes:
        if markerType == "dnaSeq":
            continue
        # special case for long gene names
        if markerType == "geneName":
            global geneNameLex
            fname = join(GENEDATADIR, "geneNames.marshal.gz")
            logging.info("Loading %s" % fname)
            geneNameLex = fastFind.loadLex(fname)
            continue

        # special case for bands
        if markerType == "band":
            global bandToEntrezSyms
            #fname = join(GENEDATADIR, "bandToEntrez.marshal.gz")
            fname = join(GENEDATADIR, "bandGenes.tab")
            logging.info("Loading %s" % fname)
            #bandToEntrezSyms = marshal.loads(gzip.open(fname).read())
            bandToEntrezSyms = parseBands(fname)

        # special case for gene symbols
        if markerType == "symbol" or markerType == "symbolMaybe":
            global geneSymLex
            fname = join(GENEDATADIR, "symbols.marshal.gz")
            logging.info("Loading %s" % fname)
            geneSymLex = fastFind.loadLex(fname)

            global symLeftReqWords, symRightReqWords
            symLeftReqWords = readBestWords(join(GENEDATADIR, "left.tab"), 500)
            symRightReqWords = readBestWords(join(GENEDATADIR, "right.tab"),
                                             500)
            continue

        markerRe = reDict[markerType]
        kwDictList.append((markerType, markerRe))
        if markerType in requiresFilter:
            filterFname = os.path.join(DICTDIR, markerType + "Accs.txt.gz")
            #filterFname = pubGeneric.getFromCache(filterFname)
            logging.info("Opening %s" % filterFname)
            if not isfile(filterFname):
                logging.warn("Cannot filter %s accessions, %s not found" % \
                    (markerType, filterFname))
                filterDict[markerType] = None
                continue
            filterSet = pubKeyVal.openDb(filterFname)
            filterDict[markerType] = filterSet

    global markerDictList
    markerDictList = kwDictList
    logging.debug("Loaded marker dict for these types: %s" %
                  [x for x, y in markerDictList])
Beispiel #17
0
def initData(markerTypes=None, exclMarkerTypes=None, addOptional=False):
    """ compile regexes and read filter files.
    
    MarkerTypes is the list of markers to prepare, some can be excluded with exclMarkerTypes

    In many applications, looking for dna sequences might not be desireable, as it requires
    a BLAT server which takes a lot of memory, in this case, you can switch off blatting by specifying
    exclMarkerTypes=["dnaSeq"]

    """
    # setup list of marker types as specified
    reDict = compileREs(addOptional)
    if markerTypes==None:
        markerTypes = set(reDict.keys())
        markerTypes.add("geneName")
        markerTypes.add("symbol")
        markerTypes.add("symbolMaybe")
        markerTypes.add("dnaSeq")

    if exclMarkerTypes!=None:
        for m in exclMarkerTypes:
            markerTypes.remove(m)

    global searchTypes
    searchTypes = markerTypes

    global filterDict
    kwDictList = []
    for markerType in markerTypes:
        if markerType=="dnaSeq":
            continue
        # special case for long gene names
        if markerType=="geneName":
            global geneNameLex
            fname = join(GENEDATADIR, "geneNames.marshal.gz")
            logging.info("Loading %s" % fname)
            geneNameLex = fastFind.loadLex(fname)
            continue

        # special case for bands
        if markerType=="band":
            global bandToEntrezSyms
            fname = join(GENEDATADIR, "bandToEntrez.marshal.gz")
            logging.info("Loading %s" % fname)
            bandToEntrezSyms = marshal.loads(gzip.open(fname).read())

        # special case for gene symbols
        if markerType=="symbol" or markerType=="symbolMaybe":
            global geneSymLex
            fname = join(GENEDATADIR, "symbols.marshal.gz")
            logging.info("Loading %s" % fname)
            geneSymLex = fastFind.loadLex(fname)

            global symLeftReqWords, symRightReqWords
            symLeftReqWords = readBestWords(join(GENEDATADIR, "left.tab"), 500)
            symRightReqWords = readBestWords(join(GENEDATADIR, "right.tab"), 500)
            continue

        markerRe = reDict[markerType]
        kwDictList.append((markerType, markerRe))
        if markerType in requiresFilter:
            #filterFname = os.path.join(DICTDIR, markerType+"b.gz")
            filterFname = os.path.join(DICTDIR, markerType+"Accs.txt.gz")
            #filterFname = pubGeneric.getFromCache(filterFname)
            logging.info("Opening %s" % filterFname)
            #filterSet = set(gzip.open(filterFname).read().splitlines())
            filterSet = pubKeyVal.openDb(filterFname)
            filterDict[markerType] = filterSet

    global markerDictList
    markerDictList = kwDictList
    logging.debug("Loaded marker dict for these types: %s" % [x for x,y in markerDictList])
Beispiel #18
0
 def startup(self, paramDict):
     """ parse dictioary of keywords """
     dictFname = join(dirname(__file__), "data/speciesDict.marshal.gz")
     logging.info("Reading %s" % dictFname)
     self.lex = fastFind.loadLex(dictFname)