def startup(paramDict, result): """ parse sym file into lex """ global lex, blackList lex = fastFind.loadLex(symFname) #result["geneCount"] = {} #result["allCount"] = {} blackList = set(open(pubConf.bncFname).read().splitlines()[:1000])
def startup(paramDict, result): """ parse sym file into lex """ global lex wordFname = paramDict["wordFname"] lex = fastFind.loadLex(join(dirname(wordFname), wordFname.split(".")[0]+".marshal.gz")) global blackList blackList = set(open(pubConf.bncFname).read().splitlines()[:10000])
def startup(self, paramDict): """ parse dictioary of keywords """ if "dict" not in paramDict: dictFname = "/hive/data/inside/pubs/geneDisease/diseaseDictionary/malacards/dictionary.marshal.gz" else: dictFname = paramDict["dict"] logging.info("Reading %s" % dictFname) self.lex = fastFind.loadLex(dictFname)
def startup(paramDict, result): """ parse sym file into lex """ global lex wordFname = paramDict["wordFname"] lex = fastFind.loadLex( join(dirname(wordFname), wordFname.split(".")[0] + ".marshal.gz")) global blackList blackList = set(open(pubConf.bncFname).read().splitlines()[:10000])
def findDiseases(text): """ find diseases in string and return as (start, end, diseaseName) >>> list(findDiseases("AlzhEImer's Disease")) [(0, 19, 'Alzheimer Disease')] """ global disLex if disLex==None: disPath = join(pubConf.staticDataDir, "diseases", "diseases.marshal.gz") disLex = fastFind.loadLex(disPath) for (start, end, name) in fastFind.fastFind(text, disLex, toLower=True): yield start, end, name
def findDiseases(text): """ find diseases in string and return as (start, end, diseaseName) >>> list(findDiseases("AlzhEImer's Disease")) [(0, 19, 'Alzheimer Disease')] """ global disLex if disLex == None: disPath = join(pubConf.staticDataDir, "diseases", "diseases.marshal.gz") disLex = fastFind.loadLex(disPath) for (start, end, name) in fastFind.fastFind(text, disLex, toLower=True): yield start, end, name
def findDrugs(text): """ find drugs in string and return as (start, end, drugbankName) >>> list(findDrugs("Acetaminophen, Penicillin V and Herceptin.")) [(0, 13, 'Acetaminophen'), (15, 27, 'Penicillin V'), (32, 41, 'Trastuzumab')] """ global drugLex if drugLex==None: drugPath = join(pubConf.staticDataDir, "drugs", "drugbank.marshal.gz") drugLex = fastFind.loadLex(drugPath) for (start, end, name) in fastFind.fastFind(text, drugLex, toLower=True): if name.lower() in drugBlacklist: continue yield start, end, name
def findDrugs(text): """ find drugs in string and return as (start, end, drugbankName) >>> list(findDrugs("Acetaminophen, Penicillin V and Herceptin.")) [(0, 13, 'Acetaminophen'), (15, 27, 'Penicillin V'), (32, 41, 'Trastuzumab')] """ global drugLex if drugLex == None: drugPath = pubConf.getStaticFile("drugs", "drugbank.marshal.gz") drugLex = fastFind.loadLex(drugPath) for (start, end, name) in fastFind.fastFind(text, drugLex, toLower=True): if name.lower() in drugBlacklist: continue yield start, end, name
def findCells(text): """ find cell types >>> list(findCells("Oligodendrocytes and neural progenitors.")) [(0, 16, 'oligodendrocyte')] """ global cellLex dictFname = pubConf.getStaticFile("cellTypes", "cellTypes.marshal") if cellLex is None: cellLex = fastFind.loadLex(dictFname) for (start, end, name) in fastFind.fastFind(text.lower(), cellLex): if name.lower() in cellBlackList: continue yield start, end, name
def startup(paramDict): """ parse file into lexicons """ global lexes for fname in paramDict["fnames"].split(","): lexName = basename(fname).split(".")[0] lexes[lexName]=loadLex(fname) if "toLower" in paramDict: global toLower toLower = bool(int(paramDict["toLower"])) logging.info("toLower is %s" % toLower) if "reqStrings" in paramDict: global reqStrings reqStrings = paramDict["reqStrings"].split(",")
def startup(paramDict): """ parse file into lexicons """ global lexes for fname in paramDict["fnames"].split(","): lexName = basename(fname).split(".")[0] lexes[lexName] = loadLex(fname) if "toLower" in paramDict: global toLower toLower = bool(int(paramDict["toLower"])) logging.info("toLower is %s" % toLower) if "reqStrings" in paramDict: global reqStrings reqStrings = paramDict["reqStrings"].split(",")
def startup(self, paramDict): """ parse dictioary of keywords """ dictFname = join(dirname(__file__), "data/speciesDict.marshal.gz") logging.info("Reading %s" % dictFname) self.lex = fastFind.loadLex(dictFname)
def startup(paramDict): """ parse HUGO file into dict """ global lex lex = loadLex(dataFname)
def initData(markerTypes=None, exclMarkerTypes=None, addOptional=False): """ compile regexes and read filter files. MarkerTypes is the list of markers to prepare, some can be excluded with exclMarkerTypes In many applications, looking for dna sequences might not be desireable, as it requires a BLAT server which takes a lot of memory, in this case, you can switch off blatting by specifying exclMarkerTypes=["dnaSeq"] """ # setup list of marker types as specified reDict = compileREs(addOptional) if markerTypes == None: markerTypes = set(reDict.keys()) markerTypes.add("geneName") markerTypes.add("symbol") markerTypes.add("symbolMaybe") markerTypes.add("dnaSeq") if exclMarkerTypes != None: for m in exclMarkerTypes: markerTypes.remove(m) global searchTypes searchTypes = markerTypes global filterDict kwDictList = [] for markerType in markerTypes: if markerType == "dnaSeq": continue # special case for long gene names if markerType == "geneName": global geneNameLex fname = join(GENEDATADIR, "geneNames.marshal.gz") logging.info("Loading %s" % fname) geneNameLex = fastFind.loadLex(fname) continue # special case for bands if markerType == "band": global bandToEntrezSyms #fname = join(GENEDATADIR, "bandToEntrez.marshal.gz") fname = join(GENEDATADIR, "bandGenes.tab") logging.info("Loading %s" % fname) #bandToEntrezSyms = marshal.loads(gzip.open(fname).read()) bandToEntrezSyms = parseBands(fname) # special case for gene symbols if markerType == "symbol" or markerType == "symbolMaybe": global geneSymLex fname = join(GENEDATADIR, "symbols.marshal.gz") logging.info("Loading %s" % fname) geneSymLex = fastFind.loadLex(fname) global symLeftReqWords, symRightReqWords symLeftReqWords = readBestWords(join(GENEDATADIR, "left.tab"), 500) symRightReqWords = readBestWords(join(GENEDATADIR, "right.tab"), 500) continue markerRe = reDict[markerType] kwDictList.append((markerType, markerRe)) if markerType in requiresFilter: filterFname = os.path.join(DICTDIR, markerType + "Accs.txt.gz") #filterFname = pubGeneric.getFromCache(filterFname) logging.info("Opening %s" % filterFname) if not isfile(filterFname): logging.warn("Cannot filter %s accessions, %s not found" % \ (markerType, filterFname)) filterDict[markerType] = None continue filterSet = pubKeyVal.openDb(filterFname) filterDict[markerType] = filterSet global markerDictList markerDictList = kwDictList logging.debug("Loaded marker dict for these types: %s" % [x for x, y in markerDictList])
def initData(markerTypes=None, exclMarkerTypes=None, addOptional=False): """ compile regexes and read filter files. MarkerTypes is the list of markers to prepare, some can be excluded with exclMarkerTypes In many applications, looking for dna sequences might not be desireable, as it requires a BLAT server which takes a lot of memory, in this case, you can switch off blatting by specifying exclMarkerTypes=["dnaSeq"] """ # setup list of marker types as specified reDict = compileREs(addOptional) if markerTypes==None: markerTypes = set(reDict.keys()) markerTypes.add("geneName") markerTypes.add("symbol") markerTypes.add("symbolMaybe") markerTypes.add("dnaSeq") if exclMarkerTypes!=None: for m in exclMarkerTypes: markerTypes.remove(m) global searchTypes searchTypes = markerTypes global filterDict kwDictList = [] for markerType in markerTypes: if markerType=="dnaSeq": continue # special case for long gene names if markerType=="geneName": global geneNameLex fname = join(GENEDATADIR, "geneNames.marshal.gz") logging.info("Loading %s" % fname) geneNameLex = fastFind.loadLex(fname) continue # special case for bands if markerType=="band": global bandToEntrezSyms fname = join(GENEDATADIR, "bandToEntrez.marshal.gz") logging.info("Loading %s" % fname) bandToEntrezSyms = marshal.loads(gzip.open(fname).read()) # special case for gene symbols if markerType=="symbol" or markerType=="symbolMaybe": global geneSymLex fname = join(GENEDATADIR, "symbols.marshal.gz") logging.info("Loading %s" % fname) geneSymLex = fastFind.loadLex(fname) global symLeftReqWords, symRightReqWords symLeftReqWords = readBestWords(join(GENEDATADIR, "left.tab"), 500) symRightReqWords = readBestWords(join(GENEDATADIR, "right.tab"), 500) continue markerRe = reDict[markerType] kwDictList.append((markerType, markerRe)) if markerType in requiresFilter: #filterFname = os.path.join(DICTDIR, markerType+"b.gz") filterFname = os.path.join(DICTDIR, markerType+"Accs.txt.gz") #filterFname = pubGeneric.getFromCache(filterFname) logging.info("Opening %s" % filterFname) #filterSet = set(gzip.open(filterFname).read().splitlines()) filterSet = pubKeyVal.openDb(filterFname) filterDict[markerType] = filterSet global markerDictList markerDictList = kwDictList logging.debug("Loaded marker dict for these types: %s" % [x for x,y in markerDictList])