def main(): ''' general gist: loads necessary information, then calls parseLines() which returns a valid sentence to be parsed, valid meaning we are going to throw it in TEES or other NLP program. ''' args = parseArgs() logging.info("Parsed arguments") geneFinder.initData(exclMarkerTypes=["dnaSeq" ]) # setup for findGenes() later logging.info("Set up environment for findGenes()") logging.info("Load pickled/marshaled relex, authors, entrez symbols") relex = pickle.load(args.extractionData) authors = pickle.load(args.extractionData) entrez = marshal.load(args.entrezData)['entrez2sym'] logging.info("Open output files") sentenceFile, geneFile = open(args.timestamp + '-sentence.txt', 'w'), open(args.timestamp + '-genes.txt', 'w') for pmid, sentence, geneIds, geneNames, rawNames, relations in parseLines( args.inputFiles, entrez, relex, authors): geneFile.write( formatMeta(pmid, geneIds, geneNames, rawNames, relations)) print formatMeta(pmid, geneIds, geneNames, rawNames, relations) sentenceFile.write(sentence + '\n') geneFile.flush() if len(nonamed) > 1: logging.warning("Not in entrez2sym: {}".format(nonamed))
def main(): ''' general gist: loads necessary information, then calls parseLines() which returns a valid sentence to be parsed, valid meaning we are going to throw it in TEES or other NLP program. ''' args = parseArgs() logging.info("Parsed arguments") geneFinder.initData(exclMarkerTypes=["dnaSeq"]) # setup for findGenes() later logging.info("Set up environment for findGenes()") logging.info("Load pickled/marshaled relex, authors, entrez symbols") relex = pickle.load(args.extractionData) authors = pickle.load(args.extractionData) entrez = marshal.load(args.entrezData)['entrez2sym'] logging.info("Open output files") sentenceFile, geneFile = open(args.timestamp+'-sentence.txt', 'w'), open(args.timestamp+'-genes.txt', 'w') for pmid, sentence, geneIds, geneNames, rawNames, relations in parseLines(args.inputFiles, entrez, relex, authors): geneFile.write(formatMeta(pmid, geneIds, geneNames, rawNames, relations)) print formatMeta(pmid, geneIds, geneNames, rawNames, relations) sentenceFile.write(sentence + '\n') geneFile.flush() if len(nonamed) > 1: logging.warning("Not in entrez2sym: {}".format(nonamed))
def startup(paramDict): """ parse HUGO file into dict """ geneFinder.initData(exclMarkerTypes=["dnaSeq"])
def startup(self, paramDict): " called once upon startup on each cluster node " geneFinder.initData(exclMarkerTypes=["dnaSeq"]) self.rowCount = 0
def startup(paramDict): varFinder.loadDb() geneFinder.initData(exclMarkerTypes=["dnaSeq"])
def startup(paramDict): geneFinder.initData(exclMarkerTypes=["dnaSeq", "band"]) #varFinder.loadDb(loadSequences=False) varFinder.loadDb()
def startup(self, paramDict, resultDict): self.maxCount = paramDict["maxCount"] kwFilename = paramDict["keywords"] self.searchTypes = getSearchTypes(paramDict) geneFinder.initData(self.searchTypes, addOptional=True)
def startup(self, paramDict): """ parse dictioary of keywords """ #self.searchTypes = getSearchTypes(paramDict) geneFinder.initData(addOptional=True)
def startup(paramDict): """ """ global seqCache # don't use seqs for gene finding geneFinder.initData(exclMarkerTypes=["dnaSeq"])