def mapDnaToGenes(self, seqs, docId, dbList): """ returns a dict seq -> set of gene symbols >>> d = DnaMapper() # simple case >>> seqs = ["GCAAGCTCCCGGGAATTCAGCTC"] >>> d.mapDnaToGenes(seqs, "1234", ["hg19"]) {'hg19': {'GCAAGCTCCCGGGAATTCAGCTC': set(['PITX2'])}} # harder case >>> seqs = ["ACTGGGAGAAGGGTGGTCAG", "TGTGTCCCTGAGCCAGTGAC"] >>> d.mapDnaToGenes(seqs, "1234", ["hg19"]) {'hg19': {'ACTGGGAGAAGGGTGGTCAG': set(['CLN6']), 'TGTGTCCCTGAGCCAGTGAC': set(['CLN6'])}} """ seqs = [(docId+"|"+str(i), seq) for i, seq in enumerate(seqs)] seqIdToSeq = dict(seqs) bedDir = pubGeneric.makeTempDir(prefix="geneFinderBeds") dbBedNames = self.mapDnaToBed(seqs, docId, dbList, bedDir) dbAnnotGenes = {} for db, bedName in dbBedNames.iteritems(): annotToGenes = pubMap.findLoci(bedName, dbList) seqIdToGenes = {} for annotId, genes in annotToGenes.iteritems(): seqId, seqRange = annotId.split(":") logging.debug("Found match for %s (%s) for genes %s" % (seqId, seqRange, genes)) seq = seqIdToSeq[seqId] seqIdToGenes.setdefault(seq, set()).update(genes) dbAnnotGenes[db] = seqIdToGenes if not pubConf.debug: shutil.rmtree(bedDir) return dbAnnotGenes
def createChunksSubmitJobs(inDir, outDir, minId, runner, chunkSize): tmpDir = pubGeneric.makeTempDir("bingData", tmpDir=outDir) #maxCommon.delOnExit(tmpDir) maxCommon.mustExistDir(outDir) updateId, minId, alreadyDoneFiles = pubStore.parseUpdatesTab(outDir, minId) # get all .gz.index files, remove the already done files inFnames = glob.glob(join(inDir, "*.index.gz")) inBaseNames = set([basename(x) for x in inFnames]) todoBasenames = inBaseNames - set(alreadyDoneFiles) todoFnames = [join(inDir, x) for x in todoBasenames] if len(todoFnames) == 0: logging.info("All input files already converted") return indexFilename = join(outDir, "%d_index.tab" % updateId) indexFile = open(indexFilename, "w") headers = ["articleId", "tsvFile", "url", "offset"] indexFile.write("\t".join(headers)) indexFile.write("\n") # read them and create a big index file: # with tsvname, url, offset numId = minId doneUrls = set() for fname in todoFnames: baseName = basename(fname) for line in gzip.open(fname): url, offset = line.rstrip("\n").split("\t") assert (offset.isdigit()) if "\t" in url or "\n" in url: logging.info("tab or NL in url %s, skipping" % url) continue if url in doneUrls: logging.info("Already did %s" % url) continue baseName = baseName.replace(".index.gz", ".gz") row = [str(numId), baseName, url, offset] indexFile.write("\t".join(row)) indexFile.write("\n") numId += 1 indexFile.close() # split the index file into chunks, one per job chunkIds = pubStore.splitTabFileOnChunkId(indexFilename, tmpDir, chunkSize=chunkSize) idFname = pubGeneric.concatIdentifiers(outDir, tmpDir, "doneArticles.tab") # submit one conversion job per chunk submitConvertJobs(runner, inDir, updateId, chunkIds, tmpDir, idFname, tmpDir) pubGeneric.concatDelIdFiles(tmpDir, outDir, "%d_ids.tab" % updateId) pubGeneric.concatDelLogs(tmpDir, outDir, "%d.log" % updateId) pubStore.moveFiles(tmpDir, outDir) shutil.rmtree(tmpDir) pubStore.appendToUpdatesTxt(outDir, updateId, numId, todoBasenames)
def createChunksSubmitJobs(inDir, outDir, minId, runner, chunkSize): tmpDir = pubGeneric.makeTempDir("bingData", tmpDir=outDir) #maxCommon.delOnExit(tmpDir) maxCommon.mustExistDir(outDir) updateId, minId, alreadyDoneFiles = pubStore.parseUpdatesTab(outDir, minId) # get all .gz.index files, remove the already done files inFnames = glob.glob(join(inDir, "*.index.gz")) inBaseNames = set([basename(x) for x in inFnames]) todoBasenames = inBaseNames - set(alreadyDoneFiles) todoFnames = [join(inDir, x) for x in todoBasenames] if len(todoFnames)==0: logging.info("All input files already converted") return indexFilename = join(outDir, "%d_index.tab" % updateId) indexFile = open(indexFilename, "w") headers = ["articleId", "tsvFile", "url", "offset"] indexFile.write("\t".join(headers)) indexFile.write("\n") # read them and create a big index file: # with tsvname, url, offset numId = minId doneUrls = set() for fname in todoFnames: baseName = basename(fname) for line in gzip.open(fname): url, offset = line.rstrip("\n").split("\t") assert(offset.isdigit()) if "\t" in url or "\n" in url: logging.info("tab or NL in url %s, skipping" % url) continue if url in doneUrls: logging.info("Already did %s" % url) continue baseName = baseName.replace(".index.gz", ".gz") row = [str(numId), baseName, url, offset] indexFile.write("\t".join(row)) indexFile.write("\n") numId+=1 indexFile.close() # split the index file into chunks, one per job chunkIds = pubStore.splitTabFileOnChunkId(indexFilename, tmpDir, chunkSize=chunkSize) idFname = pubGeneric.concatIdentifiers(outDir, tmpDir, "doneArticles.tab") # submit one conversion job per chunk submitConvertJobs(runner, inDir, updateId, chunkIds, tmpDir, idFname, tmpDir) pubGeneric.concatDelIdFiles(tmpDir, outDir, "%d_ids.tab" % updateId) pubGeneric.concatDelLogs(tmpDir, outDir, "%d.log" % updateId) pubStore.moveFiles(tmpDir, outDir) shutil.rmtree(tmpDir) pubStore.appendToUpdatesTxt(outDir, updateId, numId, todoBasenames)
def mapDnaToBed(self, seqs, docId, dbList, outDir): """ seqs is a list of (seqId, seq) """ pslDir = pubGeneric.makeTempDir(prefix="geneFinderPsls") # create tuples (seqId, seq) dbPslFnames = self.blatClient.blatSeqs(dbList, seqs, pslDir) for db, fname in dbPslFnames.iteritems(): oneBed = join(outDir, "chained.%s.bed" % db) pslFname = join(pslDir, db+".psl") dbBedNames = pubMap.chainPslToBed(pslFname, oneBed, pipeSep=True, onlyFields=12) if not pubConf.debug: shutil.rmtree(pslDir) return dbBedNames
def indexTsv(zipFname, tsvName, outFname): """ unzip a zipfile, recompress all the tsvs inside with gzip and create an .index.gz for them""" #def indexTsv(zipFname, tsvName, outFname, bgzipPath): # extract to local disk tmpDir = pubGeneric.makeTempDir("bingData") maxCommon.delOnExit(tmpDir) logging.info("Extracting to %s" % tmpDir) cmd =["unzip", "-d",tmpDir, zipFname] maxCommon.runCommand(cmd) tempFname = join(tmpDir, tsvName) logging.info("Indexing %s to %s" % (tempFname, outFname)) # index lines ofh = gzip.open(outFname, "w") ifh = open(tempFname, "rb") offset = 0 # the file iterator does not work with tell()!! #for line in ifh: while True: line = ifh.readline() if line=="": break url = line[0:line.find("\t")] ofh.write("%s\t%d\n" % (url, offset)) #logging.debug("url %s, offset %d" % (url, offset)) offset = ifh.tell() ofh.close() # re-compress with gzip tmpFnames = glob.glob(join(tmpDir, "*.tsv")) assert(len(tmpFnames)==1) tmpFname = tmpFnames[0] zipDir = dirname(zipFname) finalFname = join(zipDir, tsvName+".gz") logging.info("Compressing to %s" % finalFname) #cmd = "%s %s -c > %s" % (bgzipPath, tmpFname, finalFname) cmd = "gzip %s -c > %s" % (tmpFname, finalFname) maxCommon.runCommand(cmd) shutil.rmtree(tmpDir)
def indexTsv(zipFname, tsvName, outFname): """ unzip a zipfile, recompress all the tsvs inside with gzip and create an .index.gz for them""" #def indexTsv(zipFname, tsvName, outFname, bgzipPath): # extract to local disk tmpDir = pubGeneric.makeTempDir("bingData") maxCommon.delOnExit(tmpDir) logging.info("Extracting to %s" % tmpDir) cmd = ["unzip", "-d", tmpDir, zipFname] maxCommon.runCommand(cmd) tempFname = join(tmpDir, tsvName) logging.info("Indexing %s to %s" % (tempFname, outFname)) # index lines ofh = gzip.open(outFname, "w") ifh = open(tempFname, "rb") offset = 0 # the file iterator does not work with tell()!! #for line in ifh: while True: line = ifh.readline() if line == "": break url = line[0:line.find("\t")] ofh.write("%s\t%d\n" % (url, offset)) #logging.debug("url %s, offset %d" % (url, offset)) offset = ifh.tell() ofh.close() # re-compress with gzip tmpFnames = glob.glob(join(tmpDir, "*.tsv")) assert (len(tmpFnames) == 1) tmpFname = tmpFnames[0] zipDir = dirname(zipFname) finalFname = join(zipDir, tsvName + ".gz") logging.info("Compressing to %s" % finalFname) #cmd = "%s %s -c > %s" % (bgzipPath, tmpFname, finalFname) cmd = "gzip %s -c > %s" % (tmpFname, finalFname) maxCommon.runCommand(cmd) shutil.rmtree(tmpDir)
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile): """ get files from inIndexFile, parse Xml, write everything to outfile in ascii format """ diskDir = abspath(join(zipDir, "..", "disk")) store = pubStore.PubWriterFile(outFile) # read all already done IDs doneIds = parseDoneIds(inIdFile) # open output id files idFname = join(dirname(outFile), basename(outFile).split(".")[0] + ".ids.tab") logging.debug("Writing ids to %s" % idFname) idFh = open(idFname, "w") idFh.write("#articleId\tdoi\tpmid\n") pmidFinder = pubCompare.PmidFinder() unzipTmp = pubGeneric.makeTempDir(prefix="pubConvSpringerUnzip", tmpDir=pubConf.getFastTempDir()) maxCommon.delOnExit(unzipTmp) i = 0 inRows = list(maxCommon.iterTsvRows(inIndexFile)) logging.info("Converting %d files" % len(inRows)) convCount = 0 pdfNotFound = 0 for row in inRows: # read line i += 1 articleId = row.articleId zipFilename, filename = row.zipFilename, row.filename if u"\xbf" in filename: logging.info("Found weird character, skipping file") continue articleData = pubStore.createEmptyArticleDict(publisher="springer") if zipFilename == "": xmlString, pdfString = getDiskData(diskDir, filename) articleData["origFile"] = filename else: xmlString, pdfString = getUpdateData(unzipTmp, zipDir, zipFilename, filename) articleData["origFile"] = zipFilename + ":" + filename if pdfString == None: pdfNotFound += 1 logging.error("Could not open pdf or xml file") continue articleId = int(articleId) # parse xml logging.debug("Parsing XML") try: xmlTree = pubXml.etreeFromXml(xmlString) except lxml.etree.XMLSyntaxError: logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename)) continue articleData = parseXml(xmlTree, articleData) if articleData == None: logging.warn("Parser got no data for %s" % filename) continue if articleData["doi"] in doneIds: logging.error("article %s has already been converted, skipping" % articleData["doi"]) continue articleData["pmid"] = pmidFinder.lookupPmid(articleData) articleData["origFile"] = zipFilename + "/" + filename articleData["externalId"] = articleData["doi"] # convert pdf to ascii fileData = createFileData(articleData, "application/pdf", pdfString) logging.debug("converting pdf to ascii") pubGeneric.toAscii(fileData, "application/pdf") # write to output store.writeArticle(articleId, articleData) store.writeFile(articleId, (1000 * (articleId)) + 1, fileData, externalId=articleData["externalId"]) # write IDs to separate file idRow = [str(articleData["articleId"]), articleData["doi"], str(articleData["pmid"])] idFh.write("\t".join(idRow)) idFh.write("\n") doneIds.add(articleData["doi"]) convCount += 1 logging.info("Converted %d files, pdfNotFound=%d" % (convCount, pdfNotFound)) store.close() idFh.close()
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile): """ get files from inIndexFile, parse Xml, write everything to outfile in ascii format """ diskDir = abspath(join(zipDir, "..", "disk")) store = pubStore.PubWriterFile(outFile) # read all already done IDs doneIds = parseDoneIds(inIdFile) # open output id files idFname = join(dirname(outFile), basename(outFile).split(".")[0]+".ids.tab") logging.debug("Writing ids to %s" % idFname) idFh = open(idFname, "w") idFh.write("#articleId\tdoi\tpmid\n") pmidFinder = pubCompare.PmidFinder() unzipTmp = pubGeneric.makeTempDir(prefix="pubConvSpringerUnzip", tmpDir=pubConf.getFastTempDir()) maxCommon.delOnExit(unzipTmp) i = 0 inRows = list(maxCommon.iterTsvRows(inIndexFile)) logging.info("Converting %d files" % len(inRows)) convCount = 0 pdfNotFound = 0 for row in inRows: # read line i+=1 articleId = row.articleId zipFilename, filename = row.zipFilename, row.filename if u'\xbf' in filename: logging.info("Found weird character, skipping file") continue articleData = pubStore.createEmptyArticleDict(publisher="springer") if zipFilename=="": xmlString, pdfString = getDiskData(diskDir, filename) articleData["origFile"] = filename else: xmlString, pdfString = getUpdateData(unzipTmp, zipDir, zipFilename, filename) articleData["origFile"] = zipFilename+":"+filename if pdfString==None: pdfNotFound+=1 logging.error("Could not open pdf or xml file") continue articleId=int(articleId) # parse xml logging.debug("Parsing XML") try: xmlTree = pubXml.etreeFromXml(xmlString) except lxml.etree.XMLSyntaxError: logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename)) continue articleData = parseXml(xmlTree, articleData) if articleData==None: logging.warn("Parser got no data for %s" % filename) continue if articleData["doi"] in doneIds: logging.error("article %s has already been converted, skipping" % articleData["doi"]) continue articleData["pmid"] = pmidFinder.lookupPmid(articleData) articleData["origFile"]=zipFilename+"/"+filename articleData["externalId"]=articleData["doi"] # convert pdf to ascii fileData = createFileData(articleData, "application/pdf", pdfString) logging.debug("converting pdf to ascii") pubGeneric.toAscii(fileData, "application/pdf") # write to output store.writeArticle(articleId, articleData) store.writeFile(articleId, (1000*(articleId))+1, fileData, externalId=articleData["externalId"]) # write IDs to separate file idRow = [str(articleData["articleId"]), articleData["doi"], str(articleData["pmid"])] idFh.write("\t".join(idRow)) idFh.write("\n") doneIds.add(articleData["doi"]) convCount += 1 logging.info("Converted %d files, pdfNotFound=%d" % (convCount, pdfNotFound)) store.close() idFh.close()