Ejemplo n.º 1
0
    def mapDnaToGenes(self, seqs, docId, dbList):
        """
        returns a dict seq -> set of gene symbols

        >>> d = DnaMapper()

        # simple case
        >>> seqs = ["GCAAGCTCCCGGGAATTCAGCTC"]
        >>> d.mapDnaToGenes(seqs, "1234", ["hg19"])
        {'hg19': {'GCAAGCTCCCGGGAATTCAGCTC': set(['PITX2'])}}

        # harder case
        >>> seqs = ["ACTGGGAGAAGGGTGGTCAG", "TGTGTCCCTGAGCCAGTGAC"]
        >>> d.mapDnaToGenes(seqs, "1234", ["hg19"])
        {'hg19': {'ACTGGGAGAAGGGTGGTCAG': set(['CLN6']), 'TGTGTCCCTGAGCCAGTGAC': set(['CLN6'])}}
        """
        seqs = [(docId+"|"+str(i), seq) for i, seq in enumerate(seqs)]
        seqIdToSeq = dict(seqs)
        bedDir = pubGeneric.makeTempDir(prefix="geneFinderBeds")
        dbBedNames = self.mapDnaToBed(seqs, docId, dbList, bedDir)
        dbAnnotGenes = {}
        for db, bedName in dbBedNames.iteritems():
            annotToGenes = pubMap.findLoci(bedName, dbList)
            seqIdToGenes = {}
            for annotId, genes in annotToGenes.iteritems():
                seqId, seqRange = annotId.split(":")
                logging.debug("Found match for %s (%s) for genes %s" % (seqId, seqRange, genes))
                seq = seqIdToSeq[seqId]
                seqIdToGenes.setdefault(seq, set()).update(genes)
            dbAnnotGenes[db] = seqIdToGenes

        if not pubConf.debug:
            shutil.rmtree(bedDir)

        return dbAnnotGenes
Ejemplo n.º 2
0
def createChunksSubmitJobs(inDir, outDir, minId, runner, chunkSize):
    tmpDir = pubGeneric.makeTempDir("bingData", tmpDir=outDir)
    #maxCommon.delOnExit(tmpDir)

    maxCommon.mustExistDir(outDir)
    updateId, minId, alreadyDoneFiles = pubStore.parseUpdatesTab(outDir, minId)
    # get all .gz.index files, remove the already done files
    inFnames = glob.glob(join(inDir, "*.index.gz"))
    inBaseNames = set([basename(x) for x in inFnames])
    todoBasenames = inBaseNames - set(alreadyDoneFiles)
    todoFnames = [join(inDir, x) for x in todoBasenames]
    if len(todoFnames) == 0:
        logging.info("All input files already converted")
        return

    indexFilename = join(outDir, "%d_index.tab" % updateId)
    indexFile = open(indexFilename, "w")
    headers = ["articleId", "tsvFile", "url", "offset"]
    indexFile.write("\t".join(headers))
    indexFile.write("\n")

    # read them and create a big index file:
    # with tsvname, url, offset
    numId = minId
    doneUrls = set()
    for fname in todoFnames:
        baseName = basename(fname)
        for line in gzip.open(fname):
            url, offset = line.rstrip("\n").split("\t")
            assert (offset.isdigit())
            if "\t" in url or "\n" in url:
                logging.info("tab or NL in url %s, skipping" % url)
                continue
            if url in doneUrls:
                logging.info("Already did %s" % url)
                continue
            baseName = baseName.replace(".index.gz", ".gz")
            row = [str(numId), baseName, url, offset]
            indexFile.write("\t".join(row))
            indexFile.write("\n")
            numId += 1
    indexFile.close()

    # split the index file into chunks, one per job
    chunkIds = pubStore.splitTabFileOnChunkId(indexFilename,
                                              tmpDir,
                                              chunkSize=chunkSize)
    idFname = pubGeneric.concatIdentifiers(outDir, tmpDir, "doneArticles.tab")
    # submit one conversion job per chunk
    submitConvertJobs(runner, inDir, updateId, chunkIds, tmpDir, idFname,
                      tmpDir)
    pubGeneric.concatDelIdFiles(tmpDir, outDir, "%d_ids.tab" % updateId)
    pubGeneric.concatDelLogs(tmpDir, outDir, "%d.log" % updateId)
    pubStore.moveFiles(tmpDir, outDir)
    shutil.rmtree(tmpDir)
    pubStore.appendToUpdatesTxt(outDir, updateId, numId, todoBasenames)
Ejemplo n.º 3
0
def createChunksSubmitJobs(inDir, outDir, minId, runner, chunkSize):
    tmpDir = pubGeneric.makeTempDir("bingData", tmpDir=outDir)
    #maxCommon.delOnExit(tmpDir)

    maxCommon.mustExistDir(outDir)
    updateId, minId, alreadyDoneFiles = pubStore.parseUpdatesTab(outDir, minId)
    # get all .gz.index files, remove the already done files
    inFnames = glob.glob(join(inDir, "*.index.gz"))
    inBaseNames = set([basename(x) for x in inFnames])
    todoBasenames = inBaseNames - set(alreadyDoneFiles)
    todoFnames = [join(inDir, x) for x in todoBasenames]
    if len(todoFnames)==0:
        logging.info("All input files already converted")
        return

    indexFilename = join(outDir, "%d_index.tab" % updateId)
    indexFile = open(indexFilename, "w")
    headers = ["articleId", "tsvFile", "url", "offset"]
    indexFile.write("\t".join(headers))
    indexFile.write("\n")

    # read them and create a big index file:
    # with tsvname, url, offset
    numId = minId
    doneUrls = set()
    for fname in todoFnames:
        baseName = basename(fname)
        for line in gzip.open(fname):
            url, offset = line.rstrip("\n").split("\t")
            assert(offset.isdigit())
            if "\t" in url or "\n" in url:
                logging.info("tab or NL in url %s, skipping" % url)
                continue
            if url in doneUrls:
                logging.info("Already did %s" % url)
                continue
            baseName = baseName.replace(".index.gz", ".gz")
            row = [str(numId), baseName, url, offset]
            indexFile.write("\t".join(row))
            indexFile.write("\n")
            numId+=1
    indexFile.close()

    # split the index file into chunks, one per job
    chunkIds = pubStore.splitTabFileOnChunkId(indexFilename, tmpDir, chunkSize=chunkSize)
    idFname  = pubGeneric.concatIdentifiers(outDir, tmpDir, "doneArticles.tab")
    # submit one conversion job per chunk
    submitConvertJobs(runner, inDir, updateId, chunkIds, tmpDir, idFname, tmpDir)
    pubGeneric.concatDelIdFiles(tmpDir, outDir, "%d_ids.tab" % updateId)
    pubGeneric.concatDelLogs(tmpDir, outDir, "%d.log" % updateId)
    pubStore.moveFiles(tmpDir, outDir)
    shutil.rmtree(tmpDir)
    pubStore.appendToUpdatesTxt(outDir, updateId, numId, todoBasenames)
Ejemplo n.º 4
0
    def mapDnaToBed(self, seqs, docId, dbList, outDir):
        """ seqs is a list of (seqId, seq)
        """
        pslDir = pubGeneric.makeTempDir(prefix="geneFinderPsls")
        # create tuples (seqId, seq)
        dbPslFnames = self.blatClient.blatSeqs(dbList, seqs, pslDir)
        for db, fname in dbPslFnames.iteritems():
            oneBed = join(outDir, "chained.%s.bed" % db)
            pslFname = join(pslDir, db+".psl")
            dbBedNames = pubMap.chainPslToBed(pslFname, oneBed, pipeSep=True, onlyFields=12)

        if not pubConf.debug:
            shutil.rmtree(pslDir)

        return dbBedNames
Ejemplo n.º 5
0
def indexTsv(zipFname, tsvName, outFname):
    """ unzip a zipfile, recompress all the tsvs inside
    with gzip and create an .index.gz for them"""

    #def indexTsv(zipFname, tsvName, outFname, bgzipPath):

    # extract to local disk
    tmpDir = pubGeneric.makeTempDir("bingData")
    maxCommon.delOnExit(tmpDir)
    logging.info("Extracting to %s" % tmpDir)
    cmd =["unzip", "-d",tmpDir, zipFname]
    maxCommon.runCommand(cmd)

    tempFname = join(tmpDir, tsvName)
    logging.info("Indexing %s to %s" % (tempFname, outFname))
    # index lines
    ofh = gzip.open(outFname, "w")
    ifh = open(tempFname, "rb")
    offset = 0
    # the file iterator does not work  with tell()!!
    #for line in ifh:
    while True:
        line = ifh.readline()
        if line=="":
            break
        url = line[0:line.find("\t")]
        ofh.write("%s\t%d\n" % (url, offset))
        #logging.debug("url %s, offset %d" % (url, offset))
        offset = ifh.tell()
    ofh.close()

    # re-compress with gzip
    tmpFnames = glob.glob(join(tmpDir, "*.tsv"))
    assert(len(tmpFnames)==1)
    tmpFname = tmpFnames[0]
    zipDir = dirname(zipFname)
    finalFname = join(zipDir, tsvName+".gz")
    logging.info("Compressing to %s" % finalFname)
    #cmd = "%s %s -c > %s" % (bgzipPath, tmpFname, finalFname)
    cmd = "gzip %s -c > %s" % (tmpFname, finalFname)
    maxCommon.runCommand(cmd)
    shutil.rmtree(tmpDir)
Ejemplo n.º 6
0
def indexTsv(zipFname, tsvName, outFname):
    """ unzip a zipfile, recompress all the tsvs inside
    with gzip and create an .index.gz for them"""

    #def indexTsv(zipFname, tsvName, outFname, bgzipPath):

    # extract to local disk
    tmpDir = pubGeneric.makeTempDir("bingData")
    maxCommon.delOnExit(tmpDir)
    logging.info("Extracting to %s" % tmpDir)
    cmd = ["unzip", "-d", tmpDir, zipFname]
    maxCommon.runCommand(cmd)

    tempFname = join(tmpDir, tsvName)
    logging.info("Indexing %s to %s" % (tempFname, outFname))
    # index lines
    ofh = gzip.open(outFname, "w")
    ifh = open(tempFname, "rb")
    offset = 0
    # the file iterator does not work  with tell()!!
    #for line in ifh:
    while True:
        line = ifh.readline()
        if line == "":
            break
        url = line[0:line.find("\t")]
        ofh.write("%s\t%d\n" % (url, offset))
        #logging.debug("url %s, offset %d" % (url, offset))
        offset = ifh.tell()
    ofh.close()

    # re-compress with gzip
    tmpFnames = glob.glob(join(tmpDir, "*.tsv"))
    assert (len(tmpFnames) == 1)
    tmpFname = tmpFnames[0]
    zipDir = dirname(zipFname)
    finalFname = join(zipDir, tsvName + ".gz")
    logging.info("Compressing to %s" % finalFname)
    #cmd = "%s %s -c > %s" % (bgzipPath, tmpFname, finalFname)
    cmd = "gzip %s -c > %s" % (tmpFname, finalFname)
    maxCommon.runCommand(cmd)
    shutil.rmtree(tmpDir)
Ejemplo n.º 7
0
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile):
    """ 
    get files from inIndexFile, parse Xml, 
    write everything to outfile in ascii format
    """
    diskDir = abspath(join(zipDir, "..", "disk"))

    store = pubStore.PubWriterFile(outFile)

    # read all already done IDs
    doneIds = parseDoneIds(inIdFile)

    # open output id files
    idFname = join(dirname(outFile), basename(outFile).split(".")[0] + ".ids.tab")
    logging.debug("Writing ids to %s" % idFname)
    idFh = open(idFname, "w")
    idFh.write("#articleId\tdoi\tpmid\n")

    pmidFinder = pubCompare.PmidFinder()

    unzipTmp = pubGeneric.makeTempDir(prefix="pubConvSpringerUnzip", tmpDir=pubConf.getFastTempDir())
    maxCommon.delOnExit(unzipTmp)

    i = 0
    inRows = list(maxCommon.iterTsvRows(inIndexFile))
    logging.info("Converting %d files" % len(inRows))
    convCount = 0
    pdfNotFound = 0
    for row in inRows:
        # read line
        i += 1
        articleId = row.articleId
        zipFilename, filename = row.zipFilename, row.filename

        if u"\xbf" in filename:
            logging.info("Found weird character, skipping file")
            continue

        articleData = pubStore.createEmptyArticleDict(publisher="springer")
        if zipFilename == "":
            xmlString, pdfString = getDiskData(diskDir, filename)
            articleData["origFile"] = filename
        else:
            xmlString, pdfString = getUpdateData(unzipTmp, zipDir, zipFilename, filename)
            articleData["origFile"] = zipFilename + ":" + filename

        if pdfString == None:
            pdfNotFound += 1
            logging.error("Could not open pdf or xml file")
            continue

        articleId = int(articleId)

        # parse xml
        logging.debug("Parsing XML")
        try:
            xmlTree = pubXml.etreeFromXml(xmlString)
        except lxml.etree.XMLSyntaxError:
            logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename))
            continue

        articleData = parseXml(xmlTree, articleData)

        if articleData == None:
            logging.warn("Parser got no data for %s" % filename)
            continue
        if articleData["doi"] in doneIds:
            logging.error("article %s has already been converted, skipping" % articleData["doi"])
            continue

        articleData["pmid"] = pmidFinder.lookupPmid(articleData)
        articleData["origFile"] = zipFilename + "/" + filename
        articleData["externalId"] = articleData["doi"]

        # convert pdf to ascii
        fileData = createFileData(articleData, "application/pdf", pdfString)
        logging.debug("converting pdf to ascii")
        pubGeneric.toAscii(fileData, "application/pdf")

        # write to output
        store.writeArticle(articleId, articleData)
        store.writeFile(articleId, (1000 * (articleId)) + 1, fileData, externalId=articleData["externalId"])

        # write IDs to separate file
        idRow = [str(articleData["articleId"]), articleData["doi"], str(articleData["pmid"])]
        idFh.write("\t".join(idRow))
        idFh.write("\n")

        doneIds.add(articleData["doi"])

        convCount += 1
    logging.info("Converted %d files, pdfNotFound=%d" % (convCount, pdfNotFound))
    store.close()
    idFh.close()
Ejemplo n.º 8
0
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile):
    """ 
    get files from inIndexFile, parse Xml, 
    write everything to outfile in ascii format
    """ 
    diskDir = abspath(join(zipDir, "..", "disk"))

    store = pubStore.PubWriterFile(outFile)

    # read all already done IDs
    doneIds = parseDoneIds(inIdFile)

    # open output id files
    idFname = join(dirname(outFile), basename(outFile).split(".")[0]+".ids.tab")
    logging.debug("Writing ids to %s" % idFname)
    idFh = open(idFname, "w")
    idFh.write("#articleId\tdoi\tpmid\n")

    pmidFinder = pubCompare.PmidFinder()

    unzipTmp = pubGeneric.makeTempDir(prefix="pubConvSpringerUnzip", tmpDir=pubConf.getFastTempDir())
    maxCommon.delOnExit(unzipTmp)

    i = 0
    inRows = list(maxCommon.iterTsvRows(inIndexFile))
    logging.info("Converting %d files" % len(inRows))
    convCount = 0
    pdfNotFound = 0
    for row in inRows:
        # read line
        i+=1
        articleId = row.articleId
        zipFilename, filename = row.zipFilename, row.filename

        if u'\xbf' in filename:
            logging.info("Found weird character, skipping file")
            continue
        
        articleData = pubStore.createEmptyArticleDict(publisher="springer")
        if zipFilename=="":
            xmlString, pdfString = getDiskData(diskDir, filename)
            articleData["origFile"] = filename
        else:
            xmlString, pdfString = getUpdateData(unzipTmp, zipDir, zipFilename, filename)
            articleData["origFile"] = zipFilename+":"+filename

        if pdfString==None:
            pdfNotFound+=1
            logging.error("Could not open pdf or xml file")
            continue

        articleId=int(articleId)

        # parse xml
        logging.debug("Parsing XML")
        try:
            xmlTree   = pubXml.etreeFromXml(xmlString)
        except lxml.etree.XMLSyntaxError:
            logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename))
            continue

        articleData = parseXml(xmlTree, articleData)

        if articleData==None:
            logging.warn("Parser got no data for %s" % filename)
            continue
        if articleData["doi"] in doneIds:
            logging.error("article %s has already been converted, skipping" % articleData["doi"])
            continue

        articleData["pmid"] = pmidFinder.lookupPmid(articleData)
        articleData["origFile"]=zipFilename+"/"+filename
        articleData["externalId"]=articleData["doi"]

        # convert pdf to ascii
        fileData = createFileData(articleData, "application/pdf", pdfString)
        logging.debug("converting pdf to ascii")
        pubGeneric.toAscii(fileData, "application/pdf")

        # write to output
        store.writeArticle(articleId, articleData)
        store.writeFile(articleId, (1000*(articleId))+1, fileData, externalId=articleData["externalId"])

        # write IDs to separate file 
        idRow = [str(articleData["articleId"]), articleData["doi"], str(articleData["pmid"])]
        idFh.write("\t".join(idRow))
        idFh.write("\n")

        doneIds.add(articleData["doi"])

        convCount += 1
    logging.info("Converted %d files, pdfNotFound=%d" % (convCount, pdfNotFound))
    store.close()
    idFh.close()