コード例 #1
0
 def readIds(fname):
     if os.path.isfile(fname):
         logging.debug("Reading assigned authorIds from %s" % fname)
         return maxTables.TableParser(fname).column("authorUniqueName")
     else:
         logging.warn("Cannot find %s" % fname)
         return set()
コード例 #2
0
def filter(compBlastFile, genomeWeights, helperTaxons, bestTaxons, geneWeights,
           genomeFeatureFile, geneFeatureFile, bestGenomeFile, bestGeneFile,
           maxDist):
    """ combined genome/gene processing """
    fieldParser = maxTables.TableParser(
        fileObj=compBlastFile,
        fileType="blastConvert")  # create field name converter
    blockReader = maxTables.BlockReader(
        compBlastFile, 0)  # split file on first field (=pmcId)

    bestGenomePredictor = mostMatchesWeightedPredictor(genomeWeights,
                                                       helperTaxons,
                                                       bestTaxons)

    docCount = 0
    for (pmcId, block) in blockReader.readNext():
        docCount += 1
        if docCount % 1000 == 0:
            logging.info("Processed %d documents" % docCount)
        pmcId = int(pmcId)
        recs = fieldParser.parseBlock(block)
        # keep only best hits
        bestHits = removeNonOptHits(pmcId, recs)
        if len(bestHits) == 0:  # everything matched univec
            logger.debug("docId %s: everything seems to match univec" %
                         str(pmcId))
            continue

        bestGenomeInfo = bestGenomePredictor.getGenomeId(pmcId, bestHits)
        if bestGenomeInfo == None:
            logger.debug("no best genome found in best hits, trying all hits")
            bestGenomeInfo = bestGenomePredictor.getGenomeId(
                pmcId, recs)  # shouldn't this be bestHits instead of recs?
            bestHits = recs
            if bestGenomeInfo == None:
                logger.debug("no best genome found, giving up")
                continue

        # keep only hits from best genome and separate them into genes/genomes
        genomeHits, geneHits = filterSplitHits(bestHits,
                                               bestGenomeInfo.bestGenome)

        # chain and disambiguate chains
        genomeFeatures = disambiguateChains(
            chainHitsGenomes(genomeHits, maxDist=maxDist))
        rawGeneFeatures = disambiguateChains(chainHitsGenes(geneHits))

        # disambiguate genes
        bestGenes, geneFeatures = filterGeneFeatures(rawGeneFeatures,
                                                     geneWeights)

        # write to output files, only if we have some supporting evidence
        if len(genomeFeatures) > 0 or len(bestGenes) > 0:
            writeTsvFeatures(genomeFeatureFile, pmcId, genomeFeatures)
            writeTsvFeatures(geneFeatureFile, pmcId, geneFeatures)
            writeTuples(bestGeneFile, pmcId, bestGenes)
            writeTuples(bestGenomeFile, pmcId, [bestGenomeInfo.bestGenome])
コード例 #3
0
def filterProtein(compBlastFile, proteinTableFilename):
    fieldParser = maxTables.TableParser(
        fileObj=compBlastFile,
        fileType="blastConvert")  # create field name converter
    blockReader = maxTables.BlockReader(
        compBlastFile, 0)  # split file on first field (=pmcId)

    protTable = open(proteinTableFilename, "w")
    headers = [
        "#documentId", "sequenceId", "best match in PDB", "BLAST score\n"
    ]
    protTable.write("\t".join(headers))

    for (pmcId, block) in blockReader.readNext():
        pmcId = int(pmcId)
        recs = fieldParser.parseBlock(block)
        bestHits = removeNonOptHits(pmcId, recs)
        for bh in bestHits:
            data = [str(bh.pmcId), str(bh.seqId), bh.chrom, str(bh.score)]
            protTable.write("\t".join(data) + "\n")
コード例 #4
0
ファイル: t2gConvert.py プロジェクト: bergmanlab/contemplate
def convertBlastFiles(blastDirs, genomeToTaxFile, tempFilename, outFilename,
                      fileFormat):
    """ collect all *.blast files from blastDirs and subdirs, map to taxon ids using genomToTaxFile and write results to outfh fileobject 
    fileFormat can be blat or blast """

    outfh = maxbio.openFile(tempFilename, "w")

    # read genome -> taxid map
    if genomeToTaxFile:
        orgToNum = {}
        logger.info("Reading genome -> number table from %s" % genomeToTaxFile)
        logger.info(
            "Expected input fields are: (GenomeName, other field,..., other field, genomeId)"
        )

        for l in open(genomeToTaxFile):
            if l.startswith("#"):
                continue
            fs = l.strip().split("\t")
            genome = fs[0]
            num = fs[-1]
            num = int(num)
            genome = genome.lower().replace(" ", "_")
            orgToNum[genome] = num
    else:
        logger.info("No genome map specified, results are not genome-based")
        orgToNum = None

    dropFileCount = 0
    lineCount = 0
    finishedTaxons = set()
    lastDir = None

    # read blast files
    if fileFormat == "blast":
        ext = ".blast"
    elif fileFormat == "blat":
        ext = ".psl"
    elif fileFormat == "bwa":
        ext = ".sam"
    else:
        assert (False)  # wrong file format parameter

    for blastDir in blastDirs:
        logger.info("Searching for files with extension %s in directory %s" %
                    (ext, blastDir))
        files = list(util.findSubdirFiles(blastDir, ext))
        logger.info("Found %d files" % len(files))

        # convert blast files
        files = list(files)
        for fname in files:
            # convert organism to taxid, skip if not possible
            dirname = os.path.dirname(fname)
            org = os.path.basename(dirname)
            org = org.lower()

            if orgToNum != None:
                if org in orgToNum:
                    orgNum = orgToNum[org]
                else:
                    # try to find any organism from genome list in filename
                    found = False
                    for dbOrg in orgToNum:
                        if dbOrg.replace(" ", "_").lower() in fname.lower():
                            orgNum = orgToNum[dbOrg]
                            logger.info(
                                "Found orgName %s in filename %s, using organism id %s"
                                % (dbOrg, fname, str(orgNum)))
                            found = True
                            break

                    if not found:
                        logger.warn(
                            "warning: could not resolve filename %s to taxid, dropping this file (recognized organism %s)"
                            % (fname, org))
                        dropFileCount += 1
                        continue
            else:
                orgNum = -1

            # check if not already processed AND in different directory (blast creates several indices per directory), skip if yes
            #if orgNum in finishedTaxons and dirname!=lastDir:
            #print("warning: already processed this taxon id %d, skipping input file %s)" % (orgNum, fname))
            #continue
            finishedTaxons.add(orgNum)
            lastDir = dirname

            # convert lines
            f = open(fname, "r")
            #print "Reading %s, writing hits to %s"%(fname,outFile)
            if orgNum != -1:
                logger.info("Reading %s, genomeID %d" % (fname, orgNum))
            else:
                logger.info("Reading %s, not linked to any genome id" %
                            (fname))

            if fileFormat == "bwa":
                tp = maxTables.TableParser(fileType="sam")

            for l in f:
                lineCount += 1
                # parse blast line
                fs = l.strip().split("\t")
                if fileFormat == "blast":
                    # example
                    # 11495631        chr1    100.00  23      0       0       1       23      25500772        25500750        2e-05   46.1
                    srcId, trgId, perc, length, dummy, dummy, dummy, length, trgStart, trgEnd, eVal, score = fs
                elif fileFormat == "blat":
                    # psl-format from http://genome.ucsc.edu/FAQ/FAQformat.html#format2
                    # matches - Number of bases that match that aren't repeats
                    # misMatches - Number of bases that don't match
                    # repMatches - Number of bases that match but are part of repeats
                    # nCount - Number of 'N' bases
                    # qNumInsert - Number of inserts in query
                    # qBaseInsert - Number of bases inserted in query
                    # tNumInsert - Number of inserts in target
                    # tBaseInsert - Number of bases inserted in target
                    # strand - '+' or '-' for query strand. For translated alignments, second '+'or '-' is for genomic strand
                    # qName - Query sequence name
                    # qSize - Query sequence size
                    # qStart - Alignment start position in query
                    # qEnd - Alignment end position in query
                    # tName - Target sequence name
                    # tSize - Target sequence size
                    # tStart - Alignment start position in target
                    # tEnd - Alignment end position in target
                    # blockCount - Number of blocks in the alignment (a block contains no gaps)
                    # blockSizes - Comma-separated list of sizes of each block
                    # qStarts - Comma-separated list of starting positions of each block in query
                    # tStarts - Comma-separated list of starting positions of each block in target
                    # 23      0       0       0       0       0       0       0       +       11075971|1      2299    2248    2271    scaffold_281    111378  17336   17359   1       23,     2248,   17336,
                    matches, misMatches, repMatches, nCount, qNumInsert, qBaseInsert, tNumInsert, tBaseInsert, strand, qName, qSize, qStart, qEnd, tName, tSize, tStart, tEnd, blockCount, blockSizes, qStarts, tStarts = fs

                    score = matches
                    perc = "%2.2f" % ((float(matches) + float(repMatches)) /
                                      (float(matches) + float(misMatches) +
                                       float(repMatches)) * 100.0)

                    trgId = tName
                    srcId = qName
                    trgStart = tStart
                    trgEnd = tEnd

                elif fileFormat == "bwa":
                    if fs[0].startswith("@"):
                        continue
                    if len(fs) == 11:
                        fs.append("")
                    row = tp.parseTuple(fs)
                    tuple = maxTables.samToBed(row)
                    if tuple == None:
                        continue

                    chrom, start, end, name, score, strand = tuple
                    srcId = name
                    trgId = chrom
                    perc = "0"
                    length = start - end
                    trgStart, trgEnd = start, end

                else:
                    assert (False)  # file format not found?

                trgEnd = int(trgEnd)
                trgStart = int(trgStart)
                if trgEnd < trgStart:
                    trgStart, trgEnd = trgEnd, trgStart

                fs = srcId.split("|")
                srcId = fs[0]
                srcSeq = fs[1]
                pmcId = srcId.replace("PMC", "")
                pmcId = pmcId.replace(".txt", "")
                pmcId = pmcId.replace(".pdf", "")
                data = [
                    pmcId,
                    str(orgNum), srcSeq, trgId,
                    str(trgStart),
                    str(trgEnd),
                    str(score),
                    str(perc)
                ]
                outfh.write("\t".join(data) + "\n")

    outfh.close()

    logger.info(
        "BlastHit output table format is (pmcId, genomeId, seqId, chrom, start, end, score, percentId)"
    )
    logger.info(
        "blastConvert : blast files dropped because of unresolvable species name %d, filesDropped=%d"
        % (dropFileCount, dropFileCount))
    logger.info("blastConvert : processed %d blast matches, blastMatches=%d" %
                (lineCount, lineCount))
    logger.info("Now sorting the file with the UNIX sort command")

    cmdLine = "sort -n %s | uniq > %s" % (tempFilename, outFilename)
    logger.info(cmdLine)
    ret = os.system(cmdLine)

    if ret == 0:
        logger.info("Sorting finished, no error")
    else:
        logger.info("Error occured while sorting")