コード例 #1
0
def toWellMappedContigs(inFastaFile,
                        inTaxonomyWFile,
                        outFastaFile,
                        outFastaMisAssembledFile,
                        outTaxonomyFile,
                        weightThreshold=0.99):
    """
        Creates the fasta and mapping files that contain well assembled contigs (filter out misassembled contigs).

        @param inFastaFile: input fasta file with contigs
        @param inTaxonomyWFile: input file that contains taxonomy with weights (seqId, weight, taxonId)
        @param outFastaFile: fasta file containing well assembled sequences
        @param outFastaMisAssembledFile: fasta file containing misassembled contigs
        @param outTaxonomyFile: resulting taxonomy of the well assembled sequences (seqId, taxonId)
        @param weightThreshold: only contigs the weight of which is at least this value will be taken
        @return: statistics
    """
    seqIdToTaxonId = csv.predToDict(inTaxonomyWFile)
    seqIdToWeight = csv.getMapping(inTaxonomyWFile, 0, 1, '\t')
    outFastaOk = csv.OutFileBuffer(outFastaFile)
    outFastaMis = csv.OutFileBuffer(outFastaMisAssembledFile)
    outTaxonomyOk = csv.OutFileBuffer(outTaxonomyFile)

    totalBp = 0.0
    totalCount = 0.0
    okBp = 0.0
    okCount = 0.0
    avgSumBp = 0.0

    for seqId, seq in fas.fastaFileToDictWholeNames(inFastaFile).iteritems():
        bp = len(seq)
        totalBp += bp
        totalCount += 1
        seqIdPrefix = str(seqId).split(' ')[0]
        weight = seqIdToWeight[seqIdPrefix][0]
        fastaEntry = '>' + str(seqIdPrefix) + '\n' + str(seq) + '\n'
        if float(weight) >= weightThreshold:
            outFastaOk.writeText(fastaEntry)
            outTaxonomyOk.writeText(
                str(seqIdPrefix) + '\t' + str(seqIdToTaxonId[seqIdPrefix]) +
                '\n')
            okBp += bp
            okCount += 1
            avgSumBp += getCoverage(seqId) * bp
        else:
            outFastaMis.writeText(fastaEntry)

    outFastaOk.close()
    outFastaMis.close()
    outTaxonomyOk.close()

    return 'Taken: %s/%sMB, %s/%sseq, %s%% bp %s%% seq, avg coverage %s' % (
        round(okBp / 1000000, 2), round(totalBp / 1000000, 2), okCount,
        totalCount, round(
            (okBp / totalBp) * 100, 2), round(
                (okCount / totalCount) * 100, 2), round(avgSumBp / okBp, 3))
コード例 #2
0
def removeEntries(mg):
    """
        Removes sequences from the marker gene files at the level from species, genus, family etc.
    """
    removeListPath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids_species.txt'
    srcFilePath = str(
        '/net/metagenomics/projects/PPSmg/data/markerGenes2/db/' + mg +
        '_bact+arch_dnaV.tax')
    dstFilePath = str(
        '/net/metagenomics/projects/PPSmg/data/V35/mgScenarios/speciesRemoved/db/'
        + mg + '_bact+arch_dnaV.tax')
    out = csv.OutFileBuffer(dstFilePath)
    removeSet = set(csv.getColumnAsList(removeListPath, colNum=0, comment='#'))
    removeSetInt = set()
    removeSetIds = set()
    removed = 0
    for s in removeSet:
        if s != '':
            removeSetInt.add(int(s))
    col0 = csv.getColumnAsList(srcFilePath, colNum=0, sep='\t', comment='#')
    col1 = csv.getColumnAsList(srcFilePath, colNum=1, sep='\t', comment='#')
    for col0, col1 in zip(col0, col1):
        lineSetInt = set()
        for s in col1.split(';'):
            if s != '':
                lineSetInt.add(int(s))
        if len(removeSetInt.intersection(
                lineSetInt)) > 0:  #the intersection is not empty
            removed += 1
            removeSetIds.add(col0)
        else:
            out.writeText(str(col0 + '\t' + col1 + '\n'))
    out.close()

    print mg, 'removedEntries', removed

    srcFilePath = str(
        '/net/metagenomics/projects/PPSmg/data/markerGenes2/db/' + mg +
        '_bact+arch_dnaV.noalign.fna')
    dstFilePath = str(
        '/net/metagenomics/projects/PPSmg/data/V35/mgScenarios/speciesRemoved/db/'
        + mg + '_bact+arch_dnaV.noalign.fna')
    out = csv.OutFileBuffer(dstFilePath)
    seqIdToSeq = fas.fastaFileToDict(srcFilePath)
    removed = 0
    for seqId in seqIdToSeq:
        if seqId in removeSetIds:
            removed += 1
        else:
            out.writeText(
                str('>' + str(seqId) + '\n' + str(seqIdToSeq[seqId]) + '\n'))

    out.close()

    print mg, 'removedSeq', removed
コード例 #3
0
def _getLabelsCreateFasta():
    """
        To process the original mercier dataset with 59 strains. Take only contigs that were mapped to the reference
        genomes. Output a fasta file and a mapping file.
    :rtype : None
    """
    # input fasta file
    fastaFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigs_1000.txt'  #contigs_1000.txt
    seqIdToSeq = fas.fastaFileToDict(fastaFilePath)

    # contigs mapped to genome names
    nameLabelsFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigs_1000_blast_labels.txt'  #contigs_1000_blast_labels.txt
    seqIdToNameLabels = csv.getMapping(nameLabelsFilePath,
                                       0,
                                       1,
                                       sep='\t',
                                       comment='#')

    # mapping: genome name -> taxon id
    genomeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_list2.txt'  #genome_list.txt
    nameLabelToNcbid = csv.getMapping(genomeListFilePath,
                                      0,
                                      2,
                                      sep=';',
                                      comment='#')

    # to store mapped sequences
    outFastaFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000.fna'  #contigsMappedBlast1000.fna
    outFasta = csv.OutFileBuffer(outFastaFilePath)
    # to stored taxonomic mapping of mapped sequences
    outLabelsFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000Labels.txt'  #contigsMappedBlast1000Labels.txt
    outLabels = csv.OutFileBuffer(outLabelsFilePath)

    for seqId in seqIdToSeq:
        if seqId in seqIdToNameLabels:
            outFasta.writeText(
                str('>' + str(seqId) + '\n' + seqIdToSeq[seqId] + '\n'))

    outFasta.close()
    print 'fasta created'

    for seqId in seqIdToSeq:
        if seqId in seqIdToNameLabels:
            nameLabel = seqIdToNameLabels[seqId][0]
            ncbid = nameLabelToNcbid[nameLabel][0]
            outLabels.writeText(str(str(seqId) + '\t' + str(ncbid) + '\n'))

    outLabels.close()
    print 'labels created'
コード例 #4
0
def toContigsLabels(inMapFile, outMapFile):
    """
        Creates the label of contigs from the label of reads.

        @param inMapFile: maps contigId to a list of read taxonIds
        @param outMapFile: maps contigId to weight and the most prevalent taxonId
    """
    out = csv.OutFileBuffer(outMapFile)

    for line in csv.getColumnAsList(inMapFile, sep='\n'):
        contigId, taxonIds = str(line).split('\t')
        taxonIdsList = map(int, str(taxonIds).split(','))
        idToCount = {}
        totalCount = 0.0
        for taxonId in taxonIdsList:
            totalCount += 1
            if taxonId in idToCount:
                idToCount[taxonId] += 1
            else:
                idToCount[taxonId] = 1
        pairList = []
        for taxonId, count in idToCount.iteritems():
            pairList.append((taxonId, count))
        pairList.sort(key=lambda x: x[1], reverse=True)
        weight = round(float(pairList[0][1]) / totalCount, 3)
        out.writeText(
            str(contigId) + '\t' + str(weight) + '\t' + str(pairList[0][0]) +
            '\n')

    out.close()
コード例 #5
0
def scafToContigOutput(scaffContigMapFile, scaffPPSOutFile, contigPPSOutFile):
    """
        Takes scaffold-contigs mapping and scaffold placement (.out file), outputs contigs placement (.out file)

        @param scaffContigMapFile: tab sepparated scaffold-contigs mapping (scaffoldName \t contigName)
        @param scaffPPSOutFile: scaffold predictions (PPS output file)
        @param contigPPSOutFile: contigs predictions (as if it was a PPS output file)
    """
    # init output
    out = csv.OutFileBuffer(contigPPSOutFile)

    # read scaffold predictions
    scaffNameToTaxonId = csv.predToDict(scaffPPSOutFile)

    # read mapping: scaffName -> contigNameList
    scaffNameToContigNameList = csv.getMapping(scaffContigMapFile, 0, 1, sep='\t')

    # store contigs' predictions (according to scaffolds' predictions)
    for scaffName, contigNameList in scaffNameToContigNameList.iteritems():
        taxonId = scaffNameToTaxonId.get(scaffName, None)
        if taxonId is None:
            taxonId = 1
        for contigName in contigNameList:
            out.writeText(contigName + '\t' + str(taxonId) + '\n')
    out.close()
コード例 #6
0
def removeLines(mg):
    removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids.txt'
    #removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_accession_silva.txt'
    srcFilePath = str('/net/metagenomics/projects/PPSmg/data/markerGenes/db/' +
                      mg + '_bact+arch_dnaV.tax')
    dstFilePath = str(
        '/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/markerGenes/db/'
        + mg + '_bact+arch_dnaV.tax')
    #srcFilePath = str('/net/metagenomics/projects/PPSmg/data/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.tax' )
    #dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.tax' )
    pattern = r'.*ncbid:([0-9]+)$'
    #pattern = r'^([^\-]+)\-.*$'

    removeSet = set(
        csv.getColumnAsList(removeListFilePath, colNum=0, comment='#'))
    col0 = csv.getColumnAsList(srcFilePath, colNum=0, sep='\t', comment='#')
    col1 = csv.getColumnAsList(srcFilePath, colNum=1, sep='\t', comment='#')
    out = csv.OutFileBuffer(dstFilePath)
    removed = 0
    for col0, col1 in zip(col0, col1):
        if re.sub(pattern, r'\1', col0) not in removeSet:
            out.writeText(str(col0 + '\t' + col1 + '\n'))
        else:
            removed += 1

    out.close()
    print mg, 'removeLines', removed
コード例 #7
0
def findPlasmids(outPlasmidFilePath):
    """
        Read sequence descriptions from a DBK files (stdin), output sequence ids (record.id) if the corresponding
        description contain "plasmid".
        Plasmids can be also within the sequences!
    """
    # append to a file if it already exists
    if os.path.isfile(outPlasmidFilePath):
        outFileMode = 'a'
    else:
        outFileMode = 'w'
    outBuffer = csv.OutFileBuffer(outPlasmidFilePath,
                                  bufferText=False,
                                  fileOpenMode=outFileMode)

    recordCount = 0
    plasmidCount = 0

    for record in SeqIO.parse(sys.stdin, "genbank"):
        recordCount += 1

        if string.find(record.description, 'plasmid') != -1:
            outBuffer.writeText(str(str(record.id) + '\n'))
            plasmidCount += 1

    outBuffer.close()
    print 'file, records, plasmids:', outPlasmidFilePath, recordCount, plasmidCount
コード例 #8
0
def removeSequences(mg):
    removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids.txt'
    #removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_accession_silva.txt'
    srcFilePath = str('/net/metagenomics/projects/PPSmg/data/markerGenes/db/' +
                      mg + '_bact+arch_dnaV.noalign.fna')
    dstFilePath = str(
        '/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/markerGenes/db/'
        + mg + '_bact+arch_dnaV.noalign.fna')
    #srcFilePath = str('/net/metagenomics/projects/PPSmg/data/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.fna' )
    #dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.fna' )
    pattern = r'.*ncbid:([0-9]+)$'
    #pattern = r'^([^\-]+)\-.*$'

    removeSet = set(
        csv.getColumnAsList(removeListFilePath, colNum=0, comment='#'))
    seqIdToSeq = fas.fastaFileToDict(srcFilePath)
    out = csv.OutFileBuffer(dstFilePath)
    removed = 0
    for seqId in seqIdToSeq:
        if re.sub(pattern, r'\1', str(seqId)) not in removeSet:
            out.writeText(
                str('>' + str(seqId) + '\n' + str(seqIdToSeq[seqId]) + '\n'))
        else:
            removed += 1

    out.close()
    print mg, 'removeSequences', removed
コード例 #9
0
def generateCladesForGeneralModel(refSeqDir, taxonomyDatabaseFile, rank,
                                  minTotalCount, minBpPerSpeciesCount,
                                  generalModelMaxClades, taxonIdListFile):
    """
        Generates the list of clades (file) to model for the general model

        @param refSeqDir: directory with reference data as needed for PPS
        @param taxonomyDatabaseFile: taxonomy file in the sqlite3 format
        @param rank: the clades will be considered at this rank
        @param minTotalCount: (see config)
        @param minBpPerSpeciesCount: (see config)
        @param generalModelMaxClades: maximum length of the list of the clades.
        @param taxonIdListFile: file to which the ncbi taxon ids will be stored
        @return: the number of the ncbi taxon ids stored in the file
    """
    cladeBpPairList = refToClades(refSeqDir, taxonomyDatabaseFile, rank)
    rs = ref_seq.RefSequences(refSeqDir, taxonomyDatabaseFile)

    cladeList = []
    count = 0
    for clade, bp in cladeBpPairList:
        if rs.isRefSufficient(int(clade), minTotalCount, minBpPerSpeciesCount):
            cladeList.append(int(clade))
            count += 1
        if count >= generalModelMaxClades:
            break

    out = csv.OutFileBuffer(taxonIdListFile)
    for clade in cladeList:
        out.writeText(str(clade) + '\n')
    out.close()
    rs.close()
    return len(cladeList)
コード例 #10
0
def ppsOut2ppOut(inFile, outFile, taxonomicRanks, databaseFile):
    """
        Transforms a PPS output file into a file in the PP format.

        @param inFile: input file in the PPS format (first column: seq name, last column: ncbi taxon id)
        @param outFile: output file in the PP format
        @param taxonomicRanks: taxonomic ranks (starting from superkingdom)
        @param databaseFile: database file in the sqlite3 format
    """
    taxonomy = Taxonomy(databaseFile, taxonomicRanks)
    outBuff = csv.OutFileBuffer(outFile)
    namesList = csv.getColumnAsList(inFile,
                                    entryModifyFunction=None,
                                    colNum=0,
                                    sep='\t',
                                    comment='#')
    valCol = 1
    ncbidsList = csv.getColumnAsList(inFile,
                                     entryModifyFunction=None,
                                     colNum=valCol,
                                     sep='\t',
                                     comment='#')

    while True:  # this is not efficient!
        valCol += 1
        tmpList = csv.getColumnAsList(inFile,
                                      entryModifyFunction=None,
                                      colNum=valCol,
                                      sep='\t',
                                      comment='#')
        if len(tmpList) == len(namesList):
            ncbidsList = tmpList
        else:
            break

    header = str('#PPS file transformed to PP format, input file: ' +
                 str(inFile) + '\n#ID' + '\t' + 'root')
    for rank in taxonomicRanks:
        header += str('\t' + rank)
    outBuff.writeText(str(header + '\n'))

    for i in range(len(namesList)):
        name = namesList[i]
        ncbid = ncbidsList[i]
        taxPathDict = taxonomy.getPathToRoot(int(ncbid))
        buff = str(name)
        if taxPathDict is None:
            buff += str('\t')
        else:
            buff += str('\t' + 'root')

        for rank in taxonomicRanks:
            if (taxPathDict is not None) and (rank in taxPathDict) and (
                    not taxPathDict[rank].isCopy()):
                buff += str('\t' + taxPathDict[rank].name)
            else:
                buff += '\t'
        outBuff.writeText(str(buff + '\n'))
    outBuff.close()
    taxonomy.close()
コード例 #11
0
def genomesToMask():
    rank = 'genus'  #which rank will be masked
    fileName = '/Users/ivan/Documents/work/binning/data/simMC/fromJohannes/contigs_genus_ncbids.txt'
    outFile = '/Users/ivan/Documents/work/binning/data/simMC/fromJohannes/genome_genus_masked.txt'
    outFile2 = '/Users/ivan/Documents/work/binning/data/simMC/fromJohannes/genome_ncbids_genus.txt'
    #outFile = '/Users/ivan/Documents/work/binning/data/V35/genome_species_masked.txt' #output file
    #outFile2 = '/Users/ivan/Documents/work/binning/data/V35/genome_ncbids_species.txt' #output file
    #fileName='/Users/ivan/Documents/work/binning/data/V35/genome_ncbids.txt' #list of all genome ncbids
    dbFile = '/Users/ivan/Documents/work/binning/taxonomy/20120828/ncbitax_sqlite.db'  #DB
    out = csv.OutFileBuffer(outFile)
    out2 = csv.OutFileBuffer(outFile2)

    genomeNcbids = csv.getColumnAsList(fileName,
                                       entryModifyFunction=None,
                                       colNum=0,
                                       sep=None,
                                       comment='#')
    taxonomy = taxonomy_ncbi.TaxonomyNcbi(dbFile)

    maskNcbids = []
    #print len(genomeNcbids), genomeNcbids
    for ncbid in genomeNcbids:
        while taxonomy.getRank(ncbid) != rank:
            ncbid = taxonomy.getParentNcbid(ncbid)
            if int(ncbid) == 1:
                print 'root reached!'
                break
        maskNcbids.append(int(ncbid))

    #print len(Set(maskNcbids)), maskNcbids

    maskSet = set(maskNcbids)
    for i in maskSet:
        out2.writeText(str(str(i) + '\n'))

    resultList = []
    for ncbid in maskSet:
        list = collectChildren(taxonomy, ncbid)
        for i in list:
            out.writeText(str(str(i) + '\n'))
        print ncbid, list

    #print taxonomy.childrenNcbids(818) #997888,818

    out.close()
    out2.close()
    taxonomy.close()
コード例 #12
0
def toContigsLabelList(inFastaFileName, readsF, readsR, readOnContig,
                       community, outMappingFileName):
    """
        Gets mapping from contigIds to lists of taxonIds of individual reads of the contigs.

        @param inFastaFileName:
        @param readsF:
        @param readsR:
        @param readOnContig:
        @param community:
        @param outMappingFileName:
    """
    # contigIds
    contigIdToBp = fas.getSequenceToBpDict(inFastaFileName)

    # map: contigId -> list of readIds
    contigIdToReadList = csv.getMapping(readOnContig,
                                        1,
                                        0,
                                        sep='\t',
                                        comment='r')

    # taxonIds as a list for reads
    readFTaxonIdList = getReadsTaxonIdList(readsF, community)
    print 's1'
    readRTaxonIdList = getReadsTaxonIdList(readsR, community)
    print 's2'

    if len(readFTaxonIdList) != len(readRTaxonIdList):
        print(
            'toContigsLabels: different number of reads in the reads files, exit'
        )
        return

    for i in range(len(readFTaxonIdList))[1:]:
        if readFTaxonIdList[i] != readRTaxonIdList[i]:
            print(
                'toContigsLabels: at index %s different taxon ids %s and %s' %
                (i, readFTaxonIdList[i], readRTaxonIdList[i]))
        if readFTaxonIdList[i] is None or readRTaxonIdList[i] is None:
            print('toContigsLabels: at index %s, one is None %s or %s' %
                  (i, readFTaxonIdList[i], readRTaxonIdList[i]))
    print 's3'
    #
    out = csv.OutFileBuffer(outMappingFileName)
    for contigId in contigIdToBp:
        try:
            readList = contigIdToReadList[contigId]
            taxonIdList = []
            for readId in readList:
                taxonIdList.append(readFTaxonIdList[int(readId)])
            out.writeText(
                str(contigId) + '\t' + ','.join(map(str, taxonIdList)) + '\n')
        except KeyError:
            print("No label for contigId: %s" % contigId)
    out.close()
    print 's4'
コード例 #13
0
ファイル: cami.py プロジェクト: CAMI-challenge/evaluation
def concatenate(directory, outputFile):
    out = csv.OutFileBuffer(outputFile)
    for f in os.listdir(directory):
        path = os.path.join(directory, f)
        name = f.split('.')[0]
        seqIdToSeq = fasta.fastaFileToDict(path)
        out.writeText('>' + str(name) + '\n')
        for seqId, seq in seqIdToSeq.iteritems():
            out.writeText(str(seq) + 200*'N' + '\n')
    out.close()
コード例 #14
0
def filterOutContigs(inFastaFile, inTaxFile, outFastaFile, outTaxFile,
                     notAllowedTaxonIdList):
    outFasta = csv.OutFileBuffer(outFastaFile)
    outTax = csv.OutFileBuffer(outTaxFile)
    seqIdToTaxonId = csv.predToDict(inTaxFile)
    notAllowedTaxonIdSet = set(notAllowedTaxonIdList)
    taxonIdToFilteredSeq = {}
    for taxonId in notAllowedTaxonIdSet:
        taxonIdToFilteredSeq[taxonId] = 0
    for seqId, seq in fas.fastaFileToDict(inFastaFile).iteritems():
        taxonId = int(seqIdToTaxonId[seqId])
        if taxonId not in notAllowedTaxonIdSet:
            outFasta.writeText('>' + str(seqId) + '\n' + str(seq) + '\n')
            outTax.writeText(str(seqId) + '\t' + str(taxonId) + '\n')
        else:
            taxonIdToFilteredSeq[taxonId] += 1
    outFasta.close()
    outTax.close()
    print("filtered taxonId -> seqCount: " + str(taxonIdToFilteredSeq))
コード例 #15
0
def filterOutReads():
    inFasta = ''
    outFasta = ''
    out = csv.OutFileBuffer(outFasta)
    notAllowedSet = set(['BA000019.2'])  # Nostoc sp. PCC 7120
    for seqId, seq in fas.fastaFileToDict(inFasta).iteritems():
        id = re.sub(r'([^_]+)_.*', r'\1', seqId)
        if id not in notAllowedSet:
            out.writeText('>' + str(seqId) + '\n' + str(seq) + '\n')
    out.close()
コード例 #16
0
def ppOut2PPSout():
    inFile = '/Users/ivan/Documents/work/binning/data/HumanGut/PP/TS29_scaff.file.0.5.txt'
    outFile = '/Users/ivan/Documents/work/binning/data/HumanGut/PP/TS29_scaff.file.0.5.PPS.txt'
    dbFile = '/Users/ivan/Documents/work/binning/taxonomy/20120828/ncbitax_sqlite.db'  #DB
    taxonomy = taxonomy_ncbi.TaxonomyNcbi(dbFile)

    out = csv.OutFileBuffer(outFile)

    csv.forEachLine(inFile, PP2PPSoutParser(taxonomy, out))

    out.close()
コード例 #17
0
def toLongSeq(inFastaFileName, outFastaFileName, minLength=1000):
    """
        Creates a fasta file that contains sequences that are at least minLength long.

        @param inFastaFileName:
        @param outFastaFileName:
        @param minLength:
    """
    out = csv.OutFileBuffer(outFastaFileName)
    for seqId, seq in fas.fastaFileToDictWholeNames(
            inFastaFileName).iteritems():
        if len(seq) >= minLength:
            out.writeText('>' + str(seqId) + '\n' + str(seq) + '\n')
    out.close()
コード例 #18
0
def samToMap(samFile, accToNcbiFile, outMapFile):
    """

        @param samFile: sam file from an assembler
        @param accToNcbiFile: mapping: accessions -> ncbi taxon ids
        @param outMapFile: output file or directory
    """
    accToNcbi = csv.getMapping(accToNcbiFile, 0, 1, sep='\t')
    contigToAcc = parseSam(samFile)
    out = csv.OutFileBuffer(outMapFile)
    for contigId, acc in contigToAcc.iteritems():
        taxonId = accToNcbi.get(acc, None)
        if taxonId is None:
            print("No mapping for %s %s" % (contigId, acc))
        else:
            out.writeText(contigId + '\t' + taxonId[0] + '\n')
    out.close()
コード例 #19
0
def refToClades(refDir, taxonomyFile, rank='species', outFile=None):
    """
        Returns (stores) a list of all clades (at the given rank) sorted according to the abundance of
        the individual clades. Abundance in respect to the size of the reference data available.

        @param refDir: directory containing reference data (as needed for PPS)
        @param taxonomyFile: ncbi taxonomy in the sqlite3 format
        @param rank: consider clades at this rank
        @param outFile: tab sep file, first column taxon id, second column number of bp (can be None)
        @return: list of tuples (clade, bp)
    """
    taxonomy = taxonomy_ncbi.TaxonomyNcbi(taxonomyFile)
    cladeNcbiToBp = {}
    for fileName in os.listdir(refDir):
        size = os.path.getsize(os.path.join(refDir, fileName))
        ncbid = int(fileName.rsplit('.', 2)[0])
        current = ncbid
        while (current is not None) and (taxonomy.getRank(current) != rank):
            current = taxonomy.getParentNcbid(int(current))
        if current is not None:
            if current in cladeNcbiToBp:
                cladeNcbiToBp[current] += size
            else:
                cladeNcbiToBp[current] = size
        else:
            print(
                'There is no ncbi taxon id defined at rank %s for ncbi taxon id %s'
                % (rank, ncbid))
    taxonomy.close()

    tuples = []
    for ncbid, size in cladeNcbiToBp.iteritems():
        tuples.append((ncbid, size))
    tuples.sort(key=lambda x: x[1], reverse=True)

    if outFile is not None:
        out = csv.OutFileBuffer(outFile)
        for t in tuples:
            out.writeText(str(t[0]) + '\t' + str(t[1]) + '\n')
        out.close()

    return tuples
コード例 #20
0
def getFirstLabelAtAllowedRank():
    rank = 'species'  # !!!!!!!

    predFile1 = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000Labels.txt'
    predFile2 = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000LabelsSpecies.txt'
    seqIdToLabel = csv.getMapping(predFile1, 0, 1, sep='\t', comment='#')
    outPred = csv.OutFileBuffer(predFile2)

    taxonomy = tax.TaxonomyNcbi(
        '/net/metagenomics/projects/PPSmg/data/nobackup/NCBI20120828/ncbiTax/ncbitax_sqlite.db'
    )

    for seqId in seqIdToLabel:
        ncbid = int(seqIdToLabel[seqId][0])
        while not taxonomy.isRankNcbidAllowed(ncbid):
            ncbid = taxonomy.getParentNcbid(ncbid)
        outPred.writeText(str(seqId + '\t' + str(ncbid) + '\n'))

    taxonomy.close()
    outPred.close()
コード例 #21
0
ファイル: cami.py プロジェクト: CAMI-challenge/evaluation
def outToCami(ppspOutFile):
    """
        Creates a cami output file, in format:

        #CAMI Format for Binning
        @Task:Binning
        @Version:1.0
        @ContestantID:CONTESTANTID
        @SampleID:SAMPLEID
        @Referencebased:T
        @Assemblybased:T
        @ReplicateInfo:T

        @@SEQUENCEID	TAXID	BINID

        read1201	123	123
        read1202	123	123
        read1203	131564	131564
        read1204	562	562.1
        read1205	562	562.2

    """
    out = csv.OutFileBuffer(ppspOutFile + '.cami')
    out.writeText("""#CAMI Format for Binning
@Task:Binning
@Version:1.0
@ContestantID:CONTESTANTID
@SampleID:SAMPLEID
@Referencebased:T
@Assemblybased:T
@ReplicateInfo:T

@@SEQUENCEID	TAXID	BINID

""")
    for line in open(ppspOutFile):
        name, taxonId = line.strip('\n').split('\t', 2)
        out.writeText("%s\t%s\t%s\n" % (name, taxonId, taxonId))
    out.close()
コード例 #22
0
def sortReads(inReadsFile,
              outReadsFile,
              headerToNum=lambda x: int(x.split('_', 2)[1].strip('nr'))):
    i = 0
    seqName = None
    tupleList = []
    for line in csv.getColumnAsList(inReadsFile, sep='\n'):
        if i % 2 == 0:
            seqName = line
        else:
            seq = line
            assert seqName is not None
            tupleList.append(
                (seqName, zlib.compress(seq), headerToNum(seqName)))
            seqName = None
        i += 1
    tupleList.sort(key=lambda x: x[2])

    out = csv.OutFileBuffer(outReadsFile)
    for t in tupleList:
        out.writeText(str(t[0]) + '\n' + str(zlib.decompress(t[1])) + '\n')
    out.close()
コード例 #23
0
def createEvalMetaFile(outputDir):

    precisionRecallFile = os.path.join(outputDir, 'precision_recall.csv')
    precisionRecallCorrectionFile = os.path.join(
        outputDir, 'precision_recall_correction.csv')
    confusionMatrixDir = os.path.join(outputDir, 'confusion_matrix')
    consistencyFile = os.path.join(outputDir, 'consistency.txt')

    metaOut = csv.OutFileBuffer(os.path.join(outputDir, 'biobox.yaml'))
    # creates a metafile describing the results

    if os.path.isfile(precisionRecallFile):
        metaOut.writeText('''name: Precision and recall
    type: csv
    value:  %s\n\n''' % precisionRecallFile)

    if os.path.isfile(precisionRecallCorrectionFile):
        metaOut.writeText('''name: Precision and recall with correction
    type: csv
    value: %s\n\n''' % precisionRecallCorrectionFile)

    if os.path.isfile(consistencyFile):
        metaOut.writeText('''name: Consistency
    type: txt
    value: %s\n\n''' % consistencyFile)

    if os.path.isdir(confusionMatrixDir):

        for f in os.listdir(confusionMatrixDir):

            filePath = os.path.join(confusionMatrixDir, f)
            rank = filePath.rsplit('.', 2)[1].split('_')[0]
            metaOut.writeText('''name: Confusion table for %s
    description: Where rows correspond to the true assignments and columns correspond to the assignments by a binning method.
    type: csv
    value: %s\n\n''' % (rank, filePath))

    metaOut.close()
コード例 #24
0
def getSeeds(inSortedFasta, outSeedsFasta):
    """
        @param inSortedFasta: DNA sequences sorted according to the sequence length in the descending order
        @param outSeedsFasta: a fasta file that contains all seeds
    """
    out = csv.OutFileBuffer(outSeedsFasta)
    seedList = []
    seqList = fasta.getSequencesToList(
        inSortedFasta)  # list of (sequenceName, sequence)

    for seqId, seq in seqList:
        seq = string.upper(seq)

        newSeed = True
        for seedSeq in seedList:

            if len(seedSeq) < len(seq):
                continue

            # if bool(re.search(seq, seedSeq, re.I)) or bool(re.search(str(Seq(seq).reverse_complement()), seedSeq, re.I)):
            if seq in seedSeq or str(Seq(seq).reverse_complement()) in seedSeq:
                newSeed = False
                break

        if newSeed:
            # print 'new seed:', seqId
            seedList.append(seq)
            out.writeText(str('>' + seqId + '\n' + seq + '\n'))
        # else:
        #    print 'no seed:', seqId

    out.close()

    print 'total', len(seqList)
    print 'seed count', len(seedList)
    print 'duplicate', (len(seqList) - len(seedList))
コード例 #25
0
def getProfile(readsFFastaFile, communityFile, contigMFastaFile,
               contigLFastaFile, taxonomyMFile, taxonomyDbFile,
               outProfileFile):
    """
        Gets the profile of the dataset.

        @param readsFFastaFile:
        @param communityFile:
        @param contigMFastaFile:
        @param contigLFastaFile:
        @param taxonomyMFile:
        @param taxonomyDbFile: taxonomy in the sqlite3 format
        @param outProfileFile: output file
    """
    # get map: taxonId -> read count
    taxonIdToReadCount = {}
    readTotalCount = 0
    for taxonId in getReadsTaxonIdList(
            readsFFastaFile, communityFile,
            readHeaderToCommunityId=getCommunityId)[1:]:
        if taxonId in taxonIdToReadCount:
            taxonIdToReadCount[taxonId] += 1
        else:
            taxonIdToReadCount[taxonId] = 1
        readTotalCount += 1

    # get map: taxonId -> contig count
    # get map: taxonId -> contig bp
    taxonIdToContigCount = {}
    taxonIdToContigBp = {}
    totalContigCount = 0
    seqIdToTaxonId = csv.predToDict(taxonomyMFile)
    seqIdToBp = fas.getSequenceToBpDict(contigMFastaFile)
    for seqId, bp in seqIdToBp.iteritems():
        totalContigCount += 1
        taxonId = seqIdToTaxonId[seqId]
        if taxonId in taxonIdToContigBp:
            taxonIdToContigBp[taxonId] += bp
        else:
            taxonIdToContigBp[taxonId] = bp
        if taxonId in taxonIdToContigCount:
            taxonIdToContigCount[taxonId] += 1
        else:
            taxonIdToContigCount[taxonId] = 1

    taxonIdToTotalBp = {}
    taxonIdToAvgSumCov = {}
    taxonIdToAvgCov = {}
    totalBp = 0.0
    for taxonId in taxonIdToContigBp:
        taxonIdToTotalBp[taxonId] = 0.0
        taxonIdToAvgSumCov[taxonId] = 0.0
        taxonIdToAvgCov[taxonId] = 0.0

    for seqId in fas.fastaFileToDictWholeNames(contigLFastaFile):
        shortSeqId = getShortContigId(seqId)
        if shortSeqId in seqIdToBp:
            coverage = getCoverage(seqId)
            bp = seqIdToBp[shortSeqId]
            taxonId = seqIdToTaxonId[shortSeqId]
            taxonIdToTotalBp[taxonId] += bp
            taxonIdToAvgSumCov[taxonId] += float(coverage) * float(bp)
            totalBp += bp

    for taxonId, bp in taxonIdToTotalBp.iteritems():
        if bp > 0:
            taxonIdToAvgCov[taxonId] = taxonIdToAvgSumCov[taxonId] / float(bp)

    tupleList = []
    taxonomy = taxonomy_ncbi.TaxonomyNcbi(taxonomyDbFile, considerNoRank=True)
    ranks = taxonomy_ncbi.TAXONOMIC_RANKS[2:]
    avgCoverage = 0.0
    for taxonId, readCount in taxonIdToReadCount.iteritems():
        scName = ScientificNameAtRank(taxonId, taxonomy, ranks)
        tupleList.append((
            taxonId,
            round(100 * (readCount / float(readTotalCount)), 1),
            round(100 * (taxonIdToTotalBp.get(taxonId, 0) / float(totalBp)),
                  1),
            round(taxonIdToAvgCov.get(taxonId, 0), 2),
            round(taxonIdToTotalBp.get(taxonId, 0) / 1000000.0, 2),
            taxonIdToContigCount.get(taxonId, 0),
            taxonomy.getScientificName(taxonId),
            scName.getNameAtRank('phylum'),
            scName.getNameAtRank('class'),
            scName.getNameAtRank('order'),
            scName.getNameAtRank('family'),
            scName.getNameAtRank('genus'),
            scName.getNameAtRank(
                'species')  # this could be done in a nicer way
        ))

        avgCoverage += taxonIdToAvgCov.get(taxonId, 0) * taxonIdToTotalBp.get(
            taxonId, 0)
    avgCoverage /= float(totalBp)
    tupleList.sort(key=lambda x: x[2], reverse=True)

    out = csv.OutFileBuffer(outProfileFile)
    out.writeText(
        '#taxonId, % reads, % contigs, avg coverage, MB contigs, contigs count, strain name, '
        + ",".join(ranks) + '\n')
    for entry in tupleList:
        out.writeText(','.join(map(str, entry)) + '\n')

    out.writeText('#Sum/Avg., -, -, ' + str(round(avgCoverage, 2)) + ', ' +
                  str(round(totalBp / 1000000.0, 2)) + ', ' +
                  str(totalContigCount) + ', -\n')
    out.close()
    taxonomy.close()
コード例 #26
0
def _main():
    """ See the module description."""
    parser = argparse.ArgumentParser(description=__doc__, epilog="""""")

    parser.add_argument(
        '-i',
        '--input-data-dir',
        action='store',
        nargs=1,
        required=True,
        help=
        """Directory that contains fasta files and corresponding mapping files, for each "*.tax" (or "*.csv")
                 file there must be a "*.fna" file with the same name. All files with suffix "tax" (or "*.csv")
                 will be considered. (Takes only Bacteria and Archaea)""",
        metavar='input_dir',
        dest='inDir')

    parser.add_argument('-o',
                        '--output-dir',
                        action='store',
                        nargs=1,
                        required=True,
                        help='Directory that contains the output files.',
                        metavar='out_dir',
                        dest='outDir')

    parser.add_argument(
        '-s',
        '--source-type',
        required=True,
        nargs=1,
        choices=["s", "a"],
        help=
        'To determine the source, use "s" for the Silva database and "a" for the Amphora database.',
        dest='srcType')

    parser.add_argument(
        '-t',
        '--taxonomy-file',
        nargs=1,
        type=file,
        required=True,
        help='NCBI taxonomy database file in the sqlite3 format.',
        metavar='ncbitax_sqlite.db',
        dest='taxonomy')

    parser.add_argument('-n', '--not-considered-taxonIds', action='store', nargs=1,
        help='Comma separated leaf level or top level taxonIds (as a string) what fill be filtered out. (optional)',
        metavar='"2759,10239,77133,155900,408172,32644, 408170,433727,749907,556182,702656,410661,652676,410659,797283'\
                ',408171,703336,256318,32630,433724,766747,488339,942017,1076179,717931,455559,527640,904678,552539,'\
                '54395,198431,358574,415540,511564,369433,380357,81726,198834,271928,311313,2759,749906,1077529,'\
                '1077529,361146,511563,361147"',
        dest='filterOut')

    # parse arguments
    args = parser.parse_args()
    inDir = args.inDir[0]
    outDir = args.outDir[0]
    srcType = args.srcType[0]
    filterOutTaxonIdsSet = set()
    try:
        if args.filterOut:
            filterOutTaxonIdsSet.update(
                set(map(int,
                        str(args.filterOut[0]).split(','))))
    except:
        print(
            'Taxon ids that are to be filtered out are in a wrong format! Comma separated integers are needed!'
        )
        raise

    taxonomy = _TaxonomyWrap(args.taxonomy[0].name)
    for dir in [inDir, outDir]:
        assert os.path.isdir(dir), 'Path: "' + dir + '" does not exists!'

    # create db for each gene
    mapDict = {}  # map: seqId -> ncbid
    for mapFilePath in glob.glob(
            os.path.join(os.path.normpath(inDir),
                         r'*.[ct][sa][vx]')):  # *.csv or *.tax

        assert mapFilePath.endswith(('.csv', '.tax')), \
            'The mapping files can either end with .csv or .tax ' + mapFilePath

        base = os.path.basename(mapFilePath).rsplit(
            '.', 1)[0]  # cut out dir path and suffix
        fastaDict = fas.fastaFileToDict(
            os.path.join(os.path.dirname(mapFilePath),
                         (base + '.fna')))  # map: seqId -> seq
        print("Processing: %s seq count: %s" % (base, str(len(fastaDict))))

        if 'a' in srcType:  # Amphora
            mapDict = {}
            for k in csv.getColumnAsList(mapFilePath, colNum=0, sep='\t'):
                v = int(k.rsplit('|', 1)[1].split(':')[1])  # get ncbid
                assert ((k not in mapDict) or (mapDict[k] == v)), str(
                    'There are at least two different values for key: ' +
                    str(k) + ' in ' + mapFilePath)
                mapDict[k] = v
        elif 's' in srcType:  # Silva
            mapTmp = csv.getMapping(mapFilePath, 0, 2, '\t')
            mapDict = {}
            for k, v in mapTmp.iteritems():
                mapDict[k] = int(v[0])
        else:
            assert False, 'Unsupported source type!'

        # same number of entries in both files (fasta and mapping) ?
        if len(mapDict) != len(fastaDict):
            print(
                str('%s: The mapping file and the corresponding fasta file have different number of entries: '
                    + '"%s" "%s" these files will be skipped!') %
                (base, str(len(mapDict)), str(len(fastaDict))))
            continue

        # are duplicates in the mapping file ?
        count = len(csv.getColumnAsList(mapFilePath))
        if len(mapDict) != count:
            print(
                '%s: The mapping file contained duplicates! unique: %s non-unique: %s'
                % (base, str(len(mapDict)), str(count)))

        # store data to the output directory
        outDna = csv.OutFileBuffer(os.path.join(outDir, str(base + '.fna')))
        outTax = csv.OutFileBuffer(os.path.join(outDir, str(base + '.tax')))
        count = 0
        filteredLeaf = 0
        filteredSup = 0
        notMapped = 0
        noBacArch = 0
        for seqId, taxonId in mapDict.iteritems():
            if taxonId in filterOutTaxonIdsSet:
                filteredLeaf += 1
                continue
            path = taxonomy.getPathToRoot(taxonId)
            if path is None:
                print('Could not find: %s for seqId: %s record skipped!' %
                      (str(taxonId), seqId))
                notMapped += 1
                continue
            topLevel = int(path.split(';', 1)[0])
            if topLevel in filterOutTaxonIdsSet:
                filteredSup += 1
                continue
            if topLevel not in [2, 2157]:  # Bacteria, Archaea
                noBacArch += 1
                print('NoBactArch: ', topLevel)

            seq = fastaDict[seqId]
            if 'a' in srcType:  # Amphora
                id = seqId
            elif 's' in srcType:  # Silva
                id = str(seqId + '|ncbid:' + str(taxonId))

            outTax.writeText(str(id + '\t' + path + '\n'))
            outDna.writeText(str('>' + id + '\n' + seq + '\n'))
            count += 1

        outDna.close()
        outTax.close()
        print(
            'Stored entries: %s filtered out: %s leaf, %s top level, not mapped: %s'
            % (count, filteredLeaf, filteredSup, notMapped))
        if noBacArch > 0:
            print(
                'WARN: stored %s of non Bacterial and non Archaeal sequences: '
                % (noBacArch))

        # Silva:
        #-i /Users/ivan/Documents/work/binning/database/silva111/arbGenerated -s s -t /Users/ivan/Documents/work/binning/taxonomy/20121122/ncbitax_sqlite.db
        # -o /Users/ivan/Documents/work/binning/database/silva111/db -n ...

        # Amphora
        # -i /Users/ivan/Documents/work/binning/database/markerGenes3/mGenesExtracted -s a -t /Users/ivan/Documents/work/binning/taxonomy/20121122/ncbitax_sqlite.db
        # -o /Users/ivan/Documents/work/binning/database/markerGenes3/db

    taxonomy.close()
    print 'done'
コード例 #27
0
def computeTrainingAccuracy(workingDir, taWorkingDir, sampleSpecificDir,
                            ppsTrainDataDir, outputDir, ppsInstallDir,
                            ppsScripts, ppsConfigFilePath, predictLogFileName,
                            modelTaxonIdFilePath, databaseFile):
    """
        Computes the training accuracy for the PPS training data.
        This function doesn't consider training data used to train intermediate (misc?) nodes!
        The training data that correspond to the sample specific data is fragmented (via PPS) and
        contained in the training data of different lengths.

        @param workingDir: working directory of the PPS+ pipeline
        @param taWorkingDir: working directory for the accuracy computation
        @param sampleSpecificDir: directory containing the sample specific data
        @param ppsTrainDataDir: directory 'sampled_fasta' containing PPS training data
        @param outputDir: directory for output files
        @param ppsScripts: directory containing PPS scripts
        @param ppsConfigFilePath: the PPS configuration file
        @param ppsInstallDir: directory where PPS is installed
        @param predictLogFileName: logging file for PPS prediction
        @param modelTaxonIdFilePath: file containing all leaf ncbi taxon ids that are modelled
        @param databaseFile: ncbi taxonomy file in the sqlite3 format
    """
    for d in [
            workingDir, taWorkingDir, sampleSpecificDir, ppsTrainDataDir,
            outputDir, ppsInstallDir, ppsScripts,
            os.path.dirname(predictLogFileName)
    ]:
        assert os.path.isdir(d), "Directory '%s' doesn't exist!" % d
    for f in [ppsConfigFilePath, databaseFile, modelTaxonIdFilePath]:
        assert os.path.isfile(f), "File '%s' doesn't exist!" % f

    # all directories that contain PPS training data
    trainDirList = [sampleSpecificDir]
    for d in os.listdir(ppsTrainDataDir):
        trainDirList.append(os.path.join(ppsTrainDataDir, d))

    # fasta file with all training sequences
    allTrainFastaFile = os.path.join(taWorkingDir, 'all_train_data.fna')
    out = csv.OutFileBuffer(allTrainFastaFile)
    seqIdToTruePred = {}

    # merge all training fasta files to one fasta file
    for d in trainDirList:
        dName = os.path.basename(d)
        for f in os.listdir(d):
            taxonId = int(os.path.basename(f).rsplit('.', 2)[0])
            for seqId, seq in fasta.fastaFileToDict(os.path.join(
                    d, f)).iteritems():
                if d == sampleSpecificDir:
                    #label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1])
                    id = str(
                        taxonId) + '|' + dName + '|' + seqId + '|label:' + str(
                            taxonId)
                else:
                    id = str(taxonId) + '|' + dName + '|' + seqId
                out.writeText('>' + id + '\n' + seq + '\n')
                seqIdToTruePred[id] = taxonId
    out.close()

    # predict the merged file using the generated model
    if os.name == 'posix':
        predictCmd = str(
            os.path.join(ppsScripts, 'predict.rb') + ' ' + allTrainFastaFile +
            ' ' + ppsConfigFilePath)
        #print(predictCmd)
        logOut = open(predictLogFileName, 'w')
        predictProc = subprocess.Popen(
            predictCmd,
            shell=True,
            bufsize=-1,
            cwd=ppsInstallDir,
            stdout=logOut,
            stderr=subprocess.STDOUT)  # stdout=subprocess.STDOUT
        predictProc.wait()
        logOut.close()
        if predictProc.returncode != 0:
            raise Exception(
                "PPS 'predict' training data returned with non-zero status: %s, cmd: %s"
                % (predictProc.returncode, predictCmd))
    else:
        print("Can't run PPS on a non-posix system!")
        return

    # read in predicted train data
    seqIdToPred = csv.predToDict(allTrainFastaFile + '.nox.fna.out')

    # read fasta file
    seqIdToBp = fasta.getSequenceToBpDict(allTrainFastaFile)

    # leaf taxonIds that are modelled
    modelLeafTaxonIds = set(map(int,
                                csv.getColumnAsList(modelTaxonIdFilePath)))

    taxonomyS = taxonomy_ncbi.TaxonomyNcbi(databaseFile, considerNoRank=True)
    notLeafTaxonIds = set()
    for id in modelLeafTaxonIds:
        notLeafTaxonIds.update(
            set(map(int, (taxonomyS.getParentsNcbidSet(id)))))
    taxonomyS.close()

    # get only sequences with true taxonId defined at leaf level that is modelled or lower
    seqIdToBp2 = {}
    seqIdToPred2 = {}
    seqIdToTruePred2 = {}
    seqIdToBpMisc = {}
    seqIdToPredMisc = {}
    seqIdToTruePredMisc = {}
    for seqId, bp in seqIdToBp.iteritems():
        label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1])
        if label not in notLeafTaxonIds:
            seqIdToBp2[seqId] = bp
            seqIdToPred2[seqId] = seqIdToPred[seqId]
            seqIdToTruePred2[seqId] = seqIdToTruePred[seqId]
        else:
            seqIdToBpMisc[seqId] = bp
            seqIdToPredMisc[seqId] = seqIdToPred[seqId]
            seqIdToTruePredMisc[seqId] = seqIdToTruePred[seqId]
    seqIdToBp = seqIdToBp2
    seqIdToPred = seqIdToPred2
    seqIdToTruePred = seqIdToTruePred2

    # accuracy for all, filter out sample specific data (whole length)
    seqIdToBpNoSampleSpec = {}
    for seqId, bp in seqIdToBp.iteritems():
        if str(seqId).split(
                '|',
                2)[1].strip() != os.path.basename(sampleSpecificDir).strip():
            seqIdToBpNoSampleSpec[seqId] = bp

    acc = accuracy.Accuracy(seqIdToBpNoSampleSpec, seqIdToPred,
                            seqIdToTruePred, databaseFile)
    out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_all.txt'))
    out.writeText(
        acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                             minFracClade=None,
                             minFracPred=None,
                             overview=True))
    out.close()
    taxonomyA = acc.getTaxonomy()
    acc.close(closeTaxonomy=False)

    # accuracy for (misc) nodes
    acc = accuracy.Accuracy(seqIdToBpMisc, seqIdToPredMisc,
                            seqIdToTruePredMisc, taxonomyA)
    out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_misc.txt'))
    out.writeText(
        acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                             minFracClade=None,
                             minFracPred=None,
                             overview=True))
    out.close()
    acc.close(closeTaxonomy=False)

    # generate the confusion matrices (for the "for all" scenario)
    cm = confusion_matrix.ConfusionMatrix(seqIdToBp, seqIdToPred,
                                          seqIdToTruePred, databaseFile,
                                          taxonomy_ncbi.TAXONOMIC_RANKS[1:])
    for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]:
        cm.generateConfusionMatrix(
            rank, os.path.join(outputDir, 'train_accuracy_cmp_all'))
    taxonomyCM = cm.getTaxonomy()
    cm.close(closeTaxonomy=False)

    # accuracy for individual directories (seq lengths)
    # (the sample specific fragments are among PPS sampled fasta)
    for d in trainDirList:
        dName = os.path.basename(d)
        seqIdToBpSub = {}
        seqIdToPredSub = {}
        seqIdToTruePredSub = {}
        for seqId, bp in seqIdToBp.iteritems():
            if str(seqId).split('|', 2)[1].strip() == str(dName).strip():
                seqIdToBpSub[seqId] = seqIdToBp[seqId]
                seqIdToPredSub[seqId] = seqIdToPred[seqId]
                seqIdToTruePredSub[seqId] = seqIdToTruePred[seqId]

        # accuracy
        acc = accuracy.Accuracy(seqIdToBpSub, seqIdToPredSub,
                                seqIdToTruePredSub, taxonomyA)
        out = csv.OutFileBuffer(
            os.path.join(outputDir, 'train_accuracy_' + dName + '.txt'))
        out.writeText(
            acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                                 minFracClade=None,
                                 minFracPred=None,
                                 overview=True))

        # confusion matrices
        cm = confusion_matrix.ConfusionMatrix(
            seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyCM,
            taxonomy_ncbi.TAXONOMIC_RANKS[1:])
        for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]:
            cm.generateConfusionMatrix(
                rank, os.path.join(outputDir, 'train_accuracy_cmp_' + dName))
        cm.close(closeTaxonomy=False)

        out.close()
        acc.close(closeTaxonomy=False)
    taxonomyA.close()
    taxonomyCM.close()
コード例 #28
0
def mergeSequences(mapFilePathList, fastaFilePathList, outputDir):
    """
        Reads all sequences. For each taxonId creates a file that contain all sequences
        mapped to this taxonId. If a seqId appears more than one it is ignored since
        acession numbers are unique.

        @param mapFilePathList: list of files where each contain mapping: seqId -> taxonId
        @param fastaFilePathList: list of fasta files that contain mapping: seqId -> seq
    """
    taxonIdToOutBuffer = {}
    seqIdSet = set()

    totalSeqCount = 0
    totalStoredSeqCount = 0
    totalIdenticalSeqCount = 0

    for mapFilePath, fastaFilePath in zip(mapFilePathList, fastaFilePathList):
        print 'processing', mapFilePath, fastaFilePath
        seqCount = 0
        storedSeqCount = 0

        seqIdToSeq = fasta.fastaFileToDict(fastaFilePath)
        seqIdToNcbidList = csv.getMapping(mapFilePath,
                                          0,
                                          1,
                                          sep='\t',
                                          comment='#')

        for seqId, seq in seqIdToSeq.iteritems():
            seqCount += 1
            if seqId in seqIdSet:
                totalIdenticalSeqCount += 1
                continue
            else:
                seqIdSet.add(seqId)

            taxonId = seqIdToNcbidList[seqId][0]

            if taxonId not in taxonIdToOutBuffer:
                outBuffer = csv.OutFileBuffer(
                    os.path.join(outputDir, str(str(taxonId) + '.fna')))
                taxonIdToOutBuffer[taxonId] = outBuffer

            taxonIdToOutBuffer[taxonId].writeText(
                str('>' + seqId + '\n' + seq + '\n'))
            taxonIdToOutBuffer[taxonId].close()
            storedSeqCount += 1

            if len(string.replace(common.noNewLine(seq), 'N', '')) == 0:
                print 'zeros', seqId, fastaFilePath, len(common.noNewLine(seq))

        # for buff in taxonIdToOutBuffer.values():
        #     buff.close()

        print 'totalSeq, storedSeq', seqCount, storedSeqCount
        totalSeqCount += seqCount
        totalStoredSeqCount += storedSeqCount

    print 'totalSeqCount, totalStoredSeqCount, totalIdenticalSeqCount', totalSeqCount, totalStoredSeqCount, totalIdenticalSeqCount

    print 'sequences merged'
コード例 #29
0
    def generateConfusionMatrix(self, rank, prefixOutputPath):
        """
            Generates confusion matrix at given rank.
            The object must have been initialized considering this rank.

            @param prefixOutputPath: prefix of the output file path
        """
        if self._initFailed:
            return
        rankId = self._taxonomy.getRankId(rank)
        if rankId not in self._allowedRankIdsSet:
            print("Can't consider rank: " + rank)
            return
        if not os.path.isdir(os.path.dirname(prefixOutputPath)):
            print(
                "Output prefix is wrong, the corresponding directory doesn't exist: "
                + os.path.dirname(prefixOutputPath))
            return

        # entries of the confusion matrix
        tableCountMap = {}  # (taxonId_ref, taxonId_pred) -> count
        tableBpMap = {}  # (taxonId_ref, taxonId_pred) -> bp

        # predictions (and reference) at the given rank
        seqNameToPred = self._rankIdToPredMap[rankId]
        seqNameToRef = self._rankIdToRefMap[rankId]
        predTaxonIdSet = set()
        refTaxonIdSet = set()

        # fill in entries of the confusion matrix
        for seqId, bp in self._seqNameToBp.iteritems():
            predId = seqNameToPred.get(seqId, None)
            refId = seqNameToRef.get(seqId, None)
            if predId is not None:
                predTaxonIdSet.add(
                    predId)  # stores it only if it's predicted at this rank
            if refId is not None:
                refTaxonIdSet.add(refId)
            key = (refId, predId)
            if key not in tableCountMap:
                tableCountMap[key] = 1
                tableBpMap[key] = bp
            else:
                tableCountMap[key] += 1
                tableBpMap[key] += bp

        # get taxonIds contained in prediction and reference prediction, common for both, unique for pred. and ref.
        commonTaxonIdSet = predTaxonIdSet.intersection(refTaxonIdSet)
        uniquePredIdSet = predTaxonIdSet.difference(commonTaxonIdSet)
        uniqueRefIdSet = refTaxonIdSet.difference(commonTaxonIdSet)

        # get taxonIds contained in predictions and reference predictions as lists of scientific names
        commonNames, commonMap = self._taxonomy.getSortedScientificNames(
            commonTaxonIdSet)
        uniquePredNames, uniquePredMap = self._taxonomy.getSortedScientificNames(
            uniquePredIdSet)
        uniqueRefNames, uniqueRefMap = self._taxonomy.getSortedScientificNames(
            uniqueRefIdSet)

        # headers
        predHeader = commonNames + uniquePredNames + ['unassigned'
                                                      ]  # predictions
        refHeader = commonNames + uniqueRefNames + ['unassigned']  # reference
        predHeaderTaxonIds = []
        refHeaderTaxonIds = []
        for name in commonNames:
            id = commonMap[name]
            predHeaderTaxonIds.append(id)
            refHeaderTaxonIds.append(id)
        for name in uniquePredNames:
            predHeaderTaxonIds.append(uniquePredMap[name])
        for name in uniqueRefNames:
            refHeaderTaxonIds.append(uniqueRefMap[name])
        predHeaderTaxonIds.append(None)  # predicted as unassigned
        refHeaderTaxonIds.append(None)  # unassigned in reference

        # count matches
        matchCount = 0
        matchBp = 0
        for taxonId in commonTaxonIdSet:
            count = tableCountMap.get((taxonId, taxonId), None)
            if count is not None:
                bp = tableBpMap.get((taxonId, taxonId), None)
                assert bp is not None
                matchCount += count
                matchBp += bp

        # count mismatches
        mismatchCount = 0
        mismatchBp = 0
        for predTaxonId in predHeaderTaxonIds[:-1]:
            for refTaxonId in refHeaderTaxonIds[:-1]:
                if predTaxonId == refTaxonId:
                    continue
                assert (predTaxonId is not None) and (refTaxonId is not None)
                count = tableCountMap.get((refTaxonId, predTaxonId), None)
                if count is not None:
                    bp = tableBpMap.get((refTaxonId, predTaxonId), None)
                    assert bp is not None
                    mismatchCount += count
                    mismatchBp += bp

        # count pred total, ref total
        predTotalCount = 0
        predTotalBp = 0
        refTotalCount = 0
        refTotalBp = 0
        for predTaxonId in predHeaderTaxonIds:
            for refTaxonId in refHeaderTaxonIds:
                count = tableCountMap.get((refTaxonId, predTaxonId), None)
                if count is None:
                    continue
                bp = tableBpMap.get((refTaxonId, predTaxonId), None)
                assert bp is not None
                if predTaxonId is not None:
                    predTotalCount += count
                    predTotalBp += bp
                if refTaxonId is not None:
                    refTotalCount += count
                    refTotalBp += bp

        # total
        totalCount = 0
        totalBp = 0
        for bp in self._seqNameToBp.values():
            totalCount += 1
            totalBp += bp

        # write the confusion matrix to a file
        out = csv.OutFileBuffer(
            os.path.normpath(prefixOutputPath + '.' + str(rank) + '_cmp.csv'))

        header = 'ref/pred'
        for e in predHeader:
            header += ', ' + e
        out.writeText(header + '\n')

        for i in range(len(refHeaderTaxonIds)):
            line = refHeader[i]
            refTaxonId = refHeaderTaxonIds[i]
            for j in range(len(predHeaderTaxonIds)):
                predTaxonId = predHeaderTaxonIds[j]
                count = tableCountMap.get((refTaxonId, predTaxonId), None)
                line += ', '
                if count is not None:
                    bp = tableBpMap.get((refTaxonId, predTaxonId), None)
                    assert bp is not None
                    line += str(int(round(
                        float(bp) / 1000.0))) + 'k (' + str(count) + ')'
            out.writeText(line + '\n')

        out.writeText(',\n')
        out.writeText('Matches, ' + str(int(round(float(matchBp) / 1000.0))) +
                      'k, ' + str(matchCount) + ', ' +
                      self._div(matchBp, matchBp + mismatchBp, 1) + ' %k' +
                      ', ' +
                      self._div(matchCount, matchCount + mismatchCount, 1) +
                      ' %\n')

        out.writeText('Mismatches, ' +
                      str(int(round(float(mismatchBp) / 1000.0))) + 'k, ' +
                      str(mismatchCount) + ', ' +
                      self._div(mismatchBp, matchBp + mismatchBp, 1) + ' %k' +
                      ', ' +
                      self._div(mismatchCount, matchCount + mismatchCount, 1) +
                      ' %\n')

        out.writeText('Pred. assigned, ' +
                      str(int(round(float(predTotalBp) / 1000.0))) + 'k, ' +
                      str(predTotalCount) + ', ' +
                      self._div(predTotalBp, totalBp, 1) + ' %k, ' +
                      self._div(predTotalCount, totalCount, 1) + ' %\n')

        out.writeText('Ref. assigned, ' +
                      str(int(round(float(refTotalBp) / 1000.0))) + 'k, ' +
                      str(refTotalCount) + ', ' +
                      self._div(refTotalBp, totalBp, 1) + ' %k, ' +
                      self._div(refTotalCount, totalCount, 1) + ' %\n')

        out.writeText('Total fasta, ' +
                      str(int(round(float(totalBp) / 1000.0))) + 'k, ' +
                      str(totalCount) + '\n')
        out.close()
コード例 #30
0
def maskDb(action,
           inDir,
           outDir,
           rank,
           clades,
           taxonomyFilePath,
           verbose=False):
    """
        Main function (function interface), see module description.

        @param action: one action that will be performed [cl, mr, mg] ~ (generate list, mask seq, mask mg)
        @type action str
        @param inDir: directory containing input files
        @type inDir: str
        @param outDir: directory containing output files
        @type: outDir: str
        @param rank: the data will be excluded at this rank
        @type rank: str
        @param clades: a file containing clades that will be masked (one ncbi taxon id at a line),
            or a set of ncbi taxon ids that will be masked
        @type clades: file or set of int
        @param taxonomyFilePath: taxonomy database file in the sqlite3 format
        @type taxonomyFilePath: str
    """
    # check input parameters
    assert action in ['cl', 'mr',
                      'mg'], str('Given action is not supported: ' + action)
    if action == 'mr':
        assert os.name == 'posix', 'Symbolic links can be created only on posix systems, action "mr" is not valid!'
    for dir in [inDir, outDir]:
        assert os.path.isdir(dir), str("Directory doesn't exists: " + dir)
    assert rank in _RANKS, str('Not supported rank: ' + rank)
    assert os.path.isfile(taxonomyFilePath), str(
        "Taxonomy database file doesn't exist: " + taxonomyFilePath)
    assert isinstance(
        clades, set
    ) or (isinstance(clades, str) and os.path.isfile(clades)), str(
        "Parameter 'clades' can be either a file or a set of ncbi taxonIds to be excluded."
    )

    # maps a rank to a lower rank
    toLowerRank = {}
    for i in range(1, len(_RANKS)):
        toLowerRank[_RANKS[i - 1]] = _RANKS[i]

    taxonomy = _TaxonomyWrapMD(taxonomyFilePath)

    # leaf clades to mask
    if isinstance(clades, set):
        inCladesSet = set(map(int, clades))
    else:
        inCladesSet = set(map(int, csv.getColumnAsList(clades)))

    # clades in the reference
    refCladesSet = set()
    if action in ['cl', 'mr']:
        # get the list of all taxon ids that appear in the directory (as PPS reference)
        for fastaFilePath in glob.glob(
                os.path.join(os.path.normpath(inDir),
                             r'*.f[na][as]')):  # *.fas or *.fna
            refCladesSet.add(_refFilePathToTaxonId(
                fastaFilePath))  # taxonId.1.fna or taxonId.1.fas
    elif action in ['mg']:
        # get the list of all taxon ids that appear in any file in the input directory as taxonomy ".tax"
        for mapFilePath in glob.glob(
                os.path.join(os.path.normpath(inDir), r'*.tax')):  # *.tax
            refCladesSet.update(
                set(
                    map(_mgSeqIdToTaxonId,
                        csv.getColumnAsList(mapFilePath, sep='\t'))))
    else:
        assert False, str('Not supported action: ' + action)

    # checks whether taxonIds are in the taxonomy
    for taxonId in inCladesSet:
        assert taxonomy.exists(taxonId), str(
            'taxonId: %s from clades list is not contained in the taxonomy!' %
            taxonId)
    for taxonId in refCladesSet:
        assert taxonomy.exists(taxonId), str(
            'taxonId: %s from the reference is not contained in the taxonomy!'
            % taxonId)

    # checks whether the taxonIds are leafs (doesn't have to be (unless you want to mask at the strain level))
    for taxonId in inCladesSet:
        if not taxonomy.isLeaf(taxonId):
            print(
                'Taxon id %s does not represent a leaf clade in the taxonomy.'
                % taxonId)

    if verbose:
        print('Initial checks done.')

    # taxonIds that should be excluded
    toExcludeSet = set()
    for taxonId in inCladesSet:
        taxonIdAtRank = taxonomy.getTaxonIdAtRank(taxonId, rank)
        if taxonIdAtRank is None:  # the lineage is not defined at this rank ! try a lower rank !
            print('Taxon id: "%s" is not defined at rank: "%s"' %
                  (taxonId, rank))
            currentRank = rank  # find a lower rank at which it's defined
            while currentRank in toLowerRank:
                currentRank = toLowerRank[currentRank]
                taxonIdAtRank = taxonomy.getTaxonIdAtRank(taxonId, currentRank)
                if taxonIdAtRank is not None:
                    break
            if taxonIdAtRank is None:
                taxonIdAtRank = taxonId
                currentRank = _STRAIN
            print('Taxon id: %s will be masked at rank: %s' %
                  (taxonId, currentRank))

        # all child clades (and itself)
        toExcludeSet.add(int(taxonIdAtRank))
        toExcludeSet.update(
            set(map(int, taxonomy.getAllChildren(taxonIdAtRank))))

    # all clades that should be excluded (there is at least one sequence for each taxonId in the reference)
    toExcludeSet.intersection_update(refCladesSet)
    if verbose:
        print('Data to mask collected done.')

    print('To exclude: ', len(toExcludeSet))

    # exclude data from the reference
    if action == 'cl':
        # generates a list of taxonIds
        out = csv.OutFileBuffer(os.path.join(outDir, 'exclude_list.txt'))
        for taxonId in toExcludeSet:
            out.writeText(str(taxonId) + '\n')
        out.close()
    elif action == 'mr':
        # masked reference sequences (create sim links to files that were not excluded)
        for fastaFilePath in glob.glob(
                os.path.join(os.path.normpath(inDir),
                             r'*.f[na][as]')):  # *.fas or *.fna
            taxonId = _refFilePathToTaxonId(
                fastaFilePath)  # taxonId.1.fna or taxonId.1.fas
            if taxonId not in toExcludeSet:
                # assert os.name == 'posix'
                os.symlink(
                    fastaFilePath,
                    os.path.join(outDir, os.path.basename(fastaFilePath)))
    elif action == 'mg':
        # exclude sequences from the marker gene databases
        for mapFilePath in glob.glob(
                os.path.join(os.path.normpath(inDir), r'*.tax')):

            # get entries that can stay in the mapping and fasta files
            allowedEntriesSet = set(
                map(_mgSeqIdToTaxonId,
                    csv.getColumnAsList(mapFilePath, sep='\t')))
            allowedEntriesSet.difference_update(toExcludeSet)

            # filter out entries from the mapping file
            csv.filterOutLines(mapFilePath,
                               os.path.join(outDir,
                                            os.path.basename(mapFilePath)),
                               allowedEntriesSet,
                               entryModifyFunction=_mgSeqIdToTaxonId,
                               colNum=0,
                               sep='\t')

            # filter out entries from the fasta file
            fastaFilePath = str(mapFilePath.rsplit('.', 1)[0] + '.fna')
            fas.filterOutSequences(fastaFilePath,
                                   os.path.join(
                                       outDir,
                                       os.path.basename(fastaFilePath)),
                                   allowedEntriesSet,
                                   seqNameModifyFunction=_mgSeqIdToTaxonId)
    else:
        assert False, 'Not supported action!'

    taxonomy.close()
    if verbose:
        print('Data masked done.')