def removeSequences(mg):
    removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids.txt'
    #removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_accession_silva.txt'
    srcFilePath = str('/net/metagenomics/projects/PPSmg/data/markerGenes/db/' +
                      mg + '_bact+arch_dnaV.noalign.fna')
    dstFilePath = str(
        '/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/markerGenes/db/'
        + mg + '_bact+arch_dnaV.noalign.fna')
    #srcFilePath = str('/net/metagenomics/projects/PPSmg/data/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.fna' )
    #dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.fna' )
    pattern = r'.*ncbid:([0-9]+)$'
    #pattern = r'^([^\-]+)\-.*$'

    removeSet = set(
        csv.getColumnAsList(removeListFilePath, colNum=0, comment='#'))
    seqIdToSeq = fas.fastaFileToDict(srcFilePath)
    out = csv.OutFileBuffer(dstFilePath)
    removed = 0
    for seqId in seqIdToSeq:
        if re.sub(pattern, r'\1', str(seqId)) not in removeSet:
            out.writeText(
                str('>' + str(seqId) + '\n' + str(seqIdToSeq[seqId]) + '\n'))
        else:
            removed += 1

    out.close()
    print mg, 'removeSequences', removed
Beispiel #2
0
def mergeSequences(mapFilePathList, fastaFilePathList, outputDir):
    """
        Reads all sequences. For each taxonId creates a file that contain all sequences
        mapped to this taxonId. If a seqId appears more than one it is ignored since
        acession numbers are unique.

        @param mapFilePathList: list of files where each contain mapping: seqId -> taxonId
        @param fastaFilePathList: list of fasta files that contain mapping: seqId -> seq
    """
    taxonIdToOutBuffer = {}
    seqIdSet = set()

    totalSeqCount = 0
    totalStoredSeqCount = 0
    totalIdenticalSeqCount = 0

    for mapFilePath, fastaFilePath in zip(mapFilePathList, fastaFilePathList):
        print 'processing', mapFilePath, fastaFilePath
        seqCount = 0
        storedSeqCount = 0

        seqIdToSeq = fasta.fastaFileToDict(fastaFilePath)
        seqIdToNcbidList = csv.getMapping(mapFilePath, 0, 1, sep='\t', comment='#')

        for seqId, seq in seqIdToSeq.iteritems():
            seqCount += 1
            if seqId in seqIdSet:
                totalIdenticalSeqCount += 1
                continue
            else:
                seqIdSet.add(seqId)

            taxonId = seqIdToNcbidList[seqId][0]

            if taxonId not in taxonIdToOutBuffer:
                outBuffer = csv.OutFileBuffer(os.path.join(outputDir, str(str(taxonId) + '.fna')))
                taxonIdToOutBuffer[taxonId] = outBuffer

            taxonIdToOutBuffer[taxonId].writeText(str('>' + seqId + '\n' + seq + '\n'))
            taxonIdToOutBuffer[taxonId].close()
            storedSeqCount += 1

            if len(string.replace(common.noNewLine(seq),'N','')) == 0:
                print 'zeros', seqId, fastaFilePath, len(common.noNewLine(seq))

        # for buff in taxonIdToOutBuffer.values():
        #     buff.close()

        print 'totalSeq, storedSeq', seqCount, storedSeqCount
        totalSeqCount += seqCount
        totalStoredSeqCount += storedSeqCount


    print 'totalSeqCount, totalStoredSeqCount, totalIdenticalSeqCount', totalSeqCount, totalStoredSeqCount, totalIdenticalSeqCount

    print 'sequences merged'
Beispiel #3
0
def concatenate(directory, outputFile):
    out = csv.OutFileBuffer(outputFile)
    for f in os.listdir(directory):
        path = os.path.join(directory, f)
        name = f.split('.')[0]
        seqIdToSeq = fasta.fastaFileToDict(path)
        out.writeText('>' + str(name) + '\n')
        for seqId, seq in seqIdToSeq.iteritems():
            out.writeText(str(seq) + 200*'N' + '\n')
    out.close()
Beispiel #4
0
def filterOutReads():
    inFasta = ''
    outFasta = ''
    out = csv.OutFileBuffer(outFasta)
    notAllowedSet = set(['BA000019.2'])  # Nostoc sp. PCC 7120
    for seqId, seq in fas.fastaFileToDict(inFasta).iteritems():
        id = re.sub(r'([^_]+)_.*', r'\1', seqId)
        if id not in notAllowedSet:
            out.writeText('>' + str(seqId) + '\n' + str(seq) + '\n')
    out.close()
def filterOutReads():
    inFasta = ''
    outFasta = ''
    out = csv.OutFileBuffer(outFasta)
    notAllowedSet = set(['BA000019.2'])  # Nostoc sp. PCC 7120
    for seqId, seq in fas.fastaFileToDict(inFasta).iteritems():
        id = re.sub(r'([^_]+)_.*', r'\1', seqId)
        if id not in notAllowedSet:
            out.writeText('>' + str(seqId) + '\n' + str(seq) + '\n')
    out.close()
Beispiel #6
0
def concatenate(directory, outputFile):
    out = csv.OutFileBuffer(outputFile)
    for f in os.listdir(directory):
        path = os.path.join(directory, f)
        name = f.split('.')[0]
        seqIdToSeq = fasta.fastaFileToDict(path)
        out.writeText('>' + str(name) + '\n')
        for seqId, seq in seqIdToSeq.iteritems():
            out.writeText(str(seq) + 200*'N' + '\n')
    out.close()
def removeEntries(mg):
    """
        Removes sequences from the marker gene files at the level from species, genus, family etc.
    """
    removeListPath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids_species.txt'
    srcFilePath = str(
        '/net/metagenomics/projects/PPSmg/data/markerGenes2/db/' + mg +
        '_bact+arch_dnaV.tax')
    dstFilePath = str(
        '/net/metagenomics/projects/PPSmg/data/V35/mgScenarios/speciesRemoved/db/'
        + mg + '_bact+arch_dnaV.tax')
    out = csv.OutFileBuffer(dstFilePath)
    removeSet = set(csv.getColumnAsList(removeListPath, colNum=0, comment='#'))
    removeSetInt = set()
    removeSetIds = set()
    removed = 0
    for s in removeSet:
        if s != '':
            removeSetInt.add(int(s))
    col0 = csv.getColumnAsList(srcFilePath, colNum=0, sep='\t', comment='#')
    col1 = csv.getColumnAsList(srcFilePath, colNum=1, sep='\t', comment='#')
    for col0, col1 in zip(col0, col1):
        lineSetInt = set()
        for s in col1.split(';'):
            if s != '':
                lineSetInt.add(int(s))
        if len(removeSetInt.intersection(
                lineSetInt)) > 0:  #the intersection is not empty
            removed += 1
            removeSetIds.add(col0)
        else:
            out.writeText(str(col0 + '\t' + col1 + '\n'))
    out.close()

    print mg, 'removedEntries', removed

    srcFilePath = str(
        '/net/metagenomics/projects/PPSmg/data/markerGenes2/db/' + mg +
        '_bact+arch_dnaV.noalign.fna')
    dstFilePath = str(
        '/net/metagenomics/projects/PPSmg/data/V35/mgScenarios/speciesRemoved/db/'
        + mg + '_bact+arch_dnaV.noalign.fna')
    out = csv.OutFileBuffer(dstFilePath)
    seqIdToSeq = fas.fastaFileToDict(srcFilePath)
    removed = 0
    for seqId in seqIdToSeq:
        if seqId in removeSetIds:
            removed += 1
        else:
            out.writeText(
                str('>' + str(seqId) + '\n' + str(seqIdToSeq[seqId]) + '\n'))

    out.close()

    print mg, 'removedSeq', removed
def _getLabelsCreateFasta():
    """
        To process the original mercier dataset with 59 strains. Take only contigs that were mapped to the reference
        genomes. Output a fasta file and a mapping file.
    :rtype : None
    """
    # input fasta file
    fastaFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigs_1000.txt'  #contigs_1000.txt
    seqIdToSeq = fas.fastaFileToDict(fastaFilePath)

    # contigs mapped to genome names
    nameLabelsFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigs_1000_blast_labels.txt'  #contigs_1000_blast_labels.txt
    seqIdToNameLabels = csv.getMapping(nameLabelsFilePath,
                                       0,
                                       1,
                                       sep='\t',
                                       comment='#')

    # mapping: genome name -> taxon id
    genomeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_list2.txt'  #genome_list.txt
    nameLabelToNcbid = csv.getMapping(genomeListFilePath,
                                      0,
                                      2,
                                      sep=';',
                                      comment='#')

    # to store mapped sequences
    outFastaFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000.fna'  #contigsMappedBlast1000.fna
    outFasta = csv.OutFileBuffer(outFastaFilePath)
    # to stored taxonomic mapping of mapped sequences
    outLabelsFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000Labels.txt'  #contigsMappedBlast1000Labels.txt
    outLabels = csv.OutFileBuffer(outLabelsFilePath)

    for seqId in seqIdToSeq:
        if seqId in seqIdToNameLabels:
            outFasta.writeText(
                str('>' + str(seqId) + '\n' + seqIdToSeq[seqId] + '\n'))

    outFasta.close()
    print 'fasta created'

    for seqId in seqIdToSeq:
        if seqId in seqIdToNameLabels:
            nameLabel = seqIdToNameLabels[seqId][0]
            ncbid = nameLabelToNcbid[nameLabel][0]
            outLabels.writeText(str(str(seqId) + '\t' + str(ncbid) + '\n'))

    outLabels.close()
    print 'labels created'
Beispiel #9
0
def removeEntries(mg):
    """
        Removes sequences from the marker gene files at the level from species, genus, family etc.
    """
    removeListPath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids_species.txt'
    srcFilePath = str('/net/metagenomics/projects/PPSmg/data/markerGenes2/db/' + mg + '_bact+arch_dnaV.tax')
    dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/mgScenarios/speciesRemoved/db/' + mg + '_bact+arch_dnaV.tax')
    out = csv.OutFileBuffer(dstFilePath)
    removeSet = set(csv.getColumnAsList(removeListPath, colNum=0, comment='#'))
    removeSetInt = set()
    removeSetIds = set()
    removed = 0
    for s in removeSet:
        if s != '':
            removeSetInt.add(int(s))
    col0 = csv.getColumnAsList(srcFilePath, colNum=0, sep='\t', comment='#')
    col1 = csv.getColumnAsList(srcFilePath, colNum=1, sep='\t', comment='#')
    for col0,col1 in zip(col0,col1):
        lineSetInt = set()
        for s in col1.split(';'):
            if s != '':
                lineSetInt.add(int(s))
        if len(removeSetInt.intersection(lineSetInt)) > 0: #the intersection is not empty
            removed += 1
            removeSetIds.add(col0)
        else:
            out.writeText(str(col0 + '\t' + col1 + '\n'))
    out.close()

    print mg, 'removedEntries', removed

    srcFilePath = str('/net/metagenomics/projects/PPSmg/data/markerGenes2/db/' + mg + '_bact+arch_dnaV.noalign.fna')
    dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/mgScenarios/speciesRemoved/db/' + mg + '_bact+arch_dnaV.noalign.fna')
    out = csv.OutFileBuffer(dstFilePath)
    seqIdToSeq = fas.fastaFileToDict(srcFilePath)
    removed=0
    for seqId in seqIdToSeq:
        if seqId in removeSetIds:
            removed += 1
        else:
            out.writeText(str('>' + str(seqId) + '\n' + str(seqIdToSeq[seqId]) + '\n'))

    out.close()

    print mg, 'removedSeq', removed
Beispiel #10
0
def filterOutContigs(inFastaFile, inTaxFile, outFastaFile, outTaxFile, notAllowedTaxonIdList):
    outFasta = csv.OutFileBuffer(outFastaFile)
    outTax = csv.OutFileBuffer(outTaxFile)
    seqIdToTaxonId = csv.predToDict(inTaxFile)
    notAllowedTaxonIdSet = set(notAllowedTaxonIdList)
    taxonIdToFilteredSeq = {}
    for taxonId in notAllowedTaxonIdSet:
        taxonIdToFilteredSeq[taxonId] = 0
    for seqId, seq in fas.fastaFileToDict(inFastaFile).iteritems():
        taxonId = int(seqIdToTaxonId[seqId])
        if taxonId not in notAllowedTaxonIdSet:
            outFasta.writeText('>' + str(seqId) + '\n' + str(seq) + '\n')
            outTax.writeText(str(seqId) + '\t' + str(taxonId) + '\n')
        else:
            taxonIdToFilteredSeq[taxonId] += 1
    outFasta.close()
    outTax.close()
    print("filtered taxonId -> seqCount: " + str(taxonIdToFilteredSeq))
def filterOutContigs(inFastaFile, inTaxFile, outFastaFile, outTaxFile,
                     notAllowedTaxonIdList):
    outFasta = csv.OutFileBuffer(outFastaFile)
    outTax = csv.OutFileBuffer(outTaxFile)
    seqIdToTaxonId = csv.predToDict(inTaxFile)
    notAllowedTaxonIdSet = set(notAllowedTaxonIdList)
    taxonIdToFilteredSeq = {}
    for taxonId in notAllowedTaxonIdSet:
        taxonIdToFilteredSeq[taxonId] = 0
    for seqId, seq in fas.fastaFileToDict(inFastaFile).iteritems():
        taxonId = int(seqIdToTaxonId[seqId])
        if taxonId not in notAllowedTaxonIdSet:
            outFasta.writeText('>' + str(seqId) + '\n' + str(seq) + '\n')
            outTax.writeText(str(seqId) + '\t' + str(taxonId) + '\n')
        else:
            taxonIdToFilteredSeq[taxonId] += 1
    outFasta.close()
    outTax.close()
    print("filtered taxonId -> seqCount: " + str(taxonIdToFilteredSeq))
Beispiel #12
0
def _getLabelsCreateFasta():
    """
        To process the original mercier dataset with 59 strains. Take only contigs that were mapped to the reference
        genomes. Output a fasta file and a mapping file.
    :rtype : None
    """
    # input fasta file
    fastaFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigs_1000.txt' #contigs_1000.txt
    seqIdToSeq = fas.fastaFileToDict(fastaFilePath)

    # contigs mapped to genome names
    nameLabelsFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigs_1000_blast_labels.txt' #contigs_1000_blast_labels.txt
    seqIdToNameLabels = csv.getMapping(nameLabelsFilePath, 0, 1, sep='\t', comment = '#')

    # mapping: genome name -> taxon id
    genomeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_list2.txt' #genome_list.txt
    nameLabelToNcbid = csv.getMapping(genomeListFilePath, 0, 2, sep=';', comment = '#')

    # to store mapped sequences
    outFastaFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000.fna' #contigsMappedBlast1000.fna
    outFasta = csv.OutFileBuffer(outFastaFilePath)
    # to stored taxonomic mapping of mapped sequences
    outLabelsFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000Labels.txt' #contigsMappedBlast1000Labels.txt
    outLabels = csv.OutFileBuffer(outLabelsFilePath)

    for seqId in seqIdToSeq:
        if seqId in seqIdToNameLabels:
            outFasta.writeText(str('>' + str(seqId) + '\n' + seqIdToSeq[seqId] + '\n'))

    outFasta.close()
    print 'fasta created'

    for seqId in seqIdToSeq:
        if seqId in seqIdToNameLabels:
            nameLabel = seqIdToNameLabels[seqId][0]
            ncbid = nameLabelToNcbid[nameLabel][0]
            outLabels.writeText(str(str(seqId) + '\t' + str(ncbid) + '\n'))

    outLabels.close()
    print 'labels created'
Beispiel #13
0
def removeSequences(mg):
    removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids.txt'
    #removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_accession_silva.txt'
    srcFilePath = str('/net/metagenomics/projects/PPSmg/data/markerGenes/db/' + mg + '_bact+arch_dnaV.noalign.fna')
    dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/markerGenes/db/' + mg + '_bact+arch_dnaV.noalign.fna')
    #srcFilePath = str('/net/metagenomics/projects/PPSmg/data/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.fna' )
    #dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.fna' )
    pattern = r'.*ncbid:([0-9]+)$'
    #pattern = r'^([^\-]+)\-.*$'

    removeSet = set(csv.getColumnAsList(removeListFilePath, colNum=0, comment='#'))
    seqIdToSeq = fas.fastaFileToDict(srcFilePath)
    out = csv.OutFileBuffer(dstFilePath)
    removed = 0
    for seqId in seqIdToSeq:
        if re.sub(pattern, r'\1', str(seqId)) not in removeSet:
            out.writeText(str('>' + str(seqId) + '\n' + str(seqIdToSeq[seqId]) + '\n'))
        else:
            removed += 1

    out.close()
    print mg, 'removeSequences', removed
def _main():
    """ See the module description."""
    parser = argparse.ArgumentParser(description=__doc__, epilog="""""")

    parser.add_argument(
        '-i',
        '--input-data-dir',
        action='store',
        nargs=1,
        required=True,
        help=
        """Directory that contains fasta files and corresponding mapping files, for each "*.tax" (or "*.csv")
                 file there must be a "*.fna" file with the same name. All files with suffix "tax" (or "*.csv")
                 will be considered. (Takes only Bacteria and Archaea)""",
        metavar='input_dir',
        dest='inDir')

    parser.add_argument('-o',
                        '--output-dir',
                        action='store',
                        nargs=1,
                        required=True,
                        help='Directory that contains the output files.',
                        metavar='out_dir',
                        dest='outDir')

    parser.add_argument(
        '-s',
        '--source-type',
        required=True,
        nargs=1,
        choices=["s", "a"],
        help=
        'To determine the source, use "s" for the Silva database and "a" for the Amphora database.',
        dest='srcType')

    parser.add_argument(
        '-t',
        '--taxonomy-file',
        nargs=1,
        type=file,
        required=True,
        help='NCBI taxonomy database file in the sqlite3 format.',
        metavar='ncbitax_sqlite.db',
        dest='taxonomy')

    parser.add_argument('-n', '--not-considered-taxonIds', action='store', nargs=1,
        help='Comma separated leaf level or top level taxonIds (as a string) what fill be filtered out. (optional)',
        metavar='"2759,10239,77133,155900,408172,32644, 408170,433727,749907,556182,702656,410661,652676,410659,797283'\
                ',408171,703336,256318,32630,433724,766747,488339,942017,1076179,717931,455559,527640,904678,552539,'\
                '54395,198431,358574,415540,511564,369433,380357,81726,198834,271928,311313,2759,749906,1077529,'\
                '1077529,361146,511563,361147"',
        dest='filterOut')

    # parse arguments
    args = parser.parse_args()
    inDir = args.inDir[0]
    outDir = args.outDir[0]
    srcType = args.srcType[0]
    filterOutTaxonIdsSet = set()
    try:
        if args.filterOut:
            filterOutTaxonIdsSet.update(
                set(map(int,
                        str(args.filterOut[0]).split(','))))
    except:
        print(
            'Taxon ids that are to be filtered out are in a wrong format! Comma separated integers are needed!'
        )
        raise

    taxonomy = _TaxonomyWrap(args.taxonomy[0].name)
    for dir in [inDir, outDir]:
        assert os.path.isdir(dir), 'Path: "' + dir + '" does not exists!'

    # create db for each gene
    mapDict = {}  # map: seqId -> ncbid
    for mapFilePath in glob.glob(
            os.path.join(os.path.normpath(inDir),
                         r'*.[ct][sa][vx]')):  # *.csv or *.tax

        assert mapFilePath.endswith(('.csv', '.tax')), \
            'The mapping files can either end with .csv or .tax ' + mapFilePath

        base = os.path.basename(mapFilePath).rsplit(
            '.', 1)[0]  # cut out dir path and suffix
        fastaDict = fas.fastaFileToDict(
            os.path.join(os.path.dirname(mapFilePath),
                         (base + '.fna')))  # map: seqId -> seq
        print("Processing: %s seq count: %s" % (base, str(len(fastaDict))))

        if 'a' in srcType:  # Amphora
            mapDict = {}
            for k in csv.getColumnAsList(mapFilePath, colNum=0, sep='\t'):
                v = int(k.rsplit('|', 1)[1].split(':')[1])  # get ncbid
                assert ((k not in mapDict) or (mapDict[k] == v)), str(
                    'There are at least two different values for key: ' +
                    str(k) + ' in ' + mapFilePath)
                mapDict[k] = v
        elif 's' in srcType:  # Silva
            mapTmp = csv.getMapping(mapFilePath, 0, 2, '\t')
            mapDict = {}
            for k, v in mapTmp.iteritems():
                mapDict[k] = int(v[0])
        else:
            assert False, 'Unsupported source type!'

        # same number of entries in both files (fasta and mapping) ?
        if len(mapDict) != len(fastaDict):
            print(
                str('%s: The mapping file and the corresponding fasta file have different number of entries: '
                    + '"%s" "%s" these files will be skipped!') %
                (base, str(len(mapDict)), str(len(fastaDict))))
            continue

        # are duplicates in the mapping file ?
        count = len(csv.getColumnAsList(mapFilePath))
        if len(mapDict) != count:
            print(
                '%s: The mapping file contained duplicates! unique: %s non-unique: %s'
                % (base, str(len(mapDict)), str(count)))

        # store data to the output directory
        outDna = csv.OutFileBuffer(os.path.join(outDir, str(base + '.fna')))
        outTax = csv.OutFileBuffer(os.path.join(outDir, str(base + '.tax')))
        count = 0
        filteredLeaf = 0
        filteredSup = 0
        notMapped = 0
        noBacArch = 0
        for seqId, taxonId in mapDict.iteritems():
            if taxonId in filterOutTaxonIdsSet:
                filteredLeaf += 1
                continue
            path = taxonomy.getPathToRoot(taxonId)
            if path is None:
                print('Could not find: %s for seqId: %s record skipped!' %
                      (str(taxonId), seqId))
                notMapped += 1
                continue
            topLevel = int(path.split(';', 1)[0])
            if topLevel in filterOutTaxonIdsSet:
                filteredSup += 1
                continue
            if topLevel not in [2, 2157]:  # Bacteria, Archaea
                noBacArch += 1
                print('NoBactArch: ', topLevel)

            seq = fastaDict[seqId]
            if 'a' in srcType:  # Amphora
                id = seqId
            elif 's' in srcType:  # Silva
                id = str(seqId + '|ncbid:' + str(taxonId))

            outTax.writeText(str(id + '\t' + path + '\n'))
            outDna.writeText(str('>' + id + '\n' + seq + '\n'))
            count += 1

        outDna.close()
        outTax.close()
        print(
            'Stored entries: %s filtered out: %s leaf, %s top level, not mapped: %s'
            % (count, filteredLeaf, filteredSup, notMapped))
        if noBacArch > 0:
            print(
                'WARN: stored %s of non Bacterial and non Archaeal sequences: '
                % (noBacArch))

        # Silva:
        #-i /Users/ivan/Documents/work/binning/database/silva111/arbGenerated -s s -t /Users/ivan/Documents/work/binning/taxonomy/20121122/ncbitax_sqlite.db
        # -o /Users/ivan/Documents/work/binning/database/silva111/db -n ...

        # Amphora
        # -i /Users/ivan/Documents/work/binning/database/markerGenes3/mGenesExtracted -s a -t /Users/ivan/Documents/work/binning/taxonomy/20121122/ncbitax_sqlite.db
        # -o /Users/ivan/Documents/work/binning/database/markerGenes3/db

    taxonomy.close()
    print 'done'
def main():
    """
        Wraps pIRS read simulator to simulate Illumina paired end reads.

        Sample config: /Users/ivan/Documents/work/binning/data/V35/simMetagenome/configMetagenome01.cfg
    """
    if os.name != 'posix':
        print 'runs only on posix systems'
        return

    #parse arguments
    parser = argparse.ArgumentParser(description='''A simple Metagenome Illumina read simulator that wraps pIRS''',
                                 epilog='''''')

    parser.add_argument('-c', '--config', nargs=1, type=file, required=True,
                        help='configuration file of the simulator', metavar='configMetagenome.cfg',
                        dest='config')

    parser.add_argument('-p', '--pIRS-param', action='store', nargs='+',
                        help='parameters of the pIRS simulator, e.g. "-Q 64 -E 1"',
                        dest='p')

    args = parser.parse_args()
    config = Config(args.config[0], 'Sim')

    pirsParam = ''
    if args.p:
        pirsParam = args.p[0]

    #reads configuration
    workingDir = config.get('workingDir')
    referenceSeq = config.get('referenceSeq')
    frequenciesInfo = config.get('frequenciesInfo')
    coverageFrequencyMultiplier = float(config.get('coverageFrequencyMultiplier'))
    pirsInstallDir = config.get('pirsInstallDir')
    insertSizeMean = int(config.get('insertSizeMean'))
    insertSizeSd = int(config.get('insertSizeSd'))
    readLength = int(config.get('readLength'))

    #check whether the pIRS optional parameters doesn`t contain those predefined elsewhere (e.g. in the config)
    if (string.count(pirsParam,'-m') != 0 or string.count(pirsParam,'-v') != 0 or string.count(pirsParam,'-l') != 0
        or string.count(pirsParam,'-x') != 0 or string.count(pirsParam,'-i') != 0 or string.count(pirsParam,'-o') != 0):
        print 'pIRS parameters -m -v -l (-x) must be set in the configuration file, parameters -i -o cannot be set '
        return

    #check working directory, create temporary directory
    tmpDir = os.path.join(workingDir,'tmp')
    if not os.path.isdir(workingDir):
        print str('The working directory does not exists, create it! (' + str(workingDir) + ')')
        return
    if not os.path.isdir(tmpDir):
        os.mkdir(tmpDir)

    seqNameToSeq = fastaFileToDict(referenceSeq)
    seqNameToFreq = getMapping(frequenciesInfo, 0, 1, sep='\t', comment = '#')

    outReads1Merged = OutFileBuffer(os.path.join(workingDir,'reads_1.fq'))
    outReads2Merged = OutFileBuffer(os.path.join(workingDir,'reads_2.fq'))

    for seqName in seqNameToFreq:
        seq = seqNameToSeq[seqName]
        coverage = float(seqNameToFreq[seqName][0])*coverageFrequencyMultiplier

        fastaFile = os.path.join(tmpDir,str(seqName + '.fna'))
        outBuffer = OutFileBuffer(fastaFile)
        outBuffer.writeText(str('>' + seqName + '\n' + seq + '\n'))
        outBuffer.close()

        cmd = str(os.path.join(pirsInstallDir,'pirs') + ' simulate -i ' + fastaFile + ' -x ' + str(coverage) +
                  ' -m ' + str(insertSizeMean) + ' -v ' + str(insertSizeSd) + ' -l ' + str(readLength)
                  + ' -o ' + seqName + ' ' + pirsParam)
        #print cmd
        proc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=tmpDir)# stdout=subprocess.STDOUT, stderr=subprocess.STDOUT)
        proc.wait()
        if proc.returncode != 0:
            sys.stderr.write(str('command failed: ' + cmd))

        #append generated reads to the merged files
        reads1 = gzip.open(os.path.join(tmpDir, str(seqName + '_' + str(readLength) + '_' + str(insertSizeMean) + '_1.fq.gz')), 'rb')
        file1Content = reads1.read()
        outReads1Merged.writeText(str(file1Content.replace('@read_',str('@read_' + seqName + '_')) + '\n'))
        reads1.close()

        reads2 = gzip.open(os.path.join(tmpDir, str(seqName + '_' + str(readLength) + '_' + str(insertSizeMean) + '_2.fq.gz')), 'rb')
        file2Content = reads2.read()
        outReads2Merged.writeText(str(file2Content.replace('@read_',str('@read_' + seqName + '_')) + '\n'))
        reads2.close()

    outReads1Merged.close()
    outReads2Merged.close()
Beispiel #16
0
def mergeSequences(mapFilePathList, fastaFilePathList, outputDir):
    """
        Reads all sequences. For each taxonId creates a file that contain all sequences
        mapped to this taxonId. If a seqId appears more than one it is ignored since
        acession numbers are unique.

        @param mapFilePathList: list of files where each contain mapping: seqId -> taxonId
        @param fastaFilePathList: list of fasta files that contain mapping: seqId -> seq
    """
    taxonIdToOutBuffer = {}
    seqIdSet = set()

    totalSeqCount = 0
    totalStoredSeqCount = 0
    totalIdenticalSeqCount = 0

    for mapFilePath, fastaFilePath in zip(mapFilePathList, fastaFilePathList):
        print 'processing', mapFilePath, fastaFilePath
        seqCount = 0
        storedSeqCount = 0

        seqIdToSeq = fasta.fastaFileToDict(fastaFilePath)
        seqIdToNcbidList = csv.getMapping(mapFilePath,
                                          0,
                                          1,
                                          sep='\t',
                                          comment='#')

        for seqId, seq in seqIdToSeq.iteritems():
            seqCount += 1
            if seqId in seqIdSet:
                totalIdenticalSeqCount += 1
                continue
            else:
                seqIdSet.add(seqId)

            taxonId = seqIdToNcbidList[seqId][0]

            if taxonId not in taxonIdToOutBuffer:
                outBuffer = csv.OutFileBuffer(
                    os.path.join(outputDir, str(str(taxonId) + '.fna')))
                taxonIdToOutBuffer[taxonId] = outBuffer

            taxonIdToOutBuffer[taxonId].writeText(
                str('>' + seqId + '\n' + seq + '\n'))
            taxonIdToOutBuffer[taxonId].close()
            storedSeqCount += 1

            if len(string.replace(common.noNewLine(seq), 'N', '')) == 0:
                print 'zeros', seqId, fastaFilePath, len(common.noNewLine(seq))

        # for buff in taxonIdToOutBuffer.values():
        #     buff.close()

        print 'totalSeq, storedSeq', seqCount, storedSeqCount
        totalSeqCount += seqCount
        totalStoredSeqCount += storedSeqCount

    print 'totalSeqCount, totalStoredSeqCount, totalIdenticalSeqCount', totalSeqCount, totalStoredSeqCount, totalIdenticalSeqCount

    print 'sequences merged'
Beispiel #17
0
def computeTrainingAccuracy(workingDir, taWorkingDir, sampleSpecificDir,
                            ppsTrainDataDir, outputDir, ppsInstallDir,
                            ppsScripts, ppsConfigFilePath, predictLogFileName,
                            modelTaxonIdFilePath, databaseFile):
    """
        Computes the training accuracy for the PPS training data.
        This function doesn't consider training data used to train intermediate (misc?) nodes!
        The training data that correspond to the sample specific data is fragmented (via PPS) and
        contained in the training data of different lengths.

        @param workingDir: working directory of the PPS+ pipeline
        @param taWorkingDir: working directory for the accuracy computation
        @param sampleSpecificDir: directory containing the sample specific data
        @param ppsTrainDataDir: directory 'sampled_fasta' containing PPS training data
        @param outputDir: directory for output files
        @param ppsScripts: directory containing PPS scripts
        @param ppsConfigFilePath: the PPS configuration file
        @param ppsInstallDir: directory where PPS is installed
        @param predictLogFileName: logging file for PPS prediction
        @param modelTaxonIdFilePath: file containing all leaf ncbi taxon ids that are modelled
        @param databaseFile: ncbi taxonomy file in the sqlite3 format
    """
    for d in [
            workingDir, taWorkingDir, sampleSpecificDir, ppsTrainDataDir,
            outputDir, ppsInstallDir, ppsScripts,
            os.path.dirname(predictLogFileName)
    ]:
        assert os.path.isdir(d), "Directory '%s' doesn't exist!" % d
    for f in [ppsConfigFilePath, databaseFile, modelTaxonIdFilePath]:
        assert os.path.isfile(f), "File '%s' doesn't exist!" % f

    # all directories that contain PPS training data
    trainDirList = [sampleSpecificDir]
    for d in os.listdir(ppsTrainDataDir):
        trainDirList.append(os.path.join(ppsTrainDataDir, d))

    # fasta file with all training sequences
    allTrainFastaFile = os.path.join(taWorkingDir, 'all_train_data.fna')
    out = csv.OutFileBuffer(allTrainFastaFile)
    seqIdToTruePred = {}

    # merge all training fasta files to one fasta file
    for d in trainDirList:
        dName = os.path.basename(d)
        for f in os.listdir(d):
            taxonId = int(os.path.basename(f).rsplit('.', 2)[0])
            for seqId, seq in fasta.fastaFileToDict(os.path.join(
                    d, f)).iteritems():
                if d == sampleSpecificDir:
                    #label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1])
                    id = str(
                        taxonId) + '|' + dName + '|' + seqId + '|label:' + str(
                            taxonId)
                else:
                    id = str(taxonId) + '|' + dName + '|' + seqId
                out.writeText('>' + id + '\n' + seq + '\n')
                seqIdToTruePred[id] = taxonId
    out.close()

    # predict the merged file using the generated model
    if os.name == 'posix':
        predictCmd = str(
            os.path.join(ppsScripts, 'predict.rb') + ' ' + allTrainFastaFile +
            ' ' + ppsConfigFilePath)
        #print(predictCmd)
        logOut = open(predictLogFileName, 'w')
        predictProc = subprocess.Popen(
            predictCmd,
            shell=True,
            bufsize=-1,
            cwd=ppsInstallDir,
            stdout=logOut,
            stderr=subprocess.STDOUT)  # stdout=subprocess.STDOUT
        predictProc.wait()
        logOut.close()
        if predictProc.returncode != 0:
            raise Exception(
                "PPS 'predict' training data returned with non-zero status: %s, cmd: %s"
                % (predictProc.returncode, predictCmd))
    else:
        print("Can't run PPS on a non-posix system!")
        return

    # read in predicted train data
    seqIdToPred = csv.predToDict(allTrainFastaFile + '.nox.fna.out')

    # read fasta file
    seqIdToBp = fasta.getSequenceToBpDict(allTrainFastaFile)

    # leaf taxonIds that are modelled
    modelLeafTaxonIds = set(map(int,
                                csv.getColumnAsList(modelTaxonIdFilePath)))

    taxonomyS = taxonomy_ncbi.TaxonomyNcbi(databaseFile, considerNoRank=True)
    notLeafTaxonIds = set()
    for id in modelLeafTaxonIds:
        notLeafTaxonIds.update(
            set(map(int, (taxonomyS.getParentsNcbidSet(id)))))
    taxonomyS.close()

    # get only sequences with true taxonId defined at leaf level that is modelled or lower
    seqIdToBp2 = {}
    seqIdToPred2 = {}
    seqIdToTruePred2 = {}
    seqIdToBpMisc = {}
    seqIdToPredMisc = {}
    seqIdToTruePredMisc = {}
    for seqId, bp in seqIdToBp.iteritems():
        label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1])
        if label not in notLeafTaxonIds:
            seqIdToBp2[seqId] = bp
            seqIdToPred2[seqId] = seqIdToPred[seqId]
            seqIdToTruePred2[seqId] = seqIdToTruePred[seqId]
        else:
            seqIdToBpMisc[seqId] = bp
            seqIdToPredMisc[seqId] = seqIdToPred[seqId]
            seqIdToTruePredMisc[seqId] = seqIdToTruePred[seqId]
    seqIdToBp = seqIdToBp2
    seqIdToPred = seqIdToPred2
    seqIdToTruePred = seqIdToTruePred2

    # accuracy for all, filter out sample specific data (whole length)
    seqIdToBpNoSampleSpec = {}
    for seqId, bp in seqIdToBp.iteritems():
        if str(seqId).split(
                '|',
                2)[1].strip() != os.path.basename(sampleSpecificDir).strip():
            seqIdToBpNoSampleSpec[seqId] = bp

    acc = accuracy.Accuracy(seqIdToBpNoSampleSpec, seqIdToPred,
                            seqIdToTruePred, databaseFile)
    out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_all.txt'))
    out.writeText(
        acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                             minFracClade=None,
                             minFracPred=None,
                             overview=True))
    out.close()
    taxonomyA = acc.getTaxonomy()
    acc.close(closeTaxonomy=False)

    # accuracy for (misc) nodes
    acc = accuracy.Accuracy(seqIdToBpMisc, seqIdToPredMisc,
                            seqIdToTruePredMisc, taxonomyA)
    out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_misc.txt'))
    out.writeText(
        acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                             minFracClade=None,
                             minFracPred=None,
                             overview=True))
    out.close()
    acc.close(closeTaxonomy=False)

    # generate the confusion matrices (for the "for all" scenario)
    cm = confusion_matrix.ConfusionMatrix(seqIdToBp, seqIdToPred,
                                          seqIdToTruePred, databaseFile,
                                          taxonomy_ncbi.TAXONOMIC_RANKS[1:])
    for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]:
        cm.generateConfusionMatrix(
            rank, os.path.join(outputDir, 'train_accuracy_cmp_all'))
    taxonomyCM = cm.getTaxonomy()
    cm.close(closeTaxonomy=False)

    # accuracy for individual directories (seq lengths)
    # (the sample specific fragments are among PPS sampled fasta)
    for d in trainDirList:
        dName = os.path.basename(d)
        seqIdToBpSub = {}
        seqIdToPredSub = {}
        seqIdToTruePredSub = {}
        for seqId, bp in seqIdToBp.iteritems():
            if str(seqId).split('|', 2)[1].strip() == str(dName).strip():
                seqIdToBpSub[seqId] = seqIdToBp[seqId]
                seqIdToPredSub[seqId] = seqIdToPred[seqId]
                seqIdToTruePredSub[seqId] = seqIdToTruePred[seqId]

        # accuracy
        acc = accuracy.Accuracy(seqIdToBpSub, seqIdToPredSub,
                                seqIdToTruePredSub, taxonomyA)
        out = csv.OutFileBuffer(
            os.path.join(outputDir, 'train_accuracy_' + dName + '.txt'))
        out.writeText(
            acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                                 minFracClade=None,
                                 minFracPred=None,
                                 overview=True))

        # confusion matrices
        cm = confusion_matrix.ConfusionMatrix(
            seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyCM,
            taxonomy_ncbi.TAXONOMIC_RANKS[1:])
        for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]:
            cm.generateConfusionMatrix(
                rank, os.path.join(outputDir, 'train_accuracy_cmp_' + dName))
        cm.close(closeTaxonomy=False)

        out.close()
        acc.close(closeTaxonomy=False)
    taxonomyA.close()
    taxonomyCM.close()
    def _init(self, align=True, dm=True, cluster=True):
        """
            Init data, compute: alignment, distance matrix, clusters.
        """
        if self._initDone:
            return
        self._initDone = True

        fastaPathList = [
        ]  # fasta files containing regions that correspond to particular marker genes
        self._mgList = []  # list of names of marker genes
        mgToFastaPath = dict([])  # marker gene name -> fasta file path

        #collect regions from Amphora mg
        for fastaFile in glob.glob(
                os.path.join(os.path.normpath(self._mgWorkingDir), '*.gff')):
            fastaPathList.append(fastaFile)
        for path in fastaPathList:
            name = re.sub('([^\.]+)\..*$', r'\1', os.path.basename(path))
            mg = re.sub(r'([^_]+)_dna', r'\1', name)
            dir = os.path.dirname(path)
            self._mgList.append(mg)
            mgToFastaPath[mg] = path

        #add 16S
        s16List = ['5S_rRNA', '16S_rRNA', '23S_rRNA']
        for mg in s16List:
            mgToFastaPath[mg] = str(self._s16Prefix + '.' + mg + '.fna')
            self._mgList.append(mg)

        #For each marker gene create filtered fasta file that contains for each mg and sequence at most one region.
        mgToFilteredFastaPath = dict([])
        mgToSeqNameToTaxPathDict = dict(
            [])  #mg -> seqName (~region name) -> pred
        for mg in self._mgList:
            mgToSeqNameToTaxPathDict[mg] = dict([])

        for seq in self._sequences.sequences:
            id = str(str(seq.scaffold.id) + '_' + str(seq.id))
            for mg, tag, pred in zip(seq.getCandidateTaxPathSourceList(),
                                     seq.getCandidateTaxPathTagList(),
                                     seq.getCandidateTaxPathDictList()):
                mgToSeqNameToTaxPathDict[mg][str(id + '_' + tag)] = pred

        #for each marker gene: choose only one sequence region for each mg and sequence
        #all sequences are predicted at least at superkingdom
        for mg in self._mgList:
            seqNameToPred = mgToSeqNameToTaxPathDict[
                mg]  #sequence region predictions for this mg
            seqNameToSeq = fastaFileToDict(
                mgToFastaPath[mg])  #read the fasta file
            outPath = os.path.normpath(
                os.path.join(self._clustDir, str(mg + '.filter.fna')))
            mgToFilteredFastaPath[mg] = outPath
            out = OutFileBuffer(outPath)
            seqBaseToSeqName = dict(
                [])  # sequence base (scaffId_seqId) -> region name
            for seqName in seqNameToSeq:
                seqBase = re.sub(r'^([0-9]+_[0-9]+)[^0-9].*', r'\1', seqName)
                if seqBase not in seqBaseToSeqName:
                    seqBaseToSeqName[seqBase] = []
                seqBaseToSeqName[seqBase].append(seqName)
            for seqBase in seqBaseToSeqName:
                seqId = int(re.sub(r'^[0-9]+_([0-9]+)', r'\1', seqBase))
                seqBaseTaxPathDict = self._sequences.getSequence(
                    seqId).getTaxonomyPath()
                list = seqBaseToSeqName[seqBase]
                candidateSeq = [
                ]  # sequence region is predicted at least at rank superkingdom
                for seqName in list:
                    if seqName not in seqNameToPred:
                        taxPathDict = None
                    else:
                        taxPathDict = seqNameToPred[seqName]
                    if taxPathDict != None:
                        candidateSeq.append(seqName)
                if len(candidateSeq) == 0:
                    continue
                candidateSeq2 = [
                ]  # sequence regions predicted at least at the same rank as the whole sequence
                for seqName in candidateSeq:
                    taxPathDict = seqNameToPred[seqName]
                    if ((seqBaseTaxPathDict == None)
                            or (len(taxPathDict) >= len(seqBaseTaxPathDict))
                        ):  #predict at least at the same level
                        candidateSeq2.append(seqName)
                if len(candidateSeq2) > 0:  #take the longest sequence
                    sMax = candidateSeq2[0]
                    for s in candidateSeq2[1:]:
                        if len(seqNameToSeq[s]) > len(seqNameToSeq[sMax]):
                            sMax = s
                else:  #all sequence regions are predicted higher than the sequence
                    sMax = candidateSeq[
                        0]  #sequence region with the most specific prediction
                    for s in candidateSeq[1:]:
                        taxPathDictMax = seqNameToPred[sMax]
                        taxPathDictS = seqNameToPred[s]
                        if taxPathDictS == None:
                            continue
                        if taxPathDictMax == None:
                            sMax = s
                            continue
                        if len(taxPathDictMax) < len(taxPathDictS):
                            sMax = s

                    candidateSeq3 = [
                    ]  #get all sequence regions with the most specific prediction
                    taxPathDictMax = seqNameToPred[sMax]
                    for s in candidateSeq:
                        taxPathDictS = seqNameToPred[s]
                        if taxPathDictMax == None:
                            candidateSeq3.append(s)
                        elif len(taxPathDictS) == len(taxPathDictMax):
                            candidateSeq3.append(s)
                    sMax = candidateSeq3[0]
                    for s in candidateSeq3[1:]:  #take the longest sequence
                        if len(seqNameToSeq[sMax]) < len(seqNameToSeq[s]):
                            sMax = s

                out.writeText(
                    str('>' + str(sMax) + '\n' + str(seqNameToSeq[sMax]) +
                        '\n'))

            out.close()

        mgToAlignPath = dict([])
        for mg in self._mgList:
            mgToAlignPath[mg] = os.path.normpath(
                os.path.join(self._clustDir, str(mg + '.align.fna')))

        #build alignment
        if align:
            for mg in self._mgList:
                alignCmd = str(
                    self._config.get('aligner') + ' -in ' +
                    mgToFilteredFastaPath[mg] + ' -out ' + mgToAlignPath[mg] +
                    ' -quiet')
                assert os.name == 'posix'
                predictProc = subprocess.Popen(
                    alignCmd, cwd=self._mgWorkingDir, shell=True, bufsize=-1
                )  #stdout=subprocess.STDOUT, stderr=subprocess.STDOUT)
                predictProc.wait()
                print 'Muscle return code for', mg, ':', predictProc.returncode
                if predictProc.returncode != 0:
                    sys.stderr.write(str(alignCmd + ' \n'))

        #compute DM
        if dm:
            for mg in self._mgList:
                mothur = os.path.join(
                    os.path.normpath(
                        self._configRRNA16S.get('mothurInstallDir')), 'mothur')
                mothurCmd = str(
                    'time ' + mothur + ' "#dist.seqs(fasta=' +
                    mgToAlignPath[mg] +
                    ', processors=2, countends=F, calc=nogaps, cutoff=0.3, output=lt)"'
                )
                assert os.name == 'posix'
                mothurProc = subprocess.Popen(mothurCmd,
                                              shell=True,
                                              bufsize=-1,
                                              cwd=self._mgWorkingDir)
                mothurProc.wait()
                print 'Mothur return code dist:', mg, mothurProc.returncode
                #distFilePath = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist'))
                #self._mgToDM[mg] = forEachLine(distFilePath, DM())
                #self._mgToDM[mg].printDM()

        #cluster
        if cluster:
            for mg in self._mgList:
                distFilePath = os.path.join(os.path.dirname(mgToAlignPath[mg]),
                                            str(mg + '.align.phylip.dist'))
                mothur = os.path.join(
                    os.path.normpath(
                        self._configRRNA16S.get('mothurInstallDir')), 'mothur')
                mothurCmd = str('time ' + mothur + ' "#cluster(phylip=' +
                                distFilePath +
                                ', method=furthest, hard=t, precision=1000)"')
                assert os.name == 'posix'
                mothurProc = subprocess.Popen(mothurCmd,
                                              shell=True,
                                              bufsize=-1,
                                              cwd=self._mgWorkingDir)
                mothurProc.wait()
                print 'Mothur return code cluster:', mg, mothurProc.returncode

        #read DM and clusters

        #sequence predictions
        self._seqIdToTaxPathDict = dict([])
        self._seqIdToWeight = dict([])
        for seq in self._sequences.sequences:
            id = int(seq.id)
            self._seqIdToTaxPathDict[id] = seq.getTaxonomyPath()
            self._seqIdToWeight[id] = seq.getTaxonomyPathWeight()

        #similarity thresholds
        thresholds = self._configMG.get('mgSimilarityThresholds')
        self._mgToMaxThreshold = dict([])
        tmpDict = getMapping(self._configMG.get('mgSimilarityThresholds'),
                             0,
                             1,
                             sep='\t',
                             comment='#')
        for k in tmpDict:
            self._mgToMaxThreshold[k] = float(tmpDict[k][0])

        self._mgToDM = dict([])
        self._mgToCluster = dict([])
        for mg in self._mgList:
            file = os.path.join(os.path.dirname(mgToAlignPath[mg]),
                                str(mg + '.align.phylip.dist'))
            self._mgToDM[mg] = forEachLine(file, DM())
            file = os.path.join(os.path.dirname(mgToAlignPath[mg]),
                                str(mg + '.align.phylip.fn.list'))
            self._mgToCluster[mg] = forEachLine(
                file,
                MCluster(self._seqIdToTaxPathDict, self._mgToMaxThreshold[mg]))
Beispiel #19
0
    def _init(self, align=True, dm=True, cluster=True):
        """
            Init data, compute: alignment, distance matrix, clusters.
        """
        if self._initDone:
            return
        self._initDone = True

        fastaPathList = [] # fasta files containing regions that correspond to particular marker genes
        self._mgList = [] # list of names of marker genes
        mgToFastaPath = dict([]) # marker gene name -> fasta file path

        #collect regions from Amphora mg
        for fastaFile in glob.glob(os.path.join(os.path.normpath(self._mgWorkingDir),'*.gff')):
            fastaPathList.append(fastaFile)
        for path in fastaPathList:
            name = re.sub('([^\.]+)\..*$', r'\1' , os.path.basename(path))
            mg = re.sub(r'([^_]+)_dna', r'\1',name)
            dir = os.path.dirname(path)
            self._mgList.append(mg)
            mgToFastaPath[mg] = path

        #add 16S
        s16List = ['5S_rRNA', '16S_rRNA', '23S_rRNA']
        for mg in s16List:
            mgToFastaPath[mg] = str(self._s16Prefix + '.' + mg + '.fna')
            self._mgList.append(mg)

        #For each marker gene create filtered fasta file that contains for each mg and sequence at most one region.
        mgToFilteredFastaPath = dict([])
        mgToSeqNameToTaxPathDict = dict([]) #mg -> seqName (~region name) -> pred
        for mg in self._mgList:
            mgToSeqNameToTaxPathDict[mg] = dict([])

        for seq in self._sequences.sequences:
            id = str(str(seq.scaffold.id) + '_' + str(seq.id))
            for mg,tag,pred in zip(seq.getCandidateTaxPathSourceList(), seq.getCandidateTaxPathTagList(),
                                    seq.getCandidateTaxPathDictList()):
                mgToSeqNameToTaxPathDict[mg][str(id + '_' + tag)] = pred

        #for each marker gene: choose only one sequence region for each mg and sequence
        #all sequences are predicted at least at superkingdom
        for mg in self._mgList:
            seqNameToPred = mgToSeqNameToTaxPathDict[mg] #sequence region predictions for this mg
            seqNameToSeq = fastaFileToDict(mgToFastaPath[mg]) #read the fasta file
            outPath = os.path.normpath(os.path.join(self._clustDir, str(mg + '.filter.fna')))
            mgToFilteredFastaPath[mg] = outPath
            out = OutFileBuffer(outPath)
            seqBaseToSeqName = dict([]) # sequence base (scaffId_seqId) -> region name
            for seqName in seqNameToSeq:
                seqBase = re.sub(r'^([0-9]+_[0-9]+)[^0-9].*',r'\1', seqName)
                if seqBase not in seqBaseToSeqName:
                    seqBaseToSeqName[seqBase] = []
                seqBaseToSeqName[seqBase].append(seqName)
            for seqBase in seqBaseToSeqName:
                seqId = int(re.sub(r'^[0-9]+_([0-9]+)',r'\1', seqBase))
                seqBaseTaxPathDict = self._sequences.getSequence(seqId).getTaxonomyPath()
                list = seqBaseToSeqName[seqBase]
                candidateSeq = [] # sequence region is predicted at least at rank superkingdom
                for seqName in list:
                    if seqName not in seqNameToPred:
                        taxPathDict = None
                    else:
                        taxPathDict = seqNameToPred[seqName]
                    if taxPathDict != None:
                         candidateSeq.append(seqName)
                if len(candidateSeq) == 0:
                    continue
                candidateSeq2 = [] # sequence regions predicted at least at the same rank as the whole sequence
                for seqName in candidateSeq:
                    taxPathDict = seqNameToPred[seqName]
                    if ((seqBaseTaxPathDict == None)
                        or (len(taxPathDict) >= len(seqBaseTaxPathDict))): #predict at least at the same level
                        candidateSeq2.append(seqName)
                if len(candidateSeq2) > 0: #take the longest sequence
                    sMax = candidateSeq2[0]
                    for s in candidateSeq2[1:]:
                        if len(seqNameToSeq[s]) > len(seqNameToSeq[sMax]):
                            sMax = s
                else: #all sequence regions are predicted higher than the sequence
                    sMax = candidateSeq[0] #sequence region with the most specific prediction
                    for s in candidateSeq[1:]:
                        taxPathDictMax = seqNameToPred[sMax]
                        taxPathDictS = seqNameToPred[s]
                        if taxPathDictS == None:
                            continue
                        if taxPathDictMax == None:
                            sMax = s
                            continue
                        if len(taxPathDictMax) < len(taxPathDictS):
                            sMax = s

                    candidateSeq3 = [] #get all sequence regions with the most specific prediction
                    taxPathDictMax = seqNameToPred[sMax]
                    for s in candidateSeq:
                        taxPathDictS = seqNameToPred[s]
                        if taxPathDictMax == None:
                            candidateSeq3.append(s)
                        elif len(taxPathDictS) == len(taxPathDictMax):
                            candidateSeq3.append(s)
                    sMax = candidateSeq3[0]
                    for s in candidateSeq3[1:]: #take the longest sequence
                        if len(seqNameToSeq[sMax]) < len(seqNameToSeq[s]):
                            sMax = s

                out.writeText(str('>' + str(sMax) + '\n' + str(seqNameToSeq[sMax]) + '\n'))

            out.close()

        mgToAlignPath = dict([])
        for mg in self._mgList:
            mgToAlignPath[mg] = os.path.normpath(os.path.join(self._clustDir, str(mg + '.align.fna')))

        #build alignment
        if align:
            for mg in self._mgList:
                alignCmd = str(self._config.get('aligner') + ' -in ' + mgToFilteredFastaPath[mg]
                + ' -out ' + mgToAlignPath[mg] + ' -quiet')
                assert os.name == 'posix'
                predictProc = subprocess.Popen(alignCmd, cwd=self._mgWorkingDir, shell=True, bufsize=-1) #stdout=subprocess.STDOUT, stderr=subprocess.STDOUT)
                predictProc.wait()
                print 'Muscle return code for', mg, ':', predictProc.returncode
                if predictProc.returncode != 0:
                    sys.stderr.write(str(alignCmd + ' \n'))

        #compute DM
        if dm:
            for mg in self._mgList:
                mothur = os.path.join(os.path.normpath(self._configRRNA16S.get('mothurInstallDir')), 'mothur')
                mothurCmd = str('time ' + mothur + ' "#dist.seqs(fasta=' + mgToAlignPath[mg]
                                + ', processors=2, countends=F, calc=nogaps, cutoff=0.3, output=lt)"')
                assert os.name == 'posix'
                mothurProc = subprocess.Popen(mothurCmd, shell=True, bufsize=-1, cwd=self._mgWorkingDir)
                mothurProc.wait()
                print 'Mothur return code dist:', mg, mothurProc.returncode
                #distFilePath = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist'))
                #self._mgToDM[mg] = forEachLine(distFilePath, DM())
                #self._mgToDM[mg].printDM()

        #cluster
        if cluster:
            for mg in self._mgList:
                distFilePath = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist'))
                mothur = os.path.join(os.path.normpath(self._configRRNA16S.get('mothurInstallDir')), 'mothur')
                mothurCmd = str('time ' + mothur + ' "#cluster(phylip=' + distFilePath
                                + ', method=furthest, hard=t, precision=1000)"')
                assert os.name == 'posix'
                mothurProc = subprocess.Popen(mothurCmd, shell=True, bufsize=-1, cwd=self._mgWorkingDir)
                mothurProc.wait()
                print 'Mothur return code cluster:', mg, mothurProc.returncode

        #read DM and clusters

        #sequence predictions
        self._seqIdToTaxPathDict = dict([])
        self._seqIdToWeight = dict([])
        for seq in self._sequences.sequences:
            id = int(seq.id)
            self._seqIdToTaxPathDict[id] = seq.getTaxonomyPath()
            self._seqIdToWeight[id] = seq.getTaxonomyPathWeight()

        #similarity thresholds
        thresholds = self._configMG.get('mgSimilarityThresholds')
        self._mgToMaxThreshold = dict([])
        tmpDict = getMapping(self._configMG.get('mgSimilarityThresholds'), 0, 1, sep='\t', comment = '#')
        for k in tmpDict:
            self._mgToMaxThreshold[k] = float(tmpDict[k][0])

        self._mgToDM = dict([])
        self._mgToCluster = dict([])
        for mg in self._mgList:
            file = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist'))
            self._mgToDM[mg] = forEachLine(file, DM())
            file = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.fn.list'))
            self._mgToCluster[mg] = forEachLine(file, MCluster(self._seqIdToTaxPathDict, self._mgToMaxThreshold[mg]))
Beispiel #20
0
def _main():
    """ See the module description."""
    parser = argparse.ArgumentParser(description=__doc__, epilog="""""")

    parser.add_argument('-i', '--input-data-dir', action='store', nargs=1, required=True,
        help="""Directory that contains fasta files and corresponding mapping files, for each "*.tax" (or "*.csv")
                 file there must be a "*.fna" file with the same name. All files with suffix "tax" (or "*.csv")
                 will be considered. (Takes only Bacteria and Archaea)""",
        metavar='input_dir',
        dest='inDir')

    parser.add_argument('-o', '--output-dir', action='store', nargs=1, required=True,
        help='Directory that contains the output files.',
        metavar='out_dir',
        dest='outDir')

    parser.add_argument('-s', '--source-type', required=True, nargs=1, choices=["s","a"],
        help='To determine the source, use "s" for the Silva database and "a" for the Amphora database.',
        dest='srcType')

    parser.add_argument('-t', '--taxonomy-file', nargs=1, type=file, required=True,
        help='NCBI taxonomy database file in the sqlite3 format.', metavar='ncbitax_sqlite.db',
        dest='taxonomy')

    parser.add_argument('-n', '--not-considered-taxonIds', action='store', nargs=1,
        help='Comma separated leaf level or top level taxonIds (as a string) what fill be filtered out. (optional)',
        metavar='"2759,10239,77133,155900,408172,32644, 408170,433727,749907,556182,702656,410661,652676,410659,797283'\
                ',408171,703336,256318,32630,433724,766747,488339,942017,1076179,717931,455559,527640,904678,552539,'\
                '54395,198431,358574,415540,511564,369433,380357,81726,198834,271928,311313,2759,749906,1077529,'\
                '1077529,361146,511563,361147"',
        dest='filterOut')

    # parse arguments
    args = parser.parse_args()
    inDir = args.inDir[0]
    outDir =  args.outDir[0]
    srcType = args.srcType[0]
    filterOutTaxonIdsSet = set()
    try:
        if args.filterOut:
            filterOutTaxonIdsSet.update(set(map(int, str(args.filterOut[0]).split(','))))
    except:
        print('Taxon ids that are to be filtered out are in a wrong format! Comma separated integers are needed!')
        raise

    taxonomy = TaxonomyWrap(args.taxonomy[0].name)
    for dir in [inDir, outDir]:
        assert os.path.isdir(dir), 'Path: "' + dir + '" does not exists!'

    # create db for each gene
    mapDict = {}  # map: seqId -> ncbid
    for mapFilePath in glob.glob(os.path.join(os.path.normpath(inDir), r'*.[ct][sa][vx]')):  # *.csv or *.tax

        assert mapFilePath.endswith(('.csv', '.tax')), \
            'The mapping files can either end with .csv or .tax ' + mapFilePath

        base = os.path.basename(mapFilePath).rsplit('.', 1)[0]  # cut out dir path and suffix
        fastaDict = fas.fastaFileToDict(os.path.join(os.path.dirname(mapFilePath), (base + '.fna'))) # map: seqId -> seq
        print("Processing: %s seq count: %s" % (base, str(len(fastaDict))))

        if 'a' in srcType:  # Amphora
            mapDict = {}
            for k in csv.getColumnAsList(mapFilePath, colNum=0, sep='\t'):
                v =  int(k.rsplit('|', 1)[1].split(':')[1]) # get ncbid
                assert ((k not in mapDict) or (mapDict[k] == v)), str(
                    'There are at least two different values for key: ' + str(k) + ' in ' + mapFilePath)
                mapDict[k] = v
        elif 's' in srcType:  # Silva
            mapTmp = csv.getMapping(mapFilePath, 0, 2, '\t')
            mapDict = {}
            for k, v in mapTmp.iteritems():
                mapDict[k] = int(v[0])
        else:
            assert False, 'Unsupported source type!'

        # same number of entries in both files (fasta and mapping) ?
        if len(mapDict) != len(fastaDict):
            print(str('%s: The mapping file and the corresponding fasta file have different number of entries: ' +
                      '"%s" "%s" these files will be skipped!') % (base, str(len(mapDict)), str(len(fastaDict))))
            continue

        # are duplicates in the mapping file ?
        count = len(csv.getColumnAsList(mapFilePath))
        if len(mapDict) != count:
            print('%s: The mapping file contained duplicates! unique: %s non-unique: %s' % (
                base, str(len(mapDict)), str(count)))

        # store data to the output directory
        outDna = csv.OutFileBuffer(os.path.join(outDir, str(base + '.fna')))
        outTax = csv.OutFileBuffer(os.path.join(outDir, str(base + '.tax')))
        count = 0
        filteredLeaf = 0
        filteredSup = 0
        notMapped = 0
        noBacArch = 0
        for seqId, taxonId in mapDict.iteritems():
            if taxonId in filterOutTaxonIdsSet:
                filteredLeaf += 1
                continue
            path = taxonomy.getPathToRoot(taxonId)
            if path is None:
                print('Could not find: %s for seqId: %s record skipped!' % (str(taxonId), seqId))
                notMapped += 1
                continue
            topLevel = int(path.split(';', 1)[0])
            if topLevel in filterOutTaxonIdsSet:
                filteredSup += 1
                continue
            if topLevel not in [2, 2157]:  # Bacteria, Archaea
                noBacArch += 1
                print('NoBactArch: ', topLevel)

            seq = fastaDict[seqId]
            if 'a' in srcType:  # Amphora
                id = seqId
            elif 's' in srcType:  # Silva
                id = str(seqId + '|ncbid:' + str(taxonId))

            outTax.writeText(str(id + '\t' + path + '\n'))
            outDna.writeText(str('>' + id + '\n' + seq + '\n'))
            count += 1

        outDna.close()
        outTax.close()
        print('Stored entries: %s filtered out: %s leaf, %s top level, not mapped: %s' %
              (count, filteredLeaf, filteredSup, notMapped))
        if noBacArch > 0:
            print('WARN: stored %s of non Bacterial and non Archaeal sequences: ' % (noBacArch))

        # Silva:
        #-i /Users/ivan/Documents/work/binning/database/silva111/arbGenerated -s s -t /Users/ivan/Documents/work/binning/taxonomy/20121122/ncbitax_sqlite.db
        # -o /Users/ivan/Documents/work/binning/database/silva111/db -n ...

        # Amphora
        # -i /Users/ivan/Documents/work/binning/database/markerGenes3/mGenesExtracted -s a -t /Users/ivan/Documents/work/binning/taxonomy/20121122/ncbitax_sqlite.db
        # -o /Users/ivan/Documents/work/binning/database/markerGenes3/db

    taxonomy.close()
    print 'done'
Beispiel #21
0
def computeTrainingAccuracy(workingDir, taWorkingDir, sampleSpecificDir, ppsTrainDataDir, outputDir, ppsInstallDir,
                            ppsScripts, ppsConfigFilePath, predictLogFileName, modelTaxonIdFilePath, databaseFile):
    """
        Computes the training accuracy for the PPS training data.
        This function doesn't consider training data used to train intermediate (misc?) nodes!
        The training data that correspond to the sample specific data is fragmented (via PPS) and
        contained in the training data of different lengths.

        @param workingDir: working directory of the PPS+ pipeline
        @param taWorkingDir: working directory for the accuracy computation
        @param sampleSpecificDir: directory containing the sample specific data
        @param ppsTrainDataDir: directory 'sampled_fasta' containing PPS training data
        @param outputDir: directory for output files
        @param ppsScripts: directory containing PPS scripts
        @param ppsConfigFilePath: the PPS configuration file
        @param ppsInstallDir: directory where PPS is installed
        @param predictLogFileName: logging file for PPS prediction
        @param modelTaxonIdFilePath: file containing all leaf ncbi taxon ids that are modelled
        @param databaseFile: ncbi taxonomy file in the sqlite3 format
    """
    for d in [workingDir, taWorkingDir, sampleSpecificDir,
              ppsTrainDataDir, outputDir, ppsInstallDir, ppsScripts, os.path.dirname(predictLogFileName)]:
        assert os.path.isdir(d), "Directory '%s' doesn't exist!" % d
    for f in [ppsConfigFilePath, databaseFile, modelTaxonIdFilePath]:
        assert os.path.isfile(f), "File '%s' doesn't exist!" % f

    # all directories that contain PPS training data
    trainDirList = [sampleSpecificDir]
    for d in os.listdir(ppsTrainDataDir):
        trainDirList.append(os.path.join(ppsTrainDataDir, d))

    # fasta file with all training sequences
    allTrainFastaFile = os.path.join(taWorkingDir, 'all_train_data.fna')
    out = csv.OutFileBuffer(allTrainFastaFile)
    seqIdToTruePred = {}

    # merge all training fasta files to one fasta file
    for d in trainDirList:
        dName = os.path.basename(d)
        for f in os.listdir(d):
            taxonId = int(os.path.basename(f).rsplit('.', 2)[0])
            for seqId, seq in fasta.fastaFileToDict(os.path.join(d, f)).iteritems():
                if d == sampleSpecificDir:
                    #label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1])
                    id = str(taxonId) + '|' + dName + '|' + seqId + '|label:' + str(taxonId)
                else:
                    id = str(taxonId) + '|' + dName + '|' + seqId
                out.writeText('>' + id + '\n' + seq + '\n')
                seqIdToTruePred[id] = taxonId
    out.close()

    # predict the merged file using the generated model
    if os.name == 'posix':
        predictCmd = str(os.path.join(ppsScripts, 'predict.rb') + ' ' + allTrainFastaFile + ' ' + ppsConfigFilePath)
        #print(predictCmd)
        logOut = open(predictLogFileName, 'w')
        predictProc = subprocess.Popen(predictCmd, shell=True, bufsize=-1, cwd=ppsInstallDir, stdout=logOut,
                                       stderr=subprocess.STDOUT)  # stdout=subprocess.STDOUT
        predictProc.wait()
        logOut.close()
        if predictProc.returncode != 0:
            raise Exception("PPS 'predict' training data returned with non-zero status: %s, cmd: %s" %
                            (predictProc.returncode, predictCmd))
    else:
        print("Can't run PPS on a non-posix system!")
        return

    # read in predicted train data
    seqIdToPred = csv.predToDict(allTrainFastaFile + '.nox.fna.out')

    # read fasta file
    seqIdToBp = fasta.getSequenceToBpDict(allTrainFastaFile)

    # leaf taxonIds that are modelled
    modelLeafTaxonIds = set(map(int, csv.getColumnAsList(modelTaxonIdFilePath)))

    taxonomyS = taxonomy_ncbi.TaxonomyNcbi(databaseFile, considerNoRank=True)
    notLeafTaxonIds = set()
    for id in modelLeafTaxonIds:
        notLeafTaxonIds.update(set(map(int, (taxonomyS.getParentsNcbidSet(id)))))
    taxonomyS.close()

    # get only sequences with true taxonId defined at leaf level that is modelled or lower
    seqIdToBp2 = {}
    seqIdToPred2 = {}
    seqIdToTruePred2 = {}
    seqIdToBpMisc = {}
    seqIdToPredMisc = {}
    seqIdToTruePredMisc = {}
    for seqId, bp in seqIdToBp.iteritems():
        label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1])
        if label not in notLeafTaxonIds:
            seqIdToBp2[seqId] = bp
            seqIdToPred2[seqId] = seqIdToPred[seqId]
            seqIdToTruePred2[seqId] = seqIdToTruePred[seqId]
        else:
            seqIdToBpMisc[seqId] = bp
            seqIdToPredMisc[seqId] = seqIdToPred[seqId]
            seqIdToTruePredMisc[seqId] = seqIdToTruePred[seqId]
    seqIdToBp = seqIdToBp2
    seqIdToPred = seqIdToPred2
    seqIdToTruePred = seqIdToTruePred2

    # accuracy for all, filter out sample specific data (whole length)
    seqIdToBpNoSampleSpec = {}
    for seqId, bp in seqIdToBp.iteritems():
        if str(seqId).split('|', 2)[1].strip() != os.path.basename(sampleSpecificDir).strip():
            seqIdToBpNoSampleSpec[seqId] = bp

    acc = accuracy.Accuracy(seqIdToBpNoSampleSpec, seqIdToPred, seqIdToTruePred, databaseFile)
    out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_all.txt'))
    out.writeText(acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                                       minFracClade=None, minFracPred=None, overview=True))
    out.close()
    taxonomyA = acc.getTaxonomy()
    acc.close(closeTaxonomy=False)

    # accuracy for (misc) nodes
    acc = accuracy.Accuracy(seqIdToBpMisc, seqIdToPredMisc, seqIdToTruePredMisc, taxonomyA)
    out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_misc.txt'))
    out.writeText(acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                                       minFracClade=None, minFracPred=None, overview=True))
    out.close()
    acc.close(closeTaxonomy=False)

    # generate the confusion matrices (for the "for all" scenario)
    cm = confusion_matrix.ConfusionMatrix(seqIdToBp, seqIdToPred, seqIdToTruePred, databaseFile,
                                          taxonomy_ncbi.TAXONOMIC_RANKS[1:])
    for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]:
        cm.generateConfusionMatrix(rank, os.path.join(outputDir, 'train_accuracy_cmp_all'))
    taxonomyCM = cm.getTaxonomy()
    cm.close(closeTaxonomy=False)

    # accuracy for individual directories (seq lengths)
    # (the sample specific fragments are among PPS sampled fasta)
    for d in trainDirList:
        dName = os.path.basename(d)
        seqIdToBpSub = {}
        seqIdToPredSub = {}
        seqIdToTruePredSub = {}
        for seqId, bp in seqIdToBp.iteritems():
            if str(seqId).split('|', 2)[1].strip() == str(dName).strip():
                seqIdToBpSub[seqId] = seqIdToBp[seqId]
                seqIdToPredSub[seqId] = seqIdToPred[seqId]
                seqIdToTruePredSub[seqId] = seqIdToTruePred[seqId]

        # accuracy
        acc = accuracy.Accuracy(seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyA)
        out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_' + dName + '.txt'))
        out.writeText(acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                                           minFracClade=None, minFracPred=None, overview=True))

        # confusion matrices
        cm = confusion_matrix.ConfusionMatrix(seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyCM,
                                              taxonomy_ncbi.TAXONOMIC_RANKS[1:])
        for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]:
            cm.generateConfusionMatrix(rank, os.path.join(outputDir, 'train_accuracy_cmp_' + dName))
        cm.close(closeTaxonomy=False)

        out.close()
        acc.close(closeTaxonomy=False)
    taxonomyA.close()
    taxonomyCM.close()
def main():
    """
        Wraps pIRS read simulator to simulate Illumina paired end reads.

        Sample config: /Users/ivan/Documents/work/binning/data/V35/simMetagenome/configMetagenome01.cfg
    """
    if os.name != 'posix':
        print 'runs only on posix systems'
        return

    #parse arguments
    parser = argparse.ArgumentParser(
        description=
        '''A simple Metagenome Illumina read simulator that wraps pIRS''',
        epilog='''''')

    parser.add_argument('-c',
                        '--config',
                        nargs=1,
                        type=file,
                        required=True,
                        help='configuration file of the simulator',
                        metavar='configMetagenome.cfg',
                        dest='config')

    parser.add_argument(
        '-p',
        '--pIRS-param',
        action='store',
        nargs='+',
        help='parameters of the pIRS simulator, e.g. "-Q 64 -E 1"',
        dest='p')

    args = parser.parse_args()
    config = Config(args.config[0], 'Sim')

    pirsParam = ''
    if args.p:
        pirsParam = args.p[0]

    #reads configuration
    workingDir = config.get('workingDir')
    referenceSeq = config.get('referenceSeq')
    frequenciesInfo = config.get('frequenciesInfo')
    coverageFrequencyMultiplier = float(
        config.get('coverageFrequencyMultiplier'))
    pirsInstallDir = config.get('pirsInstallDir')
    insertSizeMean = int(config.get('insertSizeMean'))
    insertSizeSd = int(config.get('insertSizeSd'))
    readLength = int(config.get('readLength'))

    #check whether the pIRS optional parameters doesn`t contain those predefined elsewhere (e.g. in the config)
    if (string.count(pirsParam, '-m') != 0
            or string.count(pirsParam, '-v') != 0
            or string.count(pirsParam, '-l') != 0
            or string.count(pirsParam, '-x') != 0
            or string.count(pirsParam, '-i') != 0
            or string.count(pirsParam, '-o') != 0):
        print 'pIRS parameters -m -v -l (-x) must be set in the configuration file, parameters -i -o cannot be set '
        return

    #check working directory, create temporary directory
    tmpDir = os.path.join(workingDir, 'tmp')
    if not os.path.isdir(workingDir):
        print str('The working directory does not exists, create it! (' +
                  str(workingDir) + ')')
        return
    if not os.path.isdir(tmpDir):
        os.mkdir(tmpDir)

    seqNameToSeq = fastaFileToDict(referenceSeq)
    seqNameToFreq = getMapping(frequenciesInfo, 0, 1, sep='\t', comment='#')

    outReads1Merged = OutFileBuffer(os.path.join(workingDir, 'reads_1.fq'))
    outReads2Merged = OutFileBuffer(os.path.join(workingDir, 'reads_2.fq'))

    for seqName in seqNameToFreq:
        seq = seqNameToSeq[seqName]
        coverage = float(
            seqNameToFreq[seqName][0]) * coverageFrequencyMultiplier

        fastaFile = os.path.join(tmpDir, str(seqName + '.fna'))
        outBuffer = OutFileBuffer(fastaFile)
        outBuffer.writeText(str('>' + seqName + '\n' + seq + '\n'))
        outBuffer.close()

        cmd = str(
            os.path.join(pirsInstallDir, 'pirs') + ' simulate -i ' +
            fastaFile + ' -x ' + str(coverage) + ' -m ' + str(insertSizeMean) +
            ' -v ' + str(insertSizeSd) + ' -l ' + str(readLength) + ' -o ' +
            seqName + ' ' + pirsParam)
        #print cmd
        proc = subprocess.Popen(
            cmd, shell=True, bufsize=-1,
            cwd=tmpDir)  # stdout=subprocess.STDOUT, stderr=subprocess.STDOUT)
        proc.wait()
        if proc.returncode != 0:
            sys.stderr.write(str('command failed: ' + cmd))

        #append generated reads to the merged files
        reads1 = gzip.open(
            os.path.join(
                tmpDir,
                str(seqName + '_' + str(readLength) + '_' +
                    str(insertSizeMean) + '_1.fq.gz')), 'rb')
        file1Content = reads1.read()
        outReads1Merged.writeText(
            str(
                file1Content.replace('@read_', str('@read_' + seqName + '_')) +
                '\n'))
        reads1.close()

        reads2 = gzip.open(
            os.path.join(
                tmpDir,
                str(seqName + '_' + str(readLength) + '_' +
                    str(insertSizeMean) + '_2.fq.gz')), 'rb')
        file2Content = reads2.read()
        outReads2Merged.writeText(
            str(
                file2Content.replace('@read_', str('@read_' + seqName + '_')) +
                '\n'))
        reads2.close()

    outReads1Merged.close()
    outReads2Merged.close()