Beispiel #1
0
def ppsOut2ppOut(inFile, outFile, taxonomicRanks, databaseFile):
    """
        Transforms a PPS output file into a file in the PP format.

        @param inFile: input file in the PPS format (first column: seq name, last column: ncbi taxon id)
        @param outFile: output file in the PP format
        @param taxonomicRanks: taxonomic ranks (starting from superkingdom)
        @param databaseFile: database file in the sqlite3 format
    """
    taxonomy = Taxonomy(databaseFile, taxonomicRanks)
    outBuff = csv.OutFileBuffer(outFile)
    namesList = csv.getColumnAsList(inFile,
                                    entryModifyFunction=None,
                                    colNum=0,
                                    sep='\t',
                                    comment='#')
    valCol = 1
    ncbidsList = csv.getColumnAsList(inFile,
                                     entryModifyFunction=None,
                                     colNum=valCol,
                                     sep='\t',
                                     comment='#')

    while True:  # this is not efficient!
        valCol += 1
        tmpList = csv.getColumnAsList(inFile,
                                      entryModifyFunction=None,
                                      colNum=valCol,
                                      sep='\t',
                                      comment='#')
        if len(tmpList) == len(namesList):
            ncbidsList = tmpList
        else:
            break

    header = str('#PPS file transformed to PP format, input file: ' +
                 str(inFile) + '\n#ID' + '\t' + 'root')
    for rank in taxonomicRanks:
        header += str('\t' + rank)
    outBuff.writeText(str(header + '\n'))

    for i in range(len(namesList)):
        name = namesList[i]
        ncbid = ncbidsList[i]
        taxPathDict = taxonomy.getPathToRoot(int(ncbid))
        buff = str(name)
        if taxPathDict is None:
            buff += str('\t')
        else:
            buff += str('\t' + 'root')

        for rank in taxonomicRanks:
            if (taxPathDict is not None) and (rank in taxPathDict) and (
                    not taxPathDict[rank].isCopy()):
                buff += str('\t' + taxPathDict[rank].name)
            else:
                buff += '\t'
        outBuff.writeText(str(buff + '\n'))
    outBuff.close()
    taxonomy.close()
Beispiel #2
0
def main01():
    #config = Config(open(os.path.normpath('/Users/ivan/Documents/work/binning/tests/CowRumen/01/config.cfg')), 'pPPS')
    #config = Config(open(os.path.normpath('/net/metagenomics/projects/PPSmg/tests/V35/config.cfg')), 'pPPS')
    #configMl = Config2(config, 'MLTreeMap')
    #configPPS = Config2(config, 'PPS')

    #read sequences
    #sequences = Sequences(config)

    #write ids file
    #sequences.writeSequences(config.get('inputIdsFastaFile'))

    #taxonomy = Taxonomy(config.get('databaseFile'), config.get('taxonomicRanks').split(','))

    taxonomicRanks = 'superkingdom,phylum,class,order,family,genus,species'.split(',')
    taxonomy = Taxonomy('/Users/ivan/Documents/work/binning/taxonomy/20120828/ncbitax_sqlite.db', taxonomicRanks)

    #ppsOut2ppOut('D:\\VM\\tmp\\simMC_AMD\\AMD.Arachne.genus', 'D:\\VM\\tmp\\simMC_AMD\\AMD.Arachne.genus.PP.out', taxonomy, config.get('taxonomicRanks').split(','))

    #ppsOut2ppOut('/Users/ivan/Documents/work/binning/data/CowRumen/cowRumenOrderNcbids.txt',
    #             '/Users/ivan/Documents/work/binning/data/CowRumen/cowRumenOrderNcbids.PP.txt', taxonomy, config.get('taxonomicRanks').split(','))

    #ppsOut2ppOut('/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000LabelsSpecies.txt',
    #             '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000LabelsSpecies.PP.txt', taxonomy, config.get('taxonomicRanks').split(','))

    ppsOut2ppOut('/Users/ivan/Documents/work/binning/data/simMC/fromJohannes/contigs.genus.tax',
                 '/Users/ivan/Documents/work/binning/data/simMC/fromJohannes/contigs.genus.PP.tax', taxonomy, taxonomicRanks)


    #readPPSOutput(sequences, taxonomy, config.get('inputIdsFastaFile'))

    #sequences.writePlacements(str(config.get('inputIdsFastaFile') + '.pOUT'), config.get('taxonomicRanks').split(','))

    #toRealNames(config, sequences)
    taxonomy.close()
Beispiel #3
0
def ppsOut2ppOut(inFile, outFile, taxonomicRanks, databaseFile):
    """
        Transforms a PPS output file into a file in the PP format.

        @param inFile: input file in the PPS format (first column: seq name, last column: ncbi taxon id)
        @param outFile: output file in the PP format
        @param taxonomicRanks: taxonomic ranks (starting from superkingdom)
        @param databaseFile: database file in the sqlite3 format
    """
    taxonomy = Taxonomy(databaseFile, taxonomicRanks)
    outBuff = csv.OutFileBuffer(outFile)
    namesList = csv.getColumnAsList(inFile, entryModifyFunction=None, colNum=0, sep='\t', comment='#')
    valCol = 1
    ncbidsList = csv.getColumnAsList(inFile, entryModifyFunction=None, colNum=valCol, sep='\t', comment='#')

    while True:  # this is not efficient!
        valCol += 1
        tmpList = csv.getColumnAsList(inFile, entryModifyFunction=None, colNum=valCol, sep='\t', comment='#')
        if len(tmpList) == len(namesList):
            ncbidsList = tmpList
        else:
            break

    header = str('#PPS file transformed to PP format, input file: ' + str(inFile) + '\n#ID' + '\t' + 'root')
    for rank in taxonomicRanks:
        header += str('\t' + rank)
    outBuff.writeText(str(header + '\n'))

    for i in range(len(namesList)):
        name = namesList[i]
        ncbid = ncbidsList[i]
        taxPathDict = taxonomy.getPathToRoot(int(ncbid))
        buff = str(name)
        if taxPathDict is None:
            buff += str('\t')
        else:
            buff += str('\t' + 'root')

        for rank in taxonomicRanks:
            if (taxPathDict is not None) and (rank in taxPathDict) and (not taxPathDict[rank].isCopy()):
                buff += str('\t' + taxPathDict[rank].name)
            else:
                buff += '\t'
        outBuff.writeText(str(buff + '\n'))
    outBuff.close()
    taxonomy.close()
Beispiel #4
0
def main01():
    #config = Config(open(os.path.normpath('/Users/ivan/Documents/work/binning/tests/CowRumen/01/config.cfg')), 'pPPS')
    #config = Config(open(os.path.normpath('/net/metagenomics/projects/PPSmg/tests/V35/config.cfg')), 'pPPS')
    #configMl = Config2(config, 'MLTreeMap')
    #configPPS = Config2(config, 'PPS')

    #read sequences
    #sequences = Sequences(config)

    #write ids file
    #sequences.writeSequences(config.get('inputIdsFastaFile'))

    #taxonomy = Taxonomy(config.get('databaseFile'), config.get('taxonomicRanks').split(','))

    taxonomicRanks = 'superkingdom,phylum,class,order,family,genus,species'.split(
        ',')
    taxonomy = Taxonomy(
        '/Users/ivan/Documents/work/binning/taxonomy/20120828/ncbitax_sqlite.db',
        taxonomicRanks)

    #ppsOut2ppOut('D:\\VM\\tmp\\simMC_AMD\\AMD.Arachne.genus', 'D:\\VM\\tmp\\simMC_AMD\\AMD.Arachne.genus.PP.out', taxonomy, config.get('taxonomicRanks').split(','))

    #ppsOut2ppOut('/Users/ivan/Documents/work/binning/data/CowRumen/cowRumenOrderNcbids.txt',
    #             '/Users/ivan/Documents/work/binning/data/CowRumen/cowRumenOrderNcbids.PP.txt', taxonomy, config.get('taxonomicRanks').split(','))

    #ppsOut2ppOut('/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000LabelsSpecies.txt',
    #             '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000LabelsSpecies.PP.txt', taxonomy, config.get('taxonomicRanks').split(','))

    ppsOut2ppOut(
        '/Users/ivan/Documents/work/binning/data/simMC/fromJohannes/contigs.genus.tax',
        '/Users/ivan/Documents/work/binning/data/simMC/fromJohannes/contigs.genus.PP.tax',
        taxonomy, taxonomicRanks)

    #readPPSOutput(sequences, taxonomy, config.get('inputIdsFastaFile'))

    #sequences.writePlacements(str(config.get('inputIdsFastaFile') + '.pOUT'), config.get('taxonomicRanks').split(','))

    #toRealNames(config, sequences)
    taxonomy.close()
Beispiel #5
0
def test():
    markerGeneName = 'rpsC'  #'rpsI' #'rpsS'  # 'rpsK'
    annotationDir = os.path.normpath(
        'D:/A_Phylo/A_Metagenomic/data/markerGenes/annotation')
    outDir = os.path.normpath(
        'D:/A_Phylo/A_Metagenomic/data/markerGenes/mGenesExtracted')
    taxonomy = Taxonomy(
        os.path.normpath(
            'D:/A_Phylo/A_Metagenomic/data/ncbiTaxonomy20111007/ncbitax_sqlite.db'
        ), [
            'superkingdom', 'phylum', 'class', 'order', 'family', 'genus',
            'species'
        ])
    relaxGeneNames = False
    recSkipCount = 0
    firstErrorStop = False
    createGeneDb(markerGeneName, annotationDir, outDir, taxonomy,
                 relaxGeneNames, recSkipCount, firstErrorStop)
Beispiel #6
0
def main():
    parser = argparse.ArgumentParser(
        description='''Creates database for a marker gene''', epilog=''' ''')

    parser.add_argument('-m',
                        '--marker-gene-name',
                        action='store',
                        nargs=1,
                        required=True,
                        help='The name of a specific marker gene.',
                        dest='marker')

    parser.add_argument(
        '-a',
        '--annotation-dir',
        action='store',
        nargs=1,
        required=True,
        help='The name of the directory that contains annotation files.',
        dest='annotationDir')

    parser.add_argument(
        '-o',
        '--output-dir',
        action='store',
        nargs=1,
        required=True,
        help='The name of the directory where the output files will be stored.',
        dest='outDir')

    parser.add_argument('-t',
                        '--taxonomyDb',
                        action='store',
                        nargs=1,
                        required=True,
                        help='Taxonomy database file (SQLite).',
                        dest='taxonomyDb')

    parser.add_argument(
        '-r',
        '--relax-gene-names',
        action='store_true',
        help=
        'If enabled, the script doesn`t control if the gene names are correct.',
        dest='relaxGeneNames')

    parser.add_argument(
        '-s',
        '--rec-skip',
        action='store',
        nargs=1,
        help=
        'The number of records that will be skipped at the beginning of the annotation file.',
        dest='recSkip')

    parser.add_argument('-p',
                        '--print-first-error',
                        action='store_true',
                        help='The script stops after first error occurs',
                        dest='firstErrorStop')

    args = parser.parse_args()

    markerGeneName = str(args.marker[0])

    annotationDir = os.path.normpath(str(args.annotationDir[0]))

    outDir = os.path.normpath(str(args.outDir[0]))

    taxonomy = Taxonomy(os.path.normpath(str(args.taxonomyDb[0])), [
        'superkingdom', 'phylum', 'class', 'order', 'family', 'genus',
        'species'
    ])

    if args.recSkip:
        recSkip = int(args.recSkip[0])
    else:
        recSkip = 0

    if args.firstErrorStop:
        firstErrorStop = True
    else:
        firstErrorStop = False

    if args.relaxGeneNames:
        relaxGeneNames = True
    else:
        relaxGeneNames = False

    createGeneDb(markerGeneName, annotationDir, outDir, taxonomy,
                 relaxGeneNames, recSkip, firstErrorStop)
                        entry += str('\t' + taxPathDict[rank].name)
                    else:
                        entry += '\t'
                f.write(entry)
        except Exception:
            print "Cannot create a file or write to it:", outFile
            raise
        finally:
            f.close()

if __name__ == "__main__":
    #test 2
    #ppsOutFile = 'D:\A_Phylo\A_Metagenomic\data\humanGut\PPS_contigs.txt'
    #outPPOutFile = 'D:\A_Phylo\A_Metagenomic\data\humanGut\PPS_PP_contigs.txt'
    #ppsOutFile = 'C:/Documents and Settings/Administrator/Desktop/temp/johdroPred/inputTW.fas.ids04.lP'
    ppsOutFile = 'C:/Documents and Settings/Administrator/Desktop/temp/johdroPred/inputTW.fas.ids05.lP'
    #outPPOutFile = 'C:/Documents and Settings/Administrator/Desktop/temp/johdroPred/inputTW.fas.ids04.lP.PP.out'
    outPPOutFile = 'C:/Documents and Settings/Administrator/Desktop/temp/johdroPred/inputTW.fas.ids05.lP.PP.out'


    config = Config(open(os.path.normpath('D:\\A_Phylo\\A_Metagenomic\\pPPS\\workspace\\pPPS\\config01.cfg')), 'pPPS')
    databaseFile = os.path.normpath(config.get('databaseFile'))
    taxonomicRanks = config.get('taxonomicRanks').split(',')
    taxonomy = Taxonomy(databaseFile, taxonomicRanks)
    ppsOutToPPOut(ppsOutFile, outPPOutFile, taxonomicRanks, taxonomy)

    #test 1
    #scafContigFile = 'D:/A_Phylo/A_Metagenomic/reindeer/data/scaffolds-contigs.tab'
    #scafPPSOutFile = 'D:/A_Phylo/A_Metagenomic/reindeer/predictions/pps04/scaffoldsOut/SRM_Scaffolds_namesOnly.fna.PP.out'
    #contigPPSOutFile = 'D:/A_Phylo/A_Metagenomic/reindeer/predictions/pps04/scaffoldsOut/SRM_Scaffolds_namesOnly.fna.PP.out_contigs'
    #scafToContigOutput(scafContigFile, scafPPSOutFile, contigPPSOutFile)