Python countfastaの例、lib.ufitslib.countfasta Pythonの例

コード例 #1

0

ファイルを表示

    'vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta,
    '--fastq_qmax', '55'
]
ufitslib.runSubprocess(cmd, ufitslib.log)
qtrimtotal = ufitslib.countfastq(filter_out)
ufitslib.log.info('{0:,}'.format(qtrimtotal) + ' reads passed')

#now run full length dereplication
derep_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.derep.fa')
ufitslib.log.info("De-replication (remove duplicate reads)")
cmd = [
    'vsearch', '--derep_fulllength', filter_fasta, '--sizeout', '--output',
    derep_out
]
ufitslib.runSubprocess(cmd, ufitslib.log)
total = ufitslib.countfasta(derep_out)
ufitslib.log.info('{0:,}'.format(total) + ' reads passed')

#now run sort by size
sort_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.sort.fa')
ufitslib.log.info(
    "Sorting reads by size: removing reads seen less than %s times" %
    args.minsize)
cmd = [
    'vsearch', '--sortbysize', derep_out, '--minsize', args.minsize,
    '--output', sort_out
]
ufitslib.runSubprocess(cmd, ufitslib.log)
total = ufitslib.countfasta(sort_out)
ufitslib.log.info('{0:,}'.format(total) + ' reads passed')

コード例 #2

0

ファイルを表示

ファイル: ufits-unoise2.py プロジェクト: zhongmicai/ITS_clustering

    'vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta,
    '--fastq_qmax', '55'
]
ufitslib.runSubprocess(cmd, ufitslib.log)
total = ufitslib.countfastq(filter_out)
ufitslib.log.info('{0:,}'.format(total) + ' reads passed')

#now run full length dereplication
derep_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.derep.fa')
ufitslib.log.info("De-replication (remove duplicate reads)")
cmd = [
    'vsearch', '--derep_fulllength', filter_out, '--relabel', 'Read_',
    '--sizeout', '--output', derep_out
]
ufitslib.runSubprocess(cmd, ufitslib.log)
total = ufitslib.countfasta(derep_out)
ufitslib.log.info('{0:,}'.format(total) + ' reads passed')

#now run de-noiser UNOISE2
ufitslib.log.info("Denoising reads with UNOISE2")
unoise_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.unoise.fa')
cmd = [
    usearch, '-unoise2', derep_out, '-fastaout', unoise_out, '--minampsize',
    args.minampout
]
ufitslib.runSubprocess(cmd, ufitslib.log)
total = ufitslib.countfasta(unoise_out)
ufitslib.log.info('{0:,}'.format(total) + ' denoised sequences')

#now cluster to biological OTUs with UCLUST
radius = float(args.pct_otu) / 100.

コード例 #3

0

ファイルを表示

    if not utax_db:
        utax_db = args.utax_db
    if not usearch_db:
        usearch_db = args.usearch_db
else:
    utax_db = args.utax_db
    usearch_db = args.usearch_db

if args.method in ['hybrid', 'usearch', 'utax']:
    if not utax_db and not usearch_db and not args.fasta_db:
        ufitslib.log.error("You have not selected a database, need either --db, --utax_db, --usearch_db, or --fasta_db")
        sys.exit(1)

#Count records
ufitslib.log.info("Loading FASTA Records")
total = ufitslib.countfasta(args.fasta)
ufitslib.log.info('{0:,}'.format(total) + ' OTUs')

#declare output files/variables here
blast_out = base + '.blast.txt'
rdp_out = base + '.rdp.txt'
utax_out = base + '.usearch.txt'
usearch_out = base + '.usearch.txt'
sintax_out = base + '.sintax.txt'

if not args.taxonomy:
    #start with less common uses, i.e. Blast, rdp
    if args.method == 'blast':
        #check if command line blast installed
        if not ufitslib.which('blastn'):
            ufitslib.log.error("BLASTN not found in your PATH, exiting.")

コード例 #4

0

ファイルを表示

        )
        sys.exit(1)
    #get default mock community value
    if args.mc == "mock3":
        mock = os.path.join(parentdir, 'DB', 'ufits_mock3.fa')
    elif args.mc == "mock2":
        mock = os.path.join(parentdir, 'DB', 'ufits_mock2.fa')
    elif args.mc == "mock1":
        mock = os.path.join(parentdir, 'DB', 'ufits_mock1.fa')
    elif args.mc == "synmock":
        mock = os.path.join(parentdir, 'DB', 'ufits_synmock.fa')
    else:
        mock = os.path.abspath(args.mc)

    #open mock community fasta and count records
    mock_ref_count = ufitslib.countfasta(mock)

    #map OTUs to mock community
    ufitslib.log.info("Mapping OTUs to Mock Community (USEARCH)")
    cmd = [
        usearch, '-usearch_global', mock, '-strand', 'plus', '-id', '0.95',
        '-db', args.fasta, '-uc', mock_out, '-maxaccepts', '3'
    ]
    ufitslib.runSubprocess(cmd, ufitslib.log)
    #sort the output to avoid problems
    with open(mock_sort, 'w') as output:
        subprocess.call(['sort', '-k4,4nr', mock_out], stdout=output)

    #generate dictionary for name change
    found_dict = {}
    missing = []

コード例 #5

0

ファイルを表示

ファイル: ufits-dada2.py プロジェクト: zhongmicai/ITS_clustering

        uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref+'.extracted.fa')
        if not os.path.isfile(uchime_db):
            ufitslib.log.error("Database not properly configured, run `ufits install` to setup DB, skipping chimera filtering")
            uchime_out = fastaout
    else:
        if os.path.isfile(args.uchime_ref):
            uchime_db = os.path.abspath(args.uchime_ref)
        else:
            ufitslib.log.error("%s is not a valid file, skipping reference chimera filtering" % args.uchime_ref)
            uchime_out = fastaout
    #now run chimera filtering if all checks out
    if not os.path.isfile(uchime_out):
        ufitslib.log.info("Chimera Filtering (VSEARCH) using %s DB" % args.uchime_ref)
        cmd = ['vsearch', '--mindiv', '1.0', '--uchime_ref', fastaout, '--db', uchime_db, '--nonchimeras', uchime_out]
        ufitslib.runSubprocess(cmd, ufitslib.log)
        total = ufitslib.countfasta(uchime_out)
        uchime_chimeras = validSeqs - total
        ufitslib.log.info('{0:,}'.format(total) + ' iSeqs passed, ' + '{0:,}'.format(uchime_chimeras) + ' ref chimeras removed')

    #now reformat OTUs and OTU table, dropping chimeric OTUs from table, sorting the output as well
    nonchimeras = ufitslib.fasta2list(uchime_out)
    inferredSeqs = SeqIO.index(uchime_out, 'fasta')
    with open(iSeqs, 'w') as iSeqout:
        for x in natsorted(nonchimeras):
            SeqIO.write(inferredSeqs[x], iSeqout, 'fasta')
    if not args.debug:
        #clean up chimeras fasta
        ufitslib.removefile(uchime_out)
        if os.path.isfile(fastaout):
            ufitslib.removefile(fastaout)

コード例 #6

0

ファイルを表示

ファイル: ufits-unoise2.py プロジェクト: nextgenusfs/ufits

filter_fasta = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fa')
orig_fasta = os.path.join(tmp, args.out+'.orig.fa')
ufitslib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
cmd = ['vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee', str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta, '--fastq_qmax', '55']
ufitslib.runSubprocess(cmd, ufitslib.log)
cmd = ['vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta, '--fastq_qmax', '55']
ufitslib.runSubprocess(cmd, ufitslib.log)
total = ufitslib.countfastq(filter_out)
ufitslib.log.info('{0:,}'.format(total) + ' reads passed')

#now run full length dereplication
derep_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.derep.fa')
ufitslib.log.info("De-replication (remove duplicate reads)")
cmd = ['vsearch', '--derep_fulllength', filter_out, '--relabel', 'Read_', '--sizeout', '--output', derep_out]
ufitslib.runSubprocess(cmd, ufitslib.log)
total = ufitslib.countfasta(derep_out)
ufitslib.log.info('{0:,}'.format(total) + ' reads passed')

#now run de-noiser UNOISE2
ufitslib.log.info("Denoising reads with UNOISE2")
unoise_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.unoise.fa')
cmd = [usearch, '-unoise2', derep_out, '-fastaout', unoise_out, '--minampsize', args.minampout]
ufitslib.runSubprocess(cmd, ufitslib.log)
total = ufitslib.countfasta(unoise_out)
ufitslib.log.info('{0:,}'.format(total) + ' denoised sequences')

#now cluster to biological OTUs with UCLUST
radius = float(args.pct_otu) / 100.
ufitslib.log.info("Clustering denoised sequences into OTUs at %s%%" % args.pct_otu)
uclust_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.uclust.fa')
cmd = [usearch, '-cluster_smallmem', unoise_out, '-id', str(radius), '-centroids', uclust_out, '-relabel', 'OTU']

コード例 #7

0

ファイルを表示

ファイル: ufits-filter.py プロジェクト: nextgenusfs/ufits

        ufitslib.log.error("If using the -b,--barcode option you must specify a fasta file of mock community via the --mc option")
        sys.exit(1)
    #get default mock community value
    if args.mc == "mock3":
        mock = os.path.join(parentdir, 'DB', 'ufits_mock3.fa')
    elif args.mc == "mock2":
        mock = os.path.join(parentdir, 'DB', 'ufits_mock2.fa')
    elif args.mc == "mock1":
        mock = os.path.join(parentdir, 'DB', 'ufits_mock1.fa')
    elif args.mc == "synmock":
        mock = os.path.join(parentdir, 'DB', 'ufits_synmock.fa')
    else:
        mock = os.path.abspath(args.mc)

    #open mock community fasta and count records
    mock_ref_count = ufitslib.countfasta(mock)
    
    #map OTUs to mock community
    ufitslib.log.info("Mapping OTUs to Mock Community (USEARCH)")
    cmd = [usearch, '-usearch_global', mock, '-strand', 'plus', '-id', '0.95', '-db', args.fasta, '-uc', mock_out, '-maxaccepts', '3']
    ufitslib.runSubprocess(cmd, ufitslib.log)
    #sort the output to avoid problems
    with open(mock_sort, 'w') as output:
        subprocess.call(['sort', '-k4,4nr', mock_out], stdout = output)

    #generate dictionary for name change
    found_dict = {}
    missing = []
    chimeras = []
    seen = []
    with open(mock_sort, 'rU') as map:

コード例 #8

0

ファイルを表示

ファイル: ufits-OTU_cluster_ref.py プロジェクト: nextgenusfs/ufits

filter_fasta = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fa')
orig_fasta = os.path.join(tmp, args.out+'.orig.fa')
ufitslib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
cmd = ['vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee', str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta, '--fastq_qmax', '55']
ufitslib.runSubprocess(cmd, ufitslib.log)
cmd = ['vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta, '--fastq_qmax', '55']
ufitslib.runSubprocess(cmd, ufitslib.log)
qtrimtotal = ufitslib.countfastq(filter_out)
ufitslib.log.info('{0:,}'.format(qtrimtotal) + ' reads passed')

#now run full length dereplication
derep_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.derep.fa')
ufitslib.log.info("De-replication (remove duplicate reads)")
cmd = ['vsearch', '--derep_fulllength', filter_fasta, '--sizeout', '--output', derep_out]
ufitslib.runSubprocess(cmd, ufitslib.log)
total = ufitslib.countfasta(derep_out)
ufitslib.log.info('{0:,}'.format(total) + ' reads passed')

#now run sort by size
sort_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.sort.fa')
ufitslib.log.info("Sorting reads by size: removing reads seen less than %s times" % args.minsize)
cmd = ['vsearch', '--sortbysize', derep_out, '--minsize', args.minsize, '--output', sort_out]
ufitslib.runSubprocess(cmd, ufitslib.log)
total = ufitslib.countfasta(sort_out)
ufitslib.log.info('{0:,}'.format(total) + ' reads passed')

#chimera detection
#first run through de novo chimera detection
ufitslib.log.info("De novo chimera detection (VSEARCH)")
chimera_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.chimera_check.fa')
cmd = ['vsearch', '--uchime_denovo', sort_out, '--relabel', 'Seq', '--sizeout', '--nonchimeras', chimera_out]

コード例 #9

0

ファイルを表示

ファイル: ufits-OTU_cluster.py プロジェクト: nextgenusfs/ufits

    filter_fasta,
    "--fastq_qmax",
    "55",
]
ufitslib.runSubprocess(cmd, ufitslib.log)
cmd = ["vsearch", "--fastq_filter", args.FASTQ, "--fastaout", orig_fasta, "--fastq_qmax", "55"]
ufitslib.runSubprocess(cmd, ufitslib.log)
total = ufitslib.countfastq(filter_out)
ufitslib.log.info("{0:,}".format(total) + " reads passed")

# now run full length dereplication
derep_out = os.path.join(tmp, args.out + ".EE" + args.maxee + ".derep.fa")
ufitslib.log.info("De-replication (remove duplicate reads)")
cmd = ["vsearch", "--derep_fulllength", filter_fasta, "--sizeout", "--output", derep_out]
ufitslib.runSubprocess(cmd, ufitslib.log)
total = ufitslib.countfasta(derep_out)
ufitslib.log.info("{0:,}".format(total) + " reads passed")

# optional run UNOISE
if args.unoise:
    unoise_out = unoise_out = os.path.join(tmp, args.out + ".EE" + args.maxee + ".denoised.fa")
    ufitslib.log.info("Denoising Data with UNOISE")
    cmd = [
        usearch,
        "-cluster_fast",
        derep_out,
        "-centroids",
        unoise_out,
        "-id",
        "0.9",
        "--maxdiffs",