Ejemplo n.º 1
0
def run_pipeline(args):
    """ Run entire pipeline """

    # Initialize reference data
    print("\nReading reference data")
    start = time()
    species = initialize_species(args)
    contigs = initialize_contigs(species)
    print("  %s minutes" % round((time() - start) / 60, 2))
    print("  %s Gb maximum memory" % utility.max_mem_usage())

    # Build genome database for selected species
    if args['build_db']:
        print("\nBuilding database of representative genomes")
        args['log'].write("\nBuilding database of representative genomes\n")
        start = time()
        build_genome_db(args, species)
        print("  %s minutes" % round((time() - start) / 60, 2))
        print("  %s Gb maximum memory" % utility.max_mem_usage())

    # Use bowtie2 to map reads to a representative genome for each species
    if args['align']:
        args['file_type'] = utility.auto_detect_file_type(args['m1'])
        print("\nMapping reads to representative genomes")
        args['log'].write("\nMapping reads to representative genomes\n")
        start = time()
        genome_align(args)
        print("  %s minutes" % round((time() - start) / 60, 2))
        print("  %s Gb maximum memory" % utility.max_mem_usage())

    # Use mpileup to identify SNPs
    if args['call']:
        start = time()
        print("\nRunning mpileup")
        args['log'].write("\nRunning mpileup\n")
        pileup(args)
        print("  %s minutes" % round((time() - start) / 60, 2))
        print("  %s Gb maximum memory" % utility.max_mem_usage())

        # Split pileup into files for each species, format, and report summary statistics
        print("\nFormatting output")
        args['log'].write("\nFormatting output\n")
        format_pileup(args, species, contigs)
        snps_summary(args, species)
        print("  %s minutes" % round((time() - start) / 60, 2))
        print("  %s Gb maximum memory" % utility.max_mem_usage())

    # Optionally remove temporary files
    if args['remove_temp']: remove_tmp(args)
Ejemplo n.º 2
0
Archivo: snps.py Proyecto: zhaoc1/MIDAS
def pysam_pileup(args, species, contigs):
    start = time()
    print("\nCounting alleles")
    args['log'].write("\nCounting alleles\n")

    # run pileups per species in parallel
    argument_list = []
    for species_id in species:
        argument_list.append([args, species_id, contigs])
    aln_stats = utility.parallel(species_pileup, argument_list,
                                 args['threads'])

    # update alignment stats for species objects
    for species_id, stats in aln_stats:
        sp = species[species_id]
        sp.genome_length = stats['genome_length']
        sp.covered_bases = stats['covered_bases']
        sp.total_depth = stats['total_depth']
        sp.aligned_reads = stats['aligned_reads']
        sp.mapped_reads = stats['mapped_reads']
        if sp.genome_length > 0:
            sp.fraction_covered = sp.covered_bases / float(sp.genome_length)
        if sp.covered_bases > 0:
            sp.mean_coverage = sp.total_depth / float(sp.covered_bases)

    print("  %s minutes" % round((time() - start) / 60, 2))
    print("  %s Gb maximum memory" % utility.max_mem_usage())
Ejemplo n.º 3
0
def run_pipeline(args):
    """ Run entire pipeline """

    # Initialize reference data
    print("\nReading reference data")
    start = time()
    if 'db' in args:
        if args.get('dbtoc'):
            args['iggdb'] = IGGdb(f"{args['dbtoc']}")
        else:
            args['iggdb'] = IGGdb(f"{args['db']}/metadata/species_info.tsv")
    species = initialize_species(args)
    genes = initialize_genes(args, species)
    print("  %s minutes" % round((time() - start) / 60, 2))
    print("  %s Gb maximum memory" % utility.max_mem_usage())

    # Build pangenome database for selected species
    if args['build_db']:
        print("\nBuilding pangenome database")
        args['log'].write("\nBuilding pangenome database\n")
        start = time()
        build_pangenome_db(args, species)
        print("  %s minutes" % round((time() - start) / 60, 2))
        print("  %s Gb maximum memory" % utility.max_mem_usage())

    # Use bowtie2 to align reads to pangenome database
    if args['align']:
        start = time()
        print("\nAligning reads to pangenomes")
        args['log'].write("\nAligning reads to pangenomes\n")
        pangenome_align(args)
        print("  %s minutes" % round((time() - start) / 60, 2))
        print("  %s Gb maximum memory" % utility.max_mem_usage())

    # Compute pangenome coverage for each species
    if args['cov']:
        start = time()
        print("\nComputing coverage of pangenomes")
        args['log'].write("\nComputing coverage of pangenomes\n")
        pangenome_coverage(args, species, genes)
        print("  %s minutes" % round((time() - start) / 60, 2))
        print("  %s Gb maximum memory" % utility.max_mem_usage())

    # Optionally remove temporary files
    if args['remove_temp']: remove_tmp(args)
Ejemplo n.º 4
0
def run_pipeline(args):
	""" Run entire pipeline """
	
	# Build genome database for selected GCs
	if args['build_db']:
		import species
		print("\nBuilding database of representative genomes")
		args['log'].write("\nBuilding database of representative genomes\n")
		start = time()
		genome_clusters = species.select_genome_clusters(args)
		build_genome_db(args, genome_clusters)
		print("  %s minutes" % round((time() - start)/60, 2) )
		print("  %s Gb maximum memory") % utility.max_mem_usage()

	# Use bowtie2 to map reads to a representative genome for each genome-cluster
	if args['align']:
		args['file_type'] = utility.auto_detect_file_type(args['m1'])
		print("\nMapping reads to representative genomes")
		args['log'].write("\nMapping reads to representative genomes\n")
		start = time()
		genome_align(args)
		print("  %s minutes" % round((time() - start)/60, 2) )
		print("  %s Gb maximum memory") % utility.max_mem_usage()

	# Use mpileup to identify SNPs
	if args['call']:
		start = time()
		print("\nRunning mpileup")
		args['log'].write("\nRunning mpileup\n")
		pileup(args)
		print("  %s minutes" % round((time() - start)/60, 2) )
		print("  %s Gb maximum memory") % utility.max_mem_usage()

	# Split vcf into files for each GC, format, and report summary statistics
		print("\nFormatting output")
		args['log'].write("\nFormatting output\n")
		split_vcf(args)
		format_vcf(args)
		snps_summary(args)
		print("  %s minutes" % round((time() - start)/60, 2) )
		print("  %s Gb maximum memory") % utility.max_mem_usage()

	# Optionally remove temporary files
	if args['remove_temp']: remove_tmp(args)
Ejemplo n.º 5
0
Archivo: snps.py Proyecto: palc/MIDAS
def run_pipeline(args):
    """ Run entire pipeline """

    # Build genome database for selected GCs
    if args['build_db']:
        from midas.run import species
        print("\nBuilding database of representative genomes")
        args['log'].write("\nBuilding database of representative genomes\n")
        start = time()
        genome_clusters = species.select_genome_clusters(args)
        build_genome_db(args, genome_clusters)
        print("  %s minutes" % round((time() - start) / 60, 2))
        print("  %s Gb maximum memory" % utility.max_mem_usage())

    # Use bowtie2 to map reads to a representative genome for each genome-cluster
    if args['align']:
        args['file_type'] = utility.auto_detect_file_type(args['m1'])
        print("\nMapping reads to representative genomes")
        args['log'].write("\nMapping reads to representative genomes\n")
        start = time()
        genome_align(args)
        print("  %s minutes" % round((time() - start) / 60, 2))
        print("  %s Gb maximum memory" % utility.max_mem_usage())

    # Use mpileup to identify SNPs
    if args['call']:
        start = time()
        print("\nRunning mpileup")
        args['log'].write("\nRunning mpileup\n")
        pileup(args)
        print("  %s minutes" % round((time() - start) / 60, 2))
        print("  %s Gb maximum memory" % utility.max_mem_usage())

        # Split vcf into files for each GC, format, and report summary statistics
        print("\nFormatting output")
        args['log'].write("\nFormatting output\n")
        split_vcf(args)
        format_vcf(args)
        snps_summary(args)
        print("  %s minutes" % round((time() - start) / 60, 2))
        print("  %s Gb maximum memory" % utility.max_mem_usage())

    # Optionally remove temporary files
    if args['remove_temp']: remove_tmp(args)
Ejemplo n.º 6
0
def run_pipeline(args):
    """ Run entire pipeline """

    # Initialize reference data
    print("\nReading reference data")
    start = time()
    if 'db' in args:
        if args.get('dbtoc'):
            args['iggdb'] = IGGdb(f"{args['dbtoc']}")
        else:
            args['iggdb'] = IGGdb(f"{args['db']}/metadata/species_info.tsv")
    species = initialize_species(args)
    contigs = initialize_contigs(species)
    print("  %s minutes" % round((time() - start) / 60, 2))
    print("  %s Gb maximum memory" % utility.max_mem_usage())

    # Build genome database for selected species
    if args['build_db']:
        print("\nBuilding database of representative genomes")
        args['log'].write("\nBuilding database of representative genomes\n")
        start = time()
        build_genome_db(args, species)
        print("  %s minutes" % round((time() - start) / 60, 2))
        print("  %s Gb maximum memory" % utility.max_mem_usage())

    # Use bowtie2 to map reads to a representative genome for each species
    if args['align']:
        args['file_type'] = utility.auto_detect_file_type(args['m1'])
        print("\nMapping reads to representative genomes")
        args['log'].write("\nMapping reads to representative genomes\n")
        start = time()
        genome_align(args)
        print("  %s minutes" % round((time() - start) / 60, 2))
        print("  %s Gb maximum memory" % utility.max_mem_usage())

    # Use mpileup to identify SNPs
    if args['call']:
        index_bam(args)
        pysam_pileup(args, species, contigs)
        snps_summary(args, species)

    # Optionally remove temporary files
    if args['remove_temp']: remove_tmp(args)
Ejemplo n.º 7
0
def estimate_abundance(args):
	
	""" Run entire pipeline """
	# impute missing args & get relative file paths
	species_info = read_annotations(args)
	
	# align reads
	start = time()
	print("\nAligning reads to marker-genes database")
	args['log'].write("\nAligning reads to marker-genes database\n")
	map_reads_hsblast(args)
	print("  %s minutes" % round((time() - start)/60, 2) )
	print("  %s Gb maximum memory") % utility.max_mem_usage()

	# find best hit for each read
	start = time()
	print("\nClassifying reads")
	args['log'].write("\nClassifying reads\n")
	best_hits = find_best_hits(args)
	unique_alns = assign_unique(args, best_hits, species_info)
	cluster_alns = assign_non_unique(args, best_hits, unique_alns)
	print("  %s minutes" % round((time() - start)/60, 2) )
	print("  %s Gb maximum memory") % utility.max_mem_usage()
	
	# estimate genome cluster abundance
	start = time()
	print("\nEstimating species abundance")
	args['log'].write("\nEstimating species abundance\n")
	total_gene_length = read_gene_lengths(args, species_info)
	species_abundance = normalize_counts(cluster_alns, total_gene_length)
	print("  %s minutes" % round((time() - start)/60, 2) )
	print("  %s Gb maximum memory") % utility.max_mem_usage()
	
	# write results
	write_abundance(args['outdir'], species_abundance, species_info)

	# clean up
	if args['remove_temp']:
		import shutil
		shutil.rmtree('%s/species/temp' % args['outdir'])
Ejemplo n.º 8
0
def run_pipeline(args):
    """ Run entire pipeline """
    # read info files
    species_info = read_annotations(args)
    marker_info = read_marker_info(args)

    # align reads
    start = time()
    print("\nAligning reads to marker-genes database")
    args['log'].write("\nAligning reads to marker-genes database\n")
    map_reads_hsblast(args)
    print("  %s minutes" % round((time() - start) / 60, 2))
    print("  %s Gb maximum memory" % utility.max_mem_usage())

    # find best hit for each read
    start = time()
    print("\nClassifying reads")
    args['log'].write("\nClassifying reads\n")
    best_hits = find_best_hits(args, marker_info)
    unique_alns = assign_unique(args, best_hits, species_info, marker_info)
    species_alns = assign_non_unique(args, best_hits, unique_alns, marker_info)
    print("  %s minutes" % round((time() - start) / 60, 2))
    print("  %s Gb maximum memory" % utility.max_mem_usage())

    # estimate species abundance
    start = time()
    print("\nEstimating species abundance")
    args['log'].write("\nEstimating species abundance\n")
    total_gene_length = read_gene_lengths(args, species_info, marker_info)
    species_abundance = normalize_counts(species_alns, total_gene_length)
    print("  %s minutes" % round((time() - start) / 60, 2))
    print("  %s Gb maximum memory" % utility.max_mem_usage())

    # write results
    write_abundance(args['outdir'], species_abundance, species_info)

    # clean up
    if args['remove_temp']:
        import shutil
        shutil.rmtree('%s/species/temp' % args['outdir'])
Ejemplo n.º 9
0
def index_bam(args):
    start = time()
    print("\nIndexing bamfile")
    args['log'].write("\nIndexing bamfile\n")
    command = '%s index -@ %d %s/snps/temp/genomes.bam' % (
        args['samtools'], int(args['threads']), args['outdir'])
    args['log'].write('command: ' + command + '\n')
    process = subprocess.Popen(command,
                               shell=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    utility.check_exit_code(process, command)
    print("  %s minutes" % round((time() - start) / 60, 2))
    print("  %s Gb maximum memory" % utility.max_mem_usage())
Ejemplo n.º 10
0
Archivo: genes.py Proyecto: palc/MIDAS
def run_pipeline(args):
	""" Run entire pipeline """
	
	# Build pangenome database for selected GCs
	if args['build_db']:
		from midas.run import species
		print("\nBuilding pangenome database")
		args['log'].write("\nBuilding pangenome database\n")
		start = time()
		genome_clusters = species.select_genome_clusters(args)
		build_pangenome_db(args, genome_clusters)
		print("  %s minutes" % round((time() - start)/60, 2) )
		print("  %s Gb maximum memory" % utility.max_mem_usage())

	# Use bowtie2 to align reads to pangenome database
	if args['align']:
		start = time()
		print("\nAligning reads to pangenomes")
		args['log'].write("\nAligning reads to pangenomes\n")
		args['file_type'] = utility.auto_detect_file_type(args['m1'])
		pangenome_align(args)
		print("  %s minutes" % round((time() - start)/60, 2) )
		print("  %s Gb maximum memory" % utility.max_mem_usage())

	# Compute pangenome coverage for each species
	if args['cov']:
		start = time()
		print("\nComputing coverage of pangenomes")
		args['log'].write("\nComputing coverage of pangenomes\n")
		compute_pangenome_coverage(args)
		genes_summary(args)
		print("  %s minutes" % round((time() - start)/60, 2) )
		print("  %s Gb maximum memory" % utility.max_mem_usage())

	# Optionally remove temporary files
	if args['remove_temp']: remove_tmp(args)
Ejemplo n.º 11
0
def run_pipeline(args):
	""" Run entire pipeline """
	
	# Build pangenome database for selected GCs
	if args['build_db']:
		import species
		print("\nBuilding pangenome database")
		args['log'].write("\nBuilding pangenome database\n")
		start = time()
		genome_clusters = species.select_genome_clusters(args)
		build_pangenome_db(args, genome_clusters)
		print("  %s minutes" % round((time() - start)/60, 2) )
		print("  %s Gb maximum memory") % utility.max_mem_usage()

	# Use bowtie2 to align reads to pangenome database
	if args['align']:
		start = time()
		print("\nAligning reads to pangenomes")
		args['log'].write("\nAligning reads to pangenomes\n")
		args['file_type'] = utility.auto_detect_file_type(args['m1'])
		pangenome_align(args)
		print("  %s minutes" % round((time() - start)/60, 2) )
		print("  %s Gb maximum memory") % utility.max_mem_usage()

	# Compute pangenome coverage for each species
	if args['cov']:
		start = time()
		print("\nComputing coverage of pangenomes")
		args['log'].write("\nComputing coverage of pangenomes\n")
		compute_pangenome_coverage(args)
		genes_summary(args)
		print("  %s minutes" % round((time() - start)/60, 2) )
		print("  %s Gb maximum memory") % utility.max_mem_usage()

	# Optionally remove temporary files
	if args['remove_temp']: remove_tmp(args)
Ejemplo n.º 12
0
def pysam_pileup(args, species, contigs):
    start = time()
    print("\nCounting alleles")
    args['log'].write("\nCounting alleles\n")

    # We cannot pass args to a subprocess unfortunately because args['log'] is an object;
    # so we can make it a global, although that is certainly living dangerously.
    # TODO: Just clean this up.
    global global_args
    global_args = args

    # run pileups per species in parallel
    argument_list = []
    # We might not need this for contigs.  It was an attempt to eliminate the nonserializable subprocess argument.  Which is args.
    tsprint("Reading contigs")
    contigs = {
        str(c.id): {
            'species_id': str(c.species_id),
            'length': str(c.length),
            'seq': "".join(c.seq)
        }
        for c in contigs.values()
    }
    for species_id in species:
        argument_list.append([species_id])

    global global_contigs
    global_contigs = contigs

    tsprint("Read contigs")

    mp = multiprocessing.Pool(int(args['threads']))
    # update alignment stats for species objects
    for species_id, stats in mp.starmap(species_pileup, argument_list):
        sp = species[species_id]
        sp.genome_length = int(stats['genome_length'])
        sp.covered_bases = int(stats['covered_bases'])
        sp.total_depth = int(stats['total_depth'])
        sp.aligned_reads = int(stats['aligned_reads'])
        sp.mapped_reads = int(stats['mapped_reads'])
        if sp.genome_length > 0:
            sp.fraction_covered = sp.covered_bases / float(sp.genome_length)
        if sp.covered_bases > 0:
            sp.mean_coverage = sp.total_depth / float(sp.covered_bases)

    print("  %s minutes" % round((time() - start) / 60, 2))
    print("  %s Gb maximum memory" % utility.max_mem_usage())