Example #1
0
def check_species(args):
    # check file type
    if args['m1']:
        args['file_type'] = utility.auto_detect_file_type(args['m1'])
    # check database
    utility.check_database(args)
    # create output directories
    if not os.path.isdir('%s/species' % args['outdir']):
        os.makedirs('%s/species' % args['outdir'])
    # check word size
    if args['word_size'] < 12:
        sys.exit(
            "\nError: Invalid word size: %s. Must be greater than or equal to 12\n"
            % args['word_size'])
    # check mapping identity
    if args['mapid'] and (args['mapid'] < 0 or args['mapid'] > 100):
        sys.exit(
            "\nError: Invalid mapping identity: %s. Must be between 0 and 100\n"
            % args['mapid'])
    # check alignment coverage
    if args['aln_cov'] < 0 or args['aln_cov'] > 1:
        sys.exit(
            "\nError: Invalid alignment coverage: %s. Must be between 0 and 1\n"
            % args['aln_cov'])
    # check that m1 (and m2) exist
    for arg in ['m1', 'm2']:
        if args[arg] and not os.path.isfile(args[arg]):
            sys.exit("\nError: Input file does not exist: '%s'\n" % args[arg])
    # check that extention matches compression
    if args['m1']: utility.check_compression(args['m1'])
    if args['m2']: utility.check_compression(args['m2'])
Example #2
0
def run_pipeline(args):
    """ Run entire pipeline """

    # Initialize reference data
    print("\nReading reference data")
    start = time()
    species = initialize_species(args)
    contigs = initialize_contigs(species)
    print("  %s minutes" % round((time() - start) / 60, 2))
    print("  %s Gb maximum memory" % utility.max_mem_usage())

    # Build genome database for selected species
    if args['build_db']:
        print("\nBuilding database of representative genomes")
        args['log'].write("\nBuilding database of representative genomes\n")
        start = time()
        build_genome_db(args, species)
        print("  %s minutes" % round((time() - start) / 60, 2))
        print("  %s Gb maximum memory" % utility.max_mem_usage())

    # Use bowtie2 to map reads to a representative genome for each species
    if args['align']:
        args['file_type'] = utility.auto_detect_file_type(args['m1'])
        print("\nMapping reads to representative genomes")
        args['log'].write("\nMapping reads to representative genomes\n")
        start = time()
        genome_align(args)
        print("  %s minutes" % round((time() - start) / 60, 2))
        print("  %s Gb maximum memory" % utility.max_mem_usage())

    # Use mpileup to identify SNPs
    if args['call']:
        start = time()
        print("\nRunning mpileup")
        args['log'].write("\nRunning mpileup\n")
        pileup(args)
        print("  %s minutes" % round((time() - start) / 60, 2))
        print("  %s Gb maximum memory" % utility.max_mem_usage())

        # Split pileup into files for each species, format, and report summary statistics
        print("\nFormatting output")
        args['log'].write("\nFormatting output\n")
        format_pileup(args, species, contigs)
        snps_summary(args, species)
        print("  %s minutes" % round((time() - start) / 60, 2))
        print("  %s Gb maximum memory" % utility.max_mem_usage())

    # Optionally remove temporary files
    if args['remove_temp']: remove_tmp(args)
Example #3
0
def run_pipeline(args):
	""" Run entire pipeline """
	
	# Build genome database for selected GCs
	if args['build_db']:
		import species
		print("\nBuilding database of representative genomes")
		args['log'].write("\nBuilding database of representative genomes\n")
		start = time()
		genome_clusters = species.select_genome_clusters(args)
		build_genome_db(args, genome_clusters)
		print("  %s minutes" % round((time() - start)/60, 2) )
		print("  %s Gb maximum memory") % utility.max_mem_usage()

	# Use bowtie2 to map reads to a representative genome for each genome-cluster
	if args['align']:
		args['file_type'] = utility.auto_detect_file_type(args['m1'])
		print("\nMapping reads to representative genomes")
		args['log'].write("\nMapping reads to representative genomes\n")
		start = time()
		genome_align(args)
		print("  %s minutes" % round((time() - start)/60, 2) )
		print("  %s Gb maximum memory") % utility.max_mem_usage()

	# Use mpileup to identify SNPs
	if args['call']:
		start = time()
		print("\nRunning mpileup")
		args['log'].write("\nRunning mpileup\n")
		pileup(args)
		print("  %s minutes" % round((time() - start)/60, 2) )
		print("  %s Gb maximum memory") % utility.max_mem_usage()

	# Split vcf into files for each GC, format, and report summary statistics
		print("\nFormatting output")
		args['log'].write("\nFormatting output\n")
		split_vcf(args)
		format_vcf(args)
		snps_summary(args)
		print("  %s minutes" % round((time() - start)/60, 2) )
		print("  %s Gb maximum memory") % utility.max_mem_usage()

	# Optionally remove temporary files
	if args['remove_temp']: remove_tmp(args)
Example #4
0
File: snps.py Project: palc/MIDAS
def run_pipeline(args):
    """ Run entire pipeline """

    # Build genome database for selected GCs
    if args['build_db']:
        from midas.run import species
        print("\nBuilding database of representative genomes")
        args['log'].write("\nBuilding database of representative genomes\n")
        start = time()
        genome_clusters = species.select_genome_clusters(args)
        build_genome_db(args, genome_clusters)
        print("  %s minutes" % round((time() - start) / 60, 2))
        print("  %s Gb maximum memory" % utility.max_mem_usage())

    # Use bowtie2 to map reads to a representative genome for each genome-cluster
    if args['align']:
        args['file_type'] = utility.auto_detect_file_type(args['m1'])
        print("\nMapping reads to representative genomes")
        args['log'].write("\nMapping reads to representative genomes\n")
        start = time()
        genome_align(args)
        print("  %s minutes" % round((time() - start) / 60, 2))
        print("  %s Gb maximum memory" % utility.max_mem_usage())

    # Use mpileup to identify SNPs
    if args['call']:
        start = time()
        print("\nRunning mpileup")
        args['log'].write("\nRunning mpileup\n")
        pileup(args)
        print("  %s minutes" % round((time() - start) / 60, 2))
        print("  %s Gb maximum memory" % utility.max_mem_usage())

        # Split vcf into files for each GC, format, and report summary statistics
        print("\nFormatting output")
        args['log'].write("\nFormatting output\n")
        split_vcf(args)
        format_vcf(args)
        snps_summary(args)
        print("  %s minutes" % round((time() - start) / 60, 2))
        print("  %s Gb maximum memory" % utility.max_mem_usage())

    # Optionally remove temporary files
    if args['remove_temp']: remove_tmp(args)
Example #5
0
def run_pipeline(args):
    """ Run entire pipeline """

    # Initialize reference data
    print("\nReading reference data")
    start = time()
    if 'db' in args:
        if args.get('dbtoc'):
            args['iggdb'] = IGGdb(f"{args['dbtoc']}")
        else:
            args['iggdb'] = IGGdb(f"{args['db']}/metadata/species_info.tsv")
    species = initialize_species(args)
    contigs = initialize_contigs(species)
    print("  %s minutes" % round((time() - start) / 60, 2))
    print("  %s Gb maximum memory" % utility.max_mem_usage())

    # Build genome database for selected species
    if args['build_db']:
        print("\nBuilding database of representative genomes")
        args['log'].write("\nBuilding database of representative genomes\n")
        start = time()
        build_genome_db(args, species)
        print("  %s minutes" % round((time() - start) / 60, 2))
        print("  %s Gb maximum memory" % utility.max_mem_usage())

    # Use bowtie2 to map reads to a representative genome for each species
    if args['align']:
        args['file_type'] = utility.auto_detect_file_type(args['m1'])
        print("\nMapping reads to representative genomes")
        args['log'].write("\nMapping reads to representative genomes\n")
        start = time()
        genome_align(args)
        print("  %s minutes" % round((time() - start) / 60, 2))
        print("  %s Gb maximum memory" % utility.max_mem_usage())

    # Use mpileup to identify SNPs
    if args['call']:
        index_bam(args)
        pysam_pileup(args, species, contigs)
        snps_summary(args, species)

    # Optionally remove temporary files
    if args['remove_temp']: remove_tmp(args)
Example #6
0
File: genes.py Project: palc/MIDAS
def run_pipeline(args):
	""" Run entire pipeline """
	
	# Build pangenome database for selected GCs
	if args['build_db']:
		from midas.run import species
		print("\nBuilding pangenome database")
		args['log'].write("\nBuilding pangenome database\n")
		start = time()
		genome_clusters = species.select_genome_clusters(args)
		build_pangenome_db(args, genome_clusters)
		print("  %s minutes" % round((time() - start)/60, 2) )
		print("  %s Gb maximum memory" % utility.max_mem_usage())

	# Use bowtie2 to align reads to pangenome database
	if args['align']:
		start = time()
		print("\nAligning reads to pangenomes")
		args['log'].write("\nAligning reads to pangenomes\n")
		args['file_type'] = utility.auto_detect_file_type(args['m1'])
		pangenome_align(args)
		print("  %s minutes" % round((time() - start)/60, 2) )
		print("  %s Gb maximum memory" % utility.max_mem_usage())

	# Compute pangenome coverage for each species
	if args['cov']:
		start = time()
		print("\nComputing coverage of pangenomes")
		args['log'].write("\nComputing coverage of pangenomes\n")
		compute_pangenome_coverage(args)
		genes_summary(args)
		print("  %s minutes" % round((time() - start)/60, 2) )
		print("  %s Gb maximum memory" % utility.max_mem_usage())

	# Optionally remove temporary files
	if args['remove_temp']: remove_tmp(args)
Example #7
0
def run_pipeline(args):
	""" Run entire pipeline """
	
	# Build pangenome database for selected GCs
	if args['build_db']:
		import species
		print("\nBuilding pangenome database")
		args['log'].write("\nBuilding pangenome database\n")
		start = time()
		genome_clusters = species.select_genome_clusters(args)
		build_pangenome_db(args, genome_clusters)
		print("  %s minutes" % round((time() - start)/60, 2) )
		print("  %s Gb maximum memory") % utility.max_mem_usage()

	# Use bowtie2 to align reads to pangenome database
	if args['align']:
		start = time()
		print("\nAligning reads to pangenomes")
		args['log'].write("\nAligning reads to pangenomes\n")
		args['file_type'] = utility.auto_detect_file_type(args['m1'])
		pangenome_align(args)
		print("  %s minutes" % round((time() - start)/60, 2) )
		print("  %s Gb maximum memory") % utility.max_mem_usage()

	# Compute pangenome coverage for each species
	if args['cov']:
		start = time()
		print("\nComputing coverage of pangenomes")
		args['log'].write("\nComputing coverage of pangenomes\n")
		compute_pangenome_coverage(args)
		genes_summary(args)
		print("  %s minutes" % round((time() - start)/60, 2) )
		print("  %s Gb maximum memory") % utility.max_mem_usage()

	# Optionally remove temporary files
	if args['remove_temp']: remove_tmp(args)
Example #8
0
def check_snps(args):
    """ Check validity of command line arguments """
    # check file type
    if args['m1']:
        args['file_type'] = utility.auto_detect_file_type(args['m1'])
    # check database
    utility.check_database(args)
    # make sure selected species are valid
    check_selected_species(args)
    # create output directory
    if not os.path.isdir('%s/snps' % args['outdir']):
        os.makedirs('%s/snps' % args['outdir'])
    # pipeline options
    if not any([args['build_db'], args['align'], args['call']]):
        args['build_db'] = True
        args['align'] = True
        args['call'] = True
    # set default species selection
    if not any([args['species_id'], args['species_topn'], args['species_cov']
                ]):
        args['species_cov'] = 3.0
    # species selection options, but no no profile file
    profile = '%s/species/species_profile.txt' % args['outdir']
    if not os.path.isfile(profile):
        if (args['species_topn'] or args['species_cov']) and args['build_db']:
            sys.exit("\nError: Could not find species abundance profile: %s\n\
To specify species with --species_topn or --species_cov you must have run: run_midas.py species\n\
Alternatively, you can manually specify one or more species using --species_id\n"
                     % profile)
    # no database but --align specified
    if (args['align'] and not args['build_db'] and
            not os.path.isfile('%s/snps/temp/genomes.fa' % args['outdir'])):
        error = "\nError: You've specified --align, but no database has been built"
        error += "\nTry running with --build_db\n"
        sys.exit(error)
    # no bamfile but --call specified
    if (args['call'] and not args['align'] and
            not os.path.isfile('%s/snps/temp/genomes.bam' % args['outdir'])):
        error = "\nError: You've specified --pileup, but no alignments were found"
        error += "\nTry running with --align\n"
        sys.exit(error)
    # no genomes but --call specified
    if (args['call'] and not args['build_db'] and
            not os.path.isfile('%s/snps/temp/genomes.fa' % args['outdir'])):
        error = "\nError: You've specified --pileup, but the no genome database was found"
        error += "\nTry running with --build_db\n"
        sys.exit(error)
    # no reads
    if args['align'] and not args['m1']:
        sys.exit(
            "\nError: To align reads, you must specify path to input FASTA/FASTQ\n"
        )
    # check input file paths
    for arg in ['m1', 'm2']:
        if args[arg] and not os.path.isfile(args[arg]):
            sys.exit("\nError: Input file does not exist: '%s'\n" % args[arg])
    # check compression
    if args['m1']: utility.check_compression(args['m1'])
    if args['m2']: utility.check_compression(args['m2'])
    # input options
    if args['m2'] and not args['m1']:
        sys.exit(
            "\nError: Must specify -1 and -2 if aligning paired end reads\n")
    if args['m2'] and args['interleaved']:
        sys.exit("\nError: Cannot specify --interleaved together with -2\n")
    # sanity check input values
    if args['mapid'] < 1 or args['mapid'] > 100:
        sys.exit("\nError: MAPQ must be between 1 and 100\n")
    if args['mapq'] < 0 or args['mapq'] > 100:
        sys.exit("\nError: MAPQ must be between 0 and 100\n")
    if args['baseq'] < 0 or args['baseq'] > 100:
        sys.exit("\nError: BASEQ must be between 0 and 100\n")
    if args['aln_cov'] < 0 or args['aln_cov'] > 1:
        sys.exit("\nError: ALN_COV must be between 0 and 1\n")