def main(): # get args and options args = get_args() # setup logging log, my_name = setup_logging(args) # get the input data log.info("Getting input filenames and creating output directories") input = get_input_data(args.config, args.dir) # create the output directory if it does not exist if not os.path.isdir(args.output): os.makedirs(args.output) else: pass # make the symlink directory within the output directory contig_dir = os.path.join(args.output, 'contigs') if not os.path.isdir(contig_dir): os.makedirs(contig_dir) else: pass try: abyss_pe = which('abyss-pe')[0] abyss_se = which('ABYSS')[0] except: raise EnvironmentError("Cannot find abyss-pe or ABYSS. Ensure they " "are installed and in your $PATH") # run abyss in (mostly) single-threaded mode for RAM and simplicity # reasons. abyss-map will run using as many cores as user specifies. for group in input: sample, dir = group # pretty print taxon status text = " Processing {} ".format(sample) log.info(text.center(65, "-")) # make a directory for sample-specific assemblies sample_dir = os.path.join(args.output, sample) os.makedirs(sample_dir) # determine how many files we're dealing with reads = get_input_files(dir, args.subfolder, log) # copy the read data over, combine singletons with read 1 # and run the assembly for PE data. if reads.r1 and reads.r2: output = run_abyss_pe(abyss_pe, args.kmer, reads, args.cores, sample_dir, log) if args.clean: cleanup_abyss_assembly_folder(output, log) elif reads.r1 and not reads.r2: output = run_abyss_se(abyss_se, args.kmer, reads, sample_dir, log) if args.clean: cleanup_abyss_assembly_folder(output, log, single_end=True) contigs_file = get_contigs_file_from_output(output) # remove degenerate bases, contigs < 100 bp, and rename # contigs to velvet-style naming contigs_file = convert_abyss_contigs_to_velvet(contigs_file) # create generic link in assembly folder for covg. computation generate_within_dir_symlink(contigs_file) # link to the standard (non-trimmed) assembly in ../contigs generate_symlinks(contig_dir, sample, contigs_file, log) text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): # get args and options args = get_args() # setup logging log, my_name = setup_logging(args) # get the input data log.info("Getting input filenames and creating output directories") input = get_input_data(args.config, args.dir) # create the output directory if it does not exist if not os.path.isdir(args.output): os.makedirs(args.output) else: pass # make the symlink directory within the output directory contig_dir = os.path.join(args.output, 'contigs') if not os.path.isdir(contig_dir): os.makedirs(contig_dir) else: pass try: velveth = which('velveth')[0] velvetg = which('velvetg')[0] except: raise EnvironmentError("Cannot find velveth or velvetg. Ensure they " "are installed and in your $PATH") # run velvet in single-threaded mode for RAM and simplicity # reasons. for group in input: sample, dir = group # pretty print taxon status text = " Processing {} ".format(sample) log.info(text.center(65, "-")) # make a directory for sample-specific assemblies sample_dir = os.path.join(args.output, sample) os.makedirs(sample_dir) # determine how many files we're dealing with reads = get_input_files(dir, args.subfolder, log) # copy the read data over, combine singletons with read 1 # and run the assembly for PE data. if reads.r1 and reads.r2: output = run_velveth(velveth, args.kmer, reads, sample_dir, log) output = run_velvetg(velvetg, args.kmer, output, log) elif reads.r1 and not reads.r2 and not reads.singleton: pass if args.clean: cleanup_velvet_assembly_folder(output, log) contigs_file = get_contigs_file_from_output(output) # create generic link in assembly folder for covg. computation generate_within_dir_symlink(sample_dir, contigs_file) # link to the standard (non-trimmed) assembly in ../contigs generate_symlinks(contig_dir, sample, contigs_file, log) text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def get_velvet_optimiser(name='VelvetOptimiser'): """ensure that velvetg, velveth, VelvetOpt, and VelvetOptimiser are in $PATH""" # ensure velvetg and velveth are in $PATH velvetg = which("velvetg") velveth = which("velveth") velvet_opt = which("VelvetOpt") # we need velvetoptimiser - ensure that is in $PATH and return try: velvet_optimiser = which("{}".format(name))[0] return velvet_optimiser except EnvironmentError, e: velvet_optimiser = which("{}.pl".format(name))[0] return velvet_optimiser
def main(): # get args and options args = get_args() # setup logging log, my_name = setup_logging(args) # get the input data log.info("Getting input filenames and creating output directories") input = get_input_data(args.config, args.dir) # create the output directory if it does not exist if not os.path.isdir(args.output): os.makedirs(args.output) else: pass # make the symlink directory within the output directory contig_dir = os.path.join(args.output, 'contigs') if not os.path.isdir(contig_dir): os.makedirs(contig_dir) else: pass # Get path to trinity. Standard name is `Trinity.pl`. # I usually symlink to `trinity` #TODO: Change this to system "which" - this is just to flaky in certain cases try: trinity = which('trinity')[0] except EnvironmentError: trinity = which('Trinity.pl')[0] except: raise EnvironmentError("Cannot find Trinity. Ensure it is installed and in your $PATH") for group in input: sample, dir = group # pretty print taxon status text = " Processing {} ".format(sample) log.info(text.center(65, "-")) # make a directory for sample-specific assemblies sample_dir = os.path.join(args.output, sample) os.makedirs(sample_dir) # determine how many files we're dealing with reads = get_input_files(dir, args.subfolder, log) # copy the read data over, combine singletons with read 1 # and run the assembly for PE data. if reads.r1 and reads.r2 and reads.singleton: copy_read_data(reads, sample_dir, log) combine_read_data(reads, log) output = run_trinity_pe(trinity, reads, args.cores, args.min_kmer_coverage, log) if args.clean: cleanup_trinity_assembly_folder(output, log) # we don't need to combine singleton files here. copy # the read data over and run the assembly for PE data elif reads.r1 and reads.r2: copy_read_data(reads, sample_dir, log) output = run_trinity_pe(trinity, reads, args.cores, args.min_kmer_coverage, log) if args.clean: cleanup_trinity_assembly_folder(output, log) # here, we don't have PE data, so copy the file over # and run the assembly for SE data elif reads.r1: copy_read_data(reads, sample_dir, log) output = run_trinity_se(trinity, reads, args.cores, args.min_kmer_coverage, log) if args.clean: cleanup_trinity_assembly_folder(output, log) # generate symlinks to assembled contigs generate_symlinks(contig_dir, sample, reads, log) text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): # get args and options args = get_args() # setup logging log, my_name = setup_logging(args) # get the input data log.info("Getting input filenames") input = get_input_data(args.assemblo_config, None) # Get path to bwa try: bwa = which('bwa')[0] except: raise EnvironmentError("Cannot find bwa. Ensure it is installed and in your $PATH") # make the symlink directory within the output directory contig_dir = os.path.join(args.assemblies, 'contigs-trimmed') if not os.path.isdir(contig_dir): os.makedirs(contig_dir) else: pass for group in input: sample, reads = group # pretty print taxon status text = " Processing {} ".format(sample) log.info(text.center(65, "-")) # ensure that assembly exists assembly_pth = os.path.join(args.assemblies, sample) assembly = os.path.join(assembly_pth, "contigs.fasta") if not os.path.exists(assembly): raise IOError("Assembly for {} does not appear to exist.".format(sample)) if args.clean: cleanup_trinity_assembly_folder(log, assembly_pth) # determine the types of raw read data that we have fastq = get_input_files(reads, args.subfolder, log) # create the bwa index bwa_create_index_files(log, assembly) samtools_create_faidx(log, sample, assembly_pth, assembly) picard_create_reference_dict(log, sample, assembly_pth, assembly) bam = False bam_se = False if args.bwa_mem and fastq.r1 and fastq.r2: bam = bwa_mem_pe_align(log, sample, assembly_pth, assembly, args.cores, fastq.r1, fastq.r2) bam = picard_clean_up_bam(log, sample, assembly_pth, bam, "pe") bam = picard_add_rg_header_info(log, sample, assembly_pth, "Generic", bam, "pe") elif not args.bwa_mem and fastq.r1 and fastq.r2: bam = bwa_pe_align(log, sample, assembly_pth, assembly, args.cores, fastq.r1, fastq.r2) bam = picard_clean_up_bam(log, sample, assembly_pth, bam, "pe") bam = picard_add_rg_header_info(log, sample, assembly_pth, "Generic", bam, "pe") # get singleton reads for alignment if args.bwa_mem and fastq.singleton: bam_se = bwa_mem_se_align(log, sample, assembly_pth, assembly, args.cores, fastq.singleton) bam_se = picard_clean_up_bam(log, sample, assembly_pth, bam_se, 'se') bam_se = picard_add_rg_header_info(log, sample, assembly_pth, "Generic", bam_se, "se") # if we only have se reads, those will be in fastq.r1 only elif args.bwa_mem and not fastq.r2 and fastq.r1: bam_se = bwa_mem_se_align(log, sample, assembly_pth, assembly, args.cores, fastq.r1) bam_se = picard_clean_up_bam(log, sample, assembly_pth, bam_se, 'se') bam_se = picard_add_rg_header_info(log, sample, assembly_pth, "Generic", bam_se, "se") elif not args.bwa_mem and fastq.singleton: bam_se = bwa_se_align(log, sample, assembly_pth, assembly, args.cores, fastq.singleton) bam_se = picard_clean_up_bam(log, sample, assembly_pth, bam_se, 'se') bam_se = picard_add_rg_header_info(log, sample, assembly_pth, "Generic", bam_se, "se") elif not args.bwa_mem and not fastq.r2 and fastq.r1: bam_se = bwa_se_align(log, sample, assembly_pth, assembly, args.cores, fastq.r1) bam_se = picard_clean_up_bam(log, sample, assembly_pth, bam_se, 'se') bam_se = picard_add_rg_header_info(log, sample, assembly_pth, "Generic", bam_se, "se") if bam and bam_se: bam = picard_merge_two_bams(log, sample, assembly_pth, bam, bam_se) elif bam_se and not bam: bam = bam_se if not bam: raise IOError("There is no BAM file. Check bwa log files for problems.") samtools_index(log, sample, assembly_pth, bam) coverage = gatk_coverage(log, sample, assembly_pth, assembly, args.cores, bam) overall_contigs = get_coverage_from_gatk(log, sample, assembly_pth, coverage, args.velvet) remove_gatk_coverage_files(log, assembly_pth, coverage) trimmed_fasta_path = filter_screened_contigs_from_assembly(log, sample, assembly_pth, assembly, overall_contigs) symlink_trimmed_contigs(log, sample, contig_dir, trimmed_fasta_path) # end text = " Completed {} ".format(my_name) log.info(text.center(65, "="))