def main(args): # setup logging log, my_name = setup_logging(args) # create the fasta dictionary loci = get_fasta_dict(log, args) log.info("Aligning with {}".format(str(args.aligner).upper())) opts = [[args.window, args.threshold, args.no_trim, args.proportion, args.max_divergence, args.min_length] \ for i in range(len(loci))] # combine loci and options params = zip(loci.items(), opts) log.info("Alignment begins. 'X' indicates dropped alignments (these are reported after alignment)") # During alignment, drop into sys.stdout for progress indicator # because logging in multiprocessing is more painful than what # we really need. Return to logging when alignment completes. if args.cores > 1: assert args.cores <= multiprocessing.cpu_count(), "You've specified more cores than you have" pool = multiprocessing.Pool(args.cores) alignments = pool.map(align, params) else: alignments = map(align, params) # kick the stdout down one line since we were using sys.stdout print("") # drop back into logging log.info("Alignment ends") # write the output files write_alignments_to_outdir(log, args.output, alignments, args.output_format) # end text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): args = get_args() # setup logging log, my_name = setup_logging(args.verbosity, args.log_path) text = " Starting {} ".format(my_name) log.info(text.center(65, "=")) alignments = [] log.info("Getting aligned sequences for trimming") for ftype in get_file_extensions(args.input_format): alignments.extend(glob.glob(os.path.join(args.input, "*{}".format(ftype)))) # package up needed arguments for map() package = [args.input_format, args.window, args.threshold, args.proportion, args.max_divergence, args.min_length] params = zip([package] * len(alignments), alignments) log.info("Alignment begins. 'X' indicates dropped alignments (these are reported after alignment)") # if --multprocessing, use Pool.map(), else use map() # can also extend to MPI map, but not really needed on multicore # machine if args.cores > 1: assert args.cores <= multiprocessing.cpu_count(), "You've specified more cores than you have" pool = multiprocessing.Pool(args.cores - 1) alignments = pool.map(get_and_trim_alignments, params) else: alignments = map(get_and_trim_alignments, params) # kick the stdout down one line since we were using sys.stdout print("") # drop back into logging log.info("Alignment ends") # write the output files write_alignments_to_outdir(log, args.output, alignments, args.output_format) # end text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): args = get_args() # setup logging log, my_name = setup_logging(args) files = get_files(args.input, args.input_format) for f in files: try: aln = AlignIO.read(f, args.input_format) if args.containing: containing = align_contains_taxa(args, aln) else: containing = True if args.min_length: length = align_min_length(args, aln) else: length = True if args.min_taxa: taxa = align_min_taxa(args, aln) else: taxa = True if containing and taxa and length: log.info("Good alignment: {0}".format(os.path.basename(f))) if containing and taxa and length and args.output: name = os.path.basename(f) shutil.copy(f, os.path.join(args.output, name)) except ValueError, e: if e.message == 'No records found in handle': print 'No records found in {0}'.format(os.path.basename(f)) else: raise ValueError('Something is wrong with alignment {0}'.format(os.path.basename(f)))
def main(): args = get_args() # setup logging log, my_name = setup_logging(args) #text = " Starting {} ".format(my_name) #log.info(text.center(65, "=")) # find all alignments files = get_alignment_files(log, args.alignments, args.input_format) # compile our regexes once n_bases = re.compile("N|n+") x_bases = re.compile("X|x+") work = [[file, n_bases, x_bases, args.input_format, args.output, args.do_not_screen_n, args.do_not_screen_x] for file in files] log.info("Screening alignments for problematic bases".format(args.cores)) if args.cores > 1: assert args.cores <= multiprocessing.cpu_count(), "You've specified more cores than you have" pool = multiprocessing.Pool(args.cores) results = pool.map(screen_files, work) pool.close() else: results = map(screen_files, work) count = 0 for result in results: if result is None: count += 1 else: log.warn("Removed locus {} due to presence of {} bases".format( result[0], result[1] )) log.info("Copied {} good alignments".format(count)) # end text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): args = get_args() # setup logging log, my_name = setup_logging(args) # read alignments log.info("Reading input alignments in NEXUS format") nexus_files = glob.glob(os.path.join(args.alignments, '*.nex*')) data = [(os.path.basename(fname), Nexus.Nexus(fname)) for fname in nexus_files] log.info("Concatenating files") concatenated = Nexus.combine(data) if not args.nexus: concat_file = os.path.join(args.output, os.path.basename(args.alignments) + ".phylip") if args.charsets: sets = concatenated.append_sets() charset_file = os.path.join(args.output, os.path.basename(args.alignments) + ".charsets") log.info("Writing charsets to {}".format( charset_file )) with open(charset_file, 'w') as outf: outf.write(sets) log.info("Writing concatenated PHYLIP alignment to {}".format(concat_file)) concatenated.export_phylip(concat_file) else: concat_file = os.path.join(args.output, os.path.basename(args.alignments) + ".nexus") if args.charsets: log.info("Writing concatenated alignment to NEXUS format (with charsets)") concatenated.write_nexus_data(concat_file) else: log.info("Writing concatenated alignment to NEXUS format (without charsets)") concatenated.write_nexus_data(concat_file, append_sets=False) # end text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): args = get_args() # setup logging log, my_name = setup_logging(args) files = get_files(args.alignments, args.input_format) if len(files) == 0: raise IOError("There are no {}-formatted alignments in {}.".format( args.input_format, args.alignments )) if args.shorten_name and not args.name_conf: name_map = shorten_name(args, files[0]) elif args.shorten_name and args.name_conf: conf = ConfigParser.ConfigParser() conf.readfp(open(args.name_conf)) name_map = dict(conf.items('taxa')) else: name_map = None params = [[f, args, name_map] for f in files] sys.stdout.write('Converting') sys.stdout.flush() if args.cores > 1: pool = Pool(args.cores) pool.map(convert_files_worker, params) else: map(convert_files_worker, params) print "" if args.shorten_name: log.info("Taxa renamed (from) => (to):") for k, v in name_map.iteritems(): log.info("\t{0} => {1}".format(k, v)) # end text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): args = get_args() # setup logging log, my_name = setup_logging(args) # get input files files = get_alignment_files(log, args.alignments, args.input_format) sys.stdout.write("Running") sys.stdout.flush() with open(args.output, 'w') as outf: for f in files: aln = AlignIO.read(f, args.input_format) locus = os.path.splitext(os.path.basename(f))[0] for taxon in aln: if taxon.id == args.taxon: seq = str(taxon.seq).replace('-', '').replace('?','') record = SeqRecord(Seq(seq), id=locus, name="", description="") if not len(seq) == 0: outf.write(record.format("fasta")) sys.stdout.write(".") sys.stdout.flush() else: log.info("Could not write {}".format(locus)) print "" # end text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): # get args and options args = get_args() # setup logging log, my_name = setup_logging(args) # get the input data log.info("Getting input filenames and creating output directories") input = get_input_data(args.config, args.dir) # create the output directory if it does not exist if not os.path.isdir(args.output): os.makedirs(args.output) else: pass # make the symlink directory within the output directory contig_dir = os.path.join(args.output, 'contigs') if not os.path.isdir(contig_dir): os.makedirs(contig_dir) else: pass try: abyss_pe = which('abyss-pe')[0] abyss_se = which('ABYSS')[0] except: raise EnvironmentError("Cannot find abyss-pe or ABYSS. Ensure they " "are installed and in your $PATH") # run abyss in (mostly) single-threaded mode for RAM and simplicity # reasons. abyss-map will run using as many cores as user specifies. for group in input: sample, dir = group # pretty print taxon status text = " Processing {} ".format(sample) log.info(text.center(65, "-")) # make a directory for sample-specific assemblies sample_dir = os.path.join(args.output, sample) os.makedirs(sample_dir) # determine how many files we're dealing with reads = get_input_files(dir, args.subfolder, log) # copy the read data over, combine singletons with read 1 # and run the assembly for PE data. if reads.r1 and reads.r2: output = run_abyss_pe(abyss_pe, args.kmer, reads, args.cores, sample_dir, log) if args.clean: cleanup_abyss_assembly_folder(output, log) elif reads.r1 and not reads.r2: output = run_abyss_se(abyss_se, args.kmer, reads, sample_dir, log) if args.clean: cleanup_abyss_assembly_folder(output, log, single_end=True) contigs_file = get_contigs_file_from_output(output) # remove degenerate bases, contigs < 100 bp, and rename # contigs to velvet-style naming contigs_file = convert_abyss_contigs_to_velvet(contigs_file) # create generic link in assembly folder for covg. computation generate_within_dir_symlink(contigs_file) # link to the standard (non-trimmed) assembly in ../contigs generate_symlinks(contig_dir, sample, contigs_file, log) text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): # get args and options args = get_args() # setup logging log, my_name = setup_logging(args) # get the input data log.info("Getting input filenames and creating output directories") input = get_input_data(args.config, args.dir) # create the output directory if it does not exist if not os.path.isdir(args.output): os.makedirs(args.output) else: pass # make the symlink directory within the output directory contig_dir = os.path.join(args.output, 'contigs') if not os.path.isdir(contig_dir): os.makedirs(contig_dir) else: pass try: velveth = which('velveth')[0] velvetg = which('velvetg')[0] except: raise EnvironmentError("Cannot find velveth or velvetg. Ensure they " "are installed and in your $PATH") # run velvet in single-threaded mode for RAM and simplicity # reasons. for group in input: sample, dir = group # pretty print taxon status text = " Processing {} ".format(sample) log.info(text.center(65, "-")) # make a directory for sample-specific assemblies sample_dir = os.path.join(args.output, sample) os.makedirs(sample_dir) # determine how many files we're dealing with reads = get_input_files(dir, args.subfolder, log) # copy the read data over, combine singletons with read 1 # and run the assembly for PE data. if reads.r1 and reads.r2: output = run_velveth(velveth, args.kmer, reads, sample_dir, log) output = run_velvetg(velvetg, args.kmer, output, log) elif reads.r1 and not reads.r2 and not reads.singleton: pass if args.clean: cleanup_velvet_assembly_folder(output, log) contigs_file = get_contigs_file_from_output(output) # create generic link in assembly folder for covg. computation generate_within_dir_symlink(sample_dir, contigs_file) # link to the standard (non-trimmed) assembly in ../contigs generate_symlinks(contig_dir, sample, contigs_file, log) text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): # get args args = get_args() # setup logging log, my_name = setup_logging(args) log.info("Logging set-up") matches = get_match_records(log, args.input_xml, args.chromosome) matches_names = fix_labels(matches, log) vcf.output = create_vcf(log, args.input_vcf, args.output_vcf, matches_names) log.info("Done!")
def main(): # get args and options args = get_args() # if we're resuming, we need to set output = resume # so that we can check previously created files. if args.resume: args.output = args.resume # setup logging log, my_name = setup_logging(args) log.info("Creating the output directory") # get the input data log.info("Fetching input filenames") assemblies = sorted(glob.glob(os.path.join(args.assemblies, "*"))) # remove the contigs/contigs-trimmed directories extra = set(['contigs', 'contigs-trimmed']) assemblies = [assembly for assembly in assemblies if os.path.basename(assembly) not in extra] loci = get_match_count_loci(log, args.match_count_output) # setup database connection conn = sqlite3.connect(args.locus_db) cur = conn.cursor() for assembly in assemblies: organism = os.path.basename(assembly) if args.resume and os.path.exists(os.path.join(args.output, "{}.reads-on-target.txt".format(organism))): log.warn("Skipping previously processed {} data (--resume)".format(organism)) else: reference = os.path.join(assembly, "contigs.fasta") bams = glob.glob(os.path.join(assembly, "*.bam")) try: assert len(bams) == 1 bam = bams[0] except: raise IOError("There appears to be more than one BAM file for {}".format(organism)) # pretty print taxon status text = " Processing {} ".format(organism) log.info(text.center(65, "-")) locus_map = get_sqlite_loci_for_taxon(log, args.locus_db, cur, organism, loci) locus_map_names = set(locus_map.keys()) create_per_base_coverage_file(log, args.output, assembly, organism, locus_map, locus_map_names) coverages_dict = create_per_locus_coverage_file(log, args.output, assembly, organism, locus_map, locus_map_names) # pass the same intervals as targets and base - we don't care that much here about bait performance hs_metrics_file = picard_calculate_hs_metrics(log, organism, args.output, reference, bam, coverages_dict["interval_list"], coverages_dict["interval_list"]) on_target_dict = picard_get_percent_reads_on_target(log, hs_metrics_file, organism) log.info("\t{} contigs, mean trimmed length = {:.1f}, mean trimmed coverage = {:.1f}x, on-target bases (uce contigs) = {:.1f}%, unique reads aligned (all contigs) = {:.1f}%".format( coverages_dict["count"], coverages_dict["mean_length_trimmed"], coverages_dict["mean_trim_cov"], float(on_target_dict["PCT_SELECTED_BASES"]) * 100, float(on_target_dict["PCT_PF_UQ_READS_ALIGNED"]) * 100, )) # end text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): args = get_args() # setup logging log, my_name = setup_logging(args) env = Environment(loader=FileSystemLoader(args.templates)) for i in xrange(args.trees): submit_script_pth = compute_starting_parsimony_tree(log, args, env, i) submit_parsimony_job(log, args, env, i, submit_script_pth) # convert the phylip file to binary format submit_script_pth = prep_parser_script(log, args, env) submit_parser_job(log, args, env, submit_script_pth) text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): args = get_args() # setup logging log, my_name = setup_logging(args) env = Environment(loader=FileSystemLoader(args.templates)) # check for the binary aligment binary_name = check_for_binary_phylip(log, args) # check for starting trees starting_trees = get_starting_trees(log, args) # create the binary file for starting_tree in starting_trees: prep_examl_script(log, args, env, starting_tree, binary_name) text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): args = get_args() # setup logging log, my_name = setup_logging(args) # read config file output by match_count_config.py config = ConfigParser.RawConfigParser(allow_no_value=True) # make case sensitive config.optionxform = str config.read(args.match_count_output) # read the incomplete matrix file that contains loci that are incomplete if args.incomplete_matrix: incomplete = ConfigParser.RawConfigParser(allow_no_value=True) incomplete.optionxform = str incomplete.read(args.incomplete_matrix) missing = get_missing_loci_from_conf_file(incomplete) else: missing = None # get the taxa in the alignment organisms = get_names_from_config(log, config, 'Organisms') # get input files files = get_alignment_files(log, args.alignments, args.input_format) work = [[ file, args.input_format, organisms, args.check_missing, missing, args.verbatim, args.min_taxa, args.output, args.output_format ] for file in files ] log.info("Adding missing data designators using {} cores".format(args.cores)) if args.cores > 1: assert args.cores <= multiprocessing.cpu_count(), "You've specified more cores than you have" pool = multiprocessing.Pool(args.cores) results = pool.map(add_designators, work) else: results = map(add_designators, work) for result in results: if result is not None: log.info("Dropped {} because of too few taxa (N < {})".format( result, args.min_taxa )) # end text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): args = get_args() # setup logging log, my_name = setup_logging(args) # change to working dir starting_dir = os.getcwd() # convert data to binary binary_file_pth = convert_phylip_to_examl_binary(log, args) for iter in xrange(args.trees): # compute starting tree on data seed, starting_tree_pth = compute_starting_parsimony_tree(log, args, iter, binary_file_pth) # run examl against binary data with starting tree run_examl_against_binary_data(log, args, iter, binary_file_pth, starting_tree_pth) # return to starting dir os.chdir(starting_dir)
def main(): args = get_args() # setup logging log, my_name = setup_logging(args) # change to working dir starting_dir = os.getcwd() # convert data to binary binary_file_pth = convert_phylip_to_examl_binary(log, args) for iter in xrange(args.trees): # compute starting tree on data seed, starting_tree_pth = compute_starting_parsimony_tree( log, args, iter, binary_file_pth) # run examl against binary data with starting tree run_examl_against_binary_data(log, args, iter, binary_file_pth, starting_tree_pth) # return to starting dir os.chdir(starting_dir)
def main(): args = get_args() # setup logging log, my_name = setup_logging(args) conf = ConfigParser.ConfigParser() conf.optionxform = str conf.read(args.config) items = conf.items("samples") #pdb.set_trace() for item in items: name, file_names = item files = file_names.strip().split(",") with open(os.path.join(args.output, name), 'wb') as outfile: for infile in sorted(files): shutil.copyfileobj(open(infile), outfile) log.info("Copied {} to {}".format( os.path.basename(infile), name )) # end text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): args = get_args() # setup logging log, my_name = setup_logging(args) # parse the config file - allowing no values (e.g. no ":" in config file) config = ConfigParser.RawConfigParser(allow_no_value=True) config.optionxform = str config.read(args.taxon_list_config) # connect to the database conn = sqlite3.connect(args.locus_db) c = conn.cursor() # attach to external database, if passed as option if args.extend_locus_db: log.info("Attaching extended database {}".format(os.path.basename(args.extend_locus_db))) query = "ATTACH DATABASE '{0}' AS extended".format(args.extend_locus_db) c.execute(query) organisms = get_taxa_from_config(config, args.taxon_group) log.info("There are {} taxa in the taxon-group '[{}]' in the config file {}".format( len(organisms), args.taxon_group, os.path.basename(args.taxon_list_config) )) uces = get_uce_names(log, c) log.info("There are {} total UCE loci in the database".format(len(uces))) all_counts = [] if args.optimize: shared_uces, organisms = sample_match_groups(args, c, organisms, uces, all_counts) else: shared_uces = dont_sample_match_groups(log, args, c, organisms, uces) if args.output and organisms and not args.silent: log.info("Writing the taxa and loci in the data matrix to {}".format(args.output)) with open(args.output, 'w') as outf: outf.write("[Organisms]\n{0}\n[Loci]\n{1}\n".format( '\n'.join(sorted(organisms)), '\n'.join(sorted(shared_uces)) )) text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): args = get_args() # setup logging log, my_name = setup_logging(args) files = get_alignment_files(log, args.alignments, args.input_format) work = [(args, f) for f in files] sys.stdout.write("Running") sys.stdout.flush() if args.cores > 1: assert args.cores <= multiprocessing.cpu_count(), "You've specified more cores than you have" pool = multiprocessing.Pool(args.cores) results = pool.map(worker, work) else: results = map(worker, work) # flatten results all_taxa = set([item for sublist in results for item in sublist]) print "" log.info("Taxon names in alignments: {0}".format( ','.join(list(all_taxa)) )) # end text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): args = get_args() # setup logging log, my_name = setup_logging(args) # find all alignments files = get_alignment_files(log, args.alignments, args.input_format) work = [[file, args.input_format] for file in files] log.info("Computing summary statistics using {} cores".format(args.cores)) if args.cores > 1: assert args.cores <= multiprocessing.cpu_count(), "You've specified more cores than you have" pool = multiprocessing.Pool(args.cores) summary = pool.map(get_stats, work) else: summary = map(get_stats, work) # alignments a_vars = get_lengths(summary) log_length_summary(log, len(summary), a_vars) # taxa t_vars = get_taxa(summary) log_taxa_summary(log, t_vars) # missing m_vars = get_percent_missing(summary) log_missing_summary(log, m_vars) # characters all_bases, sum_characters = total_characters(summary) sum_nucleotides = total_nucleotides(summary) log_char_summary(log, sum_characters, sum_nucleotides) # matrix percentages = get_matrix_percentages(t_vars[0]) log_matrix_summary(log, percentages) # taxa dist. log_taxa_dist(log, args.show_taxon_counts, t_vars[0]) # character dist log_character_dist(log, all_bases) # end text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): args = get_args() # setup logging log, my_name = setup_logging(args) # find all alignments files = get_alignment_files(log, args.alignments, args.input_format) # determine the minimum count of taxa needed in each alignment, given --percent min_count = int(math.floor(args.percent * args.taxa)) work = [[file, args.input_format, min_count, args.output] for file in files] if args.cores > 1: assert args.cores <= multiprocessing.cpu_count(), "You've specified more cores than you have" pool = multiprocessing.Pool(args.cores) results = pool.map(copy_over_files, work) else: results = map(copy_over_files, work) log.info("Copied {0} alignments of {1} total containing ≥ {2} proportion of taxa (n = {3})".format( sum(results), len(results), args.percent, min_count )) # end text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): # get args and options args = get_args() # setup logging log, my_name = setup_logging(args) # get the input data log.info("Getting input filenames and creating output directories") input = get_input_data(args.config, args.dir) # create the output directory if it does not exist if not os.path.isdir(args.output): os.makedirs(args.output) else: pass # make the symlink directory within the output directory contig_dir = os.path.join(args.output, 'contigs') if not os.path.isdir(contig_dir): os.makedirs(contig_dir) else: pass # Get path to trinity. Standard name is `Trinity.pl`. # I usually symlink to `trinity` #TODO: Change this to system "which" - this is just to flaky in certain cases try: trinity = which('trinity')[0] except EnvironmentError: trinity = which('Trinity.pl')[0] except: raise EnvironmentError("Cannot find Trinity. Ensure it is installed and in your $PATH") for group in input: sample, dir = group # pretty print taxon status text = " Processing {} ".format(sample) log.info(text.center(65, "-")) # make a directory for sample-specific assemblies sample_dir = os.path.join(args.output, sample) os.makedirs(sample_dir) # determine how many files we're dealing with reads = get_input_files(dir, args.subfolder, log) # copy the read data over, combine singletons with read 1 # and run the assembly for PE data. if reads.r1 and reads.r2 and reads.singleton: copy_read_data(reads, sample_dir, log) combine_read_data(reads, log) output = run_trinity_pe(trinity, reads, args.cores, args.min_kmer_coverage, log) if args.clean: cleanup_trinity_assembly_folder(output, log) # we don't need to combine singleton files here. copy # the read data over and run the assembly for PE data elif reads.r1 and reads.r2: copy_read_data(reads, sample_dir, log) output = run_trinity_pe(trinity, reads, args.cores, args.min_kmer_coverage, log) if args.clean: cleanup_trinity_assembly_folder(output, log) # here, we don't have PE data, so copy the file over # and run the assembly for SE data elif reads.r1: copy_read_data(reads, sample_dir, log) output = run_trinity_se(trinity, reads, args.cores, args.min_kmer_coverage, log) if args.clean: cleanup_trinity_assembly_folder(output, log) # generate symlinks to assembled contigs generate_symlinks(contig_dir, sample, reads, log) text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): args = get_args() # setup logging log, my_name = setup_logging(args) # parse the config file - allowing no values (e.g. no ":" in config file) config = ConfigParser.RawConfigParser(allow_no_value=True) config.optionxform = str config.read(args.match_count_output) # connect to the database conn = sqlite3.connect(args.locus_db) c = conn.cursor() # attach to external database, if passed as option if args.extend_locus_db: log.info("Attaching extended database {}".format(os.path.basename(args.extend_locus_db))) query = "ATTACH DATABASE '{0}' AS extended".format(args.extend_locus_db) c.execute(query) organisms = get_names_from_config(config, 'Organisms') log.info("There are {} taxa in the match-count-config file named {}".format( len(organisms), os.path.basename(args.match_count_output) )) uces = get_names_from_config(config, 'Loci') if not args.incomplete_matrix: log.info("There are {} shared UCE loci in a COMPLETE matrix".format(len(uces))) else: log.info("There are {} UCE loci in an INCOMPLETE matrix".format(len(uces))) regex = re.compile("[N,n]{1,21}") if args.incomplete_matrix: incomplete_outf = open(args.incomplete_matrix, 'w') with open(args.output, 'w') as uce_fasta_out: for organism in organisms: text = "Getting UCE loci for {0}".format(organism) log.info(text.center(65, "-")) written = [] # going to need to do something more generic w/ suffixes name = organism.replace('_', '-') if args.incomplete_matrix: if not organism.endswith('*'): reads = find_file(args.contigs, name) node_dict, missing = get_nodes_for_uces(c, organism, uces, extend=False, notstrict=True) elif args.extend_locus_contigs: # remove the asterisk name = name.rstrip('*') reads = find_file(args.extend_locus_contigs, name) node_dict, missing = get_nodes_for_uces(c, organism.rstrip('*'), uces, extend=True, notstrict=True) else: if not name.endswith('*'): reads = find_file(args.contigs, name) node_dict, missing = get_nodes_for_uces(c, organism, uces) elif name.endswith('*') and args.extend_locus_contigs: # remove the asterisk name = name.rstrip('*') reads = find_file(args.extend_locus_contigs, name) node_dict, missing = get_nodes_for_uces(c, organism.rstrip('*'), uces, extend=True) count = 0 log.info("There are {} UCE loci for {}".format(len(node_dict), organism)) log.info("Parsing and renaming contigs for {}".format(organism)) for seq in SeqIO.parse(open(reads, 'rU'), 'fasta'): name = get_contig_name(seq.id).lower() if name in node_dict.keys(): seq.id = "{0}_{1} |{0}".format(node_dict[name][0], organism.rstrip('*')) seq.name = '' seq.description = '' # deal with strandedness because aligners sometimes dont, which # is annoying if node_dict[name][1] == '-': seq.seq = seq.seq.reverse_complement() # Replace any occurrences of <21 Ns in a given sequence with # blanks. These should gap out during alignment. Also, replace # leading/trailing lowercase bases from velvet assemblies. # Lowercase bases indicate low coverage, and these # have been problematic in downstream alignments). seq, count = replace_and_remove_bases(regex, seq, count) uce_fasta_out.write(seq.format('fasta')) written.append(str(node_dict[name][0])) else: pass if count > 0: log.info("Replaced <20 ambiguous bases (N) in {} contigs for {}".format(count, organism)) if args.incomplete_matrix and missing: log.info("Writing missing locus information to {}".format(args.incomplete_matrix)) incomplete_outf.write("[{0}]\n".format(organism)) for name in missing: incomplete_outf.write("{0}\n".format(name)) written.append(name) assert set(written) == set(uces), "UCE names do not match" text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): args = get_args() log, my_name = setup_logging(args) regex = re.compile(args.regex) if not os.path.isdir(args.output): os.makedirs(args.output) else: raise IOError("The directory {} already exists. Please check and remove by hand.".format(args.output)) uces = set(new_get_probe_name(seq.id, regex) for seq in SeqIO.parse(open(args.probes, 'rU'), 'fasta')) if args.dupefile: dupes = get_dupes(log, args.dupefile, regex) else: dupes = set() fasta_files = glob.glob(os.path.join(args.contigs, '*.fa*')) organisms = get_organism_names_from_fasta_files(fasta_files) conn, c = create_probe_database( log, os.path.join(args.output, 'probe.matches.sqlite'), organisms, uces ) log.info("Processing contig data") # open a file for duplicate writing, if we're interested if args.keep_duplicates is not None: dupefile = open(args.keep_duplicates, 'w') else: dupefile = None log.info("{}".format("-" * 65)) for contig in sorted(fasta_files): critter = os.path.basename(contig).split('.')[0].replace('-', "_") output = os.path.join( args.output, os.path.splitext(os.path.basename(contig))[0] + '.lastz' ) contigs = contig_count(contig) # align the probes to the contigs alignment = lastz.Align( contig, args.probes, args.min_coverage, args.min_identity, output ) lzstdout, lztstderr = alignment.run() if lztstderr: raise EnvironmentError("lastz: {}".format(lztstderr)) # parse the lastz results of the alignment matches = defaultdict(set) orientation = defaultdict(set) revmatches = defaultdict(set) probe_dupes = set() if not lztstderr: for lz in lastz.Reader(output): # get strandedness of match contig_name = get_contig_name(lz.name1) uce_name = new_get_probe_name(lz.name2, regex) if args.dupefile and uce_name in dupes: probe_dupes.add(uce_name) else: matches[contig_name].add(uce_name) orientation[uce_name].add(lz.strand2) revmatches[uce_name].add(contig_name) # we need to check nodes for dupe matches to the same probes contigs_matching_mult_uces = check_contigs_for_dupes(matches) uce_dupe_contigs, uce_dupe_uces = check_loci_for_dupes(revmatches) nodes_to_drop = contigs_matching_mult_uces.union(uce_dupe_contigs) # write out duplicates if requested if dupefile is not None: log.info("Writing duplicates file for {}".format(critter)) if len(uce_dupe_uces) != 0: dupefile.write("[{} - probes hitting multiple contigs]\n".format(critter)) for uce in uce_dupe_uces: dupefile.write("{}:{}\n".format(uce, ', '.join(revmatches[uce]))) dupefile.write("\n") if len(contigs_matching_mult_uces) != 0: dupefile.write("[{} - contigs hitting multiple probes]\n".format(critter)) for dupe in contigs_matching_mult_uces: dupefile.write("{}:{}\n".format(dupe, ', '.join(matches[dupe]))) dupefile.write("\n") #pdb.set_trace() # remove dupe and/or dubious nodes/contigs match_copy = copy.deepcopy(matches) for k in match_copy.keys(): if k in nodes_to_drop: del matches[k] store_lastz_results_in_db(c, matches, orientation, critter) conn.commit() pretty_log_output( log, critter, matches, contigs, probe_dupes, contigs_matching_mult_uces, uce_dupe_uces ) if dupefile is not None: dupefile.close() log.info("{}".format("-" * 65)) log.info("The LASTZ alignments are in {}".format(args.output)) log.info("The UCE match database is in {}".format(os.path.join(args.output, "probes.matches.sqlite"))) text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): args = get_args() # setup logging log, my_name = setup_logging(args) # parse the config file - allowing no values (e.g. no ":" in config file) config = ConfigParser.RawConfigParser(allow_no_value=True) config.optionxform = str config.read(args.config) # connect to the database conn = sqlite3.connect(args.locus_db) c = conn.cursor() # attach to external database, if passed as option organisms = get_names_from_config(config, "Organisms") log.info( "There are {} taxa in the match-count-config file named {}".format( len(organisms), os.path.basename(args.config) ) ) exons = get_names_from_config(config, "Loci") log.info("There are {} exon loci in the matrix".format(len(exons))) regex = re.compile("[N,n]{1,21}") out_dir = "/".join(args.output.split("/")[:-1]) temp_conf = os.path.join(out_dir, "config_extended") incomplete_outf = open(temp_conf, "w") with open(args.output, "w") as exon_fasta_out: for organism in organisms: text = "Getting exon loci for {0}".format(organism) log.info(text.center(65, "-")) written = [] # going to need to do something more generic w/ suffixes name = organism.replace("_", "-") if not organism.endswith("*"): reads = find_file(args.contigs, name) node_dict, missing = get_nodes_for_exons(c, organism, exons, extend=False, notstrict=True) count = 0 log.info("There are {} exon loci for {}".format(len(node_dict), organism)) log.info("Parsing and renaming contigs for {}".format(organism)) for seq in SeqIO.parse(open(reads, "rU"), "fasta"): name = get_contig_name(seq.id).lower() # print "name:", name # print node_dict.keys() if name in node_dict.keys(): seq.id = "{0}_{1} |{0}".format(node_dict[name][0], organism.rstrip("*")) seq.name = "" seq.description = "" # deal with strandedness because aligners sometimes dont, which # is annoying if node_dict[name][1] == "-": seq.seq = seq.seq.reverse_complement() # Replace any occurrences of <21 Ns in a given sequence with # blanks. These should gap out during alignment. Also, replace # leading/trailing lowercase bases from velvet assemblies. # Lowercase bases indicate low coverage, and these # have been problematic in downstream alignments). seq, count = replace_and_remove_bases(regex, seq, count) exon_fasta_out.write(seq.format("fasta")) # print "node_dict:", node_dict[name][0] written.append(str(node_dict[name][0])) else: pass if count > 0: log.info("Replaced <20 ambiguous bases (N) in {} contigs for {}".format(count, organism)) if missing: log.info("Writing missing locus information to {}".format(temp_conf)) incomplete_outf.write("[{0}]\n".format(organism)) for name in missing: incomplete_outf.write("{0}\n".format(name)) written.append(name) # print written # print exons assert set(written) == set(exons), "exon names do not match" text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): args = get_args() log, my_name = setup_logging(args) pre_regex = args.regex regex = re.compile("^(%s)(?:.*)" %pre_regex) if not os.path.isdir(args.output): os.makedirs(args.output) else: raise IOError("The directory {} already exists. Please check and remove by hand.".format(args.output)) exons = set(new_get_probe_name(seq.id, regex) for seq in SeqIO.parse(open(args.reference, 'rU'), 'fasta')) #print exons if args.dupefile: dupes = get_dupes(log, args.dupefile, regex) else: dupes = set() fasta_files = glob.glob(os.path.join(args.contigs, '*.fa*')) for f in fasta_files: replace_bad_fasta_chars = "sed -i -e '/>/! s=[K,Y,R,S,M,W,k,y,r,s,m,w]=N=g' %s" %f os.system(replace_bad_fasta_chars) #print fasta_files organisms = get_organism_names_from_fasta_files(fasta_files) #print organisms conn, c = create_probe_database( log, os.path.join(args.output, 'probe.matches.sqlite'), organisms, exons ) log.info("Processing contig data") # open a file for duplicate writing, if we're interested if args.keep_duplicates is not None: dupefile = open(args.keep_duplicates, 'w') else: dupefile = None log.info("{}".format("-" * 65)) kmers = {} for contig in sorted(fasta_files): critter = os.path.basename(contig).split('.')[0].replace('-', "_") output = os.path.join( args.output, os.path.splitext(os.path.basename(contig))[0] + '.lastz' ) contigs = contig_count(contig) # align the probes to the contigs alignment = lastz.Align( contig, args.reference, args.min_coverage, args.min_identity, output ) lzstdout, lztstderr = alignment.run() if lztstderr: raise EnvironmentError("lastz: {}".format(lztstderr)) # parse the lastz results of the alignment matches = defaultdict(set) orientation = defaultdict(set) revmatches = defaultdict(set) probe_dupes = set() if not lztstderr: for lz in lastz.Reader(output): contig_name = get_contig_name(lz.name1) exon_name = new_get_probe_name(lz.name2, regex) if args.dupefile and exon_name in dupes: probe_dupes.add(exon_name) else: matches[contig_name].add(exon_name) orientation[exon_name].add(lz.strand2) revmatches[exon_name].add(contig_name) # we need to check nodes for dupe matches to the same probes contigs_matching_mult_exons = check_contigs_for_dupes(matches) exon_dupe_contigs, exon_dupe_exons = check_loci_for_dupes(revmatches) nodes_to_drop = contigs_matching_mult_exons.union(exon_dupe_contigs) # write out duplicates if requested if dupefile is not None: log.info("Writing duplicates file for {}".format(critter)) if len(exon_dupe_exons) != 0: dupefile.write("[{} - probes hitting multiple contigs]\n".format(critter)) for exon in exon_dupe_exons: dupefile.write("{}:{}\n".format(exon, ', '.join(revmatches[exon]))) dupefile.write("\n") if len(contigs_matching_mult_exons) != 0: dupefile.write("[{} - contigs hitting multiple probes]\n".format(critter)) for dupe in contigs_matching_mult_exons: dupefile.write("{}:{}\n".format(dupe, ', '.join(matches[dupe]))) dupefile.write("\n") # remove dupe and/or dubious nodes/contigs match_copy = copy.deepcopy(matches) for k in match_copy.keys(): if k in nodes_to_drop: del matches[k] #print matches #print lz.name1 #get contig id #contig_id = re.search("^(\d*)\s\d*\s\d*.*", lz.name1).groups()[0] #print matches #added function to return the kmer count (sum of all kmers of target contigs) for lz in lastz.Reader(output): for element in matches: #print element, "has to match", lz[1] if re.search("^(\d*)\s\d*\s\d*.*", lz[1]).groups()[0] == element: kmer_value = get_kmer_value(lz.name1) kmers.setdefault(contig,[]) kmers[contig].append(kmer_value) store_lastz_results_in_db(c, matches, orientation, critter) conn.commit() pretty_log_output( log, critter, matches, contigs, probe_dupes, contigs_matching_mult_exons, exon_dupe_exons ) kmerfile = open(os.path.join(args.output,'kmer_count.txt'), 'w') for key in kmers: count = 0 for element in kmers[key]: count += int(element) kmerfile.write("%s : %d\n" %(os.path.basename(key).split('.')[0],count)) if dupefile is not None: dupefile.close() log.info("{}".format("-" * 65)) log.info("The LASTZ alignments are in {}".format(args.output)) log.info("The exon match database is in {}".format(os.path.join(args.output, "probes.matches.sqlite"))) text = " Completed {} ".format(my_name) log.info(text.center(65, "=")) # Access the SQL file and export tab-separated text-file sql_file = os.path.join(args.output, 'probe.matches.sqlite') tsf_out = os.path.join(args.output, 'match_table.txt') sql_cmd = "%s -header -nullvalue '.' -separator '\t' %s \"select * from matches;\" > %s" %(args.sqlite3,sql_file,tsf_out) os.system(sql_cmd) # Create the config file for the extraction of the desired loci output_folder = args.output create_conf_cmd = "echo \"[Organisms]\" > %s/config; ls %s/*.lastz | rev | cut -d/ -f1 | rev | cut -d \"_\" -f 1 >> %s/config; echo \"[Loci]\" >> %s/config; tail -n+2 %s/match_table.txt | cut -f 1 >> %s/config" %(output_folder,output_folder,output_folder,output_folder,output_folder,output_folder) os.system(create_conf_cmd) remove_lastz = "sed -i 's/.lastz//g' %s/config" %output_folder os.system(remove_lastz)
def main(): # get args and options args = get_args() # setup logging log, my_name = setup_logging(args) # get the input data log.info("Getting input filenames") input = get_input_data(args.assemblo_config, None) # Get path to bwa try: bwa = which('bwa')[0] except: raise EnvironmentError("Cannot find bwa. Ensure it is installed and in your $PATH") # make the symlink directory within the output directory contig_dir = os.path.join(args.assemblies, 'contigs-trimmed') if not os.path.isdir(contig_dir): os.makedirs(contig_dir) else: pass for group in input: sample, reads = group # pretty print taxon status text = " Processing {} ".format(sample) log.info(text.center(65, "-")) # ensure that assembly exists assembly_pth = os.path.join(args.assemblies, sample) assembly = os.path.join(assembly_pth, "contigs.fasta") if not os.path.exists(assembly): raise IOError("Assembly for {} does not appear to exist.".format(sample)) if args.clean: cleanup_trinity_assembly_folder(log, assembly_pth) # determine the types of raw read data that we have fastq = get_input_files(reads, args.subfolder, log) # create the bwa index bwa_create_index_files(log, assembly) samtools_create_faidx(log, sample, assembly_pth, assembly) picard_create_reference_dict(log, sample, assembly_pth, assembly) bam = False bam_se = False if args.bwa_mem and fastq.r1 and fastq.r2: bam = bwa_mem_pe_align(log, sample, assembly_pth, assembly, args.cores, fastq.r1, fastq.r2) bam = picard_clean_up_bam(log, sample, assembly_pth, bam, "pe") bam = picard_add_rg_header_info(log, sample, assembly_pth, "Generic", bam, "pe") elif not args.bwa_mem and fastq.r1 and fastq.r2: bam = bwa_pe_align(log, sample, assembly_pth, assembly, args.cores, fastq.r1, fastq.r2) bam = picard_clean_up_bam(log, sample, assembly_pth, bam, "pe") bam = picard_add_rg_header_info(log, sample, assembly_pth, "Generic", bam, "pe") # get singleton reads for alignment if args.bwa_mem and fastq.singleton: bam_se = bwa_mem_se_align(log, sample, assembly_pth, assembly, args.cores, fastq.singleton) bam_se = picard_clean_up_bam(log, sample, assembly_pth, bam_se, 'se') bam_se = picard_add_rg_header_info(log, sample, assembly_pth, "Generic", bam_se, "se") # if we only have se reads, those will be in fastq.r1 only elif args.bwa_mem and not fastq.r2 and fastq.r1: bam_se = bwa_mem_se_align(log, sample, assembly_pth, assembly, args.cores, fastq.r1) bam_se = picard_clean_up_bam(log, sample, assembly_pth, bam_se, 'se') bam_se = picard_add_rg_header_info(log, sample, assembly_pth, "Generic", bam_se, "se") elif not args.bwa_mem and fastq.singleton: bam_se = bwa_se_align(log, sample, assembly_pth, assembly, args.cores, fastq.singleton) bam_se = picard_clean_up_bam(log, sample, assembly_pth, bam_se, 'se') bam_se = picard_add_rg_header_info(log, sample, assembly_pth, "Generic", bam_se, "se") elif not args.bwa_mem and not fastq.r2 and fastq.r1: bam_se = bwa_se_align(log, sample, assembly_pth, assembly, args.cores, fastq.r1) bam_se = picard_clean_up_bam(log, sample, assembly_pth, bam_se, 'se') bam_se = picard_add_rg_header_info(log, sample, assembly_pth, "Generic", bam_se, "se") if bam and bam_se: bam = picard_merge_two_bams(log, sample, assembly_pth, bam, bam_se) elif bam_se and not bam: bam = bam_se if not bam: raise IOError("There is no BAM file. Check bwa log files for problems.") samtools_index(log, sample, assembly_pth, bam) coverage = gatk_coverage(log, sample, assembly_pth, assembly, args.cores, bam) overall_contigs = get_coverage_from_gatk(log, sample, assembly_pth, coverage, args.velvet) remove_gatk_coverage_files(log, assembly_pth, coverage) trimmed_fasta_path = filter_screened_contigs_from_assembly(log, sample, assembly_pth, assembly, overall_contigs) symlink_trimmed_contigs(log, sample, contig_dir, trimmed_fasta_path) # end text = " Completed {} ".format(my_name) log.info(text.center(65, "="))