def main(): args = get_args() # setup logging log, my_name = setup_logging(args) #text = " Starting {} ".format(my_name) #log.info(text.center(65, "=")) # find all alignments files = get_alignment_files(log, args.alignments, args.input_format) # compile our regexes once n_bases = re.compile("N|n+") x_bases = re.compile("X|x+") work = [[file, n_bases, x_bases, args.input_format, args.output, args.do_not_screen_n, args.do_not_screen_x] for file in files] log.info("Screening alignments for problematic bases".format(args.cores)) if args.cores > 1: assert args.cores <= multiprocessing.cpu_count(), "You've specified more cores than you have" pool = multiprocessing.Pool(args.cores) results = pool.map(screen_files, work) pool.close() else: results = map(screen_files, work) count = 0 for result in results: if result is None: count += 1 else: log.warn("Removed locus {} due to presence of {} bases".format( result[0], result[1] )) log.info("Copied {} good alignments".format(count)) # end text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): args = get_args() # setup logging log, my_name = setup_logging(args) # get input files files = get_alignment_files(log, args.alignments, args.input_format) sys.stdout.write("Running") sys.stdout.flush() with open(args.output, 'w') as outf: for f in files: aln = AlignIO.read(f, args.input_format) locus = os.path.splitext(os.path.basename(f))[0] for taxon in aln: if taxon.id == args.taxon: seq = str(taxon.seq).replace('-', '').replace('?','') record = SeqRecord(Seq(seq), id=locus, name="", description="") if not len(seq) == 0: outf.write(record.format("fasta")) sys.stdout.write(".") sys.stdout.flush() else: log.info("Could not write {}".format(locus)) print "" # end text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): args = get_args() # setup logging log, my_name = setup_logging(args) # read config file output by match_count_config.py config = ConfigParser.RawConfigParser(allow_no_value=True) # make case sensitive config.optionxform = str config.read(args.match_count_output) # read the incomplete matrix file that contains loci that are incomplete if args.incomplete_matrix: incomplete = ConfigParser.RawConfigParser(allow_no_value=True) incomplete.optionxform = str incomplete.read(args.incomplete_matrix) missing = get_missing_loci_from_conf_file(incomplete) else: missing = None # get the taxa in the alignment organisms = get_names_from_config(log, config, 'Organisms') # get input files files = get_alignment_files(log, args.alignments, args.input_format) work = [[ file, args.input_format, organisms, args.check_missing, missing, args.verbatim, args.min_taxa, args.output, args.output_format ] for file in files ] log.info("Adding missing data designators using {} cores".format(args.cores)) if args.cores > 1: assert args.cores <= multiprocessing.cpu_count(), "You've specified more cores than you have" pool = multiprocessing.Pool(args.cores) results = pool.map(add_designators, work) else: results = map(add_designators, work) for result in results: if result is not None: log.info("Dropped {} because of too few taxa (N < {})".format( result, args.min_taxa )) # end text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): args = get_args() # setup logging log, my_name = setup_logging(args) files = get_alignment_files(log, args.alignments, args.input_format) work = [(args, f) for f in files] sys.stdout.write("Running") sys.stdout.flush() if args.cores > 1: assert args.cores <= multiprocessing.cpu_count(), "You've specified more cores than you have" pool = multiprocessing.Pool(args.cores) results = pool.map(worker, work) else: results = map(worker, work) # flatten results all_taxa = set([item for sublist in results for item in sublist]) print "" log.info("Taxon names in alignments: {0}".format( ','.join(list(all_taxa)) )) # end text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): args = get_args() # setup logging log, my_name = setup_logging(args) # find all alignments files = get_alignment_files(log, args.alignments, args.input_format) work = [[file, args.input_format] for file in files] log.info("Computing summary statistics using {} cores".format(args.cores)) if args.cores > 1: assert args.cores <= multiprocessing.cpu_count(), "You've specified more cores than you have" pool = multiprocessing.Pool(args.cores) summary = pool.map(get_stats, work) else: summary = map(get_stats, work) # alignments a_vars = get_lengths(summary) log_length_summary(log, len(summary), a_vars) # taxa t_vars = get_taxa(summary) log_taxa_summary(log, t_vars) # missing m_vars = get_percent_missing(summary) log_missing_summary(log, m_vars) # characters all_bases, sum_characters = total_characters(summary) sum_nucleotides = total_nucleotides(summary) log_char_summary(log, sum_characters, sum_nucleotides) # matrix percentages = get_matrix_percentages(t_vars[0]) log_matrix_summary(log, percentages) # taxa dist. log_taxa_dist(log, args.show_taxon_counts, t_vars[0]) # character dist log_character_dist(log, all_bases) # end text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): args = get_args() # setup logging log, my_name = setup_logging(args) # find all alignments files = get_alignment_files(log, args.alignments, args.input_format) # determine the minimum count of taxa needed in each alignment, given --percent min_count = int(math.floor(args.percent * args.taxa)) work = [[file, args.input_format, min_count, args.output] for file in files] if args.cores > 1: assert args.cores <= multiprocessing.cpu_count(), "You've specified more cores than you have" pool = multiprocessing.Pool(args.cores) results = pool.map(copy_over_files, work) else: results = map(copy_over_files, work) log.info("Copied {0} alignments of {1} total containing ≥ {2} proportion of taxa (n = {3})".format( sum(results), len(results), args.percent, min_count )) # end text = " Completed {} ".format(my_name) log.info(text.center(65, "="))