def get_all_tax_intervals(distribution_files, name_infix): # several ways of doing this - e.g. just make a set out of a list of all of them, however # will do it using the distribution builder global RUN_ROOT, BUILD_ROOT interval_readers = [ Distribution.load(distribution_file).get_distribution().keys() for distribution_file in distribution_files ] distob = Distribution([], 1, interval_readers) distob.interval_locator_funcs = ( bin_discrete_value, bin_discrete_value, ) distob.assignments_files = ("kingdom_binning.txt", "family_binning.txt") distdata = build(distob) distob.summary() if len(name_infix) > 0: save_filename = os.path.join(BUILD_ROOT, "all_taxa_%s.pickle" % name_infix) else: save_filename = os.path.join(BUILD_ROOT, "all_taxa.pickle") distob.save(save_filename) return save_filename
def get_tax_interval_measure_space(measure, all_taxa_distribution, run_distributions): all_intervals_list = Distribution.load( all_taxa_distribution).get_distribution().keys() all_intervals_list.sort() sample_measures = Distribution.get_projections(run_distributions, all_intervals_list, measure) zsample_measures = itertools.izip(*sample_measures) sample_name_iter = [ tuple([ os.path.splitext(os.path.basename(run_distribution))[0] for run_distribution in run_distributions ]) ] zsample_measures = itertools.chain(sample_name_iter, zsample_measures) interval_name_iter = itertools.chain([("kingdom", "family")], all_intervals_list) zsample_measures_with_rownames = itertools.izip(interval_name_iter, zsample_measures) return (zsample_measures, zsample_measures_with_rownames)
def summarise_distributions(distributions, options): measure = "frequency" if options["summary_type"] in ["zipfian","entropy"]: measure = "unsigned_information" kmer_intervals = Distribution.get_intervals(distributions, options["num_processes"]) if options["alphabet"] is not None: kmer_intervals1 = [ interval for interval in kmer_intervals if re.search("^[%(alphabet)s]+$"%options , interval[0], re.IGNORECASE) is not None ] print "(restricting kmers to those from alphabet %s , deleted %d / %d kmers)"%(options["alphabet"],len(kmer_intervals) - len(kmer_intervals1) , len(kmer_intervals)) kmer_intervals = kmer_intervals1 #print "summarising %s , %s across %s"%(measure, str(kmer_intervals), str(distributions)) print "summarising %s , %d kmers across %s"%(measure, len(kmer_intervals), str(distributions)) sample_measures = Distribution.get_projections(distributions, kmer_intervals, measure, False, options["num_processes"]) zsample_measures = itertools.izip(*sample_measures) sample_name_iter = [tuple([os.path.splitext(os.path.basename(distribution))[0] for distribution in distributions])] zsample_measures = itertools.chain(sample_name_iter, zsample_measures) interval_name_iter = itertools.chain([("kmer_pattern")],kmer_intervals) outfile=open(options["output_filename"], "w") if options["summary_type"] in ["entropy", "frequency"]: zsample_measures_with_rownames = itertools.izip(interval_name_iter, zsample_measures) for interval_measure in zsample_measures_with_rownames: print >> outfile, "%s\t%s"%("%s"%interval_measure[0], string.join((str(item) for item in interval_measure[1]),"\t")) outfile.close() elif options["summary_type"] in ["ranks", "zipfian"]: # duplicate interval_name_iter - needed 3 times interval_name_iter_dup = itertools.tee(interval_name_iter, 3) # triplicate zsample_measures (0 used to get ranks; 1 used to output measures; 3 used to get distances) zsample_measures_dup = itertools.tee(zsample_measures,3) ranks = Distribution.get_rank_iter(zsample_measures_dup[0]) # duplicate ranks (0 used to output; 1 used to get distances) ranks_dup = itertools.tee(ranks, 2) ranks_with_rownames = itertools.izip(interval_name_iter_dup[0], ranks_dup[0]) # output ranks print >> outfile , "*** ranks *** :" for interval_rank in ranks_with_rownames: print >> outfile, "%s\t%s"%("%s"%interval_rank[0], string.join((str(item) for item in interval_rank[1]),"\t")) # output measures print >> outfile , "*** entropies *** :" zsample_measures_with_rownames = itertools.izip(interval_name_iter_dup[1], zsample_measures_dup[1]) for interval_measure in zsample_measures_with_rownames: print >> outfile, "%s\t%s"%("%s"%interval_measure[0], string.join((str(item) for item in interval_measure[1]),"\t")) # get distances print >> outfile , "*** distances *** :" (distance_matrix, point_names_sorted) = Distribution.get_zipfian_distance_matrix(zsample_measures_dup[2], ranks_dup[1]) Distribution.print_distance_matrix(distance_matrix, point_names_sorted, outfile) else: print "warning, unknown summary type %(summary_type)s, no summary available"%options outfile.close()
def use_kmer_prbdf(picklefile): distob = Distribution.load(picklefile) distdata = distob.get_distribution() for (interval, freq) in distdata.items(): print interval, freq
def build_kmer_distribution(datafile, kmer_patterns, sampling_proportion, num_processes, builddir, reverse_complement, pattern_window_length, input_driver_config): if os.path.exists(get_save_filename(datafile, builddir)): print("build_kmer_distribution- skipping %s as already done"%datafile) distob = Distribution.load(get_save_filename(datafile, builddir)) distob.summary() else: print("build_kmer_distribution- processing %s"%datafile) filetype = get_file_type(datafile) distob = Distribution([datafile], num_processes) distob.interval_locator_parameters = (None,) distob.interval_locator_funcs = (bin_discrete_value,) distob.assignments_files = ("kmer_binning.txt",) distob.file_to_stream_func = seq_from_sequence_file distob.file_to_stream_func_xargs = [filetype,sampling_proportion] distob.weight_value_provider_func = kmer_count_from_sequence distob.weight_value_provider_func_xargs = [reverse_complement, pattern_window_length, 1] + kmer_patterns if filetype == ".cnt": #print "DEBUG setting methods for count file" distob.file_to_stream_func = tag_count_from_tag_count_file distob.file_to_stream_func_xargs = [input_driver_config,sampling_proportion] distob.weight_value_provider_func = kmer_count_from_tag_count distdata = build(distob, use="singlethread") else: distdata = build(distob, proc_pool_size=num_processes) distob.save(get_save_filename(datafile, builddir)) print "Distribution %s has %d points distributed over %d intervals, stored in %d parts"%(get_save_filename(datafile, builddir), distob.point_weight, len(distdata), len(distob.part_dict)) return get_save_filename(datafile, builddir)
def build_sample_tax_distribution(datafile, run_name, sample_name, tax_pattern=None, name_infix="", exclusions=None): """ each record - i.e. taxa - is a bin. Build a distribution of reads across these bins, for each sample in a run. This is already provided by the summary files - we just collate all summary files and store it our own sparse prbdf structure (tax_pattern and name_infix are there for selecting out and naming sub-sets of taxa) """ global RUN_ROOT, BUILD_ROOT #print "building sample tax distribution for %s:%s using %s"%(run_name, sample_name, datafile) data_stream = from_tab_delimited_file(datafile) header = data_stream.next() sample_index = header.index(sample_name) if exclusions is None: if tax_pattern is None: data_stream = ( (record[0], record[1], record[sample_index]) for record in data_stream if float(record[sample_index]) > 0) # taxname, count else: data_stream = ( (record[0], record[1], record[sample_index]) for record in data_stream if float(record[sample_index]) > 0 and re.search( tax_pattern, record[0], re.IGNORECASE) is not None ) # taxname, count elif exclusions == "nohit": if tax_pattern is None: data_stream = ( (record[0], record[1], record[sample_index]) for record in data_stream if float(record[sample_index]) > 0 and re.search("no\s*hit", record[0], re.IGNORECASE) is None) else: data_stream = ( (record[0], record[1], record[sample_index]) for record in data_stream if float(record[sample_index]) > 0 and re.search( tax_pattern, record[0], re.IGNORECASE) is not None and re.search("no\s*hit", record[0], re.IGNORECASE) is None ) # taxname, count else: raise Exception("unsupported exclusions spec %s" % exclusions) distob = Distribution(None, 1, [data_stream]) distob.interval_locator_funcs = [bin_discrete_value, bin_discrete_value] distob.assignments_files = ["kingdom_binning.txt", "family_binning.txt"] distob.weight_value_provider_func = my_weight_value_provider distdata = build(distob, "singlethread") save_filename = os.path.join( BUILD_ROOT, "%s_%s_%s.pickle" % (run_name, sample_name, name_infix)) if len(name_infix) > 0: save_filename = os.path.join( BUILD_ROOT, "%s_%s_%s.pickle" % (run_name, sample_name, name_infix)) else: save_filename = os.path.join(BUILD_ROOT, "%s_%s.pickle" % (run_name, sample_name)) distob.save(save_filename) #print "Distribution %s:%s has %d points distributed over %d intervals, stored in %d parts"%(run_name, sample_name,distob.point_weight, len(distdata), len(distob.part_dict)) return save_filename
def use_prbdf(picklefile): distob = Distribution.load(picklefile) distob.list()