def build_kmer_distribution(datafile, kmer_patterns, sampling_proportion, num_processes, builddir, reverse_complement, pattern_window_length, input_driver_config): if os.path.exists(get_save_filename(datafile, builddir)): print("build_kmer_distribution- skipping %s as already done"%datafile) distob = Distribution.load(get_save_filename(datafile, builddir)) distob.summary() else: print("build_kmer_distribution- processing %s"%datafile) filetype = get_file_type(datafile) distob = Distribution([datafile], num_processes) distob.interval_locator_parameters = (None,) distob.interval_locator_funcs = (bin_discrete_value,) distob.assignments_files = ("kmer_binning.txt",) distob.file_to_stream_func = seq_from_sequence_file distob.file_to_stream_func_xargs = [filetype,sampling_proportion] distob.weight_value_provider_func = kmer_count_from_sequence distob.weight_value_provider_func_xargs = [reverse_complement, pattern_window_length, 1] + kmer_patterns if filetype == ".cnt": #print "DEBUG setting methods for count file" distob.file_to_stream_func = tag_count_from_tag_count_file distob.file_to_stream_func_xargs = [input_driver_config,sampling_proportion] distob.weight_value_provider_func = kmer_count_from_tag_count distdata = build(distob, use="singlethread") else: distdata = build(distob, proc_pool_size=num_processes) distob.save(get_save_filename(datafile, builddir)) print "Distribution %s has %d points distributed over %d intervals, stored in %d parts"%(get_save_filename(datafile, builddir), distob.point_weight, len(distdata), len(distob.part_dict)) return get_save_filename(datafile, builddir)
def get_all_tax_intervals(distribution_files, name_infix): # several ways of doing this - e.g. just make a set out of a list of all of them, however # will do it using the distribution builder global RUN_ROOT, BUILD_ROOT interval_readers = [ Distribution.load(distribution_file).get_distribution().keys() for distribution_file in distribution_files ] distob = Distribution([], 1, interval_readers) distob.interval_locator_funcs = ( bin_discrete_value, bin_discrete_value, ) distob.assignments_files = ("kingdom_binning.txt", "family_binning.txt") distdata = build(distob) distob.summary() if len(name_infix) > 0: save_filename = os.path.join(BUILD_ROOT, "all_taxa_%s.pickle" % name_infix) else: save_filename = os.path.join(BUILD_ROOT, "all_taxa.pickle") distob.save(save_filename) return save_filename
def build_locus_distribution(datafiles, weighting_method = None, locus_type="locus"): distob = prism(datafiles, 1) #distob.DEBUG = True if locus_type == "locus": distob.file_to_stream_func = my_top_locus_provider distob.file_to_stream_func_xargs = [weighting_method,0,1,8,9] # i.e. pick out first field (query) then hit accession and subject start and end distob.interval_locator_funcs = [bin_discrete_value, bin_discrete_value] distob.spectrum_value_provider_func = my_locus_spectrum_value_provider elif locus_type=="description": distob.file_to_stream_func = my_top_description_provider distob.file_to_stream_func_xargs = [weighting_method,0,8] # i.e. pick out first field (query) then hit accession and subject start and end distob.interval_locator_funcs = [bin_discrete_value, bin_discrete_value] distob.spectrum_value_provider_func = my_description_spectrum_value_provider distdata = build(distob,"singlethread") print "saving distribution to %s.locus.pickle"%os.path.commonprefix(datafiles) distob.save("%s.locus.pickle"%os.path.commonprefix(datafiles)) print """ seq count %d locus count %d """%(distob.total_spectrum_value, len(distob.spectrum.keys())) distob.list() return distdata
def build_tax_distribution(datafile, weighting_method=None, column_numbers=[0, 7, 6]): distob = prism([datafile], 1) distob.file_to_stream_func = my_top_hit_provider #distob.DEBUG = True distob.file_to_stream_func_xargs = [ weighting_method ] + column_numbers # i.e. pick out first field, then kingdom, comnames distob.interval_locator_funcs = [bin_discrete_value, bin_discrete_value] distob.spectrum_value_provider_func = my_spectrum_value_provider distdata = build(distob, "singlethread") distob.save("%s.tax.pickle" % datafile) return distdata
def build_tax_distribution(datafile, weighting_method, columns, moniker): use_columns = [int(item) for item in re.split(",", columns)] distob = prism([datafile], 1) #distob.DEBUG = True distob.file_to_stream_func = my_taxonomy_tuple_provider distob.file_to_stream_func_xargs = use_columns distob.interval_locator_funcs = [bin_discrete_value] distob.spectrum_value_provider_func = my_value_provider distob.spectrum_value_provider_func_xargs = [weighting_method] distdata = build(distob, "singlethread") print("saving distribution to %s.taxonomy%s.pickle" % (datafile, moniker)) distob.save("%s.taxonomy%s.pickle" % (datafile, moniker)) print(""" seq count %d taxonomy bin count %d """ % (distob.total_spectrum_value, len(distob.spectrum.keys()))) distob.list() return distdata
def build_sample_tax_distribution(datafile, run_name, sample_name, tax_pattern=None, name_infix="", exclusions=None): """ each record - i.e. taxa - is a bin. Build a distribution of reads across these bins, for each sample in a run. This is already provided by the summary files - we just collate all summary files and store it our own sparse prbdf structure (tax_pattern and name_infix are there for selecting out and naming sub-sets of taxa) """ global RUN_ROOT, BUILD_ROOT #print "building sample tax distribution for %s:%s using %s"%(run_name, sample_name, datafile) data_stream = from_tab_delimited_file(datafile) header = data_stream.next() sample_index = header.index(sample_name) if exclusions is None: if tax_pattern is None: data_stream = ( (record[0], record[1], record[sample_index]) for record in data_stream if float(record[sample_index]) > 0) # taxname, count else: data_stream = ( (record[0], record[1], record[sample_index]) for record in data_stream if float(record[sample_index]) > 0 and re.search( tax_pattern, record[0], re.IGNORECASE) is not None ) # taxname, count elif exclusions == "nohit": if tax_pattern is None: data_stream = ( (record[0], record[1], record[sample_index]) for record in data_stream if float(record[sample_index]) > 0 and re.search("no\s*hit", record[0], re.IGNORECASE) is None) else: data_stream = ( (record[0], record[1], record[sample_index]) for record in data_stream if float(record[sample_index]) > 0 and re.search( tax_pattern, record[0], re.IGNORECASE) is not None and re.search("no\s*hit", record[0], re.IGNORECASE) is None ) # taxname, count else: raise Exception("unsupported exclusions spec %s" % exclusions) distob = Distribution(None, 1, [data_stream]) distob.interval_locator_funcs = [bin_discrete_value, bin_discrete_value] distob.assignments_files = ["kingdom_binning.txt", "family_binning.txt"] distob.weight_value_provider_func = my_weight_value_provider distdata = build(distob, "singlethread") save_filename = os.path.join( BUILD_ROOT, "%s_%s_%s.pickle" % (run_name, sample_name, name_infix)) if len(name_infix) > 0: save_filename = os.path.join( BUILD_ROOT, "%s_%s_%s.pickle" % (run_name, sample_name, name_infix)) else: save_filename = os.path.join(BUILD_ROOT, "%s_%s.pickle" % (run_name, sample_name)) distob.save(save_filename) #print "Distribution %s:%s has %d points distributed over %d intervals, stored in %d parts"%(run_name, sample_name,distob.point_weight, len(distdata), len(distob.part_dict)) return save_filename
def build_kmer_spectrum(datafile, kmer_patterns, sampling_proportion, num_processes, builddir, reverse_complement, pattern_window_length, input_driver_config, input_filetype=None, weighting_method=None, assemble=False, number_to_assemble=100): if os.path.exists(get_save_filename(datafile, builddir)): print("build_kmer_spectrum- skipping %s as already done" % datafile) kmer_prism = prism.load(get_save_filename(datafile, builddir)) kmer_prism.summary() else: print("build_kmer_spectrum- processing %s" % datafile) filetype = input_filetype if filetype is None: filetype = get_file_type(datafile) kmer_prism = prism([datafile], num_processes) kmer_prism.interval_locator_parameters = (None, ) kmer_prism.interval_locator_funcs = (bin_discrete_value, ) kmer_prism.assignments_files = ("kmer_binning.txt", ) kmer_prism.file_to_stream_func = seq_from_sequence_file kmer_prism.file_to_stream_func_xargs = [filetype, sampling_proportion] kmer_prism.spectrum_value_provider_func = kmer_count_from_sequence if weighting_method is None: kmer_prism.spectrum_value_provider_func_xargs = [ reverse_complement, pattern_window_length, 1 ] + kmer_patterns elif weighting_method == "tag_count": kmer_prism.spectrum_value_provider_func_xargs = [ reverse_complement, pattern_window_length, parse_weight_from_sequence_description ] + kmer_patterns if filetype == ".cnt": #print "DEBUG setting methods for count file" kmer_prism.file_to_stream_func = tag_count_from_tag_count_file kmer_prism.file_to_stream_func_xargs = [ input_driver_config, sampling_proportion ] kmer_prism.spectrum_value_provider_func = kmer_count_from_tag_count spectrum_data = build(kmer_prism, use="singlethread") else: spectrum_data = build(kmer_prism, proc_pool_size=num_processes) kmer_prism.save(get_save_filename(datafile, builddir)) print( "spectrum %s has %d points distributed over %d intervals, stored in %d parts" % (get_save_filename(datafile, builddir), kmer_prism.total_spectrum_value, len(spectrum_data), len(kmer_prism.part_dict))) if assemble: print("assembling low entropy kmers (lowest %d)..." % number_to_assemble) kmer_list = sorted( kmer_prism.spectrum.items(), lambda x, y: cmp(y[1], x[1]) )[0: number_to_assemble] # sort in descending order and pick the first number_to_assemble # yields e.g. #[(('CGCCGC',), 26870.0), (('GCGGCG',), 25952.0),.... print("(%s)" % str(kmer_list)) kmer_list = [item[0][0] for item in kmer_list] assemble_kmer_spectrum(kmer_list, datafile, input_filetype, None, weighting_method=weighting_method) return get_save_filename(datafile, builddir)