Beispiel #1
0
def build_kmer_distribution(datafile, kmer_patterns, sampling_proportion, num_processes, builddir, reverse_complement, pattern_window_length, input_driver_config):

    if os.path.exists(get_save_filename(datafile, builddir)):
        print("build_kmer_distribution- skipping %s as already done"%datafile)
        distob = Distribution.load(get_save_filename(datafile, builddir))
        distob.summary()
        
    else:
        print("build_kmer_distribution- processing %s"%datafile)
        filetype = get_file_type(datafile)
        distob = Distribution([datafile], num_processes)
        distob.interval_locator_parameters = (None,)
        distob.interval_locator_funcs = (bin_discrete_value,)
        distob.assignments_files = ("kmer_binning.txt",)
        distob.file_to_stream_func = seq_from_sequence_file
        distob.file_to_stream_func_xargs = [filetype,sampling_proportion]
        distob.weight_value_provider_func = kmer_count_from_sequence
        distob.weight_value_provider_func_xargs = [reverse_complement, pattern_window_length, 1] + kmer_patterns        
        
        if filetype == ".cnt":
            #print "DEBUG setting methods for count file"
            distob.file_to_stream_func = tag_count_from_tag_count_file
            distob.file_to_stream_func_xargs = [input_driver_config,sampling_proportion]
            distob.weight_value_provider_func = kmer_count_from_tag_count 
            distdata = build(distob, use="singlethread")
        else:
            distdata = build(distob, proc_pool_size=num_processes)
            
        distob.save(get_save_filename(datafile, builddir))
            
        print "Distribution %s has %d points distributed over %d intervals, stored in %d parts"%(get_save_filename(datafile, builddir), distob.point_weight, len(distdata), len(distob.part_dict))

    return get_save_filename(datafile, builddir)
Beispiel #2
0
def get_all_tax_intervals(distribution_files, name_infix):
    # several ways of doing this - e.g. just make a set out of a list of all of them, however
    # will do it using the distribution builder
    global RUN_ROOT, BUILD_ROOT

    interval_readers = [
        Distribution.load(distribution_file).get_distribution().keys()
        for distribution_file in distribution_files
    ]

    distob = Distribution([], 1, interval_readers)

    distob.interval_locator_funcs = (
        bin_discrete_value,
        bin_discrete_value,
    )
    distob.assignments_files = ("kingdom_binning.txt", "family_binning.txt")
    distdata = build(distob)
    distob.summary()
    if len(name_infix) > 0:
        save_filename = os.path.join(BUILD_ROOT,
                                     "all_taxa_%s.pickle" % name_infix)
    else:
        save_filename = os.path.join(BUILD_ROOT, "all_taxa.pickle")
    distob.save(save_filename)
    return save_filename
Beispiel #3
0
def build_locus_distribution(datafiles, weighting_method = None, locus_type="locus"):
    distob = prism(datafiles, 1)

    #distob.DEBUG = True
    if locus_type == "locus":
        distob.file_to_stream_func = my_top_locus_provider
        distob.file_to_stream_func_xargs = [weighting_method,0,1,8,9] # i.e. pick out first field (query) then hit accession and subject start and end
        distob.interval_locator_funcs = [bin_discrete_value, bin_discrete_value]
        distob.spectrum_value_provider_func = my_locus_spectrum_value_provider
    elif locus_type=="description":
        distob.file_to_stream_func = my_top_description_provider
        distob.file_to_stream_func_xargs = [weighting_method,0,8] # i.e. pick out first field (query) then hit accession and subject start and end
        distob.interval_locator_funcs = [bin_discrete_value, bin_discrete_value]
        distob.spectrum_value_provider_func = my_description_spectrum_value_provider
    
    
    distdata = build(distob,"singlethread")

    print "saving distribution to %s.locus.pickle"%os.path.commonprefix(datafiles)
    distob.save("%s.locus.pickle"%os.path.commonprefix(datafiles))
    print """
    seq count %d
    locus count %d
    
    """%(distob.total_spectrum_value, len(distob.spectrum.keys()))
    distob.list()
    
    return distdata
Beispiel #4
0
def build_tax_distribution(datafile,
                           weighting_method=None,
                           column_numbers=[0, 7, 6]):
    distob = prism([datafile], 1)
    distob.file_to_stream_func = my_top_hit_provider
    #distob.DEBUG = True
    distob.file_to_stream_func_xargs = [
        weighting_method
    ] + column_numbers  # i.e. pick out first field, then kingdom, comnames
    distob.interval_locator_funcs = [bin_discrete_value, bin_discrete_value]
    distob.spectrum_value_provider_func = my_spectrum_value_provider
    distdata = build(distob, "singlethread")
    distob.save("%s.tax.pickle" % datafile)
    return distdata
def build_tax_distribution(datafile, weighting_method, columns, moniker):
    use_columns = [int(item) for item in re.split(",", columns)]

    distob = prism([datafile], 1)

    #distob.DEBUG = True
    distob.file_to_stream_func = my_taxonomy_tuple_provider
    distob.file_to_stream_func_xargs = use_columns
    distob.interval_locator_funcs = [bin_discrete_value]
    distob.spectrum_value_provider_func = my_value_provider
    distob.spectrum_value_provider_func_xargs = [weighting_method]

    distdata = build(distob, "singlethread")

    print("saving distribution to %s.taxonomy%s.pickle" % (datafile, moniker))
    distob.save("%s.taxonomy%s.pickle" % (datafile, moniker))
    print("""
    seq count %d
    taxonomy bin count %d
    
    """ % (distob.total_spectrum_value, len(distob.spectrum.keys())))
    distob.list()

    return distdata
Beispiel #6
0
def build_sample_tax_distribution(datafile,
                                  run_name,
                                  sample_name,
                                  tax_pattern=None,
                                  name_infix="",
                                  exclusions=None):
    """
    each record - i.e. taxa - is a bin. Build a distribution of reads across
    these bins, for each sample in a run. This is already provided by the summary files - we just collate
    all summary files and store it our own sparse prbdf structure

    (tax_pattern and name_infix are there for selecting out and naming sub-sets of taxa)
    """
    global RUN_ROOT, BUILD_ROOT

    #print "building sample tax distribution for %s:%s using %s"%(run_name, sample_name, datafile)

    data_stream = from_tab_delimited_file(datafile)
    header = data_stream.next()
    sample_index = header.index(sample_name)
    if exclusions is None:
        if tax_pattern is None:
            data_stream = (
                (record[0], record[1], record[sample_index])
                for record in data_stream
                if float(record[sample_index]) > 0)  # taxname, count
        else:
            data_stream = (
                (record[0], record[1], record[sample_index])
                for record in data_stream
                if float(record[sample_index]) > 0 and re.search(
                    tax_pattern, record[0], re.IGNORECASE) is not None
            )  # taxname, count
    elif exclusions == "nohit":
        if tax_pattern is None:
            data_stream = (
                (record[0], record[1], record[sample_index])
                for record in data_stream if float(record[sample_index]) > 0
                and re.search("no\s*hit", record[0], re.IGNORECASE) is None)
        else:
            data_stream = (
                (record[0], record[1], record[sample_index])
                for record in data_stream
                if float(record[sample_index]) > 0 and re.search(
                    tax_pattern, record[0], re.IGNORECASE) is not None
                and re.search("no\s*hit", record[0], re.IGNORECASE) is None
            )  # taxname, count
    else:
        raise Exception("unsupported exclusions spec %s" % exclusions)

    distob = Distribution(None, 1, [data_stream])

    distob.interval_locator_funcs = [bin_discrete_value, bin_discrete_value]
    distob.assignments_files = ["kingdom_binning.txt", "family_binning.txt"]
    distob.weight_value_provider_func = my_weight_value_provider
    distdata = build(distob, "singlethread")
    save_filename = os.path.join(
        BUILD_ROOT, "%s_%s_%s.pickle" % (run_name, sample_name, name_infix))
    if len(name_infix) > 0:
        save_filename = os.path.join(
            BUILD_ROOT,
            "%s_%s_%s.pickle" % (run_name, sample_name, name_infix))
    else:
        save_filename = os.path.join(BUILD_ROOT,
                                     "%s_%s.pickle" % (run_name, sample_name))
    distob.save(save_filename)

    #print "Distribution %s:%s has %d points distributed over %d intervals, stored in %d parts"%(run_name, sample_name,distob.point_weight, len(distdata), len(distob.part_dict))

    return save_filename
Beispiel #7
0
def build_kmer_spectrum(datafile,
                        kmer_patterns,
                        sampling_proportion,
                        num_processes,
                        builddir,
                        reverse_complement,
                        pattern_window_length,
                        input_driver_config,
                        input_filetype=None,
                        weighting_method=None,
                        assemble=False,
                        number_to_assemble=100):

    if os.path.exists(get_save_filename(datafile, builddir)):
        print("build_kmer_spectrum- skipping %s as already done" % datafile)
        kmer_prism = prism.load(get_save_filename(datafile, builddir))
        kmer_prism.summary()

    else:
        print("build_kmer_spectrum- processing %s" % datafile)
        filetype = input_filetype
        if filetype is None:
            filetype = get_file_type(datafile)
        kmer_prism = prism([datafile], num_processes)
        kmer_prism.interval_locator_parameters = (None, )
        kmer_prism.interval_locator_funcs = (bin_discrete_value, )
        kmer_prism.assignments_files = ("kmer_binning.txt", )
        kmer_prism.file_to_stream_func = seq_from_sequence_file
        kmer_prism.file_to_stream_func_xargs = [filetype, sampling_proportion]
        kmer_prism.spectrum_value_provider_func = kmer_count_from_sequence

        if weighting_method is None:
            kmer_prism.spectrum_value_provider_func_xargs = [
                reverse_complement, pattern_window_length, 1
            ] + kmer_patterns
        elif weighting_method == "tag_count":
            kmer_prism.spectrum_value_provider_func_xargs = [
                reverse_complement, pattern_window_length,
                parse_weight_from_sequence_description
            ] + kmer_patterns

        if filetype == ".cnt":
            #print "DEBUG setting methods for count file"
            kmer_prism.file_to_stream_func = tag_count_from_tag_count_file
            kmer_prism.file_to_stream_func_xargs = [
                input_driver_config, sampling_proportion
            ]
            kmer_prism.spectrum_value_provider_func = kmer_count_from_tag_count
            spectrum_data = build(kmer_prism, use="singlethread")
        else:
            spectrum_data = build(kmer_prism, proc_pool_size=num_processes)

        kmer_prism.save(get_save_filename(datafile, builddir))

        print(
            "spectrum %s has %d points distributed over %d intervals, stored in %d parts"
            % (get_save_filename(datafile,
                                 builddir), kmer_prism.total_spectrum_value,
               len(spectrum_data), len(kmer_prism.part_dict)))

        if assemble:
            print("assembling low entropy kmers (lowest %d)..." %
                  number_to_assemble)
            kmer_list = sorted(
                kmer_prism.spectrum.items(), lambda x, y: cmp(y[1], x[1])
            )[0:
              number_to_assemble]  # sort in descending order and pick the first number_to_assemble
            # yields e.g.
            #[(('CGCCGC',), 26870.0), (('GCGGCG',), 25952.0),....
            print("(%s)" % str(kmer_list))
            kmer_list = [item[0][0] for item in kmer_list]
            assemble_kmer_spectrum(kmer_list,
                                   datafile,
                                   input_filetype,
                                   None,
                                   weighting_method=weighting_method)

    return get_save_filename(datafile, builddir)