コード例 #1
0
def get_all_tax_intervals(distribution_files, name_infix):
    # several ways of doing this - e.g. just make a set out of a list of all of them, however
    # will do it using the distribution builder
    global RUN_ROOT, BUILD_ROOT

    interval_readers = [
        Distribution.load(distribution_file).get_distribution().keys()
        for distribution_file in distribution_files
    ]

    distob = Distribution([], 1, interval_readers)

    distob.interval_locator_funcs = (
        bin_discrete_value,
        bin_discrete_value,
    )
    distob.assignments_files = ("kingdom_binning.txt", "family_binning.txt")
    distdata = build(distob)
    distob.summary()
    if len(name_infix) > 0:
        save_filename = os.path.join(BUILD_ROOT,
                                     "all_taxa_%s.pickle" % name_infix)
    else:
        save_filename = os.path.join(BUILD_ROOT, "all_taxa.pickle")
    distob.save(save_filename)
    return save_filename
コード例 #2
0
def get_tax_interval_measure_space(measure, all_taxa_distribution,
                                   run_distributions):

    all_intervals_list = Distribution.load(
        all_taxa_distribution).get_distribution().keys()

    all_intervals_list.sort()

    sample_measures = Distribution.get_projections(run_distributions,
                                                   all_intervals_list, measure)
    zsample_measures = itertools.izip(*sample_measures)
    sample_name_iter = [
        tuple([
            os.path.splitext(os.path.basename(run_distribution))[0]
            for run_distribution in run_distributions
        ])
    ]
    zsample_measures = itertools.chain(sample_name_iter, zsample_measures)

    interval_name_iter = itertools.chain([("kingdom", "family")],
                                         all_intervals_list)
    zsample_measures_with_rownames = itertools.izip(interval_name_iter,
                                                    zsample_measures)
    return (zsample_measures, zsample_measures_with_rownames)
コード例 #3
0
ファイル: kmer_prism.py プロジェクト: AgResearch/DECONVQC
def summarise_distributions(distributions, options):

    measure = "frequency"
    if options["summary_type"] in ["zipfian","entropy"]:
        measure = "unsigned_information"

    kmer_intervals = Distribution.get_intervals(distributions, options["num_processes"])

    if options["alphabet"] is not None:
        kmer_intervals1 = [ interval for interval in kmer_intervals if re.search("^[%(alphabet)s]+$"%options , interval[0], re.IGNORECASE) is not None ]
        print "(restricting kmers to those from alphabet %s , deleted %d / %d kmers)"%(options["alphabet"],len(kmer_intervals) - len(kmer_intervals1) , len(kmer_intervals))
        kmer_intervals  = kmer_intervals1
        

    #print "summarising %s , %s across %s"%(measure, str(kmer_intervals), str(distributions))
    print "summarising %s , %d kmers across %s"%(measure, len(kmer_intervals), str(distributions))


    sample_measures = Distribution.get_projections(distributions, kmer_intervals, measure, False, options["num_processes"])
    zsample_measures = itertools.izip(*sample_measures)
    sample_name_iter = [tuple([os.path.splitext(os.path.basename(distribution))[0] for distribution in distributions])]
    zsample_measures = itertools.chain(sample_name_iter, zsample_measures)
    interval_name_iter = itertools.chain([("kmer_pattern")],kmer_intervals)
    
    outfile=open(options["output_filename"], "w")

    if options["summary_type"] in ["entropy", "frequency"]:
        zsample_measures_with_rownames = itertools.izip(interval_name_iter, zsample_measures)
        for interval_measure in zsample_measures_with_rownames:
            print >> outfile, "%s\t%s"%("%s"%interval_measure[0], string.join((str(item) for item in interval_measure[1]),"\t"))
        outfile.close()
    elif options["summary_type"] in ["ranks", "zipfian"]:
        # duplicate interval_name_iter - needed 3 times
        interval_name_iter_dup = itertools.tee(interval_name_iter, 3)

        # triplicate zsample_measures (0 used to get ranks; 1 used to output measures; 3 used to get distances)
        zsample_measures_dup = itertools.tee(zsample_measures,3)
        ranks = Distribution.get_rank_iter(zsample_measures_dup[0])

        # duplicate ranks (0 used to output; 1 used to get distances)
        ranks_dup = itertools.tee(ranks, 2)
        ranks_with_rownames = itertools.izip(interval_name_iter_dup[0], ranks_dup[0])

        # output ranks
        print >> outfile , "*** ranks *** :"
        for interval_rank in ranks_with_rownames:
            print >> outfile, "%s\t%s"%("%s"%interval_rank[0], string.join((str(item) for item in interval_rank[1]),"\t"))

        # output measures
        print >> outfile , "*** entropies *** :"
        zsample_measures_with_rownames = itertools.izip(interval_name_iter_dup[1], zsample_measures_dup[1])
        for interval_measure in zsample_measures_with_rownames:
            print >> outfile, "%s\t%s"%("%s"%interval_measure[0], string.join((str(item) for item in interval_measure[1]),"\t"))

        # get distances
        print >> outfile , "*** distances *** :"
        (distance_matrix, point_names_sorted) = Distribution.get_zipfian_distance_matrix(zsample_measures_dup[2], ranks_dup[1])
        Distribution.print_distance_matrix(distance_matrix, point_names_sorted, outfile)
    else:
        print "warning, unknown summary type %(summary_type)s, no summary available"%options
        
        
        outfile.close()
コード例 #4
0
ファイル: kmer_prism.py プロジェクト: AgResearch/DECONVQC
def use_kmer_prbdf(picklefile):
    distob = Distribution.load(picklefile)
    distdata = distob.get_distribution()
    for (interval, freq) in distdata.items():
        print interval, freq
コード例 #5
0
ファイル: kmer_prism.py プロジェクト: AgResearch/DECONVQC
def build_kmer_distribution(datafile, kmer_patterns, sampling_proportion, num_processes, builddir, reverse_complement, pattern_window_length, input_driver_config):

    if os.path.exists(get_save_filename(datafile, builddir)):
        print("build_kmer_distribution- skipping %s as already done"%datafile)
        distob = Distribution.load(get_save_filename(datafile, builddir))
        distob.summary()
        
    else:
        print("build_kmer_distribution- processing %s"%datafile)
        filetype = get_file_type(datafile)
        distob = Distribution([datafile], num_processes)
        distob.interval_locator_parameters = (None,)
        distob.interval_locator_funcs = (bin_discrete_value,)
        distob.assignments_files = ("kmer_binning.txt",)
        distob.file_to_stream_func = seq_from_sequence_file
        distob.file_to_stream_func_xargs = [filetype,sampling_proportion]
        distob.weight_value_provider_func = kmer_count_from_sequence
        distob.weight_value_provider_func_xargs = [reverse_complement, pattern_window_length, 1] + kmer_patterns        
        
        if filetype == ".cnt":
            #print "DEBUG setting methods for count file"
            distob.file_to_stream_func = tag_count_from_tag_count_file
            distob.file_to_stream_func_xargs = [input_driver_config,sampling_proportion]
            distob.weight_value_provider_func = kmer_count_from_tag_count 
            distdata = build(distob, use="singlethread")
        else:
            distdata = build(distob, proc_pool_size=num_processes)
            
        distob.save(get_save_filename(datafile, builddir))
            
        print "Distribution %s has %d points distributed over %d intervals, stored in %d parts"%(get_save_filename(datafile, builddir), distob.point_weight, len(distdata), len(distob.part_dict))

    return get_save_filename(datafile, builddir)
コード例 #6
0
def build_sample_tax_distribution(datafile,
                                  run_name,
                                  sample_name,
                                  tax_pattern=None,
                                  name_infix="",
                                  exclusions=None):
    """
    each record - i.e. taxa - is a bin. Build a distribution of reads across
    these bins, for each sample in a run. This is already provided by the summary files - we just collate
    all summary files and store it our own sparse prbdf structure

    (tax_pattern and name_infix are there for selecting out and naming sub-sets of taxa)
    """
    global RUN_ROOT, BUILD_ROOT

    #print "building sample tax distribution for %s:%s using %s"%(run_name, sample_name, datafile)

    data_stream = from_tab_delimited_file(datafile)
    header = data_stream.next()
    sample_index = header.index(sample_name)
    if exclusions is None:
        if tax_pattern is None:
            data_stream = (
                (record[0], record[1], record[sample_index])
                for record in data_stream
                if float(record[sample_index]) > 0)  # taxname, count
        else:
            data_stream = (
                (record[0], record[1], record[sample_index])
                for record in data_stream
                if float(record[sample_index]) > 0 and re.search(
                    tax_pattern, record[0], re.IGNORECASE) is not None
            )  # taxname, count
    elif exclusions == "nohit":
        if tax_pattern is None:
            data_stream = (
                (record[0], record[1], record[sample_index])
                for record in data_stream if float(record[sample_index]) > 0
                and re.search("no\s*hit", record[0], re.IGNORECASE) is None)
        else:
            data_stream = (
                (record[0], record[1], record[sample_index])
                for record in data_stream
                if float(record[sample_index]) > 0 and re.search(
                    tax_pattern, record[0], re.IGNORECASE) is not None
                and re.search("no\s*hit", record[0], re.IGNORECASE) is None
            )  # taxname, count
    else:
        raise Exception("unsupported exclusions spec %s" % exclusions)

    distob = Distribution(None, 1, [data_stream])

    distob.interval_locator_funcs = [bin_discrete_value, bin_discrete_value]
    distob.assignments_files = ["kingdom_binning.txt", "family_binning.txt"]
    distob.weight_value_provider_func = my_weight_value_provider
    distdata = build(distob, "singlethread")
    save_filename = os.path.join(
        BUILD_ROOT, "%s_%s_%s.pickle" % (run_name, sample_name, name_infix))
    if len(name_infix) > 0:
        save_filename = os.path.join(
            BUILD_ROOT,
            "%s_%s_%s.pickle" % (run_name, sample_name, name_infix))
    else:
        save_filename = os.path.join(BUILD_ROOT,
                                     "%s_%s.pickle" % (run_name, sample_name))
    distob.save(save_filename)

    #print "Distribution %s:%s has %d points distributed over %d intervals, stored in %d parts"%(run_name, sample_name,distob.point_weight, len(distdata), len(distob.part_dict))

    return save_filename
コード例 #7
0
def use_prbdf(picklefile):
    distob = Distribution.load(picklefile)
    distob.list()