コード例 #1
0
def build_tax_distribution(datafile):
    distob = Distribution([datafile], 1)
    distob.file_to_stream_func = my_top_hit_provider
    #distob.DEBUG = True
    distob.file_to_stream_func_xargs = [0,7,6] # i.e. pick out first field, then kingdom, comnames
    distob.interval_locator_funcs = [bin_discrete_value, bin_discrete_value]
    distdata = build(distob,"singlethread")
    distob.save("%s.pickle"%datafile)
    return distdata
コード例 #2
0
ファイル: kmer_entropy.py プロジェクト: AgResearch/prbdf
def summarise_distributions(distributions, options):

    measure = "frequency"
    if options["summary_type"] in ["zipfian","entropy"]:
        measure = "unsigned_information"

    kmer_intervals = Distribution.get_intervals(distributions, options["num_processes"])

    #print "summarising %s , %s across %s"%(measure, str(kmer_intervals), str(distributions))
    print "summarising %s , %d kmers across %s"%(measure, len(kmer_intervals), str(distributions))


    sample_measures = Distribution.get_projections(distributions, kmer_intervals, measure, False, options["num_processes"])
    zsample_measures = itertools.izip(*sample_measures)
    sample_name_iter = [tuple([os.path.splitext(os.path.basename(distribution))[0] for distribution in distributions])]
    zsample_measures = itertools.chain(sample_name_iter, zsample_measures)
    interval_name_iter = itertools.chain([("kmer_pattern")],kmer_intervals)
    
    outfile=open(options["output_filename"], "w")

    if options["summary_type"] in ["entropy", "frequency"]:
        zsample_measures_with_rownames = itertools.izip(interval_name_iter, zsample_measures)
        for interval_measure in zsample_measures_with_rownames:
            print >> outfile, "%s\t%s"%("%s"%interval_measure[0], string.join((str(item) for item in interval_measure[1]),"\t"))
        outfile.close()
    elif options["summary_type"] in ["ranks", "zipfian"]:
        # duplicate interval_name_iter - needed 3 times
        interval_name_iter_dup = itertools.tee(interval_name_iter, 3)

        # triplicate zsample_measures (0 used to get ranks; 1 used to output measures; 3 used to get distances)
        zsample_measures_dup = itertools.tee(zsample_measures,3)
        ranks = Distribution.get_rank_iter(zsample_measures_dup[0])

        # duplicate ranks (0 used to output; 1 used to get distances)
        ranks_dup = itertools.tee(ranks, 2)
        ranks_with_rownames = itertools.izip(interval_name_iter_dup[0], ranks_dup[0])

        # output ranks
        print >> outfile , "*** ranks *** :"
        for interval_rank in ranks_with_rownames:
            print >> outfile, "%s\t%s"%("%s"%interval_rank[0], string.join((str(item) for item in interval_rank[1]),"\t"))

        # output measures
        print >> outfile , "*** entropies *** :"
        zsample_measures_with_rownames = itertools.izip(interval_name_iter_dup[1], zsample_measures_dup[1])
        for interval_measure in zsample_measures_with_rownames:
            print >> outfile, "%s\t%s"%("%s"%interval_measure[0], string.join((str(item) for item in interval_measure[1]),"\t"))

        # get distances
        print >> outfile , "*** distances *** :"
        (distance_matrix, point_names_sorted) = Distribution.get_zipfian_distance_matrix(zsample_measures_dup[2], ranks_dup[1])
        Distribution.print_distance_matrix(distance_matrix, point_names_sorted, outfile)
    else:
        print "warning, unknown summary type %(summary_type)s, no summary available"%options
        
        
        outfile.close()
コード例 #3
0
def get_sample_tax_frequency_distribution(sample_tax_summaries):
    sample_tax_lists = [ Distribution.load(sample_tax_summary).get_distribution().keys() for sample_tax_summary in sample_tax_summaries ] 
    all_taxa = set( reduce(lambda x,y:x+y, sample_tax_lists))
    all_taxa_list = list(all_taxa)
    all_taxa_list.sort(tax_cmp)

    #print all_taxa_list

    sample_tax_frequency_distributions = [["%s\t%s"%item for item in all_taxa_list]] + [ Distribution.load(sample_tax_summary).get_frequency_projection(all_taxa_list) for sample_tax_summary in sample_tax_summaries]

    #print sample_tax_frequency_distributions

    fd_iter = itertools.izip(*sample_tax_frequency_distributions)
    heading = itertools.izip(*[["Kingdom\tFamily"]]+[[re.split("\.",os.path.basename(path.strip()))[0]] for path in sample_tax_summaries])
    #print heading

    fd_iter = itertools.chain(heading, fd_iter)

    for record in fd_iter:
        print string.join([str(item) for item in record],"\t")
コード例 #4
0
ファイル: kmer_entropy.py プロジェクト: AgResearch/prbdf
def use_kmer_prbdf(picklefile):
    distob = Distribution.load(picklefile)
    distdata = distob.get_distribution()
    for (interval, freq) in distdata.items():
        print interval, freq
コード例 #5
0
ファイル: kmer_entropy.py プロジェクト: AgResearch/prbdf
def build_kmer_distribution(datafile, kmer_patterns, sampling_proportion, num_processes, builddir, reverse_complement, pattern_window_length, input_driver_config):

    if os.path.exists(get_save_filename(datafile, builddir)):
        print("build_kmer_distribution- skipping %s as already done"%datafile)
        distob = Distribution.load(get_save_filename(datafile, builddir))
        distob.summary()
        
    else:
        filetype = get_file_type(datafile)
        distob = Distribution([datafile], num_processes)
        distob.interval_locator_parameters = (None,)
        distob.interval_locator_funcs = (bin_discrete_value,)
        distob.assignments_files = ("kmer_binning.txt",)
        distob.file_to_stream_func = seq_from_sequence_file
        distob.file_to_stream_func_xargs = [filetype,sampling_proportion]
        distob.weight_value_provider_func = kmer_count_from_sequence
        distob.weight_value_provider_func_xargs = [reverse_complement, pattern_window_length, 1] + kmer_patterns        
        
        if filetype == ".cnt":
            print "DEBUG setting methods for count file"
            distob.file_to_stream_func = tag_count_from_tag_count_file
            distob.file_to_stream_func_xargs = [input_driver_config,sampling_proportion]
            distob.weight_value_provider_func = kmer_count_from_tag_count
            
        #distdata = build(distob, use="singlethread")
        distdata = build(distob, proc_pool_size=num_processes)
        distob.save(get_save_filename(datafile, builddir))
            
        print "Distribution %s has %d points distributed over %d intervals, stored in %d parts"%(get_save_filename(datafile, builddir), distob.point_weight, len(distdata), len(distob.part_dict))

    return get_save_filename(datafile, builddir)