def entropy_analysis(alignment_path, output_file = None, verbose = True, uniqued = False, freq_from_defline = None, weighted = False, qual_stats_dict = None, amino_acid_sequences = False): if freq_from_defline == None: freq_from_defline = lambda x: int([t.split(':')[1] for t in x.split('|') if t.startswith('freq')][0]) lines = [] previous_alignment_length = None progress = Progress() progress.verbose = verbose alignment = u.SequenceSource(alignment_path) progress.new('Processing the Alignment') # processing the alignment file.. while alignment.next(): # check the alignment lengths along the way: if previous_alignment_length: if previous_alignment_length != len(alignment.seq): raise EntropyError, "Not all reads have the same length." # print out process info if alignment.pos % 10000 == 0: progress.update('Reads processed: %s' % (pretty_print(alignment.pos))) # fill 'lines' variable if not uniqued: lines.append(alignment.seq) else: try: frequency = freq_from_defline(alignment.id) except IndexError: raise EntropyError, "Reads declared as unique, but they do not have proper deflines. See help for --uniqued." for i in range(0, frequency): lines.append(alignment.seq) previous_alignment_length = len(alignment.seq) progress.end() if verbose: run.info('Number of reads', pretty_print(alignment.pos)) alignment.close() # entropy analysis progress.new('Entropy Analysis') entropy_tpls = [] for position in range(0, len(lines[0])): progress.update(P(int(position + 1), len(lines[0]))) if len(set([x[position] for x in lines])) == 1: entropy_tpls.append((position, 0.0),) else: column = "".join([x[position] for x in lines]) if weighted: if not qual_stats_dict: raise EntropyError, "Weighted entropy is selected, but no qual stats are provided" e = entropy(column, l_qual = qual_stats_dict[position], amino_acid_sequences = amino_acid_sequences) else: e = entropy(column, amino_acid_sequences = amino_acid_sequences) if e < 0.00001: entropy_tpls.append((position, 0.0),) else: entropy_tpls.append((position, e),) sorted_entropy_tpls = sorted(entropy_tpls, key=operator.itemgetter(1), reverse=True) progress.end() if verbose: entropy_components_larger_than_0 = [e[1] for e in entropy_tpls if e[1] > 0] if entropy_components_larger_than_0: run.info('Entropy analysis', 'Done (total of %d components greater than 0, mean: %.2f, max: %.2f, min: %.2f).' \ % (len(entropy_components_larger_than_0), numpy.mean(entropy_components_larger_than_0), numpy.max(entropy_components_larger_than_0), numpy.min(entropy_components_larger_than_0))) else: run.info('Entropy analysis', 'None of the nucleotide positions posessed any entropy!') if output_file: entropy_output = open(output_file, 'w') for _component, _entropy in sorted_entropy_tpls: entropy_output.write('%d\t%.4f\n' % (_component, _entropy)) if verbose: run.info('Entropy analysis output file path', output_file) entropy_output.close() return [x[1] for x in entropy_tpls]
def entropy_analysis(alignment_path, output_file=None, verbose=True, uniqued=False, freq_from_defline=None, weighted=False, qual_stats_dict=None, amino_acid_sequences=False): if freq_from_defline == None: freq_from_defline = lambda x: int( [t.split(':')[1] for t in x.split('|') if t.startswith('freq')][0]) lines = [] previous_alignment_length = None progress = Progress() progress.verbose = verbose alignment = u.SequenceSource(alignment_path) progress.new('Processing the Alignment') # processing the alignment file.. while alignment.next(): # check the alignment lengths along the way: if previous_alignment_length: if previous_alignment_length != len(alignment.seq): raise EntropyError, "Not all reads have the same length." # print out process info if alignment.pos % 10000 == 0: progress.update('Reads processed: %s' % (pretty_print(alignment.pos))) # fill 'lines' variable if not uniqued: lines.append(alignment.seq) else: try: frequency = freq_from_defline(alignment.id) except IndexError: raise EntropyError, "Reads declared as unique, but they do not have proper deflines. See help for --uniqued." for i in range(0, frequency): lines.append(alignment.seq) previous_alignment_length = len(alignment.seq) progress.end() if verbose: run.info('Number of reads', pretty_print(alignment.pos)) alignment.close() # entropy analysis progress.new('Entropy Analysis') entropy_tpls = [] for position in range(0, len(lines[0])): progress.update(P(int(position + 1), len(lines[0]))) if len(set([x[position] for x in lines])) == 1: entropy_tpls.append((position, 0.0), ) else: column = "".join([x[position] for x in lines]) if weighted: if not qual_stats_dict: raise EntropyError, "Weighted entropy is selected, but no qual stats are provided" e = entropy(column, l_qual=qual_stats_dict[position], amino_acid_sequences=amino_acid_sequences) else: e = entropy(column, amino_acid_sequences=amino_acid_sequences) if e < 0.00001: entropy_tpls.append((position, 0.0), ) else: entropy_tpls.append((position, e), ) sorted_entropy_tpls = sorted(entropy_tpls, key=operator.itemgetter(1), reverse=True) progress.end() if verbose: entropy_components_larger_than_0 = [ e[1] for e in entropy_tpls if e[1] > 0 ] if entropy_components_larger_than_0: run.info('Entropy analysis', 'Done (total of %d components greater than 0, mean: %.2f, max: %.2f, min: %.2f).' \ % (len(entropy_components_larger_than_0), numpy.mean(entropy_components_larger_than_0), numpy.max(entropy_components_larger_than_0), numpy.min(entropy_components_larger_than_0))) else: run.info('Entropy analysis', 'None of the nucleotide positions posessed any entropy!') if output_file: entropy_output = open(output_file, 'w') for _component, _entropy in sorted_entropy_tpls: entropy_output.write('%d\t%.4f\n' % (_component, _entropy)) if verbose: run.info('Entropy analysis output file path', output_file) entropy_output.close() return [x[1] for x in entropy_tpls]
# # Please read the COPYING file. import numpy as np from scipy import log2 as log import matplotlib.pyplot as plt from Oligotyping.utils.random_colors import get_list_of_colors from Oligotyping.utils.utils import get_unique_sequences_from_FASTA from Oligotyping.utils.utils import Progress from Oligotyping.utils.utils import Run from Oligotyping.utils.utils import NUCL_COLORS run = Run() progress = Progress() def entropy_distribution_bar(alignment, entropy_values, output_file, quick = False, no_display = False, qual_stats_dict = None, weighted = False, verbose = False): progress.verbose = verbose progress.new('Entropy Distribution Figure') progress.update('Computing ') y_maximum = max(entropy_values) + (max(entropy_values) / 10.0) y_maximum = 1 if y_maximum < 1 else y_maximum number_of_uniques_to_show = int(y_maximum * 100) if alignment == None: quick = True