Example #1
0
def entropy_analysis(alignment_path, output_file = None, verbose = True, uniqued = False, freq_from_defline = None, weighted = False, qual_stats_dict = None, amino_acid_sequences = False):
    if freq_from_defline == None:
        freq_from_defline = lambda x: int([t.split(':')[1] for t in x.split('|') if t.startswith('freq')][0])

    lines = []
    previous_alignment_length = None

    progress = Progress()
    progress.verbose = verbose
   
    alignment = u.SequenceSource(alignment_path)

    progress.new('Processing the Alignment')

    # processing the alignment file..
    while alignment.next():
        # check the alignment lengths along the way:
        if previous_alignment_length:
            if previous_alignment_length != len(alignment.seq):
                raise EntropyError, "Not all reads have the same length."

        # print out process info
        if alignment.pos % 10000 == 0:
            progress.update('Reads processed: %s' % (pretty_print(alignment.pos)))
        
        # fill 'lines' variable
        if not uniqued:
            lines.append(alignment.seq)
        else:
            try:
                frequency = freq_from_defline(alignment.id)
            except IndexError:
                raise EntropyError, "Reads declared as unique, but they do not have proper deflines. See help for --uniqued."
                
            for i in range(0, frequency):
                lines.append(alignment.seq)

        previous_alignment_length = len(alignment.seq)

    progress.end()
    if verbose:
        run.info('Number of reads', pretty_print(alignment.pos))

    alignment.close()


    # entropy analysis
    progress.new('Entropy Analysis')
    entropy_tpls = []

    for position in range(0, len(lines[0])):
        progress.update(P(int(position + 1), len(lines[0])))
   
        if len(set([x[position] for x in lines])) == 1:
            entropy_tpls.append((position, 0.0),)
        else:
            column = "".join([x[position] for x in lines])

            if weighted:
                if not qual_stats_dict: 
                    raise EntropyError, "Weighted entropy is selected, but no qual stats are provided"
                e = entropy(column, l_qual = qual_stats_dict[position], amino_acid_sequences = amino_acid_sequences)
            else:
                e = entropy(column, amino_acid_sequences = amino_acid_sequences)

            if e < 0.00001:
                entropy_tpls.append((position, 0.0),)
            else:
                entropy_tpls.append((position, e),)

    sorted_entropy_tpls = sorted(entropy_tpls, key=operator.itemgetter(1), reverse=True)

    progress.end()


    if verbose:
        entropy_components_larger_than_0 = [e[1] for e in entropy_tpls if e[1] > 0]
        if entropy_components_larger_than_0:
            run.info('Entropy analysis', 'Done (total of %d components greater than 0, mean: %.2f, max: %.2f, min: %.2f).' \
                                                        % (len(entropy_components_larger_than_0),
                                                           numpy.mean(entropy_components_larger_than_0),
                                                           numpy.max(entropy_components_larger_than_0),
                                                           numpy.min(entropy_components_larger_than_0)))
        else:
            run.info('Entropy analysis', 'None of the nucleotide positions posessed any entropy!')


    if output_file:
        entropy_output = open(output_file, 'w')
        for _component, _entropy in sorted_entropy_tpls:
            entropy_output.write('%d\t%.4f\n' % (_component, _entropy))
        if verbose:
            run.info('Entropy analysis output file path', output_file)
        entropy_output.close()
    
    return [x[1] for x in entropy_tpls]
Example #2
0
def entropy_analysis(alignment_path,
                     output_file=None,
                     verbose=True,
                     uniqued=False,
                     freq_from_defline=None,
                     weighted=False,
                     qual_stats_dict=None,
                     amino_acid_sequences=False):
    if freq_from_defline == None:
        freq_from_defline = lambda x: int(
            [t.split(':')[1] for t in x.split('|') if t.startswith('freq')][0])

    lines = []
    previous_alignment_length = None

    progress = Progress()
    progress.verbose = verbose

    alignment = u.SequenceSource(alignment_path)

    progress.new('Processing the Alignment')

    # processing the alignment file..
    while alignment.next():
        # check the alignment lengths along the way:
        if previous_alignment_length:
            if previous_alignment_length != len(alignment.seq):
                raise EntropyError, "Not all reads have the same length."

        # print out process info
        if alignment.pos % 10000 == 0:
            progress.update('Reads processed: %s' %
                            (pretty_print(alignment.pos)))

        # fill 'lines' variable
        if not uniqued:
            lines.append(alignment.seq)
        else:
            try:
                frequency = freq_from_defline(alignment.id)
            except IndexError:
                raise EntropyError, "Reads declared as unique, but they do not have proper deflines. See help for --uniqued."

            for i in range(0, frequency):
                lines.append(alignment.seq)

        previous_alignment_length = len(alignment.seq)

    progress.end()
    if verbose:
        run.info('Number of reads', pretty_print(alignment.pos))

    alignment.close()

    # entropy analysis
    progress.new('Entropy Analysis')
    entropy_tpls = []

    for position in range(0, len(lines[0])):
        progress.update(P(int(position + 1), len(lines[0])))

        if len(set([x[position] for x in lines])) == 1:
            entropy_tpls.append((position, 0.0), )
        else:
            column = "".join([x[position] for x in lines])

            if weighted:
                if not qual_stats_dict:
                    raise EntropyError, "Weighted entropy is selected, but no qual stats are provided"
                e = entropy(column,
                            l_qual=qual_stats_dict[position],
                            amino_acid_sequences=amino_acid_sequences)
            else:
                e = entropy(column, amino_acid_sequences=amino_acid_sequences)

            if e < 0.00001:
                entropy_tpls.append((position, 0.0), )
            else:
                entropy_tpls.append((position, e), )

    sorted_entropy_tpls = sorted(entropy_tpls,
                                 key=operator.itemgetter(1),
                                 reverse=True)

    progress.end()

    if verbose:
        entropy_components_larger_than_0 = [
            e[1] for e in entropy_tpls if e[1] > 0
        ]
        if entropy_components_larger_than_0:
            run.info('Entropy analysis', 'Done (total of %d components greater than 0, mean: %.2f, max: %.2f, min: %.2f).' \
                                                        % (len(entropy_components_larger_than_0),
                                                           numpy.mean(entropy_components_larger_than_0),
                                                           numpy.max(entropy_components_larger_than_0),
                                                           numpy.min(entropy_components_larger_than_0)))
        else:
            run.info('Entropy analysis',
                     'None of the nucleotide positions posessed any entropy!')

    if output_file:
        entropy_output = open(output_file, 'w')
        for _component, _entropy in sorted_entropy_tpls:
            entropy_output.write('%d\t%.4f\n' % (_component, _entropy))
        if verbose:
            run.info('Entropy analysis output file path', output_file)
        entropy_output.close()

    return [x[1] for x in entropy_tpls]
#
# Please read the COPYING file.

import numpy as np
from scipy import log2 as log
import matplotlib.pyplot as plt

from Oligotyping.utils.random_colors import get_list_of_colors
from Oligotyping.utils.utils import get_unique_sequences_from_FASTA
from Oligotyping.utils.utils import Progress
from Oligotyping.utils.utils import Run
from Oligotyping.utils.utils import NUCL_COLORS


run = Run()
progress = Progress()


def entropy_distribution_bar(alignment, entropy_values, output_file, quick = False, no_display = False, qual_stats_dict = None, weighted = False, verbose = False):
    progress.verbose = verbose
    progress.new('Entropy Distribution Figure')
    progress.update('Computing ')

    y_maximum = max(entropy_values) + (max(entropy_values) / 10.0)
    y_maximum = 1 if y_maximum < 1 else y_maximum

    number_of_uniques_to_show = int(y_maximum * 100)

    if alignment == None:
        quick = True