Ejemplo n.º 1
0
    def print_b6_file_stats(self):
        if self.matrix == []:
            self.load_b6_matrix()

        TABULAR = lambda x, y: sys.stdout.write('%s %s: %s\n' % (x, '.' * (20 - len(x)), y))
        INFO    = lambda x: '%-10.2f %-10.2f %-10.2f %-10.2f'\
                                 % (numpy.mean(self.matrix[x]),
                                     numpy.std(self.matrix[x]),
                                     numpy.min(self.matrix[x]),
                                     numpy.max(self.matrix[x]))
     
        print
        TABULAR('Total Hits', pretty_print(len(self.matrix[IDENTITY])))
        print
        print '                        mean       std         min         max'
        print
        TABULAR('Identity', INFO(IDENTITY))
        TABULAR('Alignment Length', INFO(ALIGNMENT_LENGTH))
        TABULAR('Mismatches', INFO(MISMATCHES))
        TABULAR('Gaps', INFO(GAPS))
        TABULAR('Query Start', INFO(Q_START))
        TABULAR('Query End', INFO(Q_END))
        TABULAR('Target Start', INFO(S_START))
        TABULAR('Target End', INFO(S_END))
        TABULAR('E-Value', INFO(E_VALUE))
        TABULAR('Bit Score', INFO(BIT_SCORE))
        print
Ejemplo n.º 2
0
def get_oligo_reps_dict(html_dict, html_output_directory):
    oligos, rep_dir = html_dict['oligos'], html_dict[
        'output_directory_for_reps']

    oligo_reps_dict = {}
    oligo_reps_dict['imgs'] = {}
    oligo_reps_dict['fancy_seqs'] = {}
    oligo_reps_dict['clear_seqs'] = {}
    oligo_reps_dict['frequency'] = {}
    oligo_reps_dict['component_references'] = {}
    oligo_reps_dict['blast_results'] = {}

    for i in range(0, len(oligos)):
        oligo = oligos[i]

        alignment_base_path = os.path.join(rep_dir, '%.5d_' % i + oligo)

        diversity_image_path = alignment_base_path + '_unique.png'
        diversity_image_dest = os.path.join(
            html_output_directory, os.path.basename(diversity_image_path))
        shutil.copy2(diversity_image_path, diversity_image_dest)
        oligo_reps_dict['imgs'][oligo] = os.path.basename(diversity_image_dest)

        unique_sequences_path = alignment_base_path + '_unique'
        uniques = u.SequenceSource(unique_sequences_path)
        oligo_reps_dict['fancy_seqs'][oligo] = []
        oligo_reps_dict['clear_seqs'][oligo] = []
        oligo_reps_dict['frequency'][oligo] = []
        while uniques.next() and uniques.pos <= 20:
            oligo_reps_dict['clear_seqs'][oligo].append(uniques.seq)
            oligo_reps_dict['fancy_seqs'][oligo].append(
                get_decorated_sequence(uniques.seq,
                                       html_dict['entropy_components']))
            oligo_reps_dict['frequency'][oligo].append(
                pretty_print(uniques.id.split('|')[1].split(':')[1]))

        entropy_file_path = alignment_base_path + '_unique_entropy'
        entropy_values_per_column = [0] * html_dict['alignment_length']
        for column, entropy in [
                x.strip().split('\t') for x in open(entropy_file_path)
        ]:
            entropy_values_per_column[int(column)] = float(entropy)

        color_per_column = cPickle.load(
            open(alignment_base_path + '_unique_color_per_column.cPickle'))
        oligo_reps_dict['component_references'][oligo] = ''.join([
            '<span style="background-color: %s;"><a onmouseover="popup(\'\column: %d<br />entropy: %.4f\', 100)" href="">|</a></span>'
            % (color_per_column[i], i, entropy_values_per_column[i])
            for i in range(0, html_dict['alignment_length'])
        ])

        blast_results_dict = alignment_base_path + '_unique_BLAST.cPickle'
        if os.path.exists(blast_results_dict):
            html_dict['blast_results_found'] = True
            oligo_reps_dict['blast_results'][oligo] = cPickle.load(
                open(blast_results_dict))
        else:
            oligo_reps_dict['blast_results'][oligo] = None

    return oligo_reps_dict
Ejemplo n.º 3
0
    def print_b6_file_stats(self):
        if self.matrix == []:
            self.load_b6_matrix()

        TABULAR = lambda x, y: sys.stdout.write('%s %s: %s\n' %
                                                (x, '.' * (20 - len(x)), y))
        INFO    = lambda x: '%-10.2f %-10.2f %-10.2f %-10.2f'\
                                 % (numpy.mean(self.matrix[x]),
                                     numpy.std(self.matrix[x]),
                                     numpy.min(self.matrix[x]),
                                     numpy.max(self.matrix[x]))

        print()
        TABULAR('Total Hits', pretty_print(len(self.matrix[IDENTITY])))
        print()
        print('                        mean       std         min         max')
        print()
        TABULAR('Identity', INFO(IDENTITY))
        TABULAR('Alignment Length', INFO(ALIGNMENT_LENGTH))
        TABULAR('Mismatches', INFO(MISMATCHES))
        TABULAR('Gaps', INFO(GAPS))
        TABULAR('Query Start', INFO(Q_START))
        TABULAR('Query End', INFO(Q_END))
        TABULAR('Target Start', INFO(S_START))
        TABULAR('Target End', INFO(S_END))
        TABULAR('E-Value', INFO(E_VALUE))
        TABULAR('Bit Score', INFO(BIT_SCORE))
        print()
Ejemplo n.º 4
0
def get_oligo_reps_dict(html_dict, html_output_directory):
    oligos, rep_dir = html_dict['oligos'], html_dict['output_directory_for_reps']

    oligo_reps_dict = {}
    oligo_reps_dict['imgs'] = {}
    oligo_reps_dict['fancy_seqs'] = {}
    oligo_reps_dict['clear_seqs'] = {}
    oligo_reps_dict['frequency'] = {}
    oligo_reps_dict['component_references'] = {}
    oligo_reps_dict['blast_results'] = {}

    for i in range(0, len(oligos)):
        oligo = oligos[i]

        alignment_base_path = os.path.join(rep_dir, '%.5d_' % i + oligo)

        diversity_image_path =  alignment_base_path + '_unique.png'
        diversity_image_dest = os.path.join(html_output_directory, os.path.basename(diversity_image_path))
        shutil.copy2(diversity_image_path, diversity_image_dest)
        oligo_reps_dict['imgs'][oligo] = os.path.basename(diversity_image_dest)

        unique_sequences_path = alignment_base_path + '_unique'
        uniques = u.SequenceSource(unique_sequences_path)
        oligo_reps_dict['fancy_seqs'][oligo] = []
        oligo_reps_dict['clear_seqs'][oligo] = []
        oligo_reps_dict['frequency'][oligo] = []
        while uniques.next() and uniques.pos <= 20:
            oligo_reps_dict['clear_seqs'][oligo].append(uniques.seq)
            oligo_reps_dict['fancy_seqs'][oligo].append(get_decorated_sequence(uniques.seq, html_dict['entropy_components']))
            oligo_reps_dict['frequency'][oligo].append(pretty_print(uniques.id.split('|')[1].split(':')[1]))

        entropy_file_path = alignment_base_path + '_unique_entropy'
        entropy_values_per_column = [0] * html_dict['alignment_length']
        for column, entropy in [x.strip().split('\t') for x in open(entropy_file_path)]:
            entropy_values_per_column[int(column)] = float(entropy)

        color_per_column = cPickle.load(open(alignment_base_path + '_unique_color_per_column.cPickle'))
        oligo_reps_dict['component_references'][oligo] = ''.join(['<span style="background-color: %s;"><a onmouseover="popup(\'\column: %d<br />entropy: %.4f\', 100)" href="">|</a></span>' % (color_per_column[i], i, entropy_values_per_column[i]) for i in range(0, html_dict['alignment_length'])])

        blast_results_dict = alignment_base_path + '_unique_BLAST.cPickle'
        if os.path.exists(blast_results_dict):
            html_dict['blast_results_found'] = True
            oligo_reps_dict['blast_results'][oligo] = cPickle.load(open(blast_results_dict))
        else:
            oligo_reps_dict['blast_results'][oligo] = None

    return oligo_reps_dict
Ejemplo n.º 5
0
    def load_b6_matrix(self):
        for i in range(0, 12):
            self.matrix.append([])
        
        F = lambda x, i: self.conversion[i](x)
   
        while self.next(raw = True): 
            if self.pos % 10000 == 0 or self.pos == 1:
                sys.stderr.write('\r[b6_matrix] Reading: %s' % (pretty_print(self.pos)))
                sys.stderr.flush()
 
            b6_columns = self.entry_line.split(('\t'))
            for i in range(0, 12):
                self.matrix[i].append(F(b6_columns[i], i))

        sys.stderr.write('\n')
        return True
Ejemplo n.º 6
0
    def load_b6_matrix(self):
        for i in range(0, 12):
            self.matrix.append([])

        F = lambda x, i: self.conversion[i](x)

        while self.next(raw=True):
            if self.pos % 10000 == 0 or self.pos == 1:
                sys.stderr.write('\r[b6_matrix] Reading: %s' %
                                 (pretty_print(self.pos)))
                sys.stderr.flush()

            b6_columns = self.entry_line.split(('\t'))
            for i in range(0, 12):
                self.matrix[i].append(F(b6_columns[i], i))

        sys.stderr.write('\n')
        return True
Ejemplo n.º 7
0
def length_distribution(fasta, output=None, title=None):
    fasta = u.SequenceSource(fasta)

    sequence_lengths = []

    fasta.reset()

    while fasta.next():
        if fasta.pos % 1000 == 0 or fasta.pos == 1:
            sys.stderr.write('\r[fastalib] Reading: %s' % (fasta.pos))
            sys.stderr.flush()
        sequence_lengths.append(len(fasta.seq.replace('-', '')))

    fasta.reset()

    sys.stderr.write('\n')

    max_seq_len = max(sequence_lengths) + (int(max(sequence_lengths) / 100.0)
                                           or 10)

    seq_len_distribution = [0] * (max_seq_len + 1)

    for l in sequence_lengths:
        seq_len_distribution[l] += 1

    fig = plt.figure(figsize=(12, 8))
    plt.rcParams.update({'axes.linewidth': 0.9})
    plt.rc('grid', color='0.50', linestyle='-', linewidth=0.1)

    gs = gridspec.GridSpec(20, 1)

    #############################################################################################################

    ax1 = plt.subplot(gs[1:3])
    plt.subplots_adjust(left=0.05, bottom=0.03, top=0.95, right=0.98)
    plt.grid(False)
    plt.yticks([])
    plt.xticks([])
    total_seqs = len(sequence_lengths)
    plt.text(0.02, 0.5, 'total: %s / mean: %.2f / std: %.2f / min: %s / max: %s'\
        % (pretty_print(total_seqs),
           numpy.mean(sequence_lengths), numpy.std(sequence_lengths),\
           min(sequence_lengths),\
           max(sequence_lengths)),\
        va = 'center', alpha = 0.8, size = 12)

    #############################################################################################################

    ax1 = plt.subplot(gs[4:11])
    plt.grid(True)
    plt.subplots_adjust(left=0.05, bottom=0.01, top=0.95, right=0.98)

    plt.plot(seq_len_distribution, color='black', alpha=0.3)
    plt.fill_between(range(0, max_seq_len + 1),
                     seq_len_distribution,
                     y2=0,
                     color='black',
                     alpha=0.30)
    plt.ylabel('number of sequences')

    xtickstep = (max_seq_len / 50) or 1
    ytickstep = max(seq_len_distribution) / 20 or 1

    plt.xticks(range(xtickstep, max_seq_len + 1, xtickstep),
               rotation=90,
               size='xx-small')
    plt.yticks(range(0,
                     max(seq_len_distribution) + 1, ytickstep),
               [y for y in range(0,
                                 max(seq_len_distribution) + 1, ytickstep)],
               size='xx-small')
    plt.xlim(xmin=0, xmax=max_seq_len)
    plt.ylim(ymin=0,
             ymax=max(seq_len_distribution) +
             (max(seq_len_distribution) / 20.0))

    plt.figtext(0.5,
                0.96,
                '%s' % (title or fasta.fasta_file_path),
                weight='black',
                size='xx-large',
                ha='center')

    #############################################################################################################

    ax2 = plt.subplot(gs[12:19])
    plt.subplots_adjust(left=0.05, bottom=0.01, top=0.95, right=0.98)
    plt.grid(True)

    length_abundance = {}
    for l in sequence_lengths:
        if length_abundance.has_key(l):
            length_abundance[l] += 1
        else:
            length_abundance[l] = 1

    percentages = []
    total_percentage = 0
    for i in range(0, max_seq_len):
        if length_abundance.has_key(i):
            total_percentage += length_abundance[i] * 100.0 / total_seqs
            percentages.append(total_percentage)
        else:
            percentages.append(total_percentage)

    xtickstep = (max_seq_len / 50) or 1
    plt.xticks(range(xtickstep, max_seq_len + 1, xtickstep),
               rotation=90,
               size='xx-small')
    plt.yticks(range(0, 101, 5), ['%d%%' % y for y in range(0, 101, 5)],
               size='xx-small')
    plt.ylabel('percent of reads')

    plt.xlim(xmin=0, xmax=max_seq_len)
    plt.ylim(ymin=0, ymax=100)
    plt.plot(percentages)
    plt.fill_between(range(0, max_seq_len + 1),
                     percentages + [100],
                     y2=0,
                     color='blue',
                     alpha=0.30)

    #############################################################################################################

    if output == None:
        output = fasta.fasta_file_path

    try:
        plt.savefig(output + '.pdf')
    except:
        plt.savefig(output + '.png')

    try:
        plt.show()
    except:
        pass

    fasta.close()

    return
Ejemplo n.º 8
0
def sumvals(arg, clean=None):
    if clean:
        return sum(arg.values())
    return pretty_print(sum(arg.values()))
Ejemplo n.º 9
0
def entropy_analysis(alignment_path, output_file = None, verbose = True, uniqued = False, freq_from_defline = None, weighted = False, qual_stats_dict = None, amino_acid_sequences = False):
    if freq_from_defline == None:
        freq_from_defline = lambda x: int([t.split(':')[1] for t in x.split('|') if t.startswith('freq')][0])

    lines = []
    previous_alignment_length = None

    progress = Progress()
    progress.verbose = verbose
   
    alignment = u.SequenceSource(alignment_path)

    progress.new('Processing the Alignment')

    # processing the alignment file..
    while alignment.next():
        # check the alignment lengths along the way:
        if previous_alignment_length:
            if previous_alignment_length != len(alignment.seq):
                raise EntropyError, "Not all reads have the same length."

        # print out process info
        if alignment.pos % 10000 == 0:
            progress.update('Reads processed: %s' % (pretty_print(alignment.pos)))
        
        # fill 'lines' variable
        if not uniqued:
            lines.append(alignment.seq)
        else:
            try:
                frequency = freq_from_defline(alignment.id)
            except IndexError:
                raise EntropyError, "Reads declared as unique, but they do not have proper deflines. See help for --uniqued."
                
            for i in range(0, frequency):
                lines.append(alignment.seq)

        previous_alignment_length = len(alignment.seq)

    progress.end()
    if verbose:
        run.info('Number of reads', pretty_print(alignment.pos))

    alignment.close()


    # entropy analysis
    progress.new('Entropy Analysis')
    entropy_tpls = []

    for position in range(0, len(lines[0])):
        progress.update(P(int(position + 1), len(lines[0])))
   
        if len(set([x[position] for x in lines])) == 1:
            entropy_tpls.append((position, 0.0),)
        else:
            column = "".join([x[position] for x in lines])

            if weighted:
                if not qual_stats_dict: 
                    raise EntropyError, "Weighted entropy is selected, but no qual stats are provided"
                e = entropy(column, l_qual = qual_stats_dict[position], amino_acid_sequences = amino_acid_sequences)
            else:
                e = entropy(column, amino_acid_sequences = amino_acid_sequences)

            if e < 0.00001:
                entropy_tpls.append((position, 0.0),)
            else:
                entropy_tpls.append((position, e),)

    sorted_entropy_tpls = sorted(entropy_tpls, key=operator.itemgetter(1), reverse=True)

    progress.end()


    if verbose:
        entropy_components_larger_than_0 = [e[1] for e in entropy_tpls if e[1] > 0]
        if entropy_components_larger_than_0:
            run.info('Entropy analysis', 'Done (total of %d components greater than 0, mean: %.2f, max: %.2f, min: %.2f).' \
                                                        % (len(entropy_components_larger_than_0),
                                                           numpy.mean(entropy_components_larger_than_0),
                                                           numpy.max(entropy_components_larger_than_0),
                                                           numpy.min(entropy_components_larger_than_0)))
        else:
            run.info('Entropy analysis', 'None of the nucleotide positions posessed any entropy!')


    if output_file:
        entropy_output = open(output_file, 'w')
        for _component, _entropy in sorted_entropy_tpls:
            entropy_output.write('%d\t%.4f\n' % (_component, _entropy))
        if verbose:
            run.info('Entropy analysis output file path', output_file)
        entropy_output.close()
    
    return [x[1] for x in entropy_tpls]
Ejemplo n.º 10
0
def length_distribution(fasta, output = None, title = None):
    fasta = u.SequenceSource(fasta)

    sequence_lengths = []
    
    fasta.reset()
    
    while fasta.next():
        if fasta.pos % 1000 == 0 or fasta.pos == 1:
            sys.stderr.write('\r[fastalib] Reading: %s' % (fasta.pos))
            sys.stderr.flush()
        sequence_lengths.append(len(fasta.seq.replace('-', '')))
    
    fasta.reset()
    
    sys.stderr.write('\n')
    
    max_seq_len = max(sequence_lengths) + (int(max(sequence_lengths) / 100.0) or 10)
    
    seq_len_distribution = [0] * (max_seq_len + 1)
    
    for l in sequence_lengths:
        seq_len_distribution[l] += 1
    
    fig = plt.figure(figsize = (12, 8))
    plt.rcParams.update({'axes.linewidth' : 0.9})
    plt.rc('grid', color='0.50', linestyle='-', linewidth=0.1)
    
    gs = gridspec.GridSpec(20, 1)

    #############################################################################################################
    
    ax1 = plt.subplot(gs[1:3])
    plt.subplots_adjust(left=0.05, bottom = 0.03, top = 0.95, right = 0.98)
    plt.grid(False)
    plt.yticks([])
    plt.xticks([])
    total_seqs = len(sequence_lengths)
    plt.text(0.02, 0.5, 'total: %s / mean: %.2f / std: %.2f / min: %s / max: %s'\
        % (pretty_print(total_seqs),
           numpy.mean(sequence_lengths), numpy.std(sequence_lengths),\
           min(sequence_lengths),\
           max(sequence_lengths)),\
        va = 'center', alpha = 0.8, size = 12)
   
    #############################################################################################################
 
    ax1 = plt.subplot(gs[4:11])
    plt.grid(True)
    plt.subplots_adjust(left=0.05, bottom = 0.01, top = 0.95, right = 0.98)
    
    plt.plot(seq_len_distribution, color = 'black', alpha = 0.3)
    plt.fill_between(range(0, max_seq_len + 1), seq_len_distribution, y2 = 0, color = 'black', alpha = 0.30)
    plt.ylabel('number of sequences')
    
    xtickstep = (max_seq_len / 50) or 1
    ytickstep = max(seq_len_distribution) / 20 or 1
    
    plt.xticks(range(xtickstep, max_seq_len + 1, xtickstep), rotation=90, size='xx-small')
    plt.yticks(range(0, max(seq_len_distribution) + 1, ytickstep),
               [y for y in range(0, max(seq_len_distribution) + 1, ytickstep)],
               size='xx-small')
    plt.xlim(xmin = 0, xmax = max_seq_len)
    plt.ylim(ymin = 0, ymax = max(seq_len_distribution) + (max(seq_len_distribution) / 20.0))
    
    plt.figtext(0.5, 0.96, '%s' % (title or fasta.fasta_file_path), weight = 'black', size = 'xx-large', ha = 'center')
    
   
    #############################################################################################################
    
    ax2 = plt.subplot(gs[12:19])
    plt.subplots_adjust(left=0.05, bottom = 0.01, top = 0.95, right = 0.98)
    plt.grid(True)

    length_abundance = {}
    for l in sequence_lengths:
        if length_abundance.has_key(l):
            length_abundance[l] += 1
        else:
            length_abundance[l] = 1

    percentages = []
    total_percentage = 0
    for i in range(0, max_seq_len):
        if length_abundance.has_key(i):
            total_percentage += length_abundance[i] * 100.0 / total_seqs
            percentages.append(total_percentage)
        else:
            percentages.append(total_percentage)

    xtickstep = (max_seq_len / 50) or 1
    plt.xticks(range(xtickstep, max_seq_len + 1, xtickstep), rotation=90, size='xx-small')
    plt.yticks(range(0, 101, 5),
               ['%d%%' % y for y in range(0, 101, 5)],
               size='xx-small')
    plt.ylabel('percent of reads')

    plt.xlim(xmin = 0, xmax = max_seq_len)
    plt.ylim(ymin = 0, ymax = 100)
    plt.plot(percentages)
    plt.fill_between(range(0, max_seq_len + 1), percentages + [100], y2 = 0, color = 'blue', alpha = 0.30)
    
 
    #############################################################################################################
    
    if output == None:
        output = fasta.fasta_file_path

    try:
        plt.savefig(output + '.pdf')
    except:
        plt.savefig(output + '.png')
    
    try:
        plt.show()
    except:
        pass
    

    fasta.close()
    
    return
Ejemplo n.º 11
0
    def visualize_b6_output(self, title_hint, Q_LENGTH=101):
        if self.matrix == []:
            self.load_b6_matrix()

        import matplotlib.pyplot as plt
        import matplotlib.gridspec as gridspec

        def _setp(b, c='red'):
            plt.setp(b['medians'], color=c)
            plt.setp(b['whiskers'], color='black', alpha=0.6)
            plt.setp(b['boxes'], color='black', alpha=0.8)
            plt.setp(b['caps'], color='black', alpha=0.6)
            plt.setp(b['fliers'], color='#EEEEEE', alpha=0.01)

        fig = plt.figure(figsize=(24, 12))
        plt.rcParams.update({'axes.linewidth': 0.9})
        plt.rc('grid', color='0.50', linestyle='-', linewidth=0.1)

        gs = gridspec.GridSpec(2, 19)

        #
        # UPPER PANEL, Q_START AND Q_END
        #

        ax1 = plt.subplot(gs[0:15])
        plt.grid(True)

        plt.subplots_adjust(left=0.03, bottom=0.05, top=0.92, right=0.97)

        plt.title('Alignment Start / End Positions for "%s" (Number of Hits: %s)'\
              % (os.path.basename(self.b6_source) if not title_hint else title_hint, pretty_print(len(self.matrix[0]))))

        p1 = [0] * max(self.matrix[Q_END])
        p2 = [0] * max(self.matrix[Q_END])

        for i in self.matrix[Q_START]:
            p1[i - 1] += 1
        for i in self.matrix[Q_END]:
            p2[i - 1] += 1

        p1 = [x * 100.0 / sum(p1) for x in p1]
        p2 = [x * 100.0 / sum(p2) for x in p2]

        for i in range(0, len(p1)):
            plt.bar([i], [100],
                    color='green',
                    alpha=(p1[i] / max(p1)) * 0.8,
                    width=1,
                    edgecolor='green')
        for i in range(0, len(p2)):
            plt.bar([i], [100],
                    color='purple',
                    alpha=(p2[i] / max(p2)) * 0.8,
                    width=1,
                    linewidth=0)

        ax1.plot(p1, c='black', linewidth=3)
        ax1.plot(p1, c='green', label='Alignment Start Position')
        ax1.plot(p2, c='black', linewidth=3)
        ax1.plot(p2, c='red', label='Alignment End Position')
        plt.fill_between(list(range(0, len(p1))),
                         p1,
                         y2=0,
                         color='black',
                         alpha=0.5)
        plt.fill_between(list(range(0, len(p2))),
                         p2,
                         y2=0,
                         color='black',
                         alpha=0.5)

        plt.ylabel('Percent of Hits')
        plt.xlabel('Position')
        plt.xticks(list(range(0, Q_LENGTH, Q_LENGTH / 100)),
                   list(range(1, Q_LENGTH + 1, Q_LENGTH / 100)),
                   rotation=90,
                   size='xx-small')
        plt.yticks([t for t in range(0, 101, 10)],
                   ['%s%%' % t for t in range(0, 101, 10)],
                   size='xx-small')
        plt.ylim(ymin=0, ymax=100)
        plt.xlim(xmin=0, xmax=Q_LENGTH - 1)

        plt.legend()

        #UPPER PANEL RIGHT SIDE

        ax1b = plt.subplot(gs[16:19])
        plt.title('Percent Identity Breakdown')

        plt.grid(True)
        percent_brake_down = []
        for p in range(90, 101):
            percent_brake_down.append(
                len([True for x in self.matrix[IDENTITY] if x >= p]) * 100.0 /
                len(self.matrix[IDENTITY]))

        percent_differences = []
        for i in range(0, len(percent_brake_down)):
            if i < len(percent_brake_down) - 1:
                percent_differences.append(percent_brake_down[i] -
                                           percent_brake_down[i + 1])
            else:
                percent_differences.append(percent_brake_down[i])
        percent_differences.sort(reverse=True)

        ax1b.bar([t + .05 for t in range(0, 11)],
                 percent_differences,
                 width=.9,
                 color='orange')
        plt.xlim(xmax=11)
        plt.ylim(ymax=100, ymin=0)
        plt.xticks([t + .5 for t in range(0, 11)],
                   ['%s%%' % t for t in range(100, 89, -1)],
                   rotation=90,
                   size='xx-small')
        plt.yticks([t for t in range(0, 101, 10)],
                   ['%s%%' % t for t in range(0, 101, 10)],
                   size='xx-small')
        plt.xlabel('Percent Identity Level')
        plt.ylabel('Percent of Hits')

        # BOX 1
        ax2 = plt.subplot(gs[19:22])
        plt.grid(True)
        plt.title('Query Alignment Start / End Positions')
        plt.ylabel('Position in Query')
        b2 = ax2.boxplot([self.matrix[Q_START], self.matrix[Q_END]],
                         positions=[0.5, 1.5],
                         sym=',',
                         widths=0.7)
        _setp(b2)
        plt.xticks([0.5, 1.5], ['Start', 'End'])

        # BOX 2
        ax3 = plt.subplot(gs[23:26])
        plt.grid(True)
        plt.title('Target Alignment Start / End Positions')
        plt.ylabel('Position in Target')
        b3 = ax3.boxplot([self.matrix[S_START], self.matrix[S_END]],
                         positions=[0.5, 1.5],
                         sym=',',
                         widths=0.7)
        _setp(b3)
        plt.xticks([0.5, 1.5], ['Start', 'End'])

        # BOX 3
        ax4 = plt.subplot(gs[27:29])
        plt.grid(True)
        plt.title('Percent Identity to Target')
        plt.ylabel('Percent')
        b4 = ax4.boxplot(self.matrix[IDENTITY],
                         positions=[0.5],
                         sym=',',
                         widths=0.7)
        _setp(b4, 'purple')
        plt.xticks([0.5], [])
        plt.ylim(ymax=101, ymin=0)

        # BOX 4
        ax5 = plt.subplot(gs[30:32])
        plt.grid(True)
        plt.title('Alignment Length')
        plt.ylabel('Nucleotide')
        b5 = ax5.boxplot(self.matrix[ALIGNMENT_LENGTH],
                         positions=[0.5],
                         sym=',',
                         widths=0.7)
        _setp(b5, 'orange')
        plt.xticks([0.5], [])

        # BOX 5
        ax6 = plt.subplot(gs[33:35])
        plt.grid(True)
        plt.title('Mismatches and Gaps')
        plt.ylabel('Number')
        b6 = ax6.boxplot([self.matrix[MISMATCHES], self.matrix[GAPS]],
                         positions=[0.5, 1.5],
                         sym=',',
                         widths=0.7)
        _setp(b6, 'brown')
        plt.xticks([0.5, 1.5], ['Mismatches', 'Gaps'])

        # BOX 6
        ax7 = plt.subplot(gs[36:38])
        plt.grid(True)
        plt.title('Bit Score')
        b7 = ax7.boxplot(self.matrix[BIT_SCORE],
                         positions=[0.5],
                         sym=',',
                         widths=0.7)
        _setp(b7, 'green')
        plt.xticks([0.5], [])

        try:
            plt.savefig(self.b6_source + '.tiff')
        except:
            plt.savefig(self.b6_source + '.png')

        try:
            plt.show()
        except:
            pass

        return
Ejemplo n.º 12
0
    def visualize_b6_output(self, title_hint, Q_LENGTH = 101):
        if self.matrix == []:
            self.load_b6_matrix()

        import matplotlib.pyplot as plt
        import matplotlib.gridspec as gridspec
 
        def _setp(b, c = 'red'):
            plt.setp(b['medians'], color=c)
            plt.setp(b['whiskers'], color='black', alpha=0.6)
            plt.setp(b['boxes'], color='black', alpha=0.8)
            plt.setp(b['caps'], color='black', alpha=0.6)
            plt.setp(b['fliers'], color='#EEEEEE', alpha=0.01)
    
        fig = plt.figure(figsize = (24, 12))
        plt.rcParams.update({'axes.linewidth' : 0.9})
        plt.rc('grid', color='0.50', linestyle='-', linewidth=0.1)
        
        gs = gridspec.GridSpec(2, 19)
    
        #
        # UPPER PANEL, Q_START AND Q_END
        #
    
        ax1 = plt.subplot(gs[0:15])
        plt.grid(True)
        
        plt.subplots_adjust(left=0.03, bottom = 0.05, top = 0.92, right = 0.97)
     
        plt.title('Alignment Start / End Positions for "%s" (Number of Hits: %s)'\
              % (os.path.basename(self.b6_source) if not title_hint else title_hint, pretty_print(len(self.matrix[0]))))
    
        p1 = [0] * max(self.matrix[Q_END])
        p2 = [0] * max(self.matrix[Q_END])
    
        for i in self.matrix[Q_START]:
            p1[i - 1] += 1
        for i in self.matrix[Q_END]:
            p2[i - 1] += 1
        
        p1 = [x * 100.0 / sum(p1) for x in p1]
        p2 = [x * 100.0 / sum(p2) for x in p2]
    
        for i in range(0, len(p1)):
            plt.bar([i], [100], color='green', alpha = (p1[i] / max(p1)) * 0.8, width = 1, edgecolor='green')
        for i in range(0, len(p2)):
            plt.bar([i], [100], color='purple', alpha = (p2[i] / max(p2)) * 0.8, width = 1, linewidth = 0)
    
        ax1.plot(p1, c = 'black', linewidth = 3)
        ax1.plot(p1, c = 'green', label = 'Alignment Start Position')
        ax1.plot(p2, c = 'black', linewidth = 3)
        ax1.plot(p2, c = 'red', label = 'Alignment End Position')
        plt.fill_between(range(0, len(p1)), p1, y2 = 0, color = 'black', alpha = 0.5)
        plt.fill_between(range(0, len(p2)), p2, y2 = 0, color = 'black', alpha = 0.5)
        
        plt.ylabel('Percent of Hits')
        plt.xlabel('Position')
        plt.xticks(range(0, Q_LENGTH, Q_LENGTH / 100), range(1, Q_LENGTH + 1, Q_LENGTH / 100), rotation=90, size='xx-small')
        plt.yticks([t for t in range(0, 101, 10)], ['%s%%' % t for t in range(0, 101, 10)], size='xx-small')
        plt.ylim(ymin = 0, ymax = 100)
        plt.xlim(xmin = 0, xmax = Q_LENGTH - 1)
    
        plt.legend()   
   

        #UPPER PANEL RIGHT SIDE
    
        ax1b = plt.subplot(gs[16:19])
        plt.title('Percent Identity Breakdown')
        
        plt.grid(True)
        percent_brake_down = []
        for p in range(90, 101):
            percent_brake_down.append(len([True for x in self.matrix[IDENTITY] if x >= p]) * 100.0 / len(self.matrix[IDENTITY]))
    
        percent_differences = []
        for i in range(0, len(percent_brake_down)):
            if i < len(percent_brake_down) - 1:
                percent_differences.append(percent_brake_down[i] - percent_brake_down[i + 1])
            else:
                percent_differences.append(percent_brake_down[i])
        percent_differences.sort(reverse = True)
   

        ax1b.bar([t + .05 for t in range(0, 11)], percent_differences, width = .9, color = 'orange')
        plt.xlim(xmax = 11)
        plt.ylim(ymax = 100, ymin = 0)
        plt.xticks([t + .5 for t in range(0, 11)], ['%s%%' % t for t in range(100, 89, -1)], rotation=90, size='xx-small')
        plt.yticks([t for t in range(0, 101, 10)], ['%s%%' % t for t in range(0, 101, 10)], size='xx-small')
        plt.xlabel('Percent Identity Level')
        plt.ylabel('Percent of Hits')
    
        # BOX 1
        ax2 = plt.subplot(gs[19:22])
        plt.grid(True)
        plt.title('Query Alignment Start / End Positions') 
        plt.ylabel('Position in Query')
        b2 = ax2.boxplot([self.matrix[Q_START], self.matrix[Q_END]], positions=[0.5, 1.5], sym=',', widths=0.7)
        _setp(b2)
        plt.xticks([0.5, 1.5], ['Start', 'End'])
    
        # BOX 2
        ax3 = plt.subplot(gs[23:26])
        plt.grid(True)
        plt.title('Target Alignment Start / End Positions') 
        plt.ylabel('Position in Target')
        b3 = ax3.boxplot([self.matrix[S_START], self.matrix[S_END]], positions=[0.5, 1.5], sym=',', widths=0.7)
        _setp(b3)
        plt.xticks([0.5, 1.5], ['Start', 'End'])
    
    
        # BOX 3
        ax4 = plt.subplot(gs[27:29])
        plt.grid(True)
        plt.title('Percent Identity to Target') 
        plt.ylabel('Percent')
        b4 = ax4.boxplot(self.matrix[IDENTITY], positions=[0.5], sym=',', widths=0.7)
        _setp(b4, 'purple')
        plt.xticks([0.5], [])
        plt.ylim(ymax = 101, ymin = 0)
        
    
        # BOX 4
        ax5 = plt.subplot(gs[30:32])
        plt.grid(True)
        plt.title('Alignment Length') 
        plt.ylabel('Nucleotide')
        b5 = ax5.boxplot(self.matrix[ALIGNMENT_LENGTH], positions=[0.5], sym=',', widths=0.7)
        _setp(b5, 'orange')
        plt.xticks([0.5], [])
     
        # BOX 5
        ax6 = plt.subplot(gs[33:35])
        plt.grid(True)
        plt.title('Mismatches and Gaps') 
        plt.ylabel('Number')
        b6 = ax6.boxplot([self.matrix[MISMATCHES], self.matrix[GAPS]], positions=[0.5, 1.5], sym=',', widths=0.7)
        _setp(b6, 'brown')
        plt.xticks([0.5, 1.5], ['Mismatches', 'Gaps'])
    
        # BOX 6
        ax7 = plt.subplot(gs[36:38])
        plt.grid(True)
        plt.title('Bit Score') 
        b7 = ax7.boxplot(self.matrix[BIT_SCORE], positions=[0.5], sym=',', widths=0.7)
        _setp(b7, 'green')
        plt.xticks([0.5], [])
    
 
        try:
            plt.savefig(self.b6_source + '.tiff')
        except:
            plt.savefig(self.b6_source + '.png')

        try:
            plt.show()
        except:
            pass

        return
Ejemplo n.º 13
0
def generate_html_output(run_info_dict,
                         html_output_directory=None,
                         entropy_figure=None):
    if not html_output_directory:
        html_output_directory = os.path.join(run_info_dict['output_directory'],
                                             'HTML-OUTPUT')

    if not os.path.exists(html_output_directory):
        os.makedirs(html_output_directory)

    html_dict = copy.deepcopy(run_info_dict)

    shutil.copy2(os.path.join(absolute, 'static/style.css'),
                 os.path.join(html_output_directory, 'style.css'))
    shutil.copy2(os.path.join(absolute, 'static/header_1.png'),
                 os.path.join(html_output_directory, 'header.png'))
    shutil.copy2(os.path.join(absolute, 'static/missing_image.png'),
                 os.path.join(html_output_directory, 'missing.png'))
    shutil.copy2(os.path.join(absolute, 'static/colorbar.png'),
                 os.path.join(html_output_directory, 'colorbar.png'))
    shutil.copy2(os.path.join(absolute, 'scripts/jquery-1.7.1.js'),
                 os.path.join(html_output_directory, 'jquery-1.7.1.js'))
    shutil.copy2(os.path.join(absolute, 'scripts/popup.js'),
                 os.path.join(html_output_directory, 'popup.js'))
    shutil.copy2(os.path.join(absolute, 'scripts/g.pie.js'),
                 os.path.join(html_output_directory, 'g.pie.js'))
    shutil.copy2(os.path.join(absolute, 'scripts/g.raphael.js'),
                 os.path.join(html_output_directory, 'g.raphael.js'))
    shutil.copy2(os.path.join(absolute, 'scripts/raphael.js'),
                 os.path.join(html_output_directory, 'raphael.js'))
    shutil.copy2(os.path.join(absolute, 'scripts/morris.js'),
                 os.path.join(html_output_directory, 'morris.js'))

    def copy_as(source, dest_name, essential=True):
        dest = os.path.join(html_output_directory, dest_name)

        if essential:
            shutil.copy2(source, dest)
        else:
            # it is ok if you fail to copy files that are not
            # essential..
            try:
                shutil.copy2(source, dest)
            except:
                sys.stderr.write(
                    '\n\n[HTML] Warning: Source file not found\n\tSource: "%s"\n\tDest: "%s\n\n"'
                    % (source, dest))

        return os.path.basename(dest)

    # embarrassingly ad-hoc:
    if entropy_figure:
        if entropy_figure.endswith('.pdf') or entropy_figure.endswith('.png'):
            entropy_figure = entropy_figure[:-4]

    CP = lambda e, o: copy_as(os.path.join(e + ('.%s' % ext)),
                              o,
                              essential=True if ext == 'png' else False)
    for ext in ['png', 'pdf']:
        output_file = 'entropy.%s' % ext
        if entropy_figure:
            html_dict['entropy_figure_%s' % ext] = CP(entropy_figure,
                                                      output_file)
        else:
            try:
                html_dict['entropy_figure_%s' % ext] = CP(
                    run_info_dict['entropy'], output_file)
            except:
                html_dict['entropy_figure_%s' % ext] = CP(
                    run_info_dict['entropy'][:-4], output_file)

    if run_info_dict['gexf_network_file_path']:
        html_dict['gexf_network_file_path'] = copy_as(
            run_info_dict['gexf_network_file_path'], 'network.gexf')

    if run_info_dict['sample_mapping']:
        html_dict['sample_mapping'] = copy_as(run_info_dict['sample_mapping'],
                                              'sample_mapping.txt')
    else:
        html_dict['sample_mapping'] = None

    html_dict['matrix_count_file_path'] = copy_as(
        run_info_dict['matrix_count_file_path'], 'matrix_counts.txt')
    html_dict['matrix_percent_file_path'] = copy_as(
        run_info_dict['matrix_percent_file_path'], 'matrix_percents.txt')
    html_dict['read_distribution_table_path'] = copy_as(
        run_info_dict['read_distribution_table_path'], 'read_distribution.txt')
    html_dict['environment_file_path'] = copy_as(
        run_info_dict['environment_file_path'], 'environment.txt')
    html_dict['oligos_fasta_file_path'] = copy_as(
        run_info_dict['oligos_fasta_file_path'], 'oligos.fa.txt')
    html_dict['oligos_nexus_file_path'] = copy_as(
        run_info_dict['oligos_nexus_file_path'], 'oligos.nex.txt')

    def get_figures_dict(html_dict_prefix):
        html_dict_key = '%s_file_path' % html_dict_prefix
        if html_dict.has_key(html_dict_key):
            figures_dict = cPickle.load(open(html_dict[html_dict_key]))
            for _map in figures_dict:
                for _func in figures_dict[_map]:
                    for _op in figures_dict[_map][_func]:
                        if os.path.exists(figures_dict[_map][_func][_op] +
                                          '.pdf') and os.path.exists(
                                              figures_dict[_map][_func][_op] +
                                              '.png'):
                            prefix = copy_as(
                                figures_dict[_map][_func][_op] + '.pdf',
                                '%s.pdf' % '-'.join([_map, _func, _op]))
                            prefix = copy_as(
                                figures_dict[_map][_func][_op] + '.png',
                                '%s.png' % '-'.join([_map, _func, _op]))
                            figures_dict[_map][_func][_op] = '.'.join(
                                prefix.split('.')[:-1])
                        else:
                            figures_dict[_map][_func][_op] = None
            return figures_dict
        else:
            return None

    html_dict['figures_dict'] = get_figures_dict('figures_dict')
    html_dict['exclusive_figures_dict'] = get_figures_dict(
        'exclusive_figures_dict')

    if html_dict['generate_sets']:
        html_dict['across_samples_MN_file_path'] = copy_as(
            run_info_dict['across_samples_MN_file_path'],
            'across_samples_max_normalized.txt')
        html_dict['across_samples_SN_file_path'] = copy_as(
            run_info_dict['across_samples_SN_file_path'],
            'across_samples_sum_normalized.txt')
        html_dict['oligo_sets_stackbar_figure'] = copy_as(
            run_info_dict['stack_bar_with_agglomerated_oligos_file_path'],
            'stackbar_with_oligo_sets.png')
        html_dict['oligos_across_samples_figure'] = copy_as(
            run_info_dict['oligos_across_samples_file_path'],
            'oligos_across_samples.png')
        html_dict['oligotype_sets_figure'] = copy_as(
            run_info_dict['oligotype_sets_across_samples_figure_path'],
            'oligotype_sets.png')
        html_dict['matrix_count_oligo_sets_file_path'] = copy_as(
            run_info_dict['matrix_count_oligo_sets_file_path'],
            'matrix_counts_oligo_sets.txt')
        html_dict['matrix_percent_oligo_sets_file_path'] = copy_as(
            run_info_dict['matrix_percent_oligo_sets_file_path'],
            'matrix_percents_oligo_sets.txt')
        html_dict['oligotype_sets_file'] = copy_as(
            run_info_dict['oligotype_sets_file_path'], 'oligotype_sets.txt')
        html_dict['oligotype_sets'] = [
            l.strip().split('\t')[1].split(',')
            for l in open(run_info_dict['oligotype_sets_file_path'])
        ]

    if html_dict.has_key('representative_seqs_fasta_file_path'):
        html_dict['representative_seqs_fasta_file_path'] = copy_as(
            run_info_dict['representative_seqs_fasta_file_path'],
            'oligo-representatives.fa.txt')
    else:
        html_dict['representative_seqs_fasta_file_path'] = None
    if run_info_dict.has_key('blast_ref_db') and os.path.exists(
            run_info_dict['blast_ref_db']):
        html_dict['blast_ref_db_path'] = copy_as(run_info_dict['blast_ref_db'],
                                                 'reference_db.fa')
    html_dict['entropy_components'] = [
        int(x) for x in html_dict['bases_of_interest_locs'].split(',')
    ]
    html_dict['samples_dict'] = get_samples_dict_from_environment_file(
        run_info_dict['environment_file_path'])
    html_dict['samples'] = sorted(html_dict['samples_dict'].keys())
    html_dict['blast_results_found'] = False

    # get alignment length
    html_dict['alignment_length'] = get_alignment_length(
        run_info_dict['alignment'])
    # include pretty names
    html_dict['pretty_names'] = pretty_names
    # get purity score colors dict
    html_dict['score_color_dict'] = {}
    gradient = get_list_of_colors(26, colormap='RdYlGn')
    for oligo in run_info_dict['final_purity_score_dict']:
        html_dict['score_color_dict'][oligo] = gradient[int(
            run_info_dict['final_purity_score_dict'][oligo] * 25)]
    # get total purity score color dict
    html_dict['total_score_color'] = gradient[int(
        float(run_info_dict['total_purity_score_dict']) * 25)]
    # get colors dict
    html_dict['color_dict'] = get_colors_dict(
        run_info_dict['colors_file_path'])
    # get abundant oligos list
    html_dict['oligos'] = get_oligos_list(
        run_info_dict['oligos_fasta_file_path'])
    # get oligo frequencies
    html_dict['frequency'] = {}
    for oligo in html_dict['oligos']:
        html_dict['frequency'][oligo] = pretty_print(
            sum([
                d[oligo] for d in html_dict['samples_dict'].values()
                if d.has_key(oligo)
            ]))
    # get purity score
    html_dict['purity_score'] = run_info_dict['final_purity_score_dict']
    # get total purity score
    html_dict['total_purity_score'] = run_info_dict['total_purity_score_dict']
    # get unique sequence dict (which will contain the most frequent unique sequence for given oligotype)
    if html_dict.has_key('output_directory_for_reps'):
        html_dict['rep_oligo_seqs_clean_dict'], html_dict[
            'rep_oligo_seqs_fancy_dict'] = get_unique_sequences_dict(html_dict)
        html_dict['oligo_reps_dict'] = get_oligo_reps_dict(
            html_dict, html_output_directory)
        html_dict['component_reference'] = ''.join([
            '<a onmouseover="popup(\'\#%d\', 50)" href="">|</a>' % i
            for i in range(0, html_dict['alignment_length'])
        ])

    # get javascript code for sample pie-charts
    html_dict['pie_charts_js'] = render_to_string('pie_charts_js.tmpl',
                                                  html_dict)

    # FIXME: code below is very inefficient and causes a huge
    # memory issue. fix it by not using deepcopy.
    # generate individual oligotype pages
    if html_dict.has_key('output_directory_for_reps'):
        for i in range(0, len(html_dict['oligos'])):
            oligo = html_dict['oligos'][i]
            tmp_dict = copy.deepcopy(html_dict)
            tmp_dict['oligo'] = oligo
            tmp_dict['distribution'] = get_oligo_distribution_dict(
                oligo, html_dict)
            oligo_page = os.path.join(html_output_directory,
                                      'oligo_%s.html' % oligo)

            tmp_dict['index'] = i + 1
            tmp_dict['total'] = len(html_dict['oligos'])
            tmp_dict['prev'] = None
            tmp_dict['next'] = None
            if i > 0:
                tmp_dict['prev'] = 'oligo_%s.html' % html_dict['oligos'][i - 1]
            if i < (len(html_dict['oligos']) - 1):
                tmp_dict['next'] = 'oligo_%s.html' % html_dict['oligos'][i + 1]

            rendered = render_to_string('single_oligo.tmpl', tmp_dict)

            open(oligo_page, 'w').write(rendered.encode("utf-8"))

    # generate index
    index_page = os.path.join(html_output_directory, 'index.html')
    rendered = render_to_string('index_for_oligo.tmpl', html_dict)

    open(index_page, 'w').write(rendered.encode("utf-8"))

    return index_page
Ejemplo n.º 14
0
def pretify(arg):
    return pretty_print(arg)
Ejemplo n.º 15
0
def topology(topology_dict_path, output_file=None, title=None):
    G, nodes_dict = topology_graph(topology_dict_path)

    number_of_edges = G.number_of_edges()
    number_of_nodes = G.number_of_nodes()

    print("Loaded %d edges and %d nodes." % (number_of_edges, number_of_nodes))

    plt.figure(figsize=(24, 16))

    # use graphviz to find radial layout
    # twopi, gvcolor, wc, ccomps, tred, sccmap, fdp, circo, neato, acyclic, nop, gvpr, dot
    pos = nx.graphviz_layout(G, prog="fdp")

    # node size is proportional to number of reads went into it
    sizes = dict.fromkeys(G.nodes(), 0.0)
    for (u, v, d) in G.edges(data=True):
        sizes[u] = d['size']
    max_size = max(sizes.values())
    k = 10000.0 / max_size
    for node in sizes:
        sizes[node] = sizes[node] * k if sizes[node] * k > 500 else 500

    shapes = dict.fromkeys(G.nodes(), 0.0)
    for (u, v, d) in G.edges(data=True):
        shapes[u] = 'o' if d['size'] > 1 else ''

    # edge width, not in use at this moment
    edgewidth = []
    for (u, v, d) in G.edges(data=True):
        edgewidth.append(2)  #len(G.get_edge_data(u,v)))

    parent_nodes_network = nx.draw_networkx_nodes(
        G,
        pos,
        nodelist=parent_nodes,
        node_shape='o',
        node_size=[sizes[i] for i in parent_nodes],
        node_color='#EFEFEF')
    final_nodes_network = nx.draw_networkx_nodes(
        G,
        pos,
        nodelist=final_nodes,
        node_shape='o',
        node_size=[sizes[i] for i in final_nodes],
        node_color='#FAFFFA')
    parent_nodes_network.set_edgecolor('#888888')
    final_nodes_network.set_edgecolor('#88BB00')
    nx.draw_networkx_edges(G,
                           pos,
                           alpha=0.4,
                           node_size=10,
                           width=1,
                           edge_color='#808080')
    nx.draw_networkx_labels(
        G,
        pos,
        font_size=8,
        font_weight='bold',
        labels=dict([(u, '%s\n(%s)' % (d['label'], pretty_print(d['size'])))
                     for u, v, d in G.edges(data=True)]))

    # adjust the plot limits
    xmax = 1.02 * max(x for x, y in pos.values())
    ymax = 1.02 * max(y for x, y in pos.values())
    plt.xlim(0, xmax)
    plt.ylim(0, ymax)
    plt.xticks([])
    plt.yticks([])

    plt.subplots_adjust(hspace=0,
                        wspace=0,
                        right=0.995,
                        left=0.005,
                        top=0.995,
                        bottom=0.005)

    plt.text(0.03,
             0.97,
             title or "Topology",
             fontsize='xx-large',
             fontname="Arial",
             fontweight="bold",
             transform=plt.gca().transAxes)

    ax = plt.gca()
    plt.setp(ax, frame_on=False)
    #plt.axis('off')

    if nodes_dict['root'].has_key('freq_curve_img_path'):
        AX = plt.gca()
        f = plt.gcf()

        for node in nodes_dict.keys():
            (x, y) = pos[node]
            xt, yt = AX.transData.transform((x, y))  # figure coordinates
            xf, yf = f.transFigure.inverted().transform(
                (xt, yt))  # axes coordinates
            print xf, yf
            if node == 'root':
                imsize = 0.04
            else:
                imsize = 0.025
            img = mpimg.imread(nodes_dict[node]['freq_curve_img_path'])
            a = plt.axes(
                [xf - imsize / 2.0, yf - imsize / 2.0, imsize, imsize])
            a.imshow(img)
            a.axis('off')

    if output_file:
        plt.savefig(output_file)
    else:
        plt.show()
Ejemplo n.º 16
0
def sumvals(arg, clean=None):
    if clean:
        return sum(arg.values())
    return pretty_print(sum(arg.values()))
Ejemplo n.º 17
0
 def show_progress(self, end=False):
     sys.stderr.write('\r[b6lib] Reading: %s' % (pretty_print(self.pos)))
     sys.stderr.flush()
     if end:
         sys.stderr.write('\n')
Ejemplo n.º 18
0
 def show_progress(self, end = False):
     sys.stderr.write('\r[b6lib] Reading: %s' % (pretty_print(self.pos)))
     sys.stderr.flush()
     if end:
         sys.stderr.write('\n')
Ejemplo n.º 19
0
def pretify(arg):
    return pretty_print(arg) 
Ejemplo n.º 20
0
def generate_html_output(run_info_dict, html_output_directory = None, entropy_figure = None):
    if not html_output_directory:    
        html_output_directory = os.path.join(run_info_dict['output_directory'], 'HTML-OUTPUT')
        
    if not os.path.exists(html_output_directory):
        os.makedirs(html_output_directory)
    
    html_dict = copy.deepcopy(run_info_dict)

    shutil.copy2(os.path.join(absolute, 'static/style.css'), os.path.join(html_output_directory, 'style.css'))
    shutil.copy2(os.path.join(absolute, 'static/header_1.png'), os.path.join(html_output_directory, 'header.png'))
    shutil.copy2(os.path.join(absolute, 'static/missing_image.png'), os.path.join(html_output_directory, 'missing.png'))
    shutil.copy2(os.path.join(absolute, 'scripts/jquery-1.7.1.js'), os.path.join(html_output_directory, 'jquery-1.7.1.js'))
    shutil.copy2(os.path.join(absolute, 'scripts/popup.js'), os.path.join(html_output_directory, 'popup.js'))
    shutil.copy2(os.path.join(absolute, 'scripts/g.pie.js'), os.path.join(html_output_directory, 'g.pie.js'))
    shutil.copy2(os.path.join(absolute, 'scripts/g.raphael.js'), os.path.join(html_output_directory, 'g.raphael.js'))
    shutil.copy2(os.path.join(absolute, 'scripts/raphael.js'), os.path.join(html_output_directory, 'raphael.js'))
    shutil.copy2(os.path.join(absolute, 'scripts/morris.js'), os.path.join(html_output_directory, 'morris.js'))

    def copy_as(source, dest_name, essential = True):
        dest = os.path.join(html_output_directory, dest_name)

        if essential:
            shutil.copy2(source, dest)
        else:
            # it is ok if you fail to copy files that are not
            # essential.. 
            try:
                shutil.copy2(source, dest)
            except:
                sys.stderr.write('\n\n[HTML] Warning: Source file not found\n\tSource: "%s"\n\tDest: "%s\n\n"' % (source, dest))

        return os.path.basename(dest)

    # embarrassingly ad-hoc:
    if entropy_figure:
        if entropy_figure.endswith('.pdf') or entropy_figure.endswith('.png'):
            entropy_figure = entropy_figure[:-4]
            
    CP = lambda e, o:  copy_as(os.path.join(e + ('.%s' % ext)), o, essential = True if ext == 'png' else False)
    for ext in ['png', 'pdf']:
        output_file = 'entropy.%s' % ext
        if entropy_figure:
            html_dict['entropy_figure_%s' % ext] = CP(entropy_figure, output_file)
        else:
            try:
                html_dict['entropy_figure_%s' % ext] = CP(run_info_dict['entropy'], output_file)
            except:
                html_dict['entropy_figure_%s' % ext] = CP(run_info_dict['entropy'][:-4], output_file)

 
    if run_info_dict['gexf_network_file_path']:
        html_dict['gexf_network_file_path'] = copy_as(run_info_dict['gexf_network_file_path'], 'network.gexf')

    if run_info_dict['sample_mapping']:
        html_dict['sample_mapping'] = copy_as(run_info_dict['sample_mapping'], 'sample_mapping.txt')
    else:
        html_dict['sample_mapping'] = None

    html_dict['matrix_count_file_path'] = copy_as(run_info_dict['matrix_count_file_path'], 'matrix_counts.txt')
    html_dict['matrix_percent_file_path'] = copy_as(run_info_dict['matrix_percent_file_path'], 'matrix_percents.txt')
    html_dict['read_distribution_table_path'] = copy_as(run_info_dict['read_distribution_table_path'], 'read_distribution.txt')
    html_dict['environment_file_path'] = copy_as(run_info_dict['environment_file_path'], 'environment.txt')
    html_dict['oligos_fasta_file_path'] = copy_as(run_info_dict['oligos_fasta_file_path'], 'oligos.fa.txt')
    html_dict['oligos_nexus_file_path'] = copy_as(run_info_dict['oligos_nexus_file_path'], 'oligos.nex.txt')


    def get_figures_dict(html_dict_prefix):
        html_dict_key = '%s_file_path' % html_dict_prefix
        if html_dict.has_key(html_dict_key):
            figures_dict = cPickle.load(open(html_dict[html_dict_key]))
            for _map in figures_dict:
                for _func in figures_dict[_map]:
                    for _op in figures_dict[_map][_func]:
                        if os.path.exists(figures_dict[_map][_func][_op] + '.pdf') and os.path.exists(figures_dict[_map][_func][_op] + '.png'):
                            prefix = copy_as(figures_dict[_map][_func][_op] + '.pdf', '%s.pdf' % '-'.join([_map, _func, _op]))
                            prefix = copy_as(figures_dict[_map][_func][_op] + '.png', '%s.png' % '-'.join([_map, _func, _op]))
                            figures_dict[_map][_func][_op] = '.'.join(prefix.split('.')[:-1])
                        else:
                            figures_dict[_map][_func][_op] = None
            return figures_dict
        else:
            return None
        
    
    html_dict['figures_dict'] = get_figures_dict('figures_dict')
    html_dict['exclusive_figures_dict'] = get_figures_dict('exclusive_figures_dict')


    if html_dict['generate_sets']:
        html_dict['across_samples_MN_file_path'] = copy_as(run_info_dict['across_samples_MN_file_path'], 'across_samples_max_normalized.txt')
        html_dict['across_samples_SN_file_path'] = copy_as(run_info_dict['across_samples_SN_file_path'], 'across_samples_sum_normalized.txt')
        html_dict['oligo_sets_stackbar_figure'] = copy_as(run_info_dict['stack_bar_with_agglomerated_oligos_file_path'], 'stackbar_with_oligo_sets.png')
        html_dict['oligos_across_samples_figure'] = copy_as(run_info_dict['oligos_across_samples_file_path'], 'oligos_across_samples.png')
        html_dict['oligotype_sets_figure'] = copy_as(run_info_dict['oligotype_sets_across_samples_figure_path'], 'oligotype_sets.png')
        html_dict['matrix_count_oligo_sets_file_path'] = copy_as(run_info_dict['matrix_count_oligo_sets_file_path'], 'matrix_counts_oligo_sets.txt')
        html_dict['matrix_percent_oligo_sets_file_path'] = copy_as(run_info_dict['matrix_percent_oligo_sets_file_path'], 'matrix_percents_oligo_sets.txt')
        html_dict['oligotype_sets_file'] = copy_as(run_info_dict['oligotype_sets_file_path'], 'oligotype_sets.txt')
        html_dict['oligotype_sets'] = [l.strip().split('\t')[1].split(',') for l in open(run_info_dict['oligotype_sets_file_path'])]
 
    if html_dict.has_key('representative_seqs_fasta_file_path'):
        html_dict['representative_seqs_fasta_file_path'] = copy_as(run_info_dict['representative_seqs_fasta_file_path'], 'oligo-representatives.fa.txt')
    else:
        html_dict['representative_seqs_fasta_file_path'] = None
    if run_info_dict.has_key('blast_ref_db') and os.path.exists(run_info_dict['blast_ref_db']):
        html_dict['blast_ref_db_path'] = copy_as(run_info_dict['blast_ref_db'], 'reference_db.fa')
    html_dict['entropy_components'] = [int(x) for x in html_dict['bases_of_interest_locs'].split(',')]
    html_dict['samples_dict'] = get_samples_dict_from_environment_file(run_info_dict['environment_file_path'])
    html_dict['samples'] = sorted(html_dict['samples_dict'].keys())
    html_dict['blast_results_found'] = False

    # get alignment length
    html_dict['alignment_length'] = get_alignment_length(run_info_dict['alignment'])
    # include pretty names
    html_dict['pretty_names'] = pretty_names
    # get colors dict
    html_dict['color_dict'] = get_colors_dict(run_info_dict['colors_file_path'])
    # get abundant oligos list
    html_dict['oligos'] = get_oligos_list(run_info_dict['oligos_fasta_file_path'])
    # get oligo frequencies
    html_dict['frequency'] = {}
    for oligo in html_dict['oligos']:
        html_dict['frequency'][oligo] = pretty_print(sum([d[oligo] for d in html_dict['samples_dict'].values() if d.has_key(oligo)]))
    # get unique sequence dict (which will contain the most frequent unique sequence for given oligotype)
    if html_dict.has_key('output_directory_for_reps'):
        html_dict['rep_oligo_seqs_clean_dict'], html_dict['rep_oligo_seqs_fancy_dict'] = get_unique_sequences_dict(html_dict)
        html_dict['oligo_reps_dict'] = get_oligo_reps_dict(html_dict, html_output_directory)
        html_dict['component_reference'] = ''.join(['<a onmouseover="popup(\'\#%d\', 50)" href="">|</a>' % i for i in range(0, html_dict['alignment_length'])])

    # get javascript code for sample pie-charts
    html_dict['pie_charts_js'] = render_to_string('pie_charts_js.tmpl', html_dict)

    # FIXME: code below is very inefficient and causes a huge
    # memory issue. fix it by not using deepcopy.
    # generate individual oligotype pages
    if html_dict.has_key('output_directory_for_reps'):
        for i in range(0, len(html_dict['oligos'])):
            oligo = html_dict['oligos'][i]
            tmp_dict = copy.deepcopy(html_dict)
            tmp_dict['oligo'] = oligo
            tmp_dict['distribution'] = get_oligo_distribution_dict(oligo, html_dict)
            oligo_page = os.path.join(html_output_directory, 'oligo_%s.html' % oligo)
            
            tmp_dict['index'] = i + 1
            tmp_dict['total'] = len(html_dict['oligos'])
            tmp_dict['prev'] = None
            tmp_dict['next'] = None
            if i > 0:
                tmp_dict['prev'] = 'oligo_%s.html' % html_dict['oligos'][i - 1]
            if i < (len(html_dict['oligos']) - 1):
                tmp_dict['next'] = 'oligo_%s.html' % html_dict['oligos'][i + 1]
            
            rendered = render_to_string('single_oligo.tmpl', tmp_dict)
    
            open(oligo_page, 'w').write(rendered.encode("utf-8"))


    # generate index
    index_page = os.path.join(html_output_directory, 'index.html')
    rendered = render_to_string('index_for_oligo.tmpl', html_dict)

    open(index_page, 'w').write(rendered.encode("utf-8"))

    return index_page
Ejemplo n.º 21
0
def entropy_analysis(alignment_path,
                     output_file=None,
                     verbose=True,
                     uniqued=False,
                     freq_from_defline=None,
                     weighted=False,
                     qual_stats_dict=None,
                     amino_acid_sequences=False):
    if freq_from_defline == None:
        freq_from_defline = lambda x: int(
            [t.split(':')[1] for t in x.split('|') if t.startswith('freq')][0])

    lines = []
    previous_alignment_length = None

    progress = Progress()
    progress.verbose = verbose

    alignment = u.SequenceSource(alignment_path)

    progress.new('Processing the Alignment')

    # processing the alignment file..
    while alignment.next():
        # check the alignment lengths along the way:
        if previous_alignment_length:
            if previous_alignment_length != len(alignment.seq):
                raise EntropyError, "Not all reads have the same length."

        # print out process info
        if alignment.pos % 10000 == 0:
            progress.update('Reads processed: %s' %
                            (pretty_print(alignment.pos)))

        # fill 'lines' variable
        if not uniqued:
            lines.append(alignment.seq)
        else:
            try:
                frequency = freq_from_defline(alignment.id)
            except IndexError:
                raise EntropyError, "Reads declared as unique, but they do not have proper deflines. See help for --uniqued."

            for i in range(0, frequency):
                lines.append(alignment.seq)

        previous_alignment_length = len(alignment.seq)

    progress.end()
    if verbose:
        run.info('Number of reads', pretty_print(alignment.pos))

    alignment.close()

    # entropy analysis
    progress.new('Entropy Analysis')
    entropy_tpls = []

    for position in range(0, len(lines[0])):
        progress.update(P(int(position + 1), len(lines[0])))

        if len(set([x[position] for x in lines])) == 1:
            entropy_tpls.append((position, 0.0), )
        else:
            column = "".join([x[position] for x in lines])

            if weighted:
                if not qual_stats_dict:
                    raise EntropyError, "Weighted entropy is selected, but no qual stats are provided"
                e = entropy(column,
                            l_qual=qual_stats_dict[position],
                            amino_acid_sequences=amino_acid_sequences)
            else:
                e = entropy(column, amino_acid_sequences=amino_acid_sequences)

            if e < 0.00001:
                entropy_tpls.append((position, 0.0), )
            else:
                entropy_tpls.append((position, e), )

    sorted_entropy_tpls = sorted(entropy_tpls,
                                 key=operator.itemgetter(1),
                                 reverse=True)

    progress.end()

    if verbose:
        entropy_components_larger_than_0 = [
            e[1] for e in entropy_tpls if e[1] > 0
        ]
        if entropy_components_larger_than_0:
            run.info('Entropy analysis', 'Done (total of %d components greater than 0, mean: %.2f, max: %.2f, min: %.2f).' \
                                                        % (len(entropy_components_larger_than_0),
                                                           numpy.mean(entropy_components_larger_than_0),
                                                           numpy.max(entropy_components_larger_than_0),
                                                           numpy.min(entropy_components_larger_than_0)))
        else:
            run.info('Entropy analysis',
                     'None of the nucleotide positions posessed any entropy!')

    if output_file:
        entropy_output = open(output_file, 'w')
        for _component, _entropy in sorted_entropy_tpls:
            entropy_output.write('%d\t%.4f\n' % (_component, _entropy))
        if verbose:
            run.info('Entropy analysis output file path', output_file)
        entropy_output.close()

    return [x[1] for x in entropy_tpls]
def topology(topology_dict_path, output_file = None, title = None):
    G, nodes_dict = topology_graph(topology_dict_path)

    number_of_edges = G.number_of_edges()
    number_of_nodes = G.number_of_nodes()

    print("Loaded %d edges and %d nodes." % (number_of_edges, number_of_nodes))

    plt.figure(figsize=(24, 16))
    
    # use graphviz to find radial layout
    # twopi, gvcolor, wc, ccomps, tred, sccmap, fdp, circo, neato, acyclic, nop, gvpr, dot
    pos=nx.graphviz_layout(G, prog="fdp")

    # node size is proportional to number of reads went into it
    sizes = dict.fromkeys(G.nodes(), 0.0)
    for (u, v, d) in G.edges(data=True):
        sizes[u] = d['size']
    max_size = max(sizes.values())
    k = 10000.0 / max_size
    for node in sizes:
        sizes[node] = sizes[node] * k if sizes[node] * k > 500 else 500
 
    shapes = dict.fromkeys(G.nodes(), 0.0)
    for (u, v, d) in G.edges(data=True):
        shapes[u] = 'o' if d['size'] > 1 else ''

 
    # edge width, not in use at this moment
    edgewidth = []
    for (u, v, d) in G.edges(data = True):
        edgewidth.append(2) #len(G.get_edge_data(u,v)))

    parent_nodes_network = nx.draw_networkx_nodes(G, pos, nodelist = parent_nodes, node_shape = 'o', node_size = [sizes[i] for i in parent_nodes], node_color = '#EFEFEF')
    final_nodes_network = nx.draw_networkx_nodes(G, pos, nodelist = final_nodes, node_shape = 'o', node_size = [sizes[i] for i in final_nodes], node_color = '#FAFFFA')
    parent_nodes_network.set_edgecolor('#888888')
    final_nodes_network.set_edgecolor('#88BB00')
    nx.draw_networkx_edges(G, pos, alpha=0.4, node_size=10, width = 1, edge_color='#808080')
    nx.draw_networkx_labels(G, pos, font_size=8, font_weight = 'bold', labels = dict([(u, '%s\n(%s)' % (d['label'], pretty_print(d['size']))) for u, v, d in G.edges(data=True)]))
    
    # adjust the plot limits
    xmax = 1.02 * max(x for x, y in pos.values())
    ymax = 1.02 * max(y for x, y in pos.values())
    plt.xlim(0, xmax)
    plt.ylim(0, ymax)
    plt.xticks([])
    plt.yticks([])

    plt.subplots_adjust(hspace = 0, wspace = 0, right = 0.995, left = 0.005, top = 0.995, bottom = 0.005)

    plt.text(0.03, 0.97, title or "Topology", fontsize='xx-large',
             fontname="Arial", fontweight="bold", transform=plt.gca().transAxes)

    ax=plt.gca()
    plt.setp(ax, frame_on=False)
    #plt.axis('off')

    if nodes_dict['root'].has_key('freq_curve_img_path'):
        AX=plt.gca()
        f=plt.gcf()

        for node in nodes_dict.keys():
            (x, y) = pos[node]
            xt,yt = AX.transData.transform((x, y)) # figure coordinates
            xf, yf = f.transFigure.inverted().transform((xt, yt)) # axes coordinates
            print xf, yf
            if node == 'root':
                imsize = 0.04
            else:
                imsize = 0.025
            img =  mpimg.imread(nodes_dict[node]['freq_curve_img_path'])
            a = plt.axes([xf - imsize / 2.0, yf - imsize / 2.0, imsize, imsize ])
            a.imshow(img)
            a.axis('off')

    if output_file:
        plt.savefig(output_file)
    else:
        plt.show()