Esempio n. 1
0
def visualize_nucl_substitution_matrix(shms_df, output_fname, log):
    nucl_list = ['A', 'C', 'G', 'T']
    nucl_matrix = []
    for n in nucl_list:
        nucl_matrix.append([0] * len(nucl_list))
    num_shms = 0
    for it in shms_df:
        read_shms = shms_df[it]
        for shm in read_shms:
            if not shm.is_substitution():
                continue
            if nucl_is_valid(shm.read_nucl) and nucl_is_valid(shm.gene_nucl):
                nucl_matrix[nucl_list.index(shm.read_nucl)][nucl_list.index(
                    shm.gene_nucl)] += 1
                num_shms += 1
    for i in range(0, len(nucl_matrix)):
        for j in range(0, len(nucl_matrix[i])):
            nucl_matrix[i][j] = float(nucl_matrix[i][j]) / float(num_shms)
    fig, ax = plt.subplots()
    sns.heatmap(nucl_matrix,
                cmap=plt.cm.Blues,
                xticklabels=nucl_list,
                yticklabels=nucl_list,
                square=True,
                ax=ax)
    ax.tick_params(labelsize=14)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12, rotation='horizontal')
    plt.xlabel("To", fontsize=14)
    plt.ylabel("From", fontsize=14, rotation='horizontal')
    utils.output_figure(output_fname, "Nucleotide substitution heatmap", log)
Esempio n. 2
0
def visualize_largest_group_aa_variability(labeling_df, region, region_name,
                                           output_fname, log):
    region_seq = list(labeling_df[region])
    max_group = get_region_largest_group(region_seq)
    if len(max_group) == 0:
        return
    group_len = len(max_group[0])
    if group_len % 3 != 0:
        print("Largest " + region_name + " is not out-of-frame")
        return
    aa_len = group_len // 3
    aa_seqs = [Seq(cdr).translate(to_stop=True) for cdr in max_group]
    aa_dict = {'Position': [], 'Hidrophobicity': []}
    for aa_seq in aa_seqs:
        aa_row = [utils.hydrophoby_dict[aa] for aa in aa_seq]
        for i in range(len(aa_row)):
            aa_dict['Position'].append(i + 1)
            aa_dict['Hidrophobicity'].append(aa_row[i])
    plt.figure()
    sns.barplot(x='Position',
                y='Hidrophobicity',
                data=aa_dict,
                order=range(1, aa_len + 1),
                color='blue')
    plt.xlabel('Position (aa)', fontsize=14)
    plt.ylabel('Hidrophobicity', fontsize=14)
    plt.ylim(
        min(utils.hydrophoby_dict.values()) - 10,
        max(utils.hydrophoby_dict.values()) + 10)
    utils.output_figure(output_fname, region_name + " aa variability", log)
Esempio n. 3
0
def visualize_largest_region_nucls(labeling_df, region, region_name,
                                   output_fname, log):
    region_seq = list(labeling_df[region])
    max_group = get_region_largest_group(region_seq)
    if len(max_group) == 0:
        return
    nucl_dict = get_nucls_lists(max_group)
    x = np.array(range(0, len(max_group[0])))
    x_l = [str(i) for i in range(1, len(max_group[0]) + 1)]
    acgt = nucl_dict['A'] + nucl_dict['C'] + nucl_dict['G'] + nucl_dict['T']
    cgt = nucl_dict['C'] + nucl_dict['G'] + nucl_dict['T']
    gt = nucl_dict['G'] + nucl_dict['T']
    #sns.set_color_codes("pastel")
    sns.set_color_codes("muted")
    f, ax = plt.subplots(figsize=(15, 6))
    sns.barplot(x=x, y=acgt, label="A", color='b')
    sns.barplot(x=x, y=cgt, label="C", color='g')
    sns.barplot(x=x, y=gt, label="G", color='r')
    sns.barplot(x=x, y=nucl_dict['T'], label="T", color='orange')
    plt.ylim(0, 115)
    ax.legend(ncol=4, loc="upper center", frameon=True, fontsize=16)
    plt.xlabel(region_name + ' position (nt)', fontsize=16)
    plt.ylabel('Nucleotide %', fontsize=16)
    plt.xticks(x, x_l, fontsize=14)
    plt.yticks(fontsize=14)
    utils.output_figure(output_fname, region_name + " nucleotide distribution",
                        log)
Esempio n. 4
0
def output_shm_stats_for_isotype(num_shms, shm_pos, isotype, output_prefix,
                                 log):
    plt.figure(1)
    plt.subplot(211)
    # plot for SHM positions
    plt.hist(shm_pos, color=isotype_colors[isotype], alpha=.75, bins=50)
    #cdr_color = "#EFBEBE"
    #plt.gca().add_patch(patches.Rectangle((cdr_positions[isotype]['CDR1'][0], 0),
    #                                      cdr_positions[isotype]['CDR1'][1] - cdr_positions[isotype]['CDR1'][0],
    #                                      max(n) + 2, facecolor= cdr_color, lw = 0))
    #plt.gca().add_patch(patches.Rectangle((cdr_positions[isotype]['CDR2'][0], 0),
    #                                      cdr_positions[isotype]['CDR2'][1] - cdr_positions[isotype]['CDR2'][0],
    #                                      max(n) + 2, facecolor= cdr_color, lw = 0))
    #plt.gca().add_patch(patches.Rectangle((cdr_positions[isotype]['CDR3'][0], 0),
    #                                      cdr_positions[isotype]['CDR3'][1] - cdr_positions[isotype]['CDR3'][0],
    #                                      max(n) + 2, facecolor= cdr_color, lw = 0))
    #n, bins, p = pylab.hist(shm_pos, color = isotype_colors[isotype], bins = 50)
    plt.xlabel("Relative position of " + isotype + "V SHM in read",
               fontsize=16)
    plt.ylabel("# SHMs", fontsize=16)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.title("SHMs in " + isotype + "V", fontsize=18)
    # plot for SHM number
    plt.subplot(212)
    plt.hist(num_shms, color=isotype_colors[isotype], bins=50, alpha=.75)
    plt.xlabel("# SHMs in " + isotype + "V", fontsize=16)
    plt.ylabel("# sequences", fontsize=16)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    output_fname = output_prefix + "_" + isotype + "V"
    utils.output_figure(output_fname,
                        "Distribution of # SHMs in " + isotype + "V segments",
                        log)
Esempio n. 5
0
def visualize_length_abundance_dist(labeling_df, region, region_name,
                                    output_fname, log):
    region_seq = list(labeling_df[region])
    region_dict = dict()
    for seq in region_seq:
        if seq not in region_dict:
            region_dict[seq] = 0
        region_dict[seq] += 1
    abun = []  #np.array()
    lens = []  #np.array()
    for seq in region_dict:
        if region_dict[seq] == 1:
            continue
        abun.append(region_dict[seq])
        lens.append(len(seq))
    abun = np.asarray(abun)
    lens = np.asarray(lens)
    f, ax = plt.subplots()
    sns.jointplot(abun, lens, size=6)
    #plt.xlabel(region_name + ' abundance', fontsize = 14)
    #ax.xaxis.set_label_position('top')
    #plt.ylabel(region_name + ' length', fontsize = 14)
    #plt.xticks(fontsize = 14)
    #plt.yticks(fontsize = 14)
    #plt.xlim(-1, abun.max() + 1)
    #plt.ylim(-1, lens.max() + 1)
    utils.output_figure(
        output_fname,
        region_name + " joint distribution of abundances & lengths", log)
Esempio n. 6
0
def visualize_indel_shm_lengths(shm_df, output_fname, log):
    prev_read_pos = -1
    prev_gene_pos = -1
    insertion_length = []
    deletions_lengths = []
    in_len = 0
    del_len = 0
    for it in shm_df:
        read_shms = shm_df[it]
        for shm in read_shms:
            if shm.is_deletion():
                if shm.gene_pos - prev_gene_pos == 1:
                    del_len += 1
                else:
                    if del_len > 0:
                        deletions_lengths.append(del_len)
                    del_len = 1
                prev_gene_pos = shm.gene_pos
            if shm.is_insertion():
                if shm.read_pos - prev_read_pos == 1:
                    in_len += 1
                else:
                    if in_len > 0:
                        insertion_length.append(in_len)
                    in_len = 1
                prev_read_pos = shm.read_pos
    if in_len != 0:
        insertion_length.append(in_len)
    if del_len != 0:
        deletions_lengths.append(del_len)
    dt = []
    labels = []
    max_x_value = 0
    if len(deletions_lengths) > 10:
        dt.append(deletions_lengths)
        labels.append("Deletions")
    if len(insertion_length) > 10:
        dt.append(insertion_length)
        labels.append("Insertions")
    if len(dt) == 0:
        log.info(
            "Output contains very low number of indel SHMs. Plot drawing was skipped"
        )
        return
    plt.hist(dt, label=labels, bins=50)
    plt.legend(loc='upper center', ncol=len(dt), fontsize=14)
    plt.xlabel("Insertion / deletion SHM length", fontsize=16)
    plt.ylabel("# insertion / deletion SHMs", fontsize=16)
    xlim_right = 0
    if len(deletions_lengths) != 0:
        xlim_right = max(deletions_lengths)
    if len(insertion_length) != 0:
        xlim_right = max(xlim_right, max(insertion_length))
    plt.xlim(.5, xlim_right + .5)
    plt.xticks(range(0, xlim_right + 1), fontsize=14)
    plt.yticks(fontsize=14)
    utils.output_figure(output_fname,
                        "Distribution of insertion/deletion SHM lengths", log)
Esempio n. 7
0
def visualize_special_shm_positions(shm_df, syn_output_fname,
                                    special_output_fname, log):
    synonymous_pos = []
    stop_codon_pos = []
    deletion_pos = []
    insertion_pos = []
    for it in shm_df:
        read_shms = shm_df[it]
        for shm in read_shms:
            if not it.is_variable():
                continue
            relative_pos = float(shm.read_pos) / float(it.read_len)
            if shm.synonymous:
                synonymous_pos.append(relative_pos)
            elif shm.to_stop_codon:
                stop_codon_pos.append(relative_pos)
            elif shm.is_deletion():
                deletion_pos.append(relative_pos)
            elif shm.is_insertion():
                insertion_pos.append(relative_pos)
    output_synonymous_shms(synonymous_pos, syn_output_fname, log)
    pos = []
    labels = []
    colors = []
    plt.figure(figsize=(12, 9))
    #sns.distplot(synonymous_pos, hist = False, label = "Synonymous SHMs", color = 'r')
    #if len(stop_codon_pos) > 100:
    #    pos.append(stop_codon_pos)
    #    labels.append('Stop codon')
    #    colors.append('g')
    #    #sns.distplot(stop_codon_pos, hist = False, label = "Stop codon SHMs", color = 'g')
    if len(deletion_pos) > 10:
        pos.append(deletion_pos)
        labels.append('Deletions')
        colors.append('b')
        #sns.distplot(deletion_pos, hist = False, label = "Deletion SHMs", color = 'b')
    if len(insertion_pos) > 10:
        pos.append(insertion_pos)
        labels.append('Insertions')
        colors.append('g')
    if len(pos) == 0:
        log.info(
            "Output contains very low number of special SHMs. Plot drawing will be skipped"
        )
        return
    #sns.distplot(insertion_pos, hist = False, label = "Insertion SHMs", color = 'orange')
    plt.hist(pos, color=colors, label=labels, bins=100 / len(pos))
    plt.xlim(0, .75)
    plt.legend(loc='upper center',
               ncol=len(pos),
               fontsize=12,
               bbox_to_anchor=(0.5, -0.07))
    plt.xlabel("Relative position of V SHM in read", fontsize=14)
    plt.ylabel("# SHMs", fontsize=14)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    utils.output_figure(special_output_fname,
                        "Distribution of indel V SHM positions in read", log)
Esempio n. 8
0
def visualize_region_lengths(labeling_df, region, region_name, output_fname,
                             log):
    region_seq = list(labeling_df[region])
    region_len = [len(s) for s in region_seq if len(s) > 1]
    plt.figure()
    plt.hist(region_len)
    plt.xlabel(region_name + ' length (nt)', fontsize=14)
    plt.ylabel('# ' + region_name + 's', fontsize=14)
    utils.output_figure(output_fname, region_name + " length distribution",
                        log)
Esempio n. 9
0
def output_shm_stats_for_isotype(num_shms, locus, output_fname, log):
    plt.figure()
    plt.hist(num_shms, color=isotype_colors[locus], bins=50, alpha=.75)
    plt.xlabel("# SHMs in " + locus + "V", fontsize=16)
    plt.ylabel("# sequences", fontsize=16)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.title('# SHMs in ' + locus + 'V' + ' sequences', fontsize=14)
    utils.output_figure(output_fname,
                        "Distribution of # SHMs in " + locus + "V segments",
                        log)
Esempio n. 10
0
 def OutputHeatmap(self, output_fname, log):
     plt.figure(figsize=(10, 15))
     sns.heatmap(np.array(self.abundant_vj_matrix),
                 xticklabels=self.sorted_js,
                 yticklabels=self.used_vs,
                 cmap='jet')
     plt.yticks(rotation=0, fontsize=10)
     plt.xticks(rotation=90, fontsize=10)
     utils.output_figure(
         output_fname, "VJ heatmap for the most abundant VJ combinations",
         log)
Esempio n. 11
0
def output_synonymous_shms(synonymous_pos, output_fname, log):
    if len(synonymous_pos) < 100:
        return
    plt.hist(synonymous_pos, color='r', bins=100)
    plt.xlabel("Relative position of V SHM in read", fontsize=14)
    plt.ylabel("#SHMs", fontsize=14)
    plt.xlim(0, .75)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    utils.output_figure(
        output_fname, "Distribution of synonymous SHM positions in V segment",
        log)
Esempio n. 12
0
 def OutputJUsage(self, output_fname, log):
     plt.figure(figsize=(10, 8))
     perc_list = [
         float(self.j_dict[j]) / len(self.vj_df) * 100
         for j in self.sorted_js
     ]
     plt.bar(range(len(self.sorted_js)), perc_list)
     plt.xticks(range(len(self.sorted_js)),
                self.sorted_js,
                rotation=90,
                fontsize=10)
     plt.ylabel('% of sequences', fontsize=14)
     utils.output_figure(output_fname, "Usage of J genes", log)
def visualize_region_lengths(labeling_df, region, region_name, output_fname,
                             log):
    region_seq = list(labeling_df[region])
    region_len = [len(s) for s in region_seq if len(s) > 1]
    f, ax = plt.subplots(figsize=(8, 8))
    sns.distplot(region_len, kde=False, rug=False)
    plt.xlabel(region_name + ' length', fontsize=16)
    plt.ylabel('# ' + region_name + 's', fontsize=16)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.xlim(0, 100)
    utils.output_figure(output_fname, region_name + " length distribution",
                        log)
def visualize_largest_group_aa_variability(labeling_df, region, region_name,
                                           output_fname, log):
    region_seq = list(labeling_df[region])
    max_group = get_region_largest_group(region_seq)
    if len(max_group) == 0:
        return
    group_len = len(max_group[0])
    if group_len % 3 != 0:
        print "Largest " + region_name + " is not out-of-frame"
        return
    aa_seqs = [Seq(cdr).translate(to_stop=True) for cdr in max_group]
    aa_list = [dict() for i in range(0, group_len / 3)]
    for aa_seq in aa_seqs:
        for i in range(0, len(aa_seq)):
            if aa_seq[i] not in aa_list[i]:
                aa_list[i][aa_seq[i]] = 0
            aa_list[i][aa_seq[i]] += 1
    aa_num = [len(aa) for aa in aa_list]
    aa_large_abun = []
    aa_large_acid = []
    for aa in aa_list:
        aa = sorted(aa.items(), key=operator.itemgetter(1), reverse=True)
        sum = 0
        for i in aa:
            sum += i[1]
        aa_large_abun.append(float(aa[0][1]) / float(sum) * 100)
        aa_large_acid.append(aa[0][0])
    aa_set = set()
    aa_colors = get_aa_colors()
    for aa in aa_large_acid:
        aa_set.add(aa)
    for aa in aa_set:
        x_ = []
        abun_ = []
        for i in range(0, len(aa_large_abun)):
            x_.append(i)
            if aa_large_acid[i] == aa:
                abun_.append(aa_large_abun[i])
            else:
                abun_.append(0)
        df = pd.DataFrame({'x': x_, 'y': abun_})
        sns.barplot(x='x',
                    y='y',
                    data=df,
                    color=aa_colors[amino_acids.index(aa)])
    plt.xticks(range(0, len(aa_large_abun)), aa_large_acid, fontsize=14)
    plt.yticks(fontsize=14)
    plt.xlabel('The most abundant amino acid', fontsize=16)
    plt.ylabel('% ' + region_name + 's', fontsize=16)
    utils.output_figure(output_fname, region_name + " aa variability", log)
Esempio n. 15
0
def OutputGeneMutability(gene_mutability_dict, output_fname, gene_type, log):
    df_dict = {'Gene': [], 'Mutability': []}
    for gene in gene_mutability_dict:
        for m in gene_mutability_dict[gene]:
            df_dict['Gene'].append(gene)
            df_dict['Mutability'].append(m)
    plt.figure(figsize=(10, 8))
    sns.boxplot(x='Gene', y='Mutability', data=df_dict)
    max_mutability = max(0.55, max(df_dict['Mutability']))
    plt.ylim(-0.05, max_mutability)
    plt.xticks(rotation=90)
    plt.ylabel('Mutability')
    utils.output_figure(output_fname, "Mutability of " + gene_type + ' genes',
                        log)
Esempio n. 16
0
def OutputGeneSHMPlot(gene_shms, gene_name, gene_length, num_aligned_seqs,
                      output_fname, log):
    nucl_dict = {
        'A': [0] * gene_length,
        'C': [0] * gene_length,
        'G': [0] * gene_length,
        'T': [0] * gene_length
    }
    num_shms = 0
    for shm in gene_shms:
        if not shm.is_substitution() or not nucl_is_valid(shm.read_nucl):
            continue
        nucl_dict[shm.read_nucl][shm.gene_pos] += 1
        num_shms += 1
    for nucl in nucl_dict:
        for i in range(len(nucl_dict[nucl])):
            nucl_dict[nucl][i] = nucl_dict[nucl][i]
    x = range(gene_length)
    plt.figure()
    plt.bar(x, [
        float(sum(y)) / num_aligned_seqs for y in zip(
            nucl_dict['A'], nucl_dict['C'], nucl_dict['G'], nucl_dict['T'])
    ],
            color='blue',
            label='A')
    plt.bar(x, [
        float(sum(y)) / num_aligned_seqs
        for y in zip(nucl_dict['C'], nucl_dict['G'], nucl_dict['T'])
    ],
            color='green',
            label='C')
    plt.bar(x, [
        float(sum(y)) / num_aligned_seqs
        for y in zip(nucl_dict['G'], nucl_dict['T'])
    ],
            color='red',
            label='G')
    plt.bar(x, [float(m) / num_aligned_seqs for m in nucl_dict['T']],
            color='orange',
            label='T')
    plt.legend(loc='upper center', ncol=4)
    plt.ylim(0, 1.1)
    plt.xlabel('Position in V gene', fontsize=14)
    plt.ylabel('Fraction of sequences', fontsize=14)
    plt.title(
        str(num_aligned_seqs) + ' sequences were aligned to ' + gene_name)
    utils.output_figure(output_fname, "SHM position in " + gene_name, log)
    return nucl_dict
Esempio n. 17
0
def output_shms_pos(all_shms_pos, colors, output_prefix, log):
    for isotype in all_shms_pos:
        if len(all_shms_pos[isotype]) < 10:
            continue
        plt.hist(all_shms_pos[isotype],
                 bins=100,
                 color=colors[isotype],
                 alpha=.75)
        plt.xlabel("#SHM in " + isotype + "V gene segment", fontsize=16)
        plt.ylabel("# sequences", fontsize=16)
        plt.xticks(fontsize=14)
        plt.yticks(fontsize=14)
        plt.xlim(0, .75)
        output_fname = output_prefix + "_" + isotype + "V_pos"
        utils.output_figure(
            output_fname, "Distribution of SHM relative positions in " +
            isotype + "V segments", log)
Esempio n. 18
0
def output_num_shms(num_all_shms, colors, output_prefix, log):
    pos = []
    labels = []
    cols = []
    for isotype in num_all_shms:
        if len(num_all_shms[isotype]) > 0:
            pos.append(num_all_shms[isotype])
            labels.append(str(isotype))
            cols.append(colors[isotype])
    plt.hist(pos, bins=50, color=cols, alpha=.75, label=labels)
    plt.legend(loc='upper center', ncol=len(pos), fontsize=16)
    plt.xlabel("# of SHM in V gene segment", fontsize=16)
    plt.ylabel("# SHMs", fontsize=16)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.xlim(0, 150)
    output_fname = output_prefix + "_shms_number"
    utils.output_figure(output_fname, "Distribution of # SHMs in V segments",
                        log)
Esempio n. 19
0
def visualize_vj_heatmap(labeling_df, output_pdf, log):
    v_hits = list(labeling_df['V_hit'])
    j_hits = list(labeling_df['J_hit'])
    if len(v_hits) == 0 or len(j_hits) == 0:
        log.info("VJ data-frame contains 0 records. VJ usage visualization will be skipped")
        return
    vj_matrix = VJMatrix(v_hits, j_hits)
    log.info(str(len(vj_matrix.vj_dict)) + " VJ pairs were extracted. Pairs are presented by " +
             str(len(vj_matrix.v_set)) + " V genes & " + str(len(vj_matrix.j_set)) + " J genes")
    table, v, j = vj_matrix.CreateTable(100)
    mplt.rcParams.update({'font.size': 20})
    #plt.figure(figsize=(15, 15))
    f, ax = plt.subplots(figsize=(10, 15))
    sns.heatmap(table, cmap = plt.cm.jet, xticklabels = v, yticklabels = j, ax = ax)
    ax.tick_params(labelsize = 16)
    x = [i + 0.0 for i in range(0, len(v))]
    y = [i + .5 for i in range(0, len(j))]
    plt.xticks(x, v, rotation=60, fontsize=14)
    plt.yticks(y, j, rotation='horizontal', fontsize=14)
    utils.output_figure(output_pdf, "VJ heatmap for the most abundant VJ combinations", log)
Esempio n. 20
0
def visualize_aa_substitution_matrix(shms_df, output_fname, log):
    dict_aa = dict()
    num_shms = 0
    for it in shms_df:
        read_shms = shms_df[it]
        prev_pos = -1
        for shm in read_shms:
            if prev_pos / 3 != shm.read_pos / 3:
                if aa_is_valid(shm.gene_aa) and aa_is_valid(shm.read_aa):
                    aa_pair = shm.gene_aa + shm.read_aa
                    if not aa_pair in dict_aa:
                        dict_aa[aa_pair] = 0
                    dict_aa[aa_pair] += 1
                    num_shms += 1
            prev_pos = shm.read_pos
    aa_list = get_aa_list()
    aa_freq = []
    for i in range(0, len(aa_list)):
        aa_freq.append([0] * len(aa_list))
    for aa_pair in dict_aa:
        aa_freq[aa_list.index(aa_pair[1])][aa_list.index(
            aa_pair[0])] = float(dict_aa[aa_pair]) / float(num_shms)
    fig, ax = plt.subplots()
    sns.heatmap(aa_freq,
                cmap=plt.cm.jet,
                xticklabels=aa_list,
                yticklabels=aa_list,
                square=True,
                ax=ax)
    ax.tick_params(labelsize=14)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12, rotation='horizontal')
    #for tick, color in zip(ax.get_xticklabels(), get_aa_ticks_colors(aa_list)):
    #    tick.set_color(color)
    #for tick, color in zip(ax.get_yticklabels(), get_aa_ticks_colors(aa_list)):
    #    tick.set_color(color)
    plt.xlabel("To", fontsize=14)
    plt.ylabel("From", fontsize=14, rotation='horizontal')
    utils.output_figure(output_fname, "Amino acid substitution heatmap", log)
    return aa_freq
Esempio n. 21
0
def OutputFractionOfSynonymousSHMs(shm_df, output_fname, log):
    v_shm_fractions = []
    j_shm_fractions = []
    for it in shm_df:
        read_shms = shm_df[it]
        num_synonymous = 0
        for shm in read_shms:
            if shm.synonymous:
                num_synonymous += 1
        fraction = 0
        if len(read_shms) != 0:
            fraction = float(num_synonymous) / len(read_shms)
        if it.is_variable():
            v_shm_fractions.append(fraction)
        else:
            j_shm_fractions.append(fraction)
    plt.hist([v_shm_fractions, j_shm_fractions], label=['V gene', 'J gene'])
    plt.xlabel('Fraction of synonymous SHMs', fontsize=14)
    plt.ylabel('# sequences', fontsize=14)
    plt.legend(loc='upper right', fontsize=14)
    utils.output_figure(output_fname,
                        "Fractions of synonymous SHMs in V and J genes", log)