Ejemplo n.º 1
0
def create_maf_distribution(
    seqs, distrib_fhand=None, plot_fhand=None, summary_fhand=None, groups=None, group_kind=None
):
    "It creates the distribution of the maf (not takes in account ref allele)"
    title = "maf"
    if groups and group_kind:
        title = "maf (%s: %s)" % (group_kind, ",".join(groups))

    mafs = CachedArray("f")
    for seq in seqs:
        for snv in seq.get_features("snv"):
            maf = calculate_maf_frequency(snv, groups=groups, group_kind=group_kind)
            if maf:
                mafs.append(maf)
    if list(mafs):
        create_distribution(
            mafs,
            labels={"title": title},
            distrib_fhand=distrib_fhand,
            bins=None,
            plot_fhand=plot_fhand,
            range_=None,
            summary_fhand=summary_fhand,
            calculate_freqs=False,
            remove_outliers=False,
        )
Ejemplo n.º 2
0
def create_het_distribution(
    seqs, distrib_fhand=None, plot_fhand=None, summary_fhand=None, group_kind=None, groups=None, ploidy=2
):
    """It creates the distribution of the heterozygosity
    (not takes in account ref allele)"""
    title = "heterozygosity"
    if groups and group_kind:
        title = "heterozygosity (%s: %s)" % (group_kind, ",".join(groups))

    hets = CachedArray("f")
    for seq in seqs:
        for snv in seq.get_features("snv"):
            if not group_kind and "heterozygosity" in snv.qualifiers:
                het = snv.qualifiers["heterozygosity"]
            else:
                het = calculate_heterozygosity(snv, ploidy, group_kind=group_kind, groups=groups)
            if het is not None:
                hets.append(het)
    if list(hets):
        create_distribution(
            hets,
            labels={"title": title},
            distrib_fhand=distrib_fhand,
            bins=None,
            plot_fhand=plot_fhand,
            range_=None,
            summary_fhand=summary_fhand,
            calculate_freqs=False,
            remove_outliers=False,
        )
Ejemplo n.º 3
0
def create_pic_distribution(
    seqs, distrib_fhand=None, plot_fhand=None, summary_fhand=None, read_groups=None, group_kind=None, groups=None
):
    "It creates the distribution of the pic (not takes in account ref allele)"
    title = "pic"
    if groups and group_kind:
        title = "pic (%s: %s)" % (group_kind, ",".join(groups))

    pics = CachedArray("f")
    for seq in seqs:
        for snv in seq.get_features("snv"):
            if not group_kind and "pic" in snv.qualifiers:
                pic = snv.qualifiers["pic"]
            else:
                pic = calculate_pic(snv, group_kind=group_kind, groups=groups)
            if pic is not None:
                pics.append(pic)
    if list(pics):
        create_distribution(
            pics,
            labels={"title": title},
            distrib_fhand=distrib_fhand,
            bins=None,
            plot_fhand=plot_fhand,
            range_=None,
            summary_fhand=summary_fhand,
            calculate_freqs=False,
            remove_outliers=False,
        )
    def test_basic_distribution(self):
        'It tests the distribution'
        summary_fhand = StringIO()
        distrib_fhand = StringIO()

        numbers = CachedArray(typecode='I')
        numbers.extend([1, 2, 3, 4, 5, 6, 7, 8, 9, 101, 2, 3, 4, 5, 6, 7, 8, 9])

        create_distribution(numbers, distrib_fhand=distrib_fhand,
                            summary_fhand=summary_fhand)
        result = '''Statistics for histogram
-------------------------
minimum: 1
maximum: 101
average: 10.5556
variance: 486.9136
sum: 190
items: 18'''
        assert result in summary_fhand.getvalue()
def main():
    '''The main section'''

    # Get parameters
    infhand, outfhand, do_incompat, low_memory = set_parameters()
    bins = 20
    range_ = (95, 100)

    # Parse blast results
    blasts = BlastParser(infhand)

    # The values for the distribution
    score_keys = ['similarity']

    if do_incompat:
        score_keys.append('d_incompatibility')
    scores = alignment_results_scores(blasts, score_keys)

    # The distribution
    if do_incompat:
        #distrib, x_edges, y_edges = numpy.histogram2d(scores[0], scores[1],
        #                                              bins=bins)
        distrib = numpy.histogram2d(scores[0], scores[1], bins=bins)[0]
    else:
        result = create_distribution(scores, range=range_, bins=bins,
                                     low_memory=low_memory)
        distrib   = result['distrib'][0]
        bin_edges = result['bin_edges'][0]

    # The drawing
    if do_incompat:
        #fig = pylab.figure()
        pylab.figure()
        #axes = Axes3D(fig)
        #axes.plot_surface(x_edges[:-1], y_edges[:-1], distrib)
        #axes = pylab.subplot(111)
        pylab.subplot(111)
        image = pylab.imshow(distrib)
        image.set_interpolation('bilinear')
        pylab.show()
    else:
        draw_scatter(x_axe=bin_edges[:-1], y_axe=distrib, fhand=outfhand)
    return
Ejemplo n.º 6
0
def bam_distribs(bam_fhand, kind, basename=None, range_=None,
                 grouping=None, sample_size=None, summary_fhand=None,
                 labels=None, plot_file_format='svg'):
    '''It makes the bam coverage distribution.

    It can make the distribution taking into account any of the readgroup items:
    platform, sample and library
    '''
    value_calculator = {'coverage':_get_bam_coverage,
                       'mapq':_get_bam_mapping_quality,
                       'edit_distance': _get_bam_edit_distance}
    coverage_labels = {'title': "Coverage for %s %s",
                       'xlabel': 'Coverage',
                       'ylabel': 'Num. of positions',
                       'sum':None,
                       'items':'total sequence length'
                       }
    mapping_labels = {'title': "Mapping qualities for %s %s",
                       'xlabel': "mapping quality",
                       'ylabel': 'Num. of reads',
                       'sum':None, 'items':'number reads in the sam file'
                       }
    edit_distance_labels = {'title': "Edit distances for %s %s",
                       'xlabel': "edit distance",
                       'ylabel': 'Num. of reads',
                       'sum':None, 'items':'number reads in the sam file'
                       }
    plot_labels = {'coverage': coverage_labels,
                   'mapq':mapping_labels,
                   'edit_distance': edit_distance_labels}

    if sample_size is not None:
        sampled_bam_fhand = NamedTemporaryFile(suffix='.bam')
        sample_bam(bam_fhand, sampled_bam_fhand, sample_size)
        sample_fpath = sampled_bam_fhand.name
    else:
        sample_fpath = bam_fhand.name

    create_bam_index(bam_fpath=sample_fpath)
    bam = pysam.Samfile(sample_fpath, 'rb')
    rgs = get_read_group_info(bam)

    if grouping is None:
        platforms = set([rg['PL'] for rg in rgs.values()])
        if len(platforms) > 1:
            grouping = 'PL'
        else:
            grouping = 'SM'

    item_values = value_calculator[kind](bam, rgs, grouping)

    results = {}
    for group_name, values in item_values.items():
        if basename is None:
            distrib_fhand = None
            plot_fhand = None
        else:
            distrib_fhand = open('%s.%s_%s.dat' % (basename, kind, group_name),
                                 'w')
            plot_fhand = open('%s.%s_%s.%s' % (basename, kind, group_name,
                                               plot_file_format),
                              'w')

        if grouping == 'PL':
            grouping = 'platform'
        elif grouping == 'SM':
            grouping = 'sample'
        labels = copy.deepcopy(plot_labels[kind])
        labels['title'] = labels['title'] % (grouping, group_name)
        remove_outliers = True if kind == 'coverage' else False
        distrib = create_distribution(values, labels=labels,
                                      distrib_fhand=distrib_fhand,
                                      plot_fhand=plot_fhand,
                                      range_=range_,
                                      summary_fhand=summary_fhand,
                                      remove_outliers=remove_outliers)
        results[(grouping, group_name)] = distrib
    return results