Beispiel #1
0
def do_count(args, alignment_parser):
    """Count the number and density covering each merged gene in an annotation made made using the `generate` subcommand).
    
    Parameters
    ----------
    args : :py:class:`argparse.Namespace`
        command-line arguments for ``count`` subprogram    
    """
    # we expect many zero-lenght segmentchains, so turn these off for now
    warnings.filterwarnings(
        "ignore",
        ".*zero-length SegmentChain.*",
    )

    keys = ("exon", "utr5", "cds", "utr3")
    column_order = ["region"]
    gene_positions = read_pl_table(args.position_file)

    # read count files
    ga = alignment_parser.get_genome_array_from_args(args, printer=printer)
    total_counts = ga.sum()

    normconst = 1000.0 * 1e6 / total_counts

    printer.write("Dataset has %s counts in it." % total_counts)
    printer.write("Tallying genes ...")

    dtmp = {"region": []}
    for x in keys:
        for y in ("reads", "length", "rpkm"):
            label = "%s_%s" % (x, y)
            dtmp[label] = []
            column_order.append(label)

    for i, name in enumerate(gene_positions["region"]):
        dtmp["region"].append(name)
        if i % 500 == 0:
            printer.write("Processed %s genes ..." % i)

        for k in keys:
            ivc = SegmentChain.from_str(gene_positions[k][i])
            total = sum(ivc.get_counts(ga))
            length = ivc.length
            rpkm = (normconst * total / length) if length > 0 else numpy.nan
            dtmp["%s_reads" % k].append(total)
            dtmp["%s_length" % k].append(length)
            dtmp["%s_rpkm" % k].append(rpkm)

    fout = argsopener("%s.txt" % args.outbase, args, "w")
    dtmp = pd.DataFrame(dtmp)
    dtmp.to_csv(fout,
                sep="\t",
                header=True,
                index=False,
                columns=column_order,
                na_rep="nan",
                float_format="%.8f")

    fout.close()
    printer.write("Done.")
Beispiel #2
0
def roi_row_to_cds(row):
    """Helper function to extract coding portions from maximal spanning windows
    flanking CDS starts that are created by |metagene| ``generate`` subprogram.
    
    Parameters
    ----------
    row : (int, Series)
        Row from a :class:`pandas.DataFrame` of an ROI file made by the |metagene|
        ``generate`` subprogram
        
    Returns
    -------
    |SegmentChain|
        Coding portion of maximal spanning window
    """
    chainstr, alignment_offset, zero_point = row[1][["region","alignment_offset","zero_point"]]
    chain = SegmentChain.from_str(chainstr)
    cds_start = zero_point - alignment_offset
    subchain = chain.get_subchain(cds_start,chain.length)
    return subchain
Beispiel #3
0
def roi_row_to_cds(row):
    """Helper function to extract coding portions from maximal spanning windows
    flanking CDS starts that are created by |metagene| ``generate`` subprogram.
    
    Parameters
    ----------
    row : (int, Series)
        Row from a :class:`pandas.DataFrame` of an ROI file made by the |metagene|
        ``generate`` subprogram
        
    Returns
    -------
    |SegmentChain|
        Coding portion of maximal spanning window
    """
    chainstr, alignment_offset, zero_point = row[1][[
        "region", "alignment_offset", "zero_point"
    ]]
    chain = SegmentChain.from_str(chainstr)
    cds_start = zero_point - alignment_offset
    subchain = chain.get_subchain(cds_start, chain.length)
    return subchain
Beispiel #4
0
    ],
    "YPL249C-A": [
        "YPL249C-A:0-53^291-334(-)",
    ],
    'YBR215W_mRNA_0': ['YBR215W_mRNA_0:0-108^192-2175(+)'],
    'YHL001W_mRNA_0': ['YHL001W_mRNA_0:0-146^544-961(+)'],
    'YIL018W_mRNA_0': ['YIL018W_mRNA_0:0-30^430-1280(+)'],
    'YIL133C_mRNA_0': ['YIL133C_mRNA_0:0-648^938-1007(-)'],
    'YIL156W_B_mRNA_0': ['YIL156W_B_mRNA_0:0-41^103-408(+)'],
    'YKL006W_mRNA_0': ['YKL006W_mRNA_0:0-157^555-954(+)'],
    'YMR194C_B_mRNA_0': ['YMR194C_B_mRNA_0:0-325^397-729(-)'],
    'YNL130C_mRNA_0': ['YNL130C_mRNA_0:0-1204^1296-1382(-)'],
    'YPL249C_A_mRNA_0': ['YPL249C_A_mRNA_0:0-415^653-697(-)'],
}
known_juncs = {
    K: [SegmentChain.from_str(X) for X in V]
    for K, V in known_juncs.items()
}
"""Annotated splice junctions"""

all_known_juncs = []
for v in known_juncs.values():
    all_known_juncs.extend(v)

known_juncs_as_tuples = {
    "YNL130C": [
        ("YNL130C", 53, 145, "-"),
    ],
    "YPL249C-A": [
        ("YPL249C-A", 53, 291, "-"),
    ],
Beispiel #5
0
def do_count(roi_table,ga,norm_start,norm_end,min_counts,min_len,max_len,aggregate=False,printer=NullWriter()):
    """Calculate a :term:`metagene profile` for each read length in the dataset
    
    Parameters
    ----------
    roi_table : :class:`pandas.DataFrame`
        Table specifying regions of interest, generated
        by :py:func:`plastid.bin.metagene.do_generate`
    
    ga : |BAMGenomeArray|
        Count data
    
    norm_start : int
        Coordinate in window specifying normalization region start
    
    norm_end : int
        Coordinate in window specifying normalization region end
    
    min_counts : float
        Minimum number of counts in `window[norm_start:norm_end]`
        required for inclusion in metagene profile

    min_len : int
        Minimum read length to include
    
    max_len : int
        Maximum read length to include

    aggregate : bool, optional
        Estimate P-site from aggregate reads at each position, instead of median
        normalized read density. Potentially noisier, but helpful for lower-count
        data or read lengths with few counts. (Default: False)
                             
    printer : file-like, optional
        filehandle to write logging info to (Default: :func:`~plastid.util.io.openers.NullWriter`)
        
               
    Returns
    -------
    dict
        Dictionary of :class:`numpy.ndarray` s of raw counts at each position (column)
        for each window (row)
    
    dict
        Dictionary of :class:`numpy.ndarray` s of normalized counts at each position (column)
        for each window (row), normalized by the total number of counts in that row
        from `norm_start` to `norm_end`
    
    :class:`pandas.DataFrame`
        Metagene profile of median normalized counts at each position across
        all windows, and the number of windows included in the calculation of each
        median, stratified by read length
    """
    window_size    = roi_table["window_size"][0]
    upstream_flank = roi_table["zero_point"][0]
    
    raw_count_dict  = OrderedDict()
    norm_count_dict = OrderedDict()
    shape = (len(roi_table),window_size)
    for i in range(min_len,max_len+1):
        # mask all by default
        raw_count_dict[i] = numpy.ma.MaskedArray(numpy.tile(numpy.nan,shape),
                                                 mask=numpy.tile(True,shape),
                                                 dtype=float)
    
    for i,row in roi_table.iterrows():
        if i % 1000 == 0:
            printer.write("Counted %s ROIs ..." % (i+1))
            
        roi    = SegmentChain.from_str(row["region"])
        mask   = SegmentChain.from_str(row["masked"])
        roi.add_masks(*mask)
        valid_mask = roi.get_masked_counts(ga).mask
        
        offset = int(round((row["alignment_offset"])))
        assert offset + roi.length <= window_size
        
        count_vectors = {}
        for k in raw_count_dict:
            count_vectors[k] = []

        for seg in roi:
            reads = ga.get_reads(seg)
            read_dict = {}
            for k in raw_count_dict:
                read_dict[k] = []

            for read in filter(lambda x: len(x.positions) in read_dict,reads):
                read_dict[len(read.positions)].append(read)
            
            for k in read_dict:
                count_vector = ga.map_fn(read_dict[k],seg)[1]
                count_vectors[k].extend(count_vector)
                
        for k in raw_count_dict:
            if roi.strand == "-":
                count_vectors[k] = count_vectors[k][::-1]

            raw_count_dict[k].data[i,offset:offset+roi.length] = numpy.array(count_vectors[k])
            raw_count_dict[k].mask[i,offset:offset+roi.length] = valid_mask
    
    profile_table = { "x" : numpy.arange(-upstream_flank,window_size-upstream_flank) }
    
    printer.write("Counted %s ROIs total." % (i+1))
    for k in raw_count_dict:
        k_raw = raw_count_dict[k]
        
        denominator = numpy.nansum(k_raw[:,norm_start:norm_end],axis=1)
        norm_count_dict[k] = (k_raw.T.astype(float) / denominator).T
        
        # copy mask from raw counts, then add nans and infs
        norm_counts = numpy.ma.MaskedArray(norm_count_dict[k],
                                           mask=k_raw.mask)
        norm_counts.mask[numpy.isnan(norm_counts)] = True
        norm_counts.mask[numpy.isinf(norm_counts)] = True
        
        with warnings.catch_warnings():
            # ignore numpy mean of empty slice warning, given by numpy in Python 2.7-3.4
            warnings.filterwarnings("ignore",".*mean of empty.*",RuntimeWarning)
            try:
                if aggregate == False:
                    profile = numpy.ma.median(norm_counts[denominator >= min_counts],axis=0)
                else:
                    profile = numpy.nansum(k_raw[denominator >= min_counts],axis=0)
                    
            # in numpy under Python3.5, this is an IndexError instead of a warning
            except IndexError:
                profile = numpy.zeros_like(profile_table["x"],dtype=float)
            # in new versions of numpy, this is a ValueEror instead of an IndexError
            except ValueError:
                profile = numpy.zeros_like(profile_table["x"],dtype=float)

        num_genes = ((~norm_counts.mask)[denominator >= min_counts]).sum(0)

        profile_table["%s-mers" % k]            = profile
        profile_table["%s_regions_counted" % k] = num_genes
        
    profile_table = pd.DataFrame(profile_table)
    
    return raw_count_dict, norm_count_dict, profile_table
known_juncs = {
                 "YNL130C"   : ["YNL130C:0-53^145-180(-)",],
                 "YPL249C-A" : ["YPL249C-A:0-53^291-334(-)",],
                 
                'YBR215W_mRNA_0'  : ['YBR215W_mRNA_0:0-108^192-2175(+)'],
                'YHL001W_mRNA_0'  : ['YHL001W_mRNA_0:0-146^544-961(+)'],
                'YIL018W_mRNA_0'  : ['YIL018W_mRNA_0:0-30^430-1280(+)'],
                'YIL133C_mRNA_0'  : ['YIL133C_mRNA_0:0-648^938-1007(-)'],
                'YIL156W_B_mRNA_0': ['YIL156W_B_mRNA_0:0-41^103-408(+)'],
                'YKL006W_mRNA_0'  : ['YKL006W_mRNA_0:0-157^555-954(+)'],
                'YMR194C_B_mRNA_0': ['YMR194C_B_mRNA_0:0-325^397-729(-)'],
                'YNL130C_mRNA_0'  : ['YNL130C_mRNA_0:0-1204^1296-1382(-)'],
                'YPL249C_A_mRNA_0': ['YPL249C_A_mRNA_0:0-415^653-697(-)'],
               }
known_juncs = { K : [SegmentChain.from_str(X) for X in V] for K,V in known_juncs.items() }
"""Annotated splice junctions"""

all_known_juncs = []
for v in known_juncs.values():
    all_known_juncs.extend(v)
    
known_juncs_as_tuples = {
                 "YNL130C"   : [("YNL130C",53,145,"-"),],
                 "YPL249C-A" : [("YPL249C-A",53,291,"-"),],
                 
                'YBR215W_mRNA_0'  : [('YBR215W_mRNA_0',108,192,'+'),],
                'YHL001W_mRNA_0'  : [('YHL001W_mRNA_0',146,544,'+'),],
                'YIL018W_mRNA_0'  : [('YIL018W_mRNA_0',30,430,'+'),],
                'YIL133C_mRNA_0'  : [('YIL133C_mRNA_0',648,938,'-'),],
                'YIL156W_B_mRNA_0': [('YIL156W_B_mRNA_0',41,103,'+'),],
Beispiel #7
0
def check_window(tx_ivc,
                 known_roi,
                 known_offset,
                 known_ref_point,
                 flank_up,
                 flank_down,
                 test_method,
                 test_name,
                 ref_delta=0):
    """Helper function to test output of window landmark functions
    
    Parameters
    ----------
    tx_ivc : |SegmentChain|
        Test Transcript from which window will be derived
    
    known_roi : |SegmentChain|
        Reference output for ROI
    
    known_offset : int
        Known offset to start of ROI
    
    known_ref_point : (str,int,str) or numpy.nan
        Known offset to landmark in ROI as ("chromosome_name",position,"strand")
    
    flank_up : int
        Flank upstream of landmark to include in ROI
    
    flank_down : int
        Flank downstream of landmark to include in ROI
    
    test_method : function
        Function to test (e.g. :py:func:`window_cds_start`, py:func:`window_cds_stop`)
    
    test_name : str
        Name of test (for generating rich error output)
    
    ref_delta : int, optional
        Distance from reference landmark at which to center windows
    """
    err_str = ("Failed %s on %s (strand: '%s', up: %s, down: %s). " %
               (test_name, str(tx_ivc), tx_ivc.spanning_segment.strand,
                flank_up, flank_down)) + "%s unequal (%s vs %s)"
    test_roi, test_offset, test_ref_point = test_method(tx_ivc,
                                                        flank_up,
                                                        flank_down,
                                                        ref_delta=ref_delta)
    check_equality(SegmentChain.from_str(known_roi), test_roi)

    # if no landmark
    if numpy.isnan(known_offset) or isinstance(
            known_ref_point, float) and numpy.isnan(known_ref_point):
        assert_true(numpy.isnan(test_offset),
                    msg=err_str % ("offset", known_offset, test_offset))
        assert_true(numpy.isnan(test_ref_point),
                    msg=err_str %
                    ("ref_point", known_ref_point, test_ref_point))
    # if landmark
    else:
        assert_equal(known_offset,
                     test_offset,
                     msg=err_str % ("offset", known_offset, test_offset))
        assert_equal(known_ref_point,
                     test_ref_point,
                     msg=err_str %
                     ("ref_point", known_ref_point, test_ref_point))
Beispiel #8
0
def check_maximal_window(test_name, genome_hash, test_group, result_groups,
                         flank_up, flank_down):
    """
    test_name : str
        Descriptive name of test

    genome_hash : GenomeHash
        Mask hash

    test_group : list
        List of transcript IDs, referring to transcripts in the GFF text above

    result_groups : list
        list of tuples of (region_str, aligment_offset, window_length) expected from 
        maximal spanning window output

    flank_up : int
        Bases to include upstream of landmark in window

    flank_down : int
        bases to include downstream of landmark in window
    """
    # table keys:
    #    gene_id
    #    window_size
    #    roi
    #    masked
    #    alignment_offset
    #    zero_point
    err_str = ("Failed %s (up: %s, down: %s). " %
               (test_name, flank_up, flank_down)) + "%s unequal (%s vs %s)"
    tx_ivcs = (_TRANSCRIPTS[X] for X in test_group)
    roi_table = group_regions_make_windows(tx_ivcs, genome_hash, flank_up,
                                           flank_down, window_cds_start)
    roi_table.sort(columns=["region"], inplace=True)
    trows = [X[1] for X in roi_table.iterrows()]
    result_groups = sorted(result_groups, key=lambda x: x[0])
    REGION = 0

    c = 0

    for n, result_group in enumerate(result_groups):
        # if no landmark
        if numpy.isnan(result_group[1]) or numpy.isnan(result_group[2]):
            c += 1  # increment counter for input that will have no output

        # if landmark
        else:
            check_equality(SegmentChain.from_str(result_group[0]),
                           SegmentChain.from_str(trows[n - c]["region"]),
                           test_name)
            assert_equal(
                result_group[1],
                trows[n - c]["alignment_offset"],
                msg=err_str %
                ("offset", result_group[1], trows[n - c]["alignment_offset"]))
            assert_equal(
                result_group[2],
                trows[n - c]["zero_point"],
                msg=err_str %
                ("ref_point", result_group[1], trows[n - c]["zero_point"]))
            if len(result_group) == 4:
                assert_equal(result_group[3],
                             trows[n - c]["masked"],
                             msg=err_str %
                             ("mask", result_group[3], trows[n - c]["masked"]))

    assert_equal(n + 1 - c, len(roi_table))
Beispiel #9
0
    crossmap = GenomeHash(_MASKS)
    for flank_up, flank_down in _FLANKS:
        for test_name, test_group in _DO_GENERATE_MAX_WINDOW.items():
            result_group = _DO_GENERATE_MAX_WINDOW_RESULTS_MASKED[
                "%s_%s_%s" % (test_name, flank_up, flank_down)]
            yield check_maximal_window, test_name, crossmap, test_group, [
                result_group
            ], flank_up, flank_down


#===============================================================================
# INDEX: test data
#===============================================================================

_MASKS = [
    SegmentChain.from_str("2L:7985694-7985744(+)"),
    SegmentChain.from_str("3R:4519879-4519891(-)"),
    SegmentChain.from_str("4:50-50000(+)"),
]

_TRANSCRIPTS_GFF = """##gff-version 3
3R    FlyBase    mRNA    4517211    4523544    .    -    .    ID=FBtr0081950;Name=hb-RB;Parent=FBgn0001180;Alias=FBtr0002097,FBtr0002098,CG9786-RB,hb[+]R2.8;Dbxref=FlyBase_Annotation_IDs:CG9786-RB,REFSEQ:NM_169234;score_text=Strongly Supported;score=11
3R    FlyBase    exon    4517211    4519894    .    -    .    Name=hb:2;Parent=FBtr0081950;parent_type=mRNA
3R    FlyBase    CDS    4517600    4519876    .    -    0    Name=hb-cds;Parent=FBtr0081950;parent_type=mRNA
3R    FlyBase    exon    4523048    4523544    .    -    .    Name=hb:4;Parent=FBtr0081950;parent_type=mRNA

3R    FlyBase    mRNA    4516702    4520322    .    -    .    ID=FBtr0081951;Name=hb-RA;Parent=FBgn0001180;Alias=FBtr0002096,FBtr0002097,CG9786-RA,hb[+]R3.2;Dbxref=FlyBase_Annotation_IDs:CG9786-RA,REFSEQ:NM_169233;score_text=Strongly Supported;score=11
3R    FlyBase    exon    4516702    4519894    .    -    .    Name=hb:1;Parent=FBtr0081951;parent_type=mRNA
3R    FlyBase    CDS    4517600    4519876    .    -    0    Name=hb-cds;Parent=FBtr0081951;parent_type=mRNA
3R    FlyBase    exon    4520178    4520322    .    -    .    Name=hb:3;Parent=FBtr0081951;parent_type=mRNA
Beispiel #10
0
    ],
    "YPL249C-A": [
        "YPL249C-A:0-53^291-334(-)",
    ],
    'YBR215W_mRNA_0': ['YBR215W_mRNA_0:0-108^192-2175(+)'],
    'YHL001W_mRNA_0': ['YHL001W_mRNA_0:0-146^544-961(+)'],
    'YIL018W_mRNA_0': ['YIL018W_mRNA_0:0-30^430-1280(+)'],
    'YIL133C_mRNA_0': ['YIL133C_mRNA_0:0-648^938-1007(-)'],
    'YIL156W_B_mRNA_0': ['YIL156W_B_mRNA_0:0-41^103-408(+)'],
    'YKL006W_mRNA_0': ['YKL006W_mRNA_0:0-157^555-954(+)'],
    'YMR194C_B_mRNA_0': ['YMR194C_B_mRNA_0:0-325^397-729(-)'],
    'YNL130C_mRNA_0': ['YNL130C_mRNA_0:0-1204^1296-1382(-)'],
    'YPL249C_A_mRNA_0': ['YPL249C_A_mRNA_0:0-415^653-697(-)'],
}
known_juncs = {
    K: [SegmentChain.from_str(X) for X in V]
    for K, V in known_juncs.items()
}
"""Annotated splice junctions"""

known_juncs_tuples = {
    "YNL130C": [
        ("YNL130C", 53, 145, "-"),
    ],
    "YPL249C-A": [
        ("YPL249C-A", 53, 291, "-"),
    ],
    'YBR215W_mRNA_0': [
        ('YBR215W_mRNA_0', 108, 192, '+'),
    ],
    'YHL001W_mRNA_0': [
Beispiel #11
0
known_juncs = {
                 "YNL130C"   : ["YNL130C:0-53^145-180(-)",],
                 "YPL249C-A" : ["YPL249C-A:0-53^291-334(-)",],
                 
                'YBR215W_mRNA_0'  : ['YBR215W_mRNA_0:0-108^192-2175(+)'],
                'YHL001W_mRNA_0'  : ['YHL001W_mRNA_0:0-146^544-961(+)'],
                'YIL018W_mRNA_0'  : ['YIL018W_mRNA_0:0-30^430-1280(+)'],
                'YIL133C_mRNA_0'  : ['YIL133C_mRNA_0:0-648^938-1007(-)'],
                'YIL156W_B_mRNA_0': ['YIL156W_B_mRNA_0:0-41^103-408(+)'],
                'YKL006W_mRNA_0'  : ['YKL006W_mRNA_0:0-157^555-954(+)'],
                'YMR194C_B_mRNA_0': ['YMR194C_B_mRNA_0:0-325^397-729(-)'],
                'YNL130C_mRNA_0'  : ['YNL130C_mRNA_0:0-1204^1296-1382(-)'],
                'YPL249C_A_mRNA_0': ['YPL249C_A_mRNA_0:0-415^653-697(-)'],
               }
known_juncs = { K : [SegmentChain.from_str(X) for X in V] for K,V in known_juncs.items() }
"""Annotated splice junctions"""

known_juncs_tuples = {
                 "YNL130C"   : [("YNL130C",53,145,"-"),],
                 "YPL249C-A" : [("YPL249C-A",53,291,"-"),],
                 
                'YBR215W_mRNA_0'  : [('YBR215W_mRNA_0',108,192,'+'),],
                'YHL001W_mRNA_0'  : [('YHL001W_mRNA_0',146,544,'+'),],
                'YIL018W_mRNA_0'  : [('YIL018W_mRNA_0',30,430,'+'),],
                'YIL133C_mRNA_0'  : [('YIL133C_mRNA_0',648,938,'-'),],
                'YIL156W_B_mRNA_0': [('YIL156W_B_mRNA_0',41,103,'+'),],
                'YKL006W_mRNA_0'  : [('YKL006W_mRNA_0',157,555,'+'),],
                'YMR194C_B_mRNA_0': [('YMR194C_B_mRNA_0',325,397,'-'),],
                'YNL130C_mRNA_0'  : [('YNL130C_mRNA_0',1204,1296,'-'),],
                'YPL249C_A_mRNA_0': [('YPL249C_A_mRNA_0',415,653,'-'),],