def do_count(args, alignment_parser): """Count the number and density covering each merged gene in an annotation made made using the `generate` subcommand). Parameters ---------- args : :py:class:`argparse.Namespace` command-line arguments for ``count`` subprogram """ # we expect many zero-lenght segmentchains, so turn these off for now warnings.filterwarnings( "ignore", ".*zero-length SegmentChain.*", ) keys = ("exon", "utr5", "cds", "utr3") column_order = ["region"] gene_positions = read_pl_table(args.position_file) # read count files ga = alignment_parser.get_genome_array_from_args(args, printer=printer) total_counts = ga.sum() normconst = 1000.0 * 1e6 / total_counts printer.write("Dataset has %s counts in it." % total_counts) printer.write("Tallying genes ...") dtmp = {"region": []} for x in keys: for y in ("reads", "length", "rpkm"): label = "%s_%s" % (x, y) dtmp[label] = [] column_order.append(label) for i, name in enumerate(gene_positions["region"]): dtmp["region"].append(name) if i % 500 == 0: printer.write("Processed %s genes ..." % i) for k in keys: ivc = SegmentChain.from_str(gene_positions[k][i]) total = sum(ivc.get_counts(ga)) length = ivc.length rpkm = (normconst * total / length) if length > 0 else numpy.nan dtmp["%s_reads" % k].append(total) dtmp["%s_length" % k].append(length) dtmp["%s_rpkm" % k].append(rpkm) fout = argsopener("%s.txt" % args.outbase, args, "w") dtmp = pd.DataFrame(dtmp) dtmp.to_csv(fout, sep="\t", header=True, index=False, columns=column_order, na_rep="nan", float_format="%.8f") fout.close() printer.write("Done.")
def roi_row_to_cds(row): """Helper function to extract coding portions from maximal spanning windows flanking CDS starts that are created by |metagene| ``generate`` subprogram. Parameters ---------- row : (int, Series) Row from a :class:`pandas.DataFrame` of an ROI file made by the |metagene| ``generate`` subprogram Returns ------- |SegmentChain| Coding portion of maximal spanning window """ chainstr, alignment_offset, zero_point = row[1][["region","alignment_offset","zero_point"]] chain = SegmentChain.from_str(chainstr) cds_start = zero_point - alignment_offset subchain = chain.get_subchain(cds_start,chain.length) return subchain
def roi_row_to_cds(row): """Helper function to extract coding portions from maximal spanning windows flanking CDS starts that are created by |metagene| ``generate`` subprogram. Parameters ---------- row : (int, Series) Row from a :class:`pandas.DataFrame` of an ROI file made by the |metagene| ``generate`` subprogram Returns ------- |SegmentChain| Coding portion of maximal spanning window """ chainstr, alignment_offset, zero_point = row[1][[ "region", "alignment_offset", "zero_point" ]] chain = SegmentChain.from_str(chainstr) cds_start = zero_point - alignment_offset subchain = chain.get_subchain(cds_start, chain.length) return subchain
], "YPL249C-A": [ "YPL249C-A:0-53^291-334(-)", ], 'YBR215W_mRNA_0': ['YBR215W_mRNA_0:0-108^192-2175(+)'], 'YHL001W_mRNA_0': ['YHL001W_mRNA_0:0-146^544-961(+)'], 'YIL018W_mRNA_0': ['YIL018W_mRNA_0:0-30^430-1280(+)'], 'YIL133C_mRNA_0': ['YIL133C_mRNA_0:0-648^938-1007(-)'], 'YIL156W_B_mRNA_0': ['YIL156W_B_mRNA_0:0-41^103-408(+)'], 'YKL006W_mRNA_0': ['YKL006W_mRNA_0:0-157^555-954(+)'], 'YMR194C_B_mRNA_0': ['YMR194C_B_mRNA_0:0-325^397-729(-)'], 'YNL130C_mRNA_0': ['YNL130C_mRNA_0:0-1204^1296-1382(-)'], 'YPL249C_A_mRNA_0': ['YPL249C_A_mRNA_0:0-415^653-697(-)'], } known_juncs = { K: [SegmentChain.from_str(X) for X in V] for K, V in known_juncs.items() } """Annotated splice junctions""" all_known_juncs = [] for v in known_juncs.values(): all_known_juncs.extend(v) known_juncs_as_tuples = { "YNL130C": [ ("YNL130C", 53, 145, "-"), ], "YPL249C-A": [ ("YPL249C-A", 53, 291, "-"), ],
def do_count(roi_table,ga,norm_start,norm_end,min_counts,min_len,max_len,aggregate=False,printer=NullWriter()): """Calculate a :term:`metagene profile` for each read length in the dataset Parameters ---------- roi_table : :class:`pandas.DataFrame` Table specifying regions of interest, generated by :py:func:`plastid.bin.metagene.do_generate` ga : |BAMGenomeArray| Count data norm_start : int Coordinate in window specifying normalization region start norm_end : int Coordinate in window specifying normalization region end min_counts : float Minimum number of counts in `window[norm_start:norm_end]` required for inclusion in metagene profile min_len : int Minimum read length to include max_len : int Maximum read length to include aggregate : bool, optional Estimate P-site from aggregate reads at each position, instead of median normalized read density. Potentially noisier, but helpful for lower-count data or read lengths with few counts. (Default: False) printer : file-like, optional filehandle to write logging info to (Default: :func:`~plastid.util.io.openers.NullWriter`) Returns ------- dict Dictionary of :class:`numpy.ndarray` s of raw counts at each position (column) for each window (row) dict Dictionary of :class:`numpy.ndarray` s of normalized counts at each position (column) for each window (row), normalized by the total number of counts in that row from `norm_start` to `norm_end` :class:`pandas.DataFrame` Metagene profile of median normalized counts at each position across all windows, and the number of windows included in the calculation of each median, stratified by read length """ window_size = roi_table["window_size"][0] upstream_flank = roi_table["zero_point"][0] raw_count_dict = OrderedDict() norm_count_dict = OrderedDict() shape = (len(roi_table),window_size) for i in range(min_len,max_len+1): # mask all by default raw_count_dict[i] = numpy.ma.MaskedArray(numpy.tile(numpy.nan,shape), mask=numpy.tile(True,shape), dtype=float) for i,row in roi_table.iterrows(): if i % 1000 == 0: printer.write("Counted %s ROIs ..." % (i+1)) roi = SegmentChain.from_str(row["region"]) mask = SegmentChain.from_str(row["masked"]) roi.add_masks(*mask) valid_mask = roi.get_masked_counts(ga).mask offset = int(round((row["alignment_offset"]))) assert offset + roi.length <= window_size count_vectors = {} for k in raw_count_dict: count_vectors[k] = [] for seg in roi: reads = ga.get_reads(seg) read_dict = {} for k in raw_count_dict: read_dict[k] = [] for read in filter(lambda x: len(x.positions) in read_dict,reads): read_dict[len(read.positions)].append(read) for k in read_dict: count_vector = ga.map_fn(read_dict[k],seg)[1] count_vectors[k].extend(count_vector) for k in raw_count_dict: if roi.strand == "-": count_vectors[k] = count_vectors[k][::-1] raw_count_dict[k].data[i,offset:offset+roi.length] = numpy.array(count_vectors[k]) raw_count_dict[k].mask[i,offset:offset+roi.length] = valid_mask profile_table = { "x" : numpy.arange(-upstream_flank,window_size-upstream_flank) } printer.write("Counted %s ROIs total." % (i+1)) for k in raw_count_dict: k_raw = raw_count_dict[k] denominator = numpy.nansum(k_raw[:,norm_start:norm_end],axis=1) norm_count_dict[k] = (k_raw.T.astype(float) / denominator).T # copy mask from raw counts, then add nans and infs norm_counts = numpy.ma.MaskedArray(norm_count_dict[k], mask=k_raw.mask) norm_counts.mask[numpy.isnan(norm_counts)] = True norm_counts.mask[numpy.isinf(norm_counts)] = True with warnings.catch_warnings(): # ignore numpy mean of empty slice warning, given by numpy in Python 2.7-3.4 warnings.filterwarnings("ignore",".*mean of empty.*",RuntimeWarning) try: if aggregate == False: profile = numpy.ma.median(norm_counts[denominator >= min_counts],axis=0) else: profile = numpy.nansum(k_raw[denominator >= min_counts],axis=0) # in numpy under Python3.5, this is an IndexError instead of a warning except IndexError: profile = numpy.zeros_like(profile_table["x"],dtype=float) # in new versions of numpy, this is a ValueEror instead of an IndexError except ValueError: profile = numpy.zeros_like(profile_table["x"],dtype=float) num_genes = ((~norm_counts.mask)[denominator >= min_counts]).sum(0) profile_table["%s-mers" % k] = profile profile_table["%s_regions_counted" % k] = num_genes profile_table = pd.DataFrame(profile_table) return raw_count_dict, norm_count_dict, profile_table
known_juncs = { "YNL130C" : ["YNL130C:0-53^145-180(-)",], "YPL249C-A" : ["YPL249C-A:0-53^291-334(-)",], 'YBR215W_mRNA_0' : ['YBR215W_mRNA_0:0-108^192-2175(+)'], 'YHL001W_mRNA_0' : ['YHL001W_mRNA_0:0-146^544-961(+)'], 'YIL018W_mRNA_0' : ['YIL018W_mRNA_0:0-30^430-1280(+)'], 'YIL133C_mRNA_0' : ['YIL133C_mRNA_0:0-648^938-1007(-)'], 'YIL156W_B_mRNA_0': ['YIL156W_B_mRNA_0:0-41^103-408(+)'], 'YKL006W_mRNA_0' : ['YKL006W_mRNA_0:0-157^555-954(+)'], 'YMR194C_B_mRNA_0': ['YMR194C_B_mRNA_0:0-325^397-729(-)'], 'YNL130C_mRNA_0' : ['YNL130C_mRNA_0:0-1204^1296-1382(-)'], 'YPL249C_A_mRNA_0': ['YPL249C_A_mRNA_0:0-415^653-697(-)'], } known_juncs = { K : [SegmentChain.from_str(X) for X in V] for K,V in known_juncs.items() } """Annotated splice junctions""" all_known_juncs = [] for v in known_juncs.values(): all_known_juncs.extend(v) known_juncs_as_tuples = { "YNL130C" : [("YNL130C",53,145,"-"),], "YPL249C-A" : [("YPL249C-A",53,291,"-"),], 'YBR215W_mRNA_0' : [('YBR215W_mRNA_0',108,192,'+'),], 'YHL001W_mRNA_0' : [('YHL001W_mRNA_0',146,544,'+'),], 'YIL018W_mRNA_0' : [('YIL018W_mRNA_0',30,430,'+'),], 'YIL133C_mRNA_0' : [('YIL133C_mRNA_0',648,938,'-'),], 'YIL156W_B_mRNA_0': [('YIL156W_B_mRNA_0',41,103,'+'),],
def check_window(tx_ivc, known_roi, known_offset, known_ref_point, flank_up, flank_down, test_method, test_name, ref_delta=0): """Helper function to test output of window landmark functions Parameters ---------- tx_ivc : |SegmentChain| Test Transcript from which window will be derived known_roi : |SegmentChain| Reference output for ROI known_offset : int Known offset to start of ROI known_ref_point : (str,int,str) or numpy.nan Known offset to landmark in ROI as ("chromosome_name",position,"strand") flank_up : int Flank upstream of landmark to include in ROI flank_down : int Flank downstream of landmark to include in ROI test_method : function Function to test (e.g. :py:func:`window_cds_start`, py:func:`window_cds_stop`) test_name : str Name of test (for generating rich error output) ref_delta : int, optional Distance from reference landmark at which to center windows """ err_str = ("Failed %s on %s (strand: '%s', up: %s, down: %s). " % (test_name, str(tx_ivc), tx_ivc.spanning_segment.strand, flank_up, flank_down)) + "%s unequal (%s vs %s)" test_roi, test_offset, test_ref_point = test_method(tx_ivc, flank_up, flank_down, ref_delta=ref_delta) check_equality(SegmentChain.from_str(known_roi), test_roi) # if no landmark if numpy.isnan(known_offset) or isinstance( known_ref_point, float) and numpy.isnan(known_ref_point): assert_true(numpy.isnan(test_offset), msg=err_str % ("offset", known_offset, test_offset)) assert_true(numpy.isnan(test_ref_point), msg=err_str % ("ref_point", known_ref_point, test_ref_point)) # if landmark else: assert_equal(known_offset, test_offset, msg=err_str % ("offset", known_offset, test_offset)) assert_equal(known_ref_point, test_ref_point, msg=err_str % ("ref_point", known_ref_point, test_ref_point))
def check_maximal_window(test_name, genome_hash, test_group, result_groups, flank_up, flank_down): """ test_name : str Descriptive name of test genome_hash : GenomeHash Mask hash test_group : list List of transcript IDs, referring to transcripts in the GFF text above result_groups : list list of tuples of (region_str, aligment_offset, window_length) expected from maximal spanning window output flank_up : int Bases to include upstream of landmark in window flank_down : int bases to include downstream of landmark in window """ # table keys: # gene_id # window_size # roi # masked # alignment_offset # zero_point err_str = ("Failed %s (up: %s, down: %s). " % (test_name, flank_up, flank_down)) + "%s unequal (%s vs %s)" tx_ivcs = (_TRANSCRIPTS[X] for X in test_group) roi_table = group_regions_make_windows(tx_ivcs, genome_hash, flank_up, flank_down, window_cds_start) roi_table.sort(columns=["region"], inplace=True) trows = [X[1] for X in roi_table.iterrows()] result_groups = sorted(result_groups, key=lambda x: x[0]) REGION = 0 c = 0 for n, result_group in enumerate(result_groups): # if no landmark if numpy.isnan(result_group[1]) or numpy.isnan(result_group[2]): c += 1 # increment counter for input that will have no output # if landmark else: check_equality(SegmentChain.from_str(result_group[0]), SegmentChain.from_str(trows[n - c]["region"]), test_name) assert_equal( result_group[1], trows[n - c]["alignment_offset"], msg=err_str % ("offset", result_group[1], trows[n - c]["alignment_offset"])) assert_equal( result_group[2], trows[n - c]["zero_point"], msg=err_str % ("ref_point", result_group[1], trows[n - c]["zero_point"])) if len(result_group) == 4: assert_equal(result_group[3], trows[n - c]["masked"], msg=err_str % ("mask", result_group[3], trows[n - c]["masked"])) assert_equal(n + 1 - c, len(roi_table))
crossmap = GenomeHash(_MASKS) for flank_up, flank_down in _FLANKS: for test_name, test_group in _DO_GENERATE_MAX_WINDOW.items(): result_group = _DO_GENERATE_MAX_WINDOW_RESULTS_MASKED[ "%s_%s_%s" % (test_name, flank_up, flank_down)] yield check_maximal_window, test_name, crossmap, test_group, [ result_group ], flank_up, flank_down #=============================================================================== # INDEX: test data #=============================================================================== _MASKS = [ SegmentChain.from_str("2L:7985694-7985744(+)"), SegmentChain.from_str("3R:4519879-4519891(-)"), SegmentChain.from_str("4:50-50000(+)"), ] _TRANSCRIPTS_GFF = """##gff-version 3 3R FlyBase mRNA 4517211 4523544 . - . ID=FBtr0081950;Name=hb-RB;Parent=FBgn0001180;Alias=FBtr0002097,FBtr0002098,CG9786-RB,hb[+]R2.8;Dbxref=FlyBase_Annotation_IDs:CG9786-RB,REFSEQ:NM_169234;score_text=Strongly Supported;score=11 3R FlyBase exon 4517211 4519894 . - . Name=hb:2;Parent=FBtr0081950;parent_type=mRNA 3R FlyBase CDS 4517600 4519876 . - 0 Name=hb-cds;Parent=FBtr0081950;parent_type=mRNA 3R FlyBase exon 4523048 4523544 . - . Name=hb:4;Parent=FBtr0081950;parent_type=mRNA 3R FlyBase mRNA 4516702 4520322 . - . ID=FBtr0081951;Name=hb-RA;Parent=FBgn0001180;Alias=FBtr0002096,FBtr0002097,CG9786-RA,hb[+]R3.2;Dbxref=FlyBase_Annotation_IDs:CG9786-RA,REFSEQ:NM_169233;score_text=Strongly Supported;score=11 3R FlyBase exon 4516702 4519894 . - . Name=hb:1;Parent=FBtr0081951;parent_type=mRNA 3R FlyBase CDS 4517600 4519876 . - 0 Name=hb-cds;Parent=FBtr0081951;parent_type=mRNA 3R FlyBase exon 4520178 4520322 . - . Name=hb:3;Parent=FBtr0081951;parent_type=mRNA
], "YPL249C-A": [ "YPL249C-A:0-53^291-334(-)", ], 'YBR215W_mRNA_0': ['YBR215W_mRNA_0:0-108^192-2175(+)'], 'YHL001W_mRNA_0': ['YHL001W_mRNA_0:0-146^544-961(+)'], 'YIL018W_mRNA_0': ['YIL018W_mRNA_0:0-30^430-1280(+)'], 'YIL133C_mRNA_0': ['YIL133C_mRNA_0:0-648^938-1007(-)'], 'YIL156W_B_mRNA_0': ['YIL156W_B_mRNA_0:0-41^103-408(+)'], 'YKL006W_mRNA_0': ['YKL006W_mRNA_0:0-157^555-954(+)'], 'YMR194C_B_mRNA_0': ['YMR194C_B_mRNA_0:0-325^397-729(-)'], 'YNL130C_mRNA_0': ['YNL130C_mRNA_0:0-1204^1296-1382(-)'], 'YPL249C_A_mRNA_0': ['YPL249C_A_mRNA_0:0-415^653-697(-)'], } known_juncs = { K: [SegmentChain.from_str(X) for X in V] for K, V in known_juncs.items() } """Annotated splice junctions""" known_juncs_tuples = { "YNL130C": [ ("YNL130C", 53, 145, "-"), ], "YPL249C-A": [ ("YPL249C-A", 53, 291, "-"), ], 'YBR215W_mRNA_0': [ ('YBR215W_mRNA_0', 108, 192, '+'), ], 'YHL001W_mRNA_0': [
known_juncs = { "YNL130C" : ["YNL130C:0-53^145-180(-)",], "YPL249C-A" : ["YPL249C-A:0-53^291-334(-)",], 'YBR215W_mRNA_0' : ['YBR215W_mRNA_0:0-108^192-2175(+)'], 'YHL001W_mRNA_0' : ['YHL001W_mRNA_0:0-146^544-961(+)'], 'YIL018W_mRNA_0' : ['YIL018W_mRNA_0:0-30^430-1280(+)'], 'YIL133C_mRNA_0' : ['YIL133C_mRNA_0:0-648^938-1007(-)'], 'YIL156W_B_mRNA_0': ['YIL156W_B_mRNA_0:0-41^103-408(+)'], 'YKL006W_mRNA_0' : ['YKL006W_mRNA_0:0-157^555-954(+)'], 'YMR194C_B_mRNA_0': ['YMR194C_B_mRNA_0:0-325^397-729(-)'], 'YNL130C_mRNA_0' : ['YNL130C_mRNA_0:0-1204^1296-1382(-)'], 'YPL249C_A_mRNA_0': ['YPL249C_A_mRNA_0:0-415^653-697(-)'], } known_juncs = { K : [SegmentChain.from_str(X) for X in V] for K,V in known_juncs.items() } """Annotated splice junctions""" known_juncs_tuples = { "YNL130C" : [("YNL130C",53,145,"-"),], "YPL249C-A" : [("YPL249C-A",53,291,"-"),], 'YBR215W_mRNA_0' : [('YBR215W_mRNA_0',108,192,'+'),], 'YHL001W_mRNA_0' : [('YHL001W_mRNA_0',146,544,'+'),], 'YIL018W_mRNA_0' : [('YIL018W_mRNA_0',30,430,'+'),], 'YIL133C_mRNA_0' : [('YIL133C_mRNA_0',648,938,'-'),], 'YIL156W_B_mRNA_0': [('YIL156W_B_mRNA_0',41,103,'+'),], 'YKL006W_mRNA_0' : [('YKL006W_mRNA_0',157,555,'+'),], 'YMR194C_B_mRNA_0': [('YMR194C_B_mRNA_0',325,397,'-'),], 'YNL130C_mRNA_0' : [('YNL130C_mRNA_0',1204,1296,'-'),], 'YPL249C_A_mRNA_0': [('YPL249C_A_mRNA_0',415,653,'-'),],