def test_add_commands(self): """ test that RunRecord.addCommands correctly logs long lines of text """ logging.disable(logging.NOTSET) rr = RunRecord('test_add_commands') rr.addCommands([]) cmd_line = 'This is a list of command arguments that probably '+\ 'do not exist in the real world' cmds = cmd_line.split(' ') rr.addCommands(cmds) recorded_lines = [ 'ChipPy.test_add_commands\tINFO\tcommand-line\tNo arguments given', 'ChipPy.test_add_commands\tINFO\tcommand-line\tThis is a list of command arguments', 'ChipPy.test_add_commands\tINFO\tcommand-line\tthat probably do not exist in the real', 'ChipPy.test_add_commands\tINFO\tcommand-line\tworld' ] log_file = open(LOG_FN, 'r') for n, line in enumerate(log_file): line_parts = [lp.strip() for lp in line.split('\t')] #print repr(recorded_lines[n]) #print repr('\t'.join(line_parts[1:])) assert '\t'.join(line_parts[1:]) == recorded_lines[n] logging.disable(logging.CRITICAL)
def main(): """ How do counts distributions vary with rank? """ rr = RunRecord('counts_distribution') rr.addCommands(sys.argv) args = script_info['args'].parse(window_title='Counts Distribution') studies = [CountsStudy(fn) for fn in args.collections] fig_details = FigureDetails(x_size=args.fig_width, y_size=args.fig_height, title=args.title, x_text=args.xlabel, y_text=args.ylabel) if args.normalise_by_RPM: for study in studies: study.normaliseByRPM() score_groups = [] for study in studies: score_groups.append( study.scoresAsRankedArray(metric=args.counts_region, log2=args.y_axis_is_log)) make_plot(score_groups, fig_details, args.plot_type, args.plot_filename) rr.display()
def main(): rr = RunRecord('add_expression_db') rr.addCommands(sys.argv) args = script_info['args'].parse(window_title='Add Expression to DB') session = db_query.make_session(args.db_path) name = args.name description = args.description ref_file = args.expression_data sample_type = args.sample_type # Check that Sample and Reference File are both unique if name in db_query.get_sample_entries(session): rr.dieOnCritical('Sample name already exists', name) if ref_file in db_query.get_reffile_entries(session, reffile_name=ref_file): rr.dieOnCritical('ReferenceFile already loaded', ref_file) if sample_types[sample_type] == sample_types['abs_expr']: expr_table = gene_expr_to_table( args.expression_data, stable_id_label=args.gene_id_heading, probeset_label=args.probeset_heading, exp_label=args.expression_heading, allow_probeset_many_gene=args.allow_probeset_many_gene, validate=True, sep=args.sep) elif sample_types[sample_type] == sample_types['diff_expr']: # validation breaks with some of Rohan's diff files # he's included all probesets but only the mean score, once. expr_table = gene_expr_diff_to_table( args.expression_data, stable_id_label=args.gene_id_heading, probeset_label=args.probeset_heading, exp_label=args.expression_heading, sig_label=args.significance_heading, pval_label=args.p_value_heading, allow_probeset_many_gene=args.allow_probeset_many_gene, validate=False, sep=args.sep) elif sample_types[sample_type] == sample_types['target_genes']: expr_table = LoadTable(args.expression_data, sep=args.sep) else: rr.dieOnCritical('Unknown sample type', args.sample_type) success = add_data(session, name, description, args.expression_data, expr_table, sample_type=args.sample_type, reffile1=args.reffile1, reffile2=args.reffile2) rr.addInfo(name + ' added to DB', success) rr.display()
def main(): """ Plot the score differential (y-axis) against the absolute expression components (x-axes of each plot). """ rr = RunRecord('diff_abs_plots') rr.addCommands(sys.argv) args = script_info['args'].parse(\ window_title='Difference vs Absolute Expression Plots') groups_dict = dict([('extremes_colour', args.extremes_colour), ('signif_colour', args.signif_colour), ('bulk_colour', args.bulk_colour), ('hide_extremes', args.hide_extremes), ('hide_signif', args.hide_signif), ('hide_bulk', args.hide_bulk)]) # Should do number restrictions in load step # Load all genes into RawPlotData object print 'Loading data for first plot' raw_plot_data1 = load_sample_genes(args.db_path, args.diff_sample, args.abs_expr_sample1, args.sample_extremes) print 'Loading data for second plot' raw_plot_data2 = load_sample_genes(args.db_path, args.diff_sample, args.abs_expr_sample2, args.sample_extremes) print 'Building plot points' # get back a list of plot_dot objects with 'x', 'y', 'colour', 'area' plot_dots1 = build_plot_points(raw_plot_data1, args.ranks, args.num_genes) plot_dots2 = build_plot_points(raw_plot_data2, args.ranks, args.num_genes) plot_dict = dict([('out_name', args.plot1_name), ('title', args.title), ('y_text', args.ylabel), ('y_units', args.yaxis_units), ('x_text', args.xaxis_text1), ('x_units', args.xaxis_units), ('diff_name', raw_plot_data1.diff_name), ('sample_name', raw_plot_data1.sample_name)]) print 'Generating plot 1' make_plot(plot_dots1, plot_dict, groups_dict) plot_dict['sample_name'] = raw_plot_data2.sample_name plot_dict['out_name'] = args.plot2_name plot_dict['x_text'] = args.xaxis_text2 print 'Generating plot 2' make_plot(plot_dots2, plot_dict, groups_dict) rr.display()
def main(): rr = RunRecord('drop_expression_db') rr.addCommands(sys.argv) args = script_info['args'].parse(window_title='Drop Expression Data') session = db_query.make_session(args.db_path) if db_query.drop_sample_records(session, args.sample): rr.addInfo('Removing ' + args.sample, 'Success') else: rr.addWarning('Removing ' + args.sample, 'Failure') rr.display()
def main(): rr = RunRecord('db_summary') rr.addCommands(sys.argv) args = script_info['args'].parse(window_title='DB Summary') session = make_session(args.db_path) sample_name = args.sample if args.sample else None chroms = get_chroms(session) species = get_species(session) if sample_name is None: total_samples_count = get_sample_counts(session) sample_names = get_all_sample_names(session) total_genes_count = get_gene_counts(session) total_exon_count = get_exon_counts(session) total_expr_count = get_expression_counts(session) total_diff_genes_count = get_diff_counts(session) total_target_genes_count = get_targetgene_counts(session) total_reffiles_count = get_reffile_counts(session) else: total_expr_count = get_expression_counts(session, sample_name) total_diff_genes_count = get_diff_counts(session, sample_name) total_target_genes_count = get_targetgene_counts(session, sample_name) reffiles_entries = get_reffile_entries(session, sample_name=sample_name) rr.addInfo('ChipPy DB name', args.db_path) rr.addInfo('Species name', species) rr.addInfo('Chroms list', chroms) if sample_name is None: rr.addInfo('Total # of sample entries', total_samples_count) rr.addInfo('Sample names', sample_names) rr.addInfo('Total # of gene entries', total_genes_count) rr.addInfo('Total # of exon entries', total_exon_count) rr.addInfo('Total # of absolute-scored gene entries', total_expr_count) rr.addInfo('Total # of differential gene entries', total_diff_genes_count) rr.addInfo('Total # of target gene entries', total_target_genes_count) if sample_name is None: rr.addInfo('Total # of reference files', total_reffiles_count) else: if len(reffiles_entries) > 0: rr.addInfo('Reference file name', reffiles_entries) else: rr.addError('Reference file name', 'Not Available') rr.display()
def main(): rr = RunRecord('start_chippy_db') rr.addCommands(sys.argv) args = script_info['args'].parse() create_path(args.save_db_dir) if not os.path.isdir(args.save_db_dir): sys.stderr.write('The save_db_dir must be an existing directory.\n') return release = args.ensembl_release species = args.species chippy_db_name = args.save_db_prefix + '_chippy_' + str(release) +\ '_' + species + '.db' db_path = os.path.join(args.save_db_dir, chippy_db_name) if not os.path.exists(db_path): session = make_session(db_path) hostname = args.hostname username = args.username password = args.password account = HostAccount(hostname, username, password, port=args.port) add_ensembl_gene_data(session, args.species, ensembl_release=args.ensembl_release, account=account) success = create_dummy_expr(session) if success: rr.addInfo('Dummy data added successfully', 'Expr=1.') else: rr.addError('Dummy data failed to upload to DB', 'Expect bigger problems') rr.addInfo('Chippy DB written', db_path) print os.path.realpath(db_path) else: rr.addError('Chippy DB with this name already exists', db_path) if args.show_log: rr.display()
def main(): """ How do expression distributions vary with rank? """ rr = RunRecord('expr_distribution') rr.addCommands(sys.argv) args = script_info['args'].parse(window_title='Expression Distribution') db_path = args.db_path studies = [ExprStudy(samp, db_path) for samp in args.abs_expr_samples] fig_details = FigureDetails(x_size=args.fig_width, y_size=args.fig_height, title=args.title, x_text=args.xlabel, y_text=args.ylabel) score_groups = [] for study in studies: score_groups.append(study.scoresAsRankedArray(log2=args.y_axis_is_log)) make_plot(score_groups, fig_details, args.plot_type, args.plot_filename) rr.display()
def main(): """ Comparative plots of count or rank data for chromatin or expression. """ rr = RunRecord('counts_vs_expr') rr.addCommands(sys.argv) args = script_info['args'].parse(use_scrollbars=True, use_save_load_button=True, window_title='Counts vs Expression Plots') if len(args.collections) > 1: rr.dieOnCritical('Only 1 collection allowed. You chose', len(args.collections)) # Load all required data print 'Loading expression and counts data' sample = args.abs_expr_sample collection = args.collections[0] matched_studies = MatchedStudy(sample, collection, args.db_path, args.region_feature, include_target=args.include_targets, exclude_target=args.exclude_targets) print 'Creating plot points' plot_points = matched_studies.get_matched_genes_as_xy_plotpoints( args.x_axis_type, args.expr_is_ranks, args.counts_is_ranks) fig = FigureDetails(x_size=args.fig_width, y_size=args.fig_height, title=sample + ' vs ' + collection) if args.x_axis_type.lower() == 'expression': fig.x_text = 'Expression' fig.y_text = 'Counts' if args.counts_is_ranks: fig.y_text += ' Ranks' if args.expr_is_ranks: fig.x_text += ' Ranks' else: fig.y_text = 'Expression' fig.x_text = 'Counts' if args.counts_is_ranks: fig.x_text += ' Ranks' if args.expr_is_ranks: fig.y_text += ' Ranks' if args.x_axis_is_log: fig.x_text += ' (log base 2)' if args.y_axis_is_log: fig.y_text += ' (log base 2)' make_plot(plot_points, plot_fn=args.plot_filename, fig_details=fig, x_axis_is_log=args.x_axis_is_log, y_axis_is_log=args.y_axis_is_log, x_axis_type=args.x_axis_type, counts_is_ranks=args.counts_is_ranks, expr_is_ranks=args.expr_is_ranks) rr.display()
def main(): """ Returns a pickle of size window_start to window_finish containing chromatin mapping averages per base, one per gene, ranked by expression. """ rr = RunRecord('export_counts') rr.addCommands(sys.argv) args = script_info['args'].parse(window_title='Export Counts') session = db_query.make_session(args.db_path) sample_name = args.expr_sample print 'Loading counts data for', sample_name include_name = None exclude_name = None if args.include_targets: include_name = args.include_targets rr.addInfo('include gene targets', include_name) if args.exclude_targets: exclude_name = args.exclude_targets rr.addInfo('exclude gene targets', exclude_name) if (args.multitest_signif_val is not None) and not \ (-1 <= args.multitest_signif_val <= 1): rr.dieOnCritical('Multitest_signif_val should be -1, 0, 1', args.multitest_signif_val) if args.chr_prefix != '': # If it writes nothing then cogent.Table fails because it's fragile rr.addInfo('BAM/BED chromosome prefix given', args.chr_prefix) window_upstream = args.window_upstream assert window_upstream > 0, \ 'upstream window must be of at least size 1 bp' window_downstream = args.window_downstream assert window_downstream > 0, \ 'downstream window must be of at least size 1 bp' get_collection(session, sample_name, args.feature_type, args.BAMorBED, args.chr_prefix, window_upstream, window_downstream, args.multitest_signif_val, args.collection, args.overwrite, args.tab_delimited, include_name, exclude_name, bedgraph=args.make_bedgraph, BED_windows=args.BED_windows, chrom_size=args.max_chrom_size, no_overlap=args.no_overlap) session.close() rr.display()
def makeCommands(self): """ return the arg.long_names and the user inputs """ non_positional_parts = [] positional_parts = [] if self.requiredLayout.rowCount() > 0: for row in range(self.requiredLayout.rowCount()): if self.requiredLayout.itemAtPosition(row, 2) is not None: name, input = self._checkNameValue(\ self.requiredLayout.itemAtPosition(row, 1), self.requiredLayout.itemAtPosition(row, 2)) if name.startswith('-'): non_positional_parts.append(name) if input is not None: if type(input) == list: for i in input: non_positional_parts.append(i) else: non_positional_parts.append(input) else: # positional args have no name if input is not None: if type(input) == list: for i in input: positional_parts.append(i) else: positional_parts.append(input) if self.optionalLayout.rowCount() > 0: for row in range(self.optionalLayout.rowCount()): if self.optionalLayout.itemAtPosition(row, 0) is None: continue include = self.optionalLayout.itemAtPosition(row, 0).widget() if include.checkState(): name, input = self._checkNameValue(\ self.optionalLayout.itemAtPosition(row, 1), self.optionalLayout.itemAtPosition(row, 2)) if name.startswith('-'): non_positional_parts.append(name) if input is not None: if type(input) == list: for i in input: non_positional_parts.append(i) else: non_positional_parts.append(input) else: # positional args have no name if input is not None: if type(input) == list: for i in input: positional_parts.append(i) else: positional_parts.append(input) # deal with non-displayed args for arg in self.argobs: if not arg.display: if arg.default is not None: if arg.long_form.startswith('-'): non_positional_parts.append(arg.long_form) non_positional_parts.append(str(arg.default)) else: # postional args have no name positional_parts.append(str(arg.default)) # join space separated components of strings with quotes for i, part in enumerate(non_positional_parts): if type(part) == str and ' ' in part: non_positional_parts[i] = "'" + part + "'" rr = RunRecord() rr.addCommands(non_positional_parts) rr.addCommands(positional_parts) return non_positional_parts + positional_parts
def main(ui=None): """ 1) Get all protein coding genes from DB. 2) Read WIG file and if a count record is in a gene then add to its total 3) Write out genes and expr values """ rr = RunRecord('expr_wig_to_exp') rr.addCommands(sys.argv) args = script_info['args'].parse(window_title='Expression WIG to EXP') chrom_size = args.max_chrom_size prefix = args.chr_prefix session = db_query.make_session(args.db_path) genes = db_query.get_gene_entries(session) all_genes = {} # genes indexed by ensembl_id genes_by_chrom = {} # chrom: list(gene_id) genes_scores = {} # each gene has an expression score for gene in genes: if not gene.chrom in genes_by_chrom.keys(): genes_by_chrom[gene.chrom] = [] genes_by_chrom[gene.chrom].append(gene.ensembl_id) genes_scores[gene.ensembl_id] = 0 all_genes[gene.ensembl_id] = gene wig_fn = args.wig if wig_fn.endswith('.gz'): wig_file = gzip.GzipFile(wig_fn, 'rb') else: try: wig_file = open(wig_fn, 'r') except IOError: rr.dieOnCritical('Could not open file', wig_fn) # get total lines in wig for pacing the progress bar if not wig_fn.endswith('.gz'): command = 'wc -l ' + wig_fn returncode, stdout, stderr = run_command(command) if returncode: rr.addWarning('could not run wc to count WIG lines', 'error') total_lines = 1 else: total_lines = int(stdout.strip().split(' ')[0]) rr.addInfo('total lines in '+wig_fn, total_lines) # Read each piece of the file into an artificial chromosome (Numpy array) # and slice out the gene regions that we have for each gene in that chrom chrom_array = numpy.zeros(chrom_size, dtype=numpy.float32) current_chrom = None for i, line in enumerate(wig_file): if i % 100 == 0: msg = 'Reading wiggle entries [' + str(i) +\ ' / ' + str(total_lines) + ']' progress = (float(i)/float(total_lines)) ui.display(msg=msg, progress=progress) if line.startswith('track'): continue elif line.startswith('fixed'): # fixedStep chrom=chr10 start=56001 step=20 span=20 step_type = 'fixed' step_parts = line.split(' ') step = [val.strip('step=').strip() \ for val in step_parts if val.startswith('step')][0] span = [val.strip('span=').strip() \ for val in step_parts if val.startswith('span')][0] chrom = [val.strip('chrom='+prefix).strip() \ for val in step_parts if val.startswith('chrom')][0] if chrom == 'M': chrom = 'MT' if current_chrom is None: current_chrom = chrom elif current_chrom != chrom: # Empty chrom_array into genes get_gene_scores_from_chrom(chrom_array, chrom, all_genes, genes_by_chrom, genes_scores) current_chrom = chrom chrom_array[:] = 0 start = [val.strip('start=').strip() \ for val in step_parts if val.startswith('start')][0] pos = int(start) step = int(step) span = int(span) elif line.startswith('variable'): step_type = 'variable' step_parts = line.split(' ') chrom = [val.strip('chrom='+prefix).strip() \ for val in step_parts if val.startswith('chrom')][0] if chrom == 'M': chrom = 'MT' if current_chrom is None: current_chrom = chrom elif current_chrom != chrom: # Empty chrom_array into genes get_gene_scores_from_chrom(chrom_array, chrom, all_genes, genes_by_chrom, genes_scores) current_chrom = chrom chrom_array[:] = 0 else: if step_type == 'fixed': chrom_array[pos] = float(line.strip()) pos += step else: #step_type == 'variable' if '\t' in line: line_parts = line.split('\t') else: line_parts = line.split(' ') chrom_array[int(line_parts[0])] = float(line_parts[1].strip()) # empty chrom_array into genes_score from the final section get_gene_scores_from_chrom(chrom_array, chrom, all_genes, genes_by_chrom, genes_scores) # output genes and scores if args.exp: out_fn = args.exp else: if '.gz' in wig_fn: wig_fn = '.'.join(wig_fn.split('.')[:-1]) out_fn = '.'.join(wig_fn.split('.')[:-1]) # cut off wig extension out_fn += '.exp' # add .exp extension with open(out_fn, 'w') as out: out.write('gene\texp\n') # header for id in genes_scores.keys(): out.write(id + '\t' + str(genes_scores[id]) + '\n') out.close()
def main(): """ 1) Set counts_func 2) Load studies 3) Load divisor study if provided 4) Normalise studies if required 5) Set genes_of_interest 6) Filter studies by genes_of_interest and statistical cutoff 7) Create plotlines from studies 8) Smooth or bin plotlines as required 9) Do plot division (if required) 10) Set basic plotting info 11) Set lines colors as needed 12) Create Plot 13) Save Plot 14) Save genes in plot to file in rank order (optional) """ rr = RunRecord('plot_counts') rr.addCommands(sys.argv) args = script_info['args'].parse(window_title='Plot Counts') # 1: Set feature counting metric counts_func = set_counts_function(args.counts_metric) # 2: Load studies print 'Loading counts data' studies, window_upstream, window_downstream =\ load_studies(args.collections, counts_func) # 3: Load divisor study if provided if args.div is not None: div_studies, div_window_upstream, div_window_downstream =\ load_studies([args.div], counts_func) if div_window_upstream == window_upstream and \ div_window_downstream == window_downstream: print 'Windows match - using div study' studies.append(div_studies[0]) # alter name so we divide by the same study div_studies[0].collection_label += '_div' div_name = div_studies[0].collection_label else: rr.dieOnCritical('Differing Data and Div up/down-stream '+\ 'window sizes', [div_window_upstream, div_window_downstream, window_upstream, window_downstream]) else: div_name = None # 4: RPM Normalise counts by default if not args.no_normalise and args.counts_metric == 'mean': print 'Normalising by counts RPM' for study in studies: study.normaliseByRPM() # 5: Specify genes of interest to direct study for study in studies: study.filterByGenes(args.db_path, chrom=args.chrom, include_samples=args.include_targets, exclude_samples=args.exclude_targets) # 6: Filter all genes in studies by statistical cutoff if args.data_cutoff > 0.0: for study in studies: study.filterByCutoff(args.data_cutoff) # 7: Create plot lines for each study in studies try: group_size = int(args.group_size) except ValueError: group_size = 'All' plot_lines = [] for study in studies: lines = study.asPlotLines(group_size, args.group_location, p=args.line_cutoff) for line in lines: plot_lines.append(line) rr.addInfo('Total number of lines from all studies', len(plot_lines)) # 8: smooth and/or bin plot lines as required if args.binning and args.binning > 0: for line in plot_lines: line.applyBinning(args.binning) rr.addInfo('lines binned to width', args.binning) if args.smoothing and args.smoothing > 0: for line in plot_lines: line.applySmoothing(args.smoothing) rr.addInfo('lines smoothed to width', args.smoothing) # 9: Do plot division if required if div_name: plot_lines = div_plots(plot_lines, div_name, div_by=args.div_by) rr.addInfo('Total number of lines to plot', len(plot_lines)) # 10: set basic plotting info ylim = None if args.ylim is not None: if ',' not in args.ylim: rr.dieOnCritical('ylim must be comma separated', ylim) ylim = map(float, args.ylim.strip().split(',')) # if we have a plot series, create a directory to write plots if args.plot_series and not args.test_run: plot_series_dir = set_up_series_plots_dir(args.plot_filename) filename_series = [] else: plot_series_dir = None filename_series = None series_labels = None label_coords = None print 'Prepping for plot' vline = dict(x=0, linewidth=args.vline_width, linestyle=args.vline_style, color='w') plot = PlottableGroups(height=args.fig_height/2.5, width=args.fig_width/2.5, bgcolor=args.bgcolor, grid_off=args.grid_off, yaxis_lims=ylim, xaxis_lims=(-window_upstream, window_downstream), xy_tick_spaces=(args.xgrid_lines, args.ygrid_lines), xy_tick_intervals=(args.xtick_interval, args.ytick_interval), offset_ticks=args.offset_ticks, linewidth=args.line_width, title_size=args.title_size, font=args.font, xy_label_fontsizes=(args.xfont_size, args.yfont_size), vline=vline, ioff=True, colorbar=args.colorbar, clean=args.clean_plot) x = numpy.arange(-window_upstream, window_downstream) # 11: set line colors plot_lines = set_plot_colors(plot_lines, studies, div_name, args.bgcolor, args.grey_scale, restrict_colors=args.restrict_colors) # 12: Create plot plot(x, plot_lines=plot_lines, filename_series=filename_series, xlabel=args.xlabel, ylabel=args.ylabel, title=args.title, colorbar=args.colorbar, labels_size=args.legend_font_size, show_legend=args.legend, plot_CI=args.confidence_intervals) # 13: Save plots # if series, create directory if args.plot_series and not args.test_run: set_up_series_plots_dir(args.plot_filename) if args.plot_filename and not args.test_run: plot_fn = args.plot_filename if '.pdf' in plot_fn.lower(): plot.savefig(plot_fn, image_format='pdf') elif '.png' in plot_fn.lower(): plot.savefig(plot_fn, image_format='png') elif '.jpg' in plot_fn.lower() or '.jpeg' in plot_fn.lower(): plot.savefig(plot_fn, image_format='jpg') else: plot.savefig(plot_fn+'.pdf', image_format='pdf') else: plot.show() # 14: Save ENSEMBL gene ids by rank if requested if args.write_genes_by_rank: for study in studies: fn_parts = args.write_genes_by_rank.split('.') if len(fn_parts) > 1: f_ext = fn_parts[-1] else: f_ext = 'txt' fn = '.'.join(fn_parts[:-1]) + '_' +\ study.collection_label.replace(' ', '_') + '.' + f_ext with open(fn, 'w') as out: out.write('gene' + '\n') plot_lines.sort(key=lambda x: x.rank) for line in plot_lines: for label in line.getLabelsAsList(): out.write(label + '\n') rr.display()