def load_expr(self, expr_study, db_path, include_targets=None, exclude_targets=None): """ loads expression records from a ChippyDB and also ranks by expr """ rr = RunRecord('load_expr') sample_name = expr_study.split(' : ')[0] session = db_query.make_session(db_path) self.expr_genes = [] #sample_type == 'Expression data: absolute ranked' print 'Querying sample from ChippyDB', sample_name sample_genes = db_query.get_genes_by_ranked_expr( session, sample_name, biotype='protein_coding', data_path=None, rank_by='mean', include_targets=include_targets, exclude_targets=exclude_targets) for gene in sample_genes: gene_record = ExprGene(gene.MeanScore, gene.Rank, gene.ensembl_id, sample_name) self.expr_genes.append(gene_record) rr.addInfo('genes found in ' + sample_name, len(sample_genes))
def main(): rr = RunRecord('add_expression_db') rr.addCommands(sys.argv) args = script_info['args'].parse(window_title='Add Expression to DB') session = db_query.make_session(args.db_path) name = args.name description = args.description ref_file = args.expression_data sample_type = args.sample_type # Check that Sample and Reference File are both unique if name in db_query.get_sample_entries(session): rr.dieOnCritical('Sample name already exists', name) if ref_file in db_query.get_reffile_entries(session, reffile_name=ref_file): rr.dieOnCritical('ReferenceFile already loaded', ref_file) if sample_types[sample_type] == sample_types['abs_expr']: expr_table = gene_expr_to_table( args.expression_data, stable_id_label=args.gene_id_heading, probeset_label=args.probeset_heading, exp_label=args.expression_heading, allow_probeset_many_gene=args.allow_probeset_many_gene, validate=True, sep=args.sep) elif sample_types[sample_type] == sample_types['diff_expr']: # validation breaks with some of Rohan's diff files # he's included all probesets but only the mean score, once. expr_table = gene_expr_diff_to_table( args.expression_data, stable_id_label=args.gene_id_heading, probeset_label=args.probeset_heading, exp_label=args.expression_heading, sig_label=args.significance_heading, pval_label=args.p_value_heading, allow_probeset_many_gene=args.allow_probeset_many_gene, validate=False, sep=args.sep) elif sample_types[sample_type] == sample_types['target_genes']: expr_table = LoadTable(args.expression_data, sep=args.sep) else: rr.dieOnCritical('Unknown sample type', args.sample_type) success = add_data(session, name, description, args.expression_data, expr_table, sample_type=args.sample_type, reffile1=args.reffile1, reffile2=args.reffile2) rr.addInfo(name + ' added to DB', success) rr.display()
def _create_session(): # Create DB session if 'CHIPPY_DB' in os.environ: db_path = os.environ['CHIPPY_DB'] else: raise RuntimeError( 'You need to set an environment variable ' 'CHIPPY_DB that indicates where to find the database') session = db_query.make_session('sqlite:///%s' % db_path) return session
def start_chippy_db(self): """ script to create a new DB """ command = self._make_cmd_str('start_chippy_db.py', include_db=False) returncode, stdout, stderr = run_command(command) if returncode == 0: if self.check_valid_db(stdout): self.current_db = stdout # Check the DB works correctly session = db_query.make_session(self.current_db) self.populateDBInfo(session) session.close() self.switch_menu_actions(True)
def check_valid_db(self, db_path): """ True if valid data in DB at path """ if db_path is None or db_path == '': return False # test DB is valid session = db_query.make_session(db_path) if db_query.get_species(session) is None: session.close() return False session.close() return True
def main(): rr = RunRecord('drop_expression_db') rr.addCommands(sys.argv) args = script_info['args'].parse(window_title='Drop Expression Data') session = db_query.make_session(args.db_path) if db_query.drop_sample_records(session, args.sample): rr.addInfo('Removing ' + args.sample, 'Success') else: rr.addWarning('Removing ' + args.sample, 'Failure') rr.display()
def open_chippy_db(self): """ Use dialog to select DB file and populate view with DB info """ rr = RunRecord('open_chippy_db') db_path = str(QFileDialog.getOpenFileName()) if not self.check_valid_db(db_path): rr.addWarning('DB has invalid format', db_path) self.populateLogTable() return self.current_db = os.path.realpath(db_path) session = db_query.make_session(self.current_db) self.populateDBInfo(session) self.populateDBTable(session) rr.addInfo('DB opened successfully', db_path) self.populateLogTable() session.close() self.switch_menu_actions(True)
def main(): rr = RunRecord('db_summary') rr.addCommands(sys.argv) args = script_info['args'].parse(window_title='DB Summary') session = make_session(args.db_path) sample_name = args.sample if args.sample else None chroms = get_chroms(session) species = get_species(session) if sample_name is None: total_samples_count = get_sample_counts(session) sample_names = get_all_sample_names(session) total_genes_count = get_gene_counts(session) total_exon_count = get_exon_counts(session) total_expr_count = get_expression_counts(session) total_diff_genes_count = get_diff_counts(session) total_target_genes_count = get_targetgene_counts(session) total_reffiles_count = get_reffile_counts(session) else: total_expr_count = get_expression_counts(session, sample_name) total_diff_genes_count = get_diff_counts(session, sample_name) total_target_genes_count = get_targetgene_counts(session, sample_name) reffiles_entries = get_reffile_entries(session, sample_name=sample_name) rr.addInfo('ChipPy DB name', args.db_path) rr.addInfo('Species name', species) rr.addInfo('Chroms list', chroms) if sample_name is None: rr.addInfo('Total # of sample entries', total_samples_count) rr.addInfo('Sample names', sample_names) rr.addInfo('Total # of gene entries', total_genes_count) rr.addInfo('Total # of exon entries', total_exon_count) rr.addInfo('Total # of absolute-scored gene entries', total_expr_count) rr.addInfo('Total # of differential gene entries', total_diff_genes_count) rr.addInfo('Total # of target gene entries', total_target_genes_count) if sample_name is None: rr.addInfo('Total # of reference files', total_reffiles_count) else: if len(reffiles_entries) > 0: rr.addInfo('Reference file name', reffiles_entries) else: rr.addError('Reference file name', 'Not Available') rr.display()
def populateDBTable(self, session=None): """ Get all expression set data from self.current_db """ if session is None: if not self.check_valid_db(self.current_db): return session = db_query.make_session(self.current_db) names_descriptions = db_query.get_sample_names_descriptions(session) names = names_descriptions.keys() descriptions = names_descriptions.values() types = [] num_genes = [] files = [] for name in names: abs = set(db_query.get_expr_sample_names(session)) diff = set(db_query.get_diff_sample_names(session)) target = set(db_query.get_target_gene_names(session)) if name in abs: types.append('Expression') num_genes.append( db_query.get_expression_counts(session, sample_name=name)) elif name in diff: types.append('Differential') num_genes.append( db_query.get_diff_counts(session, sample_name=name)) elif name in target: types.append('Target Genes') num_genes.append( db_query.get_targetgene_counts(session, sample_name=name)) # reffile_entries returns reffile objects reffiles = db_query.get_reffile_entries(session, sample_name=name) file_names = [r.name for r in reffiles] files.append(', '.join(file_names)) session.close() self.db_table.setRowCount(0) for row, (n, d, t, g, f) in enumerate( zip(names, descriptions, types, num_genes, files)): self.db_table.setRowCount(self.db_table.rowCount() + 1) self.db_table.setItem(row, 0, QTableWidgetItem(QString(n))) self.db_table.setItem(row, 1, QTableWidgetItem(QString(d))) self.db_table.setItem(row, 2, QTableWidgetItem(QString(t))) self.db_table.setItem(row, 3, QTableWidgetItem(QString(str(g)))) self.db_table.setItem(row, 4, QTableWidgetItem(QString(f)))
def main(): rr = RunRecord('start_chippy_db') rr.addCommands(sys.argv) args = script_info['args'].parse() create_path(args.save_db_dir) if not os.path.isdir(args.save_db_dir): sys.stderr.write('The save_db_dir must be an existing directory.\n') return release = args.ensembl_release species = args.species chippy_db_name = args.save_db_prefix + '_chippy_' + str(release) +\ '_' + species + '.db' db_path = os.path.join(args.save_db_dir, chippy_db_name) if not os.path.exists(db_path): session = make_session(db_path) hostname = args.hostname username = args.username password = args.password account = HostAccount(hostname, username, password, port=args.port) add_ensembl_gene_data(session, args.species, ensembl_release=args.ensembl_release, account=account) success = create_dummy_expr(session) if success: rr.addInfo('Dummy data added successfully', 'Expr=1.') else: rr.addError('Dummy data failed to upload to DB', 'Expect bigger problems') rr.addInfo('Chippy DB written', db_path) print os.path.realpath(db_path) else: rr.addError('Chippy DB with this name already exists', db_path) if args.show_log: rr.display()
def filterByGenes(self, db_path, chrom=None, include_samples=None, exclude_samples=None): """ keep only results that match selected genes """ rr = RunRecord('filterByGenes') if not include_samples and not exclude_samples and not chrom: return rr.addInfo('Starting no. of genes', self.data_collection.N) session = make_session(db_path) if include_samples: for sample in include_samples: rr.addInfo('Restricting plot by include sample', sample) if exclude_samples: for sample in exclude_samples: rr.addInfo('Restricting plot by exclude sample', sample) if not chrom is None: rr.addInfo('Restricting plot to chromosome', chrom) filter_gene_ids = get_gene_ids(session, chrom=chrom, include_targets=include_samples, exclude_targets=exclude_samples) self.data_collection =\ self.data_collection.filteredByLabel(filter_gene_ids) rr.addInfo('Remaining genes', self.data_collection.N) if self.data_collection is None or\ len(self.data_collection.ranks) == 0: rr.dieOnCritical('Genes remaining after filtering', '0')
def main(): """ Returns a pickle of size window_start to window_finish containing chromatin mapping averages per base, one per gene, ranked by expression. """ rr = RunRecord('export_counts') rr.addCommands(sys.argv) args = script_info['args'].parse(window_title='Export Counts') session = db_query.make_session(args.db_path) sample_name = args.expr_sample print 'Loading counts data for', sample_name include_name = None exclude_name = None if args.include_targets: include_name = args.include_targets rr.addInfo('include gene targets', include_name) if args.exclude_targets: exclude_name = args.exclude_targets rr.addInfo('exclude gene targets', exclude_name) if (args.multitest_signif_val is not None) and not \ (-1 <= args.multitest_signif_val <= 1): rr.dieOnCritical('Multitest_signif_val should be -1, 0, 1', args.multitest_signif_val) if args.chr_prefix != '': # If it writes nothing then cogent.Table fails because it's fragile rr.addInfo('BAM/BED chromosome prefix given', args.chr_prefix) window_upstream = args.window_upstream assert window_upstream > 0, \ 'upstream window must be of at least size 1 bp' window_downstream = args.window_downstream assert window_downstream > 0, \ 'downstream window must be of at least size 1 bp' get_collection(session, sample_name, args.feature_type, args.BAMorBED, args.chr_prefix, window_upstream, window_downstream, args.multitest_signif_val, args.collection, args.overwrite, args.tab_delimited, include_name, exclude_name, bedgraph=args.make_bedgraph, BED_windows=args.BED_windows, chrom_size=args.max_chrom_size, no_overlap=args.no_overlap) session.close() rr.display()
return calc_stat def summed(data): freqs = data.asfreqs() c, r = freqs.transformed(counts_func=column_sum) return c def averaged(data): c, r = data.transformed(counts_func=column_mean) return c session = db_query.make_session('sqlite:///%s' % db_path) samples = db_query.get_target_sample(session) script_info = {} script_info['title'] = 'Compare read counts between histone variants' script_info['script_description'] = "Takes read counts that are centred on"\ " on gene TSS, that exist in two separate mapped read samples." script_info['version'] = __version__ script_info['authors'] = __author__ script_info['output_description'] = "Generates a single pdf figure." # alternate option organisation # essential source files opt_collection1 = make_option('-1', '--collection1', help='path to the plottable data from sample 1'\
def load_sample_genes(db_path, diff_sample, sample, sample_extremes): """ Load all portions of diffs into a dict with keys: diff_plus1, diff_noSig, diff_minus1, sample_bot, sample_mid, sample_top """ rr = RunRecord('load_sample_genes') # convert full identifier to stored name diff_sample_name = diff_sample sample_name = sample if sample_extremes > 0.5: rr.addWarning('sample_extremes option '+\ 'must be less than or equal to 0.5', sample_extremes) sample_extremes = 0.05 rr.addInfo('setting extremes to default', sample_extremes) raw_plot_data = RawPlotData(diff_sample_name, sample_name) # get diff genes which are significantly up-regulated session = make_session(db_path) multitest_signif_val = 1 raw_plot_data.diff_sig_plus1 =\ get_genes_by_ranked_diff(session, diff_sample_name, multitest_signif_val, biotype='protein_coding', data_path=None, rank_by='mean') session.close() # get diff genes which are significantly down-regulated session = make_session(db_path) multitest_signif_val = -1 raw_plot_data.diff_sig_minus1 =\ get_genes_by_ranked_diff(session, diff_sample_name, multitest_signif_val, biotype='protein_coding', data_path=None, rank_by='mean') session.close() # get diff genes which are neither up nor down session = make_session(db_path) multitest_signif_val = 0 raw_plot_data.diff_sig_zero = get_genes_by_ranked_diff( session, diff_sample_name, multitest_signif_val) session.close() # get absolute expression samples session = make_session(db_path) sample_genes = get_genes_by_ranked_expr(session, sample_name) session.close() sample_genes.sort(key=lambda x: x.MeanScore, reverse=True) sample_cutoff = int(len(sample_genes) * sample_extremes) rr.addInfo('sample cutoff set', sample_cutoff) # set absolute expression middle genes raw_plot_data.sample_mid =\ sample_genes[sample_cutoff:len(sample_genes)-sample_cutoff] raw_plot_data.sample_top = sample_genes[:sample_cutoff] raw_plot_data.sample_bot = sample_genes[-sample_cutoff:]\ if sample_cutoff else [] # Report diff counts rr.addInfo('Difference sample name', raw_plot_data.diff_name) rr.addInfo('diff genes for signif 1', len(raw_plot_data.diff_sig_plus1)) rr.addInfo('diff genes for signif 0', len(raw_plot_data.diff_sig_zero)) rr.addInfo('diff genes for signif -1', len(raw_plot_data.diff_sig_minus1)) # Report sample counts rr.addInfo('Absolute sample name', raw_plot_data.sample_name) rr.addInfo('top extreme genes for sample', len(raw_plot_data.sample_top)) rr.addInfo('bulk, non-extreme genes for sample', len(raw_plot_data.sample_mid)) rr.addInfo('bottom extreme genes for sample', len(raw_plot_data.sample_bot)) return raw_plot_data
def main(ui=None): """ 1) Get all protein coding genes from DB. 2) Read WIG file and if a count record is in a gene then add to its total 3) Write out genes and expr values """ rr = RunRecord('expr_wig_to_exp') rr.addCommands(sys.argv) args = script_info['args'].parse(window_title='Expression WIG to EXP') chrom_size = args.max_chrom_size prefix = args.chr_prefix session = db_query.make_session(args.db_path) genes = db_query.get_gene_entries(session) all_genes = {} # genes indexed by ensembl_id genes_by_chrom = {} # chrom: list(gene_id) genes_scores = {} # each gene has an expression score for gene in genes: if not gene.chrom in genes_by_chrom.keys(): genes_by_chrom[gene.chrom] = [] genes_by_chrom[gene.chrom].append(gene.ensembl_id) genes_scores[gene.ensembl_id] = 0 all_genes[gene.ensembl_id] = gene wig_fn = args.wig if wig_fn.endswith('.gz'): wig_file = gzip.GzipFile(wig_fn, 'rb') else: try: wig_file = open(wig_fn, 'r') except IOError: rr.dieOnCritical('Could not open file', wig_fn) # get total lines in wig for pacing the progress bar if not wig_fn.endswith('.gz'): command = 'wc -l ' + wig_fn returncode, stdout, stderr = run_command(command) if returncode: rr.addWarning('could not run wc to count WIG lines', 'error') total_lines = 1 else: total_lines = int(stdout.strip().split(' ')[0]) rr.addInfo('total lines in '+wig_fn, total_lines) # Read each piece of the file into an artificial chromosome (Numpy array) # and slice out the gene regions that we have for each gene in that chrom chrom_array = numpy.zeros(chrom_size, dtype=numpy.float32) current_chrom = None for i, line in enumerate(wig_file): if i % 100 == 0: msg = 'Reading wiggle entries [' + str(i) +\ ' / ' + str(total_lines) + ']' progress = (float(i)/float(total_lines)) ui.display(msg=msg, progress=progress) if line.startswith('track'): continue elif line.startswith('fixed'): # fixedStep chrom=chr10 start=56001 step=20 span=20 step_type = 'fixed' step_parts = line.split(' ') step = [val.strip('step=').strip() \ for val in step_parts if val.startswith('step')][0] span = [val.strip('span=').strip() \ for val in step_parts if val.startswith('span')][0] chrom = [val.strip('chrom='+prefix).strip() \ for val in step_parts if val.startswith('chrom')][0] if chrom == 'M': chrom = 'MT' if current_chrom is None: current_chrom = chrom elif current_chrom != chrom: # Empty chrom_array into genes get_gene_scores_from_chrom(chrom_array, chrom, all_genes, genes_by_chrom, genes_scores) current_chrom = chrom chrom_array[:] = 0 start = [val.strip('start=').strip() \ for val in step_parts if val.startswith('start')][0] pos = int(start) step = int(step) span = int(span) elif line.startswith('variable'): step_type = 'variable' step_parts = line.split(' ') chrom = [val.strip('chrom='+prefix).strip() \ for val in step_parts if val.startswith('chrom')][0] if chrom == 'M': chrom = 'MT' if current_chrom is None: current_chrom = chrom elif current_chrom != chrom: # Empty chrom_array into genes get_gene_scores_from_chrom(chrom_array, chrom, all_genes, genes_by_chrom, genes_scores) current_chrom = chrom chrom_array[:] = 0 else: if step_type == 'fixed': chrom_array[pos] = float(line.strip()) pos += step else: #step_type == 'variable' if '\t' in line: line_parts = line.split('\t') else: line_parts = line.split(' ') chrom_array[int(line_parts[0])] = float(line_parts[1].strip()) # empty chrom_array into genes_score from the final section get_gene_scores_from_chrom(chrom_array, chrom, all_genes, genes_by_chrom, genes_scores) # output genes and scores if args.exp: out_fn = args.exp else: if '.gz' in wig_fn: wig_fn = '.'.join(wig_fn.split('.')[:-1]) out_fn = '.'.join(wig_fn.split('.')[:-1]) # cut off wig extension out_fn += '.exp' # add .exp extension with open(out_fn, 'w') as out: out.write('gene\texp\n') # header for id in genes_scores.keys(): out.write(id + '\t' + str(genes_scores[id]) + '\n') out.close()