def _check_diff_headers(header_row, sig_label='', pval_label=''): """ check the additional significance header label and p_value label. Has be able to deal with there having been no probesets in column 2. """ rr = RunRecord('_check_diff_headers') try: sig_col = header_row.index(sig_label) except ValueError: rr.dieOnCritical('Significance column header not found in', header_row) try: pval_col = header_row.index(pval_label) except ValueError: rr.addCritical('Expected to see P-value column header', pval_label) rr.dieOnCritical('P-value column header not found in', header_row) return sig_col, pval_col
def _read_data_file(data_path, sep='\t', stable_id_label='ENSEMBL', probeset_label='probeset', exp_label='exp', sig_label='sig', pval_label='p_val', is_diff=False): """ Get the data out of the file - shared by both: gene_expr_to_table() and gene_expr_diff_to_table() """ rr = RunRecord('_read_data_file') rows = [] if '.gz' in data_path.lower(): with GzipFile(data_path, 'r') as data_file: for data in data_file: rows.append(str(data).strip().split(sep)) else: with open(data_path) as data_file: for data in data_file: rows.append(str(data).strip().split(sep)) # check that headers are valid and whether probesets are present gene_col, probe_col, exp_col, probes_present = _check_expr_headers(rows[0], stable_id_label=stable_id_label, probeset_label=probeset_label, exp_label=exp_label) if not probes_present: rr.addInfo('No probeset header found. Reading as', 'RNA-seq') else: rr.addInfo('Probesets found. Reading as', 'Micro-array') if is_diff: sig_col, pval_col = _check_diff_headers(rows[0], sig_label=sig_label, pval_label=pval_label) genes = []; probes = []; exp = []; sigs = []; pvals = [] # get data from each row for i, row in enumerate(rows): if i==0: continue # skip header line if probes_present: tmp_probes = list(row[probe_col].split('|')) else: tmp_probes = 'P'+str(i) # give RNAseq data a fake probe id exp_strs = row[exp_col].split('|') # Nuke any scores and probes marked with 'NA' by R while 'NA' in exp_strs: exp_strs.remove('NA') while 'NA' in tmp_probes: tmp_probes.remove('NA') if not len(exp_strs) or not len(tmp_probes): continue try: tmp_expr = map(float, exp_strs) except ValueError: rr.addCritical('Expected expression score float on line', i) rr.dieOnCritical('Line has incorrect format', row) genes.append(str(row[gene_col])) probes.append(tmp_probes) exp.append(tmp_expr) if is_diff: sigs.append(str(row[sig_col])) pvals.append(float(row[pval_col])) if is_diff: return genes, probes, exp, sigs, pvals, probes_present else: return genes, probes, exp, probes_present
def div_plots(plot_lines, div_study_name, div_by='all'): """ Divides the counts values in plot_lines by those in the divisor lines """ rr = RunRecord('div_plots') # build two matching sized dicts of lines indexed by rank ranked_plot_lines = {} dividing_plot_lines = {} for line in plot_lines: if line.study == div_study_name: dividing_plot_lines[line.rank] = line else: if not line.rank in ranked_plot_lines.keys(): ranked_plot_lines[line.rank] = [] ranked_plot_lines[line.rank].append(line) # sanity check if len(ranked_plot_lines.keys()) == 0: rr.dieOnCritical('No plot lines.', 'Same study as div plot?') # default: divide scores line for line - maybe important if trends change out_lines = [] if div_by.lower() == 'all': for ranked_index, div_index in zip(sorted(ranked_plot_lines.keys()), sorted(dividing_plot_lines.keys())): if ranked_index != div_index: rr.addCritical('Div and study plot lines do not match', [len(ranked_plot_lines), len(dividing_plot_lines)]) rr.addCritical('Differences in filtering may be responsible') rr.dieOnCritical('Try choosing a representative group instead', ['mean-counts', 'median-counts', 'top-expr', 'median-expr', 'bottom-expr']) for line in ranked_plot_lines[ranked_index]: line.counts = safe_line_division(line.counts, dividing_plot_lines[ranked_index].counts) out_lines.append(line) else: # divide all lines by the same counts array # build counts array to choose dividing counts from counts_lines = [] div_counts = None for key in sorted(dividing_plot_lines.keys()): counts_lines.append(dividing_plot_lines[key].counts) counts_array = numpy.array(counts_lines) if div_by.lower() == 'mean-counts': # mean counts for all div div_counts = numpy.mean(counts_array, axis=0) elif div_by.lower() == 'median-counts': # median counts for all div div_counts = numpy.median(counts_array, axis=0) elif div_by.lower() == 'top-expr': # top expressed genes counts line from div div_counts = counts_array[0] elif div_by.lower() == 'median-expr': # median expressed genes counts line from div div_counts = counts_array[int(len(counts_array)/2)] elif div_by.lower() == 'bottom-expr': # bottom expressed genes counts line from div div_counts = counts_array[-1] else: rr.dieOnCritical('Unrecognised div_by type', div_by) for ranked_index in ranked_plot_lines: for line in ranked_plot_lines[ranked_index]: line.counts = safe_line_division(line.counts, div_counts) out_lines.append(line) return out_lines