Example #1
0
def _check_diff_headers(header_row, sig_label='', pval_label=''):
    """ check the additional significance header label and
        p_value label. Has be able to deal with there having been
        no probesets in column 2.
    """
    rr = RunRecord('_check_diff_headers')
    try:
        sig_col = header_row.index(sig_label)
    except ValueError:
        rr.dieOnCritical('Significance column header not found in', header_row)

    try:
        pval_col = header_row.index(pval_label)
    except ValueError:
        rr.addCritical('Expected to see P-value column header', pval_label)
        rr.dieOnCritical('P-value column header not found in', header_row)

    return sig_col, pval_col
Example #2
0
def _read_data_file(data_path, sep='\t', stable_id_label='ENSEMBL',
        probeset_label='probeset', exp_label='exp', sig_label='sig',
        pval_label='p_val', is_diff=False):
    """
        Get the data out of the file - shared by both:
        gene_expr_to_table() and gene_expr_diff_to_table()
    """
    rr = RunRecord('_read_data_file')

    rows = []
    if '.gz' in data_path.lower():
        with GzipFile(data_path, 'r') as data_file:
            for data in data_file:
                rows.append(str(data).strip().split(sep))
    else:
        with open(data_path) as data_file:
            for data in data_file:
                rows.append(str(data).strip().split(sep))

    # check that headers are valid and whether probesets are present
    gene_col, probe_col, exp_col, probes_present = _check_expr_headers(rows[0],
            stable_id_label=stable_id_label, probeset_label=probeset_label,
            exp_label=exp_label)
    if not probes_present:
        rr.addInfo('No probeset header found. Reading as', 'RNA-seq')
    else:
        rr.addInfo('Probesets found. Reading as', 'Micro-array')

    if is_diff:
        sig_col, pval_col = _check_diff_headers(rows[0], sig_label=sig_label,
                pval_label=pval_label)

    genes = []; probes = []; exp = []; sigs = []; pvals = []

    # get data from each row
    for i, row in enumerate(rows):
        if i==0:
            continue # skip header line

        if probes_present:
            tmp_probes = list(row[probe_col].split('|'))
        else:
            tmp_probes = 'P'+str(i) # give RNAseq data a fake probe id

        exp_strs = row[exp_col].split('|')

        # Nuke any scores and probes marked with 'NA' by R
        while 'NA' in exp_strs:
            exp_strs.remove('NA')
        while 'NA' in tmp_probes:
            tmp_probes.remove('NA')

        if not len(exp_strs) or not len(tmp_probes):
            continue

        try:
            tmp_expr = map(float, exp_strs)
        except ValueError:
            rr.addCritical('Expected expression score float on line', i)
            rr.dieOnCritical('Line has incorrect format', row)

        genes.append(str(row[gene_col]))
        probes.append(tmp_probes)
        exp.append(tmp_expr)
        if is_diff:
            sigs.append(str(row[sig_col]))
            pvals.append(float(row[pval_col]))

    if is_diff:
        return genes, probes, exp, sigs, pvals, probes_present
    else:
        return genes, probes, exp, probes_present
Example #3
0
def div_plots(plot_lines, div_study_name, div_by='all'):
    """ Divides the counts values in plot_lines by those in the divisor
            lines """
    rr = RunRecord('div_plots')
    # build two matching sized dicts of lines indexed by rank
    ranked_plot_lines = {}
    dividing_plot_lines = {}
    for line in plot_lines:
        if line.study == div_study_name:
            dividing_plot_lines[line.rank] = line
        else:
            if not line.rank in ranked_plot_lines.keys():
                 ranked_plot_lines[line.rank] = []
            ranked_plot_lines[line.rank].append(line)

        # sanity check
        if len(ranked_plot_lines.keys()) == 0:
            rr.dieOnCritical('No plot lines.', 'Same study as div plot?')

    # default: divide scores line for line - maybe important if trends change
    out_lines = []
    if div_by.lower() == 'all':
        for ranked_index, div_index in zip(sorted(ranked_plot_lines.keys()),
                sorted(dividing_plot_lines.keys())):
            if ranked_index != div_index:
                rr.addCritical('Div and study plot lines do not match',
                        [len(ranked_plot_lines), len(dividing_plot_lines)])
                rr.addCritical('Differences in filtering may be responsible')
                rr.dieOnCritical('Try choosing a representative group instead',
                        ['mean-counts', 'median-counts',
                        'top-expr', 'median-expr', 'bottom-expr'])
            for line in ranked_plot_lines[ranked_index]:
                line.counts = safe_line_division(line.counts,
                        dividing_plot_lines[ranked_index].counts)
                out_lines.append(line)
    else: # divide all lines by the same counts array
        # build counts array to choose dividing counts from
        counts_lines = []
        div_counts = None
        for key in sorted(dividing_plot_lines.keys()):
            counts_lines.append(dividing_plot_lines[key].counts)
        counts_array = numpy.array(counts_lines)

        if div_by.lower() == 'mean-counts': # mean counts for all div
            div_counts = numpy.mean(counts_array, axis=0)
        elif div_by.lower() == 'median-counts': # median counts for all div
            div_counts = numpy.median(counts_array, axis=0)
        elif div_by.lower() == 'top-expr': # top expressed genes counts line from div
            div_counts = counts_array[0]
        elif div_by.lower() == 'median-expr': # median expressed genes counts line from div
            div_counts = counts_array[int(len(counts_array)/2)]
        elif div_by.lower() == 'bottom-expr': # bottom expressed genes counts line from div
            div_counts = counts_array[-1]
        else:
            rr.dieOnCritical('Unrecognised div_by type', div_by)

        for ranked_index in ranked_plot_lines:
            for line in ranked_plot_lines[ranked_index]:
                line.counts = safe_line_division(line.counts, div_counts)
                out_lines.append(line)

    return out_lines