Ejemplo n.º 1
0
def replacePearsonPvalueWithZscore():
    all_sample_data={}
    for tissue in tissue_comparison_scores:
        for (r,p,sample) in tissue_comparison_scores[tissue]:
            all_sample_data[sample] = [] ### populate this dictionary and create sub-dictionaries
        break

    for tissue in tissue_comparison_scores:
        for (r,p,sample) in tissue_comparison_scores[tissue]:
            all_sample_data[sample].append(r)

    sample_stats={}
    all_dataset_rho_values=[]
    ### Get average and standard deviation for all sample rho's
    for sample in all_sample_data:
        all_dataset_rho_values+=all_sample_data[sample]
        avg=statistics.avg(all_sample_data[sample])
        stdev=statistics.stdev(all_sample_data[sample])
        sample_stats[sample]=avg,stdev
    
    global_rho_avg = statistics.avg(all_dataset_rho_values)
    global_rho_stdev = statistics.stdev(all_dataset_rho_values)
    
    ### Replace the p-value for each rho
    for tissue in tissue_comparison_scores:
        scores = []
        for (r,p,sample) in tissue_comparison_scores[tissue]:
            #u,s=sample_stats[sample]
            #z = (r-u)/s
            z = (r-global_rho_avg)/global_rho_stdev ### Instead of doing this for the sample background, do it relative to all analyzed samples
            scores.append([r,z,sample])
        tissue_comparison_scores[tissue] = scores
Ejemplo n.º 2
0
def statisticallyFilterFile(input_file, output_file, threshold):
    if 'exp.' in input_file:
        counts_file = string.replace(input_file, 'exp.', 'geneCount.')
    else:
        counts_file = input_file[:-4] + '-geneCount.txt'
    sample_expressed_genes = {}
    header = True
    junction_max = []
    count_sum_array = []
    for line in open(input_file, 'rU').xreadlines():
        data = cleanUpLine(line)
        if '.csv' in input_file:
            t = string.split(data, ',')
        else:
            t = string.split(data, '\t')
        if header:
            header_len = len(t)
            full_header = t
            samples = t[1:]
            header = False
            count_sum_array = [0] * len(samples)
        else:
            if len(t) == (header_len + 1):
                ### Correct header with a missing UID column
                samples = full_header
                count_sum_array = [0] * len(samples)
                print 'fixing bad header'
            try:
                values = map(float, t[1:])
            except Exception:
                if 'NA' in t[1:]:
                    tn = [0 if x == 'NA' else x
                          for x in t[1:]]  ### Replace NAs
                    values = map(float, tn)
                else:
                    tn = [0 if x == '' else x for x in t[1:]]  ### Replace NAs
                    values = map(float, tn)

            binarized_values = []
            for v in values:
                if v > threshold: binarized_values.append(1)
                else: binarized_values.append(0)
            count_sum_array = [
                sum(value)
                for value in zip(*[count_sum_array, binarized_values])
            ]

    index = 0
    distribution = []
    count_sum_array_db = {}
    samples_to_retain = []
    samples_to_exclude = []
    for sample in samples:
        count_sum_array_db[sample] = count_sum_array[index]
        distribution.append(count_sum_array[index])
        index += 1
    from stats_scripts import statistics
    distribution.sort()
    avg = int(statistics.avg(distribution))
    stdev = int(statistics.stdev(distribution))
    min_exp = int(min(distribution))
    cutoff = avg - (stdev * 2)
    dev = 2
    print 'The average number of genes expressed above %s is %s, (SD is %s, min is %s)' % (
        threshold, avg, stdev, min_exp)
    if cutoff < 0:
        if (stdev - avg) > 0:
            cutoff = avg - (stdev / 2)
            dev = 0.5
            print cutoff, 'genes expressed selected as a default cutoff to include cells (2-stdev away)'
        else:
            cutoff = avg - stdev
            dev = 1
            print cutoff, 'genes expressed selected as a default cutoff to include cells (1-stdev away)'
    if min_exp > cutoff:
        cutoff = avg - stdev
        dev = 1

    print 'Using a default cutoff of >=500 genes per cell expressed/cell'
    cutoff = 499
    import export
    eo = export.ExportFile(counts_file)
    eo.write('Sample\tGenes Expressed(threshold:' + str(threshold) + ')\n')
    for sample in samples:  ### keep the original order
        if count_sum_array_db[sample] > cutoff:
            samples_to_retain.append(sample)
        else:
            samples_to_exclude.append(sample)
        eo.write(sample + '\t' + str(count_sum_array_db[sample]) + '\n')

    if len(samples_to_retain) < 4:  ### Don't remove any if too few samples
        samples_to_retain += samples_to_exclude
    else:
        print len(
            samples_to_exclude
        ), 'samples removed (< 500 genes expressed)'  # (%s)' % (dev,string.join(samples_to_exclude,', '))
    eo.close()
    print 'Exporting the filtered expression file to:'
    print output_file
    filterFile(input_file, output_file, samples_to_retain)
Ejemplo n.º 3
0
def plotFeatureBoxPlots(qc_db, dataset_name, feature_type):
    pylab.figure()
    pylab.xlabel('Biological Sample Names')
    pylab.ylabel('Read Counts - Log2')
    pylab.title('Expression BoxPlots for %ss - %s' %
                (feature_type, dataset_name))
    #pylab.subplots_adjust(left=0.085, right=0.95, top=0.2, bottom=0.35)
    pylab.subplots_adjust(left=0.075, right=0.95, top=0.9, bottom=0.35)

    #axes = getAxes(scores) ### adds buffer space to the end of each axis and creates room for a legend
    #pylab.axis(axes)

    boxplots = []
    samples = []

    sample_sorted_list = []

    for sample_name in qc_db:
        try:
            qc = qc_db[sample_name][feature_type]
        except Exception:
            print 'No junction data found for at least one sample:', sample_name
            forceExit
        sample_sorted_list.append(
            [statistics.avg(qc),
             statistics.stdev(qc), sample_name])
    sample_sorted_list.sort()
    sample_sorted_list.reverse()

    filename = 'QC-%s-BoxPlot-%s.pdf' % (dataset_name, feature_type)
    export_obj = export.ExportFile(root_dir + filename[:-4] + '.txt')
    export_obj.write('SampleID\tAverage Expression\n')

    firstEntry = True
    for (mean, stdev, sample_name) in sample_sorted_list:
        ls = []
        x_ls = []
        y_ls = []
        qc = qc_db[sample_name][feature_type]
        boxplots.append(qc)
        samples.append(sample_name)
        export_obj.write(sample_name + '\t' + str(mean) + '\n')
        if firstEntry:
            threshold = mean - 2 * stdev
            firstEntry = False
        else:
            if mean < threshold:
                print sample_name, 'expression is considered very low (2 standard deviations away from the max).'
    pylab.boxplot(boxplots,
                  notch=0,
                  whis=1.5,
                  positions=None,
                  widths=None,
                  patch_artist=False)
    #pylab.boxplot(boxplots, notch=0, sym='+', vert=1, whis=1.5, positions=None, widths=None, patch_artist=False)
    xtickNames = pylab.setp(pylab.gca(), xticklabels=samples)
    pylab.setp(xtickNames, rotation=90, fontsize=10)
    export_obj.close()

    #print 'Exporting:',filename
    pylab.savefig(root_dir + filename)
    filename = filename[:-3] + 'png'
    pylab.savefig(root_dir + filename)  #,dpi=200
    graphic_link.append(
        ['QC - BoxPlot-' + feature_type + ' Expression', root_dir + filename])
    try:
        import gc
        pylab.figure.clf()
        pylab.close()
        gc.collect()
    except Exception:
        pass
Ejemplo n.º 4
0
def importTableEntries(filename,
                       filter_db,
                       ensembl_exon_db,
                       gene_db,
                       root_dir,
                       transpose,
                       display,
                       showIntrons,
                       analysisType='plot'):
    import collections
    average_samples = True
    if showIntrons == 'yes': include_introns = True
    else: include_introns = False
    uid_db = {}  ### probeset or AltAnalyze RNA-Seq ID keyed
    uid_list = {}  ### ordered from first to last exon region
    uid_gene_db = {}  ### Lets us look at multiple genes
    try:
        import UI
        biotypes = UI.getBiotypes(filename)
    except Exception:
        biotypes = {}
    for gene in ensembl_exon_db:
        uid_list[gene] = []
        for (index, ed, id) in ensembl_exon_db[gene]:
            proceed = False
            if 'exp.' in filename:
                if include_introns:
                    proceed = True
                elif 'E' in ed.ExonID():
                    proceed = True
            else:  ### Include introns for splicing index view
                if include_introns == True: proceed = True
                elif 'E' in ed.ExonID(): proceed = True
            if proceed:
                uid_db[id] = ed
                uid_list[gene].append(id)
            uid_gene_db[id] = gene

    if '_vs_' in filename:  ### If one two groups, this is what will be output to the RawSplice folder - need to have this alternate way of getting the expression file location
        rootdir = string.split(filename, 'AltResults')[0]
        exp_dir = getValidExpFile(rootdir + 'ExpressionInput')
        alt_groups_dir = string.split(
            exp_dir, 'ExpressionInput'
        )[0] + 'ExpressionInput/groups.' + findFilename(exp_dir)
        alt_groups_dir = string.replace(alt_groups_dir, 'exp.', '')

    start_time = time.time()
    fn = filepath(filename)
    matrix_gene_db = {}
    stdev_gene_matrix_db = {}
    row_header_gene = {}
    ids = {}
    x = 0

    if 'heatmap' in analysisType:
        average_samples = False

    if '/' in filename:
        dataset_name = string.split(filename, '/')[-1][:-4]
    else:
        dataset_name = string.split(filename, '\\')[-1][:-4]
    for line in open(fn, 'rU').xreadlines():
        data = line.strip()
        t = string.split(data, '\t')
        if data[0] == '#': x = 0
        elif x == 0:
            if platform == 'RNASeq':
                removeExtension = True
            else:
                removeExtension = False
            group_db, column_header, sample_name_db = assignGroupColors(
                t[1:], '', removeExtension=removeExtension)
            x = 1
            altresults = False
            if average_samples:
                if 'AltResults' in filename:
                    altresults = True
                    groups_dir = string.split(
                        filename, 'AltResults'
                    )[0] + 'ExpressionInput/groups.' + findFilename(filename)
                    if verifyFile(groups_dir) == False:
                        groups_dir = alt_groups_dir
                    new_column_header = reformatAltHeaders(t[3:])
                    start = 3
                else:
                    if 'exp.' in filename:
                        groups_dir = string.replace(filename, 'exp.',
                                                    'groups.')
                    else:
                        groups_dir = string.replace(filename, 'counts.',
                                                    'groups.')
                    new_column_header = column_header
                    start = 1  ### starting index with numeric values
                groups_dir = string.replace(groups_dir, 'stats.', 'groups.')
                groups_dir = string.replace(
                    groups_dir, '-steady-state.txt',
                    '.txt')  ### groups is for the non-steady-state file

                try:
                    group_index_db = collections.OrderedDict()
                except Exception:
                    import ordereddict
                    group_index_db = ordereddict.OrderedDict()
                ### use comps in the future to visualize group comparison changes
                sample_list, group_sample_db, group_db, group_name_sample_db, comp_groups, comps_name_db = ExpressionBuilder.simpleGroupImport(
                    groups_dir)
                for item in sample_list:
                    group_name = group_db[item]
                    proceed = False
                    try:
                        sample_index = new_column_header.index(item)
                        proceed = True
                    except Exception:
                        try:
                            item = string.replace(item, '.bed', '')
                            item = string.replace(
                                item, '.CEL',
                                '')  ### Probe-level analyses as RNA-Seq
                            item = string.replace(item, '.cel', '')
                            item = string.replace(item, '.txt', '')
                            item = string.replace(item, '.TXT', '')
                            item = string.replace(item, '.TAB', '')
                            item = string.replace(item, '.tab', '')
                            sample_index = new_column_header.index(item)
                            proceed = True
                        except Exception:
                            pass
                            #print [item]
                            #print column_header
                            #print Error
                    if proceed:
                        try:
                            group_index_db[group_name].append(sample_index)
                        except Exception:
                            try:
                                group_index_db[group_name] = [
                                    sample_index
                                ]  ### dictionary of group to input file sample indexes
                            except Exception:
                                pass  ### Occurs when analyzing splicing-index for two groups when more than two groups exist (error from 5 lines up)
                groups = map(str, group_index_db)  ### store group names
                new_sample_list = map(
                    lambda item: group_db[item], sample_list
                )  ### lookup index of each sample in the ordered group sample list
                column_header = groups
            else:
                if 'AltResults' in filename: start = 3
                else: start = 1  ### starting index with numeric values
                column_header = t[start - 1:]
            row_number = 1
        else:
            if ' ' not in t and '' not in t:  ### Occurs for rows with missing data
                uid = t[start - 1]
                if ';' in uid:
                    uid = string.split(uid, ';')[0]
                ids[uid] = None
                ens_geneID = string.split(uid, ':')[0]
                #if ens_geneID in gene_db: print uid
                if uid in filter_db or ('heatmap' in analysisType
                                        and ens_geneID in gene_db):
                    try:
                        if len(biotypes) == 1 and 'junction' in biotypes:
                            gene = ens_geneID
                        else:
                            gene = uid_gene_db[uid]
                        try:
                            row_header_gene[gene].append(uid)
                        except Exception:
                            row_header_gene[gene] = [uid]
                        if average_samples == False:
                            values = map(float, t[start:])
                            try:
                                matrix_gene_db[gene].append(values)
                            except Exception:
                                matrix_gene_db[gene] = [values]
                        else:
                            if platform == 'RNASeq' and altresults == False:
                                ### Convert to log2 RPKM values - or counts
                                values = map(lambda x: math.log(float(x), 2),
                                             t[start:])
                            else:
                                values = map(float, t[start:])

                            if 'AltResults' in filename:  ### If splicing scores, normalize these to the mean values
                                mean = statistics.avg(values)
                                values = map(lambda x: x - mean, values)
                            avg_ls = []
                            std_ls = []
                            for group_name in group_index_db:
                                group_values = map(
                                    lambda x: values[x],
                                    group_index_db[group_name]
                                )  ### simple and fast way to reorganize the samples
                                avg = statistics.avg(group_values)
                                try:
                                    st_err = statistics.stdev(
                                        group_values) / math.sqrt(
                                            len(group_values))
                                except Exception:
                                    ### Occurs if no replicates in the dataset
                                    st_err = 0
                                avg_ls.append(avg)
                                std_ls.append(st_err)
                            try:
                                matrix_gene_db[gene].append(avg_ls)
                            except Exception:
                                matrix_gene_db[gene] = [avg_ls]
                            try:
                                stdev_gene_matrix_db[gene].append(std_ls)
                            except Exception:
                                stdev_gene_matrix_db[gene] = [std_ls]
                    except Exception:
                        #print traceback.format_exc()
                        pass
            x += 1

    global colors
    original_column_header = list(column_header)
    if len(uid_list) == 0:
        print 'No genes found in the exon expression database'
        forceNoExonExpError
    successfully_output_genes = 0
    display_count = 0  ### Only display a certain number of genes

    for last_gene in uid_list:
        pass
    for gene in uid_list:
        fig = pylab.figure(
        )  ### Create this here - resulting in a single figure for memory purposes
        new_header = []
        new_matrix = []
        new_stdev = []
        annotation_list = []
        gene_symbol = gene_db[gene]
        try:
            matrix = matrix_gene_db[gene]
        except Exception:
            #print gene_symbol, 'not in alternative expression database'
            continue  ### go the next gene - no alt.expression for this gene
        row_header = row_header_gene[gene]

        try:
            stdev_matrix = stdev_gene_matrix_db[gene]
        except Exception:
            pass
        for uid in uid_list[gene]:
            #print row_header;sys.exit()
            try:
                i = row_header.index(
                    uid
                )  ### If the ID is in the filtered annotated exon list (not just core)
                new_header.append(uid)
                try:
                    new_matrix.append(matrix[i])
                except Exception:
                    print uid, i, len(matrix)
                    sys.exit()
                ed = uid_db[uid]
                annotation_list.append(ed)
                try:
                    new_stdev.append(stdev_matrix[i])
                except Exception:
                    pass
            except Exception:
                pass

        if len(new_matrix) > 0:
            matrix = new_matrix
        if len(new_header) > 0:
            row_header = new_header
        if 'heatmap' in analysisType:
            export_dir = root_dir + gene_symbol + '-heatmap.txt'
            export_obj = export.ExportFile(export_dir)
            export_obj.write(string.join(column_header, '\t') + '\n')
            ki = 0
            if len(annotation_list) > 0:
                for ed in annotation_list:
                    if 'AltResults' not in filename and platform == 'RNASeq':
                        values = map(lambda x: math.log(x, 2), matrix[ki])
                    else:
                        values = matrix[ki]
                    export_obj.write(
                        string.join([ed.ExonID()] + map(str, values), '\t') +
                        '\n')
                    ki += 1
                row_metric = 'euclidean'
                row_method = None
            else:
                ### Just junctions analyzed here... no sorted junctions yet
                ki = 0
                for uid in row_header_gene[gene]:
                    if 'AltResults' not in filename and platform == 'RNASeq':
                        values = map(lambda x: math.log(x, 2), matrix[ki])
                    else:
                        values = matrix[ki]
                    export_obj.write(
                        string.join([uid] + map(str, values), '\t') + '\n')
                    ki += 1
                row_metric = 'euclidean'
                row_method = 'average'
            export_obj.close()
            from visualization_scripts import clustering

            column_metric = 'euclidean'
            column_method = 'hopach'
            color_gradient = 'red_black_sky'
            transpose = False
            graphic_links = []
            if ki > 100: transpose = True
            if gene == last_gene: display = True
            else: display = False
            graphic_links = clustering.runHCexplicit(export_dir,
                                                     graphic_links,
                                                     row_method,
                                                     row_metric,
                                                     column_method,
                                                     column_metric,
                                                     color_gradient,
                                                     transpose,
                                                     display=display,
                                                     Normalize=True,
                                                     compressAxis=False,
                                                     contrast=2.5)
            successfully_output_genes += 1
        else:
            stdev_matrix = new_stdev
            time_diff = str(round(time.time() - start_time, 1))
            #print '%d rows and %d columns imported for %s in %s seconds...' % (len(matrix),len(column_header),dataset_name,time_diff)
            if transpose == True:
                matrix = map(numpy.array,
                             zip(*matrix))  ### coverts these to tuples
                column_header, row_header = row_header, original_column_header
                stdev_matrix = map(numpy.array, zip(*stdev_matrix))
            matrix = numpy.array(matrix)

            stdev_matrix = numpy.array(stdev_matrix)
            try:
                if len(uid_list) > 10:
                    #if display_count==5: display=False
                    display = False
                if display_count == 0:
                    ### store a consistent color palete to use
                    colors = []
                    """
                    k=0
                    while k < len(row_header):
                        colors.append(tuple(rand(3)))
                        k+=1"""
                    #http://stackoverflow.com/questions/3016283/create-a-color-generator-from-given-colormap-in-matplotlib
                    cm = pylab.cm.get_cmap('gist_rainbow')  #gist_ncar
                    for i in range(len(row_header)):
                        colors.append(cm(1. * i / len(row_header))
                                      )  # color will now be an RGBA tuple

                plotExonExpression(fig,
                                   matrix,
                                   stdev_matrix,
                                   row_header,
                                   column_header,
                                   dataset_name,
                                   annotation_list,
                                   gene_symbol,
                                   root_dir,
                                   display=display)
                successfully_output_genes += 1
                display_count += 1
            except Exception:
                print traceback.format_exc()
                sys.exit()
                print gene_symbol, 'failed'
        try:
            pylab.close()
        except Exception:
            pass
        if successfully_output_genes > 0:
            #try: print 'Gene graphs exported to ExonPlots...'
            #except Exception: pass
            pass
        else:
            print '\nWARNING!!!! No genes with associated alternative exon evidence found\n'
            forceNoExonExpError
        try:
            import gc
            fig.clf()
            pylab.close()
            gc.collect()
        except Exception:
            pass