Esempio n. 1
0
def reorder(data,
            data_headers,
            array_order,
            comp_group_list,
            probeset_db,
            include_raw_data,
            array_type,
            norm,
            fl,
            logvalues=True,
            blanksPresent=False):
    ###array_order gives the final level order sorted, followed by the original index order as a tuple
    expbuilder_value_db = {}
    group_name_db = {}
    summary_filtering_stats = {}
    pval_summary_db = {}
    replicates = 'yes'

    stat_result_names = ['avg-', 'log_fold-', 'fold-', 'rawp-', 'adjp-']
    group_summary_result_names = ['avg-']

    ### Define expression variables
    try:
        probability_statistic = fl.ProbabilityStatistic()
    except Exception:
        probability_statistic = 'unpaired t-test'
    try:
        gene_exp_threshold = math.log(fl.GeneExpThreshold(), 2)
    except Exception:
        gene_exp_threshold = 0
    try:
        gene_rpkm_threshold = float(fl.RPKMThreshold())
    except Exception:
        gene_rpkm_threshold = 0
    try:
        FDR_statistic = fl.FDRStatistic()
    except Exception:
        FDR_statistic = 'Benjamini-Hochberg'
    calculateAsNonLog = True
    if blanksPresent:
        calculateAsNonLog = False

    ### Begin processing sample expression values according to the organized groups
    for row_id in data:
        try:
            gene = probeset_db[row_id][0]
        except TypeError:
            gene = ''  #not needed if not altsplice data
        data_headers2 = {}  #reset each time
        grouped_ordered_array_list = {}
        for x in array_order:
            y = x[1]  #this is the new first index
            group = x[2]
            group_name = x[3]
            group_name_db[group] = group_name
            #for example y = 5, therefore the data[row_id][5] entry is now the first
            try:
                try:
                    new_item = data[row_id][y]
                except IndexError:
                    print row_id, data[row_id], len(
                        data[row_id]), y, len(array_order), array_order
                    kill
                if logvalues == False and calculateAsNonLog and array_type == 'RNASeq':
                    new_item = math.pow(2, new_item)
            except TypeError:
                new_item = ''  #this is for a spacer added in the above function
            try:
                grouped_ordered_array_list[group].append(new_item)
            except KeyError:
                grouped_ordered_array_list[group] = [new_item]
            try:
                data_headers2[group].append(data_headers[y])
            except KeyError:
                data_headers2[group] = [data_headers[y]]
        #perform statistics on each group comparison - comp_group_list: [(1,2),(3,4)]
        stat_results = {}
        group_summary_results = {}
        for comp in comp_group_list:
            group1 = int(comp[0])
            group2 = int(comp[1])
            group1_name = group_name_db[group1]
            group2_name = group_name_db[group2]
            groups_name = group1_name + "_vs_" + group2_name
            data_list1 = grouped_ordered_array_list[group1]
            data_list2 = grouped_ordered_array_list[
                group2]  #baseline expression
            if blanksPresent:  ### Allows for empty cells
                data_list1 = filterBlanks(data_list1)
                data_list2 = filterBlanks(data_list2)
            try:
                avg1 = statistics.avg(data_list1)
            except Exception:
                avg1 = ''
            try:
                avg2 = statistics.avg(data_list2)
            except Exception:
                avg2 = ''
            try:
                if (logvalues == False
                        and array_type != 'RNASeq') or (logvalues == False
                                                        and calculateAsNonLog):
                    fold = avg1 / avg2
                    log_fold = math.log(fold, 2)
                    if fold < 1: fold = -1.0 / fold
                else:
                    log_fold = avg1 - avg2
                    fold = statistics.log_fold_conversion(log_fold)
            except Exception:
                log_fold = ''
                fold = ''
            try:
                #t,df,tails = statistics.ttest(data_list1,data_list2,2,3) #unpaired student ttest, calls p_value function
                #t = abs(t); df = round(df); p = str(statistics.t_probability(t,df))
                p = statistics.runComparisonStatistic(data_list1, data_list2,
                                                      probability_statistic)
            except Exception:
                p = 1
                sg = 1
                N1 = 0
                N2 = 0
            comp = group1, group2
            if array_type == 'RNASeq':  ### Also non-log but treated differently
                if 'RPKM' == norm: adj = 0
                else: adj = 1
                if calculateAsNonLog == False:
                    try:
                        avg1 = math.pow(2, avg1) - adj
                        avg2 = math.pow(2, avg2) - adj
                    except Exception:
                        avg1 = ''
                        avg2 = ''
                if 'RPKM' == norm:
                    if avg1 < gene_rpkm_threshold and avg2 < gene_rpkm_threshold:
                        log_fold = 'Insufficient Expression'
                        fold = 'Insufficient Expression'
                else:
                    if avg1 < gene_exp_threshold and avg2 < gene_exp_threshold:
                        log_fold = 'Insufficient Expression'
                        fold = 'Insufficient Expression'
                    #if row_id=='ENSG00000085514':
                    #if fold=='Insufficient Expression':
                    #print [norm, avg1, avg2, fold, comp, gene_exp_threshold, gene_rpkm_threshold, row_id]
                    #5.96999111075 7.72930768675 Insufficient Expression (3, 1) 1.0 ENSG00000085514
            if gene_rpkm_threshold != 0 and calculateAsNonLog:  ### Any other data
                a1 = nonLogAvg(data_list1)
                a2 = nonLogAvg(data_list2)
                #print [a1,a2,gene_rpkm_threshold]
                if a1 < gene_rpkm_threshold and a2 < gene_rpkm_threshold:
                    log_fold = 'Insufficient Expression'
                    fold = 'Insufficient Expression'
                #print log_fold;kill
            try:
                gs = statistics.GroupStats(log_fold, fold, p)
                stat_results[comp] = groups_name, gs, group2_name
                if probability_statistic == 'moderated t-test':
                    gs.setAdditionalStats(
                        data_list1, data_list2)  ### Assuming equal variance
                if probability_statistic == 'moderated Welch-test':
                    gs.setAdditionalWelchStats(
                        data_list1, data_list2)  ### Assuming unequal variance
            except Exception:
                null = []
                replicates = 'no'  ### Occurs when not enough replicates
                #print comp, len(stat_results); kill_program
            group_summary_results[group1] = group1_name, [avg1]
            group_summary_results[group2] = group2_name, [avg2]

        ### Replaces the below method to get the largest possible comparison fold and ftest p-value
        grouped_exp_data = []
        avg_exp_data = []
        for group in grouped_ordered_array_list:
            data_list = grouped_ordered_array_list[group]
            if blanksPresent:  ### Allows for empty cells
                data_list = filterBlanks(data_list)
            if len(data_list) > 0: grouped_exp_data.append(data_list)
            try:
                avg = statistics.avg(data_list)
                avg_exp_data.append(avg)
            except Exception:
                avg = ''
                #print row_id, group, data_list;kill
        try:
            avg_exp_data.sort()
            max_fold = avg_exp_data[-1] - avg_exp_data[0]
        except Exception:
            max_fold = 'NA'
        try:
            ftestp = statistics.OneWayANOVA(grouped_exp_data)
        except Exception:
            ftestp = 1
        gs = statistics.GroupStats(max_fold, 0, ftestp)
        summary_filtering_stats[row_id] = gs

        stat_result_list = []
        for entry in stat_results:
            data_tuple = entry, stat_results[entry]
            stat_result_list.append(data_tuple)
        stat_result_list.sort()

        grouped_ordered_array_list2 = []
        for group in grouped_ordered_array_list:
            data_tuple = group, grouped_ordered_array_list[group]
            grouped_ordered_array_list2.append(data_tuple)
        grouped_ordered_array_list2.sort(
        )  #now the list is sorted by group number

        ###for each rowid, add in the reordered data, and new statistics for each group and for each comparison
        for entry in grouped_ordered_array_list2:
            group_number = entry[0]
            original_data_values = entry[1]
            if include_raw_data == 'yes':  ###optionally exclude the raw values
                for value in original_data_values:
                    if array_type == 'RNASeq':
                        if norm == 'RPKM': adj = 0
                        else: adj = 1
                        if calculateAsNonLog == False:
                            value = math.pow(2, value) - adj
                    try:
                        expbuilder_value_db[row_id].append(value)
                    except KeyError:
                        expbuilder_value_db[row_id] = [value]
            if group_number in group_summary_results:
                group_summary_data = group_summary_results[group_number][
                    1]  #the group name is listed as the first entry
                for value in group_summary_data:
                    try:
                        expbuilder_value_db[row_id].append(value)
                    except KeyError:
                        expbuilder_value_db[row_id] = [value]
            for info in stat_result_list:
                if info[0][
                        0] == group_number:  #comp,(groups_name,[avg1,log_fold,fold,ttest])
                    comp = info[0]
                    gs = info[1][1]
                    expbuilder_value_db[row_id].append(gs.LogFold())
                    expbuilder_value_db[row_id].append(gs.Fold())
                    expbuilder_value_db[row_id].append(gs.Pval())
                    ### Create a placeholder and store the position of the adjusted p-value to be calculated
                    expbuilder_value_db[row_id].append('')
                    gs.SetAdjPIndex(len(expbuilder_value_db[row_id]) - 1)
                    gs.SetPvalIndex(len(expbuilder_value_db[row_id]) - 2)
                    pval_summary_db[(row_id, comp)] = gs

    ###do the same for the headers, but at the dataset level (redundant processes)
    array_fold_headers = []
    data_headers3 = []
    try:
        for group in data_headers2:
            data_tuple = group, data_headers2[
                group]  #e.g. 1, ['X030910_25_hl.CEL', 'X030910_29R_hl.CEL', 'X030910_45_hl.CEL'])
            data_headers3.append(data_tuple)
        data_headers3.sort()
    except UnboundLocalError:
        print data_headers, '\n', array_order, '\n', comp_group_list, '\n'
        kill_program

    for entry in data_headers3:
        x = 0  #indicates the times through a loop
        y = 0  #indicates the times through a loop
        group_number = entry[0]
        original_data_values = entry[1]
        if include_raw_data == 'yes':  ###optionally exclude the raw values
            for value in original_data_values:
                array_fold_headers.append(value)
        if group_number in group_summary_results:
            group_name = group_summary_results[group_number][0]
            group_summary_data = group_summary_results[group_number][1]
            for value in group_summary_data:
                combined_name = group_summary_result_names[
                    x] + group_name  #group_summary_result_names = ['avg-']
                array_fold_headers.append(combined_name)
                x += 1  #increment the loop index

        for info in stat_result_list:
            if info[0][
                    0] == group_number:  #comp,(groups_name,[avg1,log_fold,fold,ttest],group2_name)
                groups_name = info[1][0]
                only_add_these = stat_result_names[1:]
                for value in only_add_these:
                    new_name = value + groups_name
                    array_fold_headers.append(new_name)

    ###For the raw_data only export we need the headers for the different groups (data_headers2) and group names (group_name_db)
    raw_data_comp_headers = {}
    for comp in comp_group_list:
        temp_raw = []
        group1 = int(comp[0])
        group2 = int(comp[1])
        comp = str(comp[0]), str(comp[1])
        g1_headers = data_headers2[group1]
        g2_headers = data_headers2[group2]
        g1_name = group_name_db[group1]
        g2_name = group_name_db[group2]
        for header in g2_headers:
            temp_raw.append(g2_name + ':' + header)
        for header in g1_headers:
            temp_raw.append(g1_name + ':' + header)
        raw_data_comp_headers[comp] = temp_raw

    ###Calculate adjusted ftest p-values using BH95 sorted method
    statistics.adjustPermuteStats(summary_filtering_stats)

    ### Calculate adjusted p-values for all p-values using BH95 sorted method
    round = 0
    for info in comp_group_list:
        compid = int(info[0]), int(info[1])
        pval_db = {}
        for (rowid, comp) in pval_summary_db:
            if comp == compid:
                gs = pval_summary_db[(rowid, comp)]
                pval_db[rowid] = gs

        if 'moderated' in probability_statistic and replicates == 'yes':
            ### Moderates the original reported test p-value prior to adjusting
            try:
                statistics.moderateTestStats(pval_db, probability_statistic)
            except Exception:
                if round == 0:
                    if replicates == 'yes':
                        print 'Moderated test failed due to issue with mpmpath or out-of-range values\n   ... using unmoderated unpaired test instead!'
                null = []  ### Occurs when not enough replicates
            round += 1

        if FDR_statistic == 'Benjamini-Hochberg':
            statistics.adjustPermuteStats(pval_db)
        else:
            ### Calculate a qvalue (https://github.com/nfusi/qvalue)
            import numpy
            import qvalue
            pvals = []
            keys = []
            for key in pval_db:
                pvals.append(pval_db[key].Pval())
                keys.append(key)
            pvals = numpy.array(pvals)
            pvals = qvalue.estimate(pvals)
            for i in range(len(pvals)):
                pval_db[keys[i]].SetAdjP(pvals[i])

        for rowid in pval_db:
            gs = pval_db[rowid]
            expbuilder_value_db[rowid][gs.AdjIndex()] = gs.AdjP(
            )  ### set the place holder to the calculated value
            if 'moderated' in probability_statistic:
                expbuilder_value_db[rowid][gs.RawIndex()] = gs.Pval(
                )  ### Replace the non-moderated with a moderated p-value

    pval_summary_db = []
    ###Finished re-ordering lists and adding statistics to expbuilder_value_db
    return expbuilder_value_db, array_fold_headers, summary_filtering_stats, raw_data_comp_headers
Esempio n. 2
0
def exportTransitResults(array_group_list, array_raw_group_values,
                         array_group_db, avg_const_exp_db, adj_fold_dbase,
                         exon_db, dataset_name, apt_dir):
    """Export processed raw expression values (e.g. add global fudge factor or eliminate probe sets based on filters) to txt files
    for analysis with MiDAS"""
    #array_group_list contains group names in order of analysis
    #array_raw_group_values contains expression values for the x number of groups in above list
    #array_group_db key is the group name and values are the list of array names
    #avg_const_exp_db contains the average expression values for all arrays for all constitutive probesets, with gene as the key

    ordered_array_header_list = []
    for group in array_group_list:  ###contains the correct order for each group
        for array_id in array_group_db[group]:
            ordered_array_header_list.append(str(array_id))
    ordered_exp_val_db = {
    }  ###new dictionary containing all expression values together, but organized based on group
    probeset_affygene_db = {
    }  ###lists all altsplice probesets and corresponding affygenes
    for probeset in array_raw_group_values:
        try:
            include_probeset = 'yes'
            ###Examines user input parameters for inclusion of probeset types in the analysis
            if include_probeset == 'yes':
                if probeset in adj_fold_dbase:  ###indicates that this probeset is analyzed for splicing (e.g. has a constitutive probeset)
                    for group_val_list in array_raw_group_values[probeset]:
                        non_log_group_exp_vals = statistics.log_fold_conversion(
                            group_val_list)
                        for val in non_log_group_exp_vals:
                            try:
                                ordered_exp_val_db[probeset].append(str(val))
                            except KeyError:
                                ordered_exp_val_db[probeset] = [str(val)]
                    affygene = exon_db[probeset].GeneID()
                    try:
                        probeset_affygene_db[affygene].append(probeset)
                    except KeyError:
                        probeset_affygene_db[affygene] = [probeset]
        except KeyError:
            ###Indicates that the expression dataset file was not filtered for whether annotations exist in the exon annotation file
            ###In that case, just ignore the entry
            null = ''

    gene_count = 0
    ordered_gene_val_db = {}
    for affygene in avg_const_exp_db:  ###now, add all constitutive gene level expression values (only per anlayzed gene)
        if affygene in probeset_affygene_db:  ###ensures we only include gene data where there are altsplice examined probesets
            non_log_ordered_exp_const_val = statistics.log_fold_conversion(
                avg_const_exp_db[affygene])
            gene_count += 1
            for val in non_log_ordered_exp_const_val:
                try:
                    ordered_gene_val_db[affygene].append(str(val))
                except KeyError:
                    ordered_gene_val_db[affygene] = [str(val)]

    convert_probesets_to_numbers = {}
    convert_affygene_to_numbers = {}
    array_type = 'junction'
    probeset_affygene_number_db = {}
    x = 0
    y = 0
    for affygene in probeset_affygene_db:
        x += 1
        y = x  ###each affygene has a unique number, from other affygenes and probesets and probesets count up from each affygene
        x_copy = x
        example_gene_probeset = probeset_affygene_db[affygene][0]
        #if exon_db[example_gene_probeset].ArrayType() == 'exon': x_copy = exon_db[example_gene_probeset].SecondaryGeneID()
        if x_copy not in exon_db:
            convert_affygene_to_numbers[affygene] = str(x_copy)
        else:
            print affygene, x_copy, 'new numeric for MIDAS already exists as a probeset ID number'
            kill
        for probeset in probeset_affygene_db[affygene]:
            y = y + 1
            y_copy = y
            if exon_db[probeset].ArrayType() == 'exon':
                y_copy = probeset  ### Only appropriate when the probeset ID is a number
                array_type = 'exon'
            convert_probesets_to_numbers[probeset] = str(y_copy)
            try:
                probeset_affygene_number_db[str(x_copy)].append(str(y_copy))
            except KeyError:
                probeset_affygene_number_db[str(x_copy)] = [str(y_copy)]
        x = y

    metafile = 'AltResults/MIDAS/meta-' + dataset_name[0:-1] + '.txt'
    data1 = export.createExportFile(metafile, 'AltResults/MIDAS')
    title = 'probeset_id\ttranscript_cluster_id\tprobeset_list\tprobe_count\n'
    data1.write(title)
    for affygene in probeset_affygene_number_db:
        probeset_list = probeset_affygene_number_db[affygene]
        probe_number = str(len(probeset_list) * 6)
        probeset_list = [string.join(probeset_list, ' ')]
        probeset_list.append(affygene)
        probeset_list.append(affygene)
        probeset_list.reverse()
        probeset_list.append(probe_number)
        probeset_list = string.join(probeset_list, '\t')
        probeset_list = probeset_list + '\n'
        data1.write(probeset_list)
    data1.close()

    junction_exp_file = 'AltResults/MIDAS/' + array_type + '-exp-' + dataset_name[
        0:-1] + '.txt'
    fn2 = filepath(junction_exp_file)
    data2 = open(fn2, 'w')
    ordered_array_header_list.reverse()
    ordered_array_header_list.append('probeset_id')
    ordered_array_header_list.reverse()
    title = string.join(ordered_array_header_list, '\t')
    data2.write(title + '\n')
    for probeset in ordered_exp_val_db:
        probeset_number = convert_probesets_to_numbers[probeset]
        exp_values = ordered_exp_val_db[probeset]
        exp_values.reverse()
        exp_values.append(probeset_number)
        exp_values.reverse()
        exp_values = string.join(exp_values, '\t')
        exp_values = exp_values + '\n'
        data2.write(exp_values)
    data2.close()

    gene_exp_file = 'AltResults/MIDAS/gene-exp-' + dataset_name[0:-1] + '.txt'
    fn3 = filepath(gene_exp_file)
    data3 = open(fn3, 'w')
    title = string.join(ordered_array_header_list, '\t')
    data3.write(title + '\n')
    for affygene in ordered_gene_val_db:
        try:
            affygene_number = convert_affygene_to_numbers[affygene]
        except KeyError:
            print len(convert_affygene_to_numbers), len(ordered_gene_val_db)
            kill
        exp_values = ordered_gene_val_db[affygene]
        exp_values.reverse()
        exp_values.append(affygene_number)
        exp_values.reverse()
        exp_values = string.join(exp_values, '\t')
        exp_values = exp_values + '\n'
        data3.write(exp_values)
    data3.close()

    exportMiDASArrayNames(array_group_list, array_group_db, dataset_name,
                          'new')

    coversionfile = 'AltResults/MIDAS/probeset-conversion-' + dataset_name[
        0:-1] + '.txt'
    fn5 = filepath(coversionfile)
    data5 = open(fn5, 'w')
    title = 'probeset\tprobeset_number\n'
    data5.write(title)
    for probeset in convert_probesets_to_numbers:  ###contains the correct order for each group
        probeset_number = convert_probesets_to_numbers[probeset]
        values = probeset + '\t' + probeset_number + '\n'
        data5.write(values)
    data5.close()
    """
    ### This code is obsolete... used before AltAnalyze could connect to APT directly.
    commands = 'AltResults/MIDAS/commands-'+dataset_name[0:-1]+'.txt'
    data = export.createExportFile(commands,'AltResults/MIDAS')
    path = filepath('AltResults/MIDAS'); path = string.replace(path,'\\','/'); path = 'cd '+path+'\n\n'
    metafile = 'meta-'+dataset_name[0:-1]+'.txt'
    junction_exp_file = array_type+'-exp-'+dataset_name[0:-1]+'.txt'
    gene_exp_file = 'gene-exp-'+dataset_name[0:-1]+'.txt'
    celfiles = 'celfiles-'+dataset_name[0:-1]+'.txt'
    command_line = 'apt-midas -c '+celfiles+' -g '+gene_exp_file+' -e '+junction_exp_file+' -m '+metafile+' -o '+dataset_name[0:-1]+'-output'
    data.write(path); data.write(command_line); data.close()
    """

    status = runMiDAS(apt_dir, array_type, dataset_name, array_group_list,
                      array_group_db)
    return status
def exportTransitResults(array_group_list,array_raw_group_values,array_group_db,avg_const_exp_db,adj_fold_dbase,exon_db,dataset_name,apt_dir):
    """Export processed raw expression values (e.g. add global fudge factor or eliminate probe sets based on filters) to txt files
    for analysis with MiDAS"""
    #array_group_list contains group names in order of analysis
    #array_raw_group_values contains expression values for the x number of groups in above list
    #array_group_db key is the group name and values are the list of array names
    #avg_const_exp_db contains the average expression values for all arrays for all constitutive probesets, with gene as the key

    ordered_array_header_list=[]
    for group in array_group_list: ###contains the correct order for each group
        for array_id in array_group_db[group]:
            ordered_array_header_list.append(str(array_id))
    ordered_exp_val_db = {} ###new dictionary containing all expression values together, but organized based on group
    probeset_affygene_db = {} ###lists all altsplice probesets and corresponding affygenes
    for probeset in array_raw_group_values:
        try:
            include_probeset = 'yes'
            ###Examines user input parameters for inclusion of probeset types in the analysis
            if include_probeset == 'yes':
                if probeset in adj_fold_dbase: ###indicates that this probeset is analyzed for splicing (e.g. has a constitutive probeset)
                    for group_val_list in array_raw_group_values[probeset]:
                        non_log_group_exp_vals = statistics.log_fold_conversion(group_val_list)
                        for val in non_log_group_exp_vals:
                            try: ordered_exp_val_db[probeset].append(str(val))
                            except KeyError: ordered_exp_val_db[probeset] = [str(val)]
                    affygene = exon_db[probeset].GeneID()
                    try: probeset_affygene_db[affygene].append(probeset)
                    except KeyError: probeset_affygene_db[affygene] = [probeset]
        except KeyError:
            ###Indicates that the expression dataset file was not filtered for whether annotations exist in the exon annotation file
            ###In that case, just ignore the entry
            null = ''

    gene_count = 0
    ordered_gene_val_db={}
    for affygene in avg_const_exp_db: ###now, add all constitutive gene level expression values (only per anlayzed gene)
        if affygene in probeset_affygene_db: ###ensures we only include gene data where there are altsplice examined probesets
            non_log_ordered_exp_const_val = statistics.log_fold_conversion(avg_const_exp_db[affygene])
            gene_count+=1
            for val in non_log_ordered_exp_const_val:
                try: ordered_gene_val_db[affygene].append(str(val))
                except KeyError: ordered_gene_val_db[affygene] = [str(val)]

    convert_probesets_to_numbers={}
    convert_affygene_to_numbers={}; array_type = 'junction'
    probeset_affygene_number_db={}; x=0; y=0
    for affygene in probeset_affygene_db:
        x+=1; y = x  ###each affygene has a unique number, from other affygenes and probesets and probesets count up from each affygene
        x_copy = x
        example_gene_probeset = probeset_affygene_db[affygene][0]
        #if exon_db[example_gene_probeset].ArrayType() == 'exon': x_copy = exon_db[example_gene_probeset].SecondaryGeneID()
        if x_copy not in exon_db:
            convert_affygene_to_numbers[affygene] = str(x_copy)
        else: print affygene, x_copy,'new numeric for MIDAS already exists as a probeset ID number'; kill
        for probeset in probeset_affygene_db[affygene]:
            y = y+1; y_copy = y
            if exon_db[probeset].ArrayType() == 'exon':
                y_copy = probeset ### Only appropriate when the probeset ID is a number
                array_type = 'exon'
            convert_probesets_to_numbers[probeset] = str(y_copy)
            try: probeset_affygene_number_db[str(x_copy)].append(str(y_copy))
            except KeyError: probeset_affygene_number_db[str(x_copy)] = [str(y_copy)]
        x=y
    
    metafile = 'AltResults/MIDAS/meta-'+dataset_name[0:-1]+'.txt'
    data1 = export.createExportFile(metafile,'AltResults/MIDAS')
    title = 'probeset_id\ttranscript_cluster_id\tprobeset_list\tprobe_count\n'    
    data1.write(title)
    for affygene in probeset_affygene_number_db:
        probeset_list = probeset_affygene_number_db[affygene]; probe_number = str(len(probeset_list)*6)
        probeset_list = [string.join(probeset_list,' ')]
        probeset_list.append(affygene); probeset_list.append(affygene); probeset_list.reverse(); probeset_list.append(probe_number)
        probeset_list = string.join(probeset_list,'\t'); probeset_list=probeset_list+'\n'
        data1.write(probeset_list)
    data1.close()

    junction_exp_file = 'AltResults/MIDAS/'+array_type+'-exp-'+dataset_name[0:-1]+'.txt'
    fn2=filepath(junction_exp_file)
    data2 = open(fn2,'w')
    ordered_array_header_list.reverse(); ordered_array_header_list.append('probeset_id'); ordered_array_header_list.reverse()
    title = string.join(ordered_array_header_list,'\t')
    data2.write(title+'\n')
    for probeset in ordered_exp_val_db:
        probeset_number = convert_probesets_to_numbers[probeset]
        exp_values = ordered_exp_val_db[probeset]; exp_values.reverse(); exp_values.append(probeset_number); exp_values.reverse()
        exp_values = string.join(exp_values,'\t'); exp_values = exp_values +'\n'
        data2.write(exp_values)
    data2.close()

    gene_exp_file = 'AltResults/MIDAS/gene-exp-'+dataset_name[0:-1]+'.txt'
    fn3=filepath(gene_exp_file)
    data3 = open(fn3,'w')
    title = string.join(ordered_array_header_list,'\t')
    data3.write(title+'\n')
    for affygene in ordered_gene_val_db:
        try: affygene_number = convert_affygene_to_numbers[affygene]
        except KeyError: print len(convert_affygene_to_numbers), len(ordered_gene_val_db); kill
        exp_values = ordered_gene_val_db[affygene]; exp_values.reverse(); exp_values.append(affygene_number); exp_values.reverse()
        exp_values = string.join(exp_values,'\t'); exp_values = exp_values +'\n'
        data3.write(exp_values)
    data3.close()

    exportMiDASArrayNames(array_group_list,array_group_db,dataset_name,'new')
    
    coversionfile = 'AltResults/MIDAS/probeset-conversion-'+dataset_name[0:-1]+'.txt'
    fn5=filepath(coversionfile)
    data5 = open(fn5,'w')
    title = 'probeset\tprobeset_number\n'; data5.write(title)
    for probeset in convert_probesets_to_numbers: ###contains the correct order for each group
        probeset_number = convert_probesets_to_numbers[probeset]
        values = probeset+'\t'+probeset_number+'\n'
        data5.write(values)
    data5.close()

    """
    ### This code is obsolete... used before AltAnalyze could connect to APT directly.
    commands = 'AltResults/MIDAS/commands-'+dataset_name[0:-1]+'.txt'
    data = export.createExportFile(commands,'AltResults/MIDAS')
    path = filepath('AltResults/MIDAS'); path = string.replace(path,'\\','/'); path = 'cd '+path+'\n\n'
    metafile = 'meta-'+dataset_name[0:-1]+'.txt'
    junction_exp_file = array_type+'-exp-'+dataset_name[0:-1]+'.txt'
    gene_exp_file = 'gene-exp-'+dataset_name[0:-1]+'.txt'
    celfiles = 'celfiles-'+dataset_name[0:-1]+'.txt'
    command_line = 'apt-midas -c '+celfiles+' -g '+gene_exp_file+' -e '+junction_exp_file+' -m '+metafile+' -o '+dataset_name[0:-1]+'-output'
    data.write(path); data.write(command_line); data.close()
    """

    status = runMiDAS(apt_dir,array_type,dataset_name,array_group_list,array_group_db)    
    return status
Esempio n. 4
0
def reorder(data,data_headers,array_order,comp_group_list,probeset_db,include_raw_data,array_type,norm,fl,logvalues=True,blanksPresent=False):
    ###array_order gives the final level order sorted, followed by the original index order as a tuple                   
    expbuilder_value_db = {}; group_name_db = {}; summary_filtering_stats = {}; pval_summary_db= {}
    replicates = 'yes'
    
    stat_result_names = ['avg-','log_fold-','fold-','rawp-','adjp-']
    group_summary_result_names = ['avg-']
    
    ### Define expression variables
    try: probability_statistic = fl.ProbabilityStatistic()
    except Exception: probability_statistic = 'unpaired t-test'
    try: gene_exp_threshold = math.log(fl.GeneExpThreshold(),2)
    except Exception: gene_exp_threshold = 0
    try: gene_rpkm_threshold = float(fl.RPKMThreshold())
    except Exception: gene_rpkm_threshold = 0
    try: FDR_statistic = fl.FDRStatistic()
    except Exception: FDR_statistic = 'Benjamini-Hochberg'
    calculateAsNonLog=True
    if blanksPresent:
        calculateAsNonLog=False
    
    ### Begin processing sample expression values according to the organized groups
    for row_id in data:
        try: gene = probeset_db[row_id][0]
        except TypeError: gene = '' #not needed if not altsplice data
        data_headers2 = {} #reset each time
        grouped_ordered_array_list = {}
        for x in array_order:
            y = x[1]  #this is the new first index
            group = x[2]
            group_name = x[3]
            group_name_db[group] = group_name
            #for example y = 5, therefore the data[row_id][5] entry is now the first
            try:
                try: new_item = data[row_id][y]
                except IndexError: print row_id,data[row_id],len(data[row_id]),y,len(array_order),array_order;kill
                if logvalues==False and calculateAsNonLog and array_type == 'RNASeq':
                    new_item = math.pow(2,new_item)
            except TypeError: new_item = ''  #this is for a spacer added in the above function
            try: grouped_ordered_array_list[group].append(new_item)
            except KeyError: grouped_ordered_array_list[group] = [new_item]
            try: data_headers2[group].append(data_headers[y])
            except KeyError: data_headers2[group]= [data_headers[y]]
        #perform statistics on each group comparison - comp_group_list: [(1,2),(3,4)]
        stat_results = {}
        group_summary_results = {}
        for comp in comp_group_list:
            group1 = int(comp[0])
            group2 = int(comp[1])
            group1_name = group_name_db[group1]
            group2_name = group_name_db[group2]
            groups_name = group1_name + "_vs_" + group2_name
            data_list1 = grouped_ordered_array_list[group1] 
            data_list2 = grouped_ordered_array_list[group2] #baseline expression
            if blanksPresent: ### Allows for empty cells
                data_list1 = filterBlanks(data_list1)
                data_list2 = filterBlanks(data_list2)
            try: avg1 = statistics.avg(data_list1)
            except Exception: avg1 = ''
            try: avg2 = statistics.avg(data_list2)
            except Exception: avg2=''
            try:
                if (logvalues == False and array_type != 'RNASeq') or (logvalues==False and calculateAsNonLog):
                    fold = avg1/avg2
                    log_fold = math.log(fold,2)
                    if fold<1: fold = -1.0/fold
                else:
                    log_fold = avg1 - avg2
                    fold = statistics.log_fold_conversion(log_fold) 
            except Exception:
                log_fold=''; fold=''
            try:
                #t,df,tails = statistics.ttest(data_list1,data_list2,2,3) #unpaired student ttest, calls p_value function
                #t = abs(t); df = round(df); p = str(statistics.t_probability(t,df))
                p = statistics.runComparisonStatistic(data_list1,data_list2,probability_statistic)
            except Exception: p = 1; sg = 1; N1=0; N2=0
            comp = group1,group2
            if array_type == 'RNASeq': ### Also non-log but treated differently
                if 'RPKM' == norm: adj = 0
                else: adj = 1
                if calculateAsNonLog == False:
                    try: avg1 = math.pow(2,avg1)-adj; avg2 = math.pow(2,avg2)-adj
                    except Exception: avg1=''; avg2=''
                if 'RPKM' == norm:
                    if avg1 < gene_rpkm_threshold and avg2 < gene_rpkm_threshold:
                        log_fold = 'Insufficient Expression'
                        fold = 'Insufficient Expression'
                else:
                    if avg1 < gene_exp_threshold and avg2 < gene_exp_threshold:
                        log_fold = 'Insufficient Expression'
                        fold = 'Insufficient Expression'
                    #if row_id=='ENSG00000085514':
                    #if fold=='Insufficient Expression':
                    #print [norm, avg1, avg2, fold, comp, gene_exp_threshold, gene_rpkm_threshold, row_id]
                    #5.96999111075 7.72930768675 Insufficient Expression (3, 1) 1.0 ENSG00000085514
            if gene_rpkm_threshold!=0 and calculateAsNonLog: ### Any other data
                a1 = nonLogAvg(data_list1)
                a2 = nonLogAvg(data_list2)
                #print [a1,a2,gene_rpkm_threshold]
                if a1<gene_rpkm_threshold and a2<gene_rpkm_threshold:
                    log_fold = 'Insufficient Expression'
                    fold = 'Insufficient Expression'
                #print log_fold;kill
            try:
                gs = statistics.GroupStats(log_fold,fold,p)
                stat_results[comp] = groups_name,gs,group2_name
                if probability_statistic == 'moderated t-test':
                    gs.setAdditionalStats(data_list1,data_list2) ### Assuming equal variance
                if probability_statistic == 'moderated Welch-test':
                    gs.setAdditionalWelchStats(data_list1,data_list2) ### Assuming unequal variance
            except Exception:
                null=[]; replicates = 'no' ### Occurs when not enough replicates
                #print comp, len(stat_results); kill_program
            group_summary_results[group1] = group1_name,[avg1]
            group_summary_results[group2] = group2_name,[avg2]

        ### Replaces the below method to get the largest possible comparison fold and ftest p-value
        grouped_exp_data = []; avg_exp_data = []
        for group in grouped_ordered_array_list:
            data_list = grouped_ordered_array_list[group]
            if blanksPresent: ### Allows for empty cells
                data_list = filterBlanks(data_list)
            if len(data_list)>0: grouped_exp_data.append(data_list)
            try: avg = statistics.avg(data_list); avg_exp_data.append(avg)
            except Exception:
                avg = ''
                #print row_id, group, data_list;kill
        try: avg_exp_data.sort(); max_fold = avg_exp_data[-1]-avg_exp_data[0]
        except Exception: max_fold = 'NA'
        try: ftestp = statistics.OneWayANOVA(grouped_exp_data)
        except Exception: ftestp = 1
        gs = statistics.GroupStats(max_fold,0,ftestp)
        summary_filtering_stats[row_id] = gs
        
        stat_result_list = []
        for entry in stat_results:
            data_tuple = entry,stat_results[entry]
            stat_result_list.append(data_tuple)
        stat_result_list.sort()
        
        grouped_ordered_array_list2 = []
        for group in grouped_ordered_array_list:
            data_tuple = group,grouped_ordered_array_list[group]
            grouped_ordered_array_list2.append(data_tuple)
        grouped_ordered_array_list2.sort() #now the list is sorted by group number
        
        ###for each rowid, add in the reordered data, and new statistics for each group and for each comparison
        for entry in grouped_ordered_array_list2:
            group_number = entry[0]
            original_data_values = entry[1]
            if include_raw_data == 'yes': ###optionally exclude the raw values
                for value in original_data_values:
                    if array_type == 'RNASeq':
                        if norm == 'RPKM': adj = 0
                        else: adj = 1
                        if calculateAsNonLog == False:
                            value = math.pow(2,value)-adj
                    try: expbuilder_value_db[row_id].append(value)
                    except KeyError: expbuilder_value_db[row_id] = [value]
            if group_number in group_summary_results:
                group_summary_data = group_summary_results[group_number][1] #the group name is listed as the first entry
                for value in group_summary_data:
                    try: expbuilder_value_db[row_id].append(value)
                    except KeyError: expbuilder_value_db[row_id] = [value]
            for info in stat_result_list:
                if info[0][0] == group_number: #comp,(groups_name,[avg1,log_fold,fold,ttest])
                    comp = info[0]; gs = info[1][1]
                    expbuilder_value_db[row_id].append(gs.LogFold())
                    expbuilder_value_db[row_id].append(gs.Fold())
                    expbuilder_value_db[row_id].append(gs.Pval())
                    ### Create a placeholder and store the position of the adjusted p-value to be calculated
                    expbuilder_value_db[row_id].append('') 
                    gs.SetAdjPIndex(len(expbuilder_value_db[row_id])-1)
                    gs.SetPvalIndex(len(expbuilder_value_db[row_id])-2)
                    pval_summary_db[(row_id,comp)] = gs

    ###do the same for the headers, but at the dataset level (redundant processes)
    array_fold_headers = []; data_headers3 = []
    try:
        for group in data_headers2:
            data_tuple = group,data_headers2[group]  #e.g. 1, ['X030910_25_hl.CEL', 'X030910_29R_hl.CEL', 'X030910_45_hl.CEL'])
            data_headers3.append(data_tuple)
        data_headers3.sort()
    except UnboundLocalError:
        print data_headers,'\n',array_order,'\n',comp_group_list,'\n'; kill_program
    
    for entry in data_headers3:
        x = 0 #indicates the times through a loop
        y = 0 #indicates the times through a loop
        group_number = entry[0]
        original_data_values = entry[1]
        if include_raw_data == 'yes': ###optionally exclude the raw values
            for value in original_data_values:
                array_fold_headers.append(value)
        if group_number in group_summary_results:
            group_name = group_summary_results[group_number][0]
            group_summary_data = group_summary_results[group_number][1]
            for value in group_summary_data:
                combined_name = group_summary_result_names[x] + group_name  #group_summary_result_names = ['avg-']
                array_fold_headers.append(combined_name)
                x += 1 #increment the loop index

        for info in stat_result_list:
            if info[0][0] == group_number:  #comp,(groups_name,[avg1,log_fold,fold,ttest],group2_name)
                groups_name = info[1][0]
                only_add_these = stat_result_names[1:]
                for value in only_add_these:
                    new_name = value + groups_name
                    array_fold_headers.append(new_name)

    ###For the raw_data only export we need the headers for the different groups (data_headers2) and group names (group_name_db)       
    raw_data_comp_headers = {}
    for comp in comp_group_list:
        temp_raw = []
        group1 = int(comp[0]);group2 = int(comp[1])
        comp = str(comp[0]),str(comp[1])
        g1_headers = data_headers2[group1]
        g2_headers = data_headers2[group2]
        g1_name = group_name_db[group1]
        g2_name = group_name_db[group2]
        for header in g2_headers: temp_raw.append(g2_name+':'+header)
        for header in g1_headers: temp_raw.append(g1_name+':'+header)
        raw_data_comp_headers[comp] = temp_raw

    ###Calculate adjusted ftest p-values using BH95 sorted method
    statistics.adjustPermuteStats(summary_filtering_stats)
    
    ### Calculate adjusted p-values for all p-values using BH95 sorted method
    round=0
    for info in comp_group_list:
        compid = int(info[0]),int(info[1]); pval_db={}
        for (rowid,comp) in pval_summary_db:
            if comp == compid:
                gs = pval_summary_db[(rowid,comp)]
                pval_db[rowid] = gs

        if 'moderated' in probability_statistic and replicates == 'yes':
            ### Moderates the original reported test p-value prior to adjusting
            try: statistics.moderateTestStats(pval_db,probability_statistic)
            except Exception:
                if round == 0:
                    if replicates == 'yes':
                        print 'Moderated test failed due to issue with mpmpath or out-of-range values\n   ... using unmoderated unpaired test instead!'
                null=[] ### Occurs when not enough replicates
            round+=1
            
        if FDR_statistic == 'Benjamini-Hochberg':
            statistics.adjustPermuteStats(pval_db)
        else:
            ### Calculate a qvalue (https://github.com/nfusi/qvalue)
            import numpy; import qvalue; pvals = []; keys = []
            for key in pval_db: pvals.append(pval_db[key].Pval()); keys.append(key)
            pvals = numpy.array(pvals)
            pvals = qvalue.estimate(pvals)
            for i in range(len(pvals)): pval_db[keys[i]].SetAdjP(pvals[i])
            
        for rowid in pval_db:
            gs = pval_db[rowid]
            expbuilder_value_db[rowid][gs.AdjIndex()] = gs.AdjP() ### set the place holder to the calculated value
            if 'moderated' in probability_statistic:
                expbuilder_value_db[rowid][gs.RawIndex()] = gs.Pval() ### Replace the non-moderated with a moderated p-value
                
    pval_summary_db=[]            
    ###Finished re-ordering lists and adding statistics to expbuilder_value_db
    return expbuilder_value_db, array_fold_headers, summary_filtering_stats, raw_data_comp_headers