Python avg Examples

Programming Language: Python

Namespace/Package Name: statistics

Method/Function: avg

Examples at hotexamples.com: 15

Python avg - 15 examples found. These are the top rated real world Python examples of statistics.avg extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: LineageProfiler.py Project: wuxue/altanalyze

def replacePearsonPvalueWithZscore():
    all_sample_data={}
    for tissue in tissue_comparison_scores:
        for (r,p,sample) in tissue_comparison_scores[tissue]:
            all_sample_data[sample] = [] ### populate this dictionary and create sub-dictionaries
        break

    for tissue in tissue_comparison_scores:
        for (r,p,sample) in tissue_comparison_scores[tissue]:
            all_sample_data[sample].append(r)

    sample_stats={}
    all_dataset_rho_values=[]
    ### Get average and standard deviation for all sample rho's
    for sample in all_sample_data:
        all_dataset_rho_values+=all_sample_data[sample]
        avg=statistics.avg(all_sample_data[sample])
        stdev=statistics.stdev(all_sample_data[sample])
        sample_stats[sample]=avg,stdev
    
    global_rho_avg = statistics.avg(all_dataset_rho_values)
    global_rho_stdev = statistics.stdev(all_dataset_rho_values)
    
    ### Replace the p-value for each rho
    for tissue in tissue_comparison_scores:
        scores = []
        for (r,p,sample) in tissue_comparison_scores[tissue]:
            #u,s=sample_stats[sample]
            #z = (r-u)/s
            z = (r-global_rho_avg)/global_rho_stdev ### Instead of doing this for the sample background, do it relative to all analyzed samples
            scores.append([r,z,sample])
        tissue_comparison_scores[tissue] = scores

Example #2

Show file

File: ExonArray.py Project: wuxue/altanalyze

def reorderArraysOnly(filtered_exp_db,filetype,counts): 
    ###array_order gives the final level order sorted, followed by the original index order as a tuple                   
    ###expr_group_list gives the final level order sorted, followed by the original index order as a tuple
    for probeset in filtered_exp_db:
        grouped_ordered_array_list = {}; group_list = []
        for x in expr_group_list:
            y = x[1]; group = x[2]  ### this is the new first index
            ### for example y = 5, therefore the filtered_exp_db[probeset][5] entry is now the first
            try:
                try: new_item = filtered_exp_db[probeset][y]
                except TypeError: print y,x,expr_group_list; kill
            except IndexError: print probeset,y,x,expr_group_list,'\n',filtered_exp_db[probeset];kill
            ###Used for comparision analysis
            try: grouped_ordered_array_list[group].append(new_item)
            except KeyError: grouped_ordered_array_list[group] = [new_item]
            
        ### For the exon-level expression data, export the group pair data for all pairwise comparisons to different comp files
        ###*******Include a database with the raw values saved for permuteAltAnalyze*******
        for info in comp_group_list:
            group1 = int(info[0]); group2 = int(info[1]); comp = str(info[0]),str(info[1])
            g1_data = grouped_ordered_array_list[group1]
            g2_data = grouped_ordered_array_list[group2]
            #print probeset, group1, group2, g1_data, g2_data, info;kill
            data = comparision_export_db[comp]
            values = [probeset]+g2_data+g1_data; values = string.join(values,'\t')+'\n' ###groups are reversed since so are the labels
            #raw_data_comps[probeset,comp] = temp_raw
            data.write(values)
            
        ### Export all values grouped from the array
        for group in grouped_ordered_array_list: group_list.append(group)
        group_list.sort(); combined_value_list=[]; avg_values=[]
        for group in group_list:
            g_data = grouped_ordered_array_list[group]
            if exp_analysis_type == 'expression':
                try: avg_gdata = statistics.avg(g_data); avg_values.append(avg_gdata)
                except Exception:
                    print g_data
                    print avg_values
                    kill
            combined_value_list+=g_data
        
        if exp_data_format == 'non-log' and counts == 'no':
            try: combined_value_list = logTransform(combined_value_list)
            except Exception:
                print probeset, combined_value_list,comp_group_list,expr_group_list
                print filtered_exp_db[probeset]; kill

        if filetype == 'expression':
            ### Export the expression values for all samples grouped (if meeting the above thresholds)
            values = string.join([probeset]+combined_value_list,'\t')+'\n'
            fulldataset_export_object.write(values) ### Don't need this for dabg data

        if exp_analysis_type == 'expression':
            avg_values.sort() ### Sort to get the lowest dabg and largest average expression
            if filetype == 'dabg':
                if avg_values[0]<=dabg_p_threshold: dabg_summary[probeset]=[] ### store probeset if the minimum p<user-threshold
            else:
                #if 'ENSMUSG00000018263:' in probeset: print probeset,[avg_values[-1],expression_threshold]
                if avg_values[-1]>=expression_threshold:
                    expression_summary[probeset]=[] ### store probeset if the minimum p<user-threshold

Example #3

Show file

File: QC.py Project: venkatmi/altanalyze

def plotFeatureBoxPlots(qc_db,dataset_name,feature_type):
    pylab.figure()    
    pylab.xlabel('Biological Sample Names')
    pylab.ylabel('Read Counts - Log2')
    pylab.title('Expression BoxPlots for %ss - %s' % (feature_type,dataset_name))
    #pylab.subplots_adjust(left=0.085, right=0.95, top=0.2, bottom=0.35)
    pylab.subplots_adjust(left=0.075, right=0.95, top=0.9, bottom=0.35)
    
    #axes = getAxes(scores) ### adds buffer space to the end of each axis and creates room for a legend
    #pylab.axis(axes)

    boxplots=[]
    samples=[]
    
    sample_sorted_list=[]
    
    for sample_name in qc_db:
        try: qc = qc_db[sample_name][feature_type]
        except Exception:
            print 'No junction data found for at least one sample:',sample_name; forceExit
        sample_sorted_list.append([statistics.avg(qc),statistics.stdev(qc),sample_name])
    sample_sorted_list.sort()
    sample_sorted_list.reverse()
    
    filename = 'QC-%s-BoxPlot-%s.pdf' % (dataset_name,feature_type)
    export_obj = export.ExportFile(root_dir + filename[:-4]+'.txt')
    export_obj.write('SampleID\tAverage Expression\n')
    
    firstEntry=True
    for (mean,stdev,sample_name) in sample_sorted_list:
        ls=[]; x_ls=[]; y_ls=[]
        qc = qc_db[sample_name][feature_type]
        boxplots.append(qc)
        samples.append(sample_name)
        export_obj.write(sample_name+'\t'+str(mean)+'\n')
        if firstEntry:
            threshold=mean-2*stdev
            firstEntry=False
        else:
            if mean<threshold:
                print sample_name,'expression is considered very low (2 standard deviations away from the max).'
    pylab.boxplot(boxplots, notch=0, whis=1.5, positions=None, widths=None, patch_artist=False)
    #pylab.boxplot(boxplots, notch=0, sym='+', vert=1, whis=1.5, positions=None, widths=None, patch_artist=False)
    xtickNames = pylab.setp(pylab.gca(), xticklabels=samples)
    pylab.setp(xtickNames, rotation=90, fontsize=10)
    export_obj.close()

    #print 'Exporting:',filename
    pylab.savefig(root_dir + filename)
    filename = filename[:-3]+'png'
    pylab.savefig(root_dir + filename) #,dpi=200
    graphic_link.append(['QC - BoxPlot-'+feature_type+' Expression',root_dir+filename])
    try:
        import gc
        pylab.figure.clf()
        pylab.close()
        gc.collect()
    except Exception:
        pass

Example #4

Show file

File: GradeCentral.py Project: XwyhyX/PythonCodes

def studAvg():
    studentName = input('Student Name:')
    try:
        average = avg(studentdict[studentName])
        print(studentName,'has an average grade of',average)
    except KeyError as keyErr:
        print('Student',studentName,'not found')   
    print(studentdict)

Example #5

Show file

File: FilterDabg.py Project: venkatmi/altanalyze

def combine_profiles(profile_list):
    profile_group_sizes={}
    for db in profile_list:
        for key in db: profile_group_sizes[key] = len(db[key])
        break

    new_profile_db={}
    for key in profile_group_sizes:
        x = profile_group_sizes[key] ###number of elements in list for key
        new_val_list=[]; i = 0
        while i<x:
            temp_val_list=[]
            for db in profile_list:
                if key in db: val = db[key][i]; temp_val_list.append(val)
            i+=1; val_avg = statistics.avg(temp_val_list); new_val_list.append(val_avg)
        new_profile_db[key] = new_val_list
    return new_profile_db

Example #6

Show file

File: FilterDabg.py Project: venkatmi/altanalyze

def parse_input_data(filename,data_type):
    fn=filepath(filename); first_line = 1; array_group_name_db = {}; z=0; array_group_db = {}; output_file = []
    #print "Reading",filename
    secondary_data_type = export.getParentDir(filename) ### e.g., expression or counts
    
    for line in open(fn,'rU').xreadlines():
      data = cleanUpLine(line); t = string.split(data,'\t'); probeset = t[0]; z+=1
      if first_line == 1:
          first_line = 0 #makes this value null for the next loop of actual array data
          ###Below ocucrs if the data is raw opposed to precomputed
          if data_type == 'export':
              if array_type == 'exon': folder = 'ExonArray'+'/'+species + '/'
              elif array_type == 'gene': folder = 'GeneArray'+'/'+species + '/'
              elif array_type == 'junction': folder = 'JunctionArray'+'/'+species + '/'
              elif array_type == 'RNASeq': folder = 'RNASeq'+'/'+species + '/'
              else: folder = array_type + '/'
              parent_path = root_dir+'AltExpression/'+folder
              if array_type == 'RNASeq':
                  output_file =  altanalzye_input[0:-4] + '.ExpCutoff-' + str(original_exp_threshold) +'_'+ filter_method+'.txt'
              else:
                  output_file = altanalzye_input[0:-4] + '.p' + str(int(100*p)) +'_'+ filter_method+'.txt'
              output_file_dir = parent_path+output_file
              print "...Exporting",output_file_dir
              export_data = export.createExportFile(output_file_dir,root_dir+'AltExpression/'+folder)
              fn=filepath(output_file_dir); export_data = open(fn,'w');
              export_data.write(line)
          if ':' in t[1]:
              array_group_list = []; x=0 ###gives us an original index value for each entry in the group
              for entry in t[1:]:
                  array_group,array_name = string.split(entry,':')
                  try:
                      array_group_db[array_group].append(x)
                      array_group_name_db[array_group].append(array_name)
                  except KeyError:
                      array_group_db[array_group] = [x]
                      array_group_name_db[array_group] = [array_name]
                      ### below only occurs with a new group addition
                      array_group_list.append(array_group) #use this to generate comparisons in the below linked function
                  x += 1
          #print '##### array_group_list',array_group_list
      elif len(probeset)>0 and data_type != 'export':
          ###Use the index values from above to assign each expression value to a new database
          temp_group_array={}; array_index_list = []  ###Use this list for permutation analysis
          for group in array_group_db:
              #array_index_list.append(array_group_db[group])
              group_values = []
              for array_index in array_group_db[group]:
                  try: exp_val = float(t[array_index+1])
                  except IndexError: print t, z,'\n',array_index,'\n',group, probeset;kill
                  group_values.append(exp_val)
              avg_stat = statistics.avg(group_values)

              if data_type == 'expression':
                  ###If non-log array data
                  if exp_data_format == 'non-log':
                      ### This works better for RNASeq as opposed to log transforming and then filtering which is more stringent and different than the filtering in ExonArray().
                      if array_type == 'RNASeq':
                        if normalization_method == 'RPKM' and secondary_data_type == 'expression':
                            if ':I' in probeset: k=1 ### Don't require an RPKM threshold for intron IDs (these will likely never meet this unless small or fully retained and highly expressed)
                            elif ':' not in probeset:
                                if avg_stat>=gene_rpkm_threshold: k=1
                                else: k=0
                            elif avg_stat>=exon_rpkm_threshold: k=1
                            elif '-' in probeset: k=1 ### Don't consider RPKM for junctions, just counts
                            else: k=0
                            #if 'ENSMUSG00000045991:E2.2' in probeset: print [probeset, normalization_method, secondary_data_type, gene_rpkm_threshold, avg_stat, k]
                        else: ### Otherwise, we are looking at count data
                            if '-' in probeset: ### junction meeting minimum read-count number
                                if avg_stat>=junction_exp_threshold: k=1 ### junction_exp_threshold is the same as nonlog_exp_threshold
                                else: k=0
                            elif ':' not in probeset:
                                if avg_stat>=gene_exp_threshold: k=1
                                else: k=0
                            else: ### exon or intron meeting minimum read-count number
                                if avg_stat>=exon_exp_threshold: k=1
                                else: k=0
                            #if 'ENSMUSG00000045991:E2.2' in probeset: print [probeset, normalization_method, secondary_data_type, exon_exp_threshold, junction_exp_threshold, avg_stat, k]
                      else:
                        if avg_stat>=nonlog_exp_threshold: k=1
                        else: k=0
                  elif avg_stat>=log_expression_threshold: k=1
                  else: k=0
                  if normalization_method == 'RPKM' and secondary_data_type == 'expression': ### Treat as dabp p-value
                      try: pvalue_status_db[probeset].append(k)
                      except KeyError: pvalue_status_db[probeset] = [k]
                  else:
                      try: expression_status_db[probeset].append(k)
                      except KeyError: expression_status_db[probeset] = [k]
                  #if probeset == '3209315': print [group],k,len(group_values),array_group_list
              if data_type == 'p-value':
                  if avg_stat<=p: k=1
                  else: k=0
                  #if 'G7216513_a_at' in probeset: print k, avg_stat
                  try: pvalue_status_db[probeset].append(k)
                  except KeyError: pvalue_status_db[probeset] = [k]
      elif data_type == 'export':
          if exp_data_format == 'non-log':
              ### This code was added in version 1.16 in conjunction with a switch from logstatus to
              ### non-log in AltAnalyze to prevent "Process AltAnalyze Filtered" associated errors
              exp_values = t[1:]; exp_values_log2=[]
              for exp_val in exp_values:
                  exp_values_log2.append(str(math.log(float(exp_val),2))) ### exp_val+=1 was removed in 2.0.5
              line = string.join([probeset]+exp_values_log2,'\t')+'\n'
          try: null = export_db[probeset]; export_data.write(line)
          except KeyError: null = [] ### occurs if not a probeset to include in the filtered results export file
    if data_type == 'export': export_data.close()
    return output_file

Example #7

Show file

File: NormalizeDataset.py Project: wuxue/altanalyze

def performGroupNormalization(filename,export_dir,platform):
    expressionDataFormat,increment,convertNonLogToLog = ExpressionBuilder.checkExpressionFileFormat(filename)
    groups_dir = string.replace(export_dir,'exp.','batch.')
    fn=unique.filepath(filename); row_number=0; exp_db={}; relative_headers_exported = False
    group_db = importGroups(groups_dir)
    export_data = export.ExportFile(export_dir)
    for line in open(fn,'rU').xreadlines():
        data = ExpressionBuilder.cleanUpLine(line)
        t = string.split(data,'\t')
        if data[0]=='#' and row_number==0: row_number = 0
        elif row_number==0:
            sample_list = t[1:]
            new_sample_list = []
            for group in group_db:
                group_samples = group_db[group]
                try:
                    sample_index_list = map(lambda x: sample_list.index(x), group_samples)
                    group_db[group] = sample_index_list
                    new_sample_list+=group_samples
                except Exception:
                    missing=[]
                    for x in sample_list:
                        if x not in t[1:]: missing.append(x)
                    print 'missing:',missing
                    print t
                    print sample_list
                    print filename, groups_dir
                    print 'Unknown Error!!! Skipping cluster input file build (check column and row formats for conflicts)'; forceExit
            title = string.join([t[0]]+new_sample_list,'\t')+'\n' ### output the new sample order (group file order)
            export_data.write(title)
            row_number=1
        else:
            gene = t[0]
            if expressionDataFormat == 'non-log' and (convertNonLogToLog or platform == 'RNASeq'):
                ### Convert to log2 RPKM values - or counts
    
                try: all_values = map(lambda x: math.log(float(x)+increment,2), t[1:])
                except Exception:
                    all_values = ExpressionBuilder.logTransformWithNAs(t[1:],increment)
            else:
                try: all_values = map(float,t[1:])
                except Exception:
                    all_values = ExpressionBuilder.logTransformWithNAs(t[1:],increment)
            row_number+=1 ### Keep track of the first gene as to write out column headers for the relative outputs
            gene_log_folds = []

            for group in group_db:
                sample_index_list = group_db[group]
                ### Calculate log-fold values relative to the mean of all sample expression values
                try: values = map(lambda x: all_values[x], sample_index_list) ### simple and fast way to reorganize the samples
                except Exception:
                    print len(values), sample_index_list;kill
                try: avg = statistics.avg(values)
                except Exception:
                    values2=[]
                    for v in values:
                        try: values2.append(float(v))
                        except Exception: pass
                    values = values2
                    try: avg = statistics.avg(values)
                    except Exception:
                        if len(values)>0: avg = values[0]
                        else: avg = 0
                try: log_folds = map(lambda x: (x-avg), values)
                except Exception: 
                    log_folds=[]
                    for x in values:
                        try: log_folds.append(x-avg)
                        except Exception: log_folds.append('')
                gene_log_folds+=log_folds                            
            gene_log_folds = map(lambda x: str(x),gene_log_folds)
            export_data.write(string.join([gene]+gene_log_folds,'\t')+'\n')
    export_data.close()

Example #8

Show file

File: QC.py Project: venkatmi/altanalyze

def importTableEntries(filename,filter_db,ensembl_exon_db,gene_db,root_dir,transpose,display,showIntrons,analysisType='plot'):
    import collections
    average_samples = True
    if showIntrons == 'yes': include_introns = True
    else: include_introns = False
    uid_db={} ### probeset or AltAnalyze RNA-Seq ID keyed
    uid_list={} ### ordered from first to last exon region
    uid_gene_db={} ### Lets us look at multiple genes
    try:
        import UI
        biotypes = UI.getBiotypes(filename)
    except Exception: biotypes={}
    for gene in ensembl_exon_db:
        uid_list[gene]=[]
        for (index,ed,id) in ensembl_exon_db[gene]:
            proceed = False
            if 'exp.' in filename:
                if include_introns:
                    proceed = True
                elif 'E' in ed.ExonID():
                    proceed = True
            else: ### Include introns for splicing index view
                if include_introns == True: proceed = True
                elif 'E' in ed.ExonID(): proceed = True
            if proceed:
                uid_db[id] = ed
                uid_list[gene].append(id)
            uid_gene_db[id]=gene

    if '_vs_' in filename: ### If one two groups, this is what will be output to the RawSplice folder - need to have this alternate way of getting the expression file location
        rootdir = string.split(filename, 'AltResults')[0]
        exp_dir = getValidExpFile(rootdir+'ExpressionInput')
        alt_groups_dir = string.split(exp_dir, 'ExpressionInput')[0]+'ExpressionInput/groups.'+findFilename(exp_dir)
        alt_groups_dir = string.replace(alt_groups_dir,'exp.','')
        
    start_time = time.time()
    fn = filepath(filename)
    matrix_gene_db={}
    stdev_gene_matrix_db={}
    row_header_gene={}
    ids={}
    x=0
    
    if 'heatmap' in analysisType:
        average_samples = False
        
    if '/' in filename:
        dataset_name = string.split(filename,'/')[-1][:-4]
    else:
        dataset_name = string.split(filename,'\\')[-1][:-4]
    for line in open(fn,'rU').xreadlines():         
        data = line.strip()
        t = string.split(data,'\t')
        if data[0]=='#': x=0
        elif x==0:
            if platform == 'RNASeq':
                removeExtension=True
            else:
                removeExtension=False
            group_db, column_header, sample_name_db = assignGroupColors(t[1:],'',removeExtension=removeExtension)
            x=1
            altresults = False
            if average_samples:
                if 'AltResults' in filename:
                    altresults=True
                    groups_dir = string.split(filename, 'AltResults')[0]+'ExpressionInput/groups.'+findFilename(filename)
                    if verifyFile(groups_dir)==False:
                        groups_dir = alt_groups_dir
                    new_column_header = reformatAltHeaders(t[3:])
                    start = 3
                else:
                    if 'exp.' in filename:
                        groups_dir = string.replace(filename,'exp.','groups.')
                    else:
                        groups_dir = string.replace(filename,'counts.','groups.')
                    new_column_header = column_header
                    start = 1 ### starting index with numeric values
                groups_dir = string.replace(groups_dir,'stats.','groups.')
                groups_dir = string.replace(groups_dir,'-steady-state.txt','.txt') ### groups is for the non-steady-state file
                
                try: group_index_db=collections.OrderedDict()
                except Exception:
                    import ordereddict
                    group_index_db = ordereddict.OrderedDict()
                ### use comps in the future to visualize group comparison changes
                sample_list,group_sample_db,group_db,group_name_sample_db,comp_groups,comps_name_db = ExpressionBuilder.simpleGroupImport(groups_dir)
                for item in sample_list:
                    group_name = group_db[item]
                    proceed=False
                    try: sample_index = new_column_header.index(item); proceed=True
                    except Exception:
                        try:
                            item = string.replace(item,'.bed','')
                            item = string.replace(item,'.CEL','') ### Probe-level analyses as RNA-Seq
                            item = string.replace(item,'.cel','')
                            item = string.replace(item,'.txt','')
                            item = string.replace(item,'.TXT','')
                            item = string.replace(item,'.TAB','')
                            item = string.replace(item,'.tab','')
                            sample_index = new_column_header.index(item)
                            proceed=True
                        except Exception:
                            pass
                            #print [item]
                            #print column_header
                            #print Error
                    if proceed:
                        try: group_index_db[group_name].append(sample_index)
                        except Exception:
                            try: group_index_db[group_name] = [sample_index] ### dictionary of group to input file sample indexes
                            except Exception: pass ### Occurs when analyzing splicing-index for two groups when more than two groups exist (error from 5 lines up)
                groups = map(str, group_index_db) ### store group names
                new_sample_list = map(lambda item: group_db[item], sample_list) ### lookup index of each sample in the ordered group sample list
                column_header = groups
            else:
                if 'AltResults' in filename: start = 3
                else: start = 1 ### starting index with numeric values
                column_header = t[start-1:]
            row_number=1   
        else:
            if ' ' not in t and '' not in t: ### Occurs for rows with missing data
                uid = t[start-1]
                if ';' in uid:
                    uid = string.split(uid,';')[0]
                ids[uid]=None
                ens_geneID = string.split(uid,':')[0]
                #if ens_geneID in gene_db: print uid
                if uid in filter_db or ('heatmap' in analysisType and ens_geneID in gene_db):
                    try:
                        if len(biotypes)==1 and 'junction' in biotypes:
                            gene = ens_geneID
                        else:
                            gene = uid_gene_db[uid]
                        try: row_header_gene[gene].append(uid)
                        except Exception: row_header_gene[gene] = [uid]
                        if average_samples == False:
                            values = map(float,t[start:])
                            try: matrix_gene_db[gene].append(values)
                            except Exception: matrix_gene_db[gene]=[values]
                        else:
                            if platform == 'RNASeq' and altresults==False:
                                ### Convert to log2 RPKM values - or counts
                                values = map(lambda x: math.log(float(x),2), t[start:])
                            else:
                                values = map(float,t[start:])
                                
                            if 'AltResults' in filename: ### If splicing scores, normalize these to the mean values
                                mean = statistics.avg(values)
                                values = map(lambda x: x-mean, values)
                            avg_ls=[]; std_ls = []
                            for group_name in group_index_db:
                                group_values = map(lambda x: values[x], group_index_db[group_name]) ### simple and fast way to reorganize the samples
                                avg = statistics.avg(group_values)
                                try: st_err = statistics.stdev(group_values)/math.sqrt(len(group_values))
                                except Exception:
                                    ### Occurs if no replicates in the dataset
                                    st_err = 0
                                avg_ls.append(avg)
                                std_ls.append(st_err)
                            try: matrix_gene_db[gene].append(avg_ls)
                            except Exception: matrix_gene_db[gene]=[avg_ls]
                            try: stdev_gene_matrix_db[gene].append(std_ls)
                            except Exception: stdev_gene_matrix_db[gene]=[std_ls]
                    except Exception:
                        #print traceback.format_exc()
                        pass
            x+=1

    global colors
    original_column_header = list(column_header)
    if len(uid_list)==0:
        print 'No genes found in the exon expression database'; forceNoExonExpError
    successfully_output_genes=0
    display_count=0 ### Only display a certain number of genes
    
    for last_gene in uid_list: pass
    for gene in uid_list:
        fig = pylab.figure() ### Create this here - resulting in a single figure for memory purposes
        new_header = []
        new_matrix = []
        new_stdev = []
        annotation_list=[]
        gene_symbol = gene_db[gene]
        try: matrix = matrix_gene_db[gene]
        except Exception:
            print gene_symbol, 'not in alternative expression database'
            continue ### go the next gene - no alt.expression for this gene
        row_header = row_header_gene[gene]

        try: stdev_matrix = stdev_gene_matrix_db[gene]
        except Exception: pass
        for uid in uid_list[gene]:
            #print row_header;sys.exit()
            try:
                i = row_header.index(uid) ### If the ID is in the filtered annotated exon list (not just core)
                new_header.append(uid)
                try: new_matrix.append(matrix[i])
                except Exception: print uid, i,len(matrix);sys.exit()
                ed = uid_db[uid]
                annotation_list.append(ed)
                try: new_stdev.append(stdev_matrix[i])
                except Exception: pass
            except Exception: pass

        if len(new_matrix)>0:
            matrix = new_matrix
        if len(new_header)>0:
            row_header = new_header
        if 'heatmap' in analysisType:
            export_dir = root_dir + gene_symbol + '-heatmap.txt'
            export_obj = export.ExportFile(export_dir)
            export_obj.write(string.join(column_header,'\t')+'\n')
            ki=0
            if len(annotation_list)>0:
                for ed in annotation_list:
                    if 'AltResults' not in filename and platform == 'RNASeq':
                        values = map(lambda x: math.log(x,2), matrix[ki])
                    else: values = matrix[ki]
                    export_obj.write(string.join([ed.ExonID()] + map(str,values),'\t')+'\n')
                    ki+=1
                row_metric = 'euclidean'; row_method = None
            else:
                ### Just junctions analyzed here... no sorted junctions yet
                ki=0
                for uid in row_header_gene[gene]:
                    if 'AltResults' not in filename and platform == 'RNASeq':
                        values = map(lambda x: math.log(x,2), matrix[ki])
                    else: values = matrix[ki]
                    export_obj.write(string.join([uid] + map(str,values),'\t')+'\n')
                    ki+=1
                row_metric = 'euclidean'; row_method = 'average'
            export_obj.close()
            import clustering
            
            column_metric = 'euclidean'; column_method = 'hopach'
            color_gradient = 'red_black_sky'; transpose = False; graphic_links=[]
            if ki>100: transpose = True
            if gene == last_gene: display = True
            else: display = False
            graphic_links = clustering.runHCexplicit(export_dir, graphic_links, row_method, row_metric, column_method, column_metric, color_gradient, transpose, display=display, Normalize=True, compressAxis = False, contrast = 2.5)
            successfully_output_genes+=1
        else:
            stdev_matrix = new_stdev
            time_diff = str(round(time.time()-start_time,1))
            #print '%d rows and %d columns imported for %s in %s seconds...' % (len(matrix),len(column_header),dataset_name,time_diff)
            if transpose == True:
                matrix = map(numpy.array, zip(*matrix)) ### coverts these to tuples
                column_header, row_header = row_header, original_column_header
                stdev_matrix = map(numpy.array, zip(*stdev_matrix))
            matrix = numpy.array(matrix)

            stdev_matrix = numpy.array(stdev_matrix)
            try:
                if len(uid_list)>10:
                    #if display_count==5: display=False
                    display=False
                if display_count==0:
                    ### store a consistent color palete to use
                    colors=[]
                    """
                    k=0
                    while k < len(row_header):
                        colors.append(tuple(rand(3)))
                        k+=1"""
                    #http://stackoverflow.com/questions/3016283/create-a-color-generator-from-given-colormap-in-matplotlib
                    cm = pylab.cm.get_cmap('gist_rainbow') #gist_ncar
                    for i in range(len(row_header)):
                        colors.append(cm(1.*i/len(row_header)))  # color will now be an RGBA tuple
        
                plotExonExpression(fig,matrix,stdev_matrix,row_header,column_header,dataset_name,annotation_list,gene_symbol,root_dir,display=display)
                successfully_output_genes+=1
                display_count+=1
            except Exception:
                print traceback.format_exc();sys.exit()
                print gene_symbol, 'failed'
        try: pylab.close()
        except Exception: pass
        if successfully_output_genes>0:
            #try: print 'Gene graphs exported to ExonPlots...'
            #except Exception: pass
            pass
        else:
            print '\nWARNING!!!! No genes with associated alternative exon evidence found\n'; forceNoExonExpError
        try:
            import gc
            fig.clf()
            pylab.close()
            gc.collect()
        except Exception:
            pass

Example #9

Show file

File: reorder_arrays.py Project: wuxue/altanalyze

def reorder(data,data_headers,array_order,comp_group_list,probeset_db,include_raw_data,array_type,norm,fl,logvalues=True,blanksPresent=False):
    ###array_order gives the final level order sorted, followed by the original index order as a tuple                   
    expbuilder_value_db = {}; group_name_db = {}; summary_filtering_stats = {}; pval_summary_db= {}
    replicates = 'yes'
    
    stat_result_names = ['avg-','log_fold-','fold-','rawp-','adjp-']
    group_summary_result_names = ['avg-']
    
    ### Define expression variables
    try: probability_statistic = fl.ProbabilityStatistic()
    except Exception: probability_statistic = 'unpaired t-test'
    try: gene_exp_threshold = math.log(fl.GeneExpThreshold(),2)
    except Exception: gene_exp_threshold = 0
    try: gene_rpkm_threshold = float(fl.RPKMThreshold())
    except Exception: gene_rpkm_threshold = 0
    try: FDR_statistic = fl.FDRStatistic()
    except Exception: FDR_statistic = 'Benjamini-Hochberg'
    calculateAsNonLog=True
    if blanksPresent:
        calculateAsNonLog=False
    
    ### Begin processing sample expression values according to the organized groups
    for row_id in data:
        try: gene = probeset_db[row_id][0]
        except TypeError: gene = '' #not needed if not altsplice data
        data_headers2 = {} #reset each time
        grouped_ordered_array_list = {}
        for x in array_order:
            y = x[1]  #this is the new first index
            group = x[2]
            group_name = x[3]
            group_name_db[group] = group_name
            #for example y = 5, therefore the data[row_id][5] entry is now the first
            try:
                try: new_item = data[row_id][y]
                except IndexError: print row_id,data[row_id],len(data[row_id]),y,len(array_order),array_order;kill
                if logvalues==False and calculateAsNonLog and array_type == 'RNASeq':
                    new_item = math.pow(2,new_item)
            except TypeError: new_item = ''  #this is for a spacer added in the above function
            try: grouped_ordered_array_list[group].append(new_item)
            except KeyError: grouped_ordered_array_list[group] = [new_item]
            try: data_headers2[group].append(data_headers[y])
            except KeyError: data_headers2[group]= [data_headers[y]]
        #perform statistics on each group comparison - comp_group_list: [(1,2),(3,4)]
        stat_results = {}
        group_summary_results = {}
        for comp in comp_group_list:
            group1 = int(comp[0])
            group2 = int(comp[1])
            group1_name = group_name_db[group1]
            group2_name = group_name_db[group2]
            groups_name = group1_name + "_vs_" + group2_name
            data_list1 = grouped_ordered_array_list[group1] 
            data_list2 = grouped_ordered_array_list[group2] #baseline expression
            if blanksPresent: ### Allows for empty cells
                data_list1 = filterBlanks(data_list1)
                data_list2 = filterBlanks(data_list2)
            try: avg1 = statistics.avg(data_list1)
            except Exception: avg1 = ''
            try: avg2 = statistics.avg(data_list2)
            except Exception: avg2=''
            try:
                if (logvalues == False and array_type != 'RNASeq') or (logvalues==False and calculateAsNonLog):
                    fold = avg1/avg2
                    log_fold = math.log(fold,2)
                    if fold<1: fold = -1.0/fold
                else:
                    log_fold = avg1 - avg2
                    fold = statistics.log_fold_conversion(log_fold) 
            except Exception:
                log_fold=''; fold=''
            try:
                #t,df,tails = statistics.ttest(data_list1,data_list2,2,3) #unpaired student ttest, calls p_value function
                #t = abs(t); df = round(df); p = str(statistics.t_probability(t,df))
                p = statistics.runComparisonStatistic(data_list1,data_list2,probability_statistic)
            except Exception: p = 1; sg = 1; N1=0; N2=0
            comp = group1,group2
            if array_type == 'RNASeq': ### Also non-log but treated differently
                if 'RPKM' == norm: adj = 0
                else: adj = 1
                if calculateAsNonLog == False:
                    try: avg1 = math.pow(2,avg1)-adj; avg2 = math.pow(2,avg2)-adj
                    except Exception: avg1=''; avg2=''
                if 'RPKM' == norm:
                    if avg1 < gene_rpkm_threshold and avg2 < gene_rpkm_threshold:
                        log_fold = 'Insufficient Expression'
                        fold = 'Insufficient Expression'
                else:
                    if avg1 < gene_exp_threshold and avg2 < gene_exp_threshold:
                        log_fold = 'Insufficient Expression'
                        fold = 'Insufficient Expression'
                    #if row_id=='ENSG00000085514':
                    #if fold=='Insufficient Expression':
                    #print [norm, avg1, avg2, fold, comp, gene_exp_threshold, gene_rpkm_threshold, row_id]
                    #5.96999111075 7.72930768675 Insufficient Expression (3, 1) 1.0 ENSG00000085514
            if gene_rpkm_threshold!=0 and calculateAsNonLog: ### Any other data
                a1 = nonLogAvg(data_list1)
                a2 = nonLogAvg(data_list2)
                #print [a1,a2,gene_rpkm_threshold]
                if a1<gene_rpkm_threshold and a2<gene_rpkm_threshold:
                    log_fold = 'Insufficient Expression'
                    fold = 'Insufficient Expression'
                #print log_fold;kill
            try:
                gs = statistics.GroupStats(log_fold,fold,p)
                stat_results[comp] = groups_name,gs,group2_name
                if probability_statistic == 'moderated t-test':
                    gs.setAdditionalStats(data_list1,data_list2) ### Assuming equal variance
                if probability_statistic == 'moderated Welch-test':
                    gs.setAdditionalWelchStats(data_list1,data_list2) ### Assuming unequal variance
            except Exception:
                null=[]; replicates = 'no' ### Occurs when not enough replicates
                #print comp, len(stat_results); kill_program
            group_summary_results[group1] = group1_name,[avg1]
            group_summary_results[group2] = group2_name,[avg2]

        ### Replaces the below method to get the largest possible comparison fold and ftest p-value
        grouped_exp_data = []; avg_exp_data = []
        for group in grouped_ordered_array_list:
            data_list = grouped_ordered_array_list[group]
            if blanksPresent: ### Allows for empty cells
                data_list = filterBlanks(data_list)
            if len(data_list)>0: grouped_exp_data.append(data_list)
            try: avg = statistics.avg(data_list); avg_exp_data.append(avg)
            except Exception:
                avg = ''
                #print row_id, group, data_list;kill
        try: avg_exp_data.sort(); max_fold = avg_exp_data[-1]-avg_exp_data[0]
        except Exception: max_fold = 'NA'
        try: ftestp = statistics.OneWayANOVA(grouped_exp_data)
        except Exception: ftestp = 1
        gs = statistics.GroupStats(max_fold,0,ftestp)
        summary_filtering_stats[row_id] = gs
        
        stat_result_list = []
        for entry in stat_results:
            data_tuple = entry,stat_results[entry]
            stat_result_list.append(data_tuple)
        stat_result_list.sort()
        
        grouped_ordered_array_list2 = []
        for group in grouped_ordered_array_list:
            data_tuple = group,grouped_ordered_array_list[group]
            grouped_ordered_array_list2.append(data_tuple)
        grouped_ordered_array_list2.sort() #now the list is sorted by group number
        
        ###for each rowid, add in the reordered data, and new statistics for each group and for each comparison
        for entry in grouped_ordered_array_list2:
            group_number = entry[0]
            original_data_values = entry[1]
            if include_raw_data == 'yes': ###optionally exclude the raw values
                for value in original_data_values:
                    if array_type == 'RNASeq':
                        if norm == 'RPKM': adj = 0
                        else: adj = 1
                        if calculateAsNonLog == False:
                            value = math.pow(2,value)-adj
                    try: expbuilder_value_db[row_id].append(value)
                    except KeyError: expbuilder_value_db[row_id] = [value]
            if group_number in group_summary_results:
                group_summary_data = group_summary_results[group_number][1] #the group name is listed as the first entry
                for value in group_summary_data:
                    try: expbuilder_value_db[row_id].append(value)
                    except KeyError: expbuilder_value_db[row_id] = [value]
            for info in stat_result_list:
                if info[0][0] == group_number: #comp,(groups_name,[avg1,log_fold,fold,ttest])
                    comp = info[0]; gs = info[1][1]
                    expbuilder_value_db[row_id].append(gs.LogFold())
                    expbuilder_value_db[row_id].append(gs.Fold())
                    expbuilder_value_db[row_id].append(gs.Pval())
                    ### Create a placeholder and store the position of the adjusted p-value to be calculated
                    expbuilder_value_db[row_id].append('') 
                    gs.SetAdjPIndex(len(expbuilder_value_db[row_id])-1)
                    gs.SetPvalIndex(len(expbuilder_value_db[row_id])-2)
                    pval_summary_db[(row_id,comp)] = gs

    ###do the same for the headers, but at the dataset level (redundant processes)
    array_fold_headers = []; data_headers3 = []
    try:
        for group in data_headers2:
            data_tuple = group,data_headers2[group]  #e.g. 1, ['X030910_25_hl.CEL', 'X030910_29R_hl.CEL', 'X030910_45_hl.CEL'])
            data_headers3.append(data_tuple)
        data_headers3.sort()
    except UnboundLocalError:
        print data_headers,'\n',array_order,'\n',comp_group_list,'\n'; kill_program
    
    for entry in data_headers3:
        x = 0 #indicates the times through a loop
        y = 0 #indicates the times through a loop
        group_number = entry[0]
        original_data_values = entry[1]
        if include_raw_data == 'yes': ###optionally exclude the raw values
            for value in original_data_values:
                array_fold_headers.append(value)
        if group_number in group_summary_results:
            group_name = group_summary_results[group_number][0]
            group_summary_data = group_summary_results[group_number][1]
            for value in group_summary_data:
                combined_name = group_summary_result_names[x] + group_name  #group_summary_result_names = ['avg-']
                array_fold_headers.append(combined_name)
                x += 1 #increment the loop index

        for info in stat_result_list:
            if info[0][0] == group_number:  #comp,(groups_name,[avg1,log_fold,fold,ttest],group2_name)
                groups_name = info[1][0]
                only_add_these = stat_result_names[1:]
                for value in only_add_these:
                    new_name = value + groups_name
                    array_fold_headers.append(new_name)

    ###For the raw_data only export we need the headers for the different groups (data_headers2) and group names (group_name_db)       
    raw_data_comp_headers = {}
    for comp in comp_group_list:
        temp_raw = []
        group1 = int(comp[0]);group2 = int(comp[1])
        comp = str(comp[0]),str(comp[1])
        g1_headers = data_headers2[group1]
        g2_headers = data_headers2[group2]
        g1_name = group_name_db[group1]
        g2_name = group_name_db[group2]
        for header in g2_headers: temp_raw.append(g2_name+':'+header)
        for header in g1_headers: temp_raw.append(g1_name+':'+header)
        raw_data_comp_headers[comp] = temp_raw

    ###Calculate adjusted ftest p-values using BH95 sorted method
    statistics.adjustPermuteStats(summary_filtering_stats)
    
    ### Calculate adjusted p-values for all p-values using BH95 sorted method
    round=0
    for info in comp_group_list:
        compid = int(info[0]),int(info[1]); pval_db={}
        for (rowid,comp) in pval_summary_db:
            if comp == compid:
                gs = pval_summary_db[(rowid,comp)]
                pval_db[rowid] = gs

        if 'moderated' in probability_statistic and replicates == 'yes':
            ### Moderates the original reported test p-value prior to adjusting
            try: statistics.moderateTestStats(pval_db,probability_statistic)
            except Exception:
                if round == 0:
                    if replicates == 'yes':
                        print 'Moderated test failed due to issue with mpmpath or out-of-range values\n   ... using unmoderated unpaired test instead!'
                null=[] ### Occurs when not enough replicates
            round+=1
            
        if FDR_statistic == 'Benjamini-Hochberg':
            statistics.adjustPermuteStats(pval_db)
        else:
            ### Calculate a qvalue (https://github.com/nfusi/qvalue)
            import numpy; import qvalue; pvals = []; keys = []
            for key in pval_db: pvals.append(pval_db[key].Pval()); keys.append(key)
            pvals = numpy.array(pvals)
            pvals = qvalue.estimate(pvals)
            for i in range(len(pvals)): pval_db[keys[i]].SetAdjP(pvals[i])
            
        for rowid in pval_db:
            gs = pval_db[rowid]
            expbuilder_value_db[rowid][gs.AdjIndex()] = gs.AdjP() ### set the place holder to the calculated value
            if 'moderated' in probability_statistic:
                expbuilder_value_db[rowid][gs.RawIndex()] = gs.Pval() ### Replace the non-moderated with a moderated p-value
                
    pval_summary_db=[]            
    ###Finished re-ordering lists and adding statistics to expbuilder_value_db
    return expbuilder_value_db, array_fold_headers, summary_filtering_stats, raw_data_comp_headers

Example #10

Show file

File: reorder_arrays.py Project: wuxue/altanalyze

def nonLogAvg(data_list):
    return statistics.avg(map(lambda x: math.pow(2,x)-1,data_list))

Example #11

Show file

File: sampleIndexSelection.py Project: nsalomonis/altanalyze

def statisticallyFilterFile(input_file,output_file,threshold):
    if 'exp.' in input_file:
        counts_file = string.replace(input_file,'exp.','geneCount.')
    else:
        counts_file = input_file[:-4]+'-geneCount.txt'
    sample_expressed_genes={}
    header=True
    junction_max=[]
    count_sum_array=[]
    for line in open(input_file,'rU').xreadlines():
        data = cleanUpLine(line)
        if '.csv' in input_file:
            t = string.split(data,',')
        else:
            t = string.split(data,'\t')
        if header:
            samples = t[1:]
            header=False
            count_sum_array=[0]*len(samples)
        else:
            try: values = map(float,t[1:])
            except Exception:
                if 'NA' in t[1:]:
                    tn = [0 if x=='NA' else x for x in t[1:]] ### Replace NAs
                    values = map(float,tn)
                else:
                    tn = [0 if x=='' else x for x in t[1:]] ### Replace NAs
                    values = map(float,tn)       
                
            binarized_values = []
            for v in values:
                if v>threshold: binarized_values.append(1)
                else: binarized_values.append(0)
            count_sum_array = [sum(value) for value in zip(*[count_sum_array,binarized_values])]
            
    index=0
    distribution=[]
    count_sum_array_db={}
    samples_to_retain =[]
    samples_to_exclude = []
    for sample in samples:
        count_sum_array_db[sample] = count_sum_array[index]
        distribution.append(count_sum_array[index])
        index+=1
    import statistics
    distribution.sort()
    avg = int(statistics.avg(distribution))
    stdev = int(statistics.stdev(distribution))
    min_exp = int(min(distribution))
    cutoff = avg - (stdev*2)
    dev = 2
    print 'The average number of genes expressed above %s is %s, (SD is %s, min is %s)' % (threshold,avg,stdev,min_exp)
    if cutoff<0:
        if (stdev-avg)>0:
            cutoff = avg - (stdev/2); dev = 0.5
        else:
            cutoff = avg - stdev; dev = 1
    if min_exp>cutoff:
        cutoff = avg - stdev; dev = 1
    import export
    eo = export.ExportFile(counts_file)
    eo.write('Sample\tGenes Expressed(threshold:'+str(threshold)+')\n')
    for sample in samples: ### keep the original order
        if count_sum_array_db[sample]>cutoff:
            samples_to_retain.append(sample)
        else:
            samples_to_exclude.append(sample)
        eo.write(sample+'\t'+str(count_sum_array_db[sample])+'\n')
        
    eo.close()
    print len(samples_to_exclude), 'samples removed (# exp. genes, < %d SD away) (%s)' % (dev,string.join(samples_to_exclude,', '))
    print 'Exporting the filtered expression file to:'
    print output_file
    filterFile(input_file,output_file,samples_to_retain)

Example #12

Show file

File: ExonArray.py Project: wuxue/altanalyze

def generateConstitutiveExpression(exp_dbase,constitutive_gene_db,probeset_gene_db,pre_filtered_db,array_names,filename):
    """Generate Steady-State expression values for each gene for analysis in the main module of this package"""
    steady_state_db={}; k=0; l=0
    remove_nonexpressed_genes = 'no' ### By default set to 'no'

    ###1st Pass: Identify probesets for steady-state calculation
    for gene in probeset_gene_db:
        if avg_all_probes_for_steady_state == 'yes': average_all_probesets[gene] = probeset_gene_db[gene] ### These are all exon aligning (not intron) probesets
        else:
            if gene not in constitutive_gene_db: average_all_probesets[gene] = probeset_gene_db[gene]
            else:
                constitutive_probeset_list = constitutive_gene_db[gene]
                constitutive_filtered=[] ###Added this extra code to eliminate constitutive probesets not in exp_dbase (gene level filters are more efficient when dealing with this many probesets)
                for probeset in constitutive_probeset_list:
                    if probeset in probeset_gene_db[gene]: constitutive_filtered.append(probeset)
                if len(constitutive_filtered)>0: average_all_probesets[gene] = constitutive_filtered
                else: average_all_probesets[gene] = probeset_gene_db[gene]

    ###2nd Pass: Remove probesets that have no detected expression (keep all if none are expressed)
    if excludeLowExpressionExons:
        non_expressed_genes={} ### keep track of these for internal QC
        for gene in average_all_probesets:
            gene_probe_list=[]; x = 0
            for probeset in average_all_probesets[gene]:
                if probeset in pre_filtered_db: gene_probe_list.append(probeset); x += 1
            ###If no constitutive and there are probes with detected expression: replace entry
            if x >0: average_all_probesets[gene] = gene_probe_list
            elif remove_nonexpressed_genes == 'yes': non_expressed_genes[gene]=[]   

    if remove_nonexpressed_genes == 'yes':
        for gene in non_expressed_genes: del average_all_probesets[gene]
    ###3rd Pass: Make sure the probesets are present in the input set (this is not typical unless a user is loading a pre-filtered probeset expression dataset)
    for gene in average_all_probesets:
        v=0
        for probeset in average_all_probesets[gene]:
            try: null = exp_dbase[probeset]; v+=1
            except KeyError: null =[] ###occurs if the expression probeset list is missing some of these probesets
            if v==0: ###Therefore, no probesets were found that were previously predicted to be best constitutive
                try: average_all_probesets[gene] = probeset_gene_db[gene] ###expand the average_all_probesets to include any exon linked to the gene
                except KeyError: print gene, probeset, len(probeset_gene_db), len(average_all_probesets);kill
    
    for probeset in exp_dbase:
        array_count = len(exp_dbase[probeset]); break

    try: null = array_count
    except Exception:
        print 'WARNING...CRITICAL ERROR. Make sure the correct array type is selected and that all input expression files are indeed present (array_count ERROR).'; forceError
        
    ###Calculate avg expression for each array for each probeset (using constitutive values)
    gene_count_db={}
    for gene in average_all_probesets:
        x = 0 ###For each array, average all probeset expression values
        gene_sum=0
        probeset_list = average_all_probesets[gene]#; k+= len(average_all_probesets[gene])
        if array_type != 'RNASeq': ### Just retain the list of probesets for RNA-seq
            while x < array_count:
                exp_list=[] ### average all exp values for constituitive probesets for each array
                for probeset in probeset_list:
                    try:
                        exp_val = exp_dbase[probeset][x]
                        exp_list.append(exp_val)
                    except KeyError: null =[] ###occurs if the expression probeset list is missing some of these probesets
                try:
                    if len(exp_list)==0:                
                        for probeset in probeset_list:
                            try:
                                exp_val = exp_dbase[probeset][x]
                                exp_list.append(exp_val)
                            except KeyError: null =[] ###occurs if the expression probeset list is missing some of these probesets
                    avg_const_exp=statistics.avg(exp_list)
                    ### Add only one avg-expression value for each array, this loop
                    try: steady_state_db[gene].append(avg_const_exp)
                    except KeyError: steady_state_db[gene] = [avg_const_exp]
                except ZeroDivisionError: null=[] ### Occurs when processing a truncated dataset (for testing usually) - no values for the gene should be included
                x += 1

    l = len(probeset_gene_db) - len(steady_state_db)
    steady_state_export = filename[0:-4]+'-steady-state.txt'
    steady_state_export = string.replace(steady_state_export,'counts.','exp.')
    fn=filepath(steady_state_export); data = open(fn,'w'); title = 'Gene_ID'
    
    if array_type == 'RNASeq':
        import RNASeq
        steady_state_db, pre_filtered_db = RNASeq.calculateGeneLevelStatistics(steady_state_export,species,average_all_probesets,normalize_feature_exp,array_names,UserOptions,excludeLowExp=excludeLowExpressionExons)
        ### This "pre_filtered_db" replaces the above since the RNASeq module performs the exon and junction-level filtering, not ExonArray (RPKM and count based)
        ### Use pre_filtered_db to exclude non-expressed features for multi-group alternative exon analysis
        removeNonExpressedProbesets(pre_filtered_db,full_dataset_export_dir)
        reload(RNASeq)
    
    for array in array_names: title = title +'\t'+ array
    data.write(title+'\n')
    for gene in steady_state_db:
        ss_vals = gene
        for exp_val in steady_state_db[gene]:
            ss_vals = ss_vals +'\t'+ str(exp_val)
        data.write(ss_vals+'\n')
    data.close()
    exp_dbase={}; steady_state_db={}; pre_filtered_db ={}
    #print k, "probesets were not found in the expression file, that could be used for the constitutive expression calculation"
    #print l, "genes were also not included that did not have such expression data"
    print "Steady-state data exported to",steady_state_export

Example #13

Show file

# As you import something, you can assign it a custom name using `as`
print('Let\'s alias something as we import it!')
print('importing e...')
from math import e
print('importing e as wahoo...')
from math import e as wahoo
print('T/F: e and wahoo are equal:', e == wahoo)
print()

# To summarize, the following three approaches all achieve the exact same thing

print('approach 1')
import statistics
avg = statistics.mean
print('The average of [1,2,3] is:', avg([1, 2, 3]))
# or...
# print('The average of [1,2,3] is:', statistics.mean([1, 2, 3]))
print()

print('approach 2')
from statistics import mean
avg = mean
print('The average of [1,2,3] is:', avg([1, 2, 3]))
# or...
# print('The average of [1,2,3] is:', mean([1, 2, 3]))
print()

print('approach 3')
from statistics import mean as avg
print('The average of [1,2,3] is:', avg([1, 2, 3]))

Example #14

Show file

def reorder(data,
            data_headers,
            array_order,
            comp_group_list,
            probeset_db,
            include_raw_data,
            array_type,
            norm,
            fl,
            logvalues=True):
    ###array_order gives the final level order sorted, followed by the original index order as a tuple
    expbuilder_value_db = {}
    group_name_db = {}
    summary_filtering_stats = {}
    pval_summary_db = {}
    replicates = 'yes'

    stat_result_names = ['avg-', 'log_fold-', 'fold-', 'rawp-', 'adjp-']
    group_summary_result_names = ['avg-']

    ### Define expression variables
    try:
        probability_statistic = fl.ProbabilityStatistic()
    except Exception:
        probability_statistic = 'unpaired t-test'
    try:
        gene_exp_threshold = fl.GeneExpThreshold()
    except Exception:
        gene_exp_threshold = 0
    try:
        gene_rpkm_threshold = fl.RPKMThreshold()
    except Exception:
        gene_rpkm_threshold = 0
    calculateAsNonLog = True

    ### Begin processing sample expression values according to the organized groups
    for row_id in data:
        try:
            gene = probeset_db[row_id][0]
        except TypeError:
            gene = ''  #not needed if not altsplice data
        data_headers2 = {}  #reset each time
        grouped_ordered_array_list = {}
        for x in array_order:
            y = x[1]  #this is the new first index
            group = x[2]
            group_name = x[3]
            group_name_db[group] = group_name
            #for example y = 5, therefore the data[row_id][5] entry is now the first
            try:
                try:
                    new_item = data[row_id][y]
                except IndexError:
                    print row_id, data[row_id], len(
                        data[row_id]), y, len(array_order), array_order
                    kill
                if logvalues == False and calculateAsNonLog and array_type == 'RNASeq':
                    new_item = math.pow(2, new_item)
            except TypeError:
                new_item = ''  #this is for a spacer added in the above function
            try:
                grouped_ordered_array_list[group].append(new_item)
            except KeyError:
                grouped_ordered_array_list[group] = [new_item]
            try:
                data_headers2[group].append(data_headers[y])
            except KeyError:
                data_headers2[group] = [data_headers[y]]
        #perform statistics on each group comparison - comp_group_list: [(1,2),(3,4)]
        stat_results = {}
        group_summary_results = {}
        for comp in comp_group_list:
            group1 = int(comp[0])
            group2 = int(comp[1])
            group1_name = group_name_db[group1]
            group2_name = group_name_db[group2]
            groups_name = group1_name + "_vs_" + group2_name
            data_list1 = grouped_ordered_array_list[group1]
            data_list2 = grouped_ordered_array_list[
                group2]  #baseline expression
            avg1 = statistics.avg(data_list1)
            try:
                avg2 = statistics.avg(data_list2)
            except ValueError:
                print data_list2, row_id
                forceError
            if (logvalues == False
                    and array_type != 'RNASeq') or (logvalues == False
                                                    and calculateAsNonLog):
                fold = avg1 / avg2
                log_fold = math.log(fold, 2)
                if fold < 1: fold = -1.0 / fold
            else:
                log_fold = avg1 - avg2
                fold = statistics.log_fold_conversion(log_fold)
            try:
                #t,df,tails = statistics.ttest(data_list1,data_list2,2,3) #unpaired student ttest, calls p_value function
                #t = abs(t); df = round(df); p = str(statistics.t_probability(t,df))
                p = statistics.runComparisonStatistic(data_list1, data_list2,
                                                      probability_statistic)
            except Exception:
                p = 1
                sg = 1
                N1 = 0
                N2 = 0
            comp = group1, group2
            if array_type == 'RNASeq':  ### Also non-log but treated differently
                if norm == 'RPKM': adj = 0
                else: adj = 1
                if calculateAsNonLog == False:
                    avg1 = math.pow(2, avg1) - adj
                    avg2 = math.pow(2, avg2) - adj
                if norm == 'RPKM':
                    if avg1 < gene_rpkm_threshold and avg2 < gene_rpkm_threshold:
                        log_fold = 'Insufficient Expression'
                        fold = 'Insufficient Expression'
                else:
                    if avg1 < gene_exp_threshold and avg2 < gene_exp_threshold:
                        log_fold = 'Insufficient Expression'
                        fold = 'Insufficient Expression'
            try:
                gs = statistics.GroupStats(log_fold, fold, p)
                stat_results[comp] = groups_name, gs, group2_name
                if probability_statistic == 'moderated t-test':
                    gs.setAdditionalStats(
                        data_list1, data_list2)  ### Assuming equal variance
                if probability_statistic == 'moderated Welch-test':
                    gs.setAdditionalWelchStats(
                        data_list1, data_list2)  ### Assuming unequal variance
            except Exception:
                null = []
                replicates = 'no'  ### Occurs when not enough replicates
                #print comp, len(stat_results); kill_program
            group_summary_results[group1] = group1_name, [avg1]
            group_summary_results[group2] = group2_name, [avg2]

        ### Replaces the below method to get the largest possible comparison fold and ftest p-value
        grouped_exp_data = []
        avg_exp_data = []
        for group in grouped_ordered_array_list:
            data_list = grouped_ordered_array_list[group]
            grouped_exp_data.append(data_list)
            try:
                avg = statistics.avg(data_list)
                avg_exp_data.append(avg)
            except Exception:
                print row_id, group, data_list
                kill
        try:
            avg_exp_data.sort()
            max_fold = avg_exp_data[-1] - avg_exp_data[0]
        except Exception:
            max_fold = 'NA'
        try:
            ftestp = statistics.OneWayANOVA(grouped_exp_data)
        except Exception:
            ftestp = 1
        gs = statistics.GroupStats(max_fold, 0, ftestp)
        summary_filtering_stats[row_id] = gs

        stat_result_list = []
        for entry in stat_results:
            data_tuple = entry, stat_results[entry]
            stat_result_list.append(data_tuple)
        stat_result_list.sort()

        grouped_ordered_array_list2 = []
        for group in grouped_ordered_array_list:
            data_tuple = group, grouped_ordered_array_list[group]
            grouped_ordered_array_list2.append(data_tuple)
        grouped_ordered_array_list2.sort(
        )  #now the list is sorted by group number

        ###for each rowid, add in the reordered data, and new statistics for each group and for each comparison
        for entry in grouped_ordered_array_list2:
            group_number = entry[0]
            original_data_values = entry[1]
            if include_raw_data == 'yes':  ###optionally exclude the raw values
                for value in original_data_values:
                    if array_type == 'RNASeq':
                        if norm == 'RPKM': adj = 0
                        else: adj = 1
                        if calculateAsNonLog == False:
                            value = math.pow(2, value) - adj
                    try:
                        expbuilder_value_db[row_id].append(value)
                    except KeyError:
                        expbuilder_value_db[row_id] = [value]
            if group_number in group_summary_results:
                group_summary_data = group_summary_results[group_number][
                    1]  #the group name is listed as the first entry
                for value in group_summary_data:
                    try:
                        expbuilder_value_db[row_id].append(value)
                    except KeyError:
                        expbuilder_value_db[row_id] = [value]
            for info in stat_result_list:
                if info[0][
                        0] == group_number:  #comp,(groups_name,[avg1,log_fold,fold,ttest])
                    comp = info[0]
                    gs = info[1][1]
                    expbuilder_value_db[row_id].append(gs.LogFold())
                    expbuilder_value_db[row_id].append(gs.Fold())
                    expbuilder_value_db[row_id].append(gs.Pval())
                    ### Create a placeholder and store the position of the adjusted p-value to be calculated
                    expbuilder_value_db[row_id].append('')
                    gs.SetAdjPIndex(len(expbuilder_value_db[row_id]) - 1)
                    gs.SetPvalIndex(len(expbuilder_value_db[row_id]) - 2)
                    pval_summary_db[(row_id, comp)] = gs

    ###do the same for the headers, but at the dataset level (redundant processes)
    array_fold_headers = []
    data_headers3 = []
    try:
        for group in data_headers2:
            data_tuple = group, data_headers2[
                group]  #e.g. 1, ['X030910_25_hl.CEL', 'X030910_29R_hl.CEL', 'X030910_45_hl.CEL'])
            data_headers3.append(data_tuple)
        data_headers3.sort()
    except UnboundLocalError:
        print data_headers, '\n', array_order, '\n', comp_group_list, '\n'
        kill_program

    for entry in data_headers3:
        x = 0  #indicates the times through a loop
        y = 0  #indicates the times through a loop
        group_number = entry[0]
        original_data_values = entry[1]
        if include_raw_data == 'yes':  ###optionally exclude the raw values
            for value in original_data_values:
                array_fold_headers.append(value)
        if group_number in group_summary_results:
            group_name = group_summary_results[group_number][0]
            group_summary_data = group_summary_results[group_number][1]
            for value in group_summary_data:
                combined_name = group_summary_result_names[
                    x] + group_name  #group_summary_result_names = ['avg-']
                array_fold_headers.append(combined_name)
                x += 1  #increment the loop index

        for info in stat_result_list:
            if info[0][
                    0] == group_number:  #comp,(groups_name,[avg1,log_fold,fold,ttest],group2_name)
                groups_name = info[1][0]
                only_add_these = stat_result_names[1:]
                for value in only_add_these:
                    new_name = value + groups_name
                    array_fold_headers.append(new_name)

    ###For the raw_data only export we need the headers for the different groups (data_headers2) and group names (group_name_db)
    raw_data_comp_headers = {}
    for comp in comp_group_list:
        temp_raw = []
        group1 = int(comp[0])
        group2 = int(comp[1])
        comp = str(comp[0]), str(comp[1])
        g1_headers = data_headers2[group1]
        g2_headers = data_headers2[group2]
        g1_name = group_name_db[group1]
        g2_name = group_name_db[group2]
        for header in g2_headers:
            temp_raw.append(g2_name + ':' + header)
        for header in g1_headers:
            temp_raw.append(g1_name + ':' + header)
        raw_data_comp_headers[comp] = temp_raw

    ###Calculate adjusted ftest p-values using BH95 sorted method
    statistics.adjustPermuteStats(summary_filtering_stats)

    ### Calculate adjusted p-values for all p-values using BH95 sorted method
    round = 0
    for info in comp_group_list:
        compid = int(info[0]), int(info[1])
        pval_db = {}
        for (rowid, comp) in pval_summary_db:
            if comp == compid:
                gs = pval_summary_db[(rowid, comp)]
                pval_db[rowid] = gs
        if 'moderated' in probability_statistic and replicates == 'yes':
            ### Moderates the original reported test p-value prior to adjusting
            try:
                statistics.moderateTestStats(pval_db, probability_statistic)
            except Exception:
                if round == 0:
                    if replicates == 'yes':
                        print 'Moderated test failed due to issue with mpmpath or out-of-range values\n   ... using unmoderated unpaired test instead!'
                null = []  ### Occurs when not enough replicates
            round += 1
        statistics.adjustPermuteStats(pval_db)
        for rowid in pval_db:
            gs = pval_db[rowid]
            expbuilder_value_db[rowid][gs.AdjIndex()] = gs.AdjP(
            )  ### set the place holder to the calculated value
            if 'moderated' in probability_statistic:
                expbuilder_value_db[rowid][gs.RawIndex()] = gs.Pval(
                )  ### Replace the non-moderated with a moderated p-value

    pval_summary_db = []
    ###Finished re-ordering lists and adding statistics to expbuilder_value_db
    return expbuilder_value_db, array_fold_headers, summary_filtering_stats, raw_data_comp_headers

Example #15

Show file

def nonLogAvg(data_list):
    return statistics.avg(map(lambda x: math.pow(2, x) - 1, data_list))