def getGeneNativeLFEProfiles(taxId, args): for result in sampleProfilesFixedIntervals( convertResultsToMFEProfiles( readSeriesResultsForSpecies_cached( (args.computation_tag,), taxId, 0, 0, shuffleType=args.shuffle_type ) , 0) , args.profile[3], args.profile[0], args.profile[1], args.profile[2]): nativeLFE = result['profile-data'][0] yield nativeLFE
def countShuffledProfiles(taxId, profile, computationTag, shuffleType): shuffledMeanProfile = MeanProfile(profileLength(profile)) for result in sampleProfilesFixedIntervals( convertResultsToMFEProfiles( readSeriesResultsForSpeciesWithSequence( (computationTag, ), taxId, numShuffledGroups, numShuffledGroups, shuffleType=shuffleType), numShuffledGroups), profile[3], profile[0], profile[1], profile[2]): profileData = result["profile-data"] shuffledMeanProfile.add(profileData[1:]) print(shuffledMeanProfile.counts()) numShuffledSeqs = shuffledMeanProfile.counts()[0] / numShuffledGroups return (taxId, numShuffledSeqs)
from mysql_rnafold import Sources from process_series_data import readSeriesResultsForSpeciesWithSequence, convertResultsToMFEProfiles, sampleProfilesFixedIntervals, profileLength, profileElements, MeanProfile, calcSampledGCcontent for result in sampleProfilesFixedIntervals( convertResultsToMFEProfiles( readSeriesResultsForSpeciesWithSequence( (Sources.RNAfoldEnergy_SlidingWindow40_v2,), 85962, 20, 20 ) , 20) , 150, 600, 10): fullCDS = result["cds-seq"] seq = fullCDS[150:600] print(seq)
def performPlots(self): args = self._args taxid = self._taxId # ------------------------------------------------------------------------------------ numShuffledGroups = args.num_shuffles shuffleTypes = args.shuffle_types print("*********** {} ***********".format(shuffleTypes)) combinedData = {} for shuffleType in shuffleTypes: n = 0 x1 = Counter() x2 = Counter() x3 = Counter() print("Processing species %d (%s), shuffleType=%d" % (taxid, getSpeciesFileName(taxid), shuffleType)) nativeMeanProfile = MeanProfile(profileLength(args.profile)) shuffledMeanProfile = MeanProfile(profileLength(args.profile)) shuffled25Profile = MeanProfile(profileLength(args.profile)) shuffled75Profile = MeanProfile(profileLength(args.profile)) xRange = profileElements(args.profile) nativeMeanProfile_HighPAOnly = None nativeMeanProfile_MediumPAOnly = None nativeMeanProfile_LowPAOnly = None if (args.pax_db): nativeMeanProfile_HighPAOnly = MeanProfile( profileLength(args.profile)) nativeMeanProfile_MediumPAOnly = MeanProfile( profileLength(args.profile)) nativeMeanProfile_LowPAOnly = MeanProfile( profileLength(args.profile)) meanProfile_HighExtPropOnly = None meanProfile_LowExtPropOnly = None if (args.external_property): meanProfile_HighExtPropOnly = MeanProfile( profileLength(args.profile)) meanProfile_LowExtPropOnly = MeanProfile( profileLength(args.profile)) GCProfile = MeanProfile(profileLength(args.profile)) #deltasForWilcoxon = np.zeros((0,2), dtype=float) deltasForWilcoxon = pd.DataFrame({ 'pos': pd.Series(dtype='int'), 'delta': pd.Series(dtype='float') }) fullDeltas = [] geneLevelScatter = pd.DataFrame({ 'gc': pd.Series(dtype='float'), 'logpval': pd.Series(dtype='float'), 'abslogpval': pd.Series(dtype='float'), 'protid': pd.Series(dtype='string') }) cdsLengths = [] fullSeqs = [] dfCodonw = None # ------------------------------------ # Process all CDS for this species # ------------------------------------ for result in sampleProfilesFixedIntervals( convertResultsToMFEProfiles( readSeriesResultsForSpeciesWithSequence( (args.computation_tag, ), taxid, numShuffledGroups, numShuffledGroups, shuffleType=shuffleType), numShuffledGroups), args.profile[3], args.profile[0], args.profile[1], args.profile[2]): fullCDS = result["cds-seq"] seq = fullCDS[args.profile[3]:args.profile[0]] if not seq: continue protId = result["cds"].getProtId() #print("Length: {}nt".format(result["cds"].length())) fullSeqs.append( SeqRecord(Seq(fullCDS, NucleotideAlphabet), id=protId)) profileData = result["profile-data"] assert (profileData.shape[0] >= numShuffledGroups) #print(profileData.shape) #print(profileData) #print(profileData[:,0].T) # Prepare mean MFE profiles nativeMeanProfile.add(profileData[0, None]) shuffledMeanProfile.add(profileData[1:]) # Prepare GC profile gc = calcSampledGCcontent(seq, args.profile[1]) if (gc.size > profileLength( args.profile)): # truncate the profile if necessary gc = np.resize(gc, (profileLength(args.profile), )) GCProfile.add(np.expand_dims(gc, 0)) # Prepare percentile mean profiles shuffled25Profile.add( np.expand_dims(np.percentile(profileData[1:], 25, axis=0), 0)) shuffled75Profile.add( np.expand_dims(np.percentile(profileData[1:], 75, axis=0), 0)) # Prepare data for genome-wide wilcoxon test #newDeltas = profileData[0,0::4] - np.mean(profileData[1:,0::4], axis=0) newDeltas = profileData[0, 0::1] - np.mean( profileData[1:, 0::1], axis=0) #print("newDeltas: {}".format(newDeltas.shape)) #newPositions = range(args.profile[3], profileLength(args.profile), 40) newPositions = range(args.profile[3], args.profile[0], args.profile[1]) deltaspd = pd.DataFrame({ 'pos': pd.Series(newPositions, dtype='int'), 'delta': pd.Series(newDeltas, dtype='float') }) #print("deltaspd: {}".format(deltaspd.shape)) deltasForWilcoxon = deltasForWilcoxon.append(deltaspd) fullDeltas.append( profileData[0, 0::1] - profileData[1:, 0::1] ) # store the 20x31 matrix of deltas for full wilcoxon test # Prepare data for GC vs. selection test meanGC = calcSampledGCcontent(seq, 10000)[0] if (not (meanGC >= 0.05 and meanGC <= 0.95)): meanGC = None #deltas = profileData[0,0::4] - np.mean(profileData[1:,0::4], axis=0) deltas = profileData[0, 0::1] - np.mean(profileData[1:, 0::1], axis=0) #print("deltas: {}".format(deltas.shape)) pvalue = wilcoxon(deltas).pvalue direction = np.mean(deltas) directedLogPval = None if (pvalue > 0.0): directedLogPval = log10(pvalue) * direction * -1.0 else: directedLogPval = -250.0 * direction * -1.0 paval = None if (args.pax_db): paval = pa.get(protId) if (paval >= 0.8): nativeMeanProfile_HighPAOnly.add(profileData[0, None]) elif (paval <= 0.2): nativeMeanProfile_LowPAOnly.add(profileData[0, None]) elif (not paval is None): nativeMeanProfile_MediumPAOnly.add(profileData[0, None]) if (args.external_property): extPropValue = externalProperty.get(xxxxxx, None) if extPropValue >= externalPropertyMedian: meanProfile_HighExtPropOnly.add(profileData[0, None]) else: meanProfile_LowExtPropOnly.add(profileData[0, None]) cds_length_nt = len(fullCDS) cdsLengths.append(cds_length_nt) geneLevelScatter = geneLevelScatter.append( pd.DataFrame({ 'gc': pd.Series([meanGC]), 'logpval': pd.Series([directedLogPval]), 'abslogpval': pd.Series([pvalue]), 'protid': pd.Series([protId]), 'pa': pd.Series([paval]), 'cds_length_nt': pd.Series([cds_length_nt]) })) x1.update((fullCDS[0], )) x2.update((fullCDS[1], )) x3.update((fullCDS[2], )) del fullCDS del result n += 1 del (pvalue) del (direction) del (seq) del (deltas) # Refuse to proceed if the data found is unreasonably small if (n < 100): raise Exception( "Found insufficient data to process taxid=%d (n=%d)" % (taxid, n)) CUBmetricsProfile = None if (args.codonw): fFullSeqs = NamedTemporaryFile( mode="w") # create a temporary file SeqIO.write(fullSeqs, fFullSeqs.name, "fasta") # write the full sequences into the file dfCodonw = readCodonw( fFullSeqs.name ) # run codonw and get the gene-level results print('****************************************************') print(dfCodonw.columns) print(dfCodonw.head()) print(geneLevelScatter.columns) print(geneLevelScatter.head()) geneLevelScatter = pd.merge(dfCodonw, geneLevelScatter, left_index=True, right_index=False, right_on='protid') print(geneLevelScatter.corr()) #args.profile[3], args.profile[0], args.profile[1] CUBmetricsProfile = meanCodonwProfile( fullSeqs, confWindowWidth, 'begin', args.profile[3], args.profile[0], args.profile[1]) # TODO - use real values! print(CUBmetricsProfile) #else: # geneLevelScatter['CAI'] = pd.Series( np.zeros(len(geneLevelScatter)), index=geneLevelScatter.index) # ------------------------------------ # Display summary for this species # ------------------------------------ #print("Native:") #print(nativeMeanProfile.value()) #print(nativeMeanProfile.counts()) #print("Shuffled:") #print(shuffledMeanProfile.value()) #print(shuffledMeanProfile.counts()) #print(deltasForWilcoxon.shape) #------------------------------------------------------------------------------------------------------------------ # Test for significance of the mean dLFE (postive or negative) at each position along the genome # (This will answer questions such as "how many genomes have (significantly) negative dLFE at position X?") #------------------------------------------------------------------------------------------------------------------ wilcoxonDf = pd.DataFrame({ 'pos': pd.Series(dtype='int'), 'logpval': pd.Series(dtype='float'), 'N': pd.Series(dtype='int') }) if (True): print("Processing full deltas...") # Perform statistical tests based on the deltas for each position (from all proteins) for pos in range(profileLength(args.profile)): # Collect all deltas for this position (data will be an list of arrays of length 20 - one for each protein long enough to have deltas at this position) data = [x[:, pos] for x in fullDeltas if x.shape[1] > pos] dataar = np.concatenate(data) # flatten all deltas assert (dataar.ndim == 1) # Perform 1-sample Wilcoxon signed-rank test on the deltas (testing whether the deltas are symmetrical) wilcoxonPval = wilcoxon(dataar).pvalue # 2-sided test if wilcoxonPval > 0.0: logWilcoxonPval = log10(wilcoxonPval) else: logWilcoxonPval = -324.0 # ~minimum value for log10(0.000.....) N = dataar.shape[0] wilcoxonDf = wilcoxonDf.append( pd.DataFrame({ 'pos': pd.Series(xRange[pos]), 'N': pd.Series([N]), 'logpval': pd.Series([logWilcoxonPval]) })) #alldeltas = np.concatenate(fullDeltas) #print(wilcoxonDf) del (data) del (dataar) #------------------------------------------------------------------------------------------------------------------ #------------------------------------------------------------------------------------------------------------------ # Find "transition peak" #------------------------------------------------------------------------------------------------------------------ # Calculate the dLFE print( "-TransitionPeak-TransitionPeak-TransitionPeak-TransitionPeak-" ) meanDeltaLFE = nativeMeanProfile.value( ) - shuffledMeanProfile.value() peakPos = np.argmin(meanDeltaLFE) peakVal = meanDeltaLFE[peakPos] guPeakDf = pd.DataFrame({ 'pos': pd.Series(dtype='int'), 'logpval': pd.Series(dtype='float') }) if peakVal <= 0.0: print("{} {}".format(peakPos, peakVal)) if not wilcoxonDf[wilcoxonDf['pos'] == peakPos * 10].empty: logpval = wilcoxonDf[wilcoxonDf['pos'] == peakPos * 10].logpval.loc[0] print(type(logpval)) #print(logpval.shape) print(logpval) if logpval < -2.0: # # Collect all deltas for this position (data will be an list of arrays of length 20 - one for each protein long enough to have deltas at this position) for otherPos in range(profileLength(args.profile)): data1 = [ x[:, peakPos] for x in fullDeltas if x.shape[1] > max(peakPos, otherPos) ] peakData = np.concatenate( data1) # flatten all deltas assert (peakData.ndim == 1) data2 = [ x[:, otherPos] for x in fullDeltas if x.shape[1] > max(peakPos, otherPos) ] otherData = np.concatenate( data2) # flatten all deltas assert (len(peakData) == len(otherData)) datax = otherData - peakData print("/-: {} {} {}".format( peakPos, otherPos, np.mean(datax))) #if( peakPos==otherPos ): # print(datax) wilcoxonPval = None if np.allclose(otherData, peakData): logWilcoxonPval = 0.0 else: # Perform 1-sample Wilcoxon signed-rank test on the deltas (testing whether the deltas are symmetrical) wilcoxonPval = wilcoxon( peakData, otherData ).pvalue # 2-sided test (not ideal in this situation...) if wilcoxonPval > 0.0: logWilcoxonPval = log10(wilcoxonPval) elif wilcoxonPval == 0.0: logWilcoxonPval = -324.0 # ~minimum value for log10(0.000.....) else: logWilcoxonPval = None if not logWilcoxonPval is None: guPeakDf = guPeakDf.append( pd.DataFrame({ 'pos': pd.Series(xRange[otherPos]), 'logpval': pd.Series([logWilcoxonPval]) })) print(guPeakDf) #------------------------------------------------------------------------------------------------------------------ # Calculate edge-referenced wilcoxon #------------------------------------------------------------------------------------------------------------------ edgePos = profileEdgeIndex(args.profile) data0 = [ x[:, edgePos] if x.shape[1] > pos else None for x in fullDeltas ] edgeWilcoxonDf = pd.DataFrame({ 'pos': pd.Series(dtype='int'), 'logpval': pd.Series(dtype='float') }) for pos in range(profileLength(args.profile)): data1 = [ x[:, pos] if x.shape[1] > pos else None for x in fullDeltas ] assert (len(data0) == len(data1)) if not data1[0] is None: print("]]]]]]]]]]]]] {}".format(data1[0].shape)) diffs = [] for d0, d1 in zip(data0, data1): if (not d0 is None) and (not d1 is None): #print("{} {}".format(d0.shape, d1.shape)) d = d1 - d0 diffs.append(d[~np.isnan(d)]) alldiffs = np.concatenate(diffs) #print(alldiffs.shape) print(pos) #print(alldiffs[:100]) print(alldiffs.shape) wilcoxonPval = None if np.allclose(alldiffs, 0): logWilcoxonPval = 0.0 else: # Perform 1-sample Wilcoxon signed-rank test on the deltas (testing whether the deltas are symmetrical) wilcoxonPval = wilcoxon( alldiffs ).pvalue # 2-sided test (not ideal in this situation...) if wilcoxonPval > 0.0: logWilcoxonPval = log10(wilcoxonPval) elif wilcoxonPval == 0.0: logWilcoxonPval = -324.0 # ~minimum value for log10(0.000.....) else: logWilcoxonPval = None if not logWilcoxonPval is None: edgeWilcoxonDf = edgeWilcoxonDf.append( pd.DataFrame({ 'pos': pd.Series(xRange[pos]), 'logpval': pd.Series([logWilcoxonPval]) })) print(edgeWilcoxonDf) # Count the mininum number of sequences minCount = 1000000 for pos in range(profileLength(args.profile)): countAtPos = sum( [1 if x.shape[1] > pos else 0 for x in fullDeltas]) if countAtPos < minCount: minCount = countAtPos #------------------------------------------------------------------------------------------------------------------ # Store the results #------------------------------------------------------------------------------------------------------------------ #native = np.asarray(nativeMean[1:], dtype="float") #shuffled = np.asarray(shuffledMean[1:], dtype="float") #gc = np.asarray(gcMean[1:], dtype="float") #xrange = [x for x in args.profile.Elements() if x<profileInfo.cdsLength()] profileId = "%d_%d_%s_t%d" % (args.profile[0], args.profile[1], args.profile[2], shuffleType) df = pd.DataFrame( { "native": nativeMeanProfile.value(), "shuffled": shuffledMeanProfile.value(), "gc": GCProfile.value(), "position": xRange, "shuffled25": shuffled25Profile.value(), "shuffled75": shuffled75Profile.value() }, index=xRange) print("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^") if (not CUBmetricsProfile is None): df = pd.merge(df, CUBmetricsProfile, how='left', left_on='position', right_on='windowStart') df = df.set_index('position') combinedData[shuffleType] = df #print(df) dfProfileCorrs = None if (args.codonw): plotMFEProfileMultiple(taxid, profileId, df, ('GC', 'Nc', 'CAI', 'CBI', 'Fop'), scaleBar=confWindowWidth) smfe = df['native'] - df['shuffled'] spearman_gc = spearmanr(df['GC'], smfe) spearman_Nc = spearmanr(df['Nc'], smfe) spearman_CAI = spearmanr(df['CAI'], smfe) spearman_Fop = spearmanr(df['Fop'], smfe) dfProfileCorrs = pd.DataFrame( { "spearman_smfe_gc_rho": spearman_gc.correlation, "spearman_smfe_gc_pval": spearman_gc.pvalue, "spearman_smfe_Nc_rho": spearman_Nc.correlation, "spearman_smfe_Nc_pval": spearman_Nc.pvalue, "spearman_smfe_CAI_rho": spearman_CAI.correlation, "spearman_smfe_CAI_pval": spearman_CAI.pvalue, "spearman_smfe_Fop_rho": spearman_Fop.correlation, "spearman_smfe_Fop_pval": spearman_Fop.pvalue }, index=(taxid, )) lengthsDist = np.array(cdsLengths) statisticsDF = pd.DataFrame({ 'mean_mean_gc': pd.Series([np.mean(GCProfile.value())]), 'taxid': pd.Series([taxid], dtype='int'), 'cds_count': pd.Series([len(cdsLengths)], dtype='int'), 'media_cds_length_nt': pd.Series([np.median(cdsLengths)]) }) plotMFEProfileWithGC(taxid, profileId, df) plotMFEProfileV3(taxid, profileId, df, dLFEData=meanDeltaLFE, wilcoxon=wilcoxonDf, transitionPeak=guPeakDf, transitionPeakPos=peakPos * 10, edgeWilcoxon=edgeWilcoxonDf, ProfilesCount=minCount) # Plot the number of genes included in each profile position plotXY( taxid, profileId, pd.DataFrame({"num_genes": nativeMeanProfile.counts()}, index=xRange), "position", "num_genes", "Number of genes included, per starting position") # scatterPlotWithKernel( # taxid, # profileId, # geneLevelScatter, # "gc", # "logpval", # "GC vs. MFE selection - %s" # ) # if( args.codonw ): # scatterPlot( # taxid, # profileId, # geneLevelScatter, # "GC3s", # "logpval", # "GC3s vs. MFE selection - %s" # ) # if( args.codonw ): # scatterPlot( # taxid, # profileId, # geneLevelScatter, # "gc", # "Nc", # "GC vs. ENc - %s" # ) # if( args.codonw ): # scatterPlot( # taxid, # profileId, # geneLevelScatter, # "GC3s", # "Nc", # "GC3s vs. ENc - %s" # ) # if( args.codonw ): # scatterPlot( # taxid, # profileId, # geneLevelScatter, # "Nc", # "logpval", # "ENc vs. MFE selection - %s" # ) # if( args.codonw ): # scatterPlot( # taxid, # profileId, # geneLevelScatter, # "CBI", # "logpval", # "CBI vs. MFE selection - %s" # ) # if( args.pax_db ): # #print(geneLevelScatter.head()) # scatterPlotWithColor( # taxid, # profileId, # shuffleType, # geneLevelScatter, # "gc", # "logpval", # "pa", # "GC vs. PA - %s" # ) # if( args.codonw ): # scatterPlot( # taxid, # profileId, # geneLevelScatter, # "Nc", # "pa", # "ENc vs. PA - %s" # ) # dfProfileByPA = pd.DataFrame( { "native": nativeMeanProfile.value(), "shuffled": shuffledMeanProfile.value(), "position": xRange, "shuffled25":shuffled25Profile.value(), "shuffled75":shuffled75Profile.value(), "native_pa_high":nativeMeanProfile_HighPAOnly.value(), "native_pa_med":nativeMeanProfile_MediumPAOnly.value(), "native_pa_low":nativeMeanProfile_LowPAOnly.value() }, index=xRange ) # plotMFEProfileByPA(taxid, profileId, dfProfileByPA) # # Try to fit a linear model to describe the gene-level data # if( args.codonw ): # if( args.pax_db ): # model = ols("logpval ~ gc + cds_length_nt + Nc + GC3s + CAI + pa", data=geneLevelScatter).fit() # else: # model = ols("logpval ~ gc + cds_length_nt + Nc + GC3s + CAI", data=geneLevelScatter).fit() # else: # model = ols("logpval ~ gc + cds_length_nt", data=geneLevelScatter).fit() # print(model.params) # print(model.summary()) # print("r = %f" % model.rsquared**.5) # print("r_adj = %f" % model.rsquared_adj**.5) spearman_rho = geneLevelScatter.corr(method='spearman') print(spearman_rho) spearman_rho.to_csv('mfe_v2_spearman_%d_%s_t%d.csv' % (taxid, profileId, shuffleType)) # vars = ['gc', 'logpval', 'pa', 'cds_length_nt'] # spearman_rho = np.zeros((len(vars),len(vars)), dtype=float) # spearman_pval = np.zeros((len(vars),len(vars)), dtype=float) # for n1, var1 in enumerate(vars): # for n2, var2 in enumerate(vars): # rho, pval = spearmanr(geneLevelScatter[var1], geneLevelScatter[var2], nan_policy='omit') # spearman_rho[n1,n2] = rho # spearman_pval[n1,n2] = pval # print(spearman_rho) # print(spearman_pval) print(statisticsDF) # ----------------------------------------------------------------------------- # Save mean profiles as H5 # Format (for compatible with plot_xy.py and old convert_data_for_plotting.py: # gc native position shuffled # 1 0.451 -4.944 1 -5.886 # 2 0.459 -5.137 2 -6.069 # 3 0.473 -5.349 3 -6.262 if (args.computation_tag == Sources.RNAfoldEnergy_SlidingWindow40_v2): h5fn = "gcdata_v2_taxid_{}_profile_{}_{}_{}_{}_t{}.h5".format( taxid, args.profile[0], args.profile[1], args.profile[2], args.profile[3], shuffleType) else: h5fn = "gcdata_v2_taxid_{}_profile_{}_{}_{}_{}_t{}_series{}.h5".format( taxid, args.profile[0], args.profile[1], args.profile[2], args.profile[3], shuffleType, args.computation_tag) # Compression parameters are described here: http://www.pytables.org/usersguide/libref/helper_classes.html#filtersclassdescr # ...and discussed thoroughly in the performance FAQs with pd.io.pytables.HDFStore(h5fn, complib="zlib", complevel=1) as store: store["df_%d_%d_%d_%s_%d" % (taxid, args.profile[0], args.profile[1], args.profile[2], args.profile[3])] = df store["deltas_%d_%d_%d_%s_%d" % (taxid, args.profile[0], args.profile[1], args.profile[2], args.profile[3])] = deltasForWilcoxon store["spearman_rho_%d_%d_%d_%s_%d" % (taxid, args.profile[0], args.profile[1], args.profile[2], args.profile[3])] = spearman_rho store["statistics_%d_%d_%d_%s_%d" % (taxid, args.profile[0], args.profile[1], args.profile[2], args.profile[3])] = statisticsDF if (args.codonw): store["profiles_spearman_rho_%d_%d_%d_%s_%d" % (taxid, args.profile[0], args.profile[1], args.profile[2], args.profile[3])] = dfProfileCorrs store["wilcoxon_%d_%d_%d_%s_%d" % (taxid, args.profile[0], args.profile[1], args.profile[2], args.profile[3])] = wilcoxonDf store["transition_peak_wilcoxon_%d_%d_%d_%s_%d" % (taxid, args.profile[0], args.profile[1], args.profile[2], args.profile[3])] = guPeakDf store["edge_wilcoxon_%d_%d_%d_%s_%d" % (taxid, args.profile[0], args.profile[1], args.profile[2], args.profile[3])] = edgeWilcoxonDf store.flush() # ------------------------------------------------------------------------------------ # Print final report print("Got %d results" % n) print(x1) print(x2) print(x3) print("//" * 20) print(combinedData.keys()) if len(combinedData) > 1: profileId = "%d_%d_%s" % (args.profile[0], args.profile[1], args.profile[2]) plotMFEProfileForMultipleRandomizations(taxid, profileId, combinedData) return (taxid, (x1, x2, x3))
#deltasForWilcoxon = np.zeros((0,2), dtype=float) deltasForWilcoxon = pd.DataFrame({'pos':pd.Series(dtype='int'), 'delta':pd.Series(dtype='float')}) geneLevelScatter = pd.DataFrame({'gc':pd.Series(dtype='float'), 'logpval':pd.Series(dtype='float'), 'abslogpval':pd.Series(dtype='float'), 'protid':pd.Series(dtype='string')}) cdsLengths = [] fullSeqs = [] dfCodonw = None # ------------------------------------ # Process all CDS for this species # ------------------------------------ for result in sampleProfilesFixedIntervals( convertResultsToMFEProfiles( readSeriesResultsForSpeciesWithSequence(args.computation_tag, taxid, numShuffledGroups, numShuffledGroups ) , numShuffledGroups) , args.profile[3], args.profile[0], args.profile[1]): fullCDS = result["cds-seq"] seq = fullCDS[args.profile[3]:args.profile[0]] protId = result["cds"].getProtId() fullSeqs.append( SeqRecord( Seq(fullCDS, NucleotideAlphabet), id=protId) ) profileData = result["profile-data"] assert(profileData.shape[0] >= numShuffledGroups) # Prepare mean MFE profiles
def calculate2dProfile(args): maxLength = 300 profileStep = 10 taxids = [] if args.all_species: taxids = [x for x in allSpeciesSource()] else: taxids = args.taxid for taxid in taxids: count = 0 nativeArrayData = [] controlArrayData = [] testt = dict(map(lambda x: (x, 0), range(21))) for result in sampleProfilesFixedIntervals(convertResultsToMFEProfiles( readSeriesResultsForSpecies(args.computation_tag, taxid, args.num_shuffles, args.num_shuffles), args.num_shuffles), startPosition=0, endPosition=maxLength, interval=profileStep): profileData = result["profile-data"] # Check the sequence-id seqId = result["content"][0]["id"] if (seqId.find(":") != -1): seqId = seqId.replace(":", "/") shuffleId = int( seqId.split("/")[3] ) # the first result should belong to shuffle-id -1 (i.e., the native sequence) assert (shuffleId == -1) expectedProfileLength = min( len(result["content"][0]["MFE-profile"]) / profileStep, maxLength / profileStep) profileLength = profileData.shape[1] assert (abs(profileLength - expectedProfileLength) <= 1) if ( profileData.shape[0] != args.num_shuffles + 1 ): # we require one vector per suffled sequence, plus one for the native sequence print("Warning: ignoring record '%s' containing %d records" % (seqId, profileData.shape[0])) continue nativeDiffs = profileData[0, ] - profileData[ 1:, ] # Calculate NativeLFE - ShuffledLFE (for each of the shuffles, and for each window) #controlDiffs = profileData[-1,] - profileData[:-1,] # Calculate NativeLFE - ShuffledLFE (for each of the shuffles, and for each window) controlDiffs = profileData[8, ] - profileData[ 1:, ] # Calculate NativeLFE - ShuffledLFE (for each of the shuffles, and for each window) assert (nativeDiffs.shape[0] == args.num_shuffles) assert (controlDiffs.shape[0] == args.num_shuffles) for i in range(21): deltas = profileData[i, ] - profileData T, pval = wilcoxon(deltas.ravel()) if (pval < 0.05): testt[i] += 1 #print("%d - pval: %g" % (i, pval)) direction = np.sign( np.apply_along_axis(np.mean, 0, nativeDiffs) ) # TODO - prove this is equivalent to checking whether the sign of the sum of ranks controlDirection = np.sign( np.apply_along_axis(np.mean, 0, controlDiffs) ) # TODO - prove this is equivalent to checking whether the sign of the sum of ranks wilc = np.apply_along_axis( wilcoxon, 0, nativeDiffs ) # The wilcoxon test is performed separately for each window (with N=args.num_shuffles, typically = 20) # Note that Nr <= N (because ties are ignored when using the default settings), and Nr should be at least 10 or 20 for the distribution to approach normal. # See: # Explanation of Python impl. (using T statistic): https://stackoverflow.com/a/18966286 # Wilcoxon signed-rank test tutorial: http://vassarstats.net/textbook/ch12a.html assert (wilc.shape == (2, profileLength)) #controlWilc = np.apply_along_axis( wilcoxon, 0, controlDiffs ) # The wilcoxon test is performed separately for each window (with N=args.num_shuffles, typically = 20) controlWilc = np.apply_along_axis( np.mean, 0, controlDiffs ) # The wilcoxon test is performed separately for each window (with N=args.num_shuffles, typically = 20) #print("--"*10) #print(direction) #print(np.mean(wilc[0,])) #Nr = sum( np.abs(nativeDiffs) > 1e-6 ) #print(Nr) #sigma = np.sqrt(Nr * (Nr+1) * (2*Nr+1) / 6) # =SD of W assert ( np.all(wilc[0, ] >= 0.0) ) # test statistic T (not W) - The sum of the ranks of the differences above or below zero, whichever is smaller #assert( np.all( controlWilc[0,] >= 0.0 )) # test statistic T (not W) - The sum of the ranks of the differences above or below zero, whichever is smaller #S = Nr * (Nr+1) / 2.0 # Sum of all ranks #W = S - 2*wilc[0,] #Z = W / sigma #print(wilc[1,]) assert (np.all(((wilc[1, ] >= 0.0) & (wilc[1, ] <= 1.0)) | np.isnan(wilc[1, ]))) # P-values #print(Z * direction) #wilc.resize((2, maxLength / profileStep)) # pad with zeros #arrayData.append( Zwilc[1,] ) #out = np.resize( Z*direction, (2, maxLength / profileStep)) out = np.resize( np.log10(wilc[1, ]) * direction * -1, (2, maxLength / profileStep)) nativeArrayData.append(out) #controlOut = np.resize( np.log10(controlWilc[1,]) * controlDirection * -1, (2, maxLength / profileStep)) controlOut = np.resize(controlWilc[1, ], (2, maxLength / profileStep)) controlArrayData.append(controlOut) count += 1 if (not nativeArrayData): print("Warning: no data found for taxid=%d" % taxid) continue nativeAr = np.vstack(nativeArrayData) controlAr = np.vstack(controlArrayData) #print(ar.shape) #print(ar[0,]) #x = np.apply_along_axis( lambda x: relfreq(x[~np.isnan(x)], numbins=100, defaultreallimits=(-5,5)), 0, ar) #x = np.apply_along_axis( lambda x: np.histogram(x[~np.isnan(x)], bins=100, range=(-5,5), density=True), 0, nativeAr) #print(x.shape) #nativeFreqs = np.vstack(x[0,]) #y = np.apply_along_axis( lambda x: np.histogram(x[~np.isnan(x)], bins=100, range=(-5,5), density=True), 0, controlAr) #print(x.shape) #controlFreqs = np.vstack(y[0,]) #print( np.apply_along_axis( np.sum, 1, controlFreqs ) ) #assert( np.allclose( np.apply_along_axis( np.sum, 0, freqs ), 1.0 ) ) #print(freqs.shape) #print(freqs[0]) #plot2dProfile(nativeFreqs, taxid) print(testt) plot2dProfile(controlAr, taxid) print(count) return 0