def plotProfile(taxId, data): fig, (ax1,ax2) = plt.subplots(2, sharex=True, gridspec_kw={'height_ratios': [2, 1]}) data[['native', 'shuffled']].plot(ax=ax1) #plt.title(getSpeciesName(taxId)) plt.xlabel('Position (nt, window start, from cds %s)' % args.profile.ProfileReference) ax1.set_title("Mean LFE for %s" % getSpeciesName(taxId)) ax1.set_ylabel('Mean LFE') ax1.legend() ax1.grid(True) data['gc'].plot(ax=ax2) ax2.set_title("GC%") ax2.set_ylabel('GC% (in window)') ax2.grid(True) profileId = str(args.profile.ProfileId).replace(':', '-') plt.savefig("mfe_40nt_cds_%s_%s.pdf" % (profileId, getSpeciesFileName(taxId)) ) plt.savefig("mfe_40nt_cds_%s_%s.svg" % (profileId, getSpeciesFileName(taxId)) ) plt.close(fig)
def plotGCContentHist( medianGCContent, taxId ): histBins = np.arange(0.24, 0.76, 0.02) hist, _ = np.histogram( medianGCContent, bins = histBins ) print(len(medianGCContent)) print(hist) print(histBins) fig = plt.figure() #plt.hist( hist, histBins, normed=0 ) plt.bar( np.arange(0.24, 0.74, 0.02), hist, width=0.02) plt.xlabel('Median GC% (5\' cds section)') plt.ylabel('CDS count') plt.xticks(np.arange(0.24, 0.76, 0.04)) plt.savefig("mediangc_%s.pdf" % getSpeciesFileName(taxId) ) plt.savefig("mediangc_%s.svg" % getSpeciesFileName(taxId) ) plt.close(fig)
def getTaxName(taxId): return data_helpers.getSpeciesFileName(taxId)
def performPlots(self): args = self._args taxid = self._taxId # ------------------------------------------------------------------------------------ numShuffledGroups = args.num_shuffles shuffleTypes = args.shuffle_types print("*********** {} ***********".format(shuffleTypes)) combinedData = {} for shuffleType in shuffleTypes: n = 0 x1 = Counter() x2 = Counter() x3 = Counter() print("Processing species %d (%s), shuffleType=%d" % (taxid, getSpeciesFileName(taxid), shuffleType)) nativeMeanProfile = MeanProfile(profileLength(args.profile)) shuffledMeanProfile = MeanProfile(profileLength(args.profile)) shuffled25Profile = MeanProfile(profileLength(args.profile)) shuffled75Profile = MeanProfile(profileLength(args.profile)) xRange = profileElements(args.profile) nativeMeanProfile_HighPAOnly = None nativeMeanProfile_MediumPAOnly = None nativeMeanProfile_LowPAOnly = None if (args.pax_db): nativeMeanProfile_HighPAOnly = MeanProfile( profileLength(args.profile)) nativeMeanProfile_MediumPAOnly = MeanProfile( profileLength(args.profile)) nativeMeanProfile_LowPAOnly = MeanProfile( profileLength(args.profile)) meanProfile_HighExtPropOnly = None meanProfile_LowExtPropOnly = None if (args.external_property): meanProfile_HighExtPropOnly = MeanProfile( profileLength(args.profile)) meanProfile_LowExtPropOnly = MeanProfile( profileLength(args.profile)) GCProfile = MeanProfile(profileLength(args.profile)) #deltasForWilcoxon = np.zeros((0,2), dtype=float) deltasForWilcoxon = pd.DataFrame({ 'pos': pd.Series(dtype='int'), 'delta': pd.Series(dtype='float') }) fullDeltas = [] geneLevelScatter = pd.DataFrame({ 'gc': pd.Series(dtype='float'), 'logpval': pd.Series(dtype='float'), 'abslogpval': pd.Series(dtype='float'), 'protid': pd.Series(dtype='string') }) cdsLengths = [] fullSeqs = [] dfCodonw = None # ------------------------------------ # Process all CDS for this species # ------------------------------------ for result in sampleProfilesFixedIntervals( convertResultsToMFEProfiles( readSeriesResultsForSpeciesWithSequence( (args.computation_tag, ), taxid, numShuffledGroups, numShuffledGroups, shuffleType=shuffleType), numShuffledGroups), args.profile[3], args.profile[0], args.profile[1], args.profile[2]): fullCDS = result["cds-seq"] seq = fullCDS[args.profile[3]:args.profile[0]] if not seq: continue protId = result["cds"].getProtId() #print("Length: {}nt".format(result["cds"].length())) fullSeqs.append( SeqRecord(Seq(fullCDS, NucleotideAlphabet), id=protId)) profileData = result["profile-data"] assert (profileData.shape[0] >= numShuffledGroups) #print(profileData.shape) #print(profileData) #print(profileData[:,0].T) # Prepare mean MFE profiles nativeMeanProfile.add(profileData[0, None]) shuffledMeanProfile.add(profileData[1:]) # Prepare GC profile gc = calcSampledGCcontent(seq, args.profile[1]) if (gc.size > profileLength( args.profile)): # truncate the profile if necessary gc = np.resize(gc, (profileLength(args.profile), )) GCProfile.add(np.expand_dims(gc, 0)) # Prepare percentile mean profiles shuffled25Profile.add( np.expand_dims(np.percentile(profileData[1:], 25, axis=0), 0)) shuffled75Profile.add( np.expand_dims(np.percentile(profileData[1:], 75, axis=0), 0)) # Prepare data for genome-wide wilcoxon test #newDeltas = profileData[0,0::4] - np.mean(profileData[1:,0::4], axis=0) newDeltas = profileData[0, 0::1] - np.mean( profileData[1:, 0::1], axis=0) #print("newDeltas: {}".format(newDeltas.shape)) #newPositions = range(args.profile[3], profileLength(args.profile), 40) newPositions = range(args.profile[3], args.profile[0], args.profile[1]) deltaspd = pd.DataFrame({ 'pos': pd.Series(newPositions, dtype='int'), 'delta': pd.Series(newDeltas, dtype='float') }) #print("deltaspd: {}".format(deltaspd.shape)) deltasForWilcoxon = deltasForWilcoxon.append(deltaspd) fullDeltas.append( profileData[0, 0::1] - profileData[1:, 0::1] ) # store the 20x31 matrix of deltas for full wilcoxon test # Prepare data for GC vs. selection test meanGC = calcSampledGCcontent(seq, 10000)[0] if (not (meanGC >= 0.05 and meanGC <= 0.95)): meanGC = None #deltas = profileData[0,0::4] - np.mean(profileData[1:,0::4], axis=0) deltas = profileData[0, 0::1] - np.mean(profileData[1:, 0::1], axis=0) #print("deltas: {}".format(deltas.shape)) pvalue = wilcoxon(deltas).pvalue direction = np.mean(deltas) directedLogPval = None if (pvalue > 0.0): directedLogPval = log10(pvalue) * direction * -1.0 else: directedLogPval = -250.0 * direction * -1.0 paval = None if (args.pax_db): paval = pa.get(protId) if (paval >= 0.8): nativeMeanProfile_HighPAOnly.add(profileData[0, None]) elif (paval <= 0.2): nativeMeanProfile_LowPAOnly.add(profileData[0, None]) elif (not paval is None): nativeMeanProfile_MediumPAOnly.add(profileData[0, None]) if (args.external_property): extPropValue = externalProperty.get(xxxxxx, None) if extPropValue >= externalPropertyMedian: meanProfile_HighExtPropOnly.add(profileData[0, None]) else: meanProfile_LowExtPropOnly.add(profileData[0, None]) cds_length_nt = len(fullCDS) cdsLengths.append(cds_length_nt) geneLevelScatter = geneLevelScatter.append( pd.DataFrame({ 'gc': pd.Series([meanGC]), 'logpval': pd.Series([directedLogPval]), 'abslogpval': pd.Series([pvalue]), 'protid': pd.Series([protId]), 'pa': pd.Series([paval]), 'cds_length_nt': pd.Series([cds_length_nt]) })) x1.update((fullCDS[0], )) x2.update((fullCDS[1], )) x3.update((fullCDS[2], )) del fullCDS del result n += 1 del (pvalue) del (direction) del (seq) del (deltas) # Refuse to proceed if the data found is unreasonably small if (n < 100): raise Exception( "Found insufficient data to process taxid=%d (n=%d)" % (taxid, n)) CUBmetricsProfile = None if (args.codonw): fFullSeqs = NamedTemporaryFile( mode="w") # create a temporary file SeqIO.write(fullSeqs, fFullSeqs.name, "fasta") # write the full sequences into the file dfCodonw = readCodonw( fFullSeqs.name ) # run codonw and get the gene-level results print('****************************************************') print(dfCodonw.columns) print(dfCodonw.head()) print(geneLevelScatter.columns) print(geneLevelScatter.head()) geneLevelScatter = pd.merge(dfCodonw, geneLevelScatter, left_index=True, right_index=False, right_on='protid') print(geneLevelScatter.corr()) #args.profile[3], args.profile[0], args.profile[1] CUBmetricsProfile = meanCodonwProfile( fullSeqs, confWindowWidth, 'begin', args.profile[3], args.profile[0], args.profile[1]) # TODO - use real values! print(CUBmetricsProfile) #else: # geneLevelScatter['CAI'] = pd.Series( np.zeros(len(geneLevelScatter)), index=geneLevelScatter.index) # ------------------------------------ # Display summary for this species # ------------------------------------ #print("Native:") #print(nativeMeanProfile.value()) #print(nativeMeanProfile.counts()) #print("Shuffled:") #print(shuffledMeanProfile.value()) #print(shuffledMeanProfile.counts()) #print(deltasForWilcoxon.shape) #------------------------------------------------------------------------------------------------------------------ # Test for significance of the mean dLFE (postive or negative) at each position along the genome # (This will answer questions such as "how many genomes have (significantly) negative dLFE at position X?") #------------------------------------------------------------------------------------------------------------------ wilcoxonDf = pd.DataFrame({ 'pos': pd.Series(dtype='int'), 'logpval': pd.Series(dtype='float'), 'N': pd.Series(dtype='int') }) if (True): print("Processing full deltas...") # Perform statistical tests based on the deltas for each position (from all proteins) for pos in range(profileLength(args.profile)): # Collect all deltas for this position (data will be an list of arrays of length 20 - one for each protein long enough to have deltas at this position) data = [x[:, pos] for x in fullDeltas if x.shape[1] > pos] dataar = np.concatenate(data) # flatten all deltas assert (dataar.ndim == 1) # Perform 1-sample Wilcoxon signed-rank test on the deltas (testing whether the deltas are symmetrical) wilcoxonPval = wilcoxon(dataar).pvalue # 2-sided test if wilcoxonPval > 0.0: logWilcoxonPval = log10(wilcoxonPval) else: logWilcoxonPval = -324.0 # ~minimum value for log10(0.000.....) N = dataar.shape[0] wilcoxonDf = wilcoxonDf.append( pd.DataFrame({ 'pos': pd.Series(xRange[pos]), 'N': pd.Series([N]), 'logpval': pd.Series([logWilcoxonPval]) })) #alldeltas = np.concatenate(fullDeltas) #print(wilcoxonDf) del (data) del (dataar) #------------------------------------------------------------------------------------------------------------------ #------------------------------------------------------------------------------------------------------------------ # Find "transition peak" #------------------------------------------------------------------------------------------------------------------ # Calculate the dLFE print( "-TransitionPeak-TransitionPeak-TransitionPeak-TransitionPeak-" ) meanDeltaLFE = nativeMeanProfile.value( ) - shuffledMeanProfile.value() peakPos = np.argmin(meanDeltaLFE) peakVal = meanDeltaLFE[peakPos] guPeakDf = pd.DataFrame({ 'pos': pd.Series(dtype='int'), 'logpval': pd.Series(dtype='float') }) if peakVal <= 0.0: print("{} {}".format(peakPos, peakVal)) if not wilcoxonDf[wilcoxonDf['pos'] == peakPos * 10].empty: logpval = wilcoxonDf[wilcoxonDf['pos'] == peakPos * 10].logpval.loc[0] print(type(logpval)) #print(logpval.shape) print(logpval) if logpval < -2.0: # # Collect all deltas for this position (data will be an list of arrays of length 20 - one for each protein long enough to have deltas at this position) for otherPos in range(profileLength(args.profile)): data1 = [ x[:, peakPos] for x in fullDeltas if x.shape[1] > max(peakPos, otherPos) ] peakData = np.concatenate( data1) # flatten all deltas assert (peakData.ndim == 1) data2 = [ x[:, otherPos] for x in fullDeltas if x.shape[1] > max(peakPos, otherPos) ] otherData = np.concatenate( data2) # flatten all deltas assert (len(peakData) == len(otherData)) datax = otherData - peakData print("/-: {} {} {}".format( peakPos, otherPos, np.mean(datax))) #if( peakPos==otherPos ): # print(datax) wilcoxonPval = None if np.allclose(otherData, peakData): logWilcoxonPval = 0.0 else: # Perform 1-sample Wilcoxon signed-rank test on the deltas (testing whether the deltas are symmetrical) wilcoxonPval = wilcoxon( peakData, otherData ).pvalue # 2-sided test (not ideal in this situation...) if wilcoxonPval > 0.0: logWilcoxonPval = log10(wilcoxonPval) elif wilcoxonPval == 0.0: logWilcoxonPval = -324.0 # ~minimum value for log10(0.000.....) else: logWilcoxonPval = None if not logWilcoxonPval is None: guPeakDf = guPeakDf.append( pd.DataFrame({ 'pos': pd.Series(xRange[otherPos]), 'logpval': pd.Series([logWilcoxonPval]) })) print(guPeakDf) #------------------------------------------------------------------------------------------------------------------ # Calculate edge-referenced wilcoxon #------------------------------------------------------------------------------------------------------------------ edgePos = profileEdgeIndex(args.profile) data0 = [ x[:, edgePos] if x.shape[1] > pos else None for x in fullDeltas ] edgeWilcoxonDf = pd.DataFrame({ 'pos': pd.Series(dtype='int'), 'logpval': pd.Series(dtype='float') }) for pos in range(profileLength(args.profile)): data1 = [ x[:, pos] if x.shape[1] > pos else None for x in fullDeltas ] assert (len(data0) == len(data1)) if not data1[0] is None: print("]]]]]]]]]]]]] {}".format(data1[0].shape)) diffs = [] for d0, d1 in zip(data0, data1): if (not d0 is None) and (not d1 is None): #print("{} {}".format(d0.shape, d1.shape)) d = d1 - d0 diffs.append(d[~np.isnan(d)]) alldiffs = np.concatenate(diffs) #print(alldiffs.shape) print(pos) #print(alldiffs[:100]) print(alldiffs.shape) wilcoxonPval = None if np.allclose(alldiffs, 0): logWilcoxonPval = 0.0 else: # Perform 1-sample Wilcoxon signed-rank test on the deltas (testing whether the deltas are symmetrical) wilcoxonPval = wilcoxon( alldiffs ).pvalue # 2-sided test (not ideal in this situation...) if wilcoxonPval > 0.0: logWilcoxonPval = log10(wilcoxonPval) elif wilcoxonPval == 0.0: logWilcoxonPval = -324.0 # ~minimum value for log10(0.000.....) else: logWilcoxonPval = None if not logWilcoxonPval is None: edgeWilcoxonDf = edgeWilcoxonDf.append( pd.DataFrame({ 'pos': pd.Series(xRange[pos]), 'logpval': pd.Series([logWilcoxonPval]) })) print(edgeWilcoxonDf) # Count the mininum number of sequences minCount = 1000000 for pos in range(profileLength(args.profile)): countAtPos = sum( [1 if x.shape[1] > pos else 0 for x in fullDeltas]) if countAtPos < minCount: minCount = countAtPos #------------------------------------------------------------------------------------------------------------------ # Store the results #------------------------------------------------------------------------------------------------------------------ #native = np.asarray(nativeMean[1:], dtype="float") #shuffled = np.asarray(shuffledMean[1:], dtype="float") #gc = np.asarray(gcMean[1:], dtype="float") #xrange = [x for x in args.profile.Elements() if x<profileInfo.cdsLength()] profileId = "%d_%d_%s_t%d" % (args.profile[0], args.profile[1], args.profile[2], shuffleType) df = pd.DataFrame( { "native": nativeMeanProfile.value(), "shuffled": shuffledMeanProfile.value(), "gc": GCProfile.value(), "position": xRange, "shuffled25": shuffled25Profile.value(), "shuffled75": shuffled75Profile.value() }, index=xRange) print("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^") if (not CUBmetricsProfile is None): df = pd.merge(df, CUBmetricsProfile, how='left', left_on='position', right_on='windowStart') df = df.set_index('position') combinedData[shuffleType] = df #print(df) dfProfileCorrs = None if (args.codonw): plotMFEProfileMultiple(taxid, profileId, df, ('GC', 'Nc', 'CAI', 'CBI', 'Fop'), scaleBar=confWindowWidth) smfe = df['native'] - df['shuffled'] spearman_gc = spearmanr(df['GC'], smfe) spearman_Nc = spearmanr(df['Nc'], smfe) spearman_CAI = spearmanr(df['CAI'], smfe) spearman_Fop = spearmanr(df['Fop'], smfe) dfProfileCorrs = pd.DataFrame( { "spearman_smfe_gc_rho": spearman_gc.correlation, "spearman_smfe_gc_pval": spearman_gc.pvalue, "spearman_smfe_Nc_rho": spearman_Nc.correlation, "spearman_smfe_Nc_pval": spearman_Nc.pvalue, "spearman_smfe_CAI_rho": spearman_CAI.correlation, "spearman_smfe_CAI_pval": spearman_CAI.pvalue, "spearman_smfe_Fop_rho": spearman_Fop.correlation, "spearman_smfe_Fop_pval": spearman_Fop.pvalue }, index=(taxid, )) lengthsDist = np.array(cdsLengths) statisticsDF = pd.DataFrame({ 'mean_mean_gc': pd.Series([np.mean(GCProfile.value())]), 'taxid': pd.Series([taxid], dtype='int'), 'cds_count': pd.Series([len(cdsLengths)], dtype='int'), 'media_cds_length_nt': pd.Series([np.median(cdsLengths)]) }) plotMFEProfileWithGC(taxid, profileId, df) plotMFEProfileV3(taxid, profileId, df, dLFEData=meanDeltaLFE, wilcoxon=wilcoxonDf, transitionPeak=guPeakDf, transitionPeakPos=peakPos * 10, edgeWilcoxon=edgeWilcoxonDf, ProfilesCount=minCount) # Plot the number of genes included in each profile position plotXY( taxid, profileId, pd.DataFrame({"num_genes": nativeMeanProfile.counts()}, index=xRange), "position", "num_genes", "Number of genes included, per starting position") # scatterPlotWithKernel( # taxid, # profileId, # geneLevelScatter, # "gc", # "logpval", # "GC vs. MFE selection - %s" # ) # if( args.codonw ): # scatterPlot( # taxid, # profileId, # geneLevelScatter, # "GC3s", # "logpval", # "GC3s vs. MFE selection - %s" # ) # if( args.codonw ): # scatterPlot( # taxid, # profileId, # geneLevelScatter, # "gc", # "Nc", # "GC vs. ENc - %s" # ) # if( args.codonw ): # scatterPlot( # taxid, # profileId, # geneLevelScatter, # "GC3s", # "Nc", # "GC3s vs. ENc - %s" # ) # if( args.codonw ): # scatterPlot( # taxid, # profileId, # geneLevelScatter, # "Nc", # "logpval", # "ENc vs. MFE selection - %s" # ) # if( args.codonw ): # scatterPlot( # taxid, # profileId, # geneLevelScatter, # "CBI", # "logpval", # "CBI vs. MFE selection - %s" # ) # if( args.pax_db ): # #print(geneLevelScatter.head()) # scatterPlotWithColor( # taxid, # profileId, # shuffleType, # geneLevelScatter, # "gc", # "logpval", # "pa", # "GC vs. PA - %s" # ) # if( args.codonw ): # scatterPlot( # taxid, # profileId, # geneLevelScatter, # "Nc", # "pa", # "ENc vs. PA - %s" # ) # dfProfileByPA = pd.DataFrame( { "native": nativeMeanProfile.value(), "shuffled": shuffledMeanProfile.value(), "position": xRange, "shuffled25":shuffled25Profile.value(), "shuffled75":shuffled75Profile.value(), "native_pa_high":nativeMeanProfile_HighPAOnly.value(), "native_pa_med":nativeMeanProfile_MediumPAOnly.value(), "native_pa_low":nativeMeanProfile_LowPAOnly.value() }, index=xRange ) # plotMFEProfileByPA(taxid, profileId, dfProfileByPA) # # Try to fit a linear model to describe the gene-level data # if( args.codonw ): # if( args.pax_db ): # model = ols("logpval ~ gc + cds_length_nt + Nc + GC3s + CAI + pa", data=geneLevelScatter).fit() # else: # model = ols("logpval ~ gc + cds_length_nt + Nc + GC3s + CAI", data=geneLevelScatter).fit() # else: # model = ols("logpval ~ gc + cds_length_nt", data=geneLevelScatter).fit() # print(model.params) # print(model.summary()) # print("r = %f" % model.rsquared**.5) # print("r_adj = %f" % model.rsquared_adj**.5) spearman_rho = geneLevelScatter.corr(method='spearman') print(spearman_rho) spearman_rho.to_csv('mfe_v2_spearman_%d_%s_t%d.csv' % (taxid, profileId, shuffleType)) # vars = ['gc', 'logpval', 'pa', 'cds_length_nt'] # spearman_rho = np.zeros((len(vars),len(vars)), dtype=float) # spearman_pval = np.zeros((len(vars),len(vars)), dtype=float) # for n1, var1 in enumerate(vars): # for n2, var2 in enumerate(vars): # rho, pval = spearmanr(geneLevelScatter[var1], geneLevelScatter[var2], nan_policy='omit') # spearman_rho[n1,n2] = rho # spearman_pval[n1,n2] = pval # print(spearman_rho) # print(spearman_pval) print(statisticsDF) # ----------------------------------------------------------------------------- # Save mean profiles as H5 # Format (for compatible with plot_xy.py and old convert_data_for_plotting.py: # gc native position shuffled # 1 0.451 -4.944 1 -5.886 # 2 0.459 -5.137 2 -6.069 # 3 0.473 -5.349 3 -6.262 if (args.computation_tag == Sources.RNAfoldEnergy_SlidingWindow40_v2): h5fn = "gcdata_v2_taxid_{}_profile_{}_{}_{}_{}_t{}.h5".format( taxid, args.profile[0], args.profile[1], args.profile[2], args.profile[3], shuffleType) else: h5fn = "gcdata_v2_taxid_{}_profile_{}_{}_{}_{}_t{}_series{}.h5".format( taxid, args.profile[0], args.profile[1], args.profile[2], args.profile[3], shuffleType, args.computation_tag) # Compression parameters are described here: http://www.pytables.org/usersguide/libref/helper_classes.html#filtersclassdescr # ...and discussed thoroughly in the performance FAQs with pd.io.pytables.HDFStore(h5fn, complib="zlib", complevel=1) as store: store["df_%d_%d_%d_%s_%d" % (taxid, args.profile[0], args.profile[1], args.profile[2], args.profile[3])] = df store["deltas_%d_%d_%d_%s_%d" % (taxid, args.profile[0], args.profile[1], args.profile[2], args.profile[3])] = deltasForWilcoxon store["spearman_rho_%d_%d_%d_%s_%d" % (taxid, args.profile[0], args.profile[1], args.profile[2], args.profile[3])] = spearman_rho store["statistics_%d_%d_%d_%s_%d" % (taxid, args.profile[0], args.profile[1], args.profile[2], args.profile[3])] = statisticsDF if (args.codonw): store["profiles_spearman_rho_%d_%d_%d_%s_%d" % (taxid, args.profile[0], args.profile[1], args.profile[2], args.profile[3])] = dfProfileCorrs store["wilcoxon_%d_%d_%d_%s_%d" % (taxid, args.profile[0], args.profile[1], args.profile[2], args.profile[3])] = wilcoxonDf store["transition_peak_wilcoxon_%d_%d_%d_%s_%d" % (taxid, args.profile[0], args.profile[1], args.profile[2], args.profile[3])] = guPeakDf store["edge_wilcoxon_%d_%d_%d_%s_%d" % (taxid, args.profile[0], args.profile[1], args.profile[2], args.profile[3])] = edgeWilcoxonDf store.flush() # ------------------------------------------------------------------------------------ # Print final report print("Got %d results" % n) print(x1) print(x2) print(x3) print("//" * 20) print(combinedData.keys()) if len(combinedData) > 1: profileId = "%d_%d_%s" % (args.profile[0], args.profile[1], args.profile[2]) plotMFEProfileForMultipleRandomizations(taxid, profileId, combinedData) return (taxid, (x1, x2, x3))
assert(rank >= 0.0 and rank <= 1.0 ) pa[row[0]] = rank # ------------------------------------------------------------------------------------ x1 = Counter() x2 = Counter() x3 = Counter() numShuffledGroups = args.num_shuffles n = 0 for taxid in args.taxid: print("Processing species %d (%s)" % (taxid, getSpeciesFileName(taxid))) nativeMeanProfile = MeanProfile( profileLength(args.profile) ) shuffledMeanProfile = MeanProfile( profileLength(args.profile) ) shuffled25Profile = MeanProfile( profileLength(args.profile) ) shuffled75Profile = MeanProfile( profileLength(args.profile) ) nativeMeanProfile_HighPAOnly = None nativeMeanProfile_MediumPAOnly = None nativeMeanProfile_LowPAOnly = None if( args.pax_db ): nativeMeanProfile_HighPAOnly = MeanProfile( profileLength(args.profile) ) nativeMeanProfile_MediumPAOnly = MeanProfile( profileLength(args.profile) ) nativeMeanProfile_LowPAOnly = MeanProfile( profileLength(args.profile) )