def getGeneNativeLFEProfiles(taxId, args):
    for result in sampleProfilesFixedIntervals(
            convertResultsToMFEProfiles(
                readSeriesResultsForSpecies_cached( (args.computation_tag,), taxId, 0, 0, shuffleType=args.shuffle_type )
                , 0)
            , args.profile[3], args.profile[0], args.profile[1], args.profile[2]):
        nativeLFE   = result['profile-data'][0]
        yield nativeLFE
コード例 #2
0
def countShuffledProfiles(taxId, profile, computationTag, shuffleType):

    shuffledMeanProfile = MeanProfile(profileLength(profile))

    for result in sampleProfilesFixedIntervals(
            convertResultsToMFEProfiles(
                readSeriesResultsForSpeciesWithSequence(
                    (computationTag, ),
                    taxId,
                    numShuffledGroups,
                    numShuffledGroups,
                    shuffleType=shuffleType), numShuffledGroups), profile[3],
            profile[0], profile[1], profile[2]):

        profileData = result["profile-data"]

        shuffledMeanProfile.add(profileData[1:])

    print(shuffledMeanProfile.counts())

    numShuffledSeqs = shuffledMeanProfile.counts()[0] / numShuffledGroups
    return (taxId, numShuffledSeqs)
コード例 #3
0
ファイル: test_2.py プロジェクト: michaelpeeri/rnafold-public
from mysql_rnafold import Sources
from process_series_data import readSeriesResultsForSpeciesWithSequence, convertResultsToMFEProfiles, sampleProfilesFixedIntervals, profileLength, profileElements, MeanProfile, calcSampledGCcontent


for result in sampleProfilesFixedIntervals(
        convertResultsToMFEProfiles(
            readSeriesResultsForSpeciesWithSequence(
                (Sources.RNAfoldEnergy_SlidingWindow40_v2,),
                85962,
                20,
                20 )
            , 20)
        , 150, 600, 10):
    
    fullCDS = result["cds-seq"]
    seq = fullCDS[150:600]
    
    print(seq)

コード例 #4
0
    def performPlots(self):

        args = self._args
        taxid = self._taxId

        # ------------------------------------------------------------------------------------

        numShuffledGroups = args.num_shuffles
        shuffleTypes = args.shuffle_types
        print("*********** {} ***********".format(shuffleTypes))

        combinedData = {}

        for shuffleType in shuffleTypes:
            n = 0

            x1 = Counter()
            x2 = Counter()
            x3 = Counter()

            print("Processing species %d (%s), shuffleType=%d" %
                  (taxid, getSpeciesFileName(taxid), shuffleType))

            nativeMeanProfile = MeanProfile(profileLength(args.profile))
            shuffledMeanProfile = MeanProfile(profileLength(args.profile))

            shuffled25Profile = MeanProfile(profileLength(args.profile))
            shuffled75Profile = MeanProfile(profileLength(args.profile))

            xRange = profileElements(args.profile)

            nativeMeanProfile_HighPAOnly = None
            nativeMeanProfile_MediumPAOnly = None
            nativeMeanProfile_LowPAOnly = None
            if (args.pax_db):
                nativeMeanProfile_HighPAOnly = MeanProfile(
                    profileLength(args.profile))
                nativeMeanProfile_MediumPAOnly = MeanProfile(
                    profileLength(args.profile))
                nativeMeanProfile_LowPAOnly = MeanProfile(
                    profileLength(args.profile))

            meanProfile_HighExtPropOnly = None
            meanProfile_LowExtPropOnly = None
            if (args.external_property):
                meanProfile_HighExtPropOnly = MeanProfile(
                    profileLength(args.profile))
                meanProfile_LowExtPropOnly = MeanProfile(
                    profileLength(args.profile))

            GCProfile = MeanProfile(profileLength(args.profile))

            #deltasForWilcoxon = np.zeros((0,2), dtype=float)
            deltasForWilcoxon = pd.DataFrame({
                'pos': pd.Series(dtype='int'),
                'delta': pd.Series(dtype='float')
            })

            fullDeltas = []

            geneLevelScatter = pd.DataFrame({
                'gc':
                pd.Series(dtype='float'),
                'logpval':
                pd.Series(dtype='float'),
                'abslogpval':
                pd.Series(dtype='float'),
                'protid':
                pd.Series(dtype='string')
            })

            cdsLengths = []

            fullSeqs = []
            dfCodonw = None

            # ------------------------------------
            # Process all CDS for this species
            # ------------------------------------
            for result in sampleProfilesFixedIntervals(
                    convertResultsToMFEProfiles(
                        readSeriesResultsForSpeciesWithSequence(
                            (args.computation_tag, ),
                            taxid,
                            numShuffledGroups,
                            numShuffledGroups,
                            shuffleType=shuffleType),
                        numShuffledGroups), args.profile[3], args.profile[0],
                    args.profile[1], args.profile[2]):

                fullCDS = result["cds-seq"]
                seq = fullCDS[args.profile[3]:args.profile[0]]

                if not seq:
                    continue

                protId = result["cds"].getProtId()
                #print("Length: {}nt".format(result["cds"].length()))

                fullSeqs.append(
                    SeqRecord(Seq(fullCDS, NucleotideAlphabet), id=protId))

                profileData = result["profile-data"]
                assert (profileData.shape[0] >= numShuffledGroups)
                #print(profileData.shape)
                #print(profileData)

                #print(profileData[:,0].T)

                # Prepare mean MFE profiles
                nativeMeanProfile.add(profileData[0, None])
                shuffledMeanProfile.add(profileData[1:])

                # Prepare GC profile
                gc = calcSampledGCcontent(seq, args.profile[1])
                if (gc.size > profileLength(
                        args.profile)):  # truncate the profile if necessary
                    gc = np.resize(gc, (profileLength(args.profile), ))
                GCProfile.add(np.expand_dims(gc, 0))

                # Prepare percentile mean profiles
                shuffled25Profile.add(
                    np.expand_dims(np.percentile(profileData[1:], 25, axis=0),
                                   0))
                shuffled75Profile.add(
                    np.expand_dims(np.percentile(profileData[1:], 75, axis=0),
                                   0))

                # Prepare data for genome-wide wilcoxon test
                #newDeltas = profileData[0,0::4] - np.mean(profileData[1:,0::4], axis=0)
                newDeltas = profileData[0, 0::1] - np.mean(
                    profileData[1:, 0::1], axis=0)
                #print("newDeltas: {}".format(newDeltas.shape))
                #newPositions = range(args.profile[3], profileLength(args.profile), 40)
                newPositions = range(args.profile[3], args.profile[0],
                                     args.profile[1])
                deltaspd = pd.DataFrame({
                    'pos':
                    pd.Series(newPositions, dtype='int'),
                    'delta':
                    pd.Series(newDeltas, dtype='float')
                })
                #print("deltaspd: {}".format(deltaspd.shape))
                deltasForWilcoxon = deltasForWilcoxon.append(deltaspd)

                fullDeltas.append(
                    profileData[0, 0::1] - profileData[1:, 0::1]
                )  # store the 20x31 matrix of deltas for full wilcoxon test

                # Prepare data for GC vs. selection test
                meanGC = calcSampledGCcontent(seq, 10000)[0]
                if (not (meanGC >= 0.05 and meanGC <= 0.95)):
                    meanGC = None
                #deltas = profileData[0,0::4] - np.mean(profileData[1:,0::4], axis=0)
                deltas = profileData[0, 0::1] - np.mean(profileData[1:, 0::1],
                                                        axis=0)
                #print("deltas: {}".format(deltas.shape))
                pvalue = wilcoxon(deltas).pvalue
                direction = np.mean(deltas)
                directedLogPval = None

                if (pvalue > 0.0):
                    directedLogPval = log10(pvalue) * direction * -1.0
                else:
                    directedLogPval = -250.0 * direction * -1.0

                paval = None
                if (args.pax_db):
                    paval = pa.get(protId)
                    if (paval >= 0.8):
                        nativeMeanProfile_HighPAOnly.add(profileData[0, None])
                    elif (paval <= 0.2):
                        nativeMeanProfile_LowPAOnly.add(profileData[0, None])
                    elif (not paval is None):
                        nativeMeanProfile_MediumPAOnly.add(profileData[0,
                                                                       None])

                if (args.external_property):
                    extPropValue = externalProperty.get(xxxxxx, None)
                    if extPropValue >= externalPropertyMedian:
                        meanProfile_HighExtPropOnly.add(profileData[0, None])
                    else:
                        meanProfile_LowExtPropOnly.add(profileData[0, None])

                cds_length_nt = len(fullCDS)
                cdsLengths.append(cds_length_nt)

                geneLevelScatter = geneLevelScatter.append(
                    pd.DataFrame({
                        'gc': pd.Series([meanGC]),
                        'logpval': pd.Series([directedLogPval]),
                        'abslogpval': pd.Series([pvalue]),
                        'protid': pd.Series([protId]),
                        'pa': pd.Series([paval]),
                        'cds_length_nt': pd.Series([cds_length_nt])
                    }))

                x1.update((fullCDS[0], ))
                x2.update((fullCDS[1], ))
                x3.update((fullCDS[2], ))
                del fullCDS

                del result
                n += 1
            del (pvalue)
            del (direction)
            del (seq)
            del (deltas)

            # Refuse to proceed if the data found is unreasonably small
            if (n < 100):
                raise Exception(
                    "Found insufficient data to process taxid=%d (n=%d)" %
                    (taxid, n))

            CUBmetricsProfile = None
            if (args.codonw):
                fFullSeqs = NamedTemporaryFile(
                    mode="w")  # create a temporary file
                SeqIO.write(fullSeqs, fFullSeqs.name,
                            "fasta")  # write the full sequences into the file
                dfCodonw = readCodonw(
                    fFullSeqs.name
                )  # run codonw and get the gene-level results

                print('****************************************************')
                print(dfCodonw.columns)
                print(dfCodonw.head())

                print(geneLevelScatter.columns)
                print(geneLevelScatter.head())

                geneLevelScatter = pd.merge(dfCodonw,
                                            geneLevelScatter,
                                            left_index=True,
                                            right_index=False,
                                            right_on='protid')
                print(geneLevelScatter.corr())

                #args.profile[3], args.profile[0], args.profile[1]
                CUBmetricsProfile = meanCodonwProfile(
                    fullSeqs, confWindowWidth, 'begin', args.profile[3],
                    args.profile[0],
                    args.profile[1])  # TODO - use real values!
                print(CUBmetricsProfile)

            #else:
            #    geneLevelScatter['CAI'] = pd.Series( np.zeros(len(geneLevelScatter)), index=geneLevelScatter.index)

            # ------------------------------------
            # Display summary for this species
            # ------------------------------------
            #print("Native:")
            #print(nativeMeanProfile.value())
            #print(nativeMeanProfile.counts())

            #print("Shuffled:")
            #print(shuffledMeanProfile.value())
            #print(shuffledMeanProfile.counts())

            #print(deltasForWilcoxon.shape)

            #------------------------------------------------------------------------------------------------------------------
            # Test for significance of the mean dLFE (postive or negative) at each position along the genome
            # (This will answer questions such as "how many genomes have (significantly) negative dLFE at position X?")
            #------------------------------------------------------------------------------------------------------------------
            wilcoxonDf = pd.DataFrame({
                'pos': pd.Series(dtype='int'),
                'logpval': pd.Series(dtype='float'),
                'N': pd.Series(dtype='int')
            })
            if (True):
                print("Processing full deltas...")

                # Perform statistical tests based on the deltas for each position (from all proteins)
                for pos in range(profileLength(args.profile)):

                    # Collect all deltas for this position (data will be an list of arrays of length 20 - one for each protein long enough to have deltas at this position)
                    data = [x[:, pos] for x in fullDeltas if x.shape[1] > pos]
                    dataar = np.concatenate(data)  # flatten all deltas
                    assert (dataar.ndim == 1)

                    # Perform 1-sample Wilcoxon signed-rank test on the deltas (testing whether the deltas are symmetrical)
                    wilcoxonPval = wilcoxon(dataar).pvalue  # 2-sided test
                    if wilcoxonPval > 0.0:
                        logWilcoxonPval = log10(wilcoxonPval)
                    else:
                        logWilcoxonPval = -324.0  # ~minimum value for log10(0.000.....)

                    N = dataar.shape[0]

                    wilcoxonDf = wilcoxonDf.append(
                        pd.DataFrame({
                            'pos': pd.Series(xRange[pos]),
                            'N': pd.Series([N]),
                            'logpval': pd.Series([logWilcoxonPval])
                        }))

                    #alldeltas = np.concatenate(fullDeltas)
                #print(wilcoxonDf)
                del (data)
                del (dataar)

            #------------------------------------------------------------------------------------------------------------------

            #------------------------------------------------------------------------------------------------------------------
            # Find "transition peak"
            #------------------------------------------------------------------------------------------------------------------
            # Calculate the dLFE
            print(
                "-TransitionPeak-TransitionPeak-TransitionPeak-TransitionPeak-"
            )
            meanDeltaLFE = nativeMeanProfile.value(
            ) - shuffledMeanProfile.value()
            peakPos = np.argmin(meanDeltaLFE)
            peakVal = meanDeltaLFE[peakPos]

            guPeakDf = pd.DataFrame({
                'pos': pd.Series(dtype='int'),
                'logpval': pd.Series(dtype='float')
            })

            if peakVal <= 0.0:
                print("{} {}".format(peakPos, peakVal))
                if not wilcoxonDf[wilcoxonDf['pos'] == peakPos * 10].empty:
                    logpval = wilcoxonDf[wilcoxonDf['pos'] == peakPos *
                                         10].logpval.loc[0]
                    print(type(logpval))
                    #print(logpval.shape)
                    print(logpval)

                    if logpval < -2.0:
                        #

                        # Collect all deltas for this position (data will be an list of arrays of length 20 - one for each protein long enough to have deltas at this position)

                        for otherPos in range(profileLength(args.profile)):

                            data1 = [
                                x[:, peakPos] for x in fullDeltas
                                if x.shape[1] > max(peakPos, otherPos)
                            ]
                            peakData = np.concatenate(
                                data1)  # flatten all deltas
                            assert (peakData.ndim == 1)

                            data2 = [
                                x[:, otherPos] for x in fullDeltas
                                if x.shape[1] > max(peakPos, otherPos)
                            ]
                            otherData = np.concatenate(
                                data2)  # flatten all deltas

                            assert (len(peakData) == len(otherData))
                            datax = otherData - peakData

                            print("/-: {} {} {}".format(
                                peakPos, otherPos, np.mean(datax)))

                            #if( peakPos==otherPos ):
                            #    print(datax)

                            wilcoxonPval = None
                            if np.allclose(otherData, peakData):
                                logWilcoxonPval = 0.0
                            else:
                                # Perform 1-sample Wilcoxon signed-rank test on the deltas (testing whether the deltas are symmetrical)
                                wilcoxonPval = wilcoxon(
                                    peakData, otherData
                                ).pvalue  # 2-sided test (not ideal in this situation...)
                                if wilcoxonPval > 0.0:
                                    logWilcoxonPval = log10(wilcoxonPval)
                                elif wilcoxonPval == 0.0:
                                    logWilcoxonPval = -324.0  # ~minimum value for log10(0.000.....)
                                else:
                                    logWilcoxonPval = None

                            if not logWilcoxonPval is None:
                                guPeakDf = guPeakDf.append(
                                    pd.DataFrame({
                                        'pos':
                                        pd.Series(xRange[otherPos]),
                                        'logpval':
                                        pd.Series([logWilcoxonPval])
                                    }))

                        print(guPeakDf)

            #------------------------------------------------------------------------------------------------------------------
            # Calculate edge-referenced wilcoxon
            #------------------------------------------------------------------------------------------------------------------

            edgePos = profileEdgeIndex(args.profile)
            data0 = [
                x[:, edgePos] if x.shape[1] > pos else None for x in fullDeltas
            ]
            edgeWilcoxonDf = pd.DataFrame({
                'pos': pd.Series(dtype='int'),
                'logpval': pd.Series(dtype='float')
            })

            for pos in range(profileLength(args.profile)):
                data1 = [
                    x[:, pos] if x.shape[1] > pos else None for x in fullDeltas
                ]
                assert (len(data0) == len(data1))
                if not data1[0] is None:
                    print("]]]]]]]]]]]]] {}".format(data1[0].shape))

                diffs = []
                for d0, d1 in zip(data0, data1):
                    if (not d0 is None) and (not d1 is None):
                        #print("{} {}".format(d0.shape, d1.shape))
                        d = d1 - d0
                        diffs.append(d[~np.isnan(d)])

                alldiffs = np.concatenate(diffs)
                #print(alldiffs.shape)
                print(pos)
                #print(alldiffs[:100])
                print(alldiffs.shape)

                wilcoxonPval = None
                if np.allclose(alldiffs, 0):
                    logWilcoxonPval = 0.0
                else:
                    # Perform 1-sample Wilcoxon signed-rank test on the deltas (testing whether the deltas are symmetrical)
                    wilcoxonPval = wilcoxon(
                        alldiffs
                    ).pvalue  # 2-sided test (not ideal in this situation...)
                    if wilcoxonPval > 0.0:
                        logWilcoxonPval = log10(wilcoxonPval)
                    elif wilcoxonPval == 0.0:
                        logWilcoxonPval = -324.0  # ~minimum value for log10(0.000.....)
                    else:
                        logWilcoxonPval = None

                if not logWilcoxonPval is None:
                    edgeWilcoxonDf = edgeWilcoxonDf.append(
                        pd.DataFrame({
                            'pos': pd.Series(xRange[pos]),
                            'logpval': pd.Series([logWilcoxonPval])
                        }))
            print(edgeWilcoxonDf)

            # Count the mininum number of sequences
            minCount = 1000000
            for pos in range(profileLength(args.profile)):
                countAtPos = sum(
                    [1 if x.shape[1] > pos else 0 for x in fullDeltas])
                if countAtPos < minCount: minCount = countAtPos

            #------------------------------------------------------------------------------------------------------------------
            # Store the results
            #------------------------------------------------------------------------------------------------------------------

            #native = np.asarray(nativeMean[1:], dtype="float")
            #shuffled = np.asarray(shuffledMean[1:], dtype="float")
            #gc = np.asarray(gcMean[1:], dtype="float")
            #xrange = [x for x in args.profile.Elements() if x<profileInfo.cdsLength()]
            profileId = "%d_%d_%s_t%d" % (args.profile[0], args.profile[1],
                                          args.profile[2], shuffleType)

            df = pd.DataFrame(
                {
                    "native": nativeMeanProfile.value(),
                    "shuffled": shuffledMeanProfile.value(),
                    "gc": GCProfile.value(),
                    "position": xRange,
                    "shuffled25": shuffled25Profile.value(),
                    "shuffled75": shuffled75Profile.value()
                },
                index=xRange)
            print("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")
            if (not CUBmetricsProfile is None):
                df = pd.merge(df,
                              CUBmetricsProfile,
                              how='left',
                              left_on='position',
                              right_on='windowStart')
                df = df.set_index('position')

            combinedData[shuffleType] = df
            #print(df)

            dfProfileCorrs = None
            if (args.codonw):
                plotMFEProfileMultiple(taxid,
                                       profileId,
                                       df, ('GC', 'Nc', 'CAI', 'CBI', 'Fop'),
                                       scaleBar=confWindowWidth)

                smfe = df['native'] - df['shuffled']
                spearman_gc = spearmanr(df['GC'], smfe)
                spearman_Nc = spearmanr(df['Nc'], smfe)
                spearman_CAI = spearmanr(df['CAI'], smfe)
                spearman_Fop = spearmanr(df['Fop'], smfe)
                dfProfileCorrs = pd.DataFrame(
                    {
                        "spearman_smfe_gc_rho": spearman_gc.correlation,
                        "spearman_smfe_gc_pval": spearman_gc.pvalue,
                        "spearman_smfe_Nc_rho": spearman_Nc.correlation,
                        "spearman_smfe_Nc_pval": spearman_Nc.pvalue,
                        "spearman_smfe_CAI_rho": spearman_CAI.correlation,
                        "spearman_smfe_CAI_pval": spearman_CAI.pvalue,
                        "spearman_smfe_Fop_rho": spearman_Fop.correlation,
                        "spearman_smfe_Fop_pval": spearman_Fop.pvalue
                    },
                    index=(taxid, ))

            lengthsDist = np.array(cdsLengths)
            statisticsDF = pd.DataFrame({
                'mean_mean_gc':
                pd.Series([np.mean(GCProfile.value())]),
                'taxid':
                pd.Series([taxid], dtype='int'),
                'cds_count':
                pd.Series([len(cdsLengths)], dtype='int'),
                'media_cds_length_nt':
                pd.Series([np.median(cdsLengths)])
            })

            plotMFEProfileWithGC(taxid, profileId, df)

            plotMFEProfileV3(taxid,
                             profileId,
                             df,
                             dLFEData=meanDeltaLFE,
                             wilcoxon=wilcoxonDf,
                             transitionPeak=guPeakDf,
                             transitionPeakPos=peakPos * 10,
                             edgeWilcoxon=edgeWilcoxonDf,
                             ProfilesCount=minCount)

            # Plot the number of genes included in each profile position
            plotXY(
                taxid, profileId,
                pd.DataFrame({"num_genes": nativeMeanProfile.counts()},
                             index=xRange), "position", "num_genes",
                "Number of genes included, per starting position")

            # scatterPlotWithKernel(
            #     taxid,
            #     profileId,
            #     geneLevelScatter,
            #     "gc",
            #     "logpval",
            #     "GC vs. MFE selection - %s"
            #     )

            # if( args.codonw ):
            #     scatterPlot(
            #         taxid,
            #         profileId,
            #         geneLevelScatter,
            #         "GC3s",
            #         "logpval",
            #         "GC3s vs. MFE selection - %s"
            #     )

            # if( args.codonw ):
            #     scatterPlot(
            #         taxid,
            #         profileId,
            #         geneLevelScatter,
            #         "gc",
            #         "Nc",
            #         "GC vs. ENc - %s"
            #     )

            # if( args.codonw ):
            #     scatterPlot(
            #         taxid,
            #         profileId,
            #         geneLevelScatter,
            #         "GC3s",
            #         "Nc",
            #         "GC3s vs. ENc - %s"
            #     )

            # if( args.codonw ):
            #     scatterPlot(
            #         taxid,
            #         profileId,
            #         geneLevelScatter,
            #         "Nc",
            #         "logpval",
            #         "ENc vs. MFE selection - %s"
            #     )

            # if( args.codonw ):
            #     scatterPlot(
            #         taxid,
            #         profileId,
            #         geneLevelScatter,
            #         "CBI",
            #         "logpval",
            #         "CBI vs. MFE selection - %s"
            #     )

            # if( args.pax_db ):
            #     #print(geneLevelScatter.head())
            #     scatterPlotWithColor(
            #         taxid,
            #         profileId,
            #         shuffleType,
            #         geneLevelScatter,
            #         "gc",
            #         "logpval",
            #         "pa",
            #         "GC vs. PA - %s"
            #     )

            #     if( args.codonw ):
            #         scatterPlot(
            #             taxid,
            #             profileId,
            #             geneLevelScatter,
            #             "Nc",
            #             "pa",
            #             "ENc vs. PA - %s"
            #         )

            # dfProfileByPA = pd.DataFrame( { "native": nativeMeanProfile.value(), "shuffled": shuffledMeanProfile.value(), "position": xRange, "shuffled25":shuffled25Profile.value(), "shuffled75":shuffled75Profile.value(), "native_pa_high":nativeMeanProfile_HighPAOnly.value(), "native_pa_med":nativeMeanProfile_MediumPAOnly.value(), "native_pa_low":nativeMeanProfile_LowPAOnly.value() }, index=xRange )

            # plotMFEProfileByPA(taxid, profileId, dfProfileByPA)

            # # Try to fit a linear model to describe the gene-level data
            # if( args.codonw ):
            #     if( args.pax_db ):
            #         model = ols("logpval ~ gc + cds_length_nt + Nc + GC3s + CAI + pa", data=geneLevelScatter).fit()
            #     else:
            #         model = ols("logpval ~ gc + cds_length_nt + Nc + GC3s + CAI", data=geneLevelScatter).fit()
            # else:
            #     model = ols("logpval ~ gc + cds_length_nt", data=geneLevelScatter).fit()

            # print(model.params)
            # print(model.summary())
            # print("r     = %f" % model.rsquared**.5)
            # print("r_adj = %f" % model.rsquared_adj**.5)

            spearman_rho = geneLevelScatter.corr(method='spearman')
            print(spearman_rho)
            spearman_rho.to_csv('mfe_v2_spearman_%d_%s_t%d.csv' %
                                (taxid, profileId, shuffleType))

            # vars = ['gc', 'logpval', 'pa', 'cds_length_nt']
            # spearman_rho  = np.zeros((len(vars),len(vars)), dtype=float)
            # spearman_pval = np.zeros((len(vars),len(vars)), dtype=float)
            # for n1, var1 in enumerate(vars):
            #     for n2, var2 in enumerate(vars):
            #         rho, pval = spearmanr(geneLevelScatter[var1], geneLevelScatter[var2], nan_policy='omit')
            #         spearman_rho[n1,n2] = rho
            #         spearman_pval[n1,n2] = pval
            # print(spearman_rho)
            # print(spearman_pval)

            print(statisticsDF)

            # -----------------------------------------------------------------------------
            # Save mean profiles as H5

            # Format (for compatible with plot_xy.py and old convert_data_for_plotting.py:
            #         gc  native  position  shuffled
            # 1    0.451  -4.944         1    -5.886
            # 2    0.459  -5.137         2    -6.069
            # 3    0.473  -5.349         3    -6.262

            if (args.computation_tag ==
                    Sources.RNAfoldEnergy_SlidingWindow40_v2):
                h5fn = "gcdata_v2_taxid_{}_profile_{}_{}_{}_{}_t{}.h5".format(
                    taxid, args.profile[0], args.profile[1], args.profile[2],
                    args.profile[3], shuffleType)
            else:
                h5fn = "gcdata_v2_taxid_{}_profile_{}_{}_{}_{}_t{}_series{}.h5".format(
                    taxid, args.profile[0], args.profile[1], args.profile[2],
                    args.profile[3], shuffleType, args.computation_tag)

            # Compression parameters are described here:  http://www.pytables.org/usersguide/libref/helper_classes.html#filtersclassdescr
            # ...and discussed thoroughly in the performance FAQs
            with pd.io.pytables.HDFStore(h5fn, complib="zlib",
                                         complevel=1) as store:
                store["df_%d_%d_%d_%s_%d" %
                      (taxid, args.profile[0], args.profile[1],
                       args.profile[2], args.profile[3])] = df
                store["deltas_%d_%d_%d_%s_%d" %
                      (taxid, args.profile[0], args.profile[1],
                       args.profile[2], args.profile[3])] = deltasForWilcoxon
                store["spearman_rho_%d_%d_%d_%s_%d" %
                      (taxid, args.profile[0], args.profile[1],
                       args.profile[2], args.profile[3])] = spearman_rho
                store["statistics_%d_%d_%d_%s_%d" %
                      (taxid, args.profile[0], args.profile[1],
                       args.profile[2], args.profile[3])] = statisticsDF
                if (args.codonw):
                    store["profiles_spearman_rho_%d_%d_%d_%s_%d" %
                          (taxid, args.profile[0], args.profile[1],
                           args.profile[2], args.profile[3])] = dfProfileCorrs
                store["wilcoxon_%d_%d_%d_%s_%d" %
                      (taxid, args.profile[0], args.profile[1],
                       args.profile[2], args.profile[3])] = wilcoxonDf
                store["transition_peak_wilcoxon_%d_%d_%d_%s_%d" %
                      (taxid, args.profile[0], args.profile[1],
                       args.profile[2], args.profile[3])] = guPeakDf
                store["edge_wilcoxon_%d_%d_%d_%s_%d" %
                      (taxid, args.profile[0], args.profile[1],
                       args.profile[2], args.profile[3])] = edgeWilcoxonDf

                store.flush()

            # ------------------------------------------------------------------------------------
            # Print final report

            print("Got %d results" % n)

            print(x1)
            print(x2)
            print(x3)

        print("//" * 20)
        print(combinedData.keys())

        if len(combinedData) > 1:
            profileId = "%d_%d_%s" % (args.profile[0], args.profile[1],
                                      args.profile[2])
            plotMFEProfileForMultipleRandomizations(taxid, profileId,
                                                    combinedData)

        return (taxid, (x1, x2, x3))
コード例 #5
0
    #deltasForWilcoxon = np.zeros((0,2), dtype=float)
    deltasForWilcoxon = pd.DataFrame({'pos':pd.Series(dtype='int'), 'delta':pd.Series(dtype='float')})

    geneLevelScatter = pd.DataFrame({'gc':pd.Series(dtype='float'), 'logpval':pd.Series(dtype='float'), 'abslogpval':pd.Series(dtype='float'), 'protid':pd.Series(dtype='string')})

    cdsLengths = []

    fullSeqs = []
    dfCodonw = None

    # ------------------------------------
    # Process all CDS for this species
    # ------------------------------------
    for result in sampleProfilesFixedIntervals(
            convertResultsToMFEProfiles(
                readSeriesResultsForSpeciesWithSequence(args.computation_tag, taxid, numShuffledGroups, numShuffledGroups )
                , numShuffledGroups)
            , args.profile[3], args.profile[0], args.profile[1]):

        fullCDS = result["cds-seq"]
        seq = fullCDS[args.profile[3]:args.profile[0]]

        protId = result["cds"].getProtId()
        
        fullSeqs.append( SeqRecord( Seq(fullCDS, NucleotideAlphabet), id=protId) )


        profileData = result["profile-data"]
        assert(profileData.shape[0] >= numShuffledGroups)

        # Prepare mean MFE profiles
コード例 #6
0
def calculate2dProfile(args):

    maxLength = 300
    profileStep = 10

    taxids = []
    if args.all_species:
        taxids = [x for x in allSpeciesSource()]
    else:
        taxids = args.taxid

    for taxid in taxids:
        count = 0
        nativeArrayData = []
        controlArrayData = []

        testt = dict(map(lambda x: (x, 0), range(21)))

        for result in sampleProfilesFixedIntervals(convertResultsToMFEProfiles(
                readSeriesResultsForSpecies(args.computation_tag, taxid,
                                            args.num_shuffles,
                                            args.num_shuffles),
                args.num_shuffles),
                                                   startPosition=0,
                                                   endPosition=maxLength,
                                                   interval=profileStep):
            profileData = result["profile-data"]

            # Check the sequence-id
            seqId = result["content"][0]["id"]
            if (seqId.find(":") != -1):
                seqId = seqId.replace(":", "/")
            shuffleId = int(
                seqId.split("/")[3]
            )  # the first result should belong to shuffle-id -1 (i.e., the native sequence)
            assert (shuffleId == -1)

            expectedProfileLength = min(
                len(result["content"][0]["MFE-profile"]) / profileStep,
                maxLength / profileStep)
            profileLength = profileData.shape[1]
            assert (abs(profileLength - expectedProfileLength) <= 1)

            if (
                    profileData.shape[0] != args.num_shuffles + 1
            ):  # we require one vector per suffled sequence, plus one for the native sequence
                print("Warning: ignoring record '%s' containing %d records" %
                      (seqId, profileData.shape[0]))
                continue

            nativeDiffs = profileData[0, ] - profileData[
                1:,
            ]  # Calculate NativeLFE - ShuffledLFE (for each of the shuffles, and for each window)
            #controlDiffs = profileData[-1,] - profileData[:-1,]   # Calculate NativeLFE - ShuffledLFE (for each of the shuffles, and for each window)
            controlDiffs = profileData[8, ] - profileData[
                1:,
            ]  # Calculate NativeLFE - ShuffledLFE (for each of the shuffles, and for each window)
            assert (nativeDiffs.shape[0] == args.num_shuffles)
            assert (controlDiffs.shape[0] == args.num_shuffles)

            for i in range(21):
                deltas = profileData[i, ] - profileData
                T, pval = wilcoxon(deltas.ravel())
                if (pval < 0.05):
                    testt[i] += 1
                    #print("%d - pval: %g" % (i, pval))

            direction = np.sign(
                np.apply_along_axis(np.mean, 0, nativeDiffs)
            )  # TODO - prove this is equivalent to checking whether the sign of the sum of ranks
            controlDirection = np.sign(
                np.apply_along_axis(np.mean, 0, controlDiffs)
            )  # TODO - prove this is equivalent to checking whether the sign of the sum of ranks

            wilc = np.apply_along_axis(
                wilcoxon, 0, nativeDiffs
            )  # The wilcoxon test is performed separately for each window (with N=args.num_shuffles, typically = 20)

            # Note that Nr <= N (because ties are ignored when using the default settings), and Nr should be at least 10 or 20 for the distribution to approach normal.
            # See:
            # Explanation of Python impl. (using T statistic):  https://stackoverflow.com/a/18966286
            # Wilcoxon signed-rank test tutorial:               http://vassarstats.net/textbook/ch12a.html
            assert (wilc.shape == (2, profileLength))
            #controlWilc = np.apply_along_axis( wilcoxon, 0, controlDiffs )   # The wilcoxon test is performed separately for each window (with N=args.num_shuffles, typically = 20)
            controlWilc = np.apply_along_axis(
                np.mean, 0, controlDiffs
            )  # The wilcoxon test is performed separately for each window (with N=args.num_shuffles, typically = 20)

            #print("--"*10)
            #print(direction)
            #print(np.mean(wilc[0,]))
            #Nr = sum( np.abs(nativeDiffs) > 1e-6 )
            #print(Nr)
            #sigma = np.sqrt(Nr * (Nr+1) * (2*Nr+1) / 6)   # =SD of W
            assert (
                np.all(wilc[0, ] >= 0.0)
            )  # test statistic T (not W) - The sum of the ranks of the differences above or below zero, whichever is smaller
            #assert( np.all( controlWilc[0,] >= 0.0 ))  # test statistic T (not W) - The sum of the ranks of the differences above or below zero, whichever is smaller
            #S = Nr * (Nr+1) / 2.0  # Sum of all ranks
            #W = S - 2*wilc[0,]
            #Z = W / sigma
            #print(wilc[1,])
            assert (np.all(((wilc[1, ] >= 0.0) & (wilc[1, ] <= 1.0))
                           | np.isnan(wilc[1, ])))  # P-values
            #print(Z * direction)

            #wilc.resize((2, maxLength / profileStep))  # pad with zeros
            #arrayData.append( Zwilc[1,] )
            #out = np.resize( Z*direction, (2, maxLength / profileStep))
            out = np.resize(
                np.log10(wilc[1, ]) * direction * -1,
                (2, maxLength / profileStep))
            nativeArrayData.append(out)

            #controlOut = np.resize( np.log10(controlWilc[1,]) * controlDirection * -1, (2, maxLength / profileStep))
            controlOut = np.resize(controlWilc[1, ],
                                   (2, maxLength / profileStep))
            controlArrayData.append(controlOut)

            count += 1

        if (not nativeArrayData):
            print("Warning: no data found for taxid=%d" % taxid)
            continue

        nativeAr = np.vstack(nativeArrayData)
        controlAr = np.vstack(controlArrayData)
        #print(ar.shape)
        #print(ar[0,])

        #x = np.apply_along_axis( lambda x: relfreq(x[~np.isnan(x)], numbins=100, defaultreallimits=(-5,5)), 0, ar)
        #x = np.apply_along_axis( lambda x: np.histogram(x[~np.isnan(x)], bins=100, range=(-5,5), density=True), 0, nativeAr)
        #print(x.shape)
        #nativeFreqs = np.vstack(x[0,])

        #y = np.apply_along_axis( lambda x: np.histogram(x[~np.isnan(x)], bins=100, range=(-5,5), density=True), 0, controlAr)
        #print(x.shape)
        #controlFreqs = np.vstack(y[0,])

        #print( np.apply_along_axis( np.sum, 1, controlFreqs ) )
        #assert( np.allclose( np.apply_along_axis( np.sum, 0, freqs ), 1.0 ) )
        #print(freqs.shape)
        #print(freqs[0])

        #plot2dProfile(nativeFreqs, taxid)
        print(testt)
        plot2dProfile(controlAr, taxid)

        print(count)
    return 0