Ejemplos de Corr en Python, ejemplos de thinkstats2.Corr en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: brfss_corr.py Proyecto: wu12345/ThinkStats2

def ComputeCorrelations(heights, weights):
    """Compute correlations and least squares fit.

    heights: sequence
    weights: sequence
    """
    pearson = thinkstats2.Corr(heights, weights)
    assert almostEquals(pearson, 0.508736478973)
    print('Pearson correlation (weights):', pearson)

    log_weights = np.log(weights)
    log_pearson = thinkstats2.Corr(heights, log_weights)
    assert almostEquals(log_pearson, 0.531728260598)
    print('Pearson correlation (log weights):', log_pearson)

    spearman = thinkstats2.SpearmanCorr(heights, weights)
    print('Spearman correlation (weights):', spearman)
    assert almostEquals(spearman, 0.541535836332)

    inter, slope = thinkstats2.LeastSquares(heights, log_weights)
    print('Least squares inter, slope (log weights):', inter, slope)

    res = thinkstats2.Residuals(heights, log_weights, inter, slope)
    R2 = thinkstats2.CoefDetermination(log_weights, res)
    R = math.sqrt(R2)
    print('Coefficient of determination:', R2)
    print('sqrt(R^2):', R)

    assert almostEquals(R, log_pearson)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: thinkstats2_test.py Proyecto: Patsonstats/ThinkStats2-1

    def testCov(self):
        t = [0, 4, 7, 3, 8, 1, 6, 2, 9, 5]
        a = np.array(t)
        t2 = [5, 4, 3, 0, 8, 9, 7, 6, 2, 1]

        self.assertAlmostEqual(thinkstats2.Cov(t, a), 8.25)
        self.assertAlmostEqual(thinkstats2.Cov(t, -a), -8.25)

        self.assertAlmostEqual(thinkstats2.Corr(t, a), 1)
        self.assertAlmostEqual(thinkstats2.Corr(t, -a), -1)
        self.assertAlmostEqual(thinkstats2.Corr(t, t2), -0.1878787878)

        self.assertAlmostEqual(thinkstats2.SpearmanCorr(t, -a), -1)
        self.assertAlmostEqual(thinkstats2.SpearmanCorr(t, t2), -0.1878787878)

Ejemplo n.º 3

0

Mostrar archivo

    def TestStatistic(self, data):
        """Computes the test statistic.

        data: tuple of xs and ys
        """
        x, y = data
        test_stat = abs(thinkstats2.Corr(x, y))
        return test_stat

Ejemplo n.º 4

0

Mostrar archivo

Archivo: scatter.py Proyecto: avinashalapati09/dsc530

def Correlations(df):
    print('pandas cov', df.htm3.cov(df.wtkg2))
    #print('NumPy cov', np.cov(df.htm3, df.wtkg2, ddof=0))
    print('thinkstats2 Cov', thinkstats2.Cov(df.htm3, df.wtkg2))
    print()

    print('pandas corr', df.htm3.corr(df.wtkg2))
    #print('NumPy corrcoef', np.corrcoef(df.htm3, df.wtkg2, ddof=0))
    print('thinkstats2 Corr', thinkstats2.Corr(df.htm3, df.wtkg2))
    print()

    print('pandas corr spearman', df.htm3.corr(df.wtkg2, method='spearman'))
    print('thinkstats2 SpearmanCorr',
          thinkstats2.SpearmanCorr(df.htm3, df.wtkg2))
    print('thinkstats2 SpearmanCorr log wtkg3',
          thinkstats2.SpearmanCorr(df.htm3, np.log(df.wtkg2)))
    print()

    print('thinkstats2 Corr log wtkg3',
          thinkstats2.Corr(df.htm3, np.log(df.wtkg2)))
    print()

Ejemplo n.º 5

0

Mostrar archivo

Archivo: Week7_chap07sol_7.1_rkarna.py Proyecto: rkarna/ThinkStats2

def main(script):
    thinkstats2.RandomSeed(17)

    live, firsts, others = first.MakeFrames()
    live = live.dropna(subset=['agepreg', 'totalwgt_lb'])
    BinnedPercentiles(live)

    ages = live.agepreg
    weights = live.totalwgt_lb
    print('thinkstats2 Corr', thinkstats2.Corr(ages, weights))
    print('thinkstats2 SpearmanCorr', thinkstats2.SpearmanCorr(ages, weights))

    ScatterPlot(ages, weights, alpha=0.1)
    thinkplot.Save(root='chap07scatter1', legend=False, formats=['jpg'])

Ejemplo n.º 6

0

Mostrar archivo

Archivo: correlate1.py Proyecto: wu12345/ThinkStats2

def PValue(xs, ys, n=10):
    actual = thinkstats2.Corr(xs, ys)

    xs_copy = list(xs)
    ys_copy = list(ys)

    corrs = []
    for i in range(n):
        corr = SimulateNull(xs_copy, ys_copy)
        corrs.append(corr)

    # what does the distribution of corrs look like?

    hits = [corr for corr in corrs if abs(corr) >= abs(actual)]
    p = len(hits) / float(n)
    return p

Ejemplo n.º 7

0

Mostrar archivo

Archivo: correlate1.py Proyecto: wu12345/ThinkStats2

def main(name, data_dir='.'):
    xs, ys = ReadData(data_dir)

    thinkplot.Scatter(xs, ys, alpha=0.05)
    thinkplot.Save(root='correlate1',
                   xlabel='Age (years)',
                   ylabel='Birth weight (oz)',
                   axis=[9, 45, 0, 250])

    print 'Pearson', thinkstats2.Corr(xs, ys)
    print 'Spearman', thinkstats2.SpearmanCorr(xs, ys)

    for i in range(10):
        print SimulateNull(list(xs), list(ys))

    print PValue(xs, ys, 1000)

Ejemplo n.º 8

0

Mostrar archivo

def main():
    random.seed(17)

    rho = -0.8
    res = CorrelatedGenerator(1000, rho)
    xs, ys = zip(*res)

    a = 1.0
    b = 0.0
    xs = [a * x + b for x in xs]

    print 'mean, var of x', thinkstats2.MeanVar(xs)
    print 'mean, var of y', thinkstats2.MeanVar(ys)
    print 'covariance', thinkstats2.Cov(xs, ys)
    print 'Pearson corr', thinkstats2.Corr(xs, ys)
    print 'Spearman corr', thinkstats2.SpearmanCorr(xs, ys)

    thinkplot.Scatter(xs, ys)
    thinkplot.Show()

Ejemplo n.º 9

0

Mostrar archivo

Archivo: agemodel.py Proyecto: wu12345/ThinkStats2

def ComputeLeastSquares(ages, weights):
    """Computes least squares fit for ages and weights.

    Prints summary statistics.
    """
    # compute the correlation between age and weight
    print 'Pearson correlation', thinkstats2.Corr(ages, weights)
    print 'Spearman correlation', thinkstats2.SpearmanCorr(ages, weights)

    # compute least squares fit
    inter, slope = thinkstats2.LeastSquares(ages, weights)
    print '(inter, slope):', inter, slope

    res = thinkstats2.Residuals(ages, weights, inter, slope)
    R2 = thinkstats2.CoefDetermination(weights, res)

    print 'R^2', R2
    print
    return inter, slope, R2

Ejemplo n.º 10

0

Mostrar archivo

def main():
    random.seed(17)

    rho = 0.8
    xs, ys = SatIqData(1000, rho)
    print 'mean, var of x', thinkstats2.MeanVar(xs)
    print 'mean, var of y', thinkstats2.MeanVar(ys)
    print 'Pearson corr', thinkstats2.Corr(xs, ys)

    inter, slope = thinkstats2.LeastSquares(xs, ys)
    print 'inter', inter
    print 'slope', slope

    fxs, fys = thinkstats2.FitLine(xs, inter, slope)
    res = thinkstats2.Residuals(xs, ys, inter, slope)
    R2 = thinkstats2.CoefDetermination(ys, res)
    print 'R2', R2

    thinkplot.Plot(fxs, fys, color='gray', alpha=0.2)
    thinkplot.Scatter(xs, ys)
    thinkplot.Show()

Ejemplo n.º 11

0

Mostrar archivo

def ComputeAirlineArrivalDelayCorrelations(flights):
    """Compute the different correlations.
        This is similar to Correlations() in scatter.py
    """
    flights = flights.dropna(subset=['AIRLINE', 'ARRIVAL_DELAY'])
    print('pandas cov', flights.AIRLINE_CODE.cov(flights.ARRIVAL_DELAY))
    print('thinkstats2 Cov',
          thinkstats2.Cov(flights.AIRLINE_CODE, flights.ARRIVAL_DELAY))
    print()

    print('pandas corr Pearson',
          flights.AIRLINE_CODE.corr(flights.ARRIVAL_DELAY))
    print('thinkstats2 Corr Pearson',
          thinkstats2.Corr(flights.AIRLINE_CODE, flights.ARRIVAL_DELAY))
    print()

    print('pandas corr spearman',
          flights.AIRLINE_CODE.corr(flights.ARRIVAL_DELAY, method='spearman'))
    print(
        'thinkstats2 SpearmanCorr',
        thinkstats2.SpearmanCorr(flights.AIRLINE_CODE, flights.ARRIVAL_DELAY))
    print()

Ejemplo n.º 12

0

Mostrar archivo

def SerialCorr(series, lag=1):
    xs = series[lag:]
    ys = series.shift(lag)[lag:]
    corr = thinkstats2.Corr(xs, ys)
    return corr

Ejemplo n.º 13

0

Mostrar archivo

 def TestStatistic(self, data):
     xs, ys = data
     test_stat = abs(thinkstats2.Corr(xs, ys))
     return test_stat

Ejemplo n.º 14

0

Mostrar archivo

Archivo: analysis.py Proyecto: pastormt/Album-Reviews-Analysis

 def TestStatistic(self, data):
     xs, ys = data
     test_stat = ts2.Corr(xs, ys)
     return test_stat

Ejemplo n.º 15

0

Mostrar archivo

import thinkstats2
import thinkplot
import first
import numpy as np

live, firsts, others = first.MakeFrames()
live = live.dropna(subset=['agepreg', 'totalwgt_lb'])

rho = thinkstats2.Corr(live.agepreg, live.totalwgt_lb)
rho_s = thinkstats2.SpearmanCorr(live.agepreg, live.totalwgt_lb)
print('Pearson\'s Correlation, Mother\'s age and Birth weight: ', rho)
print('Spearman\'s Rank Correlation, Mother\'s age and Birth weight: ', rho_s)

thinkplot.LEGEND = False
thinkplot.Scatter(live.agepreg, live.totalwgt_lb)
#thinkplot.Show(xlabel = 'Mother\'s age', ylabel = 'Birth weight')
thinkplot.SaveFormat(root='age_weight_scatter',
                     fmt='png',
                     xlabel='Mothers\'s age',
                     ylabel='Birth weight')

thinkplot.LEGEND = True
bins = np.arange(10, 45, 2.5)
indices = np.digitize(live.agepreg, bins)
groups = live.groupby(indices)
ages = [group.agepreg.mean() for i, group in groups]
cdfs = [thinkstats2.Cdf(group.totalwgt_lb) for i, group in groups]
for percent in [75, 50, 25]:
    weights = [cdf.Percentile(percent) for cdf in cdfs]
    label = '%dth' % percent
    thinkplot.Plot(ages, weights, label=label)

Ejemplo n.º 16

0

Mostrar archivo

def PlotSamplingDistributions(live):
    """Plots confidence intervals for the fitted curve and sampling dists.

    live: DataFrame
    """
    ages = live.agepreg
    weights = live.totalwgt_lb
    inter, slope = thinkstats2.LeastSquares(ages, weights)
    res = thinkstats2.Residuals(ages, weights, inter, slope)
    r2 = thinkstats2.CoefDetermination(weights, res)

    print('rho', thinkstats2.Corr(ages, weights))
    print('R2', r2)
    print('R', math.sqrt(r2))
    print('Std(ys)', thinkstats2.Std(weights))
    print('Std(res)', thinkstats2.Std(res))

    # plot the confidence intervals
    inters, slopes = SamplingDistributions(live, iters=1001)
    PlotConfidenceIntervals(ages,
                            inters,
                            slopes,
                            percent=90,
                            alpha=0.3,
                            label='90% CI')
    thinkplot.Text(42, 7.53, '90%')
    PlotConfidenceIntervals(ages,
                            inters,
                            slopes,
                            percent=50,
                            alpha=0.5,
                            label='50% CI')
    thinkplot.Text(42, 7.59, '50%')

    thinkplot.Save(root='linear3',
                   xlabel='age (years)',
                   ylabel='birth weight (lbs)',
                   legend=False)

    # plot the confidence intervals
    thinkplot.PrePlot(2)
    thinkplot.Scatter(ages, weights, color='gray', alpha=0.1)
    PlotConfidenceIntervals(ages, inters, slopes, res=res, alpha=0.2)
    PlotConfidenceIntervals(ages, inters, slopes)
    thinkplot.Save(root='linear5',
                   xlabel='age (years)',
                   ylabel='birth weight (lbs)',
                   title='90% CI',
                   axis=[10, 45, 0, 15],
                   legend=False)

    # plot the sampling distribution of slope under null hypothesis
    # and alternate hypothesis
    sampling_cdf = thinkstats2.Cdf(slopes)
    print('p-value, sampling distribution', sampling_cdf[0])

    ht = SlopeTest((ages, weights))
    pvalue = ht.PValue()
    print('p-value, slope test', pvalue)

    print('inter', inter, thinkstats2.Mean(inters))
    Summarize(inters, inter)
    print('slope', slope, thinkstats2.Mean(slopes))
    Summarize(slopes, slope)

    thinkplot.PrePlot(2)
    thinkplot.Plot([0, 0], [0, 1], color='0.8')
    ht.PlotCdf(label='null hypothesis')
    thinkplot.Cdf(sampling_cdf, label='sampling distribution')
    thinkplot.Save(root='linear4',
                   xlabel='slope (lbs / year)',
                   ylabel='CDF',
                   xlim=[-0.03, 0.03],
                   loc='upper left')

Ejemplo n.º 17

0

Mostrar archivo

groups = live_ss.groupby(indices)

age_means = [g.agepreg.mean() for i, g in groups]
wgt_cdfs = [thinkstats2.Cdf(g.totalwgt_lb) for i, g in groups]

percentiles = [75, 50, 25]
thinkplot.PrePlot(len(percentiles))
for percent in percentiles:
    wgt_percentile = [cdf.Percentile(percent) for cdf in wgt_cdfs]
    label = '%dth' % percent
    thinkplot.Plot(age_means, wgt_percentile, label=label)
thinkplot.Config(xlabel='Mother age (years)',
                 ylabel='Birth weight (lbs)',
                 legend=True)

p_corr = thinkstats2.Corr(live_ss.agepreg, live_ss.totalwgt_lb)
s_corr = thinkstats2.SpearmanCorr(live_ss.agepreg, live_ss.totalwgt_lb)
print('Pearson\'s Correlation:', p_corr)
print('Spearman\'s Correlation:', s_corr)


#--- Chapter8 Ex2
def SimulateSample(lam=2, n=10, iters=1000):
    lams_est = []
    for m in np.arange(iters):
        xs = np.random.exponential(1.0 / lam, n)
        L = 1 / np.mean(xs)
        lams_est.append(L)
    return lams_est

Ejemplo n.º 18

0

Mostrar archivo

def CorrelationPlots(df,
                     xlabel,
                     ylabel,
                     xjitter=0,
                     yjitter=0,
                     axis=None,
                     nbins=5,
                     **options):

    cleaned = df.dropna(subset=[xlabel, ylabel])
    xs = cleaned[xlabel]
    ys = cleaned[ylabel]

    xs = thinkstats2.Jitter(xs, xjitter)
    ys = thinkstats2.Jitter(ys, yjitter)

    xmin, xmax = min(xs), max(xs)
    ymin, ymax = min(ys), max(ys)
    if axis is None:
        axis = [xmin, xmax, ymin, ymax]

    PrePlot(num=4, rows=2, cols=2)

    # make scatter plot
    SubPlot(1)
    Scatter(xs, ys, alpha=0.1, s=10)
    Config(xlabel=xlabel, ylabel=ylabel, axis=axis, legend=False)

    # make HexBin plot
    SubPlot(2)
    HexBin(xs, ys)
    Config(xlabel=xlabel, ylabel=ylabel, axis=axis, legend=False)

    # plot percentiles
    SubPlot(3)

    xs_cdf = thinkstats2.Cdf(xs)
    lower = xs_cdf.Percentile(1)
    upper = xs_cdf.Percentile(99)

    bins = np.arange(lower, upper, nbins)
    indices = np.digitize(xs, bins)
    groups = cleaned.groupby(indices)
    mean_xs = [group[xlabel].mean() for i, group in groups]
    cdfs = [thinkstats2.Cdf(group[ylabel]) for i, group in groups]

    for percent in [75, 50, 25]:
        y_percentiles = [cdf.Percentile(percent) for cdf in cdfs]
        label = '%dth' % percent
        Plot(mean_xs, y_percentiles, label=label)

    Config(xlabel=xlabel, ylabel=ylabel, axis=axis, legend=True)

    # plot CDFs
    n = (upper - lower) // (nbins - 2)
    bins = np.arange(lower, upper, n)
    indices = np.digitize(cleaned[xlabel], bins)
    groups = cleaned.groupby(indices)
    mean_xs = [group[xlabel].mean() for i, group in groups]
    cdfs = [thinkstats2.Cdf(group[ylabel]) for i, group in groups]

    ## plot the cdfs
    SubPlot(4)
    PrePlot(len(cdfs))
    for i, cdf in enumerate(cdfs):
        if i == 0:
            label = '<%d ' % bins[0] + xlabel
        elif i == len(cdfs) - 1:
            label = '>%d ' % bins[-1] + xlabel
        else:
            label = '%d - %d ' % (bins[i - 1], bins[i]) + xlabel
        Cdf(cdf, label=label)
        Config(xlabel=ylabel, ylabel='CDF', legend=True)

    #print statistics
    print('Correlation:\n', thinkstats2.Corr(xs, ys))
    print('Spearman Correlation Coefficient:\n',
          thinkstats2.SpearmanCorr(xs, ys))

Ejemplo n.º 19

0

Mostrar archivo

Archivo: 10_HO1.py Proyecto: fullern1/previouscode

    return greq, less


def SplitFrames(df):
    df = df.dropna(subset=['agepreg', 'totalwgt_lb'])
    age = df.agepreg
    wgt = df.totalwgt_lb
    return age, wgt


def PlotScatter(age, wgt, xmin, xmax, ymin, ymax):
    thinkplot.Scatter(age, wgt, alpha=1.0)
    thinkplot.Config(xlabel='Age (Years)',
                     ylabel='Birth Weight (lbs)',
                     xlim=[xmin, xmax],
                     ylim=[ymin, ymax],
                     legend=False)
    thinkplot.Show()


greq, less = MakeFrames()
greqage, greqwgt = SplitFrames(greq)
lessage, lesswgt = SplitFrames(less)
PlotScatter(greqage, greqwgt, 30, 50, 0, 14)
PlotScatter(lessage, lesswgt, 5, 30, 0, 14)
print "Greq 30 Pearson's corr:", thinkstats2.Corr(greqage, greqwgt)
print "Greq 30 Spearman corr:", thinkstats2.SpearmanCorr(greqage, greqwgt)
print "Less 30 Pearson's corr:", thinkstats2.Corr(lessage, lesswgt)
print "Less 30 Spearman corr:", thinkstats2.SpearmanCorr(lessage, lesswgt)

Ejemplo n.º 20

0

Mostrar archivo

Archivo: DSC530_Paulovici_Exercise_8_2.py Proyecto: kevinpau/Bellevue_University_DSC_530

groups = data.groupby(indices)

means = [group.htm3.mean() for i, group in groups][1:-1]
cdfs = [thinkstats2.Cdf(group.residual) for i, group in groups][1:-1]

# plot the pencitles
for p in [75, 50, 25]:
    ys = [cdf.Percentile(p) for cdf in cdfs]
    label = str(p) + 'th'
    thinkplot.Plot(means, ys, label=label)

thinkplot.Config(xlabel='height (cm)', ylabel='residual weight (kg)')

#%%
# calculate correlation and coefficient of determination
rho = thinkstats2.Corr(heights, logWeight)
r2 = thinkstats2.CoefDetermination(logWeight, res)

# check if R^2 = rho^2
print("Correlation: {:.3f}".format(rho))
print("Coefficent of determination: {:.3f}".format(r2))

print("R^2 - rho^2: {:.3f}".format(rho**2 - r2))

#%%
# calc standard deviation (RMSE) of prediction w/o height
std_ys = thinkstats2.Std(logWeight)
print("Standard deviation w/o height: {:.3f}".format(std_ys))

#%%
# calc standard deviation (RMSE) of prediction w/ height

Ejemplo n.º 21

0

Mostrar archivo

Archivo: correlate1.py Proyecto: wu12345/ThinkStats2

def SimulateNull(xs, ys):
    random.shuffle(xs)
    random.shuffle(ys)
    return thinkstats2.Corr(xs, ys)

Ejemplo n.º 22

0

Mostrar archivo

cdf = thinkstats2.Cdf(df.Age)
thinkplot.Cdf(cdf)
thinkplot.Config(xlabel='Age', ylabel='CDF')

#plot normal distribution
mean = df.Age.mean()
std = df.Age.std()
xs = [-4, 4]
fxs, fys = thinkstats2.FitLine(xs, inter=mean, slope=std)
thinkplot.Plot(fxs, fys, color='gray', label='model')
xs, ys = thinkstats2.NormalProbability(df.Age)
thinkplot.Plot(xs, ys, label='Age')

#scatter plots and correlation
#year vs. age
year = thinkstats2.Jitter(df.Year, .25)
thinkplot.Scatter(year, df.Age)
thinkplot.Show(xlabel='Year', ylabel='Age')
thinkstats2.Corr(df.Year, df.Age)
#drug vs. age
thinkplot.Scatter(df.Age, df.Drug)
thinkplot.Show(xlabel='Age', ylabel='Drug')

#testing a difference in gender
data = male.Age.values, female.Age.values
ht = DiffMeansPermute(data)
pvalue = ht.PValue()
print(pvalue)
ht.PlotCdf()
thinkplot.Config(xlabel='test statistic', ylabel='CDF')

Ejemplo n.º 23

0

Mostrar archivo

Archivo: ch10.py Proyecto: smithb16/ThinkStats2

    # bin the data
    bins = np.arange(120, 200, 6)
    indices = np.digitize(df.htm3, bins)
    groups = df.groupby(indices)

    # make cdfs
    height_means = [group.htm3.mean() for _, group in groups][1:-1]
    cdfs = [thinkstats2.Cdf(group.residual) for _, group in groups][1:-1]

    # make plot of percentiles
    PlotPercentileLines(height_means, cdfs,
                        xlabel='height(cm)',
                        ylabel='residual log_10 weight (log_10 kg)')

    ## calculate correlation
    rho = thinkstats2.Corr(heights, log_weights)
    print('rho:\n',rho)

    ## coefficient of determination
    res = df.residual
    r2 = CoefDetermination(log_weights, res)
    print('r2:\n',r2)

    ## confirm that R^2 = rho^2
    print('rho**2:\n',rho**2)
    print('r2:\n',r2)

    ## Std(ys)
    print('Std(log_weights):\n',Std(log_weights))
    print('Std(res):\n',Std(res))
    ratio = 1 - (Std(res) / Std(log_weights))