Esempi in Python per DataFrame.mean, esempi in Python per pandas.DataFrame.mean

Esempio n. 1

0

Mostra file

File: test_replace.py Progetto: tsdlovell/pandas

    def test_replace_mixed(self):
        mf = self.mixed_frame
        mf.iloc[5:20, mf.columns.get_loc('foo')] = nan
        mf.iloc[-10:, mf.columns.get_loc('A')] = nan

        result = self.mixed_frame.replace(np.nan, -18)
        expected = self.mixed_frame.fillna(value=-18)
        assert_frame_equal(result, expected)
        assert_frame_equal(result.replace(-18, nan), self.mixed_frame)

        result = self.mixed_frame.replace(np.nan, -1e8)
        expected = self.mixed_frame.fillna(value=-1e8)
        assert_frame_equal(result, expected)
        assert_frame_equal(result.replace(-1e8, nan), self.mixed_frame)

        # int block upcasting
        df = DataFrame({'A': Series([1.0, 2.0], dtype='float64'),
                        'B': Series([0, 1], dtype='int64')})
        expected = DataFrame({'A': Series([1.0, 2.0], dtype='float64'),
                              'B': Series([0.5, 1], dtype='float64')})
        result = df.replace(0, 0.5)
        assert_frame_equal(result, expected)

        df.replace(0, 0.5, inplace=True)
        assert_frame_equal(df, expected)

        # int block splitting
        df = DataFrame({'A': Series([1.0, 2.0], dtype='float64'),
                        'B': Series([0, 1], dtype='int64'),
                        'C': Series([1, 2], dtype='int64')})
        expected = DataFrame({'A': Series([1.0, 2.0], dtype='float64'),
                              'B': Series([0.5, 1], dtype='float64'),
                              'C': Series([1, 2], dtype='int64')})
        result = df.replace(0, 0.5)
        assert_frame_equal(result, expected)

        # to object block upcasting
        df = DataFrame({'A': Series([1.0, 2.0], dtype='float64'),
                        'B': Series([0, 1], dtype='int64')})
        expected = DataFrame({'A': Series([1, 'foo'], dtype='object'),
                              'B': Series([0, 1], dtype='int64')})
        result = df.replace(2, 'foo')
        assert_frame_equal(result, expected)

        expected = DataFrame({'A': Series(['foo', 'bar'], dtype='object'),
                              'B': Series([0, 'foo'], dtype='object')})
        result = df.replace([1, 2], ['foo', 'bar'])
        assert_frame_equal(result, expected)

        # test case from
        df = DataFrame({'A': Series([3, 0], dtype='int64'),
                        'B': Series([0, 3], dtype='int64')})
        result = df.replace(3, df.mean().to_dict())
        expected = df.copy().astype('float64')
        m = df.mean()
        expected.iloc[0, 0] = m[0]
        expected.iloc[1, 1] = m[1]
        assert_frame_equal(result, expected)

Esempio n. 2

0

Mostra file

File: kurtosis.py Progetto: kshitij21/data-preprocessing

def kurtosis(str,list):

    s=list
    w = pd.read_csv(str, usecols=s)

    frame = DataFrame(w)

    h=len(w)


    print h
    t = frame.mean()

    d = frame.std()

    e = ((w - t) /d) ** 4

    g=e.sum()


    p1=h*(h+1)
    p2=float((h-1)*(h-2)*(h-3))
    p3=float(3*((h-1)**2))
    p4=(h-2)*(h-3)

    i=(((p1/p2)*g)-(p3/p4))

    print 'kurtosis=',i

Esempio n. 3

0

Mostra file

File: he.py Progetto: edawine/fatools

def summarize_he( analytical_sets ):

    results = {}
    he = {}

    for analytical_set in analytical_sets:
        he[analytical_set.label] = calculate_he(analytical_set.allele_df)

    he_df = DataFrame( he )
    labels = list(he_df.columns)
    if len(labels) == 2:
        # use Mann-Whitney / Wilcoxon test
        results['test'] = 'Wilcoxon test (paired)'
        results['stats'] = wilcoxon( he_df[labels[0]], he_df[labels[1]])

    elif len(labels) > 2:
        # use Kruskal Wallis
        results['test'] = 'Kruskal-Wallis test'
        results['stats'] = kruskal( * [he_df[x] for x in labels])
        results['warning'] = ''

    results['data'] = he_df
    results['mean'] = he_df.mean()
    results['stddev'] = he_df.std()
    #raise RuntimeError

    return results

Esempio n. 4

0

Mostra file

File: deviate.py Progetto: serendio-labs-stage/diskoveror-datapreprocessing-python

def mydeviate(str,list,Deviation=0,MeanAbsDeviation=1,MeanSqDev=0):

    s=list

    w= pd.read_csv(str,usecols=s)

    s=DataFrame(w)
    t= s.mean()

    if Deviation==1:


        b=[w-t]

        print b

    if MeanAbsDeviation==1:

        a=[abs(s)-t]
        print(a)

    if MeanSqDev==1:

        c=[(w-t)**2]
        print c


    return

Esempio n. 5

0

Mostra file

File: log_aggregate.py Progetto: kowito/pocketplaylabtest

class LogAggregate:
    def __init__(self, dataset):
        self.dataset = DataFrame(dataset)

    def get_median(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).median()[kwarg['key']]
        else:
            return self.dataset.median()[kwarg['key']]

    def get_average(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).mean()[kwarg['key']]
        else:
            return self.dataset.mean()[kwarg['key']]

    def get_min(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).min()[kwarg['key']]
        else:
            return self.dataset.min()[kwarg['key']]
    
    def get_max(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).max()[kwarg['key']]
        else:
            return self.dataset.max()[kwarg['key']]

    def get_count(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).count()[kwarg['key']]
        else:
            return self.dataset.count()[kwarg['key']]

Esempio n. 6

0

Mostra file

File: skewness.py Progetto: kshitij21/data-preprocessing

def skewness(str,list):
    s= list



    w = pd.read_csv(str, usecols=s)

    frame = DataFrame(w)

    h=len(w)

    t = frame.mean()



    d = frame.std()

    e = ((w - t) /d) ** 3

    g=e.sum()

    i=(h*g)/((h-1)*(h-2))


    print 'skewness=',i

Esempio n. 7

0

Mostra file

File: logreg_train.py Progetto: bcarlier75/dslr

 def preprocessing(self, df: pd.DataFrame):
     # Features wrangling
     df_features = df.iloc[:, 5:]
     df_features = df_features.fillna(df.mean())
     df_features = np.array(df_features)
     np.apply_along_axis(self._normalize, 0, df_features)
     # Labels wrangling
     df_labels = np.array(df.loc[:, "Hogwarts House"])
     return df_features, df_labels

Esempio n. 8

0

Mostra file

def summary_statistics(data_set: pd.DataFrame) -> pd.DataFrame:
    summary_data = dict()

    summary_data['mean'] = data_set.mean(numeric_only=True)
    summary_data['std'] = data_set.std(ddof=1, numeric_only=True)
    summary_data['min'] = data_set.min(numeric_only=True)
    summary_data['max'] = data_set.max(numeric_only=True)

    return pd.DataFrame(summary_data).T

Esempio n. 9

0

Mostra file

File: run_code.py Progetto: gaoshiqingaaa/bigDataDrivenVirtual

class CalWeight:
    def __init__(self, step, risk_aversion):
        self.risk_aversion = risk_aversion
        if step == 0:
            self.start = '2012-01-01'
            self.end = '2014-12-31'
        elif step == 1:
            self.start = '2012-01-01'
            self.end = '2015-2-28'
        elif step == 2:
            self.start = '2012-01-01'
            self.end = '2015-4-30'
        secIDs = [
            '000300.ZICN', '000905.ZICN', '399006.ZICN', 'SPX.ZIUS',
            '000012.ZICN', '000013.ZICN'
        ]
        self.rtn_table = DataFrame()
        for secID in secIDs:
            cp = self.get_return(secID)
            cp.name = secID
            self.rtn_table = pd.concat([self.rtn_table, cp], axis=1)
        self.rtn_table.fillna(0, inplace=True)
        self.cov_mat = self.rtn_table.cov() * 250
        self.exp_rtn = self.rtn_table.mean() * 250

    def get_return(self, ticker):
        tmp_lst = []
        fname = PERFIX + 'data_' + ticker + '.csv'
        with open(fname, 'r') as f:
            reader = csv.reader(f)
            for row in reader:
                tmp_lst.append(row)
        df = pd.DataFrame(tmp_lst[1:], columns=tmp_lst[0])
        df['Date'] = pd.to_datetime(df['Date'])
        df = df.set_index("Date")
        df = df[self.start:self.end]
        temp = df['Close'].astype('float64').pct_change().fillna(0.)
        return temp

    def get_weight(self):
        risk_aversion = self.risk_aversion
        P = risk_aversion * matrix(self.cov_mat.values)
        q = -1 * matrix(self.exp_rtn.values)
        G = matrix(
            np.vstack((np.diag(np.ones(len(self.exp_rtn))),
                       np.diag(-np.ones(len(self.exp_rtn))))))
        h = matrix(
            np.array([np.ones(len(self.exp_rtn)),
                      np.zeros(len(self.exp_rtn))
                      ]).reshape(len(self.exp_rtn) * 2, 1))
        A = matrix(np.ones(len(self.exp_rtn)), (1, len(self.exp_rtn)))
        b = matrix([1.0])
        solvers.options['show_progress'] = False
        sol = solvers.qp(P, q, G, h, A, b)
        return DataFrame(index=self.exp_rtn.index,
                         data=np.round(sol['x'], 2),
                         columns=['weight'])  # 权重精确到小数点后两位

Esempio n. 10

0

Mostra file

def _plot_stats_attribute(stats_list: Sequence[Stats], attribute: str, label, ax=None):
    """Plot a certain attribute of a collection of histories."""
    data = np.asarray([getattr(h, attribute) for h in stats_list])
    df = DataFrame(data.T)

    df_mean = df.mean(axis=1)
    df_std = df.std(axis=1)
    sns_ax = sns.lineplot(df_mean.index, df_mean, label=label, ax=ax)
    sns_ax.fill_between(df_mean.index, df_mean - df_std, df_mean + df_std, alpha=0.3)

Esempio n. 11

0

Mostra file

File: viz.py Progetto: MLjungg/CTGAN_M-Thesis

def plot_mean_std(real: pd.DataFrame, fake: pd.DataFrame, ax=None):
    """
    Plot the means and standard deviations of each dataset.

    :param real: DataFrame containing the real data
    :param fake: DataFrame containing the fake data
    :param ax: Axis to plot on. If none, a new figure is made.
    """
    if ax is None:
        fig, ax = plt.subplots(1, 2, figsize=(10, 5))
        fig.suptitle('Absolute Log Mean and STDs of numeric data\n', fontsize=16)

    ax[0].grid(True)
    ax[1].grid(True)
    real = real._get_numeric_data()
    fake = fake._get_numeric_data()
    real_mean = np.log(np.add(abs(real.mean()).values, 1e-5))
    fake_mean = np.log(np.add(abs(fake.mean()).values, 1e-5))
    min_mean = min(real_mean) - 1
    max_mean = max(real_mean) + 1
    line = np.arange(min_mean, max_mean)
    sns.lineplot(x=line, y=line, ax=ax[0])
    sns.scatterplot(x=real_mean,
                    y=fake_mean,
                    ax=ax[0])
    ax[0].set_title('Means of real and fake data')
    ax[0].set_xlabel('real data mean (log)')
    ax[0].set_ylabel('fake data mean (log)')

    real_std = np.log(np.add(real.std().values, 1e-5))
    fake_std = np.log(np.add(fake.std().values, 1e-5))
    min_std = min(real_std) - 1
    max_std = max(real_std) + 1
    line = np.arange(min_std, max_std)
    sns.lineplot(x=line, y=line, ax=ax[1])
    sns.scatterplot(x=real_std,
                    y=fake_std,
                    ax=ax[1])
    ax[1].set_title('Stds of real and fake data')
    ax[1].set_xlabel('real data std (log)')
    ax[1].set_ylabel('fake data std (log)')

    if ax is None:
        plt.show()

Esempio n. 12

0

Mostra file

File: library_images.py Progetto: biolab/baylor-dicty

def variation_statistic(gene_data: pd.DataFrame) -> pd.Series:
    """
    Calculate std/mean for each gene and replace nan with 0
    :gene_data: Expression DF with genes in rows. Calculations are performed for each row across features.
    :return: Series with statistic for each row
    """
    statistic = gene_data.std(axis=1) / gene_data.mean(axis=1)
    # statistic = gene_data.std(axis=1)
    # TODO How to deal with 0 expressed genes? Are they informative?????
    return statistic.replace(np.nan, 0)

Esempio n. 13

0

Mostra file

File: scaler.py Progetto: andersbogsnes/ml_tooling

    def fit(self, X: pd.DataFrame, y=None):
        self._reset()

        if self.with_mean:
            self.mean_ = X.mean()

        if self.with_std:
            self.scale_ = X.std(ddof=0)

        return self

Esempio n. 14

0

Mostra file

    def test_mean_datetimelike_numeric_only_false(self):
        df = DataFrame(
            {
                "A": np.arange(3),
                "B": pd.date_range("2016-01-01", periods=3),
                "C": pd.timedelta_range("1D", periods=3),
            }
        )

        # datetime(tz) and timedelta work
        result = df.mean(numeric_only=False)
        expected = Series({"A": 1, "B": df.loc[1, "B"], "C": df.loc[1, "C"]})
        tm.assert_series_equal(result, expected)

        # mean of period is not allowed
        df["D"] = pd.period_range("2016", periods=3, freq="A")

        with pytest.raises(TypeError, match="mean is not implemented for Period"):
            df.mean(numeric_only=False)

Esempio n. 15

0

Mostra file

File: recommender.py Progetto: yunusdemir/celp

    def mean_centered(self, utility_matrix: pd.DataFrame) -> pd.DataFrame:
        """

        :param utility_matrix:
        :return:
        """
        mean_centered_utility_matrix = utility_matrix.sub(
            utility_matrix.mean())

        return self.data.similarity_matrix_cosine(mean_centered_utility_matrix)

Esempio n. 16

0

Mostra file

File: drawer.py Progetto: ostr00000/alg-mag

    def table(self, extractedData: DataFrame, name: str):
        filename = self.filename

        self.filename = filename + '_all.txt'
        self._table(extractedData, name)

        mean = extractedData.mean(axis=0)
        std = extractedData.std(axis=0)
        self.filename = filename + '_average.txt'
        self._table(mean, name, std)

Esempio n. 17

0

Mostra file

File: dataset.py Progetto: marcomorucci/Clustering-Constitutions

    def get_topwords(self, countries, thresh=10, tf_idf=False):
        tw = DataFrame()
        for r in range(len(self.df)):
            if self.df.loc[r, 'country_id'] in countries:
                if tf_idf:
                    tw = tw.append(self.tf_idf.loc[r, :])
                else:
                    tw = tw.append(self.df.loc[r, :])

        return tw.mean().order(ascending=False)[:thresh]

Esempio n. 18

0

Mostra file

def scatter_peaks_no_peaks(
    top_eco: pd.DataFrame,
    top_naked: pd.DataFrame,
    non_top_eco: pd.DataFrame,
    non_top_naked: pd.DataFrame,
    ax: plt.Axes = None,
):
    if not ax:
        _, ax = plt.subplots(figsize=(12, 12))
    ax.set_xlabel("Chromatin")
    ax.set_ylabel("Naked")
    ax.scatter(
        non_top_eco,
        non_top_naked,
        alpha=0.2,
        label="All Points",
    )
    ax.scatter(top_eco, top_naked, label="Open ATAC")

    ax.axvline(non_top_eco.mean(), color="C0")
    ax.axvline(top_eco.mean(), color="C1")
    ax.axhline(non_top_naked.mean(), color="C0")
    ax.axhline(top_naked.mean(), color="C1")

    ax.legend(
        loc="upper right",
        frameon=False,
        shadow=False,
    )
    # We concatenate the two DFs to a single one so that the dropna() call will
    # "synced" between the two different rows
    top = pd.DataFrame({"chrom": top_eco, "naked": top_naked}).dropna(axis=0)
    all_ = pd.DataFrame({
        "chrom": non_top_eco,
        "naked": non_top_naked
    }).dropna(axis=0)
    r_top, _ = scipy.stats.pearsonr(top.loc[:, "chrom"], top.loc[:, "naked"])
    r_all, _ = scipy.stats.pearsonr(all_.loc[:, "chrom"], all_.loc[:, "naked"])
    ax.text(0.01,
            0.8,
            f"R (top) = {r_top} \nR (rest) = {r_all}",
            transform=ax.transAxes)
    return ax

Esempio n. 19

0

Mostra file

File: MyPortfolioSimulator.py Progetto: illyanyc/powerfolio

def get_sharpe_ratios(df_returns: DataFrame,
                      risk_free_rate: float = 0.0,
                      periods_per_annum: int = 252) -> Series:
    """
    Helper function to calculate the (annualized) Sharpe Ratios of the financial
    instruments contained in the input dataframe.
    """
    numer = (df_returns.mean(axis=0) - risk_free_rate) * periods_per_annum
    denom = np.sqrt(df_returns.var(axis=0) * periods_per_annum)
    return numer / denom

Esempio n. 20

0

Mostra file

File: pandas-03.py Progetto: gongwenqiang2/python-script

def pd_03():
    df=DataFrame(np.random.randn(6,3))
    df.ix[2:,1]=np.nan
    df.ix[4:,2]=np.nan
    print df
    print df.fillna(method='ffill')
    print df.fillna(method='ffill',limit=2)
    data=Series([1.,None,3.5,None,7])
    print data.fillna(data.mean())
    print df.fillna(df.mean())

Esempio n. 21

0

Mostra file

def get_mean_by_bin(df: pd.DataFrame) -> pd.Series:
    """
    Takes all the sweep data from the input dataframe as returned by read_hackrf_sweep_file_and_merge
    and gets the average db for each bin.
    Returns as a pandas Series

    :param df: pd.DataFrame from experiment in question
    :return: pd.Series of average
    """
    return df.mean(axis=0)

Esempio n. 22

0

Mostra file

File: test_replace.py Progetto: zuku1985/pandas

    def test_replace_series_dict(self):
        # from GH 3064
        df = DataFrame({"zero": {"a": 0.0, "b": 1}, "one": {"a": 2.0, "b": 0}})
        result = df.replace(0, {"zero": 0.5, "one": 1.0})
        expected = DataFrame({"zero": {"a": 0.5, "b": 1}, "one": {"a": 2.0, "b": 1.0}})
        tm.assert_frame_equal(result, expected)

        result = df.replace(0, df.mean())
        tm.assert_frame_equal(result, expected)

        # series to series/dict
        df = DataFrame({"zero": {"a": 0.0, "b": 1}, "one": {"a": 2.0, "b": 0}})
        s = Series({"zero": 0.0, "one": 2.0})
        result = df.replace(s, {"zero": 0.5, "one": 1.0})
        expected = DataFrame({"zero": {"a": 0.5, "b": 1}, "one": {"a": 1.0, "b": 0.0}})
        tm.assert_frame_equal(result, expected)

        result = df.replace(s, df.mean())
        tm.assert_frame_equal(result, expected)

Esempio n. 23

0

Mostra file

    def test_mean_datetimelike(self):
        # GH#24757 check that datetimelike are excluded by default, handled
        #  correctly with numeric_only=True

        df = DataFrame({
            "A": np.arange(3),
            "B": pd.date_range("2016-01-01", periods=3),
            "C": pd.timedelta_range("1D", periods=3),
            "D": pd.period_range("2016", periods=3, freq="A"),
        })
        result = df.mean(numeric_only=True)
        expected = Series({"A": 1.0})
        tm.assert_series_equal(result, expected)

        with tm.assert_produces_warning(FutureWarning):
            # in the future datetime columns will be included
            result = df.mean()
        expected = Series({"A": 1.0, "C": df.loc[1, "C"]})
        tm.assert_series_equal(result, expected)

Esempio n. 24

0

Mostra file

File: data.py Progetto: leyhline/vix-term-structure

 def normalize_data(self, data: pd.DataFrame, idx) -> pd.DataFrame:
     """
     All values should be normalized to range(-1,1).
     :param data: The data to normalize.
     :param idx: An id for remembering normalization values in class.
     :return: Normalized DataFrame.
     """
     self.mean[idx] = data.mean()
     self.ptp[idx] = data.max() - data.min()
     return (data - self.mean[idx]) / self.ptp[idx]

Esempio n. 25

0

Mostra file

    def test_mean_excludes_datetimes(self, tz):
        # https://github.com/pandas-dev/pandas/issues/24752
        # Our long-term desired behavior is unclear, but the behavior in
        # 0.24.0rc1 was buggy.
        df = DataFrame({"A": [pd.Timestamp("2000", tz=tz)] * 2})
        with tm.assert_produces_warning(FutureWarning):
            result = df.mean()

        expected = Series(dtype=np.float64)
        tm.assert_series_equal(result, expected)

Esempio n. 26

0

Mostra file

def remove_outliers(df: pd.DataFrame, zscore: int = 3) -> pd.DataFrame:
    """
    Removes all rows from the given DataFrame containing outliers in any of the columns.

    :param df: Input DataFrame.
    :param zscore: z-score to use when calculating outliers.
    :return: The DataFrame with all outliers removed.
    """
    scores = (df - df.mean()) / df.std(ddof=0).values
    return df[(np.abs(scores) < zscore).all(axis=1)]

Esempio n. 27

0

Mostra file

File: color.py Progetto: michael92ht/CBIC

def moments_features(path):
    if not os.path.exists(path):
        logger.error(path + " is not exist!")
        return
    im = cv2.imread(path)
    [b, g, r] = cv2.split(im)
    moments = []
    for n in [b, g, r]:
        df = DataFrame(np.array(n.flatten()))
        moments.extend(float(x) for x in [df.mean()[0], df.std()[0], df.skew()[0]])
    return moments

Esempio n. 28

0

Mostra file

File: sopUtils.py Progetto: liuyaox/python_study

 def __init__(self, df):
     scaler = MinMaxScaler(feature_range=(0, 100))
     df_scaled = DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index) #一定要保证colunms特别是index的一致
     df_corr = df_scaled.corr()
     df_stat = DataFrame([df_scaled.apply(lambda x: np.sqrt(np.var(x)))], index=['sd'])  #标准差
     df_stat = df_stat.append(Series(df_scaled.mean(), name='mean'))                     #均值
     df_stat = df_stat.append(Series(df_scaled.apply(self.get_entropy), name='entropy')) #熵值
     self.df_stat = df_stat.append(Series(df_corr.applymap(lambda x: 1-x).sum(), name='critic_part'))  #CRITIC部分
     self.df = df
     self.scaler = scaler        #归一标尺
     self.df_scaled = df_scaled  #归一后数据,格式跟df完全一致

Esempio n. 29

0

Mostra file

def normalize(
        data: pd.DataFrame,
        normalization_type: NormalizationType = NormalizationType.STANDARD):
    if normalization_type == NormalizationType.STANDARD:
        data_mean = data.mean(axis=0)
        data_std = data.std(axis=0)
        return (data - data_mean) / data_std, data_mean, data_std
    else:
        data_min = data.min(axis=0)
        data_max = data.max(axis=0)
        return (data - data_min) / (data_max - data_min), data_min, data_max

Esempio n. 30

0

Mostra file

    def rescale(data: pd.DataFrame) -> np.ndarray:
        means = data.mean(axis=0)
        variances = data.var(axis=0)

        scaled_data = data.copy(deep=True)
        for row in range(data.shape[0]):
            for col in range(data.shape[1]):
                scaled_data.iloc[row,
                                 col] = (data.iloc[row, col] -
                                         means[col]) / np.sqrt(variances[col])
        return scaled_data, means, np.sqrt(variances)

Esempio n. 31

0

Mostra file

def _print_full_performance(performance: pd.DataFrame,
                            verbosity='mean') -> None:
    """ Prints the accuracy, precision, recall and f-1 score
    Args:
        accuracy: performance of the predictor as a DataFrame
        verbosity: if 'mean' will print only the mean value(s), otherwise, will print for each label
    """
    if verbosity == 'mean':
        print(performance.mean().astype(float).round(3))
    else:
        print(performance.astype(float).round(3))

Esempio n. 32

0

Mostra file

File: utils.py Progetto: pjohansson/flowtools

def combine_spread(file_set, shift, drop_return_data=False):
    """
    Combine the spread of input files, return with mean and standard
    deviation calculated.

    """

    data = []
    values = {}
    for val in ('left', 'right', 'com', 'dist', 'radius', 'diameter'):
        values[val] = {}

    # Collect data from all files into dictionaries
    for i, _file in enumerate(file_set):
        data.append(Spread().read(_file))
        for val in values.keys():
            values[val][i] = Series(
                    data=data[i].spread[val]['val'],
                    index=data[i].times
                    )
        data[i].times = (np.array(data[i].times) - shift[i])

    spread = Spread()
    spread.spread['num'] = len(file_set)

    for val in values.keys():

        # Shift time as per synchronisation
        for i in values[val]:
            values[val][i].index = np.array(values[val][i].index) - shift[i]

        # Convert to DataFrame
        df = DataFrame(data=values[val])

        # If not a single file, keep only indices with at least two non-NaN
        if len(file_set) > 1:
            df = df.dropna()

        # If return data dropped, fill data here
        if drop_return_data:
            for i in df.columns:
                data[i].spread[val]['val'] = df[i].tolist()

        # Get times, mean and standard error as lists
        mean = list(df.mean(axis=1))
        std_error = list(df.std(axis=1))
        times = list(df.index)

        # Add to Spread object
        spread.spread[val]['val'] = mean
        spread.spread[val]['std'] = std_error
        spread.spread['times'] = times

    return spread, data

Esempio n. 33

0

Mostra file

def cv(df: pd.DataFrame, fill_value: Optional[float] = None) -> pd.Series:
    """
    Computes the Coefficient of variation for each column.

    Used by DataContainer objects to compute metrics.

    """
    res = df.std() / df.mean()
    if fill_value is not None:
        res = res.fillna(fill_value)
    return res

Esempio n. 34

0

Mostra file

File: ex01_df.py Progetto: venuur/pycli

def avg_columns(df:pd.DataFrame) -> pd.Series:
    '''Calculates mean of all columns in DataFrame.

    Args:
        df: Data frame of values to average.

    Returns:
        Series of means indexed by columns.
    '''

    return df.mean()

Esempio n. 35

0

Mostra file

def get_average_curve(input_csv: pd.DataFrame) -> pd.DataFrame:
    r"""
    Find the generalized curve to represent the class

    :param input_csv: raw class data
    :return: data points for generalized curve

    """
    average_series = input_csv.mean(axis=0)
    generalised = pd.DataFrame(average_series).transpose()
    return generalised

Esempio n. 36

0

Mostra file

File: preprocessing.py Progetto: takumiw/Deep-Learning-for-Human-Activity-Recognition

 def normalize(self, signal: pd.DataFrame) -> pd.DataFrame:
     """Apply normalization
     Args:
         signal (pd.DataFrame): Raw signal
     Returns:
         signal (pd.DataFrame): Normalized signal
     """
     df_mean = signal.mean()
     df_std = signal.std()
     signal = (signal - df_mean) / df_std
     return signal

Esempio n. 37

0

Mostra file

File: kmeans.py Progetto: wbenica/csc466lab4

def select_centroids_smart(df: pd.DataFrame, k: int, get_dist=get_euclidean_distances) -> np.ndarray:
    points = pd.DataFrame(df.mean(axis=0)).T
    i = 1
    while i < k:
        dists = get_dist(df, points).sum(axis=1)
        furthest = np.argmax(dists)
        next_point = pd.DataFrame(df.iloc[furthest]).T
        points = points.append(next_point)
        df = drop_df(df, df.iloc[furthest])
        i += 1
    return points.values

Esempio n. 38

0

Mostra file

File: process.py Progetto: trunganhvu/Do_An

def fillNan(matrix: pd.DataFrame, type: str = 'value', value: float = 0):
    """
    :param matrix:
    :param type: lựa chọn ['value', 'col_avg', 'row_avg']
    :param value: float
    :return:
    """
    filled_matrix = matrix.isna()
    result_matrix = matrix.copy()

    if type == 'value':
        result_matrix = matrix.fillna(value)
    elif type == 'col_avg':
        col_avg = matrix.mean(axis=0)
        result_matrix = matrix.fillna(col_avg)
    elif type == 'row_avg':
        row_avg = matrix.mean(axis=1)
        result_matrix = matrix.T.fillna(row_avg).T

    return result_matrix, filled_matrix

Esempio n. 39

0

Mostra file

def _maximum_likelihood_pairs(pairings: DataFrame, ret_largest: bool = True):
    """
    Given a pairings, choose the maximum likely pairing.
    """
    pm = pairings.mean(axis=1)
    if pm.gt(0).sum() == 0:
        raise ValueError("There is no crossover between these datasets")
    elif pm.gt(0).sum() == 1 or ret_largest:
        return pm.idxmax(), pm.max()
    else:
        return pm[pm.gt(0)]

Esempio n. 40

0

Mostra file

def resumo_disciplinas(dados: pd.DataFrame) -> pd.DataFrame:
    media_disciplina = dados.mean(axis=0).round(2)

    numero_acessos = dados[dados != 0].count(axis=0)

    tabela = pd.DataFrame(
        columns=media_disciplina.index,
        data=[media_disciplina.values, numero_acessos],
        index=['Média de Acesso por Caderno', 'Número de Acesso por Caderno '])

    return tabela

Esempio n. 41

0

Mostra file

File: test_replace.py Progetto: tsdlovell/pandas

    def test_replace_series_dict(self):
        # from GH 3064
        df = DataFrame({'zero': {'a': 0.0, 'b': 1}, 'one': {'a': 2.0, 'b': 0}})
        result = df.replace(0, {'zero': 0.5, 'one': 1.0})
        expected = DataFrame(
            {'zero': {'a': 0.5, 'b': 1}, 'one': {'a': 2.0, 'b': 1.0}})
        assert_frame_equal(result, expected)

        result = df.replace(0, df.mean())
        assert_frame_equal(result, expected)

        # series to series/dict
        df = DataFrame({'zero': {'a': 0.0, 'b': 1}, 'one': {'a': 2.0, 'b': 0}})
        s = Series({'zero': 0.0, 'one': 2.0})
        result = df.replace(s, {'zero': 0.5, 'one': 1.0})
        expected = DataFrame(
            {'zero': {'a': 0.5, 'b': 1}, 'one': {'a': 1.0, 'b': 0.0}})
        assert_frame_equal(result, expected)

        result = df.replace(s, df.mean())
        assert_frame_equal(result, expected)

Esempio n. 42

0

Mostra file

File: MMFunctions.py Progetto: flashhack/Utils

def BackTestSignal(dfXAlpha, dfXReturn, XPrice, strategy, riskmgr=None, freq = 252):
    dfAlphaWeight  = strategy.GenSingleAlphaWeight(dfXAlpha)
    if riskmgr is not None:
        dfAlphaWeight = riskmgr.AdjustAlphaWeight(dfAlphaWeight)
    dfSignalReturn = GenSingleFactorReturn(dfAlphaWeight, dfXReturn)
    
    # (simple_sharpe, geo_sharpe, sim_mean * N, geo_mean * N, vol)
    sharpe = CalcSharpeRatio(dfSignalReturn['Return'], freq)
    
    # Detailed Data
    dfLongCount = DataFrame(columns=['LongCount'], data=dfAlphaWeight.apply(lambda s: s[s>0].count(), axis=1))
    dfShortCount = DataFrame(columns=['ShortCount'], data=dfAlphaWeight.apply(lambda s: s[s<0].count(), axis=1))
    dfLongExposure = DataFrame(columns=['LongExposure'], data=dfAlphaWeight.apply(lambda s: s[s>0].sum(), axis=1))
    dfShortExposure = DataFrame(columns=['ShortExposure'], data=dfAlphaWeight.apply(lambda s: s[s<0].sum(), axis=1))
    dfNetExposure = DataFrame(columns=['NetExposure'], data=dfAlphaWeight.apply(sum, axis=1))
    dfTotalDollarInvest = DataFrame(columns=['I'], data=dfAlphaWeight.apply(lambda s: abs(s).sum(), axis=1))
    dfTotalDollarTraded = DataFrame(columns=['D'], data=(dfAlphaWeight - dfAlphaWeight.shift(1)).apply(lambda s: abs(s).sum(), axis=1))
    dfSharesTraded = dfAlphaWeight / XPrice
    dfTotalSharesTraded = DataFrame(columns=['Q'], data=(dfSharesTraded - dfSharesTraded.shift(1)).apply(lambda s: abs(s).sum(), axis=1))
    
    TurnOver = dfTotalDollarTraded.mean()[0]/dfTotalDollarInvest.mean()[0]
    CentsPerShare = 100 * dfSignalReturn['Return'].iloc[1:].mean() / dfTotalSharesTraded.mean()[0]
    
    dfMetrics = DataFrame(list(sharpe)).T
    dfMetrics.columns = ['Simple Sharpe', 'Geo. Sharpe', 'Simple Mean', 'Geo. Mean', 'Anual Vol']
    dfMetrics['Turnover'] = TurnOver
    dfMetrics['CentsPerShare'] = CentsPerShare
    dfMetrics['AvgHolding'] = 1.0/TurnOver
    dfMetrics.index = [dfXAlpha.index.name]
    
    dfSignalReturn = pd.merge(dfSignalReturn, dfLongCount,left_index=True,right_index=True,how='outer')
    dfSignalReturn = pd.merge(dfSignalReturn, dfShortCount,left_index=True,right_index=True,how='outer')
    dfSignalReturn = pd.merge(dfSignalReturn, dfLongExposure,left_index=True,right_index=True,how='outer')
    dfSignalReturn = pd.merge(dfSignalReturn, dfShortExposure,left_index=True,right_index=True,how='outer')
    dfSignalReturn = pd.merge(dfSignalReturn, dfNetExposure,left_index=True,right_index=True,how='outer')
    dfSignalReturn = pd.merge(dfSignalReturn, dfTotalDollarInvest,left_index=True,right_index=True,how='outer')
    dfSignalReturn = pd.merge(dfSignalReturn, dfTotalDollarTraded,left_index=True,right_index=True,how='outer')
    dfSignalReturn = pd.merge(dfSignalReturn, dfTotalSharesTraded,left_index=True,right_index=True,how='outer')    
    
    return dfMetrics, dfSignalReturn, dfAlphaWeight

Esempio n. 43

0

Mostra file

File: test_axis_select_reindex.py Progetto: AkiraKane/pandas

    def test_align_int_fill_bug(self):
        # GH #910
        X = np.arange(10 * 10, dtype='float64').reshape(10, 10)
        Y = np.ones((10, 1), dtype=int)

        df1 = DataFrame(X)
        df1['0.X'] = Y.squeeze()

        df2 = df1.astype(float)

        result = df1 - df1.mean()
        expected = df2 - df2.mean()
        assert_frame_equal(result, expected)

Esempio n. 44

0

Mostra file

File: stndize.py Progetto: serendio-labs-stage/diskoveror-datapreprocessing-python

def stndize(str,list):

    s=list
    w= pd.read_csv(str,usecols=s)
    frame = DataFrame(w)

    t=frame.mean()
    print t
    z=frame.std()
    print z
    print (w-t)/z

    return;

Esempio n. 45

0

Mostra file

File: test_utils.py Progetto: biroc/alphalens

    def test_common_start_returns(self, before, after, mean_by_date, demeaned,
                                  expected_vals):
        dr = date_range(start='2015-1-17', end='2015-2-2')
        dr.name = 'date'
        tickers = ['A', 'B', 'C', 'D']
        r1, r2, r3, r4 = (1.20, 1.40, 0.90, 0.80)
        prices = DataFrame(index=dr, columns=tickers,
                           data=[[r1**1, r2**1, r3**1, r4**1],
                                 [r1**2, r2**2, r3**2, r4**2],
                                 [r1**3, r2**3, r3**3, r4**3],
                                 [r1**4, r2**4, r3**4, r4**4],
                                 [r1**5, r2**5, r3**5, r4**5],
                                 [r1**6, r2**6, r3**6, r4**6],
                                 [r1**7, r2**7, r3**7, r4**7],
                                 [r1**8, r2**8, r3**8, r4**8],
                                 [r1**9, r2**9, r3**9, r4**9],
                                 [r1**10, r2**10, r3**10, r4**10],
                                 [r1**11, r2**11, r3**11, r4**11],
                                 [r1**12, r2**12, r3**12, r4**12],
                                 [r1**13, r2**13, r3**13, r4**13],
                                 [r1**14, r2**14, r3**14, r4**14],
                                 [r1**15, r2**15, r3**15, r4**15],
                                 [r1**16, r2**16, r3**16, r4**16],
                                 [r1**17, r2**17, r3**17, r4**17]])
        dr2 = date_range(start='2015-1-21', end='2015-1-29')
        factor = DataFrame(index=dr2, columns=tickers,
                           data=[[3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1]]).stack()
        factor.index = factor.index.set_names(['date', 'asset'])
        factor.name = 'factor'

        cmrt = common_start_returns(
            factor,
            prices,
            before,
            after,
            False,
            mean_by_date,
            factor if demeaned else None)
        cmrt = DataFrame({'mean': cmrt.mean(axis=1), 'std': cmrt.std(axis=1)})
        expected = DataFrame(index=range(-before, after + 1),
                             columns=['mean', 'std'], data=expected_vals)
        assert_frame_equal(cmrt, expected)

Esempio n. 46

0

Mostra file

File: kelly_criterion.py Progetto: tibkiss/kelly-criterion

def calc_kelly_leverages(securities: Set[str],
                         start_date: date,
                         end_date: date,
                         risk_free_rate: float = 0.04) -> Dict[str, float]:
    """Calculates the optimal leverages for the given securities and
    time frame. Returns a list of (security, leverage) tuple with the
    calculate optimal leverages.

    Note: risk_free_rate is annualized
    """
    f = {}
    ret = {}
    excess_return = {}

    # Download the historical prices from Yahoo Finance and calculate the
    # excess return (return of security - risk free rate) for each security.
    for symbol in securities:
        try:
            hist_prices = get_historical_data(
                symbol, start=start_date, end=end_date,
                output_format='pandas')
        except IOError as e:
            raise ValueError(f'Unable to download data for {symbol}. '
                             f'Reason: {str(e)}')

        f[symbol] = hist_prices

        ret[symbol] = hist_prices['close'].pct_change()
        # risk_free_rate is annualized
        excess_return[symbol] = (ret[symbol] - (risk_free_rate / 252))

    # Create a new DataFrame based on the Excess Returns.
    df = DataFrame(excess_return).dropna()

    # Calculate the CoVariance and Mean of the DataFrame
    C = 252 * df.cov()
    M = 252 * df.mean()

    # Calculate the Kelly-Optimal Leverages using Matrix Multiplication
    F = inv(C).dot(M)

    # Return a list of (security, leverage) tuple
    return {security: leverage
            for security, leverage in zip(df.columns.values.tolist(), F)}

Esempio n. 47

0

Mostra file

File: test_query_eval.py Progetto: bashtage/pandas

    def test_ops(self):

        # tst ops and reversed ops in evaluation
        # GH7198

        # smaller hits python, larger hits numexpr
        for n in [4, 4000]:

            df = DataFrame(1, index=range(n), columns=list('abcd'))
            df.iloc[0] = 2
            m = df.mean()

            for op_str, op, rop in [('+', '__add__', '__radd__'),
                                    ('-', '__sub__', '__rsub__'),
                                    ('*', '__mul__', '__rmul__'),
                                    ('/', '__truediv__', '__rtruediv__')]:

                base = (DataFrame(np.tile(m.values, n)  # noqa
                                  .reshape(n, -1),
                                  columns=list('abcd')))

                expected = eval("base{op}df".format(op=op_str))

                # ops as strings
                result = eval("m{op}df".format(op=op_str))
                assert_frame_equal(result, expected)

                # these are commutative
                if op in ['+', '*']:
                    result = getattr(df, op)(m)
                    assert_frame_equal(result, expected)

                # these are not
                elif op in ['-', '/']:
                    result = getattr(df, rop)(m)
                    assert_frame_equal(result, expected)

        # GH7192
        df = DataFrame(dict(A=np.random.randn(25000)))
        df.iloc[0:5] = np.nan
        expected = (1 - np.isnan(df.iloc[0:25]))
        result = (1 - np.isnan(df)).iloc[0:25]
        assert_frame_equal(result, expected)

Esempio n. 48

0

Mostra file

File: analysis.py Progetto: markhocky/EnsembleSystemDevelopment

def cross_validate_trades(trades, N = 20, subset_fraction = 0.7):
    
    tickers = trades.tickers
    sample_size = round(len(tickers) * subset_fraction)
    summary = DataFrame(dtype = float)

    for n in range(N):
        sample_tickers = list(random.choice(tickers, sample_size, replace = False))
        trade_subset = trades.find(lambda T: T.ticker in sample_tickers)
        summary[n] = summary_report(trade_subset)

    result = DataFrame(dtype = float)
    result['Base'] = summary_report(trades)
    result['Mean'] = summary.mean(axis = 1)
    result['Std'] = summary.std(axis = 1)
    result['Median'] = summary.median(axis = 1)
    result['Max'] = summary.max(axis = 1)
    result['Min'] = summary.min(axis = 1)

    return (result, summary)

Esempio n. 49

0

Mostra file

File: avg_medal_count.py Progetto: mdg357/udacity

def avg_medal_count():
    '''
    Using the dataframe's apply method, create a new Series called 
    avg_medal_count that indicates the average number of gold, silver,
    and bronze medals earned amongst countries who earned at 
    least one medal of any kind at the 2014 Sochi olympics.  Note that
    the countries list already only includes countries that have earned
    at least one medal. No additional filtering is necessary.
    
    You do not need to call the function in your code when running it in the
    browser - the grader will do that automatically when you submit or test it.
    '''

    countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
                 'Netherlands', 'Germany', 'Switzerland', 'Belarus',
                 'Austria', 'France', 'Poland', 'China', 'Korea', 
                 'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
                 'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
                 'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']

    gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
    silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
    bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]
    
    olympic_medal_counts = {'country_name':countries,
                            'gold': Series(gold),
                            'silver': Series(silver),
                            'bronze': Series(bronze)}
    df = DataFrame(olympic_medal_counts)
    
    # YOUR CODE HERE
    #df['average_medal_count'] = df.mean(axis=1)
    #avg_medal_count_by_country = df[['country_name','average_medal_count']]
    avg_medal_count = df.mean()
    # Or, we could do it this way
    avg_medal_count = df[['gold', 'silver', 'bronze']].apply(numpy.mean)
    print(avg_medal_count)
    
    return avg_medal_count

Esempio n. 50

0

Mostra file

File: D.py Progetto: kemaleren/biosys-bootcamp

class GetGenes(object):

	def __init__(self, data):
		self.dataframe = DataFrame(data)

	# read a text file and return a data frame. Records should be separated by TAB
	# There should not be duplicate column names
	def import_file(self, filename):
		# this function use to convert string to float
		def convert(x):
			try:
				x = float(x)
			except ValueError:
				pass
			return(x)

		table = []
		for line in open(filename):
			if(line.strip()):	# If not empty line
				line = line.rstrip('\n').split('\t')
				line = list(map(convert, line))
				table.append(line)
		self.dataframe = DataFrame(table[1:],columns=table[0])
		return

	def houseKeepingGenes(self, geneNum):
		# compute the CV of data
		std = array(self.dataframe.std(axis = 1))
		mean = array(self.dataframe.mean(axis = 1))
		CV = std/mean
		CV = list(map(abs, CV))		# convert to positive number

		# get the fist N minimum value
		mins = nsmallest(geneNum, CV)
		print("The GOOD genes are:\n")
		for item in mins:
			print(self.dataframe.ix[CV.index(item)][0])
		return

Esempio n. 51

0

Mostra file

File: pandas基础.py Progetto: alvis-huang/ML

## 带有重复值的轴索引
obj = Series(range(5), index=['a','a','b','b','c'])
# 检验是否唯一
obj.index.is_unique
# 一个索引有多个值，那么该索引就会返回多个值。
obj['a']

## 汇总和计算描述统计
df = DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],
	index=['a','b','c','d'], columns=['one','two'])
# 对列
df.sum()
# 对行
df.sum(axis=1)
# 默认会排除NA，但是可以通过skipna禁用该功能
df.mean(axis=1,skipna=False)
# 返回最大值的索引
df.idxmax()
# 累加
df.cumsum()
df.describe()
# 相关系数
returns.MSFT.corr(returns.IBM)
returns.corr()
returns.cov()
returns.corrwith(returns.IBM)

## 唯一值，值计数以及成员资格
obj = Series(['c','a','d','a','a','b','b','c','c'])
uniques = obj.unique()
# 统计个数

Esempio n. 52

0

Mostra file

File: LEARNER.py Progetto: talionet/PhD-projects

    def run(self,Model='svc',kernel='linear',is_cross_validation=True, cross_validationMethod='LOO', DecompositionMethod='PCA',decompositionLevel='FeatureType',n_components=30, FeatureSelection='TopExplainedVarianceComponents', n_features=10, isPerm=0,isBetweenSubjects=True,isConcatTwoLabels=False,isSaveCsv=None, isSavePickle=None, isSaveFig=None,isSelectSubFeatures=False,SubFeatures='ExpressionLevel'):       
        # -- TODO :
        # --  # Greedy selection on features + Other feature selection types...
        # --  # Make sure featuers are Best only based on train data!!!
        # --  # Keep a list of n_train, n_test from each Label and scoring (accuracy, f1..) in each cross validation iteration
        # --  # Plot results summary (see CARS paper for desired results for Ein Gedi Poster 22-1-2015)
        # --  # remove irelevant data using 'Tracking Success' and consider 'TimeStamps' for feature calculation
        # --  # add f feature analysis by facial part (see excel) 
        # --  # select best model (svm, otherwise ridge regression) 
        # --  # compare svc results with regerssion results (using LOO and different Params for regression  - params for unbalanced data, different kernels, etc.), model evaluation - http://scikit-learn.org/stable/modules/model_evaluation.html) 
        # --  # check how the model weights behave - feature selection analysis
        # --  # calc model error
        # --  # divide data to subparts for training and testing - try within/ between subject, and analyze distribution of features when data is divided
        # --  # LOO - also on bool labels (patients vs controls and mental status bool)
        # --  # add mental status rank scores (0-4)
        # --  # make sure p-val returns the right value in 'scores'
        # --  # run it over random data (permutation test) 
        # --  # continoue here - check regression results-Make sure regression works (not so good).. check what happens in svc for G7 (high train R, negative test R)

        ## init 
        if isSelectSubFeatures:
            print('Features : ' + SubFeatures)
            f=self.FeaturesDF.copy()
            featureNames=self.FeaturesDF.index.names
            try:
               f=f.loc[SubFeatures]
               f.index=MultiIndex.from_product([[SubFeatures],f.index], names=featureNames)
            except KeyError:
               f.index=f.index.swaplevel(0,1)
               f=f.loc[SubFeatures]
               f.index=MultiIndex.from_product([f.index,[SubFeatures]], names=featureNames)
            self.FeaturesDF=f.copy()
        else:
            SubFeatures='allFeatureTypes'

        FeatureTypeList=[j for j in tuple(self.FeaturesDF.index)]
        self.FullResults=DF()
           
        # set learning params (cross validation method, and model for learning)
        isBoolLabel=self.LabelsObject.isBoolLabel
        isBoolScores=isBoolLabel
        if DecompositionMethod==None and (FeatureSelection == 'TopExplainedVarianceComponents' or FeatureSelection == 'TopNComponents'):
            print("ERROR- feature selection method cannot be '"+ FeatureSelection +"' when X is not decomposed")
            FeatureSelection=raw_input("Choose a different feature selection method ('RFE','f_regression','dPrime','AllFeatures'): ")

        model, isBoolModel= learningUtils.setModel(Model)
        selectFeatures =learningUtils.setFeatureSelection(FeatureSelection,n_features)
        n_components=min(n_features,n_features) #cannot have more components than features. 
        decompositionTitle, decomposeFunction= learningUtils.setDecomposition(DecompositionMethod,n_components,decompositionLevel)
        isDecompose=  decompositionTitle!='noDecomposition'


        # save learning params
        self.Learningdetails={'Model':Model,'Kernel':kernel,'CrossVal':cross_validationMethod,'FeatureSelection':FeatureSelection,'Decomposition':decompositionTitle,'LabelBy':self.Details['LabelDetails'].keys()[0],'FeatureMethod':self.Details['FeatureMethod'],'PieceLength':self.Details['PieceLength']}
        print('\n------------Learning Details------------')
        print(DF.from_dict(self.Learningdetails,orient='index'))
        print('\n----' + cross_validationMethod + ' Cross validation Results:----')
        
        #define global variables over modules (to be used in myUtils)

        globalVars.transformMargins=0#lambda x:x         
        globalVars.isBoolLabel=isBoolLabel
        globalVars.isBoolModel=isBoolModel
        global trainLabels_all, testLabels_all, TrueLabels,isAddDroppedSubjects 
        trainLabels_all, testLabels_all, TrueLabels,isAddDroppedSubjects=labelUtils.initTrainTestLabels_all(self.LabelsObject)
        trainLabels_all2, testLabels_all2, TrueLabels2,isAddDroppedSubjects2=labelUtils.initTrainTestLabels_all(self.LabelsObject2)

        
        LabelingList=trainLabels_all.columns #['N1']
        self.ResultsDF=DF()
        self.BestFeatures=DF(columns=LabelingList) #dict of BestFeaturesDF according to Labeling methods
        YpredictedOverAllLabels=pandas.Panel(items=range(len(trainLabels_all)),major_axis=LabelingList,minor_axis=TrueLabels.index) #panel: items=cv_ind, major=labels, minor=#TODO 
       
                                              
        ## Create train and test sets according to LabelBy, repeat learning each time on different Labels from LabelingList
        
        isMultivarLabels=False      
        LabelingIndex=enumerate(LabelingList)
        if isMultivarLabels:
            LabelingIndex=enumerate([LabelingList])

        for label_ind, Labeling in LabelingIndex:
            """if isPerm: #TODO - fix this to work with continous / bool data
                try:
                    trainLabels=self.LabelsObject.permedLabelsDF[Labeling]
                except AttributeError:
                    self.LabelsObject.permLabels()
                    trainLabels=self.LabelsObject.permedLabelsDF[Labeling]"""
            #set subjects list according to labels and features
            X,SubjectsList,droppedSubjects,Xdropped=featuresUtils.initX(self.FeaturesDF,trainLabels_all,Labeling)
            X2,SubjectsList2,droppedSubjects2,Xdropped2=featuresUtils.initX(self.FeaturesDF,trainLabels_all2,Labeling,is2=1)
            
            #init train and test labels
            trainLabels, testLabels, LabelRange = labelUtils.initTrainTestLabels(Labeling,SubjectsList,trainLabels_all, testLabels_all)
            trainLabels2, testLabels2, LabelRange2 = labelUtils.initTrainTestLabels(Labeling,SubjectsList2,trainLabels_all2, testLabels_all2)
            
            #make sure only labeled subjects are used for classification
            X=X.query('subject == '+ str(list(trainLabels.index)) ) 
            X.index.get_level_values(X.index.names[0]) 
            SubjectIndex=list(set(X.index.get_level_values('subject')))

            X2=X2.query('subject == '+ str(list(trainLabels2.index)) )  
            X2.index.get_level_values(X2.index.names[0]) 
            SubjectIndex2=list(set(X2.index.get_level_values('subject')))                       
            #init vars
            if isBetweenSubjects:
                cv_param=len(SubjectIndex)
                self.Learningdetails['CrossValSubjects']='between'
                isWithinSubjects=False
            else:
                isWithinSubjects=True
                X=X.swaplevel(0,1)
                PieceIndex=list(set(X.index.get_level_values('Piece_ind')))
                cv_param=len(PieceIndex)
                self.Learningdetails['CrossValSubjects']='within'
            
            self.Learningdetails['NumOfFeatures']=n_features
            
            
            try:
                print('\n**' + Labeling + '**')
            except TypeError:
                print('\n*******')
                print(Labeling)
            
            cv, crossValScores= learningUtils.setCrossValidation(cross_validationMethod,cv_param,trainLabels,isWithinSubjects) 
            
            ## Learning - feature selection for different scoring types, with cross validation - 

            BestFeaturesForLabel=self.BestFeaturesForLabel(FeatureTypeList,LabelingList,n_features) #saves dataframe with best features for each label, for later analysis
            cv_ind=0
            #used for transforming from margins returned from svm to continouse labels (e.g . PANSS)
            trainScores=DF()
            test_index=X.index
            testScores=concat([DF(index=test_index),DF(index=['std_train_err'])])
            testScores2=concat([DF(index=testLabels.index),DF(index=['std_train_err'])]) 
            testProbas=DF(index=X.index)
            testProbas2=DF(index=SubjectIndex)

            #impt=Imputer(missing_values='NaN', strategy='median', axis=0)

            globalVars.LabelRange=LabelRange

            ModelWeights1=DF(columns=range(len(cv)),index=X.columns)
            Components=pandas.Panel(items=range(len(cv)),major_axis=X.columns,minor_axis=range(n_features)) #todo fix this for 1st and second learning
            ExplainedVar=DF(columns=range(len(cv)))
            ModelWeights2=DF(columns=range(len(cv)))
            bestNfeaturesPanel=Panel(items=LabelingList,minor_axis=range(len(cv)),major_axis=range(n_features))
            
            #bestNfeaturesPanel=Panel(items=LabelingList,major_axis=range(len(cv)),minor_axis=MultiIndex.from_tuples(('a','b')))
            

            for train, test in cv:

                if not is_cross_validation:
                   train=np.append(train,test)
                   #test=np.append(train,test)
                   self.Learningdetails['CrossVal']='NONE'
                   #if cv_ind>0:
                    #    break

                if isBetweenSubjects:
                    #set X and Y
                    train_subjects=trainLabels.iloc[train].index
                    test_subjects=testLabels.iloc[test].index 
                    Xtrain,Xtest, Ytrain, YtrainTrue, Ytest=learningUtils.setXYTrainXYTest(X,Labeling,trainLabels,testLabels,TrueLabels,train_subjects,test_subjects)
                    Xtrain2,Xtest2, Ytrain2, YtrainTrue2, Ytest2=learningUtils.setXYTrainXYTest(X2,Labeling,trainLabels2,testLabels2,TrueLabels2,train_subjects,test_subjects)

                    
                    if isConcatTwoLabels: #used when there is more than one doctor
                        Xtrain=concat([Xtrain,Xtrain2])
                        Xtest=concat([Xtest,Xtest2])
                        Ytrain=concat([Ytrain,Ytrain2])
                        YtrainTrue=concat([YtrainTrue,YtrainTrue2])
                        Ytest=concat([Ytest,Ytest2])
                        Xdropped=concat([Xdropped,Xdropped2])
                        SubjectsList=list(set(SubjectsList).intersection(set(SubjectsList2)))
                        droppedSubjects=list(set(droppedSubjects).union(set(droppedSubjects2)).difference(set(SubjectsList)))#diff from SubjectsList to make sure no subjects are both in train and test.
                 

                    #select N best features:
                    Xtrain, Xtest, bestNfeatures, components, explainedVar = learningUtils.decomposeAndSelectBestNfeatures(Xtrain,Xtest,Ytrain,n_features,selectFeatures,decomposeFunction)
                    BestFeaturesForLabel.add(bestNfeatures) #todo - delete this??  
                    bestNfeaturesPanel[Labeling][cv_ind]=bestNfeatures   
                    """for  feature_ind,feature_name in enumerate(bestNfeatures):
                         
                         try:
                            bestNfeaturesPanel[Labeling][feature_name].loc[cv_ind]=feature_ind
                        except KeyError:
                            bestNfeaturesPanel[Labeling].columns=bestNfeaturesPanel[Labeling].columns.append(feature_name)#continue here!! use 
                            bestNfeaturesPanel[Labeling][feature_name].loc[cv_ind]=feature_ind



                    [bestNfeatures].iloc[cv_ind]=range(len(bestNfeatures))"""
                    #train 1 
                    TrainModel=model
                    TrainModel.fit(Xtrain.sort_index(),Ytrain.T.sort_index())
                    """try:
                        #Components[cv_ind]=components.T
                        #ExplainedVar[cv_ind]=explainedVar
                        isDecompose=True"""
                    if cv_ind==0:
                        ModelWeights1=DF(columns=range(len(cv)),index=range(len(bestNfeatures)))    
                    ModelWeights1[cv_ind]=TrainModel.coef_.flatten()
                  
                    #get ROC scores without cross validation:
                                           
                    #train 2
                    if isBoolLabel:
                       PiecePrediction_train=DF(TrainModel.predict_proba(Xtrain).T[1],index=Xtrain.index,columns=['prediction'])
                       TrainModel2=svm.SVC(kernel='linear', probability=True,class_weight={0:1,1:1})
                    else:
                       PiecePrediction_train=DF(TrainModel.decision_function(Xtrain),index=Xtrain.index,columns=['prediction'])
                       TrainModel2=linear_model.LinearRegression()

                    Xtrain2, Ytrain2, YtrainTrue2=learningUtils.getX2Y2(Xtrain,Ytrain,YtrainTrue,PiecePrediction_train, isBoolLabel)                 
                    TrainModel2.fit(Xtrain2, Ytrain2)
                    if cv_ind==0:
                        ModelWeights2=DF(columns=range(len(cv)),index= Xtrain2.columns)
                    ModelWeights2[cv_ind]=TrainModel2.coef_.flatten()         

                              
                    #test 1
                    if isAddDroppedSubjects: #take test subjects from cv + subjects that were dropped for labeling used for test
                        if isDecompose:
                            dXdropped=DF(decomposeFunc(Xdropped).values,index=Xdropped.index)
                        XtestDropped=dXdropped[bestNfeatures]
                        YtestDropped=Series(XtestDropped.copy().icol(0))
                        #YTrueDropped=Series(Xdropped.copy().icol(0))
                        for subject in droppedSubjects:
                            YtestDropped[subject]=testLabels_all[Labeling].loc[subject]
                            #YTrueAll.loc[subject]=TrueLabels[Labeling].loc[subject]
                        Ytest=concat([Ytest,YtestDropped]).sort_index()
                        Xtest=concat([Xtest,XtestDropped]).sort_index()


                    if isPerm: #TODO- Check this!!
                        Ytest=y_perms.loc[Ytest.index]
                    Xtest=Xtest.fillna(0.)
                    
                    
                elif isWithinSubjects:
                    #train 1
                    train_pieces=PieceIndex[train]
                    test_pieces=PieceIndex[test] #TODO - make sure that if test/train> piece index, it ignores it and repeate the process
                    
                    XtrainAllFeatures=X.query('Piece_ind == '+ str(list(train_pieces)))
                    Ytrain=Series(index=X.index)
                    Ytest=Series(index=X.index)
                    YtrainTrue=Series(index=X.index)
                    
                    for subject in PieceIndex: 
                        for piece in train_pieces:
                            Ytrain.loc[piece].loc[subject]=trainLabels[subject]
                            YtrainTrue.loc[piece].loc[subject]=TrueLabels[Labeling].loc[subject] 
                            Ytest.loc[piece].loc[subject]=testLabels[subject]   
                    Ytrain=Ytrain.dropna()
                    YtrainTrue=YtrainTrue.dropna() 
                    for subject in test_subjects:
                        Ytest.loc[piece].loc[subject]=testLabels[subject]
                #train scores 1       
                if cv_ind==0:
                    trainScores,YtrainPredicted=learningUtils.getTrainScores(Ytrain,Xtrain,YtrainTrue,TrainModel)
                    plt.figure(1)
                    if len(LabelingList)>1:
                        plt.subplot(round(len(LabelingList)/2),2,label_ind+1)
                    if isBoolLabel:
                        testScores,testProbas=learningUtils.getTestScores(Ytest,Xtest,TrainModel)
                    else:
                        testScores[cv_ind],testProbas=learningUtils.getTestScores(Ytest,Xtest,TrainModel)
                        plt.title(Labeling,fontsize=10)
                else:
                    plt.figure(3)
                    new_trainScores,YtrainPredicted=learningUtils.getTrainScores(Ytrain,Xtrain,YtrainTrue,TrainModel)
                    trainScores=concat([trainScores,new_trainScores],axis=1)
                #test 1   
                    testScores[cv_ind],testProbas_new=learningUtils.getTestScores(Ytest,Xtest,TrainModel)
                    testProbas=concat([testProbas,testProbas_new])
                
                #train2

                if isBoolLabel:
                    PiecePrediction_test=DF(TrainModel.predict_proba(Xtest).T[1],index=Xtest.index,columns=['prediction'])
                else:
                    PiecePrediction_test=DF(TrainModel.decision_function(Xtest),index=Xtest.index,columns=['prediction'])
                Xtest2, Ytest2 , YtestTrue2 =learningUtils.getX2Y2(Xtest,Ytest,Ytest,PiecePrediction_test,isBoolLabel)
                
                if cv_ind==0:
                    trainScores2,YtrainPredicted2=learningUtils.getTrainScores(Ytrain2,Xtrain2,YtrainTrue2,TrainModel2)
                    YpredictedOverAllLabels[cv_ind].loc[Labeling]=YtrainPredicted2
                    #plt.figure(1)
                    #if len(LabelingList)>1:
                        #plt.subplot(round(len(LabelingList)/2),2,label_ind+1)
                #test2
                    if isBoolLabel:
                        testScores2,testProbas2=learningUtils.getTestScores(Ytest2,Xtest2,TrainModel2)
                    else:
                        testScores2[cv_ind],testProbas2=learningUtils.getTestScores(Ytest2,Xtest2,TrainModel2)
                    #plt.title(Labeling,fontsize=10)
                else:
                    new_trainScores2,YtrainPredicted2=learningUtils.getTrainScores(Ytrain2,Xtrain2,YtrainTrue2,TrainModel2)
                    YpredictedOverAllLabels[cv_ind].loc[Labeling]=YtrainPredicted2
                    trainScores2=concat([trainScores2,new_trainScores2],axis=1)
                    if len(Xtest2)>0: # if there is more than one segment for subject
                        testScores2[cv_ind],testProbas2_new=learningUtils.getTestScores(Ytest2,Xtest2,TrainModel2)     
                        testProbas2=concat([testProbas2,testProbas2_new])
                cv_ind+=1

                #crossValScores=crossValScores.append(CVscoresDF,ignore_index=True) #information about entire train test data. 
            fig2=plt.figure(2)
            if len(LabelingList)>1:
                plt.subplot(round(len(LabelingList)/2),2,label_ind+1)
            #if isAddDroppedSubjects:
               # testLabelsSummary=testLabels_all[Labeling].loc[AllSubjects]
           # else:
               # testLabelsSummary=testLabels
            scoresSummary,rocDF = learningUtils.getScoresSummary(trainScores2,testScores2,testProbas2,TrueLabels[Labeling])

            # reset global vars
            globalVars.fitYscale='notDefined'
            globalVars.beta=DF()

            plt.title(Labeling,fontsize=10)
            plt.xlabel('Ytrue',fontsize=8)
            plt.ylabel('Ypredicted',fontsize=8)
            plt.tick_params(labelsize=6)
            #print(crossValScores.T)    
            scores=scoresSummary.fillna(0.)
            
            #analyze feature weights             
            ModelWeights1=ModelWeights1.dropna(how='all')
            WeightedFeatures1_index0=analysisUtils.getFeaturesWeights(0,bestNfeaturesPanel[Labeling],ModelWeights1) #FeatureAnalysisIndex=0 for featureType, 1= au's (if not decomposed) or component rank (if decomposed)
            WeightedFeatures1_index1=analysisUtils.getFeaturesWeights(1,bestNfeaturesPanel[Labeling],ModelWeights1)
            WeightedFeatures1=concat([DF(index=['-------(A) Index0-------']),WeightedFeatures1_index0,DF(index=['-------(B) Index1 -------']),WeightedFeatures1_index1])
            
            WeightedFeatures2=DF(ModelWeights2.mean(axis=1)).fillna(0)
            #WeightedFeatures2=DF([ModelWeights2.mean(axis=1),ModelWeights2.std(axis=1)],index=['mean','std']).T.fillna(0)
            BestFeatures=concat([DF(index=['------------- Learning 1 -------------']),WeightedFeatures1,DF(index=['------------- Learning 2 -------------']),WeightedFeatures2])
            self.BestFeatures[Labeling]=Series(BestFeatures.values.flatten(),index=BestFeatures.index)

            #analyze decomposition
            if isDecompose:
                Components_mean = Components.mean(axis=0)
                Components_std = Components.std(axis=0)
                normalize=lambda df:DF(StandardScaler().fit_transform(df.T).T,index=df.index,columns=df.columns) 

                """#componentsMeanFeatureType=normalize(Components.mean(axis=1,level='FeatureType'))
                #componentsMeanFeatureTypeABS=normalize(componentsDF.abs().mean(axis=1,level='FeatureType'))
                #componentsMeanFSsignal=normalize(componentsDF.mean(axis=1,level='fs-signal'))
                #componentsMeanFSsignalABS=normalize(componentsDF.abs().mean(axis=1,level='fs-signal'))
                #ExplainedVar_mean = DF(ExplainedVar.mean(axis=1)).T#todo- check!
                #ExplainedVar_mean.index=['ExplainedVar_mean']
                #ExplainedVar_std = DF(ExplainedVar.std(axis=1)).T#todo- check!
                #ExplainedVar_std.index=['ExplainedVar_std']
                #componentsToCSV=concat([DF(index='---meanFeatureType----'),componentsMeanFeatureType,DF(index='---meanFeatureType - abs ----'),componentsMeanFeatureTypeABS,DF(index='---mean fs-signal ----'),componentsMeanFSsignal,DF(index='---mean fs-signal - abs ----'),componentsMeanFSsignalABS])
                try:
                    self.LabelComponents[Labeling]=concat([DF(index=['---components mean---']),Components_mean,ExplainedVar_mean,DF(index=['---components std over cross validation---']),Components_std,ExplainedVar_std])
                except AttributeError:
                    self.LabelComponents=dict.fromkeys(LabelingList)
                    self.LabelComponents[Labeling]=concat([DF(index=['---components mean---']),Components_mean,ExplainedVar_mean,DF(index=['---components std over cross validation---']),Components_std,ExplainedVar_std])"""

                """print(Components_mean)
                print(ExplainedVar_mean)
                print(WeightedFeatures1)"""

                        
            #BestFeaturesForLabel.analyze(ByLevel=0) #TODO change to regression coeff
            LabelFullResults=concat([DF(index=[Labeling]),scores]) 
  
            self.FullResults=concat([self.FullResults,LabelFullResults])            
            self.ResultsDF=concat([self.ResultsDF,DF(scores[0],columns=[Labeling])],axis=1)

            #self.BestFeatures[Labeling]=BestFeaturesForLabel.WeightedMean

            #plt.savefig('C:\\Users\\taliat01\\Desktop\\TALIA\\Code-Python\\Results\\'+Labeling+'png')
        testScores3=pandas.Panel(items=range(len(X2.index))) #for each cv score...
        FullSubjectsList=YpredictedOverAllLabels[0].columns
        YdroppNans=YpredictedOverAllLabels.dropna(axis=0,how='all')
        YdroppNans=YdroppNans.dropna(axis=1,how='all')
        YpredictedOverAllLabels=YdroppNans.dropna(axis=2,how='all')
        notNans_cv_ind=YpredictedOverAllLabels.items
        notNans_trainSubjects=YpredictedOverAllLabels.minor_axis
        notNans_LabelsList=YpredictedOverAllLabels.major_axis
        notNans_TrueLabels=TrueLabels.T[notNans_trainSubjects].loc[notNans_LabelsList]
        cv_ind=0
        for train, test in cv:
            if cv_ind in notNans_cv_ind:
                print(test)
                train=list(set(FullSubjectsList[train]).intersection(set(notNans_trainSubjects)))
                test=list(set(FullSubjectsList[test]).intersection(set(notNans_trainSubjects)))
                if len(train)>0 and len(test)>0: 
                    AllLabelsYTrainPredicted=YpredictedOverAllLabels[cv_ind][train]
                    AllLabelsYTrainPredicted=AllLabelsYTrainPredicted.fillna(0)
                    AllLabelsYTrainTrue=notNans_TrueLabels[train]
                    AllLabelsYTestPredicted=YpredictedOverAllLabels[cv_ind][test]
                    AllLabelsYTestTrue=notNans_TrueLabels[test]

                    pseudoInverse_AllLabelsYTrainTrue=DF(np.linalg.pinv(AllLabelsYTrainTrue),columns=AllLabelsYTrainTrue.index,index=AllLabelsYTrainTrue.columns)
                    global AllLabelsTransformationMatrix
                    AllLabelsTransformationMatrix=DF(AllLabelsYTrainPredicted.dot(pseudoInverse_AllLabelsYTrainTrue),columns=pseudoInverse_AllLabelsYTrainTrue.columns)#change to real code!!
                TrainModel3=lambda y: y.T.dot(AllLabelsTransformationMatrix)
                #testscores3[cv_ind]=learningUtils.getTestScores(AllLabelsYTrainTrue,AllLabelsYTrainPredicted,TrainModel3)
            cv_ind+=1

        self.BestNFeaturesAll=bestNfeaturesPanel 
        self.ResultsDF=self.ResultsDF.fillna(0.)  
        
        ## Print and save results  
        print('\n')
        print(self.ResultsDF)
        print('\n')
        D=self.Learningdetails 
        savePath=resultsPath+'\\'+D['Model']+'_'+D['CrossVal']+'_LabelBy'+D['LabelBy']+ '_FSelection'+FeatureSelection+'_Decompostion'+D['Decomposition']+'PieceSize'+D['PieceLength']+'_'+SubFeatures
        if isPerm:
            savePath=savePath+'_PERMStest'
        saveName=savePath+'\\'+str(n_features)+'_features'        
        self.Learningdetails['saveDir']=savePath
        dir=os.path.dirname(saveName)
        if not os.path.exists(dir):
            os.makedirs(dir)
        if isSavePickle is None:
            isSavePickle=int(raw_input('Save Results to pickle? '))
        if isSaveCsv is None:
            isSaveCsv= int(raw_input('save Results to csv? '))
        if isSaveFig is None:
            isSaveFig=int(raw_input('save Results to figure? '))

       
        if isSavePickle:        
            self.ResultsDF.to_pickle(saveName+'.pickle')
            self.BestFeatures.to_pickle(saveName+'_bestFeatures.pickle')
                
        if isSaveCsv:
            DetailsDF=DF.from_dict(self.Learningdetails,orient='index')
            ResultsCSV=concat([self.ResultsDF,DF(index=['-------Label Details-------']),self.N,DF(index=['-------Learning Details-------']),DetailsDF,DF(index=['-------Selected Features Analysis------']),self.BestFeatures])
            ResultsCSV.to_csv(saveName+'.csv')
            if isBoolLabel:
                ROCfig=learningUtils.save_plotROC(rocDF,isSave=True,saveName=saveName,title=SubFeatures)

        if isSaveCsv or isSavePickle:
            print('successfully saved as:\n' + saveName)
        
        if isSaveFig:
            plt.figure(1)
            plt.savefig(saveName + 'Train.png')
            plt.figure(2)
            plt.savefig(saveName + 'Test.png')
        plt.close()
        plt.close()

Esempio n. 53

0

Mostra file

File: classify.py Progetto: kevin-winter/Correspondence-Recognition

    sys.stdout = Logger(title)

if t == 2:
    authors = y_options.get(0)[1]
    recipients = y_options.get(1)[1]
    accuracies = []
    results = {}
    clf, author_score = load_classifier(clfs.get(7), X, y_options.get(0))

    for a in np.unique(authors):
        s_targets = list(compress(recipients, authors == a))
        s_data = list(compress(X, authors == a))
        _, score = classify(clfs.get(c), ("Recipients of {}".format(class_labels[a]), s_targets), s_data)
        accuracies.append(np.mean(score))
        results[a] = dict(zip(np.unique(s_targets), np.atleast_1d(score)))

    df = DataFrame(results).T
    plot_accuracy_matrix(df, class_labels[np.unique(recipients)], class_labels[np.unique(authors)], title)

    df = df.T.fillna(df.mean(axis=1)).T
    plot_accuracy_matrix(df, class_labels[np.unique(recipients)], class_labels[np.unique(authors)], title+" (filled)")
    print("##################################")
    print("Mean Recipient Score: {}".format(np.mean(accuracies)))
    print("##################################")
    print("FINAL SCORE: {}".format(np.mean(author_score * np.mean(accuracies))))
    print("##################################")

else:
    classify(clfs.get(c), y, X, save=save_results if t == 0 else False)

Esempio n. 54

0

Mostra file

File: ch05_6.py Progetto: uolter/pydata-book

# reductions or summary statistics
f = DataFrame([[1.4, np.nan], [7.1, -4.5],
[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
df
df = DataFrame([[1.4, np.nan], [7.1, -4.5],
[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
df
df.sum() # columns sum
df.sum(axis=1) # sum row by row
df
(7.10 - 4.5)/2
df.mean(axis=1, skipna=False)
df
df.idxmax()
df
df.cumsum() # accumultation
df.describe() # multiple summary statistics in one shot.
obj = Series(['a', 'a', 'b', 'c'] * 4)
obj
obj.describe()
## Correlation and Covariance
import pandas.io.data as web
all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')
    
price = DataFrame({tic: data['Adj Close'],

Esempio n. 55

0

Mostra file

File: main.py Progetto: trevorwitter/Baroreflex

BPbin= []
for y in frame2.sbp:
    BPbin.append(int((y - min(frame2.sbp))/3))
frame2['bin'] = BPbin
frame2 = frame2[:(len(frame2))-2] #removes trailing incomplete cardiac cycle
print frame2

groupedRR = frame2['RR'].groupby(frame2['bin'])
RRarray = groupedRR.mean() 

groupedSBP = frame2['sbp'].groupby(frame2['bin'])
SBParray = np.asarray(groupedSBP.mean())
print SBParray

bin_weight = groupedSBP.size()/frame2['hb'].max()
frame3 = frame2.mean()


#linear regression
#RR vs SBP
slope, intercept, r_value, p_value, std_err = linregress(SBParray, RRarray)
frame3['BRS slope'] = slope
frame3['R^2'] = r_value**2
print frame3
bestfit = [(i*0.012020)+intercept for i in SBParray]

#plots plots plots plots plots plots plots plots plots plots plots
fig = plt.figure()

#ECG plot
ax1 = fig.add_subplot(2, 1, 1)

Esempio n. 56

0

Mostra file

File: pandas-demo.py Progetto: spartanem/python-for-researchers

# 'Panel' objects are 3D.

wp = Panel( {
    'Item1' : DataFrame(randn(4, 3)),
    'Item2' : DataFrame(randn(4, 2))
} )
pprint( wp )

# There are also 'TimeSeries', 'SparseSeries', and 'SparsePanel' objects.
# In newer versions, there is experiemental support for higher-dimensional
# panels.

# Stats can also be performed on Pandas objects.
df = DataFrame( randn( 6, 4 ), columns=[ 'A', 'B', 'C', 'D' ] )
pprint( df )

# You can choose which axis number to perform the operation along.
pprint( df.mean( 0 ) )
pprint( df.mean( 1 ) )

# Much more to Pandas, but that's the basic idea.

# For more information, see:
#   http://pandas.pydata.org/pandas-docs/stable/index.html
# Also, definitely have a look at StatsModels:
#   http://statsmodels.sourceforge.net/
#   http://statsmodels.sourceforge.net/stable/

# <demo> --- stop ---

Esempio n. 57

0

Mostra file

File: crawlGit.py Progetto: dluzenst/nostromo

# Returns all repos data from a given user
def getRepos(user):
    myrepos=requests.get("https://api.github.com/users/"+ user +"/repos", \
    	headers={'Authorization': 'token 5218551eb082bffa572318de0c2de10d255170b1'}).json()
    return myrepos

# Getting number of stars
data = DataFrame()
i = 0
for user in topGitUsers:
    userRepos = getRepos(user)
    i += 1
    print i #check progress
    if len(userRepos) > 0:
	    stars = []
	    listUserStars = [('',0)]
	    for repo in userRepos:
	        #print repo['stargazers_count']
	        stars.append(repo['stargazers_count'])
	    userStars = DataFrame(stars)
	    userMeanSt = userStars.mean(axis=0)
	    listUserStars.append((user,userMeanSt))
	    #print user + str(userMeanSt[0])
	    result = DataFrame({'userId': user,'Mean of stars': userMeanSt})
	    data = data.append(result)
    else:
		print user + ': No repos found for this user'

data.to_csv('gitTopUsersMean.csv')

Esempio n. 58

0

Mostra file

File: stats_and_aggregation.py Progetto: ksomemo/pandas-study

def main():
    """
    Calculation and aggregation of summary statistics
    """

    # Summary of statistics
    # return is not ndarray
    df = DataFrame([[1.4, np.nan],
                    [7.1, -4.5],
                    [np.nan, np.nan],
                    [0.75, -1.3]],
                   index=list('abcd'),
                   columns=['one', 'two'])
    print df
    print df.sum()
    print df.sum(axis=1)
    print df.mean(axis=1) # exclude nan
    print df.mean(axis=1, skipna=False)
    print df.idxmin()
    print df.idxmax()
    print df.cumsum()
    print df.describe()
    # values are not number
    obj = Series(list('aabc') * 4)
    print obj.describe()


    methods = ['count', 'min', 'max', # 'argmin', 'argmax',
               'quantile', 'median', 'mad', 'var', 'std',
               'skew', 'kurt', 'cummin', 'cummax', 'cumprod',
               'diff', 'pct_change']

    for method in methods:
        print u'「{0}」'.format(method)
        print getattr(df, method)()
        print ''

    # Correspond and Covariance
    all_data = {}
    lst = [] # ['AAPL', 'IBM', 'MSFT'] #, 'GOOG']:
    for ticket in lst: #, 'GOOG']:
        # IOError: after 3 tries, Yahoo! did not return a 200
        # for url 'http://ichart.finance.yahoo.com/table.csv?s=GOOG&a=0&b=1&c=2000&d=0&e=1&f=2010&g=d&ignore=.csv'
        all_data[ticket] = pd.io.data.get_data_yahoo(ticket, '1/1/2000', '1/1/2010')
    price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems()})
    volume = DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()})
    if all_data:
        returns = price.pct_change()
        print returns.tail()
        print ''
        print returns.MSFT.corr(returns.IBM)
        print returns.MSFT.cov(returns.IBM)
        print ''
        print returns.corr()
        print returns.cov()
        print ''
        print returns.corrwith(returns.IBM)
        print returns.corrwith(volume)

    # unique, frequency, belong
    print '',''
    obj = Series(list('cadaabbcc'))
    uniques = obj.unique()
    print uniques
    print obj.value_counts()
    print pd.value_counts(obj.values, sort=False)
    mask = obj.isin(['b', 'c'])
    print mask
    print obj[mask]

    data = DataFrame({
        'Qu1' : [1,3,4,3,4],
        'Qu2' : [2,3,1,2,3],
        'Qu3' : [1,5,2,4,4],
    })
    print data
    print data.apply(pd.value_counts).fillna(0)

Esempio n. 59

0

Mostra file

File: Water_Level_Fernandes.py Progetto: duncombe/system-test

        if False:
            kw = dict(method='time')
            df = df.reindex(index).interpolate(**kw).ix[index]
        dfs.update({model: df})

dfs = Panel.fromDict(dfs).swapaxes(0, 2)


# In[ ]:

from pandas import DataFrame

means = dict()
for station, df in dfs.iteritems():
    df.dropna(axis=1, how='all', inplace=True)
    mean = df.mean()
    df = df - mean + mean['OBS_DATA']
    means.update({station: mean['OBS_DATA'] - mean.drop('OBS_DATA')})

bias = DataFrame.from_dict(means).dropna(axis=1, how='all')
bias = bias.applymap('{:.2f}'.format).replace('nan', '--')

columns = dict()
[columns.update({station: get_coops_longname(station)}) for
 station in bias.columns.values]

bias.rename(columns=columns, inplace=True)

to_html(bias.T, 'style.css')

Esempio n. 60

0

Mostra file

import numpy as np
from pandas import DataFrame, Series
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats

df=DataFrame(abs(np.random.randn(30).reshape(6,5))*100)

plt.bar(np.arange(len(df.mean())), df.mean(),
        align='center',
        color='white',
        linewidth=1.5)
plt.hold(True)
plt.errorbar(np.arange(len(df.mean())),df.mean(),df.std(),
             elinewidth=1.2,
             capsize=7.5,
             fmt=None)


plt.show()