Ejemplo n.º 1
0
def sd_ratio(df1: pd.DataFrame,
             df2: pd.DataFrame,
             robust: bool = False,
             fill_value: Optional[float] = None) -> pd.Series:
    """
    Computes the ratio between the standard deviation of the columns of
    DataFrame1 and DataFrame2.

    Used to compute the D-Ratio metric. NaN values are filled to np.inf.

    Parameters
    ----------
    df1 : DataFrame with shape (n1, m)
    df2 : DataFrame with shape (n2, m)
    robust : bool
        If True uses the MAD as an estimator of the standard deviation. Else
        computes the sample standard deviation.
    fill_value : Number used to input NaNs.

    Returns
    -------
    ratio : pd.Series

    """
    if robust:
        ratio = mad(df1) / mad(df2)
    else:
        ratio = df1.std() / df2.std()

    if fill_value is not None:
        ratio = ratio.fillna(fill_value)
    return ratio
Ejemplo n.º 2
0
def attributes_sanity_check(df: pd.DataFrame):
    """Utility function to check if the standard deviation of one (or more) attributes is zero.
    
    This utility function can be used to check if any attribute has a standard deviation of zero. This would lead to 
    NaN's, when normalizing the features and thus would lead to NaN's when training the model. The function will raise
    a `RuntimeError` if one or more zeros have been detected and will print the list of corresponding attribute names
    to the console.
    
    Parameters
    ----------
    df : pd.DataFrame
        DataFrame of catchment attributes as columns.

    Raises
    ------
    RuntimeError
        If one or more attributes have a standard deviation of zero.
    """
    # Iterate over attributes and check for NaNs
    attributes = []
    if any(df.std() == 0.0) or any(df.std().isnull()):
        for k, v in df.std().iteritems():
            if (v == 0) or (np.isnan(v)):
                attributes.append(k)
    if attributes:
        msg = [
            "The following attributes have a std of zero or NaN, which results in NaN's ",
            "when normalizing the features. Remove the attributes from the attribute feature list ",
            "and restart the run. \n", f"Attributes: {attributes}"
        ]
        raise RuntimeError("".join(msg))
Ejemplo n.º 3
0
def scale(df: pd.DataFrame, method: str) -> pd.DataFrame:
    """
    scales features using different methods.

    Parameters
    ----------
    df: pandas.DataFrame
    method: {"autoscaling", "rescaling", "pareto"}
        Scaling method. `autoscaling` performs mean centering scaling of
        features to unitary variance. `rescaling` scales data to a 0-1 range.
        `pareto` performs mean centering and scaling using the square root of
        the standard deviation

    Returns
    -------
    scaled: pandas.DataFrame
    """
    if method == "autoscaling":
        scaled = (df - df.mean()) / df.std()
    elif method == "rescaling":
        scaled = (df - df.min()) / (df.max() - df.min())
    elif method == "pareto":
        scaled = (df - df.mean()) / df.std().apply(np.sqrt)
    else:
        msg = "Available methods are `autoscaling`, `rescaling` and `pareto`."
        raise ValueError(msg)
    # replace nans generated when dividing by zero
    scaled[scaled.isna()] = 0
    return scaled
Ejemplo n.º 4
0
def stats_nodules_deca(values_dic):
    """ Function that calculates the extremes, mean and std of several metrics
    given by the values_dic

    Parameters:
        values_dic (dictionary):
            volume - list with the volumes of the nodules
            volume% - list with the percentage volumes of the nodules
            larg_diam - list with the largest diameters of the nodules
            Feret_diam - list with the feret diameters of the nodules


    Returns:
        values_dic_out (dictionary):
            'volume' : mean and std of the volume
            'volume extremes' : largest and smallest value of the volume
            'volume%' : mean and the std of the volume percentage
            'larg_diam' : mean and the std of the largest diameters
            'larg_diam_extremes' : largest and smallest value of the diameters
            'Feret diam' :  mean and the std of the Feret diameters
            'Feret diam' : largest and smallest value of the Feret diameters

    """

    # ------- Calculate stats
    stats_dic_out={'volume':[0,0],# mean first, std second
                   'volume extremes':[0,0], #largest and smallest
                   'volume%':[0,0],  # mean first, std second
                   'larg_diam':[0,0],# mean first, std second
                   'larg_diam_extremes':[0,0], #largest and smallest
                   'Feret diam':[0,0], # mean first, std second
                   'Feret diam extremes':[0,0]} #largest and smallest

    stats_dic_out['volume'][0]=df.mean(pd.Series(values_dic['volume']))
    stats_dic_out['volume'][1]=df.std(pd.Series(values_dic['volume']))
    stats_dic_out['volume extremes'][0] = max(values_dic['volume'])
    stats_dic_out['volume extremes'][1] = min(values_dic['volume'])

    stats_dic_out['volume%'][0]=df.mean(pd.Series(values_dic['volume%']))
    stats_dic_out['volume%'][1]=df.std(pd.Series(values_dic['volume%']))

    stats_dic_out['larg_diam'][0]=df.mean(pd.Series(values_dic['larg_diam']))
    stats_dic_out['larg_diam'][1]=df.std(pd.Series(values_dic['larg_diam']))
    stats_dic_out['larg_diam_extremes'][0] = max(values_dic['larg_diam'])
    stats_dic_out['larg_diam_extremes'][1] = min(values_dic['larg_diam'])

    stats_dic_out['Feret diam'][0]=df.mean(pd.Series(values_dic['Feret diam']))
    stats_dic_out['Feret diam'][1]=df.std(pd.Series(values_dic['Feret diam']))
    stats_dic_out['Feret diam extremes'][0] = max(values_dic['Feret diam'])
    stats_dic_out['Feret diam extremes'][1] = min(values_dic['Feret diam'])

    return stats_dic_out
Ejemplo n.º 5
0
def standardize_features(train: pd.DataFrame,
                         test: pd.DataFrame) -> (pd.DataFrame, pd.DataFrame):
    """
    Standardize all features except label to mean 0 variance 1.
    The test will be processed with the mean and the std of the training set.

    :param train: feature array of training set from where the std and the mean will be estimated
    :param test: feature array of the test set
    :return: tuple of the stand. train and test arrays
    """
    train_std = (train - train.mean()) / train.std()
    test_std = (test - train.mean()) / train.std()
    return train_std, test_std
Ejemplo n.º 6
0
def compare(df1: pd.DataFrame, df2: pd.DataFrame):
    """同じ列を持つ二つのdfの値を色々比べた結果をdfに入れて返す。"""
    assert (df1.columns == df2.columns).all()

    std = (df1.std() + df2.std()) / 2
    df_result = pd.DataFrame(index=df1.columns)
    df_result["mean_ae/std"] = np.abs(df1.mean() - df2.mean()) / std
    df_result["median_ae/std"] = np.abs(df1.median() - df2.median()) / std
    df_result["mode1"] = df1.mode().transpose()[0]
    df_result["mode2"] = df2.mode().transpose()[0]

    df_result = df_result.sort_values("median_ae/std", ascending=False)
    return df_result
Ejemplo n.º 7
0
def _create_summary(data: pd.DataFrame, original):
    summary = pd.DataFrame(0,
                           index=data.columns,
                           columns=[
                               "original", "mean", "std.error", "perc.025",
                               "perc.975", "t stat."
                           ])
    summary.loc[:, "mean"] = data.mean(axis=0)
    summary.loc[:, "std.error"] = data.std(axis=0)
    summary.loc[:, "perc.025"] = data.quantile(0.025, axis=0)
    summary.loc[:, "perc.975"] = data.quantile(0.975, axis=0)
    summary.loc[:, "original"] = original
    summary.loc[:, "t stat."] = original / data.std(axis=0)
    return summary
Ejemplo n.º 8
0
    def test_std_timedelta64_skipna_false(self):
        # GH#37392
        tdi = pd.timedelta_range("1 Day", periods=10)
        df = DataFrame({"A": tdi, "B": tdi})
        df.iloc[-2, -1] = pd.NaT

        result = df.std(skipna=False)
        expected = Series([df["A"].std(), pd.NaT],
                          index=["A", "B"],
                          dtype="timedelta64[ns]")
        tm.assert_series_equal(result, expected)

        result = df.std(axis=1, skipna=False)
        expected = Series([pd.Timedelta(0)] * 8 + [pd.NaT, pd.Timedelta(0)])
        tm.assert_series_equal(result, expected)
Ejemplo n.º 9
0
def standardize(
        data: pd.DataFrame) -> Tuple[pd.DataFrame, Tuple[float, float]]:
    """
    Applies standardization to input data. Result should have mean zero and standard
    deviation of one.

    Args
    ----
      data: pd.DataFrame

    Returns
    -------
      Tuple[pd.DataFrame, Tuple[float, float]]
        data: pd.DataFrame
            standardized data with zero mean and std of one.
        Tuple[float, float]
          mean and standard deviation used on each column of input data to make
          standardization. These values should be used to obtain the original dataframe.

    Raises
    ------
      ValueError: if data has only one value.
    """
    if data.shape[0] == 1:
        raise ValueError('Input data must have more than one value')
    mu = data.mean(skipna=True)
    std = data.std(skipna=True, ddof=0)
    data = (data - mu) / std.fillna(1)
    return [data, (mu, std)]
Ejemplo n.º 10
0
def analyze(df: pd.DataFrame):
    """中身を適当に分析してDataFrameに詰めて返す。"""
    if isinstance(df, pd.DataFrame):
        df_result = pd.DataFrame(index=df.columns)
        df_result["dtype"] = df.dtypes
        df_result["null"] = df.isnull().sum()
        df_result["nunique"] = df.nunique()
        df_result["min"] = df.min()
        df_result["median"] = df.median()
        df_result["max"] = df.max()
        df_result["mode"] = df.mode().transpose()[0]
        df_result["mean"] = df.mean()
        df_result["std"] = df.std()
        # # はずれ値のはずれ度合いを見るためにRobustScalerした結果の絶対値を見てみる。
        # numeric_columns = df.select_dtypes(include=np.number).columns
        # df_result["outlier_size"] = np.nan
        # df_result.loc[numeric_columns, "outlier_size"] = (
        #     tk.preprocessing.SafeRobustScaler(clip_range=None)
        #     .fit_transform(df.loc[:, numeric_columns])
        #     .fillna(0)
        #     .abs()
        #     .max()
        #     .round(decimals=1)
        # )
        return df_result
    else:
        raise NotImplementedError()
Ejemplo n.º 11
0
def kurtosis(str,list):

    s=list
    w = pd.read_csv(str, usecols=s)

    frame = DataFrame(w)

    h=len(w)


    print h
    t = frame.mean()

    d = frame.std()

    e = ((w - t) /d) ** 4

    g=e.sum()


    p1=h*(h+1)
    p2=float((h-1)*(h-2)*(h-3))
    p3=float(3*((h-1)**2))
    p4=(h-2)*(h-3)

    i=(((p1/p2)*g)-(p3/p4))

    print 'kurtosis=',i
        def construct_portfolio(self):
            """

            :return:
            """
            pre_date_data = w.tdaysoffset(-self.window, self.date, "Period=M")
            pre_date = pre_date_data.Data[0][0].strftime("%Y-%m-%d")
            tradedays_data = w.tdays(pre_date, self.date, "Period=M")
            tradedayslist = tradedays_data[0]
            tradedays = [td.strftime("%Y-%m-%d") for td in tradedayslist]
            # 提取因子数据
            style_return = DataFrame()
            for f in self.factors:
                f_data = []
                for dt in tradedays:
                    stockcodes = StockPool(dt).select_stock()
                    f_data = f(dt, stockcodes).getdata()
                    f_ret = FactorProcess.get_alpha(stockcodes, dt,
                                                    -1)  # 选取一个月的alpha
                    df = DataFrame(data=[f_data, f_ret],
                                   columns=[f.windLabel, 'ret'])
                    long_only, long_short = FactorStyle.compute_style_return_month(
                        df, f.windLabel)
                    f_data.append(long_only)
                style_return[f.windLabel] = f_data
            style_sigma = style_return.std()
            weight = style_sigma / self.target
            weight[weight > 1] = 1
            weight = weight / len(self.factors)

            return weight
Ejemplo n.º 13
0
def summarize_he( analytical_sets ):

    results = {}
    he = {}

    for analytical_set in analytical_sets:
        he[analytical_set.label] = calculate_he(analytical_set.allele_df)

    he_df = DataFrame( he )
    labels = list(he_df.columns)
    if len(labels) == 2:
        # use Mann-Whitney / Wilcoxon test
        results['test'] = 'Wilcoxon test (paired)'
        results['stats'] = wilcoxon( he_df[labels[0]], he_df[labels[1]])

    elif len(labels) > 2:
        # use Kruskal Wallis
        results['test'] = 'Kruskal-Wallis test'
        results['stats'] = kruskal( * [he_df[x] for x in labels])
        results['warning'] = ''

    results['data'] = he_df
    results['mean'] = he_df.mean()
    results['stddev'] = he_df.std()
    #raise RuntimeError

    return results
Ejemplo n.º 14
0
def kurtosis(str, list):

    s = list
    w = pd.read_csv(str, usecols=s)

    frame = DataFrame(w)

    h = len(w)

    print h
    t = frame.mean()

    d = frame.std()

    e = ((w - t) / d)**4

    g = e.sum()

    p1 = h * (h + 1)
    p2 = float((h - 1) * (h - 2) * (h - 3))
    p3 = float(3 * ((h - 1)**2))
    p4 = (h - 2) * (h - 3)

    i = (((p1 / p2) * g) - (p3 / p4))

    print 'kurtosis=', i
Ejemplo n.º 15
0
def perform_pca(returns_df: pd.DataFrame, n):
    '''
    Uses the sklearn library to perform principle components analysis on the stocks.
    '''
    stocks = returns_df.columns
    stdev_returns = returns_df.std(ddof=1, axis=0)
    # Standardise the data
    scaled = StandardScaler().fit_transform(returns_df)
    # Conduct principle components analysis and project onto principle components
    pca = PCA(n_components=len(returns_df.columns))
    print(scaled.shape)
    transformed = pca.fit_transform(scaled)

    eigenvalues = pca.explained_variance_
    pc_df = pd.DataFrame(pca.components_,
                         columns=[
                             'PC{}'.format(i)
                             for i in range(1,
                                            len(returns_df.columns) + 1)
                         ],
                         index=returns_df.columns)
    # Divide rows by STDEV of each coin return to get eigen portfolio weights
    eigenportfolios = pc_df.div(stdev_returns, axis=0)
    # The columns of 'eigenportfolios' are the relevant eigenportfolios.
    # Lets add the returns of the first n eigenportfolios to the returns dataframe.
    for i in range(1, n + 1):
        pc = 'PC{}'.format(i)
        returns_df[pc] = np.sum(returns_df[stocks].multiply(
            eigenportfolios[pc].to_list()),
                                axis=1) / np.sqrt(eigenvalues[i - 1])
    return returns_df
Ejemplo n.º 16
0
def skewness(str,list):
    s= list



    w = pd.read_csv(str, usecols=s)

    frame = DataFrame(w)

    h=len(w)

    t = frame.mean()



    d = frame.std()

    e = ((w - t) /d) ** 3

    g=e.sum()

    i=(h*g)/((h-1)*(h-2))


    print 'skewness=',i
Ejemplo n.º 17
0
def vars_gaussian(rets: pd.DataFrame, modified: bool = False) -> dict:
    """Retorna um dicionário com os 4 VaRs-paramétricos: 95%, 97%,
    99% e 99.9% do dataframe de retornos 'rets'. Se modified=True,
    considera a skewness e curtose de 'rets' e realiza a correção de
    Cornish-Fisher.

    Args:
        rets (pd.DataFrame): dataframe dos retornos.

    Returns:
        dict: {95: ..., 97: ...,
        99: ..., 99.9: ...}
    """
    lvls = (95, 97, 99, 99.9)

    # z-scores
    zs = [norm.ppf(1 - lvl / 100) for lvl in lvls]

    if modified:
        s, k = skew(rets), kurtosis(rets)
        zs = [cornish_fisher_z(z, s, k) for z in zs]

    vol = rets.std()
    var = {v[0]: (rets.mean() + v[1] * vol)[0] for v in zip(lvls, zs)}
    return var
        def construct_portfolio(self):
            """

            :return:
            """
            pre_date_data = w.tdaysoffset(-self.window, self.date, "Period=M")
            pre_date = pre_date_data.Data[0][0].strftime("%Y-%m-%d")
            tradedays_data = w.tdays(pre_date, self.date, "Period=M")
            tradedayslist = tradedays_data[0]
            tradedays = [td.strftime("%Y-%m-%d") for td in tradedayslist]
            # 提取因子数据
            style_return = DataFrame()
            for f in self.factors:
                f_data = []
                for dt in tradedays:
                    stockcodes = StockPool(dt).select_stock()
                    f_data = f(dt, stockcodes).getdata()
                    f_ret = FactorProcess.get_alpha(stockcodes, dt,
                                                    -1)  # 选取一个月的alpha
                    df = DataFrame(data=[f_data, f_ret],
                                   columns=[f.windLabel, 'ret'])
                    long_only, long_short = FactorStyle.compute_style_return_month(
                        df, f.windLabel)
                    f_data.append(long_only)
                style_return[f.windLabel] = f_data
            S = matrix(style_return.cov().values)
            pbar = matrix(np.zeros_like(style_return.std().values))
            n = len(self.factors)
            G = matrix(0.0, (n, n))
            G[::n + 1] = -1.0
            h = matrix(0.0, (n, 1))
            A = matrix(1.0, (1, n))
            b = matrix(1.0)
            portfolio_weight = qp(S, -pbar, G, h, A, b)['x']
Ejemplo n.º 19
0
def bse(data: pd.DataFrame,
        weight_name: Optional[str]=None,
        ignore: List[str]=None) -> pd.DataFrame:
    """
    Calculate the Block Standard Error (BSE).

    Parameters
    ----------
    data : Dataframe with CV data over time and weights.
    weight_name : Name of the weight column.
    ignore : List of column names to ignore.

    Returns
    -------
    bse : Dataframe containing BSEs over all iterations.

    References
    ----------
    Flyvbjerg, H., Petersen, H. G. Error estimates on averages of correlated
    data. The Journal of Chemical Physics, 91(1), 461 (1989)

    """
    if ignore is None:
        ignore = []
    if 'time' not in ignore:
        ignore.append('time')

    # Prepare input, first element
    if weight_name is not None:
        weights = data[weight_name].values
        ignore.append(weight_name)

    length = data.shape[0]
    width = data.shape[1]
    index = data.T.index
    data = data.values
    blist = [data.std(axis=0) / np.sqrt(length)]
    length //= 2

    # Iteratively increase block size
    while length > 2:
        halved = np.empty((length, width))

        # Each iteration, we halve the dataset
        for i in range(0, length):
            if weight_name is not None:
                halved[i] = (1 / (weights[2 * i - 1] + weights[2 * i]) *
                             (data[2 * i - 1] * weights[2 * i - 1] +
                              data[2 * i] * weights[2 * i]))
            else:
                halved[i] = 0.5 * (data[2 * i - 1] + data[2 * i])

        # Calculate the BSE
        bse = halved.std(axis=0) / np.sqrt(length)
        blist.append(bse)
        length //= 2

    # Reconstruct Dataframe
    return pd.DataFrame(np.asarray(blist), columns=index).drop(ignore, axis=1)
Ejemplo n.º 20
0
def calc_bse(data: pd.DataFrame,
             weight_name: Optional[str] = None,
             ignore: List[str] = None) -> pd.DataFrame:
    '''
    Calculate the Block Standard Error (BSE).

    Parameters
    ----------
    data : Dataframe with CV data over time and weights.
    weight_name : Name of the weight column.
    ignore : List of column names to ignore.

    Returns
    -------
    bse : Dataframe containing BSEs over all iterations.

    References
    ----------
    Flyvbjerg, H., Petersen, H. G. Error estimates on averages of correlated
    data. The Journal of Chemical Physics, 91(1), 461 (1989)

    '''
    if ignore is None:
        ignore = []
    if 'time' not in ignore:
        ignore.append('time')

    # Prepare input, first element
    if weight_name is not None:
        weights = data[weight_name].values
        ignore.append(weight_name)

    length = data.shape[0]
    width = data.shape[1]
    index = data.T.index
    data = data.values
    blist = [data.std(axis=0) / np.sqrt(length)]
    length = length // 2

    # Iteratively increase block size
    while length > 2:
        halved = np.empty((length, width))

        # Each iteration, we halve the dataset
        for i in range(0, length):
            if weight_name is not None:
                halved[i] = (1 / (weights[2 * i - 1] + weights[2 * i]) *
                             (data[2 * i - 1] * weights[2 * i - 1] +
                              data[2 * i] * weights[2 * i]))
            else:
                halved[i] = 0.5 * (data[2 * i - 1] + data[2 * i])

        # Calculate the BSE
        bse = halved.std(axis=0) / np.sqrt(length)
        blist.append(bse)
        length = length // 2

    # Reconstruct Dataframe
    return pd.DataFrame(np.asarray(blist), columns=index).drop(ignore, axis=1)
Ejemplo n.º 21
0
    def structural_adj(self,
                       cov: pd.DataFrame,
                       spec_ret: pd.DataFrame,
                       fact_exp: pd.DataFrame,
                       liq_mv: pd.DataFrame,
                       liq_mv_name: PVN.LIQ_MV.value,
                       time_window: int = 120):
        """

        :param cov: 经Newey-West调整的个股特异收益矩阵
        :param spec_ret: 个股特异收益序列
        :param fact_exp: 因子暴露
        :param liq_mv: 流通市值
        :param liq_mv_name: 流通市值名称
        :param time_window: 个股特异收益的时间窗口(后面考虑改为特异收益序列的长度)
        :return:
        """
        # 计算协调参数
        h_n = spec_ret.count()  # 非空数量
        V_n = (h_n - 20 / 4) / 20 * 2  # 数据缺失程度(先用20测试)

        sigma_n = spec_ret.std().fillna(1)  # 样本等权标准差(无法计算的标准差记为1)  TODO

        sigma_n_steady = (spec_ret.quantile(.75) -
                          spec_ret.quantile(0.25)) / 1.35  # 样本稳健估计标准差

        Z_n = abs((sigma_n - sigma_n_steady) / sigma_n_steady)  # 数据肥尾程度

        # 将无限大值替换为0
        Z_n[np.isinf(Z_n)] = 0
        Z_n.fillna(0, inplace=True)

        left_, right_ = V_n.where(V_n > 0, 0), np.exp(1 - Z_n)

        left_, right_ = left_.where(left_ < 1, 1), right_.where(right_ < 1, 1)
        gam_n = left_ * right_  # 个股协调参数[0,1]

        reg_data = pd.concat([np.log(sigma_n), liq_mv, gam_n, fact_exp],
                             axis=1)
        reg_data.columns = ['sigma', liq_mv_name, 'gam_n'
                            ] + fact_exp.columns.tolist()

        ref_data_com = reg_data[reg_data['gam_n'] == 1]

        # 加权(流通市值)最小二乘法用优质股票估计因子对特异波动的贡献值
        model = sm.WLS(ref_data_com['sigma'],
                       ref_data_com[fact_exp.columns],
                       weights=ref_data_com['gam_n']).fit()

        # 个股结构化特异波动预测值
        sigma_STR = pd.DataFrame(np.diag(
            np.exp(np.dot(fact_exp, model.params)) * 1.05),
                                 index=fact_exp.index,
                                 columns=fact_exp.index)

        # 对特异收益矩阵进行结构化调整
        F_STR = sigma_STR.mul((1 - gam_n), axis=0) + cov.mul(gam_n, axis=0)

        return F_STR
Ejemplo n.º 22
0
def normalize_factor(factor: pd.DataFrame,
                     mean=None,
                     std=None) -> pd.DataFrame:
    if mean is None:
        mean = factor.mean()
    if std is None:
        std = factor.std()
    return (factor - mean) / std
Ejemplo n.º 23
0
def attributes_sanity_check(df: pd.DataFrame):
    """Utility function to check the suitability of the attributes for model training.
    
    This utility function can be used to check if any attribute has a standard deviation of zero. This would lead to 
    NaN's when normalizing the features and thus would lead to NaN's when training the model. It also checks if any
    attribute for any basin contains a NaN, which would also cause NaNs during model training.
    
    Parameters
    ----------
    df : pd.DataFrame
        DataFrame of catchment attributes as columns.

    Raises
    ------
    RuntimeError
        If one or more attributes have a standard deviation of zero or any attribute for any basin is NaN.
    """
    # Check for NaNs in standard deviation of attributes.
    attributes = []
    if any(df.std() == 0.0) or any(df.std().isnull()):
        for k, v in df.std().iteritems():
            if (v == 0) or (np.isnan(v)):
                attributes.append(k)
    if attributes:
        msg = [
            "The following attributes have a std of zero or NaN, which results in NaN's ",
            "when normalizing the features. Remove the attributes from the attribute feature list ",
            "and restart the run. \n", f"Attributes: {attributes}"
        ]
        raise RuntimeError("".join(msg))

    # Check for NaNs in any attribute of any basin
    nan_df = df[df.isnull().any(axis=1)]
    if len(nan_df) > 0:
        failure_cases = defaultdict(list)
        for basin, row in nan_df.iterrows():
            for feature, value in row.iteritems():
                if np.isnan(value):
                    failure_cases[basin].append(feature)
        # create verbose error message
        msg = [
            "The following basins/attributes are NaN, which can't be used as input:"
        ]
        for basin, features in failure_cases.items():
            msg.append(f"{basin}: {features}")
        raise RuntimeError("\n".join(msg))
Ejemplo n.º 24
0
Archivo: views.py Proyecto: Mihkorz/AMD
 def form_valid(self, form):
     document = form.save(commit=False)
     project = form.cleaned_data['project']
     document.save()
     filename = settings.MEDIA_ROOT+"/"+document.document.name
     sniffer = csv.Sniffer()
     dialect = sniffer.sniff(open(filename, 'r').read(), delimiters='\t,;') # defining the separator of the csv file
     df = read_csv(filename, delimiter=dialect.delimiter)
     tumour_cols = [col for col in df.columns if 'Tumour' in col]
     norm_cols = [col for col in df.columns if 'Norm' in col]
     document.sample_num = len(tumour_cols)
     document.norm_num = len(norm_cols)
     document.row_num = len(df)
     document.save()
     
     """ Use PANDAS to preprocess input file(calculate Mean_norm CNR and STD) and save to process folder 
         Create ProcessDocument instance to store the file in database"""
         
     path = os.path.join('users', str(document.project.owner),
                                         str(document.project),'process', 'process_'+str(document.get_filename()))
     if not os.path.exists(settings.MEDIA_ROOT+'/'+os.path.join('users', str(document.project.owner),
                                         str(document.project),'process')):
         os.mkdir(settings.MEDIA_ROOT+'/'+os.path.join('users', str(document.project.owner),
                                         str(document.project),'process'))
     
     process_doc = ProcessDocument()
     process_doc.document = path
     process_doc.input_doc = document
     process_doc.created_by = self.request.user
     process_doc.save()
     
     new_file = settings.MEDIA_ROOT+"/"+path
             
     df = df.set_index('SYMBOL') #create index by SYMBOL column
       
     df = df.groupby(df.index, level=0).mean() #deal with duplicate genes by taking mean value
     
     mean_norm = df[[norm for norm in norm_cols]].mean(axis=1)
     from scipy.stats.mstats import gmean
     gmean_norm = df[[norm for norm in norm_cols]].apply(gmean, axis=1)
     
     df1 = DataFrame(df[[norm for norm in norm_cols]], index=df.index)
     
     df1 = df1.std(axis=1)
             
     df['Mean_norm'] = mean_norm
            
     df = df.div(df.Mean_norm, axis='index')
    
     df['Mean_norm'] = mean_norm
     df['gMean_norm'] = gmean_norm
     df['std'] = df1
             
     
     df.to_csv(new_file, sep='\t')
 
      
     return HttpResponseRedirect(self.success_url+project.name)
Ejemplo n.º 25
0
def plot_mean_std(real: pd.DataFrame, fake: pd.DataFrame, ax=None, fname=None):
    """
    Plot the means and standard deviations of each dataset.

    :param real: DataFrame containing the real data
    :param fake: DataFrame containing the fake data
    :param ax: Axis to plot on. If none, a new figure is made.
    :param fname: If not none, saves the plot with this file name. 
    """
    if ax is None:
        fig, ax = plt.subplots(1, 2, figsize=(10, 5))
        fig.suptitle('Absolute Log Mean and STDs of numeric data\n',
                     fontsize=16)

    ax[0].grid(True)
    ax[1].grid(True)
    real = real._get_numeric_data()
    fake = fake._get_numeric_data()
    real_mean = np.log(np.add(abs(real.mean()).values, 1e-5))
    fake_mean = np.log(np.add(abs(fake.mean()).values, 1e-5))
    min_mean = min(real_mean) - 1
    max_mean = max(real_mean) + 1
    line = np.arange(min_mean, max_mean)
    sns.lineplot(x=line, y=line, ax=ax[0])
    sns.scatterplot(x=real_mean, y=fake_mean, ax=ax[0])
    ax[0].set_title('Means of real and fake data')
    ax[0].set_xlabel('real data mean (log)')
    ax[0].set_ylabel('fake data mean (log)')

    real_std = np.log(np.add(real.std().values, 1e-5))
    fake_std = np.log(np.add(fake.std().values, 1e-5))
    min_std = min(real_std) - 1
    max_std = max(real_std) + 1
    line = np.arange(min_std, max_std)
    sns.lineplot(x=line, y=line, ax=ax[1])
    sns.scatterplot(x=real_std, y=fake_std, ax=ax[1])
    ax[1].set_title('Stds of real and fake data')
    ax[1].set_xlabel('real data std (log)')
    ax[1].set_ylabel('fake data std (log)')

    if fname is not None:
        plt.savefig(fname)

    if ax is None:
        plt.show()
Ejemplo n.º 26
0
    def annualized_volatility(self, df: pd.DataFrame) -> float:
        """
        Calculates annualized volatility for a date-indexed pandas data frame
        It works for any interval of time and whether it is prices or returns.
        """
        years_past = self.get_years_past()
        entries_per_year = df.shape[0] / years_past

        return df.std() * np.sqrt(entries_per_year)
Ejemplo n.º 27
0
def _plot_stats_attribute(stats_list: Sequence[Stats], attribute: str, label, ax=None):
    """Plot a certain attribute of a collection of histories."""
    data = np.asarray([getattr(h, attribute) for h in stats_list])
    df = DataFrame(data.T)

    df_mean = df.mean(axis=1)
    df_std = df.std(axis=1)
    sns_ax = sns.lineplot(df_mean.index, df_mean, label=label, ax=ax)
    sns_ax.fill_between(df_mean.index, df_mean - df_std, df_mean + df_std, alpha=0.3)
Ejemplo n.º 28
0
def summary_statistics(data_set: pd.DataFrame) -> pd.DataFrame:
    summary_data = dict()

    summary_data['mean'] = data_set.mean(numeric_only=True)
    summary_data['std'] = data_set.std(ddof=1, numeric_only=True)
    summary_data['min'] = data_set.min(numeric_only=True)
    summary_data['max'] = data_set.max(numeric_only=True)

    return pd.DataFrame(summary_data).T
Ejemplo n.º 29
0
    def testZscore(self, df: pd.DataFrame, stdev_cutoff: float = 5.0):
        """
        Checks to make sure there are no outliers using z score cutoff.
        """
        z_scores = ((df - df.mean(axis=0, skipna=True)) /
                    df.std(axis=0, skipna=True)).abs()

        self.assertGreater(0, (z_scores > stdev_cutoff).to_numpy().sum(),
                           "There are outlier values!")
Ejemplo n.º 30
0
 def drop_high_volatility(df: pd.DataFrame, threshold=2) -> pd.DataFrame:
     """
     清除波动率过大的数据
     :param df:
     :param threshold: 波动率超过几个标准差
     :return: 原始数据除去了波动率过大的对应列
     Contributed by Gu Chengyang
     """
     return df[df.columns[df.min() < df.mean() - threshold * df.std()]]
Ejemplo n.º 31
0
 def get_portfolio_risk(cls, weights: list, ror: pd.DataFrame) -> float:
     """
     Computes the std of portfolio returns.
     """
     # cls.weights_sum_is_one(weights)
     if isinstance(ror, pd.Series):  # required for a single asset portfolio
         return ror.std()
     weights = np.array(weights)
     covmat = ror.cov()
     return math.sqrt(weights.T @ covmat @ weights)
Ejemplo n.º 32
0
        def mean_and_std(cls, df: DataFrame) -> DataFrame:
            """ Standard.Specification.

                Args:
                    df: The data to be standardized.
                Returns: The mean and std (standard deviation) of data.
            """
            scale = df.std()
            scale.name = 'std'
            return cls._stack_as_rows(cls._mean(df), scale)
Ejemplo n.º 33
0
def remove_outliers(df: pd.DataFrame, zscore: int = 3) -> pd.DataFrame:
    """
    Removes all rows from the given DataFrame containing outliers in any of the columns.

    :param df: Input DataFrame.
    :param zscore: z-score to use when calculating outliers.
    :return: The DataFrame with all outliers removed.
    """
    scores = (df - df.mean()) / df.std(ddof=0).values
    return df[(np.abs(scores) < zscore).all(axis=1)]
Ejemplo n.º 34
0
    def testWLS(self):
        X = DataFrame(np.random.randn(30, 4), columns=['A', 'B', 'C', 'D'])
        Y = Series(np.random.randn(30))
        weights = X.std(1)

        self._check_wls(X, Y, weights)

        weights.ix[[5, 15]] = np.nan
        Y[[2, 21]] = np.nan
        self._check_wls(X, Y, weights)
Ejemplo n.º 35
0
def moments_features(path):
    if not os.path.exists(path):
        logger.error(path + " is not exist!")
        return
    im = cv2.imread(path)
    [b, g, r] = cv2.split(im)
    moments = []
    for n in [b, g, r]:
        df = DataFrame(np.array(n.flatten()))
        moments.extend(float(x) for x in [df.mean()[0], df.std()[0], df.skew()[0]])
    return moments
Ejemplo n.º 36
0
def combine_spread(file_set, shift, drop_return_data=False):
    """
    Combine the spread of input files, return with mean and standard
    deviation calculated.

    """

    data = []
    values = {}
    for val in ('left', 'right', 'com', 'dist', 'radius', 'diameter'):
        values[val] = {}

    # Collect data from all files into dictionaries
    for i, _file in enumerate(file_set):
        data.append(Spread().read(_file))
        for val in values.keys():
            values[val][i] = Series(
                    data=data[i].spread[val]['val'],
                    index=data[i].times
                    )
        data[i].times = (np.array(data[i].times) - shift[i])

    spread = Spread()
    spread.spread['num'] = len(file_set)

    for val in values.keys():

        # Shift time as per synchronisation
        for i in values[val]:
            values[val][i].index = np.array(values[val][i].index) - shift[i]

        # Convert to DataFrame
        df = DataFrame(data=values[val])

        # If not a single file, keep only indices with at least two non-NaN
        if len(file_set) > 1:
            df = df.dropna()

        # If return data dropped, fill data here
        if drop_return_data:
            for i in df.columns:
                data[i].spread[val]['val'] = df[i].tolist()

        # Get times, mean and standard error as lists
        mean = list(df.mean(axis=1))
        std_error = list(df.std(axis=1))
        times = list(df.index)

        # Add to Spread object
        spread.spread[val]['val'] = mean
        spread.spread[val]['std'] = std_error
        spread.spread['times'] = times

    return spread, data
def stndize(str,list):

    s=list
    w= pd.read_csv(str,usecols=s)
    frame = DataFrame(w)

    t=frame.mean()
    print t
    z=frame.std()
    print z
    print (w-t)/z

    return;
Ejemplo n.º 38
0
    def testWLS(self):
        # WLS centered SS changed (fixed) in 0.5.0
        if sm.version.version < '0.5.0':
            raise nose.SkipTest

        X = DataFrame(np.random.randn(30, 4), columns=['A', 'B', 'C', 'D'])
        Y = Series(np.random.randn(30))
        weights = X.std(1)

        self._check_wls(X, Y, weights)

        weights.ix[[5, 15]] = np.nan
        Y[[2, 21]] = np.nan
        self._check_wls(X, Y, weights)
Ejemplo n.º 39
0
    def test_common_start_returns(self, before, after, mean_by_date, demeaned,
                                  expected_vals):
        dr = date_range(start='2015-1-17', end='2015-2-2')
        dr.name = 'date'
        tickers = ['A', 'B', 'C', 'D']
        r1, r2, r3, r4 = (1.20, 1.40, 0.90, 0.80)
        prices = DataFrame(index=dr, columns=tickers,
                           data=[[r1**1, r2**1, r3**1, r4**1],
                                 [r1**2, r2**2, r3**2, r4**2],
                                 [r1**3, r2**3, r3**3, r4**3],
                                 [r1**4, r2**4, r3**4, r4**4],
                                 [r1**5, r2**5, r3**5, r4**5],
                                 [r1**6, r2**6, r3**6, r4**6],
                                 [r1**7, r2**7, r3**7, r4**7],
                                 [r1**8, r2**8, r3**8, r4**8],
                                 [r1**9, r2**9, r3**9, r4**9],
                                 [r1**10, r2**10, r3**10, r4**10],
                                 [r1**11, r2**11, r3**11, r4**11],
                                 [r1**12, r2**12, r3**12, r4**12],
                                 [r1**13, r2**13, r3**13, r4**13],
                                 [r1**14, r2**14, r3**14, r4**14],
                                 [r1**15, r2**15, r3**15, r4**15],
                                 [r1**16, r2**16, r3**16, r4**16],
                                 [r1**17, r2**17, r3**17, r4**17]])
        dr2 = date_range(start='2015-1-21', end='2015-1-29')
        factor = DataFrame(index=dr2, columns=tickers,
                           data=[[3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1]]).stack()
        factor.index = factor.index.set_names(['date', 'asset'])
        factor.name = 'factor'

        cmrt = common_start_returns(
            factor,
            prices,
            before,
            after,
            False,
            mean_by_date,
            factor if demeaned else None)
        cmrt = DataFrame({'mean': cmrt.mean(axis=1), 'std': cmrt.std(axis=1)})
        expected = DataFrame(index=range(-before, after + 1),
                             columns=['mean', 'std'], data=expected_vals)
        assert_frame_equal(cmrt, expected)
Ejemplo n.º 40
0
    def testWLS(self):
        # WLS centered SS changed (fixed) in 0.5.0
        sm_version = sm.version.version
        if sm_version < LooseVersion("0.5.0"):
            raise nose.SkipTest("WLS centered SS not fixed in statsmodels" " version {0}".format(sm_version))

        X = DataFrame(np.random.randn(30, 4), columns=["A", "B", "C", "D"])
        Y = Series(np.random.randn(30))
        weights = X.std(1)

        self._check_wls(X, Y, weights)

        weights.ix[[5, 15]] = np.nan
        Y[[2, 21]] = np.nan
        self._check_wls(X, Y, weights)
Ejemplo n.º 41
0
    def testWLS(self):
        # WLS centered SS changed (fixed) in 0.5.0
        if sm.version.version < '0.5.0':
            raise nose.SkipTest

        print( "Make sure you're using statsmodels 0.5.0.dev-cec4f26 or later.")

        X = DataFrame(np.random.randn(30, 4), columns=['A', 'B', 'C', 'D'])
        Y = Series(np.random.randn(30))
        weights = X.std(1)

        self._check_wls(X, Y, weights)

        weights.ix[[5, 15]] = np.nan
        Y[[2, 21]] = np.nan
        self._check_wls(X, Y, weights)
Ejemplo n.º 42
0
def cross_validate_trades(trades, N = 20, subset_fraction = 0.7):
    
    tickers = trades.tickers
    sample_size = round(len(tickers) * subset_fraction)
    summary = DataFrame(dtype = float)

    for n in range(N):
        sample_tickers = list(random.choice(tickers, sample_size, replace = False))
        trade_subset = trades.find(lambda T: T.ticker in sample_tickers)
        summary[n] = summary_report(trade_subset)

    result = DataFrame(dtype = float)
    result['Base'] = summary_report(trades)
    result['Mean'] = summary.mean(axis = 1)
    result['Std'] = summary.std(axis = 1)
    result['Median'] = summary.median(axis = 1)
    result['Max'] = summary.max(axis = 1)
    result['Min'] = summary.min(axis = 1)

    return (result, summary)
Ejemplo n.º 43
0
class GetGenes(object):

	def __init__(self, data):
		self.dataframe = DataFrame(data)

	# read a text file and return a data frame. Records should be separated by TAB
	# There should not be duplicate column names
	def import_file(self, filename):
		# this function use to convert string to float
		def convert(x):
			try:
				x = float(x)
			except ValueError:
				pass
			return(x)

		table = []
		for line in open(filename):
			if(line.strip()):	# If not empty line
				line = line.rstrip('\n').split('\t')
				line = list(map(convert, line))
				table.append(line)
		self.dataframe = DataFrame(table[1:],columns=table[0])
		return

	def houseKeepingGenes(self, geneNum):
		# compute the CV of data
		std = array(self.dataframe.std(axis = 1))
		mean = array(self.dataframe.mean(axis = 1))
		CV = std/mean
		CV = list(map(abs, CV))		# convert to positive number

		# get the fist N minimum value
		mins = nsmallest(geneNum, CV)
		print("The GOOD genes are:\n")
		for item in mins:
			print(self.dataframe.ix[CV.index(item)][0])
		return
Ejemplo n.º 44
0
experiment_data_Raw = DataFrame({"Timestamp": quelle_timestampsRaws, "Raw key": quelle_raws, "Dataset": quelle_datasetR})
experiment_data_Raw = experiment_data_Raw.set_index("Timestamp")

final_data = concat([experiment_data_Qber,experiment_data_Raw])

final_data = final_data.sort_index()

# after prepaired data, time to plot it:

for new_counter in range(file_counter+1):
    #print new_counter
    Qbers = final_data[(final_data["Dataset"]==new_counter) & (final_data["Qber"] > 0) ]
    x1 = Qbers.index.tolist()
    y1 = Qbers["Qber"].tolist()
    x1_average = DataFrame.mean(Qbers)["Qber"]
    x1_std_dev = DataFrame.std(Qbers)["Qber"]
    #prepairing proper time:
    x1[:] = [x - quelle_initialTimestamps[new_counter] for x in x1]
    
    Raws = final_data[(final_data["Dataset"]==new_counter) & (final_data["Raw key"] > 0) ]
    x2_average = DataFrame.mean(Raws)["Raw key"]
    x2_median = DataFrame.median(Raws)["Raw key"]
    x2_max = DataFrame.max(Raws)["Raw key"]
    
    Raws = Raws[Raws["Raw key"]<(x2_max - (x2_max/100)*20)]
    
    x2 = Raws.index.tolist()
    y2 = Raws["Raw key"].tolist()

    print x2_average
    #x2_std_dev = 3
Ejemplo n.º 45
0
def test():
    # a : adulte isolé
    # b : couple
    # c : enfant dans couple
    # d : enfan isolé
    # e : ado couple
    # f : ado isolé
    # g : chambre d'enfant

    # A: 2a,2e
    #  b + 2*c + g
    fa = [0, 1, 2, 0, 0, 0, 1]
    ma = 2754.74

    # B : 2a,2ea,supp:
    #  b + 2*e + 2*g
    fb = [0, 1, 0, 0, 2, 0, 2]
    mb = 3165.15

    # C : 1a,2e:
    #  a + 2*d + g
    fc = [1, 0, 0, 2, 0, 0, 1]
    mc = 2291.04

    # D: 2a, 2e, 2ea, 2*supp :
    #   b + 2*c + 2*e + 3*g
    fd = [0, 1, 2, 0, 2, 0, 3]
    md = 3969.81

    # E : 2a,1ea
    #    b + e + g
    fe = [0, 1, 0, 0, 1, 0, 1]
    me = 2549.17

    # F : 2a, 1e, 2ea
    #    b + c + 2*e + 2*g
    ff = [0, 1, 1, 0, 2, 0, 2]
    mf = 3514.12

    # G: 2a, 1e ,1ea, supp
    #   b + c + e + 2*g
    fg = [0, 1, 1, 0, 1, 0, 2]
    mg = 3042.39

    # H: 1a, 1ea
    #    a + f + g
    fh = [1, 0, 0, 0, 0, 1, 1]
    mh = 2103.91

    # solve f*x = m

    # A supplementary equation is needed because the system is inconsistant
    fsup = [1, -1 / 1.5, 0, 0, 0, 0, 0]
    msup = 0
    f = [fa, fb, fc, fd, fe, ff, fg, fh, fsup]
    m = [ma, mb, mc, md, me, mf, mg, mh, msup]

    results = DataFrame()

    for i in range(8):
        selected_f1 = list(f)
        selected_m1 = list(m)
        selected_f1.pop(i)
        selected_m1.pop(i)
        for j in range(7):
            selected_f = list(selected_f1)
            selected_m = list(selected_m1)
            selected_f.pop(j)
            selected_m.pop(j)

            f_mat = np.array(selected_f)

            m_vec = np.array(selected_m)

            # print i, np.linalg.det(f_mat)
            try:
                x = DataFrame({str(i) + str(j): np.linalg.solve(f_mat, m_vec)}).T
            except:

                x = None

            from pandas import concat

            if x is not None:
                results = concat([results, x])

    print results
    print results.mean()
    print results.std()
    print results.std() / results.mean()
Ejemplo n.º 46
0
def ExerciseCheckerAlmostCorrect(path):
    
    # Gather the solution
    solution_path = path + "Solutions/Week1.xlsx"
    solution = load_workbook(solution_path, read_only=True, use_iterators=False, 
                             keep_vba=False, guess_types=False, data_only=True)

    solution_rows = prepare_book(solution)
    num_solution_records = len(solution_rows)
    
    print "The number of solution records is: " + str(num_solution_records) + "\n"
    
    num_responses = 0
    
    all_accuracy_array = []
    almost_accuracy_array = []
    
    # Gather the response
    answer_path = path + "Response/Week_1/"
    files = os.listdir(answer_path)
    for file in files:
                        
        file_type_array = file.split(".")
        file_type = file_type_array[len(file_type_array)-1]
        
        if file_type not in ["xlsx", "xlsm", "xltx", "xltm"]:
            continue
        
        print file
        
        num_responses += 1
        
        num_check = 0
        num_contain = 0
         
        try:
                        
            answer = load_workbook(answer_path + file, read_only=True, use_iterators=False, 
                               keep_vba=False, guess_types=False, data_only=True)
        
            # Gather each sheet in the answer file
            for sheet in answer:
            
                answer_rows = {}
                       
                for row in sheet.rows:
                    full_address = str.lower(str.strip(str(row[0].value)))
                    remaining_elements = set()
                    for i in range(1, len(row)):
                        remaining_elements.add(str.lower(str.strip(str(row[i].value))))
                    answer_rows[full_address] = remaining_elements
            
                # Compare the answer and the solution
                for row in solution_rows:
                    full_address = str.lower(str.strip(str(row[0].value)))
                    if answer_rows.has_key(full_address):
                        for i in range(1, len(row)):
                            num_check += 1
                            row_element = str.lower(str.strip(str(row[i].value)))
                            
                            if row_element in answer_rows[full_address] or row_element[1:len(row_element)] in answer_rows[full_address]:
                                num_contain += 1
                    
        except Exception as e:
            print "False\t" + str(e)
                            
        if num_check > 0:            
            accuracy = float(num_contain) / num_check
            all_accuracy_array.append(accuracy)
            if accuracy < 1:
                almost_accuracy_array.append(accuracy)
        else:
            almost_accuracy_array.append(accuracy)
    
    print    
    df1 = DataFrame(all_accuracy_array)
    print "ALl: " + str(num_responses)
    print df1.mean()
    print df1.std()
    
    print 
    df2 = DataFrame(almost_accuracy_array)
    print "Almost: " + str(len(almost_accuracy_array))
    print df2.mean()
    print df2.std()
Ejemplo n.º 47
0
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}

data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data

data['food'].map(lambda x: meat_to_animal[x.lower()])

# 数据标准化
datafile = 'd:/data/normalization_data.xls' #参数初始化
data = pd.read_excel(datafile, header = None) #读取数据

(data - data.min())/(data.max() - data.min()) #最小-最大规范化
(data - data.mean())/data.std() #零-均值规范化
data/10**np.ceil(np.log10(data.abs().max())) #小数定标规范化


###替换值
data = Series([1., -999., 2., -999., -1000., 3.])
data

data.replace(-999, np.nan)

data.replace([-999, -1000], np.nan)

data.replace([-999, -1000], [np.nan, 0])

data.replace({-999: np.nan, -1000: 0})
Ejemplo n.º 48
0
        float( len( timedeltas_above_double_average ) ) / len( delta_t ) * 100

    print "Timedeltas above double average",
    print len(timedeltas_above_double_average), 
    print timedeltas_above_double_average_percent

    last_timestamp = timestamps[-1]

    print "Last timestamp", timestamps[-1]
    print "Maximal timestamp", max( timestamps )
    print "Average frequency", float( len( timestamps ) ) / ( float( last_timestamp ) / 1000 ) 


    delta_t = DataFrame( delta_t )
    delta_t.plot()
    pyplot.show()

    print "Timedelta standard deviation", float( delta_t.std() )

    font = {
        'family': 'Consolas',
        'weight': 'x-small',
        'size': 11.0,
        'stretch': 0
    }

    
    
    pyplot.rc( 'font', **font )
    pyplot.show( block=True )    
Ejemplo n.º 49
0
WEEKCOLS=[WEEKDF,updf,dwndf]
WEEKDF=pd.concat(WEEKCOLS,axis=1)
WEEKDF.columns=['PLUSMINUSWEEK','UP RATE','DOWN RATE']
'''
print(' ')
print (WEEKDF)
'''

#find the current high low of futures


POSITION=DataFrame([0])
VALUE=FIVEPATTERNCHANGE.mean()*CLWEEK.iloc[0,0]+CLWEEK.iloc[0,0]


WEEKSTDUP=float((VALUE*FIVEPATTERNCHANGE.std())+VALUE)
WEEKSTDDOWN=float(VALUE-(VALUE*FIVEPATTERNCHANGE.std()))


print ("The last week through the last five weeks have done the following")
print (FINALDF)
print (' ')
VALUE=FIVEPATTERNCHANGE.mean()*CLWEEK.iloc[0,0]+CLWEEK.iloc[0,0]
VALUE=DataFrame(VALUE)
print (' ')
print ('Value price based on pattern')
print ("%.2f" % VALUE.iloc[0,0])
print (' ')

print ('One Standard Deviation Up')
print ((VALUE*FIVEPATTERNCHANGE.std())+VALUE)
Ejemplo n.º 50
0
import numpy as np
from pandas import DataFrame, Series
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats

df=DataFrame(abs(np.random.randn(30).reshape(6,5))*100)

plt.bar(np.arange(len(df.mean())), df.mean(),
        align='center',
        color='white',
        linewidth=1.5)
plt.hold(True)
plt.errorbar(np.arange(len(df.mean())),df.mean(),df.std(),
             elinewidth=1.2,
             capsize=7.5,
             fmt=None)


plt.show()
Ejemplo n.º 51
0
    def run(self,Model='ridge',kernel='linear', cross_validationMethod='KFold',FeatureSelection='PCA',n_features=20,scoringList=['specificity','sensitivity','precision','f1','accuracy','ss_mean'],isSaveCsv=None,isSavePickle=None, isSaveFig=None, isPerm=0,isBetweenSubjects=True,isConcatTwoLabels=False):       
        # -- TODO :
        # --  # Greedy selection on features + Other feature selection types...
        # --  # Make sure featuers are Best only based on train data!!!
        # --  # Keep a list of n_train, n_test from each Label and scoring (accuracy, f1..) in each cross validation iteration
        # --  # Plot results summary (see CARS paper for desired results for Ein Gedi Poster 22-1-2015)
        # --  # remove irelevant data using 'Tracking Success' and consider 'TimeStamps' for feature calculation
        # --  # add f feature analysis by facial part (see excel) 
        # --  # select best model (svm, otherwise ridge regression) 
        # --  # compare svc results with regerssion results (using LOO and different Params for regression  - params for unbalanced data, different kernels, etc.), model evaluation - http://scikit-learn.org/stable/modules/model_evaluation.html) 
        # --  # check how the model weights behave - feature selection analysis
        # --  # calc model error
        # --  # divide data to subparts for training and testing - try within/ between subject, and analyze distribution of features when data is divided
        # --  # LOO - also on bool labels (patients vs controls and mental status bool)
        # --  # add mental status rank scores (0-4)
        # --  # make sure p-val returns the right value in 'scores'
        # --  # run it over random data (permutation test) 
        # --  # continoue here - check regression results-Make sure regression works (not so good).. check what happens in svc for G7 (high train R, negative test R)

        ## init        
        FeatureTypeList=[j for j in tuple(self.FeaturesDF.index)]
        self.FullResults=DF()
        self.Learningdetails={'Model':Model,'Kernel':kernel,'CrossVal':cross_validationMethod,'FeatureSelection':FeatureSelection,'LabelBy':self.Details['LabelDetails'].keys()[0],'FeatureMethod':self.Details['FeatureMethod'],'PieceLength':self.Details['PieceLength']}
        print('\n------------Learning Details------------')
        print(DF.from_dict(self.Learningdetails,orient='index'))
        print('\n----' + cross_validationMethod + ' Cross validation Results:----')
           
        # Set learning params (cross validation method, and model for learning)
        isBoolLabel=self.LabelsObject.isBoolLabel
        isBoolScores=isBoolLabel
        model, isBoolModel, featureSelectionMethod,selectFeaturesFunction= learningUtils.setModel(Model,FeatureSelection,n_features)
        #define global variables over modules (to be used in myUtils)
        globalVars.transformMargins=0#lambda x:x         
        globalVars.isBoolLabel=isBoolLabel
        globalVars.isBoolModel=isBoolModel
        global trainLabels_all, testLabels_all, TrueLabels,isAddDroppedSubjects 
        trainLabels_all, testLabels_all, TrueLabels,isAddDroppedSubjects=labelUtils.initTrainTestLabels_all(self.LabelsObject)
        trainLabels_all2, testLabels_all2, TrueLabels2,isAddDroppedSubjects2=labelUtils.initTrainTestLabels_all(self.LabelsObject2)



        
        
        
        LabelingList=['N1']#trainLabels_all.columns
        self.ResultsDF=DF()
        self.BestFeatures=DF(columns=LabelingList) #dict of BestFeaturesDF according to Labeling methods
        YpredictedOverAllLabels=pandas.Panel(items=range(len(trainLabels_all)),major_axis=LabelingList,minor_axis=TrueLabels.index) #panel: items=cv_ind, major=labels, minor=#TODO 
       
                                              
        ## Create train and test sets according to LabelBy, repeat learning each time on different Labels from LabelingList.
        for label_ind, Labeling in enumerate(LabelingList):
            """if isPerm: #TODO - fix this to work with continous / bool data
                try:
                    trainLabels=self.LabelsObject.permedLabelsDF[Labeling]
                except AttributeError:
                    self.LabelsObject.permLabels()
                    trainLabels=self.LabelsObject.permedLabelsDF[Labeling]"""
            #set subjects list according to labels and features
            X,SubjectsList,droppedSubjects,Xdropped=featuresUtils.initX(self.FeaturesDF,trainLabels_all,Labeling)
            X2,SubjectsList2,droppedSubjects2,Xdropped2=featuresUtils.initX(self.FeaturesDF,trainLabels_all2,Labeling,is2=1)
            
            #init train and test labels
            trainLabels, testLabels, LabelRange = labelUtils.initTrainTestLabels(Labeling,SubjectsList,trainLabels_all, testLabels_all)
            trainLabels2, testLabels2, LabelRange2 = labelUtils.initTrainTestLabels(Labeling,SubjectsList2,trainLabels_all2, testLabels_all2)
            
            #make sure only labeled subjects are used for classification
            X=X.query('subject == '+ str(list(trainLabels.index)) ) 
            X.index.get_level_values(X.index.names[0]) 
            SubjectIndex=list(set(X.index.get_level_values('subject')))

            X2=X2.query('subject == '+ str(list(trainLabels2.index)) )  
            X2.index.get_level_values(X2.index.names[0]) 
            SubjectIndex2=list(set(X2.index.get_level_values('subject')))                       
            #init vars
            if isBetweenSubjects:
                cv_param=len(SubjectIndex)
                self.Learningdetails['CrossValSubjects']='between'
                isWithinSubjects=False
            else:
                isWithinSubjects=True
                X=X.swaplevel(0,1)
                PieceIndex=list(set(X.index.get_level_values('Piece_ind')))
                cv_param=len(PieceIndex)
                self.Learningdetails['CrossValSubjects']='within'
            
            self.Learningdetails['NumOfFeatures']=n_features
            
            print('\n**' + Labeling + '**')
            
            cv, crossValScores= learningUtils.setCrossValidation(cross_validationMethod,cv_param,trainLabels,isWithinSubjects) 
            
            ## Learning - feature selection for different scoring types, with cross validation - 

            BestFeaturesForLabel=self.BestFeaturesForLabel(FeatureTypeList,LabelingList,n_features) #saves dataframe with best features for each label, for later analysis
            cv_ind=0
            #used for transforming from margins returned from svm to continouse labels (e.g . PANSS)
            trainScores=DF()
            test_index=X.index
            testScores=concat([DF(index=test_index),DF(index=['std_train_err'])])
            testScores2=concat([DF(index=testLabels.index),DF(index=['std_train_err'])]) 
            #impt=Imputer(missing_values='NaN', strategy='median', axis=0)

            globalVars.LabelRange=LabelRange

            ModelWeights1=DF(columns=range(len(cv)),index=X.columns)
            Components=pandas.Panel(items=range(len(cv)),major_axis=X.columns,minor_axis=range(n_features)) #todo fix this for 1st and second learning
            ExplainedVar=DF(columns=range(len(cv)))
            ModelWeights2=DF(columns=range(len(cv)))
            for train, test in cv:

                if isBetweenSubjects:
                    #set X and Y
                    train_subjects=trainLabels.iloc[train].index
                    test_subjects=testLabels.iloc[test].index 
                    Xtrain,Xtest, Ytrain, YtrainTrue, Ytest=learningUtils.setXYTrainXYTest(X,Labeling,trainLabels,testLabels,TrueLabels,train_subjects,test_subjects)
                    Xtrain2,Xtest2, Ytrain2, YtrainTrue2, Ytest2=learningUtils.setXYTrainXYTest(X2,Labeling,trainLabels2,testLabels2,TrueLabels2,train_subjects,test_subjects)

                    
                    if isConcatTwoLabels: #used when there is more than one doctor
                        Xtrain=concat([Xtrain,Xtrain2])
                        Xtest=concat([Xtest,Xtest2])
                        Ytrain=concat([Ytrain,Ytrain2])
                        YtrainTrue=concat([YtrainTrue,YtrainTrue2])
                        Ytest=concat([Ytest,Ytest2])
                        Xdropped=concat([Xdropped,Xdropped2])
                        SubjectsList=list(set(SubjectsList).intersection(set(SubjectsList2)))
                        droppedSubjects=list(set(droppedSubjects).union(set(droppedSubjects2)).difference(set(SubjectsList)))#diff from SubjectsList to make sure no subjects are both in train and test.
                    """else:
                        Xtrain=Xtrain1
                        Xtest=Xtest1
                        Xdropped=Xdropped1
                        Ytrain=Ytrain1
                        YtrainTrue=YtrainTrue1
                        Ytest=Ytest1"""

                    #select N best features:
                    Xtrain, Xtest, bestNfeatures, components, explainedVar,decomposeFunc=learningUtils.selectBestNfeatures(Xtrain,Xtest,Ytrain,n_features,selectFeaturesFunction)
                    BestFeaturesForLabel.add(bestNfeatures) #todo - delete this??     

                    #train 1 
                    TrainModel=model
                    TrainModel.fit(Xtrain.sort_index(),Ytrain.T.sort_index())
                    try:
                        Components[cv_ind]=components.T
                        ExplainedVar[cv_ind]=explainedVar
                        isDecompose=True
                        if cv_ind==0:
                            ModelWeights1=DF(columns=range(len(cv)),index=range(len(bestNfeatures)))
                        ModelWeights1[cv_ind]=TrainModel.coef_.flatten()
                    except AttributeError:
                        isDecompose=False
                        ModelWeights1[cv_ind].loc[bestNfeatures]=TrainModel.coef_.flatten()
                    self.isDecompose=isDecompose                    
                    #train 2
                    if isBoolLabel:
                       PiecePrediction_train=DF(TrainModel.predict(Xtrain),index=Xtrain.index,columns=['prediction'])
                       TrainModel2=svm.SVC(kernel='linear', probability=True,class_weight={0:1,1:1})
                    else:
                       PiecePrediction_train=DF(TrainModel.decision_function(Xtrain),index=Xtrain.index,columns=['prediction'])
                       TrainModel2=linear_model.LinearRegression()

                    Xtrain2, Ytrain2, YtrainTrue2=learningUtils.getX2Y2(Xtrain,Ytrain,YtrainTrue,PiecePrediction_train, isBoolLabel)                 
                    TrainModel2.fit(Xtrain2, Ytrain2)
                    if cv_ind==0:
                        ModelWeights2=DF(columns=range(len(cv)),index= Xtrain2.columns)
                    ModelWeights2[cv_ind]=TrainModel2.coef_.flatten()         

                              
                    #test 1
                    if isAddDroppedSubjects: #take test subjects from cv + subjects that were dropped for labeling used for test
                        if isDecompose:
                            dXdropped=DF(decomposeFunc(Xdropped).values,index=Xdropped.index)
                        XtestDropped=dXdropped[bestNfeatures]
                        YtestDropped=Series(XtestDropped.copy().icol(0))
                        #YTrueDropped=Series(Xdropped.copy().icol(0))
                        for subject in droppedSubjects:
                            YtestDropped[subject]=testLabels_all[Labeling].loc[subject]
                            #YTrueAll.loc[subject]=TrueLabels[Labeling].loc[subject]
                        Ytest=concat([Ytest,YtestDropped]).sort_index()
                        Xtest=concat([Xtest,XtestDropped]).sort_index()


                    if isPerm: #TODO- Check this!!
                        Ytest=y_perms.loc[Ytest.index]
                    Xtest=Xtest.fillna(0.)
                    
                    
                elif isWithinSubjects:
                    #train 1
                    train_pieces=PieceIndex[train]
                    test_pieces=PieceIndex[test] #TODO - make sure that if test/train> piece index, it ignores it and repeate the process
                    
                    XtrainAllFeatures=X.query('Piece_ind == '+ str(list(train_pieces)))
                    Ytrain=Series(index=X.index)
                    Ytest=Series(index=X.index)
                    YtrainTrue=Series(index=X.index)
                    
                    for subject in PieceIndex: 
                        for piece in train_pieces:
                            Ytrain.loc[piece].loc[subject]=trainLabels[subject]
                            YtrainTrue.loc[piece].loc[subject]=TrueLabels[Labeling].loc[subject] 
                            Ytest.loc[piece].loc[subject]=testLabels[subject]   
                    Ytrain=Ytrain.dropna()
                    YtrainTrue=YtrainTrue.dropna() 
                    for subject in test_subjects:
                        Ytest.loc[piece].loc[subject]=testLabels[subject]
                #train scores 1       
                if cv_ind==0:
                    trainScores,YtrainPredicted=learningUtils.getTrainScores(Ytrain,Xtrain,YtrainTrue,TrainModel)
                    plt.figure(1)
                    if len(LabelingList)>1:
                        plt.subplot(round(len(LabelingList)/2),2,label_ind+1)
                    if isBoolLabel:
                        testScores=learningUtils.getTestScores(Ytest,Xtest,TrainModel)
                    else:
                        testScores[cv_ind]=learningUtils.getTestScores(Ytest,Xtest,TrainModel)
                        plt.title(Labeling,fontsize=10)
                else:
                    plt.figure(3)
                    new_trainScores,YtrainPredicted=learningUtils.getTrainScores(Ytrain,Xtrain,YtrainTrue,TrainModel)
                    trainScores=concat([trainScores,new_trainScores],axis=1)
                #test 1   
                    testScores[cv_ind]=learningUtils.getTestScores(Ytest,Xtest,TrainModel)
                
                #train2

                if isBoolLabel:
                    PiecePrediction_test=DF(TrainModel.predict(Xtest),index=Xtest.index,columns=['prediction'])
                else:
                    PiecePrediction_test=DF(TrainModel.decision_function(Xtest),index=Xtest.index,columns=['prediction'])
                Xtest2, Ytest2 , YtestTrue2 =learningUtils.getX2Y2(Xtest,Ytest,Ytest,PiecePrediction_test, isBoolLabel)
                
                if cv_ind==0:
                    trainScores2,YtrainPredicted2=learningUtils.getTrainScores(Ytrain2,Xtrain2,YtrainTrue2,TrainModel2)
                    YpredictedOverAllLabels[cv_ind].loc[Labeling]=YtrainPredicted2
                    #plt.figure(1)
                    #if len(LabelingList)>1:
                        #plt.subplot(round(len(LabelingList)/2),2,label_ind+1)
                #test2
                    if isBoolLabel:
                        testScores2=learningUtils.getTestScores(Ytest2,Xtest2,TrainModel2)
                    else:
                        testScores2[cv_ind]=learningUtils.getTestScores(Ytest2,Xtest2,TrainModel2)
                    #plt.title(Labeling,fontsize=10)
                else:
                    new_trainScores2,YtrainPredicted2=learningUtils.getTrainScores(Ytrain2,Xtrain2,YtrainTrue2,TrainModel2)
                    YpredictedOverAllLabels[cv_ind].loc[Labeling]=YtrainPredicted2
                    trainScores2=concat([trainScores2,new_trainScores2],axis=1)
                    testScores2[cv_ind]=learningUtils.getTestScores(Ytest2,Xtest2,TrainModel2)     
                cv_ind+=1

                #crossValScores=crossValScores.append(CVscoresDF,ignore_index=True) #information about entire train test data. 
            fig2=plt.figure(2)
            if len(LabelingList)>1:
                plt.subplot(round(len(LabelingList)/2),2,label_ind+1)
            #if isAddDroppedSubjects:
               # testLabelsSummary=testLabels_all[Labeling].loc[AllSubjects]
           # else:
               # testLabelsSummary=testLabels
            scoresSummary = learningUtils.getScoresSummary(trainScores2,testScores2,TrueLabels[Labeling])
            # reset global vars
            globalVars.fitYscale='notDefined'
            globalVars.beta=DF()

            plt.title(Labeling,fontsize=10)
            plt.xlabel('Ytrue',fontsize=8)
            plt.ylabel('Ypredicted',fontsize=8)
            plt.tick_params(labelsize=6)
            #print(crossValScores.T)    
            scores=scoresSummary.fillna(0.)
            
            #analyze feature weightsL

            WeightedFeatures1=DF([ModelWeights1.mean(axis=1),ModelWeights1.std(axis=1)],index=['mean','std']).T.fillna(0)
            if isDecompose==0:
                WeightedFeatures1FeatureType=WeightedFeatures1.mean(level='FeatureType')
                WeightedFeatures1FsSingal=WeightedFeatures1.mean(level='fs-signal')
                WeightedFeatures1=concat([DF(index=['-------(A) FeatureType-------']),WeightedFeatures1FeatureType,DF(index=['-------(B) faceshift signal-------']),WeightedFeatures1FsSingal])
            
            WeightedFeatures2=DF([ModelWeights2.mean(axis=1),ModelWeights2.std(axis=1)],index=['mean','std']).T.fillna(0)
            BestFeatures=concat([DF(index=['------------- Learning 1 -------------']),WeightedFeatures1,DF(index=['------------- Learning 2 -------------']),WeightedFeatures2])
            self.BestFeatures[Labeling]=BestFeatures['mean']

            #analyze decomposition
            if isDecompose:
                Components_mean = Components.mean(axis=0)
                Components_std = Components.std(axis=0)
                ExplainedVar_mean = DF(ExplainedVar.mean(axis=1)).T#todo- check!
                ExplainedVar_mean.index=['ExplainedVar_mean']
                ExplainedVar_std = DF(ExplainedVar.std(axis=1)).T#todo- check!
                ExplainedVar_std.index=['ExplainedVar_std']
                try:
                    self.LabelComponents[Labeling]=concat([DF(index=['---components mean---']),Components_mean,ExplainedVar_mean,DF(index=['---components std over cross validation---']),Components_std,ExplainedVar_std])
                except AttributeError:
                    self.LabelComponents=dict.fromkeys(LabelingList)
                    self.LabelComponents[Labeling]=concat([DF(index=['---components mean---']),Components_mean,ExplainedVar_mean,DF(index=['---components std over cross validation---']),Components_std,ExplainedVar_std])

                """print(Components_mean)
                print(ExplainedVar_mean)
                print(WeightedFeatures1)"""

                        
            #BestFeaturesForLabel.analyze(ByLevel=0) #TODO change to regression coeff
            LabelFullResults=concat([DF(index=[Labeling]),scores]) 
  
            self.FullResults=concat([self.FullResults,LabelFullResults])            
            self.ResultsDF=concat([self.ResultsDF,DF(scores[0],columns=[Labeling])],axis=1)
#continue here!! to build pseudo inverse matrix from predicted to true - make sure columns + rows are set!

            #self.BestFeatures[Labeling]=BestFeaturesForLabel.WeightedMean

            #plt.savefig('C:\\Users\\taliat01\\Desktop\\TALIA\\Code-Python\\Results\\'+Labeling+'png')
        testScores3=pandas.Panel(items=range(len(X2.index))) #for each cv score...
        FullSubjectsList=YpredictedOverAllLabels[0].columns
        YdroppNans=YpredictedOverAllLabels.dropna(axis=0,how='all')
        YdroppNans=YdroppNans.dropna(axis=1,how='all')
        YpredictedOverAllLabels=YdroppNans.dropna(axis=2,how='all')
        notNans_cv_ind=YpredictedOverAllLabels.items
        notNans_trainSubjects=YpredictedOverAllLabels.minor_axis
        notNans_LabelsList=YpredictedOverAllLabels.major_axis
        notNans_TrueLabels=TrueLabels.T[notNans_trainSubjects].loc[notNans_LabelsList]
        cv_ind=0
        for train, test in cv:
            if cv_ind in notNans_cv_ind:
                print(test)
                train=list(set(FullSubjectsList[train]).intersection(set(notNans_trainSubjects)))
                test=list(set(FullSubjectsList[test]).intersection(set(notNans_trainSubjects)))
                if len(train)>0 and len(test)>0: 
                    AllLabelsYTrainPredicted=YpredictedOverAllLabels[cv_ind][train]
                    AllLabelsYTrainPredicted=AllLabelsYTrainPredicted.fillna(0)
                    AllLabelsYTrainTrue=notNans_TrueLabels[train]
                    AllLabelsYTestPredicted=YpredictedOverAllLabels[cv_ind][test]
                    AllLabelsYTestTrue=notNans_TrueLabels[test]

                    pseudoInverse_AllLabelsYTrainTrue=DF(np.linalg.pinv(AllLabelsYTrainTrue),columns=AllLabelsYTrainTrue.index,index=AllLabelsYTrainTrue.columns)
                    global AllLabelsTransformationMatrix
                    AllLabelsTransformationMatrix=DF(AllLabelsYTrainPredicted.dot(pseudoInverse_AllLabelsYTrainTrue),columns=pseudoInverse_AllLabelsYTrainTrue.columns)#change to real code!!
                TrainModel3=lambda y: y.T.dot(AllLabelsTransformationMatrix)
                testscores3[cv_ind]=learningUtils.getTestScores(AllLabelsYTrainTrue,AllLabelsYTrainPredicted,TrainModel3)
            cv_ind+=1

           
        self.ResultsDF=self.ResultsDF.fillna(0.)  
        
        ## Print and save results  
        print('\n')
        print(self.ResultsDF)
        print('\n')
        D=self.Learningdetails 
        savePath=resultsPath+'\\'+D['Model']+'_'+D['CrossVal']+'_LabelBy'+D['LabelBy']+'_Features'+D['FeatureMethod']+ '_FS'+FeatureSelection+'_Kernel'+D['Kernel']+'_'+D['CrossValSubjects']+'Subjects_PieceSize'+D['PieceLength']
        if isPerm:
            savePath=savePath+'_PERMStest'
        saveName=savePath+'\\'+str(n_features)+'_features'        
        self.Learningdetails['saveDir']=savePath
        dir=os.path.dirname(saveName)
        if not os.path.exists(dir):
            os.makedirs(dir)
        if isSavePickle is None:
            isSavePickle=int(raw_input('Save Results to pickle? '))
        if isSaveCsv is None:
            isSaveCsv=int(raw_input('save Results to csv? '))
        if isSaveFig is None:
            isSaveFig=int(raw_input('save Results to figure? '))

       
        if isSavePickle:        
            self.ResultsDF.to_pickle(saveName+'.pickle')
            self.BestFeatures.to_pickle(saveName+'_bestFeatures.pickle')
                
        if isSaveCsv:
            DetailsDF=DF.from_dict(self.Learningdetails,orient='index')
            ResultsCSV=concat([self.ResultsDF,DF(index=['-------Label Details-------']),self.N,DF(index=['-------Learning Details-------']),DetailsDF,DF(index=['-------Selected Features Analysis------']),self.BestFeatures])
            ResultsCSV.to_csv(saveName+'.csv')

        if isSaveCsv or isSavePickle:
            print('successfully saved as:\n' + saveName)
        
        if isSaveFig:
            plt.figure(1)
            plt.savefig(saveName + 'Train.png')
            plt.figure(2)
            plt.savefig(saveName + 'Test.png')
        plt.close()
        plt.close()
Ejemplo n.º 52
0
test.ix[ser_id2].value_counts(sort=False).plot(kind='bar')
test.ix[ser_id1].value_counts(sort=False).plot(kind='bar')

# Sampling from the overlapped rated movies to calculate the correlation
periods_test = DataFrame(np.zeros((20,7)),columns=[int(ser_max/100),int(ser_max/50),int(ser_max/20),int(ser_max/10),int(ser_max/5),int(ser_max/2),ser_max])
for i in periods_test.index:   # Sampling 20 times
    for j in periods_test.columns:
         sample = test.reindex(columns=np.random.permutation(test.columns)[:j])
         periods_test.ix[i,j] = sample.iloc[0].corr(sample.iloc[1])  # ix is for label index, iloc is for int index
print periods_test[:5]
print periods_test.describe()

threshold = 0.1
temp_std = 0
# Take the threshold num which makes sampling correlation stable
for i, std in enumerate(periods_test.std()):
    if std < 0.1 and temp_std >= 0.1:
        mini_period = periods_test.columns[i]
        break
    temp_std = std

# Decide the value of min_periods. Set std 0.05 as threshold
# mini_period = 200
check_size = int(len(data.index) * 0.2)   # 20% dataset for testing
check = {}
check_data = data.copy() # Avoid the changes on original data
check_data = check_data.ix[check_data.count(axis=1) > mini_period]    # Filter users with few ratings. If there is no axis, the sum is the whole matrix
for user in np.random.permutation(check_data.index):
    movie = np.random.permutation(check_data.ix[user].dropna().index)[0]
    check[(user,movie)] = check_data.ix[user,movie]
    check_data.ix[user,movie] = np.nan
Ejemplo n.º 53
0
def discretise_cnv(matrix, filter_sd=True, lower_bound=-1, upper_bound=1):
    matrix_discrete = DataFrame(0, index=matrix.axes[0], columns=matrix.axes[1])
    matrix_discrete[matrix <= lower_bound] = -1.2
    matrix_discrete[matrix >= upper_bound] = 1.2
    return matrix_discrete.loc[:, matrix_discrete.std() != 0] if filter_sd else matrix_discrete
Ejemplo n.º 54
0
import numpy as np
from pandas import DataFrame, Series
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats

df=DataFrame(abs(np.random.randn(30).reshape(6,5))*100)

plt.bar(np.arange(len(df.mean())), df.mean(),        
        align='center',
        color='white',
        

        yerr=df.std(),
        ecolor='black',
        capsize=5,
        linewidth=1,)
plt.grid()



plt.show()