コード例 #1
0
    def calculate_distances(self, similarity_type):
        # 1. determine the type of similarity that needs to be calculated:
        if similarity_type == "m":
            matrix = self.ratings_matrix
            similarity_matrix = pd.pivot(self.ratings_matrix.columns,
                                         self.ratings_matrix.columns,
                                         np.ones(self.movies_count))
        else:
            matrix = self.ratings_matrix.transpose()
            similarity_matrix = pd.pivot(self.ratings_matrix.index,
                                         self.ratings_matrix.index,
                                         np.ones(self.users_count))

        # 2. Calculate the cosine distances between each 2 movies/users, depending on the chosen type
        for x in matrix:
            for y in matrix:
                if x != y:
                    x_matrix = matrix[x]
                    y_matrix = matrix[y]
                    similarity_matrix[x][y] = cosine(x_matrix, y_matrix)
            similarity_matrix = similarity_matrix.reindex(
                similarity_matrix['Value'].sort_values(by=x,
                                                       ascending=False).index)
        # 3. Return similarity matrix
        return similarity_matrix
コード例 #2
0
def submission():
    #モデルの学習
    model = train(seed=args.seed,
                  learning_rate=args.learning_rate,
                  max_depth=args.max_depth,
                  num_leaves=args.num_leaves,
                  min_child_weight=args.min_child_weight,
                  reg_alpha=args.reg_alpha,
                  subsample=args.subsample,
                  num_boost_round=args.num_boost_round,
                  early_stopping_rounds=args.early_stopping_rounds,
                  data_folder=args.data_folder)

    #推論パート
    df_val = pd.read_csv(args.data_folder + "/val.csv")
    df_eval = pd.read_csv(args.data_folder + '/eval.csv')

    drop_list = ['id', 'day', 'demand']

    val_submission = df_val[drop_list]
    eval_submission = df_eval[drop_list]

    x_val = df_val.drop(drop_list, axis=1)
    x_eval = df_eval.drop(drop_list, axis=1)

    #予測の出力
    v_pred = model.predict(x_val)
    e_pred = model.predict(x_eval)

    #出力をSeriesに
    val_submission['demand'] = pd.Series(v_pred)
    eval_submission['demand'] = pd.Series(e_pred)

    #逆メルト処理
    val_submission = pd.pivot(val_submission,
                              index='id',
                              columns='day',
                              values='demand').reset_index()
    eval_submission = pd.pivot(eval_submission,
                               index='id',
                               columns='day',
                               values='demand').reset_index()

    #submissionファイルの形に合うように変換
    val_submission.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]
    eval_submission.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

    try:
        sub = pd.read_csv(
            '/kaggle/input/m5-forecasting-accuracy/sample_submission.csv')
    except:
        sub = pd.read_csv(args.data_folder + '/sample_submission.csv')

    v_submission = pd.merge(sub.iloc[:, :1], val_submission, on='id')
    e_submission = pd.merge(sub.iloc[:, :1], eval_submission, on='id')

    submission = pd.concat([v_submission, e_submission], axis=0)

    submission.to_csv(args.output_folder + '/submission.csv', index=False)
コード例 #3
0
def main():
    filename = "./input/npload.csv"
    # res = np.loadtxt(filename,dtype=str, delimiter="\t")
    # print type(res)
    # write_csv(res, "./input/npload.csv")
    df = pd.read_csv(filename, header=None, index_col=0, usecols=(1,2,3), skiprows=0)
    print(df.head(5))
    pd.read_excel()
    pd.pivot()
コード例 #4
0
def events_calendar():
    calendar_df = pd.read_csv(
        CALENDAR,
        usecols=["event_name_1", "event_name_2", "event_type_1", "event_type_2"],
    )
    ev1_df = pd.pivot(calendar_df, columns="event_name_1", values="event_type_1")
    ev2_df = pd.pivot(calendar_df, columns="event_name_2", values="event_type_2")
    ev1_df = ev1_df.fillna(ev2_df)
    bool_events_df = ev1_df.notna()
    return bool_events_df.values, ev1_df.values
コード例 #5
0
def update_output(list_of_contents, list_of_names, list_of_dates):
    '''
    param: contents - all of the files uploaded
    param: filename - name of the uploaded file
    param: last_modified - the date in which the file was last modified
    return: irrelavent output, will never be printed out and is used to 
    comply with needing an Output for every callback
    '''
    

    if not os.path.exists(config['outputs']): 
        os.makedirs(config['outputs'])
    if list_of_contents is not None:
        number_sheets = utils_app.parse_contents(list_of_contents[0], list_of_names[0], list_of_dates[0])
        if number_sheets == 4: 
            partner_solver_weights = pd.read_excel(config['outputs'] + config['partner-solver-inital-weights'], sheet_name= 'Partner Solver Weights')
            geo_weights_pivot = pd.pivot(partner_solver_weights[['Org_y', 'Org_x', 'geo_weights']], columns='Org_x', index='Org_y' )
            needs_weights_pivot = pd.pivot(partner_solver_weights[['Org_y', 'Org_x', 'needs_weights']], columns='Org_x', index='Org_y' )
            challenge_weights_pivot = pd.pivot(partner_solver_weights[['Org_y', 'Org_x', 'challenge_weights']], columns='Org_x', index='Org_y' )
            stage_weights_pivot = pd.pivot(partner_solver_weights[['Org_y', 'Org_x', 'stage_weights']], columns='Org_x', index='Org_y' )
            tech_weights_pivot = pd.pivot(partner_solver_weights[['Org_y', 'Org_x', 'tech_weights']], columns='Org_x', index='Org_y' )
            
            # List_of_uploaded_files is fully available here
            new_total_score = create_total_score_excel(config['outputs'],
                                                        geo_weights_pivot,
                                                        needs_weights_pivot,
                                                        challenge_weights_pivot, 
                                                        stage_weights_pivot, 
                                                        tech_weights_pivot )
            
                   # new_total_score.insert(0, "Partners", Partners, True)
            children = "Generated outputs"
            solver_df =  pd.read_csv(config['solver_location'])
            partners_df = pd.read_csv(config['partner_location'])
            solver_options = solver_df['Org']
            solver_options = solver_options.to_frame(name='Solvers')
            matches = ['None' for x in range(0, solver_options.shape[0])]
            solver_options['matches'] = matches
            solver_options.to_excel(config['solver_options'], sheet_name='Solver Options', index=False)       
           
            with pd.ExcelWriter(config['output_weights'], mode='w') as writer: 
                solver_df.to_excel(writer, sheet_name='Solver Team Data', index=False)
                partners_df.to_excel(writer, sheet_name='Partner Data', index=False)
                partner_solver_weights.to_excel(writer, sheet_name='Partner Solver Weights', index=False)
               
            
        else: 
            children = "Input file must be an excel file with three sheets- Solver Team Data, Partner Data, Initial Weights"     
    else: 
        children = "Input file must be an excel file with three sheets- Solver Team Data, Partner Data, Initial Weights"    
    return children
コード例 #6
0
def make_pivot(feature: str,
               index: str,
               column: str,
               data: pd.DataFrame,
               groupby_args: list = None):
    """Create two types of pivot matrices: count and mean

    Args:
        feature (str): Feature that is used as a value for the pivot tables. Needs to be numeric
        index (str): Name of rows of the pivot table
        column (str): Name of columns of the pivot table
        data (pd.DataFrame): Data frame containing the data
        groupby_args (list, optional): Parse arguments to groupby. Defaults to None.

    Returns:
        (pd.DataFrame): Pivot tables
    """

    groupby_args = groupby_args or [index, column]

    grouped = data.groupby(groupby_args)[feature].count().to_frame(
        name=f"count_{feature}")

    try:
        grouped[f"mean_{feature}"] = data.groupby(groupby_args)[feature].mean()

    except ValueError:
        if np.issubdtype(data[feature].dtype, np.number):
            msg = f"Expected feature {feature} to of data type numerical. Got {data[feature].dtype}."
            raise (msg)

        raise

    grouped.reset_index(inplace=True)
    grouped.sort_values(by=[index, column], inplace=True, ascending=False)

    pivot_count = pd.pivot(grouped,
                           index=index,
                           columns=column,
                           values=f"count_{feature}")
    pivot_mean = pd.pivot(grouped,
                          index=index,
                          columns=column,
                          values=f"mean_{feature}")

    pivot_count.sort_index(inplace=True, ascending=False)
    pivot_mean.sort_index(inplace=True, ascending=False)

    return pivot_count, pivot_mean
コード例 #7
0
    def evaluate(self, postprocess=False):
        assert len(self.models) != 0, 'Model is not trained...'
        print('Evaluate...')

        pred_val = np.zeros(len(self.val_id))
        pred_eval = np.zeros(len(self.eval_id))

        for model in self.models:
            pred_val += model.predict(self.vals, num_iteration=model.best_iteration) / len(self.models)
            pred_eval += model.predict(self.evals, num_iteration=model.best_iteration) / len(self.models)

        res_val = pd.DataFrame({
            'id': self.val_id,
            'date': self.val_date,
            'demand': pred_val
        })

        res_val = pd.pivot(res_val, index='id', columns='date', values='demand').reset_index()

        res_eval = pd.DataFrame({
            'id': self.eval_id,
            'date': self.eval_date,
            'demand': pred_eval
        })

        res_eval = pd.pivot(res_eval, index='id', columns='date', values='demand').reset_index()

        F_list = [f'F{i + 1}' for i in range(28)]

        res_val.columns = ['id'] + F_list
        res_eval.columns = ['id'] + F_list

        res = pd.concat([res_val, res_eval], axis=0)

        if postprocess:
            alphas = [1.035, 1.03, 1.025]
            weights = [1 / len(alphas)] * len(alphas)
            _res = res.copy()
            for f in F_list:
                _res[f] = 0

                for alpha, weight in zip(alphas, weights):
                    _res[f] += alpha * weight * res[f]

            return _res

        else:
            return res
コード例 #8
0
def plot_heatmap(comp, test):
    test = test.drop_duplicates()
    # pivot the results into a table
    table = pd.pivot(test.loc[test.reject_null==True].sort_values(by = "mrca_2"),\
    index = "tf", columns = "mrca_2", values = 'log2')
    table = table.dropna(thresh=2)  # drop any Na's
    table = table.replace(-np.Inf, -2)
    table = table.replace(np.Inf, 2)

    if len(table) < 25:
        figsize = (5, 10)
    else:
        figsize = (5, 30)
    # plot
    sns.set("notebook")
    cm = sns.clustermap(table.fillna(0),
                        mask=(table == 0),
                        cmap="RdBu_r",
                        center=0,
                        col_cluster=False,
                        figsize=figsize)

    cm.fig.suptitle(comp)
    outf = f"{RE}{val}_{comp}_clustermap_core_mrca2.pdf"
    plt.savefig(outf, bbox_inches="tight", dpi=300)
コード例 #9
0
    def _fit(self, X, y=None):
        total_answers, answers = self.get_answers_ratios(X)
        answers = answers.drop(columns=['is_correct'])

        # Merge with X
        X = pd.merge(X, answers, on=['content_id', 'user_answer'], how='left')
        tasks = X.groupby(['user_id', 'task_container_id'],
                          sort=False)['answer_ratio'].mean().reset_index()

        # Get context
        self.priors = total_answers.to_dict(orient='index')
        self.answers = pd.pivot(answers,
                                index='content_id',
                                columns='user_answer',
                                values='answer_ratio').to_dict(orient='index')

        self.features = ['answer_ratio']
        self.context = tasks[['user_id'] + self.features].drop_duplicates(
            'user_id',
            keep='last').set_index('user_id').to_dict(orient='index')

        tasks['prior_answer_ratio'] = tasks.groupby(
            'user_id')['answer_ratio'].shift()
        tasks = tasks.drop(columns=['answer_ratio'])

        X = pd.merge(X, tasks, on=['user_id', 'task_container_id'], how='left')
        X = X.drop(columns=['answer_ratio'])
        X['prior_answer_ratio'] = X['prior_answer_ratio'].astype(FLOAT_DTYPE)
        return X
コード例 #10
0
def main(file, trainsize):
    data = pd.read_json(file)

    assert (data.loc[:, "Training Size"] == trainsize).any()
    # Drop all other training sizes
    data = data.loc[data["Training Size"] == trainsize, :]
    assert len(data) > 0

    data["f1_loose_l1"] = score(f1_score, data["c_Normal"], data["ic_Normal"],
                                data["c_loose_l1"], data["ic_loose_l1"])

    algorithms = data["Algorithm"].unique()
    p_values = pd.DataFrame(index=algorithms, columns=algorithms)

    for alg1 in algorithms:
        for alg2 in algorithms:
            if alg1 == alg2:
                continue
            selection = data.loc[(data["Algorithm"].isin([alg1, alg2])), :]
            pivoted = pd.pivot(selection,
                               values="f1_loose_l1",
                               columns="Algorithm",
                               index=["Time of Start"])

            p_values.at[alg1, alg2] = ttest_rel(pivoted[alg1],
                                                pivoted[alg2])[1]

    print(p_values)
コード例 #11
0
def spread_dataset(type, data, sel_year):
    index_cols = list(data.columns)
    index_cols.remove('type')
    index_cols.remove('value')

    data = pd.pivot(data, index=index_cols, columns='type',
                    values='value').reset_index().fillna(0)

    if type == 'sm':
        data['Avg Hourly Wages'] = data.groupby(
            ['state', 'year'])['Avg Hourly Wages'].transform('max')

        # Calculate percentage Chg
        data['wage_inc'] = (
            data.groupby(['state', 'cbsa_area'
                          ])['Avg Hourly Wages'].transform('last') /
            data.groupby(['state', 'cbsa_area'
                          ])['Avg Hourly Wages'].transform('first')
        ).apply(lambda x: round(100 * x, 2))

        data = data.loc[data['cbsa_area'] != 'Statewide', ]

    elif type == 'cpi':
        # Calculate percentage Chg
        data[cpi_columns] = (
            data.groupby(['area'])[cpi_columns].transform('last') /
            data.groupby(['area'])[cpi_columns].transform('first')
        ).apply(lambda x: round(100 * x, 2))

    data.drop(data.loc[data['year'] != sel_year, ].index, inplace=True)

    return data
コード例 #12
0
def preprocess(df):
    """
    step 1. 去重
    step 2. 旋转
    step 3. 格式化
    step 4. 指标排序、日期排序
    step 5. 检查空值
    step 6. 异常分析与缺失处理

    # :param inputFile:
    # :param outputFile:
    :return:
    """
    # df = pd.read_excel(inputFile, engine='openpyxl')
    df = df.drop_duplicates(subset=['proc_time', 'collect_name'])  # 去除重复
    pivoted = pd.pivot(df, values='collect_value', columns='collect_name', index='proc_time')  # 旋转
    pivoted.index = pd.to_datetime(pivoted.index)
    pivoted.sort_index(inplace=True)
    pivoted = pivoted[sortTable['name']]  # 抽出来255指标

    if not (pivoted.count() == pivoted.shape[0]).all():
        raise Exception("有空值!!")

    pivoted = wash_data(pivoted)  # 箱型图去除异常
    # pivoted.to_excel(outputFile)

    return pivoted
コード例 #13
0
def global_plot_confirmed():
    df = df_global_confirmed.melt(
        id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'],
        var_name='Date',
        value_name='Confirmed')
    df['Date'] = pd.to_datetime(df['Date'])

    df_state = df.groupby(by=['Country/Region', 'Date']).agg('sum')
    df_state.reset_index(inplace=True)
    df_state = df_state[(df_state['Confirmed'] > GLOBAL_CONFIRMED_THRESHOLD)]

    df_state.drop(columns=['Lat', 'Long'], inplace=True)
    df_state = pd.pivot(df_state,
                        index='Date',
                        columns='Country/Region',
                        values='Confirmed')
    ax = df_state.plot(title='Global confirmed cases time series',
                       grid=True,
                       lw=2,
                       colormap='jet',
                       markersize=10,
                       x_compat=True)
    ax.legend(loc='upper left', frameon=True, fancybox=True, shadow=True)
    ax.set_xlabel('Date')
    ax.set_ylabel('Confirmed')
    ax.patch.set_edgecolor('black')
    ax.patch.set_linewidth('1')
コード例 #14
0
def us_plot_deaths():
    df = df_us_deaths.melt(id_vars=[
        'UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Province_State',
        'Country_Region', 'Lat', 'Long_', 'Combined_Key', 'Population'
    ],
                           var_name='Date',
                           value_name='Deaths')
    df['Date'] = pd.to_datetime(df['Date'])

    df_state = df.groupby(by=['Province_State', 'Date']).agg('sum')
    df_state.reset_index(inplace=True)
    df_state = df_state[(df_state['Deaths'] > US_STATES_DEATHS_THRESHOLD)]

    df_state.drop(
        columns=['UID', 'code3', 'FIPS', 'Lat', 'Long_', 'Population'],
        inplace=True)
    df_state = pd.pivot(df_state,
                        index='Date',
                        columns='Province_State',
                        values='Deaths')
    ax = df_state.plot(title='US States deaths time series',
                       grid=True,
                       lw=2,
                       colormap='jet',
                       markersize=10,
                       x_compat=True)
    ax.legend(loc='upper left', frameon=True, fancybox=True, shadow=True)
    ax.set_xlabel('Date')
    ax.set_ylabel('Confirmed')
    ax.patch.set_edgecolor('black')
    ax.patch.set_linewidth('1')
コード例 #15
0
def global_pct_change_deaths():
    df = df_global_deaths.melt(
        id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'],
        var_name='Date',
        value_name='Deaths')
    df['Date'] = pd.to_datetime(df['Date'])

    df_state = df.groupby(by=['Country/Region', 'Date']).agg('sum')
    df_state.reset_index(inplace=True)
    df_state = df_state[(df_state['Deaths'] > GLOBAL_DEATHS_THRESHOLD)]

    df_state.drop(columns=['Lat', 'Long'], inplace=True)
    df_state = pd.pivot(df_state,
                        index='Date',
                        columns='Country/Region',
                        values='Deaths')
    df_pct_change = df_state.pct_change()

    ax = df_pct_change.plot(title='Global day-to-day percent change in deaths',
                            grid=True,
                            lw=2,
                            colormap='jet',
                            markersize=10,
                            x_compat=True)
    ax.legend(loc='upper left', frameon=True, fancybox=True, shadow=True)
    ax.set_xlabel('Date')
    ax.set_ylabel('Percent Change')
    ax.patch.set_edgecolor('black')
    ax.patch.set_linewidth('1')
コード例 #16
0
ファイル: lgb.py プロジェクト: brett-gt/Kaggle-M5
    def make_submission(self):
        #TODO: This was at the end of train
        print("Predicting with model...")
        val_pred = self.model.predict(self.x_val[features])
        val_score = np.sqrt(metrics.mean_squared_error(val_pred, self.y_val))

        print(f'Our val rmse score is {val_score}')
        y_pred = self.model.predict(self.test[features])
        self.test['demand'] = y_pred
        #return test

        print("Preparing submission...")
        submission = pd.read_csv('Data/sample_submission.csv')

        predictions = self.test[['id', 'date', 'demand']]
        predictions = pd.pivot(predictions,
                               index='id',
                               columns='date',
                               values='demand').reset_index()
        predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

        evaluation_rows = [
            row for row in submission['id'] if 'evaluation' in row
        ]
        evaluation = submission[submission['id'].isin(evaluation_rows)]

        validation = submission[['id']].merge(predictions, on='id')
        final = pd.concat([validation, evaluation])
        final.to_csv('submission.csv', index=False)
        return final
コード例 #17
0
def plot_per_dataset_delta_overlap(res):

    groupby_df = res.groupby(["desc",
                              "arch"])["count_overlap"].median().reset_index()

    table = pd.pivot(groupby_df,
                     index="desc",
                     columns="arch",
                     values="count_overlap")
    print(table.head())

    table[["simple", "complex"]] = table[["simple", "complex"]].astype(int)
    fig, ax = plt.subplots(figsize=(6, 6))

    x, y = "simple", "complex"
    data = table

    g = sns.pointplot(x=x, y=y, data=data, join=False)
    ax.legend(bbox_to_anchor=(1, 1)).remove()
    ax.yaxis.set_major_locator(MultipleLocator(2))
    ax.xaxis.set_major_locator(MultipleLocator(2))
    ax.set(xlim=(-1, 14),
           ylim=(-1, 14),
           xlabel="median simple",
           ylabel="median complex",
           title="simple v. complex overla\n per tissue")
    sns.lineplot([0, 14], [0, 14], ls="--")
コード例 #18
0
    def calc_activity_consistency_multi_process(self):
        gc.collect()
        print(time.ctime(), 'Calculate activity and consistency...')
        df = pd.DataFrame()

        pool = mp.Pool()  # Use number of CPUs processes.
        results = [
            pool.apply_async(self.process_samples, args=(x, ))
            for x in self.chunker_columns(20)
        ]
        for p in results:
            df = pd.concat([df, p.get()[0]])  # f.get(timeout=100)
            print('.', end="")
            sys.stdout.flush()

        #process_samples(self.udp)[0]
        df.drop(['molRole'], inplace=True, axis=1)
        df.drop_duplicates(inplace=True)
        #df['Activity'] = df.Activity #Consistency
        df = pd.pivot(df,
                      columns='sampleID',
                      index='path_name',
                      values='Activity')
        pool.close()
        df.to_csv('./data/output_activity.csv')
        self.activity = df
        print(time.ctime(), "Done.")
コード例 #19
0
def count_top_scores(summary_df, complete_only=False, min_diff=None):
    summary_df = summary_df.sort_values("mean", ascending=False)

    # only consider queries where all systems completed...
    if complete_only:
        summary_df = filter_incomplete(summary_df)
    # top score by folder/dataset
    top_df = summary_df.groupby(["folder_num", "dataset"]).head(1)

    if min_diff is not None:
        diffs = summary_df.groupby(
            ["folder_num",
             "dataset"])["mean"].apply(lambda x: x.values[0] - x.values[1])
        diffs = diffs >= min_diff
        diffs = diffs.to_frame(name="sat_min_diff").reset_index()
        top_df_ext = pd.merge(top_df,
                              diffs,
                              how="left",
                              on=["folder_num", "dataset"])
        top_df = top_df_ext[top_df_ext["sat_min_diff"]].reset_index(drop=True)

    # count of system entries for that folder (i..e # of datasets where it wins)
    top_df = top_df.groupby(["folder_num", "name"]).size().to_frame(name="ct")
    top_df = top_df.reset_index()
    top_df = pd.pivot(top_df, index="folder_num", columns="name", values="ct")
    top_df = top_df.fillna(0)
    return top_df
コード例 #20
0
def melt_to_pivot(melt_df, target_col, encoder_dict):
    index_columns = [
        'id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'
    ]
    index_df = melt_df[index_columns].drop_duplicates().copy()
    demand_df = pd.pivot(melt_df,
                         index='id',
                         columns='date',
                         values=target_col).reset_index().copy()
    demand_df = pd.merge(index_df, demand_df, on='id', how='left')
    demand_df = demand_df.reset_index(drop=True)
    # print(demand_df)
    # print(demand_df.columns)
    demand_df.columns = index_columns + \
        [f'd_{i}' for i in range(
            1, len(demand_df.columns) - len(index_columns) + 1)]

    # print('decoding categorical columns...')
    for col in index_columns:
        # print(col)
        encoder = encoder_dict.get(col)
        if encoder is not None:
            # print(f'aa: {col}')
            demand_df[col] = encoder.inverse_transform(demand_df[col])
    return demand_df
コード例 #21
0
def us_pct_change_confirmed():
    df = df_us_confirmed.melt(id_vars=[
        'UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Province_State',
        'Country_Region', 'Lat', 'Long_', 'Combined_Key'
    ],
                              var_name='Date',
                              value_name='Confirmed')
    df['Date'] = pd.to_datetime(df['Date'])

    df_state = df.groupby(by=['Province_State', 'Date']).agg('sum')
    df_state.reset_index(inplace=True)
    df_state = df_state[(df_state['Confirmed'] >
                         US_STATES_CONFIRMED_THRESHOLD)]
    df_state.drop(columns=['UID', 'code3', 'FIPS', 'Lat', 'Long_'],
                  inplace=True)
    df_state = pd.pivot(df_state,
                        index='Date',
                        columns='Province_State',
                        values='Confirmed')
    df_pct_change = df_state.pct_change()
    ax = df_pct_change.plot(
        title='US States day-to-day percent change in confirmed cases',
        grid=True,
        lw=2,
        colormap='jet',
        markersize=10,
        x_compat=True)
    ax.legend(loc='upper left', frameon=True, fancybox=True, shadow=True)
    ax.set_xlabel('Date')
    ax.set_ylabel('Percent Change')
    ax.patch.set_edgecolor('black')
    ax.patch.set_linewidth('1')
コード例 #22
0
    def _fit(self, X):
        _, answers_ratios = AnswersEncoder(
            smoothing_min=10, smoothing_value=2).get_answers_ratios(X)
        answers_ratios = pd.pivot(answers_ratios,
                                  index='content_id',
                                  columns='user_answer',
                                  values='answer_ratio').fillna(0)

        X = pd.merge(X,
                     answers_ratios,
                     left_on='content_id',
                     right_index=True,
                     how='left')
        results, users, contexts = compute_user_answers_ratio(X, self.decay)
        X = X.drop(columns=[0, 1, 2, 3])

        # Save context
        for user, context in zip(users, contexts):
            self.context[user] = list(map(float, context))

        # Transform ratios to dict
        answers_ratios = answers_ratios.to_dict(orient='index')
        self.answers_ratios = {
            content_id: list(ratios.values())
            for content_id, ratios in answers_ratios.items()
        }

        return X, results
コード例 #23
0
def convert_distmat(data):
    '''
    reads a distance matrix in "long" format and converts to wide format
    input format:
        a   b   d
        a   c   d
        b   c   d
   
    '''
    mat = pd.read_table(data, delimiter="\s", names=['id1', 'id2', 'dist'])
    mat = pd.pivot(index=mat['id1'],
                   columns=mat['id2'],
                   values=mat['dist'])

    mat.update(mat.transpose())
    if mat.index.all() != mat.columns.all():
        i = list(set(mat.index).difference(set(mat.columns)))[0]
        j = list(set(mat.columns).difference(set(mat.index)))[0]
        print j
        mat.loc[:,i] = mat.loc[i,:]
        mat.loc[j,:] = mat.loc[:,j]
    mat = mat.replace("NaN", float(0))
    mat = mat.sort(axis=0)
    mat = mat.sort(axis=1)
    mat2 = mat.values
    ids = list(mat.index)
    return mat2, ids
コード例 #24
0
def describeVariance(df, time='X0', od='Y'):
    '''
    df columns ['X0','X1',...,'Y']
    values of Xs except fo X0 should be non-unique
    '''

    window = getValue('variance_smoothing_window')

    df = df.sort_values('Time')
    df.reset_index(drop=True, inplace=True)

    nX = len(df[time].drop_duplicates())
    nS = int(df.shape[0] / nX)

    sid = pd.DataFrame(np.ravel([np.arange(nS)] * nX), columns=['SID'])
    df = df.join(sid)

    tmp = pd.pivot(df, index=time, columns='SID', values=od)
    if window < 1: window = int(np.ceil(nX * window))

    var = np.var(tmp.values, 1)
    var = filters.gaussian_filter1d(var, window)

    df = df.sort_values(['SID', 'Time'])
    df.loc[:, 'error'] = np.ravel([var] * nS)

    return df
コード例 #25
0
ファイル: heatmap.py プロジェクト: firasmidani/amiga
def pivot(df,args,metric=None):

	if metric is None:

		return df

	else: 

		df = pd.pivot(data=df,columns=args.x_variable,index=args.y_variable,values=metric)

		rows_todrop = np.where(df.isna().any(1))[0]
		rows_todrop = df.index.values[rows_todrop]

		cols_todrop = np.where(df.isna().any())[0]
		cols_todrop = df.keys().values[cols_todrop]

		if len(rows_todrop) > 0 or len(cols_todrop): 
			msg = 'User Warning: The heatmap data is missing values. '
			msg += 'Pleae check the data for the following:\n\n'
			msg += 'Columns:\t'
			msg += ', '.join(cols_todrop) + '\n'
			msg += '\n'
			msg += 'Rows:\t'
			msg += ', '.join(rows_todrop) + '\n'
			msg += '\nThese variables will be dropped and not plotted unless if you requested that '
			msg += 'they be kept with --keep-rows-missing-data or --keep-columns-missing-data.\n\n'
			smartPrint(msg,args.verbose)

		if not args.keep_rows_missing_data:
			df = df.drop(labels=rows_todrop,axis=0)

		if not args.keep_columns_missing_data:
			df = df.drop(labels=cols_todrop,axis=1)

	return df
コード例 #26
0
ファイル: links_object.py プロジェクト: csbramlett/CellOracle
def _link2mat(link, value="coef_abs", fillna=0):
    mat = pd.pivot(data=link,
                   values=[value],
                   index="target", columns="source")
    mat = mat.fillna(fillna)

    return mat
コード例 #27
0
def plot_heatmap(val_to_replace, percentage_decline, path_dict):
    
    # surface plot
    x_grid, y_grid = np.meshgrid(val_to_replace['PrEPDuration'], val_to_replace['PrEPCoverage'])
    z_grid = np.array(percentage_decline).reshape(x_grid.shape)
    x = np.ravel(x_grid)
    y = np.ravel(y_grid)
    z = np.array(percentage_decline)
    sb_heatmap = pd.DataFrame()
    sb_heatmap['Time to max. uptake (months)'] = np.floor(x).astype(int)
    sb_heatmap['PrEP uptake (%)'] = np.floor(y * 100).astype(int)
    sb_heatmap['Percentage declination in incidence'] = z
    sb_heatmap = sb_heatmap.sort_values(by = 'Time to max. uptake (months)')
    plot_df = pd.pivot(data = sb_heatmap,
                       index = 'Time to max. uptake (months)',
                       columns = 'PrEP uptake (%)',
                       values = 'Percentage declination in incidence')
    
    
    # choose color theme
    #cmap = cm.get_cmap('RdYlGn')
    cmap = 'PuBu' #'Reds'
    #my_col_map = ["#eff3ff", "#bdd7e7", "#6baed6", "#3182bd", "#08519c"] # high point is dark blue
    #my_col_map_r = ["#08519c", "#3182bd", "#6baed6", "#bdd7e7", "#eff3ff"] # high point is white
    cmap = 'PuBu_r'
    #cmap = sb.color_palette(cmap)
    
    plt.figure(figsize=(10, 5))
    sb.set(font_scale=1.2)
    heatmap_plot = sb.heatmap(plot_df, annot = True, fmt = '0.1f', linewidths = 0.2, cmap = cmap, cbar_kws={'label': 'Percentage reduction in incidence\n due to only community benefit'})
    heatmap_plot.figure.axes[0].invert_yaxis()
    # if we need to rotate the axis ticks
    if False:
        heatmap_plot.set_yticklabels(heatmap_plot.get_yticklabels(), rotation = 45)
    heatmap_plot.figure.savefig(os.path.join(os.path.join(path_dict['output']['intervention'], ".."), 'Percentage declination in incidence.jpg'))
コード例 #28
0
def sentiment_word_graph(docs_df: pd.DataFrame,
                         output_path: str,
                         doc_col: str = "doc") -> None:
    """
    This will create a word graph based on the counts per word in the docs

    :param docs_df: The data frame with the documents
    :param output_path: The output path
    :param doc_col: The column with our documents
    :return:
    """

    sentiment_counts: pd.DataFrame = \
        docs_df.groupby("label").apply(lambda x: pd.DataFrame.from_dict(
            data=x[doc_col].apply(count_freq_across_documents),
            columns=["n"],
            orient="index"
        ).reset_index().rename(columns={"index": "word"})).reset_index()

    sentiment_counts['log_counts'] = sentiment_counts['count'].log()

    pivoted_sentiments = pd.pivot(sentiment_counts,
                                  index=["word"],
                                  values="log_counts",
                                  columns=["label"])

    fig = px.scatter(data_frame=pivoted_sentiments,
                     x='positive',
                     y='negative',
                     text='word')

    fig.to_html(f"{output_path}/word_graph.html")
コード例 #29
0
    def pivot(df: pd.DataFrame) -> pd.DataFrame:
        """Converts dataframe into unrolled form with each column
        corresponding a single feature entry.

        :param df: pd.DataFrame, raw data.
        Must have columns 'id_job' and 'features'. Column 'features' contains
        comma separated list of values. First value is a feature name and
        others are entries of this feature vector.
        :return: pd.DataFrame, data in unrolled form.
        """
        df = df.copy()
        df['features'] = df['features'].str.split(',')
        df['name'] = df['features'].apply(lambda x: x[0])
        df['features'] = df['features'].apply(lambda x: x[1:])

        length = df['features'].apply(len)
        if length.min() != length.max():
            raise ValueError(f'Feature vectors have different lengths '
                             f'(from {length.min()} to {length.max()}).')

        exploded = df.explode('features')
        indices = np.tile(np.arange(length.min()), df.shape[0]).astype(str)
        exploded['name'] = 'feature_' + exploded['name'] + '_' + indices

        return pd.pivot(data=exploded,
                        index='id_job',
                        columns='name',
                        values='features').astype('float64')
コード例 #30
0
def date_of_test():
    try:
        dot = pd.read_excel(
            'http://ldh.la.gov/assets/oph/Coronavirus/data/LA_COVID_TESTBYDAY_PARISH_PUBLICUSE.xlsx'
        )
        dot['Lab Collection Date'] = dot['Lab Collection Date'].apply(
            lambda x: x.strftime('%m/%d/%Y'))

        categories = [
            'Daily Test Count',
            'Daily Case Count',
            'Daily Negative Test Count',
            'Daily Positive Test Count',
        ]

        df = pd.DataFrame()

        for c in categories:
            cdf = pd.pivot(dot,
                           index='Parish',
                           columns='Lab Collection Date',
                           values=c)
            cdf.insert(0, 'Category', '')
            cdf['Category'] = c
            df = df.append(cdf)
        df.sort_values(by=['Parish', 'Category']).to_csv(
            f'{module_path}/data/cases_tests_dot.csv')
        logger.info('COMPLETE: Date of Test')
    except Exception as e:
        logger.error('Failed to download date of test data')
        logger.exception('Function date_of_test failed with exception')
        logger.error(str(e))
        sys.exit(1)
コード例 #31
0
ファイル: app.py プロジェクト: aeturrell/uk-economy-app
def plot_beveridge_curve():
    indices_dicts_lms = {
        "Vacancies": "AP2Y",
        "Unemployment": "MGSX",
        "Active": "LF2K"
    }
    df = pd.DataFrame()
    for key, value in indices_dicts_lms.items():
        xf, x_text = ons_qna_data("LMS", value)
        xf["Name"] = key
        df = pd.concat([df, xf], axis=0)
    df["value"] = df["value"].astype(np.double)
    df = pd.pivot(df, index="date", columns="Name")
    df.columns = df.columns.droplevel()
    df = df.dropna()
    df["Date"] = df.index
    df["Vacancies"] = 100 * df["Vacancies"].divide(df["Active"])
    max_u = df["Unemployment"].argmax()
    # Need to divide vacs by labour force size
    # Need to label most extremal u value
    fig, ax = plt.subplots()
    quivx = -df["Unemployment"].diff(-1)
    quivy = -df["Vacancies"].diff(-1)
    # This connects the points
    ax.quiver(
        df["Unemployment"],
        df["Vacancies"],
        quivx,
        quivy,
        scale_units="xy",
        angles="xy",
        scale=1,
        width=0.006,
        alpha=0.3,
    )
    ax.scatter(
        df["Unemployment"],
        df["Vacancies"],
        marker="o",
        s=35,
        edgecolor="black",
        linewidth=0.2,
        alpha=0.9,
    )
    for j in [0, max_u, -1]:
        ax.annotate(
            f'{df["Date"].iloc[j].year} Q{df["Date"].iloc[j].quarter}',
            xy=(df[["Unemployment", "Vacancies"]].iloc[j].tolist()),
            xycoords="data",
            xytext=(20, 20),
            textcoords="offset points",
            arrowprops=dict(arrowstyle="->",
                            connectionstyle="angle3,angleA=0,angleB=-90"),
        )
    ax.set_xlabel("Unemployment rate, %")
    ax.set_ylabel("Vacancy rate, %")
    ax.grid(which="major", axis="both", lw=0.2)
    plt.tight_layout()
    st.pyplot(fig)
コード例 #32
0
ファイル: test_panel.py プロジェクト: greeness/pandas
    def test_pivot(self):
        from pandas.core.reshape import _slow_pivot

        one, two, three = (np.array([1, 2, 3, 4, 5]), np.array(["a", "b", "c", "d", "e"]), np.array([1, 2, 3, 5, 4.0]))
        df = pivot(one, two, three)
        self.assertEqual(df["a"][1], 1)
        self.assertEqual(df["b"][2], 2)
        self.assertEqual(df["c"][3], 3)
        self.assertEqual(df["d"][4], 5)
        self.assertEqual(df["e"][5], 4)
        assert_frame_equal(df, _slow_pivot(one, two, three))

        # weird overlap, TODO: test?
        a, b, c = (np.array([1, 2, 3, 4, 4]), np.array(["a", "a", "a", "a", "a"]), np.array([1.0, 2.0, 3.0, 4.0, 5.0]))
        self.assertRaises(Exception, pivot, a, b, c)

        # corner case, empty
        df = pivot(np.array([]), np.array([]), np.array([]))
コード例 #33
0
def make_CF_table(aData, needed_param):
    ''' make an appropriate table for the collaborative filtering and 
        insert the ratings value in the table 
    '''
    user_id = needed_param['user_id']
    product_id = needed_param['product_id']
    ratings = needed_param['ratings']
    
    table_CF = pd.pivot(aData, index = product_id, columns = user_id, values = ratings)
    return table_CF
コード例 #34
0
ファイル: test_panel.py プロジェクト: fperez/pandas
    def test_pivot(self):
        from pandas.core.reshape import _slow_pivot

        one, two, three = (np.array([1, 2, 3, 4, 5]),
                           np.array(['a', 'b', 'c', 'd', 'e']),
                           np.array([1, 2, 3, 5, 4.]))
        df = pivot(one, two, three)
        self.assertEqual(df['a'][1], 1)
        self.assertEqual(df['b'][2], 2)
        self.assertEqual(df['c'][3], 3)
        self.assertEqual(df['d'][4], 5)
        self.assertEqual(df['e'][5], 4)
        assert_frame_equal(df, _slow_pivot(one, two, three))

        # weird overlap, TODO: test?
        a, b, c = (np.array([1, 2, 3, 4, 4]),
                   np.array(['a', 'a', 'a', 'a', 'a']),
                   np.array([1., 2., 3., 4., 5.]))
        self.assertRaises(Exception, pivot, a, b, c)

        # corner case, empty
        df = pivot(np.array([]), np.array([]), np.array([]))
コード例 #35
0
    # Initialize model
    model = Model({'class': model_class})
    banditAlgorithm = BanditAlgorithm(params=0.2)
    model.initialize()
    model.all_possible_decisions = ['hit', 'stay']

    for _ in xrange(num_sims):
        model.buffer += 1

        # Initialize game
        blackjack = BlackJack()
        blackjack.initiate_game()
        if blackjack.game_status != 'in process':
            continue

        all_observed_decision_states, reward = blackjack.complete_one_episode(banditAlgorithm, model)
        model = learn_Q_function(all_observed_decision_states, reward, model)

    return banditAlgorithm.policy, model


if __name__ == "__main__":
    #policy, model = train_reinforcement_learning_strategy(num_sims=500000, model_class='lookup_table')
    policy, model = train_reinforcement_learning_strategy(num_sims=50000, model_class='scikit')
    pd = pd.DataFrame(policy).T
    pd.columns = ['player_value', 'dealer_value', 'decision', 'score']
    pt = pd.pivot('player_value', 'dealer_value')['decision']
    print pt
    pt1 = pd.pivot('player_value', 'dealer_value')['score']
    print pt1