def calculate_distances(self, similarity_type): # 1. determine the type of similarity that needs to be calculated: if similarity_type == "m": matrix = self.ratings_matrix similarity_matrix = pd.pivot(self.ratings_matrix.columns, self.ratings_matrix.columns, np.ones(self.movies_count)) else: matrix = self.ratings_matrix.transpose() similarity_matrix = pd.pivot(self.ratings_matrix.index, self.ratings_matrix.index, np.ones(self.users_count)) # 2. Calculate the cosine distances between each 2 movies/users, depending on the chosen type for x in matrix: for y in matrix: if x != y: x_matrix = matrix[x] y_matrix = matrix[y] similarity_matrix[x][y] = cosine(x_matrix, y_matrix) similarity_matrix = similarity_matrix.reindex( similarity_matrix['Value'].sort_values(by=x, ascending=False).index) # 3. Return similarity matrix return similarity_matrix
def submission(): #モデルの学習 model = train(seed=args.seed, learning_rate=args.learning_rate, max_depth=args.max_depth, num_leaves=args.num_leaves, min_child_weight=args.min_child_weight, reg_alpha=args.reg_alpha, subsample=args.subsample, num_boost_round=args.num_boost_round, early_stopping_rounds=args.early_stopping_rounds, data_folder=args.data_folder) #推論パート df_val = pd.read_csv(args.data_folder + "/val.csv") df_eval = pd.read_csv(args.data_folder + '/eval.csv') drop_list = ['id', 'day', 'demand'] val_submission = df_val[drop_list] eval_submission = df_eval[drop_list] x_val = df_val.drop(drop_list, axis=1) x_eval = df_eval.drop(drop_list, axis=1) #予測の出力 v_pred = model.predict(x_val) e_pred = model.predict(x_eval) #出力をSeriesに val_submission['demand'] = pd.Series(v_pred) eval_submission['demand'] = pd.Series(e_pred) #逆メルト処理 val_submission = pd.pivot(val_submission, index='id', columns='day', values='demand').reset_index() eval_submission = pd.pivot(eval_submission, index='id', columns='day', values='demand').reset_index() #submissionファイルの形に合うように変換 val_submission.columns = ['id'] + ['F' + str(i + 1) for i in range(28)] eval_submission.columns = ['id'] + ['F' + str(i + 1) for i in range(28)] try: sub = pd.read_csv( '/kaggle/input/m5-forecasting-accuracy/sample_submission.csv') except: sub = pd.read_csv(args.data_folder + '/sample_submission.csv') v_submission = pd.merge(sub.iloc[:, :1], val_submission, on='id') e_submission = pd.merge(sub.iloc[:, :1], eval_submission, on='id') submission = pd.concat([v_submission, e_submission], axis=0) submission.to_csv(args.output_folder + '/submission.csv', index=False)
def main(): filename = "./input/npload.csv" # res = np.loadtxt(filename,dtype=str, delimiter="\t") # print type(res) # write_csv(res, "./input/npload.csv") df = pd.read_csv(filename, header=None, index_col=0, usecols=(1,2,3), skiprows=0) print(df.head(5)) pd.read_excel() pd.pivot()
def events_calendar(): calendar_df = pd.read_csv( CALENDAR, usecols=["event_name_1", "event_name_2", "event_type_1", "event_type_2"], ) ev1_df = pd.pivot(calendar_df, columns="event_name_1", values="event_type_1") ev2_df = pd.pivot(calendar_df, columns="event_name_2", values="event_type_2") ev1_df = ev1_df.fillna(ev2_df) bool_events_df = ev1_df.notna() return bool_events_df.values, ev1_df.values
def update_output(list_of_contents, list_of_names, list_of_dates): ''' param: contents - all of the files uploaded param: filename - name of the uploaded file param: last_modified - the date in which the file was last modified return: irrelavent output, will never be printed out and is used to comply with needing an Output for every callback ''' if not os.path.exists(config['outputs']): os.makedirs(config['outputs']) if list_of_contents is not None: number_sheets = utils_app.parse_contents(list_of_contents[0], list_of_names[0], list_of_dates[0]) if number_sheets == 4: partner_solver_weights = pd.read_excel(config['outputs'] + config['partner-solver-inital-weights'], sheet_name= 'Partner Solver Weights') geo_weights_pivot = pd.pivot(partner_solver_weights[['Org_y', 'Org_x', 'geo_weights']], columns='Org_x', index='Org_y' ) needs_weights_pivot = pd.pivot(partner_solver_weights[['Org_y', 'Org_x', 'needs_weights']], columns='Org_x', index='Org_y' ) challenge_weights_pivot = pd.pivot(partner_solver_weights[['Org_y', 'Org_x', 'challenge_weights']], columns='Org_x', index='Org_y' ) stage_weights_pivot = pd.pivot(partner_solver_weights[['Org_y', 'Org_x', 'stage_weights']], columns='Org_x', index='Org_y' ) tech_weights_pivot = pd.pivot(partner_solver_weights[['Org_y', 'Org_x', 'tech_weights']], columns='Org_x', index='Org_y' ) # List_of_uploaded_files is fully available here new_total_score = create_total_score_excel(config['outputs'], geo_weights_pivot, needs_weights_pivot, challenge_weights_pivot, stage_weights_pivot, tech_weights_pivot ) # new_total_score.insert(0, "Partners", Partners, True) children = "Generated outputs" solver_df = pd.read_csv(config['solver_location']) partners_df = pd.read_csv(config['partner_location']) solver_options = solver_df['Org'] solver_options = solver_options.to_frame(name='Solvers') matches = ['None' for x in range(0, solver_options.shape[0])] solver_options['matches'] = matches solver_options.to_excel(config['solver_options'], sheet_name='Solver Options', index=False) with pd.ExcelWriter(config['output_weights'], mode='w') as writer: solver_df.to_excel(writer, sheet_name='Solver Team Data', index=False) partners_df.to_excel(writer, sheet_name='Partner Data', index=False) partner_solver_weights.to_excel(writer, sheet_name='Partner Solver Weights', index=False) else: children = "Input file must be an excel file with three sheets- Solver Team Data, Partner Data, Initial Weights" else: children = "Input file must be an excel file with three sheets- Solver Team Data, Partner Data, Initial Weights" return children
def make_pivot(feature: str, index: str, column: str, data: pd.DataFrame, groupby_args: list = None): """Create two types of pivot matrices: count and mean Args: feature (str): Feature that is used as a value for the pivot tables. Needs to be numeric index (str): Name of rows of the pivot table column (str): Name of columns of the pivot table data (pd.DataFrame): Data frame containing the data groupby_args (list, optional): Parse arguments to groupby. Defaults to None. Returns: (pd.DataFrame): Pivot tables """ groupby_args = groupby_args or [index, column] grouped = data.groupby(groupby_args)[feature].count().to_frame( name=f"count_{feature}") try: grouped[f"mean_{feature}"] = data.groupby(groupby_args)[feature].mean() except ValueError: if np.issubdtype(data[feature].dtype, np.number): msg = f"Expected feature {feature} to of data type numerical. Got {data[feature].dtype}." raise (msg) raise grouped.reset_index(inplace=True) grouped.sort_values(by=[index, column], inplace=True, ascending=False) pivot_count = pd.pivot(grouped, index=index, columns=column, values=f"count_{feature}") pivot_mean = pd.pivot(grouped, index=index, columns=column, values=f"mean_{feature}") pivot_count.sort_index(inplace=True, ascending=False) pivot_mean.sort_index(inplace=True, ascending=False) return pivot_count, pivot_mean
def evaluate(self, postprocess=False): assert len(self.models) != 0, 'Model is not trained...' print('Evaluate...') pred_val = np.zeros(len(self.val_id)) pred_eval = np.zeros(len(self.eval_id)) for model in self.models: pred_val += model.predict(self.vals, num_iteration=model.best_iteration) / len(self.models) pred_eval += model.predict(self.evals, num_iteration=model.best_iteration) / len(self.models) res_val = pd.DataFrame({ 'id': self.val_id, 'date': self.val_date, 'demand': pred_val }) res_val = pd.pivot(res_val, index='id', columns='date', values='demand').reset_index() res_eval = pd.DataFrame({ 'id': self.eval_id, 'date': self.eval_date, 'demand': pred_eval }) res_eval = pd.pivot(res_eval, index='id', columns='date', values='demand').reset_index() F_list = [f'F{i + 1}' for i in range(28)] res_val.columns = ['id'] + F_list res_eval.columns = ['id'] + F_list res = pd.concat([res_val, res_eval], axis=0) if postprocess: alphas = [1.035, 1.03, 1.025] weights = [1 / len(alphas)] * len(alphas) _res = res.copy() for f in F_list: _res[f] = 0 for alpha, weight in zip(alphas, weights): _res[f] += alpha * weight * res[f] return _res else: return res
def plot_heatmap(comp, test): test = test.drop_duplicates() # pivot the results into a table table = pd.pivot(test.loc[test.reject_null==True].sort_values(by = "mrca_2"),\ index = "tf", columns = "mrca_2", values = 'log2') table = table.dropna(thresh=2) # drop any Na's table = table.replace(-np.Inf, -2) table = table.replace(np.Inf, 2) if len(table) < 25: figsize = (5, 10) else: figsize = (5, 30) # plot sns.set("notebook") cm = sns.clustermap(table.fillna(0), mask=(table == 0), cmap="RdBu_r", center=0, col_cluster=False, figsize=figsize) cm.fig.suptitle(comp) outf = f"{RE}{val}_{comp}_clustermap_core_mrca2.pdf" plt.savefig(outf, bbox_inches="tight", dpi=300)
def _fit(self, X, y=None): total_answers, answers = self.get_answers_ratios(X) answers = answers.drop(columns=['is_correct']) # Merge with X X = pd.merge(X, answers, on=['content_id', 'user_answer'], how='left') tasks = X.groupby(['user_id', 'task_container_id'], sort=False)['answer_ratio'].mean().reset_index() # Get context self.priors = total_answers.to_dict(orient='index') self.answers = pd.pivot(answers, index='content_id', columns='user_answer', values='answer_ratio').to_dict(orient='index') self.features = ['answer_ratio'] self.context = tasks[['user_id'] + self.features].drop_duplicates( 'user_id', keep='last').set_index('user_id').to_dict(orient='index') tasks['prior_answer_ratio'] = tasks.groupby( 'user_id')['answer_ratio'].shift() tasks = tasks.drop(columns=['answer_ratio']) X = pd.merge(X, tasks, on=['user_id', 'task_container_id'], how='left') X = X.drop(columns=['answer_ratio']) X['prior_answer_ratio'] = X['prior_answer_ratio'].astype(FLOAT_DTYPE) return X
def main(file, trainsize): data = pd.read_json(file) assert (data.loc[:, "Training Size"] == trainsize).any() # Drop all other training sizes data = data.loc[data["Training Size"] == trainsize, :] assert len(data) > 0 data["f1_loose_l1"] = score(f1_score, data["c_Normal"], data["ic_Normal"], data["c_loose_l1"], data["ic_loose_l1"]) algorithms = data["Algorithm"].unique() p_values = pd.DataFrame(index=algorithms, columns=algorithms) for alg1 in algorithms: for alg2 in algorithms: if alg1 == alg2: continue selection = data.loc[(data["Algorithm"].isin([alg1, alg2])), :] pivoted = pd.pivot(selection, values="f1_loose_l1", columns="Algorithm", index=["Time of Start"]) p_values.at[alg1, alg2] = ttest_rel(pivoted[alg1], pivoted[alg2])[1] print(p_values)
def spread_dataset(type, data, sel_year): index_cols = list(data.columns) index_cols.remove('type') index_cols.remove('value') data = pd.pivot(data, index=index_cols, columns='type', values='value').reset_index().fillna(0) if type == 'sm': data['Avg Hourly Wages'] = data.groupby( ['state', 'year'])['Avg Hourly Wages'].transform('max') # Calculate percentage Chg data['wage_inc'] = ( data.groupby(['state', 'cbsa_area' ])['Avg Hourly Wages'].transform('last') / data.groupby(['state', 'cbsa_area' ])['Avg Hourly Wages'].transform('first') ).apply(lambda x: round(100 * x, 2)) data = data.loc[data['cbsa_area'] != 'Statewide', ] elif type == 'cpi': # Calculate percentage Chg data[cpi_columns] = ( data.groupby(['area'])[cpi_columns].transform('last') / data.groupby(['area'])[cpi_columns].transform('first') ).apply(lambda x: round(100 * x, 2)) data.drop(data.loc[data['year'] != sel_year, ].index, inplace=True) return data
def preprocess(df): """ step 1. 去重 step 2. 旋转 step 3. 格式化 step 4. 指标排序、日期排序 step 5. 检查空值 step 6. 异常分析与缺失处理 # :param inputFile: # :param outputFile: :return: """ # df = pd.read_excel(inputFile, engine='openpyxl') df = df.drop_duplicates(subset=['proc_time', 'collect_name']) # 去除重复 pivoted = pd.pivot(df, values='collect_value', columns='collect_name', index='proc_time') # 旋转 pivoted.index = pd.to_datetime(pivoted.index) pivoted.sort_index(inplace=True) pivoted = pivoted[sortTable['name']] # 抽出来255指标 if not (pivoted.count() == pivoted.shape[0]).all(): raise Exception("有空值!!") pivoted = wash_data(pivoted) # 箱型图去除异常 # pivoted.to_excel(outputFile) return pivoted
def global_plot_confirmed(): df = df_global_confirmed.melt( id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], var_name='Date', value_name='Confirmed') df['Date'] = pd.to_datetime(df['Date']) df_state = df.groupby(by=['Country/Region', 'Date']).agg('sum') df_state.reset_index(inplace=True) df_state = df_state[(df_state['Confirmed'] > GLOBAL_CONFIRMED_THRESHOLD)] df_state.drop(columns=['Lat', 'Long'], inplace=True) df_state = pd.pivot(df_state, index='Date', columns='Country/Region', values='Confirmed') ax = df_state.plot(title='Global confirmed cases time series', grid=True, lw=2, colormap='jet', markersize=10, x_compat=True) ax.legend(loc='upper left', frameon=True, fancybox=True, shadow=True) ax.set_xlabel('Date') ax.set_ylabel('Confirmed') ax.patch.set_edgecolor('black') ax.patch.set_linewidth('1')
def us_plot_deaths(): df = df_us_deaths.melt(id_vars=[ 'UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Province_State', 'Country_Region', 'Lat', 'Long_', 'Combined_Key', 'Population' ], var_name='Date', value_name='Deaths') df['Date'] = pd.to_datetime(df['Date']) df_state = df.groupby(by=['Province_State', 'Date']).agg('sum') df_state.reset_index(inplace=True) df_state = df_state[(df_state['Deaths'] > US_STATES_DEATHS_THRESHOLD)] df_state.drop( columns=['UID', 'code3', 'FIPS', 'Lat', 'Long_', 'Population'], inplace=True) df_state = pd.pivot(df_state, index='Date', columns='Province_State', values='Deaths') ax = df_state.plot(title='US States deaths time series', grid=True, lw=2, colormap='jet', markersize=10, x_compat=True) ax.legend(loc='upper left', frameon=True, fancybox=True, shadow=True) ax.set_xlabel('Date') ax.set_ylabel('Confirmed') ax.patch.set_edgecolor('black') ax.patch.set_linewidth('1')
def global_pct_change_deaths(): df = df_global_deaths.melt( id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], var_name='Date', value_name='Deaths') df['Date'] = pd.to_datetime(df['Date']) df_state = df.groupby(by=['Country/Region', 'Date']).agg('sum') df_state.reset_index(inplace=True) df_state = df_state[(df_state['Deaths'] > GLOBAL_DEATHS_THRESHOLD)] df_state.drop(columns=['Lat', 'Long'], inplace=True) df_state = pd.pivot(df_state, index='Date', columns='Country/Region', values='Deaths') df_pct_change = df_state.pct_change() ax = df_pct_change.plot(title='Global day-to-day percent change in deaths', grid=True, lw=2, colormap='jet', markersize=10, x_compat=True) ax.legend(loc='upper left', frameon=True, fancybox=True, shadow=True) ax.set_xlabel('Date') ax.set_ylabel('Percent Change') ax.patch.set_edgecolor('black') ax.patch.set_linewidth('1')
def make_submission(self): #TODO: This was at the end of train print("Predicting with model...") val_pred = self.model.predict(self.x_val[features]) val_score = np.sqrt(metrics.mean_squared_error(val_pred, self.y_val)) print(f'Our val rmse score is {val_score}') y_pred = self.model.predict(self.test[features]) self.test['demand'] = y_pred #return test print("Preparing submission...") submission = pd.read_csv('Data/sample_submission.csv') predictions = self.test[['id', 'date', 'demand']] predictions = pd.pivot(predictions, index='id', columns='date', values='demand').reset_index() predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)] evaluation_rows = [ row for row in submission['id'] if 'evaluation' in row ] evaluation = submission[submission['id'].isin(evaluation_rows)] validation = submission[['id']].merge(predictions, on='id') final = pd.concat([validation, evaluation]) final.to_csv('submission.csv', index=False) return final
def plot_per_dataset_delta_overlap(res): groupby_df = res.groupby(["desc", "arch"])["count_overlap"].median().reset_index() table = pd.pivot(groupby_df, index="desc", columns="arch", values="count_overlap") print(table.head()) table[["simple", "complex"]] = table[["simple", "complex"]].astype(int) fig, ax = plt.subplots(figsize=(6, 6)) x, y = "simple", "complex" data = table g = sns.pointplot(x=x, y=y, data=data, join=False) ax.legend(bbox_to_anchor=(1, 1)).remove() ax.yaxis.set_major_locator(MultipleLocator(2)) ax.xaxis.set_major_locator(MultipleLocator(2)) ax.set(xlim=(-1, 14), ylim=(-1, 14), xlabel="median simple", ylabel="median complex", title="simple v. complex overla\n per tissue") sns.lineplot([0, 14], [0, 14], ls="--")
def calc_activity_consistency_multi_process(self): gc.collect() print(time.ctime(), 'Calculate activity and consistency...') df = pd.DataFrame() pool = mp.Pool() # Use number of CPUs processes. results = [ pool.apply_async(self.process_samples, args=(x, )) for x in self.chunker_columns(20) ] for p in results: df = pd.concat([df, p.get()[0]]) # f.get(timeout=100) print('.', end="") sys.stdout.flush() #process_samples(self.udp)[0] df.drop(['molRole'], inplace=True, axis=1) df.drop_duplicates(inplace=True) #df['Activity'] = df.Activity #Consistency df = pd.pivot(df, columns='sampleID', index='path_name', values='Activity') pool.close() df.to_csv('./data/output_activity.csv') self.activity = df print(time.ctime(), "Done.")
def count_top_scores(summary_df, complete_only=False, min_diff=None): summary_df = summary_df.sort_values("mean", ascending=False) # only consider queries where all systems completed... if complete_only: summary_df = filter_incomplete(summary_df) # top score by folder/dataset top_df = summary_df.groupby(["folder_num", "dataset"]).head(1) if min_diff is not None: diffs = summary_df.groupby( ["folder_num", "dataset"])["mean"].apply(lambda x: x.values[0] - x.values[1]) diffs = diffs >= min_diff diffs = diffs.to_frame(name="sat_min_diff").reset_index() top_df_ext = pd.merge(top_df, diffs, how="left", on=["folder_num", "dataset"]) top_df = top_df_ext[top_df_ext["sat_min_diff"]].reset_index(drop=True) # count of system entries for that folder (i..e # of datasets where it wins) top_df = top_df.groupby(["folder_num", "name"]).size().to_frame(name="ct") top_df = top_df.reset_index() top_df = pd.pivot(top_df, index="folder_num", columns="name", values="ct") top_df = top_df.fillna(0) return top_df
def melt_to_pivot(melt_df, target_col, encoder_dict): index_columns = [ 'id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id' ] index_df = melt_df[index_columns].drop_duplicates().copy() demand_df = pd.pivot(melt_df, index='id', columns='date', values=target_col).reset_index().copy() demand_df = pd.merge(index_df, demand_df, on='id', how='left') demand_df = demand_df.reset_index(drop=True) # print(demand_df) # print(demand_df.columns) demand_df.columns = index_columns + \ [f'd_{i}' for i in range( 1, len(demand_df.columns) - len(index_columns) + 1)] # print('decoding categorical columns...') for col in index_columns: # print(col) encoder = encoder_dict.get(col) if encoder is not None: # print(f'aa: {col}') demand_df[col] = encoder.inverse_transform(demand_df[col]) return demand_df
def us_pct_change_confirmed(): df = df_us_confirmed.melt(id_vars=[ 'UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Province_State', 'Country_Region', 'Lat', 'Long_', 'Combined_Key' ], var_name='Date', value_name='Confirmed') df['Date'] = pd.to_datetime(df['Date']) df_state = df.groupby(by=['Province_State', 'Date']).agg('sum') df_state.reset_index(inplace=True) df_state = df_state[(df_state['Confirmed'] > US_STATES_CONFIRMED_THRESHOLD)] df_state.drop(columns=['UID', 'code3', 'FIPS', 'Lat', 'Long_'], inplace=True) df_state = pd.pivot(df_state, index='Date', columns='Province_State', values='Confirmed') df_pct_change = df_state.pct_change() ax = df_pct_change.plot( title='US States day-to-day percent change in confirmed cases', grid=True, lw=2, colormap='jet', markersize=10, x_compat=True) ax.legend(loc='upper left', frameon=True, fancybox=True, shadow=True) ax.set_xlabel('Date') ax.set_ylabel('Percent Change') ax.patch.set_edgecolor('black') ax.patch.set_linewidth('1')
def _fit(self, X): _, answers_ratios = AnswersEncoder( smoothing_min=10, smoothing_value=2).get_answers_ratios(X) answers_ratios = pd.pivot(answers_ratios, index='content_id', columns='user_answer', values='answer_ratio').fillna(0) X = pd.merge(X, answers_ratios, left_on='content_id', right_index=True, how='left') results, users, contexts = compute_user_answers_ratio(X, self.decay) X = X.drop(columns=[0, 1, 2, 3]) # Save context for user, context in zip(users, contexts): self.context[user] = list(map(float, context)) # Transform ratios to dict answers_ratios = answers_ratios.to_dict(orient='index') self.answers_ratios = { content_id: list(ratios.values()) for content_id, ratios in answers_ratios.items() } return X, results
def convert_distmat(data): ''' reads a distance matrix in "long" format and converts to wide format input format: a b d a c d b c d ''' mat = pd.read_table(data, delimiter="\s", names=['id1', 'id2', 'dist']) mat = pd.pivot(index=mat['id1'], columns=mat['id2'], values=mat['dist']) mat.update(mat.transpose()) if mat.index.all() != mat.columns.all(): i = list(set(mat.index).difference(set(mat.columns)))[0] j = list(set(mat.columns).difference(set(mat.index)))[0] print j mat.loc[:,i] = mat.loc[i,:] mat.loc[j,:] = mat.loc[:,j] mat = mat.replace("NaN", float(0)) mat = mat.sort(axis=0) mat = mat.sort(axis=1) mat2 = mat.values ids = list(mat.index) return mat2, ids
def describeVariance(df, time='X0', od='Y'): ''' df columns ['X0','X1',...,'Y'] values of Xs except fo X0 should be non-unique ''' window = getValue('variance_smoothing_window') df = df.sort_values('Time') df.reset_index(drop=True, inplace=True) nX = len(df[time].drop_duplicates()) nS = int(df.shape[0] / nX) sid = pd.DataFrame(np.ravel([np.arange(nS)] * nX), columns=['SID']) df = df.join(sid) tmp = pd.pivot(df, index=time, columns='SID', values=od) if window < 1: window = int(np.ceil(nX * window)) var = np.var(tmp.values, 1) var = filters.gaussian_filter1d(var, window) df = df.sort_values(['SID', 'Time']) df.loc[:, 'error'] = np.ravel([var] * nS) return df
def pivot(df,args,metric=None): if metric is None: return df else: df = pd.pivot(data=df,columns=args.x_variable,index=args.y_variable,values=metric) rows_todrop = np.where(df.isna().any(1))[0] rows_todrop = df.index.values[rows_todrop] cols_todrop = np.where(df.isna().any())[0] cols_todrop = df.keys().values[cols_todrop] if len(rows_todrop) > 0 or len(cols_todrop): msg = 'User Warning: The heatmap data is missing values. ' msg += 'Pleae check the data for the following:\n\n' msg += 'Columns:\t' msg += ', '.join(cols_todrop) + '\n' msg += '\n' msg += 'Rows:\t' msg += ', '.join(rows_todrop) + '\n' msg += '\nThese variables will be dropped and not plotted unless if you requested that ' msg += 'they be kept with --keep-rows-missing-data or --keep-columns-missing-data.\n\n' smartPrint(msg,args.verbose) if not args.keep_rows_missing_data: df = df.drop(labels=rows_todrop,axis=0) if not args.keep_columns_missing_data: df = df.drop(labels=cols_todrop,axis=1) return df
def _link2mat(link, value="coef_abs", fillna=0): mat = pd.pivot(data=link, values=[value], index="target", columns="source") mat = mat.fillna(fillna) return mat
def plot_heatmap(val_to_replace, percentage_decline, path_dict): # surface plot x_grid, y_grid = np.meshgrid(val_to_replace['PrEPDuration'], val_to_replace['PrEPCoverage']) z_grid = np.array(percentage_decline).reshape(x_grid.shape) x = np.ravel(x_grid) y = np.ravel(y_grid) z = np.array(percentage_decline) sb_heatmap = pd.DataFrame() sb_heatmap['Time to max. uptake (months)'] = np.floor(x).astype(int) sb_heatmap['PrEP uptake (%)'] = np.floor(y * 100).astype(int) sb_heatmap['Percentage declination in incidence'] = z sb_heatmap = sb_heatmap.sort_values(by = 'Time to max. uptake (months)') plot_df = pd.pivot(data = sb_heatmap, index = 'Time to max. uptake (months)', columns = 'PrEP uptake (%)', values = 'Percentage declination in incidence') # choose color theme #cmap = cm.get_cmap('RdYlGn') cmap = 'PuBu' #'Reds' #my_col_map = ["#eff3ff", "#bdd7e7", "#6baed6", "#3182bd", "#08519c"] # high point is dark blue #my_col_map_r = ["#08519c", "#3182bd", "#6baed6", "#bdd7e7", "#eff3ff"] # high point is white cmap = 'PuBu_r' #cmap = sb.color_palette(cmap) plt.figure(figsize=(10, 5)) sb.set(font_scale=1.2) heatmap_plot = sb.heatmap(plot_df, annot = True, fmt = '0.1f', linewidths = 0.2, cmap = cmap, cbar_kws={'label': 'Percentage reduction in incidence\n due to only community benefit'}) heatmap_plot.figure.axes[0].invert_yaxis() # if we need to rotate the axis ticks if False: heatmap_plot.set_yticklabels(heatmap_plot.get_yticklabels(), rotation = 45) heatmap_plot.figure.savefig(os.path.join(os.path.join(path_dict['output']['intervention'], ".."), 'Percentage declination in incidence.jpg'))
def sentiment_word_graph(docs_df: pd.DataFrame, output_path: str, doc_col: str = "doc") -> None: """ This will create a word graph based on the counts per word in the docs :param docs_df: The data frame with the documents :param output_path: The output path :param doc_col: The column with our documents :return: """ sentiment_counts: pd.DataFrame = \ docs_df.groupby("label").apply(lambda x: pd.DataFrame.from_dict( data=x[doc_col].apply(count_freq_across_documents), columns=["n"], orient="index" ).reset_index().rename(columns={"index": "word"})).reset_index() sentiment_counts['log_counts'] = sentiment_counts['count'].log() pivoted_sentiments = pd.pivot(sentiment_counts, index=["word"], values="log_counts", columns=["label"]) fig = px.scatter(data_frame=pivoted_sentiments, x='positive', y='negative', text='word') fig.to_html(f"{output_path}/word_graph.html")
def pivot(df: pd.DataFrame) -> pd.DataFrame: """Converts dataframe into unrolled form with each column corresponding a single feature entry. :param df: pd.DataFrame, raw data. Must have columns 'id_job' and 'features'. Column 'features' contains comma separated list of values. First value is a feature name and others are entries of this feature vector. :return: pd.DataFrame, data in unrolled form. """ df = df.copy() df['features'] = df['features'].str.split(',') df['name'] = df['features'].apply(lambda x: x[0]) df['features'] = df['features'].apply(lambda x: x[1:]) length = df['features'].apply(len) if length.min() != length.max(): raise ValueError(f'Feature vectors have different lengths ' f'(from {length.min()} to {length.max()}).') exploded = df.explode('features') indices = np.tile(np.arange(length.min()), df.shape[0]).astype(str) exploded['name'] = 'feature_' + exploded['name'] + '_' + indices return pd.pivot(data=exploded, index='id_job', columns='name', values='features').astype('float64')
def date_of_test(): try: dot = pd.read_excel( 'http://ldh.la.gov/assets/oph/Coronavirus/data/LA_COVID_TESTBYDAY_PARISH_PUBLICUSE.xlsx' ) dot['Lab Collection Date'] = dot['Lab Collection Date'].apply( lambda x: x.strftime('%m/%d/%Y')) categories = [ 'Daily Test Count', 'Daily Case Count', 'Daily Negative Test Count', 'Daily Positive Test Count', ] df = pd.DataFrame() for c in categories: cdf = pd.pivot(dot, index='Parish', columns='Lab Collection Date', values=c) cdf.insert(0, 'Category', '') cdf['Category'] = c df = df.append(cdf) df.sort_values(by=['Parish', 'Category']).to_csv( f'{module_path}/data/cases_tests_dot.csv') logger.info('COMPLETE: Date of Test') except Exception as e: logger.error('Failed to download date of test data') logger.exception('Function date_of_test failed with exception') logger.error(str(e)) sys.exit(1)
def plot_beveridge_curve(): indices_dicts_lms = { "Vacancies": "AP2Y", "Unemployment": "MGSX", "Active": "LF2K" } df = pd.DataFrame() for key, value in indices_dicts_lms.items(): xf, x_text = ons_qna_data("LMS", value) xf["Name"] = key df = pd.concat([df, xf], axis=0) df["value"] = df["value"].astype(np.double) df = pd.pivot(df, index="date", columns="Name") df.columns = df.columns.droplevel() df = df.dropna() df["Date"] = df.index df["Vacancies"] = 100 * df["Vacancies"].divide(df["Active"]) max_u = df["Unemployment"].argmax() # Need to divide vacs by labour force size # Need to label most extremal u value fig, ax = plt.subplots() quivx = -df["Unemployment"].diff(-1) quivy = -df["Vacancies"].diff(-1) # This connects the points ax.quiver( df["Unemployment"], df["Vacancies"], quivx, quivy, scale_units="xy", angles="xy", scale=1, width=0.006, alpha=0.3, ) ax.scatter( df["Unemployment"], df["Vacancies"], marker="o", s=35, edgecolor="black", linewidth=0.2, alpha=0.9, ) for j in [0, max_u, -1]: ax.annotate( f'{df["Date"].iloc[j].year} Q{df["Date"].iloc[j].quarter}', xy=(df[["Unemployment", "Vacancies"]].iloc[j].tolist()), xycoords="data", xytext=(20, 20), textcoords="offset points", arrowprops=dict(arrowstyle="->", connectionstyle="angle3,angleA=0,angleB=-90"), ) ax.set_xlabel("Unemployment rate, %") ax.set_ylabel("Vacancy rate, %") ax.grid(which="major", axis="both", lw=0.2) plt.tight_layout() st.pyplot(fig)
def test_pivot(self): from pandas.core.reshape import _slow_pivot one, two, three = (np.array([1, 2, 3, 4, 5]), np.array(["a", "b", "c", "d", "e"]), np.array([1, 2, 3, 5, 4.0])) df = pivot(one, two, three) self.assertEqual(df["a"][1], 1) self.assertEqual(df["b"][2], 2) self.assertEqual(df["c"][3], 3) self.assertEqual(df["d"][4], 5) self.assertEqual(df["e"][5], 4) assert_frame_equal(df, _slow_pivot(one, two, three)) # weird overlap, TODO: test? a, b, c = (np.array([1, 2, 3, 4, 4]), np.array(["a", "a", "a", "a", "a"]), np.array([1.0, 2.0, 3.0, 4.0, 5.0])) self.assertRaises(Exception, pivot, a, b, c) # corner case, empty df = pivot(np.array([]), np.array([]), np.array([]))
def make_CF_table(aData, needed_param): ''' make an appropriate table for the collaborative filtering and insert the ratings value in the table ''' user_id = needed_param['user_id'] product_id = needed_param['product_id'] ratings = needed_param['ratings'] table_CF = pd.pivot(aData, index = product_id, columns = user_id, values = ratings) return table_CF
def test_pivot(self): from pandas.core.reshape import _slow_pivot one, two, three = (np.array([1, 2, 3, 4, 5]), np.array(['a', 'b', 'c', 'd', 'e']), np.array([1, 2, 3, 5, 4.])) df = pivot(one, two, three) self.assertEqual(df['a'][1], 1) self.assertEqual(df['b'][2], 2) self.assertEqual(df['c'][3], 3) self.assertEqual(df['d'][4], 5) self.assertEqual(df['e'][5], 4) assert_frame_equal(df, _slow_pivot(one, two, three)) # weird overlap, TODO: test? a, b, c = (np.array([1, 2, 3, 4, 4]), np.array(['a', 'a', 'a', 'a', 'a']), np.array([1., 2., 3., 4., 5.])) self.assertRaises(Exception, pivot, a, b, c) # corner case, empty df = pivot(np.array([]), np.array([]), np.array([]))
# Initialize model model = Model({'class': model_class}) banditAlgorithm = BanditAlgorithm(params=0.2) model.initialize() model.all_possible_decisions = ['hit', 'stay'] for _ in xrange(num_sims): model.buffer += 1 # Initialize game blackjack = BlackJack() blackjack.initiate_game() if blackjack.game_status != 'in process': continue all_observed_decision_states, reward = blackjack.complete_one_episode(banditAlgorithm, model) model = learn_Q_function(all_observed_decision_states, reward, model) return banditAlgorithm.policy, model if __name__ == "__main__": #policy, model = train_reinforcement_learning_strategy(num_sims=500000, model_class='lookup_table') policy, model = train_reinforcement_learning_strategy(num_sims=50000, model_class='scikit') pd = pd.DataFrame(policy).T pd.columns = ['player_value', 'dealer_value', 'decision', 'score'] pt = pd.pivot('player_value', 'dealer_value')['decision'] print pt pt1 = pd.pivot('player_value', 'dealer_value')['score'] print pt1