'F3': 'Counts', 'F6': 'Sequent Clicks', 'F7': 'Time Difference' }) else: what_to_buy = extract_what_to_buy(clicks_grouped_by_session_id) features_to_csv(file_what_to_buy_features, what_to_buy.values()) if os.path.isfile(file_buy_or_not_features): buy_or_not = np.load(file_buy_or_not_features) else: buy_or_not = extract_buy_or_not(clicks_grouped_by_session_id, what_to_buy) np.save(file_buy_or_not_features, buy_or_not) buys = read_buys(file_buys, usecols=effective_columns_names) _, buys_grouped_by_session_id_keys = df_group_by_session_id(buys) buys_result = extract_buys(clicks_grouped_by_session_id_keys, buys_grouped_by_session_id_keys) buy_or_not_train, buy_or_not_val, buys_result_train, buys_result_val = train_test_split( buy_or_not, buys_result, test_size=0.2) classifier = XGBClassifier(n_estimators=500, subsample=0.8, colsample_bytree=0.5, max_depth=4, min_child_weight=3) classifier.fit(buy_or_not_train, buys_result_train) predictions_val = classifier.predict(buy_or_not_val)
clicks_grouped_by_session_id, clicks_grouped_by_session_id_keys = df_group_by_session_id(clicks) if os.path.isfile(file_what_to_buy_features): what_to_buy = features_from_csv(file_what_to_buy_features, ['Session ID', 'Item ID'], {'F3': 'Counts', 'F6': 'Sequent Clicks', 'F7': 'Time Difference'}) else: what_to_buy = extract_what_to_buy(clicks_grouped_by_session_id) features_to_csv(file_what_to_buy_features, what_to_buy.values()) if os.path.isfile(file_buy_or_not_features): buy_or_not = np.load(file_buy_or_not_features) else: buy_or_not = extract_buy_or_not(clicks_grouped_by_session_id, what_to_buy) np.save(file_buy_or_not_features, buy_or_not) buys = read_buys(file_buys, usecols=effective_columns_names) _, buys_grouped_by_session_id_keys = df_group_by_session_id(buys) buys_result = extract_buys(clicks_grouped_by_session_id_keys, buys_grouped_by_session_id_keys) buy_or_not_train, buy_or_not_val, buys_result_train, buys_result_val = train_test_split(buy_or_not, buys_result, test_size=0.2) classifier = XGBClassifier(n_estimators=500, subsample=0.8, colsample_bytree=0.5, max_depth=4, min_child_weight=3) classifier.fit(buy_or_not_train, buys_result_train) predictions_val = classifier.predict(buy_or_not_val) scores = metrics(buys_result_val, predictions_val) write_metrics(file_scores, scores) test = read_clicks(file_test, usecols=effective_columns_names) test_grouped_by_session_id, _ = df_group_by_session_id(test)
if __name__ == "__main__": assert len(sys.argv) == 5, r"Incorrect argument list, names also shouldn't contain spaces" + \ "\nusage: argv[0] /dir/with/data clicks_file buys_file frac" path = sys.argv[1] file_clicks_basename = sys.argv[2] file_buys_basename = sys.argv[3] char_frac = sys.argv[4] float_frac = float(char_frac) assert float_frac > 0, 'Frac must be a positive number' frac = float_frac if float_frac <= 1 else int(float_frac) if frac == 1: exit(0) file_clicks_parts = os.path.splitext(file_clicks_basename) file_buys_parts = os.path.splitext(file_buys_basename) file_clicks_sliced_basename = file_clicks_parts[0] + '-' + char_frac + file_clicks_parts[1] file_buys_sliced_basename = file_buys_parts[0] + '-' + char_frac + file_buys_parts[1] file_clicks = os.path.join(path, file_clicks_basename) file_buys = os.path.join(path, file_buys_basename) file_clicks_sliced = os.path.join(path, file_clicks_sliced_basename) file_buys_sliced = os.path.join(path, file_buys_sliced_basename) clicks = read_clicks(file_clicks) buys = read_buys(file_buys) clicks, buys = slice_data(clicks, buys, frac=frac) write_df(file_clicks_sliced, clicks) write_df(file_buys_sliced, buys)
"\nusage: argv[0] /dir/with/data clicks_file buys_file frac" path = sys.argv[1] file_clicks_basename = sys.argv[2] file_buys_basename = sys.argv[3] char_frac = sys.argv[4] float_frac = float(char_frac) assert float_frac > 0, 'Frac must be a positive number' frac = float_frac if float_frac <= 1 else int(float_frac) if frac == 1: exit(0) file_clicks_parts = os.path.splitext(file_clicks_basename) file_buys_parts = os.path.splitext(file_buys_basename) file_clicks_sliced_basename = file_clicks_parts[ 0] + '-' + char_frac + file_clicks_parts[1] file_buys_sliced_basename = file_buys_parts[ 0] + '-' + char_frac + file_buys_parts[1] file_clicks = os.path.join(path, file_clicks_basename) file_buys = os.path.join(path, file_buys_basename) file_clicks_sliced = os.path.join(path, file_clicks_sliced_basename) file_buys_sliced = os.path.join(path, file_buys_sliced_basename) clicks = read_clicks(file_clicks) buys = read_buys(file_buys) clicks, buys = slice_data(clicks, buys, frac=frac) write_df(file_clicks_sliced, clicks) write_df(file_buys_sliced, buys)