def regressior_baseline(city="Phoenix"): bus_df = get_data(city) test_ids = [line.strip() for line in open(test_business_list).readlines()] train_raw_x, test_raw_x = gen_data_no_review(bus_df, test_ids) print "baseline lasso no review" # 335 rmse: 0.792623 # 410 rmse: 1.255996 print "rmse: %f" % baseline_lasso_leave_one_out( train_raw_x, test_raw_x, test_ids, alpha=0.001, result_path="regression_lasso_no_review.txt" ) print "baseline extra no review" # 335 rmse: 0.818625 print "rmse: %f" % baseline_extra_leave_one_out( train_raw_x, test_raw_x, test_ids, n=20, d=10, result_path="regression_extra_no_review.txt" ) train_raw_x, test_raw_x = gen_data_with_top_review(bus_df, test_ids, city) print "baseline lasso with 10 reviews" # 335 rmse: 0.808528 # 410 rmse: 1.256638 print "rmse: %f" % baseline_lasso_leave_one_out( train_raw_x, test_raw_x, test_ids, alpha=10e-4, result_path="regression_with_review.txt" ) print "baseline extra no review" print "rmse: %f" % baseline_extra_leave_one_out( train_raw_x, test_raw_x, test_ids, n=40, d=40, result_path="regression_extra_with_review.txt" )
def build_feature(input_file, top_review_number=50): bus_df = get_data() all_stars_dist = pd.value_counts( bus_df.stars, normalize=True).sort_index(ascending=False) all_stars_dist_dict = all_stars_dist.to_dict() stars_dict = dict(zip(bus_df.business_id, bus_df.stars)) x = [] y = [] xi = {} now_b_id = "" count = 0 for line in open(input_file): line = line.replace("BUSINESS#", "") source, dest, pagerank = line.strip().split(',') if now_b_id != source: if xi: row = [] for k in sorted(xi.keys()): row.append((xi[k] / count) - all_stars_dist_dict[k]) x.append(row) xi = {k: 0.0 for k in all_stars_dist_dict.keys()} now_b_id = source count = 0 y.append(stars_dict[now_b_id]) if count < top_review_number: xi[stars_dict[dest]] += 1 count += 1 row = [] for k in sorted(xi.keys()): row.append((xi[k] / count) - all_stars_dist_dict[k]) x.append(row) return np.array(x), np.array(y)
def baseline_city_average(city="Phoenix"): bus_df = get_data(city) # test_ids = bus_df.business_id[:100].tolist() test_ids = [line.strip() for line in open(test_business_list).readlines()] x = bus_df[['business_id', 'review_count', 'categories']] test_x = x[bus_df['business_id'].isin(test_ids)].reset_index() predict = [] for test_id in test_ids: predict.append(x[x.business_id != test_id].review_count.mean()) rmse = mean_squared_error(predict, test_x.review_count.as_matrix()) ** 0.5 print rmse
def search_n_d(city="Phoenix"): bus_df = get_data(city) test_ids = [line.strip() for line in open(test_business_list).readlines()] print "no review" train_raw_x, test_raw_x = gen_data_no_review(bus_df, test_ids) train_x = train_raw_x[~train_raw_x.business_id.isin(test_ids)] train_y = train_x.stars.as_matrix() train_x = train_x.drop(["business_id", "stars"], 1).as_matrix() tune_extra(train_x, train_y) print "with top 10 review" train_raw_x, test_raw_x = gen_data_with_top_review(bus_df, test_ids, city) train_x = train_raw_x[~train_raw_x.business_id.isin(test_ids)] train_y = train_x.stars.as_matrix() train_x = train_x.drop(["business_id", "stars"], 1).as_matrix() tune_extra(train_x, train_y)
def baseline_city_average(city="Phoenix"): bus_df = get_data(city) test_ids = [line.strip() for line in open(test_business_list).readlines()] x = bus_df[["business_id", "stars"]] test_x = x[bus_df["business_id"].isin(test_ids)].reset_index() predict = [] for test_id in test_ids: predict.append(x[x.business_id != test_id].stars.mean()) rmse = mean_squared_error(predict, test_x.stars.as_matrix()) ** 0.5 result = pd.DataFrame([], columns=["stars", "predict"]) result["stars"] = test_x.stars result["predict"] = predict result = result.sort("stars", ascending=0) result.to_csv("city.csv", index=False) print rmse
def baseline_category_average(city="Phoenix"): bus_df = get_data(city) test_ids = [line.strip() for line in open(test_business_list).readlines()] x = bus_df[['business_id', 'review_count', 'categories']] test_x = x[bus_df['business_id'].isin(test_ids)].reset_index() predict = [] for test_id in test_ids: categories = set(x[x.business_id == test_id].categories.tolist()[0]) if categories: c = x.categories.apply( lambda x: len(categories.intersection(set(x))) > 0) predict.append(x[c].review_count.mean()) else: predict.append(x[x.business_id != test_id].review_count.mean()) rmse = mean_squared_error(predict, test_x.review_count.as_matrix()) ** 0.5 print rmse
def search_alpha(city="Phoenix"): bus_df = get_data(city) test_ids = [line.strip() for line in open(test_business_list).readlines()] print "no review" train_raw_x, test_raw_x = gen_data_no_review(bus_df, test_ids) train_x = train_raw_x[~train_raw_x.business_id.isin(test_ids)] train_y = train_x.stars.as_matrix() train_x = train_x.drop(["business_id", "stars"], 1).as_matrix() coefs, scores, alpha = tune_lasso(train_x, train_y) best_alpha = alpha[scores.index(min(scores))] print "best alpha=%f" % best_alpha print "with top 10 review" train_raw_x, test_raw_x = gen_data_with_top_review(bus_df, test_ids, city) train_x = train_raw_x[~train_raw_x.business_id.isin(test_ids)] train_y = train_x.stars.as_matrix() train_x = train_x.drop(["business_id", "stars"], 1).as_matrix() coefs, scores, alpha = tune_lasso(train_x, train_y) best_alpha = alpha[scores.index(min(scores))] print "best alpha=%f" % best_alpha
def baseline_category_average(city="Phoenix"): bus_df = get_data(city) test_ids = [line.strip() for line in open(test_business_list).readlines()] x = bus_df[["business_id", "stars", "categories"]] test_x = x[bus_df["business_id"].isin(test_ids)].reset_index() predict = [] for test_id in test_ids: categories = set(x[x.business_id == test_id].categories.tolist()[0]) if categories: c = x.categories.apply(lambda x: len(categories.intersection(set(x))) > 0) predict.append(x[c].stars.mean()) else: predict.append(x[x.business_id != test_id].stars.mean()) result = pd.DataFrame([], columns=["stars", "predict"]) result["stars"] = test_x.stars result["predict"] = predict result = result.sort("stars", ascending=0) result.to_csv("category.csv", index=False) rmse = mean_squared_error(predict, test_x.stars.as_matrix()) ** 0.5 print rmse