def main(argv): parser = argparse.ArgumentParser() parser.add_argument('filename', help="input features file") parser.add_argument('-s', '--split', type=float, help="test split ratio", default=0.33) parser.add_argument('-t', '--threshold', type=float, help="decision boundry", default=0.5) parser.add_argument('-o', '--output', action='store_true', help="save the output") args = parser.parse_args() filename = args.filename test_size = args.split t = args.threshold write_output = args.output print('running swiki with test size %.2f and threshold %.2f' % (test_size, t)) df = pd.read_csv("../../data/cache_selection_structured/" + filename) print("df size: " + str(df.shape)) df = df.fillna(0) df = df.T.drop_duplicates().T print("df size after dedup: " + str(df.shape)) labels = np.where(df['full'] > df['cache'], 1, 0) print("bad queries ratio: %.2f" % (100 * np.sum(labels) / labels.shape[0])) X, X_test, y, y_test = train_test_split(df, labels, stratify=labels, test_size=test_size, random_state=1) sample_weight = X['freq'] X = X.drop(['query', 'freq', 'cache', 'full'], axis=1) test_queries = X_test['query'] test_freq = X_test['freq'] subset_mrr = X_test['cache'] db_mrr = X_test['full'] X_test = X_test.drop(['query', 'freq', 'cache', 'full'], axis=1) #print(df.corr()['label'].sort_values()) print("train set size, bad queries and bad query ratio: %d, %d, %.2f" % (y.shape[0], np.sum(y), (100 * np.sum(y) / y.shape[0]))) print("test set size, bad queries and bad query ratio: %d, %d, %.2f" % (y_test.shape[0], np.sum(y_test), (100 * np.sum(y_test) / y_test.shape[0]))) # learn the model y_pred = train_lr(X, y, X_test, y_test, t, df.columns.values[2:-2]) output = pd.DataFrame() output['query'] = test_queries output['TestFreq'] = test_freq output['cache'] = subset_mrr output['full'] = db_mrr output['Label'] = y_test output['ql_label'] = X_test['ql_0_0'] < X_test['ql_rest_0_0'] output['ql'] = np.where(output['ql_label'] == 1, db_mrr, subset_mrr) output['ml_label'] = pd.Series(y_pred, index=output.index) output['ml'] = np.where(output['ml_label'] == 1, db_mrr, subset_mrr) output['best'] = np.maximum(subset_mrr, db_mrr) r = np.random.randint(0, 2, output['cache'].size) output['rand'] = np.where(r == 1, output['full'], output['cache']) analyze(output, 'cache', 'full','TestFreq') if (write_output): output.to_csv('%s%s_result.csv' % ('../../data/cache_selection_structured/', filename[:-4]), index=False)
def main(argv): parser = argparse.ArgumentParser() parser.add_argument('filename', help="input features file") parser.add_argument('-s', '--split', type=float, help="test split ratio", default=0.33) parser.add_argument('-t', '--threshold', type=float, help="decision boundry", default=0.5) parser.add_argument('-o', '--output', action='store_true', help="save the output") args = parser.parse_args() filename = args.filename test_size = args.split t = args.threshold write_output = args.output print('running swiki with test size %.2f and threshold %.2f' % (test_size, t)) df = pd.read_csv("../../data/cache_selection_structured/" + filename) print("df size: " + str(df.shape)) df = df.fillna(0) df = df.T.drop_duplicates().T print("df size after dedup: " + str(df.shape)) labels = np.where(df['full'] > df['cache'], 1, 0) viewcount = df['freq'] print("bad queries ratio (with freq): %.2f" % (100 * np.sum(labels * viewcount) / viewcount.sum())) test_viewcount = sample_viewcount(viewcount, int(test_size * viewcount.sum())) train_viewcount = viewcount - test_viewcount X = df[train_viewcount > 0].copy() y = labels[train_viewcount > 0].copy() X_test = df[test_viewcount > 0].copy() y_test = labels[test_viewcount > 0].copy() X = X.drop(['query', 'freq', 'cache', 'full'], axis=1) test_queries = X_test['query'] test_freq = test_viewcount subset_mrr = X_test['cache'] db_mrr = X_test['full'] X_test = X_test.drop(['query', 'freq', 'cache', 'full'], axis=1) #print(df.corr()['label'].sort_values()) print("train set size, bad queries and bad query ratio: %d, %d, %.2f" % (train_viewcount.sum(), (labels * train_viewcount).sum(), (100 * (labels * train_viewcount).sum() / train_viewcount.sum()))) print("test set size, bad queries and bad query ratio: %d, %d, %.2f" % (test_viewcount.sum(), (labels * test_viewcount).sum(), (100 * (labels * test_viewcount).sum() / test_viewcount.sum()))) # learn the model # y_pred = train_lr(X, y, X_test, y_test, t, df.columns.values[2:-2]) y_pred = train_lr(X, y, X_test, y_test, t) output = pd.DataFrame() output['query'] = test_queries output['TestFreq'] = test_freq output['cache'] = subset_mrr output['full'] = db_mrr output['Label'] = y_test output['ql_label'] = X_test['ql_0_0'] < X_test['ql_rest_0_0'] output['ql'] = np.where(output['ql_label'] == 1, db_mrr, subset_mrr) output['ml_label'] = pd.Series(y_pred, index=output.index) output['ml'] = np.where(output['ml_label'] == 1, db_mrr, subset_mrr) output['best'] = np.maximum(subset_mrr, db_mrr) r = np.random.randint(0, 2, output['cache'].size) output['rand'] = np.where(r == 1, output['full'], output['cache']) analyze(output, 'cache', 'full', 'TestFreq') if (write_output): output.to_csv( '%s%s_result.csv' % ('../../data/cache_selection_structured/', filename[:-4]), index=False)
def main(argv): filename = argv[0] df = pd.read_csv('../../data/cache_selection/' + filename) t = float(argv[1]) df = df.fillna(0) labels = df['label'] size = 0.33 X, X_test, y, y_test = train_test_split(df.drop(['label'], axis=1), labels, stratify=labels, test_size=size, random_state=1) X = X.drop(['query', 'TrainFreq', 'TestFreq', '2', '100'], axis=1) test_queries = X_test['query'] subset_mrr = X_test['2'] db_mrr = X_test['100'] test_freq = X_test['TestFreq'] X_test = X_test.drop(['TrainFreq', 'TestFreq', 'query', '2', '100'], axis=1) ql = subset_mrr.copy() ql_pred = X_test['ql_c'] < X_test['ql_c.1'] ql.loc[ql_pred == 1] = db_mrr[ql_pred == 1] #print(df.corr()['label'].sort_values()) print("train set size and ones: %d, %d" % (y.shape[0], np.sum(y))) print("test set size and ones: %d, %d" % (y_test.shape[0], np.sum(y_test))) print("onez ratio in trian set = %.2f" % (100 * np.sum(y) / y.shape[0])) print("onez ratio in test set = %.2f" % (100 * np.sum(y_test) / y_test.shape[0])) # learn the model #sc = StandardScaler().fit(X) sc = MinMaxScaler().fit(X) X = sc.transform(X) X_test = sc.transform(X_test) print("training balanced LR..") lr = linear_model.LogisticRegression(class_weight='balanced') lr.fit(X, y) print("training mean accuracy = %.2f" % lr.score(X, y)) print("testing mean accuracy = %.2f" % lr.score(X_test, y_test)) c = np.column_stack((df.columns.values[5:-1], np.round(lr.coef_.flatten(),2))) print(c[c[:,1].argsort()]) y_prob = lr.predict_proba(X_test) y_pred = y_prob[:, 1] > t y_pred = y_pred.astype('uint8') print('--- t = %.2f results:' % t) print_results(y_test, y_pred) output = pd.DataFrame() output['Query'] = test_queries output['TestFreq'] = test_freq output['2'] = subset_mrr output['100'] = db_mrr output['Label'] = y_test output['ql'] = ql output['ql_label'] = ql ml = subset_mrr.copy() ml.loc[y_pred == 1] = db_mrr[y_pred == 1] output['ml'] = ml output['Pred'] = pd.Series(y_pred, index=output.index) best = subset_mrr.copy() print(best.mean()) best[y_test == 1] = db_mrr[y_test == 1] print(best.mean()) output['best'] = best r = np.random.randint(0, 2, output['2'].size) output['rand'] = output['2'].copy() output['rand'][r == 1] = output['100'][r == 1].copy() analyze(output, '2', '100','TestFreq') if (argv[2]): df.to_csv('%s%s_result.csv' % ('../../data/python_data/', filename[:-4]), index=False)
def main(argv): parser = argparse.ArgumentParser() parser.add_argument('filename', help="input features file") parser.add_argument('-s', '--split', type=float, help="test split ratio", default=0.33) parser.add_argument('-t', '--threshold', type=float, help="decision boundry", default=0.5) parser.add_argument('-o', '--output', action='store_true', help="save the output", default=None) parser.add_argument('-d', '--diff', action='store_true', help="uses feature diffs") args = parser.parse_args() filename = args.filename test_size = args.split t = args.threshold write_output = args.output print('running swiki with test size %.2f and threshold %.2f' % (test_size, t)) df = pd.read_csv("../../data/cache_selection_structured/" + filename) if args.diff: columns = df.columns for i in range(2, 146, 2): col = columns[i] if 'rest' in col: continue rest_col = col[:-4] + '_rest' + col[-4:] if rest_col not in columns: print('wtf! %s' % rest_col) df[col] = df[col] - df[rest_col] df = df.drop([rest_col], axis=1) print("df size: " + str(df.shape)) df = df.fillna(0) df = df.T.drop_duplicates().T print("df size after dedup: " + str(df.shape)) labels = np.where(df['full'] > df['cache'], 1, 0) print("bad queries ratio: %.2f" % (100 * np.sum(labels) / labels.shape[0])) X, X_test, y, y_test = train_test_split(df, labels, stratify=labels, test_size=test_size, random_state=1) sample_weight = X['freq'] X = X.drop(['query', 'freq', 'cache', 'full'], axis=1) test_queries = X_test['query'] test_freq = X_test['freq'] subset_mrr = X_test['cache'] db_mrr = X_test['full'] X_test = X_test.drop(['query', 'freq', 'cache', 'full'], axis=1) #print(df.corr()['label'].sort_values()) print("train set size, bad queries and bad query ratio: %d, %d, %.2f" % (y.shape[0], np.sum(y), (100 * np.sum(y) / y.shape[0]))) print("test set size, bad queries and bad query ratio: %d, %d, %.2f" % (y_test.shape[0], np.sum(y_test), (100 * np.sum(y_test) / y_test.shape[0]))) # learn the model y_pred = train_lr(X, y, X_test, y_test, t, df.columns.values[2:-2]) output = pd.DataFrame() output['query'] = test_queries output['TestFreq'] = test_freq output['cache'] = subset_mrr output['full'] = db_mrr output['Label'] = y_test if args.diff: output['ql_label'] = X_test['ql_0_0'] < 0 else: output['ql_label'] = X_test['ql_0_0'] < X_test['ql_rest_0_0'] output['ql'] = np.where(output['ql_label'] == 1, db_mrr, subset_mrr) output['ml_label'] = pd.Series(y_pred, index=output.index) output['ml'] = np.where(output['ml_label'] == 1, db_mrr, subset_mrr) output['best'] = np.maximum(subset_mrr, db_mrr) r = np.random.randint(0, 2, output['cache'].size) output['rand'] = np.where(r == 1, output['full'], output['cache']) analyze(output, 'cache', 'full', 'TestFreq') if (write_output): output.to_csv( '%s%s_result.csv' % ('../../data/cache_selection_structured/', filename[:-4]), index=False)
def main(argv): filename = argv[0] df = pd.read_csv('../../data/cache_selection/' + filename) t = float(argv[1]) df = df.fillna(0) labels = df['label'] size = 0.33 X, X_test, y, y_test = train_test_split(df.drop(['label'], axis=1), labels, stratify=labels, test_size=size, random_state=1) X = X.drop(['query', 'TrainFreq', 'TestFreq', '2', '100'], axis=1) test_queries = X_test['query'] subset_mrr = X_test['2'] db_mrr = X_test['100'] test_freq = X_test['TestFreq'] X_test = X_test.drop(['TrainFreq', 'TestFreq', 'query', '2', '100'], axis=1) ql = subset_mrr.copy() ql_pred = X_test['ql_c'] < X_test['ql_c.1'] ql.loc[ql_pred == 1] = db_mrr[ql_pred == 1] #print(df.corr()['label'].sort_values()) print("train set size and ones: %d, %d" % (y.shape[0], np.sum(y))) print("test set size and ones: %d, %d" % (y_test.shape[0], np.sum(y_test))) print("onez ratio in trian set = %.2f" % (100 * np.sum(y) / y.shape[0])) print("onez ratio in test set = %.2f" % (100 * np.sum(y_test) / y_test.shape[0])) # learn the model #sc = StandardScaler().fit(X) sc = MinMaxScaler().fit(X) X = sc.transform(X) X_test = sc.transform(X_test) print("training balanced LR..") lr = linear_model.LogisticRegression(class_weight='balanced') lr.fit(X, y) print("training mean accuracy = %.2f" % lr.score(X, y)) print("testing mean accuracy = %.2f" % lr.score(X_test, y_test)) #c = np.column_stack((df.columns.values[5:-1], np.round(lr.coef_.flatten(),2))) #print(c[c[:,1].argsort()]) y_prob = lr.predict_proba(X_test) y_pred = y_prob[:, 1] > t y_pred = y_pred.astype('uint8') print('--- t = %.2f results:' % t) print_results(y_test, y_pred) output = pd.DataFrame() output['Query'] = test_queries output['TestFreq'] = test_freq output['2'] = subset_mrr output['100'] = db_mrr output['Label'] = y_test output['ql'] = ql output['ql_label'] = ql ml = subset_mrr.copy() ml.loc[y_pred == 1] = db_mrr[y_pred == 1] output['ml'] = ml output['Pred'] = pd.Series(y_pred, index=output.index) best = subset_mrr.copy() print(best.mean()) best[y_test == 1] = db_mrr[y_test == 1] print(best.mean()) output['best'] = best r = np.random.randint(0, 2, output['2'].size) output['rand'] = output['2'].copy() output['rand'][r == 1] = output['100'][r == 1].copy() analyze(output, '2', '100','TestFreq') if (argv[2]): df.to_csv('%s%s_result.csv' % ('../../data/python_data/', filename[:-4]), index=False)