コード例 #1
0
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument('filename', help="input features file")
    parser.add_argument('-s', '--split', type=float, help="test split ratio", default=0.33)
    parser.add_argument('-t', '--threshold', type=float, help="decision boundry", default=0.5)
    parser.add_argument('-o', '--output', action='store_true', help="save the output")
    args = parser.parse_args()
    filename = args.filename
    test_size = args.split
    t = args.threshold
    write_output = args.output
    print('running swiki with test size %.2f and threshold %.2f' %
          (test_size, t))

    df = pd.read_csv("../../data/cache_selection_structured/" + filename)
    print("df size: " + str(df.shape))
    df = df.fillna(0)
    df = df.T.drop_duplicates().T
    print("df size after dedup: " + str(df.shape))
    labels = np.where(df['full'] > df['cache'], 1, 0)
    print("bad queries ratio: %.2f" % (100 * np.sum(labels) / labels.shape[0]))
    X, X_test, y, y_test = train_test_split(df, labels, stratify=labels,
                                            test_size=test_size, random_state=1)
    sample_weight = X['freq']
    X = X.drop(['query', 'freq', 'cache', 'full'], axis=1)
    test_queries = X_test['query']
    test_freq = X_test['freq']
    subset_mrr = X_test['cache']
    db_mrr = X_test['full']
    X_test = X_test.drop(['query', 'freq', 'cache', 'full'], axis=1)
    #print(df.corr()['label'].sort_values())
    print("train set size, bad queries and bad query ratio: %d, %d, %.2f"
          % (y.shape[0], np.sum(y), (100 * np.sum(y) / y.shape[0])))
    print("test set size, bad queries and bad query ratio: %d, %d, %.2f"
          % (y_test.shape[0], np.sum(y_test), (100 * np.sum(y_test) / y_test.shape[0])))
    # learn the model
    y_pred = train_lr(X, y, X_test, y_test, t, df.columns.values[2:-2])
    output = pd.DataFrame()
    output['query'] = test_queries
    output['TestFreq'] = test_freq
    output['cache'] = subset_mrr
    output['full'] = db_mrr
    output['Label'] = y_test
    output['ql_label'] = X_test['ql_0_0'] < X_test['ql_rest_0_0']
    output['ql'] = np.where(output['ql_label'] == 1, db_mrr, subset_mrr)
    output['ml_label'] = pd.Series(y_pred, index=output.index)
    output['ml'] = np.where(output['ml_label'] == 1, db_mrr, subset_mrr)
    output['best'] = np.maximum(subset_mrr, db_mrr)
    r = np.random.randint(0, 2, output['cache'].size)
    output['rand'] = np.where(r == 1, output['full'], output['cache'])
    analyze(output, 'cache', 'full','TestFreq')
    if (write_output):
        output.to_csv('%s%s_result.csv' % ('../../data/cache_selection_structured/',
                                       filename[:-4]), index=False)
コード例 #2
0
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument('filename', help="input features file")
    parser.add_argument('-s',
                        '--split',
                        type=float,
                        help="test split ratio",
                        default=0.33)
    parser.add_argument('-t',
                        '--threshold',
                        type=float,
                        help="decision boundry",
                        default=0.5)
    parser.add_argument('-o',
                        '--output',
                        action='store_true',
                        help="save the output")
    args = parser.parse_args()
    filename = args.filename
    test_size = args.split
    t = args.threshold
    write_output = args.output
    print('running swiki with test size %.2f and threshold %.2f' %
          (test_size, t))
    df = pd.read_csv("../../data/cache_selection_structured/" + filename)
    print("df size: " + str(df.shape))
    df = df.fillna(0)
    df = df.T.drop_duplicates().T
    print("df size after dedup: " + str(df.shape))
    labels = np.where(df['full'] > df['cache'], 1, 0)
    viewcount = df['freq']
    print("bad queries ratio (with freq): %.2f" %
          (100 * np.sum(labels * viewcount) / viewcount.sum()))
    test_viewcount = sample_viewcount(viewcount,
                                      int(test_size * viewcount.sum()))
    train_viewcount = viewcount - test_viewcount
    X = df[train_viewcount > 0].copy()
    y = labels[train_viewcount > 0].copy()
    X_test = df[test_viewcount > 0].copy()
    y_test = labels[test_viewcount > 0].copy()
    X = X.drop(['query', 'freq', 'cache', 'full'], axis=1)
    test_queries = X_test['query']
    test_freq = test_viewcount
    subset_mrr = X_test['cache']
    db_mrr = X_test['full']
    X_test = X_test.drop(['query', 'freq', 'cache', 'full'], axis=1)
    #print(df.corr()['label'].sort_values())
    print("train set size, bad queries and bad query ratio: %d, %d, %.2f" %
          (train_viewcount.sum(), (labels * train_viewcount).sum(),
           (100 * (labels * train_viewcount).sum() / train_viewcount.sum())))
    print("test set size, bad queries and bad query ratio: %d, %d, %.2f" %
          (test_viewcount.sum(), (labels * test_viewcount).sum(),
           (100 * (labels * test_viewcount).sum() / test_viewcount.sum())))
    # learn the model
    # y_pred = train_lr(X, y, X_test, y_test, t, df.columns.values[2:-2])
    y_pred = train_lr(X, y, X_test, y_test, t)
    output = pd.DataFrame()
    output['query'] = test_queries
    output['TestFreq'] = test_freq
    output['cache'] = subset_mrr
    output['full'] = db_mrr
    output['Label'] = y_test
    output['ql_label'] = X_test['ql_0_0'] < X_test['ql_rest_0_0']
    output['ql'] = np.where(output['ql_label'] == 1, db_mrr, subset_mrr)
    output['ml_label'] = pd.Series(y_pred, index=output.index)
    output['ml'] = np.where(output['ml_label'] == 1, db_mrr, subset_mrr)
    output['best'] = np.maximum(subset_mrr, db_mrr)
    r = np.random.randint(0, 2, output['cache'].size)
    output['rand'] = np.where(r == 1, output['full'], output['cache'])
    analyze(output, 'cache', 'full', 'TestFreq')
    if (write_output):
        output.to_csv(
            '%s%s_result.csv' %
            ('../../data/cache_selection_structured/', filename[:-4]),
            index=False)
コード例 #3
0
def main(argv):
    filename = argv[0]
    df = pd.read_csv('../../data/cache_selection/' + filename)
    t = float(argv[1])
    df = df.fillna(0)
    labels = df['label']
    size = 0.33
    X, X_test, y, y_test = train_test_split(df.drop(['label'], axis=1), labels, stratify=labels,
                                            test_size=size, random_state=1)
    X = X.drop(['query', 'TrainFreq', 'TestFreq', '2', '100'], axis=1)
    test_queries = X_test['query']
    subset_mrr = X_test['2']
    db_mrr = X_test['100']
    test_freq = X_test['TestFreq']
    X_test = X_test.drop(['TrainFreq', 'TestFreq', 'query', '2', '100'], axis=1)
    ql = subset_mrr.copy()
    ql_pred = X_test['ql_c'] < X_test['ql_c.1']
    ql.loc[ql_pred == 1] = db_mrr[ql_pred == 1]
    #print(df.corr()['label'].sort_values())
    print("train set size and ones: %d, %d" % (y.shape[0], np.sum(y)))
    print("test set size and ones: %d, %d" % (y_test.shape[0], np.sum(y_test)))
    print("onez ratio in trian set =  %.2f" % (100 * np.sum(y) / y.shape[0]))
    print("onez ratio in test set =  %.2f" % (100 * np.sum(y_test) / y_test.shape[0]))
    # learn the model
    #sc = StandardScaler().fit(X)
    sc = MinMaxScaler().fit(X)
    X = sc.transform(X)
    X_test = sc.transform(X_test)
    print("training balanced LR..")
    lr = linear_model.LogisticRegression(class_weight='balanced')
    lr.fit(X, y)
    print("training mean accuracy = %.2f" % lr.score(X, y))
    print("testing mean accuracy = %.2f" % lr.score(X_test, y_test))
    c = np.column_stack((df.columns.values[5:-1], np.round(lr.coef_.flatten(),2)))
    print(c[c[:,1].argsort()])
    y_prob = lr.predict_proba(X_test)
    y_pred = y_prob[:, 1] > t
    y_pred = y_pred.astype('uint8')
    print('--- t = %.2f results:' % t)
    print_results(y_test, y_pred)
    output = pd.DataFrame()
    output['Query'] = test_queries
    output['TestFreq'] = test_freq
    output['2'] = subset_mrr
    output['100'] = db_mrr
    output['Label'] = y_test
    output['ql'] = ql
    output['ql_label'] = ql
    ml = subset_mrr.copy()
    ml.loc[y_pred == 1] = db_mrr[y_pred == 1]
    output['ml'] = ml
    output['Pred'] = pd.Series(y_pred, index=output.index)
    best = subset_mrr.copy()
    print(best.mean())
    best[y_test == 1] = db_mrr[y_test == 1]
    print(best.mean())
    output['best'] = best
    r = np.random.randint(0, 2, output['2'].size)
    output['rand'] = output['2'].copy()
    output['rand'][r == 1] = output['100'][r == 1].copy()
    analyze(output, '2', '100','TestFreq')
    if (argv[2]):
        df.to_csv('%s%s_result.csv' % ('../../data/python_data/',
                                       filename[:-4]), index=False)
コード例 #4
0
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument('filename', help="input features file")
    parser.add_argument('-s',
                        '--split',
                        type=float,
                        help="test split ratio",
                        default=0.33)
    parser.add_argument('-t',
                        '--threshold',
                        type=float,
                        help="decision boundry",
                        default=0.5)
    parser.add_argument('-o',
                        '--output',
                        action='store_true',
                        help="save the output",
                        default=None)
    parser.add_argument('-d',
                        '--diff',
                        action='store_true',
                        help="uses feature diffs")
    args = parser.parse_args()
    filename = args.filename
    test_size = args.split
    t = args.threshold
    write_output = args.output
    print('running swiki with test size %.2f and threshold %.2f' %
          (test_size, t))

    df = pd.read_csv("../../data/cache_selection_structured/" + filename)
    if args.diff:
        columns = df.columns
        for i in range(2, 146, 2):
            col = columns[i]
            if 'rest' in col:
                continue
            rest_col = col[:-4] + '_rest' + col[-4:]
            if rest_col not in columns:
                print('wtf! %s' % rest_col)
            df[col] = df[col] - df[rest_col]
            df = df.drop([rest_col], axis=1)
    print("df size: " + str(df.shape))
    df = df.fillna(0)
    df = df.T.drop_duplicates().T
    print("df size after dedup: " + str(df.shape))
    labels = np.where(df['full'] > df['cache'], 1, 0)
    print("bad queries ratio: %.2f" % (100 * np.sum(labels) / labels.shape[0]))
    X, X_test, y, y_test = train_test_split(df,
                                            labels,
                                            stratify=labels,
                                            test_size=test_size,
                                            random_state=1)
    sample_weight = X['freq']
    X = X.drop(['query', 'freq', 'cache', 'full'], axis=1)
    test_queries = X_test['query']
    test_freq = X_test['freq']
    subset_mrr = X_test['cache']
    db_mrr = X_test['full']
    X_test = X_test.drop(['query', 'freq', 'cache', 'full'], axis=1)
    #print(df.corr()['label'].sort_values())
    print("train set size, bad queries and bad query ratio: %d, %d, %.2f" %
          (y.shape[0], np.sum(y), (100 * np.sum(y) / y.shape[0])))
    print("test set size, bad queries and bad query ratio: %d, %d, %.2f" %
          (y_test.shape[0], np.sum(y_test),
           (100 * np.sum(y_test) / y_test.shape[0])))
    # learn the model
    y_pred = train_lr(X, y, X_test, y_test, t, df.columns.values[2:-2])
    output = pd.DataFrame()
    output['query'] = test_queries
    output['TestFreq'] = test_freq
    output['cache'] = subset_mrr
    output['full'] = db_mrr
    output['Label'] = y_test
    if args.diff:
        output['ql_label'] = X_test['ql_0_0'] < 0
    else:
        output['ql_label'] = X_test['ql_0_0'] < X_test['ql_rest_0_0']
    output['ql'] = np.where(output['ql_label'] == 1, db_mrr, subset_mrr)
    output['ml_label'] = pd.Series(y_pred, index=output.index)
    output['ml'] = np.where(output['ml_label'] == 1, db_mrr, subset_mrr)
    output['best'] = np.maximum(subset_mrr, db_mrr)
    r = np.random.randint(0, 2, output['cache'].size)
    output['rand'] = np.where(r == 1, output['full'], output['cache'])
    analyze(output, 'cache', 'full', 'TestFreq')
    if (write_output):
        output.to_csv(
            '%s%s_result.csv' %
            ('../../data/cache_selection_structured/', filename[:-4]),
            index=False)
コード例 #5
0
def main(argv):
    filename = argv[0]
    df = pd.read_csv('../../data/cache_selection/' + filename)
    t = float(argv[1])
    df = df.fillna(0)
    labels = df['label']
    size = 0.33
    X, X_test, y, y_test = train_test_split(df.drop(['label'], axis=1), labels, stratify=labels,
                                            test_size=size, random_state=1)
    X = X.drop(['query', 'TrainFreq', 'TestFreq', '2', '100'], axis=1)
    test_queries = X_test['query']
    subset_mrr = X_test['2']
    db_mrr = X_test['100']
    test_freq = X_test['TestFreq']
    X_test = X_test.drop(['TrainFreq', 'TestFreq', 'query', '2', '100'], axis=1)
    ql = subset_mrr.copy()
    ql_pred = X_test['ql_c'] < X_test['ql_c.1']
    ql.loc[ql_pred == 1] = db_mrr[ql_pred == 1]
    #print(df.corr()['label'].sort_values())
    print("train set size and ones: %d, %d" % (y.shape[0], np.sum(y)))
    print("test set size and ones: %d, %d" % (y_test.shape[0], np.sum(y_test)))
    print("onez ratio in trian set =  %.2f" % (100 * np.sum(y) / y.shape[0]))
    print("onez ratio in test set =  %.2f" % (100 * np.sum(y_test) / y_test.shape[0]))
    # learn the model
    #sc = StandardScaler().fit(X)
    sc = MinMaxScaler().fit(X)
    X = sc.transform(X)
    X_test = sc.transform(X_test)
    print("training balanced LR..")
    lr = linear_model.LogisticRegression(class_weight='balanced')
    lr.fit(X, y)
    print("training mean accuracy = %.2f" % lr.score(X, y))
    print("testing mean accuracy = %.2f" % lr.score(X_test, y_test))
    #c = np.column_stack((df.columns.values[5:-1], np.round(lr.coef_.flatten(),2)))
    #print(c[c[:,1].argsort()])
    y_prob = lr.predict_proba(X_test)
    y_pred = y_prob[:, 1] > t
    y_pred = y_pred.astype('uint8')
    print('--- t = %.2f results:' % t)
    print_results(y_test, y_pred)
    output = pd.DataFrame()
    output['Query'] = test_queries
    output['TestFreq'] = test_freq
    output['2'] = subset_mrr
    output['100'] = db_mrr
    output['Label'] = y_test
    output['ql'] = ql
    output['ql_label'] = ql
    ml = subset_mrr.copy()
    ml.loc[y_pred == 1] = db_mrr[y_pred == 1]
    output['ml'] = ml
    output['Pred'] = pd.Series(y_pred, index=output.index)
    best = subset_mrr.copy()
    print(best.mean())
    best[y_test == 1] = db_mrr[y_test == 1]
    print(best.mean())
    output['best'] = best
    r = np.random.randint(0, 2, output['2'].size)
    output['rand'] = output['2'].copy()
    output['rand'][r == 1] = output['100'][r == 1].copy()
    analyze(output, '2', '100','TestFreq')
    if (argv[2]):
        df.to_csv('%s%s_result.csv' % ('../../data/python_data/',
                                       filename[:-4]), index=False)