コード例 #1
0
ファイル: ensemble.py プロジェクト: mtmiron/cdc-public
def try_list(datapath, modelargs, outfile="ensemble_list.txt"):
    """
    Score a list of models, to find the best of a fixed number of combinations for ensembling.

    Expects `modelargs` to be a list of model descriptions, see predict().
    """
    df = pd.read_csv(datapath)
    df = preproc.preproc(df, lower=True)
    lines = []
    already_tried = load_tried_set(outfile)

    with open(datapath) as f:
        gt = f.readlines()[1:]
    while len(modelargs) > 0:
        lines.clear()
        ensemble = modelargs.pop()
        if len(set(ensemble)) != len(ensemble):
            logger.warning("Truncating ensemble: {}".format(str(ensemble)))
            ensemble = list(set(ensemble))

        results = predict(list(ensemble), list(df['text']), None)

        lines = []
        for i, result in enumerate(results):
            line = "{},{},{},{}".format(df['text'][i], df['sex'][i],
                                        df['age'][i], results[i])
            lines.append(line)

        en_score = score(gt, lines)
        print("{}: {}".format(en_score, str(ensemble)))
        with open(outfile, "a") as f:
            f.write("{}: {}\n".format(en_score, str(ensemble)))
コード例 #2
0
ファイル: ensemble.py プロジェクト: mtmiron/cdc-public
def main(datapath: ("The CSV file to use as input", "option", "d",
                    str) = "valid5000.csv",
         permute: ("Permute the order of words in sentences randomly", "flag",
                   "p") = False,
         reverse: ("Reverse the order of words in sentences", "flag",
                   "r") = False,
         search:
         ("Do an exhaustive search to find the best ensemble of SEARCH models",
          "option", "s", int) = 0,
         models: ("Path to a file with a list of models", "option", "m") = ""):

    global prediction_cache_file
    global prediction_cache
    prediction_cache_file = datapath + ".prediction_cache.pickle"
    if os.path.exists(prediction_cache_file):
        with open(prediction_cache_file, "rb") as f:
            prediction_cache = pickle.load(f)
            logger.warning("Loaded prediction cache from {}".format(
                prediction_cache_file))

    if search > 0:
        try_permutations(datapath, num_ensembles=search, outfile="tmp.txt")
        sys.exit()
    elif models != "":
        already_tried = load_tried_set("tmp.txt")
        already_tried = truncate_ensembles(already_tried)
        to_try = load_tried_set(models)
        to_try = truncate_ensembles(to_try)
        ensembles = to_try - already_tried
        logger.info("List of {} ensembles to try: {}".format(
            len(ensembles), ensembles))
        try_list(datapath, ensembles, outfile="tmp.txt")
        sys.exit()

    ensemble = build_ensemble()
    df = pd.read_csv(datapath)
    df_nonums = pd.read_csv(datapath)
    df = preproc.preproc(df, lower=True)  #, spelling=True)
    df_nonums = preproc.preproc(df_nonums, lower=True, nonumbers=True)

    if permute:
        results = permute_predict(ensemble, df)
    elif reverse:
        results = reverse_predict(ensemble, df)
    else:
        results = predict(ensemble, list(df['text']), list(df_nonums['text']))

    lines = []
    for i, result in enumerate(results):
        line = "{},{},{},{}".format(df['text'][i], df['sex'][i], df['age'][i],
                                    results[i])
        lines.append(line)
    write_solution(lines)

    with open(datapath) as f:
        gt = f.readlines()
    print("Score: {}".format(score(gt[1:], lines)))
コード例 #3
0
ファイル: fit_refactor.py プロジェクト: Ryan-M3/squeeze
def fit_score(word, pos, batch_pmass):
    """
    Convenience function wrapper for the score function found
    in main.py. It also happens to hide the ugliness of passing
    a ton of arguments to it.
    """
    return score(pos, word,
                 english_pmass,
                 batch_pmass,
                 wts["english_freq"],
                 wts[ "batch_freq" ],
                 wts["starts_word" ],
                 wts["repetitions" ],
                 wts["avoid_vowels"],
                 wts["consecutives"] )
コード例 #4
0
ファイル: tfpredict.py プロジェクト: mtmiron/cdc-public
def main(modelpath, datapath):
    import main
    logger.info("Loading model")
    model = Model.load_model(modelpath)
    df = pd.read_csv(datapath)
    logger.info("Predicting")
    ret, _ = model.predict_df(df)
    import pdb
    pdb.set_trace()
    solution = []
    for i in tqdm.tqdm(range(len(ret))):
        solution.append(ret[i] + 1)
    with open(datapath) as f:
        gt = f.readlines()
    print("Score: {}".format(main.score(gt[1:], solution)))
コード例 #5
0
def batch_fit():

    X = []
    Y = []
    prediction_data = []
    prediction_labels = []

    X, Y, prediction_data, prediction_labels = get_Data()

    print('Train data length ={0} , Test data length ={1}'.format(
        len(X), len(prediction_data)))
    #print(X)
    X = np.array(X) / 255.0
    Y = np.array(Y)

    prediction_data = np.array(prediction_data) / 255.0
    prediction_labels = np.array(prediction_labels)

    N, D = X.shape

    #print ('Dshape {0}'.format(D))
    M = 100
    K = 7

    # randomly initialize weights
    W1 = np.random.randn(D, M) / np.sqrt(D + M)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M + K)
    b2 = np.zeros(K)

    learning_rate = 5 * 10e-7
    costs = []
    best_validation_error = 1

    batch_sz = 500
    n_batches = int(N / batch_sz)

    #Use of Momentum
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0
    reg = 0.01

    #rms propagation
    cache_W2 = 1
    cache_b2 = 1
    cache_W1 = 1
    cache_b1 = 1
    decay_rate = 0.999
    eps = 1e-10

    #print ('{0} {1} {2}'.format(N,n_batches,D))

    for m in range(1000):
        tmpX, tmpY = shuffle(X, Y)
        for j in range(n_batches):

            x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :]
            y = tmpY[j * batch_sz:(j * batch_sz + batch_sz)]

            n = len(x)  # Getting new length

            T = np.zeros((n, K))
            for i in range(n):
                T[i, y[i]] = 1

            output, hidden = forward(x, W1, b1, W2, b2)

            gW2 = derivative_w2(hidden, T, output) + reg * W2
            gb1 = derivative_b1(T, output, W2, hidden) + reg * b1
            gW1 = derivative_w1(x, W2, hidden, T, output) + reg * W1
            gb2 = derivative_b2(T, output) + reg * b2

            dW1 = dW1 * mu + learning_rate * gW1
            dW2 = dW2 * mu + learning_rate * gW2
            db1 = db1 * mu + learning_rate * gb1
            db2 = db2 * mu + learning_rate * gb2

            #cache calculation
            cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2
            cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2
            cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1
            cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1

            W2 += (dW2 / (np.sqrt(cache_W2) + eps))
            b1 += (db1 / (np.sqrt(cache_b1) + eps))
            W1 += (dW1 / (np.sqrt(cache_W1) + eps))
            b2 += (db2 / (np.sqrt(cache_b2) + eps))

            #print('i value ={0}'.format(i))

            if j % (n_batches / 2) == 0:
                c = cost(T, output)
                P = np.argmax(output, axis=1)
                r = score(y, P)
                e = error_rate(y, P)
                if e < best_validation_error:
                    best_validation_error = e
                print('Cost = {0} , Training Score = {1} ,Error ={2}'.format(
                    c, r, e))
                costs.append(c)

    print("best_validation_error:", best_validation_error)

    plt.plot(costs)
    plt.show()

    # Test npar_train

    output, hidden = forward(prediction_data, W1, b1, W2, b2)
    P = np.argmax(output, axis=1)
    print('Test Score {0}'.format(score(prediction_labels, P)))  # Final output
コード例 #6
0
    parser = argparse.ArgumentParser()
    parser.add_argument('file', type=str)
    args = parser.parse_args()
    n_books, n_libs, n_days, scores_of_books, libs = load_libraries(args.file)
    scores_of_books_dict = dict(enumerate(scores_of_books))

    # print(n_books)
    # print(n_libs)
    # print(n_days)
    # print(book_scores)
    # print(libs[:10])

    # lib = libs[0]
    # n = lib[0]
    # signup_days = lib[1]
    # bs_per_day = lib[2]
    # bs = lib[3]
    # score = score_library(bs, signup_days, bs_per_day, [], scores_of_books, n_days)
    # print(score)

    library_signup_times = [lib[1] for lib in libs]
    library_ship_capacities = [lib[2] for lib in libs]
    solution = order_libraries(libs, scores_of_books_dict, n_days)
    print(solution)
    print(scores_of_books)
    scores_of_books = np.asarray(scores_of_books, dtype=np.uint)
    s = score(solution, n_days, scores_of_books,
              library_signup_times,
              library_ship_capacities)
    print(s)
コード例 #7
0
def hello():
    return score()
コード例 #8
0
with open("responses.json") as file:
    data = json.load(file)

d1 = dict(list(data.items())[:len(data) // 2])
d2 = dict(list(data.items())[len(data) // 2:])

propscrDict = {}
const1 = 0
const2 = 0
while len(d1) != 0 and len(d2) != 0:
    scoreDict = {}
    for men in d2:
        scoreDict[men] = []

        for women in d1:
            value = score(d2[men], d1[women])
            if value > 0:
                scoreDict[men].append((women, value))

        scoreDict[men].sort(key=lambda x: x[1], reverse=True)

    for proposer in scoreDict:
        if len(scoreDict[proposer]) == 0:
            continue
        if scoreDict[proposer][0][0] not in propscrDict.keys():
            propscrDict[scoreDict[proposer][0][0]] = (
                proposer, scoreDict[proposer][0][1])
        else:
            if scoreDict[proposer][0][1] > propscrDict[scoreDict[proposer][0]
                                                       [0]][1]:
                propscrDict[scoreDict[proposer][0][0]] = (
コード例 #9
0
ファイル: ensemble.py プロジェクト: mtmiron/cdc-public
def try_permutations(datapath, num_ensembles=8, outfile="tmp.txt"):
    """
    Brute force random walk over the list of available checkpoints.  Tries combinations at random
    to find the highest scoring ensemble.  Never returns.
    """
    DIRS = [
        "cp/bert/*/pytorch_model.bin",
        "cp/roberta/*/pytorch_model.bin",
        "cp/xlnet/*/pytorch_model.bin",
        "cp/xlnet/09/checkpoint-21000",
        "cp/xlnet/07/checkpoint-42500",
        "cp/distilbert/*/pytorch_model.bin",
    ]
    score_hash = {}
    tried = load_tried_set(outfile)
    for models in tried:
        score_hash[tuple(set(models))] = 0.0
    setlist = [set(x) for x in tried]
    buf = []
    full_list = []
    df = pd.read_csv(datapath)
    df = preproc.preproc(df, lower=True)
    with open(datapath) as f:
        gt = f.readlines()[1:]
        gt = [int(x.split(',')[-1][:-1]) for x in gt]
    for d in DIRS:
        buf = glob.glob(d)
        for i, tmp in enumerate(buf):
            tmp = tmp.replace("/pytorch_model.bin", "")
            full_list.append(tmp)

    logger.warning("Models to permute: {}".format(str(full_list)))
    combo = itertools.combinations(full_list, num_ensembles)
    combo_list = list(combo)
    random.shuffle(combo_list)
    df_text_list = list(df['text'])
    progbar = tqdm.tqdm(
        combo_list,
        total=len(combo_list) -
        len(list(filter(lambda x: len(x) == num_ensembles, tried))),
        desc="HiScore: 0.0")
    hiscore = 0.0

    for paths in progbar:
        buf.clear()
        for path in paths:
            path = path.strip()
            if 'roberta' in path:
                buf.append(("roberta", path, ""))
            elif "distilbert" in path:
                buf.append(("distilbert", path, ""))
            elif 'bert' in path:
                buf.append(("bert", path, ""))
            elif 'xlnet' in path:
                buf.append(('xlnet', path, ''))
            else:
                logger.error("Epic fail, programmer.")
                sys.exit()
        buf_set = set(buf)
        buf_tuple = tuple(buf_set)
        if score_hash.get(buf_tuple, None) is not None:
            continue
#        if buf_set in setlist:
#            continue
#        setlist.append(buf_set)
#        if buf_tuple in tried:
#            continue
#        tried.add(buf_tuple)
        results = predict(buf_tuple, df_text_list, None)

        #        lines = []
        #       for i, result in enumerate(results):
        #          line = ("%s,%d,%d,%d" % (df['text'][i],
        #                                 df['sex'][i],
        #                                df['age'][i],
        #                               results[i]))
        #      lines.append(line)

        en_score = score(gt, results)
        score_hash[buf_tuple] = en_score
        if en_score > hiscore:
            hiscore = en_score
            progbar.set_description("HiScore: {}".format(hiscore))
        #print("{}: {}".format(en_score, str(buf)))


#        if en_score > 0.83:
        with open(outfile, "a") as f:
            f.write("{}: {}\n".format(en_score, str(buf)))
コード例 #10
0
 def test2(self):
     self.assertEqual(main.score([4, 4, 4, 3, 3]), 400)
コード例 #11
0
 def test1(self):
     self.assertEqual(main.score([2, 3, 4, 6, 2]), 0)
コード例 #12
0
 def test3(self):
     self.assertEqual(main.score([2, 4, 4, 5, 4]), 450)
コード例 #13
0
ファイル: xtx_0.py プロジェクト: RuralCat/xtc
nb_train = 2000000

#%%
x0 = data_array[:nb_train, 0:60]  # - data_array[:nb_train, 30:60]
train_y = data_array[:nb_train, 60]

x1 = data_array[nb_train:, 0:60]  # - data_array[nb_train:, 30:60]
test_y = data_array[nb_train:, 60]

reg = LinearRegression(normalize=True).fit(x0, train_y)

#%%
train_pred = reg.predict(x0)
test_pred = reg.predict(x1)

print('train score: {:.5f}'.format(score(train_y, train_pred)))
print('test score: {:.5f}'.format(score(test_y, test_pred)))

#%% [markdown]
# ### linear model with normalized data

#%%
with open('normed_data.pickle', 'rb') as f:
    normed_data = pickle.load(f)

#%%
nb_train = 2000000
# train data
train_x = normed_data[:nb_train, :60]
train_y = normed_data[:nb_train, 60]
# test data