Ejemplo n.º 1
0
def user_clustering():
    users = []
    with open(FILE_DIR + file_path[0]) as f:
        for line in f:
            _, _, _, user_data, _ = InputData.split_data(line)
            if user_data not in users: users.append(user_data)
            if len(users) % 10000 == 0: print(len(users))
    print(len(users))

    kmeans = UserCluster(N_CLUSTERING)
    kmeans.fit(features)
    joblib.dump(kmeans, 'kmeans.pkl')
    return kmeans
Ejemplo n.º 2
0
def user_clustering():
    users = []
    with open('../analytics/stdev/usercluster200.csv') as f:
        for line in f:
            _, _, _, user_data, _ = InputData.split_data(line)
            if user_data not in users: users.append(user_data)
            if len(users) % 10000 == 0: print(len(users))
    print(len(users))

    kmeans = UserCluster(N_CLUSTERING)
    kmeans.fit(features)
    joblib.dump(kmeans, 'kmeans.pkl')
    return kmeans
Ejemplo n.º 3
0
def run_enviroment(algorithms, cluster_model):
    ite = 0
    for file_name in file_path:
        with open(FILE_DIR + file_name) as f:
            old = {}
            old['LinearedUCB'] = 0
            old['OriginalUCB'] = 0
            old_imp = {}
            old_imp['LinearedUCB'] = 0
            old_imp['OriginalUCB'] = 0
            click_count = 0
            click_eq_count = 0
            ctr_list = {}
            ctr_list['LinearedUCB'] = []
            ctr_list['OriginalUCB'] = []
            for line in f:
                _, click_article_id, click, user_data, article_pool = InputData.split_data(
                    line)
                click_count += click
                userID = cluster_model.predict_cluster(user_data)[0]
                for name, alg in algorithms.items():
                    decide_id = alg.decide(userID, user_data, article_pool)
                    if evaluate(click_article_id, decide_id, click, name, ite,
                                line):
                        # if click == 1: print(click, algorithms['OriginalUCB'].get_prob_check(userID, article_pool[decide_id]))
                        ctr_list[name].append((reward[name] - old[name]) /
                                              (count[name] - old_imp[name]))
                        alg.update(userID, user_data, article_pool[decide_id],
                                   click)
                    if count[name] % 2000 == 0:
                        # print(ite, name, reward[name], count[name], reward[name]/count[name], reward[name]-old[name], count[name]-old_imp[name], (reward[name]-old[name])/(count[name]-old_imp[name]), click_count)
                        print(name, np.mean(ctr_list[name]),
                              reward[name] - old[name],
                              count[name] - old_imp[name], reward[name],
                              count[name])
                        old[name] = reward[name]
                        old_imp[name] = count[name]
                        click_count = 0
                        click_eq_count = 0
                        ctr_list[name] = []
                        count[name] += 1
                        # alg.save_weight(name + '_weight_' + str(N_CLUSTERING) + '_3.csv')
                ite += 1
    return
Ejemplo n.º 4
0
def run_enviroment(algorithms, cluster_model):
    iteration = 0
    ite = {}
    click_count = {}
    old = {}
    old2 = {}
    old_imp = {}
    old_imp2 = {}
    ctr = {}
    ctr_list = {}
    rew = {}
    writer = {}
    now = datetime.datetime.today()

    for name in algorithms.keys():
        old[name] = 0
        old_imp[name] = 0
        old2[name] = 0
        old_imp2[name] = 0
        ctr_list[name] = [1.]
        ctr[name] = []
        rew[name] = [0]
        ite[name] = 0
        click_count[name] = 1
        # writer[name] = open('../data/'+name+'_experiment_log_'+str(now)+'.csv', 'w')

    for file_name in file_path:
        with open(FILE_DIR + file_name) as f:
            for line in f:
                _, click_article_id, click, user_data, article_pool = InputData.split_data(
                    line)
                article_pool = article_pool_10(article_pool, click_article_id)
                userID = cluster_model.predict_cluster(user_data)[0]
                # user_item_log[userID][]
                for name, alg in algorithms.items():
                    ite[name] += 1
                    click_count[name] += click
                    decide_id = alg.decide_try(userID, user_data, article_pool)
                    if name == 'OriginalUCB' and iteration < 150000 and random.random(
                    ) < (1. - (iteration * 0.5 / 150000)):
                        decide_id = random.choice(list(article_pool.keys()))
                    random_ctr = click_count[name] / ite[name]
                    cumulated_ctr = reward[name] / count[name]
                    relative_ctr = cumulated_ctr / random_ctr

                    if iteration % 2000 == 0:
                        ctr[name].append(np.mean(ctr_list[name]))
                        #writer[name].write(str(float('{:.5f}'.format(np.mean(ctr[name]))))+','+str(float('{:.5f}'.format(np.mean(ctr_list[name]))))+','+str(float('{:.5f}'.format(cumulated_ctr)))+'\n')
                        if iteration % 10000 == 0:
                            print(
                                iteration, name,
                                float('{:.5f}'.format(np.mean(ctr[name]))),
                                float('{:.5f}'.format(np.mean(
                                    ctr_list[name]))),
                                reward[name] - old2[name],
                                count[name] - old_imp2[name], reward[name],
                                count[name],
                                float('{:.5f}'.format(cumulated_ctr)),
                                float('{:.5f}'.format(relative_ctr)))
                            old2[name] = reward[name]
                            old_imp2[name] = count[name]
                        rew[name].append(reward[name] - old[name])
                        old[name] = copy.deepcopy(reward[name])
                        old_imp[name] = copy.deepcopy(count[name])
                        ctr_list[name] = [1.]
                        count[name] += 1
                        # alg.save_weight(name + '_weight_' + str(N_CLUSTERING) + '_3.csv')

                    if evaluate(click_article_id, decide_id, click, name, ite,
                                line):
                        ctr_list[name].append(relative_ctr)
                        alg.update(userID, user_data, article_pool[decide_id],
                                   click, decide_id)
                        if name == 'CLUB':
                            n_components = alg.updateGraphClusters(
                                userID, 'False')

                iteration += 1
    gragh(ctr, rew)
    for name, alg in algorithms.items():
        print(name, np.mean(ctr[name]))
        # writer[name].close()
        alg.memory_item_num()
    return
Ejemplo n.º 5
0
    def predict_cluster(self, feature):
        return self.model.predict(feature)

    def model_load(self, model_file):
        return joblib.load('/Users/chan-p/GitHub/ReserchBandit/system/model/' +
                           model_file)


if __name__ == '__main__':
    n = 1000
    km = UserCluster(n)
    users = []
    with open('../../../Desktop/R6/ydata-fp-td-clicks-v1_0.20090501') as f:
        for line in f:
            timestamp, click_article_id, click, user_data, article_pool = InputData.split_data(
                line)
            users.append(user_data)
            if len(users) == 3000000:
                km.fit(users)
                joblib.dump(
                    km,
                    './model/model' + str(n) + '_' + str(len(users)) + '.pkl')
                # k = km.model_load('model40.pkl')
                '''
                for user in users:
                    print(k.predict_cluster(user))
                '''
        am = UserCluster(n)
        am.fit(users)
        joblib.dump(am,
                    './model/model' + str(n) + '_' + str(len(users)) + '.pkl')