Beispiel #1
0
def test_ranking(filename):
    vectors = load_vectors(filename)
    normalize(vectors)
    with codecs.open("data/rank.txt", "r", encoding="utf-8") as source:
        lines = [line.strip().split() for line in source.readlines()]
    actual = [line[1:] for line in lines]
    predicted = [[x[0] for x in nearest(vectors, line[0], normal=True)] for line in lines]
    print ml_metrics.mapk(actual, predicted, k=100)
 def test_mapk(self):
     self.assertAlmostEqual(metrics.mapk([range(1,5)],[range(1,5)],3), 1.0)
     self.assertAlmostEqual(metrics.mapk([[1,3,4],[1,2,4],[1,3]],
         [range(1,6),range(1,6),range(1,6)], 3), 0.685185185185185)
     self.assertAlmostEqual(metrics.mapk([range(1,6),range(1,6)],
         [[6,4,7,1,2],[1,1,1,1,1]], 5), 0.26)
     self.assertAlmostEqual(metrics.mapk([[1,3],[1,2,3],[1,2,3]],
         [range(1,6),[1,1,1],[1,2,1]], 3), 11.0/18)
Beispiel #3
0
def test_ranking_2(filename):
    vectors = load_vectors(filename)
    normalize(vectors)
    with codecs.open("data/rank.txt", "r", encoding="utf-8") as source:
        lines = [line.strip().split() for line in source.readlines()]
    vocab = set([word for line in lines for word in line])
    vectors = {word: vector for word, vector in vectors.iteritems() if word in vocab}
    actual = [line[1:] for line in lines]
    predicted = [[x[0] for x in nearest(vectors, line[0], normal=True)] for line in lines]
    print ml_metrics.mapk(actual, predicted, k=100)
 def test_mapk(self):
     self.assertAlmostEqual(metrics.mapk([range(1, 5)], [range(1, 5)], 3),
                            1.0)
     print metrics.mapk(
         [[1, 3, 4], [1, 2, 4], [1, 3]],
         [range(1, 6), range(1, 6), range(1, 6)], 5)  #, 0.685185185185185)
     self.assertAlmostEqual(
         metrics.mapk([range(1, 6), range(1, 6)],
                      [[6, 4, 7, 1, 2], [1, 1, 1, 1, 1]], 5), 0.26)
     self.assertAlmostEqual(
         metrics.mapk([[1, 3], [1, 2, 3], [1, 2, 3]],
                      [range(1, 6), [1, 1, 1], [1, 2, 1]], 3), 11.0 / 18)
Beispiel #5
0
def load_paper_pair(path):
    paper_size = 60
    end = paper_size * len(candidate_ids)
    batch_x = np.load(path[0], allow_pickle=True)[:end]
    batch_ans = np.load(path[1], allow_pickle=True)[:end].tolist()
    batch_pred = make_prediction(batch_x, paper_size).tolist()
    print(metrics.mapk(batch_ans, batch_pred, 150))
Beispiel #6
0
def map_brain(dataset, n_rows, labels=[], save_name=None):

    map_k = 3
    print('There are %d rows' % n_rows)

    data_coor_sigma = dataset[['x', 'y', 'accuracy']].values
    closest_3places_ids = []
    closest_3places_ids_str = np.zeros((dataset.shape[0],)).astype(object)
    for i, cur_coor in enumerate(data_coor_sigma[:n_rows]):
        if not i % 1000:
            print('row %d' % i)
        metric_results = (((places_x - cur_coor[0]) / places_x_sd) ** 2 +
                          ((places_y - cur_coor[1]) / places_y_sd) ** 2) / places_freq
        ranked_ids = argsort_short(metric_results, map_k)
        cur_places = []
        for place_id in ranked_ids:
            cur_places.append(places_ID[place_id])
        closest_3places_ids.append(cur_places)
        closest_3places_ids_str[i] = ' '.join(map(lambda x: str(x), cur_places))  # For submission

    if len(labels):
        print('The MAP3 score is %f' % mapk(labels, closest_3places_ids, map_k))
    if save_name:
        submission = pd.DataFrame.from_csv('sample_submission.csv')
        submission['place_id'] = closest_3places_ids_str
        submission.to_csv(save_name)
Beispiel #7
0
def run_linear_regression(usage, thefts, theft_usage, day_num, apt_num, noise):
    daily_thefts = convert_theft_to_daily(thefts, apt_num, day_num)

    caught_total = 0
    caught_thefts_total = 0
    total_pvalues = []
    total_coeffs = []

    for day in xrange(day_num):
        caught, caught_theft, daily_pvalue, daily_coef = compute_precision_recall(
            daily_thefts[day], theft_usage[day], usage[day], noise=noise)
        caught_total += caught
        caught_thefts_total += caught_theft
        total_pvalues.extend(daily_pvalue)
        total_coeffs.extend(daily_coef)

    print "Summary"
    print "Theft {}. Caught {}. Correct {}".format(len(thefts), caught_total,
                                                   caught_thefts_total)

    print "map@k"
    indexes = [i for i, v in enumerate(total_pvalues) if v < 1]
    pred_thievesid = [i for i, v in enumerate(total_coeffs) if v > 0.1]
    sorted_houses = np.argsort(total_coeffs)
    refined_sorted_houses = [x for x in sorted_houses if x in set(indexes)]
    refined_sorted_houses = [
        x for x in refined_sorted_houses if x in set(pred_thievesid)
    ]

    for i in range(1, 11):
        mapk = metrics.mapk(
            convert_theft_to_daily(thefts, cfg.Apts, cfg.Lr_Days),
            convert_theft_to_daily(refined_sorted_houses, cfg.Apts,
                                   cfg.Lr_Days), i)
        print "{}: {}".format(i, mapk)
Beispiel #8
0
def comparing_with_ground_truth(tops, txt_infos, k):
    utils.dump_pickle("result.pkl", tops)
    gt = utils.get_pickle("datasets/qst1_w4/gt_corresps.pkl")
    hypo = utils.get_pickle("result.pkl")
    mapAtK = metrics.mapk(gt, hypo, k)
    print("\nMap@ " + str(k) + " is " + str(mapAtK))

    bbs_gt = np.asarray(
        utils.get_groundtruth("datasets/qst1_w4/text_boxes.pkl")).squeeze()
    bbs_predicted = [[painting.boundingxy for painting in txt_info]
                     for txt_info in txt_infos]
    mean_iou = utils.get_mean_IoU(bbs_gt, bbs_predicted)
    print("Mean Intersection over Union: ", mean_iou)

    texts_gt = utils.get_gt_text("datasets/qst1_w4")
    texts_predicted = [[painting.text for painting in txt_info]
                       for txt_info in txt_infos]
    with open('results.txt', 'w') as f:
        for item in texts_predicted:
            f.write("%s\n" % item)
    mean_lev = utils.compute_lev(texts_gt, texts_predicted)
    print(texts_predicted)
    print("\n")
    print(texts_gt)
    print("Mean Levenshtein distance: ", mean_lev)
def validate(model, dataset, make_batch_fn=make_batch):
    """Validates the model
    
    Args:
        model (nn.Module): The model to be validated
        dataset (DrawingLoader): The validation dataset
        make_batch_fn (function, optional): Defaults to make_batch. The function for making batches.
    
    Returns:
        float: The MAP@3 validation score
    """

    model.eval()
    actual = []
    predicted = []
    log("Calculating MAP@3 for the validation dataset...")
    for batch in tqdm(dataset, desc="Validating", file=sys.stdout):
        inputs, targets, _, _ = make_batch_fn(batch)
        predicted.append(model.predict(inputs))
        actual.append(targets.data.cpu().numpy())
    actual = np.concatenate(actual)
    actual = [[x] for x in actual]
    predicted = np.concatenate(predicted, axis=0)
    map_3 = metrics.mapk(actual, predicted, 3)
    log("Validation MAP@3: {}".format(map_3))
    log("--------------------------------------------------------")
    return map_3
Beispiel #10
0
def main():
    #K parameter for map@k
    k = 1
    # Get images and denoise query set.
    print("Reading images...")
    qs = get_imgs("datasets/qsd1_w4")
    db = get_imgs("datasets/DDBB")
    """ Denoising methods
    "Gaussian"
    "Median"
    "bilateral"
    "FastNl"
    """
    print("Denoising images...")
    #qs_denoised = [utils.denoise_image(img, method="FastNl") for img in tqdm(qs)]

    #Separating paitings inside images to separate images
    qs_split = [background_remover.remove_background(img) for img in qs]

    print("\nComputing histograms...")
    hogs_qs = [[utils.get_hog_histogram(painting) for painting in img] for img in qs_split]
    hogs_ddbb = utils.get_hog_histograms(db)

    print("\nComputing distances")
    distances = []

    #Generating distances between qs images and db images
    for im in tqdm(hogs_qs):
        current_im = []
        for painting_hog in im:
            current_pt = []
            for db_hog in hogs_ddbb:
                current_pt.append(sum(np.abs(painting_hog - db_hog)))
            current_im.append(current_pt)
        distances.append(current_im)

    print("Done calculating hogs")

    #Generating predictions
    predictions = []

    for im in distances:
        current_im = []
        for painting_dst in im:
            current_im.append(utils.list_argsort(painting_dst)[:k])
        predictions.append(current_im)

    #Remove nesting of lists
    hypo = []
    for im in predictions:
        current_im = []
        for painting in im:
            for pred in painting:
                current_im.append(pred)
        hypo.append(current_im)

    #Generate map@k
    gt = utils.get_pickle("datasets/qsd1_w4/gt_corresps.pkl")
    mapAtK = metrics.mapk(gt, hypo, k)
    print("\nMap@ " + str(k) + " is " + str(mapAtK))
Beispiel #11
0
 def test_data2x2(self):
     X, y = data2x2(100)
     estimator =  KNeighborsClassifier(n_neighbors=1)
     grid = grid2d.Grid2d([0, 0.5, 1.0], [0, 0.5, 1.0], estimator)
     grid.fit(X, y)
     pred = grid.predict(X)
     self.assertTrue(np.array_equal(pred, y))
     self.assertEqual(metrics.mapk(pred.reshape(-1, 1), np.array(y).reshape(-1, 1), 1), 1.0)
Beispiel #12
0
def metric_bar(hot, ansK, recommend, method='MAP', t='train', kList=(1, 5, 10, 25, 50, 100, 150)):
    for k in kList:
        if method == 'MAP':
            hot_metrics = metrics.mapk(ansK.tolist(), [hot]*ansK.shape[0], k)
            rs_metrics = metrics.mapk(ansK.tolist(), recommend.T.tolist(), k)
            plt.bar('Hot', hot_metrics)
            plt.bar('RS', rs_metrics)
            # if t == 'train':
            #     plt.bar('GR', metrics.mapk(ansK.tolist(), graph_recommend_papers.T.tolist(), k))
        if method == 'Recall':
            plt.bar('Hot', mark(ansK, [hot]*ansK.shape[0], k))
            plt.bar('RS', mark(ansK, recommend.T, k))
            # if t == 'train':
            #     plt.bar('GR', mark(ansK.values, graph_recommend_papers.T, k))
        plt.ylabel('score')
        plt.title(t+' '+method+'@'+str(k))
        plt.show()
Beispiel #13
0
def validate():
    train = pd.read_csv(os.path.join(HOME_DIR, "input/clicks_train.csv"))
    y = train[train.clicked == 1].ad_id.values
    y = [[_] for _ in y]

    predict = pd.read_csv(os.path.join(HOME_DIR, "output/sub_v2.csv"))
    p = [[row["ad_id"].split()] for index, row in predict.iterrows()]
    print(mapk(y, p, k=12))
Beispiel #14
0
def run_map_test(data, eventNames, users=None, primaryEvent=cfg.testing.primary_event,
                 consider_non_zero_scores=cfg.testing.consider_non_zero_scores_only,
                 num=200, K=cfg.testing.map_k, test=False, predictionio_url="http://0.0.0.0:8000"):
    N_TEST = 2000
    d = {}
    res_data = {}
    engine_client = predictionio.EngineClient(url=predictionio_url)

    for rec in data:
        if rec.event == primaryEvent:
            user = rec.entityId
            item = rec.targetEntityId
            if (users is None) or (user in users):
                d.setdefault(user, []).append(item)

    if test:
        holdoutUsers = d.keys()[1:N_TEST]
    else:
        holdoutUsers = d.keys()

    prediction = []
    ground_truth = []
    user_items_cnt = 0.0
    users_cnt = 0
    for user in tqdm(holdoutUsers):
        q = {
            "user": user,
            "eventNames": eventNames,
            "num": num,
        }

        try:
            res = engine_client.send_query(q)
            # Sort by score then by item name
            tuples = sorted([(r["score"], r["item"]) for r in res["itemScores"]], reverse=True)
            scores = [score for score, item in tuples]
            items = [item for score, item in tuples]
            res_data[user] = {
                "items": items,
                "scores": scores,
            }
            # Consider only non-zero scores
            if consider_non_zero_scores:
                if len(scores) > 0 and scores[0] != 0.0:
                    prediction.append(items)
                    ground_truth.append(d.get(user, []))
                    user_items_cnt += len(d.get(user, []))
                    users_cnt += 1
            else:
                prediction.append(items)
                ground_truth.append(d.get(user, []))
                user_items_cnt += len(d.get(user, []))
                users_cnt += 1
        except predictionio.NotFoundError:
            print("Error with user: %s" % user)
    return ([metrics.mapk(ground_truth, prediction, k) for k in range(1, K + 1)],
            res_data, user_items_cnt / (users_cnt + 0.00001))
Beispiel #15
0
def run_map_test(data, eventNames, users=None, primaryEvent=cfg.testing.primary_event,
                 consider_non_zero_scores=cfg.testing.consider_non_zero_scores_only,
                 num=200, K=cfg.testing.map_k, test=False, predictionio_url="http://0.0.0.0:8000"):
    N_TEST = 2000
    d = {}
    res_data = {}
    engine_client = predictionio.EngineClient(url=predictionio_url)

    for rec in data:
        if rec.event == primaryEvent:
            user = rec.entityId
            item = rec.targetEntityId
            if not users or user in users:
                d.setdefault(user, []).append(item)

    if test:
        holdoutUsers = [*d.keys()][1:N_TEST]
    else:
        holdoutUsers = [*d.keys()]

    prediction = []
    ground_truth = []
    user_items_cnt = 0.0
    users_cnt = 0
    for user in tqdm(holdoutUsers):
        q = {
            "user": user,
            "eventNames": eventNames,
            "num": num,
        }

        try:
            res = engine_client.send_query(q)
            # Sort by score then by item name
            tuples = sorted([(r["score"], r["item"]) for r in res["itemScores"]], reverse=True)
            scores = [score for score, item in tuples]
            items = [item for score, item in tuples]
            res_data[user] = {
                "items": items,
                "scores": scores,
            }
            # Consider only non-zero scores
            if consider_non_zero_scores:
                if len(scores) > 0 and scores[0] != 0.0:
                    prediction.append(items)
                    ground_truth.append(d.get(user, []))
                    user_items_cnt += len(d.get(user, []))
                    users_cnt += 1
            else:
                prediction.append(items)
                ground_truth.append(d.get(user, []))
                user_items_cnt += len(d.get(user, []))
                users_cnt += 1
        except predictionio.NotFoundError:
            print("Error with user: %s" % user)
    return ([metrics.mapk(ground_truth, prediction, k) for k in range(1, K + 1)],
            res_data, user_items_cnt / (users_cnt + 0.00001))
Beispiel #16
0
def map_5_scorer(estimator, X, y):
    prob = estimator.predict_proba(X)

    def top5(row):
        return sorted(range(len(row)), key=lambda k: row[k], reverse=True)

    y = map(lambda x: [x], y)
    y_pred = np.apply_along_axis(top5, axis=1, arr=prob)
    return mtr.mapk(y, y_pred, 5)
Beispiel #17
0
def random_search(train,
                  user_hist,
                  correct: dict,
                  items_to_predict,
                  num_samples: int = 20,
                  num_threads: int = -1):
    """
    Sample random hyperparameters, fit a LightFM model, and evaluate it
    on the test set.
    Parameters
    ----------
    train: np.float32 coo_matrix
        Training data.
    correct: dict
        dict with keys as item and val is max score 
    num_samples: int, optional
        Number of hyperparameter choices to evaluate.
    Returns
    ----------
    generator of (auc_score, hyperparameter dict, fitted model)
    """
    best_score = -1
    best_params = {}
    for hyperparams in itertools.islice(sample_hyperparameters(), num_samples):
        start = datetime.now()
        print('hyperparams set:', hyperparams)
        num_epochs = hyperparams.pop("num_epochs")

        model = LightFM(**hyperparams)
        model.fit(train, epochs=num_epochs, num_threads=num_threads)

        recoms = {}
        num_to_recom = 5
        for user in correct.keys():
            items_to_score = list(items_to_predict.difference(user_hist[user]))
            predict = model.predict(user,
                                    items_to_predict,
                                    num_threads=num_threads)
            top_recoms_id = sorted(range(len(predict)),
                                   key=lambda i: predict[i])[-num_to_recom:]
            top_recoms_id.reverse()
            recoms[user_decode[user]] = [
                item_decode[items_to_predict[i]] for i in top_recoms_id
            ]

        score = metrics.mapk(list(recoms.values()), list(correct_1.values()),
                             5)
        print(score)

        hyperparams["num_epochs"] = num_epochs

        end = datetime.now()

        yield (score, hyperparams, model, end - start)
Beispiel #18
0
def old_df2mapk(df):
    df_clicked = df[df.clicked == '1'][['display_id', 'ad_id']]
    df_clicked_gb_display_id = df_clicked.groupby('display_id')['ad_id'].apply(
        list)
    df_result = df.sort_values(['display_id','pred'], inplace=False, ascending=False) \
                  .groupby('display_id')['ad_id'] \
                  .apply(list)

    sr_answer, sr_pred = df_clicked_gb_display_id.align(df_result,
                                                        join='inner')
    return mapk(sr_answer.tolist(), sr_pred.tolist(), k=12)
def eval_ranking(rel_test_topic, relevant_len, docs, model):
    # Precision, Recall
    precision, recall, docs_ids = precision_recall_rank(
        docs, rel_test_topic, relevant_len, model)
    relevant_retrieved_rank = [
        doc for doc in docs_ids if doc in rel_test_topic
    ]

    # MAP
    map_results_rank = []
    print("\nMAP:")
    for k in range(1, len(docs_ids)):
        map_results_rank.append(ml_metrics.mapk(rel_test_topic, docs_ids, k))
        print(ml_metrics.mapk(rel_test_topic, docs_ids, k))

    # BPREF
    bPref = bpref(docs_ids, rel_test_topic, relevant_len,
                  relevant_retrieved_rank)

    return precision, recall, map_results_rank, bPref
Beispiel #20
0
def run_svm(theft_usage, theft_vectors):
    scaler = StandardScaler()

    training = []
    training_label = []
    for day in xrange(cfg.Svm_Train_Days):
        for apt in xrange(len(theft_usage[day])):
            training.append(theft_usage[day][apt])
            index = day * cfg.Apts + apt
            if index in theft_vectors:
                training_label.append(1)
            else:
                training_label.append(0)
    training = np.asarray(training)
    training = scaler.fit_transform(training)
    training_label = np.asarray(training_label)
    testing = []
    for i in xrange(cfg.Svm_Train_Days, cfg.Days):
        for apt in theft_usage[i]:
            testing.append(apt)
    testing = np.asarray(testing)
    testing = scaler.fit_transform(testing)

    clf = SVC(class_weight={1: 5}, random_state=0, probability=True)
    clf.fit(training, training_label)
    print "done fit"

    # For probabilities
    pred_probability = clf.predict_proba(testing)[:, 1]
    sorted_probas = pred_probability.argsort()[::-1]  # descending order

    # For theft ids
    pred_results = clf.predict(testing)
    print "done predict"

    pre_number = cfg.Svm_Train_Days * cfg.Apts
    pred_theft = [(i + pre_number) for i in range(testing.shape[0])
                  if pred_results[i] == 1]
    total_theft = [i for i in theft_vectors if i >= pre_number]
    print "total theft, ", len(total_theft)
    print "detect, ", len(pred_theft)
    print "detect theft, ", len(set(pred_theft).intersection(total_theft))

    # compute map@k
    pred_day_theft = convert_theft_to_daily(
        [(i + pre_number) for i in sorted_probas if pred_results[i] == 1],
        cfg.Apts, cfg.Days)
    total_day_theft = convert_theft_to_daily(total_theft, cfg.Apts, cfg.Days)

    # only take the last 70 days(those with theft), as empty list [] (1-349 day) will affect average value.
    for i in range(1, 11):
        mapk = metrics.mapk(total_day_theft[-70:], pred_day_theft[-70:], i)
        print "{}: {}".format(i, mapk)
def evaluate(y_true, y_pred):

    validate_predictions(y_true, y_pred)
    
    actual = []
    predicted = []

    for user_id in y_true.keys():
        actual.append(y_true[user_id])
        predicted.append(y_pred[user_id])

    return metrics.mapk(actual, predicted, k=100)
def main(k1, b, k3, feature, query_feature, r):
    file_list = open(model_dir + "/file-list")
    file = file_list.read().split("\n")
    file.remove("")
    num_doc = len(file)

    file_dic = dict()
    for i in range(num_doc):
        term = file[i].split("/")[-1].lower()
        file_dic[term] = i

    inf_file = open(model_dir + "/inverted-file")
    inf = inf_file.read()
    del inf_file
    inf = inf.split("\n")
    inf.remove("")

    dictionaries, que_dic, select_voc, num_term, que = build_term_dic(
        model_dir, query_dir, num, k3=k3, F=feature, QF=query_feature)
    D = build_doc_vector(num_doc, dictionaries, inf, select_voc, k1=k1, b=b)
    #pca = PCA(n_components=int(num_term*0.9), svd_solver='full')
    #D_=pca.fit_transform(D)
    D_ = D
    if if_train:
        df = pd.read_csv("queries/ans_train.csv")
        truth = []
        for i in range(num):
            app = []
            ret = df["retrieved_docs"][i].split(" ")
            for ele in ret:
                app.append(file_dic[ele])
            truth.append(app)
        w_file = open("score.txt", "a")
        for i in r:
            train_rank = output(
                que, num, num_doc, D_,
                r=i)  #,feature=["title","concepts","question"])

            for j in range(num):
                print(
                    ml_metrics.apk(truth[j],
                                   train_rank[j, :100].tolist(),
                                   k=100))
            score = ml_metrics.mapk(truth, train_rank[:, :100].tolist(), k=100)
            #w_file.write("s=%.3f,k1=%.1f, k3=%d, b=%.2f ,r=%d, f="%(score,k1,k3,b,i)+str(feature)+" q="+str(query_feature)+"\n")
            print(i, score)
        w_file.close()

    else:

        train_rank = output(que, num, num_doc, D_, r=r)[:, :100].tolist()
        return train_rank, file
Beispiel #23
0
def event_type_map_eval_ml_metrics(config, logger, result):
    print_and_log(logger, "====================================")
    print_and_log(logger, "event type MAP evaluation:")

    print_and_log(
        logger, "event type MAP: {}".format(
            ml_metrics.mapk(result['gt_all_event_id'],
                            result['pred_all_event_id'])))

    k_list = []
    for i in range(len(result['chain_name'])):
        k_list.append(len(result['gt_all_event_type'][i]))
    k_list = sorted(list(set(k_list)))
    k_list.remove(0)
    print_and_log(logger, "all possible k: {}".format(k_list))

    for k in k_list:
        map_at_k = ml_metrics.mapk(result['gt_all_event_id'],
                                   result['pred_all_event_id'], k)
        print_and_log(logger, "event type MAP@{}: {}".format(int(k), map_at_k))

    return result
def map_5_scorer(estimator, X, y):
    if X.shape[0] == 0:
        return 1
    prob = estimator.predict_proba(X)
    labels = np.array(estimator.classes_)
    
    def top5(prob):
        indice = sorted(range(len(prob)), key=lambda k: prob[k], reverse=True)
        return labels[indice].tolist()
    
    y = map(lambda x:[x], y)
    y_pred = np.apply_along_axis(top5, axis=1, arr=prob)
    return mtr.mapk(y, y_pred, 5) 
Beispiel #25
0
def map_5_scorer(estimator, X, y):
    if X.shape[0] == 0:
        return 1
    prob = estimator.predict_proba(X)
    labels = np.array(estimator.classes_)
    
    def top5(prob):
        indice = sorted(range(len(prob)), key=lambda k: prob[k], reverse=True)
        return labels[indice].tolist()
    
    y = map(lambda x:[x], y)
    y_pred = np.apply_along_axis(top5, axis=1, arr=prob)
    return mtr.mapk(y, y_pred, 5) 
Beispiel #26
0
def run_map_test_dummy(data,
                       items=None,
                       probs=None,
                       uniform=True,
                       top=True,
                       users=None,
                       primaryEvent=cfg.testing.primary_event,
                       K=10,
                       no_progress=False):
    """Performs dummy test

    Args:
        data: list of event rows
        items: np.array or list of items sorted in descending popularity order
        probs: np.array or list of corresponding probabilities (needed for experiment #2)
        uniform: Boolean flag to use uniform sampling
        top: Boolean flag to use top items
        users: set of users to consider
        primaryEvent: str name of primary event
        K: int for MAP @ K
        no_progress: Boolean flag not to show the progress bar during calculations

    Returns:
        list of [MAP@1, MAP@2, ... MAP@K] evaluations
    """
    d = {}
    for rec in data:
        if rec.event == primaryEvent:
            user = rec.entityId
            item = rec.targetEntityId
            if (users is None) or (user in users):
                d.setdefault(user, []).append(item)

    holdoutUsers = d.keys()

    prediction = []
    ground_truth = []
    if no_progress:
        gen = holdoutUsers
    else:
        gen = tqdm(holdoutUsers)
    for user in gen:
        if top:
            test_items = items[0:K]
        elif uniform:
            test_items = np.random.choice(items, size=(K, ))
        else:
            test_items = np.random.choice(items, size=(K, ), p=probs)
        prediction.append(test_items)
        ground_truth.append(d.get(user, []))
    return [metrics.mapk(ground_truth, prediction, k) for k in range(1, K + 1)]
def train_test_partition(feature_list,labels_list):
  
  x = feature_list
  y = labels_list
  X_train, X_test,y_train,y_test = train_test_split(x,y,test_size=0.10)
  
  knn_cv = KNeighborsClassifier(n_neighbors=1)
  #svm_cf = svm.SVC(gamma=0.001)
  #cv_scores = cross_val_score(knn_cv, x,y, cv=2)
  #print(cv_scores)
  knn_cv.fit(X_train,y_train)
  expected = y_test
  predicted = knn_cv.predict(X_test)
  print(ml_metrics.mapk(expected,predicted,4))
def cal_map(pred_valid,cv,train_df,tr_data):
    df_pred = train_df[train_df['cv']==cv].copy()
    df_pred['pred'] = pred_valid
    df_pred = df_pred[['description_id','paper_id','pred']]
    sort_df_pred = df_pred.sort_values(['description_id', 'pred'], ascending=False)
    df_pred = df_pred[['description_id']].drop_duplicates()             .merge(sort_df_pred, on=['description_id'], how='left')
    df_pred['rank'] = df_pred.groupby('description_id').cumcount().values
    df_pred = df_pred[df_pred['rank'] < 3]
    df_pred = df_pred.groupby(['description_id'])['paper_id']             .apply(lambda s : ','.join((s))).reset_index()
    df_pred = df_pred.merge(tr_data, on=['description_id'], how='left')
    df_pred.rename(columns={'paper_id': 'paper_ids'}, inplace=True)
    df_pred['paper_ids'] = df_pred['paper_ids'].apply(lambda s: s.split(','))
    df_pred['target_id'] = df_pred['target_id'].apply(lambda s: [s])
    return metrics.mapk(df_pred['target_id'].tolist(), df_pred['paper_ids'].tolist(), 3)
def show_results(query_path: str, method_names: List[str], matching_results, text_results):
    # if 'w5' in query_path:
    with open('./w5_query_devel.pkl', 'rb') as file:
        matching_dict = pickle.load(file)
    with open('./w5_text_bbox_list.pkl', 'rb') as file:
        text_dict = pickle.load(file)
        texts_sol = (seq(text_dict)
                     .map(lambda p: Rectangle(p[0:2], (p[2] - p[0]) + 1, (p[3] - p[1]) + 1))
                     .to_list())

    table = []
    for pos, method_name in enumerate(method_names):
        # Matching results
        matching = (
            seq(matching_results[pos])
                .map(lambda r: r[1])
                .map(lambda r: seq(r).map(lambda s: s.id).to_list())
                .map(replace_empty)
                .to_list()
        )
        matching_solution = seq(matching_results[pos]).map(lambda r: matching_dict[r[0].id][1]).to_list()

        # Text results
        text_iou = (seq(texts_sol)
                    .zip(text_results[pos])
                    .map(lambda pair: pair[0].ioi(pair[1]))
                    .average())

        table.append((method_name,
                      metrics.mapk(matching_solution, matching, k=10),
                      metrics.mapk(matching_solution, matching, k=5),
                      metrics.mapk(matching_solution, matching, k=1),
                      text_iou))

    data = pandas.DataFrame(table, columns=['Method', 'MAPK K=10', 'MAPK K=5', 'MAPK K=1', 'Text IoU'])

    print(data)
def ensemble_score(dfs, model_output_paths, df_valid, **cs):
    c = [cs["c%d" % m] for m in range(len(cs))]
    probas = defaultdict(lambda: defaultdict(float))
    for m, df_model in enumerate(dfs):
        #logging.info("scoring %d, %s" % (m, model_output_paths[m]))
        for i in range(len(df_model)):
            probas[df_model["row_id"][i]][df_model["place_id"][i]] += c[m] * df_model["proba"][i]
    df = pd.DataFrame()
    df["row_id"] = probas.keys()
    df["place_id"] = df["row_id"].apply(lambda x: map(itemgetter(0),
                        sorted(probas[x].items(), key=itemgetter(1), reverse=True)[:3]))
    df_merge = pd.merge(df, df_valid, how="left", on="row_id")
    valid_score = metrics.mapk(df_merge.place_id_label.values[:, None],
                               df_merge.place_id.values, 3)
    return valid_score
def parseDict(probas, output_name, valid_file=None):
    df = pd.DataFrame()
    df["row_id"] = probas.keys()
    df["place_id"] = df["row_id"].apply(lambda x: map(itemgetter(0),
                        sorted(probas[x].items(), key=itemgetter(1), reverse=True)[:3]))
    if valid_file is not None:
        df_valid = pd.read_csv(valid_file, usecols=["row_id", "place_id"])
        df_valid.rename(columns={"place_id": "place_id_label"}, inplace=True)
        df_merge = pd.merge(df, df_valid, how="left", on="row_id")
        valid_score = metrics.mapk(df_merge.place_id_label.values[:, None],
                                   df_merge.place_id.values, 3)
        logging.info("total validation score: %f" % valid_score)
        del df_valid
        del df_merge
        return valid_score
def main():
    data_provider = DataProvider(data_directory=Path('./data'))
    item_users = transform_to_item_user_csr_matrix(
        data_provider.get_purchases_train())

    # baseline model
    model = get_model()
    np.random.seed(42)
    model.fit(item_users=item_users)

    test_user_ids, test_purchases = get_purchases_by_customer(
        data_provider.get_purchases_test())
    recommendations = get_recommendations(model, test_user_ids, item_users)
    score = mapk(test_purchases, recommendations, k=10)
    return score
Beispiel #33
0
 def cal_map_at_k(self):
     """Map @ top_k"""
     full, top_k = self._subjects, self._top_k
     users = list(dict.fromkeys(list(full['user'])))
     actual = [
         list(full[(full['user'] == user)
                   & (full['rank_true'] <= top_k)]['test_item'])
         for user in users
     ]
     predicted = [
         list(full[(full['user'] == user)
                   & (full['rank'] <= top_k)]['test_item'])
         for user in users
     ]
     return mapk(actual, predicted, k=top_k)
Beispiel #34
0
def parseDict(probas, output_name, valid_file=None):
    df = pd.DataFrame()
    df["row_id"] = probas.keys()
    df["place_id"] = df["row_id"].apply(lambda x: map(itemgetter(0),
                        sorted(probas[x].items(), key=itemgetter(1), reverse=True)[:3]))
    if valid_file is not None:
        df_valid = pd.read_csv(valid_file, usecols=["row_id", "place_id"])
        df_valid.rename(columns={"place_id": "place_id_label"}, inplace=True)
        df_merge = pd.merge(df, df_valid, how="left", on="row_id")
        valid_score = metrics.mapk(df_merge.place_id_label.values[:, None],
                                   df_merge.place_id.values, 3)
        logging.info("total validation score: %f" % valid_score)
        del df_valid
        del df_merge
        return valid_score
Beispiel #35
0
def test(model, data, target):
	'''
	Predicts MAP score
	input : (model, data, target)
	output : MAP score
	'''
	(x_test, y_test) = data
	#predicting
	x_decode = model.predict(x_test)
	# Find out the the top 5 hotel cluster predictions
	tmp = x_decode[:, 673:x_decode.shape[1]]
	predictions = [tmp[i].argsort()[-5:][::-1] for i in range(tmp.shape[0])]
	# Calculate the MAP score
	score = metrics.mapk(target, predictions, k=5)

	return score
Beispiel #36
0
def mapk_score(s_hidden, recs_pred, k=10):
    """
    Computes the mean average precision at k (MAP@K) of recommendations.
    MAP@K = mean AP@K score over all users
    AP@K = (1 / min(m, k)) * sum from 1 to k of (precision at i * relevance of ith item)
    Where m is the number of items in a user's hidden set
    Where k is the number of items recommended to each user
    params:
        s_hidden: list of sets of hidden items for each user
        recs_pred: list of lists of recommended items, with each list
        k: number of recommendations to use in top set
    returns:
        float, range [0, 1]
    """
    check_list_of_sets(s_hidden, "s_hidden")
    return ml_metrics.mapk(s_hidden, recs_pred, k)
Beispiel #37
0
def ensemble_score(dfs, model_output_paths, df_valid, **cs):
    c = [cs["c%d" % m] for m in range(len(cs))]
    probas = defaultdict(lambda: defaultdict(float))
    for m, df_model in enumerate(dfs):
        #logging.info("scoring %d, %s" % (m, model_output_paths[m]))
        for i in range(len(df_model)):
            probas[df_model["row_id"][i]][df_model["place_id"]
                                          [i]] += c[m] * df_model["proba"][i]
    df = pd.DataFrame()
    df["row_id"] = probas.keys()
    df["place_id"] = df["row_id"].apply(lambda x: map(
        itemgetter(0),
        sorted(probas[x].items(), key=itemgetter(1), reverse=True)[:3]))
    df_merge = pd.merge(df, df_valid, how="left", on="row_id")
    valid_score = metrics.mapk(df_merge.place_id_label.values[:, None],
                               df_merge.place_id.values, 3)
    return valid_score
Beispiel #38
0
def run_map_test_dummy(data, items=None, probs=None, uniform=True, top=True,
                       users=None, primaryEvent=cfg.testing.primary_event, K=10, no_progress=False):
    """Performs dummy test

    Args:
        data: list of event rows
        items: np.array or list of items sorted in descending popularity order
        probs: np.array or list of corresponding probabilities (needed for experiment #2)
        uniform: Boolean flag to use uniform sampling
        top: Boolean flag to use top items
        users: set of users to consider
        primaryEvent: str name of primary event
        K: int for MAP @ K
        no_progress: Boolean flag not to show the progress bar during calculations

    Returns:
        list of [MAP@1, MAP@2, ... MAP@K] evaluations
    """
    d = {}
    for rec in data:
        if rec.event == primaryEvent:
            user = rec.entityId
            item = rec.targetEntityId
            if not users or user in users:
                d.setdefault(user, []).append(item)

    holdoutUsers = [*d.keys()]

    prediction = []
    ground_truth = []
    if no_progress:
        gen = holdoutUsers
    else:
        gen = tqdm(holdoutUsers)
    for user in gen:
        if top:
            test_items = items[0:K]
        elif uniform:
            test_items = np.random.choice(items, size=(K,))
        else:
            test_items = np.random.choice(items, size=(K,), p=probs)
        prediction.append(test_items)
        ground_truth.append(d.get(user, []))
    return [metrics.mapk(ground_truth, prediction, k) for k in range(1, K + 1)]
def main():
    step = 0.25
    map_k = 3
    save_name = 'sub_30nn.csv'
    split_val_time = 73000

    train_inc_labels, test = readfiles()
    x_arange, y_arange = create_grid((0, 10), (0, 10), step)

    np.random.seed(2016)
    # train evaluating
    knn_result_list = []
    label_list = []
    for x_cell_min in x_arange:
        for y_cell_min in y_arange:
            print('Working on %f, %f cell' % (x_cell_min + step / 2, y_cell_min + step / 2))
            cur_train, cur_test, cur_labels = find_index_in_cell(train_inc_labels, test, x_cell_min, y_cell_min, step)
            print('Train size is %d, test size is %d' % (cur_train.shape[0], cur_test.shape[0]))
            for i, probe in enumerate(cur_train.values):
                knn_result_list.append(list(functions_py.knn(probe, cur_train.values, cur_labels.values,
                                                             self_test=True, mapk=map_k, k_nn=30)))
                label_list.append([cur_labels.values[i]])
            print('The MAP3 score is %f' % mapk(label_list, knn_result_list, map_k))
            print('***')

    np.random.seed(2016)
    # test predicting
    knn_ids_str = np.full((test.shape[0],), fill_value='0 1 2', dtype=object)
    for x_cell_min in x_arange:
        for y_cell_min in y_arange:
            print('Working on %f, %f cell' % (x_cell_min + step / 2, y_cell_min + step / 2))
            cur_train, cur_test, cur_labels = find_index_in_cell(train_inc_labels, test, x_cell_min, y_cell_min, step)
            print('Train size is %d, test size is %d' % (cur_train.shape[0], cur_test.shape[0]))
            test_index = cur_test.index.values
            for i, probe in enumerate(cur_test.values):
                knn_ids_str[test_index[i]] = ' '.join(list(functions_py.knn(probe, cur_train.values, cur_labels.values,
                                                                            self_test=False, mapk=map_k,
                                                                            k_nn=20).astype(str)))
                # print(test_index[i], knn_ids_str[test_index[i]])
    submission = pd.DataFrame.from_csv('sample_submission.csv')
    submission['place_id'] = knn_ids_str
    submission.to_csv(save_name)
Beispiel #40
0
"""
MLing, CV
"""
print('CV')
X_train, X_test, y_train, y_test = train_test_split(train_samp.values, target.values, test_size=0.33, random_state=42)
classifier.fit(X_train, y_train)
train_predict_prob = np.zeros((X_test.shape[0], n_classes))
for batch_i in np.arange(0, X_test.shape[0], test_batch):
    if (batch_i + test_batch) < X_test.shape[0]:
        train_predict_prob[batch_i: batch_i + test_batch, :] = \
            classifier.predict_proba(X_test[batch_i: batch_i + test_batch, :])
    else:
        train_predict_prob[batch_i:, :] = classifier.predict_proba(X_test[batch_i:, :])
train_predict_map = percent2mapk(train_predict_prob, 5)
y_test_list = y2list(y_test)
print('The mean average precision is %.4f' % mapk(y_test_list, train_predict_map, k=5))
train_predict_str = list2str(train_predict_map, ' ')

"""
MLing
"""
print('Batch predicting test')
classifier.fit(train_samp.values, target.values)

# Freeing memory
del train_samp, target, X_train, X_test, y_train, y_test, train_predict_prob, train_predict_map

if merge:
    test = pd.merge(test, destinations, left_on=test.srch_destination_id.values.astype(int),
                    right_on=destinations.index.values, how='left')
    test = test.fillna(-10)
Beispiel #41
0



# NB w sample_weight
clf = MultinomialNB(alpha = 0.07)
clf.fit(Xtrain, Ytrain) #sample_weight = 0.1 + 0.5*train.is_booking)

pred = clf.predict_proba(Xtest)
pred_rank = np.apply_along_axis(lambda x: np.argsort(-x)[:5], 1, pred)
print pred_rank.shape
# pred_rank_prob = np.apply_along_axis(lambda x: x[np.argsort(-x)[:4]], 1, pred)

# compute_map
if Ytest.shape[0]==pred_rank.shape[0]:
    map_pred = metrics.mapk([[l] for l in Ytest], pred_rank, k=5)
    print map_pred


# pred = clf.predict(Xtest)
# acc =  sum(pred==Ytest)/len(Ytest)
# print acc

# write output
import pickle
with open('test_id_lkp.pkl') as f:
    test_id_lkp = pickle.load(f)
print len(test_id_lkp)

with open('featurized/Xtest_train_test_users_click_10.pkl', 'w') as f:
    pickle.dump(Xtest, f)
def f5(test, predictions):
    target = [[l] for l in test['hotel_cluster']]
    return metrics.mapk(target, predictions, k=5)
def main():

    #open files
    print "opening files"
    #destinations = pd.read_csv("data/destinations.csv")
    train = pd.read_csv("../data/train.csv",
                        usecols=['srch_destination_id', 'hotel_market','hotel_cluster','user_location_country', 'user_location_region', 'user_location_city',
                                 'hotel_market', 'orig_destination_distance','date_time','user_id','is_booking'],
                        dtype={'srch_destination_id':np.uint32, 'hotel_market':np.uint32, 'hotel_cluster':np.uint32, 'user_location_country':np.uint32, 
                               'user_location_region':np.uint32, 'is_booking':np.bool})
    
    test= pd.read_csv("../data/test.csv",
                      usecols=['srch_destination_id', 'hotel_market','user_location_country', 'user_location_region', 'user_location_city',
                               'hotel_market', 'orig_destination_distance','date_time','user_id'],
                      dtype={'srch_destination_id':np.uint32, 'hotel_market':np.uint32, 'user_location_country':np.uint32,
                             'user_location_region':np.uint32})
    
    print "files opened"

    train.date_time =pd.to_datetime(train.date_time)
    train["year"] = train.date_time.dt.year
    train["month"] = train.date_time.dt.month

    #select all users
    unique_user = train.user_id.unique()
    print "unique users", len(unique_user)

    #sel_user_ids = [unique_user[i] for i in sorted(random.sample(range(len(unique_user)),50000))]
    #sel_train = train[train.user_id.isin(sel_user_ids)]

    #jsut train on a subset of the data (seasonality)
    t1 = train[((train.year == 2013) | ((train.year == 2014) & (sel_train.month < 8)))]
    #t2 = sel_train[((sel_train.year == 2014) & (sel_train.month >= 8))]
    #t2 = t2[t2.is_booking==True]
    
    t2=test

    print "shape of t1 (train)", t1.shape
    print "shape of t2 (test)", t2.shape

    most_common_cl = list(train.hotel_cluster.value_counts().head().index)
    
    print "most common cluster prediction made", most_common_cl

    
    #clusters by destination id and type
    #match_cols = ["srch_destination_id", "srch_destination_type_id", "is_package", "hotel_market"]
    match_cols = ["srch_destination_id", "hotel_market"]
    cluster_cols = match_cols + ['hotel_cluster']

    groups = t1.groupby(cluster_cols)
    top_clusters = {}
    for name, group in groups:
        clicks = len(group.is_booking[group.is_booking == False])
        bookings = len(group.is_booking[group.is_booking == True])

        score = bookings + .15 * clicks
    
        clus_name = make_key(name[:len(match_cols)])
        
        if clus_name not in top_clusters:
            top_clusters[clus_name] = {}
            top_clusters[clus_name][name[-1]] = score

    cluster_dict = {}
    for n in top_clusters:
        tc = top_clusters[n]
        top = [l[0] for l in sorted(tc.items(), key=operator.itemgetter(1), reverse=True)[:5]]
        cluster_dict[n] = top



    preds = []
    for index, row in t2.iterrows():
        key = make_key([row[m] for m in match_cols])
        if key in cluster_dict:
            preds.append(cluster_dict[key])
        else:
            preds.append([])
    
    print "basic prediction made"

    match_cols = ['user_location_city', 'orig_destination_distance']
    
    groups = t1.groupby(match_cols)
    print "exact groups number: ", len(groups)

    exact_matches = []
    for i in range(t2.shape[0]):
        exact_matches.append(generate_exact_matches(t2.iloc[i], match_cols, groups))
        if i%1000==0: print "read ", i, group
    print "exact matches prediction made"

    
    basic_preds = [f5(most_common_cl)[:5] for p in range(len(preds))]
    print "basic", metrics.mapk([[l] for l in t2["hotel_cluster"]], basic_preds, k=5)
    
    region_preds = [f5(preds[p] + most_common_cl)[:5] for p in range(len(preds))]
    print "regional", metrics.mapk([[l] for l in t2["hotel_cluster"]], region_preds, k=5)    
    
    full_preds = [f5(exact_matches[p] + preds[p] + most_common_cl)[:5] for p in range(len(preds))]
    print "full", metrics.mapk([[l] for l in t2["hotel_cluster"]], full_preds, k=5)


    #uncomment to write the file
    
    write_p = [" ".join([str(l) for l in p]) for p in preds]
    write_frame = ["{0},{1}".format(t2["id"][i], write_p[i]) for i in range(len(preds))]
    write_frame = ["id,hotel_cluster"] + write_frame
    with open("predictions_v2.csv", "w+") as f:
        f.write("\n".join(write_frame))
Beispiel #44
0
sys.exit()

### Code to get the mapk value ###
print "Getting Eval Metric"
import pandas as pd
import numpy as np
from ml_metrics import mapk

preds_df = pd.read_csv("val_leak_preds.csv")
preds = np.array( preds_df["hotel_cluster"].apply(lambda x: str(x).split(" ")) )
#preds = [pred for pred in preds]
print preds[:10]
found_count= 0
total_count = 0
item_count = 0
for pred in preds:
        if pred != ['nan']:
                found_count+=1
		item_count += len(pred)
        total_count+=1
print "Item, Found and total : ", item_count,found_count, total_count
 

actuals = np.array( pd.read_csv("../../Data/val.csv", usecols = ["hotel_cluster"])).astype('str')
actuals = actuals.reshape(len(actuals),1)
#actuals = [list(actual) for actual in actuals]
print actuals[:10]

print mapk(actuals, preds, k=5)

Beispiel #45
0
    images = []
    keys = []
    for s in samples:
        images.append(prepare_photo(load_im(s)))
        keys.append(s['photo'])
    input_images = np.array(images).astype('float32')
    vectors = func(input_images)
    for i in range(len(vectors)):
        feature_vectors[keys[i]] = vectors[i]
        

true_images = list()
top20_images = list()
for item in tqdm(filtered_test_ids):
    streets = test_ids[item][0]
    shops = test_ids[item][1]
    for test_case in streets:
        test_im = prepare_photo(load_im(test_case))
        feature_vec = func(test_im.reshape((1, 3, 224, 224)).astype('float32'))
        results = [(i[0], cosine(feature_vec, i[1])) for i in feature_vectors.items()]
        results.sort(key=lambda x: x[1])
        true_images.append([x['photo'] for x in shops])
        top20_images.append([x[0] for x in results[:20]])
        
from ml_metrics import mapk
print mapk(true_images, top20_images, k=20)
print mapk(true_images, top20_images, k=10)
print mapk(true_images, top20_images, k=5)


def process_one_cell(df_train, df_test, valid_mode_on,
                     gx_id, gy_id, x_border, y_border, th, model_list):
    """   
    Classification inside one grid cell.
    """
    #Working on df_train
    #filtering occurance smaller than th
    #consider border of cell
    df_cell_train = df_train.loc[(df_train.grid_cell_x == gx_id) & (df_train.grid_cell_y == gy_id)]
    x_min = df_cell_train.x.min()
    x_max = df_cell_train.x.max()
    y_min = df_cell_train.y.min()
    y_max = df_cell_train.y.max()
    df_cell_train = df_train.loc[(df_train.x >= x_min - x_border) & (df_train.x <= x_max + x_border)
                                  & (df_train.y >= y_min - y_border) & (df_train.y <= y_max + y_border)]
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= th).values
    df_cell_train = df_cell_train.loc[mask]

    #Working on df_test
    df_cell_test = df_test.loc[(df_test.grid_cell_x == gx_id) & (df_test.grid_cell_y == gy_id)]
    row_ids = df_cell_test.row_id.values

    #Preparing data
    #remove columns and encode label
    le = LabelEncoder()
    y_train = le.fit_transform(df_cell_train.place_id.values)
    l_train = df_cell_train.shape[0]
    l_test = df_cell_test.shape[0]
    n_class = len(le.classes_)
    logging.info("number of class: %d" % n_class)
    if valid_mode_on:
        logging.info("validation mode")
        logging.info("train size: %d, validation size: %d" % (l_train, l_test))
        logging.info("%d labels in validation is not in train" 
                     % len(set(df_cell_test.place_id.values) - set(df_cell_train.place_id.values)))
    else:
        logging.info("prediction mode")
        logging.info("train size: %d, test size: %d" % (l_train, l_test))

    df_cell_train_feats = df_cell_train.drop(['place_id', 'grid_cell_x', 'grid_cell_y', 'row_id'], axis=1)
    feats = df_cell_train_feats.columns.values
    df_cell_test_feats = df_cell_test[feats]

    y_test_pred = np.zeros((df_cell_test_feats.shape[0], n_class))
    for clf in model_list:
        y_test_pred_model = clf(df_cell_train_feats, y_train, df_cell_test_feats)
        y_test_pred += y_test_pred_model

    if valid_mode_on:
        pred_labels = le.inverse_transform(np.argsort(y_test_pred, axis=1)[:, ::-1][:, :10])
        valid_score = metrics.mapk(df_cell_test.place_id.values[:, None], pred_labels, 3)
        logging.info("valid score = %6.6f" % valid_score)
    else:
        valid_score = None

    #return list of (row_id, place_id, proba)
    top10_label = le.inverse_transform(np.argsort(y_test_pred, axis=1)[:, ::-1][:, :10])
    top10_proba_raw = np.sort(y_test_pred, axis=1)[:, ::-1][:, :10]
    top10_proba = top10_proba_raw / np.sum(top10_proba_raw, axis=1)[:, None]
    probas = []
    for i, rid in enumerate(row_ids):
        if i == 0:
            probas = np.array([[rid] * 10, top10_label[i], top10_proba[i]]).T
        else:
            probas = np.vstack([probas, np.array([[rid] * 10, top10_label[i], top10_proba[i]]).T])
    return probas, valid_score, l_test
Beispiel #47
0
	print (valid.shape, train.shape)

cnt = train[train.clicked==1].ad_id.value_counts()
cntall = train.ad_id.value_counts()
del train

def get_prob(k):
    if k not in cnt:
        return 0
    return cnt[k]/(float(cntall[k]) + reg)

def srt(x):
    ad_ids = map(int, x.split())
    ad_ids = sorted(ad_ids, key=get_prob, reverse=True)
    return " ".join(map(str,ad_ids))
   
if eval:
	from ml_metrics import mapk
	
	y = valid[valid.clicked==1].ad_id.values
	y = [[_] for _ in y]
	p = valid.groupby('display_id').ad_id.apply(list)
	p = [sorted(x, key=get_prob, reverse=True) for x in p]
	
	print (mapk(y, p, k=12))
else:
	subm = pd.read_csv("./data/sample_submission.csv") 
	subm['ad_id'] = subm.ad_id.apply(lambda x: srt(x))
	subm.to_csv("subm_reg_1.csv", index=False)

Beispiel #48
0
"""

import csv
from ml_metrics import mapk
import predict
import settings


def __read_rows(file_name):
    with open(file_name, "r", encoding="utf8") as file:
        return {int(r["AuthorId"]): [int(x) for x in r["PaperIds"].split()] for r in csv.DictReader(file)}


if __name__ == "__main__":
    try:
        submission = __read_rows(settings.SUBMISSION_PATH)
    except FileNotFoundError:
        predict.submit_prediction()
        submission = __read_rows(settings.SUBMISSION_PATH)

    print("*** Calculate Mean Average Precision ***")
    print("\tbuilding valid solution")
    valid_solution = __read_rows(settings.VALID_SOLUTION_PATH)

    if sorted(valid_solution.keys()) != sorted(submission.keys()):
        print("The submission is incorrect: author ids are mismatched with the valid dataset")
    else:
        print("\tcalculating score")
        score = mapk([valid_solution[k] for k in valid_solution.keys()], [submission[k] for k in valid_solution.keys()])
        print("\n*** Mean average precision for solution file: {0} ***".format(score))
    print("Elapsed time column: %s minutes" % ((time.time() - start_time_column)/60))

def model(x_ranges, y_ranges, x_end, y_end, train, test):   
    start_time = time.time()
    jobs = []
    mgr = Manager()
    preds_total = mgr.dict();
    for x_min, x_max in  x_ranges:
    
        p = multiprocessing.Process(target=process_column, args=(x_min, x_max, y_ranges, \
                                                                 x_end, y_end, train, test, preds_total))
        jobs.append(p)
        p.start()
        if len(jobs) == 1:
            for proc in jobs:
                proc.join();
            jobs = [];
        
    print("Elapsed time overall: %s minutes" % ((time.time() - start_time)/60))
    
    preds_total = pd.concat(preds_total.values(), axis=0);
    print preds_total.shape
    
    return preds_total.sort_index();


predictions = model(x_ranges_cv, y_ranges_cv, x_cv_end, y_cv_end, train_cv, test_cv.drop(['place_id'], axis=1));
actual = test_cv[['place_id']].sort_index();
print actual.shape
print mapk(np.array([actual.values.flatten()]).T, predictions.values, 3)
        if classifier == RF or classifier == GBM:
            print "Feature importance", classifier.feature_importances_

        print("Making predictions")
        predictions = classifier.predict_proba(test_features)[:,1]
        predictions = list(predictions)
    
        author_predictions = defaultdict(list)
        paper_predictions = {}
    
        for (a_id, p_id), pred in zip(author_paper_ids, predictions):
            author_predictions[str(a_id)].append((pred,str(p_id)))
    
        for author_id in sorted(author_predictions):
            paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
            paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]
       
        predicted = paper_predictions.items()
        predicted.sort()
    
        print [x[0] for x in predicted[:5]]
        print [x[0] for x in test_ground_truth[:5]]
        mp.append(metrics.mapk([row[1] for row in test_ground_truth], [row[1] for row in predicted],10000))
        print mp[k]
    
    print numpy.mean(mp)
    print numpy.std(mp)
    
if __name__=="__main__":
    main()
def main():
    print("Getting features for deleted papers from the database")
    #features_deleted = data_io.get_features_db("TrainDeleted")
    features_deleted = data_io.get_precomputed_features("DeletedFeatures")
    print("Getting features for confirmed papers from the database")
    #features_conf = data_io.get_features_db("TrainConfirmed")
    features_conf = data_io.get_precomputed_features("ConfirmedFeatures")
    print("Getting features for deleted papers from the database")
    #valid_features_deleted = data_io.get_features_db("ValidDeleted")
    valid_features_deleted = data_io.get_precomputed_features("ValidDeletedFeatures")
    print("Getting features for confirmed papers from the database")
    #valid_features_conf = data_io.get_features_db("ValidConfirmed")
    valid_features_conf = data_io.get_precomputed_features("ValidConfirmedFeatures")

    features = [x[2:] for x in features_deleted + features_conf] #+ valid_features_deleted + valid_features_conf
    target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))] 
          #+ [0 for x in range(len(valid_features_deleted))] + [1 for x in range(len(valid_features_conf))]

    print("Training the Classifier")
    RF = RandomForestClassifier(n_estimators=50, 
                                       verbose=2,
                                        n_jobs=1,
                                        min_samples_split=10,
                                        compute_importances=True,
                                        random_state=1)
    
    GBM = GradientBoostingClassifier(n_estimators=100, 
                                        verbose=2,
                                        min_samples_split=10,
                                        random_state=1)
    classifier = RF
    classifier.fit(features, target)

    # Validation
    author_paper_ids = [x[:2] for x in valid_features_conf+valid_features_deleted]
    features = [x[2:] for x in valid_features_conf+valid_features_deleted]

    print("Making predictions")
    predictions = classifier.predict_proba(features)[:,1]
    predictions = list(predictions)

    author_predictions = defaultdict(list)
    paper_predictions = {}

    for (a_id, p_id), pred in zip(author_paper_ids, predictions):
        author_predictions[str(a_id)].append((pred,str(p_id)))

    for author_id in sorted(author_predictions):
        paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
        paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]
   
    predicted = paper_predictions.items()
    predicted.sort()
    #Now I have sorted predictions for each author_id
    #Need to get the ground truth for the validation set:

    valid_confirmed_data = [row for row in csv.reader(open("ValidSolution.csv"))] #TrainConfirmed.csv
    valid_confirmed_papers = [(row[0],row[1].split()) for row in valid_confirmed_data[1:]]
    valid_confirmed_papers.sort()

    print predicted[0]
    print valid_confirmed_papers[0]
   
    import ml_metrics as metrics
    print metrics.mapk([row[1] for row in valid_confirmed_papers], [row[1] for row in predicted],10000)
Beispiel #52
0
for index, row in te.iterrows():
    srch_destination_id = row["srch_destination_id"]
    is_booking = row["is_booking"]
    hotel_market = row["hotel_market"]
    hotel_country = row["hotel_country"]
    top_clusters = nlargest(
        5, best_srch_destination_id[srch_destination_id], key=best_srch_destination_id[srch_destination_id].get
    )
    if len(top_clusters) <= 5:
        item = nlargest(5, best_hotel_market[hotel_market], key=best_hotel_market[hotel_market].get)
        for i in item:
            if i not in top_clusters:
                top_clusters.append(i)

    if len(top_clusters) < 5:
        item = nlargest(5, best_hotel_country[hotel_country], key=best_hotel_country[hotel_market].get)
        for i in item:
            if i not in top_clusters:
                top_clusters.append(i)

    if len(top_clusters) < 5:
        item = nlargest(5, best_hotel, key=best_hotel.get)
        for i in item:
            if i not in top_clusters:
                top_clusters.append(i)

    prediction.append(top_clusters[:5])

ground_truth = [[l] for l in te["hotel_cluster"]]
print "the rule-based method has MAP5 %s" % mapk(ground_truth, prediction, k=5)  # 0.25024279168333224