Ejemplo n.º 1
0
def link_split_map (data, mdl, sample_model, train_plan, folds, model_dir = None):
    '''
    Train on all the words and half the links. Predict the remaining links.
    Evaluate using mean average-precision.

    Cross validation may be used, but note we're always evaluating on training
    data.

    :param data: the DataSet object with the data
    :param mdl:  the module with the train etc. functin
    :param sample_model: a preconfigured model which is cloned at the start of each
            cross-validation run
    :param train_plan:  the training plan (number of iterations etc.)
    :param folds:  the number of folds to cross validation
    :param model_dir: if not none, and folds > 1, the models are stored in this
    directory.
    :return: the list of model files stored
    '''
    model_files = []
    assert folds > 1, "Need at least two folds for this to make any sense whatsoever"
    def prepareForTraining(data):
        if mdl.is_undirected_link_predictor():
            result = data.copy()
            result.convert_to_undirected_graph()
            result.convert_to_binary_link_matrix()
            return result
        else:
            return data

    for fold in range(folds):
        model = mdl.newModelFromExisting(sample_model)
        train_data, query_data = data.link_prediction_split(symmetric=False)
        train_data = prepareForTraining(train_data) # make symmetric, if necessary, after split, so we
                                                    # can compare symmetric with non-symmetric models
        train_tops = mdl.newQueryState(train_data, model)
        model, train_tops, (train_itrs, train_vbs, train_likes) = \
            mdl.train(train_data, model, train_tops, train_plan)

        print("Training perplexity is %.2f " % perplexity_from_like(mdl.log_likelihood(train_data, model, train_tops), train_data.word_count))

        min_link_probs       = mdl.min_link_probs(model, train_tops, query_data.links)
        predicted_link_probs = mdl.link_probs(model, train_tops, min_link_probs)

        map = mean_average_prec (query_data.links, predicted_link_probs)
        print ("Fold %2d: Mean-Average-Precision %6.3f" % (fold, map))

        model_files = save_if_necessary(model_files, model_dir, model, data, fold, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl)

    return model_files
Ejemplo n.º 2
0
def outsample_lro_style_prec_rec (data, mdl, sample_model, train_plan, feature_mask, model_dir=None, ldaModel=None, ldaTopics=None):
    '''
    Take a feature list. Train on all documents where none of those features
    are set. Remove the first element from the feature list, query all documents
    with that feature set, and then evaluate link prediction. Repeat until
    feature-list is empty.

    :param data: the DataSet object with the data
    :param mdl:  the module with the train etc. functin
    :param sample_model: a preconfigured model which is cloned at the start of each
            cross-validation run
    :param train_plan:  the training plan (number of iterations etc.)
    :param feature_mask:  the list of features used to separate training from query
    This is a list of tuples, the left side is the feature label, the right side
    is the
    :param model_dir: if not none, the models are stored in this directory.
    :param ldaModel: for those models that utilise and LDA component, a pre-trained
    LDA model can be supplied.
    :param ldaTopics: the topics of all documents in the corpus as given by the ldaModel
    :return: the list of model files stored
    '''
    def prepareForTraining(data):
        if mdl.is_undirected_link_predictor():
            result = data.copy()
            result.convert_to_undirected_graph()
            result.convert_to_binary_link_matrix()
            return result
        else:
            return data

    ms = [10, 20, 30, 40, 50, 75, 100, 150, 250, 500]
    model_files = []

    combi_precs, combi_recs, combi_dcounts = None, None, None
    mrr_sum, mrr_doc_count = 0, 0
    map_sum, map_doc_count = 0, 0
    while len(feature_mask) > 0:
        # try:
        # Prepare the training and query data
        feature_mask_indices = [i for _,i in feature_mask]
        train_data, query_data, train_indices = data.split_on_feature(feature_mask_indices)
        (feat_label, feat_id) = feature_mask.pop(0)
        print ("\n\nFeature: %s\n" % (feat_label,) + ("-" * 80))

        train_data = prepareForTraining(train_data) # make symmetric, if necessary, after split, so we
                                                    # can compare symmetric with non-symmetric models

        # Train the model
        if model_uses_lda(sample_model):
            ldaModelSubset, ldaTopicsSubset = subsetLda(ldaModel, ldaTopics, train_indices)
            model      = mdl.newModelFromExisting(sample_model, withLdaModel=ldaModelSubset)
            train_tops = mdl.newQueryState(train_data, model, withLdaTopics=ldaTopicsSubset)
        else:
            model = mdl.newModelFromExisting(sample_model)
            train_tops = mdl.newQueryState(train_data, model)

        model, train_tops, (train_itrs, train_vbs, train_likes) = \
            mdl.train(train_data, model, train_tops, train_plan)

        print ("Training perplexity is %.2f " % perplexity_from_like(mdl.log_likelihood(train_data, model, train_tops), train_data.word_count))

        # Infer the expected link probabilities
        query_tops    = mdl.newQueryState(query_data, model)
        _, query_tops = mdl.query(query_data, model, query_tops, train_plan)

        min_link_probs       = mdl.min_link_probs(model, train_tops, query_tops, query_data.links)
        predicted_link_probs = mdl.link_probs(model, train_tops, query_tops, min_link_probs)
        expected_links       = query_data.links

        # Evaluation 1/3: Precision and Recall at M
        precs, recs, doc_counts = mean_prec_rec_at (expected_links, predicted_link_probs, at=ms, groups=[(0,3), (3,5), (5,10), (10,1000)])
        print (" Mean-Precisions for feature %s (#%d)" % (feat_label, feat_id), end="")

        printTable("Precision", precs, doc_counts, ms)
        printTable("Recall",    recs,  doc_counts, ms)

        combi_precs, _             = combine_map(combi_precs, combi_dcounts, precs, doc_counts)
        combi_recs,  combi_dcounts = combine_map(combi_recs,  combi_dcounts, recs,  doc_counts)

        # Evaluation 2/3: Mean Reciprocal-Rank
        mrr = mean_reciprocal_rank(expected_links, predicted_link_probs)
        print ("Mean reciprocal-rank : %f" % mrr)
        mrr_sum       += mrr * expected_links.shape[0]
        mrr_doc_count += expected_links.shape[0]

        # Evaluation 3/3: Mean Average-Precision
        map = mean_average_prec (expected_links, predicted_link_probs)
        print ("Mean Average Precision : %f" % map)
        map_sum       += map * expected_links.shape[0]
        map_doc_count += expected_links.shape[0]

        # Save the files if necessary and move onto the next fold if required
        model_files = save_if_necessary(model_files, model_dir, model, data, feat_id, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl)
        # except Exception as e:
        #     print("Fold " + str(fold) + " failed: " + str(e))

    print ("-" * 80 + "\n\n Final Results\n\n")
    printTable("Precision", combi_precs, combi_dcounts, ms)
    printTable("Recall",    combi_recs,  combi_dcounts, ms)
    print("Mean reciprocal-rank: %f" % (mrr_sum / mrr_doc_count))
    print("Mean average-precision: %f" % (map_sum / map_doc_count))

    return model_files
Ejemplo n.º 3
0
def insample_lro_style_prec_rec (data, mdl, sample_model, train_plan, folds, target_folds=None, model_dir=None, ldaModel=None, ldaTopics=None):
    '''
    For documents with > 5 links remove a portion. The portion is determined by
    the number of folds (e.g. five-fold implied remove one fifth of links, three
    fold implies remove a third, etc.)

    Train on all documents and all remaining links.

    Predict remaining links.

    Evaluate using [email protected], [email protected], mean reciprocal-rank and
    mean average-precision

    Average all results.

    :param data: the DataSet object with the data
    :param mdl:  the module with the train etc. functin
    :param sample_model: a preconfigured model which is cloned at the start of each
            cross-validation run
    :param train_plan:  the training plan (number of iterations etc.)
    :param folds:  the number of folds to cross validation
    :param target_folds: the number of folds to complete before finishing. Set
    to folds by default
    :param model_dir: if not none, and folds > 1, the models are stored in this
    directory.
    :param ldaModel: for those models that utilise and LDA component, a pre-trained
    LDA model can be supplied.
    :param ldaTopics: the topics of all documents in the corpus as given by the ldaModel
    :return: the list of model files stored
    '''
    ms = [10, 20, 30, 40, 50, 75, 100, 150, 250, 500]
    model_files = []
    assert folds > 1, "Need at least two folds for this to make any sense whatsoever"
    def prepareForTraining(data):
        if mdl.is_undirected_link_predictor():
            result = data.copy()
            result.convert_to_undirected_graph()
            result.convert_to_binary_link_matrix()
            return result
        else:
            return data

    if target_folds is None:
        target_folds = folds

    combi_precs, combi_recs, combi_dcounts = None, None, None
    mrr_sum, mrr_doc_count = 0, 0
    map_sum, map_doc_count = 0, 0
    fold_count = 0
    for fold in range(folds):
        # try:
        # Prepare the training and query data
        train_data, query_data, docSubset = data.folded_link_prediction_split(MinLinkCountEval, fold, folds)
        train_data = prepareForTraining(train_data) # make symmetric, if necessary, after split, so we
                                                    # can compare symmetric with non-symmetric models

        print ("\n\nFold %d\n" % fold + ("-" * 80))

        # Train the model
        if model_uses_lda(sample_model):
            model      = mdl.newModelFromExisting(sample_model, withLdaModel=ldaModel)
            train_tops = mdl.newQueryState(train_data, model, withLdaTopics=ldaTopics)
        else:
            model = mdl.newModelFromExisting(sample_model)
            train_tops = mdl.newQueryState(train_data, model)

        model, train_tops, (train_itrs, train_vbs, train_likes) = \
            mdl.train(train_data, model, train_tops, train_plan)

        print ("Training perplexity is %.2f " % perplexity_from_like(mdl.log_likelihood(train_data, model, train_tops), train_data.word_count))

        # Infer the expected link probabilities
        min_link_probs       = mdl.min_link_probs(model, train_tops, train_tops, query_data.links, docSubset)
        predicted_link_probs = mdl.link_probs(model, train_tops, train_tops, min_link_probs, docSubset)
        expected_links       = query_data.links[docSubset, :]

        # Evaluation 1/3: Precision and Recall at M
        precs, recs, doc_counts = mean_prec_rec_at (expected_links, predicted_link_probs, at=ms, groups=[(0,3), (3,5), (5,10), (10,1000)])
        print ("Fold %2d: Mean-Precisions at \n" % fold, end="")

        printTable("Precision", precs, doc_counts, ms)
        printTable("Recall",    recs,  doc_counts, ms)

        combi_precs, _             = combine_map(combi_precs, combi_dcounts, precs, doc_counts)
        combi_recs,  combi_dcounts = combine_map(combi_recs,  combi_dcounts, recs,  doc_counts)

        # Evaluation 2/3: Mean Reciprocal-Rank
        mrr = mean_reciprocal_rank(expected_links, predicted_link_probs)
        print ("Mean reciprocal-rank : %f" % mrr)
        mrr_sum       += mrr * expected_links.shape[0]
        mrr_doc_count += expected_links.shape[0]

        # Evaluation 3/3: Mean Average-Precision
        map = mean_average_prec (expected_links, predicted_link_probs)
        print ("Mean Average Precision : %f" % map)
        map_sum       += map * expected_links.shape[0]
        map_doc_count += expected_links.shape[0]

        # Save the files if necessary and move onto the next fold if required
        model_files = save_if_necessary(model_files, model_dir, model, data, fold, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl)
        fold_count += 1
        if fold_count == target_folds:
            break
        # except Exception as e:
        #     print("Fold " + str(fold) + " failed: " + str(e))

    print ("-" * 80 + "\n\n Final Results\n\n")
    printTable("Precision", combi_precs, combi_dcounts, ms)
    printTable("Recall",    combi_recs,  combi_dcounts, ms)
    print("Mean reciprocal-rank: %f" % (mrr_sum / mrr_doc_count))
    print("Mean average-precision: %f" % (map_sum / map_doc_count))

    return model_files
Ejemplo n.º 4
0
def link_split_prec_rec (data, mdl, sample_model, train_plan, folds, target_folds=None, model_dir=None, ldaModel=None, ldaTopics=None):
    '''
    Train on all the words and half the links. Predict the remaining links.
    Evaluate using precision at m using as values of m 50, 100, 250, and 500,
    and additionally recall at m

    Cross validation may be used, but note we're always evaluating on training
    data.

    :param data: the DataSet object with the data
    :param mdl:  the module with the train etc. functin
    :param sample_model: a preconfigured model which is cloned at the start of each
            cross-validation run
    :param train_plan:  the training plan (number of iterations etc.)
    :param folds:  the number of folds to cross validation
    :param target_folds: the number of folds to complete before finishing. Set
    to folds by default
    :param model_dir: if not none, and folds > 1, the models are stored in this
    directory.
    :param ldaModel: for those models that utilise and LDA component, a pre-trained
    LDA model can be supplied.
    :param ldaTopics: the topics of all documents in the corpus as given by the ldaModel
    :return: the list of model files stored
    '''
    ms = [10, 20, 30, 40, 50, 75, 100, 150, 250, 500]
    model_files = []
    assert folds > 1, "Need at least two folds for this to make any sense whatsoever"
    def prepareForTraining(data):
        if mdl.is_undirected_link_predictor():
            result = data.copy()
            result.convert_to_undirected_graph()
            result.convert_to_binary_link_matrix()
            return result
        else:
            return data

    if ldaModel is not None:
        (_, _, _, _, ldaModel, ldaTopics, _) = ldaModel
    if target_folds is None:
        target_folds = folds

    combi_precs, combi_recs, combi_dcounts = None, None, None
    mrr_sum, mrr_doc_count = 0, 0
    map_sum, map_doc_count = 0, 0
    for fold in range(target_folds):
        model = mdl.newModelFromExisting(sample_model, withLdaModel=ldaModel) \
                if sample_model.name == LRO_MODEL_NAME \
                else mdl.newModelFromExisting(sample_model)

        train_data, query_data = data.link_prediction_split(symmetric=False)
        train_data = prepareForTraining(train_data) # make symmetric, if necessary, after split, so we
                                                    # can compare symmetric with non-symmetric models
        train_tops = mdl.newQueryState(train_data, model)
        model, train_tops, (train_itrs, train_vbs, train_likes) = \
            mdl.train(train_data, model, train_tops, train_plan)

        print ("Training perplexity is %.2f " % perplexity_from_like(mdl.log_likelihood(train_data, model, train_tops), train_data.word_count))

        min_link_probs       = mdl.min_link_probs(model, train_tops, query_data.links)
        predicted_link_probs = mdl.link_probs(model, train_tops, min_link_probs)
        expected_links       = query_data.links

        precs, recs, doc_counts = mean_prec_rec_at (expected_links, predicted_link_probs, at=ms, groups=[(0,3), (3,5), (5,10), (10,1000)])
        print ("Fold %2d: Mean-Precisions at \n" % fold, end="")

        printTable("Precision", precs, doc_counts, ms)
        printTable("Recall",    recs,  doc_counts, ms)

        mrr = mean_reciprocal_rank(expected_links, predicted_link_probs)
        print ("Mean reciprocal-rank : %f" % mrr)
        mrr_sum       += mrr * expected_links.shape[0]
        mrr_doc_count += expected_links.shape[0]

        map = mean_average_prec (expected_links, predicted_link_probs)
        print ("Mean Average Precision : %f" % map)
        map_sum       += map * expected_links.shape[0]
        map_doc_count += expected_links.shape[0]

        combi_precs, _             = combine_map(combi_precs, combi_dcounts, precs, doc_counts)
        combi_recs,  combi_dcounts = combine_map(combi_recs,  combi_dcounts, recs,  doc_counts)

        model_files = save_if_necessary(model_files, model_dir, model, data, fold, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl)

    print ("-" * 80 + "\n\n Final Results\n\n")
    printTable("Precision", combi_precs, combi_dcounts, ms)
    printTable("Recall",    combi_recs,  combi_dcounts, ms)
    print("Mean reciprocal-rank: %f" % (mrr_sum / mrr_doc_count))

    return model_files
Ejemplo n.º 5
0
def cross_val_and_eval_hashtag_prec_at_m(data, mdl, sample_model, train_plan, word_dict, num_folds, fold_run_count=-1, model_dir= None):
    '''
    Evaluate the precision at M for the top 50 hash-tags. In the held-out set, the hashtags
    are deleted. We train on all, both training and held-out, then evaluate the precision
    at M for the hashtags

    For values of M we use 10, 50, 100, 150, 250, 500


    :param data: the DataSet object with the data
    :param mdl:  the module with the train etc. functin
    :param sample_model: a preconfigured model which is cloned at the start of each
            cross-validation run
    :param train_plan:  the training plan (number of iterations etc.)
    :param word_dict the word dictionary, used to identify hashtags and print them
    out when the run is completed.
    :param num_folds:  the number of folds to cross validation
    :param fold_run_count: for debugging stop early after processing the number
    of the folds
    :param model_dir: if not none, the models are stored in this directory.
    :return: the list of model files stored
    '''
    MS = [10, 50, 100, 150, 200, 250, 1000, 1500, 3000, 5000, 10000]
    Precision, Recall = "precision", "recall"

    model_files = []
    if fold_run_count < 1:
        fold_run_count = num_folds
    if num_folds <= 1:
        raise ValueError ("Number of folds must be greater than 1")

    hashtag_indices = popular_hashtag_indices (data, word_dict, 50)

    folds_finished = 0 # count of folds that finished successfully
    fold = 0
    while fold < num_folds and folds_finished < fold_run_count:
        try:
            train_range, query_range = data.cross_valid_split_indices(fold, num_folds)

            segment_with_htags             = data.words[train_range, :]
            held_out_segment_with_htags    = data.words[query_range, :]
            held_out_segment_without_htags = data.words[query_range, :]
            held_out_segment_without_htags[:, hashtag_indices] = 0

            train_words = ssp.vstack((segment_with_htags, held_out_segment_without_htags))
            train_data  = data.copy_with_changes(words=train_words)

            # Train the model
            print ("Duplicating model template... ", end="")
            model      = mdl.newModelFromExisting(sample_model)
            train_tops = mdl.newQueryState(train_data, model)

            print ("Starting training")
            model, train_tops, (train_itrs, train_vbs, train_likes) \
                = mdl.train(train_data, model, train_tops, train_plan)

            # Predict hashtags
            dist = rowwise_softmax(train_tops.means)

            # For each hash-tag, for each value of M, evaluate the precision
            results = {Recall : dict(), Precision : dict()}
            for hi in hashtag_indices:
                h_probs = dist[query_range,:].dot(model.vocab[:,hi])
                h_count = held_out_segment_with_htags[:, hi].sum()

                results[Recall][word_dict[hi]]    = { -1 : h_count }
                results[Precision][word_dict[hi]] = { -1 : h_count }
                for m in MS:
                    top_m = h_probs.argsort()[-m:][::-1]

                    true_pos = held_out_segment_with_htags[top_m, hi].sum()
                    rec_denom = min(m, h_count)
                    results[Precision][word_dict[hi]][m] = true_pos / m
                    results[Recall][word_dict[hi]][m]    = true_pos / rec_denom

            print ("%10s\t%20s\t%6s\t" % ("Metric", "Hashtag", "Count") + "\t".join("%5d" % m for m in MS))
            for htag, prec_results in results[Precision].items():
                print ("%10s\t%20s\t%6d\t%s" % ("Precision", htag, prec_results[-1], "\t".join(("%0.3f" % prec_results[m] for m in MS))))
            for htag, prec_results in results[Recall].items():
                print ("%10s\t%20s\t%6d\t%s" % ("Recall", htag, prec_results[-1], "\t".join(("%0.3f" % prec_results[m] for m in MS))))


            # Save the model
            model_files = save_if_necessary(model_files, model_dir, model, data, fold, train_itrs, train_vbs, train_likes, train_tops, None, mdl)
        except Exception as e:
            traceback.print_exc()
            print("Abandoning fold %d due to the error : %s" % (fold, str(e)))
        finally:
            fold += 1


    return model_files
Ejemplo n.º 6
0
def cross_val_and_eval_perplexity(data, mdl, sample_model, train_plan, query_plan, num_folds, fold_run_count=-1, model_dir= None):
    '''
    Uses cross-validation go get the average perplexity. If folds == 1 a special path is
    triggered where perplexity is evaluated on the training data, and the results are
    not saved to disk, even if model_dir is not none

    :param data: the DataSet object with the data
    :param mdl:  the module with the train etc. functin
    :param sample_model: a preconfigured model which is cloned at the start of each
            cross-validation run
    :param train_plan:  the training plan (number of iterations etc.)
    :param query_plan:  the query play (number of iterations etc.)
    :param num_folds:  the number of folds to cross validation
    :param fold_run_count: for debugging stop early after processing the number
    of the folds
    :param model_dir: if not none, the models are stored in this directory.
    :return: the list of model files stored
    '''
    model_files = []
    if fold_run_count < 1:
        fold_run_count = num_folds

    if num_folds == 1:
        model = mdl.newModelFromExisting(sample_model)
        query = mdl.newQueryState(data, model)

        model, train_tops, (train_itrs, train_vbs, train_likes) = mdl.train(data, model, query, train_plan)
        likely = mdl.log_likelihood(data, model, train_tops)
        perp   = perplexity_from_like(likely, data.word_count)

        print("Train-set Likelihood: %12f" % (likely))
        print("Train-set Perplexity: %12f" % (perp))

        model_files = save_if_necessary(model_files, model_dir, model, data, 0, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl)
        return model_files

    query_like_sum    = 0 # to calculate the overall likelihood and
    query_wcount_sum  = 0 # perplexity for the whole dataset
    train_like_sum    = 0
    train_wcount_sum  = 0
    folds_finished    = 0 # count of folds that finished successfully

    fold = 0
    while fold < num_folds and folds_finished < fold_run_count:
        try:
            train_data, query_data = data.cross_valid_split(fold, num_folds)

            # Train the model
            print ("Duplicating model template... ", end="")
            model      = mdl.newModelFromExisting(sample_model)
            print ("Done.\nCreating query state...")
            train_tops = mdl.newQueryState(train_data, model)

            print ("Starting training")

            model, train_tops, (train_itrs, train_vbs, train_likes) \
                = mdl.train(train_data, model, train_tops, train_plan)

            train_like       = mdl.log_likelihood (train_data, model, train_tops)
            train_word_count = train_data.word_count
            train_perp       = perplexity_from_like(train_like, train_word_count)

            print ("DEBUG Train perplexity is " + str(train_perp))

            # Query the model - if there are no features we need to split the text
            print ("Starting query.")
            query_estim, query_eval = query_data.doc_completion_split()
            query_tops              = mdl.newQueryState(query_estim, model)
            model, query_tops = mdl.query(query_estim, model, query_tops, query_plan)

            query_like       = mdl.log_likelihood(query_eval, model, query_tops)
            query_word_count = query_eval.word_count
            query_perp       = perplexity_from_like(query_like, query_word_count)

            # Keep a record of the cumulative likelihood and query-set word-count
            train_like_sum += train_like
            train_wcount_sum  += train_word_count
            query_like_sum += query_like
            query_wcount_sum  += query_word_count
            folds_finished  += 1

            # Write out the output
            print("Fold %d: Train-set Perplexity: %12.3f \t Query-set Perplexity: %12.3f" % (fold, train_perp, query_perp))
            print("")

            # Save the model
            model_files = save_if_necessary(model_files, model_dir, model, data, fold, train_itrs, train_vbs, train_likes, train_tops, query_tops, mdl)
        # except Exception as e:
        #     traceback.print_exc()
        #     print("Abandoning fold %d due to the error : %s" % (fold, str(e)))
        finally:
            fold += 1

    print ("Total (%d): Train-set Likelihood: %12.3f \t Train-set Perplexity: %12.3f" % (folds_finished, train_like_sum, perplexity_from_like(train_like_sum, train_wcount_sum)))
    print ("Total (%d): Query-set Likelihood: %12.3f \t Query-set Perplexity: %12.3f" % (folds_finished, query_like_sum, perplexity_from_like(query_like_sum, query_wcount_sum)))

    return model_files