Ejemplo n.º 1
0
def link_split_map (data, mdl, sample_model, train_plan, folds, model_dir = None):
    '''
    Train on all the words and half the links. Predict the remaining links.
    Evaluate using mean average-precision.

    Cross validation may be used, but note we're always evaluating on training
    data.

    :param data: the DataSet object with the data
    :param mdl:  the module with the train etc. functin
    :param sample_model: a preconfigured model which is cloned at the start of each
            cross-validation run
    :param train_plan:  the training plan (number of iterations etc.)
    :param folds:  the number of folds to cross validation
    :param model_dir: if not none, and folds > 1, the models are stored in this
    directory.
    :return: the list of model files stored
    '''
    model_files = []
    assert folds > 1, "Need at least two folds for this to make any sense whatsoever"
    def prepareForTraining(data):
        if mdl.is_undirected_link_predictor():
            result = data.copy()
            result.convert_to_undirected_graph()
            result.convert_to_binary_link_matrix()
            return result
        else:
            return data

    for fold in range(folds):
        model = mdl.newModelFromExisting(sample_model)
        train_data, query_data = data.link_prediction_split(symmetric=False)
        train_data = prepareForTraining(train_data) # make symmetric, if necessary, after split, so we
                                                    # can compare symmetric with non-symmetric models
        train_tops = mdl.newQueryState(train_data, model)
        model, train_tops, (train_itrs, train_vbs, train_likes) = \
            mdl.train(train_data, model, train_tops, train_plan)

        print("Training perplexity is %.2f " % perplexity_from_like(mdl.log_likelihood(train_data, model, train_tops), train_data.word_count))

        min_link_probs       = mdl.min_link_probs(model, train_tops, query_data.links)
        predicted_link_probs = mdl.link_probs(model, train_tops, min_link_probs)

        map = mean_average_prec (query_data.links, predicted_link_probs)
        print ("Fold %2d: Mean-Average-Precision %6.3f" % (fold, map))

        model_files = save_if_necessary(model_files, model_dir, model, data, fold, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl)

    return model_files
Ejemplo n.º 2
0
    def testMapOnRealData(self):
        dtype = np.float64 # DTYPE

        rd.seed(0xBADB055)
        data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath)
        with open(AclDictPath, "rb") as f:
            dic = pkl.load(f)

        data.convert_to_dtype(dtype)
        data.convert_to_undirected_graph()
        data.convert_to_binary_link_matrix()
        data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount)

        trainData, testData = data.doc_completion_split()

        for pseudoNegCount in (5, 10, 25, 50, 100):
            rd.seed(0xC0FFEE)

            # Initialise the model
            K = TopicCount
            model      = rtm.newModelAtRandom(trainData, K, dtype=dtype, pseudoNegCount=data.doc_count * pseudoNegCount)
            queryState = rtm.newQueryState(trainData, model)
            trainPlan  = rtm.newTrainPlan(iterations=50, logFrequency=LogFreq, fastButInaccurate=False, debug=True)

            # Train the model, and the immediately save the result to a file for subsequent inspection
            model, topics, (bndItrs, bndVals, bndLikes) = rtm.train(trainData, model, queryState, trainPlan)
    #        with open(newModelFileFromModel(model), "wb") as f:
    #            pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f)

            # Plot the evolution of the bound during training.
            fig, ax1 = plt.subplots()
            ax1.plot(bndItrs, bndVals, 'b-')
            ax1.set_xlabel('Iterations')
            ax1.set_ylabel('Bound', color='b')

            ax2 = ax1.twinx()
            ax2.plot(bndItrs, bndLikes, 'r-')
            ax2.set_ylabel('Likelihood', color='r')

            fig.show()
            plt.show()

            # Print out the most likely topic words
            # scale = np.reciprocal(1 + np.squeeze(np.array(data.words.sum(axis=0))))
            vocab = rtm.wordDists(model)
            topWordCount = 10
            kTopWordInds = [self.topWordInds(vocab[k, :], topWordCount) for k in range(K)]

            like = rtm.log_likelihood(trainData, model, topics)
            perp = perplexity_from_like(like, trainData.word_count)

            # print ("Prior %s" % (str(model.topicPrior)))
            print ("Pseudo Neg-Count: %d " % pseudoNegCount)
            print ("\tTrain Perplexity: %f\n\n" % perp)

            # for k in range(model.K):
            #     print ("\nTopic %d\n=============================" % k)
            #     print ("\n".join("%-20s\t%0.4f" % (dic[kTopWordInds[k][c]], vocab[k][kTopWordInds[k][c]]) for c in range(topWordCount)))

            min_probs  = rtm.min_link_probs(model, topics, testData.links)
            link_probs = rtm.link_probs(model, topics, min_probs)
            try:
                map = mean_average_prec(testData.links, link_probs)
            except:
                print ("Unexpected error")

            print("\tThe Mean-Average-Precision is %.3f" % map)
Ejemplo n.º 3
0
def outsample_lro_style_prec_rec (data, mdl, sample_model, train_plan, feature_mask, model_dir=None, ldaModel=None, ldaTopics=None):
    '''
    Take a feature list. Train on all documents where none of those features
    are set. Remove the first element from the feature list, query all documents
    with that feature set, and then evaluate link prediction. Repeat until
    feature-list is empty.

    :param data: the DataSet object with the data
    :param mdl:  the module with the train etc. functin
    :param sample_model: a preconfigured model which is cloned at the start of each
            cross-validation run
    :param train_plan:  the training plan (number of iterations etc.)
    :param feature_mask:  the list of features used to separate training from query
    This is a list of tuples, the left side is the feature label, the right side
    is the
    :param model_dir: if not none, the models are stored in this directory.
    :param ldaModel: for those models that utilise and LDA component, a pre-trained
    LDA model can be supplied.
    :param ldaTopics: the topics of all documents in the corpus as given by the ldaModel
    :return: the list of model files stored
    '''
    def prepareForTraining(data):
        if mdl.is_undirected_link_predictor():
            result = data.copy()
            result.convert_to_undirected_graph()
            result.convert_to_binary_link_matrix()
            return result
        else:
            return data

    ms = [10, 20, 30, 40, 50, 75, 100, 150, 250, 500]
    model_files = []

    combi_precs, combi_recs, combi_dcounts = None, None, None
    mrr_sum, mrr_doc_count = 0, 0
    map_sum, map_doc_count = 0, 0
    while len(feature_mask) > 0:
        # try:
        # Prepare the training and query data
        feature_mask_indices = [i for _,i in feature_mask]
        train_data, query_data, train_indices = data.split_on_feature(feature_mask_indices)
        (feat_label, feat_id) = feature_mask.pop(0)
        print ("\n\nFeature: %s\n" % (feat_label,) + ("-" * 80))

        train_data = prepareForTraining(train_data) # make symmetric, if necessary, after split, so we
                                                    # can compare symmetric with non-symmetric models

        # Train the model
        if model_uses_lda(sample_model):
            ldaModelSubset, ldaTopicsSubset = subsetLda(ldaModel, ldaTopics, train_indices)
            model      = mdl.newModelFromExisting(sample_model, withLdaModel=ldaModelSubset)
            train_tops = mdl.newQueryState(train_data, model, withLdaTopics=ldaTopicsSubset)
        else:
            model = mdl.newModelFromExisting(sample_model)
            train_tops = mdl.newQueryState(train_data, model)

        model, train_tops, (train_itrs, train_vbs, train_likes) = \
            mdl.train(train_data, model, train_tops, train_plan)

        print ("Training perplexity is %.2f " % perplexity_from_like(mdl.log_likelihood(train_data, model, train_tops), train_data.word_count))

        # Infer the expected link probabilities
        query_tops    = mdl.newQueryState(query_data, model)
        _, query_tops = mdl.query(query_data, model, query_tops, train_plan)

        min_link_probs       = mdl.min_link_probs(model, train_tops, query_tops, query_data.links)
        predicted_link_probs = mdl.link_probs(model, train_tops, query_tops, min_link_probs)
        expected_links       = query_data.links

        # Evaluation 1/3: Precision and Recall at M
        precs, recs, doc_counts = mean_prec_rec_at (expected_links, predicted_link_probs, at=ms, groups=[(0,3), (3,5), (5,10), (10,1000)])
        print (" Mean-Precisions for feature %s (#%d)" % (feat_label, feat_id), end="")

        printTable("Precision", precs, doc_counts, ms)
        printTable("Recall",    recs,  doc_counts, ms)

        combi_precs, _             = combine_map(combi_precs, combi_dcounts, precs, doc_counts)
        combi_recs,  combi_dcounts = combine_map(combi_recs,  combi_dcounts, recs,  doc_counts)

        # Evaluation 2/3: Mean Reciprocal-Rank
        mrr = mean_reciprocal_rank(expected_links, predicted_link_probs)
        print ("Mean reciprocal-rank : %f" % mrr)
        mrr_sum       += mrr * expected_links.shape[0]
        mrr_doc_count += expected_links.shape[0]

        # Evaluation 3/3: Mean Average-Precision
        map = mean_average_prec (expected_links, predicted_link_probs)
        print ("Mean Average Precision : %f" % map)
        map_sum       += map * expected_links.shape[0]
        map_doc_count += expected_links.shape[0]

        # Save the files if necessary and move onto the next fold if required
        model_files = save_if_necessary(model_files, model_dir, model, data, feat_id, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl)
        # except Exception as e:
        #     print("Fold " + str(fold) + " failed: " + str(e))

    print ("-" * 80 + "\n\n Final Results\n\n")
    printTable("Precision", combi_precs, combi_dcounts, ms)
    printTable("Recall",    combi_recs,  combi_dcounts, ms)
    print("Mean reciprocal-rank: %f" % (mrr_sum / mrr_doc_count))
    print("Mean average-precision: %f" % (map_sum / map_doc_count))

    return model_files
Ejemplo n.º 4
0
def insample_lro_style_prec_rec (data, mdl, sample_model, train_plan, folds, target_folds=None, model_dir=None, ldaModel=None, ldaTopics=None):
    '''
    For documents with > 5 links remove a portion. The portion is determined by
    the number of folds (e.g. five-fold implied remove one fifth of links, three
    fold implies remove a third, etc.)

    Train on all documents and all remaining links.

    Predict remaining links.

    Evaluate using precision@m, recall@m, mean reciprocal-rank and
    mean average-precision

    Average all results.

    :param data: the DataSet object with the data
    :param mdl:  the module with the train etc. functin
    :param sample_model: a preconfigured model which is cloned at the start of each
            cross-validation run
    :param train_plan:  the training plan (number of iterations etc.)
    :param folds:  the number of folds to cross validation
    :param target_folds: the number of folds to complete before finishing. Set
    to folds by default
    :param model_dir: if not none, and folds > 1, the models are stored in this
    directory.
    :param ldaModel: for those models that utilise and LDA component, a pre-trained
    LDA model can be supplied.
    :param ldaTopics: the topics of all documents in the corpus as given by the ldaModel
    :return: the list of model files stored
    '''
    ms = [10, 20, 30, 40, 50, 75, 100, 150, 250, 500]
    model_files = []
    assert folds > 1, "Need at least two folds for this to make any sense whatsoever"
    def prepareForTraining(data):
        if mdl.is_undirected_link_predictor():
            result = data.copy()
            result.convert_to_undirected_graph()
            result.convert_to_binary_link_matrix()
            return result
        else:
            return data

    if target_folds is None:
        target_folds = folds

    combi_precs, combi_recs, combi_dcounts = None, None, None
    mrr_sum, mrr_doc_count = 0, 0
    map_sum, map_doc_count = 0, 0
    fold_count = 0
    for fold in range(folds):
        # try:
        # Prepare the training and query data
        train_data, query_data, docSubset = data.folded_link_prediction_split(MinLinkCountEval, fold, folds)
        train_data = prepareForTraining(train_data) # make symmetric, if necessary, after split, so we
                                                    # can compare symmetric with non-symmetric models

        print ("\n\nFold %d\n" % fold + ("-" * 80))

        # Train the model
        if model_uses_lda(sample_model):
            model      = mdl.newModelFromExisting(sample_model, withLdaModel=ldaModel)
            train_tops = mdl.newQueryState(train_data, model, withLdaTopics=ldaTopics)
        else:
            model = mdl.newModelFromExisting(sample_model)
            train_tops = mdl.newQueryState(train_data, model)

        model, train_tops, (train_itrs, train_vbs, train_likes) = \
            mdl.train(train_data, model, train_tops, train_plan)

        print ("Training perplexity is %.2f " % perplexity_from_like(mdl.log_likelihood(train_data, model, train_tops), train_data.word_count))

        # Infer the expected link probabilities
        min_link_probs       = mdl.min_link_probs(model, train_tops, train_tops, query_data.links, docSubset)
        predicted_link_probs = mdl.link_probs(model, train_tops, train_tops, min_link_probs, docSubset)
        expected_links       = query_data.links[docSubset, :]

        # Evaluation 1/3: Precision and Recall at M
        precs, recs, doc_counts = mean_prec_rec_at (expected_links, predicted_link_probs, at=ms, groups=[(0,3), (3,5), (5,10), (10,1000)])
        print ("Fold %2d: Mean-Precisions at \n" % fold, end="")

        printTable("Precision", precs, doc_counts, ms)
        printTable("Recall",    recs,  doc_counts, ms)

        combi_precs, _             = combine_map(combi_precs, combi_dcounts, precs, doc_counts)
        combi_recs,  combi_dcounts = combine_map(combi_recs,  combi_dcounts, recs,  doc_counts)

        # Evaluation 2/3: Mean Reciprocal-Rank
        mrr = mean_reciprocal_rank(expected_links, predicted_link_probs)
        print ("Mean reciprocal-rank : %f" % mrr)
        mrr_sum       += mrr * expected_links.shape[0]
        mrr_doc_count += expected_links.shape[0]

        # Evaluation 3/3: Mean Average-Precision
        map = mean_average_prec (expected_links, predicted_link_probs)
        print ("Mean Average Precision : %f" % map)
        map_sum       += map * expected_links.shape[0]
        map_doc_count += expected_links.shape[0]

        # Save the files if necessary and move onto the next fold if required
        model_files = save_if_necessary(model_files, model_dir, model, data, fold, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl)
        fold_count += 1
        if fold_count == target_folds:
            break
        # except Exception as e:
        #     print("Fold " + str(fold) + " failed: " + str(e))

    print ("-" * 80 + "\n\n Final Results\n\n")
    printTable("Precision", combi_precs, combi_dcounts, ms)
    printTable("Recall",    combi_recs,  combi_dcounts, ms)
    print("Mean reciprocal-rank: %f" % (mrr_sum / mrr_doc_count))
    print("Mean average-precision: %f" % (map_sum / map_doc_count))

    return model_files
Ejemplo n.º 5
0
def link_split_prec_rec (data, mdl, sample_model, train_plan, folds, target_folds=None, model_dir=None, ldaModel=None, ldaTopics=None):
    '''
    Train on all the words and half the links. Predict the remaining links.
    Evaluate using precision at m using as values of m 50, 100, 250, and 500,
    and additionally recall at m

    Cross validation may be used, but note we're always evaluating on training
    data.

    :param data: the DataSet object with the data
    :param mdl:  the module with the train etc. functin
    :param sample_model: a preconfigured model which is cloned at the start of each
            cross-validation run
    :param train_plan:  the training plan (number of iterations etc.)
    :param folds:  the number of folds to cross validation
    :param target_folds: the number of folds to complete before finishing. Set
    to folds by default
    :param model_dir: if not none, and folds > 1, the models are stored in this
    directory.
    :param ldaModel: for those models that utilise and LDA component, a pre-trained
    LDA model can be supplied.
    :param ldaTopics: the topics of all documents in the corpus as given by the ldaModel
    :return: the list of model files stored
    '''
    ms = [10, 20, 30, 40, 50, 75, 100, 150, 250, 500]
    model_files = []
    assert folds > 1, "Need at least two folds for this to make any sense whatsoever"
    def prepareForTraining(data):
        if mdl.is_undirected_link_predictor():
            result = data.copy()
            result.convert_to_undirected_graph()
            result.convert_to_binary_link_matrix()
            return result
        else:
            return data

    if ldaModel is not None:
        (_, _, _, _, ldaModel, ldaTopics, _) = ldaModel
    if target_folds is None:
        target_folds = folds

    combi_precs, combi_recs, combi_dcounts = None, None, None
    mrr_sum, mrr_doc_count = 0, 0
    map_sum, map_doc_count = 0, 0
    for fold in range(target_folds):
        model = mdl.newModelFromExisting(sample_model, withLdaModel=ldaModel) \
                if sample_model.name == LRO_MODEL_NAME \
                else mdl.newModelFromExisting(sample_model)

        train_data, query_data = data.link_prediction_split(symmetric=False)
        train_data = prepareForTraining(train_data) # make symmetric, if necessary, after split, so we
                                                    # can compare symmetric with non-symmetric models
        train_tops = mdl.newQueryState(train_data, model)
        model, train_tops, (train_itrs, train_vbs, train_likes) = \
            mdl.train(train_data, model, train_tops, train_plan)

        print ("Training perplexity is %.2f " % perplexity_from_like(mdl.log_likelihood(train_data, model, train_tops), train_data.word_count))

        min_link_probs       = mdl.min_link_probs(model, train_tops, query_data.links)
        predicted_link_probs = mdl.link_probs(model, train_tops, min_link_probs)
        expected_links       = query_data.links

        precs, recs, doc_counts = mean_prec_rec_at (expected_links, predicted_link_probs, at=ms, groups=[(0,3), (3,5), (5,10), (10,1000)])
        print ("Fold %2d: Mean-Precisions at \n" % fold, end="")

        printTable("Precision", precs, doc_counts, ms)
        printTable("Recall",    recs,  doc_counts, ms)

        mrr = mean_reciprocal_rank(expected_links, predicted_link_probs)
        print ("Mean reciprocal-rank : %f" % mrr)
        mrr_sum       += mrr * expected_links.shape[0]
        mrr_doc_count += expected_links.shape[0]

        map = mean_average_prec (expected_links, predicted_link_probs)
        print ("Mean Average Precision : %f" % map)
        map_sum       += map * expected_links.shape[0]
        map_doc_count += expected_links.shape[0]

        combi_precs, _             = combine_map(combi_precs, combi_dcounts, precs, doc_counts)
        combi_recs,  combi_dcounts = combine_map(combi_recs,  combi_dcounts, recs,  doc_counts)

        model_files = save_if_necessary(model_files, model_dir, model, data, fold, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl)

    print ("-" * 80 + "\n\n Final Results\n\n")
    printTable("Precision", combi_precs, combi_dcounts, ms)
    printTable("Recall",    combi_recs,  combi_dcounts, ms)
    print("Mean reciprocal-rank: %f" % (mrr_sum / mrr_doc_count))

    return model_files