def generate_random_forest(dataset, n_trees, max_depth=12, min_leaf_sample=4):
    """
    Build random forest
    1. Sample N random samples with replacement to create a subset of the data.
       The subset it about 66% of the whole dataset
    2. At each node:
       (1). For some number m, m feature variables are selected at random from all feature variables
       (2). The feature variable that provides the best split, according to some objective function
            (here we use max info gain), is used to do a binary split on the node.
       (3). At the next node, choose another m variables at random from all feature variables and
            do the same.
       The choice of m is generally 1/2 * sqrt(m), sqrt(m) and 2 * sqrt(m)
    """

    # use parallel computing to generate trees
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
    # provide a group of argument to perform parallel computing
    result = pool.map(multi_run_wrapper,
                      itertools.repeat((prep_data(dataset), None, max_depth, min_leaf_sample), n_trees))
    pool.close()
    pool.join()

    # save trees in separate variables
    utilites.saveVariableToFile(result, "Corel5K/forest.pkl") # save forest in single variable

    for i in range(len(result)):
        utilites.saveVariableToFile(result[i], "Corel5K/forest/Tree_" + str(i) + ".pkl")

    return result
Example #2
0
def perform_clustering(alpha=0.0, num_clusters=100):
    """
    clustering the tag/terms and return the cluster ids for each tag
    :param alpha: parameter to combine visual and textual similarity matrix
    :param num_clusters: number of clusters/concepts obtained
    :return: cluster ids for each tag
    """
    vis_sim_mat = utilites.loadVariableFromFile(
        "Corel5k/tag_affinity_matrix_scaled.pkl")
    tex_sim_mat = utilites.loadVariableFromFile(
        "Corel5k/tag_textual_similarity_matrix.pkl")

    tex_sim_mat = adjust_and_norm_affinity(tex_sim_mat)
    vis_sim_mat = expit(vis_sim_mat)

    # introduce a parameter alpha to merge the two matrics
    joint_mat = alpha * vis_sim_mat + (1 - alpha) * tex_sim_mat

    # let's start spectrum clustering
    # obtain cluster IDs for each word
    # eigen_solver: None, arpack, lobpcg, or amg
    cluster_ids = spectral_clustering(joint_mat,
                                      n_clusters=num_clusters,
                                      eigen_solver='arpack')
    print("Done...")
    # Create a Word / Index dictionary, mapping each vocabulary word to
    # a cluster number
    words = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl")
    word_centroid_map = dict(zip(words, cluster_ids))
    utilites.saveVariableToFile(cluster_ids, "Corel5k/concepts_ids.pkl")

    cluster_contents = []
    # For the first 10 clusters
    for cluster in range(0, num_clusters):
        # print the cluster number
        print("\nCluster %d" % cluster)
        # Find all of the words for that cluster number, and print them out
        r_words = []
        for i in range(0, len(word_centroid_map.values())):
            if (word_centroid_map.values()[i] == cluster):
                r_words.append(word_centroid_map.keys()[i])

        print(r_words)
        cluster_contents.append(r_words)

    utilites.saveVariableToFile(cluster_contents,
                                "Corel5k/cluster_contents.pkl")

    return cluster_ids
def perform_clustering(alpha=0.0, num_clusters=100):
    """
    clustering the tag/terms and return the cluster ids for each tag
    :param alpha: parameter to combine visual and textual similarity matrix
    :param num_clusters: number of clusters/concepts obtained
    :return: cluster ids for each tag
    """
    vis_sim_mat = utilites.loadVariableFromFile("Corel5k/tag_affinity_matrix_scaled.pkl")
    tex_sim_mat = utilites.loadVariableFromFile("Corel5k/tag_textual_similarity_matrix.pkl")

    tex_sim_mat = adjust_and_norm_affinity(tex_sim_mat)
    vis_sim_mat = expit(vis_sim_mat)

    # introduce a parameter alpha to merge the two matrics
    joint_mat = alpha * vis_sim_mat + (1 - alpha) * tex_sim_mat

    # let's start spectrum clustering
    # obtain cluster IDs for each word
    # eigen_solver: None, arpack, lobpcg, or amg
    cluster_ids = spectral_clustering(joint_mat, n_clusters=num_clusters, eigen_solver='arpack')
    print("Done...")
    # Create a Word / Index dictionary, mapping each vocabulary word to
    # a cluster number
    words = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl")
    word_centroid_map = dict(zip(words, cluster_ids))
    utilites.saveVariableToFile(cluster_ids, "Corel5k/concepts_ids.pkl")

    cluster_contents = []
    # For the first 10 clusters
    for cluster in range(0, num_clusters):
        # print the cluster number
        print("\nCluster %d" % cluster)
        # Find all of the words for that cluster number, and print them out
        r_words = []
        for i in range(0,len(word_centroid_map.values())):
            if( word_centroid_map.values()[i] == cluster ):
                r_words.append(word_centroid_map.keys()[i])

        print (r_words)
        cluster_contents.append(r_words)

    utilites.saveVariableToFile(cluster_contents, "Corel5k/cluster_contents.pkl")

    return cluster_ids
Example #4
0
def get_concept_anno():
    """
    build new concept annotation matrix upon old tag based annotation
    :param num_clusters: number of clusters/concepts
    :return: new concept based annotation matrix
    """
    cluster_ids = utilites.loadVariableFromFile("Corel5k/concepts_ids.pkl")
    # all tags
    words = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl")
    # all tag ids from 1 to length of cluster_ids
    word_ids = range(len(cluster_ids))
    # get number of clusters by counting unique cluster ids
    num_clusters = len(set(cluster_ids))
    # construct to indicate which cluster does the given word belong to
    cluster_map = dict(zip(word_ids, cluster_ids))
    # load the original tag annotation matrix
    word_anno = utilites.loadVariableFromFile(
        "Corel5k/train_anno_filtered.pkl")

    # initialize a zero matrix as concept matrix
    anno = np.zeros((len(word_anno), num_clusters), dtype=np.int)

    # for every instance in the anno
    for i in range(len(word_anno)):
        print('This is instance ' + str(i) + '.')
        # for every tag in all tags
        for j in range(len(cluster_ids)):
            # if this tag appears in the original tag annotation  matrix
            if word_anno[i][j] == 1:
                # we first find which concept this tag belongs to
                # and then set the occurrence of this concept is 1
                anno[i][cluster_map[j]] = 1
                print("The words is " + words[j] + ", and the concept is " +
                      str(cluster_map[j]))

    utilites.saveVariableToFile(anno, "Corel5k/train_anno_concept.pkl")

    return anno
def get_concept_anno():
    """
    build new concept annotation matrix upon old tag based annotation
    :param num_clusters: number of clusters/concepts
    :return: new concept based annotation matrix
    """
    cluster_ids = utilites.loadVariableFromFile("Corel5k/concepts_ids.pkl")
    # all tags
    words = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl")
    # all tag ids from 1 to length of cluster_ids
    word_ids = range(len(cluster_ids))
    # get number of clusters by counting unique cluster ids
    num_clusters = len(set(cluster_ids))
    # construct to indicate which cluster does the given word belong to
    cluster_map = dict(zip(word_ids, cluster_ids))
    # load the original tag annotation matrix
    word_anno = utilites.loadVariableFromFile("Corel5k/train_anno_filtered.pkl")

    # initialize a zero matrix as concept matrix
    anno = np.zeros((len(word_anno), num_clusters), dtype=np.int)

    # for every instance in the anno
    for i in range(len(word_anno)):
        print('This is instance ' + str(i) + '.')
        # for every tag in all tags
        for j in range(len(cluster_ids)):
            # if this tag appears in the original tag annotation  matrix
            if word_anno[i][j] == 1:
                # we first find which concept this tag belongs to
                # and then set the occurrence of this concept is 1
                anno[i][cluster_map[j]] = 1
                print("The words is " + words[j] + ", and the concept is " + str(cluster_map[j]))

    utilites.saveVariableToFile(anno, "Corel5k/train_anno_concept.pkl")

    return anno
Example #6
0
the term/vector pairs into a dictionary
"""
# get all filtered term(tag) names
terms_corel5k_filtered = utilites.loadVariableFromFile(
    "Corel5k/terms_corel5k_filtered.pkl")
# get training image annotations: lists of separate terms
train_anno_filtered = utilites.loadVariableFromFile(
    "Corel5k/train_anno_filtered.pkl")

# initialize a model using parameters above
word_model = gensim.models.Word2Vec.load_word2vec_format(utilites.getAbsPath(
    setup.lmodel_file_path),
                                                         binary=True)
"""
Calculate similarity matrix using given vectors
We use pairwise distances to build the matrix
"""
print("Extracting word vectors...")
vecs = []
# Index2word is a list that contains the names of the words in
for word in terms_corel5k_filtered:
    vecs.append(
        word_model[word])  # now we extract all word vectors from the model

print("Term vectors haven been created.")
d_pairwise_vecs = 1 - pairwise.pairwise_distances(vecs, metric='cosine')
print("Similarity matrix has been built.")

utilites.saveVariableToFile(d_pairwise_vecs,
                            "Corel5k/tag_textual_similarity_matrix.pkl")
# return text vectors calculated using Word2Vec by gensim
"""
open the annotation text file and read content
build word vectors using Word2Vec and then extract
the term/vector pairs into a dictionary
"""
# get all filtered term(tag) names
terms_corel5k_filtered = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl")
# get training image annotations: lists of separate terms
train_anno_filtered = utilites.loadVariableFromFile("Corel5k/train_anno_filtered.pkl")

# initialize a model using parameters above
word_model = gensim.models.Word2Vec.load_word2vec_format(utilites.getAbsPath(setup.lmodel_file_path), binary=True)

"""
Calculate similarity matrix using given vectors
We use pairwise distances to build the matrix
"""
print("Extracting word vectors...")
vecs = []
# Index2word is a list that contains the names of the words in
for word in terms_corel5k_filtered:
    vecs.append(word_model[word])  # now we extract all word vectors from the model

print("Term vectors haven been created.")
d_pairwise_vecs = 1 - pairwise.pairwise_distances(vecs, metric="cosine")
print("Similarity matrix has been built.")

utilites.saveVariableToFile(d_pairwise_vecs, "Corel5k/tag_textual_similarity_matrix.pkl")
Example #8
0
TicToc = TicTocGenerator() # create an instance of the TicTocGen generator

# This will be the main function through which we define both tic() and toc()
def toc(tempBool=True):
    # Prints the time difference yielded by generator instance TicToc
    tempTimeInterval = next(TicToc)
    if tempBool:
        print( "Elapsed time: %f seconds.\n" %tempTimeInterval )

def tic():
    # Records a time in TicToc, marks the beginning of a time interval
    toc(False)

# we need to parse each test sample here
# get all terms from txt file
"""
test_file = open(utilites.getAbsPath('static/Corel5K/corel5k_test_list.txt'))
test_file_list = test_file.readlines()
test_file_list = [term.strip().decode('utf-8').replace('\n', '') + '.jpeg' for term in test_file_list]
utilites.saveVariableToFile(test_file_list, utilites.getAbsPath('static/Corel5K/corel5k_test_list.pkl'))
"""
test_file_list = utilites.loadVariableFromFile('static/Corel5K/corel5k_test_list.pkl')
train_file_path = utilites.loadVariableFromFile("static/Corel5K/train_file_path.pkl")
test_anno = utilites.loadVariableFromFile('static/Corel5K/test_anno_filtered.pkl')
train_anno = utilites.loadVariableFromFile('static/Corel5K/train_anno_filtered.pkl')
train_anno_concept = utilites.loadVariableFromFile("static/Corel5K/train_anno_concept.pkl")
test_anno_concept = utilites.loadVariableFromFile("static/Corel5K/test_anno_concept.pkl")
all_prob = utilites.loadVariableFromFile("static/Corel5K/all_probs.pkl")
concepts = utilites.loadVariableFromFile("static/Corel5K/cluster_contents.pkl")
train_vectors = loadmat(utilites.getAbsPath('static/Corel5K/train_vectors_original.mat'))
train_vectors = train_vectors['train_vectors']
        # when similarity between base tag and all other tags are calculated
        score_curr_tag.append(median)

    return score_curr_tag


# use parallel computing to calculate scores for all tags
base_tags = range(len(terms_corel5k_filtered))
pool = multiprocessing.Pool(multiprocessing.cpu_count())
tic()
scores_t_similarity = pool.map(cal_tag_similarity, base_tags)
toc()
pool.close()
pool.join()

utilites.saveVariableToFile(scores_t_similarity, "Corel5k/tag_affinity_matrix.pkl")
utilites.saveVariableToFile(terms_corel5k_filtered, "Corel5k/terms_corel5k_filtered.pkl")
utilites.saveVariableToFile(train_anno_filtered, "Corel5k/train_anno_filtered.pkl")


"""
Since similarity between tag1 & tag2 is not the same with tag2 & tag1
This is normal due to the different visual K nearest neighbour
Now we need to retain only value between the similarity pairs
Considering using mean or min, mean by default
That is to say, similarity = average(sim(tag1, tag2), sim(tag2, tag1))
"""
def adjust_and_norm_affinity(affinity_matrix, method='average'):
    """
    adjust values of the affinity matrix
    for pairwise values, we use average or min of them
        # when similarity between base tag and all other tags are calculated
        score_curr_tag.append(median)

    return score_curr_tag


# use parallel computing to calculate scores for all tags
base_tags = range(len(terms_corel5k_filtered))
pool = multiprocessing.Pool(multiprocessing.cpu_count())
tic()
scores_t_similarity = pool.map(cal_tag_similarity, base_tags)
toc()
pool.close()
pool.join()

utilites.saveVariableToFile(scores_t_similarity,
                            "Corel5k/tag_affinity_matrix.pkl")
utilites.saveVariableToFile(terms_corel5k_filtered,
                            "Corel5k/terms_corel5k_filtered.pkl")
utilites.saveVariableToFile(train_anno_filtered,
                            "Corel5k/train_anno_filtered.pkl")
"""
Since similarity between tag1 & tag2 is not the same with tag2 & tag1
This is normal due to the different visual K nearest neighbour
Now we need to retain only value between the similarity pairs
Considering using mean or min, mean by default
That is to say, similarity = average(sim(tag1, tag2), sim(tag2, tag1))
"""


def adjust_and_norm_affinity(affinity_matrix, method='average'):
    """