def generate_random_forest(dataset, n_trees, max_depth=12, min_leaf_sample=4): """ Build random forest 1. Sample N random samples with replacement to create a subset of the data. The subset it about 66% of the whole dataset 2. At each node: (1). For some number m, m feature variables are selected at random from all feature variables (2). The feature variable that provides the best split, according to some objective function (here we use max info gain), is used to do a binary split on the node. (3). At the next node, choose another m variables at random from all feature variables and do the same. The choice of m is generally 1/2 * sqrt(m), sqrt(m) and 2 * sqrt(m) """ # use parallel computing to generate trees pool = multiprocessing.Pool(multiprocessing.cpu_count()) # provide a group of argument to perform parallel computing result = pool.map(multi_run_wrapper, itertools.repeat((prep_data(dataset), None, max_depth, min_leaf_sample), n_trees)) pool.close() pool.join() # save trees in separate variables utilites.saveVariableToFile(result, "Corel5K/forest.pkl") # save forest in single variable for i in range(len(result)): utilites.saveVariableToFile(result[i], "Corel5K/forest/Tree_" + str(i) + ".pkl") return result
def perform_clustering(alpha=0.0, num_clusters=100): """ clustering the tag/terms and return the cluster ids for each tag :param alpha: parameter to combine visual and textual similarity matrix :param num_clusters: number of clusters/concepts obtained :return: cluster ids for each tag """ vis_sim_mat = utilites.loadVariableFromFile( "Corel5k/tag_affinity_matrix_scaled.pkl") tex_sim_mat = utilites.loadVariableFromFile( "Corel5k/tag_textual_similarity_matrix.pkl") tex_sim_mat = adjust_and_norm_affinity(tex_sim_mat) vis_sim_mat = expit(vis_sim_mat) # introduce a parameter alpha to merge the two matrics joint_mat = alpha * vis_sim_mat + (1 - alpha) * tex_sim_mat # let's start spectrum clustering # obtain cluster IDs for each word # eigen_solver: None, arpack, lobpcg, or amg cluster_ids = spectral_clustering(joint_mat, n_clusters=num_clusters, eigen_solver='arpack') print("Done...") # Create a Word / Index dictionary, mapping each vocabulary word to # a cluster number words = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl") word_centroid_map = dict(zip(words, cluster_ids)) utilites.saveVariableToFile(cluster_ids, "Corel5k/concepts_ids.pkl") cluster_contents = [] # For the first 10 clusters for cluster in range(0, num_clusters): # print the cluster number print("\nCluster %d" % cluster) # Find all of the words for that cluster number, and print them out r_words = [] for i in range(0, len(word_centroid_map.values())): if (word_centroid_map.values()[i] == cluster): r_words.append(word_centroid_map.keys()[i]) print(r_words) cluster_contents.append(r_words) utilites.saveVariableToFile(cluster_contents, "Corel5k/cluster_contents.pkl") return cluster_ids
def perform_clustering(alpha=0.0, num_clusters=100): """ clustering the tag/terms and return the cluster ids for each tag :param alpha: parameter to combine visual and textual similarity matrix :param num_clusters: number of clusters/concepts obtained :return: cluster ids for each tag """ vis_sim_mat = utilites.loadVariableFromFile("Corel5k/tag_affinity_matrix_scaled.pkl") tex_sim_mat = utilites.loadVariableFromFile("Corel5k/tag_textual_similarity_matrix.pkl") tex_sim_mat = adjust_and_norm_affinity(tex_sim_mat) vis_sim_mat = expit(vis_sim_mat) # introduce a parameter alpha to merge the two matrics joint_mat = alpha * vis_sim_mat + (1 - alpha) * tex_sim_mat # let's start spectrum clustering # obtain cluster IDs for each word # eigen_solver: None, arpack, lobpcg, or amg cluster_ids = spectral_clustering(joint_mat, n_clusters=num_clusters, eigen_solver='arpack') print("Done...") # Create a Word / Index dictionary, mapping each vocabulary word to # a cluster number words = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl") word_centroid_map = dict(zip(words, cluster_ids)) utilites.saveVariableToFile(cluster_ids, "Corel5k/concepts_ids.pkl") cluster_contents = [] # For the first 10 clusters for cluster in range(0, num_clusters): # print the cluster number print("\nCluster %d" % cluster) # Find all of the words for that cluster number, and print them out r_words = [] for i in range(0,len(word_centroid_map.values())): if( word_centroid_map.values()[i] == cluster ): r_words.append(word_centroid_map.keys()[i]) print (r_words) cluster_contents.append(r_words) utilites.saveVariableToFile(cluster_contents, "Corel5k/cluster_contents.pkl") return cluster_ids
def get_concept_anno(): """ build new concept annotation matrix upon old tag based annotation :param num_clusters: number of clusters/concepts :return: new concept based annotation matrix """ cluster_ids = utilites.loadVariableFromFile("Corel5k/concepts_ids.pkl") # all tags words = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl") # all tag ids from 1 to length of cluster_ids word_ids = range(len(cluster_ids)) # get number of clusters by counting unique cluster ids num_clusters = len(set(cluster_ids)) # construct to indicate which cluster does the given word belong to cluster_map = dict(zip(word_ids, cluster_ids)) # load the original tag annotation matrix word_anno = utilites.loadVariableFromFile( "Corel5k/train_anno_filtered.pkl") # initialize a zero matrix as concept matrix anno = np.zeros((len(word_anno), num_clusters), dtype=np.int) # for every instance in the anno for i in range(len(word_anno)): print('This is instance ' + str(i) + '.') # for every tag in all tags for j in range(len(cluster_ids)): # if this tag appears in the original tag annotation matrix if word_anno[i][j] == 1: # we first find which concept this tag belongs to # and then set the occurrence of this concept is 1 anno[i][cluster_map[j]] = 1 print("The words is " + words[j] + ", and the concept is " + str(cluster_map[j])) utilites.saveVariableToFile(anno, "Corel5k/train_anno_concept.pkl") return anno
def get_concept_anno(): """ build new concept annotation matrix upon old tag based annotation :param num_clusters: number of clusters/concepts :return: new concept based annotation matrix """ cluster_ids = utilites.loadVariableFromFile("Corel5k/concepts_ids.pkl") # all tags words = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl") # all tag ids from 1 to length of cluster_ids word_ids = range(len(cluster_ids)) # get number of clusters by counting unique cluster ids num_clusters = len(set(cluster_ids)) # construct to indicate which cluster does the given word belong to cluster_map = dict(zip(word_ids, cluster_ids)) # load the original tag annotation matrix word_anno = utilites.loadVariableFromFile("Corel5k/train_anno_filtered.pkl") # initialize a zero matrix as concept matrix anno = np.zeros((len(word_anno), num_clusters), dtype=np.int) # for every instance in the anno for i in range(len(word_anno)): print('This is instance ' + str(i) + '.') # for every tag in all tags for j in range(len(cluster_ids)): # if this tag appears in the original tag annotation matrix if word_anno[i][j] == 1: # we first find which concept this tag belongs to # and then set the occurrence of this concept is 1 anno[i][cluster_map[j]] = 1 print("The words is " + words[j] + ", and the concept is " + str(cluster_map[j])) utilites.saveVariableToFile(anno, "Corel5k/train_anno_concept.pkl") return anno
the term/vector pairs into a dictionary """ # get all filtered term(tag) names terms_corel5k_filtered = utilites.loadVariableFromFile( "Corel5k/terms_corel5k_filtered.pkl") # get training image annotations: lists of separate terms train_anno_filtered = utilites.loadVariableFromFile( "Corel5k/train_anno_filtered.pkl") # initialize a model using parameters above word_model = gensim.models.Word2Vec.load_word2vec_format(utilites.getAbsPath( setup.lmodel_file_path), binary=True) """ Calculate similarity matrix using given vectors We use pairwise distances to build the matrix """ print("Extracting word vectors...") vecs = [] # Index2word is a list that contains the names of the words in for word in terms_corel5k_filtered: vecs.append( word_model[word]) # now we extract all word vectors from the model print("Term vectors haven been created.") d_pairwise_vecs = 1 - pairwise.pairwise_distances(vecs, metric='cosine') print("Similarity matrix has been built.") utilites.saveVariableToFile(d_pairwise_vecs, "Corel5k/tag_textual_similarity_matrix.pkl")
# return text vectors calculated using Word2Vec by gensim """ open the annotation text file and read content build word vectors using Word2Vec and then extract the term/vector pairs into a dictionary """ # get all filtered term(tag) names terms_corel5k_filtered = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl") # get training image annotations: lists of separate terms train_anno_filtered = utilites.loadVariableFromFile("Corel5k/train_anno_filtered.pkl") # initialize a model using parameters above word_model = gensim.models.Word2Vec.load_word2vec_format(utilites.getAbsPath(setup.lmodel_file_path), binary=True) """ Calculate similarity matrix using given vectors We use pairwise distances to build the matrix """ print("Extracting word vectors...") vecs = [] # Index2word is a list that contains the names of the words in for word in terms_corel5k_filtered: vecs.append(word_model[word]) # now we extract all word vectors from the model print("Term vectors haven been created.") d_pairwise_vecs = 1 - pairwise.pairwise_distances(vecs, metric="cosine") print("Similarity matrix has been built.") utilites.saveVariableToFile(d_pairwise_vecs, "Corel5k/tag_textual_similarity_matrix.pkl")
TicToc = TicTocGenerator() # create an instance of the TicTocGen generator # This will be the main function through which we define both tic() and toc() def toc(tempBool=True): # Prints the time difference yielded by generator instance TicToc tempTimeInterval = next(TicToc) if tempBool: print( "Elapsed time: %f seconds.\n" %tempTimeInterval ) def tic(): # Records a time in TicToc, marks the beginning of a time interval toc(False) # we need to parse each test sample here # get all terms from txt file """ test_file = open(utilites.getAbsPath('static/Corel5K/corel5k_test_list.txt')) test_file_list = test_file.readlines() test_file_list = [term.strip().decode('utf-8').replace('\n', '') + '.jpeg' for term in test_file_list] utilites.saveVariableToFile(test_file_list, utilites.getAbsPath('static/Corel5K/corel5k_test_list.pkl')) """ test_file_list = utilites.loadVariableFromFile('static/Corel5K/corel5k_test_list.pkl') train_file_path = utilites.loadVariableFromFile("static/Corel5K/train_file_path.pkl") test_anno = utilites.loadVariableFromFile('static/Corel5K/test_anno_filtered.pkl') train_anno = utilites.loadVariableFromFile('static/Corel5K/train_anno_filtered.pkl') train_anno_concept = utilites.loadVariableFromFile("static/Corel5K/train_anno_concept.pkl") test_anno_concept = utilites.loadVariableFromFile("static/Corel5K/test_anno_concept.pkl") all_prob = utilites.loadVariableFromFile("static/Corel5K/all_probs.pkl") concepts = utilites.loadVariableFromFile("static/Corel5K/cluster_contents.pkl") train_vectors = loadmat(utilites.getAbsPath('static/Corel5K/train_vectors_original.mat')) train_vectors = train_vectors['train_vectors']
# when similarity between base tag and all other tags are calculated score_curr_tag.append(median) return score_curr_tag # use parallel computing to calculate scores for all tags base_tags = range(len(terms_corel5k_filtered)) pool = multiprocessing.Pool(multiprocessing.cpu_count()) tic() scores_t_similarity = pool.map(cal_tag_similarity, base_tags) toc() pool.close() pool.join() utilites.saveVariableToFile(scores_t_similarity, "Corel5k/tag_affinity_matrix.pkl") utilites.saveVariableToFile(terms_corel5k_filtered, "Corel5k/terms_corel5k_filtered.pkl") utilites.saveVariableToFile(train_anno_filtered, "Corel5k/train_anno_filtered.pkl") """ Since similarity between tag1 & tag2 is not the same with tag2 & tag1 This is normal due to the different visual K nearest neighbour Now we need to retain only value between the similarity pairs Considering using mean or min, mean by default That is to say, similarity = average(sim(tag1, tag2), sim(tag2, tag1)) """ def adjust_and_norm_affinity(affinity_matrix, method='average'): """ adjust values of the affinity matrix for pairwise values, we use average or min of them
# when similarity between base tag and all other tags are calculated score_curr_tag.append(median) return score_curr_tag # use parallel computing to calculate scores for all tags base_tags = range(len(terms_corel5k_filtered)) pool = multiprocessing.Pool(multiprocessing.cpu_count()) tic() scores_t_similarity = pool.map(cal_tag_similarity, base_tags) toc() pool.close() pool.join() utilites.saveVariableToFile(scores_t_similarity, "Corel5k/tag_affinity_matrix.pkl") utilites.saveVariableToFile(terms_corel5k_filtered, "Corel5k/terms_corel5k_filtered.pkl") utilites.saveVariableToFile(train_anno_filtered, "Corel5k/train_anno_filtered.pkl") """ Since similarity between tag1 & tag2 is not the same with tag2 & tag1 This is normal due to the different visual K nearest neighbour Now we need to retain only value between the similarity pairs Considering using mean or min, mean by default That is to say, similarity = average(sim(tag1, tag2), sim(tag2, tag1)) """ def adjust_and_norm_affinity(affinity_matrix, method='average'): """