def __init__(self): print("Loading forest model...") tic() self.forest = utilites.loadVariableFromFile( "static/Corel5K/forest/forest_128.pkl") print("Done.") toc() self.train_vectors = loadmat( utilites.getAbsPath('static/Corel5K/train_vectors_original.mat')) self.train_vectors = self.train_vectors['train_vectors'] self.train_file_path = utilites.loadVariableFromFile( "static/Corel5K/train_file_path.pkl") # load contents of concepts self.concepts = utilites.loadVariableFromFile( "static/Corel5K/cluster_contents.pkl") self.tag_scores = utilites.loadVariableFromFile( "static/Corel5K/all_tags_scores.pkl") self.train_vectors_classic = loadmat( utilites.getAbsPath( 'static/Corel5K/baseline_features/corel5k_train_feats_classic.mat' )) self.train_vectors_classic = self.train_vectors_classic[ 'corel5k_train_feats_classic'] self.test_vectors_classic = loadmat( utilites.getAbsPath( 'static/Corel5K/baseline_features/corel5k_test_feats_classic.mat' )) self.test_vectors_classic = self.test_vectors_classic[ 'corel5k_test_feats_classic'] self.test_file_name = utilites.loadVariableFromFile( 'static/Corel5K/corel5k_test_file_name.pkl') self.feat_dict_classic = dict( zip(self.test_file_name, self.test_vectors_classic)) # start a matlab session for feature extraction self.matlab = matlab_wrapper.MatlabSession( matlab_root="/Applications/MATLAB_R2015b.app") # start matlab self.matlab.eval('run MatConvNet/matlab/vl_setupnn') # basic config self.matlab.eval('run vlfeat/toolbox/vl_setup') ## basic config self.matlab.eval("feature('DefaultCharacterSet', 'UTF8')") print("Loading cnn model...") tic() self.matlab.eval( "net = load('/Users/TONYSUN/Desktop/SIR_Corel5K_demo/static/cnnmodel/imagenet-matconvnet-vgg-verydeep-16.mat')" ) toc() print("Matlab session started.") print("Ready for work ^_^.")
def getTextVectors(): """ open the annotation text file and read content build word vectors using Word2Vec and then extract the term/vector pairs into a dictionary :return: the ultimate word vectors """ raw_text_file = open(utilites.getAbsPath(setup.corpus_file_path)) raw_text = raw_text_file.readlines() print("Corpus file " + raw_text_file.name + " was loaded.") # use re to split the raw text string and replace the original text # After this all the sentence are split into such format: # [0]filename, [1]order of annotation, [2]annotation text raw_text = [ re.split('\t|#', singleLine.replace('\n', '')) for singleLine in raw_text ] # now we only need the annotations annotations = [line[2] for line in raw_text] # Prepare the sentences sentences = annotation_to_wordlists(annotations) # Set values for Word2Vec num_features = 300 # Use a 300-dimension vector to represent a word min_word_count = 5 # Word appears less than 5 times will be ignored num_workers = 4 # Number of threads to run in parallel context = 5 # Sample 5 words as input for each iteration # initialize a model using parameters above word_model = gensim.models.Word2Vec(workers=num_workers, size=num_features, min_count=min_word_count, window=context) word_model.build_vocab(sentences) # build vocabulary on split sentenced print("Language model established.") print("Loading pre-trained language model...") # initialize the network weights using pre-trained model word_model.intersect_word2vec_format(utilites.getAbsPath( setup.lmodel_file_path), binary=True) print("Loaded weights from pre-trained Google News language model.") print("Training models...") # train the model to get word vectors word_model.train(sentences) print("Training completed.") return extractVecs(word_model)
def generate_forest(): train_original = loadmat(utilites.getAbsPath('Corel5K/train_vectors_original.mat')) train_original = train_original['train_vectors'] train_label = utilites.loadVariableFromFile(utilites.getAbsPath("Corel5K/train_anno_concept.pkl")) # prepare data train_data = Data() train_data.samples = train_original train_data.labels = train_label train_data.orig_sample_indexes = np.array(range(len(train_original))) train_data.features = np.array(range(np.shape(train_original)[1])) tic() rand_forest = generate_random_forest(train_data, 400) toc() return rand_forest
def getTextVectors(): """ open the annotation text file and read content build word vectors using Word2Vec and then extract the term/vector pairs into a dictionary :return: the ultimate word vectors """ raw_text_file = open(utilites.getAbsPath(setup.corpus_file_path)) raw_text = raw_text_file.readlines() print("Corpus file " + raw_text_file.name + " was loaded.") # use re to split the raw text string and replace the original text # After this all the sentence are split into such format: # [0]filename, [1]order of annotation, [2]annotation text raw_text = [re.split("\t|#", singleLine.replace("\n", "")) for singleLine in raw_text] # now we only need the annotations annotations = [line[2] for line in raw_text] # Prepare the sentences sentences = annotation_to_wordlists(annotations) # Set values for Word2Vec num_features = 300 # Use a 300-dimension vector to represent a word min_word_count = 5 # Word appears less than 5 times will be ignored num_workers = 4 # Number of threads to run in parallel context = 5 # Sample 5 words as input for each iteration # initialize a model using parameters above word_model = gensim.models.Word2Vec( workers=num_workers, size=num_features, min_count=min_word_count, window=context ) word_model.build_vocab(sentences) # build vocabulary on split sentenced print("Language model established.") print("Loading pre-trained language model...") # initialize the network weights using pre-trained model word_model.intersect_word2vec_format(utilites.getAbsPath(setup.lmodel_file_path), binary=True) print("Loaded weights from pre-trained Google News language model.") print("Training models...") # train the model to get word vectors word_model.train(sentences) print("Training completed.") return extractVecs(word_model)
def im2res(self, im_path, test_file_name): """ read an image file and give the final parsed result :param im_path: image file path :return: top5 concept label count and top5 sample count """ im_vec = self.extract_feature(utilites.getAbsPath(im_path)) src, srs, im_near_path, rf_bl_time = self.parse_test_sample(im_vec, test_file_name) return src, srs, im_near_path, rf_bl_time
def im2res(self, im_path, test_file_name): """ read an image file and give the final parsed result :param im_path: image file path :return: top5 concept label count and top5 sample count """ im_vec = self.extract_feature(utilites.getAbsPath(im_path)) src, srs, im_near_path, rf_bl_time = self.parse_test_sample( im_vec, test_file_name) return src, srs, im_near_path, rf_bl_time
def __init__(self): print ("Loading forest model...") tic() self.forest = utilites.loadVariableFromFile("static/Corel5K/forest/forest_128.pkl") print ("Done.") toc() self.train_vectors = loadmat(utilites.getAbsPath("static/Corel5K/train_vectors_original.mat")) self.train_vectors = self.train_vectors["train_vectors"] self.train_file_path = utilites.loadVariableFromFile("static/Corel5K/train_file_path.pkl") # load contents of concepts self.concepts = utilites.loadVariableFromFile("static/Corel5K/cluster_contents.pkl") self.tag_scores = utilites.loadVariableFromFile("static/Corel5K/all_tags_scores.pkl") self.train_vectors_classic = loadmat( utilites.getAbsPath("static/Corel5K/baseline_features/corel5k_train_feats_classic.mat") ) self.train_vectors_classic = self.train_vectors_classic["corel5k_train_feats_classic"] self.test_vectors_classic = loadmat( utilites.getAbsPath("static/Corel5K/baseline_features/corel5k_test_feats_classic.mat") ) self.test_vectors_classic = self.test_vectors_classic["corel5k_test_feats_classic"] self.test_file_name = utilites.loadVariableFromFile("static/Corel5K/corel5k_test_file_name.pkl") self.feat_dict_classic = dict(zip(self.test_file_name, self.test_vectors_classic)) # start a matlab session for feature extraction self.matlab = matlab_wrapper.MatlabSession(matlab_root="/Applications/MATLAB_R2015b.app") # start matlab self.matlab.eval("run MatConvNet/matlab/vl_setupnn") # basic config self.matlab.eval("run vlfeat/toolbox/vl_setup") ## basic config self.matlab.eval("feature('DefaultCharacterSet', 'UTF8')") print ("Loading cnn model...") tic() self.matlab.eval( "net = load('/Users/TONYSUN/Desktop/SIR_Corel5K_demo/static/cnnmodel/imagenet-matconvnet-vgg-verydeep-16.mat')" ) toc() print ("Matlab session started.") print ("Ready for work ^_^.")
def show_images(image_list,titles=None): """Display a list of images""" images = [] for j in range(0, len(image_list)): images.append(io.imread(utilites.getAbsPath(image_list[j]))) n_ims = len(images) if titles is None: titles = ['(%d)' % i for i in range(1,n_ims + 1)] fig = plt.figure() n = 1 for image,title in zip(images,titles): a = fig.add_subplot(1,n_ims,n) # Make subplot if image.ndim == 2: # Is image grayscale? plt.gray() # Only place in this blog you can't replace 'gray' with 'grey' plt.imshow(image) a.set_title(title) n += 1 # fig.set_size_inches(np.array(fig.get_size_inches()) * n_ims) plt.show()
# return text vectors calculated using Word2Vec by gensim """ open the annotation text file and read content build word vectors using Word2Vec and then extract the term/vector pairs into a dictionary """ # get all filtered term(tag) names terms_corel5k_filtered = utilites.loadVariableFromFile( "Corel5k/terms_corel5k_filtered.pkl") # get training image annotations: lists of separate terms train_anno_filtered = utilites.loadVariableFromFile( "Corel5k/train_anno_filtered.pkl") # initialize a model using parameters above word_model = gensim.models.Word2Vec.load_word2vec_format(utilites.getAbsPath( setup.lmodel_file_path), binary=True) """ Calculate similarity matrix using given vectors We use pairwise distances to build the matrix """ print("Extracting word vectors...") vecs = [] # Index2word is a list that contains the names of the words in for word in terms_corel5k_filtered: vecs.append( word_model[word]) # now we extract all word vectors from the model print("Term vectors haven been created.") d_pairwise_vecs = 1 - pairwise.pairwise_distances(vecs, metric='cosine') print("Similarity matrix has been built.")
from sklearn.metrics import pairwise # return text vectors calculated using Word2Vec by gensim """ open the annotation text file and read content build word vectors using Word2Vec and then extract the term/vector pairs into a dictionary """ # get all filtered term(tag) names terms_corel5k_filtered = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl") # get training image annotations: lists of separate terms train_anno_filtered = utilites.loadVariableFromFile("Corel5k/train_anno_filtered.pkl") # initialize a model using parameters above word_model = gensim.models.Word2Vec.load_word2vec_format(utilites.getAbsPath(setup.lmodel_file_path), binary=True) """ Calculate similarity matrix using given vectors We use pairwise distances to build the matrix """ print("Extracting word vectors...") vecs = [] # Index2word is a list that contains the names of the words in for word in terms_corel5k_filtered: vecs.append(word_model[word]) # now we extract all word vectors from the model print("Term vectors haven been created.") d_pairwise_vecs = 1 - pairwise.pairwise_distances(vecs, metric="cosine") print("Similarity matrix has been built.")
# get all terms from txt file """ test_file = open(utilites.getAbsPath('static/Corel5K/corel5k_test_list.txt')) test_file_list = test_file.readlines() test_file_list = [term.strip().decode('utf-8').replace('\n', '') + '.jpeg' for term in test_file_list] utilites.saveVariableToFile(test_file_list, utilites.getAbsPath('static/Corel5K/corel5k_test_list.pkl')) """ test_file_list = utilites.loadVariableFromFile('static/Corel5K/corel5k_test_list.pkl') train_file_path = utilites.loadVariableFromFile("static/Corel5K/train_file_path.pkl") test_anno = utilites.loadVariableFromFile('static/Corel5K/test_anno_filtered.pkl') train_anno = utilites.loadVariableFromFile('static/Corel5K/train_anno_filtered.pkl') train_anno_concept = utilites.loadVariableFromFile("static/Corel5K/train_anno_concept.pkl") test_anno_concept = utilites.loadVariableFromFile("static/Corel5K/test_anno_concept.pkl") all_prob = utilites.loadVariableFromFile("static/Corel5K/all_probs.pkl") concepts = utilites.loadVariableFromFile("static/Corel5K/cluster_contents.pkl") train_vectors = loadmat(utilites.getAbsPath('static/Corel5K/train_vectors_original.mat')) train_vectors = train_vectors['train_vectors'] test_vectors = loadmat(utilites.getAbsPath('static/Corel5K/test_vectors_original.mat')) test_vectors = test_vectors['test_vectors'] train_vectors_classic = loadmat(utilites.getAbsPath('static/Corel5K/baseline_features/corel5k_train_feats_classic.mat')) train_vectors_classic = train_vectors_classic['corel5k_train_feats_classic'] test_vectors_classic = loadmat(utilites.getAbsPath('static/Corel5K/baseline_features/corel5k_test_feats_classic.mat')) test_vectors_classic = test_vectors_classic['corel5k_test_feats_classic'] print("Loading forest model...") tic() forest = utilites.loadVariableFromFile("static/Corel5K/forest/forest_128.pkl")
a_rs = [] for tree in forest: rc, rs = parse_single_tree(sample, tree) a_rc.append(rc.label_count) a_rs = a_rs + list(rs) a_rc = np.asarray(a_rc) sum_a_rc = np.sum(a_rc, axis=0) # get count in all trees for each concept sum_a_rs = Counter(a_rs) return sum_a_rc, sum_a_rs # test code here test_original = loadmat( utilites.getAbsPath('Corel5K/test_vectors_original.mat')) test_original = test_original['test_vectors'] test_sample = test_original[0] forest = load_forest() # rc, rs = parse_single_tree(test_sample, forest[0]) # src: sum of concept count, srs: sum of retrieved sample count src, srs = parse_forest(test_sample, forest) label_name = range(100) src_dict = dict(zip(label_name, src)) src_sorted = sorted(src_dict.items(), key=itemgetter(1))[::-1] srs_sorted = sorted(srs.items(), key=itemgetter(1))[::-1] src_top10 = src[0:10] srs_top10 = srs_sorted[0:10]
def toc(tempBool=True): # Prints the time difference yielded by generator instance TicToc tempTimeInterval = next(TicToc) if tempBool: print( "Elapsed time: %f seconds.\n" %tempTimeInterval ) def tic(): # Records a time in TicToc, marks the beginning of a time interval toc(False) """ Prepare data """ # START TO RUN HERE # load datasets train_original = loadmat(utilites.getAbsPath('Corel5K/train_vectors_original.mat')) test_original = loadmat(utilites.getAbsPath('Corel5K/test_vectors_original.mat')) train_original = train_original['train_vectors'] test_original = test_original['test_vectors'] # l2 normalize the feature vectors train_original_list = [tr.astype('float32') for tr in train_original] # construct a pairwise distance matrix to lookup (affinity matrix) d_cos_train_vecs = 1 - pairwise.pairwise_distances(train_original_list, metric='cosine') # d_eu_train_vecs = pairwise.pairwise_distances(train_pca_300_list, metric='euclidean') # get all terms from txt file terms_file = open(utilites.getAbsPath('Corel5K/corel5k_words.txt')) terms_corel5k = terms_file.readlines() print("Term file " + terms_file.name + " was loaded.")
if tempBool: print("Elapsed time: %f seconds.\n" % tempTimeInterval) def tic(): # Records a time in TicToc, marks the beginning of a time interval toc(False) """ Prepare data """ # START TO RUN HERE # load datasets train_original = loadmat( utilites.getAbsPath('Corel5K/train_vectors_original.mat')) test_original = loadmat( utilites.getAbsPath('Corel5K/test_vectors_original.mat')) train_original = train_original['train_vectors'] test_original = test_original['test_vectors'] # l2 normalize the feature vectors train_original_list = [tr.astype('float32') for tr in train_original] # construct a pairwise distance matrix to lookup (affinity matrix) d_cos_train_vecs = 1 - pairwise.pairwise_distances(train_original_list, metric='cosine') # d_eu_train_vecs = pairwise.pairwise_distances(train_pca_300_list, metric='euclidean') # get all terms from txt file terms_file = open(utilites.getAbsPath('Corel5K/corel5k_words.txt'))
a_rc = [] a_rs = [] for tree in forest: rc, rs = parse_single_tree(sample, tree) a_rc.append(rc.label_count) a_rs = a_rs + list(rs) a_rc = np.asarray(a_rc) sum_a_rc = np.sum(a_rc, axis=0) # get count in all trees for each concept sum_a_rs = Counter(a_rs) return sum_a_rc, sum_a_rs # test code here test_original = loadmat(utilites.getAbsPath('Corel5K/test_vectors_original.mat')) test_original = test_original['test_vectors'] test_sample = test_original[0] forest = load_forest() # rc, rs = parse_single_tree(test_sample, forest[0]) # src: sum of concept count, srs: sum of retrieved sample count src, srs = parse_forest(test_sample, forest) label_name = range(100) src_dict = dict(zip(label_name, src)) src_sorted = sorted(src_dict.items(), key=itemgetter(1))[::-1] srs_sorted = sorted(srs.items(), key=itemgetter(1))[::-1] src_top10 = src[0:10] srs_top10 = srs_sorted[0:10]