Exemple #1
0
    def __init__(self):

        print("Loading forest model...")
        tic()
        self.forest = utilites.loadVariableFromFile(
            "static/Corel5K/forest/forest_128.pkl")
        print("Done.")
        toc()
        self.train_vectors = loadmat(
            utilites.getAbsPath('static/Corel5K/train_vectors_original.mat'))
        self.train_vectors = self.train_vectors['train_vectors']
        self.train_file_path = utilites.loadVariableFromFile(
            "static/Corel5K/train_file_path.pkl")
        # load contents of concepts
        self.concepts = utilites.loadVariableFromFile(
            "static/Corel5K/cluster_contents.pkl")
        self.tag_scores = utilites.loadVariableFromFile(
            "static/Corel5K/all_tags_scores.pkl")

        self.train_vectors_classic = loadmat(
            utilites.getAbsPath(
                'static/Corel5K/baseline_features/corel5k_train_feats_classic.mat'
            ))
        self.train_vectors_classic = self.train_vectors_classic[
            'corel5k_train_feats_classic']

        self.test_vectors_classic = loadmat(
            utilites.getAbsPath(
                'static/Corel5K/baseline_features/corel5k_test_feats_classic.mat'
            ))
        self.test_vectors_classic = self.test_vectors_classic[
            'corel5k_test_feats_classic']
        self.test_file_name = utilites.loadVariableFromFile(
            'static/Corel5K/corel5k_test_file_name.pkl')
        self.feat_dict_classic = dict(
            zip(self.test_file_name, self.test_vectors_classic))

        # start a matlab session for feature extraction
        self.matlab = matlab_wrapper.MatlabSession(
            matlab_root="/Applications/MATLAB_R2015b.app")  # start matlab
        self.matlab.eval('run MatConvNet/matlab/vl_setupnn')  # basic config
        self.matlab.eval('run vlfeat/toolbox/vl_setup')  ## basic config
        self.matlab.eval("feature('DefaultCharacterSet', 'UTF8')")
        print("Loading cnn model...")
        tic()
        self.matlab.eval(
            "net = load('/Users/TONYSUN/Desktop/SIR_Corel5K_demo/static/cnnmodel/imagenet-matconvnet-vgg-verydeep-16.mat')"
        )
        toc()
        print("Matlab session started.")
        print("Ready for work ^_^.")
def getTextVectors():
    """
    open the annotation text file and read content
    build word vectors using Word2Vec and then extract
    the term/vector pairs into a dictionary
    :return: the ultimate word vectors
    """
    raw_text_file = open(utilites.getAbsPath(setup.corpus_file_path))
    raw_text = raw_text_file.readlines()
    print("Corpus file " + raw_text_file.name + " was loaded.")
    # use re to split the raw text string and replace the original text
    # After this all the sentence are split into such format:
    # [0]filename, [1]order of annotation, [2]annotation text
    raw_text = [
        re.split('\t|#', singleLine.replace('\n', ''))
        for singleLine in raw_text
    ]

    # now we only need the annotations
    annotations = [line[2] for line in raw_text]

    # Prepare the sentences
    sentences = annotation_to_wordlists(annotations)

    # Set values for Word2Vec
    num_features = 300  # Use a 300-dimension vector to represent a word
    min_word_count = 5  # Word appears less than 5 times will be ignored
    num_workers = 4  # Number of threads to run in parallel
    context = 5  # Sample 5 words as input for each iteration

    # initialize a model using parameters above
    word_model = gensim.models.Word2Vec(workers=num_workers,
                                        size=num_features,
                                        min_count=min_word_count,
                                        window=context)

    word_model.build_vocab(sentences)  # build vocabulary on split sentenced
    print("Language model established.")
    print("Loading pre-trained language model...")
    # initialize the network weights using pre-trained model
    word_model.intersect_word2vec_format(utilites.getAbsPath(
        setup.lmodel_file_path),
                                         binary=True)
    print("Loaded weights from pre-trained Google News language model.")
    print("Training models...")
    # train the model to get word vectors
    word_model.train(sentences)
    print("Training completed.")

    return extractVecs(word_model)
def generate_forest():

    train_original = loadmat(utilites.getAbsPath('Corel5K/train_vectors_original.mat'))
    train_original = train_original['train_vectors']
    train_label = utilites.loadVariableFromFile(utilites.getAbsPath("Corel5K/train_anno_concept.pkl"))

    # prepare data
    train_data = Data()
    train_data.samples = train_original
    train_data.labels = train_label
    train_data.orig_sample_indexes = np.array(range(len(train_original)))
    train_data.features = np.array(range(np.shape(train_original)[1]))

    tic()
    rand_forest = generate_random_forest(train_data, 400)
    toc()

    return rand_forest
def getTextVectors():
    """
    open the annotation text file and read content
    build word vectors using Word2Vec and then extract
    the term/vector pairs into a dictionary
    :return: the ultimate word vectors
    """
    raw_text_file = open(utilites.getAbsPath(setup.corpus_file_path))
    raw_text = raw_text_file.readlines()
    print("Corpus file " + raw_text_file.name + " was loaded.")
    # use re to split the raw text string and replace the original text
    # After this all the sentence are split into such format:
    # [0]filename, [1]order of annotation, [2]annotation text
    raw_text = [re.split("\t|#", singleLine.replace("\n", "")) for singleLine in raw_text]

    # now we only need the annotations
    annotations = [line[2] for line in raw_text]

    # Prepare the sentences
    sentences = annotation_to_wordlists(annotations)

    # Set values for Word2Vec
    num_features = 300  # Use a 300-dimension vector to represent a word
    min_word_count = 5  # Word appears less than 5 times will be ignored
    num_workers = 4  # Number of threads to run in parallel
    context = 5  # Sample 5 words as input for each iteration

    # initialize a model using parameters above
    word_model = gensim.models.Word2Vec(
        workers=num_workers, size=num_features, min_count=min_word_count, window=context
    )

    word_model.build_vocab(sentences)  # build vocabulary on split sentenced
    print("Language model established.")
    print("Loading pre-trained language model...")
    # initialize the network weights using pre-trained model
    word_model.intersect_word2vec_format(utilites.getAbsPath(setup.lmodel_file_path), binary=True)
    print("Loaded weights from pre-trained Google News language model.")
    print("Training models...")
    # train the model to get word vectors
    word_model.train(sentences)
    print("Training completed.")

    return extractVecs(word_model)
Exemple #5
0
    def im2res(self, im_path, test_file_name):
        """
        read an image file and give the final parsed result
        :param im_path: image file path
        :return: top5 concept label count and top5 sample count
        """
        im_vec = self.extract_feature(utilites.getAbsPath(im_path))
        src, srs, im_near_path, rf_bl_time = self.parse_test_sample(im_vec, test_file_name)

        return src, srs, im_near_path, rf_bl_time
Exemple #6
0
    def im2res(self, im_path, test_file_name):
        """
        read an image file and give the final parsed result
        :param im_path: image file path
        :return: top5 concept label count and top5 sample count
        """
        im_vec = self.extract_feature(utilites.getAbsPath(im_path))
        src, srs, im_near_path, rf_bl_time = self.parse_test_sample(
            im_vec, test_file_name)

        return src, srs, im_near_path, rf_bl_time
Exemple #7
0
    def __init__(self):

        print ("Loading forest model...")
        tic()
        self.forest = utilites.loadVariableFromFile("static/Corel5K/forest/forest_128.pkl")
        print ("Done.")
        toc()
        self.train_vectors = loadmat(utilites.getAbsPath("static/Corel5K/train_vectors_original.mat"))
        self.train_vectors = self.train_vectors["train_vectors"]
        self.train_file_path = utilites.loadVariableFromFile("static/Corel5K/train_file_path.pkl")
        # load contents of concepts
        self.concepts = utilites.loadVariableFromFile("static/Corel5K/cluster_contents.pkl")
        self.tag_scores = utilites.loadVariableFromFile("static/Corel5K/all_tags_scores.pkl")

        self.train_vectors_classic = loadmat(
            utilites.getAbsPath("static/Corel5K/baseline_features/corel5k_train_feats_classic.mat")
        )
        self.train_vectors_classic = self.train_vectors_classic["corel5k_train_feats_classic"]

        self.test_vectors_classic = loadmat(
            utilites.getAbsPath("static/Corel5K/baseline_features/corel5k_test_feats_classic.mat")
        )
        self.test_vectors_classic = self.test_vectors_classic["corel5k_test_feats_classic"]
        self.test_file_name = utilites.loadVariableFromFile("static/Corel5K/corel5k_test_file_name.pkl")
        self.feat_dict_classic = dict(zip(self.test_file_name, self.test_vectors_classic))

        # start a matlab session for feature extraction
        self.matlab = matlab_wrapper.MatlabSession(matlab_root="/Applications/MATLAB_R2015b.app")  # start matlab
        self.matlab.eval("run MatConvNet/matlab/vl_setupnn")  # basic config
        self.matlab.eval("run vlfeat/toolbox/vl_setup")  ## basic config
        self.matlab.eval("feature('DefaultCharacterSet', 'UTF8')")
        print ("Loading cnn model...")
        tic()
        self.matlab.eval(
            "net = load('/Users/TONYSUN/Desktop/SIR_Corel5K_demo/static/cnnmodel/imagenet-matconvnet-vgg-verydeep-16.mat')"
        )
        toc()
        print ("Matlab session started.")
        print ("Ready for work ^_^.")
Exemple #8
0
def show_images(image_list,titles=None):
    """Display a list of images"""
    images = []
    for j in range(0, len(image_list)):
        images.append(io.imread(utilites.getAbsPath(image_list[j])))

    n_ims = len(images)
    if titles is None: titles = ['(%d)' % i for i in range(1,n_ims + 1)]
    fig = plt.figure()
    n = 1
    for image,title in zip(images,titles):
        a = fig.add_subplot(1,n_ims,n) # Make subplot
        if image.ndim == 2: # Is image grayscale?
            plt.gray() # Only place in this blog you can't replace 'gray' with 'grey'
        plt.imshow(image)
        a.set_title(title)
        n += 1
    # fig.set_size_inches(np.array(fig.get_size_inches()) * n_ims)
    plt.show()
Exemple #9
0
# return text vectors calculated using Word2Vec by gensim
"""
open the annotation text file and read content
build word vectors using Word2Vec and then extract
the term/vector pairs into a dictionary
"""
# get all filtered term(tag) names
terms_corel5k_filtered = utilites.loadVariableFromFile(
    "Corel5k/terms_corel5k_filtered.pkl")
# get training image annotations: lists of separate terms
train_anno_filtered = utilites.loadVariableFromFile(
    "Corel5k/train_anno_filtered.pkl")

# initialize a model using parameters above
word_model = gensim.models.Word2Vec.load_word2vec_format(utilites.getAbsPath(
    setup.lmodel_file_path),
                                                         binary=True)
"""
Calculate similarity matrix using given vectors
We use pairwise distances to build the matrix
"""
print("Extracting word vectors...")
vecs = []
# Index2word is a list that contains the names of the words in
for word in terms_corel5k_filtered:
    vecs.append(
        word_model[word])  # now we extract all word vectors from the model

print("Term vectors haven been created.")
d_pairwise_vecs = 1 - pairwise.pairwise_distances(vecs, metric='cosine')
print("Similarity matrix has been built.")
from sklearn.metrics import pairwise


# return text vectors calculated using Word2Vec by gensim
"""
open the annotation text file and read content
build word vectors using Word2Vec and then extract
the term/vector pairs into a dictionary
"""
# get all filtered term(tag) names
terms_corel5k_filtered = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl")
# get training image annotations: lists of separate terms
train_anno_filtered = utilites.loadVariableFromFile("Corel5k/train_anno_filtered.pkl")

# initialize a model using parameters above
word_model = gensim.models.Word2Vec.load_word2vec_format(utilites.getAbsPath(setup.lmodel_file_path), binary=True)

"""
Calculate similarity matrix using given vectors
We use pairwise distances to build the matrix
"""
print("Extracting word vectors...")
vecs = []
# Index2word is a list that contains the names of the words in
for word in terms_corel5k_filtered:
    vecs.append(word_model[word])  # now we extract all word vectors from the model

print("Term vectors haven been created.")
d_pairwise_vecs = 1 - pairwise.pairwise_distances(vecs, metric="cosine")
print("Similarity matrix has been built.")
Exemple #11
0
# get all terms from txt file
"""
test_file = open(utilites.getAbsPath('static/Corel5K/corel5k_test_list.txt'))
test_file_list = test_file.readlines()
test_file_list = [term.strip().decode('utf-8').replace('\n', '') + '.jpeg' for term in test_file_list]
utilites.saveVariableToFile(test_file_list, utilites.getAbsPath('static/Corel5K/corel5k_test_list.pkl'))
"""
test_file_list = utilites.loadVariableFromFile('static/Corel5K/corel5k_test_list.pkl')
train_file_path = utilites.loadVariableFromFile("static/Corel5K/train_file_path.pkl")
test_anno = utilites.loadVariableFromFile('static/Corel5K/test_anno_filtered.pkl')
train_anno = utilites.loadVariableFromFile('static/Corel5K/train_anno_filtered.pkl')
train_anno_concept = utilites.loadVariableFromFile("static/Corel5K/train_anno_concept.pkl")
test_anno_concept = utilites.loadVariableFromFile("static/Corel5K/test_anno_concept.pkl")
all_prob = utilites.loadVariableFromFile("static/Corel5K/all_probs.pkl")
concepts = utilites.loadVariableFromFile("static/Corel5K/cluster_contents.pkl")
train_vectors = loadmat(utilites.getAbsPath('static/Corel5K/train_vectors_original.mat'))
train_vectors = train_vectors['train_vectors']
test_vectors = loadmat(utilites.getAbsPath('static/Corel5K/test_vectors_original.mat'))
test_vectors = test_vectors['test_vectors']

train_vectors_classic = loadmat(utilites.getAbsPath('static/Corel5K/baseline_features/corel5k_train_feats_classic.mat'))
train_vectors_classic = train_vectors_classic['corel5k_train_feats_classic']

test_vectors_classic = loadmat(utilites.getAbsPath('static/Corel5K/baseline_features/corel5k_test_feats_classic.mat'))
test_vectors_classic = test_vectors_classic['corel5k_test_feats_classic']



print("Loading forest model...")
tic()
forest = utilites.loadVariableFromFile("static/Corel5K/forest/forest_128.pkl")
Exemple #12
0
    a_rs = []

    for tree in forest:
        rc, rs = parse_single_tree(sample, tree)
        a_rc.append(rc.label_count)
        a_rs = a_rs + list(rs)

    a_rc = np.asarray(a_rc)
    sum_a_rc = np.sum(a_rc, axis=0)  # get count in all trees for each concept
    sum_a_rs = Counter(a_rs)

    return sum_a_rc, sum_a_rs


# test code here
test_original = loadmat(
    utilites.getAbsPath('Corel5K/test_vectors_original.mat'))
test_original = test_original['test_vectors']
test_sample = test_original[0]

forest = load_forest()
# rc, rs = parse_single_tree(test_sample, forest[0])
# src: sum of concept count, srs: sum of retrieved sample count
src, srs = parse_forest(test_sample, forest)
label_name = range(100)
src_dict = dict(zip(label_name, src))

src_sorted = sorted(src_dict.items(), key=itemgetter(1))[::-1]
srs_sorted = sorted(srs.items(), key=itemgetter(1))[::-1]
src_top10 = src[0:10]
srs_top10 = srs_sorted[0:10]
def toc(tempBool=True):
    # Prints the time difference yielded by generator instance TicToc
    tempTimeInterval = next(TicToc)
    if tempBool:
        print( "Elapsed time: %f seconds.\n" %tempTimeInterval )

def tic():
    # Records a time in TicToc, marks the beginning of a time interval
    toc(False)

"""
Prepare data
"""
# START TO RUN HERE
# load datasets
train_original = loadmat(utilites.getAbsPath('Corel5K/train_vectors_original.mat'))
test_original = loadmat(utilites.getAbsPath('Corel5K/test_vectors_original.mat'))
train_original = train_original['train_vectors']
test_original = test_original['test_vectors']

# l2 normalize the feature vectors
train_original_list = [tr.astype('float32') for tr in train_original]

# construct a pairwise distance matrix to lookup (affinity matrix)
d_cos_train_vecs = 1 - pairwise.pairwise_distances(train_original_list, metric='cosine')
# d_eu_train_vecs = pairwise.pairwise_distances(train_pca_300_list, metric='euclidean')

# get all terms from txt file
terms_file = open(utilites.getAbsPath('Corel5K/corel5k_words.txt'))
terms_corel5k = terms_file.readlines()
print("Term file " + terms_file.name + " was loaded.")
    if tempBool:
        print("Elapsed time: %f seconds.\n" % tempTimeInterval)


def tic():
    # Records a time in TicToc, marks the beginning of a time interval
    toc(False)


"""
Prepare data
"""
# START TO RUN HERE
# load datasets
train_original = loadmat(
    utilites.getAbsPath('Corel5K/train_vectors_original.mat'))
test_original = loadmat(
    utilites.getAbsPath('Corel5K/test_vectors_original.mat'))
train_original = train_original['train_vectors']
test_original = test_original['test_vectors']

# l2 normalize the feature vectors
train_original_list = [tr.astype('float32') for tr in train_original]

# construct a pairwise distance matrix to lookup (affinity matrix)
d_cos_train_vecs = 1 - pairwise.pairwise_distances(train_original_list,
                                                   metric='cosine')
# d_eu_train_vecs = pairwise.pairwise_distances(train_pca_300_list, metric='euclidean')

# get all terms from txt file
terms_file = open(utilites.getAbsPath('Corel5K/corel5k_words.txt'))
    a_rc = []
    a_rs = []

    for tree in forest:
        rc, rs = parse_single_tree(sample, tree)
        a_rc.append(rc.label_count)
        a_rs = a_rs + list(rs)

    a_rc = np.asarray(a_rc)
    sum_a_rc = np.sum(a_rc, axis=0)  # get count in all trees for each concept
    sum_a_rs = Counter(a_rs)

    return sum_a_rc, sum_a_rs


# test code here
test_original = loadmat(utilites.getAbsPath('Corel5K/test_vectors_original.mat'))
test_original = test_original['test_vectors']
test_sample = test_original[0]

forest = load_forest()
# rc, rs = parse_single_tree(test_sample, forest[0])
# src: sum of concept count, srs: sum of retrieved sample count
src, srs = parse_forest(test_sample, forest)
label_name = range(100)
src_dict = dict(zip(label_name, src))

src_sorted = sorted(src_dict.items(), key=itemgetter(1))[::-1]
srs_sorted = sorted(srs.items(), key=itemgetter(1))[::-1]
src_top10 = src[0:10]
srs_top10 = srs_sorted[0:10]