Esempio n. 1
0
    def create_img_document_topic_matrix(self):
        print("===============================================================")
        print("Creating teaser img_doc-topic matrix - loading visual features and applying LDA.\n")

        self.read_visual_image_ids()

        if FileIOManager.existFile(dictionary_prefix+img_matrix_filename):
            self.img_document_topics = FileIOManager.load_from_file(dictionary_prefix+img_matrix_filename)
            self.similarity_index = similarities.Similarity.load(dictionary_prefix+similarity_index_filename)
            return

        lines = FileIOManager.read_teaser_visual_file()
        for line in lines:
            img_doc = utils.generate_corpus_for_image(line,
                                                          self.data_manager.textual_dictionary.features_names2id)
            topic_vector = self.lda.document_topics_inference(img_doc, min_probability=0.0001)
            # Add into the list.
            self.img_document_topics.append(topic_vector)

        # Build similarity index #self.lda.document_topics_inference(self.img_document_topics
        self.similarity_index = similarities.Similarity(dictionary_prefix+index_filename, self.img_document_topics,
                                                        num_features=self.lda.num_topics)
        self.similarity_index.save(dictionary_prefix+similarity_index_filename)
        FileIOManager.save_to_file(dictionary_prefix+img_matrix_filename, self.img_document_topics)
        testing_img_count = len(self.img_document_topics)
        print("Testing img count is %d\n" % (testing_img_count))
Esempio n. 2
0
    def create_testing_textual_document_topic_matrix(self):
        print(
            "===============================================================")
        print(
            "Creating testing texual_doc-topic matrix - loading teaser1 textual data and applying LDA.\n"
        )

        if FileIOManager.existFile(dictionary_prefix +
                                   testing_textual_matrix_filename):
            self.testing_textual_document_topics = FileIOManager.load_from_file(
                dictionary_prefix + testing_textual_matrix_filename)
            self.testing_textual_count = len(
                self.testing_textual_document_topics)
            print("Testing textual count is %d\n" %
                  (self.testing_textual_count))
            return

        number_of_lines = 0
        lines = FileIOManager.read_testing_textual_file()
        for line in lines:
            number_of_lines += 1
            corpus_line_dict = dict()
            line_words = line.split()
            number_of_features = int(line_words[1])
            line_words = line_words[2:]
            for j in range(0, number_of_features * 2, 2):
                word = self.data_manager.textual_dictionary.processWord(
                    line_words[j].decode('utf-8'))
                # Normalize weight
                weight = float(line_words[j + 1]) / 100000
                if word not in self.data_manager.textual_dictionary.word2id:
                    continue
                # Get word id
                word_id = self.data_manager.textual_dictionary.word2id[word]
                if word_id not in corpus_line_dict:
                    corpus_line_dict[word_id] = weight
                else:
                    corpus_line_dict[word_id] += weight
            # Create array of tuples (word_id, weight) from dictionary
            corpus_line = []
            for key, value in corpus_line_dict.iteritems():
                corpus_line.append((key, value))

            # Normalize to one vector
            corpus_line = matutils.unitvec(corpus_line)

            # Add into the list.
            self.testing_textual_document_topics.append(
                self.lda.document_topics_inference(corpus_line,
                                                   min_probability=0.001))

            # Debug output
            if ((number_of_lines + 1) % 10000 == 0):
                print("Read %d" % (number_of_lines + 1))

        FileIOManager.save_to_file(
            dictionary_prefix + testing_textual_matrix_filename,
            self.testing_textual_document_topics)
        self.testing_textual_count = len(self.testing_textual_document_topics)
        print("Testing textual count is %d\n" % (self.testing_textual_count))
    def create_gensim_dictionary(self):
        '''
        Create vocabulary using gensim dictionary.
        :return:
        '''
        print(
            "===============================================================")
        print(
            "Creating vocabulary by dictionary in gensim - loading textual features.\n"
        )

        if FileIOManager.existFile(dictionary_prefix +
                                   gensim_dictionary_filename):
            self.load_gensim_dictionary()
            return

        # Generate visual feature names
        features = [self._generate_visual_feature_names()]

        corpus = []
        lines = FileIOManager.read_textual_file_words()
        number_of_lines = 0
        for line in lines:
            line = [self.processWord(word.decode('utf-8')) for word in line]
            corpus.append(line)
            number_of_lines += 1
            # Debug output
            if ((number_of_lines + 1) % 10000 == 0):
                print("Read %d" % (number_of_lines + 1))
        dictionary = corpora.Dictionary(corpus)
        print(dictionary)
        # remove stop words and words that appear only once
        stop_ids = [
            dictionary.token2id[stopword] for stopword in self.stop
            if stopword in dictionary.token2id
        ]
        once_ids = [
            tokenid for tokenid, docfreq in dictionary.dfs.iteritems()
            if docfreq == 1
        ]
        dictionary.filter_tokens(
            stop_ids +
            once_ids)  # remove stop words and words that appear only once
        dictionary.compactify(
        )  # remove gaps in id sequence after words that were removed
        print(dictionary)
        dictionary.add_documents(features)
        print(dictionary)
        self._create_feature_name2id_list()
        dictionary.save(dictionary_prefix + gensim_dictionary_filename)
Esempio n. 4
0
    def create_testing_textual_document_topic_matrix(self):
        print("===============================================================")
        print("Creating testing texual_doc-topic matrix - loading teaser1 textual data and applying LDA.\n")

        if FileIOManager.existFile(dictionary_prefix+testing_textual_matrix_filename):
            self.testing_textual_document_topics = FileIOManager.load_from_file(dictionary_prefix+testing_textual_matrix_filename)
            self.testing_textual_count = len(self.testing_textual_document_topics)
            print("Testing textual count is %d\n" % (self.testing_textual_count))
            return

        number_of_lines = 0
        lines = FileIOManager.read_testing_textual_file()
        for line in lines:
            number_of_lines += 1
            corpus_line_dict = dict()
            line_words = line.split()
            number_of_features = int(line_words[1])
            line_words = line_words[2:]
            for j in range(0, number_of_features*2, 2):
                word = self.data_manager.textual_dictionary.processWord(line_words[j].decode('utf-8'))
                # Normalize weight
                weight = float(line_words[j + 1]) / 100000
                if word not in self.data_manager.textual_dictionary.word2id:
                    continue
                # Get word id
                word_id = self.data_manager.textual_dictionary.word2id[word]
                if word_id not in corpus_line_dict:
                    corpus_line_dict[word_id] = weight
                else:
                    corpus_line_dict[word_id] += weight
            # Create array of tuples (word_id, weight) from dictionary
            corpus_line = []
            for key, value in corpus_line_dict.iteritems():
                corpus_line.append( (key, value) )

            # Normalize to one vector
            corpus_line = matutils.unitvec(corpus_line)

            # Add into the list.
            self.testing_textual_document_topics.append(self.lda.document_topics_inference(corpus_line, min_probability=0.001))

             # Debug output
            if ((number_of_lines + 1) % 10000 == 0):
                print("Read %d" % (number_of_lines + 1))

        FileIOManager.save_to_file(dictionary_prefix+testing_textual_matrix_filename, self.testing_textual_document_topics)
        self.testing_textual_count = len(self.testing_textual_document_topics)
        print("Testing textual count is %d\n" % (self.testing_textual_count))
Esempio n. 5
0
    def yield_testing_textual_document_topics(self):
        number_of_lines = 0
        lines = FileIOManager.read_testing_textual_file()
        for line in lines:
            number_of_lines += 1
            corpus_line_dict = dict()
            line_words = line.split()
            number_of_features = int(line_words[1])
            line_words = line_words[2:]
            for j in range(0, number_of_features*2, 2):
                word = self.data_manager.textual_dictionary.processWord(line_words[j].decode('utf-8'))
                # Normalize weight
                weight = float(line_words[j + 1]) / 100000
                if word not in self.data_manager.textual_dictionary.word2id:
                    continue
                # Get word id
                word_id = self.data_manager.textual_dictionary.word2id[word]
                if word_id not in corpus_line_dict:
                    corpus_line_dict[word_id] = weight
                else:
                    corpus_line_dict[word_id] += weight
            # Create array of tuples (word_id, weight) from dictionary
            corpus_line = []
            for key, value in corpus_line_dict.iteritems():
                corpus_line.append( (key, value) )

            # Normalize to one vector
            corpus_line = matutils.unitvec(corpus_line)

            # Add into the list.
            corpus_line = self.lda.document_topics_inference(corpus_line, min_probability=0.001)

            yield corpus_line
Esempio n. 6
0
    def yield_testing_textual_document_topics(self):
        number_of_lines = 0
        lines = FileIOManager.read_testing_textual_file()
        for line in lines:
            number_of_lines += 1
            corpus_line_dict = dict()
            line_words = line.split()
            number_of_features = int(line_words[1])
            line_words = line_words[2:]
            for j in range(0, number_of_features * 2, 2):
                word = self.data_manager.textual_dictionary.processWord(
                    line_words[j].decode('utf-8'))
                # Normalize weight
                weight = float(line_words[j + 1]) / 100000
                if word not in self.data_manager.textual_dictionary.word2id:
                    continue
                # Get word id
                word_id = self.data_manager.textual_dictionary.word2id[word]
                if word_id not in corpus_line_dict:
                    corpus_line_dict[word_id] = weight
                else:
                    corpus_line_dict[word_id] += weight
            # Create array of tuples (word_id, weight) from dictionary
            corpus_line = []
            for key, value in corpus_line_dict.iteritems():
                corpus_line.append((key, value))

            # Normalize to one vector
            corpus_line = matutils.unitvec(corpus_line)

            # Add into the list.
            corpus_line = self.lda.document_topics_inference(
                corpus_line, min_probability=0.001)

            yield corpus_line
Esempio n. 7
0
    def load_testing_img_ids(self):
        '''
        Load img testing ids.
        :return:
        '''

        if FileIOManager.existFile(dictionary_prefix+test_image_ids_filename):
            self.textual_test_image_ids = FileIOManager.load_from_file(dictionary_prefix+test_image_ids_filename)
            return

        lines = FileIOManager.read_image_ids_from_filename(FileIOManager.testing_img_ids_filename)
        for line in lines:
            self.textual_test_image_ids.append(line)

        print(len(self.textual_test_image_ids), len(self.textual_train_image_ids))

        FileIOManager.save_to_file(dictionary_prefix+test_image_ids_filename, self.textual_test_image_ids)
Esempio n. 8
0
    def count_textual_training_img_ids(self):
        '''
        Count and store textual training img ids.
        :return:
        '''

        if FileIOManager.existFile(dictionary_prefix+train_image_ids_filename):
            self.textual_train_image_ids = FileIOManager.load_from_file(dictionary_prefix+train_image_ids_filename)
            return

        self.textual_train_image_ids = []
        lines = FileIOManager.read_textual_file()
        for line in lines:
            line_words = line.split()
            self.textual_train_image_ids.append(line_words[0])

        FileIOManager.save_to_file(dictionary_prefix+train_image_ids_filename, self.textual_train_image_ids)
Esempio n. 9
0
    def load_testing_img_ids(self):
        '''
        Load img testing ids.
        :return:
        '''

        if FileIOManager.existFile(dictionary_prefix+test_image_ids_filename):
            self.textual_test_image_ids = FileIOManager.load_from_file(dictionary_prefix+test_image_ids_filename)
            return

        lines = FileIOManager.read_image_ids_from_filename(FileIOManager.testing_img_ids_filename)
        for line in lines:
            self.textual_test_image_ids.append(line)

        print(len(self.textual_test_image_ids), len(self.textual_train_image_ids))

        FileIOManager.save_to_file(dictionary_prefix+test_image_ids_filename, self.textual_test_image_ids)
Esempio n. 10
0
    def count_textual_training_img_ids(self):
        '''
        Count and store textual training img ids.
        :return:
        '''

        if FileIOManager.existFile(dictionary_prefix+train_image_ids_filename):
            self.textual_train_image_ids = FileIOManager.load_from_file(dictionary_prefix+train_image_ids_filename)
            return

        self.textual_train_image_ids = []
        lines = FileIOManager.read_textual_file()
        for line in lines:
            line_words = line.split()
            self.textual_train_image_ids.append(line_words[0])

        FileIOManager.save_to_file(dictionary_prefix+train_image_ids_filename, self.textual_train_image_ids)
Esempio n. 11
0
    def __iter__(self):
        textual_lines = FileIOManager.read_textual_file()
        visual_file = open(FileIOManager.images_features_path, 'r')
        visual_file.readline()
        number_of_lines = 0
        for textual_line in textual_lines:
            number_of_lines += 1
            if self.limited_length is not None and number_of_lines > self.limited_length:
                break

            corpus_line_dict = dict()
            line_words = textual_line.split()
            textual_img_id = line_words[0]
            number_of_features = int(line_words[1])
            line_words = line_words[2:]
            for j in range(0, number_of_features * 2, 2):
                word = self.dictionary.processWord(
                    line_words[j].decode('utf-8'))
                # Normalize weight
                weight = float(line_words[j + 1]) / 100000
                if word not in self.dictionary.word2id:
                    continue
                # Get word id
                word_id = self.dictionary.word2id[word]
                if word_id not in corpus_line_dict:
                    corpus_line_dict[word_id] = weight
                else:
                    corpus_line_dict[word_id] += weight

            # Create array of tuples (word_id, weight) from dictionary
            corpus_line = []
            for key, value in corpus_line_dict.iteritems():
                corpus_line.append((key, value))

            # Normalize to unit vector
            corpus_line = matutils.unitvec(corpus_line)

            # Search for training images only for corresponding img
            visual_line = visual_file.readline().split()
            image_id = visual_line[0]
            while image_id != textual_img_id:
                visual_line = visual_file.readline().split()
                image_id = visual_line[0]

            # Append visual features
            corpus_line = corpus_line + utils.generate_corpus_for_image(
                visual_line[1:], self.dictionary.features_names2id)

            # Normalize to unit vector
            corpus_line = matutils.unitvec(corpus_line)

            yield corpus_line
Esempio n. 12
0
    def create_custom_vocabulary(self):
        '''
        Create custom vocabulary by searching through textual training file.
        :return:
        '''
        print(
            "===============================================================")
        print("Creating vocabulary - loading textual features.\n")
        # Generate feature names
        self._generate_visual_feature_names()

        if FileIOManager.existFile(dictionary_prefix + id2word_filename):
            self.load_vocabulary()
            return

        number_of_lines = 0
        lines = FileIOManager.read_textual_file()
        for line in lines:
            number_of_lines += 1
            # Get only words
            line_words = line.split()[2::2]
            for word in line_words:
                word = self.processWord(word.decode('utf-8'))
                if (word not in self.stop):
                    self.unique_words.add(word)
                    self.id2word[self.unique_words_counter] = word
                    self.word2id[word] = self.unique_words_counter
                    self.unique_words_counter += 1
            # Debug output
            if ((number_of_lines + 1) % 10000 == 0):
                print("Read %d" % (number_of_lines + 1))
        self.textual_docs_count = number_of_lines
        print("Number of text examples read:", number_of_lines)
        print("Number of unique words:", len(self.unique_words))
        # Delete unnecessary variables
        self.unique_words = None
        # Save vocabulary into file.
        self.save_vocabulary()
Esempio n. 13
0
    def __iter__(self):
        textual_lines = FileIOManager.read_textual_file()
        visual_file = open(FileIOManager.images_features_path, 'r')
        visual_file.readline()
        number_of_lines = 0
        for textual_line in textual_lines:
            number_of_lines += 1
            if self.limited_length is not None and number_of_lines > self.limited_length:
                break

            corpus_line_dict = dict()
            line_words = textual_line.split()
            textual_img_id = line_words[0]
            number_of_features = int(line_words[1])
            line_words = line_words[2:]
            for j in range(0, number_of_features*2, 2):
                word = self.dictionary.processWord(line_words[j].decode('utf-8'))
                # Normalize weight
                weight = float(line_words[j + 1]) / 100000
                if word not in self.dictionary.word2id:
                    continue
                # Get word id
                word_id = self.dictionary.word2id[word]
                if word_id not in corpus_line_dict:
                    corpus_line_dict[word_id] = weight
                else:
                    corpus_line_dict[word_id] += weight

            # Create array of tuples (word_id, weight) from dictionary
            corpus_line = []
            for key, value in corpus_line_dict.iteritems():
                corpus_line.append( (key, value) )

            # Normalize to unit vector
            corpus_line = matutils.unitvec(corpus_line)

            # Search for training images only for corresponding img
            visual_line = visual_file.readline().split()
            image_id = visual_line[0]
            while image_id != textual_img_id:
                visual_line = visual_file.readline().split()
                image_id = visual_line[0]

            # Append visual features
            corpus_line = corpus_line + utils.generate_corpus_for_image(visual_line[1:], self.dictionary.features_names2id)

            # Normalize to unit vector
            corpus_line = matutils.unitvec(corpus_line)

            yield corpus_line
Esempio n. 14
0
    def train_lda(self):
        print("===============================================================")
        print("Training LDA.\n")
        if FileIOManager.existFile(lda_model_filename):
            self.lda_model = gensim.models.ldamodel.LdaModel.load(lda_model_filename)
            self.num_topics = self.lda_model.num_topics
            return

        self.lda_model = gensim.models.ldamodel.LdaModel(corpus=self.corpus,
                                                         id2word=dict((v, k) for k, v in self.data_manager.textual_dictionary.word2id.items()),
                                                         num_topics= 50, update_every=1, passes=1, chunksize=8000)
        # Save the model.
        self.lda_model.save(lda_model_filename)
        self.num_topics = self.lda_model.num_topics
Esempio n. 15
0
    def create_gensim_dictionary(self):
        '''
        Create vocabulary using gensim dictionary.
        :return:
        '''
        print("===============================================================")
        print("Creating vocabulary by dictionary in gensim - loading textual features.\n")

        if FileIOManager.existFile(dictionary_prefix+gensim_dictionary_filename):
            self.load_gensim_dictionary()
            return

        # Generate visual feature names
        features = [self._generate_visual_feature_names()]

        corpus = []
        lines = FileIOManager.read_textual_file_words()
        number_of_lines = 0
        for line in lines:
            line = [self.processWord(word.decode('utf-8')) for word in line]
            corpus.append(line)
            number_of_lines += 1
            # Debug output
            if ((number_of_lines + 1) % 10000 == 0):
                print("Read %d" % (number_of_lines + 1))
        dictionary = corpora.Dictionary(corpus)
        print(dictionary)
        # remove stop words and words that appear only once
        stop_ids = [dictionary.token2id[stopword] for stopword in self.stop if stopword in dictionary.token2id]
        once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
        dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
        dictionary.compactify() # remove gaps in id sequence after words that were removed
        print(dictionary)
        dictionary.add_documents(features)
        print(dictionary)
        self._create_feature_name2id_list()
        dictionary.save(dictionary_prefix+gensim_dictionary_filename)
Esempio n. 16
0
    def create_custom_vocabulary(self):
        '''
        Create custom vocabulary by searching through textual training file.
        :return:
        '''
        print("===============================================================")
        print("Creating vocabulary - loading textual features.\n")
        # Generate feature names
        self._generate_visual_feature_names()

        if FileIOManager.existFile(dictionary_prefix+id2word_filename):
            self.load_vocabulary()
            return

        number_of_lines = 0
        lines = FileIOManager.read_textual_file()
        for line in lines:
            number_of_lines += 1
            # Get only words
            line_words = line.split()[2::2]
            for word in line_words:
                word = self.processWord(word.decode('utf-8'))
                if (word not in self.stop):
                    self.unique_words.add(word)
                    self.id2word[self.unique_words_counter] = word
                    self.word2id[word] = self.unique_words_counter
                    self.unique_words_counter += 1
            # Debug output
            if ((number_of_lines + 1) % 10000 == 0):
                print("Read %d" % (number_of_lines + 1))
        self.textual_docs_count = number_of_lines
        print("Number of text examples read:", number_of_lines)
        print("Number of unique words:", len(self.unique_words))
        # Delete unnecessary variables
        self.unique_words = None
        # Save vocabulary into file.
        self.save_vocabulary()
Esempio n. 17
0
    def create_img_document_topic_matrix(self):
        print(
            "===============================================================")
        print(
            "Creating teaser img_doc-topic matrix - loading visual features and applying LDA.\n"
        )

        self.read_visual_image_ids()

        if FileIOManager.existFile(dictionary_prefix + img_matrix_filename):
            self.img_document_topics = FileIOManager.load_from_file(
                dictionary_prefix + img_matrix_filename)
            self.similarity_index = similarities.Similarity.load(
                dictionary_prefix + similarity_index_filename)
            return

        lines = FileIOManager.read_teaser_visual_file()
        for line in lines:
            img_doc = utils.generate_corpus_for_image(
                line, self.data_manager.textual_dictionary.features_names2id)
            topic_vector = self.lda.document_topics_inference(
                img_doc, min_probability=0.0001)
            # Add into the list.
            self.img_document_topics.append(topic_vector)

        # Build similarity index #self.lda.document_topics_inference(self.img_document_topics
        self.similarity_index = similarities.Similarity(
            dictionary_prefix + index_filename,
            self.img_document_topics,
            num_features=self.lda.num_topics)
        self.similarity_index.save(dictionary_prefix +
                                   similarity_index_filename)
        FileIOManager.save_to_file(dictionary_prefix + img_matrix_filename,
                                   self.img_document_topics)
        testing_img_count = len(self.img_document_topics)
        print("Testing img count is %d\n" % (testing_img_count))
Esempio n. 18
0
    def create_testing_img_document_topic_matrix(self):
        print("===============================================================")
        print("Creating testing img_doc-topic matrix - loading visual features and applying LDA.\n")

        if FileIOManager.existFile(dictionary_prefix+testing_img_matrix_filename):
            self.testing_img_document_topics = FileIOManager.load_from_file(dictionary_prefix+testing_img_matrix_filename)
            #self.all_img_ids = FileIOManager.load_from_file(dictionary_prefix+all_img_ids_filename)
            self.testing_img_ids = FileIOManager.load_from_file(dictionary_prefix+testing_img_ids_filename)
            self.similarity_index = similarities.Similarity.load(dictionary_prefix+similarity_index_filename)
            self.testing_img_count = len(self.testing_img_document_topics)
            print("Testing img count is %d\n" % (self.testing_img_count))
            return

        lines = FileIOManager.read_visual_file()
        textual_train_images_length = len(self.data_manager.textual_train_image_ids)
        train_index = images_index = 0
        for line in lines:
            image_id = line[0]
            #self.all_img_ids[image_id] = images_index
            if train_index < textual_train_images_length and image_id == self.data_manager.textual_train_image_ids[
                train_index]:
                train_index += 1
            else:
                self.testing_img_ids.append(image_id)
                img_doc = utils.generate_corpus_for_image(line[1:],
                                                          self.data_manager.textual_dictionary.features_names2id)
                topic_vector = self.lda.document_topics_inference(img_doc, min_probability=0.001)
                #topic_vector = self.lda.document_topic_inference_chunk([img_doc])
                # Add into the list.
                self.testing_img_document_topics.append(topic_vector)
            images_index += 1
            # Debug print
            if ((images_index + 1) % 4000 == 0):
                print("Read visual features %d" % (images_index + 1))

        # Build similarity index #self.lda.document_topics_inference(self.img_document_topics
        self.similarity_index = similarities.Similarity(dictionary_prefix+index_filename,
                                                        self.testing_img_document_topics, num_features=self.lda.num_topics)
        self.similarity_index.save(dictionary_prefix+similarity_index_filename)

        FileIOManager.save_to_file(dictionary_prefix+testing_img_matrix_filename, self.testing_img_document_topics)
        #FileIOManager.save_to_file(dictionary_prefix+all_img_ids_filename, self.all_img_ids)
        FileIOManager.save_to_file(dictionary_prefix+testing_img_ids_filename, self.testing_img_ids)
        self.testing_img_count = len(self.testing_img_document_topics)
        assert self.testing_img_count == len(self.testing_img_ids)
        print("Testing img count is %d\n" % (self.testing_img_count))
Esempio n. 19
0
    def train_lda(self):
        print(
            "===============================================================")
        print("Training LDA.\n")
        if FileIOManager.existFile(lda_model_filename):
            self.lda_model = gensim.models.ldamodel.LdaModel.load(
                lda_model_filename)
            self.num_topics = self.lda_model.num_topics
            return

        self.lda_model = gensim.models.ldamodel.LdaModel(
            corpus=self.corpus,
            id2word=dict((v, k) for k, v in
                         self.data_manager.textual_dictionary.word2id.items()),
            num_topics=50,
            update_every=1,
            passes=1,
            chunksize=8000)
        # Save the model.
        self.lda_model.save(lda_model_filename)
        self.num_topics = self.lda_model.num_topics
Esempio n. 20
0
    def create_testing_img_document_topic_matrix(self):
        print(
            "===============================================================")
        print(
            "Creating testing img_doc-topic matrix - loading visual features and applying LDA.\n"
        )

        if FileIOManager.existFile(dictionary_prefix +
                                   testing_img_matrix_filename):
            self.testing_img_document_topics = FileIOManager.load_from_file(
                dictionary_prefix + testing_img_matrix_filename)
            #self.all_img_ids = FileIOManager.load_from_file(dictionary_prefix+all_img_ids_filename)
            self.testing_img_ids = FileIOManager.load_from_file(
                dictionary_prefix + testing_img_ids_filename)
            self.similarity_index = similarities.Similarity.load(
                dictionary_prefix + similarity_index_filename)
            self.testing_img_count = len(self.testing_img_document_topics)
            print("Testing img count is %d\n" % (self.testing_img_count))
            return

        lines = FileIOManager.read_visual_file()
        textual_train_images_length = len(
            self.data_manager.textual_train_image_ids)
        train_index = images_index = 0
        for line in lines:
            image_id = line[0]
            #self.all_img_ids[image_id] = images_index
            if train_index < textual_train_images_length and image_id == self.data_manager.textual_train_image_ids[
                    train_index]:
                train_index += 1
            else:
                self.testing_img_ids.append(image_id)
                img_doc = utils.generate_corpus_for_image(
                    line[1:],
                    self.data_manager.textual_dictionary.features_names2id)
                topic_vector = self.lda.document_topics_inference(
                    img_doc, min_probability=0.001)
                #topic_vector = self.lda.document_topic_inference_chunk([img_doc])
                # Add into the list.
                self.testing_img_document_topics.append(topic_vector)
            images_index += 1
            # Debug print
            if ((images_index + 1) % 4000 == 0):
                print("Read visual features %d" % (images_index + 1))

        # Build similarity index #self.lda.document_topics_inference(self.img_document_topics
        self.similarity_index = similarities.Similarity(
            dictionary_prefix + index_filename,
            self.testing_img_document_topics,
            num_features=self.lda.num_topics)
        self.similarity_index.save(dictionary_prefix +
                                   similarity_index_filename)

        FileIOManager.save_to_file(
            dictionary_prefix + testing_img_matrix_filename,
            self.testing_img_document_topics)
        #FileIOManager.save_to_file(dictionary_prefix+all_img_ids_filename, self.all_img_ids)
        FileIOManager.save_to_file(
            dictionary_prefix + testing_img_ids_filename, self.testing_img_ids)
        self.testing_img_count = len(self.testing_img_document_topics)
        assert self.testing_img_count == len(self.testing_img_ids)
        print("Testing img count is %d\n" % (self.testing_img_count))
Esempio n. 21
0
 def load_vocabulary(self):
     self.id2word = FileIOManager.load_from_file(dictionary_prefix+id2word_filename)
     self.word2id = FileIOManager.load_from_file(dictionary_prefix+word2id_filename)
Esempio n. 22
0
 def save_vocabulary(self):
     FileIOManager.save_to_file(dictionary_prefix+id2word_filename,self.id2word)
     FileIOManager.save_to_file(dictionary_prefix+word2id_filename, self.word2id)
import TestingDataSharePh1 as ds
import myStringLib as ms
import AssembleData as ad
import FileIOManager as fm

op = fm.OperateFiles()


class FileControl:
    def __init__(self, c):
        self.c = c

        self.sWindow = []

        self.sWinSend = {}
        self.sWinRecv = {}

        self.files = {}

    def getSetWindow(self, sWin):
        if sWin not in self.sWindow:
            self.sWindow.append(sWin)
            self.sWinSend[sWin] = []
            self.sWinRecv[sWin] = []

    def onSendStart(self, sWin, fileName):

        self.setSendFile(sWin, fileName)

    def onSendEnd(self, sWin, fileName):
Esempio n. 24
0
import time
import socket
import myStringLib as ms
import ControlUnit as cu
import os
from cryptography.fernet import Fernet
import sys
import numpy as np
import threading
import AssembleData as ad
import DataShare as ds
import dbquery2 as db
import FileIOManager as fim


opFile=fim.OperateFiles()

key=b'eQ5jxFcJNYII5Z4vhBtvT-mNiqx64yQEUln1SOoYEDA='
fernet=Fernet(key)

def prepSend(data):
        #print(data)
        if data is None:
            data=[]
        data=np.array(data)
        types=str(data.dtype)
        shape=str(data.shape)
        data=bytes(data)
        data=ds.encb(data)
        data=data.decode()