def create_img_document_topic_matrix(self): print("===============================================================") print("Creating teaser img_doc-topic matrix - loading visual features and applying LDA.\n") self.read_visual_image_ids() if FileIOManager.existFile(dictionary_prefix+img_matrix_filename): self.img_document_topics = FileIOManager.load_from_file(dictionary_prefix+img_matrix_filename) self.similarity_index = similarities.Similarity.load(dictionary_prefix+similarity_index_filename) return lines = FileIOManager.read_teaser_visual_file() for line in lines: img_doc = utils.generate_corpus_for_image(line, self.data_manager.textual_dictionary.features_names2id) topic_vector = self.lda.document_topics_inference(img_doc, min_probability=0.0001) # Add into the list. self.img_document_topics.append(topic_vector) # Build similarity index #self.lda.document_topics_inference(self.img_document_topics self.similarity_index = similarities.Similarity(dictionary_prefix+index_filename, self.img_document_topics, num_features=self.lda.num_topics) self.similarity_index.save(dictionary_prefix+similarity_index_filename) FileIOManager.save_to_file(dictionary_prefix+img_matrix_filename, self.img_document_topics) testing_img_count = len(self.img_document_topics) print("Testing img count is %d\n" % (testing_img_count))
def create_testing_textual_document_topic_matrix(self): print( "===============================================================") print( "Creating testing texual_doc-topic matrix - loading teaser1 textual data and applying LDA.\n" ) if FileIOManager.existFile(dictionary_prefix + testing_textual_matrix_filename): self.testing_textual_document_topics = FileIOManager.load_from_file( dictionary_prefix + testing_textual_matrix_filename) self.testing_textual_count = len( self.testing_textual_document_topics) print("Testing textual count is %d\n" % (self.testing_textual_count)) return number_of_lines = 0 lines = FileIOManager.read_testing_textual_file() for line in lines: number_of_lines += 1 corpus_line_dict = dict() line_words = line.split() number_of_features = int(line_words[1]) line_words = line_words[2:] for j in range(0, number_of_features * 2, 2): word = self.data_manager.textual_dictionary.processWord( line_words[j].decode('utf-8')) # Normalize weight weight = float(line_words[j + 1]) / 100000 if word not in self.data_manager.textual_dictionary.word2id: continue # Get word id word_id = self.data_manager.textual_dictionary.word2id[word] if word_id not in corpus_line_dict: corpus_line_dict[word_id] = weight else: corpus_line_dict[word_id] += weight # Create array of tuples (word_id, weight) from dictionary corpus_line = [] for key, value in corpus_line_dict.iteritems(): corpus_line.append((key, value)) # Normalize to one vector corpus_line = matutils.unitvec(corpus_line) # Add into the list. self.testing_textual_document_topics.append( self.lda.document_topics_inference(corpus_line, min_probability=0.001)) # Debug output if ((number_of_lines + 1) % 10000 == 0): print("Read %d" % (number_of_lines + 1)) FileIOManager.save_to_file( dictionary_prefix + testing_textual_matrix_filename, self.testing_textual_document_topics) self.testing_textual_count = len(self.testing_textual_document_topics) print("Testing textual count is %d\n" % (self.testing_textual_count))
def create_gensim_dictionary(self): ''' Create vocabulary using gensim dictionary. :return: ''' print( "===============================================================") print( "Creating vocabulary by dictionary in gensim - loading textual features.\n" ) if FileIOManager.existFile(dictionary_prefix + gensim_dictionary_filename): self.load_gensim_dictionary() return # Generate visual feature names features = [self._generate_visual_feature_names()] corpus = [] lines = FileIOManager.read_textual_file_words() number_of_lines = 0 for line in lines: line = [self.processWord(word.decode('utf-8')) for word in line] corpus.append(line) number_of_lines += 1 # Debug output if ((number_of_lines + 1) % 10000 == 0): print("Read %d" % (number_of_lines + 1)) dictionary = corpora.Dictionary(corpus) print(dictionary) # remove stop words and words that appear only once stop_ids = [ dictionary.token2id[stopword] for stopword in self.stop if stopword in dictionary.token2id ] once_ids = [ tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1 ] dictionary.filter_tokens( stop_ids + once_ids) # remove stop words and words that appear only once dictionary.compactify( ) # remove gaps in id sequence after words that were removed print(dictionary) dictionary.add_documents(features) print(dictionary) self._create_feature_name2id_list() dictionary.save(dictionary_prefix + gensim_dictionary_filename)
def create_testing_textual_document_topic_matrix(self): print("===============================================================") print("Creating testing texual_doc-topic matrix - loading teaser1 textual data and applying LDA.\n") if FileIOManager.existFile(dictionary_prefix+testing_textual_matrix_filename): self.testing_textual_document_topics = FileIOManager.load_from_file(dictionary_prefix+testing_textual_matrix_filename) self.testing_textual_count = len(self.testing_textual_document_topics) print("Testing textual count is %d\n" % (self.testing_textual_count)) return number_of_lines = 0 lines = FileIOManager.read_testing_textual_file() for line in lines: number_of_lines += 1 corpus_line_dict = dict() line_words = line.split() number_of_features = int(line_words[1]) line_words = line_words[2:] for j in range(0, number_of_features*2, 2): word = self.data_manager.textual_dictionary.processWord(line_words[j].decode('utf-8')) # Normalize weight weight = float(line_words[j + 1]) / 100000 if word not in self.data_manager.textual_dictionary.word2id: continue # Get word id word_id = self.data_manager.textual_dictionary.word2id[word] if word_id not in corpus_line_dict: corpus_line_dict[word_id] = weight else: corpus_line_dict[word_id] += weight # Create array of tuples (word_id, weight) from dictionary corpus_line = [] for key, value in corpus_line_dict.iteritems(): corpus_line.append( (key, value) ) # Normalize to one vector corpus_line = matutils.unitvec(corpus_line) # Add into the list. self.testing_textual_document_topics.append(self.lda.document_topics_inference(corpus_line, min_probability=0.001)) # Debug output if ((number_of_lines + 1) % 10000 == 0): print("Read %d" % (number_of_lines + 1)) FileIOManager.save_to_file(dictionary_prefix+testing_textual_matrix_filename, self.testing_textual_document_topics) self.testing_textual_count = len(self.testing_textual_document_topics) print("Testing textual count is %d\n" % (self.testing_textual_count))
def yield_testing_textual_document_topics(self): number_of_lines = 0 lines = FileIOManager.read_testing_textual_file() for line in lines: number_of_lines += 1 corpus_line_dict = dict() line_words = line.split() number_of_features = int(line_words[1]) line_words = line_words[2:] for j in range(0, number_of_features*2, 2): word = self.data_manager.textual_dictionary.processWord(line_words[j].decode('utf-8')) # Normalize weight weight = float(line_words[j + 1]) / 100000 if word not in self.data_manager.textual_dictionary.word2id: continue # Get word id word_id = self.data_manager.textual_dictionary.word2id[word] if word_id not in corpus_line_dict: corpus_line_dict[word_id] = weight else: corpus_line_dict[word_id] += weight # Create array of tuples (word_id, weight) from dictionary corpus_line = [] for key, value in corpus_line_dict.iteritems(): corpus_line.append( (key, value) ) # Normalize to one vector corpus_line = matutils.unitvec(corpus_line) # Add into the list. corpus_line = self.lda.document_topics_inference(corpus_line, min_probability=0.001) yield corpus_line
def yield_testing_textual_document_topics(self): number_of_lines = 0 lines = FileIOManager.read_testing_textual_file() for line in lines: number_of_lines += 1 corpus_line_dict = dict() line_words = line.split() number_of_features = int(line_words[1]) line_words = line_words[2:] for j in range(0, number_of_features * 2, 2): word = self.data_manager.textual_dictionary.processWord( line_words[j].decode('utf-8')) # Normalize weight weight = float(line_words[j + 1]) / 100000 if word not in self.data_manager.textual_dictionary.word2id: continue # Get word id word_id = self.data_manager.textual_dictionary.word2id[word] if word_id not in corpus_line_dict: corpus_line_dict[word_id] = weight else: corpus_line_dict[word_id] += weight # Create array of tuples (word_id, weight) from dictionary corpus_line = [] for key, value in corpus_line_dict.iteritems(): corpus_line.append((key, value)) # Normalize to one vector corpus_line = matutils.unitvec(corpus_line) # Add into the list. corpus_line = self.lda.document_topics_inference( corpus_line, min_probability=0.001) yield corpus_line
def load_testing_img_ids(self): ''' Load img testing ids. :return: ''' if FileIOManager.existFile(dictionary_prefix+test_image_ids_filename): self.textual_test_image_ids = FileIOManager.load_from_file(dictionary_prefix+test_image_ids_filename) return lines = FileIOManager.read_image_ids_from_filename(FileIOManager.testing_img_ids_filename) for line in lines: self.textual_test_image_ids.append(line) print(len(self.textual_test_image_ids), len(self.textual_train_image_ids)) FileIOManager.save_to_file(dictionary_prefix+test_image_ids_filename, self.textual_test_image_ids)
def count_textual_training_img_ids(self): ''' Count and store textual training img ids. :return: ''' if FileIOManager.existFile(dictionary_prefix+train_image_ids_filename): self.textual_train_image_ids = FileIOManager.load_from_file(dictionary_prefix+train_image_ids_filename) return self.textual_train_image_ids = [] lines = FileIOManager.read_textual_file() for line in lines: line_words = line.split() self.textual_train_image_ids.append(line_words[0]) FileIOManager.save_to_file(dictionary_prefix+train_image_ids_filename, self.textual_train_image_ids)
def __iter__(self): textual_lines = FileIOManager.read_textual_file() visual_file = open(FileIOManager.images_features_path, 'r') visual_file.readline() number_of_lines = 0 for textual_line in textual_lines: number_of_lines += 1 if self.limited_length is not None and number_of_lines > self.limited_length: break corpus_line_dict = dict() line_words = textual_line.split() textual_img_id = line_words[0] number_of_features = int(line_words[1]) line_words = line_words[2:] for j in range(0, number_of_features * 2, 2): word = self.dictionary.processWord( line_words[j].decode('utf-8')) # Normalize weight weight = float(line_words[j + 1]) / 100000 if word not in self.dictionary.word2id: continue # Get word id word_id = self.dictionary.word2id[word] if word_id not in corpus_line_dict: corpus_line_dict[word_id] = weight else: corpus_line_dict[word_id] += weight # Create array of tuples (word_id, weight) from dictionary corpus_line = [] for key, value in corpus_line_dict.iteritems(): corpus_line.append((key, value)) # Normalize to unit vector corpus_line = matutils.unitvec(corpus_line) # Search for training images only for corresponding img visual_line = visual_file.readline().split() image_id = visual_line[0] while image_id != textual_img_id: visual_line = visual_file.readline().split() image_id = visual_line[0] # Append visual features corpus_line = corpus_line + utils.generate_corpus_for_image( visual_line[1:], self.dictionary.features_names2id) # Normalize to unit vector corpus_line = matutils.unitvec(corpus_line) yield corpus_line
def create_custom_vocabulary(self): ''' Create custom vocabulary by searching through textual training file. :return: ''' print( "===============================================================") print("Creating vocabulary - loading textual features.\n") # Generate feature names self._generate_visual_feature_names() if FileIOManager.existFile(dictionary_prefix + id2word_filename): self.load_vocabulary() return number_of_lines = 0 lines = FileIOManager.read_textual_file() for line in lines: number_of_lines += 1 # Get only words line_words = line.split()[2::2] for word in line_words: word = self.processWord(word.decode('utf-8')) if (word not in self.stop): self.unique_words.add(word) self.id2word[self.unique_words_counter] = word self.word2id[word] = self.unique_words_counter self.unique_words_counter += 1 # Debug output if ((number_of_lines + 1) % 10000 == 0): print("Read %d" % (number_of_lines + 1)) self.textual_docs_count = number_of_lines print("Number of text examples read:", number_of_lines) print("Number of unique words:", len(self.unique_words)) # Delete unnecessary variables self.unique_words = None # Save vocabulary into file. self.save_vocabulary()
def __iter__(self): textual_lines = FileIOManager.read_textual_file() visual_file = open(FileIOManager.images_features_path, 'r') visual_file.readline() number_of_lines = 0 for textual_line in textual_lines: number_of_lines += 1 if self.limited_length is not None and number_of_lines > self.limited_length: break corpus_line_dict = dict() line_words = textual_line.split() textual_img_id = line_words[0] number_of_features = int(line_words[1]) line_words = line_words[2:] for j in range(0, number_of_features*2, 2): word = self.dictionary.processWord(line_words[j].decode('utf-8')) # Normalize weight weight = float(line_words[j + 1]) / 100000 if word not in self.dictionary.word2id: continue # Get word id word_id = self.dictionary.word2id[word] if word_id not in corpus_line_dict: corpus_line_dict[word_id] = weight else: corpus_line_dict[word_id] += weight # Create array of tuples (word_id, weight) from dictionary corpus_line = [] for key, value in corpus_line_dict.iteritems(): corpus_line.append( (key, value) ) # Normalize to unit vector corpus_line = matutils.unitvec(corpus_line) # Search for training images only for corresponding img visual_line = visual_file.readline().split() image_id = visual_line[0] while image_id != textual_img_id: visual_line = visual_file.readline().split() image_id = visual_line[0] # Append visual features corpus_line = corpus_line + utils.generate_corpus_for_image(visual_line[1:], self.dictionary.features_names2id) # Normalize to unit vector corpus_line = matutils.unitvec(corpus_line) yield corpus_line
def train_lda(self): print("===============================================================") print("Training LDA.\n") if FileIOManager.existFile(lda_model_filename): self.lda_model = gensim.models.ldamodel.LdaModel.load(lda_model_filename) self.num_topics = self.lda_model.num_topics return self.lda_model = gensim.models.ldamodel.LdaModel(corpus=self.corpus, id2word=dict((v, k) for k, v in self.data_manager.textual_dictionary.word2id.items()), num_topics= 50, update_every=1, passes=1, chunksize=8000) # Save the model. self.lda_model.save(lda_model_filename) self.num_topics = self.lda_model.num_topics
def create_gensim_dictionary(self): ''' Create vocabulary using gensim dictionary. :return: ''' print("===============================================================") print("Creating vocabulary by dictionary in gensim - loading textual features.\n") if FileIOManager.existFile(dictionary_prefix+gensim_dictionary_filename): self.load_gensim_dictionary() return # Generate visual feature names features = [self._generate_visual_feature_names()] corpus = [] lines = FileIOManager.read_textual_file_words() number_of_lines = 0 for line in lines: line = [self.processWord(word.decode('utf-8')) for word in line] corpus.append(line) number_of_lines += 1 # Debug output if ((number_of_lines + 1) % 10000 == 0): print("Read %d" % (number_of_lines + 1)) dictionary = corpora.Dictionary(corpus) print(dictionary) # remove stop words and words that appear only once stop_ids = [dictionary.token2id[stopword] for stopword in self.stop if stopword in dictionary.token2id] once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1] dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once dictionary.compactify() # remove gaps in id sequence after words that were removed print(dictionary) dictionary.add_documents(features) print(dictionary) self._create_feature_name2id_list() dictionary.save(dictionary_prefix+gensim_dictionary_filename)
def create_custom_vocabulary(self): ''' Create custom vocabulary by searching through textual training file. :return: ''' print("===============================================================") print("Creating vocabulary - loading textual features.\n") # Generate feature names self._generate_visual_feature_names() if FileIOManager.existFile(dictionary_prefix+id2word_filename): self.load_vocabulary() return number_of_lines = 0 lines = FileIOManager.read_textual_file() for line in lines: number_of_lines += 1 # Get only words line_words = line.split()[2::2] for word in line_words: word = self.processWord(word.decode('utf-8')) if (word not in self.stop): self.unique_words.add(word) self.id2word[self.unique_words_counter] = word self.word2id[word] = self.unique_words_counter self.unique_words_counter += 1 # Debug output if ((number_of_lines + 1) % 10000 == 0): print("Read %d" % (number_of_lines + 1)) self.textual_docs_count = number_of_lines print("Number of text examples read:", number_of_lines) print("Number of unique words:", len(self.unique_words)) # Delete unnecessary variables self.unique_words = None # Save vocabulary into file. self.save_vocabulary()
def create_img_document_topic_matrix(self): print( "===============================================================") print( "Creating teaser img_doc-topic matrix - loading visual features and applying LDA.\n" ) self.read_visual_image_ids() if FileIOManager.existFile(dictionary_prefix + img_matrix_filename): self.img_document_topics = FileIOManager.load_from_file( dictionary_prefix + img_matrix_filename) self.similarity_index = similarities.Similarity.load( dictionary_prefix + similarity_index_filename) return lines = FileIOManager.read_teaser_visual_file() for line in lines: img_doc = utils.generate_corpus_for_image( line, self.data_manager.textual_dictionary.features_names2id) topic_vector = self.lda.document_topics_inference( img_doc, min_probability=0.0001) # Add into the list. self.img_document_topics.append(topic_vector) # Build similarity index #self.lda.document_topics_inference(self.img_document_topics self.similarity_index = similarities.Similarity( dictionary_prefix + index_filename, self.img_document_topics, num_features=self.lda.num_topics) self.similarity_index.save(dictionary_prefix + similarity_index_filename) FileIOManager.save_to_file(dictionary_prefix + img_matrix_filename, self.img_document_topics) testing_img_count = len(self.img_document_topics) print("Testing img count is %d\n" % (testing_img_count))
def create_testing_img_document_topic_matrix(self): print("===============================================================") print("Creating testing img_doc-topic matrix - loading visual features and applying LDA.\n") if FileIOManager.existFile(dictionary_prefix+testing_img_matrix_filename): self.testing_img_document_topics = FileIOManager.load_from_file(dictionary_prefix+testing_img_matrix_filename) #self.all_img_ids = FileIOManager.load_from_file(dictionary_prefix+all_img_ids_filename) self.testing_img_ids = FileIOManager.load_from_file(dictionary_prefix+testing_img_ids_filename) self.similarity_index = similarities.Similarity.load(dictionary_prefix+similarity_index_filename) self.testing_img_count = len(self.testing_img_document_topics) print("Testing img count is %d\n" % (self.testing_img_count)) return lines = FileIOManager.read_visual_file() textual_train_images_length = len(self.data_manager.textual_train_image_ids) train_index = images_index = 0 for line in lines: image_id = line[0] #self.all_img_ids[image_id] = images_index if train_index < textual_train_images_length and image_id == self.data_manager.textual_train_image_ids[ train_index]: train_index += 1 else: self.testing_img_ids.append(image_id) img_doc = utils.generate_corpus_for_image(line[1:], self.data_manager.textual_dictionary.features_names2id) topic_vector = self.lda.document_topics_inference(img_doc, min_probability=0.001) #topic_vector = self.lda.document_topic_inference_chunk([img_doc]) # Add into the list. self.testing_img_document_topics.append(topic_vector) images_index += 1 # Debug print if ((images_index + 1) % 4000 == 0): print("Read visual features %d" % (images_index + 1)) # Build similarity index #self.lda.document_topics_inference(self.img_document_topics self.similarity_index = similarities.Similarity(dictionary_prefix+index_filename, self.testing_img_document_topics, num_features=self.lda.num_topics) self.similarity_index.save(dictionary_prefix+similarity_index_filename) FileIOManager.save_to_file(dictionary_prefix+testing_img_matrix_filename, self.testing_img_document_topics) #FileIOManager.save_to_file(dictionary_prefix+all_img_ids_filename, self.all_img_ids) FileIOManager.save_to_file(dictionary_prefix+testing_img_ids_filename, self.testing_img_ids) self.testing_img_count = len(self.testing_img_document_topics) assert self.testing_img_count == len(self.testing_img_ids) print("Testing img count is %d\n" % (self.testing_img_count))
def train_lda(self): print( "===============================================================") print("Training LDA.\n") if FileIOManager.existFile(lda_model_filename): self.lda_model = gensim.models.ldamodel.LdaModel.load( lda_model_filename) self.num_topics = self.lda_model.num_topics return self.lda_model = gensim.models.ldamodel.LdaModel( corpus=self.corpus, id2word=dict((v, k) for k, v in self.data_manager.textual_dictionary.word2id.items()), num_topics=50, update_every=1, passes=1, chunksize=8000) # Save the model. self.lda_model.save(lda_model_filename) self.num_topics = self.lda_model.num_topics
def create_testing_img_document_topic_matrix(self): print( "===============================================================") print( "Creating testing img_doc-topic matrix - loading visual features and applying LDA.\n" ) if FileIOManager.existFile(dictionary_prefix + testing_img_matrix_filename): self.testing_img_document_topics = FileIOManager.load_from_file( dictionary_prefix + testing_img_matrix_filename) #self.all_img_ids = FileIOManager.load_from_file(dictionary_prefix+all_img_ids_filename) self.testing_img_ids = FileIOManager.load_from_file( dictionary_prefix + testing_img_ids_filename) self.similarity_index = similarities.Similarity.load( dictionary_prefix + similarity_index_filename) self.testing_img_count = len(self.testing_img_document_topics) print("Testing img count is %d\n" % (self.testing_img_count)) return lines = FileIOManager.read_visual_file() textual_train_images_length = len( self.data_manager.textual_train_image_ids) train_index = images_index = 0 for line in lines: image_id = line[0] #self.all_img_ids[image_id] = images_index if train_index < textual_train_images_length and image_id == self.data_manager.textual_train_image_ids[ train_index]: train_index += 1 else: self.testing_img_ids.append(image_id) img_doc = utils.generate_corpus_for_image( line[1:], self.data_manager.textual_dictionary.features_names2id) topic_vector = self.lda.document_topics_inference( img_doc, min_probability=0.001) #topic_vector = self.lda.document_topic_inference_chunk([img_doc]) # Add into the list. self.testing_img_document_topics.append(topic_vector) images_index += 1 # Debug print if ((images_index + 1) % 4000 == 0): print("Read visual features %d" % (images_index + 1)) # Build similarity index #self.lda.document_topics_inference(self.img_document_topics self.similarity_index = similarities.Similarity( dictionary_prefix + index_filename, self.testing_img_document_topics, num_features=self.lda.num_topics) self.similarity_index.save(dictionary_prefix + similarity_index_filename) FileIOManager.save_to_file( dictionary_prefix + testing_img_matrix_filename, self.testing_img_document_topics) #FileIOManager.save_to_file(dictionary_prefix+all_img_ids_filename, self.all_img_ids) FileIOManager.save_to_file( dictionary_prefix + testing_img_ids_filename, self.testing_img_ids) self.testing_img_count = len(self.testing_img_document_topics) assert self.testing_img_count == len(self.testing_img_ids) print("Testing img count is %d\n" % (self.testing_img_count))
def load_vocabulary(self): self.id2word = FileIOManager.load_from_file(dictionary_prefix+id2word_filename) self.word2id = FileIOManager.load_from_file(dictionary_prefix+word2id_filename)
def save_vocabulary(self): FileIOManager.save_to_file(dictionary_prefix+id2word_filename,self.id2word) FileIOManager.save_to_file(dictionary_prefix+word2id_filename, self.word2id)
import TestingDataSharePh1 as ds import myStringLib as ms import AssembleData as ad import FileIOManager as fm op = fm.OperateFiles() class FileControl: def __init__(self, c): self.c = c self.sWindow = [] self.sWinSend = {} self.sWinRecv = {} self.files = {} def getSetWindow(self, sWin): if sWin not in self.sWindow: self.sWindow.append(sWin) self.sWinSend[sWin] = [] self.sWinRecv[sWin] = [] def onSendStart(self, sWin, fileName): self.setSendFile(sWin, fileName) def onSendEnd(self, sWin, fileName):
import time import socket import myStringLib as ms import ControlUnit as cu import os from cryptography.fernet import Fernet import sys import numpy as np import threading import AssembleData as ad import DataShare as ds import dbquery2 as db import FileIOManager as fim opFile=fim.OperateFiles() key=b'eQ5jxFcJNYII5Z4vhBtvT-mNiqx64yQEUln1SOoYEDA=' fernet=Fernet(key) def prepSend(data): #print(data) if data is None: data=[] data=np.array(data) types=str(data.dtype) shape=str(data.shape) data=bytes(data) data=ds.encb(data) data=data.decode()