def __init__(self, target_vocab, vectors, config=None): super(SentenceEncoder, self).__init__() if config is not None: self.config = config self.embedding_weights_matrix = create_embedding_matrix( self.config, target_vocab, vectors) self.embedding_layer, self.num_embeddings, self.embedding_dim = create_emb_layer( self.embedding_weights_matrix) print("self.embedding_layer", self.embedding_layer) print("self.num_embeddings", self.num_embeddings) print("self.embedding_dim", self.embedding_dim) Ks = np.array(self.config.sentence_enc.FILTER_SIZES) self.embedding_size = np.sum(Ks[:, 1]) self.convs = nn.ModuleList([ nn.Conv1d( 1, out_channels=out_c, kernel_size=(k, self.config.dataset_options.WORD_DIMENTIONS)) for (k, out_c) in Ks ]) self.max_pool = nn.MaxPool1d( kernel_size=self.config.dataset_options.MAX_SENTENCES_PER_DOCUMENT) self.highway_layer = Highway(size=self.embedding_size, num_layers=1, f=torch.nn.functional.relu)
x, input_word_index = utils.tokenize_sequence(input_sentences, filters, config['encoder_num_tokens'], config['encoder_vocab']) y, output_word_index = utils.tokenize_sequence(output_sentences, filters, config['decoder_num_tokens'], config['decoder_vocab']) print('[INFO] Split data into train-validation-test sets') dataset_sizes = [train_data.shape[0], val_data.shape[0], test_data.shape[0]] x_train, y_train, x_val, y_val, x_test, y_test = utils.create_data_split(x, y, dataset_sizes) encoder_embeddings_matrix = utils.create_embedding_matrix(input_word_index, config['embedding_size'], w2v_path) decoder_embeddings_matrix = utils.create_embedding_matrix(output_word_index, config['embedding_size'], w2v_path) # Re-calculate the vocab size based on the word_idx dictionary config['encoder_vocab'] = len(input_word_index) config['decoder_vocab'] = len(output_word_index) #----------------------------------------------------------------# model = StochasticWEDModel(config, encoder_embeddings_matrix, decoder_embeddings_matrix,
np.random.shuffle(sentences) print('[INFO] Tokenizing input and output sequences') filters = '!"#$%&()*+/:;<=>@[\\]^`{|}~\t\n' x, word_index = utils.tokenize_sequence(sentences, filters, config['num_tokens'], config['vocab_size']) print('[INFO] Split data into train-validation-test sets') x_train, _x_val_test = train_test_split(x, test_size = 0.1, random_state = 10) x_val, x_test = train_test_split(_x_val_test, test_size = 0.5, random_state = 10) w2v = config['w2v_file'] embeddings_matrix = utils.create_embedding_matrix(word_index, config['embedding_size'], w2v) # Re-calculate the vocab size based on the word_idx dictionary config['vocab_size'] = len(word_index) #----------------------------------------------------------------# model = DetWAEModel(config, embeddings_matrix, word_index) #----------------------------------------------------------------# checkpoint = config['ckpt'] with tf.Session() as sess:
## #from shutil import copy #for isbn in X_isbns: # source_path = 'D:/PythonOK/图书封面影响/covers/%s.jpg'%isbn # target_path = 'D:/PythonOK/图书封面影响/covers_subset' # copy(source_path,target_path) ## # 加载词向量模型,创建embedding matrix: wv_path = '../../wv/wikibaikeWV250/wikibaikewv250' print("Loading word2vec model, may take a few minutes......") if ('wvmodel' not in vars()): # 避免重复加载 wvmodel = Word2Vec.load(wv_path) wvdim = 250 embedding_matrix = create_embedding_matrix(wvmodel,vocab_size,wvdim,freq_word_index) # ============================== # 将顺序打乱 indexs = np.random.permutation(range(len(X_isbns))) X_isbns = np.array(X_isbns)[indexs] X_colors = np.array(X_colors)[indexs] X_img = np.array(X_img)[indexs] X_title = X_title[indexs] Y = np.array(Y)[indexs] # 获取其他的信息,例如price、publisher: X_price = [] X_pub = [] for isbn in X_isbns: