def __init__(self, target_vocab, vectors, config=None):
        super(SentenceEncoder, self).__init__()

        if config is not None:
            self.config = config
        self.embedding_weights_matrix = create_embedding_matrix(
            self.config, target_vocab, vectors)
        self.embedding_layer, self.num_embeddings, self.embedding_dim = create_emb_layer(
            self.embedding_weights_matrix)

        print("self.embedding_layer", self.embedding_layer)
        print("self.num_embeddings", self.num_embeddings)
        print("self.embedding_dim", self.embedding_dim)

        Ks = np.array(self.config.sentence_enc.FILTER_SIZES)
        self.embedding_size = np.sum(Ks[:, 1])
        self.convs = nn.ModuleList([
            nn.Conv1d(
                1,
                out_channels=out_c,
                kernel_size=(k, self.config.dataset_options.WORD_DIMENTIONS))
            for (k, out_c) in Ks
        ])
        self.max_pool = nn.MaxPool1d(
            kernel_size=self.config.dataset_options.MAX_SENTENCES_PER_DOCUMENT)
        self.highway_layer = Highway(size=self.embedding_size,
                                     num_layers=1,
                                     f=torch.nn.functional.relu)
Exemple #2
0
x, input_word_index = utils.tokenize_sequence(input_sentences, 
                                                filters, 
                                                config['encoder_num_tokens'], 
                                                config['encoder_vocab'])

y, output_word_index = utils.tokenize_sequence(output_sentences, 
                                                filters, 
                                                config['decoder_num_tokens'], 
                                                config['decoder_vocab'])

print('[INFO] Split data into train-validation-test sets')
dataset_sizes = [train_data.shape[0], val_data.shape[0], test_data.shape[0]]
x_train, y_train, x_val, y_val, x_test, y_test = utils.create_data_split(x, y, dataset_sizes)

encoder_embeddings_matrix = utils.create_embedding_matrix(input_word_index, 
                                                               config['embedding_size'], 
                                                               w2v_path)

decoder_embeddings_matrix = utils.create_embedding_matrix(output_word_index, 
                                                               config['embedding_size'], 
                                                               w2v_path)

# Re-calculate the vocab size based on the word_idx dictionary
config['encoder_vocab'] = len(input_word_index)
config['decoder_vocab'] = len(output_word_index)

#----------------------------------------------------------------#

model = StochasticWEDModel(config, 
                   encoder_embeddings_matrix, 
                   decoder_embeddings_matrix, 
Exemple #3
0
np.random.shuffle(sentences)

print('[INFO] Tokenizing input and output sequences')
filters = '!"#$%&()*+/:;<=>@[\\]^`{|}~\t\n'
x, word_index = utils.tokenize_sequence(sentences,
                                             filters,
                                             config['num_tokens'],
                                             config['vocab_size'])

print('[INFO] Split data into train-validation-test sets')
x_train, _x_val_test = train_test_split(x, test_size = 0.1, random_state = 10)
x_val, x_test = train_test_split(_x_val_test, test_size = 0.5, random_state = 10)

w2v = config['w2v_file']
embeddings_matrix = utils.create_embedding_matrix(word_index,
                                                  config['embedding_size'],
                                                  w2v)

# Re-calculate the vocab size based on the word_idx dictionary
config['vocab_size'] = len(word_index)

#----------------------------------------------------------------#

model = DetWAEModel(config,
                    embeddings_matrix,
                    word_index)
#----------------------------------------------------------------#

checkpoint = config['ckpt']

with tf.Session() as sess:
Exemple #4
0
##
#from shutil import copy
#for isbn in X_isbns:
#    source_path = 'D:/PythonOK/图书封面影响/covers/%s.jpg'%isbn
#    target_path = 'D:/PythonOK/图书封面影响/covers_subset'
#    copy(source_path,target_path)
##

# 加载词向量模型,创建embedding matrix:
wv_path = '../../wv/wikibaikeWV250/wikibaikewv250'
print("Loading word2vec model, may take a few minutes......")
if ('wvmodel' not in vars()): # 避免重复加载  
    wvmodel = Word2Vec.load(wv_path)
wvdim = 250
embedding_matrix = create_embedding_matrix(wvmodel,vocab_size,wvdim,freq_word_index)


# ==============================
# 将顺序打乱
indexs = np.random.permutation(range(len(X_isbns)))
X_isbns = np.array(X_isbns)[indexs]
X_colors = np.array(X_colors)[indexs]
X_img = np.array(X_img)[indexs]
X_title = X_title[indexs]
Y = np.array(Y)[indexs]

# 获取其他的信息,例如price、publisher:
X_price = []
X_pub = []
for isbn in X_isbns: