def make_model(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1): attn = MultiHeadedAttention(h, d_model, dropout) ff = PositionwiseFeedForward(d_model, d_ff, dropout) pe = PositionalEncoding(d_model, dropout) encoder_layer = EncoderLayer(d_model, copy(attn), copy(ff), dropout) encoder = Encoder(encoder_layer, N) decoder_layer = DecoderLayer(d_model, copy(attn), copy(attn), copy(ff), dropout) decoder = Decoder(decoder_layer, N) src_embed = nn.Sequential(Embedding(src_vocab, d_model), copy(pe)) tgt_embed = nn.Sequential(Embedding(tgt_vocab, d_model), copy(pe)) generator = Generator(d_model, tgt_vocab) model = EncoderDecoder(encoder, decoder, src_embed, tgt_embed, generator) for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform(p) return model
def load_attr_embeddings(self): for attr_name in self.attr_embedding_map.keys(): print("load attr embedding for : {}".format(attr_name)) attr_embedding_info = self.attr_embedding_map[attr_name] path = attr_embedding_info["path"] cache_mod = attr_embedding_info["cache_mod"] read_func = attr_embedding_info["read_func"] attr_embedding = Embedding(path, cache_mod, read_func) attr_embedding.load() self.loaded_attr_embedding_map[attr_name] = attr_embedding
def train(self, learning_rate=0.01, print_step=1000, stop_threshold=0): losses = [] aver_losses = [] wa_scores = [] if print_step == 0: print_step = self.n_batches for _ in range(self.epochs): iteration = 0 start = time.time() with open(self.filename, 'r') as f: reader = csv.reader(f) for row in reader: # Print step iteration += 1 if iteration % print_step == 0: end = time.time() print("Epochs: {}".format(_), "Iteration: {}".format(iteration), "Avg. Training loss: {:.4f}".format(np.mean(losses)), "{:.4f} sec/ {} sample".format((end - start), self.batch_size * print_step)) aver_losses.append(np.mean(losses)) losses = [] start = time.time() # Print word analogy if iteration % (print_step * 10) == 0: eval = Embedding(np.array(self.E), self.int_to_vocab, self.vocab_to_int) wa_score = self.word_analogy.evaluate(eval, high_level_category=False, restrict_top_words=False) wa_scores.append(wa_score['all']) self.export_model(self.output_dictionary + 'step-{}/'.format(int(iteration))) loss = self._train_one_sample(int(row[0]), int(row[1]), learning_rate) losses.append(loss) eval = Embedding(np.array(self.E), self.int_to_vocab, self.vocab_to_int) wa_score = self.word_analogy.evaluate(eval, high_level_category=False, restrict_top_words=False) wa_scores.append(wa_score['all']) print('Epochs: {}, WA score: {}'.format(_, wa_score['all'])) # Save step if _ % 5 == 0: self.export_model(self.output_dictionary + 'step-{}/'.format(int(_))) # export losses utils.save_pkl(aver_losses, self.output_dictionary + config['TRAIN']['loss_file']) utils.save_pkl(wa_scores, self.output_dictionary + config['TRAIN']['acc_file'])
# accuracy acc = mean(result) # result print("Category: %-30s, accuracy: %f (all: %d)" % (cat, acc, len(X_cat))) predictions[cat] = acc # overrall total_count = 0 acc = 0 for cat in cat_list: cat_count = len(X[cat]) acc += cat_count * predictions.get(cat) total_count += cat_count predictions['all'] = acc / total_count print("All Category accuracy: %f" % (acc / total_count)) return predictions if __name__ == "__main__": word_analogy = WordAnalogy() word_analogy.set_top_words('../../data/processed data/top_30000_words.txt') embedding = Embedding.from_file( '../../output/50dim/embedding-e=50-n_sampled=200-epochs=35-batch_size=10000_1.txt' ) result = word_analogy.evaluate(embedding, high_level_category=False, restrict_top_words=False)
if __name__ == "__main__": wordsim = Wordsim() word_analogy = WordAnalogy() word_analogy.set_top_words('../../data/processed data/top_30000_words.txt') suffix = '_1' dimension_list = [50, 100, 150, 200] wa_list = [] ws_list = [] for dimension in dimension_list: filename = '../../output/convergence_test/3000samples/31epochs/snml/{}dim/embedding-e={}-n_sampled=3000-epochs=31-batch_size=10000.txt'.format( dimension, dimension) print('Reading: ', filename) embedding = Embedding.from_file(filename) wa_result = word_analogy.evaluate(embedding, high_level_category=False, restrict_top_words=False) ws_result = wordsim.evaluate(embedding) wa_list.append(wa_result['all']) ws_list.append(ws_result['EN-WS-353-ALL'][2]) print('Word analogy: ') for wa in wa_list: print(wa) print('Word sim: ') for ws in ws_list: print(ws)
import torch.autograd as autograd import torch.nn.functional as F import torch.optim as optim import time from utils.dataset import Dataset from utils.evaluation import Evaluation from utils.embedding import Embedding from utils.utils import stemming, vocabulary from utils.utils import removeStopwords import matplotlib matplotlib.use('TkAgg') import matplotlib.pyplot as plt torch.manual_seed(1) embedding = Embedding() data = Dataset() vocab = vocabulary(data.trainset) word_to_ix = {word: i for i, word in enumerate(vocab)} class RNN(nn.Module): def __init__(self, embedding_dim, hidden_size, vocab_size): super(RNN, self).__init__() self.embedding_dim = embedding_dim self.hidden_size = hidden_size self.vocab_size = vocab_size self.embedding = nn.Embedding(vocab_size + 1, embedding_dim) self.embedding.weight.data.copy_(self.loadEmbedding()) self.t_lstm = nn.LSTM(embedding_dim, hidden_size,
@staticmethod def pprint(result): from prettytable import PrettyTable x = PrettyTable(["Dataset", "Found", "Not Found", "Score (rho)"]) x.align["Dataset"] = "l" for k, v in result.items(): x.add_row([k,v[0],v[1],v[2]]) print(x) def evaluate(self, embedding): result = {} for file_name, data in self.dataset.items(): pred, label, found, notfound = [] ,[], 0, 0 for datum in data: if embedding.in_vocab(datum[0]) and embedding.in_vocab(datum[1]): found += 1 pred.append(umath.cos(embedding.vector(datum[0]), embedding.vector(datum[1]))) label.append(datum[2]) else: notfound += 1 result[file_name] = (found, notfound, umath.rho(label,pred)*100) return result if __name__ == "__main__": wordsim = Wordsim() embedding = Embedding.from_file('../../output/100dim/embedding-e=100-n_sampled=200-epochs=10-batch_size=10000.txt') result = wordsim.evaluate(embedding) # wordsim.pprint(result) print(result['EN-WS-353-ALL'][2])
def train(self, print_step=1000, stop_threshold=0): iteration = 1 loss = 0 losses = [] epoch_sum_loss = 0. last_epoch_loss = 999999. wa_scores = [] if print_step == 0: print_step = self.n_batches try: start = time.time() while True: train_loss, _ = self.sess.run( [self.full_cost, self.full_optimizer]) # train_loss, _ = self.sess.run([self.cost, self.optimizer]) loss += train_loss epoch_sum_loss += train_loss losses.append(train_loss) if iteration % print_step == 0: end = time.time() print( "Iteration: {}".format(iteration), "Avg. Training loss: {:.4f}".format(loss / print_step), "{:.4f} sec/ {} sample".format( (end - start), self.batch_size * print_step)) loss = 0 start = time.time() if iteration % self.n_batches == 0: epochs = iteration / self.n_batches epoch_loss = epoch_sum_loss / self.n_batches epoch_sum_loss = 0 epoch_loss_diff = np.abs(epoch_loss - last_epoch_loss) print('Epochs {} loss: {}'.format(epochs, epoch_loss)) # word analogy score embedding = self.sess.run(self.embedding_g) eval = Embedding(embedding, self.int_to_vocab, self.vocab_to_int) wa_score = self.word_analogy.evaluate( eval, high_level_category=False, restrict_top_words=False) wa_scores.append(wa_score['all']) # stop criteria if epoch_loss_diff < stop_threshold: self.epochs = iteration / self.n_batches # output file self.embedding_file = config['TRAIN'][ 'embedding'].format(self.n_embedding, self.n_sampled, int(self.epochs), self.batch_size) print('Loss diff: {}, stop training.'.format( epoch_loss_diff)) print(self.output_dictionary + self.embedding_file) break # Save step if epochs % 10 == 0: self.embedding = self.sess.run(self.embedding_g) self.softmax_w = self.sess.run(self.softmax_w_g) self.export_model(self.output_dictionary + 'step-{}/'.format(int(epochs))) last_epoch_loss = epoch_loss iteration += 1 except tf.errors.OutOfRangeError: print("End of dataset") # export embedding matrix self.embedding = self.sess.run(self.embedding_g) self.softmax_w = self.sess.run(self.softmax_w_g) # export losses utils.save_pkl(losses, self.output_dictionary + config['TRAIN']['loss_file']) utils.save_pkl(wa_scores, self.output_dictionary + config['TRAIN']['acc_file'])