def __init__(self, model_path, context_path, learning_rate=0.001, n_negative_sample=15): self.E = utils.load_pkl(model_path + 'embedding.pkl', local=True) self.E = np.array(self.E) self.F = utils.load_pkl(model_path + 'softmax_w.pkl', local=True) self.n_vocab = len(self.E) self.d = self.F.shape[1] self.n_context = self.F.shape[0] self.data_path = model_path self.n_negative_sample = n_negative_sample self.scope = 0 # Context distribution self.context_distribution = utils.load_pkl(context_path + config['SNML']['context_dist'], local=True) self.context_distribution = self.context_distribution ** (3 / 4) self.context_distribution = self.context_distribution / sum(self.context_distribution) # Optimizer initialize self.lr = learning_rate # Context sample look up table table_size = 100000000 # Length of the unigram table table = np.zeros(table_size, dtype=np.uint32) p = 0 # Cumulative probability i = 0 for j in range(self.n_context): p += self.context_distribution[j] while i < table_size and float(i) / table_size < p: table[i] = j i += 1 self.table = table # Set random seed np.random.seed(int(config['OTHER']['random_seed']))
def __init__(self, data_path, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-08): self.E = utils.load_pkl(data_path + 'embedding.pkl') self.C = utils.load_pkl(data_path + 'softmax_w.pkl') self.b = utils.load_pkl(data_path + 'softmax_b.pkl') self.V = self.E.shape[0] self.K = self.E.shape[1] self.V_dash = self.C.shape[0] self.data_path = data_path # adam optimizer initialize self.t = 256040 self.t_default = 256040 self.beta1 = beta1 self.beta2 = beta2 self.lr = learning_rate self.epsilon = epsilon self.beta1_t = beta1 ** self.t self.beta2_t = beta2 ** self.t # initialize things self.mE_t = np.zeros((self.V, self.K)) self.mC_t = np.zeros((self.V_dash, self.K)) self.mb_t = np.zeros(self.V_dash) self.vE_t = np.zeros((self.V, self.K)) self.vC_t = np.zeros((self.V_dash, self.K)) self.vb_t = np.zeros(self.V_dash)
def __init__(self, model_path, context_path, n_neg_sample=200, n_context_sample=1000, learning_rate=0.0004, random_seed=1234): # Load parameters self.embedding = utils.load_pkl(model_path + config['SNML']['embedding']) self.softmax_w = utils.load_pkl(model_path + config['SNML']['softmax_w']) self.n_vocab = self.embedding.shape[0] self.n_embedding = self.embedding.shape[1] self.n_context = self.softmax_w.shape[0] self.n_neg_sample = n_neg_sample self.context_path = context_path self.n_context_sample = n_context_sample self.scope = 0 self.learning_rate = learning_rate # paths self.data_path = model_path # Set uniform distribution as default for context sampling self.contexts = [] # set computation self._set_computation(random_seed)
def __init__(self, data_path, context_path, learning_rate=0.2, beta=0.9, n_context_sample=600): self.E = utils.load_pkl(data_path + 'embedding.pkl') self.C = utils.load_pkl(data_path + 'softmax_w.pkl') self.b = utils.load_pkl(data_path + 'softmax_b.pkl') self.V = self.E.shape[0] self.K = self.E.shape[1] self.V_dash = self.C.shape[0] self.data_path = data_path self.context_path = context_path self.n_context_sample = n_context_sample self.scope = 0 # Load context distribution self.context_distribution = utils.load_pkl(context_path + 'context_distribution.pkl') # Check if sample context file exits self.sample_contexts_file_name = os.path.join( self.context_path, 'sample_contexts_{}.pkl'.format(n_context_sample)) self.contexts = utils.load_pkl(self.sample_contexts_file_name) # Momentum hyper parameters self.learning_rate = learning_rate self.beta = beta self.vE = np.zeros((self.V, self.K)) self.vC = np.zeros((self.V_dash, self.K)) self.vb = np.zeros(self.V_dash)
def __init__(self, model_path, data_file): # Load parameters self.embedding = utils.load_pkl(model_path + config['SNML']['embedding']) self.softmax_w = utils.load_pkl(model_path + config['SNML']['softmax_w']) self.n_vocab = self.embedding.shape[0] self.n_embedding = self.embedding.shape[1] self.n_context = self.softmax_w.shape[0] self.batch_size = 5000 self.sum_log_likelihood = 0 self.k = self.embedding.shape[0] * self.embedding.shape[1] + \ self.softmax_w.shape[0] * self.softmax_w.shape[1] print('Matrix E: Vw * d: ', self.embedding.shape[0], self.embedding.shape[1]) print('Matrix F: d * Bc: ', self.embedding.shape[0], self.embedding.shape[1]) print('k: ', self.k) # paths self.model_path = model_path self.filename = data_file # set computation self._set_computation(self.filename, batch_size=self.batch_size, epochs=1)
def change_model(self, model_path): # Load parameters self.embedding = utils.load_pkl(model_path + config['SNML']['embedding']) self.softmax_w = utils.load_pkl(model_path + config['SNML']['softmax_w']) self.softmax_b = utils.load_pkl(model_path + config['SNML']['softmax_b']) self.n_vocab = self.embedding.shape[0] self.n_embedding = self.embedding.shape[1] self.n_context = self.softmax_w.shape[0] # paths self.data_path = model_path # set computation self._set_computation()
def __init__(self, input_path, output_path, n_embedding): self.n_embedding = n_embedding self.embedding = np.array([]) self.data_path = input_path # create output directory self.output_dictionary = output_path + config['TRAIN'][ 'output_dir'].format(n_embedding) if not os.path.exists(self.output_dictionary): os.makedirs(self.output_dictionary) # read dictionaries self.int_to_vocab = utils.load_pkl(input_path + config['TRAIN']['vocab_dict']) self.int_to_cont = utils.load_pkl(input_path + config['TRAIN']['context_dict']) self.n_vocab = len(self.int_to_vocab) self.n_context = len(self.int_to_cont)
def __init__(self, data_path, context_path, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-08, n_context_sample=600): self.E = utils.load_pkl(data_path + 'embedding.pkl') self.C = utils.load_pkl(data_path + 'softmax_w.pkl') self.b = utils.load_pkl(data_path + 'softmax_b.pkl') self.V = self.E.shape[0] self.K = self.E.shape[1] self.V_dash = self.C.shape[0] self.data_path = data_path self.context_path = context_path self.n_context_sample = n_context_sample self.scope = 0 # Load context distribution self.context_distribution = utils.load_pkl(context_path + 'context_distribution.pkl') # Check if sample context file exits self.sample_contexts_file_name = os.path.join( self.context_path, 'sample_contexts_{}.pkl'.format(n_context_sample)) self.contexts = utils.load_pkl(self.sample_contexts_file_name) # adam optimizer initialize self.t = 396893 self.beta1 = beta1 self.beta2 = beta2 self.lr = learning_rate self.epsilon = epsilon self.beta1_t = beta1**self.t self.beta2_t = beta2**self.t # initialize things self.mE_t = np.zeros((self.V, self.K)) self.mC_t = np.zeros((self.V_dash, self.K)) self.mb_t = np.zeros(self.V_dash) self.vE_t = np.zeros((self.V, self.K)) self.vC_t = np.zeros((self.V_dash, self.K)) self.vb_t = np.zeros(self.V_dash)
def __init__(self, config, logger): # load parameters self.config = config self.logger = logger self.data_path = config['data_path'] # load docs self.data_dict = load_pkl(self.data_path + '/texts.pkl') self.stop_words = load_stop_words(self.data_path + '/cn_stopwords.txt') docs = list(self.data_dict.values()) # topkenize self.docs = self._tokenizer(docs)
def __init__(self, model_path, sample_path, output_path, context_distribution_file, n_train_sample=1000, n_neg_sample=200, n_context_sample=1000): # Load parameters self.embedding = utils.load_pkl(model_path + config['SNML']['embedding']) self.softmax_w = utils.load_pkl(model_path + config['SNML']['softmax_w']) self.softmax_b = utils.load_pkl(model_path + config['SNML']['softmax_b']) self.n_vocab = self.embedding.shape[0] self.n_embedding = self.embedding.shape[1] self.n_context = self.softmax_w.shape[0] self.n_neg_sample = n_neg_sample # paths self.data_path = model_path self.sample_path = sample_path self.output_path = output_path self.n_files = int(config['SNML']['n_files']) # sample data self.n_train_sample = n_train_sample self.words = [] self.contexts = [] self.epochs = 0 self._set_training_sample(20) self.sample_contexts, self.sample_contexts_prob = utils.sample_contexts( context_distribution_file, n_context_sample) self.n_context_sample = n_context_sample # set computation self._set_computation()
def evaludate_model(dim): kl_list = [] mae_list = [] cos_list = [] rho_list = [] parent_dir = '../notebooks/output/100-context-500000-data-38-questions/' model = Model(parent_dir + '1/full/{}dim/'.format(dim), '../../data/text8/', learning_rate=0.1) context_distribution = utils.load_pkl(parent_dir + 'context_distribution.dict') for i in range(len(context_distribution)): true_dis = context_distribution[i] pred_dis = model.get_context_dis(i) kl_list.append(kl_divergence(pred_dis, true_dis)) mae_list.append(mean_absolute_error(pred_dis, true_dis)) cos_list.append(cos(pred_dis, true_dis)) rho_list.append(rho(pred_dis, true_dis)) return np.sum(rho_list)
from utils.tools import load_pkl def export_embedding(embedding, filename): # write embedding result to file output = open(filename, 'w', encoding='utf-8') for i in range(embedding.shape[0]): text = int_to_vocab[i] for j in embedding[i]: text += ' %f' % j text += '\n' try: output.write(text) except: print(text) output.close() if __name__ == "__main__": dims = [50, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 300] path = '../../output/wiki/20200126/1/train1/{}dim/step-90/' int_to_vocab = load_pkl('../../data/wiki/dict/int_to_vocab.dict', local=True) for dim in dims: embedding = load_pkl(path.format(dim) + 'embedding.pkl', local=True) export_embedding(embedding, path.format(dim) + 'embedding.txt')
from collections import Counter from utils.tools import load_pkl, save_pkl import numpy as np if __name__ == "__main__": # Data file raw_data_path = '../data/raw data/test.txt ' context_to_dict_path = 'data/text8/dict/cont_to_int.dict' output_path = 'data/text8/contexts/distribution_from_raw.pkl' int_to_cont = load_pkl('data/text8/dict/int_to_cont.dict', local=True) # Load data with open(raw_data_path, encoding='utf-8') as f: words = f.read().split() # Load dict context_to_dict = load_pkl(context_to_dict_path, local=True) # Convert vocab to int context = [] for word in words: if word in context_to_dict: context.append(context_to_dict[word]) context_counts = Counter(context) n_context = len(context_to_dict) n_data = sum(list(context_counts.values())) context_distribution = np.zeros(n_context) for c, count in context_counts.items(): context_distribution[c] = count / n_data
def __init__(self, input_path, output_path, n_embedding, batch_size, epochs, n_sampled, snml=False, snml_dir='', random_seed=1234): self.n_embedding = n_embedding self.embedding = np.array([]) self.data_path = input_path # create output directory self.output_dictionary = output_path + config['TRAIN'][ 'output_dir'].format(n_embedding) if not os.path.exists(self.output_dictionary): os.makedirs(self.output_dictionary) if snml: self.snml_dir = snml_dir + config['TRAIN']['output_dir'].format( n_embedding) if not os.path.exists(self.snml_dir): os.makedirs(self.snml_dir) # sync with gcs utils.download_from_gcs(input_path + config['TRAIN']['int_to_vocab']) utils.download_from_gcs(input_path + config['TRAIN']['vocab_to_int']) utils.download_from_gcs(input_path + config['TRAIN']['int_to_cont']) utils.download_from_gcs(input_path + config['TRAIN']['train_data']) # read dictionaries self.int_to_vocab = utils.load_pkl(input_path + config['TRAIN']['int_to_vocab']) self.vocab_to_int = utils.load_pkl(input_path + config['TRAIN']['vocab_to_int']) self.int_to_cont = utils.load_pkl(input_path + config['TRAIN']['int_to_cont']) self.n_vocab = len(self.int_to_vocab) self.n_context = len(self.int_to_cont) # computation graph self.train_graph = tf.Graph() self.batch_size = batch_size self.epochs = epochs self.n_sampled = n_sampled # construct computation graph if snml: filename = self.data_path + config['TRAIN']['train_data_snml'] else: filename = self.data_path + config['TRAIN']['train_data'] self.snml = snml self._set_training_file(filename) self._set_computation(filename, random_seed) # training data self.n_datums = utils.count_line(filename) self.n_batches = self.n_datums // self.batch_size # evaluator self.word_analogy = WordAnalogy(config['TRAIN']['question_file']) self.word_analogy.set_top_words(config['TRAIN']['top_word_file']) # output file self.embedding_file = config['TRAIN']['embedding'].format( self.n_embedding, n_sampled, epochs, batch_size) print( 'Initialize Model with: {} samples, trying to run {} batches each epoch.' .format(self.n_datums, self.n_batches))
def __init__(self, input_path, output_path, n_embedding, batch_size, epochs, n_sampled, snml=False, snml_dir=''): self.n_embedding = n_embedding self.embedding = np.array([]) self.data_path = input_path # create output directory self.output_dictionary = output_path + config['TRAIN']['output_dir'].format(n_embedding) if not os.path.exists(self.output_dictionary): os.makedirs(self.output_dictionary) if snml: self.snml_dir = snml_dir + config['TRAIN']['output_dir'].format(n_embedding) if not os.path.exists(self.snml_dir): os.makedirs(self.snml_dir) # sync with gcs utils.download_from_gcs(input_path + config['TRAIN']['int_to_vocab']) utils.download_from_gcs(input_path + config['TRAIN']['vocab_to_int']) utils.download_from_gcs(input_path + config['TRAIN']['int_to_cont']) utils.download_from_gcs(input_path + config['TRAIN']['cont_to_int']) utils.download_from_gcs(input_path + config['TRAIN']['train_data']) # read dictionaries self.int_to_vocab = utils.load_pkl(input_path + config['TRAIN']['int_to_vocab']) self.vocab_to_int = utils.load_pkl(input_path + config['TRAIN']['vocab_to_int']) self.int_to_cont = utils.load_pkl(input_path + config['TRAIN']['int_to_cont']) self.cont_to_int = utils.load_pkl(input_path + config['TRAIN']['cont_to_int']) self.n_vocab = len(self.int_to_vocab) self.n_context = len(self.int_to_cont) # computation parameters self.batch_size = batch_size self.epochs = epochs self.n_sampled = n_sampled random_seed = int(config['OTHER']['random_seed']) np.random.seed(random_seed) # Set seed for weights self.E = [np.random.uniform(-0.8, 0.8, self.n_embedding) for _ in range(self.n_vocab)] self.F = np.random.uniform(-0.8, 0.8, (self.n_context, self.n_embedding)) np.random.seed(random_seed) # Set seed for anythings else (negative samples) # construct computation graph if snml: filename = self.data_path + config['TRAIN']['train_data_snml'] else: filename = self.data_path + config['TRAIN']['train_data'] self.snml = snml self.filename = filename # training data self.n_datums = utils.count_line(filename) self.n_batches = self.n_datums // self.batch_size # Context distribution self.context_distribution = utils.load_pkl(self.data_path + config['TRAIN']['context_dist']) self.context_distribution = self.context_distribution ** (3/4) self.context_distribution = self.context_distribution / sum(self.context_distribution) # Context sample look up table table_size = 100000000 # Length of the unigram table table = np.zeros(table_size, dtype=np.uint32) p = 0 # Cumulative probability i = 0 for j in range(self.n_context): p += self.context_distribution[j] while i < table_size and float(i) / table_size < p: table[i] = j i += 1 self.table = table self.pos_neg = [1] + [0 for _ in range(self.n_sampled)] # evaluator self.word_analogy = WordAnalogy(config['TRAIN']['question_file']) # self.word_analogy.set_top_words(config['TRAIN']['top_word_file']) # output file self.embedding_file = config['TRAIN']['embedding'].format(self.n_embedding, n_sampled, epochs, batch_size) print('Initialize Model with: {} samples, trying to run {} batches each epoch.'.format(self.n_datums, self.n_batches))
# Set up parameters if args.continue_scope == 0: args.continue_scope = args.scope # read snml train file utils.download_from_gcs(args.snml_train_file) data = np.genfromtxt(args.snml_train_file, delimiter=',').astype(int) # Initialize model model = Model(args.model, args.context_path, n_context_sample=args.n_context_sample, learning_rate=args.learning_rate) # Continue from previous previous_file = args.model + '{}-step/scope-{}-snml_length.pkl'.format(args.continue_from, args.continue_scope) if os.path.isfile(previous_file): snml_lengths = utils.load_pkl(previous_file) else: snml_lengths = [] for i in range(args.continue_from): model.train_one_sample(data[i][0], data[i][1], epochs=args.epochs, update_weight=True) print('Continue step: {}, from file: {}'.format(args.continue_from, previous_file)) # Run snml print_step = 10 start = time.time() for i in range(args.continue_from, args.scope): w = data[i][0] c = data[i][1] length = model.snml_length_sampling(w, c, epochs=args.epochs)
import os import utils.tools as utils if __name__ == "__main__": context_path = '../notebooks/output/50-context-500000-data-18-questions/contexts/' n_context_sample = 50 scope = 5000 file_name = os.path.join(context_path, 'sample_contexts_{}.pkl'.format(n_context_sample)) context_distribution = utils.load_pkl(context_path + 'context_distribution.pkl') if os.path.exists(file_name): print('Load file') contexts = utils.load_pkl(file_name) else: contexts = [] print('Current contexts: ', len(contexts)) # Sample contexts if scope + 1 > len(contexts): for i in range(scope - len(contexts)): samples = utils.sample_context_uniform(len(context_distribution), n_context_sample) contexts.append(samples) # Save result back to pkl utils.save_pkl(contexts, file_name) print(len(contexts))