def train(self): rsl = [] files = listdir(self.path) reader = InputData(self.dataset_type, self.path) corpus = plsa.Corpus() for filename in files: vectors=[] question = reader.readFile(filename) id = question["id"] self.data[id]=[ref["text"] for ref in question["referenceAnswers"]] for r in question["referenceAnswers"]: rid=r["id"] references=[ sr["text"] for sr in r["studentAnswers"]] """ for ans in question["student_answers"]: if ans["id"]==rid: references.append(ans["text"]) #references=[ self.stemmer.stem(sr["text"]) for sr in r["studentAnswers"]] """ references.append(r["text"]) corpus.addBaseline(references) #print corpus.getVector() vectors.append(corpus.getVector()) corpus.reset() self.model[id]=vectors return
class Word2Vec: def __init__(self, input_file_name, output_file_name): self.output_file_name = output_file_name self.data = InputData(input_file_name, MIN_COUNT) self.model = SkipGramModel(self.data.word_count, EMB_DIMENSION).cuda() self.lr = LR self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr) def train(self): #self.model.load_state_dict(torch.load("../results/skipgram_nge.pkl")) print("SkipGram Training......") pairs_count = self.data.evaluate_pairs_count(WINDOW_SIZE) print("pairs_count", pairs_count) batch_count = pairs_count / BATCH_SIZE print("batch_count", batch_count) process_bar = tqdm(range(int(5 * batch_count))) for i in process_bar: pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE) pos_w = [int(pair[0]) for pair in pos_pairs] pos_v = [int(pair[1]) for pair in pos_pairs] neg_v = self.data.get_negative_sampling(pos_pairs, NEG_COUNT) pos_w = pos_w pos_v = pos_v neg_v = neg_v self.optimizer.zero_grad() loss = self.model.forward(pos_w, pos_v, neg_v) loss.backward() self.optimizer.step() process_bar.set_postfix(loss=loss.data) process_bar.update() torch.save(self.model.state_dict(), "../results/skipgram_nge.pkl") self.model.save_embedding(self.data.id2word_dict, self.output_file_name)
def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=50, window_size=5, iteration=1, initial_lr=0.025, min_count=5): """Initilize class parameters. Args: input_file_name: Name of a text data from file. Each line is a sentence splited with space. output_file_name: Name of the final embedding file. emb_dimention: Embedding dimention, typically from 50 to 500. batch_size: The count of word pairs for one forward. window_size: Max skip length between words. iteration: Control the multiple training iterations. initial_lr: Initial learning rate. min_count: The minimal word frequency, words with lower frequency will be filtered. Returns: None. """ self.data = InputData(input_file_name, min_count) self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr)
def main(): # 入力データのshape定義 x = nn.variable.Variable( [BATCH_SIZE, IMAGE_DEPTH * IMAGE_WIDTH * IMAGE_HEIGHT]) # ラベルのshape定義 t = nn.variable.Variable([BATCH_SIZE, LABEL_NUM]) pred = convolution(x) loss_ = loss(pred, t) solver = S.Adam() solver.set_parameters(nn.get_parameters()) data = InputData() for i in range(NUM_STEP): # 100STEP毎にテスト実施 if i % 100 == 0: l = 0 a = 0 for k, (t.d, x.d) in enumerate(data.test_data()): loss_.forward() l += loss_.d a += accuracy(pred, t) print("Step: %05d Test loss: %0.05f Test accuracy: %0.05f" % (i, l / k, a / k)) t.d, x.d = data.next_batch() loss_.forward() solver.zero_grad() loss_.backward() solver.weight_decay(DECAY_RATE) solver.update() if i % 10 == 0: print("Step: %05d Train loss: %0.05f Train accuracy: %0.05f" % (i, loss_.d, accuracy(pred, t)))
class Word2Vec: def __init__(self, input_file_name, output_file_name): self.output_file_name = output_file_name self.data = InputData(input_file_name, MIN_COUNT) self.model = SkipGramModel(self.data.word_count, EMB_DIMENSION) self.lr = LR self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr) def train(self): print("SkipGram Training......") pairs_count = self.data.evaluate_pairs_count(WINDOW_SIZE) print("pairs_count", pairs_count) batch_count = pairs_count / BATCH_SIZE print("batch_count", batch_count) process_bar = tqdm(range(int(batch_count))) for i in process_bar: pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE) pos_w = [int(pair[0]) for pair in pos_pairs] pos_v = [int(pair[1]) for pair in pos_pairs] neg_v = self.data.get_negative_sampling(pos_pairs, NEG_COUNT) self.optimizer.zero_grad() loss = self.model.forward(pos_w, pos_v, neg_v) loss.backward() self.optimizer.step() if i * BATCH_SIZE % 100000 == 0: self.lr = self.lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = self.lr self.model.save_embedding(self.data.id2word_dict, self.output_file_name)
class Word2Vec: def __init__(self, input_file_name, output_file_name): self.min_count = 5 self.emb_dimension = 100 self.batch_size = 64 self.window_size = 5 self.iteration = 1 self.initial_lr = 0.001 self.data = InputData(input_file_name, self.min_count) self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension, self.batch_size, self.window_size, self.iteration, self.initial_lr, self.min_count) self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.skip_gram_model.cuda() self.optimizer = optim.SGD( self.skip_gram_model.parameters(), lr=self.initial_lr) def train(self): """Multiple training. Returns: None. """ pair_count = self.data.evaluate_pair_count(self.window_size) batch_count = self.iteration * pair_count / self.batch_size process_bar = tqdm(range(int(batch_count))) for i in process_bar: pos_pairs = self.data.get_batch_pairs(self.batch_size, self.window_size) neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5) pos_u = [pair[0] for pair in pos_pairs] pos_v = [pair[1] for pair in pos_pairs] pos_u = Variable(torch.LongTensor(pos_u)) pos_v = Variable(torch.LongTensor(pos_v)) neg_v = Variable(torch.LongTensor(neg_v)) if self.use_cuda: pos_u = pos_u.cuda() pos_v = pos_v.cuda() neg_v = neg_v.cuda() self.optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() self.optimizer.step() process_bar.set_description("Loss: %0.8f, lr: %0.6f" % (loss.data, self.optimizer.param_groups[0]['lr'])) if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr self.skip_gram_model.save_embedding( self.data.id2word, self.output_file_name, self.use_cuda)
class Word2Vec: def __init__(self, input_file_name, output_file_name): self.output_file_name = output_file_name self.data = InputData(input_file_name, MIN_COUNT) self.model = CBOWModel(self.data.word_count, EMB_DIMENSION) self.lr = LR self.optimizer = optim.SparseAdam(self.model.parameters(), lr=self.lr) def train(self): start = time.clock() max_accuracy = 0 for epoch in range(5000): all_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE) pos_pairs, neg_pairs = self.data.get_pairs(all_pairs) # pos是huffman编码为1的部分 pos_u = [pair[0] for pair in pos_pairs] pos_v = [int(pair[1]) for pair in pos_pairs] # 与1对应的非叶子节点 #neg是huffman编码为0的部分 neg_u = [pair[0] for pair in neg_pairs] neg_v = [int(pair[1]) for pair in neg_pairs] # 与0对应的非叶子节点 self.optimizer.zero_grad() loss = self.model.forward(pos_u, pos_v, neg_u, neg_v) loss.backward() self.optimizer.step() #梯度更新 #mid_end=time.clock() #print('one time:%s seconds'%(mid_end-start)) if epoch % 100 == 0: print("Epoch : %d, loss : %.02f" % (epoch, loss)) ac = self.model.predict(all_pairs, self.data.huffman_tree) if ac > max_accuracy: max_accuracy = ac end = time.clock() print('time:%s seconds' % (end - start)) print('accuracy:%.06f' % (max_accuracy)) #self.model.save_embedding(self.data.id2word_dict, self.output_file_name) tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=500) #词向量图 embed_two = tsne.fit_transform( self.model.u_embeddings.weight.cpu().detach().numpy()) labels = [self.data.id2word_dict[i] for i in range(200)] plt.figure(figsize=(15, 12)) for i, label in enumerate(labels): x, y = embed_two[i, :] plt.scatter(x, y) plt.annotate(label, (x, y), ha='center', va='top') plt.savefig('HS.png')
def __init__(self, wikidump_filename, output_text_filename, emb_dimension, batch_size, window_size, iteration, initial_lr, min_count): self.data = InputData(wikidump_filename, min_count, output_text_filename) self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr)
class Word2Vec: def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=50, window_size=5, iteration=5, initial_lr=0.025, neg_num=5, min_count=5): self.data = InputData(input_file_name, min_count) self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.neg_num = neg_num self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.skip_gram_model.cuda() self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) def train(self): pair_count = self.data.evaluate_pair_count(self.window_size) batch_count = self.iteration * pair_count / self.batch_size process_bar = tqdm(range(int(batch_count))) count = int(batch_count) // 3 for i in process_bar: pos_pairs = self.data.get_batch_pairs(self.batch_size, self.window_size) neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, self.neg_num) pos_u = [pair[0] for pair in pos_pairs] pos_v = [pair[1] for pair in pos_pairs] pos_u = Variable(torch.LongTensor(pos_u)).cuda() pos_v = Variable(torch.LongTensor(pos_v)).cuda() neg_v = Variable(torch.LongTensor(neg_v)).cuda() self.optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() self.optimizer.step() process_bar.set_description("Loss: %0.8f, lr: %0.6f" % (loss.item(), self.optimizer.param_groups[0]['lr'])) if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr if i != 0 and i % count == 0: self.skip_gram_model.save_embedding(self.data.id2word,self.output_file_name + str(i)) self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name + 'final')
def evaluate(): '''テストデータに対する評価を実施する関数 ''' data = InputData(test_data_path=FLAGS.test_data, train=False) input_ph = tf.placeholder(tf.int32, [None, data.max_len]) training_ph = tf.placeholder(tf.bool, []) label_ph = tf.placeholder(tf.float32, [None, data.num_category]) with tf.Session() as sess: output = convolution(input_ph, training_ph, data.num_chars, data.num_category) values, indices = tf.nn.top_k(output, k=10) saver = tf.train.Saver() load_checkpoint(sess, saver) with open(FLAGS.output_dir + '/evaluate.tsv', 'w') as f: writer = csv.writer(f, delimiter='\t') for test_labels, test_texts, unique_ids, item_names in data.next_batch_evaluation_data( ): values_, indices_ = sess.run([values, indices], feed_dict={ input_ph: test_texts, training_ph: False }) for (value, index, test_label, unique_id, item_name) in zip(values_, indices_, test_labels, unique_ids, item_names): row = [unique_id] + [ data.category_dict[np.argmax(test_label)] ] + [data.chars_to_unknown(item_name) ] + list(value) + list( map(lambda x: data.category_dict[x], index)) + [ index[0] == np.argmax(test_label) ] + [np.argmax(test_label) in index[0:3]] writer.writerow(row) num_records = len( open(FLAGS.output_dir + '/evaluate.tsv', 'r').readlines()) with open(FLAGS.output_dir + '/evaluate.tsv', 'r') as f: reader = csv.reader(f, delimiter='\t') accuracy_count = [(line[23], line[24]) for line in reader] accuracy_top1 = len( list(filter(lambda x: x[0] == 'True', accuracy_count))) / num_records accuracy_top3 = len( list(filter(lambda x: x[1] == 'True', accuracy_count))) / num_records with open(FLAGS.output_dir + '/test_accuracy.tsv', 'w') as f: writer = csv.writer(f, delimiter='\t') writer.writerow([accuracy_top1, accuracy_top3])
def __call__(self, data: InputData) -> ResultData: (x, y) = data.initial path = [(x, y)] grad = (data.df_dx1(x, y), data.df_dx2(x, y)) calls_count = 2 while (grad[0] * grad[0] + grad[1] * grad[1]) >= data.eps * data.eps: (x, y) = (x - self.alpha * grad[0], y - self.alpha * grad[1]) grad = (data.df_dx1(x, y), data.df_dx2(x, y)) calls_count += 2 path.append((x, y)) return ResultData(self.name, (x, y), data.function(x, y), 0, calls_count, path)
def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=50, window_size=5, iteration=5, initial_lr=0.025, neg_num=5, min_count=5): self.data = InputData(input_file_name, min_count) self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.neg_num = neg_num self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.skip_gram_model.cuda() self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr)
def __init__(self, args): # data class self.data = InputData(args.train, args.min_count, args.minn, args.maxn, args.thread) self.outfile = args.output self.save_model = args.save_model self.load_model = args.load_model self.emb_dim = args.size self.bs = args.batch_size self.win_size = args.window self.iters = args.iter self.lr = args.lr self.neg_n = args.negative self.sub_samp_th = args.sample #subsampling, prob reserving the word self.sub_samp_probs = np.sqrt(self.sub_samp_th / self.data.idx2freq) self.thread = args.thread self.use_cuda = args.cuda print('Initializing model...') self.init_model(args) if self.use_cuda: self.model.cuda() self.model.share_memory()
def test(self,mode,inputdir,outputdir): head = ["id","grade" ,"Accuracy","Predicted"] self.mode=mode rsl=[] files = listdir(inputdir) reader = InputData(self.dataset_type, inputdir) for filename in files: question = reader.readFile(filename) id = question["id"] stuAns = [] for r in question["referenceAnswers"]: for sr in r["studentAnswers"]: stuAns.append(sr) for sr in question["otherStudentAnswers"]: stuAns.append(sr) for sr in stuAns: grade="" if self.nonDomain.test(sr["text"]): if mode==2 or mode==3: grade="incorrect" if mode==5: grade="non_domain" rsl.append({"id": sr["id"],"Accuracy":sr["accuracy"],"Predicted":grade,"grade":"NA"}) print rsl[len(rsl)-1] continue if self.contradictBigram.isContradictory(id,sr["text"]) or self.contradict.isContradictory(self.modeler.getReferences(id),sr["text"]): if mode==2: grade="incorrect" if mode==3 or mode==5: grade="contradictory" rsl.append({"id": sr["id"],"Accuracy":sr["accuracy"],"Predicted":grade,"grade":"NA"}) print rsl[len(rsl)-1] continue score=self.modeler.grade(id,sr["text"]) if self.datamode== "beetle": self.irr.build(self.modeler.getReferences(id)) if self.irr.isIrrelevent(sr["text"]): score=-1 grade=self.predict(score) rsl.append({"id": sr["id"],"Accuracy":sr["accuracy"],"Predicted":grade,"grade":score}) print rsl[len(rsl)-1] output(outputdir, head, rsl)
def __call__(self, data: InputData) -> ResultData: (x, y) = data.initial path = [(x, y)] grad = (data.df_dx1(x, y), data.df_dx2(x, y)) calls_count = 2 f_calls = 0 while (grad[0]**2 + grad[1]**2) >= data.eps**2: (t_x, t_y) = (x - self.alpha * grad[0], y - self.alpha * grad[1]) f_calls += 2 if data.function(t_x, t_y) > data.function( x, y) - self.delta * self.alpha * (grad[0]**2 + grad[1]**2): self.alpha *= self.delta continue (x, y) = (t_x, t_y) grad = (data.df_dx1(x, y), data.df_dx2(x, y)) calls_count += 2 path.append((x, y)) return ResultData(self.name, (x, y), data.function(x, y), f_calls, calls_count, path)
def __call__(self, data: InputData) -> ResultData: (x, y) = data.initial x0 = (x, y) x1 = (x + self.length, y) x2 = (x, y + self.length) xk = [x0, x1, x2] f_calls = 3 path = [x0] fk = [data.function(xi[0], xi[1]) for xi in xk] while sum([(xk[i][0] - xk[0][0])**2 + (xk[i][1] - xk[0][1])**2 for i in range(1, 3)]) / 2 > data.eps**2: min_i = min(range(3), key=lambda i: fk[i]) max_i = max(range(3), key=lambda i: fk[i]) c_x = 0 c_y = 0 for i in range(3): if i != max_i: c_x += xk[i][0] c_y += xk[i][1] c_x /= 2 c_y /= 2 u_k = (2 * c_x - xk[max_i][0], 2 * c_y - xk[max_i][1]) f_u = data.function(u_k[0], u_k[1]) if f_u < fk[max_i]: xk[max_i] = u_k fk[max_i] = f_u f_calls += 1 else: for i in range(0, 3): xk[i] = ((xk[i][0] + xk[min_i][0]) / 2, (xk[i][1] + xk[min_i][1]) / 2) fk[i] = data.function(xk[i][0], xk[i][1]) f_calls += 2 path.append(xk[0]) return ResultData(self.name, xk[0], fk[0], f_calls, 0, path)
def __init__(self, input_user_file_name, input_links_file_name, output_file_name, emb_dimension=100, num_batch=30000, batch_size=100, initial_lr=0.025): """Initilize class parameters. Args: input_user_file_name: 用户数据文件 input_links_file_name: 关系数据文件 output_file_name:保存文件 emb_dimention: 向量维度 num_batch:处理次数 batch_size:批处理大小 initial_lr: 初始学习率 Returns: None. """ ##处理数据 self.data = InputData(input_user_file_name, input_links_file_name) self.output_file_name = output_file_name ##emb_size为embed的大小,等于顶点个数 self.emb_size = self.data.vertex_count self.emb_dimension = emb_dimension ##batch_size是每次更新时的数据规模 self.batch_size = batch_size self.initial_lr = initial_lr self.num_batch = num_batch ##调用模型,+1的原因是顶点是从1开始的,所以我们把0位置的向量保存下来,但其实没啥意思 self.NetModel = NetModel(self.emb_size + 1, self.emb_dimension) ##是否使用cuda加速 self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.NetModel.cuda() ##使用随机梯度下降的方法来更新参数 self.optimizer = optim.SGD(self.NetModel.parameters(), lr=self.initial_lr)
def __init__(self, input_file_name, output_file_name): self.min_count = 5 self.emb_dimension = 100 self.batch_size = 64 self.window_size = 5 self.iteration = 1 self.initial_lr = 0.001 self.data = InputData(input_file_name, self.min_count) self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension, self.batch_size, self.window_size, self.iteration, self.initial_lr, self.min_count) self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.skip_gram_model.cuda() self.optimizer = optim.SGD( self.skip_gram_model.parameters(), lr=self.initial_lr)
def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=100, window_size=5, iteration=5, initial_lr=0.025, min_count=5, using_hs=False, using_neg=False, context_size=2, hidden_size=128, cbow=None, skip_gram=None): print("\nInput File loading......\n") self.data = InputData(input_file_name, min_count) print("\nInput File loaded.\n") self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.context_size = context_size self.hidden_size = hidden_size self.using_hs = using_hs self.using_neg = using_neg self.cbow = cbow self.skip_gram = skip_gram if self.skip_gram is not None and self.skip_gram: self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) print("skip_gram_model", self.skip_gram_model) self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) if self.cbow is not None and self.cbow: self.cbow_model = CBOW(self.emb_size, self.emb_dimension) print("CBOW_model", self.cbow_model) self.optimizer = optim.SGD(self.cbow_model.parameters(), lr=self.initial_lr)
def __call__(self, data: InputData) -> ResultData: (x, y) = data.initial h = self.h calls_count = 0 path = [(x, y)] while h > self.delta: x1 = x y1 = y f = data.function(x, y) if data.function(x + h, y) < f: x1 += h if data.function(x - h, y) < f: x1 -= h if data.function(x, y + h) < f: y1 += h if data.function(x, y - h) < f: y1 -= h calls_count += 5 if (x1, y1) == (x, y): h /= 2 continue x += self.lambd * (x1 - x) y += self.lambd * (y1 - y) path.append((x, y)) return ResultData(self.name, (x, y), data.function(x, y), calls_count, 0, path)
class Word2Vec: def __init__(self, input_file_name, output_file_name): self.output_file_name = output_file_name self.data = InputData(input_file_name, MIN_COUNT) self.model = CBOWModel(self.data.word_count, EMB_DIMENSION).cuda() self.lr = LR self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr) def train(self): print("CBOW Training......") pairs_count = self.data.evaluate_pairs_count(WINDOW_SIZE) print("pairs_count", pairs_count) batch_count = pairs_count / BATCH_SIZE print("batch_count", batch_count) process_bar = tqdm(range(int(batch_count))) loss = -1 for i in process_bar: pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE) pos_u = [pair[0] for pair in pos_pairs] pos_w = [int(pair[1]) for pair in pos_pairs] neg_w = self.data.get_negative_sampling(pos_pairs, NEG_COUNT) self.optimizer.zero_grad() loss_now = self.model.forward(pos_u, pos_w, neg_w) if loss == -1: loss = loss_now.data.item() else: loss = 0.95 * loss + 0.05 * loss_now.data.item() loss_now.backward() self.optimizer.step() if i * BATCH_SIZE % 100000 == 0: self.lr = self.lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = self.lr process_bar.set_postfix(loss=loss) process_bar.update() self.model.save_embedding(self.data.id2word_dict, self.output_file_name)
def user_clustering(): users = [] with open(FILE_DIR + file_path[0]) as f: for line in f: _, _, _, user_data, _ = InputData.split_data(line) if user_data not in users: users.append(user_data) if len(users) % 10000 == 0: print(len(users)) print(len(users)) kmeans = UserCluster(N_CLUSTERING) kmeans.fit(features) joblib.dump(kmeans, 'kmeans.pkl') return kmeans
def user_clustering(): users = [] with open('../analytics/stdev/usercluster200.csv') as f: for line in f: _, _, _, user_data, _ = InputData.split_data(line) if user_data not in users: users.append(user_data) if len(users) % 10000 == 0: print(len(users)) print(len(users)) kmeans = UserCluster(N_CLUSTERING) kmeans.fit(features) joblib.dump(kmeans, 'kmeans.pkl') return kmeans
def test_write(self): sents = [["a", "b", "c"], ["b", "c"], ["a"], []] sents_ids = [[1, 3, 5], [3, 5], [1], []] input_data = InputData(self.vocab, sents) with tempfile.NamedTemporaryFile() as fp: input_data.write(fp.name) filenames = [fp.name] dataset = tf.contrib.data.TFRecordDataset(filenames) dataset = dataset.map(input_data.get_single_example) iterator = dataset.make_initializable_iterator() sentence = iterator.get_next() with tf.Session() as sess: sentences = [] sess.run(iterator.initializer) while True: try: sentences.append(sess.run(sentence)) except tf.errors.OutOfRangeError: break self.assertEqual([s[1].tolist() for s in sentences], sents_ids)
def __init__( self, input_path, output_dir, wordsim_path, dimension=100, batch_size=batch_size, window_size=5, epoch_count=1, initial_lr=1e-6, min_count=5, ): self.data = InputData(input_path, min_count) self.output_dir = output_dir self.vocabulary_size = len(self.data.id_from_word) self.dimension = dimension self.batch_size = batch_size self.window_size = window_size self.epoch_count = epoch_count self.initial_lr = initial_lr self.model = SkipGramModel(self.vocabulary_size, self.dimension) if torch.cuda.is_available(): self.device = torch.device('cuda') else: self.device = torch.device('cpu') self.model = nn.DataParallel(self.model.to(self.device)) self.optimizer = optim.SGD(self.model.parameters(), lr=self.initial_lr) if wordsim_path: self.wordsim_verification_tuples = [] with open(wordsim_path, 'r') as f: f.readline() # Abandon header for line in f: word1, word2, actual_similarity = line.split(',') self.wordsim_verification_tuples.append( (word1, word2, float(actual_similarity)) ) else: self.wordsim_verification_tuples = None
class Word2Vec: def __init__(self, input_file_name, output_file_name): self.output_file_name = output_file_name self.data = InputData(input_file_name, MIN_COUNT) self.model = CBOWModel(self.data.word_count, EMB_DIMENSION).cuda() self.lr = LR self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr) def train(self): for _ in range(1, EPOCH + 1): print("CBOW Training......") pairs_count = self.data.evaluate_pairs_count(WINDOW_SIZE) print("pairs_count", pairs_count) batch_count = int(np.ceil(pairs_count / BATCH_SIZE)) print("batch_count", batch_count) process_bar = tqdm(range(int(batch_count))) # for _ in range(1, EPOCH + 1): for i in process_bar: pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE) pos_u = [pair[0] for pair in pos_pairs] pos_w = [int(pair[1]) for pair in pos_pairs] neg_w = self.data.get_negative_sampling(pos_pairs, NEG_COUNT) self.optimizer.zero_grad() loss = self.model.forward(pos_u, pos_w, neg_w) loss.backward() self.optimizer.step() if i * BATCH_SIZE % 100000 == 0: self.lr = self.lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = self.lr process_bar.set_postfix(loss=loss.data) process_bar.update() print('\n') torch.save(self.model.state_dict(), "../results/url_with_location_cbow_neg.pkl") self.model.save_embedding(self.data.id2word_dict, self.output_file_name)
def __init__(self, infile, outfile, emb_dim=100, batch_size=128, window_size=5, epochs=5, initial_lr=1, min_count=5): self.data = InputData(infile, min_count) self.outfile = outfile self.emb_size = len(self.data.id2word) self.emb_dim = emb_dim self.batch_size = batch_size self.window_size = window_size self.epochs = epochs self.initial_lr = initial_lr self.wv_model = SkipgramModel(self.emb_size, self.emb_dim) self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.wv_model.cuda() self.optimizer = optim.SGD(self.wv_model.parameters(), lr=self.initial_lr)
def __call__(self, data: InputData) -> ResultData: (x, y) = data.initial path = [(x, y)] dx = data.df_dx1(x, y) x1 = x - self.alpha * dx dy = data.df_dx2(x1, y) y1 = y - self.alpha * dy calls_count = 2 f_calls = 0 while ((x1 - x)**2 + (y1 - y)**2) >= data.eps**2: (x, y) = (x1, y1) dx = data.df_dx1(x, y) x1 = x - self.alpha * dx dy = data.df_dx2(x1, y) y1 = y - self.alpha * dy calls_count += 2 path.append((x, y)) path.append((x1, y1)) return ResultData(self.name, (x1, y1), data.function(x, y), f_calls, calls_count, path)
class Word2Vec: def __init__(self, input_file_name, output_file_name): self.output_file_name = output_file_name self.data = InputData(input_file_name, MIN_COUNT) self.model = SkipGramModel(self.data.word_count, EMB_DIMENSION).cuda() self.lr = LR self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr) def train(self): print("SkipGram Training......") pairs_count = self.data.evaluate_pairs_count(WINDOW_SIZE) print("pairs_count", pairs_count) batch_count = pairs_count / BATCH_SIZE print("batch_count", batch_count) process_bar = tqdm(range(int(batch_count))) for i in process_bar: pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE) pos_pairs, neg_pairs = self.data.get_pairs(pos_pairs) pos_u = [pair[0] for pair in pos_pairs] pos_v = [int(pair[1]) for pair in pos_pairs] neg_u = [pair[0] for pair in neg_pairs] neg_v = [int(pair[1]) for pair in neg_pairs] self.optimizer.zero_grad() loss = self.model.forward(pos_u, pos_v, neg_u, neg_v) loss.backward() self.optimizer.step() if i * BATCH_SIZE % 100000 == 0: self.lr = self.lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = self.lr process_bar.set_postfix(loss=loss.data.cpu().numpy()) process_bar.update() torch.save(self.model.state_dict(), "../results/skipgram_hs.pkl") self.model.save_embedding(self.data.id2word_dict, self.output_file_name)
def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=50, window_size=5, iteration=1, initial_lr=0.025, min_count=5): self.data = InputData(input_file_name, min_count) self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.skip_gram_model = SGModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.skip_gram_model.cuda() self.optimizer = optim.SGD( self.skip_gram_model.parameters(), lr=self.initial_lr)
def run_enviroment(algorithms, cluster_model): ite = 0 for file_name in file_path: with open(FILE_DIR + file_name) as f: old = {} old['LinearedUCB'] = 0 old['OriginalUCB'] = 0 old_imp = {} old_imp['LinearedUCB'] = 0 old_imp['OriginalUCB'] = 0 click_count = 0 click_eq_count = 0 ctr_list = {} ctr_list['LinearedUCB'] = [] ctr_list['OriginalUCB'] = [] for line in f: _, click_article_id, click, user_data, article_pool = InputData.split_data( line) click_count += click userID = cluster_model.predict_cluster(user_data)[0] for name, alg in algorithms.items(): decide_id = alg.decide(userID, user_data, article_pool) if evaluate(click_article_id, decide_id, click, name, ite, line): # if click == 1: print(click, algorithms['OriginalUCB'].get_prob_check(userID, article_pool[decide_id])) ctr_list[name].append((reward[name] - old[name]) / (count[name] - old_imp[name])) alg.update(userID, user_data, article_pool[decide_id], click) if count[name] % 2000 == 0: # print(ite, name, reward[name], count[name], reward[name]/count[name], reward[name]-old[name], count[name]-old_imp[name], (reward[name]-old[name])/(count[name]-old_imp[name]), click_count) print(name, np.mean(ctr_list[name]), reward[name] - old[name], count[name] - old_imp[name], reward[name], count[name]) old[name] = reward[name] old_imp[name] = count[name] click_count = 0 click_eq_count = 0 ctr_list[name] = [] count[name] += 1 # alg.save_weight(name + '_weight_' + str(N_CLUSTERING) + '_3.csv') ite += 1 return
def test_prepeare_one_hot_input(self): sents1 = [["a", "b", "c"], ["b", "c"], ["a"], []] sents2 = [["d", "e"], ["d"]] voc, (input_data_1, input_data_2) = InputData.prepeare_one_hot_input(sents1, sents2, min_word_count=0) self.assertEqual(set(voc.ids.keys()), {"a", "b", "c", "d", "e"}) with tempfile.NamedTemporaryFile() as fp: input_data_1.write(fp.name) filenames = [fp.name] dataset = tf.contrib.data.TFRecordDataset(filenames) dataset = dataset.map(input_data_1.get_single_example) iterator = dataset.make_initializable_iterator() sentence = iterator.get_next() with tf.Session() as sess: sentences = [] sess.run(iterator.initializer) while True: try: sentences.append(sess.run(sentence)) except tf.errors.OutOfRangeError: break self.assertEqual([[voc.id2word(w) for w in s[1].tolist()] for s in sentences], sents1)
def train_dir(self, dataset, path): reader = InputData(dataset, path) dictList = reader.readDir() self.train_all(dictList)
config = get_config() eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 min = True suffix = '.txt' if (min): suffix = '_min.txt' train_path = train_file + suffix test_path = test_file + suffix data = InputData(model_fields=model_fields, file_path=train_path) test = InputData(model_fields=model_fields, file_path=test_path) test_data = test if config.valid_on_test_data: train_data = data valid_data = test_data else: train_data, valid = data.random_pick(0.7) valid_data = valid print("valid data count:" + str(len(valid_data.data()))) with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale)