def main(args): if not args.use_cuda: paddle.set_device("cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() graph = load(args.dataset) model = SkipGramModel(graph.num_nodes, args.embed_size, args.neg_num, sparse=not args.use_cuda) model = paddle.DataParallel(model) train_steps = int(graph.num_nodes / args.batch_size) * args.epoch scheduler = paddle.optimizer.lr.PolynomialDecay( learning_rate=args.learning_rate, decay_steps=train_steps, end_lr=0.0001) optim = Adam(learning_rate=scheduler, parameters=model.parameters()) train_ds = ShardedDataset(graph.nodes) collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size, args.neg_num, args.neg_sample_type) data_loader = Dataloader(train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.sample_workers, collate_fn=collate_fn) for epoch in tqdm.tqdm(range(args.epoch)): train_loss = train(model, data_loader, optim) log.info("Runing epoch:%s\t train_loss:%.6f", epoch, train_loss) paddle.save(model.state_dict(), "model.pdparams")
def main(args): if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() graph = load(args.dataset) model = SkipGramModel( graph.num_nodes, args.embed_size, args.neg_num, sparse=not args.use_cuda) model = paddle.DataParallel(model) optim = Adam( learning_rate=args.learning_rate, parameters=model.parameters(), weight_decay=args.weight_decay) train_ds = ShardedDataset(graph.nodes) collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size, args.neg_num, args.neg_sample_type) data_loader = Dataloader( train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.sample_workers, collate_fn=collate_fn) for epoch in tqdm.tqdm(range(args.epoch)): train_loss = train(model, data_loader, optim) log.info("Runing epoch:%s\t train_loss:%.6f", epoch, train_loss)
class Word2Vec: def __init__(self, input_file_name, output_file_name): self.min_count = 5 self.emb_dimension = 100 self.batch_size = 64 self.window_size = 5 self.iteration = 1 self.initial_lr = 0.001 self.data = InputData(input_file_name, self.min_count) self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension, self.batch_size, self.window_size, self.iteration, self.initial_lr, self.min_count) self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.skip_gram_model.cuda() self.optimizer = optim.SGD( self.skip_gram_model.parameters(), lr=self.initial_lr) def train(self): """Multiple training. Returns: None. """ pair_count = self.data.evaluate_pair_count(self.window_size) batch_count = self.iteration * pair_count / self.batch_size process_bar = tqdm(range(int(batch_count))) for i in process_bar: pos_pairs = self.data.get_batch_pairs(self.batch_size, self.window_size) neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5) pos_u = [pair[0] for pair in pos_pairs] pos_v = [pair[1] for pair in pos_pairs] pos_u = Variable(torch.LongTensor(pos_u)) pos_v = Variable(torch.LongTensor(pos_v)) neg_v = Variable(torch.LongTensor(neg_v)) if self.use_cuda: pos_u = pos_u.cuda() pos_v = pos_v.cuda() neg_v = neg_v.cuda() self.optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() self.optimizer.step() process_bar.set_description("Loss: %0.8f, lr: %0.6f" % (loss.data, self.optimizer.param_groups[0]['lr'])) if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr self.skip_gram_model.save_embedding( self.data.id2word, self.output_file_name, self.use_cuda)
class Metapath2VecTrainer: def __init__(self, args): if args.aminer: dataset = AminerDataset(args.path) else: dataset = CustomDataset(args.path) self.data = DataReader(dataset, args.min_count, args.care_type) dataset = Metapath2vecDataset(self.data, args.window_size) self.dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=dataset.collate) self.output_file_name = args.output_file self.emb_size = len(self.data.word2id) self.emb_dimension = args.dim self.batch_size = args.batch_size self.iterations = args.iterations self.initial_lr = args.initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: self.skip_gram_model.cuda() def train(self): for iteration in range(self.iterations): print("\n\n\nIteration: " + str(iteration + 1)) optimizer = optim.SparseAdam(self.skip_gram_model.parameters(), lr=self.initial_lr) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, len(self.dataloader)) running_loss = 0.0 for i, sample_batched in enumerate(tqdm(self.dataloader)): if len(sample_batched[0]) > 1: pos_u = sample_batched[0].to(self.device) pos_v = sample_batched[1].to(self.device) neg_v = sample_batched[2].to(self.device) scheduler.step() optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() optimizer.step() running_loss = running_loss * 0.9 + loss.item() * 0.1 if i > 0 and i % 500 == 0: print(" Loss: " + str(running_loss)) self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name)
def train(args): data = InputData(args.input, args.min_count, args.sample) output_file_name = args.output emb_size = len(data.word2id) emb_dimension = args.dim batch_size = args.mb window_size = args.window n_negs = args.n_negs iteration = args.iters initial_lr = args.lr use_cuda = args.cuda skip_gram_model = SkipGramModel(emb_size, emb_dimension) if use_cuda: skip_gram_model = skip_gram_model.cuda() optimizer = optim.SGD(skip_gram_model.parameters(), lr=initial_lr) pair_count = data.evaluate_pair_count(window_size) batch_count = iteration * pair_count / batch_size process_bar = tqdm(range(int(batch_count))) # skip_gram_model.save_embedding( # data.id2word, 'begin_embedding.txt', use_cuda) for i in process_bar: pos_pairs = data.get_batch_pairs(batch_size, window_size) neg_v = data.get_neg_v_neg_sampling(pos_pairs, n_negs) pos_u = [pair[0] for pair in pos_pairs] pos_v = [pair[1] for pair in pos_pairs] pos_u = torch.LongTensor(pos_u) pos_v = torch.LongTensor(pos_v) neg_v = torch.LongTensor(neg_v) if use_cuda: pos_u = pos_u.cuda() pos_v = pos_v.cuda() neg_v = neg_v.cuda() optimizer.zero_grad() loss = skip_gram_model(pos_u, pos_v, neg_v) loss.backward() optimizer.step() process_bar.set_description( "\rLoss: %0.8f, lr: %0.6f" % (loss.item(), optimizer.param_groups[0]['lr'])) if i * batch_size % 100000 == 0: lr = initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in optimizer.param_groups: param_group['lr'] = lr skip_gram_model.save_embedding(data.id2word, output_file_name, use_cuda)
class Word2Vec: def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=50, window_size=5, iteration=5, initial_lr=0.025, neg_num=5, min_count=5): self.data = InputData(input_file_name, min_count) self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.neg_num = neg_num self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.skip_gram_model.cuda() self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) def train(self): pair_count = self.data.evaluate_pair_count(self.window_size) batch_count = self.iteration * pair_count / self.batch_size process_bar = tqdm(range(int(batch_count))) count = int(batch_count) // 3 for i in process_bar: pos_pairs = self.data.get_batch_pairs(self.batch_size, self.window_size) neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, self.neg_num) pos_u = [pair[0] for pair in pos_pairs] pos_v = [pair[1] for pair in pos_pairs] pos_u = Variable(torch.LongTensor(pos_u)).cuda() pos_v = Variable(torch.LongTensor(pos_v)).cuda() neg_v = Variable(torch.LongTensor(neg_v)).cuda() self.optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() self.optimizer.step() process_bar.set_description("Loss: %0.8f, lr: %0.6f" % (loss.item(), self.optimizer.param_groups[0]['lr'])) if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr if i != 0 and i % count == 0: self.skip_gram_model.save_embedding(self.data.id2word,self.output_file_name + str(i)) self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name + 'final')
def main(args): if not args.use_cuda: paddle.set_device("cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() if args.edge_file: graph = load_from_file(args.edge_file) else: graph = load(args.dataset) edges = np.load("./edges.npy") edges = np.concatenate([edges, edges[:, [1, 0]]]) graph = pgl.Graph(edges) model = SkipGramModel(graph.num_nodes, args.embed_size, args.neg_num, sparse=not args.use_cuda) model = paddle.DataParallel(model) train_ds = ShardedDataset(graph.nodes, repeat=args.epoch) train_steps = int(len(train_ds) // args.batch_size) log.info("train_steps: %s" % train_steps) scheduler = paddle.optimizer.lr.PolynomialDecay( learning_rate=args.learning_rate, decay_steps=train_steps, end_lr=0.0001) optim = Adam(learning_rate=scheduler, parameters=model.parameters()) collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size, args.neg_num, args.neg_sample_type) data_loader = Dataloader(train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.sample_workers, collate_fn=collate_fn) train_loss = train(model, data_loader, optim) paddle.save(model.state_dict(), "model.pdparams")
def main(config, ip_list_file): ds = TrainPairDataset(config, ip_list_file) loader = Dataloader( ds, batch_size=config.batch_pair_size, num_workers=config.num_workers, stream_shuffle_size=config.pair_stream_shuffle_size, collate_fn=CollateFn()) model = SkipGramModel(config) if config.warm_start_from: log.info("warm start from %s" % config.warm_start_from) model.set_state_dict(paddle.load(config.warm_start_from)) optim = Adam( learning_rate=config.lr, parameters=model.parameters(), lazy_mode=config.lazy_mode) log.info("starting training...") train(config, model, loader, optim)
class Word2Vec: def __init__(self, log_filename: str, output_filename: str, embedding_dimension: int=100, batch_size: int=128, iteration: int=1, initial_lr: float=0.025, min_count: int=5, sub_sampling_t: float = 1e-5, neg_sampling_t: float = 0.75, neg_sample_count: int = 5, half_window_size: int = 2, read_data_method: str='memory'): """ init func """ self.data = DataHanlder(log_filename=log_filename, batch_size=batch_size, min_count=min_count, sub_sampling_t=sub_sampling_t, neg_sampling_t=neg_sampling_t, neg_sample_count=neg_sample_count, half_window_size=half_window_size, read_data_method=read_data_method) self.output_filename = output_filename self.embedding_dimension = embedding_dimension self.batch_size = batch_size self.half_window_size = half_window_size self.iter = iteration self.initial_lr = initial_lr self.sg_model = SkipGramModel(len(self.data.vocab), self.embedding_dimension) self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.sg_model.cuda() self.optimizer = optim.SGD(self.sg_model.parameters(), lr=self.initial_lr) def train(self): i = 0 # total 2 * self.half_window_size * self.data.total_word_count, # for each sent, (1 + 2 + .. + half_window_size) * 2 more pairs has been calculated, over all * sent_len # CAUTION: IT IS NOT AN ACCURATE NUMBER, JUST APPROXIMATELY COUNT. approx_pair = 2 * self.half_window_size * self.data.total_word_count - \ (1 + self.half_window_size) * self.half_window_size * self.data.sentence_len batch_count = self.iter * approx_pair / self.batch_size for pos_u, pos_v, neg_samples in self.data.gen_batch(): i += 1 if self.data.sentence_cursor > self.data.sentence_len * self.iter: # reach max iter break # train iter pos_u = Variable(torch.LongTensor(pos_u)) pos_v = Variable(torch.LongTensor(pos_v)) neg_v = Variable(torch.LongTensor(neg_samples)) if self.use_cuda: pos_u, pos_v, neg_v = [i.cuda() for i in (pos_u, pos_v, neg_v)] # print(len(pos_u), len(pos_v), len(neg_v)) self.optimizer.zero_grad() # 안에서 로스값이 바로 튀어나옴 loss = self.sg_model.forward(pos_u, pos_v, neg_v) loss.backward() self.optimizer.step() if i % 100 == 0: # print(loss) print("step: %d, Loss: %0.8f, lr: %0.6f" % (i, loss.item(), self.optimizer.param_groups[0]['lr'])) if i % (100000 // self.batch_size) == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr self.sg_model.save_embedding(self.data.id2word, self.output_filename, self.use_cuda)
class Word2Vec: def __init__( self, input_file_name, input_wvectors, input_cvectors, input_ps, input_ns, output_file_name, emb_dimension=100, batch_size=50, window_size=5, kn=20, iteration=1, initial_lr=0.001, clip=1.0, min_count=30, batch_num_to_valid=100000, ): """Initilize class parameters. Args: input_file_name: Name of a text data from file. Each line is a sentence splited with space. input_vectors: Pretrained vector input_psns: Pretrained positive sample & negative sample output_file_name: Name of the final embedding file. emb_dimention: Embedding dimention, typically from 50 to 500. batch_size: The count of word pairs for one forward. window_size: Max skip length between words. kn: k neighbors. iteration: Control the multiple training iterations. initial_lr: Initial learning rate. min_count: The minimal word frequency, words with lower frequency will be filtered. Returns: None. """ self.data = InputData(input_file_name, min_count) self.pre_wvectors = InputVector(input_wvectors) self.pre_cvectors = InputVector(input_cvectors) self.ps_w = load_from_pkl(input_ps) self.ns_w = load_from_pkl(input_ns) self.ps = convert_word_to_id(self.ps_w, self.data.word2id) self.ns = convert_word_to_id(self.ns_w, self.data.word2id) self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.kn = kn self.iteration = iteration self.initial_lr = initial_lr self.clip = clip self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension, self.pre_wvectors, self.pre_cvectors) self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.skip_gram_model.cuda() self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) self.batch_num_to_valid = batch_num_to_valid def train(self, similarity_test_paths, synset_paths, analogy_paths): """Multiple training. Returns: None. """ pair_count = self.data.evaluate_pair_count(self.window_size) batch_count = self.iteration * pair_count / self.batch_size process_bar = tqdm(range(int(batch_count))) # self.skip_gram_model.save_embedding( # self.data.id2word, 'begin_embedding.txt', self.use_cuda) best_scores = dict() tmp_emb_dir = os.path.join(tempfile.gettempdir(), 'embedding') tmp_emb_path = os.path.join( tmp_emb_dir, ''.join(random.sample(string.ascii_letters + string.digits, 16))) for epoch in range(self.iteration): for i in process_bar: pos_pairs = self.data.get_batch_pairs(self.batch_size, self.window_size) pos_u, mask_pos_u = self.data.get_ps_batch( pos_pairs, self.ps, self.kn) neg_u, mask_neg_u = self.data.get_ns_batch( pos_pairs, self.ns, self.kn) pair_u = [pair[0] for pair in pos_pairs] pair_v = [pair[1] for pair in pos_pairs] pair_u = Variable(torch.LongTensor(pair_u)) pair_v = Variable(torch.LongTensor(pair_v)) pos_u = Variable(torch.LongTensor(pos_u)) mask_pos_u = Variable(torch.FloatTensor(mask_pos_u)) neg_u = Variable(torch.LongTensor(neg_u)) mask_neg_u = Variable(torch.FloatTensor(mask_neg_u)) if self.use_cuda: pair_u = pair_u.cuda() pair_v = pair_v.cuda() pos_u = pos_u.cuda() mask_pos_u = mask_pos_u.cuda() neg_u = neg_u.cuda() mask_neg_u = mask_neg_u.cuda() self.optimizer.zero_grad() ''' param = self.skip_gram_model.parameters() tmp = [] try: while True: tmp.append(param.__next__()) except: pass ''' loss = self.skip_gram_model.forward(pair_u, pair_v, pos_u, mask_pos_u, neg_u, mask_neg_u) loss.backward() torch.nn.utils.clip_grad_norm( self.skip_gram_model.parameters(), self.clip) self.optimizer.step() process_bar.set_description( "Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr if i % self.batch_num_to_valid == 0: logging.info('epoch%d_batch%d, evaluating...' % (epoch, i)) self.save_embedding(self.data.id2word, tmp_emb_path, self.use_cuda) best_scores, save_flag = evaluation( tmp_emb_path, similarity_test_paths, synset_paths, analogy_paths, best_scores) if save_flag == True: emb_save_path = self.output_file_name + "_epoch%d_batch%d" % ( epoch, i) shutil.move(tmp_emb_path, emb_save_path) logging.info('Save current embedding to %s' % emb_save_path) self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name, self.use_cuda) logging.info('final evaluating...') self.save_embedding(self.data.id2word, tmp_emb_path, self.use_cuda) best_scores, save_flag = evaluation(tmp_emb_path, similarity_test_paths, synset_paths, analogy_paths, best_scores) if save_flag == True: emb_save_path = self.output_file_name + "_epoch%d" % epoch shutil.move(tmp_emb_path, emb_save_path) logging.info('Save current embedding to %s' % emb_save_path) def save_embedding(self, id2word, file_name, use_cuda): """Save all embeddings to file. As this class only record word id, so the map from id to word has to be transfered from outside. Args: id2word: map from word id to word. file_name: file name. Returns: None. """ if use_cuda: embedding = self.skip_gram_model.u_embeddings.weight.cpu( ).data.numpy() else: embedding = self.skip_gram_model.u_embeddings.weight.data.numpy() fout = open(file_name, 'w') fout.write('%d %d\n' % (len(id2word), self.emb_dimension)) for wid, w in id2word.items(): e = embedding[wid] e = ' '.join(map(lambda x: str(x), e)) fout.write('%s %s\n' % (w, e))
class Metapath2Vec: def __init__(self, args, graph): # 1. generate walker walker = MetaPathWalker(args, graph) files = os.listdir(args.input_path) is_file = False for file in files: fullFilename = os.path.join(args.input_path, file) # if file exists, load the file. if file.startswith(args.idx_metapath): is_file = True print("\n !!! Found the file that you have specified...") self.inputFileName = "{}{}-metapath_{}-whichmeta_{}-num_walks_{}-len_metapath.txt".format( args.input_path, args.idx_metapath, args.which_metapath, args.num_walks, args.len_metapath) print("### Metapaths Loaded...", self.inputFileName) # if file does not exists, create the new one. if not is_file: print("\n !!! There is no metapaths with the given parameters...") print("### Creating new Metapaths...") self.metapaths = walker.generate_metapaths(args) walker.create_metapath_walks(args, args.num_walks, self.metapaths) self.inputFileName = "{}{}-metapath_{}-whichmeta_{}-num_walks_{}-len_metapath.txt".format( args.input_path, args.idx_metapath, args.which_metapath, args.num_walks, args.len_metapath) print("### Metapaths Loaded...", self.inputFileName) # 2. read data print( "\n\n##########################################################################" ) print("### Metapaths to DataLoader...", self.inputFileName) self.data = DataReader(args.min_count, args.care_type, self.inputFileName) # 3. make dataset for training dataset = DatasetLoader(self.data, args.window_size) # 4. initialize dataloader self.dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=dataset.collate) self.output_file_name = "{}{}-embedding_{}-metapath_{}-dim_{}-initial_lr_{}-window_size_{}-iterations_{}-min_count-_{}-isCSP_{}-CSPcoef.pickle".format( args.output_path, args.idx_embed, args.idx_metapath, args.dim, args.initial_lr, args.window_size, args.iterations, args.min_count, args.CSP_train, args.CSP_coef) self.emb_size = len(self.data.word2id) self.emb_dimension = args.dim self.batch_size = args.batch_size self.iterations = args.iterations self.initial_lr = args.initial_lr self.aux_mode = args.CSP_train self.aux_coef = args.CSP_coef if args.CSP_train: print("\n\n#####################################") print("### SkipGram with CSP") self.skip_gram_model = SkipGramModelAux(self.emb_size, self.emb_dimension, nodes=self.data.id2word, aux_coef=self.aux_coef, CSP_save=args.CSP_save) else: print("\n\n#####################################") print("### SkipGram Normal") self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: self.skip_gram_model.cuda() def train(self): for iteration in range(self.iterations): #print(self.skip_gram_model.u_embeddings.weight.data) print("\n\n\nIteration: " + str(iteration + 1)) # Temporary Fix! if self.aux_mode: u = self.skip_gram_model.u_embeddings.weight v = self.skip_gram_model.v_embeddings.weight e = self.skip_gram_model.encoder.weight optimizer = optim.Adam([u, v], lr=self.initial_lr) aux_optimizer = optim.Adam([e], lr=0.001) aux_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( aux_optimizer, len(self.dataloader)) else: optimizer = optim.SparseAdam(self.skip_gram_model.parameters(), lr=self.initial_lr) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, len(self.dataloader)) running_loss = 0.0 for i, sample_batched in enumerate(tqdm(self.dataloader)): if len(sample_batched[0]) > 1: pos_u = sample_batched[0].to(self.device) pos_v = sample_batched[1].to(self.device) neg_v = sample_batched[2].to(self.device) scheduler.step() optimizer.zero_grad() if self.aux_mode: aux_scheduler.step() aux_optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() optimizer.step() if self.aux_mode: aux_optimizer.step() running_loss = running_loss * 0.9 + loss.item() * 0.1 #if i > 0 and i % int(len(self.dataloader)/3) == 0: print(" Loss: " + str(running_loss)) if self.aux_mode: print(" Auxiliary Loss: " + str(self.skip_gram_model.aux_loss.item())) self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name)
class Node2Vec: def __init__(self, args, graph): print("\nPerforming Node2vec...\n") # 1. generate walker walker = DeepWalker(args, graph) print("\nDoing deepwalks...\n") walker.create_features() self.inputFileName = "{}{}-deepwalk_{}-num_walks_{}-len_metapath.txt".format( args.input_path, args.idx_metapath, args.number_of_walks, args.walk_length) # 2. read data self.data = DataReader(args.min_count, args.care_type, self.inputFileName) # 3. make dataset for training dataset = DatasetLoader(self.data, args.window_size) # 4. initialize dataloader self.dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=dataset.collate) self.output_file_name = "{}{}-embedding_{}-deepwalk_{}-dim_{}-initial_lr_{}-window_size_{}-iterations_{}-min_count.pickle".format( args.output_path, args.idx_embed, args.idx_metapath, args.dim, args.initial_lr, args.window_size, args.iterations, args.min_count) self.emb_size = len(self.data.word2id) self.emb_dimension = args.dim self.batch_size = args.batch_size self.iterations = args.iterations self.initial_lr = args.initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: self.skip_gram_model.cuda() def train(self): for iteration in range(self.iterations): print("\n\n\nIteration: " + str(iteration + 1)) optimizer = optim.SparseAdam(self.skip_gram_model.parameters(), lr=self.initial_lr) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, len(self.dataloader)) running_loss = 0.0 for i, sample_batched in enumerate(tqdm(self.dataloader)): if len(sample_batched[0]) > 1: pos_u = sample_batched[0].to(self.device) pos_v = sample_batched[1].to(self.device) neg_v = sample_batched[2].to(self.device) scheduler.step() optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() optimizer.step() running_loss = running_loss * 0.9 + loss.item() * 0.1 print(" Loss: " + str(running_loss)) self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name)
class Word2Vec: """ Word2Vec class module for extracting triples and training. """ def __init__(self, ifolder, ofolder, emb_dimension=400, batch_size=32, iteration=int(sys.argv[3]), initial_lr=0.025): self.ifolder = ifolder self.outfolder = ofolder+ifolder.rsplit('/',2)[1]+'/' try: os.makedirs(self.outfolder) except: print(self.outfolder+ " folder exists. Will be overwritten") self.emb_dimension = emb_dimension self.initial_lr = initial_lr self.iteration = iteration self.batch_size = batch_size self.fpos = 0 self.fneg = 0 self.id2word = dict() self.id2pair = dict() self.pair2id = dict() self.read_word_dict(ifolder+"Word2Id") self.read_pair_dict(ifolder+"Pair2Id") self.pair_count = self.evaluate_pair_count() self.positive_pairs = np.zeros((self.pair_count, 2), dtype=int) # Dummy values to ensure size does not change self.negative_pairs = np.zeros((self.pair_count, 5), dtype=int) print(" Size of :", sys.getsizeof(self.positive_pairs)) print(" Size of :", sys.getsizeof(self.negative_pairs)) #ipdb.set_trace() self.emb_size = len(self.id2word) self.pair_emb_size = len(self.id2pair) self.skip_gram_model = SkipGramModel(self.pair_emb_size,self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.skip_gram_model.cuda() self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) print("Start reading pairs") def read_word_dict(self, wdictfile ): with open(wdictfile) as inputFile: for item in inputFile: word,wid = item.split() self.id2word[int(wid)] = word print("\n Completed reading word dictionary.") def read_pair_dict(self, pdictfile ): with open(pdictfile) as inputFile: for item in inputFile: word1,word2,pid = item.split() self.id2pair[int(pid)] = word1+':::'+word2 self.pair2id[(word1,word2)] = int(pid) #print(self.id2pair[int(pid)],word1+':::'+word2) print("\n Completed reading pair dictionary.") self.cross_verification_BLESS() self.cross_verification_EVAL() def evaluate_pair_count(self): self.datasets = dict() dsfile = self.ifolder+"Statistics" with open(dsfile) as inputFile: for item in inputFile: if re.match("Dataset",item): i = item.split(':')[1] print("Total positive pair samples :",i) return int(i) def read_pairs(self, posFile, negFile): """ Read triples from file and update self.positive_pairs & self.negative_pairs """ posDsfile = self.ifolder+posFile index = 0 #ipdb.set_trace() with open(posDsfile) as inputFile: for line in inputFile: pid, wid = line.split() #self.positive_pairs.append([int(pid),int(wid)]) self.positive_pairs[index] = [int(pid),int(wid)] index += 1 print("Size of :", sys.getsizeof(self.positive_pairs)) negDsfile = self.ifolder+negFile index = 0 with open(negDsfile) as inputFile: for line in inputFile: temp = [int(i) for i in line.split()] self.negative_pairs[index] = temp index += 1 print(" Size of :", sys.getsizeof(self.negative_pairs)) def get_batch_pairs(self, batch_count): return self.positive_pairs[(batch_count)*self.batch_size:(batch_count+1)*self.batch_size] def get_neg_v(self, batch_count): return self.negative_pairs[(batch_count)*self.batch_size:(batch_count+1)*self.batch_size] def cross_verification_BLESS(self): """ Optional method To verify how many BLESS dataset elements are mapped with model pairs """ #Remove the file if it already exists try: os.remove(self.outfolder+"BlessSet.txt") except: pass #Remove the file if it already exists try: os.remove(self.outfolder+"BlessSet_Except.txt") except: pass blessExceptFile = open(self.outfolder+"BlessSet_Except.txt","w") blessFile = open(self.outfolder+"BlessSet.txt","w") self.Bless_id2pair = dict() with open("/home/achingacham/Model/GRID_data/Evaluation_Datasets/BLESS_UniqueTuples") as evalFile: testDataset = evalFile.readlines() for items in testDataset: nouns = items.split() search_key = (nouns[0],nouns[1]) rev_search_key = (nouns[1],nouns[0]) if (search_key in self.pair2id): temp_id = self.pair2id[search_key] self.Bless_id2pair[temp_id] = nouns[0]+':::'+nouns[1] blessFile.write(items) else: blessExceptFile.write(items) print("Completed cross validation with Blessset") blessExceptFile.close() blessFile.close() def cross_verification_EVAL(self): """ Optional method To verify how many EVAL dataset elements are mapped with model pairs """ #Remove the file if it already exists try: os.remove(self.outfolder+"EvalSet.txt") except: pass #Remove the file if it already exists try: os.remove(self.outfolder+"EvalSet_Except.txt") except: pass EVALExceptFile = open(self.outfolder+"EvalSet_Except.txt","w") EVALFile = open(self.outfolder+"EvalSet.txt","w") self.Eval_id2pair = dict() with open("/home/achingacham/Model/GRID_data/Evaluation_Datasets/EVAL_UniqueTuples") as evalFile: testDataset = evalFile.readlines() for items in testDataset: nouns = items.split() search_key = (nouns[0],nouns[1]) rev_search_key = (nouns[1],nouns[0]) if (search_key in self.pair2id): temp_id = self.pair2id[search_key] self.Eval_id2pair[temp_id] = nouns[0]+':::'+nouns[1] EVALFile.write(items) else: EVALExceptFile.write(items) print("Completed cross validation with Blessset") EVALExceptFile.close() EVALFile.close() def train(self): """Multiple training. Returns: None. """ batch_count = self.pair_count / self.batch_size for epoch in range(self.iteration): print("\n Epoch :", epoch) output_file_name = self.outfolder+"Epoch_"+str(epoch)+"_EMB_"+str(self.emb_dimension)+"_All.txt" Bless_output_file_name = self.outfolder+"Epoch_"+str(epoch)+"_EMB_"+str(self.emb_dimension)+"_Bless.txt" epochLoss = 0 process_bar = tqdm(range(int(batch_count))) for i in process_bar: pos_pairs = self.get_batch_pairs(i) neg_v = self.get_neg_v(i) pos_u = np.array([pair[0] for pair in pos_pairs]) #index to the pair of Nouns pos_v = np.array([pair[1] for pair in pos_pairs]) #a context word (for instance, inbetween word) #pos_u = Variable(torch.LongTensor(pos_u)) pos_u = Variable(torch.LongTensor(pos_u)) pos_v = Variable(torch.LongTensor(pos_v)) neg_v = Variable(torch.LongTensor(neg_v)) #a negative context word from unigram distribution if self.use_cuda: pos_u = pos_u.cuda() pos_v = pos_v.cuda() neg_v = neg_v.cuda() self.optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() self.optimizer.step() process_bar.set_description("Loss: %0.8f, lr: %0.6f" % (loss.data[0],self.optimizer.param_groups[0]['lr'])) epochLoss += loss.data[0] if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr print("\n Average Epoch Loss: ", epochLoss/batch_count) self.skip_gram_model.save_embedding(self.id2pair, output_file_name, self.use_cuda)
class Word2Vec: def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=100, window_size=5, iteration=5, initial_lr=0.025, min_count=5, using_hs=False, using_neg=False, context_size=2, hidden_size=128, cbow=None, skip_gram=None): print("\nInput File loading......\n") self.data = InputData(input_file_name, min_count) print("\nInput File loaded.\n") self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.context_size = context_size self.hidden_size = hidden_size self.using_hs = using_hs self.using_neg = using_neg self.cbow = cbow self.skip_gram = skip_gram if self.skip_gram is not None and self.skip_gram: self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) print("skip_gram_model", self.skip_gram_model) self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) if self.cbow is not None and self.cbow: self.cbow_model = CBOW(self.emb_size, self.emb_dimension) print("CBOW_model", self.cbow_model) self.optimizer = optim.SGD(self.cbow_model.parameters(), lr=self.initial_lr) def skip_gram_train(self): """Multiple training. Returns: None. """ print("Skip_Gram Training......") pair_count = self.data.evaluate_pair_count(self.window_size) print("pair_count", pair_count) batch_count = self.iteration * pair_count / self.batch_size print("batch_count", batch_count) process_bar = tqdm(range(int(batch_count))) self.skip_gram_model.save_embedding(self.data.id2word, 'skip_gram_begin_embedding.txt') for i in process_bar: pos_pairs = self.data.get_batch_pairs(self.batch_size, self.window_size) if self.using_hs: pos_pairs, neg_pairs = self.data.get_pairs_by_huffman( pos_pairs) else: pos_pairs, neg_pairs = self.data.get_pairs_by_neg_sampling( pos_pairs, 5) pos_u = [int(pair[0]) for pair in pos_pairs] pos_v = [int(pair[1]) for pair in pos_pairs] neg_u = [int(pair[0]) for pair in neg_pairs] neg_v = [int(pair[1]) for pair in neg_pairs] self.optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_u, neg_v) loss.backward() self.optimizer.step() process_bar.set_description( "Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) print("Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr print("Skip_Gram Trained and Saving File......") self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name) print("Skip_Gram Trained and Saved File.") def cbow_train(self): print("CBOW Training......") self.cbow_model.save_embedding(self.data.id2word, 'cbow_begin_embedding.txt') pos_all_pairs = self.data.get_cbow_batch_all_pairs( self.batch_size, self.context_size) pair_count = len(pos_all_pairs) process_bar = tqdm(range(int(pair_count / self.batch_size))) for _ in process_bar: pos_pairs = self.data.get_cbow_batch_pairs(self.batch_size, self.window_size) if self.using_hs: pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_huffman( pos_pairs) else: pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_neg_sampling( pos_pairs, self.context_size) pos_u = [pair[0] for pair in pos_pairs] pos_v = [int(pair[1]) for pair in pos_pairs] neg_u = [pair[0] for pair in neg_pairs] neg_v = [int(pair[1]) for pair in neg_pairs] self.optimizer.zero_grad() loss = self.cbow_model.forward(pos_u, pos_v, neg_u, neg_v) loss.backward() self.optimizer.step() print("CBOW Trained and Saving File......") self.cbow_model.save_embedding(self.data.id2word, self.output_file_name) print("CBOW Trained and Saved File.")
class Word2VecTrainer: def __init__(self, input_file, antonym_file, output_file, emb_dimension=100, batch_size=32, window_size=5, iterations=3, initial_lr=0.001, min_count=12): print("Reading input file...") self.data = DataReader(input_file, min_count) dataset = Word2vecDataset(self.data, window_size) print("Creating data batches") self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=dataset.collate) self.antonym_file = open(antonym_file, 'r') self.output_file_name = output_file self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.iterations = iterations self.initial_lr = initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: self.skip_gram_model.cuda() def calculate_antonym_loss(self): src_ids = [] tgt_ids = [] while len(src_ids) < self.batch_size: line = self.antonym_file.readline() if not line: #EOF reached self.antonym_file.seek(0) words = line.strip('\n').split() if len(words) < 2: continue src = words[0] tgt = random.choice(words[1:]).strip('\n') src_id = self.data.word2id.get(src, None) tgt_id = self.data.word2id.get(tgt, None) if src_id is None or tgt_id is None: continue src_ids.append(src_id) tgt_ids.append(tgt_id) #src_embedding = self.skip_gram_model.embed(torch.LongTensor(src_id).to(self.device)) #tgt_embedding = self.skip_gram_model.embed(torch.LongTensor(tgt_id).to(self.device)) input_src = torch.LongTensor(src_ids).to(self.device) input_tgt = torch.LongTensor(tgt_ids).to(self.device) src_embedding = torch.squeeze(self.skip_gram_model.embed(input_src)) tgt_embedding = torch.squeeze(self.skip_gram_model.embed(input_tgt)) #loss = torch.abs(torch.dot(src_embedding,tgt_embedding)) loss = torch.abs( torch.sum(torch.mul(src_embedding, tgt_embedding), dim=1)) loss = loss / (torch.norm(src_embedding, dim=1) * torch.norm(tgt_embedding, dim=1)) return torch.mean(loss) def train(self): for iteration in range(self.iterations): print("\n\n\nIteration: " + str(iteration + 1)) optimizer = optim.SparseAdam(self.skip_gram_model.parameters(), lr=self.initial_lr) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, len(self.dataloader)) running_loss = 0.0 count = 0 for i, sample_batched in enumerate(self.dataloader): count += 1 if count % 10000 == 0: print("\n\nEpoch %d, %d batches processed" % (iteration, count)) if len(sample_batched[0]) > 1: pos_u = sample_batched[0].to(self.device) pos_v = sample_batched[1].to(self.device) neg_v = sample_batched[2].to(self.device) scheduler.step() optimizer.zero_grad() skip_gram_loss = self.skip_gram_model.forward( pos_u, pos_v, neg_v) antonym_loss = 100 * self.calculate_antonym_loss() loss = skip_gram_loss + antonym_loss loss.backward() optimizer.step() running_loss = running_loss * 0.9 + loss.item() * 0.1 if i > 0 and i % 50000 == 0: print(" Loss: " + str(running_loss) + ' sk: ' + str(skip_gram_loss.data) + ' ant: ' + str(antonym_loss.data)) self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name)
sample_table = [] sample_table_size = 1e8 pow_frequency = np.array(list(sub_graph_vocab.values())) ** 0.75 words_pow = sum(pow_frequency) ratio = pow_frequency / words_pow count = np.round(ratio * sample_table_size) for wid, c in enumerate(count): sample_table += [sub_graph_to_id[list(sub_graph_vocab.keys())[wid]]] * int(c) sample_table = np.array(sample_table) return sample_table sample_table = init_sample_table() neg_count = 2 epoch = 20 opt = optim.SparseAdam(model_1.parameters(), lr=0.0001) model_1.train() cuda = False if torch.cuda.is_available(): cuda = True model_1.cuda() loss_g = {} for i in range(epoch): for j in range(len(graph_enc)): opt.zero_grad() # doc_id = np.random.randint(1, len(graph_enc)) doc_id = j
class Word2Vec: def __init__(self, wikidump_filename, output_text_filename, emb_dimension, batch_size, window_size, iteration, initial_lr, min_count): self.data = InputData(wikidump_filename, min_count, output_text_filename) self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) def train(self): pair_count = self.data.evaluate_pair_count(self.window_size) batch_count = self.iteration * pair_count / self.batch_size process_bar = tqdm(range(int(batch_count))) for i in process_bar: pos_pairs = self.data.get_batch_pairs(self.batch_size, self.window_size) neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5) pos_u = [pair[0] for pair in pos_pairs] pos_v = [pair[1] for pair in pos_pairs] pos_u = Variable(torch.LongTensor(pos_u)) pos_v = Variable(torch.LongTensor(pos_v)) neg_v = Variable(torch.LongTensor(neg_v)) self.optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() self.optimizer.step() process_bar.set_description( "Loss: %0.8f, lr: %0.6f" % (loss.data, self.optimizer.param_groups[0]['lr'])) if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr def calculate_probability(self, word1, word2): embeddings = self.skip_gram_model.u_embeddings.weight.data.numpy() embedding_1 = embeddings[self.data.word2id[word1]] embedding_2 = embeddings[self.data.word2id[word2]] numerator = np.exp(np.sum(embedding_1 * embedding_2)) denominator = np.sum(np.exp( np.sum(np.multiply(embedding_1, embeddings), axis=1)), axis=0) return (numerator / denominator) def wordsim353_spearman(self, input_filename): target_word = [] context_word = [] human_scores = [] with open(input_filename) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') ws353_pairs = -1 for row in csv_reader: if ws353_pairs == -1: ws353_pairs += 1 else: target_word.append(row[0]) context_word.append(row[1]) human_scores.append(float(row[2])) ws353_pairs += 1 for pair in range(0, ws353_pairs): if (target_word[pair] not in self.data.word2id): raise Exception('Target word not in model vocab: ', target_word[pair]) if (context_word[pair] not in self.data.word2id): raise Exception('Context word not in model vocab: ', context_word[pair]) human_rankings = ss.rankdata(human_scores) machine_scores = [] for pair in range(0, len(human_scores)): machine_scores.append( self.calculate_probability(target_word[pair], context_word[pair])) machine_rankings = ss.rankdata(machine_scores) human_scores_dict = dict() machine_scores_dict = dict() for pair in range(0, len(human_scores)): human_scores_dict[pair] = human_rankings[pair] machine_scores_dict[pair] = machine_rankings[pair] return spearman.spearman_correlation(human_scores_dict, machine_scores_dict)
class Word2Vec(object): def __init__(self,output_file_name, walks = [], emb_dimension=100, batch_size=64, window_size=5, epochs=5, negative_num=5): print("Load data...") self.data = InputData(window_size, batch_size, walks) self.output_file_name = output_file_name self.emb_dimension = emb_dimension self.epochs = epochs self.negative_num = negative_num self.batch_size = batch_size self.vocab_size = self.data.vocab_size self.model = SkipGramModel(self.vocab_size, self.emb_dimension) self.optimizer = torch.optim.SGD(self.model.parameters(), lr=1.0) if cuda_gpu: self.model = self.model.cuda() def train_model(self): for _ in tqdm(range(self.epochs)): step = 0 avg_loss = 0 for pos_pairs in self.data.data_iter: target_word = pos_pairs[0][:,0] context_word = pos_pairs[0][:,1] neg_word = self.data.get_negative_sample(pos_pairs[0], 3) if cuda_gpu: target_word = torch.tensor(target_word, dtype=torch.long).cuda() context_word= torch.tensor(context_word, dtype=torch.long).cuda() neg_word = torch.tensor(neg_word, dtype=torch.long).cuda() loss = self.model(target_word, context_word, neg_word).cuda() else: target_word = torch.tensor(target_word, dtype=torch.long) context_word= torch.tensor(context_word, dtype=torch.long) neg_word = torch.tensor(neg_word, dtype=torch.long) loss = self.model(target_word, context_word, neg_word) self.optimizer.zero_grad() loss.backward() self.optimizer.step() if cuda_gpu: avg_loss += loss.cpu().item() else: # print(loss.item()) avg_loss += loss.item() step += 1 if step % 2000 == 0 and step > 0: avg_loss /= 2000 print("Average loss at step ", step, ": ", avg_loss) avg_loss = 0 self.model.save_embedding(self.output_file_name) print("~ done.")
class Word2Vec: def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=100, window_size=5, iteration=5, initial_lr=0.025, min_count=5, using_hs=False, using_neg=False, context_size=2, hidden_size=128, cbow=None, skip_gram=None): """Initilize class parameters. Args: input_file_name: Name of a text data from file. Each line is a sentence splited with space. output_file_name: Name of the final embedding file. emb_dimention: Embedding dimention, typically from 50 to 500. batch_size: The count of word pairs for one forward. window_size: Max skip length between words. iteration: Control the multiple training iterations. initial_lr: Initial learning rate. min_count: The minimal word frequency, words with lower frequency will be filtered. using_hs: Whether using hierarchical softmax. Returns: None. """ print("\nInput File loading......\n") self.data = InputData(input_file_name, min_count) print("\nInput File loaded.\n") print("Input Data", self.data) self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) print("emb_size", self.emb_size) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.context_size = context_size self.hidden_size = hidden_size self.using_hs = using_hs self.using_neg = using_neg self.cbow = cbow self.skip_gram = skip_gram if self.skip_gram is not None and self.skip_gram: self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) print("skip_gram_model", self.skip_gram_model) self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) if self.cbow is not None and self.cbow: # self.cbow_model = CBOW(self.emb_size, self.context_size, self.emb_dimension, self.hidden_size) self.cbow_model = CBOW(self.emb_size, self.emb_dimension) print("CBOW_model", self.cbow_model) self.optimizer = optim.SGD(self.cbow_model.parameters(), lr=self.initial_lr) # @profile def skip_gram_train(self): """Multiple training. Returns: None. """ pair_count = self.data.evaluate_pair_count(self.window_size) print("pair_count", pair_count) batch_count = self.iteration * pair_count / self.batch_size print("batch_count", batch_count) process_bar = tqdm(range(int(batch_count))) self.skip_gram_model.save_embedding(self.data.id2word, 'skip_gram_begin_embedding.txt') for i in process_bar: pos_pairs = self.data.get_batch_pairs(self.batch_size, self.window_size) if self.using_hs: pos_pairs, neg_pairs = self.data.get_pairs_by_huffman( pos_pairs) else: pos_pairs, neg_pairs = self.data.get_pairs_by_neg_sampling( pos_pairs, 5) pos_u = [int(pair[0]) for pair in pos_pairs] pos_v = [int(pair[1]) for pair in pos_pairs] neg_u = [int(pair[0]) for pair in neg_pairs] neg_v = [int(pair[1]) for pair in neg_pairs] self.optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_u, neg_v) loss.backward() self.optimizer.step() process_bar.set_description( "Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) print("Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name) def cbow_train(self): print("CBOW Training......") pair_count = self.data.evaluate_pair_count(self.context_size * 2 + 1) print("pair_count", pair_count) batch_count = self.iteration * pair_count / self.batch_size print("batch_count", batch_count) process_bar = tqdm(range(int(batch_count))) self.cbow_model.save_embedding(self.data.id2word, 'cbow_begin_embedding.txt') for i in process_bar: pos_pairs = self.data.get_cbow_batch_all_pairs( self.batch_size, self.context_size) if self.using_hs: pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_huffman( pos_pairs) else: pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_neg_sampling( pos_pairs, self.context_size) pos_u = [pair[0] for pair in pos_pairs] pos_v = [int(pair[1]) for pair in pos_pairs] neg_u = [pair[0] for pair in neg_pairs] neg_v = [int(pair[1]) for pair in neg_pairs] self.optimizer.zero_grad() loss = self.cbow_model.forward(pos_u, pos_v, neg_u, neg_v) # loss = self.cbow_model.forwards(pos_v, pos_u, neg_v, neg_u) loss.backward() self.optimizer.step() process_bar.set_description( "Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) print("Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr print("CBOW Trained and Saving File......") self.cbow_model.save_embedding(self.data.id2word, self.output_file_name) print("CBOW Trained and Saved File.")
class Word2VecTrainer: def __init__(self, input_file, output_file, emb_dimension=300, batch_size=64, window_size=5, iterations=5, initial_lr=1.0, min_count=5): self.data = DataReader(input_file, min_count) dataset = Word2vecDataset(self.data, window_size) self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=dataset.collate) self.output_file_name = output_file self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.iterations = iterations self.initial_lr = initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: print("USING CUDA") self.skip_gram_model.cuda() else: print("CUDA FAIL") def train(self): for iteration in range(self.iterations): print("\n\n\nIteration: " + str(iteration + 1)) optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, len(self.dataloader)) running_loss = 0.0 for i, sample_batched in enumerate(tqdm(self.dataloader)): if len(sample_batched[0]) > 1: pos_u = sample_batched[0].to(self.device) pos_v = sample_batched[1].to(self.device) neg_v = sample_batched[2].to(self.device) # scheduler.step() optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() optimizer.step() running_loss = running_loss * 0.95 + loss.item() * 0.05 if i > 0 and i % 400 == 0: print(" Loss: " + str(running_loss)) self.skip_gram_model.save_embedding( self.data.id2word, self.output_file_name.format(iteration)) self.initial_lr *= 0.7
class Word2VecTrainer: def __init__(self, input_file, output_file, emb_dimension=100, batch_size=32, window_size=5, iterations=3, initial_lr=0.001, min_count=12): print("Reading input file...") self.data = DataReader(input_file, min_count) dataset = Word2vecDataset(self.data, window_size) print("Creating data batches") self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=dataset.collate) self.output_file_name = output_file self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.iterations = iterations self.initial_lr = initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: self.skip_gram_model.cuda() def train(self): for iteration in range(self.iterations): print("\n\n\nIteration: " + str(iteration + 1)) optimizer = optim.SparseAdam(self.skip_gram_model.parameters(), lr=self.initial_lr) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, len(self.dataloader)) running_loss = 0.0 count = 0 for i, sample_batched in enumerate(self.dataloader): count += 1 if count % 10000 == 0: print("\n\nEpoch %d, %d batches processed" % (iteration, count)) if len(sample_batched[0]) > 1: pos_u = sample_batched[0].to(self.device) pos_v = sample_batched[1].to(self.device) neg_v = sample_batched[2].to(self.device) scheduler.step() optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() optimizer.step() running_loss = running_loss * 0.9 + loss.item() * 0.1 if i > 0 and i % 500 == 0: print(" Loss: " + str(running_loss)) self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name)
class Word2VecTrainer: def __init__(self, args):# input_file, output_file, emb_dimension=100, batch_size=32, window_size=5, iterations=3,initial_lr=0.01, min_count=25,weight_decay = 0, time_scale =1 # self.data = DataReader(args.text, args.min_count) # if not args.use_time: # dataset = Word2vecDataset(self.data, args.window_size) # else: # dataset = TimestampledWord2vecDataset(self.data, args.window_size,args.time_scale) # # self.dataloader = DataLoader(dataset, batch_size=args.batch_size, # shuffle=True, num_workers=0, collate_fn=dataset.collate) self.data,self.dataloader = self.load_train(args) # self.data if "train" in args.text: test_filename = args.text.replace("train","test") if os.path.exists(test_filename): print("load test dataset: ".format(test_filename)) self.test = self.load_train(args, data = self.data, filename=test_filename, is_train=False ) else: self.test = None dev_filename = args.text.replace("train", "dev") if os.path.exists(dev_filename): print("load dev dataset: ".format(dev_filename)) self.dev = self.load_train(args, data = self.data, filename=dev_filename, is_train=False) else: self.dev = None else: self.dev, self.test = None, None if args.use_time: self.output_file_name = "{}/{}".format(args.output, args.time_type) if args.add_phase_shift: self.output_file_name += "_shift" else: self.output_file_name = "{}/{}".format(args.output, "word2vec") if not os.path.exists(args.output): os.mkdir(args.output) if not os.path.exists(self.output_file_name): os.mkdir(self.output_file_name) self.emb_size = len(self.data.word2id) self.emb_dimension = args.emb_dimension self.batch_size = args.batch_size self.iterations = args.iterations self.lr = args.lr self.time_type = args.time_type self.weight_decay = args.weight_decay print(args) if args.use_time: self.skip_gram_model = TimestampedSkipGramModel(self.emb_size, self.emb_dimension,time_type = args.time_type,add_phase_shift=args.add_phase_shift) else: self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: print("using cuda and GPU ....") self.skip_gram_model.cuda() # load_path = "{}/{}".format(self.output_file_name) # torch.save(self.skip_gram_model,"pytorch.bin") # self.skip_gram_model = torch.load("pytorch.bin") # self.skip_gram_model = load_model(self.skip_gram_model,"pytorch.bin") # exit() if not args.from_scatch and os.path.exists(self.output_file_name): print("loading parameters ....") self.skip_gram_model.load_embeddings(self.data.id2word,self.output_file_name) def load_train(self,args,data= None, filename = None, is_train = True): if data is None: assert is_train==True, "wrong to load data 1" data = DataReader(args.text, args.min_count) filename = args.text else: assert is_train == False, "wrong to load test data 2" assert filename is not None, "wrong to load test data 3" assert data is not None, "wrong to load test data 4" if not args.use_time: dataset = Word2vecDataset(data, input_text = filename, window_size= args.window_size) else: dataset = TimestampledWord2vecDataset(data,input_text = filename, window_size= args.window_size, time_scale=args.time_scale) dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=is_train, num_workers=0, collate_fn=dataset.collate) # shuffle if it is train if is_train: return data,dataloader else: return dataloader def evaluation_loss(self,logger =None): results = [] self.skip_gram_model.eval() print("evaluating ...") for index,dataloader in enumerate([self.dev,self.test]): if dataloader is None: continue losses = [] for i, sample_batched in enumerate(tqdm(dataloader)): if len(sample_batched[0]) > 1: pos_u = sample_batched[0].to(self.device) pos_v = sample_batched[1].to(self.device) neg_v = sample_batched[2].to(self.device) if args.use_time: time = sample_batched[3].to(self.device) # print(time) loss, pos, neg = self.skip_gram_model.forward(pos_u, pos_v, neg_v, time) else: loss, pos, neg = self.skip_gram_model.forward(pos_u, pos_v, neg_v) # print(loss) losses.append(loss.item()) mean_result = np.array(losses).mean() results.append(mean_result) print("test{} loss is {}".format(index, mean_result)) logger.write("Loss in test{}: {} \n".format( index, str(mean_result))) logger.flush() self.skip_gram_model.train() return results def train(self): print(os.path.join(self.output_file_name,"log.txt")) if not os.path.exists(self.output_file_name): os.mkdir(self.output_file_name) optimizer = optim.Adam(self.skip_gram_model.parameters(), lr=self.lr, weight_decay=self.weight_decay) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, len(self.dataloader)*self.iterations) with open("{}/log.txt".format(self.output_file_name,"log.txt"),"w") as f: for iteration in range(self.iterations): print("\nIteration: " + str(iteration + 1)) f.write(str(args) +"\n") # optimizer = optim.SparseAdam(self.skip_gram_model.parameters(), lr=self.initial_lr) running_loss = 0.0 for i, sample_batched in enumerate(tqdm(self.dataloader)): if len(sample_batched[0]) > 1: pos_u = sample_batched[0].to(self.device) pos_v = sample_batched[1].to(self.device) neg_v = sample_batched[2].to(self.device) optimizer.zero_grad() if args.use_time: time = sample_batched[3].to(self.device) # print(time) loss,pos,neg = self.skip_gram_model.forward(pos_u, pos_v, neg_v,time) else: loss,pos,neg = self.skip_gram_model.forward(pos_u, pos_v, neg_v) # print(loss) loss.backward() optimizer.step() scheduler.step() loss,pos,neg = loss.item(),pos.item(),neg.item() if i % args.log_step == 0: # i > 0 and f.write("Loss in {} steps: {} {}, {}\n".format(i,str(loss),str(pos),str(neg))) if not torch.cuda.is_available() or i % (args.log_step*10) == 0 : print("Loss in {} steps: {} {}, {}\n".format(i,str(loss),str(pos),str(neg))) self.evaluation_loss(logger=f) epoch_path = os.path.join(self.output_file_name,str(iteration)) if not os.path.exists(epoch_path): os.mkdir(epoch_path) torch.save(self.skip_gram_model, os.path.join( epoch_path,"pytorch.bin") ) self.skip_gram_model.save_embedding(self.data.id2word, os.path.join(self.output_file_name,str(iteration))) self.skip_gram_model.save_in_text_format(self.data.id2word, os.path.join(self.output_file_name, str(iteration))) self.skip_gram_model.save_in_text_format(self.data.id2word,self.output_file_name) torch.save(self.skip_gram_model, os.path.join(self.output_file_name,"pytorch.bin") ) with open(os.path.join(self.output_file_name,"config.json"), "wt") as f: json.dump(vars(args), f, indent=4) self.skip_gram_model.save_dict(self.data.id2word,self.output_file_name)
class Word2Vec: def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=50, window_size=5, iteration=1, initial_lr=0.025, min_count=1): """Initilize class parameters. Args: input_file_name: Name of a text data from file. Each line is a sentence splited with space. output_file_name: Name of the final embedding file. emb_dimention: Embedding dimention, typically from 50 to 500. batch_size: The count of word pairs for one forward. window_size: Max skip length between words. iteration: Control the multiple training iterations. initial_lr: Initial learning rate. min_count: The minimal word frequency, words with lower frequency will be filtered. Returns: None. """ self.data = InputData(input_file_name, min_count) self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.skip_gram_model.cuda() self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) def train(self): """Multiple training. Returns: None. """ pair_count = self.data.evaluate_pair_count(self.window_size) batch_count = self.iteration * pair_count / self.batch_size process_bar = tqdm(range(int(batch_count))) # self.skip_gram_model.save_embedding( # self.data.id2word, 'begin_embedding.txt', self.use_cuda) for i in process_bar: pos_pairs = self.data.get_batch_pairs(self.batch_size, self.window_size) neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5) pos_u = [pair[0] for pair in pos_pairs] pos_v = [pair[1] for pair in pos_pairs] pos_u = Variable(torch.LongTensor(pos_u)) pos_v = Variable(torch.LongTensor(pos_v)) neg_v = Variable(torch.LongTensor(neg_v)) if self.use_cuda: pos_u = pos_u.cuda() pos_v = pos_v.cuda() neg_v = neg_v.cuda() self.optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() self.optimizer.step() process_bar.set_description( "Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name, self.use_cuda)
class Word2Vec: def __init__( self, input_path, output_dir, wordsim_path, dimension=100, batch_size=batch_size, window_size=5, epoch_count=1, initial_lr=1e-6, min_count=5, ): self.data = InputData(input_path, min_count) self.output_dir = output_dir self.vocabulary_size = len(self.data.id_from_word) self.dimension = dimension self.batch_size = batch_size self.window_size = window_size self.epoch_count = epoch_count self.initial_lr = initial_lr self.model = SkipGramModel(self.vocabulary_size, self.dimension) if torch.cuda.is_available(): self.device = torch.device('cuda') else: self.device = torch.device('cpu') self.model = nn.DataParallel(self.model.to(self.device)) self.optimizer = optim.SGD(self.model.parameters(), lr=self.initial_lr) if wordsim_path: self.wordsim_verification_tuples = [] with open(wordsim_path, 'r') as f: f.readline() # Abandon header for line in f: word1, word2, actual_similarity = line.split(',') self.wordsim_verification_tuples.append( (word1, word2, float(actual_similarity)) ) else: self.wordsim_verification_tuples = None def train(self): pair_count = self.data.get_pair_count(self.window_size) batch_count = self.epoch_count * pair_count / self.batch_size best_rho = float('-inf') for i in tqdm(range(int(batch_count)), total=batch_count): self.model.train() pos_pairs = self.data.get_batch_pairs( self.batch_size, self.window_size ) neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5) pos_u = [pair[0] for pair in pos_pairs] pos_v = [pair[1] for pair in pos_pairs] pos_u = torch.tensor(pos_u, device=self.device) pos_v = torch.tensor(pos_v, device=self.device) neg_v = torch.tensor(neg_v, device=self.device) self.optimizer.zero_grad() loss = self.model(pos_u, pos_v, neg_v) loss.backward() self.optimizer.step() if i % 250 == 0: self.model.eval() rho = self.model.module.get_wordsim_rho( self.wordsim_verification_tuples, self.data.id_from_word, self.data.word_from_id ) print( f'Loss: {loss.item()},' f' lr: {self.optimizer.param_groups[0]["lr"]},' f' rho: {rho}' ) dump_embedding( self.model.module.get_embedding( self.data.id_from_word, self.data.word_from_id ), self.model.module.dimension, self.data.word_from_id, os.path.join(self.output_dir, f'latest.txt'), ) if rho > best_rho: dump_embedding( self.model.module.get_embedding( self.data.id_from_word, self.data.word_from_id ), self.model.module.dimension, self.data.word_from_id, os.path.join(self.output_dir, f'{i}_{rho}.txt') ) best_rho = rho # warm up if i < 10000: lr = self.initial_lr * i / 10000 for param_group in self.optimizer.param_groups: param_group['lr'] = lr elif i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr
class Word2VecTrainer: def __init__(self, inFile, outFile, prFile=None, emb_dimensions=100, batch_size=512, window_size=5, iterations=50, initial_lr=0.003): self.data = DataReader(inFile, txtFile=prFile) dataset = Word2VecDataset(self.data, window_size) self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=dataset.collate) self.output_file_name = outFile self.emb_size = len(self.data.word2id) self.batch_size = batch_size self.emb_dimensions = emb_dimensions self.iterations = iterations self.initial_lr = initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimensions) self.use_cuda = torch.cuda.is_available() self.device = torch.device('cuda:0' if self.use_cuda else 'cpu') if self.use_cuda: self.skip_gram_model.cuda() def train(self): loss_history = [] spear_history = [] best_spearman = 0.0 for itr in range(self.iterations): print("\nIteration: " + str(itr + 1)) optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) running_loss = 0.0 for i, batch in enumerate(tqdm(self.dataloader)): # print("V Vector:", batch[0]) # print("U Mat:", batch[1]) # print("Neg Sample:", batch[2]) pos_v = batch[0].to(self.device) pos_u = batch[1].to(self.device) neg_u = batch[2].to(self.device) optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_v, pos_u, neg_u) loss.backward() optimizer.step() running_loss = running_loss * 0.9 + loss.item() * 0.1 print("Loss: " + str(running_loss)) loss_history.append(running_loss) new_spearman = self.test(inFile="wordsim353/combined.csv") spear_history.append(new_spearman) if new_spearman > best_spearman: self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name) best_spearman = new_spearman return loss_history, spear_history def test(self, inFile, embFile="emb_art_10.npy"): self.cos_dict = dict() self.cos_dict_id = dict() # 1. Import wordsim353 and visualize it csv = pd.read_csv(inFile) csv = np.array(csv) idsim = dict() wordsim = dict() for (word_a, word_b, num) in csv: if word_a in self.data.word2id and word_b in self.data.word2id: idsim[(self.data.word2id[word_a], self.data.word2id[word_b])] = num wordsim[(word_a, word_b)] = num # 2. Load embeddings & normalize them if not self.skip_gram_model.v_embeddings: self.embeddings = np.load(embFile, allow_pickle=True) else: self.embeddings = self.skip_gram_model.v_embeddings.weight.cpu( ).data.numpy() # 3. Compute Cosine Similarities for (id_a, id_b), value in idsim.items(): embeddings_a = self.embeddings[id_a].reshape(1, -1) embeddings_b = self.embeddings[id_b].reshape(1, -1) similarity = np.asscalar( cosine_similarity(embeddings_a, embeddings_b)[0]) self.cos_dict[(self.data.id2word[id_a], self.data.id2word[id_b])] = similarity self.cos_dict_id[id_a, id_b] = similarity # Array form a = list([]) b = list([]) for (id_a, id_b), value in idsim.items(): a.append(value) b.append(self.cos_dict_id[(id_a, id_b)]) print("Spearman Coefficient:", spearman_correlation(self.cos_dict_id, idsim)) spear = spearmanr(a, b) print(spear) return (spear[0])
class Word2Vec: def __init__(self, output_file_name, output_sense_name, emb_dimension=128, K=5, batch_size=1, window_size=5, iteration=1, initial_lr=0.1, createClusterLambda=1.5, min_count=0): """Initilize class parameters. Args: input_file_name: Name of a text data from file. Each line is a sentence splited with space. output_file_name: Name of the final embedding file. emb_dimention: Embedding dimention, typically from 50 to 500. batch_size: The count of word pairs for one forward. window_size: Max skip length between words. iteration: Control the multiple training iterations. initial_lr: Initial learning rate. min_count: The minimal word frequency, words with lower frequency will be filtered. Returns: None. """ self.data = InputData(min_count) self.output_file_name = output_file_name self.output_sense_name = output_sense_name self.emb_size = len(self.data.node2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.K = K self.iteration = iteration self.initial_lr = initial_lr self.createClusterLambda = createClusterLambda self.skip_gram_model = SkipGramModel(self.emb_size, self.K, self.emb_dimension) self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.skip_gram_model.cuda() self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) def train(self): """Multiple training. Returns: None. """ pair_count = self.data.evaluate_pair_count(self.window_size) batch_count = self.iteration * pair_count / self.batch_size process_bar = tqdm(range(int(batch_count))) total_pos_pairs = self.data.get_node_pairs(self.window_size) print("training\n") for t in process_bar: pos_pairs = total_pos_pairs[t] neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5) pos_u = [pair[0] for pair in pos_pairs] pos_v = [pair[1] for pair in pos_pairs] # right=[] cnt = 0 curword = pos_u[cnt] contextwords = [] contextwords_cuda = [] while cnt < len(pos_u): contextwords.append(pos_v[cnt]) contextwords_cuda.append(pos_v[cnt]) cnt += 1 contextembedding = torch.zeros(self.emb_dimension) contextwords_cuda = Variable(torch.LongTensor(contextwords_cuda)) if self.use_cuda: contextwords_cuda = contextwords_cuda.cuda() emb_v = self.skip_gram_model.v_embeddings(contextwords_cuda) if self.use_cuda: emb_v_data = emb_v.cpu().data else: emb_v_data = emb_v.data for i in range(len(contextwords)): contextembedding += emb_v_data[i] # torch.add(contextembedding,emb_v_data[i,:],out=emb_v_data_total) emb_v_data_avg = contextembedding / (len(contextwords)) # torch.div(emb_v_data_total,len(contextwords),out=emb_v_data_avg) minDist = np.inf rightsense = 0 mu = torch.Tensor(self.emb_dimension) if self.skip_gram_model.num_sense[curword] == self.K: nC = self.K else: nC = self.skip_gram_model.num_sense[curword] + 1 prob = torch.Tensor(nC) for k in range(self.skip_gram_model.num_sense[curword]): torch.div(self.skip_gram_model.clusterCenter[curword, k, :], self.skip_gram_model.clusterCount[curword][k], out=mu) x_norm = torch.norm(emb_v_data_avg, p=2) y_norm = torch.norm(mu, p=2) summ = 0 for p in range(self.emb_dimension): summ += emb_v_data_avg[p] * mu[p] dist = 1 - summ / (x_norm * y_norm) prob[k] = dist if dist < minDist: minDist = dist rightsense = k if self.skip_gram_model.num_sense[curword] < self.K: if self.createClusterLambda < minDist: prob[self.skip_gram_model. num_sense[curword]] = self.createClusterLambda rightsense = self.skip_gram_model.num_sense[curword] self.skip_gram_model.num_sense[curword] += 1 for i in range(self.emb_dimension): self.skip_gram_model.clusterCenter[curword][rightsense][ i] += emb_v_data_avg[i] self.skip_gram_model.clusterCount[curword][rightsense] += 1 # for i in range(len(contextwords)): # right.append(rightsense) self.optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v, rightsense, self.use_cuda) loss.backward() self.optimizer.step() process_bar.set_description( "Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) if t * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * t / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr self.skip_gram_model.save_embedding(self.data.id2node, self.output_file_name, self.output_sense_name, self.use_cuda)