def __init__(self, args): if args.aminer: dataset = AminerDataset(args.path) else: dataset = CustomDataset(args.path) self.data = DataReader(dataset, args.min_count, args.care_type) dataset = Metapath2vecDataset(self.data, args.window_size) self.dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=dataset.collate) self.output_file_name = args.output_file self.emb_size = len(self.data.word2id) self.emb_dimension = args.dim self.batch_size = args.batch_size self.iterations = args.iterations self.initial_lr = args.initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: self.skip_gram_model.cuda()
def __init__(self, inFile, outFile, prFile=None, emb_dimensions=100, batch_size=512, window_size=5, iterations=50, initial_lr=0.003): self.data = DataReader(inFile, txtFile=prFile) dataset = Word2VecDataset(self.data, window_size) self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=dataset.collate) self.output_file_name = outFile self.emb_size = len(self.data.word2id) self.batch_size = batch_size self.emb_dimensions = emb_dimensions self.iterations = iterations self.initial_lr = initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimensions) self.use_cuda = torch.cuda.is_available() self.device = torch.device('cuda:0' if self.use_cuda else 'cpu') if self.use_cuda: self.skip_gram_model.cuda()
def __init__(self, input_file, output_file, emb_dimension=300, batch_size=64, window_size=5, iterations=5, initial_lr=1.0, min_count=5): self.data = DataReader(input_file, min_count) dataset = Word2vecDataset(self.data, window_size) self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=dataset.collate) self.output_file_name = output_file self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.iterations = iterations self.initial_lr = initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: print("USING CUDA") self.skip_gram_model.cuda() else: print("CUDA FAIL")
def main(args): if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() graph = load(args.dataset) model = SkipGramModel( graph.num_nodes, args.embed_size, args.neg_num, sparse=not args.use_cuda) model = paddle.DataParallel(model) optim = Adam( learning_rate=args.learning_rate, parameters=model.parameters(), weight_decay=args.weight_decay) train_ds = ShardedDataset(graph.nodes) collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size, args.neg_num, args.neg_sample_type) data_loader = Dataloader( train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.sample_workers, collate_fn=collate_fn) for epoch in tqdm.tqdm(range(args.epoch)): train_loss = train(model, data_loader, optim) log.info("Runing epoch:%s\t train_loss:%.6f", epoch, train_loss)
def __init__(self, path="output", time_type="word_sin"): # for time_type in os.listdir(path): # if ".DS_Store" in time_type: # continue self.path = path subpath = os.path.join(path, time_type) if args.add_phase_shift: subpath += "_shift" if not os.path.exists(os.path.join(subpath, "vectors.txt")): print("cannot find vectors.txt in {}, try to find {}-th iteration". format(subpath, args.iterations)) subpath = os.path.join(subpath, str(args.iterations - 1)) if not os.path.exists(subpath): print("cannot load model from {}".format(subpath)) return self.embedding_dict = read_embeddings_from_file( os.path.join(subpath, "vectors.txt")) if args.use_time and "word2vec" not in time_type: self.skip_gram_model = TimestampedSkipGramModel( len(self.embedding_dict), args.emb_dimension, time_type=time_type, add_phase_shift=args.add_phase_shift) else: self.skip_gram_model = SkipGramModel(len(self.embedding_dict), args.emb_dimension) self.id2word = pickle.load( open(os.path.join(subpath, "dict.pkl"), "rb")) self.skip_gram_model.load_embeddings(self.id2word, subpath) if torch.cuda.is_available(): self.skip_gram_model.cuda()
def __init__(self, input_file, output_file, emb_dimension=100, batch_size=32, window_size=5, iterations=3, initial_lr=0.001, min_count=12): print("Reading input file...") self.data = DataReader(input_file, min_count) dataset = Word2vecDataset(self.data, window_size) print("Creating data batches") self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=dataset.collate) self.output_file_name = output_file self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.iterations = iterations self.initial_lr = initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: self.skip_gram_model.cuda()
def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=50, window_size=5, iteration=1, initial_lr=0.025, min_count=5): """Initilize class parameters. Args: input_file_name: Name of a text data from file. Each line is a sentence splited with space. output_file_name: Name of the final embedding file. emb_dimention: Embedding dimention, typically from 50 to 500. batch_size: The count of word pairs for one forward. window_size: Max skip length between words. iteration: Control the multiple training iterations. initial_lr: Initial learning rate. min_count: The minimal word frequency, words with lower frequency will be filtered. Returns: None. """ self.data = InputData(input_file_name, min_count) self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr)
def main(args): if not args.use_cuda: paddle.set_device("cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() graph = load(args.dataset) model = SkipGramModel(graph.num_nodes, args.embed_size, args.neg_num, sparse=not args.use_cuda) model = paddle.DataParallel(model) train_steps = int(graph.num_nodes / args.batch_size) * args.epoch scheduler = paddle.optimizer.lr.PolynomialDecay( learning_rate=args.learning_rate, decay_steps=train_steps, end_lr=0.0001) optim = Adam(learning_rate=scheduler, parameters=model.parameters()) train_ds = ShardedDataset(graph.nodes) collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size, args.neg_num, args.neg_sample_type) data_loader = Dataloader(train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.sample_workers, collate_fn=collate_fn) for epoch in tqdm.tqdm(range(args.epoch)): train_loss = train(model, data_loader, optim) log.info("Runing epoch:%s\t train_loss:%.6f", epoch, train_loss) paddle.save(model.state_dict(), "model.pdparams")
def __init__(self, log_filename: str, output_filename: str, embedding_dimension: int=100, batch_size: int=128, iteration: int=1, initial_lr: float=0.025, min_count: int=5, sub_sampling_t: float = 1e-5, neg_sampling_t: float = 0.75, neg_sample_count: int = 5, half_window_size: int = 2, read_data_method: str='memory'): """ init func """ self.data = DataHanlder(log_filename=log_filename, batch_size=batch_size, min_count=min_count, sub_sampling_t=sub_sampling_t, neg_sampling_t=neg_sampling_t, neg_sample_count=neg_sample_count, half_window_size=half_window_size, read_data_method=read_data_method) self.output_filename = output_filename self.embedding_dimension = embedding_dimension self.batch_size = batch_size self.half_window_size = half_window_size self.iter = iteration self.initial_lr = initial_lr self.sg_model = SkipGramModel(len(self.data.vocab), self.embedding_dimension) self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.sg_model.cuda() self.optimizer = optim.SGD(self.sg_model.parameters(), lr=self.initial_lr)
def train(args): data = InputData(args.input, args.min_count, args.sample) output_file_name = args.output emb_size = len(data.word2id) emb_dimension = args.dim batch_size = args.mb window_size = args.window n_negs = args.n_negs iteration = args.iters initial_lr = args.lr use_cuda = args.cuda skip_gram_model = SkipGramModel(emb_size, emb_dimension) if use_cuda: skip_gram_model = skip_gram_model.cuda() optimizer = optim.SGD(skip_gram_model.parameters(), lr=initial_lr) pair_count = data.evaluate_pair_count(window_size) batch_count = iteration * pair_count / batch_size process_bar = tqdm(range(int(batch_count))) # skip_gram_model.save_embedding( # data.id2word, 'begin_embedding.txt', use_cuda) for i in process_bar: pos_pairs = data.get_batch_pairs(batch_size, window_size) neg_v = data.get_neg_v_neg_sampling(pos_pairs, n_negs) pos_u = [pair[0] for pair in pos_pairs] pos_v = [pair[1] for pair in pos_pairs] pos_u = torch.LongTensor(pos_u) pos_v = torch.LongTensor(pos_v) neg_v = torch.LongTensor(neg_v) if use_cuda: pos_u = pos_u.cuda() pos_v = pos_v.cuda() neg_v = neg_v.cuda() optimizer.zero_grad() loss = skip_gram_model(pos_u, pos_v, neg_v) loss.backward() optimizer.step() process_bar.set_description( "\rLoss: %0.8f, lr: %0.6f" % (loss.item(), optimizer.param_groups[0]['lr'])) if i * batch_size % 100000 == 0: lr = initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in optimizer.param_groups: param_group['lr'] = lr skip_gram_model.save_embedding(data.id2word, output_file_name, use_cuda)
def __init__(self, ifolder, ofolder, emb_dimension=400, batch_size=32, iteration=int(sys.argv[3]), initial_lr=0.025): self.ifolder = ifolder self.outfolder = ofolder+ifolder.rsplit('/',2)[1]+'/' try: os.makedirs(self.outfolder) except: print(self.outfolder+ " folder exists. Will be overwritten") self.emb_dimension = emb_dimension self.initial_lr = initial_lr self.iteration = iteration self.batch_size = batch_size self.fpos = 0 self.fneg = 0 self.id2word = dict() self.id2pair = dict() self.pair2id = dict() self.read_word_dict(ifolder+"Word2Id") self.read_pair_dict(ifolder+"Pair2Id") self.pair_count = self.evaluate_pair_count() self.positive_pairs = np.zeros((self.pair_count, 2), dtype=int) # Dummy values to ensure size does not change self.negative_pairs = np.zeros((self.pair_count, 5), dtype=int) print(" Size of :", sys.getsizeof(self.positive_pairs)) print(" Size of :", sys.getsizeof(self.negative_pairs)) #ipdb.set_trace() self.emb_size = len(self.id2word) self.pair_emb_size = len(self.id2pair) self.skip_gram_model = SkipGramModel(self.pair_emb_size,self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.skip_gram_model.cuda() self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) print("Start reading pairs")
def StaticSkipGramModel(num_nodes, neg_num, embed_size, num_emb_part=8, shared_embedding=False): src = F.data("src", shape=[-1, 1], dtype="int64") dsts = F.data("dsts", shape=[-1, neg_num + 1], dtype="int64") model = SkipGramModel(num_nodes, embed_size, neg_num, num_emb_part, shared_embedding=shared_embedding) loss = model(src, dsts) return loss
def __init__(self, wikidump_filename, output_text_filename, emb_dimension, batch_size, window_size, iteration, initial_lr, min_count): self.data = InputData(wikidump_filename, min_count, output_text_filename) self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr)
def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=50, window_size=5, iteration=5, initial_lr=0.025, neg_num=5, min_count=5): self.data = InputData(input_file_name, min_count) self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.neg_num = neg_num self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.skip_gram_model.cuda() self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr)
def StaticSkipGramModel(num_nodes, neg_num, embed_size, sparse=False, sparse_embedding=False): src = F.data("src", shape=[-1, 1], dtype="int64") dsts = F.data("dsts", shape=[-1, neg_num + 1], dtype="int64") py_reader = paddle.fluid.io.DataLoader.from_generator( capacity=64, feed_list=[src, dsts], iterable=False, use_double_buffer=False) model = SkipGramModel(num_nodes, embed_size, neg_num, sparse=sparse, sparse_embedding=sparse_embedding) loss = model(src, dsts) return py_reader, loss
def __init__(self, input_file_name, output_file_name, emb_dimension=500, batch_size=32, window_size=7, iteration=3, initial_lr=0.025, min_count=200, pair_min_count=100, k_value=6): """Initilize class parameters. Args: input_file_name: Name of a text data from file. Each line is a sentence splited with space. output_file_name: Name of the final embedding file. emb_dimention: Embedding dimention, typically from 50 to 500. batch_size: The count of word pairs for one forward. window_size: Max skip length between words. iteration: Control the multiple training iterations. initial_lr: Initial learning rate. min_count: The minimal word frequency, words with lower frequency will be filtered. Returns: None. """ self.data = InputData(input_file_name, min_count, pair_min_count, window_size) self.output_file_name = output_file_name self.pair_emb_size = len( self.data.pair2id ) #number of words above min_count Change it to pairs count self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.k_value = k_value self.skip_gram_model = SkipGramModel(self.pair_emb_size, self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.skip_gram_model.cuda() self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr)
def init_device_emb(self): """ set the device before training will be called once in fast_train_mp / fast_train """ choices = sum([self.args.only_gpu, self.args.only_cpu, self.args.mix]) assert choices == 1, "Must choose only *one* training mode in [only_cpu, only_gpu, mix]" choices = sum([self.args.sgd, self.args.adam, self.args.avg_sgd]) assert choices == 1, "Must choose only *one* gradient descent strategy in [sgd, avg_sgd, adam]" # initializing embedding on CPU self.emb_model = SkipGramModel( emb_size=self.emb_size, emb_dimension=self.args.dim, walk_length=self.args.walk_length, window_size=self.args.window_size, batch_size=self.args.batch_size, only_cpu=self.args.only_cpu, only_gpu=self.args.only_gpu, mix=self.args.mix, neg_weight=self.args.neg_weight, negative=self.args.negative, lr=self.args.lr, lap_norm=self.args.lap_norm, adam=self.args.adam, sgd=self.args.sgd, avg_sgd=self.args.avg_sgd, fast_neg=self.args.fast_neg, record_loss=self.args.print_loss, norm=self.args.norm, use_context_weight=self.args.use_context_weight, ) torch.set_num_threads(self.args.num_threads) if self.args.only_gpu: print("Run in 1 GPU") assert self.args.gpus[0] >= 0 self.emb_model.all_to_device(self.args.gpus[0]) elif self.args.mix: print("Mix CPU with %d GPU" % len(self.args.gpus)) if len(self.args.gpus) == 1: assert self.args.gpus[ 0] >= 0, 'mix CPU with GPU should have abaliable GPU' self.emb_model.set_device(self.args.gpus[0]) else: print("Run in CPU process") self.args.gpus = [torch.device('cpu')]
def __init__(self, input_file_name, output_file_name): self.min_count = 5 self.emb_dimension = 100 self.batch_size = 64 self.window_size = 5 self.iteration = 1 self.initial_lr = 0.001 self.data = InputData(input_file_name, self.min_count) self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension, self.batch_size, self.window_size, self.iteration, self.initial_lr, self.min_count) self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.skip_gram_model.cuda() self.optimizer = optim.SGD( self.skip_gram_model.parameters(), lr=self.initial_lr)
def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=100, window_size=5, iteration=5, initial_lr=0.025, min_count=5, using_hs=False, using_neg=False, context_size=2, hidden_size=128, cbow=None, skip_gram=None): print("\nInput File loading......\n") self.data = InputData(input_file_name, min_count) print("\nInput File loaded.\n") self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.context_size = context_size self.hidden_size = hidden_size self.using_hs = using_hs self.using_neg = using_neg self.cbow = cbow self.skip_gram = skip_gram if self.skip_gram is not None and self.skip_gram: self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) print("skip_gram_model", self.skip_gram_model) self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) if self.cbow is not None and self.cbow: self.cbow_model = CBOW(self.emb_size, self.emb_dimension) print("CBOW_model", self.cbow_model) self.optimizer = optim.SGD(self.cbow_model.parameters(), lr=self.initial_lr)
def main(args): if not args.use_cuda: paddle.set_device("cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() if args.edge_file: graph = load_from_file(args.edge_file) else: graph = load(args.dataset) edges = np.load("./edges.npy") edges = np.concatenate([edges, edges[:, [1, 0]]]) graph = pgl.Graph(edges) model = SkipGramModel(graph.num_nodes, args.embed_size, args.neg_num, sparse=not args.use_cuda) model = paddle.DataParallel(model) train_ds = ShardedDataset(graph.nodes, repeat=args.epoch) train_steps = int(len(train_ds) // args.batch_size) log.info("train_steps: %s" % train_steps) scheduler = paddle.optimizer.lr.PolynomialDecay( learning_rate=args.learning_rate, decay_steps=train_steps, end_lr=0.0001) optim = Adam(learning_rate=scheduler, parameters=model.parameters()) collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size, args.neg_num, args.neg_sample_type) data_loader = Dataloader(train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.sample_workers, collate_fn=collate_fn) train_loss = train(model, data_loader, optim) paddle.save(model.state_dict(), "model.pdparams")
def __init__(self,output_file_name, walks = [], emb_dimension=100, batch_size=64, window_size=5, epochs=5, negative_num=5): print("Load data...") self.data = InputData(window_size, batch_size, walks) self.output_file_name = output_file_name self.emb_dimension = emb_dimension self.epochs = epochs self.negative_num = negative_num self.batch_size = batch_size self.vocab_size = self.data.vocab_size self.model = SkipGramModel(self.vocab_size, self.emb_dimension) self.optimizer = torch.optim.SGD(self.model.parameters(), lr=1.0) if cuda_gpu: self.model = self.model.cuda()
def load_old(self,path,time_type): self.path = path subpath = os.path.join(path, time_type) if args.add_phase_shift: subpath += "_shift" if not os.path.exists(os.path.join(subpath, "vectors.txt")): print("cannot find vectors.txt in {}, try to find {}-th iteration".format(subpath, args.iterations)) subpath = os.path.join(subpath, str(args.iterations - 1)) if not os.path.exists(subpath): print("cannot load model from {}".format(subpath)) return self.embedding_dict = read_embeddings_from_file(os.path.join(subpath, "vectors.txt")) if args.use_time and "word2vec" not in time_type: skip_gram_model = TimestampedSkipGramModel(len(self.embedding_dict), args.emb_dimension, time_type=time_type, add_phase_shift=args.add_phase_shift) else: skip_gram_model = SkipGramModel(len(self.embedding_dict), args.emb_dimension) id2word = pickle.load(open(os.path.join(subpath, "dict.pkl"), "rb")) skip_gram_model.load_embeddings(self.id2word, subpath) return skip_gram_model,id2word
def init_device_emb(self): """ set the device before training will be called once in fast_train_mp / fast_train """ choices = sum([self.args.only_gpu, self.args.only_cpu, self.args.mix]) assert choices == 1, "Must choose only *one* training mode in [only_cpu, only_gpu, mix]" # initializing embedding on CPU self.emb_model = SkipGramModel( emb_size=self.emb_size, emb_dimension=self.args.dim, batch_size=self.args.batch_size, only_cpu=self.args.only_cpu, only_gpu=self.args.only_gpu, only_fst=self.args.only_fst, only_snd=self.args.only_snd, mix=self.args.mix, neg_weight=self.args.neg_weight, negative=self.args.negative, lr=self.args.lr, lap_norm=self.args.lap_norm, fast_neg=self.args.fast_neg, record_loss=self.args.print_loss, async_update=self.args.async_update, num_threads=self.args.num_threads, ) torch.set_num_threads(self.args.num_threads) if self.args.only_gpu: print("Run in 1 GPU") assert self.args.gpus[0] >= 0 self.emb_model.all_to_device(self.args.gpus[0]) elif self.args.mix: print("Mix CPU with %d GPU" % len(self.args.gpus)) if len(self.args.gpus) == 1: assert self.args.gpus[ 0] >= 0, 'mix CPU with GPU should have avaliable GPU' self.emb_model.set_device(self.args.gpus[0]) else: print("Run in CPU process")
def init_device_emb(self): """ set the device before training will be called once in fast_train_mp / fast_train """ choices = sum([self.args.only_gpu, self.args.only_cpu, self.args.mix]) assert choices == 1, "Must choose only *one* training mode in [only_cpu, only_gpu, mix]" assert self.args.num_procs >= 1, "The number of process must be larger than 1" choices = sum([self.args.sgd, self.args.adam, self.args.avg_sgd]) assert choices == 1, "Must choose only *one* gradient descent strategy in [sgd, avg_sgd, adam]" # initializing embedding on CPU self.emb_model = SkipGramModel( emb_size=self.emb_size, emb_dimension=self.args.dim, walk_length=self.args.walk_length, window_size=self.args.window_size, batch_size=self.args.batch_size, only_cpu=self.args.only_cpu, only_gpu=self.args.only_gpu, mix=self.args.mix, neg_weight=self.args.neg_weight, negative=self.args.negative, lr=self.args.lr, lap_norm=self.args.lap_norm, adam=self.args.adam, sgd=self.args.sgd, avg_sgd=self.args.avg_sgd, fast_neg=self.args.fast_neg, ) torch.set_num_threads(self.args.num_threads) if self.args.only_gpu: print("Run in 1 GPU") self.emb_model.all_to_device(0) elif self.args.mix: print("Mix CPU with %d GPU" % self.args.num_procs) if self.args.num_procs == 1: self.emb_model.set_device(0) else: print("Run in %d CPU process" % self.args.num_procs)
def __init__(self, file, min_count, window_size, batch_size, output_file, dim, iterations, initial_lr): self.data = DataReader(file, min_count) dataset = Metapath2vecDataset(self.data, window_size) self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4, collate_fn=dataset.collate) self.output_file_name = output_file self.emb_size = len(self.data.word2id) self.emb_dimension = dim self.batch_size = batch_size self.iterations = iterations self.initial_lr = initial_lr #learning rate self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: self.skip_gram_model.cuda()
def __init__(self, args, graph): print("\nPerforming Node2vec...\n") # 1. generate walker walker = DeepWalker(args, graph) print("\nDoing deepwalks...\n") walker.create_features() self.inputFileName = "{}{}-deepwalk_{}-num_walks_{}-len_metapath.txt".format( args.input_path, args.idx_metapath, args.number_of_walks, args.walk_length) # 2. read data self.data = DataReader(args.min_count, args.care_type, self.inputFileName) # 3. make dataset for training dataset = DatasetLoader(self.data, args.window_size) # 4. initialize dataloader self.dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=dataset.collate) self.output_file_name = "{}{}-embedding_{}-deepwalk_{}-dim_{}-initial_lr_{}-window_size_{}-iterations_{}-min_count.pickle".format( args.output_path, args.idx_embed, args.idx_metapath, args.dim, args.initial_lr, args.window_size, args.iterations, args.min_count) self.emb_size = len(self.data.word2id) self.emb_dimension = args.dim self.batch_size = args.batch_size self.iterations = args.iterations self.initial_lr = args.initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: self.skip_gram_model.cuda()
def __init__( self, input_path, output_dir, wordsim_path, dimension=100, batch_size=batch_size, window_size=5, epoch_count=1, initial_lr=1e-6, min_count=5, ): self.data = InputData(input_path, min_count) self.output_dir = output_dir self.vocabulary_size = len(self.data.id_from_word) self.dimension = dimension self.batch_size = batch_size self.window_size = window_size self.epoch_count = epoch_count self.initial_lr = initial_lr self.model = SkipGramModel(self.vocabulary_size, self.dimension) if torch.cuda.is_available(): self.device = torch.device('cuda') else: self.device = torch.device('cpu') self.model = nn.DataParallel(self.model.to(self.device)) self.optimizer = optim.SGD(self.model.parameters(), lr=self.initial_lr) if wordsim_path: self.wordsim_verification_tuples = [] with open(wordsim_path, 'r') as f: f.readline() # Abandon header for line in f: word1, word2, actual_similarity = line.split(',') self.wordsim_verification_tuples.append( (word1, word2, float(actual_similarity)) ) else: self.wordsim_verification_tuples = None
def main(config, ip_list_file): ds = TrainPairDataset(config, ip_list_file) loader = Dataloader( ds, batch_size=config.batch_pair_size, num_workers=config.num_workers, stream_shuffle_size=config.pair_stream_shuffle_size, collate_fn=CollateFn()) model = SkipGramModel(config) if config.warm_start_from: log.info("warm start from %s" % config.warm_start_from) model.set_state_dict(paddle.load(config.warm_start_from)) optim = Adam( learning_rate=config.lr, parameters=model.parameters(), lazy_mode=config.lazy_mode) log.info("starting training...") train(config, model, loader, optim)
def StaticSkipGramModel(num_nodes, neg_num, embed_size, sparse): src = F.data("src", shape=[-1, 1], dtype="int64") dsts = F.data("dsts", shape=[-1, neg_num + 1], dtype="int64") model = SkipGramModel(num_nodes, embed_size, neg_num, sparse) loss = model(src, dsts) return loss
parser = argparse.ArgumentParser(description='Word2vec') parser.add_argument('-lr', type=float, default=0.025) parser.add_argument('-epochs', type=int, default=5) parser.add_argument('-window-size', type=int, default=5) parser.add_argument('-min-count', type=int, default=5) parser.add_argument('-neg-count', type=int, default=5) parser.add_argument('-batch-size', type=int, default=100) parser.add_argument('-emb-dim', type=int, default=100) parser.add_argument('-using-hs', action='store_true', default=False) parser.add_argument('-dir', type=str, default='./data') parser.add_argument('-no-cuda', action='store_true') parser.add_argument('-test', action='store_true', default=False) args = parser.parse_args() # data data = InputData('zhihu.txt', args) args.output_file_name = 'result2.txt' # update args args.emb_size = len(data.word2id) # do skip_gram_model = SkipGramModel(args) mytrain.train(data, skip_gram_model, args)