def __init__(self, input_file, vocabulary_file, img_data_file, char2ix_file, output_dir, maxwordlength, emb_dimension, line_batch_size, sample_batch_size, neg_num, window_size, discard, epochs, initial_lr, seed): torch.manual_seed(seed) self.img_data = np.load(img_data_file) self.data = DataReader(input_file, vocabulary_file, char2ix_file, maxwordlength, discard, seed) dataset = Word2vecDataset(self.data, window_size, sample_batch_size, neg_num) self.dataloader = DataLoader(dataset, batch_size=line_batch_size, shuffle=True, num_workers=0, collate_fn=dataset.collate) self.output_dir = output_dir self.emb_size = len(self.data.word2id) self.char_size = len(self.data.char2id) + 1 #5031 self.emb_dimension = emb_dimension self.line_batch_size = line_batch_size self.epochs = epochs self.initial_lr = initial_lr self.VCWE_model = VCWEModel(self.emb_size, self.emb_dimension, self.data.wordid2charid, self.char_size) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") self.num_train_steps = int(len(self.dataloader) * self.epochs) if self.use_cuda: self.VCWE_model.cuda()
def __init__(self, input_file, output_file, emb_dimension=300, batch_size=64, window_size=5, iterations=5, initial_lr=1.0, min_count=5): self.data = DataReader(input_file, min_count) dataset = Word2vecDataset(self.data, window_size) self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=dataset.collate) self.output_file_name = output_file self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.iterations = iterations self.initial_lr = initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: print("USING CUDA") self.skip_gram_model.cuda() else: print("CUDA FAIL")
def __init__(self, input_file, output_file, emb_dimension=100, batch_size=32, window_size=5, iterations=3, initial_lr=0.001, min_count=12): print("Reading input file...") self.data = DataReader(input_file, min_count) dataset = Word2vecDataset(self.data, window_size) print("Creating data batches") self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=dataset.collate) self.output_file_name = output_file self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.iterations = iterations self.initial_lr = initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: self.skip_gram_model.cuda()
def load_train(self,args,data= None, filename = None, is_train = True): if data is None: assert is_train==True, "wrong to load data 1" data = DataReader(args.text, args.min_count) filename = args.text else: assert is_train == False, "wrong to load test data 2" assert filename is not None, "wrong to load test data 3" assert data is not None, "wrong to load test data 4" if not args.use_time: dataset = Word2vecDataset(data, input_text = filename, window_size= args.window_size) else: dataset = TimestampledWord2vecDataset(data,input_text = filename, window_size= args.window_size, time_scale=args.time_scale) dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=is_train, num_workers=0, collate_fn=dataset.collate) # shuffle if it is train if is_train: return data,dataloader else: return dataloader