コード例 #1
0
ファイル: trainer.py プロジェクト: lyDonnieLiu/VCWE
    def __init__(self, input_file, vocabulary_file, img_data_file,
                 char2ix_file, output_dir, maxwordlength, emb_dimension,
                 line_batch_size, sample_batch_size, neg_num, window_size,
                 discard, epochs, initial_lr, seed):

        torch.manual_seed(seed)
        self.img_data = np.load(img_data_file)
        self.data = DataReader(input_file, vocabulary_file, char2ix_file,
                               maxwordlength, discard, seed)
        dataset = Word2vecDataset(self.data, window_size, sample_batch_size,
                                  neg_num)
        self.dataloader = DataLoader(dataset,
                                     batch_size=line_batch_size,
                                     shuffle=True,
                                     num_workers=0,
                                     collate_fn=dataset.collate)

        self.output_dir = output_dir
        self.emb_size = len(self.data.word2id)
        self.char_size = len(self.data.char2id) + 1  #5031
        self.emb_dimension = emb_dimension
        self.line_batch_size = line_batch_size
        self.epochs = epochs
        self.initial_lr = initial_lr
        self.VCWE_model = VCWEModel(self.emb_size, self.emb_dimension,
                                    self.data.wordid2charid, self.char_size)
        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        self.num_train_steps = int(len(self.dataloader) * self.epochs)
        if self.use_cuda:
            self.VCWE_model.cuda()
コード例 #2
0
ファイル: trainer.py プロジェクト: ljjb/word2vec-pytorch
    def __init__(self,
                 input_file,
                 output_file,
                 emb_dimension=300,
                 batch_size=64,
                 window_size=5,
                 iterations=5,
                 initial_lr=1.0,
                 min_count=5):

        self.data = DataReader(input_file, min_count)
        dataset = Word2vecDataset(self.data, window_size)
        self.dataloader = DataLoader(dataset,
                                     batch_size=batch_size,
                                     shuffle=False,
                                     num_workers=0,
                                     collate_fn=dataset.collate)

        self.output_file_name = output_file
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.iterations = iterations
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        if self.use_cuda:
            print("USING CUDA")
            self.skip_gram_model.cuda()
        else:
            print("CUDA FAIL")
コード例 #3
0
    def __init__(self,
                 input_file,
                 output_file,
                 emb_dimension=100,
                 batch_size=32,
                 window_size=5,
                 iterations=3,
                 initial_lr=0.001,
                 min_count=12):

        print("Reading input file...")
        self.data = DataReader(input_file, min_count)
        dataset = Word2vecDataset(self.data, window_size)
        print("Creating data batches")
        self.dataloader = DataLoader(dataset,
                                     batch_size=batch_size,
                                     shuffle=False,
                                     num_workers=0,
                                     collate_fn=dataset.collate)

        self.output_file_name = output_file
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.iterations = iterations
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        if self.use_cuda:
            self.skip_gram_model.cuda()
コード例 #4
0
ファイル: trainer.py プロジェクト: wabyking/demo
    def load_train(self,args,data= None, filename = None, is_train = True):
        if data is None:
            assert is_train==True, "wrong to load data 1"
            data = DataReader(args.text, args.min_count)
            filename = args.text
        else:
            assert is_train == False, "wrong to load test data 2"
            assert filename is not None, "wrong to load test data 3"
            assert data is not None, "wrong to load test data 4"
        if not args.use_time:
            dataset = Word2vecDataset(data, input_text = filename, window_size= args.window_size)
        else:
            dataset = TimestampledWord2vecDataset(data,input_text = filename, window_size= args.window_size, time_scale=args.time_scale)

        dataloader = DataLoader(dataset, batch_size=args.batch_size,
                                     shuffle=is_train, num_workers=0, collate_fn=dataset.collate) # shuffle if it is train
        if is_train:
            return data,dataloader
        else:
            return dataloader