def prepare_dataset(self): dat_obj = PreProcess() dat_obj.prepare_dataset() test_df = dat_obj.test_df2 test_dataset = SentimentDataset(test_df, max_length=100, mode='test') self.test_loader = DataLoader(test_dataset, batch_size=1, num_workers=0, shuffle=True)
def prepare_dataset(self): dat_obj = PreProcess() dat_obj.prepare_dataset() train_df = dat_obj.train_df val_df = dat_obj.val_df test_df = dat_obj.test_df1 train_dataset = SentimentDataset(train_df, max_length=100) val_dataset = SentimentDataset(val_df, max_length=100) test_dataset = SentimentDataset(test_df, max_length=100, mode='test') self.train_loader = DataLoader(train_dataset, batch_size=32, num_workers=0, shuffle=True) self.val_loader = DataLoader(val_dataset, batch_size=32, num_workers=0, shuffle=True) self.test_loader = DataLoader(test_dataset, batch_size=32, num_workers=0, shuffle=True)
def __getitem__(self, index): row = self.df.iloc[index] text, label = row['pre_process'], row[0] if label != 0: label = 1 out_dict = self.tokenizer.encode_plus(text=text, padding='max_length', max_length=200, return_tensors='pt') # print(out_dict) if self.mode != 'test': return [(out_dict['input_ids'][:, :self.max_length], out_dict['attention_mask'][:, :self.max_length]), label] else: return [text, (out_dict['input_ids'][:, :self.max_length], out_dict['attention_mask'][:, :self.max_length]), label] def __len__(self): # return int(self.df.shape[0]) return 2000 if __name__ == '__main__': from pre_process import PreProcess dat_obj = PreProcess() dat_obj.prepare_dataset() train_df = dat_obj.train_df dataset = SentimentDataset(train_df, 200) train_loader = DataLoader(dataset, batch_size=5,num_workers=8) for i, j in enumerate(train_loader,0): print(i) print(j[0][0])