def main(): trial_meme_dataset = MemeDataset( csv_file=os.path.join(os.getcwd(), 'data/data1.csv'), image_dir=os.path.join( os.path.expanduser('~'), 'Downloads/semeval-2020_trialdata/Meme_images/')) train_meme_dataset = MemeDataset( csv_file=os.path.join(os.getcwd(), 'data/data_7000_new.csv'), image_dir=os.path.join( os.path.expanduser('~'), 'Downloads/memotion_analysis_training_data/data_7000/')) fig = plt.figure() for i in range(len(trial_meme_dataset)): sample = trial_meme_dataset[i] print(i, np.array(sample['image']).shape, sample['image_name']) print(sample['humour_onehot'], sample['humour_int']) print(sample['offensive_onehot'], sample['offensive_int']) ax = plt.subplot(1, 4, i + 1) plt.tight_layout() ax.set_title('Sample #{}'.format(i)) ax.axis('off') plt.imshow(sample['image']) if i == 3: plt.show() break
def readData(datalabel, batch_size): data_transform = transforms.Compose([ ResizeSample(size=(256, 256)), ToTensorSample(), NormalizeSample(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) if datalabel == 'trial': dataset = MemeDataset( csv_file=os.path.join(os.getcwd(), '../data/data1.csv'), image_dir=os.path.join( os.getcwd(), '../data/semeval-2020_trialdata/Meme_images/'), transform=data_transform) else: dataset = MemeDataset( csv_file=os.path.join(os.getcwd(), '../data/data_7000_new.csv'), image_dir=os.path.join( os.getcwd(), '../data/memotion_analysis_training_data/data_7000/'), transform=data_transform) dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=0) return list(dataloader), len(list(dataloader))
def val_data_loader(val_file): val_dataset = MemeDataset(filepath=val_file, text_only=True, text_padding=tokenizer_func) return data.DataLoader(val_dataset, batch_size=config['batch_size'], num_workers=config['num_workers'], collate_fn=val_dataset.get_collate_fn())
def test_data_loader(test_file): test_dataset = MemeDataset(filepath=test_file, text_only=True, text_padding=tokenizer_func, return_ids=True) return data.DataLoader(test_dataset, batch_size=config['batch_size'], num_workers=config['num_workers'], collate_fn=test_dataset.get_collate_fn())
def val_data_loader(val_file): val_dataset = MemeDataset( filepath=val_file, feature_dir=config['feature_path'], preload_images=False, debug=True, text_padding=tokenizer_func, confidence_threshold=config['object_conf_thresh']) return data.DataLoader(val_dataset, batch_size=config['batch_size'], num_workers=config['num_workers'], collate_fn=val_dataset.get_collate_fn())
def train_data_loader(train_file): if config['debug']: train_file = os.path.join(config["data_path"], "dev_seen.jsonl") train_dataset = MemeDataset(filepath=train_file, text_only=True, text_padding=tokenizer_func) return data.DataLoader( train_dataset, batch_size=config['batch_size'], num_workers=config['num_workers'], collate_fn=train_dataset.get_collate_fn(), pin_memory= True, # shuffle is mutually exclusive with sampler. It is shuffled anyways sampler=ConfounderSampler( train_dataset, repeat_factor=config["confounder_repeat"]))
def main(): data_transform = transforms.Compose([ ResizeSample(size=(256, 256)), ToTensorSample(), NormalizeSample(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) # Apply each of the above transforms on sample. fig = plt.figure() trial_meme_dataset = MemeDataset( csv_file=os.path.join(os.getcwd(), 'data/data1.csv'), image_dir=os.path.join( os.path.expanduser('~'), 'Downloads/semeval-2020_trialdata/Meme_images/')) for i in range(len(trial_meme_dataset)): sample = trial_meme_dataset[i] print(i, np.array(sample['image']).shape) print('np.array(sample[\'image\'])[128,128,0]: {}'.format( np.array(sample['image'])[128, 128, 0])) transformed_sample = data_transform(sample) print(i, np.array(transformed_sample['image']).shape) # print(transformed_sample['image'].numpy().max(axis=1)) print('transformed_sample[\'image\'].numpy()[0,128,128]: {}'.format( transformed_sample['image'].numpy()[0, 128, 128])) ax = plt.subplot(1, 4, i + 1) plt.tight_layout() ax.set_title('Sample #{}'.format(i)) ax.axis('off') plt.imshow(transformed_sample['image'].numpy().transpose((1, 2, 0))) if i == 3: plt.show() break
def train_data_loader(train_file): train_dataset = MemeDataset( filepath=train_file, feature_dir=config['feature_path'], preload_images=False, debug=True, text_padding=tokenizer_func, confidence_threshold=config['object_conf_thresh']) return data.DataLoader( train_dataset, batch_size=config['batch_size'], num_workers=config['num_workers'], collate_fn=train_dataset.get_collate_fn(), pin_memory= True, # shuffle is mutually exclusive with sampler. It is shuffled anyways sampler=ConfounderSampler( train_dataset, repeat_factor=config["confounder_repeat"]))
def readData(self, datalabel): data_transform = transforms.Compose([ ResizeSample(size=(256, 256)), ToTensorSample(), NormalizeSample(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])]) if datalabel == 'trial': dataset = MemeDataset( csv_file=os.path.join(os.getcwd(), '../data/data1.csv'), image_dir = os.path.join(os.getcwd(), '../data/semeval-2020_trialdata/Meme_images/'), transform= data_transform) else: dataset = MemeDataset( csv_file=os.path.join(os.getcwd(), '../data/data_7000_new.csv'), image_dir = os.path.join(os.getcwd(), '../data/memotion_analysis_training_data/data_7000/'), transform=data_transform) return dataset
def main(): data_transform = transforms.Compose([ ResizeSample(size=(256, 256)), ToTensorSample(), NormalizeSample(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) trial_meme_dataset_transformed = MemeDataset( csv_file=os.path.join(os.getcwd(), '../data/data1.csv'), image_dir=os.path.join(os.getcwd(), '../data/semeval-2020_trialdata/Meme_images/'), transform=data_transform) train_meme_dataset_transformed = MemeDataset( csv_file=os.path.join(os.getcwd(), '../data/data_7000_new.csv'), image_dir=os.path.join( os.getcwd(), '../data/memotion_analysis_training_data/data_7000/'), transform=data_transform) evaluate_classification( meme_dataset_transformed=trial_meme_dataset_transformed) evaluate_classification( meme_dataset_transformed=train_meme_dataset_transformed)
def main(): trial_meme_dataset_transformed = MemeDataset( csv_file=os.path.join(os.getcwd(), '../data/data1.csv'), image_dir=os.path.join(os.getcwd(), '../data/semeval-2020_trialdata/Meme_images/'), transform=transforms.Compose([ ResizeSample(size=(256, 256)), ToTensorSample(), NormalizeSample(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ])) for i in range(len(trial_meme_dataset_transformed)): sample = trial_meme_dataset_transformed[i] print(i, sample['image'].size()) if i == 3: break dataloader = DataLoader(dataset=trial_meme_dataset_transformed, batch_size=4, shuffle=True, num_workers=4) for i_batch, sample_batched in enumerate(dataloader): print(i_batch, sample_batched['image'].size(), sample_batched['image'].numpy().shape) print('sample_batched[\'image_name\']:\n{}'.format( sample_batched['image_name'])) print('sample_batched[\'humour_onehot\']:\n{}'.format( sample_batched['humour_onehot'])) print('sample_batched[\'humour_int\']:\n{}'.format( sample_batched['humour_int'])) print('sample_batched[\'offensive_onehot\']:\n{}'.format( sample_batched['offensive_onehot'])) print('sample_batched[\'offensive_int\']:\n{}'.format( sample_batched['offensive_int'])) print('sample_batched[\'ocr_extracted_text\']:\n{}'.format( sample_batched['ocr_extracted_text'])) print('sample_batched[\'corrected_text\']:\n{}\n'.format( sample_batched['corrected_text'])) # observe 4th batch and stop. if i_batch == 3: plt.figure() show_batch(sample_batched) plt.axis('off') plt.ioff() plt.show() break
def get_transformed_dataset(textEmb_path, data_path, img_path): ''' Get the embedding for the text, which is used as the text feature, and the dataset. ''' imgname_textEmbs = MyDataLoader.read_text_embeddings_Idx(textEmb_path) data_transform = transforms.Compose([ ResizeSample(size=(256, 256)), ToTensorSample(), NormalizeSample(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) meme_dataset_transformed = MemeDataset( csv_file=os.path.join(os.getcwd(), data_path), image_dir=os.path.join(os.getcwd(), img_path), transform=data_transform) return imgname_textEmbs, meme_dataset_transformed
def get_dataloaders(data_path, img_path, batch_size, split_seq): # split_seq: [0.8, 0.2], 80% data for training, 10% for validation, the rest of data for testing data_transform = transforms.Compose([ ResizeSample(size=(299, 299)), # ResizeSample(size=(256,256)), ToTensorSample(), NormalizeSample((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) meme_dataset_transformed = MemeDataset( csv_file=os.path.join(os.getcwd(), data_path), image_dir=os.path.join(os.getcwd(), img_path), transform=data_transform) # Split the dataset train_len = int(len(meme_dataset_transformed) * split_seq[0]) # val_len = int(len(meme_dataset_transformed) * split_seq[1]) test_len = len(meme_dataset_transformed) - train_len # meme_train, meme_val, meme_test = random_split(meme_dataset_transformed, [train_len, val_len, test_len]) meme_train, meme_val = random_split(meme_dataset_transformed, [train_len, test_len]) # The dataloader for training, validation and testing dataset train_dataloader = DataLoader(dataset=meme_train, batch_size=batch_size, shuffle=True, num_workers=4) val_dataloader = DataLoader(dataset=meme_val, batch_size=batch_size, shuffle=True, num_workers=4) # test_dataloader = DataLoader(dataset=meme_test, batch_size=4, # shuffle=True, num_workers=4) # dataloaders_dict = {'train': train_dataloader, 'val': val_dataloader, 'test': test_dataloader} dataloaders_dict = {'train': train_dataloader, 'val': val_dataloader} return dataloaders_dict
print("Initializing Datasets and Dataloaders...") # trial_meme_dataset_transformed = MemeDataset( # csv_file=os.path.join(os.getcwd(), '../data/data1.csv'), # image_dir=os.path.join(os.getcwd(), # '../data/semeval-2020_trialdata/Meme_images/'), # transform=transforms.Compose( # [ResizeSample(size=(224, 224)), # ToTensorSample(), # NormalizeSample(mean=[0.485, 0.456, 0.406], # std=[0.229, 0.224, 0.225])])) trial_meme_dataset_transformed = MemeDataset( csv_file=os.path.join(os.getcwd(), '../data/data_7000_new.csv'), image_dir=os.path.join( os.getcwd(), '../data/memotion_analysis_training_data/data_7000/'), transform=transforms.Compose([ ResizeSample(size=(224, 224)), ToTensorSample(), NormalizeSample(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ])) trial_meme_train, trial_meme_val, _ = random_split( dataset=trial_meme_dataset_transformed, lengths=[5988, 1000, 3]) # Create training and validation dataloaders sample_weights_train = make_weights_for_balanced_classes(trial_meme_train, num_classes=3) weighted_sampler_train = WeightedRandomSampler(sample_weights_train, len(sample_weights_train)) train_dataloader = DataLoader(dataset=trial_meme_train, batch_size=4,
set_seed(config['seed']) # Tokenize tokenizer = BertTokenizer.from_pretrained('bert-base-cased') tokenizer_func = partial(tokenizer, max_length=config['max_txt_len'], padding='max_length', truncation=True, return_tensors='pt', return_length=True) # Prepare the datasets and dataloaders for training and evaluation train_dataset = MemeDataset( filepath=os.path.join(config['data_path'], config['train_filename']), feature_dir=config['feature_path'], text_padding=tokenizer_func, filter_text=config["filter_text"], upsample_multiplier=config["upsample_multiplier"]) val_dataset = MemeDataset(filepath=os.path.join(config['data_path'], 'dev_seen.jsonl'), feature_dir=config['feature_path'], text_padding=tokenizer_func, filter_text=config["filter_text"]) test_dataset = MemeDataset(filepath=os.path.join(config['data_path'], 'test_seen.jsonl'), feature_dir=config['feature_path'], text_padding=tokenizer_func, filter_text=config["filter_text"]) config['train_loader'] = data.DataLoader( train_dataset,
set_seed(config['seed']) # Tokenize tokenizer = BertTokenizer.from_pretrained('bert-base-cased') tokenizer_func = partial(tokenizer, max_length=config['max_txt_len'], padding='max_length', truncation=True, return_tensors='pt', return_length=True) # Prepare the datasets and dataloaders for training and evaluation train_dataset = MemeDataset(filepath=os.path.join(config['data_path'], 'train.jsonl'), feature_dir=config['feature_path'], debug=True, text_padding=tokenizer_func) val_dataset = MemeDataset(filepath=os.path.join(config['data_path'], 'dev_seen.jsonl'), feature_dir=config['feature_path'], debug=True, text_padding=tokenizer_func) test_dataset = MemeDataset(filepath=os.path.join(config['data_path'], 'test_seen.jsonl'), feature_dir=config['feature_path'], return_ids=True, debug=True, text_padding=tokenizer_func) config['train_loader'] = data.DataLoader(
def main(): # Create training and validation datasets print("Initializing Datasets and Dataloaders...") trial_meme_dataset_transformed = MemeDataset( csv_file=os.path.join(os.getcwd(), '../data/data1.csv'), image_dir=os.path.join(os.getcwd(), '../data/semeval-2020_trialdata/Meme_images/'), transform=transforms.Compose([ ResizeSample(size=(299, 299)), # For Inception # ResizeSample(size=(224, 224)), # For other pretrained models ToTensorSample(), NormalizeSample(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ])) trial_meme_train, trial_meme_val = random_split( dataset=trial_meme_dataset_transformed, lengths=[800, 200]) # Create training and validation dataloaders # Balanced class============================================================ # sample_weights_train = make_weights_for_balanced_classes( # trial_meme_train, num_classes=3) # weighted_sampler_train = WeightedRandomSampler( # sample_weights_train, len(sample_weights_train)) # train_dataloader = DataLoader(dataset=trial_meme_train, batch_size=4, # sampler=weighted_sampler_train, num_workers=4) # sample_weights_val = make_weights_for_balanced_classes( # trial_meme_val, num_classes=3) # weighted_sampler_val = WeightedRandomSampler( # sample_weights_val, len(sample_weights_val)) # val_dataloader = DataLoader(dataset=trial_meme_val, batch_size=4, # sampler=weighted_sampler_val, num_workers=4) # ========================================================================== # Imbalanced class========================================================== train_dataloader = DataLoader(dataset=trial_meme_train, batch_size=4, shuffle=True, num_workers=4) val_dataloader = DataLoader(dataset=trial_meme_val, batch_size=4, shuffle=True, num_workers=4) # ========================================================================== dataloaders_dict = {'train': train_dataloader, 'val': val_dataloader} # Detect if we have a GPU available device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") deepsent_config = { 'num_classes': 3, # negative, positive, neutral 'batch_size': 4, 'vocab_size': 400000, 'embedding_dim': 300 } deepsent = DeepSentimentModel(**deepsent_config) # deepsent = DeepSentimentVanillaModel(**deepsent_config) # deepsent = ShallownetGloveModel(**deepsent_config) # Send the model to GPU deepsent = deepsent.to(device) # Gather the parameters to be optimized/updated in this run. If we are # finetuning we will be updating all parameters. However, if we are # doing feature extract method, we will only update the parameters # that we have just initialized, i.e. the parameters with requires_grad # is True. feature_extract = True params_to_update = deepsent.parameters() print("Params to learn:") if feature_extract: params_to_update = [] for name, param in deepsent.named_parameters(): if param.requires_grad == True: params_to_update.append(param) print("\t", name) else: for name, param in deepsent.named_parameters(): if param.requires_grad == True: print("\t", name) # Observe that all parameters are being optimized optimizer_ft = optim.SGD(params_to_update, lr=0.001, momentum=0.9) # Setup the loss fxn criterion = nn.CrossEntropyLoss() # Train and evaluate deepsent, hist = train_model(model=deepsent, dataloaders=dataloaders_dict, criterion=criterion, optimizer=optimizer_ft, num_epochs=10, is_inception=True, target_label='overall_sentiment_ternary_int')