class RickAndMortyDataset(BaseDataset): """ Wrapper class to process and produce training samples """ def __init__( self, data_dir, seq_length, vocab_size=None, vocab=None, training=False, vocab_from_pretrained="bert-base-uncased", do_lower_case=True, ): self.data_dir = data_dir self.seq_length = seq_length self.vocab = Vocabulary() with open(os.path.join(data_dir, "rick_and_morty.txt"), "r", encoding="utf-8") as f: self.text = f.read() if vocab is not None: if isinstance(vocab, str): self.vocab.load(vocab) elif isinstance(vocab, Vocabulary): self.vocab = vocab elif os.path.exists(os.path.join(data_dir, "vocab.pkl")): self.vocab.load(os.path.join(data_dir, "vocab.pkl")) else: self.vocab.add_text(self.text) self.vocab.save(os.path.join(data_dir, "vocab.pkl")) if vocab_size is not None: self.vocab = self.vocab.most_common(vocab_size - 2) self.text = self.vocab.clean_text(self.text) self.tokens = self.vocab.tokenize(self.text) def __len__(self): return len(self.tokens) - self.seq_length def __getitem__(self, idx): input_ids = [ self.vocab[word] for word in self.tokens[idx:idx + self.seq_length] ] y = [self.vocab[self.tokens[idx + self.seq_length]]] attention_mask = attention_mask = [1] * len(input_ids) segment_ids = attention_mask = [1] * len(input_ids) input_ids = torch.LongTensor(input_ids) attention_mask = torch.LongTensor(attention_mask) segment_ids = torch.LongTensor(segment_ids) y = torch.LongTensor(y) return input_ids, attention_mask, segment_ids, y
class SpamData(Dataset): """ Wrapper class to process and produce training samples """ def __init__(self, data_dir, seq_length, vocab_size, vocab=None): self.df = pd.read_csv(os.path.join(data_dir, 'spam.csv'), encoding="mbcs") self.vocab = Vocabulary() self.labels = [] for x in self.df.v1: if x == 'ham': self.labels.append(0) else: self.labels.append(1) self.seq_length = seq_length if vocab is not None: if isinstance(vocab, str): self.vocab.load(vocab) elif isinstance(vocab, Vocabulary): self.vocab = vocab elif os.path.exists(os.path.join(data_dir, "vocab.pkl")): self.vocab.load(os.path.join(data_dir, "vocab.pkl")) else: self.vocab.add_text(" ".join(self.df["v2"].values)) self.vocab.save(os.path.join(data_dir, "vocab.pkl")) if vocab_size is not None: self.vocab = self.vocab.most_common(vocab_size - 2) self.text = self.vocab.clean_text(" ".join(self.df["v2"].values)) self.tokens = [] for content in self.df["v2"].values: self.tokens.append( self.vocab.tokenize(self.vocab.clean_text(content))) def __len__(self): return len(self.tokens) - self.seq_length def __getitem__(self, idx): tokens_list = self.tokens[idx] if len(tokens_list) > self.seq_length: tokens_list = tokens_list[:self.seq_length] else: tokens_list.extend(['<pad>'] * (self.seq_length - len(tokens_list))) x = [self.vocab[word] for word in tokens_list] y = [0, 0] y[int(self.labels[idx])] = 1 x = torch.LongTensor(x) y = torch.FloatTensor([y]) return x, y
class SimpsonsDataset(Dataset): """ Wrapper class to process and produce training samples """ def __init__(self, data_dir, seq_length, vocab_size=None, vocab=None, training=False): self.data_dir = data_dir self.seq_length = seq_length self.vocab = Vocabulary() with open(os.path.join(data_dir, "simpsons.txt"), "r", encoding="utf-8") as f: self.text = f.read() if vocab is not None: if isinstance(vocab, str): self.vocab.load(vocab) elif isinstance(vocab, Vocabulary): self.vocab = vocab elif os.path.exists(os.path.join(data_dir, "vocab.pkl")): self.vocab.load(os.path.join(data_dir, "vocab.pkl")) else: self.vocab.add_text(self.text) self.vocab.save(os.path.join(data_dir, "vocab.pkl")) if vocab_size is not None: self.vocab = self.vocab.most_common(vocab_size - 2) self.text = self.vocab.clean_text(self.text) self.tokens = self.vocab.tokenize(self.text) def __len__(self): return len(self.tokens) - self.seq_length def __getitem__(self, idx): x = [ self.vocab[word] for word in self.tokens[idx:idx + self.seq_length] ] y = [self.vocab[self.tokens[idx + self.seq_length]]] x = torch.LongTensor(x) y = torch.LongTensor(y) return x, y
def test(config): config.config['data_loader']['args']['mode'] = 'test' logger = config.get_logger('test') logger.info("Running test with configuration:") logger.info(config) expert_dims, raw_input_dims = compute_dims(config) if config['experts']['text_feat'] == 'learnable': # vocab vocab = Vocabulary() vocab.load('dataset/captions/dict.all_200k_gan.json') vocab_size = len(vocab) # word2vec if config['experts']['text_feat_init'] == True: # word2vec, download file and move to we_root-path directory # https://www.kaggle.com/jacksoncrow/word2vec-flickr30k/version/1 we_rootpath = '/home/yj/pretrained_model' w2v_data_path = os.path.join(we_rootpath, "word2vec/", 'flickr', 'vec500flickr30m') we_parameter = get_we_parameter(vocab, w2v_data_path) else: we_parameter = None else: vocab = None vocab_size = None we_parameter = None if "attr" in config['experts']['modalities']: attr_vocab = Vocabulary() attr_vocab.load('dataset/captions/dict.attr.json') attr_vocab_size = len(attr_vocab) else: attr_vocab = None attr_vocab_size = None data_loaders = config.init(name='data_loader', module=module_data, raw_input_dims=raw_input_dims, text_feat=config['experts']['text_feat'], text_dim=config['experts']['text_dim'], vocab=vocab, attr_vocab=attr_vocab, pretrain=config['trainer']['pretrain']) model = config.init(name='arch', module=module_arch, expert_dims=expert_dims, text_dim=config['experts']['text_dim'], same_dim=config['experts']['ce_shared_dim'], we_parameter=we_parameter, vocab_size=vocab_size, attr_vocab_size=attr_vocab_size, text_feat=config['experts']['text_feat']) ckpt_path = Path(config._args.resume) logger.info(f"Loading checkpoint: {ckpt_path} ...") checkpoint = torch.load(ckpt_path) state_dict = checkpoint['state_dict'] if config['n_gpu'] > 1: model = torch.nn.DataParallel(model) model.load_state_dict(state_dict) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') logger.info(f"Running test on {device}") model = model.to(device) model.eval() categories = ['dress', 'shirt', 'toptee'] modalities = data_loaders[categories[0]].dataset.ordered_experts metric = {'score': dict()} for i, category in enumerate(categories): val_experts = {expert: list() for expert in modalities} target_ind = {expert: list() for expert in modalities} data_asin = [] for batch in data_loaders[category + '_trg']: for key, val in batch['candidate_experts'].items(): batch['candidate_experts'][key] = val.to(device) data_asin.extend( [meta['candidate'] for meta in batch['meta_info']]) for key, val in batch['candidate_ind'].items(): target_ind[key].append(val) with torch.no_grad(): experts, _, _ = model(batch['candidate_experts'], batch['candidate_ind'], target=True) for modality, val in experts.items(): val_experts[modality].append(val) for modality, val in val_experts.items(): val_experts[modality] = torch.cat(val) for modality, val in target_ind.items(): target_ind[modality] = torch.cat(val) scores = [] meta_infos = [] val_size = val_experts['resnet'].size(0) for batch in data_loaders[category]: for experts in ['candidate_experts']: for key, val in batch[experts].items(): batch[experts][key] = val.to(device) batch["text"] = batch["text"].to(device) batch_size = batch["text"].size(0) meta_infos.extend(list(batch['meta_info'])) with torch.no_grad(): # composition_feature, text, moe_weights = model(batch['candidate_experts'], # batch['candidate_ind'], # batch['text'], # batch['text_bow'], # batch['text_lengths']) # batch_target = dict() # for mod in modalities: # tmp = [] # for k in range(batch_size): # tmp.append(model.target_composition(val_experts[mod], text[mod][k].expand(val_size, -1))) # batch_target[mod] = torch.stack(tmp) src_experts = model.image_encoder(batch['candidate_experts'], batch['candidate_ind']) src_text, moe_weights = model.get_text_feature( batch['text'], batch['candidate_ind'], batch['text_bow'], batch['text_lengths']) src_feature = model.get_combined_feature(src_experts, src_text) trg_text, _ = model.get_text_feature(batch['text'], batch['target_ind'], batch['text_bow'], batch['text_lengths'], target=True) # trg_text, _ = self.model.text_encoder['trg'](batch['text_mean'].unsqueeze(1), batch['target_ind']) batch_target = dict() for h, mod in enumerate(modalities): tmp = [] for k in range(batch_size): tmp.append( model.trg_normalization_layer( model.target_composition[h]( val_experts[mod], trg_text[mod][k].expand(val_size, -1)))) batch_target[mod] = torch.stack(tmp) cross_view_conf_matrix = sharded_cross_view_inner_product( vid_embds=batch_target, text_embds=src_feature, text_weights=moe_weights, subspaces=model.image_encoder.modalities, l2renorm=True, dist=True, val=True) scores.append(cross_view_conf_matrix) scores = torch.cat(scores) val_ids = data_loaders[category + '_trg'].dataset.data assert val_ids == data_asin metric['score'][category] = { 'ids': val_ids, 'matrix': scores, 'meta_info': meta_infos } save_fname = ckpt_path.parent / f'test_score.pt' tic = time.time() logger.info("Saving score matrix: {} ...".format(save_fname)) torch.save(metric, save_fname) logger.info(f"Done in {time.time() - tic:.3f}s")
def main(config): logger = config.get_logger('train') expert_dims, raw_input_dims = compute_dims(config) seeds = [int(x) for x in config._args.seeds.split(',')] for seed in seeds: tic = time.time() logger.info(f"Setting experiment random seed to {seed}") random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if config['experts']['text_feat'] == 'learnable': # vocab vocab = Vocabulary() vocab.load('dataset/captions/dict.all_200k_gan.json') vocab_size = len(vocab) if config['experts']['text_feat_init'] == True: # word2vec, download file and move to we_root-path directory # https://www.kaggle.com/jacksoncrow/word2vec-flickr30k/version/1 we_rootpath = '/home/yj/pretrained_model' w2v_data_path = os.path.join(we_rootpath, "word2vec/", 'flickr', 'vec500flickr30m') we_parameter = get_we_parameter(vocab, w2v_data_path) else: we_parameter = None else: vocab = None vocab_size = None we_parameter = None if "attr" in config['experts']['modalities']: attr_vocab = Vocabulary() attr_vocab.load('dataset/captions/dict.attr.json') attr_vocab_size = len(attr_vocab) else: attr_vocab = None attr_vocab_size = None data_loaders = config.init(name='data_loader', module=module_data, raw_input_dims=raw_input_dims, text_feat=config['experts']['text_feat'], text_dim=config['experts']['text_dim'], vocab=vocab, attr_vocab=attr_vocab, pretrain=config['trainer']['pretrain']) model = config.init( name='arch', module=module_arch, expert_dims=expert_dims, text_dim=config['experts']['text_dim'], same_dim=config['experts']['ce_shared_dim'], we_parameter=we_parameter, vocab_size=vocab_size, attr_vocab_size=attr_vocab_size, text_feat=config['experts']['text_feat'], ) # logger.info(model) loss = config.init(name='loss', module=module_loss) trainable_params = filter(lambda p: p.requires_grad, model.parameters()) optimizer = config.init('optimizer', torch.optim, trainable_params) lr_scheduler = config.init('lr_scheduler', torch.optim.lr_scheduler, optimizer) trainer = Trainer( model, loss, optimizer, config=config, data_loaders=data_loaders, lr_scheduler=lr_scheduler, ) trainer.train() best_ckpt_path = config.save_dir / "trained_model.pth" duration = time.strftime('%Hh%Mm%Ss', time.gmtime(time.time() - tic)) logger.info(f"Training took {duration}") test_args = argparse.ArgumentParser() test_args.add_argument("--device", default=config._args.device) test_args.add_argument("--resume", default=best_ckpt_path) test_config = ConfigParser(test_args) test(test_config)
def test(config): logger = config.get_logger('test') logger.info("Running test with configuration:") logger.info(config) expert_dims = compute_dims(config) vocab = None vocab_size = None we_parameter = None if "attr" in config['experts']['modalities']: attr_vocab = Vocabulary() attr_vocab.load( os.path.join(config['data_loader']['args']['data_dir'], 'attributes/dict.attr.json')) attr_vocab_size = len(attr_vocab) else: attr_vocab = None attr_vocab_size = None data_loaders = config.init( name='data_loader', module=module_data, expert_dims=expert_dims, text_feat=config['experts']['text_feat'], text_dim=config['experts']['text_dim'], ) model = config.init(name='arch', module=module_arch, expert_dims=expert_dims, text_dim=config['experts']['text_dim'], same_dim=config['experts']['ce_shared_dim'], text_feat=config['experts']['text_feat']) trainer = TrainerJoint( model, loss=None, optimizer=None, config=config, data_loaders=data_loaders, lr_scheduler=None, ) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') logger.info(f"Running test on {device}") metric = trainer._valid_epoch(save_textatt=True) if config._args.mode == 'val': for key, value in metric.items(): if key == 'recall_avg': logger.info(f'[Avg Recall] : {value}') elif key == 'recall_avg_corr': logger.info(f'[Avg Recall corr]: {value}') elif key == 'comb_avg': logger.info(f'[comb_avg] : {value}') elif key == 'recall': for i, category in zip(value, trainer.categories): if len(i) == 2: logger.info(f'[{category}] r@10, r@50: {i[0]}\t{i[1]}') elif len(i) == 4: logger.info( f'[{category}] comp corr r@10, r@50: {i[0]}\t{i[1]}\t{i[2]}\t{i[3]}' ) elif key == 'comb': combstr = "comb:" for i, category in zip(value, trainer.categories): combstr += f' {i[0]} {i[1]}' logger.info(combstr) else: save_fname = config.save_dir / f'test_score.pt' tic = time.time() logger.info("Saving score matrix: {} ...".format(save_fname)) torch.save(metric, save_fname) logger.info(f"Done in {time.time() - tic:.3f}s")
class EmailSpamDataset(BaseDataset): """ Wrapper class to process and produce training samples """ def __init__( self, data_dir, vocab_size=None, vocab=None, seq_length=40, training=False, vocab_from_pretrained="bert-base-uncased", do_lower_case=True, ): self.data_dir = data_dir self.vocab = Vocabulary(vocab_from_pretrained, do_lower_case) self.seq_length = seq_length data_all = pd.read_csv(os.path.join(self.data_dir, "combined-data.csv"), sep=' ', header=None, encoding="cp1252") data_all[1] = data_all[1] + " " + data_all[2] data_all = data_all[[0, 1]] data_all.columns = ['label', 'text'] data_all = data_all[['text', 'label']] data_all = data_all[~data_all.text.isna()] data_all.label = data_all.label.apply(lambda x: int(x[-1])) data_all.text = data_all.text.apply(lambda x: x.lower()) data_all = data_all.sample(1000) self.train_df = data_all.copy() #pd.DataFrame({"text": [], "label": []}) self.val_df = pd.DataFrame({"text": [], "label": []}) self.test_df = data_all.copy() # pd.DataFrame({"text": [], "label": []}) #data_all.copy() del data_all if training: self.train() if vocab is not None: if isinstance(vocab, str): self.vocab.load(vocab) elif isinstance(vocab, Vocabulary): self.vocab = vocab elif os.path.exists(os.path.join(data_dir, "vocab.pkl")): self.vocab.load(os.path.join(data_dir, "vocab.pkl")) else: self.vocab.add_text( " ".join(pd.concat([self.train_df, self.val_df], sort=False).text.values) ) self.vocab.save(os.path.join(data_dir, "vocab.pkl")) else: self.test() if vocab is not None: if isinstance(vocab, str): self.vocab.load(vocab) elif isinstance(vocab, Vocabulary): self.vocab = vocab elif os.path.exists(os.path.join(data_dir, "vocab.pkl")): self.vocab.load(os.path.join(data_dir, "vocab.pkl")) else: raise(Exception("Vocab file is not specified in test mode!")) if vocab_size is not None: self.vocab = self.vocab.most_common(vocab_size - 2) def validation(self): self.text = self.val_df.text.values self.labels = self.val_df.label.values self.len = len(self.val_df) return True def train(self): self.text = self.train_df.text.values self.labels = self.train_df.label.values self.len = len(self.train_df) return True def test(self): self.text = self.test_df.text.values self.labels = self.test_df.label.values self.len = len(self.test_df) return True def __len__(self): return self.len - 1 if self.len else 0 def __getitem__(self, idx): y = self.labels[idx] text = self.text[idx] text = self.vocab.clean_text(text) input_ids, attention_mask, segment_ids = self.format_in_text(text) y = torch.LongTensor([y]) return input_ids, attention_mask, segment_ids, y def format_in_text(self, text): text = self.vocab.clean_text(text) tokens_a = self.vocab.tokenize(text) # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > self.seq_length - 2: tokens_a = tokens_a[: (self.seq_length - 2)] tokens = ( [self.vocab.tokenizer.cls_token] + tokens_a + [self.vocab.tokenizer.sep_token] ) segment_ids = [0] * len(tokens) # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary input_ids = [self.vocab[x] for x in tokens] # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to. attention_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. padding = [self.vocab.tokenizer.pad_token_id] * ( self.seq_length - len(input_ids) ) input_ids += padding attention_mask += padding segment_ids += padding input_ids = torch.LongTensor(input_ids) attention_mask = torch.LongTensor(attention_mask) segment_ids = torch.LongTensor(segment_ids) return input_ids, attention_mask, segment_ids