Esempio n. 1
0
class RickAndMortyDataset(BaseDataset):
    """ Wrapper class to process and produce training samples """
    def __init__(
        self,
        data_dir,
        seq_length,
        vocab_size=None,
        vocab=None,
        training=False,
        vocab_from_pretrained="bert-base-uncased",
        do_lower_case=True,
    ):
        self.data_dir = data_dir
        self.seq_length = seq_length
        self.vocab = Vocabulary()
        with open(os.path.join(data_dir, "rick_and_morty.txt"),
                  "r",
                  encoding="utf-8") as f:
            self.text = f.read()

        if vocab is not None:
            if isinstance(vocab, str):
                self.vocab.load(vocab)
            elif isinstance(vocab, Vocabulary):
                self.vocab = vocab
        elif os.path.exists(os.path.join(data_dir, "vocab.pkl")):
            self.vocab.load(os.path.join(data_dir, "vocab.pkl"))
        else:
            self.vocab.add_text(self.text)
            self.vocab.save(os.path.join(data_dir, "vocab.pkl"))

        if vocab_size is not None:
            self.vocab = self.vocab.most_common(vocab_size - 2)

        self.text = self.vocab.clean_text(self.text)
        self.tokens = self.vocab.tokenize(self.text)

    def __len__(self):
        return len(self.tokens) - self.seq_length

    def __getitem__(self, idx):
        input_ids = [
            self.vocab[word] for word in self.tokens[idx:idx + self.seq_length]
        ]
        y = [self.vocab[self.tokens[idx + self.seq_length]]]

        attention_mask = attention_mask = [1] * len(input_ids)
        segment_ids = attention_mask = [1] * len(input_ids)

        input_ids = torch.LongTensor(input_ids)
        attention_mask = torch.LongTensor(attention_mask)
        segment_ids = torch.LongTensor(segment_ids)
        y = torch.LongTensor(y)
        return input_ids, attention_mask, segment_ids, y
class SpamData(Dataset):
    """ Wrapper class to process and produce training samples """
    def __init__(self, data_dir, seq_length, vocab_size, vocab=None):
        self.df = pd.read_csv(os.path.join(data_dir, 'spam.csv'),
                              encoding="mbcs")
        self.vocab = Vocabulary()
        self.labels = []
        for x in self.df.v1:
            if x == 'ham':
                self.labels.append(0)
            else:
                self.labels.append(1)
        self.seq_length = seq_length
        if vocab is not None:
            if isinstance(vocab, str):
                self.vocab.load(vocab)
            elif isinstance(vocab, Vocabulary):
                self.vocab = vocab
        elif os.path.exists(os.path.join(data_dir, "vocab.pkl")):
            self.vocab.load(os.path.join(data_dir, "vocab.pkl"))
        else:
            self.vocab.add_text(" ".join(self.df["v2"].values))
            self.vocab.save(os.path.join(data_dir, "vocab.pkl"))

        if vocab_size is not None:
            self.vocab = self.vocab.most_common(vocab_size - 2)

        self.text = self.vocab.clean_text(" ".join(self.df["v2"].values))
        self.tokens = []
        for content in self.df["v2"].values:
            self.tokens.append(
                self.vocab.tokenize(self.vocab.clean_text(content)))

    def __len__(self):
        return len(self.tokens) - self.seq_length

    def __getitem__(self, idx):
        tokens_list = self.tokens[idx]
        if len(tokens_list) > self.seq_length:
            tokens_list = tokens_list[:self.seq_length]
        else:
            tokens_list.extend(['<pad>'] *
                               (self.seq_length - len(tokens_list)))
        x = [self.vocab[word] for word in tokens_list]
        y = [0, 0]
        y[int(self.labels[idx])] = 1
        x = torch.LongTensor(x)
        y = torch.FloatTensor([y])
        return x, y
class SimpsonsDataset(Dataset):
    """ Wrapper class to process and produce training samples """
    def __init__(self,
                 data_dir,
                 seq_length,
                 vocab_size=None,
                 vocab=None,
                 training=False):
        self.data_dir = data_dir
        self.seq_length = seq_length
        self.vocab = Vocabulary()
        with open(os.path.join(data_dir, "simpsons.txt"),
                  "r",
                  encoding="utf-8") as f:
            self.text = f.read()

        if vocab is not None:
            if isinstance(vocab, str):
                self.vocab.load(vocab)
            elif isinstance(vocab, Vocabulary):
                self.vocab = vocab
        elif os.path.exists(os.path.join(data_dir, "vocab.pkl")):
            self.vocab.load(os.path.join(data_dir, "vocab.pkl"))
        else:
            self.vocab.add_text(self.text)
            self.vocab.save(os.path.join(data_dir, "vocab.pkl"))

        if vocab_size is not None:
            self.vocab = self.vocab.most_common(vocab_size - 2)

        self.text = self.vocab.clean_text(self.text)
        self.tokens = self.vocab.tokenize(self.text)

    def __len__(self):
        return len(self.tokens) - self.seq_length

    def __getitem__(self, idx):
        x = [
            self.vocab[word] for word in self.tokens[idx:idx + self.seq_length]
        ]
        y = [self.vocab[self.tokens[idx + self.seq_length]]]
        x = torch.LongTensor(x)
        y = torch.LongTensor(y)
        return x, y
Esempio n. 4
0
def test(config):
    config.config['data_loader']['args']['mode'] = 'test'
    logger = config.get_logger('test')
    logger.info("Running test with configuration:")
    logger.info(config)

    expert_dims, raw_input_dims = compute_dims(config)

    if config['experts']['text_feat'] == 'learnable':
        # vocab
        vocab = Vocabulary()
        vocab.load('dataset/captions/dict.all_200k_gan.json')
        vocab_size = len(vocab)

        # word2vec
        if config['experts']['text_feat_init'] == True:
            # word2vec, download file and move to we_root-path directory
            # https://www.kaggle.com/jacksoncrow/word2vec-flickr30k/version/1
            we_rootpath = '/home/yj/pretrained_model'
            w2v_data_path = os.path.join(we_rootpath, "word2vec/", 'flickr',
                                         'vec500flickr30m')
            we_parameter = get_we_parameter(vocab, w2v_data_path)
        else:
            we_parameter = None
    else:
        vocab = None
        vocab_size = None
        we_parameter = None

    if "attr" in config['experts']['modalities']:
        attr_vocab = Vocabulary()
        attr_vocab.load('dataset/captions/dict.attr.json')
        attr_vocab_size = len(attr_vocab)
    else:
        attr_vocab = None
        attr_vocab_size = None

    data_loaders = config.init(name='data_loader',
                               module=module_data,
                               raw_input_dims=raw_input_dims,
                               text_feat=config['experts']['text_feat'],
                               text_dim=config['experts']['text_dim'],
                               vocab=vocab,
                               attr_vocab=attr_vocab,
                               pretrain=config['trainer']['pretrain'])

    model = config.init(name='arch',
                        module=module_arch,
                        expert_dims=expert_dims,
                        text_dim=config['experts']['text_dim'],
                        same_dim=config['experts']['ce_shared_dim'],
                        we_parameter=we_parameter,
                        vocab_size=vocab_size,
                        attr_vocab_size=attr_vocab_size,
                        text_feat=config['experts']['text_feat'])

    ckpt_path = Path(config._args.resume)
    logger.info(f"Loading checkpoint: {ckpt_path} ...")
    checkpoint = torch.load(ckpt_path)
    state_dict = checkpoint['state_dict']
    if config['n_gpu'] > 1:
        model = torch.nn.DataParallel(model)
    model.load_state_dict(state_dict)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    logger.info(f"Running test on {device}")

    model = model.to(device)
    model.eval()

    categories = ['dress', 'shirt', 'toptee']
    modalities = data_loaders[categories[0]].dataset.ordered_experts
    metric = {'score': dict()}

    for i, category in enumerate(categories):
        val_experts = {expert: list() for expert in modalities}
        target_ind = {expert: list() for expert in modalities}
        data_asin = []

        for batch in data_loaders[category + '_trg']:
            for key, val in batch['candidate_experts'].items():
                batch['candidate_experts'][key] = val.to(device)

            data_asin.extend(
                [meta['candidate'] for meta in batch['meta_info']])

            for key, val in batch['candidate_ind'].items():
                target_ind[key].append(val)

            with torch.no_grad():
                experts, _, _ = model(batch['candidate_experts'],
                                      batch['candidate_ind'],
                                      target=True)
                for modality, val in experts.items():
                    val_experts[modality].append(val)

        for modality, val in val_experts.items():
            val_experts[modality] = torch.cat(val)

        for modality, val in target_ind.items():
            target_ind[modality] = torch.cat(val)

        scores = []
        meta_infos = []
        val_size = val_experts['resnet'].size(0)

        for batch in data_loaders[category]:
            for experts in ['candidate_experts']:
                for key, val in batch[experts].items():
                    batch[experts][key] = val.to(device)
            batch["text"] = batch["text"].to(device)
            batch_size = batch["text"].size(0)

            meta_infos.extend(list(batch['meta_info']))

            with torch.no_grad():
                # composition_feature, text, moe_weights = model(batch['candidate_experts'],
                #                                                batch['candidate_ind'],
                #                                                batch['text'],
                #                                                batch['text_bow'],
                #                                                batch['text_lengths'])

                # batch_target = dict()
                # for mod in modalities:
                #     tmp = []
                #     for k in range(batch_size):
                #         tmp.append(model.target_composition(val_experts[mod], text[mod][k].expand(val_size, -1)))
                #     batch_target[mod] = torch.stack(tmp)

                src_experts = model.image_encoder(batch['candidate_experts'],
                                                  batch['candidate_ind'])
                src_text, moe_weights = model.get_text_feature(
                    batch['text'], batch['candidate_ind'], batch['text_bow'],
                    batch['text_lengths'])
                src_feature = model.get_combined_feature(src_experts, src_text)

                trg_text, _ = model.get_text_feature(batch['text'],
                                                     batch['target_ind'],
                                                     batch['text_bow'],
                                                     batch['text_lengths'],
                                                     target=True)
                # trg_text, _ = self.model.text_encoder['trg'](batch['text_mean'].unsqueeze(1), batch['target_ind'])

                batch_target = dict()
                for h, mod in enumerate(modalities):
                    tmp = []
                    for k in range(batch_size):
                        tmp.append(
                            model.trg_normalization_layer(
                                model.target_composition[h](
                                    val_experts[mod],
                                    trg_text[mod][k].expand(val_size, -1))))
                    batch_target[mod] = torch.stack(tmp)

                cross_view_conf_matrix = sharded_cross_view_inner_product(
                    vid_embds=batch_target,
                    text_embds=src_feature,
                    text_weights=moe_weights,
                    subspaces=model.image_encoder.modalities,
                    l2renorm=True,
                    dist=True,
                    val=True)

                scores.append(cross_view_conf_matrix)
        scores = torch.cat(scores)
        val_ids = data_loaders[category + '_trg'].dataset.data
        assert val_ids == data_asin
        metric['score'][category] = {
            'ids': val_ids,
            'matrix': scores,
            'meta_info': meta_infos
        }

    save_fname = ckpt_path.parent / f'test_score.pt'
    tic = time.time()
    logger.info("Saving score matrix: {} ...".format(save_fname))
    torch.save(metric, save_fname)
    logger.info(f"Done in {time.time() - tic:.3f}s")
Esempio n. 5
0
def main(config):
    logger = config.get_logger('train')
    expert_dims, raw_input_dims = compute_dims(config)
    seeds = [int(x) for x in config._args.seeds.split(',')]

    for seed in seeds:
        tic = time.time()
        logger.info(f"Setting experiment random seed to {seed}")
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)

        if config['experts']['text_feat'] == 'learnable':
            # vocab
            vocab = Vocabulary()
            vocab.load('dataset/captions/dict.all_200k_gan.json')
            vocab_size = len(vocab)
            if config['experts']['text_feat_init'] == True:
                # word2vec, download file and move to we_root-path directory
                # https://www.kaggle.com/jacksoncrow/word2vec-flickr30k/version/1
                we_rootpath = '/home/yj/pretrained_model'
                w2v_data_path = os.path.join(we_rootpath, "word2vec/",
                                             'flickr', 'vec500flickr30m')
                we_parameter = get_we_parameter(vocab, w2v_data_path)
            else:
                we_parameter = None
        else:
            vocab = None
            vocab_size = None
            we_parameter = None

        if "attr" in config['experts']['modalities']:
            attr_vocab = Vocabulary()
            attr_vocab.load('dataset/captions/dict.attr.json')
            attr_vocab_size = len(attr_vocab)
        else:
            attr_vocab = None
            attr_vocab_size = None

        data_loaders = config.init(name='data_loader',
                                   module=module_data,
                                   raw_input_dims=raw_input_dims,
                                   text_feat=config['experts']['text_feat'],
                                   text_dim=config['experts']['text_dim'],
                                   vocab=vocab,
                                   attr_vocab=attr_vocab,
                                   pretrain=config['trainer']['pretrain'])

        model = config.init(
            name='arch',
            module=module_arch,
            expert_dims=expert_dims,
            text_dim=config['experts']['text_dim'],
            same_dim=config['experts']['ce_shared_dim'],
            we_parameter=we_parameter,
            vocab_size=vocab_size,
            attr_vocab_size=attr_vocab_size,
            text_feat=config['experts']['text_feat'],
        )
        # logger.info(model)

        loss = config.init(name='loss', module=module_loss)

        trainable_params = filter(lambda p: p.requires_grad,
                                  model.parameters())
        optimizer = config.init('optimizer', torch.optim, trainable_params)
        lr_scheduler = config.init('lr_scheduler', torch.optim.lr_scheduler,
                                   optimizer)

        trainer = Trainer(
            model,
            loss,
            optimizer,
            config=config,
            data_loaders=data_loaders,
            lr_scheduler=lr_scheduler,
        )

        trainer.train()
        best_ckpt_path = config.save_dir / "trained_model.pth"
        duration = time.strftime('%Hh%Mm%Ss', time.gmtime(time.time() - tic))
        logger.info(f"Training took {duration}")

        test_args = argparse.ArgumentParser()
        test_args.add_argument("--device", default=config._args.device)
        test_args.add_argument("--resume", default=best_ckpt_path)
        test_config = ConfigParser(test_args)
        test(test_config)
Esempio n. 6
0
def test(config):
    logger = config.get_logger('test')
    logger.info("Running test with configuration:")
    logger.info(config)

    expert_dims = compute_dims(config)

    vocab = None
    vocab_size = None
    we_parameter = None

    if "attr" in config['experts']['modalities']:
        attr_vocab = Vocabulary()
        attr_vocab.load(
            os.path.join(config['data_loader']['args']['data_dir'],
                         'attributes/dict.attr.json'))
        attr_vocab_size = len(attr_vocab)
    else:
        attr_vocab = None
        attr_vocab_size = None

    data_loaders = config.init(
        name='data_loader',
        module=module_data,
        expert_dims=expert_dims,
        text_feat=config['experts']['text_feat'],
        text_dim=config['experts']['text_dim'],
    )

    model = config.init(name='arch',
                        module=module_arch,
                        expert_dims=expert_dims,
                        text_dim=config['experts']['text_dim'],
                        same_dim=config['experts']['ce_shared_dim'],
                        text_feat=config['experts']['text_feat'])
    trainer = TrainerJoint(
        model,
        loss=None,
        optimizer=None,
        config=config,
        data_loaders=data_loaders,
        lr_scheduler=None,
    )

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    logger.info(f"Running test on {device}")

    metric = trainer._valid_epoch(save_textatt=True)

    if config._args.mode == 'val':
        for key, value in metric.items():
            if key == 'recall_avg':
                logger.info(f'[Avg Recall]     : {value}')
            elif key == 'recall_avg_corr':
                logger.info(f'[Avg Recall corr]: {value}')
            elif key == 'comb_avg':
                logger.info(f'[comb_avg]       : {value}')
            elif key == 'recall':
                for i, category in zip(value, trainer.categories):
                    if len(i) == 2:
                        logger.info(f'[{category}] r@10, r@50: {i[0]}\t{i[1]}')
                    elif len(i) == 4:
                        logger.info(
                            f'[{category}] comp corr r@10, r@50: {i[0]}\t{i[1]}\t{i[2]}\t{i[3]}'
                        )
            elif key == 'comb':
                combstr = "comb:"
                for i, category in zip(value, trainer.categories):
                    combstr += f' {i[0]} {i[1]}'
                logger.info(combstr)
    else:
        save_fname = config.save_dir / f'test_score.pt'
        tic = time.time()
        logger.info("Saving score matrix: {} ...".format(save_fname))
        torch.save(metric, save_fname)
        logger.info(f"Done in {time.time() - tic:.3f}s")
class EmailSpamDataset(BaseDataset):
    """ Wrapper class to process and produce training samples """

    def __init__(
        self,
        data_dir,
        vocab_size=None,
        vocab=None,
        seq_length=40,
        training=False,
        vocab_from_pretrained="bert-base-uncased",
        do_lower_case=True,
    ):

        self.data_dir = data_dir
        self.vocab = Vocabulary(vocab_from_pretrained, do_lower_case)
        self.seq_length = seq_length

        data_all = pd.read_csv(os.path.join(self.data_dir, "combined-data.csv"), sep=' ', header=None, encoding="cp1252")
        data_all[1] = data_all[1] + " " + data_all[2]
        data_all = data_all[[0, 1]]
        data_all.columns = ['label', 'text']
        data_all = data_all[['text', 'label']]
        data_all = data_all[~data_all.text.isna()]
        data_all.label = data_all.label.apply(lambda x: int(x[-1]))
        data_all.text = data_all.text.apply(lambda x: x.lower())

        data_all = data_all.sample(1000)
        
        self.train_df = data_all.copy() #pd.DataFrame({"text": [], "label": []})
        self.val_df = pd.DataFrame({"text": [], "label": []})
        self.test_df = data_all.copy() # pd.DataFrame({"text": [], "label": []}) #data_all.copy()

        del data_all

        if training:
            self.train()
            if vocab is not None:
                if isinstance(vocab, str):
                    self.vocab.load(vocab)
                elif isinstance(vocab, Vocabulary):
                    self.vocab = vocab
            elif os.path.exists(os.path.join(data_dir, "vocab.pkl")):
                self.vocab.load(os.path.join(data_dir, "vocab.pkl"))
            else:
                self.vocab.add_text(
                    " ".join(pd.concat([self.train_df, self.val_df], sort=False).text.values)
                )
                self.vocab.save(os.path.join(data_dir, "vocab.pkl"))
        else:
            self.test()
            if vocab is not None:
                if isinstance(vocab, str):
                    self.vocab.load(vocab)
                elif isinstance(vocab, Vocabulary):
                    self.vocab = vocab
            elif os.path.exists(os.path.join(data_dir, "vocab.pkl")):
                self.vocab.load(os.path.join(data_dir, "vocab.pkl"))
            else:
                raise(Exception("Vocab file is not specified in test mode!"))
        
        if vocab_size is not None:
                self.vocab = self.vocab.most_common(vocab_size - 2)

    def validation(self):
        self.text = self.val_df.text.values
        self.labels = self.val_df.label.values
        self.len = len(self.val_df)
        return True

    def train(self):
        self.text = self.train_df.text.values
        self.labels = self.train_df.label.values
        self.len = len(self.train_df)
        return True

    def test(self):
        self.text = self.test_df.text.values
        self.labels = self.test_df.label.values
        self.len = len(self.test_df)
        return True

    def __len__(self):
        return self.len - 1 if self.len else 0

    def __getitem__(self, idx):
        y = self.labels[idx]
        text = self.text[idx]

        text = self.vocab.clean_text(text)
        input_ids, attention_mask, segment_ids = self.format_in_text(text)
        y = torch.LongTensor([y])

        return input_ids, attention_mask, segment_ids, y
    
    def format_in_text(self, text):
        text = self.vocab.clean_text(text)
        tokens_a = self.vocab.tokenize(text)

        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens_a) > self.seq_length - 2:
            tokens_a = tokens_a[: (self.seq_length - 2)]

        tokens = (
                [self.vocab.tokenizer.cls_token]
                + tokens_a
                + [self.vocab.tokenizer.sep_token]
        )
        segment_ids = [0] * len(tokens)
        # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
        input_ids = [self.vocab[x] for x in tokens]
        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
        attention_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [self.vocab.tokenizer.pad_token_id] * (
                self.seq_length - len(input_ids)
        )
        input_ids += padding
        attention_mask += padding
        segment_ids += padding


        input_ids = torch.LongTensor(input_ids)
        attention_mask = torch.LongTensor(attention_mask)
        segment_ids = torch.LongTensor(segment_ids)
        return input_ids, attention_mask, segment_ids