Esempio n. 1
0
    def test_tfidf(self):
        df_train, df_dev, df_test, metadata = get_fake_dataset(
            with_text_col=True)

        text_config = Mapping()
        text_config.mode = 'tfidf'
        text_config.max_words = 20

        encoder = Encoder(metadata, text_config)
        y_train, X_train_struc, X_train_text = encoder.fit_transform(df_train)
        y_dev, X_dev_struc, X_dev_text = encoder.transform(df_dev)
        y_test, X_test_struc, X_test_text = encoder.transform(df_test)

        model_config = get_fake_modelconfig('./outputs_test')
        model_config.output_dir = os.path.join(model_config.output_dir,
                                               'tfidf_text_only')
        if not os.path.exists(model_config.output_dir):
            os.makedirs(model_config.output_dir)

        model = Model(text_config, model_config)
        hist = model.train(y_train, X_train_struc, X_train_text, y_train,
                           X_train_struc, X_train_text)

        val_acc_true = 1.0
        self.assertTrue(np.isclose(val_acc_true, hist.history['val_acc'][-1]))
Esempio n. 2
0
    def test_lstm(self):
        df_train, df_dev, df_test, metadata = get_fake_dataset(
            with_text_col=True)

        glove_file_path = 'glove/glove.6B.50d.txt'  # need be changed to where you store the pre-trained GloVe file.

        text_config = Mapping()
        text_config.mode = 'glove'
        text_config.max_words = 20
        text_config.maxlen = 5
        text_config.embedding_dim = 50
        text_config.embeddings_index = open_glove(
            glove_file_path)  # need to change

        encoder = Encoder(metadata, text_config=text_config)
        y_train, X_train_struc, X_train_text = encoder.fit_transform(df_train)
        y_dev, X_dev_struc, X_dev_text = encoder.transform(df_dev)
        y_test, X_test_struc, X_test_text = encoder.transform(df_test)

        text_config.embedding_matrix = encoder.embedding_matrix

        model_config = get_fake_modelconfig('./outputs_test')
        model_config.output_dir = os.path.join(model_config.output_dir, 'lstm')
        if not os.path.exists(model_config.output_dir):
            os.makedirs(model_config.output_dir)

        model = Model(text_config, model_config)
        hist = model.train(y_train, X_train_struc, X_train_text, y_train,
                           X_train_struc, X_train_text)

        # print(hist.history)
        # y_dev, X_dev_struc, X_dev_text)

        val_acc_true = 1.0
        self.assertTrue(np.isclose(val_acc_true, hist.history['val_acc'][-1]))
Esempio n. 3
0
    def test_strucdata_only(self):
        df_train, df_dev, df_test, metadata = get_fake_dataset(
            with_text_col=False)
        encoder = Encoder(metadata, text_config=None)
        y_train, X_train_struc, X_train_text = encoder.fit_transform(df_train)
        y_dev, X_dev_struc, X_dev_text = encoder.transform(df_dev)
        y_test, X_test_struc, X_test_text = encoder.transform(df_test)

        print(X_train_text, X_dev_text, X_test_text)

        model_config = get_fake_modelconfig('./outputs_test')
        model_config.output_dir = os.path.join(model_config.output_dir,
                                               'dense_mlp')
        if not os.path.exists(model_config.output_dir):
            os.makedirs(model_config.output_dir)

        model = Model(text_config=None, model_config=model_config)
        hist = model.train(y_train, X_train_struc, X_train_text, y_train,
                           X_train_struc, X_train_text)

        val_acc_true = 1.0
        self.assertTrue(np.isclose(val_acc_true, hist.history['val_acc'][-1]))
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument('--encoded_data_dir', type=str,
        # default='/data/home/t-chepan/projects/MS-intern-project/raw_data',
        help=('directory to load the encoded data.'))

    # this is optional 
    parser.add_argument('--data_name', type=str,
        # default='KICK',
        help=('which data will be used? (kickstarter Or indiegogo?)'))

    parser.add_argument('--search_space_filepath', type=str,
        # default='path/to/search_space.json',
        help=('where to load the search space file?'))

    parser.add_argument('--output_dir', type=str,
        # default='path/to/save/outputs',
        help=('directory to save the trained model and related model_config.'))

    parser.add_argument('--task_type', type=str,
        default='classification',
        help=('what is the type of this task? (classification or regression?)'))

    parser.add_argument('--num_classes', type=int,
        # default='classification',
        help=('what is the number of classes (classification) or outputs (regression)?'))

    parser.add_argument('--model_type', type=str,
        default='mlp',
        help=('what type of NN model you want to try? (mlp or skip_connections?)'))

    parser.add_argument('--num_trials', type=int,
        default= 1,
        help=('how many trials you want to run the model?'))


    args = parser.parse_args()

    
    if args.data_name is not None and args.encoded_data_dir is not None:
        path_to_data = os.path.join(args.encoded_data_dir, args.data_name)
        path_to_save = os.path.join(args.output_dir, args.data_name)
        if not os.path.exists(path_to_save):
            os.makedirs(path_to_save)

    elif args.data_name is None and args.encoded_data_dir is not None:
        path_to_data = args.encoded_data_dir
        path_to_save = args.output_dir

    else:
        raise argparse.ArgumentTypeError(args.data_name + ' or ' + args.encoded_data_dir + " can't be recognized.")


    ###########################################
    ## load encoded training set and dev set ##
    ###########################################

    y_train_path = os.path.join(path_to_data, 'y_train.npy')
    if os.path.exists(y_train_path):
        y_train = np.load(y_train_path, mmap_mode='r')
    else:
        raise ValueError('y_train is not found!')

    X_train_struc_path = os.path.join(path_to_data, 'X_train_struc.npy')
    if os.path.exists(X_train_struc_path):
        X_train_struc = np.load(X_train_struc_path, mmap_mode='r')
    else:
        X_train_struc = None

    X_train_text_path = os.path.join(path_to_data, 'X_train_text.npy')
    if os.path.exists(X_train_text_path):
        X_train_text = np.load(X_train_text_path, mmap_mode='r')
    else:
        X_train_text = None

    y_dev_path = os.path.join(path_to_data, 'y_dev.npy')
    if os.path.exists(y_dev_path):
        y_dev = np.load(y_dev_path, mmap_mode='r')
    else:
        raise ValueError('y_dev is not found!')

    X_dev_struc_path = os.path.join(path_to_data, 'X_dev_struc.npy')
    if os.path.exists(X_dev_struc_path):
        X_dev_struc = np.load(X_dev_struc_path, mmap_mode='r')
    else:
        X_dev_struc = None

    X_dev_text_path = os.path.join(path_to_data, 'X_dev_text.npy')
    if os.path.exists(X_dev_text_path):
        X_dev_text = np.load(X_dev_text_path, mmap_mode='r')
    else:
        X_dev_text = None

    text_config_path = os.path.join(path_to_data, 'text_config.json')
    if os.path.exists(text_config_path):
        with open(text_config_path, 'r') as f:
            text_config = json.load(f)
        text_config = Mapping(text_config)
    else:
        text_config = None

    if text_config is not None and text_config.mode == 'glove':
        embedding_matrix_path = text_config.embedding_matrix_path
        if os.path.exists(embedding_matrix_path):
            embedding_matrix = np.load(embedding_matrix_path, mmap_mode='r')
            text_config.embedding_matrix = embedding_matrix
        else:
            raise ValueError('embedding_matrix is not found!')
    else:
        embedding_matrix = None


    ###########################################
    ## sample model config from search space ##
    ###########################################

    if args.task_type is not None and args.num_classes is not None:
        print('you are choosing ' + args.model_type + ' as the model type!')
        default_model_config = create_default_modelconfig(args.task_type, args.num_classes, args.model_type, path_to_save)
    else:
        raise ValueError('You are missing task_type or num_classes or both!')

    ## load search space file which is provided by users ##
    with open(args.search_space_filepath, 'r') as f:
        search_space = json.load(f)
    search_space = Mapping(search_space)

    
    #######################################################################
    ## update default model_config based on search_space and train model ##
    #######################################################################
  
    for i in range(args.num_trials):
        model_config = sample_modelconfig(search_space, default_model_config)
        model_name = 'model_{}'.format(i)
        print('*' * 20)
        print('model_config: ' + model_config['output_dir'])

        model_config = Mapping(model_config)

        print('*' * 20)
        print('model_config: ' + model_config.output_dir)

        model_config.output_dir = os.path.join(default_model_config.output_dir, model_name)
        if not os.path.exists(model_config.output_dir):
            os.makedirs(model_config.output_dir)
        model = Model(text_config, model_config)
        hist = model.train(y_train, X_train_struc, X_train_text, y_train, X_train_struc, X_train_text)

        ## save hist.history and model_config ##
        history_path = os.path.join(model_config.output_dir, 'history.json')
        with open(history_path, 'w') as hf:
            json.dump(hist.history, hf)

        model_config_savepath = os.path.join(model_config.output_dir, 'model_config.json')
        with open(model_config_savepath, 'w') as mf:
            json.dump(model_config, mf)
Esempio n. 5
0
class Trainer(object):
    def __init__(self, cfg):
        self.storage = {}
        self.device = cfg.SOLVER.DEVICE
        self.max_iter = cfg.SOLVER.MAX_ITERS
        self.log_dir = cfg.SOLVER.TENSORBOARD_WRITER.LOG_DIR
        self.base_lr = cfg.SOLVER.LR.BASE_LR
        optimizer_name = cfg.SOLVER.OPTIMIZER
        self.weight_decay = cfg.SOLVER.WEIGHT_DECAY
        self.weights = cfg.SOLVER.WEIGHTS
        self.image_period = cfg.SOLVER.TENSORBOARD_WRITER.IMAGE_PERIOD
        self.scalar_period = cfg.SOLVER.TENSORBOARD_WRITER.SCALAR_PERIOD
        self.save_period = cfg.SOLVER.CHECKPOINT_PERIOD
        self.save_model_dir = cfg.SOLVER.SAVE_DIR
        self.model_name = cfg.SOLVER.CHECKPOINT_NAME

        data_loader = build_train_data_loader(cfg)
        self._data_loader_iter = iter(data_loader)
        self.model = Model(cfg, True).train().to(self.device)
        self.optimizer = self.build_optimizer(optimizer_name, self.model)
        self.lr_scheduler = build_LRscheduler(self.optimizer, cfg)
        self.iter = 0
        self.writer = None
        self.tic = 0
        self.toc = 0

    def build_optimizer(self, name: str,
                        model: torch.nn.Module) -> torch.optim.Optimizer:
        """No bias decay:
        Bag of Tricks for Image Classification with Convolutional Neural Networks
        (https://arxiv.org/pdf/1812.01187.pdf)"""
        weight_p, bias_p = [], []
        for p_name, p in model.named_parameters():
            if 'bias' in p_name:
                bias_p += [p]
            else:
                weight_p += [p]
        parameters = [{
            'params': weight_p,
            'weight_decay': self.weight_decay
        }, {
            'params': bias_p,
            'weight_decay': 0
        }]

        if name == 'Adam':
            return torch.optim.Adam(model.parameters(), lr=self.base_lr)
        if name == 'SGD':
            return torch.optim.SGD(model.parameters(), lr=self.base_lr)
        if name == 'SWA':
            """Stochastic Weight Averaging: 
            Averaging Weights Leads to Wider Optima and Better Generalization
            (https://arxiv.org/pdf/1803.05407.pdf)"""
            base_opt = torch.optim.SGD(parameters, lr=self.base_lr)
            return SWA(base_opt, swa_start=10, swa_freq=5, swa_lr=self.base_lr)

    def before_train(self):
        if self.weights != '':
            checkpoint = torch.load(self.weights)
            self.model.load_state_dict(checkpoint)
        if not os.path.exists(self.save_model_dir):
            os.makedirs(self.save_model_dir)
        self.writer = SummaryWriter(self.log_dir)
        self.model.train()

    def after_train(self):
        model_name = self.model_name + '_' + str(self.iter) + '.pth'
        torch.save(self.model.state_dict(),
                   os.path.join(self.save_model_dir, model_name))

    def before_step(self):
        self.tic = time.time()

    def after_step(self):
        # 统计时间
        self.toc = time.time()
        iter_time = self.toc - self.tic
        self.storage['iter_time'] = iter_time
        # 写tensorboard
        for key in self.storage:
            if isinstance(self.storage[key], dict):
                sub_dict = self.storage[key]
                for sub_key in sub_dict:
                    value = sub_dict[sub_key]
                    self._write_tensorboard(key + '/' + sub_key, value)
            else:
                value = self.storage[key]
                self._write_tensorboard(key, value)

        # 保存模型
        if self.iter % self.save_period == 0:
            model_name = self.model_name + '_' + str(self.iter) + '.pth'
            torch.save(self.model.state_dict(),
                       os.path.join(self.save_model_dir, model_name))

    def _write_tensorboard(self, key: str, value: Union[torch.Tensor, int,
                                                        float]):
        if isinstance(value, torch.Tensor) and len(value.shape) == 4:
            if self.iter % self.image_period == 0:
                self.writer.add_images(key, value, self.iter)
        elif self.iter % self.scalar_period == 0:
            self.writer.add_scalar(key, value, self.iter)

    def train(self, start_iter=0):
        try:
            self.before_train()
            for self.iter in range(start_iter, self.max_iter):
                self.before_step()
                self.run_step()
                self.after_step()
            self.after_train()
        finally:
            self.after_train()

    def run_step(self):
        data = next(self._data_loader_iter)
        total_loss, losses, metries = self.model(data)

        self.storage['total_loss'] = total_loss
        self.storage['losses'] = losses
        self.storage['image'] = data['image']
        self.storage['training_mask'] = data['training_mask']
        self.storage['metries'] = metries
        grads = {}

        self.storage['grads'] = grads

        self.optimizer.zero_grad()
        total_loss.backward()
        self.optimizer.step()
        self.storage['lr'] = self.lr_scheduler.get_lr()[0]
        self.lr_scheduler.step()

        for name, parm in self.model.named_parameters():
            if parm.grad is not None:
                grads[name] = torch.mean(torch.abs(parm.grad))