Exemple #1
0
    def test_tfidf(self):
        df_train, df_dev, df_test, metadata = get_fake_dataset(
            with_text_col=True)

        text_config = Mapping()
        text_config.mode = 'tfidf'
        text_config.max_words = 20

        encoder = Encoder(metadata, text_config)
        y_train, X_train_struc, X_train_text = encoder.fit_transform(df_train)
        y_dev, X_dev_struc, X_dev_text = encoder.transform(df_dev)
        y_test, X_test_struc, X_test_text = encoder.transform(df_test)

        model_config = get_fake_modelconfig('./outputs_test')
        model_config.output_dir = os.path.join(model_config.output_dir,
                                               'tfidf_text_only')
        if not os.path.exists(model_config.output_dir):
            os.makedirs(model_config.output_dir)

        model = Model(text_config, model_config)
        hist = model.train(y_train, X_train_struc, X_train_text, y_train,
                           X_train_struc, X_train_text)

        val_acc_true = 1.0
        self.assertTrue(np.isclose(val_acc_true, hist.history['val_acc'][-1]))
Exemple #2
0
    def test_lstm(self):
        df_train, df_dev, df_test, metadata = get_fake_dataset(
            with_text_col=True)

        glove_file_path = 'glove/glove.6B.50d.txt'  # need be changed to where you store the pre-trained GloVe file.

        text_config = Mapping()
        text_config.mode = 'glove'
        text_config.max_words = 20
        text_config.maxlen = 5
        text_config.embedding_dim = 50
        text_config.embeddings_index = open_glove(
            glove_file_path)  # need to change

        encoder = Encoder(metadata, text_config=text_config)
        y_train, X_train_struc, X_train_text = encoder.fit_transform(df_train)
        y_dev, X_dev_struc, X_dev_text = encoder.transform(df_dev)
        y_test, X_test_struc, X_test_text = encoder.transform(df_test)

        text_config.embedding_matrix = encoder.embedding_matrix

        model_config = get_fake_modelconfig('./outputs_test')
        model_config.output_dir = os.path.join(model_config.output_dir, 'lstm')
        if not os.path.exists(model_config.output_dir):
            os.makedirs(model_config.output_dir)

        model = Model(text_config, model_config)
        hist = model.train(y_train, X_train_struc, X_train_text, y_train,
                           X_train_struc, X_train_text)

        # print(hist.history)
        # y_dev, X_dev_struc, X_dev_text)

        val_acc_true = 1.0
        self.assertTrue(np.isclose(val_acc_true, hist.history['val_acc'][-1]))
Exemple #3
0
    def test_word_embedding(self):
        df_train, df_dev, df_test, metadata = get_fake_dataset(
            with_text_col=True)

        glove_file_path = 'glove.6B.50d.txt'  # need be changed to where you store the pre-trained GloVe file.

        text_config = Mapping()
        text_config.mode = 'glove'
        text_config.max_words = 20
        text_config.maxlen = 5
        text_config.embedding_dim = 50
        text_config.embeddings_index = open_glove(glove_file_path)

        encoder = Encoder(metadata, text_config=text_config)
        y_train, X_train_struc, X_train_text = encoder.fit_transform(df_train)
        y_dev, X_dev_struc, X_dev_text = encoder.transform(df_dev)
        y_test, X_test_struc, X_test_text = encoder.transform(df_test)

        X_train_text_true = np.array([[9, 10, 11, 2, 3], [15, 16, 17, 18, 19],
                                      [1, 2, 1, 1, 3]])
        X_train_struc_true = np.array([[-1.22474487, 1., 0., 0.],
                                       [0., 0., 1., 0.],
                                       [1.22474487, 0., 0., 1.]])
        self.assertTrue(np.isclose(X_train_text_true, X_train_text).all())
        self.assertTrue(np.isclose(X_train_struc_true, X_train_struc).all())
        X_dev_text_true = np.array([[1, 1, 1, 1, 1], [1, 1, 1, 1, 0],
                                    [1, 1, 1, 1, 1]])
        X_dev_struc_true = np.array([[2.44948974, 0., 1., 0.],
                                     [6.12372436, 0., 1., 0.],
                                     [3.67423461, 0., 0., 1.]])
        # print(X_dev_text)
        self.assertTrue(np.isclose(X_dev_text_true, X_dev_text).all())
        self.assertTrue(np.isclose(X_dev_struc_true, X_dev_struc).all())
        X_test_text_true = np.array([[14, 4, 1, 1, 1], [1, 1, 1, 1, 1],
                                     [1, 1, 1, 1, 1]])
        X_test_struc_true = np.array([[0., 1., 0., 0.],
                                      [3.67423461, 0., 0., 1.],
                                      [1.22474487, 0., 0., 1.]])
        self.assertTrue(np.isclose(X_test_text_true, X_test_text).all())
        self.assertTrue(np.isclose(X_test_struc_true, X_test_struc).all())
def create_default_modelconfig(task_type, num_classes, model_type, output_dir):
    model_config = Mapping()
    model_config.task_type = task_type ## 'classification' or 'regression'
    model_config.num_classes = num_classes ## number of classes or number of outputs
    model_config.combine = 'concate' ## or 'attention'
    model_config.model_type = model_type ## default is 'mlp', can be 'skip_connections'
    model_config.n_layers_dense = 2
    model_config.hidden_size_dense = 16
    model_config.n_layers_lstm = 2
    model_config.hidden_size_lstm = 32
    model_config.dropout_rate_lstm = 0.0
    model_config.n_layers_output = 2
    model_config.hidden_size_output = 32
    model_config.optimizer = 'adam' 
    model_config.learning_rate = 0.001
    model_config.clipnorm = 5.0
    model_config.patience = 20
    model_config.output_dir = output_dir
    model_config.n_epochs = 20
    model_config.batch_size = 1
    model_config.verbose = 0
    return model_config 
Exemple #5
0
def get_fake_modelconfig(output_path):
    model_config = Mapping()
    model_config.task_type = 'classification'  ## 'classification' or 'regression'
    model_config.num_classes = 3  ## number of classes or number of outputs
    model_config.combine = 'concate'  ## or 'attention'
    model_config.model_type = 'mlp'  ## default is 'mlp', can be 'skip_connections'
    model_config.n_layers_dense = 2
    model_config.hidden_size_dense = 16
    model_config.n_layers_lstm = 2
    model_config.hidden_size_lstm = 32
    model_config.dropout_rate_lstm = 0.0
    model_config.n_layers_output = 2
    model_config.hidden_size_output = 32
    model_config.optimizer = 'adam'  ## 'adam', 'sgd', 'rmsprop'
    model_config.learning_rate = 0.001
    model_config.clipnorm = 5.0
    model_config.patience = 2
    model_config.output_dir = output_path
    model_config.n_epochs = 5
    model_config.batch_size = 1
    model_config.verbose = 0
    return model_config
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument('--encoded_data_dir', type=str,
        # default='/data/home/t-chepan/projects/MS-intern-project/raw_data',
        help=('directory to load the encoded data.'))

    # this is optional 
    parser.add_argument('--data_name', type=str,
        # default='KICK',
        help=('which data will be used? (kickstarter Or indiegogo?)'))

    parser.add_argument('--search_space_filepath', type=str,
        # default='path/to/search_space.json',
        help=('where to load the search space file?'))

    parser.add_argument('--output_dir', type=str,
        # default='path/to/save/outputs',
        help=('directory to save the trained model and related model_config.'))

    parser.add_argument('--task_type', type=str,
        default='classification',
        help=('what is the type of this task? (classification or regression?)'))

    parser.add_argument('--num_classes', type=int,
        # default='classification',
        help=('what is the number of classes (classification) or outputs (regression)?'))

    parser.add_argument('--model_type', type=str,
        default='mlp',
        help=('what type of NN model you want to try? (mlp or skip_connections?)'))

    parser.add_argument('--num_trials', type=int,
        default= 1,
        help=('how many trials you want to run the model?'))


    args = parser.parse_args()

    
    if args.data_name is not None and args.encoded_data_dir is not None:
        path_to_data = os.path.join(args.encoded_data_dir, args.data_name)
        path_to_save = os.path.join(args.output_dir, args.data_name)
        if not os.path.exists(path_to_save):
            os.makedirs(path_to_save)

    elif args.data_name is None and args.encoded_data_dir is not None:
        path_to_data = args.encoded_data_dir
        path_to_save = args.output_dir

    else:
        raise argparse.ArgumentTypeError(args.data_name + ' or ' + args.encoded_data_dir + " can't be recognized.")


    ###########################################
    ## load encoded training set and dev set ##
    ###########################################

    y_train_path = os.path.join(path_to_data, 'y_train.npy')
    if os.path.exists(y_train_path):
        y_train = np.load(y_train_path, mmap_mode='r')
    else:
        raise ValueError('y_train is not found!')

    X_train_struc_path = os.path.join(path_to_data, 'X_train_struc.npy')
    if os.path.exists(X_train_struc_path):
        X_train_struc = np.load(X_train_struc_path, mmap_mode='r')
    else:
        X_train_struc = None

    X_train_text_path = os.path.join(path_to_data, 'X_train_text.npy')
    if os.path.exists(X_train_text_path):
        X_train_text = np.load(X_train_text_path, mmap_mode='r')
    else:
        X_train_text = None

    y_dev_path = os.path.join(path_to_data, 'y_dev.npy')
    if os.path.exists(y_dev_path):
        y_dev = np.load(y_dev_path, mmap_mode='r')
    else:
        raise ValueError('y_dev is not found!')

    X_dev_struc_path = os.path.join(path_to_data, 'X_dev_struc.npy')
    if os.path.exists(X_dev_struc_path):
        X_dev_struc = np.load(X_dev_struc_path, mmap_mode='r')
    else:
        X_dev_struc = None

    X_dev_text_path = os.path.join(path_to_data, 'X_dev_text.npy')
    if os.path.exists(X_dev_text_path):
        X_dev_text = np.load(X_dev_text_path, mmap_mode='r')
    else:
        X_dev_text = None

    text_config_path = os.path.join(path_to_data, 'text_config.json')
    if os.path.exists(text_config_path):
        with open(text_config_path, 'r') as f:
            text_config = json.load(f)
        text_config = Mapping(text_config)
    else:
        text_config = None

    if text_config is not None and text_config.mode == 'glove':
        embedding_matrix_path = text_config.embedding_matrix_path
        if os.path.exists(embedding_matrix_path):
            embedding_matrix = np.load(embedding_matrix_path, mmap_mode='r')
            text_config.embedding_matrix = embedding_matrix
        else:
            raise ValueError('embedding_matrix is not found!')
    else:
        embedding_matrix = None


    ###########################################
    ## sample model config from search space ##
    ###########################################

    if args.task_type is not None and args.num_classes is not None:
        print('you are choosing ' + args.model_type + ' as the model type!')
        default_model_config = create_default_modelconfig(args.task_type, args.num_classes, args.model_type, path_to_save)
    else:
        raise ValueError('You are missing task_type or num_classes or both!')

    ## load search space file which is provided by users ##
    with open(args.search_space_filepath, 'r') as f:
        search_space = json.load(f)
    search_space = Mapping(search_space)

    
    #######################################################################
    ## update default model_config based on search_space and train model ##
    #######################################################################
  
    for i in range(args.num_trials):
        model_config = sample_modelconfig(search_space, default_model_config)
        model_name = 'model_{}'.format(i)
        print('*' * 20)
        print('model_config: ' + model_config['output_dir'])

        model_config = Mapping(model_config)

        print('*' * 20)
        print('model_config: ' + model_config.output_dir)

        model_config.output_dir = os.path.join(default_model_config.output_dir, model_name)
        if not os.path.exists(model_config.output_dir):
            os.makedirs(model_config.output_dir)
        model = Model(text_config, model_config)
        hist = model.train(y_train, X_train_struc, X_train_text, y_train, X_train_struc, X_train_text)

        ## save hist.history and model_config ##
        history_path = os.path.join(model_config.output_dir, 'history.json')
        with open(history_path, 'w') as hf:
            json.dump(hist.history, hf)

        model_config_savepath = os.path.join(model_config.output_dir, 'model_config.json')
        with open(model_config_savepath, 'w') as mf:
            json.dump(model_config, mf)
Exemple #7
0
    def test_tfidf(self):
        df_train, df_dev, df_test, metadata = get_fake_dataset(
            with_text_col=True)

        text_config = Mapping()
        text_config.mode = 'tfidf'
        text_config.max_words = 20
        print('*' * 20)
        print(text_config.mode)

        encoder = Encoder(metadata, text_config=text_config)
        y_train, X_train_struc, X_train_text = encoder.fit_transform(df_train)
        y_dev, X_dev_struc, X_dev_text = encoder.transform(df_dev)
        y_test, X_test_struc, X_test_text = encoder.transform(df_test)

        X_train_text_true = np.array([[
            0., 0.69314718, 0.69314718, 0., 0.91629073, 0.91629073, 0.91629073,
            0.91629073, 0.91629073, 0.91629073, 0.91629073, 0., 0., 0., 0., 0.,
            0., 0., 0., 0.
        ],
                                      [
                                          0., 0., 0., 1.55141507, 0., 0., 0.,
                                          0., 0., 0., 0., 0.91629073,
                                          0.91629073, 0.91629073, 0.91629073,
                                          0.91629073, 0.91629073, 0.91629073,
                                          0.91629073, 0.
                                      ],
                                      [
                                          0., 0.69314718, 0.69314718, 0., 0.,
                                          0., 0., 0., 0., 0., 0., 0., 0., 0.,
                                          0., 0., 0., 0., 0., 0.91629073
                                      ]])
        X_train_struc_true = np.array([[-1.22474487, 1., 0., 0.],
                                       [0., 0., 1., 0.],
                                       [1.22474487, 0., 0., 1.]])
        self.assertTrue(np.isclose(X_train_text_true, X_train_text).all())
        self.assertTrue(np.isclose(X_train_struc_true, X_train_struc).all())
        X_dev_text_true = np.array([[
            0., 0., 0., 0.91629073, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
            0., 0., 0., 0., 0.
        ],
                                    [
                                        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                                        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
                                    ],
                                    [
                                        0., 0., 0., 0., 0., 0., 0., 0.,
                                        0.91629073, 0.91629073, 0.91629073, 0.,
                                        0., 0., 0., 0., 0., 0., 0., 0.
                                    ]])
        X_dev_struc_true = np.array([[2.44948974, 0., 1., 0.],
                                     [6.12372436, 0., 1., 0.],
                                     [3.67423461, 0., 0., 1.]])
        self.assertTrue(np.isclose(X_dev_text_true, X_dev_text).all())
        self.assertTrue(np.isclose(X_dev_struc_true, X_dev_struc).all())

        X_test_text_true = np.array([[
            0., 0., 0., 1.55141507, 0., 0., 0., 0., 0., 0., 0., 0., 0.,
            0.91629073, 0., 0., 0., 0., 0., 0.
        ],
                                     [
                                         0., 0., 0., 0., 0., 0., 0., 0., 0.,
                                         0., 0., 0., 0., 0., 0., 0., 0., 0.,
                                         0., 0.
                                     ],
                                     [
                                         0., 0., 0., 0.91629073, 0., 0., 0.,
                                         0., 0., 0., 0., 0., 0., 0., 0., 0.,
                                         0., 0., 0., 0.
                                     ]])
        X_test_struc_true = np.array([[0., 1., 0., 0.],
                                      [3.67423461, 0., 0., 1.],
                                      [1.22474487, 0., 0., 1.]])
        self.assertTrue(np.isclose(X_test_text_true, X_test_text).all())
        self.assertTrue(np.isclose(X_test_struc_true, X_test_struc).all())