class Searcher(object):

    def __init__(self, experiment_dir, params_filename, learner, 
                 learner_kwargs, grid_search_dirname='search_gird'):
        assert os.path.isdir(experiment_dir)
        assert os.path.isfile(os.path.join(experiment_dir, params_filename))
        assert hasattr(learner, '__call__')
        assert isinstance(learner_kwargs, dict)
        self.experiment_dir = experiment_dir
        self.experiment_params = Params(os.path.join(experiment_dir, 
                                                     params_filename))
        self.params_filename = params_filename
        self.learner = learner
        self.learner_kwargs = learner_kwargs
        self.grid_search_dirname = grid_search_dirname

    def search(self, hyperparam, parent_dir):
        assert isinstance(hyperparam, tuple)
        name, candidates = hyperparam
        params = Params(os.path.join(parent_dir, self.params_filename))
        experiment_dirs = []
        for candidate in candidates:
            experiment = '{}_{}'.format(name, candidate)
            experiment_dir = os.path.join(parent_dir, experiment)
            # create experiment directory
            if not os.path.isdir(experiment_dir):
                os.makedirs(experiment_dir)
            experiment_dirs.append(experiment_dir)
            # create params file for this experiment
            params.set(name, candidate)
            params.dump(os.path.join(experiment_dir, self.params_filename))
            # run subprocess to train model
            self.learner(**self.learner_kwargs)
        return experiment_dirs

    def run(self, hyperparams):
        assert isinstance(hyperparams, dict)
        # create grid search directory
        parent_dir = os.path.join(self.experiment_dir, 
                                  self.grid_search_dirname)
        if os.path.isdir(parent_dir):
            shutil.rmtree(parent_dir)
        os.makedirs(parent_dir)
        # create experiment params file
        self.experiment_params.dump(os.path.join(parent_dir, 
                                                 self.params_filename))
        # gird search
        pds = [parent_dir]
        for name, candidates in hyperparams.items():
            new_pds = []
            for pd in pds:
                experiment_dirs = self.search((name, candidates), pd)
                new_pds.extend(experiment_dirs)
            pds = new_pds
        return parent_dir
 def __init__(self, experiment_dir, params_filename, learner, 
              learner_kwargs, grid_search_dirname='search_gird'):
     assert os.path.isdir(experiment_dir)
     assert os.path.isfile(os.path.join(experiment_dir, params_filename))
     assert hasattr(learner, '__call__')
     assert isinstance(learner_kwargs, dict)
     self.experiment_dir = experiment_dir
     self.experiment_params = Params(os.path.join(experiment_dir, 
                                                  params_filename))
     self.params_filename = params_filename
     self.learner = learner
     self.learner_kwargs = learner_kwargs
     self.grid_search_dirname = grid_search_dirname
 def search(self, hyperparam, parent_dir):
     assert isinstance(hyperparam, tuple)
     name, candidates = hyperparam
     params = Params(os.path.join(parent_dir, self.params_filename))
     experiment_dirs = []
     for candidate in candidates:
         experiment = '{}_{}'.format(name, candidate)
         experiment_dir = os.path.join(parent_dir, experiment)
         # create experiment directory
         if not os.path.isdir(experiment_dir):
             os.makedirs(experiment_dir)
         experiment_dirs.append(experiment_dir)
         # create params file for this experiment
         params.set(name, candidate)
         params.dump(os.path.join(experiment_dir, self.params_filename))
         # run subprocess to train model
         self.learner(**self.learner_kwargs)
     return experiment_dirs
Exemple #4
0
    assert os.path.isfile(inputs_file), msg.format(inputs_file)

    logger = Logger.set(os.path.join(exp_cfg.experiment_dir(), 
                                     'predict.log'))

    checkpoint = Checkpoint(
        checkpoint_dir=exp_cfg.experiment_dir(),
        filename=exp_cfg.checkpoint_filename(),
        best_checkpoint=exp_cfg.best_checkpoint(),
        latest_checkpoint=exp_cfg.latest_checkpoint(),
        logger=logger)

    # load params
    word_vocab = Vocab(words_file)
    tag_vocab = Vocab(tags_file)

    params = Params(exp_cfg.params_file())
    params.update(Params(dataset_cfg.params_file()))
    params.set('cuda', torch.cuda.is_available())

    # restore model
    items = model_factory(params)
    model = items['model']
    checkpoint.restore(model, None, restore_checkpoint)
    
    # predict
    predict(model, word_vocab, tag_vocab, inputs_file, 
            outputs_file, params.unk_word, params.cuda, encoding)

    print("It's done! Please check the output file:")
    print(outputs_file)
Exemple #5
0
def main(model,
         auxiliary=True,
         model_label='rcnn',
         rnn_type='gru',
         padding='pre',
         reg='s',
         prefix="crawl",
         embedding_file_type="word2vec",
         train_fname="./data/train.csv",
         test_fname="./data/test.csv",
         embeds_fname="./data/GoogleNews-vectors-negative300.bin",
         logger_fname="./logs/log-aws",
         mode="all",
         wrong_words_fname="./data/correct_words.csv",
         format_embeds="binary",
         config="./config.json",
         output_dir="./out",
         norm_prob=False,
         norm_prob_koef=1,
         gpus=0,
         char_level=False,
         random_seed=2018,
         num_folds=5):

    embedding_type = prefix + "_" + embedding_file_type

    logger = Logger(logging.getLogger(), logger_fname)

    # ====Detect GPUs====
    logger.debug(device_lib.list_local_devices())

    # ====Load data====
    logger.info('Loading data...')
    train_df = load_data(train_fname)
    test_df = load_data(test_fname)

    target_labels = [
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]
    num_classes = len(target_labels)

    # ====Load additional data====
    logger.info('Loading additional data...')
    # swear_words = load_data(swear_words_fname, func=lambda x: set(x.T[0]), header=None)
    wrong_words_dict = load_data(wrong_words_fname,
                                 func=lambda x: {val[0]: val[1]
                                                 for val in x})

    tokinizer = RegexpTokenizer(r'\S+')
    regexps = [
        re.compile("([a-zA-Z]+)([0-9]+)"),
        re.compile("([0-9]+)([a-zA-Z]+)")
    ]

    # ====Load word vectors====
    logger.info('Loading embeddings...')
    if model != 'mvcnn':
        embed_dim = 300
        embeds = Embeds(embeds_fname,
                        embedding_file_type,
                        format=format_embeds)

    if mode in ('preprocess', 'all'):
        logger.info('Generating indirect features...')
        # https://www.kaggle.com/jagangupta/stop-the-s-toxic-comments-eda
        # Word count in each comment:
        train_df['count_word'] = train_df["comment_text"].apply(
            lambda x: len(str(x).split()))
        test_df['count_word'] = test_df["comment_text"].apply(
            lambda x: len(str(x).split()))
        # Unique word count
        train_df['count_unique_word'] = train_df["comment_text"].apply(
            lambda x: len(set(str(x).split())))
        test_df['count_unique_word'] = test_df["comment_text"].apply(
            lambda x: len(set(str(x).split())))
        # Letter count
        train_df['count_letters'] = train_df["comment_text"].apply(
            lambda x: len(str(x)))
        test_df['count_letters'] = test_df["comment_text"].apply(
            lambda x: len(str(x)))
        # punctuation count
        train_df["count_punctuations"] = train_df["comment_text"].apply(
            lambda x: len([c for c in str(x) if c in string.punctuation]))
        test_df["count_punctuations"] = test_df["comment_text"].apply(
            lambda x: len([c for c in str(x) if c in string.punctuation]))
        # upper case words count
        train_df["count_words_upper"] = train_df["comment_text"].apply(
            lambda x: len([w for w in str(x).split() if w.isupper()]))
        test_df["count_words_upper"] = test_df["comment_text"].apply(
            lambda x: len([w for w in str(x).split() if w.isupper()]))
        # title case words count
        train_df["count_words_title"] = train_df["comment_text"].apply(
            lambda x: len([w for w in str(x).split() if w.istitle()]))
        test_df["count_words_title"] = test_df["comment_text"].apply(
            lambda x: len([w for w in str(x).split() if w.istitle()]))
        # Word count percent in each comment:
        train_df['word_unique_pct'] = train_df[
            'count_unique_word'] * 100 / train_df['count_word']
        test_df['word_unique_pct'] = test_df[
            'count_unique_word'] * 100 / test_df['count_word']
        # Punct percent in each comment:
        train_df['punct_pct'] = train_df[
            'count_punctuations'] * 100 / train_df['count_word']
        test_df['punct_pct'] = test_df['count_punctuations'] * 100 / test_df[
            'count_word']
        # Average length of the words
        train_df["mean_word_len"] = train_df["comment_text"].apply(
            lambda x: np.mean([len(w) for w in str(x).split()]))
        test_df["mean_word_len"] = test_df["comment_text"].apply(
            lambda x: np.mean([len(w) for w in str(x).split()]))
        # upper case words percentage
        train_df["words_upper_pct"] = train_df[
            "count_words_upper"] * 100 / train_df['count_word']
        test_df["words_upper_pct"] = test_df[
            "count_words_upper"] * 100 / test_df['count_word']
        # title case words count
        train_df["words_title_pct"] = train_df[
            "count_words_title"] * 100 / train_df['count_word']
        test_df["words_title_pct"] = test_df[
            "count_words_title"] * 100 / test_df['count_word']
        # remove columns
        train_df = train_df.drop('count_word', 1)
        train_df = train_df.drop('count_unique_word', 1)
        train_df = train_df.drop('count_punctuations', 1)
        train_df = train_df.drop('count_words_upper', 1)
        train_df = train_df.drop('count_words_title', 1)
        test_df = test_df.drop('count_word', 1)
        test_df = test_df.drop('count_unique_word', 1)
        test_df = test_df.drop('count_punctuations', 1)
        test_df = test_df.drop('count_words_upper', 1)
        test_df = test_df.drop('count_words_title', 1)

        logger.info('Cleaning text...')
        train_df['comment_text_clear'] = clean_text(train_df['comment_text'],
                                                    tokinizer,
                                                    wrong_words_dict,
                                                    regexps,
                                                    autocorrect=False)
        test_df['comment_text_clear'] = clean_text(test_df['comment_text'],
                                                   tokinizer,
                                                   wrong_words_dict,
                                                   regexps,
                                                   autocorrect=False)
        if reg == 'w':
            # remove all punctuations
            train_df.to_csv(os.path.join(output_dir, 'train_clear_w.csv'),
                            index=False)
            test_df.to_csv(os.path.join(output_dir, 'test_clear_w.csv'),
                           index=False)
            train_df = pd.read_csv(
                os.path.join(output_dir, 'train_clear_w.csv'))
            test_df = pd.read_csv(os.path.join(output_dir, 'test_clear_w.csv'))
        elif reg == 's':
            # split by S+ keep all punctuations
            train_df.to_csv(os.path.join(output_dir, 'train_clear.csv'),
                            index=False)
            test_df.to_csv(os.path.join(output_dir, 'test_clear.csv'),
                           index=False)
            train_df = pd.read_csv(os.path.join(output_dir, 'train_clear.csv'))
            test_df = pd.read_csv(os.path.join(output_dir, 'test_clear.csv'))

    if mode == 'preprocess':
        return

    if mode == 'processed':
        if reg == 'w':
            train_df = pd.read_csv(
                os.path.join(output_dir, 'train_clear_w.csv'))
            test_df = pd.read_csv(os.path.join(output_dir, 'test_clear_w.csv'))
        elif reg == 's':
            train_df = pd.read_csv(os.path.join(output_dir, 'train_clear.csv'))
            test_df = pd.read_csv(os.path.join(output_dir, 'test_clear.csv'))

    logger.info('Calc text length...')
    train_df.fillna('unknown', inplace=True)
    test_df.fillna('unknown', inplace=True)
    train_df['text_len'] = train_df['comment_text_clear'].apply(
        lambda words: len(words.split()))
    test_df['text_len'] = test_df['comment_text_clear'].apply(
        lambda words: len(words.split()))
    max_seq_len = np.round(train_df['text_len'].mean() +
                           3 * train_df['text_len'].std()).astype(int)
    logger.debug('Max seq length = {}'.format(max_seq_len))

    # ====Prepare data to NN====
    logger.info('Converting texts to sequences...')
    max_words = 100000
    if char_level:
        max_seq_len = 1200

    train_df['comment_seq'], test_df[
        'comment_seq'], word_index = convert_text2seq(
            train_df['comment_text_clear'].tolist(),
            test_df['comment_text_clear'].tolist(),
            max_words,
            max_seq_len,
            embeds,
            lower=True,
            char_level=char_level,
            uniq=True,
            use_only_exists_words=True,
            position=padding)
    logger.debug('Dictionary size = {}'.format(len(word_index)))

    logger.info('Preparing embedding matrix...')
    if model != 'mvcnn':
        embedding_matrix, words_not_found = get_embedding_matrix(
            embed_dim, embeds, max_words, word_index)

    logger.debug('Embedding matrix shape = {}'.format(
        np.shape(embedding_matrix)))
    logger.debug('Number of null word embeddings = {}'.format(
        np.sum(np.sum(embedding_matrix, axis=1) == 0)))

    # ====Train/test split data====
    # train/val
    x_aux = np.matrix([
        train_df["word_unique_pct"].tolist(), train_df["punct_pct"].tolist(),
        train_df["mean_word_len"].tolist(),
        train_df["words_upper_pct"].tolist(),
        train_df["words_title_pct"].tolist()
    ],
                      dtype='float32').transpose((1, 0))
    x = np.array(train_df['comment_seq'].tolist())
    y = np.array(train_df[target_labels].values)
    x_train_nn, x_test_nn, x_aux_train_nn, x_aux_test_nn, y_train_nn, y_test_nn, train_idxs, test_idxs = \
        split_data(x, np.squeeze(np.asarray(x_aux)),y,test_size=0.2,shuffle=True,random_state=2018)
    # test set
    test_df_seq = np.array(test_df['comment_seq'].tolist())
    test_aux = np.matrix([
        train_df["word_unique_pct"].tolist(), train_df["punct_pct"].tolist(),
        train_df["mean_word_len"].tolist(),
        train_df["words_upper_pct"].tolist(),
        train_df["words_title_pct"].tolist()
    ],
                         dtype='float32').transpose((1, 0))
    test_df_seq_aux = np.squeeze(np.asarray(test_aux))
    y_nn = []
    logger.debug('X shape = {}'.format(np.shape(x_train_nn)))

    # ====Train models====
    params = Params(config)
    if model_label == None:
        logger.warn('Should choose a model to train')
        return

    if model_label == 'dense':
        model = dense(
            embedding_matrix,
            num_classes,
            max_seq_len,
            dense_dim=params.get('dense').get('dense_dim'),
            n_layers=params.get('dense').get('n_layers'),
            concat=params.get('dense').get('concat'),
            dropout_val=params.get('dense').get('dropout_val'),
            l2_weight_decay=params.get('dense').get('l2_weight_decay'),
            pool=params.get('dense').get('pool'),
            train_embeds=params.get('dense').get('train_embeds'),
            add_sigmoid=True,
            gpus=gpus)
    if model_label == 'cnn':
        model = cnn(embedding_matrix,
                    num_classes,
                    max_seq_len,
                    num_filters=params.get('cnn').get('num_filters'),
                    l2_weight_decay=params.get('cnn').get('l2_weight_decay'),
                    dropout_val=params.get('cnn').get('dropout_val'),
                    dense_dim=params.get('cnn').get('dense_dim'),
                    train_embeds=params.get('cnn').get('train_embeds'),
                    n_cnn_layers=params.get('cnn').get('n_cnn_layers'),
                    pool=params.get('cnn').get('pool'),
                    add_embeds=params.get('cnn').get('add_embeds'),
                    auxiliary=auxiliary,
                    add_sigmoid=True,
                    gpus=gpus)
    if model_label == 'cnn2d':
        model = cnn2d(
            embedding_matrix,
            num_classes,
            max_seq_len,
            num_filters=params.get('cnn2d').get('num_filters'),
            l2_weight_decay=params.get('cnn2d').get('l2_weight_decay'),
            dropout_val=params.get('cnn2d').get('dropout_val'),
            dense_dim=params.get('cnn2d').get('dense_dim'),
            train_embeds=params.get('cnn2d').get('train_embeds'),
            add_embeds=params.get('cnn2d').get('add_embeds'),
            auxiliary=auxiliary,
            add_sigmoid=True,
            gpus=gpus)

    if model_label == 'lstm':
        model = rnn(
            embedding_matrix,
            num_classes,
            max_seq_len,
            l2_weight_decay=params.get('lstm').get('l2_weight_decay'),
            rnn_dim=params.get('lstm').get('rnn_dim'),
            dropout_val=params.get('lstm').get('dropout_val'),
            dense_dim=params.get('lstm').get('dense_dim'),
            n_branches=params.get('lstm').get('n_branches'),
            n_rnn_layers=params.get('lstm').get('n_rnn_layers'),
            n_dense_layers=params.get('lstm').get('n_dense_layers'),
            train_embeds=params.get('lstm').get('train_embeds'),
            mask_zero=params.get('lstm').get('mask_zero'),
            kernel_regularizer=params.get('lstm').get('kernel_regularizer'),
            recurrent_regularizer=params.get('lstm').get(
                'recurrent_regularizer'),
            activity_regularizer=params.get('lstm').get(
                'activity_regularizer'),
            dropout=params.get('lstm').get('dropout'),
            recurrent_dropout=params.get('lstm').get('recurrent_dropout'),
            auxiliary=auxiliary,
            add_sigmoid=True,
            gpus=gpus,
            rnn_type='lstm')
    if model_label == 'gru':
        model = rnn(
            embedding_matrix,
            num_classes,
            max_seq_len,
            l2_weight_decay=params.get('gru').get('l2_weight_decay'),
            rnn_dim=params.get('gru').get('rnn_dim'),
            dropout_val=params.get('gru').get('dropout_val'),
            dense_dim=params.get('gru').get('dense_dim'),
            n_branches=params.get('gru').get('n_branches'),
            n_rnn_layers=params.get('gru').get('n_rnn_layers'),
            n_dense_layers=params.get('gru').get('n_dense_layers'),
            train_embeds=params.get('gru').get('train_embeds'),
            mask_zero=params.get('gru').get('mask_zero'),
            kernel_regularizer=params.get('gru').get('kernel_regularizer'),
            recurrent_regularizer=params.get('gru').get(
                'recurrent_regularizer'),
            activity_regularizer=params.get('gru').get('activity_regularizer'),
            dropout=params.get('gru').get('dropout'),
            recurrent_dropout=params.get('gru').get('recurrent_dropout'),
            auxiliary=auxiliary,
            add_sigmoid=True,
            gpus=gpus,
            rnn_type='gru')

    if model_label == 'charrnn':
        model = charrnn(
            len(word_index),
            num_classes,
            max_seq_len,
            rnn_dim=params.get('charrnn').get('rnn_dim'),
            dropout_val=params.get('charrnn').get('dropout_val'),
            auxiliary=auxiliary,
            dropout=params.get('charrnn').get('dropout'),
            recurrent_dropout=params.get('charrnn').get('recurrent_dropout'),
            add_sigmoid=True,
            gpus=gpus,
            rnn_type=rnn_type)
    if model_label == 'cnn2rnn':
        model = cnn2rnn(embedding_matrix,
                        num_classes,
                        max_seq_len,
                        rnn_type=rnn_type)
    if model_label == 'dpcnn':
        model = dpcnn(embedding_matrix,
                      num_classes,
                      max_seq_len,
                      num_filters=params.get('dpcnn').get('num_filters'),
                      dense_dim=params.get('dpcnn').get('dense_dim'),
                      add_sigmoid=True,
                      gpus=gpus)

    if model_label == 'rcnn':
        model = rcnn(
            embedding_matrix,
            num_classes,
            max_seq_len,
            rnn_dim=params.get('rcnn').get('rnn_dim'),
            dropout_val=params.get('rcnn').get('dropout_val'),
            dense_dim=params.get('rcnn').get('dense_dim'),
            train_embeds=params.get('rcnn').get('train_embeds'),
            auxiliary=auxiliary,
            dropout=params.get('rcnn').get('dropout'),
            recurrent_dropout=params.get('rcnn').get('recurrent_dropout'),
            add_sigmoid=True,
            gpus=gpus,
            rnn_type=rnn_type)
    if model_label == 'capsule':
        model = capsule(
            embedding_matrix,
            num_classes,
            max_seq_len,
            auxiliary=auxiliary,
            Num_capsule=params.get('capsule').get('Num_capsule'),
            Routings=params.get('capsule').get('Routing'),
            add_sigmoid=params.get('capsule').get('add_sigmoid'),
            mask_zero=params.get('capsule').get('mask_zero'),
            gpus=gpus,
            rnn_type='gru')  # lstm may diverge but gru works better

    if model == 'mvcnn':
        embeds_fname1 = "./data/crawl-300d-2M.vec"  # "./data/crawl-300d-2M.vec  word2vec-raw.txt
        embeds_fname2 = "./data/glove.840B.300d.txt"
        embeds_fname3 = "./data/GoogleNews-vectors-negative300.bin"
        embed_dim = 300
        embeds1 = Embeds(embeds_fname1, "glove", format='file')
        embeds2 = Embeds(embeds_fname2, "fasttext", format='file')
        embeds3 = Embeds(embeds_fname3, "word2vec", format='binary')
        embedding_matrix1, words_not_found1 = get_embedding_matrix(
            embed_dim, embeds1, max_words, word_index)
        embedding_matrix2, words_not_found2 = get_embedding_matrix(
            embed_dim, embeds2, max_words, word_index)
        #embedding_matrix3, words_not_found3 = get_embedding_matrix(embed_dim, embeds3, max_words, word_index)
        model = mvcnn(embedding_matrix1,
                      embedding_matrix2,
                      num_classes,
                      max_seq_len,
                      auxiliary=auxiliary,
                      gpus=gpus)

    # ====k-fold cross validations split data====
    logger.info('Run k-fold cross validation...')
    params = Params(config)
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=random_seed)
    oof_train = np.zeros((x.shape[0], num_classes))
    oof_test_skf = []

    for i, (train_index, test_index) in enumerate(kf.split(x, y)):
        print("TRAIN:", train_index, "TEST:", test_index)
        x_train, x_aux_train, x_test, x_aux_test = x[train_index], x_aux[
            train_index], x[test_index], x_aux[test_index]
        y_train, y_test = y[train_index], y[test_index]
        logger.info('Start training {}-th fold'.format(i))
        if auxiliary:
            inputs = [x_train, x_aux_train]
            inputs_val = [x_test, x_aux_test]
            output = [test_df_seq, test_df_seq_aux]
        else:
            inputs = x_train
            inputs_val = x_test
            output = test_df_seq
        hist = train(
            x_train=
            inputs,  # [x_train, x_aux_train] when auxiliary input is allowed.
            y_train=y_train,
            x_val=inputs_val,  # [x_test, x_aux_test],
            y_val=y_test,
            model=model,
            batch_size=params.get(model_label).get('batch_size'),
            num_epochs=params.get(model_label).get('num_epochs'),
            learning_rate=params.get(model_label).get('learning_rate'),
            early_stopping_delta=params.get(model_label).get(
                'early_stopping_delta'),
            early_stopping_epochs=params.get(model_label).get(
                'early_stopping_epochs'),
            use_lr_strategy=params.get(model_label).get('use_lr_strategy'),
            lr_drop_koef=params.get(model_label).get('lr_drop_koef'),
            epochs_to_drop=params.get(model_label).get('epochs_to_drop'),
            model_checkpoint_dir=os.path.join('.', 'model_checkpoint', reg,
                                              model_label, embedding_type,
                                              padding, str(i)),
            logger=logger)

        model.load_weights(
            os.path.join('.', 'model_checkpoint', reg, model_label,
                         embedding_type, padding, str(i), 'weights.h5'))
        oof_train[test_index, :] = model.predict(
            inputs_val)  # model.predict([x_test, x_aux_test])
        proba = model.predict(
            output)  # model.predict([test_df_seq, test_df_seq_aux])
        oof_test_skf.append(proba)
        result = pd.read_csv("./data/sample_submission.csv")
        result[target_labels] = proba
        ithfold_path = "./cv/{}/{}/{}/{}/{}".format(reg, model_label,
                                                    embedding_type, padding, i)
        if not os.path.exists(ithfold_path):
            os.makedirs(ithfold_path)

        result.to_csv(os.path.join(ithfold_path, 'sub.csv'), index=False)
        # model.save(os.path.join(ithfold_path,'weights.h5'))

    # dump oof_test and oof_train for later slacking
    # oof_train:
    oof_train_path = "./cv/{}/{}/{}/{}/oof_train".format(
        reg, model_label, embedding_type, padding)
    if not os.path.exists(oof_train_path):
        os.makedirs(oof_train_path)

    np.savetxt(os.path.join(oof_train_path, "oof_train.csv"),
               oof_train,
               fmt='%.24f',
               delimiter=' ')
    # oof_test: stacking version
    oof_test = np.array(oof_test_skf).mean(axis=0)
    oof_test_path = "./cv/{}/{}/{}/{}/oof_test".format(reg, model_label,
                                                       embedding_type, padding)
    if not os.path.exists(oof_test_path):
        os.makedirs(oof_test_path)

    np.savetxt(os.path.join(oof_test_path, "oof_test.csv"),
               oof_test,
               fmt='%.24f',
               delimiter=' ')
    # oof_test: submission version
    result[target_labels] = oof_test
    oof_test_bag_path = "./cv/{}/{}/{}/{}/bagged".format(
        reg, model_label, embedding_type, padding)
    if not os.path.exists(oof_test_bag_path):
        os.makedirs(oof_test_bag_path)

    result.to_csv(os.path.join(oof_test_bag_path, "sub.csv"), index=False)
Exemple #6
0
 def dump(self,
          data_dir,
          params_file,
          sentences_filename='sentences.txt',
          labels_filename='labels.txt',
          words_filename='words.txt',
          tags_filename='tags.txt',
          encoding='utf8',
          shuffle=True,
          min_count_word=1,
          min_count_tag=1):
     # datasets params
     params = Params(
         data={
             'word_vocab_size': 0,
             'tag_vocab_size': 0,
             'pad_word': self.PAD_WORD,
             'unk_word': self.UNK_WORD,
             'pad_tag': self.PAD_TAG
         })
     # dataset and vocab
     datasets = self.build(shuffle=shuffle)
     tag_vocab = Counter([self.PAD_TAG])
     word_vocab = Counter([self.PAD_WORD, self.UNK_WORD])
     # save train/val/test dataset
     for dataset in datasets:
         name = dataset.name
         size = len(dataset)
         self.logger.info('Saving {} dataset...'.format(name))
         params.set('{}_size'.format(name), size)  # set dataset size
         dirpath = os.path.join(data_dir, name)
         if not os.path.isdir(dirpath):
             os.makedirs(dirpath)
         sentences_file = os.path.join(dirpath, sentences_filename)
         labels_file = os.path.join(dirpath, labels_filename)
         with open(sentences_file, 'w', encoding=encoding) as fs, \
             open(labels_file, 'w', encoding=encoding) as fl:
             for sample in dataset:
                 words, tags = sample.words, sample.tags
                 fs.write('{}\n'.format(' '.join(words)))
                 fl.write('{}\n'.format(' '.join(tags)))
                 tag_vocab.update(tags)
                 word_vocab.update(words)
         self.logger.info('- done!')
     # save word vocab
     self.logger.info('Saving word vocab...')
     word_vocab_file = os.path.join(data_dir, words_filename)
     with open(word_vocab_file, 'w', encoding=encoding) as f:
         for word in word_vocab.get(min_count=min_count_word):
             f.write('{}\n'.format(word))
     params.word_vocab_size = word_vocab.size(min_count=min_count_word)
     self.logger.info('- done!')
     # save tag vocab
     self.logger.info('Saving tag vocab...')
     tag_vocab_file = os.path.join(data_dir, tags_filename)
     with open(tag_vocab_file, 'w', encoding=encoding) as f:
         for tag in tag_vocab.get(min_count=min_count_tag):
             f.write('{}\n'.format(tag))
     params.tag_vocab_size = tag_vocab.size(min_count=min_count_tag)
     self.logger.info('- done!')
     # save datasets parameters
     self.logger.info('Saving datasets parameters...')
     params.dump(params_file, encoding=encoding)
     self.logger.info('- done!')
     # print dataset characteristics
     self.logger.info("Characteristics of the dataset:")
     for key, value in params:
         self.logger.info("- {}: {}".format(key, value))