Example #1
0
def infer(j, y_test, model_name, stamp, window, categories, k, models, text_P,
          subset_ratio, subset_seed, min_len, max_len, min_tokens,
          categories_mode, return_overall, max_words, vectorizer, test_size,
          test_random_state, data_len, test_len):
    category = categories[j]
    print('Predicting category `{}`...'.format(category))
    y_pred = np.zeros((len(y_test), ), dtype=np.int32)
    for i in range(len(y_test)):
        P = text_P[i]
        q_pred = base.predict_ordinal(models, P, k)
        label_pred = max(q_pred)
        y_pred[i] = label_pred

    base_fname = '{}_{:d}_{:d}w'.format(stamp, j, window)
    logs_path = folders.ensure(os.path.join(folders.LOGS_PATH, model_name))
    with open(os.path.join(logs_path, '{}.txt'.format(base_fname)), 'w') as fd:
        fd.write('HYPERPARAMETERS\n')
        fd.write('\nText\n')
        fd.write('subset_ratio={}\n'.format(str(subset_ratio)))
        fd.write('subset_seed={}\n'.format(str(subset_seed)))
        fd.write('min_len={:d}\n'.format(min_len))
        fd.write('max_len={:d}\n'.format(max_len))
        fd.write('min_tokens={:d}\n'.format(min_tokens))
        fd.write('\nLabels\n')
        fd.write('categories_mode=\'{}\'\n'.format(categories_mode))
        fd.write('return_overall={}\n'.format(return_overall))
        fd.write('\nVectorization\n')
        fd.write('max_words={:d}\n'.format(max_words))
        fd.write('vectorizer={}\n'.format(vectorizer.__class__.__name__))
        fd.write('\nTraining\n')
        fd.write('test_size={}\n'.format(str(test_size)))
        fd.write('test_random_state={:d}\n'.format(test_random_state))
        fd.write('\nRESULTS\n\n')
        fd.write('Data size: {:d}\n'.format(data_len))
        fd.write('Test size: {:d}\n\n'.format(test_len))
        evaluation.write_confusion_and_metrics(y_test, y_pred, fd, category)

    predictions_path = folders.ensure(
        os.path.join(folders.PREDICTIONS_PATH, model_name))
    with open(os.path.join(predictions_path, '{}.txt'.format(base_fname)),
              'w') as fd:
        evaluation.write_predictions(y_test, y_pred, fd, category)
def train(skip_models=False):
    max_words = shared_parameters.TEXT_MAX_WORDS

    start_time = int(time.time())
    if 'SLURM_JOB_ID' in os.environ:
        stamp = int(os.environ['SLURM_JOB_ID'])
    else:
        stamp = start_time
    print('Time stamp: {:d}'.format(stamp))

    # Load data.
    print('Retrieving texts...')
    source = 'paragraph_tokens'
    subset_ratio = .1  #shared_parameters.DATA_SUBSET_RATIO
    subset_seed = shared_parameters.DATA_SUBSET_SEED
    min_len = shared_parameters.DATA_PARAGRAPH_MIN_LEN
    max_len = shared_parameters.DATA_PARAGRAPH_MAX_LEN
    min_tokens = shared_parameters.DATA_MIN_TOKENS
    remove_stopwords = False
    categories_mode = shared_parameters.DATA_CATEGORIES_MODE
    return_overall = shared_parameters.DATA_RETURN_OVERALL
    inputs, Y, categories, category_levels = \
        bookcave.get_data({source},
                          subset_ratio=subset_ratio,
                          subset_seed=subset_seed,
                          min_len=min_len,
                          max_len=max_len,
                          min_tokens=min_tokens,
                          remove_stopwords=remove_stopwords,
                          categories_mode=categories_mode,
                          return_overall=return_overall)
    text_source_tokens = list(zip(*inputs[source]))[0]
    print('Retrieved {:d} texts.'.format(len(text_source_tokens)))

    # Create vectorized representations of the book texts.
    print('Vectorizing text...')
    text_tokens = []
    for source_tokens in text_source_tokens:
        all_tokens = []
        for tokens in source_tokens:
            all_tokens.extend(tokens)
        text_tokens.append(all_tokens)
    vectorizer = tokenizers.get_vectorizer_or_fit(max_words,
                                                  remove_stopwords,
                                                  text_tokens=text_tokens)
    X = vectorizer.transform(text_tokens)
    print('Vectorized text with {:d} unique words.'.format(
        len(vectorizer.get_feature_names())))

    # Split data set.
    test_size = shared_parameters.EVAL_TEST_SIZE  # b
    test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE
    Y_T = Y.transpose()  # (n, c)
    X_train, X_test, Y_train_T, Y_test_T = train_test_split(
        X, Y_T, test_size=test_size, random_state=test_random_state)
    Y_train = Y_train_T.transpose()  # (c, n * (1 - b))
    Y_test = Y_test_T.transpose()  # (c, n * b)

    create_funcs = [
        create_k_nearest_neighbors, create_logistic_regression,
        create_multi_layer_perceptron, create_multinomial_naive_bayes,
        create_random_forest, create_svm
    ]
    model_names = [
        'k_nearest_neighbors', 'logistic_regression', 'multi_layer_perceptron',
        'multinomial_naive_bayes', 'random_forest', 'svm'
    ]
    for m, create_func in enumerate(create_funcs):
        model_name = model_names[m]
        model_path = folders.ensure(
            os.path.join(folders.MODELS_PATH, model_name))
        print('Training model `{}`...'.format(model_name))
        for j, category in enumerate(categories):
            print('Classifying category `{}`...'.format(category))
            y_train = Y_train[j]  # (n * (1 - b))
            k = len(category_levels[j])
            classifiers = fit_ordinal(create_func, X_train, y_train, k)
            y_pred = predict_ordinal(classifiers, X_test, k)  # (n * b)
            y_test = Y_test[j]

            base_fname = '{:d}_{:d}'.format(stamp, j)
            logs_path = folders.ensure(
                os.path.join(folders.LOGS_PATH, model_name))
            with open(os.path.join(logs_path, '{}.txt'.format(base_fname)),
                      'w') as fd:
                fd.write('HYPERPARAMETERS\n')
                fd.write('\nText\n')
                fd.write('subset_ratio={}\n'.format(str(subset_ratio)))
                fd.write('subset_seed={}\n'.format(str(subset_seed)))
                fd.write('min_len={:d}\n'.format(min_len))
                fd.write('max_len={:d}\n'.format(max_len))
                fd.write('min_tokens={:d}\n'.format(min_tokens))
                fd.write('\nLabels\n')
                fd.write('categories_mode=\'{}\'\n'.format(categories_mode))
                fd.write('return_overall={}\n'.format(return_overall))
                fd.write('\nVectorization\n')
                fd.write('max_words={:d}\n'.format(max_words))
                fd.write('vectorizer={}\n'.format(
                    vectorizer.__class__.__name__))
                fd.write('\nTraining\n')
                fd.write('test_size={}\n'.format(str(test_size)))
                fd.write('test_random_state={:d}\n'.format(test_random_state))
                fd.write('\nRESULTS\n\n')
                fd.write('Data size: {:d}\n'.format(X.shape[0]))
                fd.write('Train size: {:d}\n'.format(X_train.shape[0]))
                fd.write('Test size: {:d}\n\n'.format(X_test.shape[0]))
                evaluation.write_confusion_and_metrics(y_test, y_pred, fd,
                                                       category)

            predictions_path = folders.ensure(
                os.path.join(folders.PREDICTIONS_PATH, model_name))
            with open(
                    os.path.join(predictions_path,
                                 '{}.txt'.format(base_fname)), 'w') as fd:
                evaluation.write_predictions(y_test, y_pred, fd, category)

            if not skip_models:
                models_path = folders.ensure(
                    os.path.join(model_path, base_fname))
                for i, classifier in enumerate(classifiers):
                    with open(
                            os.path.join(models_path,
                                         'model{:d}.pickle'.format(i)),
                            'wb') as fd:
                        pickle.dump(classifier,
                                    fd,
                                    protocol=pickle.HIGHEST_PROTOCOL)

    print('Done.')
Example #3
0
def main():
    parser = ArgumentParser(
        description='Classify the maturity level of a book by its text.',
        formatter_class=RawTextHelpFormatter)
    parser.add_argument('category_index',
                        type=int,
                        help='The category index.\n  {}'.format('\n  '.join([
                            '{:d} {}'.format(j,
                                             bookcave.CATEGORY_NAMES[category])
                            for j, category in enumerate(bookcave.CATEGORIES)
                        ])))
    parser.add_argument('--source_mode',
                        default='paragraph',
                        choices=['paragraph', 'sentence'],
                        help='The source of text. Default is `paragraph`.')
    parser.add_argument('--net_mode',
                        default='cnn',
                        choices=['rnn', 'cnn', 'rnncnn'],
                        help='The type of neural network. Default is `cnn`.')
    parser.add_argument('--remove_stopwords',
                        action='store_true',
                        help='Remove stop-words from text. Default is False.')
    parser.add_argument(
        '--agg_mode',
        default='maxavg',
        choices=['max', 'avg', 'maxavg', 'rnn'],
        help=
        'The way the network will aggregate paragraphs or sentences. Default is `maxavg`.'
    )
    parser.add_argument('--label_mode',
                        default=shared_parameters.LABEL_MODE_ORDINAL,
                        choices=[
                            shared_parameters.LABEL_MODE_ORDINAL,
                            shared_parameters.LABEL_MODE_CATEGORICAL,
                            shared_parameters.LABEL_MODE_REGRESSION
                        ],
                        help='The way that labels will be interpreted. '
                        'Default is `{}`.'.format(
                            shared_parameters.LABEL_MODE_ORDINAL))
    parser.add_argument(
        '--remove_classes',
        type=str,
        help=
        'Remove classes altogether. Can be used when the minority class is severely tiny. '
        'Like `<class1>[,<class2>,...]` as in `3` or `3,0`. Optional.')
    parser.add_argument(
        '--class_weight_p',
        default=2,
        type=int,
        help='Power with which to scale class weights. Default is 2.')
    parser.add_argument(
        '--embedding_trainable',
        action='store_true',
        help=
        'Flag to allow the model to optimize the word embeddings. Default is False.'
    )
    parser.add_argument(
        '--book_dense_units',
        default='128',
        help=
        'The number of neurons in the final fully-connected layers, comma separated. '
        'Default is `128`.')
    parser.add_argument(
        '--book_dropout',
        default=0.5,
        type=float,
        help=
        'Dropout probability before final classification layer. Default is 0.5.'
    )
    parser.add_argument(
        '--plateau_patience',
        default=16,
        type=int,
        help=
        'Number of epochs to wait before dividing the learning rate by 2. Default is 16.'
    )
    parser.add_argument(
        '--early_stopping_patience',
        default=32,
        type=int,
        help=
        'Number of epochs to wait before dividing the learning rate by 2. Default is 32.'
    )
    parser.add_argument('--epochs',
                        default=1,
                        type=int,
                        help='Epochs. Default is 1.')
    parser.add_argument(
        '--save_model',
        action='store_true',
        help='Save the model and its weights. Default is False.')
    parser.add_argument(
        '--note',
        help=
        'An optional note that will be appended to the names of generated files.'
    )
    args = parser.parse_args()

    classifier_name = '{}_{}_{}_{}'.format(args.source_mode, args.net_mode,
                                           args.agg_mode, args.label_mode)

    start_time = int(time.time())
    if 'SLURM_JOB_ID' in os.environ:
        stamp = int(os.environ['SLURM_JOB_ID'])
    else:
        stamp = start_time
    print('Time stamp: {:d}'.format(stamp))
    if args.note is not None:
        print('Note: {}'.format(args.note))
        base_fname = '{:d}_{}_{:d}'.format(stamp, args.note,
                                           args.category_index)
    else:
        base_fname = '{:d}_{:d}'.format(stamp, args.category_index)

    # Load data.
    print('Retrieving texts...')
    if args.source_mode == 'paragraph':
        source = 'paragraph_tokens'
        min_len = shared_parameters.DATA_PARAGRAPH_MIN_LEN
        max_len = shared_parameters.DATA_PARAGRAPH_MAX_LEN
    else:  # args.source_mode == 'sentence':
        source = 'sentence_tokens'
        min_len = shared_parameters.DATA_SENTENCE_MIN_LEN
        max_len = shared_parameters.DATA_SENTENCE_MAX_LEN
    subset_ratio = shared_parameters.DATA_SUBSET_RATIO
    subset_seed = shared_parameters.DATA_SUBSET_SEED
    min_tokens = shared_parameters.DATA_MIN_TOKENS
    categories_mode = shared_parameters.DATA_CATEGORIES_MODE
    return_overall = shared_parameters.DATA_RETURN_OVERALL
    inputs, Y, categories, category_levels = \
        bookcave.get_data({source},
                          subset_ratio=subset_ratio,
                          subset_seed=subset_seed,
                          min_len=min_len,
                          max_len=max_len,
                          min_tokens=min_tokens,
                          remove_stopwords=args.remove_stopwords,
                          categories_mode=categories_mode,
                          return_overall=return_overall)
    text_source_tokens = list(zip(*inputs[source]))[0]
    print('Retrieved {:d} texts.'.format(len(text_source_tokens)))

    # Reduce labels to the specified category.
    y = Y[args.category_index]
    category = categories[args.category_index]
    levels = category_levels[args.category_index]
    k = len(levels)
    k_train = k

    # Tokenize.
    print('Tokenizing...')
    max_words = shared_parameters.TEXT_MAX_WORDS
    split = '\t'
    tokenizer = tokenizers.get_tokenizer_or_fit(
        max_words,
        args.source_mode,
        args.remove_stopwords,
        text_source_tokens=text_source_tokens)

    # Convert to sequences.
    print('Converting texts to sequences...')
    if args.source_mode == 'paragraph':
        if not args.remove_stopwords:
            n_tokens = shared_parameters.TEXT_N_PARAGRAPH_TOKENS
        else:
            n_tokens = shared_parameters.TEXT_N_PARAGRAPH_TOKENS_NO_STOPWORDS
    else:  # args.source_mode == 'sentence':
        n_tokens = shared_parameters.TEXT_N_SENTENCE_TOKENS
    padding = shared_parameters.TEXT_PADDING
    truncating = shared_parameters.TEXT_TRUNCATING
    X = [
        np.array(
            pad_sequences(tokenizer.texts_to_sequences(
                [split.join(tokens) for tokens in source_tokens]),
                          maxlen=n_tokens,
                          padding=padding,
                          truncating=truncating))
        for source_tokens in text_source_tokens
    ]

    # Load embedding.
    print('Loading embedding matrix...')
    embedding_path = folders.EMBEDDING_GLOVE_300_PATH
    embedding_matrix = load_embeddings.load_embedding(tokenizer,
                                                      embedding_path,
                                                      max_words)

    # Create model.
    print('Creating model...')
    net_params = dict()
    if args.net_mode == 'rnn' or args.net_mode == 'rnncnn':
        net_params['rnn'] = CuDNNGRU if tf.test.is_gpu_available(
            cuda_only=True) else GRU
        net_params['rnn_units'] = 128
        net_params['rnn_l2'] = .001
        net_params['rnn_dense_units'] = 64
        net_params['rnn_dense_activation'] = 'elu'
        net_params['rnn_dense_l2'] = .001
        net_params['rnn_agg'] = 'attention'
    if args.net_mode == 'cnn' or args.net_mode == 'rnncnn':
        net_params['cnn_filters'] = 16
        net_params['cnn_filter_sizes'] = [1, 2, 3, 4]
        net_params['cnn_activation'] = 'elu'
        net_params['cnn_l2'] = .001
    agg_params = dict()
    if args.agg_mode == 'rnn':
        agg_params['rnn'] = CuDNNGRU if tf.test.is_gpu_available(
            cuda_only=True) else GRU
        agg_params['rnn_units'] = 64
        agg_params['rnn_l2'] = .001
    book_dense_units = [
        int(units) for units in args.book_dense_units.split(',')
    ]
    book_dense_activation = LeakyReLU(alpha=.1)
    book_dense_l2 = .001
    book_dropout = args.book_dropout
    model = create_model(n_tokens, embedding_matrix, args.embedding_trainable,
                         args.net_mode, net_params, args.agg_mode, agg_params,
                         book_dense_units, book_dense_activation,
                         book_dense_l2, book_dropout, k, category,
                         args.label_mode)
    lr = 2**-16
    optimizer = Adam(lr=lr)
    if args.label_mode == shared_parameters.LABEL_MODE_ORDINAL:
        loss = 'binary_crossentropy'
        metric = 'binary_accuracy'
    elif args.label_mode == shared_parameters.LABEL_MODE_CATEGORICAL:
        loss = 'categorical_crossentropy'
        metric = 'categorical_accuracy'
    else:  # args.label_mode == shared_parameters.LABEL_MODE_REGRESSION:
        loss = 'mse'
        metric = 'accuracy'
    model.compile(optimizer, loss=loss, metrics=[metric])

    # Split data set.
    print('Splitting data set...')
    test_size = shared_parameters.EVAL_TEST_SIZE  # b
    test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE
    val_size = shared_parameters.EVAL_VAL_SIZE  # v
    val_random_state = shared_parameters.EVAL_VAL_RANDOM_STATE
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=test_size, random_state=test_random_state)
    X_train, X_val, y_train, y_val = \
        train_test_split(X_train, y_train, test_size=val_size, random_state=val_random_state)
    y_val_transform = shared_parameters.transform_labels(
        y_val, k, args.label_mode)
    y_test_transform = shared_parameters.transform_labels(
        y_test, k, args.label_mode)

    # Remove classes from training set, if specified.
    if args.remove_classes is not None:
        remove_classes = sorted(list(
            map(int,
                args.remove_classes.strip().split(','))),
                                reverse=True)
        for class_ in remove_classes:
            y_train[y_train >= class_] -= 1
            k_train -= 1

    # Create generators.
    print('Creating data generators...')
    train_generator = TransformBalancedBatchGenerator(
        np.arange(len(X_train)).reshape((len(X_train), 1)),
        y_train,
        transform_X=transform_X,
        transform_y=transform_y,
        batch_size=1,
        X_data=[np.array([x]) for x in X_train],
        k=k,
        label_mode=args.label_mode)
    val_generator = SingleInstanceBatchGenerator(X_val,
                                                 y_val_transform,
                                                 shuffle=False)
    test_generator = SingleInstanceBatchGenerator(X_test,
                                                  y_test_transform,
                                                  shuffle=False)

    # Get class weight.
    class_weight = shared_parameters.get_class_weight(k_train,
                                                      args.label_mode,
                                                      p=args.class_weight_p)

    # Train.
    print('Training for up to {:d} epoch{}...'.format(
        args.epochs, 's' if args.epochs != 1 else ''))
    plateau_monitor = 'val_loss'
    plateau_factor = .5
    early_stopping_monitor = 'val_loss'
    early_stopping_min_delta = 2**-10
    plateau_patience = args.plateau_patience
    early_stopping_patience = args.early_stopping_patience
    callbacks = [
        ReduceLROnPlateau(monitor=plateau_monitor,
                          factor=plateau_factor,
                          patience=plateau_patience),
        EarlyStopping(monitor=early_stopping_monitor,
                      min_delta=early_stopping_min_delta,
                      patience=early_stopping_patience)
    ]
    if args.save_model:
        models_path = folders.ensure(
            os.path.join(folders.MODELS_PATH, classifier_name))
        model_path = os.path.join(models_path, '{}.h5'.format(base_fname))
        model_checkpoint = ModelCheckpoint(model_path,
                                           monitor='val_loss',
                                           save_best_only=True,
                                           mode='min')
        callbacks.append(model_checkpoint)
    else:
        model_path = None
    history = model.fit_generator(train_generator,
                                  epochs=args.epochs,
                                  verbose=0,
                                  callbacks=callbacks,
                                  validation_data=val_generator,
                                  class_weight=class_weight)
    epochs_complete = len(history.history.get('val_loss'))

    # Save the history to visualize loss over time.
    print('Saving training history...')
    history_path = folders.ensure(
        os.path.join(folders.HISTORY_PATH, classifier_name))
    with open(os.path.join(history_path, '{}.txt'.format(base_fname)),
              'w') as fd:
        for key in history.history.keys():
            values = history.history.get(key)
            fd.write('{} {}\n'.format(key,
                                      ' '.join(str(value)
                                               for value in values)))

    # Predict test instances.
    print('Predicting test instances...')
    y_pred_transform = model.predict_generator(test_generator)
    if args.label_mode == shared_parameters.LABEL_MODE_ORDINAL:
        y_pred = ordinal.from_multi_hot_ordinal(y_pred_transform, threshold=.5)
    elif args.label_mode == shared_parameters.LABEL_MODE_CATEGORICAL:
        y_pred = np.argmax(y_pred_transform, axis=1)
    else:  # args.label_mode == shared_parameters.LABEL_MODE_REGRESSION:
        y_pred = np.maximum(0, np.minimum(k - 1,
                                          np.round(y_pred_transform * k)))

    # Calculate elapsed time.
    end_time = int(time.time())
    elapsed_s = end_time - start_time
    elapsed_m, elapsed_s = elapsed_s // 60, elapsed_s % 60
    elapsed_h, elapsed_m = elapsed_m // 60, elapsed_m % 60

    # Write results.
    print('Writing results...')
    logs_path = folders.ensure(os.path.join(folders.LOGS_PATH,
                                            classifier_name))
    with open(os.path.join(logs_path, '{}.txt'.format(base_fname)), 'w') as fd:
        if args.note is not None:
            fd.write('{}\n\n'.format(args.note))
        fd.write('PARAMETERS\n\n')
        fd.write('category_index={:d}\n'.format(args.category_index))
        fd.write('epochs={:d}\n'.format(args.epochs))
        fd.write('\nHYPERPARAMETERS\n')
        fd.write('\nText\n')
        fd.write('subset_ratio={}\n'.format(str(subset_ratio)))
        fd.write('subset_seed={}\n'.format(str(subset_seed)))
        fd.write('min_len={:d}\n'.format(min_len))
        fd.write('max_len={:d}\n'.format(max_len))
        fd.write('min_tokens={:d}\n'.format(min_tokens))
        fd.write('remove_stopwords={}\n'.format(args.remove_stopwords))
        fd.write('\nLabels\n')
        fd.write('categories_mode=\'{}\'\n'.format(categories_mode))
        fd.write('return_overall={}\n'.format(return_overall))
        if args.remove_classes is not None:
            fd.write('remove_classes={}\n'.format(args.remove_classes))
        else:
            fd.write('No classes removed.\n')
        fd.write('class_weight_p={:d}\n'.format(args.class_weight_p))
        fd.write('\nTokenization\n')
        fd.write('max_words={:d}\n'.format(max_words))
        fd.write('n_tokens={:d}\n'.format(n_tokens))
        fd.write('padding=\'{}\'\n'.format(padding))
        fd.write('truncating=\'{}\'\n'.format(truncating))
        fd.write('\nWord Embedding\n')
        fd.write('embedding_path=\'{}\'\n'.format(embedding_path))
        fd.write('embedding_trainable={}\n'.format(args.embedding_trainable))
        fd.write('\nModel\n')
        if args.net_mode == 'rnn' or args.net_mode == 'rnncnn':
            fd.write('rnn={}\n'.format(net_params['rnn'].__name__))
            fd.write('rnn_units={:d}\n'.format(net_params['rnn_units']))
            fd.write('rnn_l2={}\n'.format(str(net_params['rnn_l2'])))
            fd.write('rnn_dense_units={:d}\n'.format(
                net_params['rnn_dense_units']))
            fd.write('rnn_dense_activation=\'{}\'\n'.format(
                net_params['rnn_dense_activation']))
            fd.write('rnn_dense_l2={}\n'.format(str(
                net_params['rnn_dense_l2'])))
            fd.write('rnn_agg={}\n'.format(net_params['rnn_agg']))
        if args.net_mode == 'cnn' or args.net_mode == 'rnncnn':
            fd.write('cnn_filters={:d}\n'.format(net_params['cnn_filters']))
            fd.write('cnn_filter_sizes={}\n'.format(
                str(net_params['cnn_filter_sizes'])))
            fd.write('cnn_activation=\'{}\'\n'.format(
                net_params['cnn_activation']))
            fd.write('cnn_l2={}\n'.format(str(net_params['cnn_l2'])))
        if args.agg_mode == 'rnn':
            fd.write('agg_rnn={}\n'.format(agg_params['rnn'].__name__))
            fd.write('agg_rnn_units={:d}\n'.format(agg_params['rnn_units']))
            fd.write('agg_rnn_l2={}\n'.format(str(agg_params['rnn_l2'])))
        fd.write('book_dense_units={}\n'.format(args.book_dense_units))
        fd.write('book_dense_activation={} {}\n'.format(
            book_dense_activation.__class__.__name__,
            book_dense_activation.__dict__))
        fd.write('book_dense_l2={}\n'.format(str(book_dense_l2)))
        fd.write('book_dropout={}\n'.format(str(book_dropout)))
        model.summary(print_fn=lambda x: fd.write('{}\n'.format(x)))
        fd.write('\nTraining\n')
        fd.write('optimizer={}\n'.format(optimizer.__class__.__name__))
        fd.write('lr={}\n'.format(str(lr)))
        fd.write('loss=\'{}\'\n'.format(loss))
        fd.write('metric=\'{}\'\n'.format(metric))
        fd.write('test_size={}\n'.format(str(test_size)))
        fd.write('test_random_state={:d}\n'.format(test_random_state))
        fd.write('val_size={}\n'.format(str(val_size)))
        fd.write('val_random_state={:d}\n'.format(val_random_state))
        fd.write('plateau_monitor={}\n'.format(plateau_monitor))
        fd.write('plateau_factor={}\n'.format(str(plateau_factor)))
        fd.write('plateau_patience={:d}\n'.format(plateau_patience))
        fd.write('early_stopping_monitor={}\n'.format(early_stopping_monitor))
        fd.write('early_stopping_min_delta={}\n'.format(
            str(early_stopping_min_delta)))
        fd.write(
            'early_stopping_patience={:d}\n'.format(early_stopping_patience))
        fd.write('\nRESULTS\n\n')
        fd.write('Data size: {:d}\n'.format(len(X)))
        fd.write('Train size: {:d}\n'.format(len(X_train)))
        fd.write('Validation size: {:d}\n'.format(len(X_val)))
        fd.write('Test size: {:d}\n'.format(len(X_test)))
        if model_path is not None:
            fd.write('Model path: \'{}\'\n'.format(model_path))
        else:
            fd.write('Model not saved.\n')
        fd.write('Epochs completed: {:d}\n'.format(epochs_complete))
        fd.write('Time elapsed: {:d}h {:d}m {:d}s\n\n'.format(
            elapsed_h, elapsed_m, elapsed_s))
        evaluation.write_confusion_and_metrics(y_test, y_pred, fd, category)

    # Write predictions.
    print('Writing predictions...')
    predictions_path = folders.ensure(
        os.path.join(folders.PREDICTIONS_PATH, classifier_name))
    with open(os.path.join(predictions_path, '{}.txt'.format(base_fname)),
              'w') as fd:
        evaluation.write_predictions(y_test, y_pred, fd, category)

    print('Done.')
Example #4
0
def main(argv):
    if len(argv) < 2 or len(argv) > 3:
        raise ValueError('Usage: <steps_per_epoch> <epochs> [note]')
    steps_per_epoch = int(argv[0])
    epochs = int(argv[1])
    note = None
    if len(argv) > 2:
        note = argv[2]

    script_name = os.path.basename(__file__)
    classifier_name = script_name[:script_name.index('.')]

    start_time = int(time.time())
    if 'SLURM_JOB_ID' in os.environ:
        stamp = int(os.environ['SLURM_JOB_ID'])
    else:
        stamp = start_time
    print('Time stamp: {:d}'.format(stamp))
    if note is not None:
        print('Note: {}'.format(note))
        base_fname = '{:d}_{}'.format(stamp, note)
    else:
        base_fname = format(stamp, 'd')

    # Load data.
    print('Loading data...')
    subset_ratio = shared_parameters.DATA_SUBSET_RATIO
    subset_seed = shared_parameters.DATA_SUBSET_SEED
    min_len = shared_parameters.DATA_SENTENCE_MIN_LEN
    max_len = shared_parameters.DATA_SENTENCE_MAX_LEN
    min_tokens = shared_parameters.DATA_MIN_TOKENS
    categories_mode = shared_parameters.DATA_CATEGORIES_MODE
    inputs, Y, categories, category_levels = \
        bookcave.get_data({'sentence_tokens'},
                          subset_ratio=subset_ratio,
                          subset_seed=subset_seed,
                          min_len=min_len,
                          max_len=max_len,
                          min_tokens=min_tokens,
                          categories_mode=categories_mode)
    text_sentence_tokens, text_section_ids, text_paragraph_ids = zip(
        *inputs['sentence_tokens'])
    print('Retrieved {:d} texts.'.format(len(text_sentence_tokens)))

    # Tokenize.
    print('Tokenizing...')
    max_words = shared_parameters.TEXT_MAX_WORDS
    split = '\t'
    tokenizer = Tokenizer(num_words=max_words, split=split)
    all_sentences = []
    for sentence_tokens in text_sentence_tokens:
        for tokens in sentence_tokens:
            all_sentences.append(split.join(tokens))
    tokenizer.fit_on_texts(all_sentences)
    print('Done.')

    # Convert to sequences.
    print('Converting texts to sequences...')
    n_sentences = shared_parameters.TEXT_N_SENTENCES
    n_sentence_tokens = shared_parameters.TEXT_N_SENTENCE_TOKENS
    padding = shared_parameters.TEXT_PADDING
    truncating = shared_parameters.TEXT_TRUNCATING
    text_sentence_sequences = [
        pad_sequences(tokenizer.texts_to_sequences(
            [split.join(tokens) for tokens in sentence_tokens]),
                      maxlen=n_sentence_tokens,
                      padding=padding,
                      truncating=truncating)
        for sentence_tokens in text_sentence_tokens
    ]
    X = []
    for text_i, sentence_sequences in enumerate(text_sentence_sequences):
        section_ids = text_section_ids[text_i]
        paragraph_ids = text_paragraph_ids[text_i]
        n_paragraphs = len(
            np.unique(list(
                zip(text_section_ids[text_i], text_paragraph_ids[text_i])),
                      axis=0))
        x = np.zeros((n_paragraphs, n_sentences,
                      n_sentence_tokens))  # [paragraph_i][sentence_i][token_i]
        paragraph_i = 0
        sentence_i = 0
        last_section_paragraph_id = None
        for sequence_i, sentence_sequence in enumerate(sentence_sequences):
            section_paragraph_id = (section_ids[sequence_i],
                                    paragraph_ids[sequence_i])
            if last_section_paragraph_id is not None and section_paragraph_id != last_section_paragraph_id:
                paragraph_i += 1
                sentence_i = 0
            if sentence_i < n_sentences:
                x[paragraph_i, sentence_i] = sentence_sequence
            sentence_i += 1
            last_section_paragraph_id = section_paragraph_id
        X.append(x)
    print('Done.')

    # Load embedding.
    print('Loading embedding matrix...')
    embedding_path = folders.EMBEDDING_GLOVE_300_PATH
    embedding_matrix = load_embeddings.load_embedding(tokenizer,
                                                      embedding_path,
                                                      max_words)
    print('Done.')

    # Create model.
    print('Creating model...')
    category_k = [len(levels) for levels in category_levels]
    embedding_trainable = False
    sent_rnn = CuDNNGRU if tf.test.is_gpu_available(cuda_only=True) else GRU
    sent_rnn_units = 128
    sent_rnn_l2 = .01
    sent_dense_units = 64
    sent_dense_activation = 'elu'
    sent_dense_l2 = .01
    para_rnn = CuDNNGRU if tf.test.is_gpu_available(cuda_only=True) else GRU
    para_rnn_units = 128
    para_rnn_l2 = .01
    para_dense_units = 64
    para_dense_activation = 'elu'
    para_dense_l2 = .01
    book_dense_units = 128
    book_dense_activation = tf.keras.layers.LeakyReLU(alpha=.1)
    book_dense_l2 = .01
    book_dropout = .5
    label_mode = shared_parameters.LABEL_MODE_ORDINAL
    sentence_encoder, paragraph_encoder, model = create_model(
        n_sentences, n_sentence_tokens, embedding_matrix, embedding_trainable,
        sent_rnn, sent_rnn_units, sent_rnn_l2, sent_dense_units,
        sent_dense_activation, sent_dense_l2, para_rnn, para_rnn_units,
        para_rnn_l2, para_dense_units, para_dense_activation, para_dense_l2,
        book_dense_units, book_dense_activation, book_dense_l2, book_dropout,
        category_k, categories, label_mode)
    lr = 2**-16
    optimizer = Adam(lr=lr)
    if label_mode == shared_parameters.LABEL_MODE_ORDINAL:
        loss = 'binary_crossentropy'
        metric = 'binary_accuracy'
    elif label_mode == shared_parameters.LABEL_MODE_CATEGORICAL:
        loss = 'categorical_crossentropy'
        metric = 'categorical_accuracy'
    elif label_mode == shared_parameters.LABEL_MODE_REGRESSION:
        loss = 'mse'
        metric = 'accuracy'
    else:
        raise ValueError(
            'Unknown value for `1abel_mode`: {}'.format(label_mode))
    model.compile(optimizer, loss=loss, metrics=[metric])
    print('Done.')

    # Split data set.
    test_size = shared_parameters.EVAL_TEST_SIZE  # b
    test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE
    val_size = shared_parameters.EVAL_VAL_SIZE  # v
    val_random_state = shared_parameters.EVAL_VAL_RANDOM_STATE
    Y_T = Y.transpose()  # (n, c)
    X_train, X_test, Y_train_T, Y_test_T = \
        train_test_split(X, Y_T, test_size=test_size, random_state=test_random_state)
    X_train, X_val, Y_train_T, Y_val_T = \
        train_test_split(X_train, Y_train_T, test_size=val_size, random_state=val_random_state)
    Y_train = Y_train_T.transpose()  # (c, n * (1 - b) * (1 - v))
    Y_val = Y_val_T.transpose()  # (c, n * (1 - b) * v)
    Y_test = Y_test_T.transpose()  # (c, n * b)

    # Transform labels based on the label mode.
    Y_train = shared_parameters.transform_labels(Y_train, category_k,
                                                 label_mode)
    Y_val = shared_parameters.transform_labels(Y_val, category_k, label_mode)

    # Calculate class weights.
    use_class_weights = True
    class_weight_f = 'inverse'
    if use_class_weights:
        category_class_weights = shared_parameters.get_category_class_weights(
            Y_train, label_mode, f=class_weight_f)
    else:
        category_class_weights = None

    # Create generators.
    shuffle = True
    train_generator = SingleInstanceBatchGenerator(X_train,
                                                   Y_train,
                                                   shuffle=shuffle)
    val_generator = SingleInstanceBatchGenerator(X_val, Y_val, shuffle=False)
    test_generator = SingleInstanceBatchGenerator(X_test,
                                                  Y_test,
                                                  shuffle=False)

    # Train.
    plateau_monitor = 'val_loss'
    plateau_factor = .5
    plateau_patience = 3
    early_stopping_monitor = 'val_loss'
    early_stopping_min_delta = 2**-10
    early_stopping_patience = 6
    callbacks = [
        ReduceLROnPlateau(monitor=plateau_monitor,
                          factor=plateau_factor,
                          patience=plateau_patience),
        EarlyStopping(monitor=early_stopping_monitor,
                      min_delta=early_stopping_min_delta,
                      patience=early_stopping_patience)
    ]
    history = model.fit_generator(
        train_generator,
        steps_per_epoch=steps_per_epoch if steps_per_epoch > 0 else None,
        epochs=epochs,
        validation_data=val_generator,
        class_weight=category_class_weights,
        callbacks=callbacks)

    # Save the history to visualize loss over time.
    print('Saving training history...')
    if not os.path.exists(folders.HISTORY_PATH):
        os.mkdir(folders.HISTORY_PATH)
    history_path = os.path.join(folders.HISTORY_PATH, classifier_name)
    if not os.path.exists(history_path):
        os.mkdir(history_path)
    with open(os.path.join(history_path, '{}.txt'.format(base_fname)),
              'w') as fd:
        for key in history.history.keys():
            values = history.history.get(key)
            fd.write('{} {}\n'.format(key,
                                      ' '.join(str(value)
                                               for value in values)))
    print('Done.')

    # Predict test instances.
    print('Predicting test instances...')
    Y_pred = model.predict_generator(test_generator)
    if label_mode == shared_parameters.LABEL_MODE_ORDINAL:
        Y_pred = [
            ordinal.from_multi_hot_ordinal(y, threshold=.5) for y in Y_pred
        ]
    elif label_mode == shared_parameters.LABEL_MODE_CATEGORICAL:
        Y_pred = [np.argmax(y, axis=1) for y in Y_pred]
    elif label_mode == shared_parameters.LABEL_MODE_REGRESSION:
        Y_pred = [
            np.maximum(0, np.minimum(k - 1, np.round(Y_pred[i] * k)))
            for i, k in enumerate(category_k)
        ]
    else:
        raise ValueError(
            'Unknown value for `1abel_mode`: {}'.format(label_mode))
    print('Done.')

    # Save model.
    save_model = False
    if save_model:
        models_path = os.path.join(folders.MODELS_PATH, classifier_name)
        label_mode_path = os.path.join(models_path, label_mode)
        model_path = os.path.join(label_mode_path, '{}.h5'.format(base_fname))
        print('Saving model to `{}`...'.format(model_path))
        if not os.path.exists(folders.MODELS_PATH):
            os.mkdir(folders.MODELS_PATH)
        if not os.path.exists(models_path):
            os.mkdir(models_path)
        if not os.path.exists(label_mode_path):
            os.mkdir(label_mode_path)
        model.save(model_path)
        print('Done.')
    else:
        model_path = None

    # Calculate elapsed time.
    end_time = int(time.time())
    elapsed_s = end_time - start_time
    elapsed_m, elapsed_s = elapsed_s // 60, elapsed_s % 60
    elapsed_h, elapsed_m = elapsed_m // 60, elapsed_m % 60

    # Write results.
    print('Writing results...')

    if not os.path.exists(folders.LOGS_PATH):
        os.mkdir(folders.LOGS_PATH)
    logs_path = os.path.join(folders.LOGS_PATH, classifier_name)
    if not os.path.exists(logs_path):
        os.mkdir(logs_path)
    with open(os.path.join(logs_path, '{}.txt'.format(base_fname)), 'w') as fd:
        if note is not None:
            fd.write('Note: {}\n\n'.format(note))
        fd.write('PARAMETERS\n\n')
        fd.write('steps_per_epoch={:d}\n'.format(steps_per_epoch))
        fd.write('epochs={:d}\n'.format(epochs))
        fd.write('\nHYPERPARAMETERS\n')
        fd.write('\nText\n')
        fd.write('subset_ratio={}\n'.format(str(subset_ratio)))
        fd.write('subset_seed={}\n'.format(str(subset_seed)))
        fd.write('min_len={:d}\n'.format(min_len))
        fd.write('max_len={:d}\n'.format(max_len))
        fd.write('min_tokens={:d}\n'.format(min_tokens))
        fd.write('\nLabels\n')
        fd.write('categories_mode=\'{}\'\n'.format(categories_mode))
        fd.write('\nTokenization\n')
        fd.write('max_words={:d}\n'.format(max_words))
        fd.write('n_sentences={:d}\n'.format(n_sentences))
        fd.write('n_sentence_tokens={:d}\n'.format(n_sentence_tokens))
        fd.write('padding=\'{}\'\n'.format(padding))
        fd.write('truncating=\'{}\'\n'.format(truncating))
        fd.write('\nWord Embedding\n')
        fd.write('embedding_path=\'{}\'\n'.format(embedding_path))
        fd.write('embedding_trainable={}\n'.format(embedding_trainable))
        fd.write('\nModel\n')
        fd.write('sent_rnn={}\n'.format(sent_rnn.__name__))
        fd.write('sent_rnn_units={:d}\n'.format(sent_rnn_units))
        fd.write('sent_rnn_l2={}\n'.format(str(sent_rnn_l2)))
        fd.write('sent_dense_units={:d}\n'.format(sent_dense_units))
        fd.write(
            'sent_dense_activation=\'{}\'\n'.format(sent_dense_activation))
        fd.write('sent_dense_l2={}\n'.format(str(sent_dense_l2)))
        fd.write('para_rnn={}\n'.format(para_rnn.__name__))
        fd.write('para_rnn_units={:d}\n'.format(para_rnn_units))
        fd.write('para_rnn_l2={}\n'.format(str(para_rnn_l2)))
        fd.write('para_dense_units={:d}\n'.format(para_dense_units))
        fd.write(
            'para_dense_activation=\'{}\'\n'.format(para_dense_activation))
        fd.write('para_dense_l2={}\n'.format(str(para_dense_l2)))
        fd.write('book_dense_units={:d}\n'.format(book_dense_units))
        fd.write('book_dense_activation={} {}\n'.format(
            book_dense_activation.__class__.__name__,
            book_dense_activation.__dict__))
        fd.write('book_dense_l2={}\n'.format(str(book_dense_l2)))
        fd.write('book_dropout={:.1f}\n'.format(book_dropout))
        fd.write('label_mode={}\n'.format(label_mode))
        model.summary(print_fn=lambda x: fd.write('{}\n'.format(x)))
        fd.write('\nTraining\n')
        fd.write('optimizer={}\n'.format(optimizer.__class__.__name__))
        fd.write('lr={}\n'.format(str(lr)))
        fd.write('loss=\'{}\'\n'.format(loss))
        fd.write('metric=\'{}\'\n'.format(metric))
        fd.write('test_size={:.2f}\n'.format(test_size))
        fd.write('test_random_state={:d}\n'.format(test_random_state))
        fd.write('val_size={:.2f}\n'.format(val_size))
        fd.write('val_random_state={:d}\n'.format(val_random_state))
        fd.write('use_class_weights={}\n'.format(use_class_weights))
        if use_class_weights:
            fd.write('class_weight_f={}\n'.format(class_weight_f))
        fd.write('shuffle={}\n'.format(shuffle))
        fd.write('plateau_monitor={}\n'.format(plateau_monitor))
        fd.write('plateau_factor={}\n'.format(str(plateau_factor)))
        fd.write('plateau_patience={:d}\n'.format(plateau_patience))
        fd.write('early_stopping_monitor={}\n'.format(early_stopping_monitor))
        fd.write('early_stopping_min_delta={}\n'.format(
            str(early_stopping_min_delta)))
        fd.write(
            'early_stopping_patience={:d}\n'.format(early_stopping_patience))
        fd.write('\nRESULTS\n\n')
        fd.write('Data size: {:d}\n'.format(len(X)))
        fd.write('Train size: {:d}\n'.format(len(X_train)))
        fd.write('Validation size: {:d}\n'.format(len(X_val)))
        fd.write('Test size: {:d}\n'.format(len(X_test)))
        if save_model:
            fd.write('Model path: \'{}\'\n'.format(model_path))
        else:
            fd.write('Model not saved.\n')
        fd.write('Time elapsed: {:d}h {:d}m {:d}s\n\n'.format(
            elapsed_h, elapsed_m, elapsed_s))
        evaluation.write_confusion_and_metrics(Y_test, Y_pred, fd, categories)

    if not os.path.exists(folders.PREDICTIONS_PATH):
        os.mkdir(folders.PREDICTIONS_PATH)
    predictions_path = os.path.join(folders.PREDICTIONS_PATH, classifier_name)
    if not os.path.exists(predictions_path):
        os.mkdir(predictions_path)
    with open(os.path.join(predictions_path, '{}.txt'.format(base_fname)),
              'w') as fd:
        evaluation.write_predictions(Y_test, Y_pred, fd, categories)

    print('Done.')
Example #5
0
def main():
    parser = ArgumentParser(
        description='Classify the maturity level of a book by its paragraphs.',
        formatter_class=RawTextHelpFormatter)
    parser.add_argument('classifier_name', help='The name of the classifier.')
    parser.add_argument('model_file_name',
                        help='The file name of the model to load.')
    parser.add_argument('window', type=int, help='The paragraph window size.')
    args = parser.parse_args()
    source_mode = 'paragraph'
    remove_stopwords = False

    start_time = int(time.time())
    model_file_base_name = args.model_file_name[:args.model_file_name.
                                                rindex('.')]
    category_index = int(model_file_base_name[-1])
    base_fname = '{}_{:d}w'.format(model_file_base_name, args.window)

    # Load data.
    print('Retrieving texts...')
    source = 'paragraph_tokens'
    min_len = shared_parameters.DATA_PARAGRAPH_MIN_LEN
    max_len = shared_parameters.DATA_PARAGRAPH_MAX_LEN
    subset_ratio = shared_parameters.DATA_SUBSET_RATIO
    subset_seed = shared_parameters.DATA_SUBSET_SEED
    min_tokens = shared_parameters.DATA_MIN_TOKENS
    categories_mode = shared_parameters.DATA_CATEGORIES_MODE
    return_overall = shared_parameters.DATA_RETURN_OVERALL
    inputs, Y, categories, category_levels = \
        bookcave.get_data({source},
                          subset_ratio=subset_ratio,
                          subset_seed=subset_seed,
                          min_len=min_len,
                          max_len=max_len,
                          min_tokens=min_tokens,
                          remove_stopwords=remove_stopwords,
                          categories_mode=categories_mode,
                          return_overall=return_overall)
    text_source_tokens = list(zip(*inputs[source]))[0]
    print('Retrieved {:d} texts.'.format(len(text_source_tokens)))

    # Reduce labels to the specified category.
    y = Y[category_index]
    category = categories[category_index]

    # Tokenize.
    print('Tokenizing...')
    max_words = shared_parameters.TEXT_MAX_WORDS
    split = '\t'
    tokenizer = tokenizers.get_tokenizer_or_fit(
        max_words,
        source_mode,
        remove_stopwords,
        text_source_tokens=text_source_tokens)

    # Convert to sequences.
    print('Converting texts to sequences...')
    n_tokens = shared_parameters.TEXT_N_PARAGRAPH_TOKENS
    padding = shared_parameters.TEXT_PADDING
    truncating = shared_parameters.TEXT_TRUNCATING
    X = [
        np.array(
            pad_sequences(tokenizer.texts_to_sequences(
                [split.join(tokens) for tokens in source_tokens]),
                          maxlen=n_tokens,
                          padding=padding,
                          truncating=truncating))
        for source_tokens in text_source_tokens
    ]

    # Load model.
    print('Loading model...')
    model_path = os.path.join(folders.MODELS_PATH, args.classifier_name,
                              args.model_file_name)
    if 'rnn' in args.classifier_name:
        # Since `keras` was used with the custom layer, we have to reload it with `keras`.
        # https://github.com/keras-team/keras/issues/10907
        custom_objects = {'AttentionWithContext': AttentionWithContext}
        model = keras.models.load_model(model_path,
                                        custom_objects=custom_objects)
    else:
        model = tf.keras.models.load_model(model_path)

    # Split data set.
    print('Splitting data set...')
    test_size = shared_parameters.EVAL_TEST_SIZE  # b
    test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE
    _, X_test, _, y_test = \
        train_test_split(X, y, test_size=test_size, random_state=test_random_state)

    # Predict instances.
    print('Predicting labels...')
    y_pred = np.zeros((len(X_test), ), dtype=np.int32)
    for i, x in enumerate(X_test):
        P = np.zeros((len(x) - args.window + 1, args.window, *x.shape[1:]))
        for w in range(len(P)):
            P[w] = x[w:w + args.window]
        q_pred_transform = model.predict(P)
        q_pred = ordinal.from_multi_hot_ordinal(q_pred_transform, threshold=.5)
        label_pred = max(q_pred)
        y_pred[i] = label_pred

    # Calculate elapsed time.
    end_time = int(time.time())
    elapsed_s = end_time - start_time
    elapsed_m, elapsed_s = elapsed_s // 60, elapsed_s % 60
    elapsed_h, elapsed_m = elapsed_m // 60, elapsed_m % 60

    # Write results.
    print('Writing results...')
    logs_path = folders.ensure(
        os.path.join(folders.LOGS_PATH, args.classifier_name))
    with open(os.path.join(logs_path, '{}.txt'.format(base_fname)), 'w') as fd:
        fd.write('PARAMETERS\n\n')
        fd.write('classifier_name={}\n'.format(args.classifier_name))
        fd.write('model_file_name={}\n'.format(args.model_file_name))
        fd.write('window={:d}\n'.format(args.window))
        fd.write('\nHYPERPARAMETERS\n')
        fd.write('\nText\n')
        fd.write('subset_ratio={}\n'.format(str(subset_ratio)))
        fd.write('subset_seed={}\n'.format(str(subset_seed)))
        fd.write('min_len={:d}\n'.format(min_len))
        fd.write('max_len={:d}\n'.format(max_len))
        fd.write('min_tokens={:d}\n'.format(min_tokens))
        fd.write('remove_stopwords={}\n'.format(remove_stopwords))
        fd.write('\nLabels\n')
        fd.write('categories_mode=\'{}\'\n'.format(categories_mode))
        fd.write('return_overall={}\n'.format(return_overall))
        fd.write('\nTokenization\n')
        fd.write('max_words={:d}\n'.format(max_words))
        fd.write('n_tokens={:d}\n'.format(n_tokens))
        fd.write('padding=\'{}\'\n'.format(padding))
        fd.write('truncating=\'{}\'\n'.format(truncating))
        fd.write('\nRESULTS\n\n')
        fd.write('Data size: {:d}\n'.format(len(X)))
        fd.write('Test size: {:d}\n'.format(len(X_test)))
        fd.write('Time elapsed: {:d}h {:d}m {:d}s\n\n'.format(
            elapsed_h, elapsed_m, elapsed_s))
        evaluation.write_confusion_and_metrics(y_test, y_pred, fd, category)

    # Write predictions.
    print('Writing predictions...')
    predictions_path = folders.ensure(
        os.path.join(folders.PREDICTIONS_PATH, args.classifier_name))
    with open(os.path.join(predictions_path, '{}.txt'.format(base_fname)),
              'w') as fd:
        evaluation.write_predictions(y_test, y_pred, fd, category)

    print('Done.')
Example #6
0
def main():
    script_name = os.path.basename(__file__)
    classifier_name = script_name[:script_name.rindex('.')]

    start_time = int(time.time())
    if 'SLURM_JOB_ID' in os.environ:
        stamp = int(os.environ['SLURM_JOB_ID'])
    else:
        stamp = start_time

    # Load data.
    print('Retrieving labels...')
    subset_ratio = shared_parameters.DATA_SUBSET_RATIO
    subset_seed = shared_parameters.DATA_SUBSET_SEED
    min_len = shared_parameters.DATA_PARAGRAPH_MIN_LEN
    max_len = shared_parameters.DATA_PARAGRAPH_MAX_LEN
    min_tokens = shared_parameters.DATA_MIN_TOKENS
    categories_mode = shared_parameters.DATA_CATEGORIES_MODE
    return_overall = shared_parameters.DATA_RETURN_OVERALL
    _, Y, categories, category_levels = \
        bookcave.get_data({'paragraph_tokens'},
                          subset_ratio=subset_ratio,
                          subset_seed=subset_seed,
                          min_len=min_len,
                          max_len=max_len,
                          min_tokens=min_tokens,
                          categories_mode=categories_mode,
                          return_overall=return_overall)
    print('Retrieved {:d} labels.'.format(Y.shape[1]))

    # Split data set.
    test_size = shared_parameters.EVAL_TEST_SIZE  # b
    test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE
    Y_T = Y.transpose()  # (n, c)
    Y_train_T, Y_test_T = train_test_split(Y_T,
                                           test_size=test_size,
                                           random_state=test_random_state)
    Y_train = Y_train_T.transpose()  # (c, n * (1 - b))
    Y_test = Y_test_T.transpose()  # (c, n * b)

    for j, category in enumerate(categories):
        levels = category_levels[j]
        y_train = Y_train[j]
        y_test = Y_test[j]
        # Predict the most common class seen in the training data.
        y_pred = [np.argmax(np.bincount(y_train, minlength=len(levels)))
                  ] * len(y_test)

        base_fname = '{:d}_{:d}'.format(stamp, j)
        logs_path = folders.ensure(
            os.path.join(folders.LOGS_PATH, classifier_name))
        with open(os.path.join(logs_path, '{}.txt'.format(base_fname)),
                  'w') as fd:
            fd.write('HYPERPARAMETERS\n')
            fd.write('\nText\n')
            fd.write('subset_ratio={}\n'.format(str(subset_ratio)))
            fd.write('subset_seed={}\n'.format(str(subset_seed)))
            fd.write('min_len={:d}\n'.format(min_len))
            fd.write('max_len={:d}\n'.format(max_len))
            fd.write('min_tokens={:d}\n'.format(min_tokens))
            fd.write('\nLabels\n')
            fd.write('categories_mode=\'{}\'\n'.format(categories_mode))
            fd.write('return_overall={}\n'.format(return_overall))
            fd.write('\nTraining\n')
            fd.write('test_size={}\n'.format(str(test_size)))
            fd.write('test_random_state={:d}\n'.format(test_random_state))
            fd.write('\nRESULTS\n\n')
            fd.write('Data size: {:d}\n'.format(Y.shape[1]))
            fd.write('Train size: {:d}\n'.format(Y_train.shape[1]))
            fd.write('Test size: {:d}\n'.format(Y_test.shape[1]))
            fd.write('\n')
            evaluation.write_confusion_and_metrics(y_test, y_pred, fd,
                                                   category)

        predictions_path = folders.ensure(
            os.path.join(folders.PREDICTIONS_PATH, classifier_name))
        with open(os.path.join(predictions_path, '{}.txt'.format(base_fname)),
                  'w') as fd:
            evaluation.write_predictions(y_test, y_pred, fd, category)

    print('Done.')