Ejemplo n.º 1
0
def infer(j, y_test, model_name, stamp, window, categories, k, models, text_P,
          subset_ratio, subset_seed, min_len, max_len, min_tokens,
          categories_mode, return_overall, max_words, vectorizer, test_size,
          test_random_state, data_len, test_len):
    category = categories[j]
    print('Predicting category `{}`...'.format(category))
    y_pred = np.zeros((len(y_test), ), dtype=np.int32)
    for i in range(len(y_test)):
        P = text_P[i]
        q_pred = base.predict_ordinal(models, P, k)
        label_pred = max(q_pred)
        y_pred[i] = label_pred

    base_fname = '{}_{:d}_{:d}w'.format(stamp, j, window)
    logs_path = folders.ensure(os.path.join(folders.LOGS_PATH, model_name))
    with open(os.path.join(logs_path, '{}.txt'.format(base_fname)), 'w') as fd:
        fd.write('HYPERPARAMETERS\n')
        fd.write('\nText\n')
        fd.write('subset_ratio={}\n'.format(str(subset_ratio)))
        fd.write('subset_seed={}\n'.format(str(subset_seed)))
        fd.write('min_len={:d}\n'.format(min_len))
        fd.write('max_len={:d}\n'.format(max_len))
        fd.write('min_tokens={:d}\n'.format(min_tokens))
        fd.write('\nLabels\n')
        fd.write('categories_mode=\'{}\'\n'.format(categories_mode))
        fd.write('return_overall={}\n'.format(return_overall))
        fd.write('\nVectorization\n')
        fd.write('max_words={:d}\n'.format(max_words))
        fd.write('vectorizer={}\n'.format(vectorizer.__class__.__name__))
        fd.write('\nTraining\n')
        fd.write('test_size={}\n'.format(str(test_size)))
        fd.write('test_random_state={:d}\n'.format(test_random_state))
        fd.write('\nRESULTS\n\n')
        fd.write('Data size: {:d}\n'.format(data_len))
        fd.write('Test size: {:d}\n\n'.format(test_len))
        evaluation.write_confusion_and_metrics(y_test, y_pred, fd, category)

    predictions_path = folders.ensure(
        os.path.join(folders.PREDICTIONS_PATH, model_name))
    with open(os.path.join(predictions_path, '{}.txt'.format(base_fname)),
              'w') as fd:
        evaluation.write_predictions(y_test, y_pred, fd, category)
Ejemplo n.º 2
0
def train(skip_models=False):
    max_words = shared_parameters.TEXT_MAX_WORDS

    start_time = int(time.time())
    if 'SLURM_JOB_ID' in os.environ:
        stamp = int(os.environ['SLURM_JOB_ID'])
    else:
        stamp = start_time
    print('Time stamp: {:d}'.format(stamp))

    # Load data.
    print('Retrieving texts...')
    source = 'paragraph_tokens'
    subset_ratio = .1  #shared_parameters.DATA_SUBSET_RATIO
    subset_seed = shared_parameters.DATA_SUBSET_SEED
    min_len = shared_parameters.DATA_PARAGRAPH_MIN_LEN
    max_len = shared_parameters.DATA_PARAGRAPH_MAX_LEN
    min_tokens = shared_parameters.DATA_MIN_TOKENS
    remove_stopwords = False
    categories_mode = shared_parameters.DATA_CATEGORIES_MODE
    return_overall = shared_parameters.DATA_RETURN_OVERALL
    inputs, Y, categories, category_levels = \
        bookcave.get_data({source},
                          subset_ratio=subset_ratio,
                          subset_seed=subset_seed,
                          min_len=min_len,
                          max_len=max_len,
                          min_tokens=min_tokens,
                          remove_stopwords=remove_stopwords,
                          categories_mode=categories_mode,
                          return_overall=return_overall)
    text_source_tokens = list(zip(*inputs[source]))[0]
    print('Retrieved {:d} texts.'.format(len(text_source_tokens)))

    # Create vectorized representations of the book texts.
    print('Vectorizing text...')
    text_tokens = []
    for source_tokens in text_source_tokens:
        all_tokens = []
        for tokens in source_tokens:
            all_tokens.extend(tokens)
        text_tokens.append(all_tokens)
    vectorizer = tokenizers.get_vectorizer_or_fit(max_words,
                                                  remove_stopwords,
                                                  text_tokens=text_tokens)
    X = vectorizer.transform(text_tokens)
    print('Vectorized text with {:d} unique words.'.format(
        len(vectorizer.get_feature_names())))

    # Split data set.
    test_size = shared_parameters.EVAL_TEST_SIZE  # b
    test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE
    Y_T = Y.transpose()  # (n, c)
    X_train, X_test, Y_train_T, Y_test_T = train_test_split(
        X, Y_T, test_size=test_size, random_state=test_random_state)
    Y_train = Y_train_T.transpose()  # (c, n * (1 - b))
    Y_test = Y_test_T.transpose()  # (c, n * b)

    create_funcs = [
        create_k_nearest_neighbors, create_logistic_regression,
        create_multi_layer_perceptron, create_multinomial_naive_bayes,
        create_random_forest, create_svm
    ]
    model_names = [
        'k_nearest_neighbors', 'logistic_regression', 'multi_layer_perceptron',
        'multinomial_naive_bayes', 'random_forest', 'svm'
    ]
    for m, create_func in enumerate(create_funcs):
        model_name = model_names[m]
        model_path = folders.ensure(
            os.path.join(folders.MODELS_PATH, model_name))
        print('Training model `{}`...'.format(model_name))
        for j, category in enumerate(categories):
            print('Classifying category `{}`...'.format(category))
            y_train = Y_train[j]  # (n * (1 - b))
            k = len(category_levels[j])
            classifiers = fit_ordinal(create_func, X_train, y_train, k)
            y_pred = predict_ordinal(classifiers, X_test, k)  # (n * b)
            y_test = Y_test[j]

            base_fname = '{:d}_{:d}'.format(stamp, j)
            logs_path = folders.ensure(
                os.path.join(folders.LOGS_PATH, model_name))
            with open(os.path.join(logs_path, '{}.txt'.format(base_fname)),
                      'w') as fd:
                fd.write('HYPERPARAMETERS\n')
                fd.write('\nText\n')
                fd.write('subset_ratio={}\n'.format(str(subset_ratio)))
                fd.write('subset_seed={}\n'.format(str(subset_seed)))
                fd.write('min_len={:d}\n'.format(min_len))
                fd.write('max_len={:d}\n'.format(max_len))
                fd.write('min_tokens={:d}\n'.format(min_tokens))
                fd.write('\nLabels\n')
                fd.write('categories_mode=\'{}\'\n'.format(categories_mode))
                fd.write('return_overall={}\n'.format(return_overall))
                fd.write('\nVectorization\n')
                fd.write('max_words={:d}\n'.format(max_words))
                fd.write('vectorizer={}\n'.format(
                    vectorizer.__class__.__name__))
                fd.write('\nTraining\n')
                fd.write('test_size={}\n'.format(str(test_size)))
                fd.write('test_random_state={:d}\n'.format(test_random_state))
                fd.write('\nRESULTS\n\n')
                fd.write('Data size: {:d}\n'.format(X.shape[0]))
                fd.write('Train size: {:d}\n'.format(X_train.shape[0]))
                fd.write('Test size: {:d}\n\n'.format(X_test.shape[0]))
                evaluation.write_confusion_and_metrics(y_test, y_pred, fd,
                                                       category)

            predictions_path = folders.ensure(
                os.path.join(folders.PREDICTIONS_PATH, model_name))
            with open(
                    os.path.join(predictions_path,
                                 '{}.txt'.format(base_fname)), 'w') as fd:
                evaluation.write_predictions(y_test, y_pred, fd, category)

            if not skip_models:
                models_path = folders.ensure(
                    os.path.join(model_path, base_fname))
                for i, classifier in enumerate(classifiers):
                    with open(
                            os.path.join(models_path,
                                         'model{:d}.pickle'.format(i)),
                            'wb') as fd:
                        pickle.dump(classifier,
                                    fd,
                                    protocol=pickle.HIGHEST_PROTOCOL)

    print('Done.')
Ejemplo n.º 3
0
def main():
    parser = ArgumentParser(
        description='Classify the maturity level of a book by its text.',
        formatter_class=RawTextHelpFormatter)
    parser.add_argument('category_index',
                        type=int,
                        help='The category index.\n  {}'.format('\n  '.join([
                            '{:d} {}'.format(j,
                                             bookcave.CATEGORY_NAMES[category])
                            for j, category in enumerate(bookcave.CATEGORIES)
                        ])))
    parser.add_argument('--source_mode',
                        default='paragraph',
                        choices=['paragraph', 'sentence'],
                        help='The source of text. Default is `paragraph`.')
    parser.add_argument('--net_mode',
                        default='cnn',
                        choices=['rnn', 'cnn', 'rnncnn'],
                        help='The type of neural network. Default is `cnn`.')
    parser.add_argument('--remove_stopwords',
                        action='store_true',
                        help='Remove stop-words from text. Default is False.')
    parser.add_argument(
        '--agg_mode',
        default='maxavg',
        choices=['max', 'avg', 'maxavg', 'rnn'],
        help=
        'The way the network will aggregate paragraphs or sentences. Default is `maxavg`.'
    )
    parser.add_argument('--label_mode',
                        default=shared_parameters.LABEL_MODE_ORDINAL,
                        choices=[
                            shared_parameters.LABEL_MODE_ORDINAL,
                            shared_parameters.LABEL_MODE_CATEGORICAL,
                            shared_parameters.LABEL_MODE_REGRESSION
                        ],
                        help='The way that labels will be interpreted. '
                        'Default is `{}`.'.format(
                            shared_parameters.LABEL_MODE_ORDINAL))
    parser.add_argument(
        '--remove_classes',
        type=str,
        help=
        'Remove classes altogether. Can be used when the minority class is severely tiny. '
        'Like `<class1>[,<class2>,...]` as in `3` or `3,0`. Optional.')
    parser.add_argument(
        '--class_weight_p',
        default=2,
        type=int,
        help='Power with which to scale class weights. Default is 2.')
    parser.add_argument(
        '--embedding_trainable',
        action='store_true',
        help=
        'Flag to allow the model to optimize the word embeddings. Default is False.'
    )
    parser.add_argument(
        '--book_dense_units',
        default='128',
        help=
        'The number of neurons in the final fully-connected layers, comma separated. '
        'Default is `128`.')
    parser.add_argument(
        '--book_dropout',
        default=0.5,
        type=float,
        help=
        'Dropout probability before final classification layer. Default is 0.5.'
    )
    parser.add_argument(
        '--plateau_patience',
        default=16,
        type=int,
        help=
        'Number of epochs to wait before dividing the learning rate by 2. Default is 16.'
    )
    parser.add_argument(
        '--early_stopping_patience',
        default=32,
        type=int,
        help=
        'Number of epochs to wait before dividing the learning rate by 2. Default is 32.'
    )
    parser.add_argument('--epochs',
                        default=1,
                        type=int,
                        help='Epochs. Default is 1.')
    parser.add_argument(
        '--save_model',
        action='store_true',
        help='Save the model and its weights. Default is False.')
    parser.add_argument(
        '--note',
        help=
        'An optional note that will be appended to the names of generated files.'
    )
    args = parser.parse_args()

    classifier_name = '{}_{}_{}_{}'.format(args.source_mode, args.net_mode,
                                           args.agg_mode, args.label_mode)

    start_time = int(time.time())
    if 'SLURM_JOB_ID' in os.environ:
        stamp = int(os.environ['SLURM_JOB_ID'])
    else:
        stamp = start_time
    print('Time stamp: {:d}'.format(stamp))
    if args.note is not None:
        print('Note: {}'.format(args.note))
        base_fname = '{:d}_{}_{:d}'.format(stamp, args.note,
                                           args.category_index)
    else:
        base_fname = '{:d}_{:d}'.format(stamp, args.category_index)

    # Load data.
    print('Retrieving texts...')
    if args.source_mode == 'paragraph':
        source = 'paragraph_tokens'
        min_len = shared_parameters.DATA_PARAGRAPH_MIN_LEN
        max_len = shared_parameters.DATA_PARAGRAPH_MAX_LEN
    else:  # args.source_mode == 'sentence':
        source = 'sentence_tokens'
        min_len = shared_parameters.DATA_SENTENCE_MIN_LEN
        max_len = shared_parameters.DATA_SENTENCE_MAX_LEN
    subset_ratio = shared_parameters.DATA_SUBSET_RATIO
    subset_seed = shared_parameters.DATA_SUBSET_SEED
    min_tokens = shared_parameters.DATA_MIN_TOKENS
    categories_mode = shared_parameters.DATA_CATEGORIES_MODE
    return_overall = shared_parameters.DATA_RETURN_OVERALL
    inputs, Y, categories, category_levels = \
        bookcave.get_data({source},
                          subset_ratio=subset_ratio,
                          subset_seed=subset_seed,
                          min_len=min_len,
                          max_len=max_len,
                          min_tokens=min_tokens,
                          remove_stopwords=args.remove_stopwords,
                          categories_mode=categories_mode,
                          return_overall=return_overall)
    text_source_tokens = list(zip(*inputs[source]))[0]
    print('Retrieved {:d} texts.'.format(len(text_source_tokens)))

    # Reduce labels to the specified category.
    y = Y[args.category_index]
    category = categories[args.category_index]
    levels = category_levels[args.category_index]
    k = len(levels)
    k_train = k

    # Tokenize.
    print('Tokenizing...')
    max_words = shared_parameters.TEXT_MAX_WORDS
    split = '\t'
    tokenizer = tokenizers.get_tokenizer_or_fit(
        max_words,
        args.source_mode,
        args.remove_stopwords,
        text_source_tokens=text_source_tokens)

    # Convert to sequences.
    print('Converting texts to sequences...')
    if args.source_mode == 'paragraph':
        if not args.remove_stopwords:
            n_tokens = shared_parameters.TEXT_N_PARAGRAPH_TOKENS
        else:
            n_tokens = shared_parameters.TEXT_N_PARAGRAPH_TOKENS_NO_STOPWORDS
    else:  # args.source_mode == 'sentence':
        n_tokens = shared_parameters.TEXT_N_SENTENCE_TOKENS
    padding = shared_parameters.TEXT_PADDING
    truncating = shared_parameters.TEXT_TRUNCATING
    X = [
        np.array(
            pad_sequences(tokenizer.texts_to_sequences(
                [split.join(tokens) for tokens in source_tokens]),
                          maxlen=n_tokens,
                          padding=padding,
                          truncating=truncating))
        for source_tokens in text_source_tokens
    ]

    # Load embedding.
    print('Loading embedding matrix...')
    embedding_path = folders.EMBEDDING_GLOVE_300_PATH
    embedding_matrix = load_embeddings.load_embedding(tokenizer,
                                                      embedding_path,
                                                      max_words)

    # Create model.
    print('Creating model...')
    net_params = dict()
    if args.net_mode == 'rnn' or args.net_mode == 'rnncnn':
        net_params['rnn'] = CuDNNGRU if tf.test.is_gpu_available(
            cuda_only=True) else GRU
        net_params['rnn_units'] = 128
        net_params['rnn_l2'] = .001
        net_params['rnn_dense_units'] = 64
        net_params['rnn_dense_activation'] = 'elu'
        net_params['rnn_dense_l2'] = .001
        net_params['rnn_agg'] = 'attention'
    if args.net_mode == 'cnn' or args.net_mode == 'rnncnn':
        net_params['cnn_filters'] = 16
        net_params['cnn_filter_sizes'] = [1, 2, 3, 4]
        net_params['cnn_activation'] = 'elu'
        net_params['cnn_l2'] = .001
    agg_params = dict()
    if args.agg_mode == 'rnn':
        agg_params['rnn'] = CuDNNGRU if tf.test.is_gpu_available(
            cuda_only=True) else GRU
        agg_params['rnn_units'] = 64
        agg_params['rnn_l2'] = .001
    book_dense_units = [
        int(units) for units in args.book_dense_units.split(',')
    ]
    book_dense_activation = LeakyReLU(alpha=.1)
    book_dense_l2 = .001
    book_dropout = args.book_dropout
    model = create_model(n_tokens, embedding_matrix, args.embedding_trainable,
                         args.net_mode, net_params, args.agg_mode, agg_params,
                         book_dense_units, book_dense_activation,
                         book_dense_l2, book_dropout, k, category,
                         args.label_mode)
    lr = 2**-16
    optimizer = Adam(lr=lr)
    if args.label_mode == shared_parameters.LABEL_MODE_ORDINAL:
        loss = 'binary_crossentropy'
        metric = 'binary_accuracy'
    elif args.label_mode == shared_parameters.LABEL_MODE_CATEGORICAL:
        loss = 'categorical_crossentropy'
        metric = 'categorical_accuracy'
    else:  # args.label_mode == shared_parameters.LABEL_MODE_REGRESSION:
        loss = 'mse'
        metric = 'accuracy'
    model.compile(optimizer, loss=loss, metrics=[metric])

    # Split data set.
    print('Splitting data set...')
    test_size = shared_parameters.EVAL_TEST_SIZE  # b
    test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE
    val_size = shared_parameters.EVAL_VAL_SIZE  # v
    val_random_state = shared_parameters.EVAL_VAL_RANDOM_STATE
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=test_size, random_state=test_random_state)
    X_train, X_val, y_train, y_val = \
        train_test_split(X_train, y_train, test_size=val_size, random_state=val_random_state)
    y_val_transform = shared_parameters.transform_labels(
        y_val, k, args.label_mode)
    y_test_transform = shared_parameters.transform_labels(
        y_test, k, args.label_mode)

    # Remove classes from training set, if specified.
    if args.remove_classes is not None:
        remove_classes = sorted(list(
            map(int,
                args.remove_classes.strip().split(','))),
                                reverse=True)
        for class_ in remove_classes:
            y_train[y_train >= class_] -= 1
            k_train -= 1

    # Create generators.
    print('Creating data generators...')
    train_generator = TransformBalancedBatchGenerator(
        np.arange(len(X_train)).reshape((len(X_train), 1)),
        y_train,
        transform_X=transform_X,
        transform_y=transform_y,
        batch_size=1,
        X_data=[np.array([x]) for x in X_train],
        k=k,
        label_mode=args.label_mode)
    val_generator = SingleInstanceBatchGenerator(X_val,
                                                 y_val_transform,
                                                 shuffle=False)
    test_generator = SingleInstanceBatchGenerator(X_test,
                                                  y_test_transform,
                                                  shuffle=False)

    # Get class weight.
    class_weight = shared_parameters.get_class_weight(k_train,
                                                      args.label_mode,
                                                      p=args.class_weight_p)

    # Train.
    print('Training for up to {:d} epoch{}...'.format(
        args.epochs, 's' if args.epochs != 1 else ''))
    plateau_monitor = 'val_loss'
    plateau_factor = .5
    early_stopping_monitor = 'val_loss'
    early_stopping_min_delta = 2**-10
    plateau_patience = args.plateau_patience
    early_stopping_patience = args.early_stopping_patience
    callbacks = [
        ReduceLROnPlateau(monitor=plateau_monitor,
                          factor=plateau_factor,
                          patience=plateau_patience),
        EarlyStopping(monitor=early_stopping_monitor,
                      min_delta=early_stopping_min_delta,
                      patience=early_stopping_patience)
    ]
    if args.save_model:
        models_path = folders.ensure(
            os.path.join(folders.MODELS_PATH, classifier_name))
        model_path = os.path.join(models_path, '{}.h5'.format(base_fname))
        model_checkpoint = ModelCheckpoint(model_path,
                                           monitor='val_loss',
                                           save_best_only=True,
                                           mode='min')
        callbacks.append(model_checkpoint)
    else:
        model_path = None
    history = model.fit_generator(train_generator,
                                  epochs=args.epochs,
                                  verbose=0,
                                  callbacks=callbacks,
                                  validation_data=val_generator,
                                  class_weight=class_weight)
    epochs_complete = len(history.history.get('val_loss'))

    # Save the history to visualize loss over time.
    print('Saving training history...')
    history_path = folders.ensure(
        os.path.join(folders.HISTORY_PATH, classifier_name))
    with open(os.path.join(history_path, '{}.txt'.format(base_fname)),
              'w') as fd:
        for key in history.history.keys():
            values = history.history.get(key)
            fd.write('{} {}\n'.format(key,
                                      ' '.join(str(value)
                                               for value in values)))

    # Predict test instances.
    print('Predicting test instances...')
    y_pred_transform = model.predict_generator(test_generator)
    if args.label_mode == shared_parameters.LABEL_MODE_ORDINAL:
        y_pred = ordinal.from_multi_hot_ordinal(y_pred_transform, threshold=.5)
    elif args.label_mode == shared_parameters.LABEL_MODE_CATEGORICAL:
        y_pred = np.argmax(y_pred_transform, axis=1)
    else:  # args.label_mode == shared_parameters.LABEL_MODE_REGRESSION:
        y_pred = np.maximum(0, np.minimum(k - 1,
                                          np.round(y_pred_transform * k)))

    # Calculate elapsed time.
    end_time = int(time.time())
    elapsed_s = end_time - start_time
    elapsed_m, elapsed_s = elapsed_s // 60, elapsed_s % 60
    elapsed_h, elapsed_m = elapsed_m // 60, elapsed_m % 60

    # Write results.
    print('Writing results...')
    logs_path = folders.ensure(os.path.join(folders.LOGS_PATH,
                                            classifier_name))
    with open(os.path.join(logs_path, '{}.txt'.format(base_fname)), 'w') as fd:
        if args.note is not None:
            fd.write('{}\n\n'.format(args.note))
        fd.write('PARAMETERS\n\n')
        fd.write('category_index={:d}\n'.format(args.category_index))
        fd.write('epochs={:d}\n'.format(args.epochs))
        fd.write('\nHYPERPARAMETERS\n')
        fd.write('\nText\n')
        fd.write('subset_ratio={}\n'.format(str(subset_ratio)))
        fd.write('subset_seed={}\n'.format(str(subset_seed)))
        fd.write('min_len={:d}\n'.format(min_len))
        fd.write('max_len={:d}\n'.format(max_len))
        fd.write('min_tokens={:d}\n'.format(min_tokens))
        fd.write('remove_stopwords={}\n'.format(args.remove_stopwords))
        fd.write('\nLabels\n')
        fd.write('categories_mode=\'{}\'\n'.format(categories_mode))
        fd.write('return_overall={}\n'.format(return_overall))
        if args.remove_classes is not None:
            fd.write('remove_classes={}\n'.format(args.remove_classes))
        else:
            fd.write('No classes removed.\n')
        fd.write('class_weight_p={:d}\n'.format(args.class_weight_p))
        fd.write('\nTokenization\n')
        fd.write('max_words={:d}\n'.format(max_words))
        fd.write('n_tokens={:d}\n'.format(n_tokens))
        fd.write('padding=\'{}\'\n'.format(padding))
        fd.write('truncating=\'{}\'\n'.format(truncating))
        fd.write('\nWord Embedding\n')
        fd.write('embedding_path=\'{}\'\n'.format(embedding_path))
        fd.write('embedding_trainable={}\n'.format(args.embedding_trainable))
        fd.write('\nModel\n')
        if args.net_mode == 'rnn' or args.net_mode == 'rnncnn':
            fd.write('rnn={}\n'.format(net_params['rnn'].__name__))
            fd.write('rnn_units={:d}\n'.format(net_params['rnn_units']))
            fd.write('rnn_l2={}\n'.format(str(net_params['rnn_l2'])))
            fd.write('rnn_dense_units={:d}\n'.format(
                net_params['rnn_dense_units']))
            fd.write('rnn_dense_activation=\'{}\'\n'.format(
                net_params['rnn_dense_activation']))
            fd.write('rnn_dense_l2={}\n'.format(str(
                net_params['rnn_dense_l2'])))
            fd.write('rnn_agg={}\n'.format(net_params['rnn_agg']))
        if args.net_mode == 'cnn' or args.net_mode == 'rnncnn':
            fd.write('cnn_filters={:d}\n'.format(net_params['cnn_filters']))
            fd.write('cnn_filter_sizes={}\n'.format(
                str(net_params['cnn_filter_sizes'])))
            fd.write('cnn_activation=\'{}\'\n'.format(
                net_params['cnn_activation']))
            fd.write('cnn_l2={}\n'.format(str(net_params['cnn_l2'])))
        if args.agg_mode == 'rnn':
            fd.write('agg_rnn={}\n'.format(agg_params['rnn'].__name__))
            fd.write('agg_rnn_units={:d}\n'.format(agg_params['rnn_units']))
            fd.write('agg_rnn_l2={}\n'.format(str(agg_params['rnn_l2'])))
        fd.write('book_dense_units={}\n'.format(args.book_dense_units))
        fd.write('book_dense_activation={} {}\n'.format(
            book_dense_activation.__class__.__name__,
            book_dense_activation.__dict__))
        fd.write('book_dense_l2={}\n'.format(str(book_dense_l2)))
        fd.write('book_dropout={}\n'.format(str(book_dropout)))
        model.summary(print_fn=lambda x: fd.write('{}\n'.format(x)))
        fd.write('\nTraining\n')
        fd.write('optimizer={}\n'.format(optimizer.__class__.__name__))
        fd.write('lr={}\n'.format(str(lr)))
        fd.write('loss=\'{}\'\n'.format(loss))
        fd.write('metric=\'{}\'\n'.format(metric))
        fd.write('test_size={}\n'.format(str(test_size)))
        fd.write('test_random_state={:d}\n'.format(test_random_state))
        fd.write('val_size={}\n'.format(str(val_size)))
        fd.write('val_random_state={:d}\n'.format(val_random_state))
        fd.write('plateau_monitor={}\n'.format(plateau_monitor))
        fd.write('plateau_factor={}\n'.format(str(plateau_factor)))
        fd.write('plateau_patience={:d}\n'.format(plateau_patience))
        fd.write('early_stopping_monitor={}\n'.format(early_stopping_monitor))
        fd.write('early_stopping_min_delta={}\n'.format(
            str(early_stopping_min_delta)))
        fd.write(
            'early_stopping_patience={:d}\n'.format(early_stopping_patience))
        fd.write('\nRESULTS\n\n')
        fd.write('Data size: {:d}\n'.format(len(X)))
        fd.write('Train size: {:d}\n'.format(len(X_train)))
        fd.write('Validation size: {:d}\n'.format(len(X_val)))
        fd.write('Test size: {:d}\n'.format(len(X_test)))
        if model_path is not None:
            fd.write('Model path: \'{}\'\n'.format(model_path))
        else:
            fd.write('Model not saved.\n')
        fd.write('Epochs completed: {:d}\n'.format(epochs_complete))
        fd.write('Time elapsed: {:d}h {:d}m {:d}s\n\n'.format(
            elapsed_h, elapsed_m, elapsed_s))
        evaluation.write_confusion_and_metrics(y_test, y_pred, fd, category)

    # Write predictions.
    print('Writing predictions...')
    predictions_path = folders.ensure(
        os.path.join(folders.PREDICTIONS_PATH, classifier_name))
    with open(os.path.join(predictions_path, '{}.txt'.format(base_fname)),
              'w') as fd:
        evaluation.write_predictions(y_test, y_pred, fd, category)

    print('Done.')
Ejemplo n.º 4
0
def main():
    parser = ArgumentParser(
        description='Classify the maturity level of a book by its paragraphs.',
        formatter_class=RawTextHelpFormatter)
    parser.add_argument('classifier_name', help='The name of the classifier.')
    parser.add_argument('model_file_name',
                        help='The file name of the model to load.')
    parser.add_argument('window', type=int, help='The paragraph window size.')
    args = parser.parse_args()
    source_mode = 'paragraph'
    remove_stopwords = False

    start_time = int(time.time())
    model_file_base_name = args.model_file_name[:args.model_file_name.
                                                rindex('.')]
    category_index = int(model_file_base_name[-1])
    base_fname = '{}_{:d}w'.format(model_file_base_name, args.window)

    # Load data.
    print('Retrieving texts...')
    source = 'paragraph_tokens'
    min_len = shared_parameters.DATA_PARAGRAPH_MIN_LEN
    max_len = shared_parameters.DATA_PARAGRAPH_MAX_LEN
    subset_ratio = shared_parameters.DATA_SUBSET_RATIO
    subset_seed = shared_parameters.DATA_SUBSET_SEED
    min_tokens = shared_parameters.DATA_MIN_TOKENS
    categories_mode = shared_parameters.DATA_CATEGORIES_MODE
    return_overall = shared_parameters.DATA_RETURN_OVERALL
    inputs, Y, categories, category_levels = \
        bookcave.get_data({source},
                          subset_ratio=subset_ratio,
                          subset_seed=subset_seed,
                          min_len=min_len,
                          max_len=max_len,
                          min_tokens=min_tokens,
                          remove_stopwords=remove_stopwords,
                          categories_mode=categories_mode,
                          return_overall=return_overall)
    text_source_tokens = list(zip(*inputs[source]))[0]
    print('Retrieved {:d} texts.'.format(len(text_source_tokens)))

    # Reduce labels to the specified category.
    y = Y[category_index]
    category = categories[category_index]

    # Tokenize.
    print('Tokenizing...')
    max_words = shared_parameters.TEXT_MAX_WORDS
    split = '\t'
    tokenizer = tokenizers.get_tokenizer_or_fit(
        max_words,
        source_mode,
        remove_stopwords,
        text_source_tokens=text_source_tokens)

    # Convert to sequences.
    print('Converting texts to sequences...')
    n_tokens = shared_parameters.TEXT_N_PARAGRAPH_TOKENS
    padding = shared_parameters.TEXT_PADDING
    truncating = shared_parameters.TEXT_TRUNCATING
    X = [
        np.array(
            pad_sequences(tokenizer.texts_to_sequences(
                [split.join(tokens) for tokens in source_tokens]),
                          maxlen=n_tokens,
                          padding=padding,
                          truncating=truncating))
        for source_tokens in text_source_tokens
    ]

    # Load model.
    print('Loading model...')
    model_path = os.path.join(folders.MODELS_PATH, args.classifier_name,
                              args.model_file_name)
    if 'rnn' in args.classifier_name:
        # Since `keras` was used with the custom layer, we have to reload it with `keras`.
        # https://github.com/keras-team/keras/issues/10907
        custom_objects = {'AttentionWithContext': AttentionWithContext}
        model = keras.models.load_model(model_path,
                                        custom_objects=custom_objects)
    else:
        model = tf.keras.models.load_model(model_path)

    # Split data set.
    print('Splitting data set...')
    test_size = shared_parameters.EVAL_TEST_SIZE  # b
    test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE
    _, X_test, _, y_test = \
        train_test_split(X, y, test_size=test_size, random_state=test_random_state)

    # Predict instances.
    print('Predicting labels...')
    y_pred = np.zeros((len(X_test), ), dtype=np.int32)
    for i, x in enumerate(X_test):
        P = np.zeros((len(x) - args.window + 1, args.window, *x.shape[1:]))
        for w in range(len(P)):
            P[w] = x[w:w + args.window]
        q_pred_transform = model.predict(P)
        q_pred = ordinal.from_multi_hot_ordinal(q_pred_transform, threshold=.5)
        label_pred = max(q_pred)
        y_pred[i] = label_pred

    # Calculate elapsed time.
    end_time = int(time.time())
    elapsed_s = end_time - start_time
    elapsed_m, elapsed_s = elapsed_s // 60, elapsed_s % 60
    elapsed_h, elapsed_m = elapsed_m // 60, elapsed_m % 60

    # Write results.
    print('Writing results...')
    logs_path = folders.ensure(
        os.path.join(folders.LOGS_PATH, args.classifier_name))
    with open(os.path.join(logs_path, '{}.txt'.format(base_fname)), 'w') as fd:
        fd.write('PARAMETERS\n\n')
        fd.write('classifier_name={}\n'.format(args.classifier_name))
        fd.write('model_file_name={}\n'.format(args.model_file_name))
        fd.write('window={:d}\n'.format(args.window))
        fd.write('\nHYPERPARAMETERS\n')
        fd.write('\nText\n')
        fd.write('subset_ratio={}\n'.format(str(subset_ratio)))
        fd.write('subset_seed={}\n'.format(str(subset_seed)))
        fd.write('min_len={:d}\n'.format(min_len))
        fd.write('max_len={:d}\n'.format(max_len))
        fd.write('min_tokens={:d}\n'.format(min_tokens))
        fd.write('remove_stopwords={}\n'.format(remove_stopwords))
        fd.write('\nLabels\n')
        fd.write('categories_mode=\'{}\'\n'.format(categories_mode))
        fd.write('return_overall={}\n'.format(return_overall))
        fd.write('\nTokenization\n')
        fd.write('max_words={:d}\n'.format(max_words))
        fd.write('n_tokens={:d}\n'.format(n_tokens))
        fd.write('padding=\'{}\'\n'.format(padding))
        fd.write('truncating=\'{}\'\n'.format(truncating))
        fd.write('\nRESULTS\n\n')
        fd.write('Data size: {:d}\n'.format(len(X)))
        fd.write('Test size: {:d}\n'.format(len(X_test)))
        fd.write('Time elapsed: {:d}h {:d}m {:d}s\n\n'.format(
            elapsed_h, elapsed_m, elapsed_s))
        evaluation.write_confusion_and_metrics(y_test, y_pred, fd, category)

    # Write predictions.
    print('Writing predictions...')
    predictions_path = folders.ensure(
        os.path.join(folders.PREDICTIONS_PATH, args.classifier_name))
    with open(os.path.join(predictions_path, '{}.txt'.format(base_fname)),
              'w') as fd:
        evaluation.write_predictions(y_test, y_pred, fd, category)

    print('Done.')
Ejemplo n.º 5
0
def main():
    parser = ArgumentParser(
        description='Classify the maturity level of a book by its cover.',
        formatter_class=RawTextHelpFormatter)
    parser.add_argument('category_index',
                        type=int,
                        help='The category index.\n  {}'.format('\n  '.join([
                            '{:d} {}'.format(j,
                                             bookcave.CATEGORY_NAMES[category])
                            for j, category in enumerate(bookcave.CATEGORIES)
                        ])))
    parser.add_argument('--label_mode',
                        default=shared_parameters.LABEL_MODE_ORDINAL,
                        choices=[
                            shared_parameters.LABEL_MODE_ORDINAL,
                            shared_parameters.LABEL_MODE_CATEGORICAL,
                            shared_parameters.LABEL_MODE_REGRESSION
                        ],
                        help='The way that labels will be interpreted. '
                        'Default is `{}`.'.format(
                            shared_parameters.LABEL_MODE_ORDINAL))
    args = parser.parse_args()

    classifier_name = 'cover_net'
    start_time = int(time.time())
    if 'SLURM_JOB_ID' in os.environ:
        stamp = int(os.environ['SLURM_JOB_ID'])
    else:
        stamp = start_time
    base_fname = '{:d}_{:d}'.format(stamp, args.category_index)

    images_size = (256, 256)

    # Here, `Y` has shape (n, m) where `n` is the number of books and `m` is the number of maturity categories.
    inputs, Y, categories, levels = \
        bookcave.get_data({'images'},
                          subset_ratio=1/4,  # shared_parameters.
                          subset_seed=1,
                          image_size=images_size)
    image_paths = inputs['images']

    # Reduce the labels to the specified category.
    y = Y[args.category_index]
    category = categories[args.category_index]
    levels = levels[args.category_index]
    k = len(levels)

    # Split data set.
    test_size = shared_parameters.EVAL_TEST_SIZE  # b
    test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE
    val_size = shared_parameters.EVAL_VAL_SIZE
    val_random_state = shared_parameters.EVAL_VAL_RANDOM_STATE
    image_paths_train, image_paths_test, y_train, y_test = \
        train_test_split(image_paths, y, test_size=test_size, random_state=test_random_state)
    image_paths_train, image_paths_val, y_train, y_val = \
        train_test_split(image_paths_train, y_train, test_size=val_size, random_state=val_random_state)
    X_val = image_paths_to_tensors(image_paths_val)
    y_val_transform = shared_parameters.transform_labels(
        y_val, k, args.label_mode)
    X_test = image_paths_to_tensors(image_paths_test)

    # Train.
    optimizer = Adam(lr=2**-10)
    model = get_model(images_size, k, optimizer)
    plateau_monitor = 'val_loss'
    plateau_factor = .5
    early_stopping_monitor = 'val_loss'
    early_stopping_min_delta = 2**-10
    plateau_patience = 10
    early_stopping_patience = 20
    callbacks = [
        ReduceLROnPlateau(monitor=plateau_monitor,
                          factor=plateau_factor,
                          patience=plateau_patience),
        EarlyStopping(monitor=early_stopping_monitor,
                      min_delta=early_stopping_min_delta,
                      patience=early_stopping_patience)
    ]
    train_generator = TransformBalancedBatchGenerator(
        image_paths_train,
        y_train,
        transform_X=image_paths_to_tensors,
        transform_y=transform_y,
        batch_size=32,
        k=k,
        label_mode=shared_parameters.LABEL_MODE_ORDINAL)
    val_generator = SimpleBatchGenerator(X_val, y_val_transform, batch_size=32)
    history = model.fit_generator(train_generator,
                                  callbacks=callbacks,
                                  epochs=1000,
                                  validation_data=val_generator)

    y_pred_ordinal = model.predict(X_test)

    # Convert the ordinal one-hot encoding back to discrete labels.
    y_pred = ordinal.from_multi_hot_ordinal(y_pred_ordinal, threshold=0.5)

    print('`{}`:'.format(category))
    print('Accuracy: {:.3%}'.format(accuracy_score(y_test, y_pred)))
    confusion = confusion_matrix(y_test, y_pred)
    print(confusion)

    history_path = folders.ensure(
        os.path.join(folders.HISTORY_PATH, classifier_name))
    with open(os.path.join(history_path, '{}.txt'.format(base_fname)),
              'w') as fd:
        for key in history.history.keys():
            values = history.history.get(key)
            fd.write('{} {}\n'.format(key,
                                      ' '.join(str(value)
                                               for value in values)))
Ejemplo n.º 6
0
def main():
    script_name = os.path.basename(__file__)
    classifier_name = script_name[:script_name.rindex('.')]

    start_time = int(time.time())
    if 'SLURM_JOB_ID' in os.environ:
        stamp = int(os.environ['SLURM_JOB_ID'])
    else:
        stamp = start_time

    # Load data.
    print('Retrieving labels...')
    subset_ratio = shared_parameters.DATA_SUBSET_RATIO
    subset_seed = shared_parameters.DATA_SUBSET_SEED
    min_len = shared_parameters.DATA_PARAGRAPH_MIN_LEN
    max_len = shared_parameters.DATA_PARAGRAPH_MAX_LEN
    min_tokens = shared_parameters.DATA_MIN_TOKENS
    categories_mode = shared_parameters.DATA_CATEGORIES_MODE
    return_overall = shared_parameters.DATA_RETURN_OVERALL
    _, Y, categories, category_levels = \
        bookcave.get_data({'paragraph_tokens'},
                          subset_ratio=subset_ratio,
                          subset_seed=subset_seed,
                          min_len=min_len,
                          max_len=max_len,
                          min_tokens=min_tokens,
                          categories_mode=categories_mode,
                          return_overall=return_overall)
    print('Retrieved {:d} labels.'.format(Y.shape[1]))

    # Split data set.
    test_size = shared_parameters.EVAL_TEST_SIZE  # b
    test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE
    Y_T = Y.transpose()  # (n, c)
    Y_train_T, Y_test_T = train_test_split(Y_T,
                                           test_size=test_size,
                                           random_state=test_random_state)
    Y_train = Y_train_T.transpose()  # (c, n * (1 - b))
    Y_test = Y_test_T.transpose()  # (c, n * b)

    for j, category in enumerate(categories):
        levels = category_levels[j]
        y_train = Y_train[j]
        y_test = Y_test[j]
        # Predict the most common class seen in the training data.
        y_pred = [np.argmax(np.bincount(y_train, minlength=len(levels)))
                  ] * len(y_test)

        base_fname = '{:d}_{:d}'.format(stamp, j)
        logs_path = folders.ensure(
            os.path.join(folders.LOGS_PATH, classifier_name))
        with open(os.path.join(logs_path, '{}.txt'.format(base_fname)),
                  'w') as fd:
            fd.write('HYPERPARAMETERS\n')
            fd.write('\nText\n')
            fd.write('subset_ratio={}\n'.format(str(subset_ratio)))
            fd.write('subset_seed={}\n'.format(str(subset_seed)))
            fd.write('min_len={:d}\n'.format(min_len))
            fd.write('max_len={:d}\n'.format(max_len))
            fd.write('min_tokens={:d}\n'.format(min_tokens))
            fd.write('\nLabels\n')
            fd.write('categories_mode=\'{}\'\n'.format(categories_mode))
            fd.write('return_overall={}\n'.format(return_overall))
            fd.write('\nTraining\n')
            fd.write('test_size={}\n'.format(str(test_size)))
            fd.write('test_random_state={:d}\n'.format(test_random_state))
            fd.write('\nRESULTS\n\n')
            fd.write('Data size: {:d}\n'.format(Y.shape[1]))
            fd.write('Train size: {:d}\n'.format(Y_train.shape[1]))
            fd.write('Test size: {:d}\n'.format(Y_test.shape[1]))
            fd.write('\n')
            evaluation.write_confusion_and_metrics(y_test, y_pred, fd,
                                                   category)

        predictions_path = folders.ensure(
            os.path.join(folders.PREDICTIONS_PATH, classifier_name))
        with open(os.path.join(predictions_path, '{}.txt'.format(base_fname)),
                  'w') as fd:
            evaluation.write_predictions(y_test, y_pred, fd, category)

    print('Done.')