Ejemplo n.º 1
0
def main():
    images_size = None
    inputs, _, _, _ = bookcave.get_data({'images'}, image_size=images_size)
    book_images = inputs['images']
    out_size = (256, 256)
    for images in book_images:
        path = images[0]
        resize_image(path, out_size)
def main():
    inputs, _, _, _, _, books_df, _, _, _ = bookcave.get_data({'paragraphs'},
                                                              return_meta=True)
    texts = inputs['paragraphs']
    print('All texts: {:d}'.format(len(texts)))

    asins = list(books_df['asin'])
    print('ASINs: {:d}'.format(len(asins)))

    # First, start the Core NLP server:
    # ```
    # java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000
    # ```
    st = nltk.parse.corenlp.CoreNLPParser()

    if not os.path.exists(folders.AMAZON_KINDLE_SENTENCE_TOKENS_PATH):
        os.mkdir(folders.AMAZON_KINDLE_SENTENCE_TOKENS_PATH)
    force = False
    for text_i, text in enumerate(texts):
        path = os.path.join(folders.AMAZON_KINDLE_SENTENCE_TOKENS_PATH,
                            '{}.txt'.format(asins[text_i]))
        if os.path.exists(path):
            if force:
                os.remove(path)
            else:
                continue

        paragraphs, section_ids, sections = text
        paragraph_sentence_tokens = [[[
            token['originalText'] or token['word']
            for token in sentence['tokens']
        ] for sentence in st.api_call(
            paragraph.lower(), properties={'annotators': 'tokenize,ssplit'})
                                      ['sentences']]
                                     for paragraph in paragraphs]
        section_paragraph_sentence_tokens = [[] for _ in range(len(sections))]
        for paragraph_i, sentence_tokens in enumerate(
                paragraph_sentence_tokens):
            section_id = section_ids[paragraph_i]
            section_paragraph_sentence_tokens[section_id].append(
                sentence_tokens)
        paragraph_io.write_formatted_section_paragraph_sentence_tokens(
            section_paragraph_sentence_tokens, path)
def main():
    # Load data.
    if verbose:
        print('\nRetrieving texts...')
    min_len, max_len = 250, 7500
    inputs, Y, categories, category_levels, book_ids, books_df, _, _, categories_df =\
        bookcave.get_data({'paragraph_tokens'},
                          min_len=min_len,
                          max_len=max_len,
                          return_meta=True)
    text_paragraph_tokens, _ = zip(*inputs['paragraph_tokens'])
    if verbose:
        print('Retrieved {:d} texts.'.format(len(text_paragraph_tokens)))

    # Flatten tokens.
    if verbose:
        print('\nFlattening tokens...')
    all_locations = []
    all_tokens = []
    for text_i, paragraph_tokens in enumerate(text_paragraph_tokens):
        for paragraph_i, tokens in enumerate(paragraph_tokens):
            all_locations.append((text_i, paragraph_i))
            all_tokens.append(tokens)
    if verbose:
        print('Paragraphs: {:d}'.format(len(all_tokens)))

    # Tokenize.
    if verbose:
        print('\nTokenizing...')
    max_words = 8192
    tokenizer = Tokenizer(num_words=max_words, oov_token='__UNKNOWN__')
    tokenizer.fit_on_texts(all_tokens)
    if verbose:
        print('Done.')

    # Load embedding matrix.
    if verbose:
        print('\nLoading embedding...')
    embedding_matrix = load_embeddings.load_embedding(tokenizer, folders.EMBEDDING_GLOVE_100_PATH, max_words)
    if verbose:
        print('Done.')

    # Load paragraph labels.
    if verbose:
        print('\nLoading paragraph labels...')
    tokens_min_len = 3
    train_locations = []
    train_tokens = []
    train_paragraph_labels = []
    for text_i, paragraph_tokens in enumerate(text_paragraph_tokens):
        book_id = book_ids[text_i]
        asin = books_df[books_df['id'] == book_id].iloc[0]['asin']
        category_labels = [bookcave.get_labels(asin, category) for category in categories]
        if any(labels is None for labels in category_labels):
            continue
        for paragraph_i, tokens in enumerate(paragraph_tokens):
            paragraph_labels = [labels[paragraph_i] for labels in category_labels]
            if any(label == -1 for label in paragraph_labels):
                continue
            if len(tokens) < tokens_min_len:
                continue
            train_locations.append((text_i, paragraph_i))
            train_tokens.append(tokens)
            train_paragraph_labels.append(paragraph_labels)
    test_text_indices = list({text_i for text_i, _ in train_locations})
    if verbose:
        print('Finished loading labels for {:d} books.'.format(len(test_text_indices)))

    # Split data.
    if verbose:
        print('\nSplitting data into training and test sets...')
    n_tokens = 160  # t
    test_size = .25  # b
    random_state = 1
    train_sequences = tokenizer.texts_to_sequences(train_tokens)
    P = np.array([get_input_array(sequence, n_tokens) for sequence in train_sequences])  # (n, t)
    Q = np.array(train_paragraph_labels)  # (n, C)
    P_train, P_test, Q_train, Q_test = train_test_split(P, Q, test_size=test_size, random_state=random_state)
    if verbose:
        print('Training instances: {:d}'.format(len(P_train)))
        print('Test instances: {:d}'.format(len(P_test)))

    # Create a new model.
    if verbose:
        print('\nCreating model...')
    hidden_size = 64
    dense_size = 32
    embedding_trainable = True
    category_k = [len(levels) for levels in category_levels]
    model, weights_fname = create_model(category_k,
                                        n_tokens,
                                        embedding_matrix,
                                        hidden_size,
                                        dense_size,
                                        embedding_trainable=embedding_trainable)
    if verbose:
        print(model.summary())

    # Weight classes inversely proportional to their frequency.
    class_weights = []
    for j in range(Q_train.shape[1]):
        q_train = Q_train[:, j]
        bincount = np.bincount(q_train, minlength=category_k[j])
        class_weight = {i: 1 / (count + 1) for i, count in enumerate(bincount)}
        class_weights.append(class_weight)

    # Train on paragraphs.
    batch_size = 32
    optimizer = Adam()
    model.compile(optimizer,
                  loss='binary_crossentropy',
                  metrics=['binary_accuracy'])
    category_Q_train_ordinal = [ordinal.to_multi_hot_ordinal(Q_train[:, j], k=k)
                                for j, k in enumerate(category_k)]  # (C, (1 - b)*n, k_c - 1)
    _ = model.fit(P_train,
                  category_Q_train_ordinal,
                  batch_size=batch_size,
                  epochs=epochs,
                  verbose=verbose,
                  class_weight=class_weights)

    # Evaluate paragraphs.
    category_Q_pred_ordinal = model.predict(P_test)  # (C, b*n, k_c - 1)
    for category_i, category in enumerate(categories):
        print('\nParagraph metrics for category `{}`:'.format(category))
        q_test = Q_test[:, category_i]
        q_pred_ordinal = category_Q_pred_ordinal[category_i]  # (b*n, k_c - 1)
        q_pred = ordinal.from_multi_hot_ordinal(q_pred_ordinal)  # (b*n,)
        evaluation.print_metrics(q_test, q_pred)

    # Evaluate books.
    def get_label_from_paragraph_labels(q_pred_):
        return max(q_pred_)

    # Evaluate only books from which the training set of paragraphs came.
    if verbose:
        print('\nEvaluating training set...')
    test_locations = []
    test_tokens = []
    Y_test = Y[:, test_text_indices]
    for location_i, text_i in enumerate(test_text_indices):
        for paragraph_i, tokens in enumerate(text_paragraph_tokens[text_i]):
            test_locations.append((location_i, paragraph_i))
            test_tokens.append(tokens)
    test_sequences = tokenizer.texts_to_sequences(test_tokens)
    X_test = np.array([get_input_array(sequence, n_tokens) for sequence in test_sequences])
    Y_pred_test = predict_book_labels(model, X_test, test_locations, Y_test, get_label_from_paragraph_labels)
    for category_i, category in enumerate(categories):
        print('\nTraining set of books for category `{}`:'.format(category))
        y_test, y_pred_test = Y_test[category_i], Y_pred_test[category_i]
        evaluation.print_metrics(y_test, y_pred_test)

    # Evaluate all books.
    if verbose:
        print('\nEvaluating all books...')
    all_sequences = tokenizer.texts_to_sequences(all_tokens)
    X_all = np.array([get_input_array(sequence, n_tokens) for sequence in all_sequences])
    Y_pred_all = predict_book_labels(model, X_all, all_locations, Y, get_label_from_paragraph_labels)
    for category_i, category in enumerate(categories):
        print('\nAll books for category `{}`:'.format(category))
        y, y_pred_all = Y[category_i], Y_pred_all[category_i]
        evaluation.print_metrics(y, y_pred_all)
Ejemplo n.º 4
0
def train(skip_models=False):
    max_words = shared_parameters.TEXT_MAX_WORDS

    start_time = int(time.time())
    if 'SLURM_JOB_ID' in os.environ:
        stamp = int(os.environ['SLURM_JOB_ID'])
    else:
        stamp = start_time
    print('Time stamp: {:d}'.format(stamp))

    # Load data.
    print('Retrieving texts...')
    source = 'paragraph_tokens'
    subset_ratio = .1  #shared_parameters.DATA_SUBSET_RATIO
    subset_seed = shared_parameters.DATA_SUBSET_SEED
    min_len = shared_parameters.DATA_PARAGRAPH_MIN_LEN
    max_len = shared_parameters.DATA_PARAGRAPH_MAX_LEN
    min_tokens = shared_parameters.DATA_MIN_TOKENS
    remove_stopwords = False
    categories_mode = shared_parameters.DATA_CATEGORIES_MODE
    return_overall = shared_parameters.DATA_RETURN_OVERALL
    inputs, Y, categories, category_levels = \
        bookcave.get_data({source},
                          subset_ratio=subset_ratio,
                          subset_seed=subset_seed,
                          min_len=min_len,
                          max_len=max_len,
                          min_tokens=min_tokens,
                          remove_stopwords=remove_stopwords,
                          categories_mode=categories_mode,
                          return_overall=return_overall)
    text_source_tokens = list(zip(*inputs[source]))[0]
    print('Retrieved {:d} texts.'.format(len(text_source_tokens)))

    # Create vectorized representations of the book texts.
    print('Vectorizing text...')
    text_tokens = []
    for source_tokens in text_source_tokens:
        all_tokens = []
        for tokens in source_tokens:
            all_tokens.extend(tokens)
        text_tokens.append(all_tokens)
    vectorizer = tokenizers.get_vectorizer_or_fit(max_words,
                                                  remove_stopwords,
                                                  text_tokens=text_tokens)
    X = vectorizer.transform(text_tokens)
    print('Vectorized text with {:d} unique words.'.format(
        len(vectorizer.get_feature_names())))

    # Split data set.
    test_size = shared_parameters.EVAL_TEST_SIZE  # b
    test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE
    Y_T = Y.transpose()  # (n, c)
    X_train, X_test, Y_train_T, Y_test_T = train_test_split(
        X, Y_T, test_size=test_size, random_state=test_random_state)
    Y_train = Y_train_T.transpose()  # (c, n * (1 - b))
    Y_test = Y_test_T.transpose()  # (c, n * b)

    create_funcs = [
        create_k_nearest_neighbors, create_logistic_regression,
        create_multi_layer_perceptron, create_multinomial_naive_bayes,
        create_random_forest, create_svm
    ]
    model_names = [
        'k_nearest_neighbors', 'logistic_regression', 'multi_layer_perceptron',
        'multinomial_naive_bayes', 'random_forest', 'svm'
    ]
    for m, create_func in enumerate(create_funcs):
        model_name = model_names[m]
        model_path = folders.ensure(
            os.path.join(folders.MODELS_PATH, model_name))
        print('Training model `{}`...'.format(model_name))
        for j, category in enumerate(categories):
            print('Classifying category `{}`...'.format(category))
            y_train = Y_train[j]  # (n * (1 - b))
            k = len(category_levels[j])
            classifiers = fit_ordinal(create_func, X_train, y_train, k)
            y_pred = predict_ordinal(classifiers, X_test, k)  # (n * b)
            y_test = Y_test[j]

            base_fname = '{:d}_{:d}'.format(stamp, j)
            logs_path = folders.ensure(
                os.path.join(folders.LOGS_PATH, model_name))
            with open(os.path.join(logs_path, '{}.txt'.format(base_fname)),
                      'w') as fd:
                fd.write('HYPERPARAMETERS\n')
                fd.write('\nText\n')
                fd.write('subset_ratio={}\n'.format(str(subset_ratio)))
                fd.write('subset_seed={}\n'.format(str(subset_seed)))
                fd.write('min_len={:d}\n'.format(min_len))
                fd.write('max_len={:d}\n'.format(max_len))
                fd.write('min_tokens={:d}\n'.format(min_tokens))
                fd.write('\nLabels\n')
                fd.write('categories_mode=\'{}\'\n'.format(categories_mode))
                fd.write('return_overall={}\n'.format(return_overall))
                fd.write('\nVectorization\n')
                fd.write('max_words={:d}\n'.format(max_words))
                fd.write('vectorizer={}\n'.format(
                    vectorizer.__class__.__name__))
                fd.write('\nTraining\n')
                fd.write('test_size={}\n'.format(str(test_size)))
                fd.write('test_random_state={:d}\n'.format(test_random_state))
                fd.write('\nRESULTS\n\n')
                fd.write('Data size: {:d}\n'.format(X.shape[0]))
                fd.write('Train size: {:d}\n'.format(X_train.shape[0]))
                fd.write('Test size: {:d}\n\n'.format(X_test.shape[0]))
                evaluation.write_confusion_and_metrics(y_test, y_pred, fd,
                                                       category)

            predictions_path = folders.ensure(
                os.path.join(folders.PREDICTIONS_PATH, model_name))
            with open(
                    os.path.join(predictions_path,
                                 '{}.txt'.format(base_fname)), 'w') as fd:
                evaluation.write_predictions(y_test, y_pred, fd, category)

            if not skip_models:
                models_path = folders.ensure(
                    os.path.join(model_path, base_fname))
                for i, classifier in enumerate(classifiers):
                    with open(
                            os.path.join(models_path,
                                         'model{:d}.pickle'.format(i)),
                            'wb') as fd:
                        pickle.dump(classifier,
                                    fd,
                                    protocol=pickle.HIGHEST_PROTOCOL)

    print('Done.')
Ejemplo n.º 5
0
def main():
    parser = ArgumentParser(
        description='Classify the maturity level of a book by its text.',
        formatter_class=RawTextHelpFormatter)
    parser.add_argument('category_index',
                        type=int,
                        help='The category index.\n  {}'.format('\n  '.join([
                            '{:d} {}'.format(j,
                                             bookcave.CATEGORY_NAMES[category])
                            for j, category in enumerate(bookcave.CATEGORIES)
                        ])))
    parser.add_argument('--source_mode',
                        default='paragraph',
                        choices=['paragraph', 'sentence'],
                        help='The source of text. Default is `paragraph`.')
    parser.add_argument('--net_mode',
                        default='cnn',
                        choices=['rnn', 'cnn', 'rnncnn'],
                        help='The type of neural network. Default is `cnn`.')
    parser.add_argument('--remove_stopwords',
                        action='store_true',
                        help='Remove stop-words from text. Default is False.')
    parser.add_argument(
        '--agg_mode',
        default='maxavg',
        choices=['max', 'avg', 'maxavg', 'rnn'],
        help=
        'The way the network will aggregate paragraphs or sentences. Default is `maxavg`.'
    )
    parser.add_argument('--label_mode',
                        default=shared_parameters.LABEL_MODE_ORDINAL,
                        choices=[
                            shared_parameters.LABEL_MODE_ORDINAL,
                            shared_parameters.LABEL_MODE_CATEGORICAL,
                            shared_parameters.LABEL_MODE_REGRESSION
                        ],
                        help='The way that labels will be interpreted. '
                        'Default is `{}`.'.format(
                            shared_parameters.LABEL_MODE_ORDINAL))
    parser.add_argument(
        '--remove_classes',
        type=str,
        help=
        'Remove classes altogether. Can be used when the minority class is severely tiny. '
        'Like `<class1>[,<class2>,...]` as in `3` or `3,0`. Optional.')
    parser.add_argument(
        '--class_weight_p',
        default=2,
        type=int,
        help='Power with which to scale class weights. Default is 2.')
    parser.add_argument(
        '--embedding_trainable',
        action='store_true',
        help=
        'Flag to allow the model to optimize the word embeddings. Default is False.'
    )
    parser.add_argument(
        '--book_dense_units',
        default='128',
        help=
        'The number of neurons in the final fully-connected layers, comma separated. '
        'Default is `128`.')
    parser.add_argument(
        '--book_dropout',
        default=0.5,
        type=float,
        help=
        'Dropout probability before final classification layer. Default is 0.5.'
    )
    parser.add_argument(
        '--plateau_patience',
        default=16,
        type=int,
        help=
        'Number of epochs to wait before dividing the learning rate by 2. Default is 16.'
    )
    parser.add_argument(
        '--early_stopping_patience',
        default=32,
        type=int,
        help=
        'Number of epochs to wait before dividing the learning rate by 2. Default is 32.'
    )
    parser.add_argument('--epochs',
                        default=1,
                        type=int,
                        help='Epochs. Default is 1.')
    parser.add_argument(
        '--save_model',
        action='store_true',
        help='Save the model and its weights. Default is False.')
    parser.add_argument(
        '--note',
        help=
        'An optional note that will be appended to the names of generated files.'
    )
    args = parser.parse_args()

    classifier_name = '{}_{}_{}_{}'.format(args.source_mode, args.net_mode,
                                           args.agg_mode, args.label_mode)

    start_time = int(time.time())
    if 'SLURM_JOB_ID' in os.environ:
        stamp = int(os.environ['SLURM_JOB_ID'])
    else:
        stamp = start_time
    print('Time stamp: {:d}'.format(stamp))
    if args.note is not None:
        print('Note: {}'.format(args.note))
        base_fname = '{:d}_{}_{:d}'.format(stamp, args.note,
                                           args.category_index)
    else:
        base_fname = '{:d}_{:d}'.format(stamp, args.category_index)

    # Load data.
    print('Retrieving texts...')
    if args.source_mode == 'paragraph':
        source = 'paragraph_tokens'
        min_len = shared_parameters.DATA_PARAGRAPH_MIN_LEN
        max_len = shared_parameters.DATA_PARAGRAPH_MAX_LEN
    else:  # args.source_mode == 'sentence':
        source = 'sentence_tokens'
        min_len = shared_parameters.DATA_SENTENCE_MIN_LEN
        max_len = shared_parameters.DATA_SENTENCE_MAX_LEN
    subset_ratio = shared_parameters.DATA_SUBSET_RATIO
    subset_seed = shared_parameters.DATA_SUBSET_SEED
    min_tokens = shared_parameters.DATA_MIN_TOKENS
    categories_mode = shared_parameters.DATA_CATEGORIES_MODE
    return_overall = shared_parameters.DATA_RETURN_OVERALL
    inputs, Y, categories, category_levels = \
        bookcave.get_data({source},
                          subset_ratio=subset_ratio,
                          subset_seed=subset_seed,
                          min_len=min_len,
                          max_len=max_len,
                          min_tokens=min_tokens,
                          remove_stopwords=args.remove_stopwords,
                          categories_mode=categories_mode,
                          return_overall=return_overall)
    text_source_tokens = list(zip(*inputs[source]))[0]
    print('Retrieved {:d} texts.'.format(len(text_source_tokens)))

    # Reduce labels to the specified category.
    y = Y[args.category_index]
    category = categories[args.category_index]
    levels = category_levels[args.category_index]
    k = len(levels)
    k_train = k

    # Tokenize.
    print('Tokenizing...')
    max_words = shared_parameters.TEXT_MAX_WORDS
    split = '\t'
    tokenizer = tokenizers.get_tokenizer_or_fit(
        max_words,
        args.source_mode,
        args.remove_stopwords,
        text_source_tokens=text_source_tokens)

    # Convert to sequences.
    print('Converting texts to sequences...')
    if args.source_mode == 'paragraph':
        if not args.remove_stopwords:
            n_tokens = shared_parameters.TEXT_N_PARAGRAPH_TOKENS
        else:
            n_tokens = shared_parameters.TEXT_N_PARAGRAPH_TOKENS_NO_STOPWORDS
    else:  # args.source_mode == 'sentence':
        n_tokens = shared_parameters.TEXT_N_SENTENCE_TOKENS
    padding = shared_parameters.TEXT_PADDING
    truncating = shared_parameters.TEXT_TRUNCATING
    X = [
        np.array(
            pad_sequences(tokenizer.texts_to_sequences(
                [split.join(tokens) for tokens in source_tokens]),
                          maxlen=n_tokens,
                          padding=padding,
                          truncating=truncating))
        for source_tokens in text_source_tokens
    ]

    # Load embedding.
    print('Loading embedding matrix...')
    embedding_path = folders.EMBEDDING_GLOVE_300_PATH
    embedding_matrix = load_embeddings.load_embedding(tokenizer,
                                                      embedding_path,
                                                      max_words)

    # Create model.
    print('Creating model...')
    net_params = dict()
    if args.net_mode == 'rnn' or args.net_mode == 'rnncnn':
        net_params['rnn'] = CuDNNGRU if tf.test.is_gpu_available(
            cuda_only=True) else GRU
        net_params['rnn_units'] = 128
        net_params['rnn_l2'] = .001
        net_params['rnn_dense_units'] = 64
        net_params['rnn_dense_activation'] = 'elu'
        net_params['rnn_dense_l2'] = .001
        net_params['rnn_agg'] = 'attention'
    if args.net_mode == 'cnn' or args.net_mode == 'rnncnn':
        net_params['cnn_filters'] = 16
        net_params['cnn_filter_sizes'] = [1, 2, 3, 4]
        net_params['cnn_activation'] = 'elu'
        net_params['cnn_l2'] = .001
    agg_params = dict()
    if args.agg_mode == 'rnn':
        agg_params['rnn'] = CuDNNGRU if tf.test.is_gpu_available(
            cuda_only=True) else GRU
        agg_params['rnn_units'] = 64
        agg_params['rnn_l2'] = .001
    book_dense_units = [
        int(units) for units in args.book_dense_units.split(',')
    ]
    book_dense_activation = LeakyReLU(alpha=.1)
    book_dense_l2 = .001
    book_dropout = args.book_dropout
    model = create_model(n_tokens, embedding_matrix, args.embedding_trainable,
                         args.net_mode, net_params, args.agg_mode, agg_params,
                         book_dense_units, book_dense_activation,
                         book_dense_l2, book_dropout, k, category,
                         args.label_mode)
    lr = 2**-16
    optimizer = Adam(lr=lr)
    if args.label_mode == shared_parameters.LABEL_MODE_ORDINAL:
        loss = 'binary_crossentropy'
        metric = 'binary_accuracy'
    elif args.label_mode == shared_parameters.LABEL_MODE_CATEGORICAL:
        loss = 'categorical_crossentropy'
        metric = 'categorical_accuracy'
    else:  # args.label_mode == shared_parameters.LABEL_MODE_REGRESSION:
        loss = 'mse'
        metric = 'accuracy'
    model.compile(optimizer, loss=loss, metrics=[metric])

    # Split data set.
    print('Splitting data set...')
    test_size = shared_parameters.EVAL_TEST_SIZE  # b
    test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE
    val_size = shared_parameters.EVAL_VAL_SIZE  # v
    val_random_state = shared_parameters.EVAL_VAL_RANDOM_STATE
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=test_size, random_state=test_random_state)
    X_train, X_val, y_train, y_val = \
        train_test_split(X_train, y_train, test_size=val_size, random_state=val_random_state)
    y_val_transform = shared_parameters.transform_labels(
        y_val, k, args.label_mode)
    y_test_transform = shared_parameters.transform_labels(
        y_test, k, args.label_mode)

    # Remove classes from training set, if specified.
    if args.remove_classes is not None:
        remove_classes = sorted(list(
            map(int,
                args.remove_classes.strip().split(','))),
                                reverse=True)
        for class_ in remove_classes:
            y_train[y_train >= class_] -= 1
            k_train -= 1

    # Create generators.
    print('Creating data generators...')
    train_generator = TransformBalancedBatchGenerator(
        np.arange(len(X_train)).reshape((len(X_train), 1)),
        y_train,
        transform_X=transform_X,
        transform_y=transform_y,
        batch_size=1,
        X_data=[np.array([x]) for x in X_train],
        k=k,
        label_mode=args.label_mode)
    val_generator = SingleInstanceBatchGenerator(X_val,
                                                 y_val_transform,
                                                 shuffle=False)
    test_generator = SingleInstanceBatchGenerator(X_test,
                                                  y_test_transform,
                                                  shuffle=False)

    # Get class weight.
    class_weight = shared_parameters.get_class_weight(k_train,
                                                      args.label_mode,
                                                      p=args.class_weight_p)

    # Train.
    print('Training for up to {:d} epoch{}...'.format(
        args.epochs, 's' if args.epochs != 1 else ''))
    plateau_monitor = 'val_loss'
    plateau_factor = .5
    early_stopping_monitor = 'val_loss'
    early_stopping_min_delta = 2**-10
    plateau_patience = args.plateau_patience
    early_stopping_patience = args.early_stopping_patience
    callbacks = [
        ReduceLROnPlateau(monitor=plateau_monitor,
                          factor=plateau_factor,
                          patience=plateau_patience),
        EarlyStopping(monitor=early_stopping_monitor,
                      min_delta=early_stopping_min_delta,
                      patience=early_stopping_patience)
    ]
    if args.save_model:
        models_path = folders.ensure(
            os.path.join(folders.MODELS_PATH, classifier_name))
        model_path = os.path.join(models_path, '{}.h5'.format(base_fname))
        model_checkpoint = ModelCheckpoint(model_path,
                                           monitor='val_loss',
                                           save_best_only=True,
                                           mode='min')
        callbacks.append(model_checkpoint)
    else:
        model_path = None
    history = model.fit_generator(train_generator,
                                  epochs=args.epochs,
                                  verbose=0,
                                  callbacks=callbacks,
                                  validation_data=val_generator,
                                  class_weight=class_weight)
    epochs_complete = len(history.history.get('val_loss'))

    # Save the history to visualize loss over time.
    print('Saving training history...')
    history_path = folders.ensure(
        os.path.join(folders.HISTORY_PATH, classifier_name))
    with open(os.path.join(history_path, '{}.txt'.format(base_fname)),
              'w') as fd:
        for key in history.history.keys():
            values = history.history.get(key)
            fd.write('{} {}\n'.format(key,
                                      ' '.join(str(value)
                                               for value in values)))

    # Predict test instances.
    print('Predicting test instances...')
    y_pred_transform = model.predict_generator(test_generator)
    if args.label_mode == shared_parameters.LABEL_MODE_ORDINAL:
        y_pred = ordinal.from_multi_hot_ordinal(y_pred_transform, threshold=.5)
    elif args.label_mode == shared_parameters.LABEL_MODE_CATEGORICAL:
        y_pred = np.argmax(y_pred_transform, axis=1)
    else:  # args.label_mode == shared_parameters.LABEL_MODE_REGRESSION:
        y_pred = np.maximum(0, np.minimum(k - 1,
                                          np.round(y_pred_transform * k)))

    # Calculate elapsed time.
    end_time = int(time.time())
    elapsed_s = end_time - start_time
    elapsed_m, elapsed_s = elapsed_s // 60, elapsed_s % 60
    elapsed_h, elapsed_m = elapsed_m // 60, elapsed_m % 60

    # Write results.
    print('Writing results...')
    logs_path = folders.ensure(os.path.join(folders.LOGS_PATH,
                                            classifier_name))
    with open(os.path.join(logs_path, '{}.txt'.format(base_fname)), 'w') as fd:
        if args.note is not None:
            fd.write('{}\n\n'.format(args.note))
        fd.write('PARAMETERS\n\n')
        fd.write('category_index={:d}\n'.format(args.category_index))
        fd.write('epochs={:d}\n'.format(args.epochs))
        fd.write('\nHYPERPARAMETERS\n')
        fd.write('\nText\n')
        fd.write('subset_ratio={}\n'.format(str(subset_ratio)))
        fd.write('subset_seed={}\n'.format(str(subset_seed)))
        fd.write('min_len={:d}\n'.format(min_len))
        fd.write('max_len={:d}\n'.format(max_len))
        fd.write('min_tokens={:d}\n'.format(min_tokens))
        fd.write('remove_stopwords={}\n'.format(args.remove_stopwords))
        fd.write('\nLabels\n')
        fd.write('categories_mode=\'{}\'\n'.format(categories_mode))
        fd.write('return_overall={}\n'.format(return_overall))
        if args.remove_classes is not None:
            fd.write('remove_classes={}\n'.format(args.remove_classes))
        else:
            fd.write('No classes removed.\n')
        fd.write('class_weight_p={:d}\n'.format(args.class_weight_p))
        fd.write('\nTokenization\n')
        fd.write('max_words={:d}\n'.format(max_words))
        fd.write('n_tokens={:d}\n'.format(n_tokens))
        fd.write('padding=\'{}\'\n'.format(padding))
        fd.write('truncating=\'{}\'\n'.format(truncating))
        fd.write('\nWord Embedding\n')
        fd.write('embedding_path=\'{}\'\n'.format(embedding_path))
        fd.write('embedding_trainable={}\n'.format(args.embedding_trainable))
        fd.write('\nModel\n')
        if args.net_mode == 'rnn' or args.net_mode == 'rnncnn':
            fd.write('rnn={}\n'.format(net_params['rnn'].__name__))
            fd.write('rnn_units={:d}\n'.format(net_params['rnn_units']))
            fd.write('rnn_l2={}\n'.format(str(net_params['rnn_l2'])))
            fd.write('rnn_dense_units={:d}\n'.format(
                net_params['rnn_dense_units']))
            fd.write('rnn_dense_activation=\'{}\'\n'.format(
                net_params['rnn_dense_activation']))
            fd.write('rnn_dense_l2={}\n'.format(str(
                net_params['rnn_dense_l2'])))
            fd.write('rnn_agg={}\n'.format(net_params['rnn_agg']))
        if args.net_mode == 'cnn' or args.net_mode == 'rnncnn':
            fd.write('cnn_filters={:d}\n'.format(net_params['cnn_filters']))
            fd.write('cnn_filter_sizes={}\n'.format(
                str(net_params['cnn_filter_sizes'])))
            fd.write('cnn_activation=\'{}\'\n'.format(
                net_params['cnn_activation']))
            fd.write('cnn_l2={}\n'.format(str(net_params['cnn_l2'])))
        if args.agg_mode == 'rnn':
            fd.write('agg_rnn={}\n'.format(agg_params['rnn'].__name__))
            fd.write('agg_rnn_units={:d}\n'.format(agg_params['rnn_units']))
            fd.write('agg_rnn_l2={}\n'.format(str(agg_params['rnn_l2'])))
        fd.write('book_dense_units={}\n'.format(args.book_dense_units))
        fd.write('book_dense_activation={} {}\n'.format(
            book_dense_activation.__class__.__name__,
            book_dense_activation.__dict__))
        fd.write('book_dense_l2={}\n'.format(str(book_dense_l2)))
        fd.write('book_dropout={}\n'.format(str(book_dropout)))
        model.summary(print_fn=lambda x: fd.write('{}\n'.format(x)))
        fd.write('\nTraining\n')
        fd.write('optimizer={}\n'.format(optimizer.__class__.__name__))
        fd.write('lr={}\n'.format(str(lr)))
        fd.write('loss=\'{}\'\n'.format(loss))
        fd.write('metric=\'{}\'\n'.format(metric))
        fd.write('test_size={}\n'.format(str(test_size)))
        fd.write('test_random_state={:d}\n'.format(test_random_state))
        fd.write('val_size={}\n'.format(str(val_size)))
        fd.write('val_random_state={:d}\n'.format(val_random_state))
        fd.write('plateau_monitor={}\n'.format(plateau_monitor))
        fd.write('plateau_factor={}\n'.format(str(plateau_factor)))
        fd.write('plateau_patience={:d}\n'.format(plateau_patience))
        fd.write('early_stopping_monitor={}\n'.format(early_stopping_monitor))
        fd.write('early_stopping_min_delta={}\n'.format(
            str(early_stopping_min_delta)))
        fd.write(
            'early_stopping_patience={:d}\n'.format(early_stopping_patience))
        fd.write('\nRESULTS\n\n')
        fd.write('Data size: {:d}\n'.format(len(X)))
        fd.write('Train size: {:d}\n'.format(len(X_train)))
        fd.write('Validation size: {:d}\n'.format(len(X_val)))
        fd.write('Test size: {:d}\n'.format(len(X_test)))
        if model_path is not None:
            fd.write('Model path: \'{}\'\n'.format(model_path))
        else:
            fd.write('Model not saved.\n')
        fd.write('Epochs completed: {:d}\n'.format(epochs_complete))
        fd.write('Time elapsed: {:d}h {:d}m {:d}s\n\n'.format(
            elapsed_h, elapsed_m, elapsed_s))
        evaluation.write_confusion_and_metrics(y_test, y_pred, fd, category)

    # Write predictions.
    print('Writing predictions...')
    predictions_path = folders.ensure(
        os.path.join(folders.PREDICTIONS_PATH, classifier_name))
    with open(os.path.join(predictions_path, '{}.txt'.format(base_fname)),
              'w') as fd:
        evaluation.write_predictions(y_test, y_pred, fd, category)

    print('Done.')
Ejemplo n.º 6
0
def main(argv):
    if len(argv) < 2 or len(argv) > 3:
        raise ValueError('Usage: <steps_per_epoch> <epochs> [note]')
    steps_per_epoch = int(argv[0])
    epochs = int(argv[1])
    note = None
    if len(argv) > 2:
        note = argv[2]

    script_name = os.path.basename(__file__)
    classifier_name = script_name[:script_name.index('.')]

    start_time = int(time.time())
    if 'SLURM_JOB_ID' in os.environ:
        stamp = int(os.environ['SLURM_JOB_ID'])
    else:
        stamp = start_time
    print('Time stamp: {:d}'.format(stamp))
    if note is not None:
        print('Note: {}'.format(note))
        base_fname = '{:d}_{}'.format(stamp, note)
    else:
        base_fname = format(stamp, 'd')

    # Load data.
    print('Loading data...')
    subset_ratio = shared_parameters.DATA_SUBSET_RATIO
    subset_seed = shared_parameters.DATA_SUBSET_SEED
    min_len = shared_parameters.DATA_SENTENCE_MIN_LEN
    max_len = shared_parameters.DATA_SENTENCE_MAX_LEN
    min_tokens = shared_parameters.DATA_MIN_TOKENS
    categories_mode = shared_parameters.DATA_CATEGORIES_MODE
    inputs, Y, categories, category_levels = \
        bookcave.get_data({'sentence_tokens'},
                          subset_ratio=subset_ratio,
                          subset_seed=subset_seed,
                          min_len=min_len,
                          max_len=max_len,
                          min_tokens=min_tokens,
                          categories_mode=categories_mode)
    text_sentence_tokens, text_section_ids, text_paragraph_ids = zip(
        *inputs['sentence_tokens'])
    print('Retrieved {:d} texts.'.format(len(text_sentence_tokens)))

    # Tokenize.
    print('Tokenizing...')
    max_words = shared_parameters.TEXT_MAX_WORDS
    split = '\t'
    tokenizer = Tokenizer(num_words=max_words, split=split)
    all_sentences = []
    for sentence_tokens in text_sentence_tokens:
        for tokens in sentence_tokens:
            all_sentences.append(split.join(tokens))
    tokenizer.fit_on_texts(all_sentences)
    print('Done.')

    # Convert to sequences.
    print('Converting texts to sequences...')
    n_sentences = shared_parameters.TEXT_N_SENTENCES
    n_sentence_tokens = shared_parameters.TEXT_N_SENTENCE_TOKENS
    padding = shared_parameters.TEXT_PADDING
    truncating = shared_parameters.TEXT_TRUNCATING
    text_sentence_sequences = [
        pad_sequences(tokenizer.texts_to_sequences(
            [split.join(tokens) for tokens in sentence_tokens]),
                      maxlen=n_sentence_tokens,
                      padding=padding,
                      truncating=truncating)
        for sentence_tokens in text_sentence_tokens
    ]
    X = []
    for text_i, sentence_sequences in enumerate(text_sentence_sequences):
        section_ids = text_section_ids[text_i]
        paragraph_ids = text_paragraph_ids[text_i]
        n_paragraphs = len(
            np.unique(list(
                zip(text_section_ids[text_i], text_paragraph_ids[text_i])),
                      axis=0))
        x = np.zeros((n_paragraphs, n_sentences,
                      n_sentence_tokens))  # [paragraph_i][sentence_i][token_i]
        paragraph_i = 0
        sentence_i = 0
        last_section_paragraph_id = None
        for sequence_i, sentence_sequence in enumerate(sentence_sequences):
            section_paragraph_id = (section_ids[sequence_i],
                                    paragraph_ids[sequence_i])
            if last_section_paragraph_id is not None and section_paragraph_id != last_section_paragraph_id:
                paragraph_i += 1
                sentence_i = 0
            if sentence_i < n_sentences:
                x[paragraph_i, sentence_i] = sentence_sequence
            sentence_i += 1
            last_section_paragraph_id = section_paragraph_id
        X.append(x)
    print('Done.')

    # Load embedding.
    print('Loading embedding matrix...')
    embedding_path = folders.EMBEDDING_GLOVE_300_PATH
    embedding_matrix = load_embeddings.load_embedding(tokenizer,
                                                      embedding_path,
                                                      max_words)
    print('Done.')

    # Create model.
    print('Creating model...')
    category_k = [len(levels) for levels in category_levels]
    embedding_trainable = False
    sent_rnn = CuDNNGRU if tf.test.is_gpu_available(cuda_only=True) else GRU
    sent_rnn_units = 128
    sent_rnn_l2 = .01
    sent_dense_units = 64
    sent_dense_activation = 'elu'
    sent_dense_l2 = .01
    para_rnn = CuDNNGRU if tf.test.is_gpu_available(cuda_only=True) else GRU
    para_rnn_units = 128
    para_rnn_l2 = .01
    para_dense_units = 64
    para_dense_activation = 'elu'
    para_dense_l2 = .01
    book_dense_units = 128
    book_dense_activation = tf.keras.layers.LeakyReLU(alpha=.1)
    book_dense_l2 = .01
    book_dropout = .5
    label_mode = shared_parameters.LABEL_MODE_ORDINAL
    sentence_encoder, paragraph_encoder, model = create_model(
        n_sentences, n_sentence_tokens, embedding_matrix, embedding_trainable,
        sent_rnn, sent_rnn_units, sent_rnn_l2, sent_dense_units,
        sent_dense_activation, sent_dense_l2, para_rnn, para_rnn_units,
        para_rnn_l2, para_dense_units, para_dense_activation, para_dense_l2,
        book_dense_units, book_dense_activation, book_dense_l2, book_dropout,
        category_k, categories, label_mode)
    lr = 2**-16
    optimizer = Adam(lr=lr)
    if label_mode == shared_parameters.LABEL_MODE_ORDINAL:
        loss = 'binary_crossentropy'
        metric = 'binary_accuracy'
    elif label_mode == shared_parameters.LABEL_MODE_CATEGORICAL:
        loss = 'categorical_crossentropy'
        metric = 'categorical_accuracy'
    elif label_mode == shared_parameters.LABEL_MODE_REGRESSION:
        loss = 'mse'
        metric = 'accuracy'
    else:
        raise ValueError(
            'Unknown value for `1abel_mode`: {}'.format(label_mode))
    model.compile(optimizer, loss=loss, metrics=[metric])
    print('Done.')

    # Split data set.
    test_size = shared_parameters.EVAL_TEST_SIZE  # b
    test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE
    val_size = shared_parameters.EVAL_VAL_SIZE  # v
    val_random_state = shared_parameters.EVAL_VAL_RANDOM_STATE
    Y_T = Y.transpose()  # (n, c)
    X_train, X_test, Y_train_T, Y_test_T = \
        train_test_split(X, Y_T, test_size=test_size, random_state=test_random_state)
    X_train, X_val, Y_train_T, Y_val_T = \
        train_test_split(X_train, Y_train_T, test_size=val_size, random_state=val_random_state)
    Y_train = Y_train_T.transpose()  # (c, n * (1 - b) * (1 - v))
    Y_val = Y_val_T.transpose()  # (c, n * (1 - b) * v)
    Y_test = Y_test_T.transpose()  # (c, n * b)

    # Transform labels based on the label mode.
    Y_train = shared_parameters.transform_labels(Y_train, category_k,
                                                 label_mode)
    Y_val = shared_parameters.transform_labels(Y_val, category_k, label_mode)

    # Calculate class weights.
    use_class_weights = True
    class_weight_f = 'inverse'
    if use_class_weights:
        category_class_weights = shared_parameters.get_category_class_weights(
            Y_train, label_mode, f=class_weight_f)
    else:
        category_class_weights = None

    # Create generators.
    shuffle = True
    train_generator = SingleInstanceBatchGenerator(X_train,
                                                   Y_train,
                                                   shuffle=shuffle)
    val_generator = SingleInstanceBatchGenerator(X_val, Y_val, shuffle=False)
    test_generator = SingleInstanceBatchGenerator(X_test,
                                                  Y_test,
                                                  shuffle=False)

    # Train.
    plateau_monitor = 'val_loss'
    plateau_factor = .5
    plateau_patience = 3
    early_stopping_monitor = 'val_loss'
    early_stopping_min_delta = 2**-10
    early_stopping_patience = 6
    callbacks = [
        ReduceLROnPlateau(monitor=plateau_monitor,
                          factor=plateau_factor,
                          patience=plateau_patience),
        EarlyStopping(monitor=early_stopping_monitor,
                      min_delta=early_stopping_min_delta,
                      patience=early_stopping_patience)
    ]
    history = model.fit_generator(
        train_generator,
        steps_per_epoch=steps_per_epoch if steps_per_epoch > 0 else None,
        epochs=epochs,
        validation_data=val_generator,
        class_weight=category_class_weights,
        callbacks=callbacks)

    # Save the history to visualize loss over time.
    print('Saving training history...')
    if not os.path.exists(folders.HISTORY_PATH):
        os.mkdir(folders.HISTORY_PATH)
    history_path = os.path.join(folders.HISTORY_PATH, classifier_name)
    if not os.path.exists(history_path):
        os.mkdir(history_path)
    with open(os.path.join(history_path, '{}.txt'.format(base_fname)),
              'w') as fd:
        for key in history.history.keys():
            values = history.history.get(key)
            fd.write('{} {}\n'.format(key,
                                      ' '.join(str(value)
                                               for value in values)))
    print('Done.')

    # Predict test instances.
    print('Predicting test instances...')
    Y_pred = model.predict_generator(test_generator)
    if label_mode == shared_parameters.LABEL_MODE_ORDINAL:
        Y_pred = [
            ordinal.from_multi_hot_ordinal(y, threshold=.5) for y in Y_pred
        ]
    elif label_mode == shared_parameters.LABEL_MODE_CATEGORICAL:
        Y_pred = [np.argmax(y, axis=1) for y in Y_pred]
    elif label_mode == shared_parameters.LABEL_MODE_REGRESSION:
        Y_pred = [
            np.maximum(0, np.minimum(k - 1, np.round(Y_pred[i] * k)))
            for i, k in enumerate(category_k)
        ]
    else:
        raise ValueError(
            'Unknown value for `1abel_mode`: {}'.format(label_mode))
    print('Done.')

    # Save model.
    save_model = False
    if save_model:
        models_path = os.path.join(folders.MODELS_PATH, classifier_name)
        label_mode_path = os.path.join(models_path, label_mode)
        model_path = os.path.join(label_mode_path, '{}.h5'.format(base_fname))
        print('Saving model to `{}`...'.format(model_path))
        if not os.path.exists(folders.MODELS_PATH):
            os.mkdir(folders.MODELS_PATH)
        if not os.path.exists(models_path):
            os.mkdir(models_path)
        if not os.path.exists(label_mode_path):
            os.mkdir(label_mode_path)
        model.save(model_path)
        print('Done.')
    else:
        model_path = None

    # Calculate elapsed time.
    end_time = int(time.time())
    elapsed_s = end_time - start_time
    elapsed_m, elapsed_s = elapsed_s // 60, elapsed_s % 60
    elapsed_h, elapsed_m = elapsed_m // 60, elapsed_m % 60

    # Write results.
    print('Writing results...')

    if not os.path.exists(folders.LOGS_PATH):
        os.mkdir(folders.LOGS_PATH)
    logs_path = os.path.join(folders.LOGS_PATH, classifier_name)
    if not os.path.exists(logs_path):
        os.mkdir(logs_path)
    with open(os.path.join(logs_path, '{}.txt'.format(base_fname)), 'w') as fd:
        if note is not None:
            fd.write('Note: {}\n\n'.format(note))
        fd.write('PARAMETERS\n\n')
        fd.write('steps_per_epoch={:d}\n'.format(steps_per_epoch))
        fd.write('epochs={:d}\n'.format(epochs))
        fd.write('\nHYPERPARAMETERS\n')
        fd.write('\nText\n')
        fd.write('subset_ratio={}\n'.format(str(subset_ratio)))
        fd.write('subset_seed={}\n'.format(str(subset_seed)))
        fd.write('min_len={:d}\n'.format(min_len))
        fd.write('max_len={:d}\n'.format(max_len))
        fd.write('min_tokens={:d}\n'.format(min_tokens))
        fd.write('\nLabels\n')
        fd.write('categories_mode=\'{}\'\n'.format(categories_mode))
        fd.write('\nTokenization\n')
        fd.write('max_words={:d}\n'.format(max_words))
        fd.write('n_sentences={:d}\n'.format(n_sentences))
        fd.write('n_sentence_tokens={:d}\n'.format(n_sentence_tokens))
        fd.write('padding=\'{}\'\n'.format(padding))
        fd.write('truncating=\'{}\'\n'.format(truncating))
        fd.write('\nWord Embedding\n')
        fd.write('embedding_path=\'{}\'\n'.format(embedding_path))
        fd.write('embedding_trainable={}\n'.format(embedding_trainable))
        fd.write('\nModel\n')
        fd.write('sent_rnn={}\n'.format(sent_rnn.__name__))
        fd.write('sent_rnn_units={:d}\n'.format(sent_rnn_units))
        fd.write('sent_rnn_l2={}\n'.format(str(sent_rnn_l2)))
        fd.write('sent_dense_units={:d}\n'.format(sent_dense_units))
        fd.write(
            'sent_dense_activation=\'{}\'\n'.format(sent_dense_activation))
        fd.write('sent_dense_l2={}\n'.format(str(sent_dense_l2)))
        fd.write('para_rnn={}\n'.format(para_rnn.__name__))
        fd.write('para_rnn_units={:d}\n'.format(para_rnn_units))
        fd.write('para_rnn_l2={}\n'.format(str(para_rnn_l2)))
        fd.write('para_dense_units={:d}\n'.format(para_dense_units))
        fd.write(
            'para_dense_activation=\'{}\'\n'.format(para_dense_activation))
        fd.write('para_dense_l2={}\n'.format(str(para_dense_l2)))
        fd.write('book_dense_units={:d}\n'.format(book_dense_units))
        fd.write('book_dense_activation={} {}\n'.format(
            book_dense_activation.__class__.__name__,
            book_dense_activation.__dict__))
        fd.write('book_dense_l2={}\n'.format(str(book_dense_l2)))
        fd.write('book_dropout={:.1f}\n'.format(book_dropout))
        fd.write('label_mode={}\n'.format(label_mode))
        model.summary(print_fn=lambda x: fd.write('{}\n'.format(x)))
        fd.write('\nTraining\n')
        fd.write('optimizer={}\n'.format(optimizer.__class__.__name__))
        fd.write('lr={}\n'.format(str(lr)))
        fd.write('loss=\'{}\'\n'.format(loss))
        fd.write('metric=\'{}\'\n'.format(metric))
        fd.write('test_size={:.2f}\n'.format(test_size))
        fd.write('test_random_state={:d}\n'.format(test_random_state))
        fd.write('val_size={:.2f}\n'.format(val_size))
        fd.write('val_random_state={:d}\n'.format(val_random_state))
        fd.write('use_class_weights={}\n'.format(use_class_weights))
        if use_class_weights:
            fd.write('class_weight_f={}\n'.format(class_weight_f))
        fd.write('shuffle={}\n'.format(shuffle))
        fd.write('plateau_monitor={}\n'.format(plateau_monitor))
        fd.write('plateau_factor={}\n'.format(str(plateau_factor)))
        fd.write('plateau_patience={:d}\n'.format(plateau_patience))
        fd.write('early_stopping_monitor={}\n'.format(early_stopping_monitor))
        fd.write('early_stopping_min_delta={}\n'.format(
            str(early_stopping_min_delta)))
        fd.write(
            'early_stopping_patience={:d}\n'.format(early_stopping_patience))
        fd.write('\nRESULTS\n\n')
        fd.write('Data size: {:d}\n'.format(len(X)))
        fd.write('Train size: {:d}\n'.format(len(X_train)))
        fd.write('Validation size: {:d}\n'.format(len(X_val)))
        fd.write('Test size: {:d}\n'.format(len(X_test)))
        if save_model:
            fd.write('Model path: \'{}\'\n'.format(model_path))
        else:
            fd.write('Model not saved.\n')
        fd.write('Time elapsed: {:d}h {:d}m {:d}s\n\n'.format(
            elapsed_h, elapsed_m, elapsed_s))
        evaluation.write_confusion_and_metrics(Y_test, Y_pred, fd, categories)

    if not os.path.exists(folders.PREDICTIONS_PATH):
        os.mkdir(folders.PREDICTIONS_PATH)
    predictions_path = os.path.join(folders.PREDICTIONS_PATH, classifier_name)
    if not os.path.exists(predictions_path):
        os.mkdir(predictions_path)
    with open(os.path.join(predictions_path, '{}.txt'.format(base_fname)),
              'w') as fd:
        evaluation.write_predictions(Y_test, Y_pred, fd, categories)

    print('Done.')
def main(min_len=shared_parameters.DATA_PARAGRAPH_MIN_LEN,
         max_len=shared_parameters.DATA_PARAGRAPH_MAX_LEN,
         min_tokens=shared_parameters.DATA_MIN_TOKENS,
         categories_mode=shared_parameters.DATA_CATEGORIES_MODE,
         return_overall=shared_parameters.DATA_RETURN_OVERALL,
         min_gram=1,
         max_gram=1,
         max_features=8192,
         top_n=256,
         force=False):
    # Get data.
    inputs, Y, categories, category_levels = \
        bookcave.get_data({'paragraph_tokens'},
                          min_len=min_len,
                          max_len=max_len,
                          min_tokens=min_tokens,
                          categories_mode=categories_mode,
                          return_overall=return_overall)
    text_paragraph_tokens = [
        paragraph_tokens for paragraph_tokens, _ in inputs['paragraph_tokens']
    ]
    text_all_tokens = []
    for paragraph_tokens in text_paragraph_tokens:
        all_tokens = []
        for tokens in paragraph_tokens:
            all_tokens.extend(tokens)
        text_all_tokens.append(all_tokens)

    # Vectorize.
    def identity(v):
        return v

    vectorizer = TfidfVectorizer(preprocessor=identity,
                                 tokenizer=identity,
                                 analyzer='word',
                                 token_pattern=None,
                                 ngram_range=(min_gram, max_gram),
                                 max_features=max_features,
                                 norm='l2',
                                 sublinear_tf=True)
    X = vectorizer.fit_transform(text_all_tokens)
    features = vectorizer.get_feature_names()

    # See Multi Class Text Classification article:
    # https://towardsdatascience.com/multi-class-text-classification-with-scikit-learn-12f1e60e0a9f
    category_term_scores = []
    for category_i, category in enumerate(categories):
        y = Y[category_i]
        scores, pvals = chi2(X, y)
        indices = np.argsort(scores)
        term_scores = [(features[indices[-1 - i]], scores[indices[-1 - i]])
                       for i in range(top_n)]
        category_term_scores.append(term_scores)

    # Save.
    if not os.path.exists(folders.LOGS_PATH):
        os.mkdir(folders.LOGS_PATH)
    if not os.path.exists(folders.CORRELATED_WORDS_PATH):
        os.mkdir(folders.CORRELATED_WORDS_PATH)
    size = len(text_all_tokens)
    for category_i, category in enumerate(categories):
        term_scores = category_term_scores[category_i]
        write_formatted_term_scores(category,
                                    term_scores,
                                    size,
                                    min_gram,
                                    max_gram,
                                    max_features,
                                    force=force)
    return size
def main(argv):
    if len(argv) < 5 or len(argv) > 6:
        raise ValueError(
            'Usage: <model_name> <vector_size> <max_vocab_size> <epochs> <window> [min_count]'
        )
    model_name = argv[0]
    vector_size = int(argv[1])
    max_vocab_size = int(argv[2])  # The maximum size of the vocabulary.
    epochs = int(argv[3])
    window = int(argv[4])
    min_count = 4
    if len(argv) > 5:
        min_count = int(argv[5])

    # Load data.
    print('Retrieving texts...')
    subset_ratio = 1.
    subset_seed = 1
    min_len = 256
    max_len = 4096
    min_tokens = 6
    inputs, Y, categories, category_levels = \
        bookcave.get_data({'tokens'},
                          subset_ratio=subset_ratio,
                          subset_seed=subset_seed,
                          min_len=min_len,
                          max_len=max_len,
                          min_tokens=min_tokens)
    text_paragraph_tokens, _ = zip(*inputs['tokens'])
    print('Retrieved {:d} texts.'.format(len(text_paragraph_tokens)))

    # Tokenize.
    print('Tokenizing...')
    all_paragraph_tokens = []
    for paragraph_tokens in text_paragraph_tokens:
        for tokens in paragraph_tokens:
            all_paragraph_tokens.append(tokens)
    print('Done.')

    # Create model.
    print('Creating model...')
    workers = 8
    if model_name == 'word2vec':
        model = Word2Vec(all_paragraph_tokens,
                         size=vector_size,
                         window=window,
                         min_count=min_count,
                         max_vocab_size=max_vocab_size,
                         workers=workers)
    else:
        raise ValueError('Unknown model name: `{}`'.format(model_name))
    print('Done.')

    # Train word vectors.
    print('Training model...')
    print_callback = PrintCallback(epochs)
    model.train(all_paragraph_tokens,
                total_examples=len(all_paragraph_tokens),
                epochs=epochs,
                callbacks=[print_callback])
    print('Done.')

    # Save.
    print('Saving vectors...')
    if not os.path.exists(folders.VECTORS_PATH):
        os.mkdir(folders.VECTORS_PATH)
    fname = '{}_{:d}_{:d}d_{:d}w_{:d}min_{:d}e.wv'.format(
        model_name, len(text_paragraph_tokens), vector_size, window, min_count,
        epochs)
    vectors_path = os.path.join(folders.VECTORS_PATH, fname)
    model.wv.save(vectors_path)
    print('Saved vectors to `{}`.'.format(vectors_path))
def main():
    parser = ArgumentParser(
        description='Use a model trained on books to predict the categorical maturity levels of paragraphs.',
        formatter_class = RawTextHelpFormatter
    )
    parser.add_argument('category_index',
                        type=int,
                        help='The category index.\n  {}'.format(
                            '\n  '.join(['{:d} {}'.format(j, bookcave.CATEGORY_NAMES[category])
                                         for j, category in enumerate(bookcave.CATEGORIES)]
                        )))
    parser.add_argument('name',
                        help='Model base file name.')
    parser.add_argument('--remove_stopwords',
                        action='store_true',
                        help='Remove stop-words from text. Default is False.')
    args = parser.parse_args()

    # Load data.
    source = 'paragraph_tokens'
    subset_ratio = shared_parameters.DATA_SUBSET_RATIO
    subset_seed = shared_parameters.DATA_SUBSET_SEED
    min_len = shared_parameters.DATA_PARAGRAPH_MIN_LEN
    max_len = shared_parameters.DATA_PARAGRAPH_MAX_LEN
    min_tokens = shared_parameters.DATA_MIN_TOKENS
    categories_mode = shared_parameters.DATA_CATEGORIES_MODE
    return_overall = shared_parameters.DATA_RETURN_OVERALL
    inputs, _, categories, category_levels, book_ids, books_df, _, _, _ = \
        bookcave.get_data({source},
                          subset_ratio=subset_ratio,
                          subset_seed=subset_seed,
                          min_len=min_len,
                          max_len=max_len,
                          min_tokens=min_tokens,
                          remove_stopwords=args.remove_stopwords,
                          categories_mode=categories_mode,
                          return_overall=return_overall,
                          return_meta=True)
    text_source_tokens = list(zip(*inputs[source]))[0]

    # Load paragraph labels.
    predict_locations = []
    predict_tokens = []
    predict_source_labels = []
    for text_i, source_tokens in enumerate(text_source_tokens):
        book_id = book_ids[text_i]
        asin = books_df[books_df['id'] == book_id].iloc[0]['asin']
        category_labels = [bookcave.get_labels(asin, category)
                           for category in categories[:bookcave.CATEGORY_INDEX_OVERALL]]
        if any(labels is None for labels in category_labels):
            continue
        for source_i, tokens in enumerate(source_tokens):
            source_labels = [labels[source_i] for labels in category_labels]
            if any(label == -1 for label in source_labels):
                continue
            predict_locations.append((text_i, source_i))
            predict_tokens.append(tokens)
            predict_source_labels.append(source_labels)
    Q_true = np.zeros((len(categories), len(predict_source_labels)), dtype=np.int32)
    for i, source_labels in enumerate(predict_source_labels):
        for j, label in enumerate(source_labels):
            Q_true[j, i] = label
    if return_overall:
        Q_true[bookcave.CATEGORY_INDEX_OVERALL] = bookcave.get_y_overall(Q_true, categories_mode=categories_mode)

    # Tokenize text.
    max_words = shared_parameters.TEXT_MAX_WORDS
    split = '\t'
    tokenizer = Tokenizer(num_words=max_words, split=split)
    all_locations = []
    all_sources = []
    for text_i, source_tokens in enumerate(text_source_tokens):
        for source_i, tokens in enumerate(source_tokens):
            all_locations.append((text_i, source_i))
            all_sources.append(split.join(tokens))
    tokenizer.fit_on_texts(all_sources)

    if not args.remove_stopwords:
        n_tokens = shared_parameters.TEXT_N_PARAGRAPH_TOKENS
    else:
        n_tokens = shared_parameters.TEXT_N_PARAGRAPH_TOKENS_NO_STOPWORDS
    padding = shared_parameters.TEXT_PADDING
    truncating = shared_parameters.TEXT_TRUNCATING
    P_predict = np.array([get_input_sequence([source_tokens], tokenizer, n_tokens, padding, truncating)
                          for source_tokens in predict_tokens])

    # Evaluate.
    model_path = os.path.join(folders.MODELS_PATH, 'paragraph_maxavg_ordinal', '{}.h5'.format(args.name))
    q_true = Q_true[args.category_index]
    model = load_model(model_path)
    evaluate_model(model,
                   P_predict,
                   q_true,
                   categories[args.category_index])
Ejemplo n.º 10
0
def main():
    parser = ArgumentParser(
        description='Classify the maturity level of a book by its paragraphs.',
        formatter_class=RawTextHelpFormatter)
    parser.add_argument('classifier_name', help='The name of the classifier.')
    parser.add_argument('model_file_name',
                        help='The file name of the model to load.')
    parser.add_argument('window', type=int, help='The paragraph window size.')
    args = parser.parse_args()
    source_mode = 'paragraph'
    remove_stopwords = False

    start_time = int(time.time())
    model_file_base_name = args.model_file_name[:args.model_file_name.
                                                rindex('.')]
    category_index = int(model_file_base_name[-1])
    base_fname = '{}_{:d}w'.format(model_file_base_name, args.window)

    # Load data.
    print('Retrieving texts...')
    source = 'paragraph_tokens'
    min_len = shared_parameters.DATA_PARAGRAPH_MIN_LEN
    max_len = shared_parameters.DATA_PARAGRAPH_MAX_LEN
    subset_ratio = shared_parameters.DATA_SUBSET_RATIO
    subset_seed = shared_parameters.DATA_SUBSET_SEED
    min_tokens = shared_parameters.DATA_MIN_TOKENS
    categories_mode = shared_parameters.DATA_CATEGORIES_MODE
    return_overall = shared_parameters.DATA_RETURN_OVERALL
    inputs, Y, categories, category_levels = \
        bookcave.get_data({source},
                          subset_ratio=subset_ratio,
                          subset_seed=subset_seed,
                          min_len=min_len,
                          max_len=max_len,
                          min_tokens=min_tokens,
                          remove_stopwords=remove_stopwords,
                          categories_mode=categories_mode,
                          return_overall=return_overall)
    text_source_tokens = list(zip(*inputs[source]))[0]
    print('Retrieved {:d} texts.'.format(len(text_source_tokens)))

    # Reduce labels to the specified category.
    y = Y[category_index]
    category = categories[category_index]

    # Tokenize.
    print('Tokenizing...')
    max_words = shared_parameters.TEXT_MAX_WORDS
    split = '\t'
    tokenizer = tokenizers.get_tokenizer_or_fit(
        max_words,
        source_mode,
        remove_stopwords,
        text_source_tokens=text_source_tokens)

    # Convert to sequences.
    print('Converting texts to sequences...')
    n_tokens = shared_parameters.TEXT_N_PARAGRAPH_TOKENS
    padding = shared_parameters.TEXT_PADDING
    truncating = shared_parameters.TEXT_TRUNCATING
    X = [
        np.array(
            pad_sequences(tokenizer.texts_to_sequences(
                [split.join(tokens) for tokens in source_tokens]),
                          maxlen=n_tokens,
                          padding=padding,
                          truncating=truncating))
        for source_tokens in text_source_tokens
    ]

    # Load model.
    print('Loading model...')
    model_path = os.path.join(folders.MODELS_PATH, args.classifier_name,
                              args.model_file_name)
    if 'rnn' in args.classifier_name:
        # Since `keras` was used with the custom layer, we have to reload it with `keras`.
        # https://github.com/keras-team/keras/issues/10907
        custom_objects = {'AttentionWithContext': AttentionWithContext}
        model = keras.models.load_model(model_path,
                                        custom_objects=custom_objects)
    else:
        model = tf.keras.models.load_model(model_path)

    # Split data set.
    print('Splitting data set...')
    test_size = shared_parameters.EVAL_TEST_SIZE  # b
    test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE
    _, X_test, _, y_test = \
        train_test_split(X, y, test_size=test_size, random_state=test_random_state)

    # Predict instances.
    print('Predicting labels...')
    y_pred = np.zeros((len(X_test), ), dtype=np.int32)
    for i, x in enumerate(X_test):
        P = np.zeros((len(x) - args.window + 1, args.window, *x.shape[1:]))
        for w in range(len(P)):
            P[w] = x[w:w + args.window]
        q_pred_transform = model.predict(P)
        q_pred = ordinal.from_multi_hot_ordinal(q_pred_transform, threshold=.5)
        label_pred = max(q_pred)
        y_pred[i] = label_pred

    # Calculate elapsed time.
    end_time = int(time.time())
    elapsed_s = end_time - start_time
    elapsed_m, elapsed_s = elapsed_s // 60, elapsed_s % 60
    elapsed_h, elapsed_m = elapsed_m // 60, elapsed_m % 60

    # Write results.
    print('Writing results...')
    logs_path = folders.ensure(
        os.path.join(folders.LOGS_PATH, args.classifier_name))
    with open(os.path.join(logs_path, '{}.txt'.format(base_fname)), 'w') as fd:
        fd.write('PARAMETERS\n\n')
        fd.write('classifier_name={}\n'.format(args.classifier_name))
        fd.write('model_file_name={}\n'.format(args.model_file_name))
        fd.write('window={:d}\n'.format(args.window))
        fd.write('\nHYPERPARAMETERS\n')
        fd.write('\nText\n')
        fd.write('subset_ratio={}\n'.format(str(subset_ratio)))
        fd.write('subset_seed={}\n'.format(str(subset_seed)))
        fd.write('min_len={:d}\n'.format(min_len))
        fd.write('max_len={:d}\n'.format(max_len))
        fd.write('min_tokens={:d}\n'.format(min_tokens))
        fd.write('remove_stopwords={}\n'.format(remove_stopwords))
        fd.write('\nLabels\n')
        fd.write('categories_mode=\'{}\'\n'.format(categories_mode))
        fd.write('return_overall={}\n'.format(return_overall))
        fd.write('\nTokenization\n')
        fd.write('max_words={:d}\n'.format(max_words))
        fd.write('n_tokens={:d}\n'.format(n_tokens))
        fd.write('padding=\'{}\'\n'.format(padding))
        fd.write('truncating=\'{}\'\n'.format(truncating))
        fd.write('\nRESULTS\n\n')
        fd.write('Data size: {:d}\n'.format(len(X)))
        fd.write('Test size: {:d}\n'.format(len(X_test)))
        fd.write('Time elapsed: {:d}h {:d}m {:d}s\n\n'.format(
            elapsed_h, elapsed_m, elapsed_s))
        evaluation.write_confusion_and_metrics(y_test, y_pred, fd, category)

    # Write predictions.
    print('Writing predictions...')
    predictions_path = folders.ensure(
        os.path.join(folders.PREDICTIONS_PATH, args.classifier_name))
    with open(os.path.join(predictions_path, '{}.txt'.format(base_fname)),
              'w') as fd:
        evaluation.write_predictions(y_test, y_pred, fd, category)

    print('Done.')
Ejemplo n.º 11
0
def main():
    parser = ArgumentParser(
        description='Classify the maturity level of a book by its cover.',
        formatter_class=RawTextHelpFormatter)
    parser.add_argument('category_index',
                        type=int,
                        help='The category index.\n  {}'.format('\n  '.join([
                            '{:d} {}'.format(j,
                                             bookcave.CATEGORY_NAMES[category])
                            for j, category in enumerate(bookcave.CATEGORIES)
                        ])))
    parser.add_argument('--label_mode',
                        default=shared_parameters.LABEL_MODE_ORDINAL,
                        choices=[
                            shared_parameters.LABEL_MODE_ORDINAL,
                            shared_parameters.LABEL_MODE_CATEGORICAL,
                            shared_parameters.LABEL_MODE_REGRESSION
                        ],
                        help='The way that labels will be interpreted. '
                        'Default is `{}`.'.format(
                            shared_parameters.LABEL_MODE_ORDINAL))
    args = parser.parse_args()

    classifier_name = 'cover_net'
    start_time = int(time.time())
    if 'SLURM_JOB_ID' in os.environ:
        stamp = int(os.environ['SLURM_JOB_ID'])
    else:
        stamp = start_time
    base_fname = '{:d}_{:d}'.format(stamp, args.category_index)

    images_size = (256, 256)

    # Here, `Y` has shape (n, m) where `n` is the number of books and `m` is the number of maturity categories.
    inputs, Y, categories, levels = \
        bookcave.get_data({'images'},
                          subset_ratio=1/4,  # shared_parameters.
                          subset_seed=1,
                          image_size=images_size)
    image_paths = inputs['images']

    # Reduce the labels to the specified category.
    y = Y[args.category_index]
    category = categories[args.category_index]
    levels = levels[args.category_index]
    k = len(levels)

    # Split data set.
    test_size = shared_parameters.EVAL_TEST_SIZE  # b
    test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE
    val_size = shared_parameters.EVAL_VAL_SIZE
    val_random_state = shared_parameters.EVAL_VAL_RANDOM_STATE
    image_paths_train, image_paths_test, y_train, y_test = \
        train_test_split(image_paths, y, test_size=test_size, random_state=test_random_state)
    image_paths_train, image_paths_val, y_train, y_val = \
        train_test_split(image_paths_train, y_train, test_size=val_size, random_state=val_random_state)
    X_val = image_paths_to_tensors(image_paths_val)
    y_val_transform = shared_parameters.transform_labels(
        y_val, k, args.label_mode)
    X_test = image_paths_to_tensors(image_paths_test)

    # Train.
    optimizer = Adam(lr=2**-10)
    model = get_model(images_size, k, optimizer)
    plateau_monitor = 'val_loss'
    plateau_factor = .5
    early_stopping_monitor = 'val_loss'
    early_stopping_min_delta = 2**-10
    plateau_patience = 10
    early_stopping_patience = 20
    callbacks = [
        ReduceLROnPlateau(monitor=plateau_monitor,
                          factor=plateau_factor,
                          patience=plateau_patience),
        EarlyStopping(monitor=early_stopping_monitor,
                      min_delta=early_stopping_min_delta,
                      patience=early_stopping_patience)
    ]
    train_generator = TransformBalancedBatchGenerator(
        image_paths_train,
        y_train,
        transform_X=image_paths_to_tensors,
        transform_y=transform_y,
        batch_size=32,
        k=k,
        label_mode=shared_parameters.LABEL_MODE_ORDINAL)
    val_generator = SimpleBatchGenerator(X_val, y_val_transform, batch_size=32)
    history = model.fit_generator(train_generator,
                                  callbacks=callbacks,
                                  epochs=1000,
                                  validation_data=val_generator)

    y_pred_ordinal = model.predict(X_test)

    # Convert the ordinal one-hot encoding back to discrete labels.
    y_pred = ordinal.from_multi_hot_ordinal(y_pred_ordinal, threshold=0.5)

    print('`{}`:'.format(category))
    print('Accuracy: {:.3%}'.format(accuracy_score(y_test, y_pred)))
    confusion = confusion_matrix(y_test, y_pred)
    print(confusion)

    history_path = folders.ensure(
        os.path.join(folders.HISTORY_PATH, classifier_name))
    with open(os.path.join(history_path, '{}.txt'.format(base_fname)),
              'w') as fd:
        for key in history.history.keys():
            values = history.history.get(key)
            fd.write('{} {}\n'.format(key,
                                      ' '.join(str(value)
                                               for value in values)))
Ejemplo n.º 12
0
def main():
    parser = argparse.ArgumentParser(
        description='Run fitted baseline classifiers over paragraphs.')
    parser.add_argument('model_name', help='Name of the algorithm.')
    parser.add_argument('stamp', help='Time stamp of saved models.')
    parser.add_argument('window', type=int, help='The paragraph window size.')
    parser.add_argument('--j',
                        type=int,
                        help='Only operate on the specified category index.')
    args = parser.parse_args()

    max_words = shared_parameters.TEXT_MAX_WORDS

    # Load data.
    print('Retrieving texts...')
    source = 'paragraph_tokens'
    subset_ratio = shared_parameters.DATA_SUBSET_RATIO
    subset_seed = shared_parameters.DATA_SUBSET_SEED
    min_len = shared_parameters.DATA_PARAGRAPH_MIN_LEN
    max_len = shared_parameters.DATA_PARAGRAPH_MAX_LEN
    min_tokens = shared_parameters.DATA_MIN_TOKENS
    remove_stopwords = False
    categories_mode = shared_parameters.DATA_CATEGORIES_MODE
    return_overall = shared_parameters.DATA_RETURN_OVERALL
    inputs, Y, categories, category_levels = \
        bookcave.get_data({source},
                          subset_ratio=subset_ratio,
                          subset_seed=subset_seed,
                          min_len=min_len,
                          max_len=max_len,
                          min_tokens=min_tokens,
                          remove_stopwords=remove_stopwords,
                          categories_mode=categories_mode,
                          return_overall=return_overall)
    text_source_tokens = list(zip(*inputs[source]))[0]
    print('Retrieved {:d} texts.'.format(len(text_source_tokens)))

    # Create vectorized representations of the book texts.
    print('Loading vectorizer...')
    vectorizer = tokenizers.get_vectorizer_or_fit(max_words, remove_stopwords)

    # Split data set.
    test_size = shared_parameters.EVAL_TEST_SIZE  # b
    test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE
    Y_T = Y.transpose()  # (n, c)
    _, text_source_tokens_test, _, Y_test_T = \
      train_test_split(text_source_tokens, Y_T, test_size=test_size, random_state=test_random_state)
    Y_test = Y_test_T.transpose()  # (c, n * b)

    # Separate books into paragraph windows.
    print('Creating paragraph windows...')
    text_P = list()
    for source_tokens in text_source_tokens_test:
        token_windows = list()
        for i in range(len(source_tokens) - args.window + 1):
            token_window = list()
            for tokens in source_tokens[i:i + args.window]:
                token_window.extend(tokens)
            token_windows.append(token_window)
        P = vectorizer.transform(token_windows)
        text_P.append(P)

    # Load classifiers.
    print('Loading classifiers...')
    category_classifiers = list()
    for j, levels in enumerate(category_levels):
        classifiers = list()
        category_part = '{}_{:d}'.format(args.stamp, j)
        for k in range(len(levels) - 1):
            path = os.path.join(folders.MODELS_PATH, args.model_name,
                                category_part, 'model{:d}.pickle'.format(k))
            with open(path, 'rb') as fd:
                model = pickle.load(fd)
            classifiers.append(model)
        category_classifiers.append(classifiers)

    # Infer from paragraphs.
    if args.j is None:
        for j, y_test in enumerate(Y_test):
            k = len(category_levels[j])
            models = category_classifiers[j]
            infer(j, y_test, args.model_name, args.stamp, args.window,
                  categories, k, models, text_P, subset_ratio, subset_seed,
                  min_len, max_len, min_tokens, categories_mode,
                  return_overall, max_words, vectorizer, test_size,
                  test_random_state, len(text_source_tokens),
                  len(text_source_tokens_test))
    else:
        k = len(category_levels[args.j])
        models = category_classifiers[args.j]
        infer(args.j, Y_test[args.j], args.model_name, args.stamp, args.window,
              categories, k, models, text_P, subset_ratio, subset_seed,
              min_len, max_len, min_tokens, categories_mode, return_overall,
              max_words, vectorizer, test_size, test_random_state,
              len(text_source_tokens), len(text_source_tokens_test))

    print('Done.')
Ejemplo n.º 13
0
def main():
    script_name = os.path.basename(__file__)
    classifier_name = script_name[:script_name.rindex('.')]

    start_time = int(time.time())
    if 'SLURM_JOB_ID' in os.environ:
        stamp = int(os.environ['SLURM_JOB_ID'])
    else:
        stamp = start_time

    # Load data.
    print('Retrieving labels...')
    subset_ratio = shared_parameters.DATA_SUBSET_RATIO
    subset_seed = shared_parameters.DATA_SUBSET_SEED
    min_len = shared_parameters.DATA_PARAGRAPH_MIN_LEN
    max_len = shared_parameters.DATA_PARAGRAPH_MAX_LEN
    min_tokens = shared_parameters.DATA_MIN_TOKENS
    categories_mode = shared_parameters.DATA_CATEGORIES_MODE
    return_overall = shared_parameters.DATA_RETURN_OVERALL
    _, Y, categories, category_levels = \
        bookcave.get_data({'paragraph_tokens'},
                          subset_ratio=subset_ratio,
                          subset_seed=subset_seed,
                          min_len=min_len,
                          max_len=max_len,
                          min_tokens=min_tokens,
                          categories_mode=categories_mode,
                          return_overall=return_overall)
    print('Retrieved {:d} labels.'.format(Y.shape[1]))

    # Split data set.
    test_size = shared_parameters.EVAL_TEST_SIZE  # b
    test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE
    Y_T = Y.transpose()  # (n, c)
    Y_train_T, Y_test_T = train_test_split(Y_T,
                                           test_size=test_size,
                                           random_state=test_random_state)
    Y_train = Y_train_T.transpose()  # (c, n * (1 - b))
    Y_test = Y_test_T.transpose()  # (c, n * b)

    for j, category in enumerate(categories):
        levels = category_levels[j]
        y_train = Y_train[j]
        y_test = Y_test[j]
        # Predict the most common class seen in the training data.
        y_pred = [np.argmax(np.bincount(y_train, minlength=len(levels)))
                  ] * len(y_test)

        base_fname = '{:d}_{:d}'.format(stamp, j)
        logs_path = folders.ensure(
            os.path.join(folders.LOGS_PATH, classifier_name))
        with open(os.path.join(logs_path, '{}.txt'.format(base_fname)),
                  'w') as fd:
            fd.write('HYPERPARAMETERS\n')
            fd.write('\nText\n')
            fd.write('subset_ratio={}\n'.format(str(subset_ratio)))
            fd.write('subset_seed={}\n'.format(str(subset_seed)))
            fd.write('min_len={:d}\n'.format(min_len))
            fd.write('max_len={:d}\n'.format(max_len))
            fd.write('min_tokens={:d}\n'.format(min_tokens))
            fd.write('\nLabels\n')
            fd.write('categories_mode=\'{}\'\n'.format(categories_mode))
            fd.write('return_overall={}\n'.format(return_overall))
            fd.write('\nTraining\n')
            fd.write('test_size={}\n'.format(str(test_size)))
            fd.write('test_random_state={:d}\n'.format(test_random_state))
            fd.write('\nRESULTS\n\n')
            fd.write('Data size: {:d}\n'.format(Y.shape[1]))
            fd.write('Train size: {:d}\n'.format(Y_train.shape[1]))
            fd.write('Test size: {:d}\n'.format(Y_test.shape[1]))
            fd.write('\n')
            evaluation.write_confusion_and_metrics(y_test, y_pred, fd,
                                                   category)

        predictions_path = folders.ensure(
            os.path.join(folders.PREDICTIONS_PATH, classifier_name))
        with open(os.path.join(predictions_path, '{}.txt'.format(base_fname)),
                  'w') as fd:
            evaluation.write_predictions(y_test, y_pred, fd, category)

    print('Done.')