def infer(j, y_test, model_name, stamp, window, categories, k, models, text_P, subset_ratio, subset_seed, min_len, max_len, min_tokens, categories_mode, return_overall, max_words, vectorizer, test_size, test_random_state, data_len, test_len): category = categories[j] print('Predicting category `{}`...'.format(category)) y_pred = np.zeros((len(y_test), ), dtype=np.int32) for i in range(len(y_test)): P = text_P[i] q_pred = base.predict_ordinal(models, P, k) label_pred = max(q_pred) y_pred[i] = label_pred base_fname = '{}_{:d}_{:d}w'.format(stamp, j, window) logs_path = folders.ensure(os.path.join(folders.LOGS_PATH, model_name)) with open(os.path.join(logs_path, '{}.txt'.format(base_fname)), 'w') as fd: fd.write('HYPERPARAMETERS\n') fd.write('\nText\n') fd.write('subset_ratio={}\n'.format(str(subset_ratio))) fd.write('subset_seed={}\n'.format(str(subset_seed))) fd.write('min_len={:d}\n'.format(min_len)) fd.write('max_len={:d}\n'.format(max_len)) fd.write('min_tokens={:d}\n'.format(min_tokens)) fd.write('\nLabels\n') fd.write('categories_mode=\'{}\'\n'.format(categories_mode)) fd.write('return_overall={}\n'.format(return_overall)) fd.write('\nVectorization\n') fd.write('max_words={:d}\n'.format(max_words)) fd.write('vectorizer={}\n'.format(vectorizer.__class__.__name__)) fd.write('\nTraining\n') fd.write('test_size={}\n'.format(str(test_size))) fd.write('test_random_state={:d}\n'.format(test_random_state)) fd.write('\nRESULTS\n\n') fd.write('Data size: {:d}\n'.format(data_len)) fd.write('Test size: {:d}\n\n'.format(test_len)) evaluation.write_confusion_and_metrics(y_test, y_pred, fd, category) predictions_path = folders.ensure( os.path.join(folders.PREDICTIONS_PATH, model_name)) with open(os.path.join(predictions_path, '{}.txt'.format(base_fname)), 'w') as fd: evaluation.write_predictions(y_test, y_pred, fd, category)
def train(skip_models=False): max_words = shared_parameters.TEXT_MAX_WORDS start_time = int(time.time()) if 'SLURM_JOB_ID' in os.environ: stamp = int(os.environ['SLURM_JOB_ID']) else: stamp = start_time print('Time stamp: {:d}'.format(stamp)) # Load data. print('Retrieving texts...') source = 'paragraph_tokens' subset_ratio = .1 #shared_parameters.DATA_SUBSET_RATIO subset_seed = shared_parameters.DATA_SUBSET_SEED min_len = shared_parameters.DATA_PARAGRAPH_MIN_LEN max_len = shared_parameters.DATA_PARAGRAPH_MAX_LEN min_tokens = shared_parameters.DATA_MIN_TOKENS remove_stopwords = False categories_mode = shared_parameters.DATA_CATEGORIES_MODE return_overall = shared_parameters.DATA_RETURN_OVERALL inputs, Y, categories, category_levels = \ bookcave.get_data({source}, subset_ratio=subset_ratio, subset_seed=subset_seed, min_len=min_len, max_len=max_len, min_tokens=min_tokens, remove_stopwords=remove_stopwords, categories_mode=categories_mode, return_overall=return_overall) text_source_tokens = list(zip(*inputs[source]))[0] print('Retrieved {:d} texts.'.format(len(text_source_tokens))) # Create vectorized representations of the book texts. print('Vectorizing text...') text_tokens = [] for source_tokens in text_source_tokens: all_tokens = [] for tokens in source_tokens: all_tokens.extend(tokens) text_tokens.append(all_tokens) vectorizer = tokenizers.get_vectorizer_or_fit(max_words, remove_stopwords, text_tokens=text_tokens) X = vectorizer.transform(text_tokens) print('Vectorized text with {:d} unique words.'.format( len(vectorizer.get_feature_names()))) # Split data set. test_size = shared_parameters.EVAL_TEST_SIZE # b test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE Y_T = Y.transpose() # (n, c) X_train, X_test, Y_train_T, Y_test_T = train_test_split( X, Y_T, test_size=test_size, random_state=test_random_state) Y_train = Y_train_T.transpose() # (c, n * (1 - b)) Y_test = Y_test_T.transpose() # (c, n * b) create_funcs = [ create_k_nearest_neighbors, create_logistic_regression, create_multi_layer_perceptron, create_multinomial_naive_bayes, create_random_forest, create_svm ] model_names = [ 'k_nearest_neighbors', 'logistic_regression', 'multi_layer_perceptron', 'multinomial_naive_bayes', 'random_forest', 'svm' ] for m, create_func in enumerate(create_funcs): model_name = model_names[m] model_path = folders.ensure( os.path.join(folders.MODELS_PATH, model_name)) print('Training model `{}`...'.format(model_name)) for j, category in enumerate(categories): print('Classifying category `{}`...'.format(category)) y_train = Y_train[j] # (n * (1 - b)) k = len(category_levels[j]) classifiers = fit_ordinal(create_func, X_train, y_train, k) y_pred = predict_ordinal(classifiers, X_test, k) # (n * b) y_test = Y_test[j] base_fname = '{:d}_{:d}'.format(stamp, j) logs_path = folders.ensure( os.path.join(folders.LOGS_PATH, model_name)) with open(os.path.join(logs_path, '{}.txt'.format(base_fname)), 'w') as fd: fd.write('HYPERPARAMETERS\n') fd.write('\nText\n') fd.write('subset_ratio={}\n'.format(str(subset_ratio))) fd.write('subset_seed={}\n'.format(str(subset_seed))) fd.write('min_len={:d}\n'.format(min_len)) fd.write('max_len={:d}\n'.format(max_len)) fd.write('min_tokens={:d}\n'.format(min_tokens)) fd.write('\nLabels\n') fd.write('categories_mode=\'{}\'\n'.format(categories_mode)) fd.write('return_overall={}\n'.format(return_overall)) fd.write('\nVectorization\n') fd.write('max_words={:d}\n'.format(max_words)) fd.write('vectorizer={}\n'.format( vectorizer.__class__.__name__)) fd.write('\nTraining\n') fd.write('test_size={}\n'.format(str(test_size))) fd.write('test_random_state={:d}\n'.format(test_random_state)) fd.write('\nRESULTS\n\n') fd.write('Data size: {:d}\n'.format(X.shape[0])) fd.write('Train size: {:d}\n'.format(X_train.shape[0])) fd.write('Test size: {:d}\n\n'.format(X_test.shape[0])) evaluation.write_confusion_and_metrics(y_test, y_pred, fd, category) predictions_path = folders.ensure( os.path.join(folders.PREDICTIONS_PATH, model_name)) with open( os.path.join(predictions_path, '{}.txt'.format(base_fname)), 'w') as fd: evaluation.write_predictions(y_test, y_pred, fd, category) if not skip_models: models_path = folders.ensure( os.path.join(model_path, base_fname)) for i, classifier in enumerate(classifiers): with open( os.path.join(models_path, 'model{:d}.pickle'.format(i)), 'wb') as fd: pickle.dump(classifier, fd, protocol=pickle.HIGHEST_PROTOCOL) print('Done.')
def main(): parser = ArgumentParser( description='Classify the maturity level of a book by its text.', formatter_class=RawTextHelpFormatter) parser.add_argument('category_index', type=int, help='The category index.\n {}'.format('\n '.join([ '{:d} {}'.format(j, bookcave.CATEGORY_NAMES[category]) for j, category in enumerate(bookcave.CATEGORIES) ]))) parser.add_argument('--source_mode', default='paragraph', choices=['paragraph', 'sentence'], help='The source of text. Default is `paragraph`.') parser.add_argument('--net_mode', default='cnn', choices=['rnn', 'cnn', 'rnncnn'], help='The type of neural network. Default is `cnn`.') parser.add_argument('--remove_stopwords', action='store_true', help='Remove stop-words from text. Default is False.') parser.add_argument( '--agg_mode', default='maxavg', choices=['max', 'avg', 'maxavg', 'rnn'], help= 'The way the network will aggregate paragraphs or sentences. Default is `maxavg`.' ) parser.add_argument('--label_mode', default=shared_parameters.LABEL_MODE_ORDINAL, choices=[ shared_parameters.LABEL_MODE_ORDINAL, shared_parameters.LABEL_MODE_CATEGORICAL, shared_parameters.LABEL_MODE_REGRESSION ], help='The way that labels will be interpreted. ' 'Default is `{}`.'.format( shared_parameters.LABEL_MODE_ORDINAL)) parser.add_argument( '--remove_classes', type=str, help= 'Remove classes altogether. Can be used when the minority class is severely tiny. ' 'Like `<class1>[,<class2>,...]` as in `3` or `3,0`. Optional.') parser.add_argument( '--class_weight_p', default=2, type=int, help='Power with which to scale class weights. Default is 2.') parser.add_argument( '--embedding_trainable', action='store_true', help= 'Flag to allow the model to optimize the word embeddings. Default is False.' ) parser.add_argument( '--book_dense_units', default='128', help= 'The number of neurons in the final fully-connected layers, comma separated. ' 'Default is `128`.') parser.add_argument( '--book_dropout', default=0.5, type=float, help= 'Dropout probability before final classification layer. Default is 0.5.' ) parser.add_argument( '--plateau_patience', default=16, type=int, help= 'Number of epochs to wait before dividing the learning rate by 2. Default is 16.' ) parser.add_argument( '--early_stopping_patience', default=32, type=int, help= 'Number of epochs to wait before dividing the learning rate by 2. Default is 32.' ) parser.add_argument('--epochs', default=1, type=int, help='Epochs. Default is 1.') parser.add_argument( '--save_model', action='store_true', help='Save the model and its weights. Default is False.') parser.add_argument( '--note', help= 'An optional note that will be appended to the names of generated files.' ) args = parser.parse_args() classifier_name = '{}_{}_{}_{}'.format(args.source_mode, args.net_mode, args.agg_mode, args.label_mode) start_time = int(time.time()) if 'SLURM_JOB_ID' in os.environ: stamp = int(os.environ['SLURM_JOB_ID']) else: stamp = start_time print('Time stamp: {:d}'.format(stamp)) if args.note is not None: print('Note: {}'.format(args.note)) base_fname = '{:d}_{}_{:d}'.format(stamp, args.note, args.category_index) else: base_fname = '{:d}_{:d}'.format(stamp, args.category_index) # Load data. print('Retrieving texts...') if args.source_mode == 'paragraph': source = 'paragraph_tokens' min_len = shared_parameters.DATA_PARAGRAPH_MIN_LEN max_len = shared_parameters.DATA_PARAGRAPH_MAX_LEN else: # args.source_mode == 'sentence': source = 'sentence_tokens' min_len = shared_parameters.DATA_SENTENCE_MIN_LEN max_len = shared_parameters.DATA_SENTENCE_MAX_LEN subset_ratio = shared_parameters.DATA_SUBSET_RATIO subset_seed = shared_parameters.DATA_SUBSET_SEED min_tokens = shared_parameters.DATA_MIN_TOKENS categories_mode = shared_parameters.DATA_CATEGORIES_MODE return_overall = shared_parameters.DATA_RETURN_OVERALL inputs, Y, categories, category_levels = \ bookcave.get_data({source}, subset_ratio=subset_ratio, subset_seed=subset_seed, min_len=min_len, max_len=max_len, min_tokens=min_tokens, remove_stopwords=args.remove_stopwords, categories_mode=categories_mode, return_overall=return_overall) text_source_tokens = list(zip(*inputs[source]))[0] print('Retrieved {:d} texts.'.format(len(text_source_tokens))) # Reduce labels to the specified category. y = Y[args.category_index] category = categories[args.category_index] levels = category_levels[args.category_index] k = len(levels) k_train = k # Tokenize. print('Tokenizing...') max_words = shared_parameters.TEXT_MAX_WORDS split = '\t' tokenizer = tokenizers.get_tokenizer_or_fit( max_words, args.source_mode, args.remove_stopwords, text_source_tokens=text_source_tokens) # Convert to sequences. print('Converting texts to sequences...') if args.source_mode == 'paragraph': if not args.remove_stopwords: n_tokens = shared_parameters.TEXT_N_PARAGRAPH_TOKENS else: n_tokens = shared_parameters.TEXT_N_PARAGRAPH_TOKENS_NO_STOPWORDS else: # args.source_mode == 'sentence': n_tokens = shared_parameters.TEXT_N_SENTENCE_TOKENS padding = shared_parameters.TEXT_PADDING truncating = shared_parameters.TEXT_TRUNCATING X = [ np.array( pad_sequences(tokenizer.texts_to_sequences( [split.join(tokens) for tokens in source_tokens]), maxlen=n_tokens, padding=padding, truncating=truncating)) for source_tokens in text_source_tokens ] # Load embedding. print('Loading embedding matrix...') embedding_path = folders.EMBEDDING_GLOVE_300_PATH embedding_matrix = load_embeddings.load_embedding(tokenizer, embedding_path, max_words) # Create model. print('Creating model...') net_params = dict() if args.net_mode == 'rnn' or args.net_mode == 'rnncnn': net_params['rnn'] = CuDNNGRU if tf.test.is_gpu_available( cuda_only=True) else GRU net_params['rnn_units'] = 128 net_params['rnn_l2'] = .001 net_params['rnn_dense_units'] = 64 net_params['rnn_dense_activation'] = 'elu' net_params['rnn_dense_l2'] = .001 net_params['rnn_agg'] = 'attention' if args.net_mode == 'cnn' or args.net_mode == 'rnncnn': net_params['cnn_filters'] = 16 net_params['cnn_filter_sizes'] = [1, 2, 3, 4] net_params['cnn_activation'] = 'elu' net_params['cnn_l2'] = .001 agg_params = dict() if args.agg_mode == 'rnn': agg_params['rnn'] = CuDNNGRU if tf.test.is_gpu_available( cuda_only=True) else GRU agg_params['rnn_units'] = 64 agg_params['rnn_l2'] = .001 book_dense_units = [ int(units) for units in args.book_dense_units.split(',') ] book_dense_activation = LeakyReLU(alpha=.1) book_dense_l2 = .001 book_dropout = args.book_dropout model = create_model(n_tokens, embedding_matrix, args.embedding_trainable, args.net_mode, net_params, args.agg_mode, agg_params, book_dense_units, book_dense_activation, book_dense_l2, book_dropout, k, category, args.label_mode) lr = 2**-16 optimizer = Adam(lr=lr) if args.label_mode == shared_parameters.LABEL_MODE_ORDINAL: loss = 'binary_crossentropy' metric = 'binary_accuracy' elif args.label_mode == shared_parameters.LABEL_MODE_CATEGORICAL: loss = 'categorical_crossentropy' metric = 'categorical_accuracy' else: # args.label_mode == shared_parameters.LABEL_MODE_REGRESSION: loss = 'mse' metric = 'accuracy' model.compile(optimizer, loss=loss, metrics=[metric]) # Split data set. print('Splitting data set...') test_size = shared_parameters.EVAL_TEST_SIZE # b test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE val_size = shared_parameters.EVAL_VAL_SIZE # v val_random_state = shared_parameters.EVAL_VAL_RANDOM_STATE X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=test_size, random_state=test_random_state) X_train, X_val, y_train, y_val = \ train_test_split(X_train, y_train, test_size=val_size, random_state=val_random_state) y_val_transform = shared_parameters.transform_labels( y_val, k, args.label_mode) y_test_transform = shared_parameters.transform_labels( y_test, k, args.label_mode) # Remove classes from training set, if specified. if args.remove_classes is not None: remove_classes = sorted(list( map(int, args.remove_classes.strip().split(','))), reverse=True) for class_ in remove_classes: y_train[y_train >= class_] -= 1 k_train -= 1 # Create generators. print('Creating data generators...') train_generator = TransformBalancedBatchGenerator( np.arange(len(X_train)).reshape((len(X_train), 1)), y_train, transform_X=transform_X, transform_y=transform_y, batch_size=1, X_data=[np.array([x]) for x in X_train], k=k, label_mode=args.label_mode) val_generator = SingleInstanceBatchGenerator(X_val, y_val_transform, shuffle=False) test_generator = SingleInstanceBatchGenerator(X_test, y_test_transform, shuffle=False) # Get class weight. class_weight = shared_parameters.get_class_weight(k_train, args.label_mode, p=args.class_weight_p) # Train. print('Training for up to {:d} epoch{}...'.format( args.epochs, 's' if args.epochs != 1 else '')) plateau_monitor = 'val_loss' plateau_factor = .5 early_stopping_monitor = 'val_loss' early_stopping_min_delta = 2**-10 plateau_patience = args.plateau_patience early_stopping_patience = args.early_stopping_patience callbacks = [ ReduceLROnPlateau(monitor=plateau_monitor, factor=plateau_factor, patience=plateau_patience), EarlyStopping(monitor=early_stopping_monitor, min_delta=early_stopping_min_delta, patience=early_stopping_patience) ] if args.save_model: models_path = folders.ensure( os.path.join(folders.MODELS_PATH, classifier_name)) model_path = os.path.join(models_path, '{}.h5'.format(base_fname)) model_checkpoint = ModelCheckpoint(model_path, monitor='val_loss', save_best_only=True, mode='min') callbacks.append(model_checkpoint) else: model_path = None history = model.fit_generator(train_generator, epochs=args.epochs, verbose=0, callbacks=callbacks, validation_data=val_generator, class_weight=class_weight) epochs_complete = len(history.history.get('val_loss')) # Save the history to visualize loss over time. print('Saving training history...') history_path = folders.ensure( os.path.join(folders.HISTORY_PATH, classifier_name)) with open(os.path.join(history_path, '{}.txt'.format(base_fname)), 'w') as fd: for key in history.history.keys(): values = history.history.get(key) fd.write('{} {}\n'.format(key, ' '.join(str(value) for value in values))) # Predict test instances. print('Predicting test instances...') y_pred_transform = model.predict_generator(test_generator) if args.label_mode == shared_parameters.LABEL_MODE_ORDINAL: y_pred = ordinal.from_multi_hot_ordinal(y_pred_transform, threshold=.5) elif args.label_mode == shared_parameters.LABEL_MODE_CATEGORICAL: y_pred = np.argmax(y_pred_transform, axis=1) else: # args.label_mode == shared_parameters.LABEL_MODE_REGRESSION: y_pred = np.maximum(0, np.minimum(k - 1, np.round(y_pred_transform * k))) # Calculate elapsed time. end_time = int(time.time()) elapsed_s = end_time - start_time elapsed_m, elapsed_s = elapsed_s // 60, elapsed_s % 60 elapsed_h, elapsed_m = elapsed_m // 60, elapsed_m % 60 # Write results. print('Writing results...') logs_path = folders.ensure(os.path.join(folders.LOGS_PATH, classifier_name)) with open(os.path.join(logs_path, '{}.txt'.format(base_fname)), 'w') as fd: if args.note is not None: fd.write('{}\n\n'.format(args.note)) fd.write('PARAMETERS\n\n') fd.write('category_index={:d}\n'.format(args.category_index)) fd.write('epochs={:d}\n'.format(args.epochs)) fd.write('\nHYPERPARAMETERS\n') fd.write('\nText\n') fd.write('subset_ratio={}\n'.format(str(subset_ratio))) fd.write('subset_seed={}\n'.format(str(subset_seed))) fd.write('min_len={:d}\n'.format(min_len)) fd.write('max_len={:d}\n'.format(max_len)) fd.write('min_tokens={:d}\n'.format(min_tokens)) fd.write('remove_stopwords={}\n'.format(args.remove_stopwords)) fd.write('\nLabels\n') fd.write('categories_mode=\'{}\'\n'.format(categories_mode)) fd.write('return_overall={}\n'.format(return_overall)) if args.remove_classes is not None: fd.write('remove_classes={}\n'.format(args.remove_classes)) else: fd.write('No classes removed.\n') fd.write('class_weight_p={:d}\n'.format(args.class_weight_p)) fd.write('\nTokenization\n') fd.write('max_words={:d}\n'.format(max_words)) fd.write('n_tokens={:d}\n'.format(n_tokens)) fd.write('padding=\'{}\'\n'.format(padding)) fd.write('truncating=\'{}\'\n'.format(truncating)) fd.write('\nWord Embedding\n') fd.write('embedding_path=\'{}\'\n'.format(embedding_path)) fd.write('embedding_trainable={}\n'.format(args.embedding_trainable)) fd.write('\nModel\n') if args.net_mode == 'rnn' or args.net_mode == 'rnncnn': fd.write('rnn={}\n'.format(net_params['rnn'].__name__)) fd.write('rnn_units={:d}\n'.format(net_params['rnn_units'])) fd.write('rnn_l2={}\n'.format(str(net_params['rnn_l2']))) fd.write('rnn_dense_units={:d}\n'.format( net_params['rnn_dense_units'])) fd.write('rnn_dense_activation=\'{}\'\n'.format( net_params['rnn_dense_activation'])) fd.write('rnn_dense_l2={}\n'.format(str( net_params['rnn_dense_l2']))) fd.write('rnn_agg={}\n'.format(net_params['rnn_agg'])) if args.net_mode == 'cnn' or args.net_mode == 'rnncnn': fd.write('cnn_filters={:d}\n'.format(net_params['cnn_filters'])) fd.write('cnn_filter_sizes={}\n'.format( str(net_params['cnn_filter_sizes']))) fd.write('cnn_activation=\'{}\'\n'.format( net_params['cnn_activation'])) fd.write('cnn_l2={}\n'.format(str(net_params['cnn_l2']))) if args.agg_mode == 'rnn': fd.write('agg_rnn={}\n'.format(agg_params['rnn'].__name__)) fd.write('agg_rnn_units={:d}\n'.format(agg_params['rnn_units'])) fd.write('agg_rnn_l2={}\n'.format(str(agg_params['rnn_l2']))) fd.write('book_dense_units={}\n'.format(args.book_dense_units)) fd.write('book_dense_activation={} {}\n'.format( book_dense_activation.__class__.__name__, book_dense_activation.__dict__)) fd.write('book_dense_l2={}\n'.format(str(book_dense_l2))) fd.write('book_dropout={}\n'.format(str(book_dropout))) model.summary(print_fn=lambda x: fd.write('{}\n'.format(x))) fd.write('\nTraining\n') fd.write('optimizer={}\n'.format(optimizer.__class__.__name__)) fd.write('lr={}\n'.format(str(lr))) fd.write('loss=\'{}\'\n'.format(loss)) fd.write('metric=\'{}\'\n'.format(metric)) fd.write('test_size={}\n'.format(str(test_size))) fd.write('test_random_state={:d}\n'.format(test_random_state)) fd.write('val_size={}\n'.format(str(val_size))) fd.write('val_random_state={:d}\n'.format(val_random_state)) fd.write('plateau_monitor={}\n'.format(plateau_monitor)) fd.write('plateau_factor={}\n'.format(str(plateau_factor))) fd.write('plateau_patience={:d}\n'.format(plateau_patience)) fd.write('early_stopping_monitor={}\n'.format(early_stopping_monitor)) fd.write('early_stopping_min_delta={}\n'.format( str(early_stopping_min_delta))) fd.write( 'early_stopping_patience={:d}\n'.format(early_stopping_patience)) fd.write('\nRESULTS\n\n') fd.write('Data size: {:d}\n'.format(len(X))) fd.write('Train size: {:d}\n'.format(len(X_train))) fd.write('Validation size: {:d}\n'.format(len(X_val))) fd.write('Test size: {:d}\n'.format(len(X_test))) if model_path is not None: fd.write('Model path: \'{}\'\n'.format(model_path)) else: fd.write('Model not saved.\n') fd.write('Epochs completed: {:d}\n'.format(epochs_complete)) fd.write('Time elapsed: {:d}h {:d}m {:d}s\n\n'.format( elapsed_h, elapsed_m, elapsed_s)) evaluation.write_confusion_and_metrics(y_test, y_pred, fd, category) # Write predictions. print('Writing predictions...') predictions_path = folders.ensure( os.path.join(folders.PREDICTIONS_PATH, classifier_name)) with open(os.path.join(predictions_path, '{}.txt'.format(base_fname)), 'w') as fd: evaluation.write_predictions(y_test, y_pred, fd, category) print('Done.')
def main(): parser = ArgumentParser( description='Classify the maturity level of a book by its paragraphs.', formatter_class=RawTextHelpFormatter) parser.add_argument('classifier_name', help='The name of the classifier.') parser.add_argument('model_file_name', help='The file name of the model to load.') parser.add_argument('window', type=int, help='The paragraph window size.') args = parser.parse_args() source_mode = 'paragraph' remove_stopwords = False start_time = int(time.time()) model_file_base_name = args.model_file_name[:args.model_file_name. rindex('.')] category_index = int(model_file_base_name[-1]) base_fname = '{}_{:d}w'.format(model_file_base_name, args.window) # Load data. print('Retrieving texts...') source = 'paragraph_tokens' min_len = shared_parameters.DATA_PARAGRAPH_MIN_LEN max_len = shared_parameters.DATA_PARAGRAPH_MAX_LEN subset_ratio = shared_parameters.DATA_SUBSET_RATIO subset_seed = shared_parameters.DATA_SUBSET_SEED min_tokens = shared_parameters.DATA_MIN_TOKENS categories_mode = shared_parameters.DATA_CATEGORIES_MODE return_overall = shared_parameters.DATA_RETURN_OVERALL inputs, Y, categories, category_levels = \ bookcave.get_data({source}, subset_ratio=subset_ratio, subset_seed=subset_seed, min_len=min_len, max_len=max_len, min_tokens=min_tokens, remove_stopwords=remove_stopwords, categories_mode=categories_mode, return_overall=return_overall) text_source_tokens = list(zip(*inputs[source]))[0] print('Retrieved {:d} texts.'.format(len(text_source_tokens))) # Reduce labels to the specified category. y = Y[category_index] category = categories[category_index] # Tokenize. print('Tokenizing...') max_words = shared_parameters.TEXT_MAX_WORDS split = '\t' tokenizer = tokenizers.get_tokenizer_or_fit( max_words, source_mode, remove_stopwords, text_source_tokens=text_source_tokens) # Convert to sequences. print('Converting texts to sequences...') n_tokens = shared_parameters.TEXT_N_PARAGRAPH_TOKENS padding = shared_parameters.TEXT_PADDING truncating = shared_parameters.TEXT_TRUNCATING X = [ np.array( pad_sequences(tokenizer.texts_to_sequences( [split.join(tokens) for tokens in source_tokens]), maxlen=n_tokens, padding=padding, truncating=truncating)) for source_tokens in text_source_tokens ] # Load model. print('Loading model...') model_path = os.path.join(folders.MODELS_PATH, args.classifier_name, args.model_file_name) if 'rnn' in args.classifier_name: # Since `keras` was used with the custom layer, we have to reload it with `keras`. # https://github.com/keras-team/keras/issues/10907 custom_objects = {'AttentionWithContext': AttentionWithContext} model = keras.models.load_model(model_path, custom_objects=custom_objects) else: model = tf.keras.models.load_model(model_path) # Split data set. print('Splitting data set...') test_size = shared_parameters.EVAL_TEST_SIZE # b test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE _, X_test, _, y_test = \ train_test_split(X, y, test_size=test_size, random_state=test_random_state) # Predict instances. print('Predicting labels...') y_pred = np.zeros((len(X_test), ), dtype=np.int32) for i, x in enumerate(X_test): P = np.zeros((len(x) - args.window + 1, args.window, *x.shape[1:])) for w in range(len(P)): P[w] = x[w:w + args.window] q_pred_transform = model.predict(P) q_pred = ordinal.from_multi_hot_ordinal(q_pred_transform, threshold=.5) label_pred = max(q_pred) y_pred[i] = label_pred # Calculate elapsed time. end_time = int(time.time()) elapsed_s = end_time - start_time elapsed_m, elapsed_s = elapsed_s // 60, elapsed_s % 60 elapsed_h, elapsed_m = elapsed_m // 60, elapsed_m % 60 # Write results. print('Writing results...') logs_path = folders.ensure( os.path.join(folders.LOGS_PATH, args.classifier_name)) with open(os.path.join(logs_path, '{}.txt'.format(base_fname)), 'w') as fd: fd.write('PARAMETERS\n\n') fd.write('classifier_name={}\n'.format(args.classifier_name)) fd.write('model_file_name={}\n'.format(args.model_file_name)) fd.write('window={:d}\n'.format(args.window)) fd.write('\nHYPERPARAMETERS\n') fd.write('\nText\n') fd.write('subset_ratio={}\n'.format(str(subset_ratio))) fd.write('subset_seed={}\n'.format(str(subset_seed))) fd.write('min_len={:d}\n'.format(min_len)) fd.write('max_len={:d}\n'.format(max_len)) fd.write('min_tokens={:d}\n'.format(min_tokens)) fd.write('remove_stopwords={}\n'.format(remove_stopwords)) fd.write('\nLabels\n') fd.write('categories_mode=\'{}\'\n'.format(categories_mode)) fd.write('return_overall={}\n'.format(return_overall)) fd.write('\nTokenization\n') fd.write('max_words={:d}\n'.format(max_words)) fd.write('n_tokens={:d}\n'.format(n_tokens)) fd.write('padding=\'{}\'\n'.format(padding)) fd.write('truncating=\'{}\'\n'.format(truncating)) fd.write('\nRESULTS\n\n') fd.write('Data size: {:d}\n'.format(len(X))) fd.write('Test size: {:d}\n'.format(len(X_test))) fd.write('Time elapsed: {:d}h {:d}m {:d}s\n\n'.format( elapsed_h, elapsed_m, elapsed_s)) evaluation.write_confusion_and_metrics(y_test, y_pred, fd, category) # Write predictions. print('Writing predictions...') predictions_path = folders.ensure( os.path.join(folders.PREDICTIONS_PATH, args.classifier_name)) with open(os.path.join(predictions_path, '{}.txt'.format(base_fname)), 'w') as fd: evaluation.write_predictions(y_test, y_pred, fd, category) print('Done.')
def main(): parser = ArgumentParser( description='Classify the maturity level of a book by its cover.', formatter_class=RawTextHelpFormatter) parser.add_argument('category_index', type=int, help='The category index.\n {}'.format('\n '.join([ '{:d} {}'.format(j, bookcave.CATEGORY_NAMES[category]) for j, category in enumerate(bookcave.CATEGORIES) ]))) parser.add_argument('--label_mode', default=shared_parameters.LABEL_MODE_ORDINAL, choices=[ shared_parameters.LABEL_MODE_ORDINAL, shared_parameters.LABEL_MODE_CATEGORICAL, shared_parameters.LABEL_MODE_REGRESSION ], help='The way that labels will be interpreted. ' 'Default is `{}`.'.format( shared_parameters.LABEL_MODE_ORDINAL)) args = parser.parse_args() classifier_name = 'cover_net' start_time = int(time.time()) if 'SLURM_JOB_ID' in os.environ: stamp = int(os.environ['SLURM_JOB_ID']) else: stamp = start_time base_fname = '{:d}_{:d}'.format(stamp, args.category_index) images_size = (256, 256) # Here, `Y` has shape (n, m) where `n` is the number of books and `m` is the number of maturity categories. inputs, Y, categories, levels = \ bookcave.get_data({'images'}, subset_ratio=1/4, # shared_parameters. subset_seed=1, image_size=images_size) image_paths = inputs['images'] # Reduce the labels to the specified category. y = Y[args.category_index] category = categories[args.category_index] levels = levels[args.category_index] k = len(levels) # Split data set. test_size = shared_parameters.EVAL_TEST_SIZE # b test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE val_size = shared_parameters.EVAL_VAL_SIZE val_random_state = shared_parameters.EVAL_VAL_RANDOM_STATE image_paths_train, image_paths_test, y_train, y_test = \ train_test_split(image_paths, y, test_size=test_size, random_state=test_random_state) image_paths_train, image_paths_val, y_train, y_val = \ train_test_split(image_paths_train, y_train, test_size=val_size, random_state=val_random_state) X_val = image_paths_to_tensors(image_paths_val) y_val_transform = shared_parameters.transform_labels( y_val, k, args.label_mode) X_test = image_paths_to_tensors(image_paths_test) # Train. optimizer = Adam(lr=2**-10) model = get_model(images_size, k, optimizer) plateau_monitor = 'val_loss' plateau_factor = .5 early_stopping_monitor = 'val_loss' early_stopping_min_delta = 2**-10 plateau_patience = 10 early_stopping_patience = 20 callbacks = [ ReduceLROnPlateau(monitor=plateau_monitor, factor=plateau_factor, patience=plateau_patience), EarlyStopping(monitor=early_stopping_monitor, min_delta=early_stopping_min_delta, patience=early_stopping_patience) ] train_generator = TransformBalancedBatchGenerator( image_paths_train, y_train, transform_X=image_paths_to_tensors, transform_y=transform_y, batch_size=32, k=k, label_mode=shared_parameters.LABEL_MODE_ORDINAL) val_generator = SimpleBatchGenerator(X_val, y_val_transform, batch_size=32) history = model.fit_generator(train_generator, callbacks=callbacks, epochs=1000, validation_data=val_generator) y_pred_ordinal = model.predict(X_test) # Convert the ordinal one-hot encoding back to discrete labels. y_pred = ordinal.from_multi_hot_ordinal(y_pred_ordinal, threshold=0.5) print('`{}`:'.format(category)) print('Accuracy: {:.3%}'.format(accuracy_score(y_test, y_pred))) confusion = confusion_matrix(y_test, y_pred) print(confusion) history_path = folders.ensure( os.path.join(folders.HISTORY_PATH, classifier_name)) with open(os.path.join(history_path, '{}.txt'.format(base_fname)), 'w') as fd: for key in history.history.keys(): values = history.history.get(key) fd.write('{} {}\n'.format(key, ' '.join(str(value) for value in values)))
def main(): script_name = os.path.basename(__file__) classifier_name = script_name[:script_name.rindex('.')] start_time = int(time.time()) if 'SLURM_JOB_ID' in os.environ: stamp = int(os.environ['SLURM_JOB_ID']) else: stamp = start_time # Load data. print('Retrieving labels...') subset_ratio = shared_parameters.DATA_SUBSET_RATIO subset_seed = shared_parameters.DATA_SUBSET_SEED min_len = shared_parameters.DATA_PARAGRAPH_MIN_LEN max_len = shared_parameters.DATA_PARAGRAPH_MAX_LEN min_tokens = shared_parameters.DATA_MIN_TOKENS categories_mode = shared_parameters.DATA_CATEGORIES_MODE return_overall = shared_parameters.DATA_RETURN_OVERALL _, Y, categories, category_levels = \ bookcave.get_data({'paragraph_tokens'}, subset_ratio=subset_ratio, subset_seed=subset_seed, min_len=min_len, max_len=max_len, min_tokens=min_tokens, categories_mode=categories_mode, return_overall=return_overall) print('Retrieved {:d} labels.'.format(Y.shape[1])) # Split data set. test_size = shared_parameters.EVAL_TEST_SIZE # b test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE Y_T = Y.transpose() # (n, c) Y_train_T, Y_test_T = train_test_split(Y_T, test_size=test_size, random_state=test_random_state) Y_train = Y_train_T.transpose() # (c, n * (1 - b)) Y_test = Y_test_T.transpose() # (c, n * b) for j, category in enumerate(categories): levels = category_levels[j] y_train = Y_train[j] y_test = Y_test[j] # Predict the most common class seen in the training data. y_pred = [np.argmax(np.bincount(y_train, minlength=len(levels))) ] * len(y_test) base_fname = '{:d}_{:d}'.format(stamp, j) logs_path = folders.ensure( os.path.join(folders.LOGS_PATH, classifier_name)) with open(os.path.join(logs_path, '{}.txt'.format(base_fname)), 'w') as fd: fd.write('HYPERPARAMETERS\n') fd.write('\nText\n') fd.write('subset_ratio={}\n'.format(str(subset_ratio))) fd.write('subset_seed={}\n'.format(str(subset_seed))) fd.write('min_len={:d}\n'.format(min_len)) fd.write('max_len={:d}\n'.format(max_len)) fd.write('min_tokens={:d}\n'.format(min_tokens)) fd.write('\nLabels\n') fd.write('categories_mode=\'{}\'\n'.format(categories_mode)) fd.write('return_overall={}\n'.format(return_overall)) fd.write('\nTraining\n') fd.write('test_size={}\n'.format(str(test_size))) fd.write('test_random_state={:d}\n'.format(test_random_state)) fd.write('\nRESULTS\n\n') fd.write('Data size: {:d}\n'.format(Y.shape[1])) fd.write('Train size: {:d}\n'.format(Y_train.shape[1])) fd.write('Test size: {:d}\n'.format(Y_test.shape[1])) fd.write('\n') evaluation.write_confusion_and_metrics(y_test, y_pred, fd, category) predictions_path = folders.ensure( os.path.join(folders.PREDICTIONS_PATH, classifier_name)) with open(os.path.join(predictions_path, '{}.txt'.format(base_fname)), 'w') as fd: evaluation.write_predictions(y_test, y_pred, fd, category) print('Done.')