def main(): ti = start_timer() # parse command line args parser = build_parser() args = parser.parse_args() # setup atnlp framework log_level = logging.DEBUG if args.verbose else None if args.quiet: log_level = logging.WARN else: print("\nExecuting {}\n".format(os.path.basename(__file__))) setup(log_level=log_level) section_break("Config summary") for (k, v) in vars(args).items(): log().info("{:20s}: {}".format(k, v)) # ------------------ # Prepare input data # ------------------ section_break("Preparing input data") log().info("Reading data from {}...".format(args.data)) X = read_raw(args.data) # ---------- # Load model # ---------- section_break("Loading model") model = joblib.load(args.model) if not args.quiet: print(model) # ------- # Predict # ------- section_break("Predicting labels") Y_pred = model.predict(X) Y_pred = pd.DataFrame(Y_pred, columns=model.topics) # -------- # Finalize # -------- section_break("Finalizing") log().info("Writing labels to {}".format(args.output)) write_one_hot_labels(Y_pred, args.output) # timer stop_timer(ti)
def main(): # parse args description = "Convert Reuters dataset to standard text format" parser = ArgumentParser(description=description) parser.add_argument('-v', '--verbose', action='store_true', help="Set logging level to DEBUG") parser.add_argument('--min-samples', type=int, default=100, help="Minimum number of samples per category [default: 100]") parser.add_argument('--topics', help="comma separated list of topics") args = parser.parse_args() # setup atnlp framework log_level = logging.DEBUG if args.verbose else None setup(log_level=log_level) # select topics if args.topics: topics = args.topics.split(',') else: topics = get_topics(min_samples=args.min_samples) log().info("{} topics selected.".format(len(topics))) # get topic labels (MxN data frame of bools: M categories, N documents) log().info("getting topic labels...") (Y_train, Y_test) = get_labels(topics) log().info("Writing to labels_train.txt") write_one_hot_labels(Y_train, 'labels_train.txt') log().info("Writing to labels_test.txt") write_one_hot_labels(Y_test, 'labels_test.txt') # get data iterators # Note: we also use test data because model currently requires # vocab from all samples to get be predictions log().info("getting topic data...") (X_train, X_test) = get_data(topics) log().info("Writing to data_train.txt") write_raw(X_train, "data_train.txt") log().info("Writing to data_test.txt") write_raw(X_test, "data_test.txt")
def main(): ti = start_timer() # parse command line args parser = build_parser() args = parser.parse_args() # setup atnlp framework log_level = logging.DEBUG if args.verbose else None if args.quiet: log_level = logging.WARN else: print("\nExecuting {}\n".format(os.path.basename(__file__))) setup(log_level=log_level) section_break("Config summary") for (k, v) in vars(args).items(): log().info("{:20s}: {}".format(k, v)) # ------------------ # Prepare input data # ------------------ section_break("Preparing input data") log().info("Reading training data from {}...".format(args.data)) X = read_raw(args.data) log().info("Reading training labels from {}...".format(args.labels)) Y = read_one_hot_labels(args.labels) # ----------- # Load models # ----------- section_break("Loading models") names = [os.path.splitext(os.path.basename(m))[0] for m in args.models] models = [joblib.load(m) for m in args.models] # ------- # Predict # ------- section_break("Predicting labels") preds = [m.predict(X) for m in models] # -------- # Evaluate # -------- tables = multimodel_topic_labelling_summary_tables(Y, preds, names) # -------- # Finalize # -------- section_break("Finalizing") html = Report() html.add_title("Topic modelling performance", par="Here are some totally awesome results on topic modelling!") html.add_section("Topic-averaged performance") html.add_text("The precision, recall and f1 metrics use 'micro' averaging over topics") html.add_table(tables['summary'], cap='') html.add_section("Per-topic performance") topic_labelling_barchart(Y, preds, names) html.add_figure(cap="Comparison of per-topic metrics for each model") html.add_section("Per-topic performance (tables)") html.add_table(tables['precision'], cap='Precision scores per topic for each model') html.add_table(tables['recall'], cap='Recall scores per topic for each model') html.add_table(tables['f1'], cap='f1 scores per topic for each model') html.add_table(tables['fl'], cap='Number of false labels') html.add_table(tables['ml'], cap='Number of missed labels') # best model perf best_model = tables['summary']['model'].iloc[0] best_index = names.index(best_model) best_pred = preds[best_index] html.add_section("Correlations") topic_correlation_matrix(Y) html.add_figure(cap='True topic correlations') topic_migration_matrix(Y, best_pred) html.add_figure(cap='Topic migration matrix') false_labels_matrix(Y, best_pred) html.add_figure(cap="False labels matrix") html.write(args.output) # log().info("Writing labels to {}".format(args.output)) # timer stop_timer(ti)
def main(): ti = start_timer() # parse command line args parser = build_parser() args = parser.parse_args() if not args.output: model_name = os.path.splitext(os.path.basename(args.model))[0] if model_name: args.output = model_name + '.pkl' else: args.output = 'model.pkl' # setup atnlp framework log_level = logging.DEBUG if args.verbose else None if args.quiet: log_level = logging.WARN else: print("\nExecuting {}\n".format(os.path.basename(__file__))) setup(log_level=log_level) section_break("Config summary") for (k, v) in vars(args).items(): log().info("{:20s}: {}".format(k, v)) # ------------------ # Prepare input data # ------------------ section_break("Preparing input data") log().info("Reading training data from {}...".format(args.data)) X = read_raw(args.data) log().info("Reading training labels from {}...".format(args.labels)) Y = read_one_hot_labels(args.labels) # ------------ # Create model # ------------ section_break("Creating model") # dynamically load model using yml config model = load_configured_model(args.model) # attach topics to model so they are persistified model.topics = list(Y.columns) if not args.quiet: log().info("") for s in str(model).split('\n'): log().info(s) # --------- # Fit model # --------- section_break("Training model") model.fit(X, Y) # -------- # Finalize # -------- section_break("Finalizing") log().info("Saving model to {}".format(args.output)) joblib.dump(model, args.output) # timer stop_timer(ti)
def main(): ti = start_timer() # parse command line args parser = build_parser() args = parser.parse_args() assert args.lstm_depth >= 1, "Must configure at least one LSTM layer" print("\nExecuting train_reuters_rnn.py\n") # setup atnlp framework log_level = logging.DEBUG if args.verbose else None setup(log_level=log_level) section_break("Config summary") for (k,v) in vars(args).items(): log().info("{:20s}: {}".format(k,v)) # ------------------ # Prepare input data # ------------------ section_break("Preparing input data") # select topics if args.topics: topics = args.topics.split(',') else: topics = get_topics(min_samples=args.min_samples) log().info("{} topics selected.".format(len(topics))) # get topic labels (MxN data frame of bools: M categories, N documents) # TODO: could explicitly ignore Y_test here to show we don't need test labels log().info("getting topic labels...") (Y_train, Y_test) = get_labels(topics) # get data iterators # Note: we also use test data because model currently requires # vocab from all samples to get be predictions log().info("getting topic data...") (X_train_raw, X_test_raw) = get_data(topics) # convert words to integers log().info("converting to integer representation...") word_to_id = build_vocab(list(X_train_raw) + list(X_test_raw), max_size=args.max_vocab_size) X_train_ids = raw_to_ids(X_train_raw, word_to_id) X_test_ids = raw_to_ids(X_test_raw, word_to_id) # pad log().info("padding sequences...") id_to_word = dict(zip(word_to_id.values(), word_to_id.keys())) X_train_ids = pad_sequences(X_train_ids, maxlen=args.max_doc_length, value=word_to_id[PAD_WORD], padding='post', truncating='post') X_test_ids = pad_sequences(X_test_ids, maxlen=args.max_doc_length, value=word_to_id[PAD_WORD], padding='post', truncating='post') vocab_size = len(word_to_id) # split train into train + validation X_train_ids, X_val_ids, Y_train, Y_val = train_test_split( X_train_ids, Y_train, test_size=0.20, random_state=42) # dataset summary title_break("Data Summary") log().info("{} topics selected: {}".format(len(topics), topics)) log().info("n train: {}".format(len(X_train_ids))) log().info("n val: {}".format(len(X_val_ids))) log().info("n test: {}".format(len(X_test_ids))) log().info("max doc length: {}".format(args.max_doc_length)) log().info("vocab size: {}".format(vocab_size)) # ------------ # Create model # ------------ section_break("Creating Model") # create embedding layer if args.learn_embeddings: embedding = Embedding(vocab_size, args.embedding_size) else: embedding = create_embedding_layer(load_glove(), word_to_id, args.max_doc_length) # create LSTM layers lstm_size = args.lstm_size or embedding.output_dim lstm_args = {'dropout':args.dropout, 'recurrent_dropout': args.recurrent_dropout} lstm_layers = [LSTM(lstm_size, **lstm_args) for _ in range(args.lstm_depth)] if args.bidirectional: lstm_layers = [Bidirectional(l) for l in lstm_layers] # construct model model = Sequential() model.add(embedding) for l in lstm_layers: model.add(l) model.add(Dense(units=len(topics), activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) print(model.summary()) # --------- # Fit model # --------- # create callbacks callbacks = [f1_metric] if not args.no_early_stopping: callbacks += [EarlyStopping(monitor='val_f1', mode='max', patience=2)] # fit section_break("Training Model") history = model.fit(X_train_ids, Y_train, epochs=args.epochs, batch_size=args.batch_size, verbose=1, validation_data=(X_val_ids, Y_val), callbacks=callbacks) # -------------- # Evaluate model # -------------- section_break("Evaluating Model") threshold = 0.5 Y_train_pred = model.predict(X_train_ids) > threshold Y_val_pred = model.predict(X_val_ids) > threshold Y_test_pred = model.predict(X_test_ids) > threshold ave = 'micro' scores_train = precision_recall_fscore_support(Y_train, Y_train_pred, average=ave) scores_val = precision_recall_fscore_support(Y_val, Y_val_pred, average=ave) scores_test = precision_recall_fscore_support(Y_test, Y_test_pred, average=ave) title_break("Performance") log().info("{:<10s}{:>15s}{:>15s}{:>15s}".format("Sample", "Precision", "Recall", "F1")) log().info("-"*55) log().info("{:<10s}{:15.3f}{:15.3f}{:15.3f}".format("Train", *scores_train[:3])) log().info("{:<10s}{:15.3f}{:15.3f}{:15.3f}".format("Val", *scores_val[:3])) log().info("{:<10s}{:15.3f}{:15.3f}{:15.3f}".format("Test", *scores_test[:3])) log().info("") # timer dt = stop_timer(ti) # -------- # Finalize # -------- section_break("Finalizing") log().info("Saving model to {}".format(args.output)) model.save(args.output) auxname = os.path.splitext(args.output)[0] + '.pickle' log().info("Saving aux info to {}".format(auxname)) with open(auxname, 'wb') as f: data = { 'topics': topics, 'id_to_word':id_to_word, 'history': history.history, 'scores':{ 'train': scores_train, 'val': scores_val, 'test': scores_test, }, 'time': dt.total_seconds(), 'args': vars(args), } pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)