def create_model(model_name): MODEL_DIR = pkg_resources.resource_filename('atnlp', 'share/models/') paths = [ model_name, os.path.join(MODEL_DIR, model_name), os.path.join(MODEL_DIR, model_name + ".py"), ] m = None for path in paths: if os.path.exists(path): try: mod = dynamic_import(path) if hasattr(mod, 'model'): m = mod.model else: log().warn( "Model configuration script doesn't contain 'model' object" ) if m: break except: pass if not m: raise FileNotFoundError("couldn't load model") return clone(m)
def load_configured_model(cfg_name): CFG_DIR = pkg_resources.resource_filename('atnlp', 'share/config/') paths = [ cfg_name, os.path.join(CFG_DIR, cfg_name), os.path.join(CFG_DIR, cfg_name + ".yml"), ] model_name = params = None for path in paths: if os.path.exists(path): try: with open(path) as f: data = yaml.load(f) model_name = data['model'] params = data['params'] log().info("loaded model: {}".format(path)) if model_name: break except: log.warn("failure parsing config file: {}".format(path)) if not model_name: raise FileNotFoundError("couldn't load model configuration") model = create_model(model_name) if params: model.set_params(**params) return model # EOF
def display_keywords(model, topic_names, vocab): """Print keywords for WordMatchClassifier instances in OneVsRestClassifier :param model: OneVsRestClassifier containing WordMatchClassifier instances :param topic_names: topic for each model instance in OneVsRest :param vocab: id-to-word dictionary for bag-of-words input data """ title_break('Topic Keywords') for (i, t) in enumerate(topic_names): keywords = vocab[model.estimators_[i]._keywords] log().info("{:20s}: {}".format(t, keywords))
def fit_xgb_model(alg, X, y, X_test, y_test, useTrainCV=True, cv_folds=5, early_stopping_rounds=50): """Fit xgboost model :param alg: XGBClassifier (sklearn api class) :param X: training data :param y: training labels :param X_test: testing data :param y_test: testing labels :param useTrainCV: use cross validation :param cv_folds: number of folds for cross-validation :param early_stopping_rounds: minimum number of rounds before early stopping """ if useTrainCV: import xgboost as xgb dtrain = xgb.DMatrix(X, label=y) cvresult = xgb.cv(alg.get_xgb_params(), dtrain, num_boost_round=alg.get_params()['n_estimators'], early_stopping_rounds=early_stopping_rounds, nfold=cv_folds, metrics='auc') alg.set_params(n_estimators=cvresult.shape[0]) # Fit the algorithm on the data eval_set = [(X, y), (X_test, y_test)] alg.fit(X, y, eval_metric='auc', eval_set=eval_set, verbose=False) # Predict training set: y_pred = alg.predict(X) y_prob = alg.predict_proba(X)[:, 1] # Print model report: title_break("Model Report") log().info("Accuracy : %.4g" % accuracy_score(y, y_pred)) log().info("AUC Score (Train): %f" % roc_auc_score(y, y_prob)) result = alg.evals_result() n = len(result['validation_0']['auc']) if useTrainCV: x = np.arange(len(cvresult)) (ytr, eytr) = (cvresult['train-auc-mean'], cvresult['train-auc-std']) (yte, eyte) = (cvresult['test-auc-mean'], cvresult['test-auc-std']) plt.fill_between(x, ytr - eytr, ytr + eytr, facecolor='r', alpha=0.25, label='train(cv) err') plt.fill_between(x, yte - eyte, yte + eyte, facecolor='b', alpha=0.25, label='test(cv) err') plt.plot(x, ytr, color='r', linestyle='--', label='train(cv)') plt.plot(x, yte, color='b', linestyle='--', label='test(cv)') plt.plot(np.arange(n), result['validation_0']['auc'], color='r', label='train') plt.plot(np.arange(n), result['validation_1']['auc'], color='b', linewidth=2, label='test') plt.legend() # EOF
def load_glove(filename='glove.6B.300d.w2vformat.txt'): """Return glove word embedding model The embedding input can be specified with *filename*. The inputs are searched for in EMB_DIR. Check `scripts/install_glove.py` for installation. :param filename: glove input file name :return: word embedding model (gensim format) """ filepath = os.path.join(EMB_DIR, filename) if not os.path.exists(filepath): log().error("failed to load glove embeddings, install with 'install_glove.py'") raise FileNotFoundError return gensim.models.KeyedVectors.load_word2vec_format(filepath, binary=False)
def setup( log_level=None, suppress_warnings=None, tf_loglvl=2, batch_mode=False, ): """Global setup for atnlp framework :param log_level: logging output level :type log_level: logging.LEVEL (eg. DEBUG, INFO, WARNING...) :param suppress_warnings: list of warnings to suppress :type suppress_warnings: bool :param tf_loglvl: tensorflow log level :type tf_loglvl: int """ # log level if log_level == None: log_level = logging.INFO log().setLevel(log_level) # get this show rolling log().info("Starting Job: %s" % (asctime(localtime()))) # Suppress warnings if suppress_warnings is None: suppress_warnings = [ UndefinedMetricWarning, UserWarning, ] log().info("Suppressing following warnings:") for warn in suppress_warnings: log().info(" {}".format(warn)) warnings.filterwarnings("ignore", category=warn) # Set tensorflow log level os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(tf_loglvl) # set non-interactive backend if display not available if batch_mode or (os.name == 'posix' and "DISPLAY" not in os.environ): #import matplotlib #matplotlib.use('Agg') import matplotlib.pyplot as plt plt.switch_backend('agg') # EOF
def main(): ti = start_timer() # parse command line args parser = build_parser() args = parser.parse_args() # setup atnlp framework log_level = logging.DEBUG if args.verbose else None if args.quiet: log_level = logging.WARN else: print("\nExecuting {}\n".format(os.path.basename(__file__))) setup(log_level=log_level) section_break("Config summary") for (k, v) in vars(args).items(): log().info("{:20s}: {}".format(k, v)) # ------------------ # Prepare input data # ------------------ section_break("Preparing input data") log().info("Reading data from {}...".format(args.data)) X = read_raw(args.data) # ---------- # Load model # ---------- section_break("Loading model") model = joblib.load(args.model) if not args.quiet: print(model) # ------- # Predict # ------- section_break("Predicting labels") Y_pred = model.predict(X) Y_pred = pd.DataFrame(Y_pred, columns=model.topics) # -------- # Finalize # -------- section_break("Finalizing") log().info("Writing labels to {}".format(args.output)) write_one_hot_labels(Y_pred, args.output) # timer stop_timer(ti)
def main(): # parse args description = "Convert Reuters dataset to standard text format" parser = ArgumentParser(description=description) parser.add_argument('-v', '--verbose', action='store_true', help="Set logging level to DEBUG") parser.add_argument('--min-samples', type=int, default=100, help="Minimum number of samples per category [default: 100]") parser.add_argument('--topics', help="comma separated list of topics") args = parser.parse_args() # setup atnlp framework log_level = logging.DEBUG if args.verbose else None setup(log_level=log_level) # select topics if args.topics: topics = args.topics.split(',') else: topics = get_topics(min_samples=args.min_samples) log().info("{} topics selected.".format(len(topics))) # get topic labels (MxN data frame of bools: M categories, N documents) log().info("getting topic labels...") (Y_train, Y_test) = get_labels(topics) log().info("Writing to labels_train.txt") write_one_hot_labels(Y_train, 'labels_train.txt') log().info("Writing to labels_test.txt") write_one_hot_labels(Y_test, 'labels_test.txt') # get data iterators # Note: we also use test data because model currently requires # vocab from all samples to get be predictions log().info("getting topic data...") (X_train, X_test) = get_data(topics) log().info("Writing to data_train.txt") write_raw(X_train, "data_train.txt") log().info("Writing to data_test.txt") write_raw(X_test, "data_test.txt")
def main(): ti = start_timer() # parse command line args parser = build_parser() args = parser.parse_args() if not args.output: model_name = os.path.splitext(os.path.basename(args.model))[0] if model_name: args.output = model_name + '.pkl' else: args.output = 'model.pkl' # setup atnlp framework log_level = logging.DEBUG if args.verbose else None if args.quiet: log_level = logging.WARN else: print("\nExecuting {}\n".format(os.path.basename(__file__))) setup(log_level=log_level) section_break("Config summary") for (k, v) in vars(args).items(): log().info("{:20s}: {}".format(k, v)) # ------------------ # Prepare input data # ------------------ section_break("Preparing input data") log().info("Reading training data from {}...".format(args.data)) X = read_raw(args.data) log().info("Reading training labels from {}...".format(args.labels)) Y = read_one_hot_labels(args.labels) # ------------ # Create model # ------------ section_break("Creating model") # dynamically load model using yml config model = load_configured_model(args.model) # attach topics to model so they are persistified model.topics = list(Y.columns) if not args.quiet: log().info("") for s in str(model).split('\n'): log().info(s) # --------- # Fit model # --------- section_break("Training model") model.fit(X, Y) # -------- # Finalize # -------- section_break("Finalizing") log().info("Saving model to {}".format(args.output)) joblib.dump(model, args.output) # timer stop_timer(ti)
def stop_timer(ti): """Summarize job timing""" dt = timedelta(seconds=(time() - ti)) time_str = str(dt) log().info("Execution time: {0}".format(time_str)) return dt
def main(): ti = start_timer() # parse command line args parser = build_parser() args = parser.parse_args() # setup atnlp framework log_level = logging.DEBUG if args.verbose else None if args.quiet: log_level = logging.WARN else: print("\nExecuting {}\n".format(os.path.basename(__file__))) setup(log_level=log_level) section_break("Config summary") for (k, v) in vars(args).items(): log().info("{:20s}: {}".format(k, v)) # ------------------ # Prepare input data # ------------------ section_break("Preparing input data") log().info("Reading training data from {}...".format(args.data)) X = read_raw(args.data) log().info("Reading training labels from {}...".format(args.labels)) Y = read_one_hot_labels(args.labels) # ----------- # Load models # ----------- section_break("Loading models") names = [os.path.splitext(os.path.basename(m))[0] for m in args.models] models = [joblib.load(m) for m in args.models] # ------- # Predict # ------- section_break("Predicting labels") preds = [m.predict(X) for m in models] # -------- # Evaluate # -------- tables = multimodel_topic_labelling_summary_tables(Y, preds, names) # -------- # Finalize # -------- section_break("Finalizing") html = Report() html.add_title("Topic modelling performance", par="Here are some totally awesome results on topic modelling!") html.add_section("Topic-averaged performance") html.add_text("The precision, recall and f1 metrics use 'micro' averaging over topics") html.add_table(tables['summary'], cap='') html.add_section("Per-topic performance") topic_labelling_barchart(Y, preds, names) html.add_figure(cap="Comparison of per-topic metrics for each model") html.add_section("Per-topic performance (tables)") html.add_table(tables['precision'], cap='Precision scores per topic for each model') html.add_table(tables['recall'], cap='Recall scores per topic for each model') html.add_table(tables['f1'], cap='f1 scores per topic for each model') html.add_table(tables['fl'], cap='Number of false labels') html.add_table(tables['ml'], cap='Number of missed labels') # best model perf best_model = tables['summary']['model'].iloc[0] best_index = names.index(best_model) best_pred = preds[best_index] html.add_section("Correlations") topic_correlation_matrix(Y) html.add_figure(cap='True topic correlations') topic_migration_matrix(Y, best_pred) html.add_figure(cap='Topic migration matrix') false_labels_matrix(Y, best_pred) html.add_figure(cap="False labels matrix") html.write(args.output) # log().info("Writing labels to {}".format(args.output)) # timer stop_timer(ti)
def main(): ti = start_timer() # parse command line args parser = build_parser() args = parser.parse_args() assert args.lstm_depth >= 1, "Must configure at least one LSTM layer" print("\nExecuting train_reuters_rnn.py\n") # setup atnlp framework log_level = logging.DEBUG if args.verbose else None setup(log_level=log_level) section_break("Config summary") for (k,v) in vars(args).items(): log().info("{:20s}: {}".format(k,v)) # ------------------ # Prepare input data # ------------------ section_break("Preparing input data") # select topics if args.topics: topics = args.topics.split(',') else: topics = get_topics(min_samples=args.min_samples) log().info("{} topics selected.".format(len(topics))) # get topic labels (MxN data frame of bools: M categories, N documents) # TODO: could explicitly ignore Y_test here to show we don't need test labels log().info("getting topic labels...") (Y_train, Y_test) = get_labels(topics) # get data iterators # Note: we also use test data because model currently requires # vocab from all samples to get be predictions log().info("getting topic data...") (X_train_raw, X_test_raw) = get_data(topics) # convert words to integers log().info("converting to integer representation...") word_to_id = build_vocab(list(X_train_raw) + list(X_test_raw), max_size=args.max_vocab_size) X_train_ids = raw_to_ids(X_train_raw, word_to_id) X_test_ids = raw_to_ids(X_test_raw, word_to_id) # pad log().info("padding sequences...") id_to_word = dict(zip(word_to_id.values(), word_to_id.keys())) X_train_ids = pad_sequences(X_train_ids, maxlen=args.max_doc_length, value=word_to_id[PAD_WORD], padding='post', truncating='post') X_test_ids = pad_sequences(X_test_ids, maxlen=args.max_doc_length, value=word_to_id[PAD_WORD], padding='post', truncating='post') vocab_size = len(word_to_id) # split train into train + validation X_train_ids, X_val_ids, Y_train, Y_val = train_test_split( X_train_ids, Y_train, test_size=0.20, random_state=42) # dataset summary title_break("Data Summary") log().info("{} topics selected: {}".format(len(topics), topics)) log().info("n train: {}".format(len(X_train_ids))) log().info("n val: {}".format(len(X_val_ids))) log().info("n test: {}".format(len(X_test_ids))) log().info("max doc length: {}".format(args.max_doc_length)) log().info("vocab size: {}".format(vocab_size)) # ------------ # Create model # ------------ section_break("Creating Model") # create embedding layer if args.learn_embeddings: embedding = Embedding(vocab_size, args.embedding_size) else: embedding = create_embedding_layer(load_glove(), word_to_id, args.max_doc_length) # create LSTM layers lstm_size = args.lstm_size or embedding.output_dim lstm_args = {'dropout':args.dropout, 'recurrent_dropout': args.recurrent_dropout} lstm_layers = [LSTM(lstm_size, **lstm_args) for _ in range(args.lstm_depth)] if args.bidirectional: lstm_layers = [Bidirectional(l) for l in lstm_layers] # construct model model = Sequential() model.add(embedding) for l in lstm_layers: model.add(l) model.add(Dense(units=len(topics), activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) print(model.summary()) # --------- # Fit model # --------- # create callbacks callbacks = [f1_metric] if not args.no_early_stopping: callbacks += [EarlyStopping(monitor='val_f1', mode='max', patience=2)] # fit section_break("Training Model") history = model.fit(X_train_ids, Y_train, epochs=args.epochs, batch_size=args.batch_size, verbose=1, validation_data=(X_val_ids, Y_val), callbacks=callbacks) # -------------- # Evaluate model # -------------- section_break("Evaluating Model") threshold = 0.5 Y_train_pred = model.predict(X_train_ids) > threshold Y_val_pred = model.predict(X_val_ids) > threshold Y_test_pred = model.predict(X_test_ids) > threshold ave = 'micro' scores_train = precision_recall_fscore_support(Y_train, Y_train_pred, average=ave) scores_val = precision_recall_fscore_support(Y_val, Y_val_pred, average=ave) scores_test = precision_recall_fscore_support(Y_test, Y_test_pred, average=ave) title_break("Performance") log().info("{:<10s}{:>15s}{:>15s}{:>15s}".format("Sample", "Precision", "Recall", "F1")) log().info("-"*55) log().info("{:<10s}{:15.3f}{:15.3f}{:15.3f}".format("Train", *scores_train[:3])) log().info("{:<10s}{:15.3f}{:15.3f}{:15.3f}".format("Val", *scores_val[:3])) log().info("{:<10s}{:15.3f}{:15.3f}{:15.3f}".format("Test", *scores_test[:3])) log().info("") # timer dt = stop_timer(ti) # -------- # Finalize # -------- section_break("Finalizing") log().info("Saving model to {}".format(args.output)) model.save(args.output) auxname = os.path.splitext(args.output)[0] + '.pickle' log().info("Saving aux info to {}".format(auxname)) with open(auxname, 'wb') as f: data = { 'topics': topics, 'id_to_word':id_to_word, 'history': history.history, 'scores':{ 'train': scores_train, 'val': scores_val, 'test': scores_test, }, 'time': dt.total_seconds(), 'args': vars(args), } pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)