Esempio n. 1
0
File: io.py Progetto: wedavey/atnlp
def create_model(model_name):
    MODEL_DIR = pkg_resources.resource_filename('atnlp', 'share/models/')
    paths = [
        model_name,
        os.path.join(MODEL_DIR, model_name),
        os.path.join(MODEL_DIR, model_name + ".py"),
    ]

    m = None
    for path in paths:
        if os.path.exists(path):
            try:
                mod = dynamic_import(path)
                if hasattr(mod, 'model'):
                    m = mod.model
                else:
                    log().warn(
                        "Model configuration script doesn't contain 'model' object"
                    )
                if m: break
            except:
                pass
    if not m:
        raise FileNotFoundError("couldn't load model")

    return clone(m)
Esempio n. 2
0
File: io.py Progetto: wedavey/atnlp
def load_configured_model(cfg_name):

    CFG_DIR = pkg_resources.resource_filename('atnlp', 'share/config/')
    paths = [
        cfg_name,
        os.path.join(CFG_DIR, cfg_name),
        os.path.join(CFG_DIR, cfg_name + ".yml"),
    ]

    model_name = params = None
    for path in paths:
        if os.path.exists(path):
            try:
                with open(path) as f:
                    data = yaml.load(f)
                    model_name = data['model']
                    params = data['params']
                log().info("loaded model: {}".format(path))
                if model_name: break
            except:
                log.warn("failure parsing config file: {}".format(path))

    if not model_name:
        raise FileNotFoundError("couldn't load model configuration")

    model = create_model(model_name)
    if params:
        model.set_params(**params)

    return model


# EOF
Esempio n. 3
0
def display_keywords(model, topic_names, vocab):
    """Print keywords for WordMatchClassifier instances in OneVsRestClassifier

    :param model: OneVsRestClassifier containing WordMatchClassifier instances
    :param topic_names: topic for each model instance in OneVsRest
    :param vocab: id-to-word dictionary for bag-of-words input data
    """
    title_break('Topic Keywords')
    for (i, t) in enumerate(topic_names):
        keywords = vocab[model.estimators_[i]._keywords]
        log().info("{:20s}: {}".format(t, keywords))
Esempio n. 4
0
def fit_xgb_model(alg, X, y, X_test, y_test, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    """Fit xgboost model

    :param alg: XGBClassifier (sklearn api class)
    :param X: training data
    :param y: training labels
    :param X_test: testing data
    :param y_test: testing labels
    :param useTrainCV: use cross validation
    :param cv_folds: number of folds for cross-validation
    :param early_stopping_rounds: minimum number of rounds before early stopping
    """
    if useTrainCV:
        import xgboost as xgb
        dtrain = xgb.DMatrix(X, label=y)
        cvresult = xgb.cv(alg.get_xgb_params(), dtrain,
                          num_boost_round=alg.get_params()['n_estimators'],
                          early_stopping_rounds=early_stopping_rounds,
                          nfold=cv_folds, metrics='auc')
        alg.set_params(n_estimators=cvresult.shape[0])

    # Fit the algorithm on the data
    eval_set = [(X, y), (X_test, y_test)]
    alg.fit(X, y, eval_metric='auc', eval_set=eval_set, verbose=False)

    # Predict training set:
    y_pred = alg.predict(X)
    y_prob = alg.predict_proba(X)[:, 1]

    # Print model report:
    title_break("Model Report")
    log().info("Accuracy : %.4g" % accuracy_score(y, y_pred))
    log().info("AUC Score (Train): %f" % roc_auc_score(y, y_prob))

    result = alg.evals_result()
    n = len(result['validation_0']['auc'])

    if useTrainCV:
        x = np.arange(len(cvresult))
        (ytr, eytr) = (cvresult['train-auc-mean'], cvresult['train-auc-std'])
        (yte, eyte) = (cvresult['test-auc-mean'], cvresult['test-auc-std'])

        plt.fill_between(x, ytr - eytr, ytr + eytr, facecolor='r', alpha=0.25, label='train(cv) err')
        plt.fill_between(x, yte - eyte, yte + eyte, facecolor='b', alpha=0.25, label='test(cv) err')
        plt.plot(x, ytr, color='r', linestyle='--', label='train(cv)')
        plt.plot(x, yte, color='b', linestyle='--', label='test(cv)')

    plt.plot(np.arange(n), result['validation_0']['auc'], color='r', label='train')
    plt.plot(np.arange(n), result['validation_1']['auc'], color='b', linewidth=2, label='test')

    plt.legend()


# EOF
Esempio n. 5
0
def load_glove(filename='glove.6B.300d.w2vformat.txt'):
    """Return glove word embedding model

    The embedding input can be specified with *filename*.
    The inputs are searched for in EMB_DIR.
    Check `scripts/install_glove.py` for installation.

    :param filename: glove input file name
    :return: word embedding model (gensim format)
    """
    filepath = os.path.join(EMB_DIR, filename)
    if not os.path.exists(filepath):
        log().error("failed to load glove embeddings, install with 'install_glove.py'")
        raise FileNotFoundError
    return gensim.models.KeyedVectors.load_word2vec_format(filepath, binary=False)
Esempio n. 6
0
def setup(
        log_level=None,
        suppress_warnings=None,
        tf_loglvl=2,
        batch_mode=False,
):
    """Global setup for atnlp framework

    :param log_level: logging output level
    :type log_level: logging.LEVEL (eg. DEBUG, INFO, WARNING...)
    :param suppress_warnings: list of warnings to suppress
    :type suppress_warnings: bool
    :param tf_loglvl: tensorflow log level
    :type tf_loglvl: int
    """
    # log level
    if log_level == None: log_level = logging.INFO
    log().setLevel(log_level)

    # get this show rolling
    log().info("Starting Job: %s" % (asctime(localtime())))

    # Suppress warnings
    if suppress_warnings is None:
        suppress_warnings = [
            UndefinedMetricWarning,
            UserWarning,
        ]
    log().info("Suppressing following warnings:")
    for warn in suppress_warnings:
        log().info("    {}".format(warn))
        warnings.filterwarnings("ignore", category=warn)

    # Set tensorflow log level
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(tf_loglvl)

    # set non-interactive backend if display not available
    if batch_mode or (os.name == 'posix' and "DISPLAY" not in os.environ):
        #import matplotlib
        #matplotlib.use('Agg')
        import matplotlib.pyplot as plt
        plt.switch_backend('agg')

# EOF
Esempio n. 7
0
def main():

    ti = start_timer()

    # parse command line args
    parser = build_parser()
    args = parser.parse_args()

    # setup atnlp framework
    log_level = logging.DEBUG if args.verbose else None
    if args.quiet: log_level = logging.WARN
    else:
        print("\nExecuting {}\n".format(os.path.basename(__file__)))
    setup(log_level=log_level)

    section_break("Config summary")
    for (k, v) in vars(args).items():
        log().info("{:20s}: {}".format(k, v))

    # ------------------
    # Prepare input data
    # ------------------
    section_break("Preparing input data")

    log().info("Reading data from {}...".format(args.data))
    X = read_raw(args.data)

    # ----------
    # Load model
    # ----------
    section_break("Loading model")

    model = joblib.load(args.model)

    if not args.quiet: print(model)

    # -------
    # Predict
    # -------
    section_break("Predicting labels")
    Y_pred = model.predict(X)
    Y_pred = pd.DataFrame(Y_pred, columns=model.topics)

    # --------
    # Finalize
    # --------
    section_break("Finalizing")
    log().info("Writing labels to {}".format(args.output))
    write_one_hot_labels(Y_pred, args.output)

    # timer
    stop_timer(ti)
Esempio n. 8
0
def main():
    # parse args
    description = "Convert Reuters dataset to standard text format"
    parser = ArgumentParser(description=description)
    parser.add_argument('-v', '--verbose', action='store_true',
                        help="Set logging level to DEBUG")
    parser.add_argument('--min-samples', type=int, default=100,
                        help="Minimum number of samples per category [default: 100]")
    parser.add_argument('--topics', help="comma separated list of topics")
    args = parser.parse_args()

    # setup atnlp framework
    log_level = logging.DEBUG if args.verbose else None
    setup(log_level=log_level)

    # select topics
    if args.topics:
        topics = args.topics.split(',')
    else:
        topics = get_topics(min_samples=args.min_samples)
    log().info("{} topics selected.".format(len(topics)))

    # get topic labels (MxN data frame of bools: M categories, N documents)
    log().info("getting topic labels...")
    (Y_train, Y_test) = get_labels(topics)
    log().info("Writing to labels_train.txt")
    write_one_hot_labels(Y_train, 'labels_train.txt')
    log().info("Writing to labels_test.txt")
    write_one_hot_labels(Y_test, 'labels_test.txt')

    # get data iterators
    # Note: we also use test data because model currently requires
    #       vocab from all samples to get be predictions
    log().info("getting topic data...")
    (X_train, X_test) = get_data(topics)
    log().info("Writing to data_train.txt")
    write_raw(X_train, "data_train.txt")
    log().info("Writing to data_test.txt")
    write_raw(X_test, "data_test.txt")
Esempio n. 9
0
def main():

    ti = start_timer()

    # parse command line args
    parser = build_parser()
    args = parser.parse_args()
    if not args.output:
        model_name = os.path.splitext(os.path.basename(args.model))[0]
        if model_name:
            args.output = model_name + '.pkl'
        else:
            args.output = 'model.pkl'

    # setup atnlp framework
    log_level = logging.DEBUG if args.verbose else None
    if args.quiet: log_level = logging.WARN
    else:
        print("\nExecuting {}\n".format(os.path.basename(__file__)))
    setup(log_level=log_level)

    section_break("Config summary")
    for (k, v) in vars(args).items():
        log().info("{:20s}: {}".format(k, v))

    # ------------------
    # Prepare input data
    # ------------------
    section_break("Preparing input data")

    log().info("Reading training data from {}...".format(args.data))
    X = read_raw(args.data)

    log().info("Reading training labels from {}...".format(args.labels))
    Y = read_one_hot_labels(args.labels)

    # ------------
    # Create model
    # ------------
    section_break("Creating model")

    # dynamically load model using yml config
    model = load_configured_model(args.model)

    # attach topics to model so they are persistified
    model.topics = list(Y.columns)

    if not args.quiet:
        log().info("")
        for s in str(model).split('\n'):
            log().info(s)

    # ---------
    # Fit model
    # ---------
    section_break("Training model")
    model.fit(X, Y)

    # --------
    # Finalize
    # --------
    section_break("Finalizing")
    log().info("Saving model to {}".format(args.output))
    joblib.dump(model, args.output)

    # timer
    stop_timer(ti)
Esempio n. 10
0
def stop_timer(ti):
    """Summarize job timing"""
    dt = timedelta(seconds=(time() - ti))
    time_str = str(dt)
    log().info("Execution time: {0}".format(time_str))
    return dt
Esempio n. 11
0
def main():
    ti = start_timer()

    # parse command line args
    parser = build_parser()
    args = parser.parse_args()

    # setup atnlp framework
    log_level = logging.DEBUG if args.verbose else None
    if args.quiet:
        log_level = logging.WARN
    else:
        print("\nExecuting {}\n".format(os.path.basename(__file__)))
    setup(log_level=log_level)

    section_break("Config summary")
    for (k, v) in vars(args).items():
        log().info("{:20s}: {}".format(k, v))

    # ------------------
    # Prepare input data
    # ------------------
    section_break("Preparing input data")

    log().info("Reading training data from {}...".format(args.data))
    X = read_raw(args.data)

    log().info("Reading training labels from {}...".format(args.labels))
    Y = read_one_hot_labels(args.labels)

    # -----------
    # Load models
    # -----------
    section_break("Loading models")
    names = [os.path.splitext(os.path.basename(m))[0] for m in args.models]
    models = [joblib.load(m) for m in args.models]

    # -------
    # Predict
    # -------
    section_break("Predicting labels")
    preds = [m.predict(X) for m in models]

    # --------
    # Evaluate
    # --------
    tables = multimodel_topic_labelling_summary_tables(Y, preds, names)

    # --------
    # Finalize
    # --------
    section_break("Finalizing")

    html = Report()
    html.add_title("Topic modelling performance",
                   par="Here are some totally awesome results on topic modelling!")

    html.add_section("Topic-averaged performance")
    html.add_text("The precision, recall and f1 metrics use 'micro' averaging over topics")
    html.add_table(tables['summary'], cap='')

    html.add_section("Per-topic performance")

    topic_labelling_barchart(Y, preds, names)
    html.add_figure(cap="Comparison of per-topic metrics for each model")

    html.add_section("Per-topic performance (tables)")

    html.add_table(tables['precision'], cap='Precision scores per topic for each model')
    html.add_table(tables['recall'], cap='Recall scores per topic for each model')
    html.add_table(tables['f1'], cap='f1 scores per topic for each model')
    html.add_table(tables['fl'], cap='Number of false labels')
    html.add_table(tables['ml'], cap='Number of missed labels')

    # best model perf
    best_model = tables['summary']['model'].iloc[0]
    best_index = names.index(best_model)
    best_pred = preds[best_index]

    html.add_section("Correlations")
    topic_correlation_matrix(Y)
    html.add_figure(cap='True topic correlations')

    topic_migration_matrix(Y, best_pred)
    html.add_figure(cap='Topic migration matrix')

    false_labels_matrix(Y, best_pred)
    html.add_figure(cap="False labels matrix")

    html.write(args.output)

    # log().info("Writing labels to {}".format(args.output))

    # timer
    stop_timer(ti)
Esempio n. 12
0
def main():

    ti = start_timer()

    # parse command line args
    parser = build_parser()
    args = parser.parse_args()

    assert args.lstm_depth >= 1, "Must configure at least one LSTM layer"

    print("\nExecuting train_reuters_rnn.py\n")

    # setup atnlp framework
    log_level = logging.DEBUG if args.verbose else None
    setup(log_level=log_level)

    section_break("Config summary")
    for (k,v) in vars(args).items():
        log().info("{:20s}: {}".format(k,v))

    # ------------------
    # Prepare input data
    # ------------------
    section_break("Preparing input data")

    # select topics
    if args.topics:
        topics = args.topics.split(',')
    else:
        topics = get_topics(min_samples=args.min_samples)
    log().info("{} topics selected.".format(len(topics)))

    # get topic labels (MxN data frame of bools: M categories, N documents)
    # TODO: could explicitly ignore Y_test here to show we don't need test labels
    log().info("getting topic labels...")
    (Y_train, Y_test) = get_labels(topics)

    # get data iterators
    # Note: we also use test data because model currently requires
    #       vocab from all samples to get be predictions
    log().info("getting topic data...")
    (X_train_raw, X_test_raw) = get_data(topics)

    # convert words to integers
    log().info("converting to integer representation...")
    word_to_id = build_vocab(list(X_train_raw) + list(X_test_raw), max_size=args.max_vocab_size)
    X_train_ids = raw_to_ids(X_train_raw, word_to_id)
    X_test_ids = raw_to_ids(X_test_raw, word_to_id)

    # pad
    log().info("padding sequences...")
    id_to_word = dict(zip(word_to_id.values(), word_to_id.keys()))
    X_train_ids = pad_sequences(X_train_ids, maxlen=args.max_doc_length, value=word_to_id[PAD_WORD],
                                padding='post', truncating='post')
    X_test_ids = pad_sequences(X_test_ids, maxlen=args.max_doc_length, value=word_to_id[PAD_WORD],
                               padding='post', truncating='post')
    vocab_size = len(word_to_id)

    # split train into train + validation
    X_train_ids, X_val_ids, Y_train, Y_val = train_test_split(
        X_train_ids, Y_train, test_size=0.20, random_state=42)

    # dataset summary
    title_break("Data Summary")
    log().info("{} topics selected: {}".format(len(topics), topics))
    log().info("n train: {}".format(len(X_train_ids)))
    log().info("n val:   {}".format(len(X_val_ids)))
    log().info("n test:  {}".format(len(X_test_ids)))
    log().info("max doc length: {}".format(args.max_doc_length))
    log().info("vocab size: {}".format(vocab_size))

    # ------------
    # Create model
    # ------------
    section_break("Creating Model")
    # create embedding layer
    if args.learn_embeddings:
        embedding = Embedding(vocab_size, args.embedding_size)
    else:
        embedding = create_embedding_layer(load_glove(), word_to_id, args.max_doc_length)

    # create LSTM layers
    lstm_size = args.lstm_size or embedding.output_dim
    lstm_args = {'dropout':args.dropout, 'recurrent_dropout': args.recurrent_dropout}
    lstm_layers = [LSTM(lstm_size, **lstm_args) for _ in range(args.lstm_depth)]
    if args.bidirectional:
        lstm_layers = [Bidirectional(l) for l in lstm_layers]

    # construct model
    model = Sequential()
    model.add(embedding)
    for l in lstm_layers: model.add(l)
    model.add(Dense(units=len(topics), activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    print(model.summary())

    # ---------
    # Fit model
    # ---------
    # create callbacks
    callbacks = [f1_metric]
    if not args.no_early_stopping:
        callbacks += [EarlyStopping(monitor='val_f1', mode='max', patience=2)]

    # fit
    section_break("Training Model")
    history = model.fit(X_train_ids, Y_train, epochs=args.epochs,
                       batch_size=args.batch_size, verbose=1,
                       validation_data=(X_val_ids, Y_val),
                       callbacks=callbacks)

    # --------------
    # Evaluate model
    # --------------
    section_break("Evaluating Model")
    threshold = 0.5
    Y_train_pred = model.predict(X_train_ids) > threshold
    Y_val_pred = model.predict(X_val_ids) > threshold
    Y_test_pred = model.predict(X_test_ids) > threshold
    ave = 'micro'
    scores_train = precision_recall_fscore_support(Y_train, Y_train_pred, average=ave)
    scores_val   = precision_recall_fscore_support(Y_val, Y_val_pred, average=ave)
    scores_test  = precision_recall_fscore_support(Y_test, Y_test_pred, average=ave)

    title_break("Performance")
    log().info("{:<10s}{:>15s}{:>15s}{:>15s}".format("Sample", "Precision", "Recall", "F1"))
    log().info("-"*55)
    log().info("{:<10s}{:15.3f}{:15.3f}{:15.3f}".format("Train", *scores_train[:3]))
    log().info("{:<10s}{:15.3f}{:15.3f}{:15.3f}".format("Val",   *scores_val[:3]))
    log().info("{:<10s}{:15.3f}{:15.3f}{:15.3f}".format("Test",  *scores_test[:3]))
    log().info("")

    # timer
    dt = stop_timer(ti)

    # --------
    # Finalize
    # --------
    section_break("Finalizing")
    log().info("Saving model to {}".format(args.output))
    model.save(args.output)
    auxname = os.path.splitext(args.output)[0] + '.pickle'
    log().info("Saving aux info to {}".format(auxname))
    with open(auxname, 'wb') as f:
        data = {
            'topics': topics,
            'id_to_word':id_to_word,
            'history': history.history,
            'scores':{
                'train': scores_train,
                'val': scores_val,
                'test': scores_test,
            },
            'time': dt.total_seconds(),
            'args': vars(args),
        }
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)