コード例 #1
0
def main():

    ti = start_timer()

    # parse command line args
    parser = build_parser()
    args = parser.parse_args()

    # setup atnlp framework
    log_level = logging.DEBUG if args.verbose else None
    if args.quiet: log_level = logging.WARN
    else:
        print("\nExecuting {}\n".format(os.path.basename(__file__)))
    setup(log_level=log_level)

    section_break("Config summary")
    for (k, v) in vars(args).items():
        log().info("{:20s}: {}".format(k, v))

    # ------------------
    # Prepare input data
    # ------------------
    section_break("Preparing input data")

    log().info("Reading data from {}...".format(args.data))
    X = read_raw(args.data)

    # ----------
    # Load model
    # ----------
    section_break("Loading model")

    model = joblib.load(args.model)

    if not args.quiet: print(model)

    # -------
    # Predict
    # -------
    section_break("Predicting labels")
    Y_pred = model.predict(X)
    Y_pred = pd.DataFrame(Y_pred, columns=model.topics)

    # --------
    # Finalize
    # --------
    section_break("Finalizing")
    log().info("Writing labels to {}".format(args.output))
    write_one_hot_labels(Y_pred, args.output)

    # timer
    stop_timer(ti)
コード例 #2
0
ファイル: reuters_to_txt.py プロジェクト: wedavey/atnlp
def main():
    # parse args
    description = "Convert Reuters dataset to standard text format"
    parser = ArgumentParser(description=description)
    parser.add_argument('-v', '--verbose', action='store_true',
                        help="Set logging level to DEBUG")
    parser.add_argument('--min-samples', type=int, default=100,
                        help="Minimum number of samples per category [default: 100]")
    parser.add_argument('--topics', help="comma separated list of topics")
    args = parser.parse_args()

    # setup atnlp framework
    log_level = logging.DEBUG if args.verbose else None
    setup(log_level=log_level)

    # select topics
    if args.topics:
        topics = args.topics.split(',')
    else:
        topics = get_topics(min_samples=args.min_samples)
    log().info("{} topics selected.".format(len(topics)))

    # get topic labels (MxN data frame of bools: M categories, N documents)
    log().info("getting topic labels...")
    (Y_train, Y_test) = get_labels(topics)
    log().info("Writing to labels_train.txt")
    write_one_hot_labels(Y_train, 'labels_train.txt')
    log().info("Writing to labels_test.txt")
    write_one_hot_labels(Y_test, 'labels_test.txt')

    # get data iterators
    # Note: we also use test data because model currently requires
    #       vocab from all samples to get be predictions
    log().info("getting topic data...")
    (X_train, X_test) = get_data(topics)
    log().info("Writing to data_train.txt")
    write_raw(X_train, "data_train.txt")
    log().info("Writing to data_test.txt")
    write_raw(X_test, "data_test.txt")
コード例 #3
0
def main():
    ti = start_timer()

    # parse command line args
    parser = build_parser()
    args = parser.parse_args()

    # setup atnlp framework
    log_level = logging.DEBUG if args.verbose else None
    if args.quiet:
        log_level = logging.WARN
    else:
        print("\nExecuting {}\n".format(os.path.basename(__file__)))
    setup(log_level=log_level)

    section_break("Config summary")
    for (k, v) in vars(args).items():
        log().info("{:20s}: {}".format(k, v))

    # ------------------
    # Prepare input data
    # ------------------
    section_break("Preparing input data")

    log().info("Reading training data from {}...".format(args.data))
    X = read_raw(args.data)

    log().info("Reading training labels from {}...".format(args.labels))
    Y = read_one_hot_labels(args.labels)

    # -----------
    # Load models
    # -----------
    section_break("Loading models")
    names = [os.path.splitext(os.path.basename(m))[0] for m in args.models]
    models = [joblib.load(m) for m in args.models]

    # -------
    # Predict
    # -------
    section_break("Predicting labels")
    preds = [m.predict(X) for m in models]

    # --------
    # Evaluate
    # --------
    tables = multimodel_topic_labelling_summary_tables(Y, preds, names)

    # --------
    # Finalize
    # --------
    section_break("Finalizing")

    html = Report()
    html.add_title("Topic modelling performance",
                   par="Here are some totally awesome results on topic modelling!")

    html.add_section("Topic-averaged performance")
    html.add_text("The precision, recall and f1 metrics use 'micro' averaging over topics")
    html.add_table(tables['summary'], cap='')

    html.add_section("Per-topic performance")

    topic_labelling_barchart(Y, preds, names)
    html.add_figure(cap="Comparison of per-topic metrics for each model")

    html.add_section("Per-topic performance (tables)")

    html.add_table(tables['precision'], cap='Precision scores per topic for each model')
    html.add_table(tables['recall'], cap='Recall scores per topic for each model')
    html.add_table(tables['f1'], cap='f1 scores per topic for each model')
    html.add_table(tables['fl'], cap='Number of false labels')
    html.add_table(tables['ml'], cap='Number of missed labels')

    # best model perf
    best_model = tables['summary']['model'].iloc[0]
    best_index = names.index(best_model)
    best_pred = preds[best_index]

    html.add_section("Correlations")
    topic_correlation_matrix(Y)
    html.add_figure(cap='True topic correlations')

    topic_migration_matrix(Y, best_pred)
    html.add_figure(cap='Topic migration matrix')

    false_labels_matrix(Y, best_pred)
    html.add_figure(cap="False labels matrix")

    html.write(args.output)

    # log().info("Writing labels to {}".format(args.output))

    # timer
    stop_timer(ti)
コード例 #4
0
ファイル: train.py プロジェクト: wedavey/atnlp
def main():

    ti = start_timer()

    # parse command line args
    parser = build_parser()
    args = parser.parse_args()
    if not args.output:
        model_name = os.path.splitext(os.path.basename(args.model))[0]
        if model_name:
            args.output = model_name + '.pkl'
        else:
            args.output = 'model.pkl'

    # setup atnlp framework
    log_level = logging.DEBUG if args.verbose else None
    if args.quiet: log_level = logging.WARN
    else:
        print("\nExecuting {}\n".format(os.path.basename(__file__)))
    setup(log_level=log_level)

    section_break("Config summary")
    for (k, v) in vars(args).items():
        log().info("{:20s}: {}".format(k, v))

    # ------------------
    # Prepare input data
    # ------------------
    section_break("Preparing input data")

    log().info("Reading training data from {}...".format(args.data))
    X = read_raw(args.data)

    log().info("Reading training labels from {}...".format(args.labels))
    Y = read_one_hot_labels(args.labels)

    # ------------
    # Create model
    # ------------
    section_break("Creating model")

    # dynamically load model using yml config
    model = load_configured_model(args.model)

    # attach topics to model so they are persistified
    model.topics = list(Y.columns)

    if not args.quiet:
        log().info("")
        for s in str(model).split('\n'):
            log().info(s)

    # ---------
    # Fit model
    # ---------
    section_break("Training model")
    model.fit(X, Y)

    # --------
    # Finalize
    # --------
    section_break("Finalizing")
    log().info("Saving model to {}".format(args.output))
    joblib.dump(model, args.output)

    # timer
    stop_timer(ti)
コード例 #5
0
def main():

    ti = start_timer()

    # parse command line args
    parser = build_parser()
    args = parser.parse_args()

    assert args.lstm_depth >= 1, "Must configure at least one LSTM layer"

    print("\nExecuting train_reuters_rnn.py\n")

    # setup atnlp framework
    log_level = logging.DEBUG if args.verbose else None
    setup(log_level=log_level)

    section_break("Config summary")
    for (k,v) in vars(args).items():
        log().info("{:20s}: {}".format(k,v))

    # ------------------
    # Prepare input data
    # ------------------
    section_break("Preparing input data")

    # select topics
    if args.topics:
        topics = args.topics.split(',')
    else:
        topics = get_topics(min_samples=args.min_samples)
    log().info("{} topics selected.".format(len(topics)))

    # get topic labels (MxN data frame of bools: M categories, N documents)
    # TODO: could explicitly ignore Y_test here to show we don't need test labels
    log().info("getting topic labels...")
    (Y_train, Y_test) = get_labels(topics)

    # get data iterators
    # Note: we also use test data because model currently requires
    #       vocab from all samples to get be predictions
    log().info("getting topic data...")
    (X_train_raw, X_test_raw) = get_data(topics)

    # convert words to integers
    log().info("converting to integer representation...")
    word_to_id = build_vocab(list(X_train_raw) + list(X_test_raw), max_size=args.max_vocab_size)
    X_train_ids = raw_to_ids(X_train_raw, word_to_id)
    X_test_ids = raw_to_ids(X_test_raw, word_to_id)

    # pad
    log().info("padding sequences...")
    id_to_word = dict(zip(word_to_id.values(), word_to_id.keys()))
    X_train_ids = pad_sequences(X_train_ids, maxlen=args.max_doc_length, value=word_to_id[PAD_WORD],
                                padding='post', truncating='post')
    X_test_ids = pad_sequences(X_test_ids, maxlen=args.max_doc_length, value=word_to_id[PAD_WORD],
                               padding='post', truncating='post')
    vocab_size = len(word_to_id)

    # split train into train + validation
    X_train_ids, X_val_ids, Y_train, Y_val = train_test_split(
        X_train_ids, Y_train, test_size=0.20, random_state=42)

    # dataset summary
    title_break("Data Summary")
    log().info("{} topics selected: {}".format(len(topics), topics))
    log().info("n train: {}".format(len(X_train_ids)))
    log().info("n val:   {}".format(len(X_val_ids)))
    log().info("n test:  {}".format(len(X_test_ids)))
    log().info("max doc length: {}".format(args.max_doc_length))
    log().info("vocab size: {}".format(vocab_size))

    # ------------
    # Create model
    # ------------
    section_break("Creating Model")
    # create embedding layer
    if args.learn_embeddings:
        embedding = Embedding(vocab_size, args.embedding_size)
    else:
        embedding = create_embedding_layer(load_glove(), word_to_id, args.max_doc_length)

    # create LSTM layers
    lstm_size = args.lstm_size or embedding.output_dim
    lstm_args = {'dropout':args.dropout, 'recurrent_dropout': args.recurrent_dropout}
    lstm_layers = [LSTM(lstm_size, **lstm_args) for _ in range(args.lstm_depth)]
    if args.bidirectional:
        lstm_layers = [Bidirectional(l) for l in lstm_layers]

    # construct model
    model = Sequential()
    model.add(embedding)
    for l in lstm_layers: model.add(l)
    model.add(Dense(units=len(topics), activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    print(model.summary())

    # ---------
    # Fit model
    # ---------
    # create callbacks
    callbacks = [f1_metric]
    if not args.no_early_stopping:
        callbacks += [EarlyStopping(monitor='val_f1', mode='max', patience=2)]

    # fit
    section_break("Training Model")
    history = model.fit(X_train_ids, Y_train, epochs=args.epochs,
                       batch_size=args.batch_size, verbose=1,
                       validation_data=(X_val_ids, Y_val),
                       callbacks=callbacks)

    # --------------
    # Evaluate model
    # --------------
    section_break("Evaluating Model")
    threshold = 0.5
    Y_train_pred = model.predict(X_train_ids) > threshold
    Y_val_pred = model.predict(X_val_ids) > threshold
    Y_test_pred = model.predict(X_test_ids) > threshold
    ave = 'micro'
    scores_train = precision_recall_fscore_support(Y_train, Y_train_pred, average=ave)
    scores_val   = precision_recall_fscore_support(Y_val, Y_val_pred, average=ave)
    scores_test  = precision_recall_fscore_support(Y_test, Y_test_pred, average=ave)

    title_break("Performance")
    log().info("{:<10s}{:>15s}{:>15s}{:>15s}".format("Sample", "Precision", "Recall", "F1"))
    log().info("-"*55)
    log().info("{:<10s}{:15.3f}{:15.3f}{:15.3f}".format("Train", *scores_train[:3]))
    log().info("{:<10s}{:15.3f}{:15.3f}{:15.3f}".format("Val",   *scores_val[:3]))
    log().info("{:<10s}{:15.3f}{:15.3f}{:15.3f}".format("Test",  *scores_test[:3]))
    log().info("")

    # timer
    dt = stop_timer(ti)

    # --------
    # Finalize
    # --------
    section_break("Finalizing")
    log().info("Saving model to {}".format(args.output))
    model.save(args.output)
    auxname = os.path.splitext(args.output)[0] + '.pickle'
    log().info("Saving aux info to {}".format(auxname))
    with open(auxname, 'wb') as f:
        data = {
            'topics': topics,
            'id_to_word':id_to_word,
            'history': history.history,
            'scores':{
                'train': scores_train,
                'val': scores_val,
                'test': scores_test,
            },
            'time': dt.total_seconds(),
            'args': vars(args),
        }
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)