Beispiel #1
0
def train(train_dir, test_dir=None, nn='berger_cnn', nb_epochs=NB_EPOCHS,
          batch_size=BATCH_SIZE, verbose=1):
    # Figure out whether we're predicting categories or keywords
    if NO_OF_LABELS == 14:
        scaler_path = CATEGORY_SCALER
        w2v_path = CATEGORY_WORD2VEC
    else:
        scaler_path = KEYWORD_SCALER
        w2v_path = KEYWORD_WORD2VEC

    model = MagpieModel(
        word2vec_model=Word2Vec.load(w2v_path),
        scaler=load_from_disk(scaler_path),
    )

    logger = CustomLogger(nn)
    model_checkpoint = ModelCheckpoint(
        os.path.join(logger.log_dir, 'keras_model'),
        save_best_only=True,
    )

    history = model.train(
        train_dir,
        get_labels(NO_OF_LABELS),
        test_dir=test_dir,
        nn_model=nn,
        callbacks=[logger, model_checkpoint],
        batch_size=batch_size,
        nb_epochs=nb_epochs,
        verbose=verbose,
    )

    finish_logging(logger, history)

    return history, model
Beispiel #2
0
def build_model_for_corpus(corpus):
    """ Build an appropriate Keras NN model depending on the corpus """
    if corpus == 'keywords':
        keras_model = cnn(embedding_size=100, output_length=10000)
    elif corpus == 'categories':
        keras_model = cnn(embedding_size=100, output_length=14)
    elif corpus == 'experiments':
        keras_model = cnn(embedding_size=100, output_length=500)
    else:
        raise ValueError('The corpus is not valid')

    model_path = os.path.join(DATA_DIR, corpus, 'model.pickle')
    keras_model.load_weights(model_path)

    w2v_model = Word2Vec.load(WORD2VEC_PATH)
    scaler = load_from_disk(SCALER_PATH)
    labels = get_labels(keras_model.output_shape[1])

    model = MagpieModel(
        keras_model=keras_model,
        word2vec_model=w2v_model,
        scaler=scaler,
        labels=labels,
    )

    return model
Beispiel #3
0
def build_model_for_corpus(corpus):
    """ Build an appropriate Keras NN model depending on the corpus """
    if corpus == 'keywords':
        keras_model = cnn(embedding_size=100, output_length=10000)
    elif corpus == 'categories':
        keras_model = cnn(embedding_size=100, output_length=14)
    elif corpus == 'experiments':
        keras_model = cnn(embedding_size=100, output_length=500)
    else:
        raise ValueError('The corpus is not valid')

    model_path = os.path.join(DATA_DIR, corpus, 'model.pickle')
    keras_model.load_weights(model_path)

    w2v_model = Word2Vec.load(WORD2VEC_PATH)
    scaler = load_from_disk(SCALER_PATH)
    labels = get_labels(keras_model.output_shape[1])

    model = MagpieModel(
        keras_model=keras_model,
        word2vec_model=w2v_model,
        scaler=scaler,
        labels=labels,
    )

    return model
Beispiel #4
0
def train(train_dir, test_dir=None, nn='cnn', nb_epochs=NB_EPOCHS,
          batch_size=BATCH_SIZE, persist=False, no_of_labels=NO_OF_LABELS,
          verbose=1):
    model = MagpieModel(
        word2vec_model=Word2Vec.load(WORD2VEC_PATH),
        scaler=load_from_disk(SCALER_PATH),
    )

    logger = CustomLogger(nn)
    model_checkpoint = ModelCheckpoint(
        os.path.join(logger.log_dir, 'keras_model'),
        save_best_only=True,
    )

    history = model.train(
        train_dir,
        get_labels(no_of_labels),
        test_dir=test_dir,
        nn_model=nn,
        callbacks=[logger, model_checkpoint],
        batch_size=batch_size,
        nb_epochs=nb_epochs,
        verbose=verbose,
    )

    finish_logging(logger, history, model.keras_model, persist=persist)

    return history, model
Beispiel #5
0
def build_model_for_corpus(corpus):
    """ Build an appropriate Keras NN model depending on the corpus """
    keras_model = None
    no_of_labels = -1

    if corpus == 'keywords':
        keras_model = berger_cnn(embedding_size=100, output_length=1000)
        no_of_labels = 1000
    elif corpus == 'categories':
        keras_model = berger_cnn(embedding_size=50, output_length=14)
        no_of_labels = 14

    model_path = os.path.join(DATA_DIR, corpus, 'model.pickle')
    keras_model.load_weights(model_path)

    w2v_path = os.path.join(DATA_DIR, corpus, 'word2vec.pickle')
    w2v_model = Word2Vec.load(w2v_path)

    scaler_path = os.path.join(DATA_DIR, corpus, 'scaler.pickle')
    scaler = load_from_disk(scaler_path)

    labels = get_labels(no_of_labels)

    model = MagpieModel(
        keras_model=keras_model,
        word2vec_model=w2v_model,
        scaler=scaler,
        labels=labels,
    )

    return model
Beispiel #6
0
def train(train_dir,
          test_dir=None,
          nn='cnn',
          nb_epochs=NB_EPOCHS,
          batch_size=BATCH_SIZE,
          persist=False,
          no_of_labels=NO_OF_LABELS,
          verbose=1):
    model = MagpieModel(
        word2vec_model=Word2Vec.load(WORD2VEC_PATH),
        scaler=load_from_disk(SCALER_PATH),
    )

    logger = CustomLogger(nn)
    model_checkpoint = ModelCheckpoint(
        os.path.join(logger.log_dir, 'keras_model'),
        save_best_only=True,
    )

    history = model.train(
        train_dir,
        get_labels(no_of_labels),
        test_dir=test_dir,
        nn_model=nn,
        callbacks=[logger, model_checkpoint],
        batch_size=batch_size,
        nb_epochs=nb_epochs,
        verbose=verbose,
    )

    finish_logging(logger, history, model.keras_model, persist=persist)

    return history, model
Beispiel #7
0
def test(
    testset_path,
    ontology=ONTOLOGY_PATH,
    model=MODEL_PATH,
    recreate_ontology=False,
    verbose=True,
):
    """
    Test the trained model on a set under a given path.
    :param testset_path: path to the directory with the test set
    :param ontology: path to the ontology
    :param model: path where the model is pickled
    :param recreate_ontology: boolean flag whether to recreate the ontology
    :param verbose: whether to print computation times

    :return tuple of three floats (precision, recall, f1_score)
    """
    if type(model) in [str, unicode]:
        model = load_from_disk(model)

    if type(ontology) in [str, unicode]:
        ontology = get_ontology(path=ontology, recreate=recreate_ontology)

    keywords = get_keywords()
    keyword_indices = {kw: i for i, kw in enumerate(keywords)}

    all_metrics = calculate_basic_metrics([range(5)]).keys()
    metrics_agg = {m: [] for m in all_metrics}

    for doc in get_documents(testset_path, as_generator=True):
        x, answers, kw_vector = build_test_matrices(
            [doc],
            model,
            testset_path,
            ontology,
        )

        y_true = build_y_true(answers, keyword_indices, doc.doc_id)

        # Predict
        ranking = model.scale_and_predict(x.as_matrix())

        y_pred = y_true[0][ranking[::-1]]

        metrics = calculate_basic_metrics([y_pred])

        for k, v in metrics.iteritems():
            metrics_agg[k].append(v)

    return {k: np.mean(v) for k, v in metrics_agg.iteritems()}
Beispiel #8
0
def get_data_for_model(train_dir,
                       labels,
                       test_dir=None,
                       nn_model=None,
                       as_generator=False,
                       batch_size=BATCH_SIZE,
                       word2vec_model=None,
                       scaler=None):
    """
    Get data in the form of matrices or generators for both train and test sets.
    :param train_dir: directory with train files
    :param labels: an iterable of predefined labels (controlled vocabulary)
    :param test_dir: directory with test files
    :param nn_model: Keras model of the NN
    :param as_generator: flag whether to return a generator or in-memory matrix
    :param batch_size: integer, size of the batch
    :param word2vec_model: trained w2v gensim model
    :param scaler: scaling object for X matrix normalisation e.g. StandardScaler

    :return: tuple with 2 elements for train and test data. Each element can be
    either a pair of matrices (X, y) or their generator
    """

    kwargs = dict(
        label_indices={lab: i
                       for i, lab in enumerate(labels)},
        word2vec_model=word2vec_model or Word2Vec.load(WORD2VEC_MODELPATH),
        scaler=scaler or load_from_disk(SCALER_PATH),
        nn_model=nn_model,
    )

    if as_generator:
        filename_it = FilenameIterator(train_dir, batch_size)
        train_data = iterate_over_batches(filename_it, **kwargs)
    else:
        train_files = {filename[:-4] for filename in os.listdir(train_dir)}
        train_data = build_x_and_y(train_files, train_dir, **kwargs)

    test_data = None
    if test_dir:
        test_files = {filename[:-4] for filename in os.listdir(test_dir)}
        test_data = build_x_and_y(test_files, test_dir, **kwargs)

    return train_data, test_data
Beispiel #9
0
def batch_train(train_dir,
                test_dir=None,
                nn='berger_cnn',
                nb_epochs=NB_EPOCHS,
                batch_size=BATCH_SIZE,
                verbose=1):

    # Figure out whether we're predicting categories or keywords
    if NO_OF_LABELS == 14:
        scaler_path = CATEGORY_SCALER
        w2v_path = CATEGORY_WORD2VEC
    else:
        scaler_path = KEYWORD_SCALER
        w2v_path = KEYWORD_WORD2VEC

    model = MagpieModel(
        word2vec_model=Word2Vec.load(w2v_path),
        scaler=load_from_disk(scaler_path),
    )

    logger = CustomLogger(nn)
    model_checkpoint = ModelCheckpoint(
        os.path.join(logger.log_dir, 'keras_model'),
        save_best_only=True,
    )

    history = model.batch_train(
        train_dir,
        get_labels(NO_OF_LABELS),
        test_dir=test_dir,
        nn_model=nn,
        callbacks=[logger, model_checkpoint],
        batch_size=batch_size,
        nb_epochs=nb_epochs,
        verbose=verbose,
    )

    finish_logging(logger, history)

    return history, model
Beispiel #10
0
def get_data_for_model(train_dir, labels, test_dir=None, nn_model=None,
                       as_generator=False, batch_size=BATCH_SIZE,
                       word2vec_model=None, scaler=None):
    """
    Get data in the form of matrices or generators for both train and test sets.
    :param train_dir: directory with train files
    :param labels: an iterable of predefined labels (controlled vocabulary)
    :param test_dir: directory with test files
    :param nn_model: Keras model of the NN
    :param as_generator: flag whether to return a generator or in-memory matrix
    :param batch_size: integer, size of the batch
    :param word2vec_model: trained w2v gensim model
    :param scaler: scaling object for X matrix normalisation e.g. StandardScaler

    :return: tuple with 2 elements for train and test data. Each element can be
    either a pair of matrices (X, y) or their generator
    """

    kwargs = dict(
        label_indices={lab: i for i, lab in enumerate(labels)},
        word2vec_model=word2vec_model or Word2Vec.load(WORD2VEC_MODELPATH),
        scaler=scaler or load_from_disk(SCALER_PATH),
        nn_model=nn_model,
    )

    if as_generator:
        filename_it = FilenameIterator(train_dir, batch_size)
        train_data = iterate_over_batches(filename_it, **kwargs)
    else:
        train_files = {filename[:-4] for filename in os.listdir(train_dir)}
        train_data = build_x_and_y(train_files, train_dir, **kwargs)

    test_data = None
    if test_dir:
        test_files = {filename[:-4] for filename in os.listdir(test_dir)}
        test_data = build_x_and_y(test_files, test_dir, **kwargs)

    return train_data, test_data
Beispiel #11
0
 def load_word2vec_model(self, filepath):
     """ Load the word2vec model from a file """
     self.word2vec_model = load_from_disk(filepath)
Beispiel #12
0
 def load_scaler(self, filepath):
     """ Load the scaler object from a file """
     self.scaler = load_from_disk(filepath)
Beispiel #13
0
 def load_word2vec_model(self, filepath):
     """ Load the word2vec model from a file """
     self.word2vec_model = load_from_disk(filepath)
Beispiel #14
0
 def load_scaler(self, filepath):
     """ Load the scaler object from a file """
     self.scaler = load_from_disk(filepath)