Esempio n. 1
0
    def train(self, train_dir, vocabulary, test_dir=None, callbacks=None,
              nn_model=NN_ARCHITECTURE, batch_size=BATCH_SIZE,
              nb_epochs=NB_EPOCHS, verbose=1):
        """
        Train the model on given data
        :param train_dir: directory with data files. Text files should end with
        '.txt' and corresponding files containing labels should end with '.lab'
        :param vocabulary: iterable containing all considered labels
        :param test_dir: directory with test files. They will be used to evaluate
        the model after every epoch of training.
        :param callbacks: objects passed to the Keras fit function as callbacks
        :param nn_model: string defining the NN architecture e.g. 'crnn'
        :param batch_size: size of one batch
        :param nb_epochs: number of epochs to train
        :param verbose: 0, 1 or 2. As in Keras.

        :return: History object
        """

        if not self.word2vec_model:
            print('word2vec model is not trained. Run train_word2vec() first.')
            return

        if not self.scaler:
            print('The scaler is not trained. Run fit_scaler() first.')
            return

        if self.keras_model:
            print('WARNING! Overwriting already trained Keras model.')

        self.labels = vocabulary
        self.keras_model = get_nn_model(nn_model, output_length=len(vocabulary))

        (x_train, y_train), test_data = get_data_for_model(
            train_dir,
            vocabulary,
            test_dir=test_dir,
            nn_model=self.keras_model,
            as_generator=False,
            batch_size=batch_size,
            word2vec_model=self.word2vec_model,
            scaler=self.scaler,
        )

        return self.keras_model.fit(
            x_train,
            y_train,
            batch_size=batch_size,
            nb_epoch=nb_epochs,
            show_accuracy=True,
            validation_data=test_data,
            callbacks=callbacks or [],
            verbose=verbose,
        )
Esempio n. 2
0
def extract(doc, model_path, **kwargs):
    """
    Extract keywords from a given file
    :param doc: Document object
    # :param ontology_path: unicode with the ontology path
    :param model_path: unicode with the trained model path
    # :param recreate_ontology: boolean flag whether to recreate the ontology
    # :param verbose: whether to print additional info

    :return: set of predicted keywords
    """
    nn_name = os.path.basename(model_path).split('.')[0]
    model = get_nn_model(nn_name)
    model.load_weights(model_path)

    return nn_extract(doc, model, **kwargs)
Esempio n. 3
0
def batch_train(nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, nn='berger_cnn',
                nb_worker=1, verbose=1):
    """
    Train a NN model out-of-core with given parameters.
    :param nb_epochs: number of epochs
    :param batch_size: size of one batch
    :param nn: nn type, for supported ones look at `get_nn_model()`
    :param nb_worker: number of workers to read the data
    :param verbose: verbosity flag

    :return: tuple containing a history object and a trained keras model
    """
    model = get_nn_model(nn)
    train_generator, (x_test, y_test) = get_data_for_model(
        model,
        as_generator=True,
        batch_size=batch_size,
        train_dir=HEP_TRAIN_PATH,
        test_dir=HEP_TEST_PATH,
    )

    # Create callbacks
    logger = CustomLogger(x_test, y_test, nn)
    model_checkpoint = ModelCheckpoint(
        os.path.join(logger.log_dir, 'keras_model'),
        save_best_only=True,
    )

    history = model.fit_generator(
        train_generator,
        len({filename[:-4] for filename in os.listdir(HEP_TRAIN_PATH)}),
        nb_epochs,
        show_accuracy=True,
        validation_data=(x_test, y_test),
        callbacks=[logger, model_checkpoint],
        nb_worker=nb_worker,
        verbose=verbose,
    )

    finish_logging(logger, history)

    return history, model
Esempio n. 4
0
def train(nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, nn='berger_cnn', verbose=1):
    """
    Train a NN model with given parameters, all in memory
    :param nb_epochs: number of epochs
    :param batch_size: size of one batch
    :param nn: nn type, for supported ones look at `get_nn_model()`
    :param verbose: verbosity flag

    :return: tuple containing a history object and a trained keras model
    """
    model = get_nn_model(nn)
    (x_train, y_train), (x_test, y_test) = get_data_for_model(
        model,
        as_generator=False,
        train_dir=HEP_TRAIN_PATH,
        test_dir=HEP_TEST_PATH,
    )

    # Create callbacks
    logger = CustomLogger(x_test, y_test, nn)
    model_checkpoint = ModelCheckpoint(
        os.path.join(logger.log_dir, 'keras_model'),
        save_best_only=True,
    )

    history = model.fit(
        x_train,
        y_train,
        batch_size=batch_size,
        nb_epoch=nb_epochs,
        show_accuracy=True,
        validation_data=(x_test, y_test),
        callbacks=[logger, model_checkpoint],
        verbose=verbose,
    )

    finish_logging(logger, history)

    return history, model
Esempio n. 5
0
    def train(self, train_dir, vocabulary, test_dir=None, callbacks=None,
              nn_model=NN_ARCHITECTURE, batch_size=BATCH_SIZE, test_ratio=0.0,
              epochs=EPOCHS, verbose=1, logdir= None, optimizer='Adam'):
        """
        Train the model on given data根据给定数据训练模型
        :param train_dir: directory with data files. Text files should end with
        '.txt' and corresponding files containing labels should end with '.lab'带有数据文件的目录。文本文件应以结尾“.txt”和包含标签的相应文件应以“.lab”结尾。
        :param vocabulary: iterable containing all considered labels可包含所有考虑的标签
        :param test_dir: directory with test files. They will be used to evaluate
        the model after every epoch of training.带有测试文件的目录。它们将用于评估每一个训练时代之后的模型。
        :param callbacks: objects passed to the Keras fit function as callbacks作为回调传递给keras fit函数的对象
        :param nn_model: string defining the NN architecture e.g. 'crnn'定义nn架构的字符串,例如“crnn”
        :param batch_size: size of one batch一批的大小
        :param test_ratio: the ratio of samples that will be withheld from training
        and used for testing. This can be overridden by test_dir.不接受培训的样本比率用于测试。这可以被test_dir覆盖。
        :param epochs: number of epochs to train要培训的时段数
        :param verbose: 0, 1 or 2. As in Keras.0、1或2。和Keras一样。

        :return: History object历史记录对象
        """

        if not self.word2vec_model:
            raise RuntimeError('word2vec model is not trained. ' + \
                               'Run train_word2vec() first.')

        if not self.scaler:
            raise RuntimeError('The scaler is not trained. ' + \
                               'Run fit_scaler() first.')

        if not os.path.isdir(train_dir):
            raise ValueError('The training directory ' + train_dir + \
                             ' does not exist')

        if test_dir and not os.path.isdir(test_dir):
            raise ValueError('The test directory ' + test_dir + \
                             ' does not exist')

        if self.keras_model:
            print('WARNING! Overwriting already trained Keras model.',
                  file=sys.stderr)

        self.labels = vocabulary
        self.keras_model = get_nn_model(
            nn_model,
            embedding=self.word2vec_model.vector_size,
            output_length=len(vocabulary),
            optimizer=optimizer  #这个参数是我从方法中提出来的
        )

        (x_train, y_train), test_data = get_data_for_model(
            train_dir,
            vocabulary,
            test_dir=test_dir,
            nn_model=self.keras_model,
            as_generator=False,
            batch_size=batch_size,
            word2vec_model=self.word2vec_model,
            scaler=self.scaler,
        )
        keras_model = self.keras_model.fit(
            x_train,
            y_train,
            batch_size=batch_size,
            epochs=epochs,
            validation_data=test_data,
            validation_split=test_ratio,
            callbacks=callbacks or [],
            verbose=verbose,
        )
        #logdir保存日志的文件位置
        if logdir:
            with open(logdir, 'w') as f:
                f.write(str(keras_model.history))
        return keras_model
Esempio n. 6
0
    def batch_train(self, train_dir, vocabulary, test_dir=None, callbacks=None,
                    nn_model=NN_ARCHITECTURE, batch_size=BATCH_SIZE,
                    epochs=EPOCHS, verbose=1):
        """
        Train the model on given data
        :param train_dir: directory with data files. Text files should end with
        '.txt' and corresponding files containing labels should end with '.lab'
        :param vocabulary: iterable containing all considered labels
        :param test_dir: directory with test files. They will be used to evaluate
        the model after every epoch of training.
        :param callbacks: objects passed to the Keras fit function as callbacks
        :param nn_model: string defining the NN architecture e.g. 'crnn'
        :param batch_size: size of one batch
        :param epochs: number of epochs to train
        :param verbose: 0, 1 or 2. As in Keras.

        :return: History object

        根据给定数据训练模型

:param train_dir:带有数据文件的目录。文本文件应以结尾

“.txt”和包含标签的相应文件应以“.lab”结尾。

:参数词汇:可包含所有考虑的标签

:param test_dir:带有测试文件的目录。它们将用于评估

每一个训练时代之后的模型。

:param callbacks:作为回调传递给keras fit函数的对象

:param nn_model:定义nn架构的字符串,例如“crnn”

:param batch_size:一批的大小

:param epochs:要培训的时段数

:param verbose:0、1或2。和Keras一样。



:返回:历史记录对象


        """

        if not self.word2vec_model:
            raise RuntimeError('word2vec model is not trained. ' + \
                               'Run train_word2vec() first.')

        if not self.scaler:
            raise RuntimeError('The scaler is not trained. ' + \
                               'Run fit_scaler() first.')

        if not os.path.isdir(train_dir):
            raise ValueError('The training directory ' + train_dir + \
                             ' does not exist')

        if test_dir and not os.path.isdir(test_dir):
            raise ValueError('The test directory ' + test_dir + \
                             ' does not exist')

        if self.keras_model:
            print('WARNING! Overwriting already trained Keras model.',
                  file=sys.stderr)

        self.labels = vocabulary
        self.keras_model = get_nn_model(
            nn_model,
            embedding=self.word2vec_model.vector_size,
            output_length=len(vocabulary)
        )

        train_generator, test_data = get_data_for_model(
            train_dir,
            vocabulary,
            test_dir=test_dir,
            nn_model=self.keras_model,
            as_generator=True,
            batch_size=batch_size,
            word2vec_model=self.word2vec_model,
            scaler=self.scaler,
        )

        nb_of_files = len({filename[:-4] for filename in os.listdir(train_dir)})
        steps_per_epoch = math.ceil(nb_of_files / batch_size)

        return self.keras_model.fit_generator(
            train_generator,
            steps_per_epoch=steps_per_epoch,
            epochs=epochs,
            validation_data=test_data,
            callbacks=callbacks or [],
            verbose=verbose,
        )
Esempio n. 7
0
    def train(self, train_dir, vocabulary, test_dir=None, callbacks=None,
              nn_model=NN_ARCHITECTURE, batch_size=BATCH_SIZE, test_ratio=0.0,
              epochs=EPOCHS, verbose=1):
        """
        Train the model on given data
        :param train_dir: directory with data files. Text files should end with
        '.txt' and corresponding files containing labels should end with '.lab'
        :param vocabulary: iterable containing all considered labels
        :param test_dir: directory with test files. They will be used to evaluate
        the model after every epoch of training.
        :param callbacks: objects passed to the Keras fit function as callbacks
        :param nn_model: string defining the NN architecture e.g. 'crnn'
        :param batch_size: size of one batch
        :param test_ratio: the ratio of samples that will be withheld from training
        and used for testing. This can be overridden by test_dir.
        :param epochs: number of epochs to train
        :param verbose: 0, 1 or 2. As in Keras.

        :return: History object
        """

        if not self.word2vec_model:
            raise RuntimeError('word2vec model is not trained. ' + \
                               'Run train_word2vec() first.')

        if not self.scaler:
            raise RuntimeError('The scaler is not trained. ' + \
                               'Run fit_scaler() first.')

        if not os.path.isdir(train_dir):
            raise ValueError('The training directory ' + train_dir + \
                             ' does not exist')

        if test_dir and not os.path.isdir(test_dir):
            raise ValueError('The test directory ' + test_dir + \
                             ' does not exist')

        if self.keras_model:
            print('WARNING! Overwriting already trained Keras model.',
                  file=sys.stderr)

        self.labels = vocabulary
        self.keras_model = get_nn_model(
            nn_model,
            embedding=self.word2vec_model.vector_size,
            output_length=len(vocabulary)
        )

        (x_train, y_train), test_data = get_data_for_model(
            train_dir,
            vocabulary,
            test_dir=test_dir,
            nn_model=self.keras_model,
            as_generator=False,
            batch_size=batch_size,
            word2vec_model=self.word2vec_model,
            scaler=self.scaler,
        )

        return self.keras_model.fit(
            x_train,
            y_train,
            batch_size=batch_size,
            epochs=epochs,
            validation_data=test_data,
            validation_split=test_ratio,
            callbacks=callbacks or [],
            verbose=verbose,
        )
Esempio n. 8
0
    def batch_train(self,
                    train_dir,
                    vocabulary,
                    test_dir=None,
                    callbacks=None,
                    nn_model=NN_ARCHITECTURE,
                    batch_size=BATCH_SIZE,
                    nb_epochs=NB_EPOCHS,
                    verbose=1):
        """
        Train the model on given data
        :param train_dir: directory with data files. Text files should end with
        '.txt' and corresponding files containing labels should end with '.lab'
        :param vocabulary: iterable containing all considered labels
        :param test_dir: directory with test files. They will be used to evaluate
        the model after every epoch of training.
        :param callbacks: objects passed to the Keras fit function as callbacks
        :param nn_model: string defining the NN architecture e.g. 'crnn'
        :param batch_size: size of one batch
        :param nb_epochs: number of epochs to train
        :param verbose: 0, 1 or 2. As in Keras.

        :return: History object
        """

        if not self.word2vec_model:
            print('word2vec model is not trained. Run train_word2vec() first.')
            return

        if not self.scaler:
            print('The scaler is not trained. Run fit_scaler() first.')
            return

        if self.keras_model:
            print('WARNING! Overwriting already trained Keras model.')

        self.labels = vocabulary
        self.keras_model = get_nn_model(
            nn_model,
            embedding=self.word2vec_model.vector_size,
            output_length=len(vocabulary))

        train_generator, test_data = get_data_for_model(
            train_dir,
            vocabulary,
            test_dir=test_dir,
            nn_model=self.keras_model,
            as_generator=True,
            batch_size=batch_size,
            word2vec_model=self.word2vec_model,
            scaler=self.scaler,
        )

        return self.keras_model.fit_generator(
            train_generator,
            len({filename[:-4]
                 for filename in os.listdir(train_dir)}),
            nb_epochs,
            validation_data=test_data,
            callbacks=callbacks or [],
            verbose=verbose,
        )
Esempio n. 9
0
  def train(
          self,
          train_data: DataList,
          test_data: DataList,
          labels,
          callbacks=None,
          nn_model=NN_ARCHITECTURE,
          batch_size=BATCH_SIZE,
          test_ratio=0.0,
          epochs=EPOCHS,
          verbose=1):
    """
    Train the model on given data
    :param train_dir: directory with data files. Text files should end with
    '.txt' and corresponding files containing labels should end with '.lab'
    :param vocabulary: iterable containing all considered labels
    :param test_dir: directory with test files. They will be used to evaluate
    the model after every epoch of training.
    :param callbacks: objects passed to the Keras fit function as callbacks
    :param nn_model: string defining the NN architecture e.g. 'crnn'
    :param batch_size: size of one batch
    :param test_ratio: the ratio of samples that will be withheld from training
    and used for testing. This can be overridden by test_dir.
    :param epochs: number of epochs to train
    :param verbose: 0, 1 or 2. As in Keras.

    :return: History object
    """

    set_tf_growth()

    if not self.word2vec_model:
      raise RuntimeError('word2vec model is not trained. ' +
                         'Run train_word2vec() first.')

    if not self.scaler:
      raise RuntimeError('The scaler is not trained. ' +
                         'Run fit_scaler() first.')

    if self.keras_model:
      print('WARNING! Overwriting already trained Keras model.',
            file=sys.stderr)

    self.labels = labels
    self.keras_model = get_nn_model(
        nn_model,
        embedding=self.word2vec_model.vector_size,
        output_length=len(self.labels)
    )
    regression = nn_model == 'cnn_regression'  # TODO make this more general
    self.training_set = set([example['text'] for example in train_data])
    (x_train, y_train), test_data_matrix = get_data_for_model(train_data,
                                                              test_data,
                                                              self.labels,
                                                              nn_model=self.keras_model,
                                                              as_generator=False,
                                                              batch_size=batch_size,
                                                              word2vec_model=self.word2vec_model,
                                                              scaler=self.scaler,
                                                              regression=regression
                                                              )

    return self.keras_model.fit(
        x_train,
        y_train,
        batch_size=batch_size,
        epochs=epochs,
        validation_data=test_data_matrix,
        # TODO make validation data optional for speedup
        callbacks=callbacks or [],
        verbose=verbose,
    )