Beispiel #1
0
    def __init__(self, data, maximum_tokens=None, batch_size=2048):
        """Create a generator of embedded data batches.

        The data for each batch with be an array of size (batch size, maximum tokens, embeddings). If maximum tokens is
        not specified to the constructor, the number of tokens in the longest text in all the text pairs is used.

        :param data: data frame with text1, text2, and optional label columns
        :type data: pandas.DataFrame
        :param maximum_tokens: maximum number of tokens in an embedding
        :type maximum_tokens: int or None
        :param batch_size: number of samples per batch
        :type batch_size: int
        """
        self.data = data
        self.batch_size = batch_size
        self.batches_per_epoch = math.ceil(len(self) / self.batch_size)
        self._labeled = label in self.data.columns
        if self._labeled:
            self.data.loc[:, label] = self.data.loc[:,
                                                    label].astype("category")
        if maximum_tokens is None:
            m1 = max(
                len(document) for document in parse_texts(self.data[text_1]))
            m2 = max(
                len(document) for document in parse_texts(self.data[text_2]))
            maximum_tokens = max(m1, m2)
        self.maximum_tokens = maximum_tokens
        logger.info(self)
Beispiel #2
0
def main():
    parser = create_argument_parser()
    args = parser.parse_args()
    configure_logger(args.log.upper(),
                     "%(asctime)-15s %(levelname)-8s %(message)s")
    logger.info("Start")
    args.func(args)
    logger.info("Done")
Beispiel #3
0
    def fit(self,
            training,
            epochs=1,
            validation_data=None,
            model_directory=None):
        """
        Fit the model to the training data

        :param training: training data generator
        :type training: TextPairEmbeddingGenerator
        :param epochs: number of epochs to train
        :type epochs: int
        :param validation_data: data for validation at the end of each epoch
        :type validation_data: pandas.DataFrame or None
        :param model_directory: directory in which to serialize the model
        :type model_directory: str or None
        :return: training history
        :rtype: keras.callbacks.History
        """
        logger.info("Train model: %d samples, %d epochs, batch size %d" %
                    (len(training), epochs, training.batch_size))
        if validation_data is not None:
            g = TextPairEmbeddingGenerator(validation_data,
                                           maximum_tokens=self.maximum_tokens,
                                           batch_size=training.batch_size)
            validation_embeddings, validation_steps = g(), g.batches_per_epoch
        else:
            validation_embeddings = validation_steps = None
        verbose = {
            logging.INFO: 2,
            logging.DEBUG: 1
        }.get(logger.getEffectiveLevel(), 0)
        if model_directory is not None:
            if validation_data is not None:
                monitor = "val_loss"
            else:
                monitor = "loss"
            callbacks = [
                ModelCheckpoint(filepath=self._model_filename(model_directory),
                                monitor=monitor,
                                save_best_only=True,
                                verbose=verbose)
            ]
        else:
            callbacks = None
        logger.info("Start training")
        return self.model.fit_generator(
            generator=training(),
            steps_per_epoch=training.batches_per_epoch,
            epochs=epochs,
            validation_data=validation_embeddings,
            validation_steps=validation_steps,
            callbacks=callbacks,
            verbose=verbose)
Beispiel #4
0
def score(args):
    from bisemantic.classifier import TextPairClassifier

    test = data_file(args.test, args.n, args.index_name, args.text_1_name,
                     args.text_2_name, args.label_name, args.invalid_labels,
                     not args.not_comma_delimited)
    logger.info("Score predictions for %d pairs" % len(test))
    model = TextPairClassifier.load_from_model_directory(
        args.model_directory_name)
    scores = model.score(test, batch_size=args.batch_size)
    print(", ".join("%s=%0.5f" % s for s in scores))
Beispiel #5
0
 def _train(cls, epochs, model, model_directory, training, validation_data):
     logger.info(repr(model))
     start = time.time()
     history = model.fit(training,
                         epochs=epochs,
                         validation_data=validation_data,
                         model_directory=model_directory)
     training_time = str(timedelta(seconds=time.time() - start))
     training_history = cls._training_history(model_directory,
                                              training_time, training,
                                              history.history)
     return model, training_history
Beispiel #6
0
def predict(args):
    from bisemantic.classifier import TextPairClassifier

    test = data_file(args.test, args.n, args.index_name, args.text_1_name,
                     args.text_2_name, args.label_name, args.invalid_labels,
                     not args.not_comma_delimited)
    logger.info("Predict labels for %d pairs" % len(test))
    model = TextPairClassifier.load_from_model_directory(
        args.model_directory_name)
    class_names = TextPairClassifier.class_names_from_model_directory(
        args.model_directory_name)
    predictions = model.predict(test,
                                batch_size=args.batch_size,
                                class_names=class_names)
    print(predictions.to_csv())
Beispiel #7
0
def data_file(filename,
              n=None,
              index=None,
              text_1_name=None,
              text_2_name=None,
              label_name=None,
              invalid_labels=None,
              comma_delimited=True):
    """
    Load a data file and put it in the format expected by the classifier.

    A data file is a CSV file. Any rows with null values in the columns of interest or with optional invalid label
    values are dropped. The file may optionally be clipped to a specified length.

    Rename columns in an input data frame to the ones bisemantic expects. Drop unused columns. If an argument is not
    None the corresponding column must already be in the raw data.

    :param filename: name of data file
    :type filename: str
    :param n: number of samples to limit to or None to use the entire file
    :type n: int or None
    :param index: optional name of the index column
    :type index: str or None
    :param text_1_name: name of column in data that should be mapped to text1
    :type text_1_name: str or None
    :param text_2_name: name of column in data that should be mapped to text2
    :type text_2_name: str or None
    :param label_name: name of column in data that should be mapped to label
    :type label_name: str or None
    :param invalid_labels: disallowed label values
    :type invalid_labels: list of str
    :param comma_delimited: is the data file comma-delimited?
    :type comma_delimited: bool
    :return: data frame of the desired size containing just the needed columns
    :rtype: pandas.DataFrame
    """
    data = load_data_file(filename, index, comma_delimited).head(n)
    data = fix_columns(data, text_1_name, text_2_name, label_name)
    m = len(data)
    data = data.dropna()
    if invalid_labels:
        data = data[~data[label].isin(invalid_labels)]
    n = len(data)
    if m != n:
        logger.info("Dropped %d samples with null values from %s" %
                    (m - n, filename))
    return data
Beispiel #8
0
def cross_validation_partitions(data, fraction, k):
    """
    Partition data into cross-validation sets.

    :param data: data set
    :type data: pandas.DataFrame
    :param fraction: percentage of data to use for training
    :type fraction: float
    :param k: number of cross-validation splits
    :type k: int
    :return: tuples of (training data, validation data) for each split
    :rtype: list(tuple(pandas.DateFrame, pandas.DateFrame))
    """
    logger.info("Cross validation %0.2f, %d partitions" % (fraction, k))
    n = int(fraction * len(data))
    partitions = []
    for i in range(k):
        data = data.sample(frac=1)
        train = data[:n]
        validate = data[n:]
        partitions.append((train, validate))
    return partitions
Beispiel #9
0
def _load_text_parser():
    global text_parser
    if text_parser is None:
        text_parser = spacy.load("en", tagger=None, parser=None, entity=None)
        logger.info(_text_parser_description())
    return text_parser