Beispiel #1
0
    def load_weights(self):
        """
        Load and initialise weights
        """

        if not self.model:

            logger.exception(
                "No model. you must build the model first with build_model")

        logger.debug("Loading weights from %s", self.weights_path)

        with h5py.File(self.weights_path, mode='r') as f:
            saving.load_weights_from_hdf5_group(f['model_weights'],
                                                self.model.layers)
Beispiel #2
0
def train(config_file):

    # Load variables from config files. Config files are used instead of ENV
    # vars due to the relatively large number of hyper parameters, and the need
    # to load these configs in both the train and predict moduldes.

    cfg = get_config(config_file)

    # Data config

    POLICY_TRAIN = cfg["data"]["policy_train"]
    POLICY_TEST = cfg["data"]["policy_test"]
    POLICY_VALID = cfg["data"]["policy_valid"]

    # Build config

    OUTPUT_PATH = cfg["build"]["output_path"]
    S3_SLUG = cfg["data"]["s3_slug"]

    # Check on word embedding and download if not exists

    WORD_EMBEDDINGS = cfg["build"]["word_embeddings"]

    with msg.loading(
            f"Could not find {WORD_EMBEDDINGS} locally, downloading..."):
        try:
            download_model_artefact(WORD_EMBEDDINGS, S3_SLUG)
            msg.good(f"Found {WORD_EMBEDDINGS}")
        except:
            msg.fail(f"Could not download {WORD_EMBEDDINGS}")
            logger.exception("Could not download %s", WORD_EMBEDDINGS)

    OUTPUT = cfg["build"]["output"]
    WORD_EMBEDDINGS = cfg["build"]["word_embeddings"]
    PRETRAINED_EMBEDDING = cfg["build"]["pretrained_embedding"]
    DROPOUT = float(cfg["build"]["dropout"])
    LSTM_HIDDEN = int(cfg["build"]["lstm_hidden"])
    WORD_EMBEDDING_SIZE = int(cfg["build"]["word_embedding_size"])
    CHAR_EMBEDDING_SIZE = int(cfg["build"]["char_embedding_size"])
    MAX_LEN = int(cfg["data"]["line_limit"])

    # Train config

    EPOCHS = int(cfg["train"]["epochs"])
    BATCH_SIZE = int(cfg["train"]["batch_size"])
    EARLY_STOPPING_PATIENCE = int(cfg["train"]["early_stopping_patience"])
    METRIC = cfg["train"]["metric"]

    # Load policy data

    train_data = load_tsv(POLICY_TRAIN)
    test_data = load_tsv(POLICY_TEST)
    valid_data = load_tsv(POLICY_VALID)

    X_train, y_train = train_data[0], train_data[1:]
    X_test, y_test = test_data[0], test_data[1:]
    X_valid, y_valid = valid_data[0], valid_data[1:]

    import statistics

    logger.debug("Max token length %s", max([len(i) for i in X_train]))
    logger.debug("Min token length %s", min([len(i) for i in X_train]))
    logger.debug("Mean token length %s",
                 statistics.median([len(i) for i in X_train]))

    logger.debug("Max token length %s", max([len(i) for i in X_test]))
    logger.debug("Min token length %s", min([len(i) for i in X_test]))
    logger.debug("Mean token length %s",
                 statistics.median([len(i) for i in X_test]))

    logger.debug("Max token length %s", max([len(i) for i in X_valid]))
    logger.debug("Min token length %s", min([len(i) for i in X_valid]))
    logger.debug("Mean token length %s",
                 statistics.median([len(i) for i in X_valid]))

    logger.info("X_train, y_train examples: %s, %s", len(X_train),
                list(map(len, y_train)))
    logger.info("X_test, y_test examples: %s, %s", len(X_test),
                list(map(len, y_test)))
    logger.info("X_valid, y_valid examples: %s, %s", len(X_valid),
                list(map(len, y_valid)))

    drp = DeepReferenceParser(
        X_train=X_train,
        X_test=X_test,
        X_valid=X_valid,
        y_train=y_train,
        y_test=y_test,
        y_valid=y_valid,
        max_len=MAX_LEN,
        output_path=OUTPUT_PATH,
    )

    ## Encode data and create required mapping dicts

    drp.prepare_data(save=True)

    ## Build the model architecture

    drp.build_model(
        output=OUTPUT,
        word_embeddings=WORD_EMBEDDINGS,
        pretrained_embedding=PRETRAINED_EMBEDDING,
        dropout=DROPOUT,
        lstm_hidden=LSTM_HIDDEN,
        word_embedding_size=WORD_EMBEDDING_SIZE,
        char_embedding_size=CHAR_EMBEDDING_SIZE,
    )

    ## Train the model. Not required if downloading weights from s3

    drp.train_model(
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        early_stopping_patience=EARLY_STOPPING_PATIENCE,
        metric=METRIC,
    )

    # Evaluate the model. Confusion matrices etc will be stored in
    # data/model_output

    drp.evaluate(
        load_weights=True,
        test_set=True,
        validation_set=True,
        print_padding=False,
    )
Beispiel #3
0
    def __init__(self, config_file):

        msg.info(f"Using config file: {config_file}")

        cfg = get_config(config_file)

        # Build config
        try:
            OUTPUT_PATH = cfg["build"]["output_path"]
            S3_SLUG = cfg["data"]["s3_slug"]
        except KeyError:
            config_dir, missing_config = os.path.split(config_file)
            files = os.listdir(config_dir)
            other_configs = [
                f for f in os.listdir(config_dir)
                if os.path.isfile(os.path.join(config_dir, f))
            ]
            msg.fail(
                f"Could not find config {missing_config}, perhaps you meant one of {other_configs}"
            )

        msg.info(
            f"Attempting to download model artefacts if they are not found locally in {cfg['build']['output_path']}. This may take some time..."
        )

        # Check whether the necessary artefacts exists and download them if
        # not.

        artefacts = [
            "indices.pickle",
            "weights.h5",
        ]

        for artefact in artefacts:
            with msg.loading(
                    f"Could not find {artefact} locally, downloading..."):
                try:
                    artefact = os.path.join(OUTPUT_PATH, artefact)
                    download_model_artefact(artefact, S3_SLUG)
                    msg.good(f"Found {artefact}")
                except:
                    msg.fail(f"Could not download {S3_SLUG}{artefact}")
                    logger.exception("Could not download %s%s", S3_SLUG,
                                     artefact)

        # Check on word embedding and download if not exists

        WORD_EMBEDDINGS = cfg["build"]["word_embeddings"]

        with msg.loading(
                f"Could not find {WORD_EMBEDDINGS} locally, downloading..."):
            try:
                download_model_artefact(WORD_EMBEDDINGS, S3_SLUG)
                msg.good(f"Found {WORD_EMBEDDINGS}")
            except:
                msg.fail(f"Could not download {S3_SLUG}{WORD_EMBEDDINGS}")
                logger.exception("Could not download %s", WORD_EMBEDDINGS)

        OUTPUT = cfg["build"]["output"]
        PRETRAINED_EMBEDDING = cfg["build"]["pretrained_embedding"]
        DROPOUT = float(cfg["build"]["dropout"])
        LSTM_HIDDEN = int(cfg["build"]["lstm_hidden"])
        WORD_EMBEDDING_SIZE = int(cfg["build"]["word_embedding_size"])
        CHAR_EMBEDDING_SIZE = int(cfg["build"]["char_embedding_size"])

        self.MAX_WORDS = int(cfg["data"]["line_limit"])

        # Evaluate config

        self.drp = DeepReferenceParser(output_path=OUTPUT_PATH)

        # Encode data and load required mapping dicts. Note that the max word and
        # max char lengths will be loaded in this step.

        self.drp.load_data(OUTPUT_PATH)

        # Build the model architecture

        self.drp.build_model(
            output=OUTPUT,
            word_embeddings=WORD_EMBEDDINGS,
            pretrained_embedding=PRETRAINED_EMBEDDING,
            dropout=DROPOUT,
            lstm_hidden=LSTM_HIDDEN,
            word_embedding_size=WORD_EMBEDDING_SIZE,
            char_embedding_size=CHAR_EMBEDDING_SIZE,
        )