Ejemplo n.º 1
0
def get_run_components(run_dir):
    # Load args
    config = utils.load_json(os.path.join(run_dir, 'config.json'))
    args = Namespace(**config)

    # Load tokenizers
    X_tokenizer = data.Tokenizer.load(
        fp=os.path.join(run_dir, 'X_tokenizer.json'))
    y_tokenizer = data.LabelEncoder.load(
        fp=os.path.join(run_dir, 'y_tokenizer.json'))

    # Load model
    model = models.TextCNN(embedding_dim=args.embedding_dim,
                           vocab_size=len(X_tokenizer) + 1,
                           num_filters=args.num_filters,
                           filter_sizes=args.filter_sizes,
                           hidden_dim=args.hidden_dim,
                           dropout_p=args.dropout_p,
                           num_classes=len(y_tokenizer.classes))
    model.load_state_dict(torch.load(os.path.join(run_dir, 'model.pt')))
    device = torch.device('cuda' if (
        torch.cuda.is_available() and args.cuda) else 'cpu')
    model = model.to(device)

    return args, model, X_tokenizer, y_tokenizer
Ejemplo n.º 2
0
def predict(experiment_id, text):
    """Predict the class for a text using
    a trained model from an experiment."""
    # Get experiment config
    experiment_dir = os.path.join(config.EXPERIMENTS_DIR, experiment_id)
    experiment_config = utilities.load_json(
        os.path.join(experiment_dir, 'config.json'))
    args = Namespace(**experiment_config)

    # Preprocess
    texts = [text]
    X_tokenizer = data.Tokenizer.load(
        fp=os.path.join(experiment_dir, 'X_tokenizer.json'))
    y_tokenizer = data.LabelEncoder.load(
        fp=os.path.join(experiment_dir, 'y_tokenizer.json'))
    preprocessed_texts = data.preprocess_texts(
        texts, lower=args.lower, filters=args.filters)

    # Create dataset
    X_infer = np.array(X_tokenizer.texts_to_sequences(preprocessed_texts))
    y_filler = np.array([0]*len(X_infer))
    infer_set = data.TextDataset(
        X=X_infer, y=y_filler, batch_size=args.batch_size,
        max_filter_size=max(args.filter_sizes))

    # Load model
    model = models.TextCNN(
        embedding_dim=args.embedding_dim, vocab_size=len(X_tokenizer),
        num_filters=args.num_filters, filter_sizes=args.filter_sizes,
        hidden_dim=args.hidden_dim, dropout_p=args.dropout_p,
        num_classes=len(y_tokenizer.classes))
    model.load_state_dict(torch.load(os.path.join(experiment_dir, 'model.h5')))
    device = torch.device('cuda' if (
        torch.cuda.is_available() and args.cuda) else 'cpu')
    model = model.to(device)

    # Predict
    results = []
    y_prob, conv_outputs = predict_step(
        model=model, dataset=infer_set, filter_sizes=args.filter_sizes, device=device)
    for index in range(len(X_infer)):
        results.append({
            'raw_input': texts[index],
            'preprocessed_input': X_tokenizer.sequences_to_texts([X_infer[index]])[0],
            'probabilities': get_probability_distribution(y_prob[index], y_tokenizer.classes),
            'top_n_grams': get_top_n_grams(tokens=preprocessed_texts[index].split(' '),
                                           conv_outputs={
                                               k: v[index] for k, v in conv_outputs.items()},
                                           filter_sizes=args.filter_sizes)})
    return results
Ejemplo n.º 3
0
def get_run_components(run_dir):
    # Load args
    config = utils.load_json(os.path.join(run_dir, 'config.json'))
    args = Namespace(**config)

    # Load tokenizers
    with open(os.path.join(run_dir, 'X_tokenizer.json'), 'r') as fp:
        X_tokenizer = tokenizer_from_json(json.load(fp))
        y_tokenizer = LabelEncoder()
        y_tokenizer.classes_ = np.load(os.path.join(run_dir,
                                                    'y_tokenizer.npy'),
                                       allow_pickle=True)

        # Load model
        model = models.TextCNN(embedding_dim=args.embedding_dim,
                               vocab_size=len(X_tokenizer.word_index) + 1,
                               num_filters=args.num_filters,
                               filter_sizes=args.filter_sizes,
                               hidden_dim=args.hidden_dim,
                               dropout_p=args.dropout_p,
                               num_classes=len(y_tokenizer.classes_))

        model.summary(input_shape=(10, ))  # build it
        model_path = os.path.join(run_dir, 'model/cp.ckpt')
        model.load_weights(model_path)

        # Conv output model
        conv_outputs_model = models.ConvOutputsModel(
            vocab_size=len(X_tokenizer.word_index) + 1,
            embedding_dim=args.embedding_dim,
            filter_sizes=args.filter_sizes,
            num_filters=args.num_filters)
        conv_outputs_model.summary(input_shape=(10, ))  # build it

        # Set weights
        conv_outputs_model.layers[0].set_weights(model.layers[0].get_weights())
        conv_layer_start_num = 1

        for layer_num in range(conv_layer_start_num,
                               conv_layer_start_num + len(args.filter_sizes)):
            conv_outputs_model.layers[layer_num].set_weights(
                model.layers[layer_num].get_weights())

        return args, model, conv_outputs_model, X_tokenizer, y_tokenizer
Ejemplo n.º 4
0
    y_tokenizer = data.LabelEncoder.load(
        fp=os.path.join(experiment_dir, 'y_tokenizer.json'))
    preprocessed_texts = data.preprocess_texts(
        texts, lower=args.lower, filters=args.filters)

    # Create dataset
    X_infer = np.array(X_tokenizer.texts_to_sequences(preprocessed_texts))
    y_filler = np.array([0]*len(X_infer))
    infer_set = data.TextDataset(
        X=X_infer, y=y_filler, batch_size=args.batch_size,
        max_filter_size=max(args.filter_sizes))

    # Load model
    model = models.TextCNN(
        embedding_dim=args.embedding_dim, vocab_size=len(X_tokenizer),
        num_filters=args.num_filters, filter_sizes=args.filter_sizes,
        hidden_dim=args.hidden_dim, dropout_p=args.dropout_p,
        num_classes=len(y_tokenizer.classes))
    model.load_state_dict(torch.load(os.path.join(experiment_dir, 'model.h5')))
    device = torch.device('cuda' if (
        torch.cuda.is_available() and args.cuda) else 'cpu')
    model = model.to(device)

    # Predict
    results = []
    y_prob, conv_outputs = predict_step(
        model=model, dataset=infer_set, filter_sizes=args.filter_sizes, device=device)
    for index in range(len(X_infer)):
        results.append({
            'raw_input': texts[index],
            'preprocessed_input': X_tokenizer.sequences_to_texts([X_infer[index]])[0],
Ejemplo n.º 5
0
        embeddings_file = os.path.join(config.EMBEDDINGS_DIR,
                                       f'glove.6B.{args.embedding_dim}d.txt')
        glove_embeddings = utils.load_glove_embeddings(
            embeddings_file=embeddings_file)
        embedding_matrix = utils.make_embeddings_matrix(
            embeddings=glove_embeddings,
            token_to_index=X_tokenizer.token_to_index,
            embedding_dim=args.embedding_dim)
        config.logger.info("→ GloVe Embeddings:\n" f"{embedding_matrix.shape}")

    # Initialize model
    model = models.TextCNN(embedding_dim=args.embedding_dim,
                           vocab_size=len(X_tokenizer),
                           num_filters=args.num_filters,
                           filter_sizes=args.filter_sizes,
                           hidden_dim=args.hidden_dim,
                           dropout_p=args.dropout_p,
                           num_classes=len(y_tokenizer.classes),
                           pretrained_embeddings=embedding_matrix,
                           freeze_embeddings=args.freeze_embeddings)
    model = model.to(device)
    config.logger.info("→ Model:\n" f"  {model.named_parameters}")

    # Define optimizer & scheduler
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode='min',
                                                           factor=0.1,
                                                           patience=3)

    # Model dir
Ejemplo n.º 6
0
                "Embedding dim must be in (50, 100, 200, 300) is using GloVe.")
        embeddings_file = os.path.join(config.EMBEDDINGS_DIR,
                                       f'glove.6B.{args.embedding_dim}d.txt')
        glove_embeddings = utils.load_glove_embeddings(
            embeddings_file=embeddings_file)
        embedding_matrix = utils.make_embeddings_matrix(
            embeddings=glove_embeddings,
            token_to_index=X_tokenizer.word_index,
            embedding_dim=args.embedding_dim)
        config.logger.info("→ Embeddings:\n" f"{embedding_matrix.shape}")

    # Initialize model
    model = models.TextCNN(vocab_size=vocab_size,
                           embedding_dim=args.embedding_dim,
                           filter_sizes=args.filter_sizes,
                           num_filters=args.num_filters,
                           hidden_dim=args.hidden_dim,
                           dropout_p=args.dropout_p,
                           num_classes=len(y_tokenizer.classes_),
                           freeze_embeddings=args.freeze_embeddings)
    model.summary(input_shape=(10, ))  # build it

    # Set GloVe embeddings
    if args.use_glove:
        model.layers[0].set_weights([embedding_matrix])

    # Model dir
    experiment_id = f'TextCNN_{datetime.now().strftime("%Y-%m-%d-%H:%M:%S")}'
    experiment_dir = os.path.join(config.EXPERIMENTS_DIR, experiment_id)
    utilities.create_dirs(dirpath=experiment_dir)
    model_path = os.path.join(experiment_dir, 'model/cp.ckpt')
Ejemplo n.º 7
0
def predict(experiment_id, text):
    """Predict the class for a text using
    a trained model from an experiment."""
    # Get experiment config
    experiment_dir = os.path.join(config.EXPERIMENTS_DIR, experiment_id)
    experiment_config = utilities.load_json(
        os.path.join(experiment_dir, 'config.json'))
    args = Namespace(**experiment_config)

    # Tokenizers
    texts = [text]
    with open(os.path.join(experiment_dir, 'X_tokenizer.json'), 'r') as fp:
        X_tokenizer = tokenizer_from_json(json.load(fp))
    y_tokenizer = LabelEncoder()
    y_tokenizer.classes_ = np.load(os.path.join(experiment_dir,
                                                'y_tokenizer.npy'),
                                   allow_pickle=True)

    # Create dataset generator
    X_infer = np.array(X_tokenizer.texts_to_sequences(texts))
    preprocessed_texts = X_tokenizer.sequences_to_texts(X_infer),
    y_filler = np.array([0] * len(X_infer))
    inference_generator = data.DataGenerator(X=X_infer,
                                             y=y_filler,
                                             batch_size=args.batch_size,
                                             max_filter_size=max(
                                                 args.filter_sizes))

    # Load model
    model = models.TextCNN(embedding_dim=args.embedding_dim,
                           vocab_size=len(X_tokenizer.word_index) + 1,
                           num_filters=args.num_filters,
                           filter_sizes=args.filter_sizes,
                           hidden_dim=args.hidden_dim,
                           dropout_p=args.dropout_p,
                           num_classes=len(y_tokenizer.classes_))
    model.summary(input_shape=(10, ))  # build it
    model_path = os.path.join(experiment_dir, 'model/cp.ckpt')
    model.load_weights(model_path)

    # Conv output model
    conv_outputs_model = models.ConvOutputsModel(
        vocab_size=len(X_tokenizer.word_index) + 1,
        embedding_dim=args.embedding_dim,
        filter_sizes=args.filter_sizes,
        num_filters=args.num_filters)
    conv_outputs_model.summary(input_shape=(10, ))  # build it

    # Set weights
    conv_outputs_model.layers[0].set_weights(model.layers[0].get_weights())
    conv_layer_start_num = 1
    for layer_num in range(conv_layer_start_num,
                           conv_layer_start_num + len(args.filter_sizes)):
        conv_outputs_model.layers[layer_num].set_weights(
            model.layers[layer_num].get_weights())

    # Predict
    results = []
    y_prob = model.predict(x=inference_generator, verbose=1)
    conv_outputs = conv_outputs_model.predict(x=inference_generator, verbose=1)
    for index in range(len(X_infer)):
        results.append({
            'raw_input':
            texts[index],
            'preprocessed_input':
            preprocessed_texts[index][0],
            'probabilities':
            get_probability_distribution(y_prob[index], y_tokenizer.classes_),
            'top_n_grams':
            get_top_n_grams(tokens=preprocessed_texts[index][0].split(' '),
                            conv_outputs=conv_outputs,
                            filter_sizes=args.filter_sizes)
        })

    return results