Exemple #1
0
def predict(experiment_id, text):
    """Predict the class for a text using
    a trained model from an experiment."""
    # Get experiment config
    experiment_dir = os.path.join(config.EXPERIMENTS_DIR, experiment_id)
    experiment_config = utilities.load_json(
        os.path.join(experiment_dir, 'config.json'))
    args = Namespace(**experiment_config)

    # Preprocess
    texts = [text]
    X_tokenizer = data.Tokenizer.load(
        fp=os.path.join(experiment_dir, 'X_tokenizer.json'))
    y_tokenizer = data.LabelEncoder.load(
        fp=os.path.join(experiment_dir, 'y_tokenizer.json'))
    preprocessed_texts = data.preprocess_texts(
        texts, lower=args.lower, filters=args.filters)

    # Create dataset
    X_infer = np.array(X_tokenizer.texts_to_sequences(preprocessed_texts))
    y_filler = np.array([0]*len(X_infer))
    infer_set = data.TextDataset(
        X=X_infer, y=y_filler, batch_size=args.batch_size,
        max_filter_size=max(args.filter_sizes))

    # Load model
    model = models.TextCNN(
        embedding_dim=args.embedding_dim, vocab_size=len(X_tokenizer),
        num_filters=args.num_filters, filter_sizes=args.filter_sizes,
        hidden_dim=args.hidden_dim, dropout_p=args.dropout_p,
        num_classes=len(y_tokenizer.classes))
    model.load_state_dict(torch.load(os.path.join(experiment_dir, 'model.h5')))
    device = torch.device('cuda' if (
        torch.cuda.is_available() and args.cuda) else 'cpu')
    model = model.to(device)

    # Predict
    results = []
    y_prob, conv_outputs = predict_step(
        model=model, dataset=infer_set, filter_sizes=args.filter_sizes, device=device)
    for index in range(len(X_infer)):
        results.append({
            'raw_input': texts[index],
            'preprocessed_input': X_tokenizer.sequences_to_texts([X_infer[index]])[0],
            'probabilities': get_probability_distribution(y_prob[index], y_tokenizer.classes),
            'top_n_grams': get_top_n_grams(tokens=preprocessed_texts[index].split(' '),
                                           conv_outputs={
                                               k: v[index] for k, v in conv_outputs.items()},
                                           filter_sizes=args.filter_sizes)})
    return results
Exemple #2
0
def predict(inputs, args, model, X_tokenizer, y_tokenizer):
    """Predict the class for a text using
    a trained model from an experiment."""

    # Preprocess
    texts = [sample['text'] for sample in inputs]
    preprocessed_texts = data.preprocess_texts(texts,
                                               lower=args.lower,
                                               filters=args.filters)

    # Create dataset
    X = np.array(X_tokenizer.texts_to_sequences(preprocessed_texts))
    y_filler = np.array([0] * len(X))

    dataset = data.Text_CNN_Dataset(X=X,
                                    y=y_filler,
                                    max_filter_size=max(args.filter_sizes))
    dataloader = dataset.create_dataloader(batch_size=args.batch_size)

    # Predict
    results = []
    y_prob, conv_outputs = predict_step(model=model,
                                        dataloader=dataloader,
                                        filter_sizes=args.filter_sizes,
                                        device='cpu')

    for index in range(len(X)):
        results.append({
            'raw_input':
            texts[index],
            'preprocessed_input':
            X_tokenizer.sequences_to_texts([X[index]])[0],
            'probabilities':
            get_probability_distribution(y_prob[index], y_tokenizer.classes),
            'top_n_grams':
            get_top_n_grams(
                tokens=preprocessed_texts[index].split(' '),
                conv_outputs={k: v[index]
                              for k, v in conv_outputs.items()},
                filter_sizes=args.filter_sizes)
        })

    return results
Exemple #3
0
    # Load model config
    if args.experiment_id == 'latest':
        args.experiment_id = max(os.listdir(config.EXPERIMENTS_DIR))
    experiment_dir = os.path.join(config.EXPERIMENTS_DIR, args.experiment_id)
    experiment_config = utilities.load_json(
        os.path.join(experiment_dir, 'config.json'))
    args = Namespace(**{**args.__dict__, **Namespace(**experiment_config).__dict__})
    config.logger.info(f"→ Using {args.experiment_id}")

    # Preprocess
    texts = [args.text]
    X_tokenizer = data.Tokenizer.load(
        fp=os.path.join(experiment_dir, 'X_tokenizer.json'))
    y_tokenizer = data.LabelEncoder.load(
        fp=os.path.join(experiment_dir, 'y_tokenizer.json'))
    preprocessed_texts = data.preprocess_texts(
        texts, lower=args.lower, filters=args.filters)

    # Create dataset
    X_infer = np.array(X_tokenizer.texts_to_sequences(preprocessed_texts))
    y_filler = np.array([0]*len(X_infer))
    infer_set = data.TextDataset(
        X=X_infer, y=y_filler, batch_size=args.batch_size,
        max_filter_size=max(args.filter_sizes))

    # Load model
    model = models.TextCNN(
        embedding_dim=args.embedding_dim, vocab_size=len(X_tokenizer),
        num_filters=args.num_filters, filter_sizes=args.filter_sizes,
        hidden_dim=args.hidden_dim, dropout_p=args.dropout_p,
        num_classes=len(y_tokenizer.classes))
    model.load_state_dict(torch.load(os.path.join(experiment_dir, 'model.h5')))
    wandb.log({"run_dir": wandb.run.dir})

    # Set seeds for reproducibility
    np.random.seed(args.seed)
    random.seed(args.seed)
    tf.random.set_seed(args.seed)

    # Load data
    X, y = data.load_data(url=args.data_url, data_size=args.data_size)
    config.logger.info(
        "Raw data:\n"
        f"  {X[0]} {y[0]}")

    # Preprocess (filtering is done later via tokenizer)
    original_X = X
    X = data.preprocess_texts(texts=X)
    config.logger.info(
        "Preprocessed data:\n"
        f"  {original_X[0]} → {X[0]}")

    # Split data
    X_train, X_val, X_test, y_train, y_val, y_test = data.train_val_test_split(
        X=X, y=y, val_size=args.val_size, test_size=args.test_size, shuffle=args.shuffle)
    config.logger.info(
        "Data splits:\n"
        f"\tX_train: {len(X_train)}, y_train: {len(y_train)}\n"
        f"\tX_val: {len(X_val)}, y_val: {len(y_val)}\n"
        f"\tX_test: {len(X_test)}, y_test: {len(y_test)}")

    # Tokenizer
    X_tokenizer = Tokenizer(
Exemple #5
0
def test_preprocess_texts(texts, preprocessed_texts):
    assert data.preprocess_texts(texts=[texts]) == [preprocessed_texts]
Exemple #6
0
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

    # Set device
    device = torch.device('cuda' if (
        torch.cuda.is_available() and args.cuda) else 'cpu')

    # Load data
    X, y = data.load_data(url=args.data_url, data_size=args.data_size)
    config.logger.info(
        "Raw data:\n"
        f"  {X[0]} → {y[0]}")

    # Preprocesss
    original_X = X
    X = data.preprocess_texts(texts=X, binary=args.binary, 
                              lower=args.lower, filters=args.filters)
    
    config.logger.info(
        "Preprocessed data:\n"
        f"  {original_X[0]} → {X[0]}")

    # Split data
    X_train, X_val, X_test, y_train, y_val, y_test = data.train_val_test_split(
        X=X, y=y, val_size=args.val_size, test_size=args.test_size, shuffle=args.shuffle)
    
    config.logger.info(
        "Data splits:\n"
        f"  X_train: {len(X_train)}, y_train: {len(y_train)}\n"
        f"  X_val: {len(X_val)}, y_val: {len(y_val)}\n"
        f"  X_test: {len(X_test)}, y_test: {len(y_test)}")