def predict(experiment_id, text): """Predict the class for a text using a trained model from an experiment.""" # Get experiment config experiment_dir = os.path.join(config.EXPERIMENTS_DIR, experiment_id) experiment_config = utilities.load_json( os.path.join(experiment_dir, 'config.json')) args = Namespace(**experiment_config) # Preprocess texts = [text] X_tokenizer = data.Tokenizer.load( fp=os.path.join(experiment_dir, 'X_tokenizer.json')) y_tokenizer = data.LabelEncoder.load( fp=os.path.join(experiment_dir, 'y_tokenizer.json')) preprocessed_texts = data.preprocess_texts( texts, lower=args.lower, filters=args.filters) # Create dataset X_infer = np.array(X_tokenizer.texts_to_sequences(preprocessed_texts)) y_filler = np.array([0]*len(X_infer)) infer_set = data.TextDataset( X=X_infer, y=y_filler, batch_size=args.batch_size, max_filter_size=max(args.filter_sizes)) # Load model model = models.TextCNN( embedding_dim=args.embedding_dim, vocab_size=len(X_tokenizer), num_filters=args.num_filters, filter_sizes=args.filter_sizes, hidden_dim=args.hidden_dim, dropout_p=args.dropout_p, num_classes=len(y_tokenizer.classes)) model.load_state_dict(torch.load(os.path.join(experiment_dir, 'model.h5'))) device = torch.device('cuda' if ( torch.cuda.is_available() and args.cuda) else 'cpu') model = model.to(device) # Predict results = [] y_prob, conv_outputs = predict_step( model=model, dataset=infer_set, filter_sizes=args.filter_sizes, device=device) for index in range(len(X_infer)): results.append({ 'raw_input': texts[index], 'preprocessed_input': X_tokenizer.sequences_to_texts([X_infer[index]])[0], 'probabilities': get_probability_distribution(y_prob[index], y_tokenizer.classes), 'top_n_grams': get_top_n_grams(tokens=preprocessed_texts[index].split(' '), conv_outputs={ k: v[index] for k, v in conv_outputs.items()}, filter_sizes=args.filter_sizes)}) return results
def predict(inputs, args, model, X_tokenizer, y_tokenizer): """Predict the class for a text using a trained model from an experiment.""" # Preprocess texts = [sample['text'] for sample in inputs] preprocessed_texts = data.preprocess_texts(texts, lower=args.lower, filters=args.filters) # Create dataset X = np.array(X_tokenizer.texts_to_sequences(preprocessed_texts)) y_filler = np.array([0] * len(X)) dataset = data.Text_CNN_Dataset(X=X, y=y_filler, max_filter_size=max(args.filter_sizes)) dataloader = dataset.create_dataloader(batch_size=args.batch_size) # Predict results = [] y_prob, conv_outputs = predict_step(model=model, dataloader=dataloader, filter_sizes=args.filter_sizes, device='cpu') for index in range(len(X)): results.append({ 'raw_input': texts[index], 'preprocessed_input': X_tokenizer.sequences_to_texts([X[index]])[0], 'probabilities': get_probability_distribution(y_prob[index], y_tokenizer.classes), 'top_n_grams': get_top_n_grams( tokens=preprocessed_texts[index].split(' '), conv_outputs={k: v[index] for k, v in conv_outputs.items()}, filter_sizes=args.filter_sizes) }) return results
# Load model config if args.experiment_id == 'latest': args.experiment_id = max(os.listdir(config.EXPERIMENTS_DIR)) experiment_dir = os.path.join(config.EXPERIMENTS_DIR, args.experiment_id) experiment_config = utilities.load_json( os.path.join(experiment_dir, 'config.json')) args = Namespace(**{**args.__dict__, **Namespace(**experiment_config).__dict__}) config.logger.info(f"→ Using {args.experiment_id}") # Preprocess texts = [args.text] X_tokenizer = data.Tokenizer.load( fp=os.path.join(experiment_dir, 'X_tokenizer.json')) y_tokenizer = data.LabelEncoder.load( fp=os.path.join(experiment_dir, 'y_tokenizer.json')) preprocessed_texts = data.preprocess_texts( texts, lower=args.lower, filters=args.filters) # Create dataset X_infer = np.array(X_tokenizer.texts_to_sequences(preprocessed_texts)) y_filler = np.array([0]*len(X_infer)) infer_set = data.TextDataset( X=X_infer, y=y_filler, batch_size=args.batch_size, max_filter_size=max(args.filter_sizes)) # Load model model = models.TextCNN( embedding_dim=args.embedding_dim, vocab_size=len(X_tokenizer), num_filters=args.num_filters, filter_sizes=args.filter_sizes, hidden_dim=args.hidden_dim, dropout_p=args.dropout_p, num_classes=len(y_tokenizer.classes)) model.load_state_dict(torch.load(os.path.join(experiment_dir, 'model.h5')))
wandb.log({"run_dir": wandb.run.dir}) # Set seeds for reproducibility np.random.seed(args.seed) random.seed(args.seed) tf.random.set_seed(args.seed) # Load data X, y = data.load_data(url=args.data_url, data_size=args.data_size) config.logger.info( "Raw data:\n" f" {X[0]} {y[0]}") # Preprocess (filtering is done later via tokenizer) original_X = X X = data.preprocess_texts(texts=X) config.logger.info( "Preprocessed data:\n" f" {original_X[0]} → {X[0]}") # Split data X_train, X_val, X_test, y_train, y_val, y_test = data.train_val_test_split( X=X, y=y, val_size=args.val_size, test_size=args.test_size, shuffle=args.shuffle) config.logger.info( "Data splits:\n" f"\tX_train: {len(X_train)}, y_train: {len(y_train)}\n" f"\tX_val: {len(X_val)}, y_val: {len(y_val)}\n" f"\tX_test: {len(X_test)}, y_test: {len(y_test)}") # Tokenizer X_tokenizer = Tokenizer(
def test_preprocess_texts(texts, preprocessed_texts): assert data.preprocess_texts(texts=[texts]) == [preprocessed_texts]
torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True # Set device device = torch.device('cuda' if ( torch.cuda.is_available() and args.cuda) else 'cpu') # Load data X, y = data.load_data(url=args.data_url, data_size=args.data_size) config.logger.info( "Raw data:\n" f" {X[0]} → {y[0]}") # Preprocesss original_X = X X = data.preprocess_texts(texts=X, binary=args.binary, lower=args.lower, filters=args.filters) config.logger.info( "Preprocessed data:\n" f" {original_X[0]} → {X[0]}") # Split data X_train, X_val, X_test, y_train, y_val, y_test = data.train_val_test_split( X=X, y=y, val_size=args.val_size, test_size=args.test_size, shuffle=args.shuffle) config.logger.info( "Data splits:\n" f" X_train: {len(X_train)}, y_train: {len(y_train)}\n" f" X_val: {len(X_val)}, y_val: {len(y_val)}\n" f" X_test: {len(X_test)}, y_test: {len(y_test)}")