def get_run_components(run_dir): # Load args config = utils.load_json(os.path.join(run_dir, 'config.json')) # Load word_map _, emb_size = load_word2vec_embeddings(word2vec_file, word_map) model = models.HierarchialAttentionNetwork( n_classes=n_classes, vocab_size=len(word_map), emb_size=emb_size, word_rnn_size=word_rnn_size, sentence_rnn_size=sentence_rnn_size, word_rnn_layers=word_rnn_layers, sentence_rnn_layers=sentence_rnn_layers, word_att_size=word_att_size, sentence_att_size=sentence_att_size, dropout=dropout) # Load model model.load_state_dict(torch.load(os.path.join(run_dir, 'model.pt'))) device = torch.device('cuda' if ( torch.cuda.is_available() and args.cuda) else 'cpu') model = model.to(device) return model, word_map
def get_run_components(run_dir): # Load args config = utils.load_json(os.path.join(run_dir, 'config.json')) args = Namespace(**config) # Load tokenizers X_tokenizer = data.Tokenizer.load( fp=os.path.join(run_dir, 'X_tokenizer.json')) y_tokenizer = data.LabelEncoder.load( fp=os.path.join(run_dir, 'y_tokenizer.json')) # Load model model = models.TextCNN(embedding_dim=args.embedding_dim, vocab_size=len(X_tokenizer) + 1, num_filters=args.num_filters, filter_sizes=args.filter_sizes, hidden_dim=args.hidden_dim, dropout_p=args.dropout_p, num_classes=len(y_tokenizer.classes)) model.load_state_dict(torch.load(os.path.join(run_dir, 'model.pt'))) device = torch.device('cuda' if ( torch.cuda.is_available() and args.cuda) else 'cpu') model = model.to(device) return args, model, X_tokenizer, y_tokenizer
async def _experiment_details(experiment_id: str = Path( default='latest', title="ID of experiment")): if experiment_id == 'latest': experiment_id = max(os.listdir(config.EXPERIMENTS_DIR)) experiment_dir = os.path.join(config.EXPERIMENTS_DIR, experiment_id) args = utils.load_json( filepath=os.path.join(experiment_dir, 'config.json')) classes = data.LabelEncoder.load( fp=os.path.join(experiment_dir, 'y_tokenizer.json')).classes performance = utils.load_json( filepath=os.path.join(experiment_dir, 'performance.json')) response = { 'message': HTTPStatus.OK.phrase, 'status-code': HTTPStatus.OK, 'data': { "classes": classes, "args": args, "performance": performance } } config.logger.info(json.dumps(response, indent=2)) return response
def get_run_components(run_dir): # Load args config = utils.load_json(os.path.join(run_dir, 'config.json')) args = Namespace(**config) # Load tokenizers with open(os.path.join(run_dir, 'X_tokenizer.json'), 'r') as fp: X_tokenizer = tokenizer_from_json(json.load(fp)) y_tokenizer = LabelEncoder() y_tokenizer.classes_ = np.load(os.path.join(run_dir, 'y_tokenizer.npy'), allow_pickle=True) # Load model model = models.TextCNN(embedding_dim=args.embedding_dim, vocab_size=len(X_tokenizer.word_index) + 1, num_filters=args.num_filters, filter_sizes=args.filter_sizes, hidden_dim=args.hidden_dim, dropout_p=args.dropout_p, num_classes=len(y_tokenizer.classes_)) model.summary(input_shape=(10, )) # build it model_path = os.path.join(run_dir, 'model/cp.ckpt') model.load_weights(model_path) # Conv output model conv_outputs_model = models.ConvOutputsModel( vocab_size=len(X_tokenizer.word_index) + 1, embedding_dim=args.embedding_dim, filter_sizes=args.filter_sizes, num_filters=args.num_filters) conv_outputs_model.summary(input_shape=(10, )) # build it # Set weights conv_outputs_model.layers[0].set_weights(model.layers[0].get_weights()) conv_layer_start_num = 1 for layer_num in range(conv_layer_start_num, conv_layer_start_num + len(args.filter_sizes)): conv_outputs_model.layers[layer_num].set_weights( model.layers[layer_num].get_weights()) return args, model, conv_outputs_model, X_tokenizer, y_tokenizer
def predict(experiment_id, inputs): """Predict the class for a text using a trained model from an experiment.""" # Get experiment config if experiment_id == 'latest': experiment_id = max(os.listdir(config.EXPERIMENTS_DIR)) experiment_dir = os.path.join(config.EXPERIMENTS_DIR, experiment_id) experiment_config = utils.load_json( os.path.join(experiment_dir, 'config.json')) args = Namespace(**experiment_config) # Preprocess texts = [sample['text'] for sample in inputs] X_tokenizer = data.Tokenizer.load( fp=os.path.join(experiment_dir, 'X_tokenizer.json')) y_tokenizer = data.LabelEncoder.load( fp=os.path.join(experiment_dir, 'y_tokenizer.json')) preprocessed_texts = data.preprocess_texts(texts, lower=args.lower, filters=args.filters) # Create dataset X_infer = np.array(X_tokenizer.texts_to_sequences(preprocessed_texts)) y_filler = np.array([0] * len(X_infer)) infer_set = data.TextDataset(X=X_infer, y=y_filler, batch_size=args.batch_size, max_filter_size=max(args.filter_sizes)) # Load model model = models.TextCNN(embedding_dim=args.embedding_dim, vocab_size=len(X_tokenizer), num_filters=args.num_filters, filter_sizes=args.filter_sizes, hidden_dim=args.hidden_dim, dropout_p=args.dropout_p, num_classes=len(y_tokenizer.classes)) model.load_state_dict(torch.load(os.path.join(experiment_dir, 'model.h5'))) device = torch.device('cuda' if ( torch.cuda.is_available() and args.cuda) else 'cpu') model = model.to(device) # Predict results = [] y_prob, conv_outputs = predict_step(model=model, dataset=infer_set, filter_sizes=args.filter_sizes, device=device) for index in range(len(X_infer)): results.append({ 'raw_input': texts[index], 'preprocessed_input': X_tokenizer.sequences_to_texts([X_infer[index]])[0], 'probabilities': get_probability_distribution(y_prob[index], y_tokenizer.classes), 'top_n_grams': get_top_n_grams( tokens=preprocessed_texts[index].split(' '), conv_outputs={k: v[index] for k, v in conv_outputs.items()}, filter_sizes=args.filter_sizes) }) return results
elif page == 'Model details': st.header("All Experiments") st.write( f'[https://app.wandb.ai/{project}](https://app.wandb.ai/{project})') st.header("Best Run") # Run details st.write( f"**Name**: {best_run._attrs['displayName']} ({best_run._attrs['name']})" ) st.write("**Timestamp**:", best_run._attrs['createdAt']) st.write( f"**Runtime**: {best_run._attrs['summaryMetrics']['_runtime']:.1f} seconds" ) # Performance st.write("**Performance**:") performance = utils.load_json( os.path.join(best_run_dir, 'performance.json')) st.json(performance) # Confusion matrix st.image(os.path.join(best_run_dir, 'confusion_matrix.png')) # Config st.write("**Config**:") st.json(best_run._attrs['config'])
import os import sys sys.path.append(".") import logging import logging.config from text_classification import utils # Directories BASE_DIR = os.getcwd() # project root APP_DIR = os.path.dirname(__file__) # app root LOGS_DIR = os.path.join(BASE_DIR, 'logs') EMBEDDINGS_DIR = os.path.join(BASE_DIR, 'embeddings') # Create dirs utils.create_dirs(LOGS_DIR) utils.create_dirs(EMBEDDINGS_DIR) # Loggers log_config = utils.load_json( filepath=os.path.join(BASE_DIR, 'logging.json')) logging.config.dictConfig(log_config) logger = logging.getLogger('logger')
def predict(experiment_id, inputs): """Predict the class for a text using a trained model from an experiment.""" # Get experiment config if experiment_id == 'latest': experiment_id = max(os.listdir(config.EXPERIMENTS_DIR)) experiment_dir = os.path.join(config.EXPERIMENTS_DIR, experiment_id) experiment_config = utils.load_json( os.path.join(experiment_dir, 'config.json')) args = Namespace(**experiment_config) # Tokenizers texts = [sample['text'] for sample in inputs] with open(os.path.join(experiment_dir, 'X_tokenizer.json'), 'r') as fp: X_tokenizer = tokenizer_from_json(json.load(fp)) y_tokenizer = LabelEncoder() y_tokenizer.classes_ = np.load(os.path.join(experiment_dir, 'y_tokenizer.npy'), allow_pickle=True) # Create dataset generator X_infer = np.array(X_tokenizer.texts_to_sequences(texts)) preprocessed_texts = X_tokenizer.sequences_to_texts(X_infer) y_filler = np.array([0] * len(X_infer)) inference_generator = data.DataGenerator(X=X_infer, y=y_filler, batch_size=args.batch_size, max_filter_size=max( args.filter_sizes)) # Load model model = models.TextCNN(embedding_dim=args.embedding_dim, vocab_size=len(X_tokenizer.word_index) + 1, num_filters=args.num_filters, filter_sizes=args.filter_sizes, hidden_dim=args.hidden_dim, dropout_p=args.dropout_p, num_classes=len(y_tokenizer.classes_)) model.summary(input_shape=(10, )) # build it model_path = os.path.join(experiment_dir, 'model/cp.ckpt') model.load_weights(model_path) # Conv output model conv_outputs_model = models.ConvOutputsModel( vocab_size=len(X_tokenizer.word_index) + 1, embedding_dim=args.embedding_dim, filter_sizes=args.filter_sizes, num_filters=args.num_filters) conv_outputs_model.summary(input_shape=(10, )) # build it # Set weights conv_outputs_model.layers[0].set_weights(model.layers[0].get_weights()) conv_layer_start_num = 1 for layer_num in range(conv_layer_start_num, conv_layer_start_num + len(args.filter_sizes)): conv_outputs_model.layers[layer_num].set_weights( model.layers[layer_num].get_weights()) # Predict results = [] y_prob = model.predict(x=inference_generator, verbose=1) conv_outputs = conv_outputs_model.predict(x=inference_generator, verbose=1) for index in range(len(X_infer)): results.append({ 'raw_input': texts[index], 'preprocessed_input': preprocessed_texts[index], 'probabilities': get_probability_distribution(y_prob[index], y_tokenizer.classes_), 'top_n_grams': get_top_n_grams(tokens=preprocessed_texts[index].split(' '), conv_outputs=conv_outputs, filter_sizes=args.filter_sizes) }) return results