def stylize_static_image(inference_config): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") content_img_path = os.path.join(inference_config['content_images_path'], inference_config['content_img_name']) content_image = utils.prepare_img(content_img_path, inference_config['img_width'], device) # load the weights and set the model to evaluation mode stylization_model = TransformerNet().to(device) training_state = torch.load(os.path.join(inference_config["model_binaries_path"], inference_config["model_name"])) utils.print_model_metadata(training_state) state_dict = training_state["state_dict"] stylization_model.load_state_dict(state_dict, strict=True) stylization_model.eval() with torch.no_grad(): stylized_img = stylization_model(content_image).to('cpu').numpy()[0] utils.save_and_maybe_display_image(inference_config, stylized_img, should_display=True)
def translate_a_single_sentence(translation_config): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # checking whether you have a GPU # Step 1: Prepare the field processor (tokenizer, numericalizer) _, _, src_field_processor, trg_field_processor = get_datasets_and_vocabs( translation_config['dataset_path'], translation_config['language_direction'], translation_config['dataset_name'] == DatasetType.IWSLT.name) assert src_field_processor.vocab.stoi[ PAD_TOKEN] == trg_field_processor.vocab.stoi[PAD_TOKEN] pad_token_id = src_field_processor.vocab.stoi[ PAD_TOKEN] # needed for constructing masks # Step 2: Prepare the model baseline_transformer = Transformer( model_dimension=BASELINE_MODEL_DIMENSION, src_vocab_size=len(src_field_processor.vocab), trg_vocab_size=len(trg_field_processor.vocab), number_of_heads=BASELINE_MODEL_NUMBER_OF_HEADS, number_of_layers=BASELINE_MODEL_NUMBER_OF_LAYERS, dropout_probability=BASELINE_MODEL_DROPOUT_PROB, log_attention_weights=True).to(device) model_path = os.path.join(BINARIES_PATH, translation_config['model_name']) if not os.path.exists(model_path): print(f'Model {model_path} does not exist, attempting to download.') model_path = download_models(translation_config) model_state = torch.load(model_path) print_model_metadata(model_state) baseline_transformer.load_state_dict(model_state["state_dict"], strict=True) baseline_transformer.eval() # Step 3: Prepare the input sentence source_sentence = translation_config['source_sentence'] ex = Example.fromlist([source_sentence], fields=[('src', src_field_processor) ]) # tokenize the sentence source_sentence_tokens = ex.src print(f'Source sentence tokens = {source_sentence_tokens}') # Numericalize and convert to cuda tensor src_token_ids_batch = src_field_processor.process([source_sentence_tokens], device) with torch.no_grad(): # Step 4: Optimization - compute the source token representations only once src_mask, _ = get_masks_and_count_tokens_src(src_token_ids_batch, pad_token_id) src_representations_batch = baseline_transformer.encode( src_token_ids_batch, src_mask) # Step 5: Decoding process if translation_config['decoding_method'] == DecodingMethod.GREEDY: target_sentence_tokens = greedy_decoding( baseline_transformer, src_representations_batch, src_mask, trg_field_processor) else: beam_decoding = get_beam_decoder(translation_config) target_sentence_tokens = beam_decoding(baseline_transformer, src_representations_batch, src_mask, trg_field_processor) print( f'Translation | Target sentence tokens = {target_sentence_tokens}') # Step 6: Potentially visualize the encoder/decoder attention weights if translation_config['visualize_attention']: visualize_attention(baseline_transformer, source_sentence_tokens, target_sentence_tokens)
def visualize_gat_properties(model_name=r'gat_000000.pth', dataset_name=DatasetType.CORA.name, visualization_type=VisualizationType.ATTENTION): """ Using t-SNE to visualize GAT embeddings in 2D space. Check out this one for more intuition on how to tune t-SNE: https://distill.pub/2016/misread-tsne/ If you think it'd be useful for me to implement t-SNE as well and explain how every single detail works open up an issue or DM me on social media! <3 Note: I also tried using UMAP but it doesn't provide any more insight than t-SNE. (con: it has a lot of dependencies if you want to use their plotting functionality) """ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # checking whether you have a GPU, I hope so! config = { 'dataset_name': dataset_name, 'layer_type': LayerType.IMP3, 'should_visualize': False # don't visualize the dataset } # Step 1: Prepare the data node_features, node_labels, topology, train_indices, val_indices, test_indices = load_graph_data( config, device) # Step 2: Prepare the model model_path = os.path.join(BINARIES_PATH, model_name) model_state = torch.load(model_path) gat = GAT(num_of_layers=model_state['num_of_layers'], num_heads_per_layer=model_state['num_heads_per_layer'], num_features_per_layer=model_state['num_features_per_layer'], add_skip_connection=model_state['add_skip_connection'], bias=model_state['bias'], dropout=model_state['dropout'], layer_type=name_to_layer_type(model_state['layer_type']), log_attention_weights=True).to(device) print_model_metadata(model_state) gat.load_state_dict(model_state["state_dict"], strict=True) gat.eval( ) # some layers like nn.Dropout behave differently in train vs eval mode so this part is important # Step 3: Calculate the things we'll need for different visualization types (attention, scores, edge_index) # This context manager is important (and you'll often see it), otherwise PyTorch will eat much more memory. # It would be saving activations for backprop but we are not going to do any model training just the prediction. with torch.no_grad(): # Step 3: Run predictions and collect the high dimensional data all_nodes_unnormalized_scores, _ = gat( (node_features, topology)) # shape = (N, num of classes) all_nodes_unnormalized_scores = all_nodes_unnormalized_scores.cpu( ).numpy() # We'll need the edge index in different for multiple visualization types if config[ 'layer_type'] == LayerType.IMP3: # imp 3 works with edge index while others work with adjacency info edge_index = topology else: edge_index = convert_adj_to_edge_index(topology) # Step 4: Perform a specific visualization if visualization_type == VisualizationType.ATTENTION: # The number of nodes for which we want to visualize their attention over neighboring nodes # (2x this actually as we add nodes with highest degree + random nodes) num_nodes_of_interest = 4 # 4 is an arbitrary number you can play with these numbers head_to_visualize = 0 # plot attention from this multi-head attention's head gat_layer_id = 1 # plot attention from this GAT layer # Build up the complete graph # node_features shape = (N, FIN), where N is the number of nodes and FIN number of input features total_num_of_nodes = len(node_features) complete_graph = ig.Graph() complete_graph.add_vertices( total_num_of_nodes ) # igraph creates nodes with ids [0, total_num_of_nodes - 1] edge_index_tuples = list(zip( edge_index[0, :], edge_index[1, :])) # igraph requires this format complete_graph.add_edges(edge_index_tuples) # Pick the target nodes to plot (nodes with highest degree + random nodes) # Note: there could be an overlap between random nodes and nodes with highest degree - but highly unlikely nodes_of_interest_ids = np.argpartition( complete_graph.degree(), -num_nodes_of_interest)[-num_nodes_of_interest:] random_node_ids = np.random.randint(low=0, high=total_num_of_nodes, size=num_nodes_of_interest) nodes_of_interest_ids = np.append(nodes_of_interest_ids, random_node_ids) np.random.shuffle(nodes_of_interest_ids) target_node_ids = edge_index[1] source_nodes = edge_index[0] for target_node_id in nodes_of_interest_ids: # Step 1: Find the neighboring nodes to the target node # Note: self edge for CORA is included so the target node is it's own neighbor (Alexandro yo soy tu madre) src_nodes_indices = torch.eq(target_node_ids, target_node_id) source_node_ids = source_nodes[src_nodes_indices].cpu().numpy() size_of_neighborhood = len(source_node_ids) # Step 2: Fetch their labels labels = node_labels[source_node_ids].cpu().numpy() # Step 3: Fetch the attention weights for edges (attention is logged during GAT's forward pass above) # attention shape = (N, NH, 1) -> (N, NH) - we just squeeze the last dim it's superfluous all_attention_weights = gat.gat_net[ gat_layer_id].attention_weights.squeeze(dim=-1) attention_weights = all_attention_weights[ src_nodes_indices, head_to_visualize].cpu().numpy() # This part shows that for CORA what GAT learns is pretty much constant attention weights! Like in GCN! print( f'Max attention weight = {np.max(attention_weights)} and min = {np.min(attention_weights)}' ) attention_weights /= np.max( attention_weights ) # rescale the biggest weight to 1 for nicer plotting # Build up the neighborhood graph whose attention we want to visualize # igraph constraint - it works with contiguous range of ids so we map e.g. node 497 to 0, 12 to 1, etc. id_to_igraph_id = dict( zip(source_node_ids, range(len(source_node_ids)))) ig_graph = ig.Graph() ig_graph.add_vertices(size_of_neighborhood) ig_graph.add_edges([(id_to_igraph_id[neighbor], id_to_igraph_id[target_node_id]) for neighbor in source_node_ids]) # Prepare the visualization settings dictionary and plot visual_style = { "edge_width": attention_weights, # make edges as thick as the corresponding attention weight "layout": ig_graph.layout_reingold_tilford_circular( ) # layout for tree-like graphs } # This is the only part that's Cora specific as Cora has 7 labels if dataset_name.lower() == DatasetType.CORA.name.lower(): visual_style["vertex_color"] = [ cora_label_to_color_map[label] for label in labels ] else: print( 'Add custom color scheme for your specific dataset. Using igraph default coloring.' ) ig.plot(ig_graph, **visual_style) elif visualization_type == VisualizationType.EMBEDDINGS: # visualize embeddings (using t-SNE) node_labels = node_labels.cpu().numpy() num_classes = len(set(node_labels)) # Feel free to experiment with perplexity it's arguable the most important parameter of t-SNE and it basically # controls the standard deviation of Gaussians i.e. the size of the neighborhoods in high dim (original) space. # Simply put the goal of t-SNE is to minimize the KL-divergence between joint Gaussian distribution fit over # high dim points and between the t-Student distribution fit over low dimension points (the ones we're plotting) # Intuitively, by doing this, we preserve the similarities (relationships) between the high and low dim points. # This (probably) won't make much sense if you're not already familiar with t-SNE, God knows I've tried. :P t_sne_embeddings = TSNE( n_components=2, perplexity=30, method='barnes_hut').fit_transform(all_nodes_unnormalized_scores) for class_id in range(num_classes): # We extract the points whose true label equals class_id and we color them in the same way, hopefully # they'll be clustered together on the 2D chart - that would mean that GAT has learned good representations! plt.scatter(t_sne_embeddings[node_labels == class_id, 0], t_sne_embeddings[node_labels == class_id, 1], s=20, color=cora_label_to_color_map[class_id], edgecolors='black', linewidths=0.2) plt.show() # We want our local probability distributions (attention weights over the neighborhoods) to be # non-uniform because that means that GAT is learning a useful pattern. Entropy histograms help us visualize # how different those neighborhood distributions are from the uniform distribution (constant attention). # If the GAT is learning const attention we could well be using GCN or some even simpler models. elif visualization_type == VisualizationType.ENTROPY: num_heads_per_layer = [layer.num_of_heads for layer in gat.gat_net] num_layers = len(num_heads_per_layer) num_of_nodes = len(node_features) target_node_ids = edge_index[1].cpu().numpy() # For every GAT layer and for every GAT attention head plot the entropy histogram for layer_id in range(num_layers): # Fetch the attention weights for edges (attention is logged during GAT's forward pass above) # attention shape = (N, NH, 1) -> (N, NH) - we just squeeze the last dim it's superfluous all_attention_weights = gat.gat_net[ layer_id].attention_weights.squeeze(dim=-1).cpu().numpy() for head_id in range(num_heads_per_layer[layer_id]): uniform_dist_entropy_list = [ ] # save the ideal uniform histogram as the reference neighborhood_entropy_list = [] for target_node_id in range( num_of_nodes ): # find every the neighborhood for every node in the graph # These attention weights sum up to 1 by GAT design so we can treat it as a probability distribution neigborhood_attention = all_attention_weights[ target_node_ids == target_node_id].flatten() # Reference uniform distribution of the same length ideal_uniform_attention = np.ones( len(neigborhood_attention)) / len( neigborhood_attention) # Calculate the entropy, check out this video if you're not familiar with the concept: # https://www.youtube.com/watch?v=ErfnhcEV1O8 (Aurélien Géron) neighborhood_entropy_list.append( entropy(neigborhood_attention, base=2)) uniform_dist_entropy_list.append( entropy(ideal_uniform_attention, base=2)) title = f'Cora entropy histogram layer={layer_id}, attention head={head_id}' draw_entropy_histogram(uniform_dist_entropy_list, title, color='orange', uniform_distribution=True) draw_entropy_histogram(neighborhood_entropy_list, title, color='dodgerblue') fig = plt.gcf() # get current figure plt.show() fig.savefig( os.path.join(DATA_DIR_PATH, f'layer_{layer_id}_head_{head_id}.jpg')) plt.close() else: raise Exception( f'Visualization type {visualization_type} not supported.')
shutil.rmtree(game_frames_dump_dir) os.makedirs(game_frames_dump_dir, exist_ok=True) # Step 1: Prepare environment, replay buffer and schedule env = utils.get_env_wrapper(env_id, record_video=should_record_video) replay_buffer = ReplayBuffer(buffer_size) const_schedule = utils.ConstSchedule( epsilon_eval ) # lambda would also do - doing it like this for consistency # Step 2: Prepare the DQN model model_path = os.path.join(BINARIES_PATH, model_name) model_state = torch.load(model_path) assert model_state['env_id'] == env_id, \ f"Model {model_name} was trained on {model_state['env_id']} but you're running it on {env_id}." utils.print_model_metadata(model_state) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") dqn = DQN(env, number_of_actions=env.action_space.n, epsilon_schedule=const_schedule).to(device) dqn.load_state_dict(model_state["state_dict"], strict=True) dqn.eval() # Step 3: Evaluate the agent on a single episode print(f'{"*"*10} Starting the game. {"*"*10}') last_frame = env.reset() score = 0 cnt = 0 while True:
def stylize_static_image(inference_config): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Prepare the model - load the weights and put the model into evaluation mode stylization_model = TransformerNet().to(device) training_state = torch.load( os.path.join(inference_config["model_binaries_path"], inference_config["model_name"])) state_dict = training_state["state_dict"] stylization_model.load_state_dict(state_dict, strict=True) stylization_model.eval() if inference_config['verbose']: utils.print_model_metadata(training_state) with torch.no_grad(): if os.path.isdir( inference_config['content_input'] ): # do a batch stylization (every image in the directory) img_dataset = utils.SimpleDataset( inference_config['content_input'], inference_config['img_width']) img_loader = DataLoader(img_dataset, batch_size=inference_config['batch_size']) try: processed_imgs_cnt = 0 for batch_id, img_batch in enumerate(img_loader): processed_imgs_cnt += len(img_batch) if inference_config['verbose']: print( f'Processing batch {batch_id + 1} ({processed_imgs_cnt}/{len(img_dataset)} processed images).' ) img_batch = img_batch.to(device) stylized_imgs = stylization_model(img_batch).to( 'cpu').numpy() for stylized_img in stylized_imgs: utils.save_and_maybe_display_image( inference_config, stylized_img, should_display=False) except Exception as e: print(e) print( f'Consider making the batch_size (current = {inference_config["batch_size"]} images) or img_width (current = {inference_config["img_width"]} px) smaller' ) exit(1) else: # do stylization for a single image content_img_path = os.path.join( inference_config['content_images_path'], inference_config['content_input']) content_image = utils.prepare_img(content_img_path, inference_config['img_width'], device) stylized_img = stylization_model(content_image).to( 'cpu').numpy()[0] utils.save_and_maybe_display_image( inference_config, stylized_img, should_display=inference_config['should_not_display'])