def stylize_static_image(inference_config):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    content_img_path = os.path.join(inference_config['content_images_path'], inference_config['content_img_name'])
    content_image = utils.prepare_img(content_img_path, inference_config['img_width'], device)

    # load the weights and set the model to evaluation mode
    stylization_model = TransformerNet().to(device)
    training_state = torch.load(os.path.join(inference_config["model_binaries_path"], inference_config["model_name"]))
    utils.print_model_metadata(training_state)
    state_dict = training_state["state_dict"]
    stylization_model.load_state_dict(state_dict, strict=True)
    stylization_model.eval()

    with torch.no_grad():
        stylized_img = stylization_model(content_image).to('cpu').numpy()[0]
        utils.save_and_maybe_display_image(inference_config, stylized_img, should_display=True)
Esempio n. 2
0
def translate_a_single_sentence(translation_config):
    device = torch.device("cuda" if torch.cuda.is_available() else
                          "cpu")  # checking whether you have a GPU

    # Step 1: Prepare the field processor (tokenizer, numericalizer)
    _, _, src_field_processor, trg_field_processor = get_datasets_and_vocabs(
        translation_config['dataset_path'],
        translation_config['language_direction'],
        translation_config['dataset_name'] == DatasetType.IWSLT.name)
    assert src_field_processor.vocab.stoi[
        PAD_TOKEN] == trg_field_processor.vocab.stoi[PAD_TOKEN]
    pad_token_id = src_field_processor.vocab.stoi[
        PAD_TOKEN]  # needed for constructing masks

    # Step 2: Prepare the model
    baseline_transformer = Transformer(
        model_dimension=BASELINE_MODEL_DIMENSION,
        src_vocab_size=len(src_field_processor.vocab),
        trg_vocab_size=len(trg_field_processor.vocab),
        number_of_heads=BASELINE_MODEL_NUMBER_OF_HEADS,
        number_of_layers=BASELINE_MODEL_NUMBER_OF_LAYERS,
        dropout_probability=BASELINE_MODEL_DROPOUT_PROB,
        log_attention_weights=True).to(device)

    model_path = os.path.join(BINARIES_PATH, translation_config['model_name'])
    if not os.path.exists(model_path):
        print(f'Model {model_path} does not exist, attempting to download.')
        model_path = download_models(translation_config)

    model_state = torch.load(model_path)
    print_model_metadata(model_state)
    baseline_transformer.load_state_dict(model_state["state_dict"],
                                         strict=True)
    baseline_transformer.eval()

    # Step 3: Prepare the input sentence
    source_sentence = translation_config['source_sentence']
    ex = Example.fromlist([source_sentence],
                          fields=[('src', src_field_processor)
                                  ])  # tokenize the sentence

    source_sentence_tokens = ex.src
    print(f'Source sentence tokens = {source_sentence_tokens}')

    # Numericalize and convert to cuda tensor
    src_token_ids_batch = src_field_processor.process([source_sentence_tokens],
                                                      device)

    with torch.no_grad():
        # Step 4: Optimization - compute the source token representations only once
        src_mask, _ = get_masks_and_count_tokens_src(src_token_ids_batch,
                                                     pad_token_id)
        src_representations_batch = baseline_transformer.encode(
            src_token_ids_batch, src_mask)

        # Step 5: Decoding process
        if translation_config['decoding_method'] == DecodingMethod.GREEDY:
            target_sentence_tokens = greedy_decoding(
                baseline_transformer, src_representations_batch, src_mask,
                trg_field_processor)
        else:
            beam_decoding = get_beam_decoder(translation_config)
            target_sentence_tokens = beam_decoding(baseline_transformer,
                                                   src_representations_batch,
                                                   src_mask,
                                                   trg_field_processor)
        print(
            f'Translation | Target sentence tokens = {target_sentence_tokens}')

        # Step 6: Potentially visualize the encoder/decoder attention weights
        if translation_config['visualize_attention']:
            visualize_attention(baseline_transformer, source_sentence_tokens,
                                target_sentence_tokens)
Esempio n. 3
0
def visualize_gat_properties(model_name=r'gat_000000.pth',
                             dataset_name=DatasetType.CORA.name,
                             visualization_type=VisualizationType.ATTENTION):
    """
    Using t-SNE to visualize GAT embeddings in 2D space.
    Check out this one for more intuition on how to tune t-SNE: https://distill.pub/2016/misread-tsne/

    If you think it'd be useful for me to implement t-SNE as well and explain how every single detail works
    open up an issue or DM me on social media! <3

    Note: I also tried using UMAP but it doesn't provide any more insight than t-SNE.
    (con: it has a lot of dependencies if you want to use their plotting functionality)

    """
    device = torch.device("cuda" if torch.cuda.is_available() else
                          "cpu")  # checking whether you have a GPU, I hope so!

    config = {
        'dataset_name': dataset_name,
        'layer_type': LayerType.IMP3,
        'should_visualize': False  # don't visualize the dataset
    }

    # Step 1: Prepare the data
    node_features, node_labels, topology, train_indices, val_indices, test_indices = load_graph_data(
        config, device)

    # Step 2: Prepare the model
    model_path = os.path.join(BINARIES_PATH, model_name)
    model_state = torch.load(model_path)

    gat = GAT(num_of_layers=model_state['num_of_layers'],
              num_heads_per_layer=model_state['num_heads_per_layer'],
              num_features_per_layer=model_state['num_features_per_layer'],
              add_skip_connection=model_state['add_skip_connection'],
              bias=model_state['bias'],
              dropout=model_state['dropout'],
              layer_type=name_to_layer_type(model_state['layer_type']),
              log_attention_weights=True).to(device)

    print_model_metadata(model_state)
    gat.load_state_dict(model_state["state_dict"], strict=True)
    gat.eval(
    )  # some layers like nn.Dropout behave differently in train vs eval mode so this part is important

    # Step 3: Calculate the things we'll need for different visualization types (attention, scores, edge_index)

    # This context manager is important (and you'll often see it), otherwise PyTorch will eat much more memory.
    # It would be saving activations for backprop but we are not going to do any model training just the prediction.
    with torch.no_grad():
        # Step 3: Run predictions and collect the high dimensional data
        all_nodes_unnormalized_scores, _ = gat(
            (node_features, topology))  # shape = (N, num of classes)
        all_nodes_unnormalized_scores = all_nodes_unnormalized_scores.cpu(
        ).numpy()

    # We'll need the edge index in different for multiple visualization types
    if config[
            'layer_type'] == LayerType.IMP3:  # imp 3 works with edge index while others work with adjacency info
        edge_index = topology
    else:
        edge_index = convert_adj_to_edge_index(topology)

    # Step 4: Perform a specific visualization
    if visualization_type == VisualizationType.ATTENTION:
        # The number of nodes for which we want to visualize their attention over neighboring nodes
        # (2x this actually as we add nodes with highest degree + random nodes)
        num_nodes_of_interest = 4  # 4 is an arbitrary number you can play with these numbers
        head_to_visualize = 0  # plot attention from this multi-head attention's head
        gat_layer_id = 1  # plot attention from this GAT layer

        # Build up the complete graph
        # node_features shape = (N, FIN), where N is the number of nodes and FIN number of input features
        total_num_of_nodes = len(node_features)
        complete_graph = ig.Graph()
        complete_graph.add_vertices(
            total_num_of_nodes
        )  # igraph creates nodes with ids [0, total_num_of_nodes - 1]
        edge_index_tuples = list(zip(
            edge_index[0, :], edge_index[1, :]))  # igraph requires this format
        complete_graph.add_edges(edge_index_tuples)

        # Pick the target nodes to plot (nodes with highest degree + random nodes)
        # Note: there could be an overlap between random nodes and nodes with highest degree - but highly unlikely
        nodes_of_interest_ids = np.argpartition(
            complete_graph.degree(),
            -num_nodes_of_interest)[-num_nodes_of_interest:]
        random_node_ids = np.random.randint(low=0,
                                            high=total_num_of_nodes,
                                            size=num_nodes_of_interest)
        nodes_of_interest_ids = np.append(nodes_of_interest_ids,
                                          random_node_ids)
        np.random.shuffle(nodes_of_interest_ids)

        target_node_ids = edge_index[1]
        source_nodes = edge_index[0]

        for target_node_id in nodes_of_interest_ids:
            # Step 1: Find the neighboring nodes to the target node
            # Note: self edge for CORA is included so the target node is it's own neighbor (Alexandro yo soy tu madre)
            src_nodes_indices = torch.eq(target_node_ids, target_node_id)
            source_node_ids = source_nodes[src_nodes_indices].cpu().numpy()
            size_of_neighborhood = len(source_node_ids)

            # Step 2: Fetch their labels
            labels = node_labels[source_node_ids].cpu().numpy()

            # Step 3: Fetch the attention weights for edges (attention is logged during GAT's forward pass above)
            # attention shape = (N, NH, 1) -> (N, NH) - we just squeeze the last dim it's superfluous
            all_attention_weights = gat.gat_net[
                gat_layer_id].attention_weights.squeeze(dim=-1)
            attention_weights = all_attention_weights[
                src_nodes_indices, head_to_visualize].cpu().numpy()
            # This part shows that for CORA what GAT learns is pretty much constant attention weights! Like in GCN!
            print(
                f'Max attention weight = {np.max(attention_weights)} and min = {np.min(attention_weights)}'
            )
            attention_weights /= np.max(
                attention_weights
            )  # rescale the biggest weight to 1 for nicer plotting

            # Build up the neighborhood graph whose attention we want to visualize
            # igraph constraint - it works with contiguous range of ids so we map e.g. node 497 to 0, 12 to 1, etc.
            id_to_igraph_id = dict(
                zip(source_node_ids, range(len(source_node_ids))))
            ig_graph = ig.Graph()
            ig_graph.add_vertices(size_of_neighborhood)
            ig_graph.add_edges([(id_to_igraph_id[neighbor],
                                 id_to_igraph_id[target_node_id])
                                for neighbor in source_node_ids])

            # Prepare the visualization settings dictionary and plot
            visual_style = {
                "edge_width":
                attention_weights,  # make edges as thick as the corresponding attention weight
                "layout": ig_graph.layout_reingold_tilford_circular(
                )  # layout for tree-like graphs
            }
            # This is the only part that's Cora specific as Cora has 7 labels
            if dataset_name.lower() == DatasetType.CORA.name.lower():
                visual_style["vertex_color"] = [
                    cora_label_to_color_map[label] for label in labels
                ]
            else:
                print(
                    'Add custom color scheme for your specific dataset. Using igraph default coloring.'
                )

            ig.plot(ig_graph, **visual_style)

    elif visualization_type == VisualizationType.EMBEDDINGS:  # visualize embeddings (using t-SNE)
        node_labels = node_labels.cpu().numpy()
        num_classes = len(set(node_labels))

        # Feel free to experiment with perplexity it's arguable the most important parameter of t-SNE and it basically
        # controls the standard deviation of Gaussians i.e. the size of the neighborhoods in high dim (original) space.
        # Simply put the goal of t-SNE is to minimize the KL-divergence between joint Gaussian distribution fit over
        # high dim points and between the t-Student distribution fit over low dimension points (the ones we're plotting)
        # Intuitively, by doing this, we preserve the similarities (relationships) between the high and low dim points.
        # This (probably) won't make much sense if you're not already familiar with t-SNE, God knows I've tried. :P
        t_sne_embeddings = TSNE(
            n_components=2, perplexity=30,
            method='barnes_hut').fit_transform(all_nodes_unnormalized_scores)

        for class_id in range(num_classes):
            # We extract the points whose true label equals class_id and we color them in the same way, hopefully
            # they'll be clustered together on the 2D chart - that would mean that GAT has learned good representations!
            plt.scatter(t_sne_embeddings[node_labels == class_id, 0],
                        t_sne_embeddings[node_labels == class_id, 1],
                        s=20,
                        color=cora_label_to_color_map[class_id],
                        edgecolors='black',
                        linewidths=0.2)
        plt.show()

    # We want our local probability distributions (attention weights over the neighborhoods) to be
    # non-uniform because that means that GAT is learning a useful pattern. Entropy histograms help us visualize
    # how different those neighborhood distributions are from the uniform distribution (constant attention).
    # If the GAT is learning const attention we could well be using GCN or some even simpler models.
    elif visualization_type == VisualizationType.ENTROPY:
        num_heads_per_layer = [layer.num_of_heads for layer in gat.gat_net]
        num_layers = len(num_heads_per_layer)

        num_of_nodes = len(node_features)
        target_node_ids = edge_index[1].cpu().numpy()

        # For every GAT layer and for every GAT attention head plot the entropy histogram
        for layer_id in range(num_layers):
            # Fetch the attention weights for edges (attention is logged during GAT's forward pass above)
            # attention shape = (N, NH, 1) -> (N, NH) - we just squeeze the last dim it's superfluous
            all_attention_weights = gat.gat_net[
                layer_id].attention_weights.squeeze(dim=-1).cpu().numpy()

            for head_id in range(num_heads_per_layer[layer_id]):
                uniform_dist_entropy_list = [
                ]  # save the ideal uniform histogram as the reference
                neighborhood_entropy_list = []

                for target_node_id in range(
                        num_of_nodes
                ):  # find every the neighborhood for every node in the graph
                    # These attention weights sum up to 1 by GAT design so we can treat it as a probability distribution
                    neigborhood_attention = all_attention_weights[
                        target_node_ids == target_node_id].flatten()
                    # Reference uniform distribution of the same length
                    ideal_uniform_attention = np.ones(
                        len(neigborhood_attention)) / len(
                            neigborhood_attention)

                    # Calculate the entropy, check out this video if you're not familiar with the concept:
                    # https://www.youtube.com/watch?v=ErfnhcEV1O8 (Aurélien Géron)
                    neighborhood_entropy_list.append(
                        entropy(neigborhood_attention, base=2))
                    uniform_dist_entropy_list.append(
                        entropy(ideal_uniform_attention, base=2))

                title = f'Cora entropy histogram layer={layer_id}, attention head={head_id}'
                draw_entropy_histogram(uniform_dist_entropy_list,
                                       title,
                                       color='orange',
                                       uniform_distribution=True)
                draw_entropy_histogram(neighborhood_entropy_list,
                                       title,
                                       color='dodgerblue')

                fig = plt.gcf()  # get current figure
                plt.show()
                fig.savefig(
                    os.path.join(DATA_DIR_PATH,
                                 f'layer_{layer_id}_head_{head_id}.jpg'))
                plt.close()
    else:
        raise Exception(
            f'Visualization type {visualization_type} not supported.')
        shutil.rmtree(game_frames_dump_dir)
    os.makedirs(game_frames_dump_dir, exist_ok=True)

    # Step 1: Prepare environment, replay buffer and schedule
    env = utils.get_env_wrapper(env_id, record_video=should_record_video)
    replay_buffer = ReplayBuffer(buffer_size)
    const_schedule = utils.ConstSchedule(
        epsilon_eval
    )  # lambda would also do - doing it like this for consistency

    # Step 2: Prepare the DQN model
    model_path = os.path.join(BINARIES_PATH, model_name)
    model_state = torch.load(model_path)
    assert model_state['env_id'] == env_id, \
        f"Model {model_name} was trained on {model_state['env_id']} but you're running it on {env_id}."
    utils.print_model_metadata(model_state)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    dqn = DQN(env,
              number_of_actions=env.action_space.n,
              epsilon_schedule=const_schedule).to(device)
    dqn.load_state_dict(model_state["state_dict"], strict=True)
    dqn.eval()

    # Step 3: Evaluate the agent on a single episode
    print(f'{"*"*10} Starting the game. {"*"*10}')
    last_frame = env.reset()

    score = 0
    cnt = 0
    while True:
def stylize_static_image(inference_config):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Prepare the model - load the weights and put the model into evaluation mode
    stylization_model = TransformerNet().to(device)
    training_state = torch.load(
        os.path.join(inference_config["model_binaries_path"],
                     inference_config["model_name"]))
    state_dict = training_state["state_dict"]
    stylization_model.load_state_dict(state_dict, strict=True)
    stylization_model.eval()

    if inference_config['verbose']:
        utils.print_model_metadata(training_state)

    with torch.no_grad():
        if os.path.isdir(
                inference_config['content_input']
        ):  # do a batch stylization (every image in the directory)
            img_dataset = utils.SimpleDataset(
                inference_config['content_input'],
                inference_config['img_width'])
            img_loader = DataLoader(img_dataset,
                                    batch_size=inference_config['batch_size'])

            try:
                processed_imgs_cnt = 0
                for batch_id, img_batch in enumerate(img_loader):
                    processed_imgs_cnt += len(img_batch)
                    if inference_config['verbose']:
                        print(
                            f'Processing batch {batch_id + 1} ({processed_imgs_cnt}/{len(img_dataset)} processed images).'
                        )

                    img_batch = img_batch.to(device)
                    stylized_imgs = stylization_model(img_batch).to(
                        'cpu').numpy()
                    for stylized_img in stylized_imgs:
                        utils.save_and_maybe_display_image(
                            inference_config,
                            stylized_img,
                            should_display=False)
            except Exception as e:
                print(e)
                print(
                    f'Consider making the batch_size (current = {inference_config["batch_size"]} images) or img_width (current = {inference_config["img_width"]} px) smaller'
                )
                exit(1)

        else:  # do stylization for a single image
            content_img_path = os.path.join(
                inference_config['content_images_path'],
                inference_config['content_input'])
            content_image = utils.prepare_img(content_img_path,
                                              inference_config['img_width'],
                                              device)
            stylized_img = stylization_model(content_image).to(
                'cpu').numpy()[0]
            utils.save_and_maybe_display_image(
                inference_config,
                stylized_img,
                should_display=inference_config['should_not_display'])