Python Batch Exemples, data.Batch.Batch Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : ClosedVocab.py Projet : stjordanis/Open-Vocabulary-Learning-on-Source-Code-with-a-Graph-Structured-Cache

    def batchify(self, data_filepaths: List[str], ctx: mx.context.Context):
        data = [self.data_encoder.load_datapoint(i) for i in data_filepaths]

        # Get the size of each graph
        batch_sizes = nd.array([len(dp.node_names) for dp in data], dtype='int32', ctx=ctx)

        combined_node_types = tuple(itertools.chain(*[dp.node_types for dp in data]))
        node_types = tuple_of_tuples_to_padded_array(combined_node_types, ctx)
        combined_node_names = tuple(itertools.chain(*[dp.node_names for dp in data]))
        node_names = tuple_of_tuples_to_padded_array(combined_node_names, ctx)

        # Combine all the adjacency matrices into one big, disconnected graph
        edges = OrderedDict()
        for edge_type in self.data_encoder.all_edge_types:
            adj_mat = sp.sparse.block_diag([dp.edges[edge_type] for dp in data]).tocsr()
            adj_mat = nd.sparse.csr_matrix((adj_mat.data, adj_mat.indices, adj_mat.indptr), shape=adj_mat.shape,
                                           dtype='float32', ctx=ctx)
            edges[edge_type] = adj_mat

        # 1-hot whether a variable should have been indicated or not
        length = 0
        labels = []
        # Relabel the labels to match the indices in the batchified graph
        for dp in data:
            labels += [i + length for i in dp.label]
            length += len(dp.node_types)
        labels = nd.array(labels, dtype='int32', ctx=ctx)
        one_hot_labels = nd.zeros(length, dtype='float32', ctx=ctx)
        one_hot_labels[labels] = 1

        data = self.InputClass(edges, node_types, node_names, batch_sizes, ctx)
        return Batch(data, one_hot_labels)

Exemple #2

0

Afficher le fichier

Fichier : ClosedVocab.py Projet : chubbymaggie/Open-Vocabulary-Learning-on-Source-Code-with-a-Graph-Structured-Cache

    def batchify(self, data_filepaths: List[str], ctx: mx.context.Context):
        """
        Returns combined graphs and labels.
        Labels are a (PaddedArray, Tuple[str]) tuple.  The PaddedArray is size (batch x max_name_length) containing integers
        The integer values correspond to the integers in this model's data encoder's all_node_name_subtokens dict
        (i.e. rows in the name_embedding matrix)
        """
        data = [self.data_encoder.load_datapoint(i) for i in data_filepaths]

        # Get the size of each graph
        batch_sizes = nd.array([len(dp.node_names) for dp in data], dtype='int32', ctx=ctx)

        combined_node_types = tuple(itertools.chain(*[dp.node_types for dp in data]))
        node_types = tuple_of_tuples_to_padded_array(combined_node_types, ctx)
        combined_node_names = tuple(itertools.chain(*[dp.node_names for dp in data]))
        target_location_idx = self.data_encoder.all_node_name_subtokens[self.data_encoder.name_me_flag]
        target_locations = [i for i, name in enumerate(combined_node_names) if name == (target_location_idx,)]
        node_names = tuple_of_tuples_to_padded_array(combined_node_names, ctx)

        # Combine all the adjacency matrices into one big, disconnected graph
        edges = OrderedDict()
        for edge_type in self.data_encoder.all_edge_types:
            adj_mat = sp.sparse.block_diag([dp.edges[edge_type] for dp in data]).tocsr()
            adj_mat = nd.sparse.csr_matrix((adj_mat.data, adj_mat.indices, adj_mat.indptr), shape=adj_mat.shape,
                                           dtype='float32', ctx=ctx)
            edges[edge_type] = adj_mat

        # Combine the (encoded) real names of variables-to-be-named
        combined_labels = tuple(itertools.chain([dp.label for dp in data]))
        labels = tuple_of_tuples_to_padded_array(combined_labels, ctx, pad_amount=self.max_name_length)
        # Combine the (actual) real names of variables-to-be-named
        real_names = tuple([dp.real_variable_name for dp in data])

        data = self.InputClass(edges, node_types, node_names, batch_sizes, ctx, target_locations=target_locations)
        return Batch(data, [labels, real_names])

Exemple #3

0

Afficher le fichier

Fichier : GSCVocab.py Projet : stjordanis/Open-Vocabulary-Learning-on-Source-Code-with-a-Graph-Structured-Cache

    def batchify(self, data_filepaths: List[str], ctx: mx.context.Context):
        """
        Returns combined graphs and labels.
        Labels are a (PaddedArray, Tuple[str]) tuple.  The PaddedArray is size (batch x max_name_length) containing integers.
        The integer values correspond to the integers in this model's data encoder's all_node_name_subtokens dict,
           or if the integer value is greater than len(all_node_name_subtokens) it corresponds to which subtoken node
           in the graph represents the right subtoken
        """
        data = [self.data_encoder.load_datapoint(i) for i in data_filepaths]

        # Get the size of each graph
        batch_sizes = nd.array([len(dp.node_names) for dp in data],
                               dtype='int32',
                               ctx=ctx)

        combined_node_types = tuple(
            itertools.chain(*[dp.node_types for dp in data]))
        subtoken_node_type_idx = self.data_encoder.all_node_types[
            self.data_encoder.subtoken_flag]
        graph_vocab_node_locations = [
            i for i in range(len(combined_node_types))
            if combined_node_types[i][0] == subtoken_node_type_idx
        ]
        graph_vocab_node_locations = np.array(graph_vocab_node_locations)
        graph_vocab_node_real_names = [
            dp.graph_vocab_node_real_names for dp in data
        ]
        node_types = tuple_of_tuples_to_padded_array(combined_node_types, ctx)
        combined_node_names = tuple(
            itertools.chain(*[dp.node_names for dp in data]))
        target_locations = [
            i for i, name in enumerate(combined_node_names)
            if name == self.data_encoder.name_me_flag
        ]
        node_names = []
        for name in combined_node_names:
            if name == self.data_encoder.internal_node_flag:
                node_names.append(
                    self.data_encoder.name_to_1_hot(
                        '',
                        embedding_size=self.data_encoder.
                        max_name_encoding_length,
                        mark_as_internal=True))
            elif name == self.data_encoder.name_me_flag:
                node_names.append(
                    self.data_encoder.name_to_1_hot(
                        '',
                        embedding_size=self.data_encoder.
                        max_name_encoding_length,
                        mark_as_special=True))
            else:
                node_names.append(
                    self.data_encoder.name_to_1_hot(
                        name,
                        embedding_size=self.data_encoder.
                        max_name_encoding_length))
        node_names = nd.array(np.stack(node_names), dtype='float32', ctx=ctx)

        # Combine all the adjacency matrices into one big, disconnected graph
        edges = OrderedDict()
        for edge_type in self.data_encoder.all_edge_types:
            adj_mat = sp.sparse.block_diag(
                [dp.edges[edge_type] for dp in data]).tocsr()
            adj_mat = nd.sparse.csr_matrix(
                (adj_mat.data, adj_mat.indices, adj_mat.indptr),
                shape=adj_mat.shape,
                dtype='float32',
                ctx=ctx)
            edges[edge_type] = adj_mat

        # Get the real names of the variables we're supposed to be naming
        combined_closed_vocab_labels = list(
            itertools.chain([dp.label[0] for dp in data]))
        # vocab labels are integers referring to indices in the model's data encoder's all_node_name_subtokens
        vocab_labels = tuple_of_tuples_to_padded_array(
            combined_closed_vocab_labels, ctx, pad_amount=self.max_name_length)
        combined_attn_labels = []
        for dp in data:
            graph_vocab_nodes_in_dp = [
                i for i in range(len(dp.node_types))
                if dp.node_types[i][0] == subtoken_node_type_idx
            ]
            combined_attn_labels.append(
                tuple([
                    graph_vocab_nodes_in_dp.index(i) + 1 if i >= 0 else -1
                    for i in dp.label[1]
                ]))
        # attn labels are integers referring to indices (+1 to avoid confusion with the padding value, which is 0) in the list of attn weights over graph vocab nodes the model will eventually output (or -1 if there's no appropriate node)
        attn_labels = tuple_of_tuples_to_padded_array(
            combined_attn_labels, ctx, pad_amount=self.max_name_length)
        attn_label = attn_labels.values
        subtoken_in_graph = attn_label > 0
        attn_label = len(
            self.data_encoder.all_node_name_subtokens
        ) + attn_label - 1  # -1 because we're done avoiding the padding value
        # If the correct subtoken was in the graph, then pointing to it is the correct output (it will always be in the vocab during training)
        joint_label = PaddedArray(values=nd.where(subtoken_in_graph,
                                                  attn_label,
                                                  vocab_labels.values),
                                  value_lengths=vocab_labels.value_lengths)

        # Combine the (actual) real names of variables-to-be-named
        real_names = tuple([dp.real_variable_name for dp in data])

        data = self.InputClass(
            edges,
            node_types,
            node_names,
            batch_sizes,
            ctx,
            target_locations=target_locations,
            graph_vocab_node_locations=graph_vocab_node_locations,
            graph_vocab_node_real_names=graph_vocab_node_real_names)
        return Batch(data, [joint_label, real_names])

Exemple #4

0

Afficher le fichier

Fichier : test_Batch.py Projet : epfl-lara/Deep_Learning_On_Code_With_A_Graph_Vocabulary

 def test_recursive_move_to_context_moves_all_elements(self, input):
     input = [input]
     self.assertNotIn('cpu(1)', str(input))  # Super hacky test...
     Batch.recurse_move_to_context(input, mx.cpu(1))
     self.assertNotIn('cpu(0)', str(input))  # Super hacky test...