def tensorise_token_sequence(
    vocab: Vocabulary,
    length: int,
    token_seq: Iterable[str],
) -> List[int]:
    """
    Tensorise a single example.

    Args:
        vocab: Vocabulary to use for mapping tokens to integer IDs
        length: Length to truncate/pad sequences to.
        token_seq: Sequence of tokens to tensorise.

    Returns:
        List with length elements that are integer IDs of tokens in our vocab.
    """
    #TODO 4# Insert your tensorisation code here
    tokens_ids = []
    tokens_ids.append(vocab.get_id_or_unk(START_SYMBOL))
    tokens_ids.extend(
        vocab.get_id_or_unk_multiple(token_seq, pad_to_size=length - 1))

    # END_SYMBOL must be the last element in the tokenised sequence
    end_position = min(1 + len(token_seq), length - 1)
    tokens_ids[end_position] = vocab.get_id_or_unk(END_SYMBOL)

    return tokens_ids
    def _tensorise_target_token_sequence(
        self, vocab: Vocabulary, length: int, token_seq: Iterable[str],
    ) -> List[int]:
        """
        Tensorise a single example.

        Args:
            vocab: Vocabulary to use for mapping tokens to integer IDs
            length: Length to truncate/pad sequences to.
            token_seq: Sequence of tokens to tensorise.

        Returns:
            List with length elements that are integer IDs of tokens in our vocab.
        """
        tensorised = []

        for i in range(length):
            if i==0:
                tensorised.append(vocab.get_id_or_unk(START_SYMBOL))
            elif len(token_seq) >= i:
                tensorised.append(vocab.get_id_or_unk(token_seq[i-1]))
            elif i == len(token_seq) + 1:
                tensorised.append(vocab.get_id_or_unk(END_SYMBOL))
            else:
                tensorised.append(vocab.get_id_or_unk(vocab.get_pad()))

        return tensorised
    def _build_vocab(
        self, dataset, vocab_size: int, max_num_files: Optional[int] = None
    ) -> Vocabulary:
        """
        Compute model metadata such as a vocabulary.

        Args:
            data: Dataset of method code and comments.
            source_or_taget: 'source' for methods source, 'target' for methods comments.
            vocab_size: Maximal size of the vocabulary to create.
            max_num_files: Maximal number of files to load.
        """
      
        vocab = Vocabulary(add_unk=True, add_pad=True)
        # Make sure to include the START_SYMBOL in the vocabulary as well:
        vocab.add_or_get_id(START_SYMBOL)
        vocab.add_or_get_id(END_SYMBOL)
        cnt = collections.Counter()

        for token_seq in dataset:
            for token in token_seq:
                cnt[token] += 1

        for token, _ in cnt.most_common(vocab_size):
            vocab.add_or_get_id(token)

        return vocab
def build_vocab_from_data_dir(
        data_dir: str,
        vocab_size: int,
        max_num_files: Optional[int] = None) -> Vocabulary:
    """
    Compute model metadata such as a vocabulary.

    Args:
        data_dir: Directory containing data files.
        vocab_size: Maximal size of the vocabulary to create.
        max_num_files: Maximal number of files to load.
    """

    data_files = get_data_files_from_directory(data_dir, max_num_files)

    vocab = Vocabulary(add_unk=True, add_pad=True)
    # Make sure to include the START_SYMBOL in the vocabulary as well:
    vocab.add_or_get_id(START_SYMBOL)
    vocab.add_or_get_id(END_SYMBOL)

    #TODO 3# Insert your vocabulary-building code here
    counter = Counter()
    for file in data_files:  # for each file, count all tokens
        list_of_samples = load_data_file(file)

        for list_tokens in list_of_samples:
            for token in list_tokens:
                counter[token] += 1

    # most common tokens in vocabulary
    # Take vocab_size -2 because we need to also store START_SYMBOL and END_SYMBOL
    for elem, cnt in counter.most_common(vocab_size - 2):
        vocab.add_or_get_id(elem)

    return vocab
    def _finalise_metadata(
            self, raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]:
        final_metadata = super()._finalise_metadata(raw_metadata_list)

        # First, merge all needed information:
        merged_edge_types = set()
        merged_node_label_counter = Counter()
        merged_node_type_counter = Counter()
        merged_edge_value_sizes = {}
        for raw_metadata in raw_metadata_list:
            merged_edge_types.update(raw_metadata["cg_edge_types"])
            merged_node_label_counter += raw_metadata['cg_node_label_counter']
            merged_node_type_counter += raw_metadata['cg_node_type_counter']

            for edge_type, edge_value_size in raw_metadata[
                    'cg_edge_value_sizes'].items():
                existing_edge_value_size = merged_edge_value_sizes.get(
                    edge_type)
                if existing_edge_value_size is not None:
                    assert existing_edge_value_size == edge_value_size
                merged_edge_value_sizes[edge_type] = edge_value_size
        # Store edges allowed in the context graph, and assign numerical IDs to them:
        print(merged_edge_types)
        all_used_cg_edges = list(
            merged_edge_types -
            set(self.hyperparameters['excluded_cg_edge_types']))
        print(all_used_cg_edges)
        if self.hyperparameters.get('cg_add_subtoken_nodes', False):
            all_used_cg_edges.append(USES_SUBTOKEN_EDGE_NAME)
        final_metadata['cg_edge_type_dict'] = {
            e: i
            for i, e in enumerate(all_used_cg_edges)
        }

        # Store token, type, and production vocabs:
        final_metadata['cg_node_label_vocab'] = \
            Vocabulary.create_vocabulary(
                merged_node_label_counter,
                max_size=self.hyperparameters['cg_node_label_vocab_size'])

        type_embedding_size = self.hyperparameters[
            'cg_node_type_embedding_size']
        if type_embedding_size > 0:
            final_metadata['cg_node_type_vocab'] = \
                LatticeVocabulary.get_vocabulary_for(
                    tokens=merged_node_type_counter,
                    max_size=self.hyperparameters['cg_node_type_vocab_size'] - 1,
                    lattice=final_metadata['type_lattice'])
            final_metadata['cg_node_type_vocab'].add_or_get_id(NO_TYPE)
            self.hyperparameters['cg_node_type_vocab_size'] = len(
                final_metadata['cg_node_type_vocab'])

        final_metadata['cg_edge_value_sizes'] = {}
        for edge_type, edge_feature_size in merged_edge_value_sizes.items():
            fwd_edge_type_idx = final_metadata['cg_edge_type_dict'][edge_type]
            final_metadata['cg_edge_value_sizes'][
                fwd_edge_type_idx] = edge_feature_size

        return final_metadata
    def _tensorise_node_features(
        self, vocab: Vocabulary, token_seq
    ) -> np.ndarray:
        tensorised = []
        for token in token_seq:
            tensorised.append([vocab.get_id_or_unk(token)])

        return tensorised
Exemple #7
0
    def finalise_metadata(self, raw_metadata_list: List[Dict[str, Any]],
                          final_metadata: Dict[str, Any]) -> None:
        # First, merge all needed information:
        merged_token_counter = Counter()
        for raw_metadata in raw_metadata_list:
            merged_token_counter += raw_metadata['decoder_token_counter']

        final_metadata['decoder_token_vocab'] = \
            Vocabulary.create_vocabulary(merged_token_counter,
                                         max_size=self.hyperparameters['decoder_vocab_size'] - 2)
        final_metadata['decoder_token_vocab'].add_or_get_id(START_TOKEN)
        final_metadata['decoder_token_vocab'].add_or_get_id(END_TOKEN)
    def finalise_metadata(self, raw_metadata_list: List[Dict[str, Any]],
                          final_metadata: Dict[str, Any]) -> None:
        # original code
        # First, merge all needed information:
        merged_token_counter = Counter()
        for raw_metadata in raw_metadata_list:
            merged_token_counter += raw_metadata['decoder_token_counter']

        final_metadata['decoder_token_vocab'] = \
            Vocabulary.create_vocabulary(merged_token_counter,
                                         max_size=self.hyperparameters['decoder_vocab_size'] - 2, count_threshold=1)
        final_metadata['decoder_token_vocab'].add_or_get_id(START_TOKEN)
        final_metadata['decoder_token_vocab'].add_or_get_id(END_TOKEN)

        # addtional code to process var occurrence
        merged_var_counter = Counter()
        for raw_metadata in raw_metadata_list:
            merged_var_counter += raw_metadata['var_name_counter']
        final_metadata[ 'var_occurrence_vocab' ] = \
            Vocabulary.create_vocabulary( merged_var_counter,
                                          max_size=self.hyperparameters[ 'decoder_vocab_size'] )
        print("TOTAL VARIABLE VOCAB SIZE: ",
              len(final_metadata['var_occurrence_vocab'].id_to_token))
Exemple #9
0
    def load_metadata_from_dir(self, data_dir: str, max_num_files: Optional[int]=None) -> None:
        """
        Compute model metadata such as a vocabulary.

        Args:
            data_dir: Directory containing data files.
            max_num_files: Maximal number of files to load.

        Note: This populates the model.metadata dictionary
        """
        data_files = get_data_files_from_directory(data_dir, max_num_files)
        tokens = Counter(t for f in data_files for t, _ in self.load_data_file(f))
        print('Number of unique tokens in dataset: ', len(tokens.keys()))
                
        self.__metadata = {'token_vocab': Vocabulary.create_vocabulary(tokens, max_size=5000)}
Exemple #10
0
def large_lambda_1(s: tuple,
                   tokens: np.ndarray,
                   token_lens: np.ndarray,
                   predictions: np.ndarray,
                   probs: np.ndarray,
                   vocab: Vocabulary,
                   file_batch: int,
                   lambdas: np.ndarray = None,
                   num_correct_ids: int = 0,
                   num_ids: int = 1):
    state, att_states, att_ids, alpha_states, att_counts, lambda_state = s
    tokens = np.transpose(tokens)
    att_states = np.squeeze(att_states, 2)
    alpha_states = np.squeeze(alpha_states, 2)
    max_atts = alpha_states.max(-1)
    unk_i = vocab.token_to_id[vocab.get_unk()]
    seq_mask = np.transpose(
        np.tile(np.arange(tokens.shape[0]), [tokens.shape[1], 1])) < np.tile(
            token_lens, [tokens.shape[0], 1])
    n_unk = (tokens == unk_i)[seq_mask].sum()
    n_non_pad = seq_mask.sum()
    unk_p = n_unk / n_non_pad
    print('File Batch: ', file_batch)
    print('some tokens: ', tokens[seq_mask][:10])
    if max_atts is not None:
        max_copy_prob = max_atts * lambda_state[:, 1]
        average_lambda = np.ma.array(lambda_state[:, 1],
                                     mask=tokens[-1] == 0).mean()
        print('average copy lambda: ', average_lambda)
        print('max copy lambda: ', lambda_state[:, 1].max())
        print('max copy token prob: ', np.amax(max_copy_prob))
        attention_stats = [average_lambda]
    else:
        attention_stats = []
    print_tokens_and_others((tokens, predictions), (probs, ), vocab)
    print('id acc: ', num_correct_ids / num_ids)
    am = np.unravel_index(np.argmin(predictions), predictions.shape)
    print(predictions[am], tokens[am[0]], tokens[am])
    return [
        np.min(predictions),
        np.mean(predictions), n_unk, n_non_pad, num_correct_ids, num_ids
    ] + attention_stats
Exemple #11
0
def build_vocab_from_data_dir(
    data_dir: str, vocab_size: int, max_num_files: Optional[int] = None
) -> Vocabulary:
    """
    Compute model metadata such as a vocabulary.

    Args:
        data_dir: Directory containing data files.
        vocab_size: Maximal size of the vocabulary to create.
        max_num_files: Maximal number of files to load.
    """

    data_files = get_data_files_from_directory(data_dir, max_num_files)

    vocab = Vocabulary(add_unk=True, add_pad=True)
    # Make sure to include the START_SYMBOL in the vocabulary as well:
    vocab.add_or_get_id(START_SYMBOL)
    vocab.add_or_get_id(END_SYMBOL)

    #TODO 3# Insert your vocabulary-building code here

    return vocab
def token_seq_equal(a: List[str], b: List[str]):
    unk_tok = Vocabulary.get_unk()
    if unk_tok in a or unk_tok in b:
        return False
    else:
        return a == b