Exemple #1
0
    def finalise_metadata(name: str, raw_metadata_list: List[Dict[str, Any]],
                          final_metadata: Dict[str, Any],
                          hyperparameters: Dict[str, Any]) -> None:
        label_embedding_style = hyperparameters[
            f'{name}_embedding_style'].lower()

        merged_node_label_counter = Counter()
        for raw_metadata in raw_metadata_list:
            if label_embedding_style == 'token':
                merged_node_label_counter += raw_metadata[f'{name}_counter']
            elif label_embedding_style == 'subtoken':
                merged_node_label_counter += raw_metadata[
                    f'{name}_subtoken_counter']

        def add_special_literals(vocab: Vocabulary) -> None:
            vocab.add_or_get_id(TokenEmbedder.STRING_LITERAL)
            vocab.add_or_get_id(TokenEmbedder.FLOAT_LITERAL)
            vocab.add_or_get_id(TokenEmbedder.INT_LITERAL)

        if label_embedding_style == 'token':
            # Store token, type, and production vocabs:
            final_metadata[f'{name}_vocab'] = \
                Vocabulary.create_vocabulary(
                    merged_node_label_counter,
                    max_size=hyperparameters[f'{name}_vocab_size'])
            add_special_literals(final_metadata[f'{name}_vocab'])
        elif label_embedding_style == 'subtoken':
            final_metadata[f'{name}_subtoken_vocab'] = \
                Vocabulary.create_vocabulary(
                    merged_node_label_counter,
                    max_size=hyperparameters[f'{name}_vocab_size'])
            add_special_literals(final_metadata[f'{name}_subtoken_vocab'])
    def __init__(self, nl_threshold, nl_embedding_size, nl_token_counter,
                 code_threshold, code_embedding_size, code_token_counter,
                 dropout_rate, load_pretrained_embeddings=False):
        """Keeps track of the NL and code vocabularies and embeddings."""
        super(EmbeddingStore, self).__init__()
        edit_keywords = get_edit_keywords()
        self.__nl_vocabulary = Vocabulary.create_vocabulary(tokens=edit_keywords,
                                                         max_size=MAX_VOCAB_SIZE,
                                                         count_threshold=1,
                                                         add_pad=True)
        self.__nl_vocabulary.update(nl_token_counter, MAX_VOCAB_SIZE, nl_threshold)
        self.__nl_embedding_layer = nn.Embedding(num_embeddings=len(self.__nl_vocabulary),
                                        embedding_dim=nl_embedding_size,
                                        padding_idx=self.__nl_vocabulary.get_id_or_unk(
                                            Vocabulary.get_pad()))
        self.nl_embedding_dropout_layer = nn.Dropout(p=dropout_rate)
        

        self.__code_vocabulary = Vocabulary.create_vocabulary(tokens=edit_keywords,
                                                    max_size=MAX_VOCAB_SIZE,
                                                    count_threshold=1,
                                                    add_pad=True)
        self.__code_vocabulary.update(code_token_counter, MAX_VOCAB_SIZE, code_threshold)
        self.__code_embedding_layer = nn.Embedding(num_embeddings=len(self.__code_vocabulary),
                        embedding_dim=code_embedding_size,
                        padding_idx=self.__code_vocabulary.get_id_or_unk(
                        Vocabulary.get_pad()))
        self.code_embedding_dropout_layer = nn.Dropout(p=dropout_rate)

        print('NL vocabulary size: {}'.format(len(self.__nl_vocabulary)))
        print('Code vocabulary size: {}'.format(len(self.__code_vocabulary)))

        if load_pretrained_embeddings:
            self.initialize_embeddings()
Exemple #3
0
def _evaluate_f1(best_predictions: List[List[np.ndarray]],
                 best_predictions_probs: List[np.ndarray], vocab: Vocabulary,
                 true_labels: np.ndarray):
    true_labels = clean_target_from_padding(true_labels)
    result_accumulator = PointSuggestionEvaluator()
    unk_id = vocab.get_id_or_unk(vocab.get_unk())

    for x_pred, x_prob, y_target in zip(best_predictions,
                                        best_predictions_probs, true_labels):
        confidences = x_prob.tolist()
        is_exact_prediction = [np.all(pred == y_target) for pred in x_pred]
        precision_recall = [
            token_precision_recall(pred.T, y_target) for pred in x_pred
        ]
        is_unknown_word_predicted = [
            np.all(suggestion == unk_id) for suggestion in x_pred
        ]
        unk_word_accuracy = [
            unk_acc(suggestion.T, y_target, unk_id) for suggestion in x_pred
        ]
        result_accumulator.add_result(confidences, is_exact_prediction,
                                      is_unknown_word_predicted,
                                      precision_recall, unk_word_accuracy)

    return result_accumulator
 def get_padded_nl_ids(self, nl_sequence, pad_length):
     return self.__nl_vocabulary.get_id_or_unk_multiple(
         nl_sequence,
         pad_to_size=pad_length,
         padding_element=self.__nl_vocabulary.get_id_or_unk(
             Vocabulary.get_pad()),
     )
def get_dataset_from(
        data_dirs: List[RichPath],
        use_func_names: bool = False,
        max_files_per_dir: Optional[int] = None) -> List[Dict[str, Any]]:
    data_files = sorted(
        get_data_files_from_directory(data_dirs, max_files_per_dir))
    data = list(
        chain(*chain(
            list(
                data_pipeline.combined_samples_generator(
                    {data_pipeline.CODE_TOKENS_LABEL: f}))
            for f in data_files)))

    if use_func_names:
        # This task tries to match the function name to the code, by setting the function name as the query
        for sample in data:
            # Replace the query tokens with the function name, broken up into its sub-tokens:
            sample['docstring_tokens'] = split_identifier_into_parts(
                sample['func_name'])

            # In the code, replace the function name with the out-of-vocab token everywhere it appears:
            sample['code_tokens'] = [
                Vocabulary.get_unk() if token == sample['func_name'] else token
                for token in sample['code_tokens']
            ]
    return data
Exemple #6
0
    def tensorize(self, datapoint: str, return_str_rep: bool = False):
        if self.splitting_kind == "token":
            token_idxs = self.vocabulary.get_id_or_unk(datapoint)
            str_repr = datapoint
        elif self.splitting_kind == "subtoken":
            subtoks = split_identifier_into_parts(datapoint)
            if len(subtoks) == 0:
                subtoks = [Vocabulary.get_unk()]
            token_idxs = self.vocabulary.get_id_or_unk_multiple(subtoks)
        elif self.splitting_kind == "bpe":
            if len(datapoint) == 0:
                datapoint = "<empty>"
            token_idxs = self.vocabulary.get_id_or_unk_for_text(datapoint)
            if return_str_rep:  # Do _not_ compute for efficiency
                str_repr = self.vocabulary.tokenize(datapoint)
        elif self.splitting_kind == "char":
            token_idxs = self.vocabulary.tensorize_str(datapoint)
            if return_str_rep:
                str_repr = datapoint[:self.vocabulary.max_char_length]
        else:
            raise ValueError(
                f'Unrecognized token splitting method "{self.splitting_kind}".'
            )

        if return_str_rep:
            return token_idxs, str_repr
        return token_idxs
Exemple #7
0
    def finalise_metadata(
            cls, encoder_label: str, hyperparameters: Dict[str, Any],
            raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]:
        final_metadata = super().finalise_metadata(encoder_label,
                                                   hyperparameters,
                                                   raw_metadata_list)
        merged_token_counter = Counter()
        for raw_metadata in raw_metadata_list:
            merged_token_counter += raw_metadata['token_counter']

        if hyperparameters['%s_use_bpe' % encoder_label]:
            token_vocabulary = BpeVocabulary(
                vocab_size=hyperparameters['%s_token_vocab_size' %
                                           encoder_label],
                pct_bpe=hyperparameters['%s_pct_bpe' % encoder_label])
            token_vocabulary.fit(merged_token_counter)
        else:
            token_vocabulary = Vocabulary.create_vocabulary(
                tokens=merged_token_counter,
                max_size=hyperparameters['%s_token_vocab_size' %
                                         encoder_label],
                count_threshold=hyperparameters[
                    '%s_token_vocab_count_threshold' % encoder_label])

        final_metadata['token_vocab'] = token_vocabulary
        # Save the most common tokens for use in data augmentation:
        final_metadata['common_tokens'] = merged_token_counter.most_common(50)
        return final_metadata
Exemple #8
0
    def finalise_metadata(cls, encoder_label: str, hyperparameters: Dict[str, Any],
                          raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]:
        print("Finalising metadata")
        final_metadata = super().finalise_metadata(encoder_label, hyperparameters, raw_metadata_list)
        merged_token_counter = collections.Counter()
        merged_edge_types = set()
        token_counts = []
        for raw_metadata in raw_metadata_list:
            merged_token_counter += raw_metadata['token_counter']
            merged_edge_types = merged_edge_types.union(raw_metadata['edge_types'])
            # token_counts.extend(raw_metadata['nodes_by_tokens'])

        if hyperparameters[f'{encoder_label}_token_use_bpe']:
            token_vocabulary = BpeVocabulary(
                vocab_size=hyperparameters[f'{encoder_label}_token_vocab_size'],
                pct_bpe=hyperparameters[f'{encoder_label}_token_pct_bpe']
            )
            token_vocabulary.fit(merged_token_counter)
            print('Total token word vocabulary words:', len(token_vocabulary.word_vocab))
            print('Total token bpe vocabulary words:', len(token_vocabulary.bpe_vocab))
        else:
            token_vocabulary = Vocabulary.create_vocabulary(
                tokens=merged_token_counter,
                max_size=hyperparameters[f'{encoder_label}_token_vocab_size'],
                count_threshold=hyperparameters[f'{encoder_label}_token_vocab_count_threshold'])
            print('Total token vocabulary words:', len(token_vocabulary.id_to_token))

        final_metadata['token_vocab'] = token_vocabulary
        final_metadata['edge_type_mapping'] = {edge_type: i for i, edge_type in enumerate(merged_edge_types)}
        print('Edge type mapping:', final_metadata['edge_type_mapping'])
        # print("Percentiles:")
        # for p in [0, 1, 10, 20, 25, 30, 40, 50, 60, 70, 75, 80, 90, 95, 99, 99.9, 100]:
        #     print(p, np.percentile(token_counts, p))
        return final_metadata
Exemple #9
0
    def load_data_from_sample(cls,
                              encoder_label: str,
                              hyperparameters: Dict[str, Any],
                              metadata: Dict[str, Any],
                              data_to_load: Any,
                              function_name: Optional[str],
                              result_holder: Dict[str, Any],
                              is_test: bool = True) -> bool:
        """
        Saves two versions of both the code and the query: one using the docstring as the query and the other using the
        function-name as the query, and replacing the function name in the code with an out-of-vocab token.
        Sub-tokenizes, converts, and pads both versions, and rejects empty samples.
        """
        # Save the two versions of the code and query:
        data_holder = {
            QueryType.DOCSTRING.value: data_to_load,
            QueryType.FUNCTION_NAME.value: None
        }
        # Skip samples where the function name is very short, because it probably has too little information
        # to be a good search query.
        if not is_test and hyperparameters['fraction_using_func_name'] > 0. and function_name and \
                len(function_name) >= hyperparameters['min_len_func_name_for_query']:
            if encoder_label == 'query':
                # Set the query tokens to the function name, broken up into its sub-tokens:
                data_holder[QueryType.FUNCTION_NAME.
                            value] = split_identifier_into_parts(function_name)
            elif encoder_label == 'code':
                # In the code, replace the function name with the out-of-vocab token everywhere it appears:
                data_holder[QueryType.FUNCTION_NAME.value] = [
                    Vocabulary.get_unk() if token == function_name else token
                    for token in data_to_load
                ]

        # Sub-tokenize, convert, and pad both versions:
        for key, data in data_holder.items():
            if not data:
                result_holder[f'{encoder_label}_tokens_{key}'] = None
                result_holder[f'{encoder_label}_tokens_mask_{key}'] = None
                result_holder[f'{encoder_label}_tokens_length_{key}'] = None
                continue
            if hyperparameters[f'{encoder_label}_use_subtokens']:
                data = cls._to_subtoken_stream(
                    data,
                    mark_subtoken_end=hyperparameters[
                        f'{encoder_label}_mark_subtoken_end'])
            tokens, tokens_mask = \
                convert_and_pad_token_sequence(metadata['token_vocab'], list(data),
                                               hyperparameters[f'{encoder_label}_max_num_tokens'])
            # Note that we share the result_holder with different encoders, and so we need to make our identifiers
            # unique-ish
            result_holder[f'{encoder_label}_tokens_{key}'] = tokens
            result_holder[f'{encoder_label}_tokens_mask_{key}'] = tokens_mask
            result_holder[f'{encoder_label}_tokens_length_{key}'] = int(
                np.sum(tokens_mask))

        if result_holder[f'{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}'] is None or \
                int(np.sum(result_holder[f'{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}'])) == 0:
            return False

        return True
 def get_vocab_extended_nl_token(self, token_id, inp_ids, inp_tokens):
     if token_id < len(self.__nl_vocabulary):
         return self.get_nl_token(token_id)
     elif token_id in inp_ids:
         copy_idx = inp_ids.index(token_id)
         return inp_tokens[copy_idx]
     else:
         return Vocabulary.get_unk()
 def pad_length(self, sequence, target_length):
     if len(sequence) >= target_length:
         return sequence[:target_length]
     else:
         return sequence + [
             self.__nl_vocabulary.get_id_or_unk(Vocabulary.get_pad())
             for _ in range(target_length - len(sequence))
         ]
Exemple #12
0
def __create_voc_from_tokens(all_sub_tokens):

    vocabulary = Vocabulary.create_vocabulary(all_sub_tokens,
                                              max_size=100000,
                                              count_threshold=1,
                                              add_unk=True,
                                              add_pad=True)

    return vocabulary
Exemple #13
0
 def finalize_metadata(self) -> None:
     self.__token_counter[self.START] = 1000000
     self.__token_counter[self.END] = 1000000
     self.__output_vocabulary = Vocabulary.create_vocabulary(
         self.__token_counter,
         max_size=self.vocabulary_max_size,
         count_threshold=self.vocabulary_count_threshold,
     )
     self.LOGGER.info("Output vocabulary Size %s",
                      len(self.__output_vocabulary))
     del self.__token_counter
    def _finalise_metadata(self, raw_metadata_list: List[Dict[str, Any]],
                           final_metadata: Dict[str, Any]):
        # Merge counters
        merged_type_counter = Counter()
        for raw_metadata in raw_metadata_list:
            merged_type_counter.update(raw_metadata["type_occurences_counter"])

        final_metadata['annotation_vocab'] = Vocabulary.create_vocabulary(
            merged_type_counter,
            max_size=self.__model.
            hyperparameters['max_type_annotation_vocab_size'])
        return final_metadata
    def greedy_decode(self, initial_state, encoder_hidden_states, masks,
                      max_out_len, batch_data, device):
        """Greedily generates the output sequence."""
        # Derived from https://github.com/budzianowski/PyTorch-Beam-Search-Decoding/blob/9f6b66f43d2e05175dabcc024f79e1d37a667070/decode_beam.py#L163
        batch_size = initial_state.shape[0]
        decoder_state = initial_state
        decoder_input = torch.tensor(
            [[self.embedding_store.get_nl_id(START)]] * batch_size,
            device=device)

        decoded_batch = np.zeros([batch_size, max_out_len], dtype=np.int64)
        decoded_batch_scores = np.zeros([batch_size, max_out_len])

        for i in range(max_out_len):
            decoder_input_embeddings = self.embedding_store.get_nl_embeddings(
                decoder_input)
            decoder_attention_states, decoder_state, generation_logprobs, copy_logprobs = self.decode(
                decoder_state, decoder_input_embeddings, encoder_hidden_states,
                masks)

            generation_logprobs = generation_logprobs.squeeze(1)
            copy_logprobs = copy_logprobs.squeeze(1)

            prob_scores = torch.zeros([
                generation_logprobs.shape[0],
                generation_logprobs.shape[-1] + copy_logprobs.shape[-1]
            ],
                                      dtype=torch.float32,
                                      device=device)
            prob_scores[:, :generation_logprobs.shape[-1]] = torch.exp(
                generation_logprobs)
            for b in range(generation_logprobs.shape[0]):
                for c, inp_id in enumerate(batch_data.input_ids[b]):
                    prob_scores[b,
                                inp_id] = prob_scores[b, inp_id] + torch.exp(
                                    copy_logprobs[b, c])

            predicted_ids = torch.argmax(prob_scores, dim=-1)
            decoded_batch_scores[:, i] = prob_scores[
                torch.arange(prob_scores.shape[0]), predicted_ids]
            decoded_batch[:, i] = predicted_ids

            unks = torch.ones(predicted_ids.shape[0],
                              dtype=torch.int64,
                              device=device) * self.embedding_store.get_nl_id(
                                  Vocabulary.get_unk())
            decoder_input = torch.where(
                predicted_ids < len(self.embedding_store.nl_vocabulary),
                predicted_ids, unks).unsqueeze(1)
            decoder_state = decoder_state.squeeze(0)

        return decoded_batch, decoded_batch_scores
Exemple #16
0
def evaluate_f1(model: keras.Model,
                vocab: Vocabulary,
                input_method_body_subtokens: np.ndarray,
                target_method_names: np.ndarray,
                hyperparameters: Dict[str, any],
                visualise_prediction=True):
    padding_id = vocab.get_id_or_unk(vocab.get_pad())
    begin_of_sentence_id = vocab.get_id_or_unk(SENTENCE_START_TOKEN)
    end_of_sentence_id = vocab.get_id_or_unk(SENTENCE_END_TOKEN)

    if input_method_body_subtokens.ndim != 3:
        # model prediction expects 3 dimensions, a single input won't have the batch dimension, manually add it
        input_method_body_subtokens = np.expand_dims(
            input_method_body_subtokens, 0)

    predictions = model.predict(input_method_body_subtokens, batch_size=1)

    best_predictions, best_predictions_probs = beam_search(
        predictions,
        padding_id,
        begin_of_sentence_id,
        end_of_sentence_id,
        hyperparameters['beam_width'],
        hyperparameters['beam_top_paths'],
    )
    f1_evaluation = _evaluate_f1(best_predictions, best_predictions_probs,
                                 vocab, target_method_names)
    if visualise_prediction:
        max_results = 10
        visualised_input = visualise_beam_predictions_to_targets(
            vocab, best_predictions[:max_results],
            best_predictions_probs[:max_results],
            input_method_body_subtokens[:max_results],
            target_method_names[:max_results])

        # return best_predictions, best_predictions_probs
        return f1_evaluation, visualised_input
    return f1_evaluation
    def finalise_metadata(cls,
                          encoder_label: str,
                          hyperparameters: Dict[str, Any],
                          raw_metadata_list: List[Dict[str, Any]],
                          language=None) -> Dict[str, Any]:
        final_metadata = super().finalise_metadata(encoder_label,
                                                   hyperparameters,
                                                   raw_metadata_list)
        merged_token_counter = Counter()

        print(encoder_label, language)
        if encoder_label == 'query':
            final_metadata_path = '_'.join([encoder_label, 'final_metadata'])
        else:
            assert encoder_label == 'code' and language
            final_metadata_path = '_'.join(
                [encoder_label, language, 'final_metadata'])

        if os.path.isfile(final_metadata_path):
            with open(final_metadata_path, 'rb') as final_metadata_file:
                final_metadata = pickle.load(final_metadata_file)
        else:

            for raw_metadata in raw_metadata_list:
                merged_token_counter += raw_metadata['token_counter']

            if hyperparameters['%s_use_bpe' % encoder_label]:
                token_vocabulary = BpeVocabulary(
                    vocab_size=hyperparameters['%s_token_vocab_size' %
                                               encoder_label],
                    pct_bpe=hyperparameters['%s_pct_bpe' % encoder_label])
                token_vocabulary.fit(merged_token_counter)
            else:
                token_vocabulary = Vocabulary.create_vocabulary(
                    tokens=merged_token_counter,
                    max_size=hyperparameters['%s_token_vocab_size' %
                                             encoder_label],
                    count_threshold=hyperparameters[
                        '%s_token_vocab_count_threshold' % encoder_label])

            final_metadata['token_vocab'] = token_vocabulary
            # Save the most common tokens for use in data augmentation:
            final_metadata['common_tokens'] = merged_token_counter.most_common(
                50)

            with open(final_metadata_path, 'wb') as final_metadata_file:
                pickle.dump(final_metadata, final_metadata_file)

        return final_metadata
Exemple #18
0
    def _finalise_metadata(
            self, raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]:
        final_metadata = super()._finalise_metadata(raw_metadata_list)

        TokenEmbedder.finalise_metadata('leaf_label', raw_metadata_list,
                                        final_metadata, self.hyperparameters)

        # First, merge all needed information:
        merged_non_terminals = set()
        for raw_metadata in raw_metadata_list:
            merged_non_terminals.update(raw_metadata["path_elements"])

        final_metadata['non_terminal_dict'] = Vocabulary.create_vocabulary(
            merged_non_terminals, max_size=10000, count_threshold=0)
        return final_metadata
 def get_extended_padded_nl_ids(self, nl_sequence, pad_length, inp_ids, inp_tokens):
     # Derived from: https://github.com/microsoft/dpu-utils/blob/master/python/dpu_utils/mlutils/vocabulary.py
     nl_ids = []
     for token in nl_sequence:
         nl_id = self.get_nl_id(token)
         if self.is_nl_unk(nl_id) and token in inp_tokens:
             copy_idx = inp_tokens.index(token)
             nl_id = inp_ids[copy_idx]
         nl_ids.append(nl_id)
     
     if len(nl_ids) > pad_length:
         return nl_ids[:pad_length]
     else:
         padding = [self.__nl_vocabulary.get_id_or_unk(Vocabulary.get_pad())] * (pad_length - len(nl_ids))
         return nl_ids + padding
def convert_and_pad_token_sequence(token_vocab: Union[Vocabulary, BpeVocabulary],
                                   token_sequence: List[str],
                                   output_tensor_size: int,
                                   pad_from_left: bool = False) \
        -> Tuple[np.ndarray, np.ndarray]:
    """
    Tensorise token sequence with padding; returning a mask for used elements as well.

    Args:
        token_vocab: Vocabulary or BPE encoder to use. We assume that token_vocab[0] is the padding symbol.
        token_sequence: List of tokens in string form
        output_tensor_size: Size of the resulting tensor (i.e., length up which we pad / down to which we truncate.
        pad_from_left: Indicate if we are padding/truncating on the left side of string. [Default: False]

    Returns:
        Pair of numpy arrays. First is the actual tensorised token sequence, the second is a masking tensor
        that is 1.0 for those token indices that are actually used.
    """
    print("token_vocab type is ", type(token_vocab))
    if isinstance(token_vocab, BpeVocabulary):
        print("token_vocab is type of BpeVocabulary")
        token_ids = np.array(list(token_vocab.transform([token_sequence], fixed_length=output_tensor_size))[0])
        token_mask = np.array([1 if token_ids[i] > 0 else 0 for i in range(len(token_ids))])
        print("token ids ", token_ids.shape())
        return token_ids, token_mask

    if pad_from_left:
        token_sequence = token_sequence[-output_tensor_size:]
    else:
        token_sequence = token_sequence[:output_tensor_size]

    sequence_length = len(token_sequence)
    if pad_from_left:
        start_idx = output_tensor_size - sequence_length
    else:
        start_idx = 0

    print("token_vocab type ", type(token_vocab))
    token_vocab = Vocabulary(token_vocab)
    print("token_vocab type  ", type(token_vocab))
    token_ids = np.zeros(output_tensor_size, dtype=np.int32)
    token_mask = np.zeros(output_tensor_size, dtype=np.float32)
    for i, token in enumerate(token_sequence, start=start_idx):
        token_ids[i] = token_vocab.get_id_or_unk(token)
        token_mask[i] = True

    return token_ids, token_mask
Exemple #21
0
 def finalise_metadata(
         cls, encoder_label: str, hyperparameters: Dict[str, Any],
         raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]:
     final_metadata = super().finalise_metadata(encoder_label,
                                                hyperparameters,
                                                raw_metadata_list)
     merged_type_counter = collections.Counter()
     for raw_metadata in raw_metadata_list:
         merged_type_counter += raw_metadata['type_counter']
     type_vocabulary = Vocabulary.create_vocabulary(
         tokens=merged_type_counter,
         max_size=hyperparameters[f'{encoder_label}_type_vocab_size'],
         count_threshold=hyperparameters[
             f'{encoder_label}_type_vocab_count_threshold'])
     final_metadata['type_vocab'] = type_vocabulary
     print('Total type vocabulary words:',
           len(final_metadata['type_vocab'].id_to_token))
     return final_metadata
Exemple #22
0
    def finalise_metadata(
            cls, encoder_label: str, hyperparameters: Dict[str, Any],
            raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]:
        final_metadata = super().finalise_metadata(encoder_label,
                                                   hyperparameters,
                                                   raw_metadata_list)
        # JGD ****** leaf_nodes start ******
        merged_identifier_counter = Counter()
        for raw_metadata in raw_metadata_list:
            merged_identifier_counter += raw_metadata['identifier_counter']

        if hyperparameters['%s_use_bpe' % encoder_label]:
            identifier_vocabulary = BpeVocabulary(
                vocab_size=hyperparameters['%s_token_vocab_size' %
                                           encoder_label],
                pct_bpe=hyperparameters['%s_pct_bpe' % encoder_label])
            identifier_vocabulary.fit(merged_identifier_counter)
        else:
            identifier_vocabulary = Vocabulary.create_vocabulary(
                tokens=merged_identifier_counter,
                max_size=hyperparameters['%s_token_vocab_size' %
                                         encoder_label],
                count_threshold=hyperparameters[
                    '%s_token_vocab_count_threshold' % encoder_label])

        final_metadata['identifier_vocab'] = identifier_vocabulary
        # Save the most common tokens for use in data augmentation:
        final_metadata[
            'common_identifiers'] = merged_identifier_counter.most_common(50)
        # JGD ****** leaf_nodes end ******
        # JGD ****** tree_paths start ******
        # merged_context_filenames = list()
        # merged_terminal_counter = Counter()
        # merged_nonterminal_counter = Counter()
        # for raw_metadata in raw_metadata_list:
        #     merged_context_filenames.extend(raw_metadata['context_filenames'])
        #     merged_terminal_counter += raw_metadata['terminal_counter']
        #     merged_nonterminal_counter += raw_metadata['nonterminal_counter']
        #
        # final_metadata['context_filenames'] = merged_context_filenames
        # final_metadata['terminal_counter'] = merged_terminal_counter
        # final_metadata['nonterminal_counter'] = merged_nonterminal_counter
        # JGD ****** tree_paths end ******
        return final_metadata
    def finalize_metadata(self) -> None:
        if self.splitting_kind in {"token", "subtoken"}:
            self.__vocabulary = Vocabulary.create_vocabulary(
                self.__tok_counter,
                max_size=self.max_vocabulary_size,
                count_threshold=self.min_freq_threshold,
                add_pad=True)
        elif self.splitting_kind == "bpe":
            self.__vocabulary = BpeVocabulary(self.max_vocabulary_size,
                                              unk_token=UNK_TOKEN,
                                              pad_token=PAD_TOKEN,
                                              eos_token=EOS_TOKEN,
                                              bos_token=INIT_TOKEN)
            self.__vocabulary.create_vocabulary(self.__tok_counter)
        else:
            raise ValueError(
                f'Unrecognized token splitting method "{self.splitting_kind}"')

        del self.__tok_counter
Exemple #24
0
    def finalize_metadata(self) -> None:
        if self.splitting_kind in {"token", "subtoken"}:
            self.__vocabulary = Vocabulary.create_vocabulary(
                self.__tok_counter,
                max_size=self.max_vocabulary_size,
                count_threshold=self.min_freq_threshold,
            )
        elif self.splitting_kind == "bpe":
            self.__vocabulary = BpeVocabulary(self.max_vocabulary_size)
            self.__vocabulary.create_vocabulary(self.__tok_counter)
        elif self.splitting_kind == "char":
            self.__vocabulary = CharTensorizer(
                max_num_chars=self.max_num_chars,
                lower_case_all=False,
                include_space=False)
        else:
            raise ValueError(
                f'Unrecognized token splitting method "{self.splitting_kind}"')

        del self.__tok_counter
Exemple #25
0
 def finalise_metadata(cls, encoder_label: str, hyperparameters: Dict[str, Any],
                       raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]:
     final_metadata = super().finalise_metadata(encoder_label, hyperparameters, raw_metadata_list)
     merged_token_counter = collections.Counter()
     for raw_metadata in raw_metadata_list:
         merged_token_counter += raw_metadata['token_counter']
     if hyperparameters[f'{encoder_label}_token_use_bpe']:
         token_vocabulary = BpeVocabulary(
             vocab_size=hyperparameters[f'{encoder_label}_token_vocab_size'],
             pct_bpe=hyperparameters[f'{encoder_label}_token_pct_bpe'])
         token_vocabulary.fit(merged_token_counter)
         print('Total token word vocabulary words:', len(token_vocabulary.word_vocab))
         print('Total token bpe vocabulary words:', len(token_vocabulary.bpe_vocab))
     else:
         token_vocabulary = Vocabulary.create_vocabulary(
             tokens=merged_token_counter,
             max_size=hyperparameters[f'{encoder_label}_token_vocab_size'],
             count_threshold=hyperparameters[f'{encoder_label}_token_vocab_count_threshold'])
         print('Total token vocabulary words:', len(token_vocabulary.id_to_token))
     final_metadata['token_vocab'] = token_vocabulary
     return final_metadata
    def load_vocabulary(self) -> Vocabulary:
        """ Return model vocabulary such as a vocabulary. """
        max_size = self.config['vocabulary_max_size']
        count_threshold = self.config['vocabulary_count_threshold']
        # Count occurrences of the body vocabulary
        tokens_counter = Counter()

        for method_token in self.corpus_methods_token:
            for (name, body) in method_token:
                tokens_counter.update(body)
                tokens_counter.update(name)

        token_vocab = Vocabulary.create_vocabulary(
            tokens_counter,
            count_threshold=count_threshold,
            max_size=max_size,
            add_unk=True,
            add_pad=True)

        self.logger.info('{} Vocabulary created'.format(len(token_vocab)))
        return token_vocab
    def __init__(self, train_dir, valid_dir, max_seq_length, max_vocab_size):

        # Dictionary which stores raw training data
        self.train_data = {
            METHOD_NAMES: load_data_file(train_dir + METHOD_NAME_FILE_NAME),
            METHOD_APIS: load_data_file(train_dir + METHOD_API_FILE_NAME),
            METHOD_TOKENS: load_data_file(train_dir + METHOD_TOKENS_FILE_NAME),
            JAVADOC: load_data_file(train_dir + JAVADOC_FILE_NAME)
        }

        # Dictionary which stores raw validation data
        self.valid_data = {
            METHOD_NAMES: load_data_file(valid_dir + METHOD_NAME_FILE_NAME),
            METHOD_APIS: load_data_file(valid_dir + METHOD_API_FILE_NAME),
            METHOD_TOKENS: load_data_file(valid_dir + METHOD_TOKENS_FILE_NAME),
            JAVADOC: load_data_file(valid_dir + JAVADOC_FILE_NAME)
        }

        # Tokens lists are flattened to prepare for vocabulary creation
        methods_list = [
            self.train_data[METHOD_NAMES], self.train_data[METHOD_APIS],
            self.train_data[METHOD_TOKENS]
        ]
        javadoc_list = [self.train_data[JAVADOC]]
        all_tokens = flatten(methods_list + javadoc_list)

        self.vocabulary = Vocabulary.create_vocabulary(all_tokens,
                                                       max_vocab_size,
                                                       count_threshold=1,
                                                       add_pad=True)

        self.max_seq_length = max_seq_length
        self.max_vocab_size = max_vocab_size

        # Create Training and Validation tensors
        self.train_tensors = self._tensorize_data(self.train_data)
        self.valid_tensors = self._tensorize_data(self.valid_data)
Exemple #28
0
    def finalise_metadata(cls, encoder_label: str, hyperparameters: Dict[str, Any],
                          raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]:
        hypers = cls.get_default_hyperparameters()
        resource = hypers['resource']
        vocabulary_path = f'resources/embeddings/{resource}/token_to_index.pickle'
        with open(vocabulary_path, 'rb') as fin:
            token_to_index = pickle.load(fin)
        # Fictive counts so that the ordering in the internal vocabulary will be the same as the indices in the dict.
        token_to_count = {}
        for token, index in token_to_index.items():
            token_to_count[token] = len(token_to_index) - index
        token_counter = Counter(token_to_count)
        token_vocabulary = Vocabulary.create_vocabulary(
            tokens=token_counter,
            max_size=hyperparameters['%s_token_vocab_size' % encoder_label],
            count_threshold=0)
        print('token_to_index', token_to_index)
        print('token_vocabulary.id_to_token', token_vocabulary.id_to_token)

        final_metadata = {}
        final_metadata['token_vocab'] = token_vocabulary
        # Save the most common tokens for use in data augmentation:
        final_metadata['common_tokens'] = token_counter.most_common(50)
        return final_metadata
Exemple #29
0
 def add_special_literals(vocab: Vocabulary) -> None:
     vocab.add_or_get_id(TokenEmbedder.STRING_LITERAL)
     vocab.add_or_get_id(TokenEmbedder.FLOAT_LITERAL)
     vocab.add_or_get_id(TokenEmbedder.INT_LITERAL)
Exemple #30
0
 def unk_token(self) -> str:
     return Vocabulary.get_unk()