def __init__(self, nl_threshold, nl_embedding_size, nl_token_counter,
                 code_threshold, code_embedding_size, code_token_counter,
                 dropout_rate, load_pretrained_embeddings=False):
        """Keeps track of the NL and code vocabularies and embeddings."""
        super(EmbeddingStore, self).__init__()
        edit_keywords = get_edit_keywords()
        self.__nl_vocabulary = Vocabulary.create_vocabulary(tokens=edit_keywords,
                                                         max_size=MAX_VOCAB_SIZE,
                                                         count_threshold=1,
                                                         add_pad=True)
        self.__nl_vocabulary.update(nl_token_counter, MAX_VOCAB_SIZE, nl_threshold)
        self.__nl_embedding_layer = nn.Embedding(num_embeddings=len(self.__nl_vocabulary),
                                        embedding_dim=nl_embedding_size,
                                        padding_idx=self.__nl_vocabulary.get_id_or_unk(
                                            Vocabulary.get_pad()))
        self.nl_embedding_dropout_layer = nn.Dropout(p=dropout_rate)
        

        self.__code_vocabulary = Vocabulary.create_vocabulary(tokens=edit_keywords,
                                                    max_size=MAX_VOCAB_SIZE,
                                                    count_threshold=1,
                                                    add_pad=True)
        self.__code_vocabulary.update(code_token_counter, MAX_VOCAB_SIZE, code_threshold)
        self.__code_embedding_layer = nn.Embedding(num_embeddings=len(self.__code_vocabulary),
                        embedding_dim=code_embedding_size,
                        padding_idx=self.__code_vocabulary.get_id_or_unk(
                        Vocabulary.get_pad()))
        self.code_embedding_dropout_layer = nn.Dropout(p=dropout_rate)

        print('NL vocabulary size: {}'.format(len(self.__nl_vocabulary)))
        print('Code vocabulary size: {}'.format(len(self.__code_vocabulary)))

        if load_pretrained_embeddings:
            self.initialize_embeddings()
Ejemplo n.º 2
0
    def finalise_metadata(name: str, raw_metadata_list: List[Dict[str, Any]],
                          final_metadata: Dict[str, Any],
                          hyperparameters: Dict[str, Any]) -> None:
        label_embedding_style = hyperparameters[
            f'{name}_embedding_style'].lower()

        merged_node_label_counter = Counter()
        for raw_metadata in raw_metadata_list:
            if label_embedding_style == 'token':
                merged_node_label_counter += raw_metadata[f'{name}_counter']
            elif label_embedding_style == 'subtoken':
                merged_node_label_counter += raw_metadata[
                    f'{name}_subtoken_counter']

        def add_special_literals(vocab: Vocabulary) -> None:
            vocab.add_or_get_id(TokenEmbedder.STRING_LITERAL)
            vocab.add_or_get_id(TokenEmbedder.FLOAT_LITERAL)
            vocab.add_or_get_id(TokenEmbedder.INT_LITERAL)

        if label_embedding_style == 'token':
            # Store token, type, and production vocabs:
            final_metadata[f'{name}_vocab'] = \
                Vocabulary.create_vocabulary(
                    merged_node_label_counter,
                    max_size=hyperparameters[f'{name}_vocab_size'])
            add_special_literals(final_metadata[f'{name}_vocab'])
        elif label_embedding_style == 'subtoken':
            final_metadata[f'{name}_subtoken_vocab'] = \
                Vocabulary.create_vocabulary(
                    merged_node_label_counter,
                    max_size=hyperparameters[f'{name}_vocab_size'])
            add_special_literals(final_metadata[f'{name}_subtoken_vocab'])
Ejemplo n.º 3
0
    def finalise_metadata(
            cls, encoder_label: str, hyperparameters: Dict[str, Any],
            raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]:
        final_metadata = super().finalise_metadata(encoder_label,
                                                   hyperparameters,
                                                   raw_metadata_list)
        merged_token_counter = Counter()
        for raw_metadata in raw_metadata_list:
            merged_token_counter += raw_metadata['token_counter']

        if hyperparameters['%s_use_bpe' % encoder_label]:
            token_vocabulary = BpeVocabulary(
                vocab_size=hyperparameters['%s_token_vocab_size' %
                                           encoder_label],
                pct_bpe=hyperparameters['%s_pct_bpe' % encoder_label])
            token_vocabulary.fit(merged_token_counter)
        else:
            token_vocabulary = Vocabulary.create_vocabulary(
                tokens=merged_token_counter,
                max_size=hyperparameters['%s_token_vocab_size' %
                                         encoder_label],
                count_threshold=hyperparameters[
                    '%s_token_vocab_count_threshold' % encoder_label])

        final_metadata['token_vocab'] = token_vocabulary
        # Save the most common tokens for use in data augmentation:
        final_metadata['common_tokens'] = merged_token_counter.most_common(50)
        return final_metadata
Ejemplo n.º 4
0
    def finalise_metadata(cls, encoder_label: str, hyperparameters: Dict[str, Any],
                          raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]:
        print("Finalising metadata")
        final_metadata = super().finalise_metadata(encoder_label, hyperparameters, raw_metadata_list)
        merged_token_counter = collections.Counter()
        merged_edge_types = set()
        token_counts = []
        for raw_metadata in raw_metadata_list:
            merged_token_counter += raw_metadata['token_counter']
            merged_edge_types = merged_edge_types.union(raw_metadata['edge_types'])
            # token_counts.extend(raw_metadata['nodes_by_tokens'])

        if hyperparameters[f'{encoder_label}_token_use_bpe']:
            token_vocabulary = BpeVocabulary(
                vocab_size=hyperparameters[f'{encoder_label}_token_vocab_size'],
                pct_bpe=hyperparameters[f'{encoder_label}_token_pct_bpe']
            )
            token_vocabulary.fit(merged_token_counter)
            print('Total token word vocabulary words:', len(token_vocabulary.word_vocab))
            print('Total token bpe vocabulary words:', len(token_vocabulary.bpe_vocab))
        else:
            token_vocabulary = Vocabulary.create_vocabulary(
                tokens=merged_token_counter,
                max_size=hyperparameters[f'{encoder_label}_token_vocab_size'],
                count_threshold=hyperparameters[f'{encoder_label}_token_vocab_count_threshold'])
            print('Total token vocabulary words:', len(token_vocabulary.id_to_token))

        final_metadata['token_vocab'] = token_vocabulary
        final_metadata['edge_type_mapping'] = {edge_type: i for i, edge_type in enumerate(merged_edge_types)}
        print('Edge type mapping:', final_metadata['edge_type_mapping'])
        # print("Percentiles:")
        # for p in [0, 1, 10, 20, 25, 30, 40, 50, 60, 70, 75, 80, 90, 95, 99, 99.9, 100]:
        #     print(p, np.percentile(token_counts, p))
        return final_metadata
Ejemplo n.º 5
0
def __create_voc_from_tokens(all_sub_tokens):

    vocabulary = Vocabulary.create_vocabulary(all_sub_tokens,
                                              max_size=100000,
                                              count_threshold=1,
                                              add_unk=True,
                                              add_pad=True)

    return vocabulary
Ejemplo n.º 6
0
 def finalize_metadata(self) -> None:
     self.__token_counter[self.START] = 1000000
     self.__token_counter[self.END] = 1000000
     self.__output_vocabulary = Vocabulary.create_vocabulary(
         self.__token_counter,
         max_size=self.vocabulary_max_size,
         count_threshold=self.vocabulary_count_threshold,
     )
     self.LOGGER.info("Output vocabulary Size %s",
                      len(self.__output_vocabulary))
     del self.__token_counter
Ejemplo n.º 7
0
    def _finalise_metadata(self, raw_metadata_list: List[Dict[str, Any]],
                           final_metadata: Dict[str, Any]):
        # Merge counters
        merged_type_counter = Counter()
        for raw_metadata in raw_metadata_list:
            merged_type_counter.update(raw_metadata["type_occurences_counter"])

        final_metadata['annotation_vocab'] = Vocabulary.create_vocabulary(
            merged_type_counter,
            max_size=self.__model.
            hyperparameters['max_type_annotation_vocab_size'])
        return final_metadata
    def finalise_metadata(cls,
                          encoder_label: str,
                          hyperparameters: Dict[str, Any],
                          raw_metadata_list: List[Dict[str, Any]],
                          language=None) -> Dict[str, Any]:
        final_metadata = super().finalise_metadata(encoder_label,
                                                   hyperparameters,
                                                   raw_metadata_list)
        merged_token_counter = Counter()

        print(encoder_label, language)
        if encoder_label == 'query':
            final_metadata_path = '_'.join([encoder_label, 'final_metadata'])
        else:
            assert encoder_label == 'code' and language
            final_metadata_path = '_'.join(
                [encoder_label, language, 'final_metadata'])

        if os.path.isfile(final_metadata_path):
            with open(final_metadata_path, 'rb') as final_metadata_file:
                final_metadata = pickle.load(final_metadata_file)
        else:

            for raw_metadata in raw_metadata_list:
                merged_token_counter += raw_metadata['token_counter']

            if hyperparameters['%s_use_bpe' % encoder_label]:
                token_vocabulary = BpeVocabulary(
                    vocab_size=hyperparameters['%s_token_vocab_size' %
                                               encoder_label],
                    pct_bpe=hyperparameters['%s_pct_bpe' % encoder_label])
                token_vocabulary.fit(merged_token_counter)
            else:
                token_vocabulary = Vocabulary.create_vocabulary(
                    tokens=merged_token_counter,
                    max_size=hyperparameters['%s_token_vocab_size' %
                                             encoder_label],
                    count_threshold=hyperparameters[
                        '%s_token_vocab_count_threshold' % encoder_label])

            final_metadata['token_vocab'] = token_vocabulary
            # Save the most common tokens for use in data augmentation:
            final_metadata['common_tokens'] = merged_token_counter.most_common(
                50)

            with open(final_metadata_path, 'wb') as final_metadata_file:
                pickle.dump(final_metadata, final_metadata_file)

        return final_metadata
Ejemplo n.º 9
0
    def _finalise_metadata(
            self, raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]:
        final_metadata = super()._finalise_metadata(raw_metadata_list)

        TokenEmbedder.finalise_metadata('leaf_label', raw_metadata_list,
                                        final_metadata, self.hyperparameters)

        # First, merge all needed information:
        merged_non_terminals = set()
        for raw_metadata in raw_metadata_list:
            merged_non_terminals.update(raw_metadata["path_elements"])

        final_metadata['non_terminal_dict'] = Vocabulary.create_vocabulary(
            merged_non_terminals, max_size=10000, count_threshold=0)
        return final_metadata
Ejemplo n.º 10
0
 def finalise_metadata(
         cls, encoder_label: str, hyperparameters: Dict[str, Any],
         raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]:
     final_metadata = super().finalise_metadata(encoder_label,
                                                hyperparameters,
                                                raw_metadata_list)
     merged_type_counter = collections.Counter()
     for raw_metadata in raw_metadata_list:
         merged_type_counter += raw_metadata['type_counter']
     type_vocabulary = Vocabulary.create_vocabulary(
         tokens=merged_type_counter,
         max_size=hyperparameters[f'{encoder_label}_type_vocab_size'],
         count_threshold=hyperparameters[
             f'{encoder_label}_type_vocab_count_threshold'])
     final_metadata['type_vocab'] = type_vocabulary
     print('Total type vocabulary words:',
           len(final_metadata['type_vocab'].id_to_token))
     return final_metadata
Ejemplo n.º 11
0
    def finalise_metadata(
            cls, encoder_label: str, hyperparameters: Dict[str, Any],
            raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]:
        final_metadata = super().finalise_metadata(encoder_label,
                                                   hyperparameters,
                                                   raw_metadata_list)
        # JGD ****** leaf_nodes start ******
        merged_identifier_counter = Counter()
        for raw_metadata in raw_metadata_list:
            merged_identifier_counter += raw_metadata['identifier_counter']

        if hyperparameters['%s_use_bpe' % encoder_label]:
            identifier_vocabulary = BpeVocabulary(
                vocab_size=hyperparameters['%s_token_vocab_size' %
                                           encoder_label],
                pct_bpe=hyperparameters['%s_pct_bpe' % encoder_label])
            identifier_vocabulary.fit(merged_identifier_counter)
        else:
            identifier_vocabulary = Vocabulary.create_vocabulary(
                tokens=merged_identifier_counter,
                max_size=hyperparameters['%s_token_vocab_size' %
                                         encoder_label],
                count_threshold=hyperparameters[
                    '%s_token_vocab_count_threshold' % encoder_label])

        final_metadata['identifier_vocab'] = identifier_vocabulary
        # Save the most common tokens for use in data augmentation:
        final_metadata[
            'common_identifiers'] = merged_identifier_counter.most_common(50)
        # JGD ****** leaf_nodes end ******
        # JGD ****** tree_paths start ******
        # merged_context_filenames = list()
        # merged_terminal_counter = Counter()
        # merged_nonterminal_counter = Counter()
        # for raw_metadata in raw_metadata_list:
        #     merged_context_filenames.extend(raw_metadata['context_filenames'])
        #     merged_terminal_counter += raw_metadata['terminal_counter']
        #     merged_nonterminal_counter += raw_metadata['nonterminal_counter']
        #
        # final_metadata['context_filenames'] = merged_context_filenames
        # final_metadata['terminal_counter'] = merged_terminal_counter
        # final_metadata['nonterminal_counter'] = merged_nonterminal_counter
        # JGD ****** tree_paths end ******
        return final_metadata
Ejemplo n.º 12
0
    def finalize_metadata(self) -> None:
        if self.splitting_kind in {"token", "subtoken"}:
            self.__vocabulary = Vocabulary.create_vocabulary(
                self.__tok_counter,
                max_size=self.max_vocabulary_size,
                count_threshold=self.min_freq_threshold,
                add_pad=True)
        elif self.splitting_kind == "bpe":
            self.__vocabulary = BpeVocabulary(self.max_vocabulary_size,
                                              unk_token=UNK_TOKEN,
                                              pad_token=PAD_TOKEN,
                                              eos_token=EOS_TOKEN,
                                              bos_token=INIT_TOKEN)
            self.__vocabulary.create_vocabulary(self.__tok_counter)
        else:
            raise ValueError(
                f'Unrecognized token splitting method "{self.splitting_kind}"')

        del self.__tok_counter
Ejemplo n.º 13
0
    def finalize_metadata(self) -> None:
        if self.splitting_kind in {"token", "subtoken"}:
            self.__vocabulary = Vocabulary.create_vocabulary(
                self.__tok_counter,
                max_size=self.max_vocabulary_size,
                count_threshold=self.min_freq_threshold,
            )
        elif self.splitting_kind == "bpe":
            self.__vocabulary = BpeVocabulary(self.max_vocabulary_size)
            self.__vocabulary.create_vocabulary(self.__tok_counter)
        elif self.splitting_kind == "char":
            self.__vocabulary = CharTensorizer(
                max_num_chars=self.max_num_chars,
                lower_case_all=False,
                include_space=False)
        else:
            raise ValueError(
                f'Unrecognized token splitting method "{self.splitting_kind}"')

        del self.__tok_counter
Ejemplo n.º 14
0
 def finalise_metadata(cls, encoder_label: str, hyperparameters: Dict[str, Any],
                       raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]:
     final_metadata = super().finalise_metadata(encoder_label, hyperparameters, raw_metadata_list)
     merged_token_counter = collections.Counter()
     for raw_metadata in raw_metadata_list:
         merged_token_counter += raw_metadata['token_counter']
     if hyperparameters[f'{encoder_label}_token_use_bpe']:
         token_vocabulary = BpeVocabulary(
             vocab_size=hyperparameters[f'{encoder_label}_token_vocab_size'],
             pct_bpe=hyperparameters[f'{encoder_label}_token_pct_bpe'])
         token_vocabulary.fit(merged_token_counter)
         print('Total token word vocabulary words:', len(token_vocabulary.word_vocab))
         print('Total token bpe vocabulary words:', len(token_vocabulary.bpe_vocab))
     else:
         token_vocabulary = Vocabulary.create_vocabulary(
             tokens=merged_token_counter,
             max_size=hyperparameters[f'{encoder_label}_token_vocab_size'],
             count_threshold=hyperparameters[f'{encoder_label}_token_vocab_count_threshold'])
         print('Total token vocabulary words:', len(token_vocabulary.id_to_token))
     final_metadata['token_vocab'] = token_vocabulary
     return final_metadata
Ejemplo n.º 15
0
    def load_vocabulary(self) -> Vocabulary:
        """ Return model vocabulary such as a vocabulary. """
        max_size = self.config['vocabulary_max_size']
        count_threshold = self.config['vocabulary_count_threshold']
        # Count occurrences of the body vocabulary
        tokens_counter = Counter()

        for method_token in self.corpus_methods_token:
            for (name, body) in method_token:
                tokens_counter.update(body)
                tokens_counter.update(name)

        token_vocab = Vocabulary.create_vocabulary(
            tokens_counter,
            count_threshold=count_threshold,
            max_size=max_size,
            add_unk=True,
            add_pad=True)

        self.logger.info('{} Vocabulary created'.format(len(token_vocab)))
        return token_vocab
Ejemplo n.º 16
0
    def __init__(self, train_dir, valid_dir, max_seq_length, max_vocab_size):

        # Dictionary which stores raw training data
        self.train_data = {
            METHOD_NAMES: load_data_file(train_dir + METHOD_NAME_FILE_NAME),
            METHOD_APIS: load_data_file(train_dir + METHOD_API_FILE_NAME),
            METHOD_TOKENS: load_data_file(train_dir + METHOD_TOKENS_FILE_NAME),
            JAVADOC: load_data_file(train_dir + JAVADOC_FILE_NAME)
        }

        # Dictionary which stores raw validation data
        self.valid_data = {
            METHOD_NAMES: load_data_file(valid_dir + METHOD_NAME_FILE_NAME),
            METHOD_APIS: load_data_file(valid_dir + METHOD_API_FILE_NAME),
            METHOD_TOKENS: load_data_file(valid_dir + METHOD_TOKENS_FILE_NAME),
            JAVADOC: load_data_file(valid_dir + JAVADOC_FILE_NAME)
        }

        # Tokens lists are flattened to prepare for vocabulary creation
        methods_list = [
            self.train_data[METHOD_NAMES], self.train_data[METHOD_APIS],
            self.train_data[METHOD_TOKENS]
        ]
        javadoc_list = [self.train_data[JAVADOC]]
        all_tokens = flatten(methods_list + javadoc_list)

        self.vocabulary = Vocabulary.create_vocabulary(all_tokens,
                                                       max_vocab_size,
                                                       count_threshold=1,
                                                       add_pad=True)

        self.max_seq_length = max_seq_length
        self.max_vocab_size = max_vocab_size

        # Create Training and Validation tensors
        self.train_tensors = self._tensorize_data(self.train_data)
        self.valid_tensors = self._tensorize_data(self.valid_data)
Ejemplo n.º 17
0
    def finalise_metadata(cls, encoder_label: str, hyperparameters: Dict[str, Any],
                          raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]:
        hypers = cls.get_default_hyperparameters()
        resource = hypers['resource']
        vocabulary_path = f'resources/embeddings/{resource}/token_to_index.pickle'
        with open(vocabulary_path, 'rb') as fin:
            token_to_index = pickle.load(fin)
        # Fictive counts so that the ordering in the internal vocabulary will be the same as the indices in the dict.
        token_to_count = {}
        for token, index in token_to_index.items():
            token_to_count[token] = len(token_to_index) - index
        token_counter = Counter(token_to_count)
        token_vocabulary = Vocabulary.create_vocabulary(
            tokens=token_counter,
            max_size=hyperparameters['%s_token_vocab_size' % encoder_label],
            count_threshold=0)
        print('token_to_index', token_to_index)
        print('token_vocabulary.id_to_token', token_vocabulary.id_to_token)

        final_metadata = {}
        final_metadata['token_vocab'] = token_vocabulary
        # Save the most common tokens for use in data augmentation:
        final_metadata['common_tokens'] = token_counter.most_common(50)
        return final_metadata
Ejemplo n.º 18
0
 def finalize_metadata(self) -> None:
     self.__target_vocab = Vocabulary.create_vocabulary(
         self.__target_class_counter,
         max_size=self.max_num_classes + 1,
     )
     del self.__target_class_counter