def finalise_metadata(name: str, raw_metadata_list: List[Dict[str, Any]], final_metadata: Dict[str, Any], hyperparameters: Dict[str, Any]) -> None: label_embedding_style = hyperparameters[ f'{name}_embedding_style'].lower() merged_node_label_counter = Counter() for raw_metadata in raw_metadata_list: if label_embedding_style == 'token': merged_node_label_counter += raw_metadata[f'{name}_counter'] elif label_embedding_style == 'subtoken': merged_node_label_counter += raw_metadata[ f'{name}_subtoken_counter'] def add_special_literals(vocab: Vocabulary) -> None: vocab.add_or_get_id(TokenEmbedder.STRING_LITERAL) vocab.add_or_get_id(TokenEmbedder.FLOAT_LITERAL) vocab.add_or_get_id(TokenEmbedder.INT_LITERAL) if label_embedding_style == 'token': # Store token, type, and production vocabs: final_metadata[f'{name}_vocab'] = \ Vocabulary.create_vocabulary( merged_node_label_counter, max_size=hyperparameters[f'{name}_vocab_size']) add_special_literals(final_metadata[f'{name}_vocab']) elif label_embedding_style == 'subtoken': final_metadata[f'{name}_subtoken_vocab'] = \ Vocabulary.create_vocabulary( merged_node_label_counter, max_size=hyperparameters[f'{name}_vocab_size']) add_special_literals(final_metadata[f'{name}_subtoken_vocab'])
def __init__(self, nl_threshold, nl_embedding_size, nl_token_counter, code_threshold, code_embedding_size, code_token_counter, dropout_rate, load_pretrained_embeddings=False): """Keeps track of the NL and code vocabularies and embeddings.""" super(EmbeddingStore, self).__init__() edit_keywords = get_edit_keywords() self.__nl_vocabulary = Vocabulary.create_vocabulary(tokens=edit_keywords, max_size=MAX_VOCAB_SIZE, count_threshold=1, add_pad=True) self.__nl_vocabulary.update(nl_token_counter, MAX_VOCAB_SIZE, nl_threshold) self.__nl_embedding_layer = nn.Embedding(num_embeddings=len(self.__nl_vocabulary), embedding_dim=nl_embedding_size, padding_idx=self.__nl_vocabulary.get_id_or_unk( Vocabulary.get_pad())) self.nl_embedding_dropout_layer = nn.Dropout(p=dropout_rate) self.__code_vocabulary = Vocabulary.create_vocabulary(tokens=edit_keywords, max_size=MAX_VOCAB_SIZE, count_threshold=1, add_pad=True) self.__code_vocabulary.update(code_token_counter, MAX_VOCAB_SIZE, code_threshold) self.__code_embedding_layer = nn.Embedding(num_embeddings=len(self.__code_vocabulary), embedding_dim=code_embedding_size, padding_idx=self.__code_vocabulary.get_id_or_unk( Vocabulary.get_pad())) self.code_embedding_dropout_layer = nn.Dropout(p=dropout_rate) print('NL vocabulary size: {}'.format(len(self.__nl_vocabulary))) print('Code vocabulary size: {}'.format(len(self.__code_vocabulary))) if load_pretrained_embeddings: self.initialize_embeddings()
def _evaluate_f1(best_predictions: List[List[np.ndarray]], best_predictions_probs: List[np.ndarray], vocab: Vocabulary, true_labels: np.ndarray): true_labels = clean_target_from_padding(true_labels) result_accumulator = PointSuggestionEvaluator() unk_id = vocab.get_id_or_unk(vocab.get_unk()) for x_pred, x_prob, y_target in zip(best_predictions, best_predictions_probs, true_labels): confidences = x_prob.tolist() is_exact_prediction = [np.all(pred == y_target) for pred in x_pred] precision_recall = [ token_precision_recall(pred.T, y_target) for pred in x_pred ] is_unknown_word_predicted = [ np.all(suggestion == unk_id) for suggestion in x_pred ] unk_word_accuracy = [ unk_acc(suggestion.T, y_target, unk_id) for suggestion in x_pred ] result_accumulator.add_result(confidences, is_exact_prediction, is_unknown_word_predicted, precision_recall, unk_word_accuracy) return result_accumulator
def get_padded_nl_ids(self, nl_sequence, pad_length): return self.__nl_vocabulary.get_id_or_unk_multiple( nl_sequence, pad_to_size=pad_length, padding_element=self.__nl_vocabulary.get_id_or_unk( Vocabulary.get_pad()), )
def get_dataset_from( data_dirs: List[RichPath], use_func_names: bool = False, max_files_per_dir: Optional[int] = None) -> List[Dict[str, Any]]: data_files = sorted( get_data_files_from_directory(data_dirs, max_files_per_dir)) data = list( chain(*chain( list( data_pipeline.combined_samples_generator( {data_pipeline.CODE_TOKENS_LABEL: f})) for f in data_files))) if use_func_names: # This task tries to match the function name to the code, by setting the function name as the query for sample in data: # Replace the query tokens with the function name, broken up into its sub-tokens: sample['docstring_tokens'] = split_identifier_into_parts( sample['func_name']) # In the code, replace the function name with the out-of-vocab token everywhere it appears: sample['code_tokens'] = [ Vocabulary.get_unk() if token == sample['func_name'] else token for token in sample['code_tokens'] ] return data
def tensorize(self, datapoint: str, return_str_rep: bool = False): if self.splitting_kind == "token": token_idxs = self.vocabulary.get_id_or_unk(datapoint) str_repr = datapoint elif self.splitting_kind == "subtoken": subtoks = split_identifier_into_parts(datapoint) if len(subtoks) == 0: subtoks = [Vocabulary.get_unk()] token_idxs = self.vocabulary.get_id_or_unk_multiple(subtoks) elif self.splitting_kind == "bpe": if len(datapoint) == 0: datapoint = "<empty>" token_idxs = self.vocabulary.get_id_or_unk_for_text(datapoint) if return_str_rep: # Do _not_ compute for efficiency str_repr = self.vocabulary.tokenize(datapoint) elif self.splitting_kind == "char": token_idxs = self.vocabulary.tensorize_str(datapoint) if return_str_rep: str_repr = datapoint[:self.vocabulary.max_char_length] else: raise ValueError( f'Unrecognized token splitting method "{self.splitting_kind}".' ) if return_str_rep: return token_idxs, str_repr return token_idxs
def finalise_metadata( cls, encoder_label: str, hyperparameters: Dict[str, Any], raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]: final_metadata = super().finalise_metadata(encoder_label, hyperparameters, raw_metadata_list) merged_token_counter = Counter() for raw_metadata in raw_metadata_list: merged_token_counter += raw_metadata['token_counter'] if hyperparameters['%s_use_bpe' % encoder_label]: token_vocabulary = BpeVocabulary( vocab_size=hyperparameters['%s_token_vocab_size' % encoder_label], pct_bpe=hyperparameters['%s_pct_bpe' % encoder_label]) token_vocabulary.fit(merged_token_counter) else: token_vocabulary = Vocabulary.create_vocabulary( tokens=merged_token_counter, max_size=hyperparameters['%s_token_vocab_size' % encoder_label], count_threshold=hyperparameters[ '%s_token_vocab_count_threshold' % encoder_label]) final_metadata['token_vocab'] = token_vocabulary # Save the most common tokens for use in data augmentation: final_metadata['common_tokens'] = merged_token_counter.most_common(50) return final_metadata
def finalise_metadata(cls, encoder_label: str, hyperparameters: Dict[str, Any], raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]: print("Finalising metadata") final_metadata = super().finalise_metadata(encoder_label, hyperparameters, raw_metadata_list) merged_token_counter = collections.Counter() merged_edge_types = set() token_counts = [] for raw_metadata in raw_metadata_list: merged_token_counter += raw_metadata['token_counter'] merged_edge_types = merged_edge_types.union(raw_metadata['edge_types']) # token_counts.extend(raw_metadata['nodes_by_tokens']) if hyperparameters[f'{encoder_label}_token_use_bpe']: token_vocabulary = BpeVocabulary( vocab_size=hyperparameters[f'{encoder_label}_token_vocab_size'], pct_bpe=hyperparameters[f'{encoder_label}_token_pct_bpe'] ) token_vocabulary.fit(merged_token_counter) print('Total token word vocabulary words:', len(token_vocabulary.word_vocab)) print('Total token bpe vocabulary words:', len(token_vocabulary.bpe_vocab)) else: token_vocabulary = Vocabulary.create_vocabulary( tokens=merged_token_counter, max_size=hyperparameters[f'{encoder_label}_token_vocab_size'], count_threshold=hyperparameters[f'{encoder_label}_token_vocab_count_threshold']) print('Total token vocabulary words:', len(token_vocabulary.id_to_token)) final_metadata['token_vocab'] = token_vocabulary final_metadata['edge_type_mapping'] = {edge_type: i for i, edge_type in enumerate(merged_edge_types)} print('Edge type mapping:', final_metadata['edge_type_mapping']) # print("Percentiles:") # for p in [0, 1, 10, 20, 25, 30, 40, 50, 60, 70, 75, 80, 90, 95, 99, 99.9, 100]: # print(p, np.percentile(token_counts, p)) return final_metadata
def load_data_from_sample(cls, encoder_label: str, hyperparameters: Dict[str, Any], metadata: Dict[str, Any], data_to_load: Any, function_name: Optional[str], result_holder: Dict[str, Any], is_test: bool = True) -> bool: """ Saves two versions of both the code and the query: one using the docstring as the query and the other using the function-name as the query, and replacing the function name in the code with an out-of-vocab token. Sub-tokenizes, converts, and pads both versions, and rejects empty samples. """ # Save the two versions of the code and query: data_holder = { QueryType.DOCSTRING.value: data_to_load, QueryType.FUNCTION_NAME.value: None } # Skip samples where the function name is very short, because it probably has too little information # to be a good search query. if not is_test and hyperparameters['fraction_using_func_name'] > 0. and function_name and \ len(function_name) >= hyperparameters['min_len_func_name_for_query']: if encoder_label == 'query': # Set the query tokens to the function name, broken up into its sub-tokens: data_holder[QueryType.FUNCTION_NAME. value] = split_identifier_into_parts(function_name) elif encoder_label == 'code': # In the code, replace the function name with the out-of-vocab token everywhere it appears: data_holder[QueryType.FUNCTION_NAME.value] = [ Vocabulary.get_unk() if token == function_name else token for token in data_to_load ] # Sub-tokenize, convert, and pad both versions: for key, data in data_holder.items(): if not data: result_holder[f'{encoder_label}_tokens_{key}'] = None result_holder[f'{encoder_label}_tokens_mask_{key}'] = None result_holder[f'{encoder_label}_tokens_length_{key}'] = None continue if hyperparameters[f'{encoder_label}_use_subtokens']: data = cls._to_subtoken_stream( data, mark_subtoken_end=hyperparameters[ f'{encoder_label}_mark_subtoken_end']) tokens, tokens_mask = \ convert_and_pad_token_sequence(metadata['token_vocab'], list(data), hyperparameters[f'{encoder_label}_max_num_tokens']) # Note that we share the result_holder with different encoders, and so we need to make our identifiers # unique-ish result_holder[f'{encoder_label}_tokens_{key}'] = tokens result_holder[f'{encoder_label}_tokens_mask_{key}'] = tokens_mask result_holder[f'{encoder_label}_tokens_length_{key}'] = int( np.sum(tokens_mask)) if result_holder[f'{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}'] is None or \ int(np.sum(result_holder[f'{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}'])) == 0: return False return True
def get_vocab_extended_nl_token(self, token_id, inp_ids, inp_tokens): if token_id < len(self.__nl_vocabulary): return self.get_nl_token(token_id) elif token_id in inp_ids: copy_idx = inp_ids.index(token_id) return inp_tokens[copy_idx] else: return Vocabulary.get_unk()
def pad_length(self, sequence, target_length): if len(sequence) >= target_length: return sequence[:target_length] else: return sequence + [ self.__nl_vocabulary.get_id_or_unk(Vocabulary.get_pad()) for _ in range(target_length - len(sequence)) ]
def __create_voc_from_tokens(all_sub_tokens): vocabulary = Vocabulary.create_vocabulary(all_sub_tokens, max_size=100000, count_threshold=1, add_unk=True, add_pad=True) return vocabulary
def finalize_metadata(self) -> None: self.__token_counter[self.START] = 1000000 self.__token_counter[self.END] = 1000000 self.__output_vocabulary = Vocabulary.create_vocabulary( self.__token_counter, max_size=self.vocabulary_max_size, count_threshold=self.vocabulary_count_threshold, ) self.LOGGER.info("Output vocabulary Size %s", len(self.__output_vocabulary)) del self.__token_counter
def _finalise_metadata(self, raw_metadata_list: List[Dict[str, Any]], final_metadata: Dict[str, Any]): # Merge counters merged_type_counter = Counter() for raw_metadata in raw_metadata_list: merged_type_counter.update(raw_metadata["type_occurences_counter"]) final_metadata['annotation_vocab'] = Vocabulary.create_vocabulary( merged_type_counter, max_size=self.__model. hyperparameters['max_type_annotation_vocab_size']) return final_metadata
def greedy_decode(self, initial_state, encoder_hidden_states, masks, max_out_len, batch_data, device): """Greedily generates the output sequence.""" # Derived from https://github.com/budzianowski/PyTorch-Beam-Search-Decoding/blob/9f6b66f43d2e05175dabcc024f79e1d37a667070/decode_beam.py#L163 batch_size = initial_state.shape[0] decoder_state = initial_state decoder_input = torch.tensor( [[self.embedding_store.get_nl_id(START)]] * batch_size, device=device) decoded_batch = np.zeros([batch_size, max_out_len], dtype=np.int64) decoded_batch_scores = np.zeros([batch_size, max_out_len]) for i in range(max_out_len): decoder_input_embeddings = self.embedding_store.get_nl_embeddings( decoder_input) decoder_attention_states, decoder_state, generation_logprobs, copy_logprobs = self.decode( decoder_state, decoder_input_embeddings, encoder_hidden_states, masks) generation_logprobs = generation_logprobs.squeeze(1) copy_logprobs = copy_logprobs.squeeze(1) prob_scores = torch.zeros([ generation_logprobs.shape[0], generation_logprobs.shape[-1] + copy_logprobs.shape[-1] ], dtype=torch.float32, device=device) prob_scores[:, :generation_logprobs.shape[-1]] = torch.exp( generation_logprobs) for b in range(generation_logprobs.shape[0]): for c, inp_id in enumerate(batch_data.input_ids[b]): prob_scores[b, inp_id] = prob_scores[b, inp_id] + torch.exp( copy_logprobs[b, c]) predicted_ids = torch.argmax(prob_scores, dim=-1) decoded_batch_scores[:, i] = prob_scores[ torch.arange(prob_scores.shape[0]), predicted_ids] decoded_batch[:, i] = predicted_ids unks = torch.ones(predicted_ids.shape[0], dtype=torch.int64, device=device) * self.embedding_store.get_nl_id( Vocabulary.get_unk()) decoder_input = torch.where( predicted_ids < len(self.embedding_store.nl_vocabulary), predicted_ids, unks).unsqueeze(1) decoder_state = decoder_state.squeeze(0) return decoded_batch, decoded_batch_scores
def evaluate_f1(model: keras.Model, vocab: Vocabulary, input_method_body_subtokens: np.ndarray, target_method_names: np.ndarray, hyperparameters: Dict[str, any], visualise_prediction=True): padding_id = vocab.get_id_or_unk(vocab.get_pad()) begin_of_sentence_id = vocab.get_id_or_unk(SENTENCE_START_TOKEN) end_of_sentence_id = vocab.get_id_or_unk(SENTENCE_END_TOKEN) if input_method_body_subtokens.ndim != 3: # model prediction expects 3 dimensions, a single input won't have the batch dimension, manually add it input_method_body_subtokens = np.expand_dims( input_method_body_subtokens, 0) predictions = model.predict(input_method_body_subtokens, batch_size=1) best_predictions, best_predictions_probs = beam_search( predictions, padding_id, begin_of_sentence_id, end_of_sentence_id, hyperparameters['beam_width'], hyperparameters['beam_top_paths'], ) f1_evaluation = _evaluate_f1(best_predictions, best_predictions_probs, vocab, target_method_names) if visualise_prediction: max_results = 10 visualised_input = visualise_beam_predictions_to_targets( vocab, best_predictions[:max_results], best_predictions_probs[:max_results], input_method_body_subtokens[:max_results], target_method_names[:max_results]) # return best_predictions, best_predictions_probs return f1_evaluation, visualised_input return f1_evaluation
def finalise_metadata(cls, encoder_label: str, hyperparameters: Dict[str, Any], raw_metadata_list: List[Dict[str, Any]], language=None) -> Dict[str, Any]: final_metadata = super().finalise_metadata(encoder_label, hyperparameters, raw_metadata_list) merged_token_counter = Counter() print(encoder_label, language) if encoder_label == 'query': final_metadata_path = '_'.join([encoder_label, 'final_metadata']) else: assert encoder_label == 'code' and language final_metadata_path = '_'.join( [encoder_label, language, 'final_metadata']) if os.path.isfile(final_metadata_path): with open(final_metadata_path, 'rb') as final_metadata_file: final_metadata = pickle.load(final_metadata_file) else: for raw_metadata in raw_metadata_list: merged_token_counter += raw_metadata['token_counter'] if hyperparameters['%s_use_bpe' % encoder_label]: token_vocabulary = BpeVocabulary( vocab_size=hyperparameters['%s_token_vocab_size' % encoder_label], pct_bpe=hyperparameters['%s_pct_bpe' % encoder_label]) token_vocabulary.fit(merged_token_counter) else: token_vocabulary = Vocabulary.create_vocabulary( tokens=merged_token_counter, max_size=hyperparameters['%s_token_vocab_size' % encoder_label], count_threshold=hyperparameters[ '%s_token_vocab_count_threshold' % encoder_label]) final_metadata['token_vocab'] = token_vocabulary # Save the most common tokens for use in data augmentation: final_metadata['common_tokens'] = merged_token_counter.most_common( 50) with open(final_metadata_path, 'wb') as final_metadata_file: pickle.dump(final_metadata, final_metadata_file) return final_metadata
def _finalise_metadata( self, raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]: final_metadata = super()._finalise_metadata(raw_metadata_list) TokenEmbedder.finalise_metadata('leaf_label', raw_metadata_list, final_metadata, self.hyperparameters) # First, merge all needed information: merged_non_terminals = set() for raw_metadata in raw_metadata_list: merged_non_terminals.update(raw_metadata["path_elements"]) final_metadata['non_terminal_dict'] = Vocabulary.create_vocabulary( merged_non_terminals, max_size=10000, count_threshold=0) return final_metadata
def get_extended_padded_nl_ids(self, nl_sequence, pad_length, inp_ids, inp_tokens): # Derived from: https://github.com/microsoft/dpu-utils/blob/master/python/dpu_utils/mlutils/vocabulary.py nl_ids = [] for token in nl_sequence: nl_id = self.get_nl_id(token) if self.is_nl_unk(nl_id) and token in inp_tokens: copy_idx = inp_tokens.index(token) nl_id = inp_ids[copy_idx] nl_ids.append(nl_id) if len(nl_ids) > pad_length: return nl_ids[:pad_length] else: padding = [self.__nl_vocabulary.get_id_or_unk(Vocabulary.get_pad())] * (pad_length - len(nl_ids)) return nl_ids + padding
def convert_and_pad_token_sequence(token_vocab: Union[Vocabulary, BpeVocabulary], token_sequence: List[str], output_tensor_size: int, pad_from_left: bool = False) \ -> Tuple[np.ndarray, np.ndarray]: """ Tensorise token sequence with padding; returning a mask for used elements as well. Args: token_vocab: Vocabulary or BPE encoder to use. We assume that token_vocab[0] is the padding symbol. token_sequence: List of tokens in string form output_tensor_size: Size of the resulting tensor (i.e., length up which we pad / down to which we truncate. pad_from_left: Indicate if we are padding/truncating on the left side of string. [Default: False] Returns: Pair of numpy arrays. First is the actual tensorised token sequence, the second is a masking tensor that is 1.0 for those token indices that are actually used. """ print("token_vocab type is ", type(token_vocab)) if isinstance(token_vocab, BpeVocabulary): print("token_vocab is type of BpeVocabulary") token_ids = np.array(list(token_vocab.transform([token_sequence], fixed_length=output_tensor_size))[0]) token_mask = np.array([1 if token_ids[i] > 0 else 0 for i in range(len(token_ids))]) print("token ids ", token_ids.shape()) return token_ids, token_mask if pad_from_left: token_sequence = token_sequence[-output_tensor_size:] else: token_sequence = token_sequence[:output_tensor_size] sequence_length = len(token_sequence) if pad_from_left: start_idx = output_tensor_size - sequence_length else: start_idx = 0 print("token_vocab type ", type(token_vocab)) token_vocab = Vocabulary(token_vocab) print("token_vocab type ", type(token_vocab)) token_ids = np.zeros(output_tensor_size, dtype=np.int32) token_mask = np.zeros(output_tensor_size, dtype=np.float32) for i, token in enumerate(token_sequence, start=start_idx): token_ids[i] = token_vocab.get_id_or_unk(token) token_mask[i] = True return token_ids, token_mask
def finalise_metadata( cls, encoder_label: str, hyperparameters: Dict[str, Any], raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]: final_metadata = super().finalise_metadata(encoder_label, hyperparameters, raw_metadata_list) merged_type_counter = collections.Counter() for raw_metadata in raw_metadata_list: merged_type_counter += raw_metadata['type_counter'] type_vocabulary = Vocabulary.create_vocabulary( tokens=merged_type_counter, max_size=hyperparameters[f'{encoder_label}_type_vocab_size'], count_threshold=hyperparameters[ f'{encoder_label}_type_vocab_count_threshold']) final_metadata['type_vocab'] = type_vocabulary print('Total type vocabulary words:', len(final_metadata['type_vocab'].id_to_token)) return final_metadata
def finalise_metadata( cls, encoder_label: str, hyperparameters: Dict[str, Any], raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]: final_metadata = super().finalise_metadata(encoder_label, hyperparameters, raw_metadata_list) # JGD ****** leaf_nodes start ****** merged_identifier_counter = Counter() for raw_metadata in raw_metadata_list: merged_identifier_counter += raw_metadata['identifier_counter'] if hyperparameters['%s_use_bpe' % encoder_label]: identifier_vocabulary = BpeVocabulary( vocab_size=hyperparameters['%s_token_vocab_size' % encoder_label], pct_bpe=hyperparameters['%s_pct_bpe' % encoder_label]) identifier_vocabulary.fit(merged_identifier_counter) else: identifier_vocabulary = Vocabulary.create_vocabulary( tokens=merged_identifier_counter, max_size=hyperparameters['%s_token_vocab_size' % encoder_label], count_threshold=hyperparameters[ '%s_token_vocab_count_threshold' % encoder_label]) final_metadata['identifier_vocab'] = identifier_vocabulary # Save the most common tokens for use in data augmentation: final_metadata[ 'common_identifiers'] = merged_identifier_counter.most_common(50) # JGD ****** leaf_nodes end ****** # JGD ****** tree_paths start ****** # merged_context_filenames = list() # merged_terminal_counter = Counter() # merged_nonterminal_counter = Counter() # for raw_metadata in raw_metadata_list: # merged_context_filenames.extend(raw_metadata['context_filenames']) # merged_terminal_counter += raw_metadata['terminal_counter'] # merged_nonterminal_counter += raw_metadata['nonterminal_counter'] # # final_metadata['context_filenames'] = merged_context_filenames # final_metadata['terminal_counter'] = merged_terminal_counter # final_metadata['nonterminal_counter'] = merged_nonterminal_counter # JGD ****** tree_paths end ****** return final_metadata
def finalize_metadata(self) -> None: if self.splitting_kind in {"token", "subtoken"}: self.__vocabulary = Vocabulary.create_vocabulary( self.__tok_counter, max_size=self.max_vocabulary_size, count_threshold=self.min_freq_threshold, add_pad=True) elif self.splitting_kind == "bpe": self.__vocabulary = BpeVocabulary(self.max_vocabulary_size, unk_token=UNK_TOKEN, pad_token=PAD_TOKEN, eos_token=EOS_TOKEN, bos_token=INIT_TOKEN) self.__vocabulary.create_vocabulary(self.__tok_counter) else: raise ValueError( f'Unrecognized token splitting method "{self.splitting_kind}"') del self.__tok_counter
def finalize_metadata(self) -> None: if self.splitting_kind in {"token", "subtoken"}: self.__vocabulary = Vocabulary.create_vocabulary( self.__tok_counter, max_size=self.max_vocabulary_size, count_threshold=self.min_freq_threshold, ) elif self.splitting_kind == "bpe": self.__vocabulary = BpeVocabulary(self.max_vocabulary_size) self.__vocabulary.create_vocabulary(self.__tok_counter) elif self.splitting_kind == "char": self.__vocabulary = CharTensorizer( max_num_chars=self.max_num_chars, lower_case_all=False, include_space=False) else: raise ValueError( f'Unrecognized token splitting method "{self.splitting_kind}"') del self.__tok_counter
def finalise_metadata(cls, encoder_label: str, hyperparameters: Dict[str, Any], raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]: final_metadata = super().finalise_metadata(encoder_label, hyperparameters, raw_metadata_list) merged_token_counter = collections.Counter() for raw_metadata in raw_metadata_list: merged_token_counter += raw_metadata['token_counter'] if hyperparameters[f'{encoder_label}_token_use_bpe']: token_vocabulary = BpeVocabulary( vocab_size=hyperparameters[f'{encoder_label}_token_vocab_size'], pct_bpe=hyperparameters[f'{encoder_label}_token_pct_bpe']) token_vocabulary.fit(merged_token_counter) print('Total token word vocabulary words:', len(token_vocabulary.word_vocab)) print('Total token bpe vocabulary words:', len(token_vocabulary.bpe_vocab)) else: token_vocabulary = Vocabulary.create_vocabulary( tokens=merged_token_counter, max_size=hyperparameters[f'{encoder_label}_token_vocab_size'], count_threshold=hyperparameters[f'{encoder_label}_token_vocab_count_threshold']) print('Total token vocabulary words:', len(token_vocabulary.id_to_token)) final_metadata['token_vocab'] = token_vocabulary return final_metadata
def load_vocabulary(self) -> Vocabulary: """ Return model vocabulary such as a vocabulary. """ max_size = self.config['vocabulary_max_size'] count_threshold = self.config['vocabulary_count_threshold'] # Count occurrences of the body vocabulary tokens_counter = Counter() for method_token in self.corpus_methods_token: for (name, body) in method_token: tokens_counter.update(body) tokens_counter.update(name) token_vocab = Vocabulary.create_vocabulary( tokens_counter, count_threshold=count_threshold, max_size=max_size, add_unk=True, add_pad=True) self.logger.info('{} Vocabulary created'.format(len(token_vocab))) return token_vocab
def __init__(self, train_dir, valid_dir, max_seq_length, max_vocab_size): # Dictionary which stores raw training data self.train_data = { METHOD_NAMES: load_data_file(train_dir + METHOD_NAME_FILE_NAME), METHOD_APIS: load_data_file(train_dir + METHOD_API_FILE_NAME), METHOD_TOKENS: load_data_file(train_dir + METHOD_TOKENS_FILE_NAME), JAVADOC: load_data_file(train_dir + JAVADOC_FILE_NAME) } # Dictionary which stores raw validation data self.valid_data = { METHOD_NAMES: load_data_file(valid_dir + METHOD_NAME_FILE_NAME), METHOD_APIS: load_data_file(valid_dir + METHOD_API_FILE_NAME), METHOD_TOKENS: load_data_file(valid_dir + METHOD_TOKENS_FILE_NAME), JAVADOC: load_data_file(valid_dir + JAVADOC_FILE_NAME) } # Tokens lists are flattened to prepare for vocabulary creation methods_list = [ self.train_data[METHOD_NAMES], self.train_data[METHOD_APIS], self.train_data[METHOD_TOKENS] ] javadoc_list = [self.train_data[JAVADOC]] all_tokens = flatten(methods_list + javadoc_list) self.vocabulary = Vocabulary.create_vocabulary(all_tokens, max_vocab_size, count_threshold=1, add_pad=True) self.max_seq_length = max_seq_length self.max_vocab_size = max_vocab_size # Create Training and Validation tensors self.train_tensors = self._tensorize_data(self.train_data) self.valid_tensors = self._tensorize_data(self.valid_data)
def finalise_metadata(cls, encoder_label: str, hyperparameters: Dict[str, Any], raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]: hypers = cls.get_default_hyperparameters() resource = hypers['resource'] vocabulary_path = f'resources/embeddings/{resource}/token_to_index.pickle' with open(vocabulary_path, 'rb') as fin: token_to_index = pickle.load(fin) # Fictive counts so that the ordering in the internal vocabulary will be the same as the indices in the dict. token_to_count = {} for token, index in token_to_index.items(): token_to_count[token] = len(token_to_index) - index token_counter = Counter(token_to_count) token_vocabulary = Vocabulary.create_vocabulary( tokens=token_counter, max_size=hyperparameters['%s_token_vocab_size' % encoder_label], count_threshold=0) print('token_to_index', token_to_index) print('token_vocabulary.id_to_token', token_vocabulary.id_to_token) final_metadata = {} final_metadata['token_vocab'] = token_vocabulary # Save the most common tokens for use in data augmentation: final_metadata['common_tokens'] = token_counter.most_common(50) return final_metadata
def add_special_literals(vocab: Vocabulary) -> None: vocab.add_or_get_id(TokenEmbedder.STRING_LITERAL) vocab.add_or_get_id(TokenEmbedder.FLOAT_LITERAL) vocab.add_or_get_id(TokenEmbedder.INT_LITERAL)
def unk_token(self) -> str: return Vocabulary.get_unk()