def tensorise_token_sequence( vocab: Vocabulary, length: int, token_seq: Iterable[str], ) -> List[int]: """ Tensorise a single example. Args: vocab: Vocabulary to use for mapping tokens to integer IDs length: Length to truncate/pad sequences to. token_seq: Sequence of tokens to tensorise. Returns: List with length elements that are integer IDs of tokens in our vocab. """ #TODO 4# Insert your tensorisation code here tokens_ids = [] tokens_ids.append(vocab.get_id_or_unk(START_SYMBOL)) tokens_ids.extend( vocab.get_id_or_unk_multiple(token_seq, pad_to_size=length - 1)) # END_SYMBOL must be the last element in the tokenised sequence end_position = min(1 + len(token_seq), length - 1) tokens_ids[end_position] = vocab.get_id_or_unk(END_SYMBOL) return tokens_ids
def _tensorise_target_token_sequence( self, vocab: Vocabulary, length: int, token_seq: Iterable[str], ) -> List[int]: """ Tensorise a single example. Args: vocab: Vocabulary to use for mapping tokens to integer IDs length: Length to truncate/pad sequences to. token_seq: Sequence of tokens to tensorise. Returns: List with length elements that are integer IDs of tokens in our vocab. """ tensorised = [] for i in range(length): if i==0: tensorised.append(vocab.get_id_or_unk(START_SYMBOL)) elif len(token_seq) >= i: tensorised.append(vocab.get_id_or_unk(token_seq[i-1])) elif i == len(token_seq) + 1: tensorised.append(vocab.get_id_or_unk(END_SYMBOL)) else: tensorised.append(vocab.get_id_or_unk(vocab.get_pad())) return tensorised
def _build_vocab( self, dataset, vocab_size: int, max_num_files: Optional[int] = None ) -> Vocabulary: """ Compute model metadata such as a vocabulary. Args: data: Dataset of method code and comments. source_or_taget: 'source' for methods source, 'target' for methods comments. vocab_size: Maximal size of the vocabulary to create. max_num_files: Maximal number of files to load. """ vocab = Vocabulary(add_unk=True, add_pad=True) # Make sure to include the START_SYMBOL in the vocabulary as well: vocab.add_or_get_id(START_SYMBOL) vocab.add_or_get_id(END_SYMBOL) cnt = collections.Counter() for token_seq in dataset: for token in token_seq: cnt[token] += 1 for token, _ in cnt.most_common(vocab_size): vocab.add_or_get_id(token) return vocab
def build_vocab_from_data_dir( data_dir: str, vocab_size: int, max_num_files: Optional[int] = None) -> Vocabulary: """ Compute model metadata such as a vocabulary. Args: data_dir: Directory containing data files. vocab_size: Maximal size of the vocabulary to create. max_num_files: Maximal number of files to load. """ data_files = get_data_files_from_directory(data_dir, max_num_files) vocab = Vocabulary(add_unk=True, add_pad=True) # Make sure to include the START_SYMBOL in the vocabulary as well: vocab.add_or_get_id(START_SYMBOL) vocab.add_or_get_id(END_SYMBOL) #TODO 3# Insert your vocabulary-building code here counter = Counter() for file in data_files: # for each file, count all tokens list_of_samples = load_data_file(file) for list_tokens in list_of_samples: for token in list_tokens: counter[token] += 1 # most common tokens in vocabulary # Take vocab_size -2 because we need to also store START_SYMBOL and END_SYMBOL for elem, cnt in counter.most_common(vocab_size - 2): vocab.add_or_get_id(elem) return vocab
def _finalise_metadata( self, raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]: final_metadata = super()._finalise_metadata(raw_metadata_list) # First, merge all needed information: merged_edge_types = set() merged_node_label_counter = Counter() merged_node_type_counter = Counter() merged_edge_value_sizes = {} for raw_metadata in raw_metadata_list: merged_edge_types.update(raw_metadata["cg_edge_types"]) merged_node_label_counter += raw_metadata['cg_node_label_counter'] merged_node_type_counter += raw_metadata['cg_node_type_counter'] for edge_type, edge_value_size in raw_metadata[ 'cg_edge_value_sizes'].items(): existing_edge_value_size = merged_edge_value_sizes.get( edge_type) if existing_edge_value_size is not None: assert existing_edge_value_size == edge_value_size merged_edge_value_sizes[edge_type] = edge_value_size # Store edges allowed in the context graph, and assign numerical IDs to them: print(merged_edge_types) all_used_cg_edges = list( merged_edge_types - set(self.hyperparameters['excluded_cg_edge_types'])) print(all_used_cg_edges) if self.hyperparameters.get('cg_add_subtoken_nodes', False): all_used_cg_edges.append(USES_SUBTOKEN_EDGE_NAME) final_metadata['cg_edge_type_dict'] = { e: i for i, e in enumerate(all_used_cg_edges) } # Store token, type, and production vocabs: final_metadata['cg_node_label_vocab'] = \ Vocabulary.create_vocabulary( merged_node_label_counter, max_size=self.hyperparameters['cg_node_label_vocab_size']) type_embedding_size = self.hyperparameters[ 'cg_node_type_embedding_size'] if type_embedding_size > 0: final_metadata['cg_node_type_vocab'] = \ LatticeVocabulary.get_vocabulary_for( tokens=merged_node_type_counter, max_size=self.hyperparameters['cg_node_type_vocab_size'] - 1, lattice=final_metadata['type_lattice']) final_metadata['cg_node_type_vocab'].add_or_get_id(NO_TYPE) self.hyperparameters['cg_node_type_vocab_size'] = len( final_metadata['cg_node_type_vocab']) final_metadata['cg_edge_value_sizes'] = {} for edge_type, edge_feature_size in merged_edge_value_sizes.items(): fwd_edge_type_idx = final_metadata['cg_edge_type_dict'][edge_type] final_metadata['cg_edge_value_sizes'][ fwd_edge_type_idx] = edge_feature_size return final_metadata
def _tensorise_node_features( self, vocab: Vocabulary, token_seq ) -> np.ndarray: tensorised = [] for token in token_seq: tensorised.append([vocab.get_id_or_unk(token)]) return tensorised
def finalise_metadata(self, raw_metadata_list: List[Dict[str, Any]], final_metadata: Dict[str, Any]) -> None: # First, merge all needed information: merged_token_counter = Counter() for raw_metadata in raw_metadata_list: merged_token_counter += raw_metadata['decoder_token_counter'] final_metadata['decoder_token_vocab'] = \ Vocabulary.create_vocabulary(merged_token_counter, max_size=self.hyperparameters['decoder_vocab_size'] - 2) final_metadata['decoder_token_vocab'].add_or_get_id(START_TOKEN) final_metadata['decoder_token_vocab'].add_or_get_id(END_TOKEN)
def finalise_metadata(self, raw_metadata_list: List[Dict[str, Any]], final_metadata: Dict[str, Any]) -> None: # original code # First, merge all needed information: merged_token_counter = Counter() for raw_metadata in raw_metadata_list: merged_token_counter += raw_metadata['decoder_token_counter'] final_metadata['decoder_token_vocab'] = \ Vocabulary.create_vocabulary(merged_token_counter, max_size=self.hyperparameters['decoder_vocab_size'] - 2, count_threshold=1) final_metadata['decoder_token_vocab'].add_or_get_id(START_TOKEN) final_metadata['decoder_token_vocab'].add_or_get_id(END_TOKEN) # addtional code to process var occurrence merged_var_counter = Counter() for raw_metadata in raw_metadata_list: merged_var_counter += raw_metadata['var_name_counter'] final_metadata[ 'var_occurrence_vocab' ] = \ Vocabulary.create_vocabulary( merged_var_counter, max_size=self.hyperparameters[ 'decoder_vocab_size'] ) print("TOTAL VARIABLE VOCAB SIZE: ", len(final_metadata['var_occurrence_vocab'].id_to_token))
def load_metadata_from_dir(self, data_dir: str, max_num_files: Optional[int]=None) -> None: """ Compute model metadata such as a vocabulary. Args: data_dir: Directory containing data files. max_num_files: Maximal number of files to load. Note: This populates the model.metadata dictionary """ data_files = get_data_files_from_directory(data_dir, max_num_files) tokens = Counter(t for f in data_files for t, _ in self.load_data_file(f)) print('Number of unique tokens in dataset: ', len(tokens.keys())) self.__metadata = {'token_vocab': Vocabulary.create_vocabulary(tokens, max_size=5000)}
def large_lambda_1(s: tuple, tokens: np.ndarray, token_lens: np.ndarray, predictions: np.ndarray, probs: np.ndarray, vocab: Vocabulary, file_batch: int, lambdas: np.ndarray = None, num_correct_ids: int = 0, num_ids: int = 1): state, att_states, att_ids, alpha_states, att_counts, lambda_state = s tokens = np.transpose(tokens) att_states = np.squeeze(att_states, 2) alpha_states = np.squeeze(alpha_states, 2) max_atts = alpha_states.max(-1) unk_i = vocab.token_to_id[vocab.get_unk()] seq_mask = np.transpose( np.tile(np.arange(tokens.shape[0]), [tokens.shape[1], 1])) < np.tile( token_lens, [tokens.shape[0], 1]) n_unk = (tokens == unk_i)[seq_mask].sum() n_non_pad = seq_mask.sum() unk_p = n_unk / n_non_pad print('File Batch: ', file_batch) print('some tokens: ', tokens[seq_mask][:10]) if max_atts is not None: max_copy_prob = max_atts * lambda_state[:, 1] average_lambda = np.ma.array(lambda_state[:, 1], mask=tokens[-1] == 0).mean() print('average copy lambda: ', average_lambda) print('max copy lambda: ', lambda_state[:, 1].max()) print('max copy token prob: ', np.amax(max_copy_prob)) attention_stats = [average_lambda] else: attention_stats = [] print_tokens_and_others((tokens, predictions), (probs, ), vocab) print('id acc: ', num_correct_ids / num_ids) am = np.unravel_index(np.argmin(predictions), predictions.shape) print(predictions[am], tokens[am[0]], tokens[am]) return [ np.min(predictions), np.mean(predictions), n_unk, n_non_pad, num_correct_ids, num_ids ] + attention_stats
def build_vocab_from_data_dir( data_dir: str, vocab_size: int, max_num_files: Optional[int] = None ) -> Vocabulary: """ Compute model metadata such as a vocabulary. Args: data_dir: Directory containing data files. vocab_size: Maximal size of the vocabulary to create. max_num_files: Maximal number of files to load. """ data_files = get_data_files_from_directory(data_dir, max_num_files) vocab = Vocabulary(add_unk=True, add_pad=True) # Make sure to include the START_SYMBOL in the vocabulary as well: vocab.add_or_get_id(START_SYMBOL) vocab.add_or_get_id(END_SYMBOL) #TODO 3# Insert your vocabulary-building code here return vocab
def token_seq_equal(a: List[str], b: List[str]): unk_tok = Vocabulary.get_unk() if unk_tok in a or unk_tok in b: return False else: return a == b