def create_token_mask(self, s): if len(s) == 0: # print('token set mask is empty') # info('token set mask is empty') return generate_mask([(0, self.vocab.vocabulary_size-1)], size=self.vocab.vocabulary_size) # mask = create_token_mask_by_token_set(s, vocab_len=self.vocab.vocabulary_size) mask = generate_mask(s, size=self.vocab.vocabulary_size) return mask
def __call__(self, sample): """ :param sample: a list of node :return: a dict of list {'to_parse_token', 'terminal_mask'} """ production_vocabulary = self._production_vocabulary get_token_id = production_vocabulary.get_token_id get_matched_terminal_index = production_vocabulary.get_matched_terminal_node vocabulary_size = production_vocabulary.token_num() generate_mask_fn = generate_mask(size=vocabulary_size) get_node_right_id = lambda x: x.right_id stack = [get_token_id(production_vocabulary.EMPTY), sample[0].left_id] to_parse_token_id = [sample[0].left_id] for node in sample: type_id = stack.pop() if isinstance(node, LeafToken): # print("Terminal token:{}".format(node.value)) to_parse_token_id.append(stack[-1]) else: assert type_id == node.left_id for right_id in reversed(get_node_right_id(node)): stack.append(right_id) terminal_mask = [ generate_mask_fn(get_matched_terminal_index(token)) for token in to_parse_token_id ] return { "to_parse_token": to_parse_token_id, "terminal_mask": terminal_mask }
def _generate_terminal_mask( self, terminal_label_index, ): size = self._keyword_num token_index_set = set() keyword_map = pre_defined_c_tokens_map has_identifer = 0 for t in terminal_label_index: token_str = self._label_vocabulary.get_label_by_id(t) if token_str == "ID": has_identifer = 1 elif token_str == "TYPEID": has_identifer = 1 elif token_str == "IMAGINARY_": pass elif token_str == "END_OF_SLK_INPUT": token_index_set.add(self._token_vocabulary.word_to_id(END)) elif token_str == "CONSTANT" or token_str == "STRING_LITERAL": token_index_set.add( self._token_vocabulary.word_to_id(token_str)) else: token_index_set.add( self._token_vocabulary.word_to_id(keyword_map[token_str])) return generate_mask(token_index_set, size).flip(), 1 - has_identifer
def __init__(self, vocab_size, type_num, embedding_dim, hidden_state_size, rnn_num_layers, identifier_index, keyword_index, terminal_token_index, batch_size): super().__init__() self._batch_size = batch_size self._rnn_num_layers = rnn_num_layers self._hidden_state_size = hidden_state_size self.token_embeddings = nn.Embedding(vocab_size, embedding_dim, sparse=True) self.type_embedding = nn.Embedding(type_num, embedding_dim, sparse=True).cuda(GPU_INDEX) self.rnn = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_state_size, num_layers=rnn_num_layers,).cuda(GPU_INDEX) self.token_prob_mlp = nn.Sequential( nn.Linear(hidden_state_size+embedding_dim, hidden_state_size), nn.ReLU(), nn.Linear(hidden_state_size, type_num)).cuda(GPU_INDEX) # self.type_feature_mlp = nn.Sequential( # nn.Linear(embedding_dim, hidden_state_size), # nn.ReLU(), # nn.Linear(hidden_state_size, vocab_size) # ).cuda(GPU_INDEX) self._initial_state = self.initial_state() # self._all_type_index = torch.range(0, type_num-1).type(torch.LongTensor).cuda(GPU_INDEX) self._identifier_index = torch.LongTensor([identifier_index]).cuda(GPU_INDEX) self._terminal_token_without_identifier_index = torch.LongTensor( sorted(list(set(range(type_num)) - {identifier_index}))).cuda(GPU_INDEX) self._keyword_index = torch.LongTensor(keyword_index).cuda(GPU_INDEX) self._identifier_token_mask = torch.FloatTensor( generate_mask(set(range(vocab_size)) - set(keyword_index), vocab_size)).cuda(GPU_INDEX) self.stable_log_fn = create_stable_log_fn(1e-7) self._terminal_token_index = torch.LongTensor( sorted(list(set(terminal_token_index)-{identifier_index}))).cuda(GPU_INDEX) self.rnn_feature_mlp = nn.Sequential( nn.Linear(hidden_state_size, hidden_state_size), nn.ReLU(), nn.Linear(hidden_state_size, vocab_size -len(self._terminal_token_index)), ).cuda(GPU_INDEX)
def __call__(self, sample): """ :param sample: a list of node :return: a dict of list {'to_parse_token', 'terminal_mask'} """ # print() production_vocabulary = self._production_vocabulary get_matched_terminal_index = production_vocabulary.get_matched_terminal_node vocabulary_size = production_vocabulary.token_num() generate_mask_fn = generate_mask(size=vocabulary_size) get_node_right_id = lambda x: x.right_id stack = [production_vocabulary.EMPTY_id, sample[0].left_id] string_stack = ["EMPTY", sample[0].left] to_parse_token_id = [sample[0].left_id] now_id = 0 peeked_max_id = -1 sample = list(filter(lambda x: not(isinstance(x, LeafToken) and not production_vocabulary.is_token(x.type_id)), sample)) tokens = [] for node in sample: if isinstance(node, LeafToken) and production_vocabulary.is_token(node.type_id): tokens.append(node.type_id) peeked_compact_dict = {} for node in sample: # print(node) type_id = stack.pop() type_string = string_stack.pop() # print("The stack popped token is:{}, string:{}".format(type_id, type_string)) if isinstance(node, LeafToken): # print("Terminal token:{}".format(node.value)) if production_vocabulary.is_token(node.type_id): now_id +=1 to_parse_token_id.append(stack[-1]) else: assert type_id == node.left_id, "type string is {}, now left is {}".format(type_string, node.left) if now_id < len(tokens) and production_vocabulary.need_peek(type_id, tokens[now_id]): # print("need peek") level = 1 entry = production_vocabulary.get_parse_entry(type_id, tokens[now_id]) peeked_id = now_id + level if peeked_id not in peeked_compact_dict: peeked_compact_dict[peeked_id] = production_vocabulary.get_conflict_matched_terminal_node(entry) while production_vocabulary.need_peek(entry, tokens[peeked_id], True): entry = production_vocabulary.get_conflict_entry(entry, tokens[peeked_id]) peeked_id += 1 if peeked_id not in peeked_compact_dict: peeked_compact_dict[peeked_id] = production_vocabulary.get_conflict_matched_terminal_node( entry) peeked_max_id = max(peeked_max_id, peeked_id) for i, right_id in reversed(list(enumerate(get_node_right_id(node)))): if production_vocabulary.is_token(right_id): stack.append(right_id) string_stack.append(node.right[i]) else: # print("{} with id {} is not a token".format(node.right[i], right_id)) pass terminal_mask = [] for i, token in enumerate(to_parse_token_id): if i in peeked_compact_dict: # print("peek", peeked_compact_dict[i]) terminal_mask.append(generate_mask_fn(peeked_compact_dict[i])) else: # print("terminal", get_matched_terminal_index(token)) terminal_mask.append(generate_mask_fn(get_matched_terminal_index(token))) return {"to_parse_token": to_parse_token_id, "terminal_mask": terminal_mask,}
def setUp(self): self._mask_list = generate_mask([1, (4, 8), 9], 10) self._list = [0, 1, 0, 0, 1, 1, 1, 1, 1, 1]
def __call__(self, sample): return [generate_mask([t], self._size) for t in sample]
def __call__(self, sample): """ :param sample: a dict {"tree": a list of node, } :return: a dict of list {'to_parse_token', 'terminal_mask'} """ # print() # consistent_identifier = sample["consistent_identifier"] # consistent_typename = sample["consistent_typename"] target = sample['target'] target_string = [self._token_vocabulary.id_to_word(t) for t in target] sample = sample["tree"] production_vocabulary = self._production_vocabulary get_matched_terminal_index = production_vocabulary.get_matched_terminal_node vocabulary_size = production_vocabulary.token_num() generate_mask_fn = generate_mask(size=vocabulary_size) get_node_right_id = lambda x: x.right_id stack = [production_vocabulary.EMPTY_id, sample[0].left_id] string_stack = ["EMPTY", sample[0].left] to_parse_token_id = [sample[0].left_id] now_id = 0 peeked_max_id = -1 sample = list( filter( lambda x: not (isinstance(x, LeafToken) and not production_vocabulary.is_token(x.type_id)), sample)) tokens = [] for node in sample: if isinstance(node, LeafToken) and production_vocabulary.is_token( node.type_id): tokens.append(node.type_id) peeked_compact_dict = {} for node in sample: # print(node) type_id = stack.pop() type_string = string_stack.pop() # print("The stack popped token is:{}, string:{}".format(type_id, type_string)) if isinstance(node, LeafToken): # print("Terminal token:{}".format(node.value)) if production_vocabulary.is_token(node.type_id): now_id += 1 to_parse_token_id.append(stack[-1]) else: assert type_id == node.left_id, "type string is {}, now left is {}".format( type_string, node.left) # print(node.left) if now_id < len(tokens) and production_vocabulary.need_peek( type_id, tokens[now_id]): # print("need peek") level = 1 entry = production_vocabulary.get_parse_entry( type_id, tokens[now_id]) # print("entry is:{}".format(entry)) peeked_id = now_id + level if peeked_id not in peeked_compact_dict: # print("token {} need peek after token {} saw".format(target_string[peeked_id], target_string[now_id])) peeked_compact_dict[ peeked_id] = production_vocabulary.get_conflict_matched_terminal_node( entry) # print("token {} in peeked_dict? {}".format(target_string[peeked_id], tokens[peeked_id] in peeked_compact_dict[peeked_id])) while production_vocabulary.need_peek( entry, tokens[peeked_id], True): entry = production_vocabulary.get_conflict_entry( entry, tokens[peeked_id]) # print("now entry:{}".format(entry)) peeked_id += 1 # print("token {} need peek after token {} saw".format(target_string[peeked_id], target_string[now_id])) if peeked_id not in peeked_compact_dict: peeked_compact_dict[ peeked_id] = production_vocabulary.get_conflict_matched_terminal_node( entry) peeked_max_id = max(peeked_max_id, peeked_id) for i, right_id in reversed( list(enumerate(get_node_right_id(node)))): if production_vocabulary.is_token(right_id): stack.append(right_id) string_stack.append(node.right[i]) else: # print("{} with id {} is not a token".format(node.right[i], right_id)) pass terminal_mask_index = [] for (i, token), t in zip(enumerate(to_parse_token_id), target): if i in peeked_compact_dict: # print("peek", peeked_compact_dict[i]) # print("target {} use peek".format(self._token_vocabulary.id_to_word(t))) terminal_mask_index.append(peeked_compact_dict[i]) else: # print("terminal", get_matched_terminal_index(token)) # print("target {} use get matched".format(self._token_vocabulary.id_to_word(t))) terminal_mask_index.append(get_matched_terminal_index(token)) terminal_mask = [ self._generate_terminal_mask(index) for index in terminal_mask_index ] from toolz.sandbox import unzip terminal_mask, has_identifier = unzip(terminal_mask) terminal_mask = list(terminal_mask) for t in terminal_mask: assert len(t) == self._keyword_num has_identifier = list(has_identifier) prev_tokens = [] for t, mask, index, h_i in zip(target, terminal_mask, terminal_mask_index, has_identifier): if t < self._keyword_num: if mask[t] != 0: # print("The code before: {}".format(" ".join([self._token_vocabulary.id_to_word(to) for to in prev_tokens]))) # print("all code:{}".format(" ".join([self._token_vocabulary.id_to_word(to) for to in target]))) msg = "target {} not in the mask".format( self._token_vocabulary.id_to_word(t)) raise ValueError(msg) elif h_i != 0: # print("The code before: {}".format( # " ".join([self._token_vocabulary.id_to_word(to) for to in prev_tokens]))) # print("all code:{}".format(" ".join([self._token_vocabulary.id_to_word(to) for to in target]))) msg = "target {} not in the mask".format( self._token_vocabulary.id_to_word(t)) raise ValueError(msg) else: prev_tokens.append(t) return { "terminal_mask": terminal_mask, "target": target, "has_identifier": has_identifier }
def _g_map(self, sample: int): if sample > self._size: raise ValueError( "The range mask out of range, with size {} and sample {}". format(self._size, sample)) return generate_mask(range(sample), self._size)