def load_example_code_for_(stack_size): code = read_code_from_file() identifier_set, type_set = extract_fake_c_header_identifier() clex = BufferedCLex(error_func=lambda self, msg, line, column: None, on_lbrace_func=lambda: None, on_rbrace_func=lambda: None, type_lookup_func=lambda typ: None) clex.build() vocabulary = load_vocabulary(get_token_vocabulary, get_vocabulary_id_map_with_keyword, [BEGIN], [END], UNK) print("the size of predefined_identifer:{}".format(len(identifier_set))) print("the size of typeset:{}".format(len(type_set))) parse_fn = monitored_slk_parse(clex=clex, predefined_identifer=identifier_set, predefined_typename=type_set, vocabulary=vocabulary) code_df = pd.DataFrame({"code": [code, code]}) parsed_code = show_process_map(parse_fn, code_df['code'], error_default_value=tuple([ None, ] * 7)) parsed_code = unzip(parsed_code) code_df['parse_tree'] = list(parsed_code[0]) code_df['tokens'] = list(parsed_code[1]) code_df['consistent_identifier'] = list(parsed_code[2]) code_df['identifier_scope_index'] = list(parsed_code[3]) code_df['is_identifier'] = list(parsed_code[4]) code_df['max_scope_list'] = list(parsed_code[5]) code_df['consistent_typename'] = list(parsed_code[6]) keyword_num, vocabulary, transforms_fn = get_transform(stack_size) sample = CCodeDataSet(code_df, vocabulary, stack_size, transforms_fn) return sample, keyword_num, vocabulary, code_df['code']
def parse_df(df): identifier_set, type_set = extract_fake_c_header_identifier() clex = BufferedCLex(error_func=lambda self, msg, line, column: None, on_lbrace_func=lambda: None, on_rbrace_func=lambda: None, type_lookup_func=lambda typ: None) clex.build() BEGIN, END, UNK = ["<BEGIN>", "<END>", "<UNK>"] from embedding.wordembedding import load_vocabulary vocabulary = load_vocabulary(get_token_vocabulary, get_vocabulary_id_map_with_keyword, [BEGIN], [END], UNK) print("the size of predefined_identifer:{}".format(len(identifier_set))) print("the size of typeset:{}".format(len(type_set))) parse_fn = monitored_slk_parse(clex=clex, predefined_identifer=identifier_set, predefined_typename=type_set, vocabulary=vocabulary) parsed_code = show_process_map(parse_fn, df['code'], error_default_value=tuple([None, ] * 7)) parsed_code = unzip(parsed_code) df['parse_tree'] = list(parsed_code[0]) df['tokens'] = list(parsed_code[1]) df['consistent_identifier'] = list(parsed_code[2]) df['identifier_scope_index'] = list(parsed_code[3]) df['is_identifier'] = list(parsed_code[4]) df['max_scope_list'] = list(parsed_code[5]) df['consistent_typename'] = list(parsed_code[6]) return df
def parse_df(df): monitor = MonitoredParser() parsed_code = show_process_map(monitor.parse_get_production_list_and_token_list, df['code'], error_default_value=(None, None, None)) parsed_code = unzip(parsed_code) df['parse_tree'] = list(parsed_code[0]) df['ast'] = list(parsed_code[1]) df['tokens'] = list(parsed_code[2]) return df
def __init__(self, data_df: pd.DataFrame, vocabulary: Vocabulary, transform=None): self.data_df = data_df[data_df['tokens'].map(lambda x: x is not None)] self.data_df = self.data_df[self.data_df['tokens'].map(lambda x: len(x) < MAX_LENGTH)] self.transform = transform self.vocabulary = vocabulary self._samples = [self._get_raw_sample(i) for i in range(len(self.data_df))] if self.transform: self._samples = show_process_map(self.transform, self._samples)
def parse_df(df): clex = BufferedCLex(error_func=lambda self, msg, line, column: None, on_lbrace_func=lambda: None, on_rbrace_func=lambda: None, type_lookup_func=lambda typ: None) clex.build() parse_fn = c99_slk_parse(clex=clex) parsed_code = show_process_map(parse_fn, df['code'], error_default_value=(None, None)) parsed_code = unzip(parsed_code) df['parse_tree'] = list(parsed_code[0]) df['tokens'] = list(parsed_code[1]) return df
def __init__(self, data_df: pd.DataFrame, vocabulary: Vocabulary, transform=None): self.data_df = data_df[data_df['tokens'].map(lambda x: x is not None)] self.data_df = self.data_df[self.data_df['tokens'].map(lambda x: len(x) < MAX_LENGTH)] print("The max length:{}".format(max(self.data_df['tokens'].map(lambda x: len(x))))) self.transform = transform self.vocabulary = vocabulary self._samples = [self._get_raw_sample(i) for i in range(len(self.data_df))] # for t in self._samples: # print(t) if self.transform: self._samples = list(filter(lambda x: x is not None, show_process_map(self.transform, self._samples)))
def __init__(self, data_df: pd.DataFrame, vocabulary: Vocabulary, set_type: str, transformer_vocab_slk=None, no_filter=False, do_flatten=False, MAX_LENGTH=500, use_ast=False, do_multi_step_sample=False, id_to_program_dict=None, no_id_to_program_dict=False): # super().__init__(data_df, vocabulary, set_type, transform, no_filter) self.set_type = set_type self.vocabulary = vocabulary self.transformer = transformer_vocab_slk self.is_flatten = do_flatten self.max_length = MAX_LENGTH self.use_ast = use_ast self.transform = False self.do_multi_step_sample = do_multi_step_sample if data_df is not None: if not no_filter: self.data_df = self.filter_df(data_df) else: self.data_df = data_df self.only_first = do_multi_step_sample from experiment.experiment_dataset import FlattenRandomIterateRecords self._samples = [ FlattenRandomIterateRecords(row, is_flatten=do_flatten, only_first=do_multi_step_sample) for i, row in self.data_df.iterrows() ] # c = 0 # for i, (index, row) in self.data_df.iterrows(): # print(i) # print(row['id']) self.program_to_position_dict = { row['id']: i for i, (index, row) in enumerate(self.data_df.iterrows()) } if self.transform: self._samples = show_process_map(self.transform, self._samples)
def __init__(self, data_df: pd.DataFrame, vocabulary: Vocabulary, set_type: str, transform=None): self.data_df = data_df[data_df['error_code_word_id'].map( lambda x: x is not None)] # print(self.data_df['error_code_word_id']) self.data_df = self.data_df[self.data_df['error_code_word_id'].map( lambda x: len(x) < MAX_LENGTH)] self.set_type = set_type self.transform = transform self.vocabulary = vocabulary self._samples = [ self._get_raw_sample(i) for i in range(len(self.data_df)) ] if self.transform: self._samples = show_process_map(self.transform, self._samples)
def __init__(self, data_df: pd.DataFrame, vocabulary: Vocabulary, set_type: str, transformer_vocab_slk=None, no_filter=False, do_flatten=False, MAX_LENGTH=500, use_ast=False, do_multi_step_sample=False, only_smaple=False): # super().__init__(data_df, vocabulary, set_type, transform, no_filter) self.set_type = set_type self.vocabulary = vocabulary self.transformer = transformer_vocab_slk self.is_flatten = do_flatten self.max_length = MAX_LENGTH self.use_ast = use_ast self.transform = False self.only_sample = only_smaple # if self.set_type != 'valid' and self.set_type != 'test' and self.set_type != 'deepfix': # self.do_sample = False # else: # self.do_sample = True self.do_multi_step_sample = do_multi_step_sample if data_df is not None: if not no_filter: self.data_df = self.filter_df(data_df) else: self.data_df = data_df print("before filter p2 out, dataset size is:{}".format( len(self.data_df))) self.data_df = self._filter_p2_out(self.data_df) print("after filter p2 out, dataset size is:{}".format( len(self.data_df))) self._samples = [row for i, row in self.data_df.iterrows()] if self.transform: self._samples = show_process_map(self.transform, self._samples)
def __init__(self, data_df: pd.DataFrame, vocabulary: Vocabulary, name:str, do_sample: bool, no_filter=False, MAX_LENGTH=500,): # super().__init__(data_df, vocabulary, set_type, transform, no_filter) self.do_sample = do_sample self.name = name self.vocabulary = vocabulary self.max_length = MAX_LENGTH self.transform = False if data_df is not None: if not no_filter: self.data_df = self.filter_df(data_df) else: self.data_df = data_df self._samples = [row for i, row in self.data_df.iterrows()] if self.transform: self._samples = show_process_map(self.transform, self._samples)
def __init__(self, data_list, transform=None): self._data = data_list if transform: self._data = show_process_map(transform, self._data) self._data = list(filter(lambda x: x is not None, self._data))
def parse_df(df): df['parse_tree'] = show_process_map(parse_tree_to_top_down_process, df['parse_tree']) del df['ast'] return df