def load_example_code_for_(stack_size):
    code = read_code_from_file()
    identifier_set, type_set = extract_fake_c_header_identifier()
    clex = BufferedCLex(error_func=lambda self, msg, line, column: None,
                        on_lbrace_func=lambda: None,
                        on_rbrace_func=lambda: None,
                        type_lookup_func=lambda typ: None)
    clex.build()
    vocabulary = load_vocabulary(get_token_vocabulary,
                                 get_vocabulary_id_map_with_keyword, [BEGIN],
                                 [END], UNK)
    print("the size of predefined_identifer:{}".format(len(identifier_set)))
    print("the size of typeset:{}".format(len(type_set)))
    parse_fn = monitored_slk_parse(clex=clex,
                                   predefined_identifer=identifier_set,
                                   predefined_typename=type_set,
                                   vocabulary=vocabulary)
    code_df = pd.DataFrame({"code": [code, code]})
    parsed_code = show_process_map(parse_fn,
                                   code_df['code'],
                                   error_default_value=tuple([
                                       None,
                                   ] * 7))
    parsed_code = unzip(parsed_code)
    code_df['parse_tree'] = list(parsed_code[0])
    code_df['tokens'] = list(parsed_code[1])
    code_df['consistent_identifier'] = list(parsed_code[2])
    code_df['identifier_scope_index'] = list(parsed_code[3])
    code_df['is_identifier'] = list(parsed_code[4])
    code_df['max_scope_list'] = list(parsed_code[5])
    code_df['consistent_typename'] = list(parsed_code[6])
    keyword_num, vocabulary, transforms_fn = get_transform(stack_size)
    sample = CCodeDataSet(code_df, vocabulary, stack_size, transforms_fn)
    return sample, keyword_num, vocabulary, code_df['code']
 def parse_df(df):
     identifier_set, type_set = extract_fake_c_header_identifier()
     clex = BufferedCLex(error_func=lambda self, msg, line, column: None,
                         on_lbrace_func=lambda: None,
                         on_rbrace_func=lambda: None,
                         type_lookup_func=lambda typ: None)
     clex.build()
     BEGIN, END, UNK = ["<BEGIN>", "<END>", "<UNK>"]
     from embedding.wordembedding import load_vocabulary
     vocabulary = load_vocabulary(get_token_vocabulary, get_vocabulary_id_map_with_keyword, [BEGIN], [END], UNK)
     print("the size of predefined_identifer:{}".format(len(identifier_set)))
     print("the size of typeset:{}".format(len(type_set)))
     parse_fn = monitored_slk_parse(clex=clex, predefined_identifer=identifier_set, predefined_typename=type_set,
                                    vocabulary=vocabulary)
     parsed_code = show_process_map(parse_fn, df['code'],
                                    error_default_value=tuple([None, ] * 7))
     parsed_code = unzip(parsed_code)
     df['parse_tree'] = list(parsed_code[0])
     df['tokens'] = list(parsed_code[1])
     df['consistent_identifier'] = list(parsed_code[2])
     df['identifier_scope_index'] = list(parsed_code[3])
     df['is_identifier'] = list(parsed_code[4])
     df['max_scope_list'] = list(parsed_code[5])
     df['consistent_typename'] = list(parsed_code[6])
     return df
 def parse_df(df):
     monitor = MonitoredParser()
     parsed_code = show_process_map(monitor.parse_get_production_list_and_token_list, df['code'],
                                    error_default_value=(None, None, None))
     parsed_code = unzip(parsed_code)
     df['parse_tree'] = list(parsed_code[0])
     df['ast'] = list(parsed_code[1])
     df['tokens'] = list(parsed_code[2])
     return df
Esempio n. 4
0
    def __init__(self,
                 data_df: pd.DataFrame,
                 vocabulary: Vocabulary,
                 transform=None):
        self.data_df = data_df[data_df['tokens'].map(lambda x: x is not None)]
        self.data_df = self.data_df[self.data_df['tokens'].map(lambda x: len(x) < MAX_LENGTH)]
        self.transform = transform
        self.vocabulary = vocabulary

        self._samples = [self._get_raw_sample(i) for i in range(len(self.data_df))]
        if self.transform:
            self._samples = show_process_map(self.transform, self._samples)
 def parse_df(df):
     clex = BufferedCLex(error_func=lambda self, msg, line, column: None,
                         on_lbrace_func=lambda: None,
                         on_rbrace_func=lambda: None,
                         type_lookup_func=lambda typ: None)
     clex.build()
     parse_fn = c99_slk_parse(clex=clex)
     parsed_code = show_process_map(parse_fn, df['code'],
                                    error_default_value=(None, None))
     parsed_code = unzip(parsed_code)
     df['parse_tree'] = list(parsed_code[0])
     df['tokens'] = list(parsed_code[1])
     return df
Esempio n. 6
0
    def __init__(self,
                 data_df: pd.DataFrame,
                 vocabulary: Vocabulary,
                 transform=None):
        self.data_df = data_df[data_df['tokens'].map(lambda x: x is not None)]
        self.data_df = self.data_df[self.data_df['tokens'].map(lambda x: len(x) < MAX_LENGTH)]
        print("The max length:{}".format(max(self.data_df['tokens'].map(lambda x: len(x)))))
        self.transform = transform
        self.vocabulary = vocabulary

        self._samples = [self._get_raw_sample(i) for i in range(len(self.data_df))]
        # for t in self._samples:
        #     print(t)
        if self.transform:
            self._samples = list(filter(lambda x: x is not None, show_process_map(self.transform, self._samples)))
Esempio n. 7
0
    def __init__(self,
                 data_df: pd.DataFrame,
                 vocabulary: Vocabulary,
                 set_type: str,
                 transformer_vocab_slk=None,
                 no_filter=False,
                 do_flatten=False,
                 MAX_LENGTH=500,
                 use_ast=False,
                 do_multi_step_sample=False,
                 id_to_program_dict=None,
                 no_id_to_program_dict=False):
        # super().__init__(data_df, vocabulary, set_type, transform, no_filter)
        self.set_type = set_type
        self.vocabulary = vocabulary
        self.transformer = transformer_vocab_slk
        self.is_flatten = do_flatten
        self.max_length = MAX_LENGTH
        self.use_ast = use_ast
        self.transform = False
        self.do_multi_step_sample = do_multi_step_sample
        if data_df is not None:
            if not no_filter:
                self.data_df = self.filter_df(data_df)
            else:
                self.data_df = data_df

            self.only_first = do_multi_step_sample
            from experiment.experiment_dataset import FlattenRandomIterateRecords
            self._samples = [
                FlattenRandomIterateRecords(row,
                                            is_flatten=do_flatten,
                                            only_first=do_multi_step_sample)
                for i, row in self.data_df.iterrows()
            ]
            # c = 0
            # for i, (index, row) in self.data_df.iterrows():
            #     print(i)
            #     print(row['id'])
            self.program_to_position_dict = {
                row['id']: i
                for i, (index, row) in enumerate(self.data_df.iterrows())
            }

            if self.transform:
                self._samples = show_process_map(self.transform, self._samples)
Esempio n. 8
0
    def __init__(self,
                 data_df: pd.DataFrame,
                 vocabulary: Vocabulary,
                 set_type: str,
                 transform=None):
        self.data_df = data_df[data_df['error_code_word_id'].map(
            lambda x: x is not None)]
        # print(self.data_df['error_code_word_id'])
        self.data_df = self.data_df[self.data_df['error_code_word_id'].map(
            lambda x: len(x) < MAX_LENGTH)]
        self.set_type = set_type
        self.transform = transform
        self.vocabulary = vocabulary

        self._samples = [
            self._get_raw_sample(i) for i in range(len(self.data_df))
        ]
        if self.transform:
            self._samples = show_process_map(self.transform, self._samples)
Esempio n. 9
0
 def __init__(self,
              data_df: pd.DataFrame,
              vocabulary: Vocabulary,
              set_type: str,
              transformer_vocab_slk=None,
              no_filter=False,
              do_flatten=False,
              MAX_LENGTH=500,
              use_ast=False,
              do_multi_step_sample=False,
              only_smaple=False):
     # super().__init__(data_df, vocabulary, set_type, transform, no_filter)
     self.set_type = set_type
     self.vocabulary = vocabulary
     self.transformer = transformer_vocab_slk
     self.is_flatten = do_flatten
     self.max_length = MAX_LENGTH
     self.use_ast = use_ast
     self.transform = False
     self.only_sample = only_smaple
     # if self.set_type != 'valid' and self.set_type != 'test' and self.set_type != 'deepfix':
     #     self.do_sample = False
     # else:
     #     self.do_sample = True
     self.do_multi_step_sample = do_multi_step_sample
     if data_df is not None:
         if not no_filter:
             self.data_df = self.filter_df(data_df)
         else:
             self.data_df = data_df
         print("before filter p2 out, dataset size is:{}".format(
             len(self.data_df)))
         self.data_df = self._filter_p2_out(self.data_df)
         print("after filter p2 out, dataset size is:{}".format(
             len(self.data_df)))
         self._samples = [row for i, row in self.data_df.iterrows()]
         if self.transform:
             self._samples = show_process_map(self.transform, self._samples)
    def __init__(self,
                 data_df: pd.DataFrame,
                 vocabulary: Vocabulary,
                 name:str,
                 do_sample: bool,
                 no_filter=False,
                 MAX_LENGTH=500,):
        # super().__init__(data_df, vocabulary, set_type, transform, no_filter)
        self.do_sample = do_sample
        self.name = name
        self.vocabulary = vocabulary
        self.max_length = MAX_LENGTH
        self.transform = False
        if data_df is not None:
            if not no_filter:
                self.data_df = self.filter_df(data_df)
            else:
                self.data_df = data_df

            self._samples = [row for i, row in self.data_df.iterrows()]

            if self.transform:
                self._samples = show_process_map(self.transform, self._samples)
Esempio n. 11
0
 def __init__(self, data_list, transform=None):
     self._data = data_list
     if transform:
         self._data = show_process_map(transform, self._data)
         self._data = list(filter(lambda x: x is not None, self._data))
Esempio n. 12
0
 def parse_df(df):
     df['parse_tree'] = show_process_map(parse_tree_to_top_down_process, df['parse_tree'])
     del df['ast']
     return df