def read_monitored_parsed_c99_slk_top_down_code(debug=False): def parse_df(df): identifier_set, type_set = extract_fake_c_header_identifier() clex = BufferedCLex(error_func=lambda self, msg, line, column: None, on_lbrace_func=lambda: None, on_rbrace_func=lambda: None, type_lookup_func=lambda typ: None) clex.build() BEGIN, END, UNK = ["<BEGIN>", "<END>", "<UNK>"] from embedding.wordembedding import load_vocabulary vocabulary = load_vocabulary(get_token_vocabulary, get_vocabulary_id_map_with_keyword, [BEGIN], [END], UNK) print("the size of predefined_identifer:{}".format(len(identifier_set))) print("the size of typeset:{}".format(len(type_set))) parse_fn = monitored_slk_parse(clex=clex, predefined_identifer=identifier_set, predefined_typename=type_set, vocabulary=vocabulary) parsed_code = show_process_map(parse_fn, df['code'], error_default_value=tuple([None, ] * 7)) parsed_code = unzip(parsed_code) df['parse_tree'] = list(parsed_code[0]) df['tokens'] = list(parsed_code[1]) df['consistent_identifier'] = list(parsed_code[2]) df['identifier_scope_index'] = list(parsed_code[3]) df['is_identifier'] = list(parsed_code[4]) df['max_scope_list'] = list(parsed_code[5]) df['consistent_typename'] = list(parsed_code[6]) return df if not debug: return [parse_df(df) for df in read_filtered_without_include_distinct_problem_user_ac_c99_code_dataset()] else: return [parse_df(df.head(100)) for df in read_filtered_without_include_distinct_problem_user_ac_c99_code_dataset()]
def read_parsed_tree_code(debug=False): def parse_df(df): monitor = MonitoredParser() parsed_code = show_process_map(monitor.parse_get_production_list_and_token_list, df['code'], error_default_value=(None, None, None)) parsed_code = unzip(parsed_code) df['parse_tree'] = list(parsed_code[0]) df['ast'] = list(parsed_code[1]) df['tokens'] = list(parsed_code[2]) return df if not debug: return [parse_df(df) for df in read_filtered_without_include_distinct_problem_user_ac_c99_code_dataset()] else: return [parse_df(df.head(100)) for df in read_filtered_without_include_distinct_problem_user_ac_c99_code_dataset()]
def read_antlr_parse_records_train_set(): test_df = read_filtered_without_include_distinct_problem_user_ac_c99_code_dataset( )[0] df = process_df_multiple(test_df) print('finish multiple process') df = df[df['tokens'].map(lambda x: x is not None)] return df
def read_parsed_c99_slk_top_down_code(debug=False): def parse_df(df): clex = BufferedCLex(error_func=lambda self, msg, line, column: None, on_lbrace_func=lambda: None, on_rbrace_func=lambda: None, type_lookup_func=lambda typ: None) clex.build() parse_fn = c99_slk_parse(clex=clex) parsed_code = show_process_map(parse_fn, df['code'], error_default_value=(None, None)) parsed_code = unzip(parsed_code) df['parse_tree'] = list(parsed_code[0]) df['tokens'] = list(parsed_code[1]) return df if not debug: return [parse_df(df) for df in read_filtered_without_include_distinct_problem_user_ac_c99_code_dataset()] else: return [parse_df(df.head(100)) for df in read_filtered_without_include_distinct_problem_user_ac_c99_code_dataset()]
def read_parser_train_dfa(): train_df = read_filtered_without_include_distinct_problem_user_ac_c99_code_dataset( )[0] train_df['tokens'] = train_df['code'].apply(collect_dfa_do_parse, total=len(train_df)) tmp_code = '''int main(){ return 0; }''' _, _, _, parser = create_monitor_parser(tmp_code) return parser.decisionsToDFA
def read_antlr_parse_records_dataset(): datasets = read_filtered_without_include_distinct_problem_user_ac_c99_code_dataset( ) total = 0 for df in datasets: total += len(df) datasets = [process_df_multiple(df) for df in datasets] datasets = [ df[df['tokens'].map(lambda x: x is not None)] for df in datasets ] print('train: {}, valid: {}, test: {}'.format(len(datasets[0]), len(datasets[1]), len(datasets[2]))) return datasets
def read_antlr_parse_train_records_part(i): train_df, _, _ = read_filtered_without_include_distinct_problem_user_ac_c99_code_dataset( ) # df_list = split_df_to_part(train_df, size=100) size = 10000 df = train_df.iloc[i * size:(i + 1) * size] print('len df: ', len(df)) # df = read_antlr_parse_records_df(df_list[i]) df_list = split_df_to_part(df, 1000) for df in df_list: print('len parallel df: ', len(df)) res = list(parallel_map(10, read_antlr_parse_records_df, df_list)) df = res[0] for i in range(1, len(res)): df = df.append(res[i], ignore_index=True) # df = read_antlr_parse_records_df(df) return df
def read_antlr_parse_records_valid_set(): valid_df = read_filtered_without_include_distinct_problem_user_ac_c99_code_dataset( )[1] df = process_df_multiple(valid_df) df = df[df['tokens'].map(lambda x: x is not None)] return df
def read_filtered_without_include_code_tokens(): return [parse_c99_code_to_token(df) for df in read_filtered_without_include_distinct_problem_user_ac_c99_code_dataset()]