def save_c_submit_code(data_df_list):
    create_table(COMPILE_SUCCESS_DATA_DBPATH, C_COMPILE_SUCCESS_RECORDS)

    result_list = [
        data_df['gcc_compile_result'].map(lambda x: 1 if x else 0)
        for data_df in data_df_list
    ]
    count_list = [len(data_df) for data_df in data_df_list]
    success_res = np.sum(result_list)
    count_res = np.sum(count_list)
    print('success_res total: {}, total: {}'.format(success_res, count_res))

    def trans(error_df, reverse_verdict, reverse_langdict):
        res = [
            transform_data(row, reverse_verdict, reverse_langdict)
            for index, row in error_df.iterrows()
        ]
        return res

    reverse_verdict = reverse_dict(verdict)
    reverse_langdict = reverse_dict(langdict)

    data_items_list = [
        trans(data_df, reverse_verdict, reverse_langdict)
        for data_df in data_df_list
    ]
    for data_items in data_items_list:
        insert_items(COMPILE_SUCCESS_DATA_DBPATH, C_COMPILE_SUCCESS_RECORDS,
                     data_items)
def resave_database_main(to_db_path, to_table_name, params_string: dict={}, params_number: dict={}):

    params_s = add_params(params_string, need_quot=True)
    params_n = add_params(params_number, need_quot=False)

    if params_s != '' and params_n != '':
        params = params_s + ' and ' + params_n
    else:
        params = params_s if params_s != '' else params_n
    parmas = ' where ' + params if params != '' else params

    conn = sqlite3.connect(scrapyOJ_path)
    print('start read sql')
    df = pd.read_sql('select * from {}{}'.format('submit', parmas), conn)
    print('total df length: {}'.format(len(df)))
    df = df[df['code'].map(lambda x: x != '')]
    print('no empty df length: {}'.format(len(df)))
    df_dict = df.to_dict(orient='list')
    del df

    print('finish filter')
    create_table(to_db_path, to_table_name)
    header_list = ['id', 'submit_url', 'submit_time', 'user_id', 'user_name', 'problem_id', 'problem_url', 'problem_name', 'problem_full_name', 'language', 'status', 'error_test_id', 'time', 'memory', 'code']
    total_list = [df_dict[key] for key in header_list]
    total_list = list(zip(*total_list))
    print('start save')
    insert_items(to_db_path, to_table_name, total_list)
    print('end save')
Example #3
0
def save_train_data(error_df_list, ac_df_list, db_path, table_name, transform_fn):
    create_table(db_path, table_name)

    def trans(error_df):
        res = [transform_fn(row) for index, row in error_df.iterrows()]
        return res

    error_items_list = [trans(error_df) for error_df in error_df_list]
    for error_items in error_items_list:
        insert_items(db_path, table_name, error_items)
Example #4
0
def filter_program_id_main(db_path, table_name, new_table_name):
    df = read_experiment_result_df(db_path, table_name)
    grouped = df.groupby('id')
    print('group length: ', len(grouped))
    save_list = []
    for name, group in grouped:
        one = select_best_records(group)
        save_list += [one]

    print('save list length: ', len(save_list))
    create_table(db_path, DATA_RECORDS_DEEPFIX, replace_table_name=new_table_name)
    run_sql_statment(db_path, DATA_RECORDS_DEEPFIX, 'insert_ignore', save_list, replace_table_name=new_table_name)
def preprocess():
    # initLogging()
    preprocess_logger.info("Start Read Code Data")
    code_df = read_distinct_problem_user_ac_c_records_filter_error_code()
    preprocess_logger.info("Code Data Read Finish. Total: {}".format(
        code_df.shape[0]))
    que_read = mp.Queue()
    que_write = mp.Queue()

    create_table(db_full_path=FAKE_C_COMPILE_ERROR_DATA_DBPATH,
                 table_name=current_table_name)
    pros = []
    for i in range(6):
        pro = mp.Process(target=make_fake_code, args=(que_read, que_write, i))
        pro.start()
        pros.append(pro)
    save_pro = mp.Process(target=save_fake_code,
                          args=(que_write, code_df.shape[0]))
    save_pro.start()

    count = 0
    ids = []
    items = []
    for index, row in code_df.iterrows():
        count += 1

        item = {'try_count': 0}
        item['id'] = row['id']
        item['submit_url'] = row['submit_url']
        item['problem_id'] = row['problem_id']
        item['user_id'] = row['user_id']
        item['problem_user_id'] = row['problem_user_id']
        item['originalcode'] = row['code'].replace('\ufeff',
                                                   '').replace('\u3000', ' ')
        items.append(item)

        ids.append(item['problem_user_id'])

        if len(ids) == 10000:
            push_code_to_queue(que_read, ids, items)
            preprocess_logger.info('Total Preprocess {}'.format(count))
            ids = []
            items = []

    push_code_to_queue(que_read, ids, items)
    preprocess_logger.info('Total Preprocess {}'.format(count))

    for p in pros:
        p.join()
    save_pro.join()
def save_fake_code(que: mp.Queue, all_data_count):
    create_table(db_full_path=FAKE_C_COMPILE_ERROR_DATA_DBPATH,
                 table_name=current_table_name)
    que.qsize()
    preprocess_logger.info(
        'Start Save Fake Code Process. all data count: {}'.format(
            all_data_count))
    count = 0
    error_count = 0
    param = []
    while True:
        if not que.empty() and count < all_data_count:
            try:
                preprocess_logger.info('before get item: {}'.format(count))
                item = que.get()
                preprocess_logger.info('after get item: {}'.format(count))
            except TypeError as e:
                preprocess_logger.info('Save get Type Error')
                error_count += 1
                continue
            count += 1
            if count % 1000 == 0:
                preprocess_logger.info(
                    'Total receive records: {}'.format(count))
            if not item:
                continue
            param.append(item)
            preprocess_logger.info(
                'save data count: {}. current count: {}, Wait item: {}, Que is Empty: {}'
                .format(count, len(param), que.qsize(), que.empty()))
            if len(param) > 1000:
                preprocess_logger.info(
                    'Save {} recode. Total record: {}. error count: {}. Wait item: {}'
                    .format(len(param), count, error_count, que.qsize()))
                insert_items(db_full_path=FAKE_C_COMPILE_ERROR_DATA_DBPATH,
                             table_name=current_table_name,
                             params=dict_to_list(param))
                param = []
        elif que.empty() and count >= all_data_count:
            break
        elif que.qsize() <= 0:
            time.sleep(1)
    preprocess_logger.info(
        'Save {} recode. Total record: {}. error count: {}. Wait item: {}'.
        format(len(param), count, error_count, que.qsize()))
    insert_items(db_full_path=FAKE_C_COMPILE_ERROR_DATA_DBPATH,
                 table_name=current_table_name,
                 params=dict_to_list(param))
    preprocess_logger.info('End Save Fake Code Process')
 def __init__(self,
              vocabulary,
              db_path,
              table_name,
              replace_table_name,
              ignore_token=None,
              end_id=None):
     self.vocabulary = vocabulary
     self.db_path = db_path
     self.table_name = table_name
     self.replace_table_name = replace_table_name
     create_table(self.db_path, self.table_name, self.replace_table_name)
     self.ignore_token = ignore_token
     self.end_id = end_id
     self.total_count = 0
def resave_python_code_main():
    conn = sqlite3.connect(scrapyOJ_path)
    print('start read sql')
    df = pd.read_sql('select * from {} where language="Python 3"'.format('submit'), conn)
    print('total df length: {}'.format(len(df)))
    df = df[df['code'].map(lambda x: x != '')]
    print('no empty df length: {}'.format(len(df)))
    df_dict = df.to_dict(orient='list')
    del df

    print('finish filter')
    create_table(python_db_path, PYTHON_SUBMIT_TABLE)
    header_list = ['id', 'submit_url', 'submit_time', 'user_id', 'user_name', 'problem_id', 'problem_url', 'problem_name', 'problem_full_name', 'language', 'status', 'error_test_id', 'time', 'memory', 'code']
    total_list = [df_dict[key] for key in header_list]
    total_list = list(zip(*total_list))
    print('start save')
    insert_items(python_db_path, PYTHON_SUBMIT_TABLE, total_list)
    print('end save')
def preprocess():
    # initLogging()
    preprocess_logger.info("Start Read Code Data")
    code_df = read_deepfix_ac_data()
    preprocess_logger.info("Code Data Read Finish. Total: {}".format(
        code_df.shape[0]))
    que_read = mp.Queue()
    que_write = mp.Queue()

    create_table(db_full_path=db_name, table_name=current_table_name)
    pros = []
    for i in range(6):
        pro = mp.Process(target=make_fake_code, args=(que_read, que_write, i))
        pro.start()
        pros.append(pro)
    save_pro = mp.Process(target=save_fake_code,
                          args=(que_write, code_df.shape[0]))
    save_pro.start()

    count = 0
    ids = []
    items = []
    for index, row in code_df.iterrows():
        count += 1
        # item = create_codeforce_item(row)
        item = create_deepfix_item(row)
        items.append(item)

        ids.append(item['problem_user_id'])

        if len(ids) == 10000:
            push_code_to_queue(que_read, ids, items)
            preprocess_logger.info('Total Preprocess {}'.format(count))
            ids = []
            items = []

    push_code_to_queue(que_read, ids, items)
    preprocess_logger.info('Total Preprocess {}'.format(count))

    for p in pros:
        p.join()
    save_pro.join()
Example #10
0
def sample_and_save(model,
                    dataset,
                    batch_size,
                    loss_function,
                    parse_input_batch_data_fn,
                    parse_target_batch_data_fn,
                    do_sample=False,
                    print_output=False,
                    create_output_ids_fn=None,
                    evaluate_obj_list=[],
                    expand_output_and_target_fn=None,
                    add_data_record_fn=None,
                    db_path='',
                    table_name=''):
    # total_loss = to_cuda(torch.Tensor([0]))
    total_batch = to_cuda(torch.Tensor([0]))
    saved_count = 0
    steps = 1
    for o in evaluate_obj_list:
        o.clear_result()
    model.eval()

    total_saved_list = []

    with tqdm(total=len(dataset)) as pbar:
        with torch.no_grad():
            for batch_data in data_loader(dataset,
                                          batch_size=batch_size,
                                          drop_last=True):
                model.zero_grad()

                # model_input = parse_input_batch_data(batch_data)
                model_input = parse_input_batch_data_fn(batch_data,
                                                        do_sample=do_sample)
                # model_output = model.forward(*model_input, test=do_sample)
                if do_sample:
                    model_output = model.forward(*model_input, do_sample=True)

                    model_target = parse_target_batch_data_fn(batch_data)

                    model_output, model_target = expand_output_and_target_fn(
                        model_output, model_target)
                else:
                    model_output = model.forward(*model_input)
                    model_target = parse_target_batch_data_fn(batch_data)

                # loss = loss_function(*model_output, *model_target)

                output_ids = create_output_ids_fn(model_output, model_input)
                # total_loss += loss.data
                total_batch += batch_size

                # step_output = 'in evaluate step {}  loss: {}, '.format(steps, loss.data.item())
                step_output = 'in evaluate step {} '.format(steps)
                for evaluator in evaluate_obj_list:
                    res = evaluator.add_result(output_ids,
                                               model_output,
                                               model_target,
                                               model_input,
                                               batch_data=batch_data)
                    step_output += res
                # print(step_output)
                info(step_output)

                saved_list = add_data_record_fn(output_ids, model_output,
                                                batch_data)
                total_saved_list += saved_list

                if steps % 100 == 0:
                    create_table(db_path, table_name)
                    insert_items(db_path, table_name, total_saved_list)
                    saved_count += len(total_saved_list)
                    print('saved {} record in total {}. '.format(
                        saved_count, total_batch.item()))
                    total_saved_list = []

                if print_output and steps % 100 == 0:
                    pass
                    # output_ids = output_ids.tolist()
                    # target_ids = batch_data['ac_tokens']
                    # is_copy = (is_copy > 0.5).tolist()
                    # target_is_copy = target_is_copy.tolist()
                    # value_output = torch.squeeze(torch.topk(F.softmax(value_output, dim=-1), k=1, dim=-1)[1], dim=-1)
                    # value_output = value_output.tolist()
                    # target_ac_tokens = target_ac_tokens.tolist()
                    # pointer_output = torch.squeeze(torch.topk(F.softmax(pointer_output, dim=-1), k=1, dim=-1)[1], dim=-1)
                    # pointer_output = pointer_output.tolist()
                    # target_pointer_output = target_pointer_output.tolist()
                    # target_length = torch.sum(output_mask, dim=-1)
                    # target_length = target_length.tolist()
                    # for out, tar, cop, tar_cop, val, tar_val, poi, tar_poi, tar_len in zip(output_ids, target_ids, is_copy,
                    #                                                               target_is_copy, value_output,
                    #                                                               target_ac_tokens,
                    #                                                               pointer_output,
                    #                                                               target_pointer_output, target_length):
                    # # for out, tar,  in zip(output_ids, target_ids):
                    #     out_code, end_pos = convert_one_token_ids_to_code(out, id_to_word_fn=vocab.id_to_word, start=start_id,
                    #                                          end=end_id, unk=unk_id)
                    #     tar_code, tar_end_pos = convert_one_token_ids_to_code(tar[1:], id_to_word_fn=vocab.id_to_word, start=start_id,
                    #                                          end=end_id, unk=unk_id)
                    #     info('-------------- step {} ------------------------'.format(steps))
                    #     info('output: {}'.format(out_code))
                    #     info('target: {}'.format(tar_code))
                    #     cop = [str(c) for c in cop]
                    #     tar_cop = [str(int(c)) for c in tar_cop]
                    #     poi = [str(c) for c in poi]
                    #     tar_poi = [str(c) for c in tar_poi]
                    #     info('copy output: {}'.format(' '.join(cop[:tar_len])))
                    #     info('copy target: {}'.format(' '.join(tar_cop[:tar_len])))
                    #     info('pointer output: {}'.format(' '.join(poi[:tar_len])))
                    #     info('pointer target: {}'.format(' '.join(tar_poi[:tar_len])))
                    #
                    #     value_list = []
                    #     target_list = []
                    #     for c, v, t in zip(tar_cop, val, tar_val):
                    #         if c == '1':
                    #             value_list += ['<COPY>']
                    #             target_list += ['<COPY>']
                    #         else:
                    #             value_list += [vocab.id_to_word(int(v))]
                    #             target_list += [vocab.id_to_word(int(t))]
                    #     info('value output: {}'.format(' '.join(value_list[:tar_len])))
                    #     info('value target: {}'.format(' '.join(target_list[:tar_len])))

                steps += 1
                pbar.update(batch_size)

    create_table(db_path, table_name)
    insert_items(db_path, table_name, total_saved_list)
    saved_count += len(total_saved_list)
    print('saved {} record in total {}. '.format(saved_count,
                                                 total_batch.item()))

    return evaluate_obj_list
Example #11
0
def multi_step_evaluate(model,
                        dataset,
                        batch_size,
                        parse_input_batch_data_fn,
                        parse_target_batch_data_fn,
                        do_sample=False,
                        print_output=False,
                        create_output_ids_fn=None,
                        evaluate_obj_list=[],
                        expand_output_and_target_fn=None,
                        max_step_times=0,
                        vocabulary=None,
                        file_path='',
                        create_multi_step_next_input_batch_fn=None,
                        extract_includes_fn=lambda x: x['includes'],
                        print_output_fn=None,
                        do_beam_search=False,
                        target_file_path='main.out',
                        log_file_path='main.log',
                        do_save_data=False,
                        max_save_distance=None,
                        save_records_to_database=False,
                        db_path='',
                        table_name='',
                        change_output_records_to_batch_fn=None,
                        create_save_database_records_fn=None,
                        error_stop_type='normal'):
    total_loss = to_cuda(torch.Tensor([0]))
    total_batch = to_cuda(torch.Tensor([0]))
    steps = 0
    compile_evaluator = CompileResultEvaluate()
    compile_evaluator.clear_result()
    for o in evaluate_obj_list:
        o.clear_result()

    model.eval()

    from common.pycparser_util import tokenize_by_clex_fn
    tokenize_fn = tokenize_by_clex_fn()
    save_data_dict = {}
    save_records_list = []

    # file_path = add_pid_to_file_path(file_path)
    # target_file_path = add_pid_to_file_path(target_file_path)

    with tqdm(total=len(dataset)) as pbar:
        with torch.no_grad():
            for batch_data in data_loader(dataset,
                                          batch_size=batch_size,
                                          drop_last=False):
                model.zero_grad()

                input_data = batch_data.copy()
                final_output_list = []
                output_records_list = []
                continue_list = [True for _ in range(batch_size)]
                result_list = [False for _ in range(batch_size)]
                result_records_list = []
                sample_steps = [-1 for _ in range(batch_size)]
                error_count_list = batch_data['error_count']

                for i in range(max_step_times):
                    model_input = parse_input_batch_data_fn(input_data,
                                                            do_sample=True)

                    model_output = model.forward(*model_input,
                                                 do_sample=True,
                                                 do_beam_search=do_beam_search)

                    input_data, final_output, output_records, final_output_name_list, continue_list = create_multi_step_next_input_batch_fn(
                        input_data, model_input, model_output, continue_list,
                        do_beam_search)
                    final_output_list += [final_output]
                    output_records_list += [output_records]

                    continue_list, result_list, cur_error_count_list = compile_code_ids_list(
                        final_output_name_list,
                        continue_list,
                        result_list,
                        vocabulary=vocabulary,
                        includes_list=extract_includes_fn(input_data),
                        file_path=file_path,
                        target_file_path=target_file_path,
                        log_file_path=log_file_path,
                        do_compile_pool=True,
                        need_transform=False)

                    if error_stop_type == 'oracle':
                        reject_list = [
                            True if c and n > o else False
                            for c, o, n in zip(continue_list, error_count_list,
                                               cur_error_count_list)
                        ]
                    elif error_stop_type == 'normal':
                        reject_list = [False for _ in range(batch_size)]
                    error_count_list = [
                        n if n < o and n >= 0 else o
                        for o, n in zip(error_count_list, cur_error_count_list)
                    ]
                    for i_f, rej in enumerate(reject_list):
                        if rej:
                            # use last output
                            final_output_name_list[i_f] = input_data[
                                'last_input_seq_name'][i_f]
                            continue_list[i_f] = False

                    sample_steps = [
                        i + 1 if s == -1 and not c and not r else s for s, c, r
                        in zip(sample_steps, continue_list, reject_list)
                    ]
                    sample_steps = [
                        i if s == -1 and not c and r else s for s, c, r in zip(
                            sample_steps, continue_list, reject_list)
                    ]

                    result_records_list += [result_list]
                    if sum(continue_list) == 0:
                        break
                sample_steps = [
                    max_step_times if s == -1 else s for s in sample_steps
                ]

                if do_save_data:
                    batch_data['input_seq_name'] = batch_data[
                        'final_output_name']
                    save_res_dict = save_addition_data(
                        original_states=batch_data,
                        states=input_data,
                        tokenize_fn=tokenize_fn,
                        batch_size=batch_size,
                        file_path=file_path,
                        target_file_path=target_file_path,
                        vocabulary=vocabulary,
                        max_distande=max_save_distance,
                        only_error=True)
                    for k, v in save_res_dict.items():
                        save_data_dict[k] = save_data_dict.get(k, []) + v

                if save_records_to_database:
                    batch_output_records = change_output_records_to_batch_fn(
                        output_records_list, sample_steps)
                    records_list = create_save_database_records_fn(
                        batch_data, sample_steps, final_output_name_list,
                        result_list, batch_output_records, input_data)
                    save_records_list += records_list

                step_output = 'in evaluate step {}: '.format(steps)
                res = compile_evaluator.add_result(result_list)
                step_output += res
                for evaluator in evaluate_obj_list:
                    # customer evaluator interface
                    res = evaluator.add_result(result_list,
                                               batch_data=batch_data)
                    step_output += res
                # print(step_output)
                info(step_output)

                if print_output and steps % 1 == 0:
                    print_output_fn(output_records=output_records_list,
                                    final_output=final_output_list,
                                    batch_data=batch_data,
                                    step_i=steps,
                                    vocabulary=vocabulary,
                                    compile_result_list=result_records_list)

                steps += 1
                pbar.update(batch_size)
    evaluate_obj_list = [compile_evaluator] + evaluate_obj_list

    if save_records_to_database:
        create_table(db_path,
                     DATA_RECORDS_DEEPFIX,
                     replace_table_name=table_name)
        run_sql_statment(db_path,
                         DATA_RECORDS_DEEPFIX,
                         'insert_ignore',
                         save_records_list,
                         replace_table_name=table_name)

    if steps == 0:
        t_loss = 0
    else:
        t_loss = (total_loss / steps).item()
    return evaluate_obj_list, t_loss, save_data_dict