def process_sentences(file_path, pdf_name): sent_list_raw = open(file_path, 'r').readlines() sent_list_raw = [sent.rstrip('\n') for sent in sent_list_raw] sent_list_raw.pop(0) # Remove header column sent_list = [] sent_obj = {} for sent in sent_list_raw: sent_split = sent.lower().split("\t") if len(sent_split) < 4: error_sents.append(sent_split) continue sent_id = sent_split.pop(0) sect_name = sent_split.pop(0) box_name = sent_split.pop(0) word_ids = sent_split.pop().split(",") word_array_spaces = re.split(word_split_pattern, " ".join(sent_split)) word_array = [x for x in word_array_spaces if x not in [" ", "\t", ""]] sent_obj_obj = { 'sent_id': sent_id, 'sect_name': sect_name, 'box_name': box_name, 'text': " ".join(sent_split), 'word_array': word_array, 'word_ids': word_ids } # Filter out & ignore incorrectly split sentences by PDFNLT (incorrect word displayed in XHTML) if len(word_array) != len(word_ids): error_sents.append(sent_obj_obj) # Add extra metadata to word-array adn add sentence to list and object else: word_array_info = [] # word_set = [] for i, word in enumerate(word_array): temp_word = { 'text': word, 'word_id': word_ids[i] } word_array_info.append(temp_word) # word_set.append(word) sent_obj_obj['word_array_info'] = word_array_info # print(sent_obj_obj['word_array_info']) sent_list.append(sent_obj_obj) sent_obj[sent_id] = sent_obj_obj # Add some sentence split processing stats statistics.init() statistics.log_stat(f'{pdf_name.lower()}: # sentences incorrectly split by PDFNLT: {len(error_sents)}/{len(sent_list)}') statistics.log_stat(f'{pdf_name.lower()}: # entities rejected because entity.number_words > max_entity_words ({max_entity_words}): {number_entities_rejected}') # print(sent_obj['s-3-1-0-2']['word_array_info']) return sent_list, sent_obj, error_sents
config['init_uninit_vars'] = args.init_uninit_vars config['redundant_test'] = args.redundant_test config['verbose'] = args.verbose config['build_before_instr'] = args.build_before_instr config['instr_printf'] = args.instr_printf config['mute_build_message'] = args.mute_build_message config['mute_test_message'] = args.mute_test_message config['mute_warning'] = args.mute_warning config['localize_only'] = args.localize_only config['invalid_localization'] = args.invalid_localization if args.verbose: for key, value in config.items(): logger.info('option {} = {}'.format(key, value)) statistics.init(working_dir) if args.ignore_lines: args.lines = None tool = Angelix(working_dir, src=args.src, buggy=args.buggy, oracle=abspath(args.oracle), tests=args.tests, golden=args.golden, asserts=asserts, lines=args.lines, build=args.build, configure=args.configure, config=config)
config['build_before_instr'] = args.build_before_instr config['mute_build_message'] = args.mute_build_message config['mute_test_message'] = args.mute_test_message config['mute_warning'] = args.mute_warning config['build_validation_only'] = args.build_validation_only config['build_golden_only'] = args.build_golden_only config['build_backend_only'] = args.build_backend_only config['localize_only'] = args.localize_only config['invalid_localization'] = args.invalid_localization config['term_when_syn_crashes'] = args.term_when_syn_crashes if args.verbose: for key, value in config.items(): logger.info('option {} = {}'.format(key, value)) statistics.init(working_dir) if args.ignore_lines: args.lines = None tool = Angelix(working_dir, src=args.src, buggy=args.buggy, oracle=abspath(args.oracle), tests=args.tests, golden=args.golden, asserts=asserts, lines=args.lines, build=args.build, configure=args.configure, config=config)
subproc_output = sys.stderr else: subproc_output = subprocess.DEVNULL if args.fuzz_results_dir is not None: copy(args.fuzz_results_dir, join(poracle_workdir, 'fuzz-results')) with open(args.config_file, "r") as read_file: data = json.load(read_file) target = data['target'] if 'target' in data else None delta_allowed = float( data['delta_allowed']) if 'delta_allowed' in data else 0 data['delta_allowed'] = delta_allowed threadName = data['threadName'] if 'threadName' in data else 'main' statistics.init(poracle_workdir, args.config_file, poracle_config, data) try: poracle = Poracle(poracle_config, poracle_workdir, deltas_dir, data['project'], data['bug_id'], data['ID'], target, delta_allowed, data['correctness'], threadName) code = None code = poracle() except PoracleException as e: logger.error(e.get_msg()) end_time = time.time() elapsed_time = end_time - start_time statistics.data['total_time'] = elapsed_time statistics.save() if code is not None: