def load_domain(): arxiv_feeder = pdf_feeder.ArxivFeeder() domain = [] for paper in arxiv_feeder.feed(): parsed = pdf_parser.parse_pdf(paper) if parsed is not None: domain.append(parsed) return domain
def testload_codomain(): local_feeder = pdf_feeder.LocalFeeder('./test', 'pdf') codomain = [] for paper in local_feeder.feed(): parsed = pdf_parser.parse_pdf(paper) if parsed is not None: codomain.append(parsed) return codomain
def testload_domain(): local_feeder = pdf_feeder.LocalFeeder('/data/dtriac/dtra-covid/Papers', 'pdf') domain = [] for paper in local_feeder.feed(): parsed = (pdf_parser.parse_pdf(paper)) if parsed is not None: domain.append(parsed) return domain
def main(): args = main_args() is_annotation = args.annotations csv_file_path = args.csv_file_path num_files = len([ f for f in os.listdir(PDF_FILES) if os.path.isfile(os.path.join(PDF_FILES, f)) ]) num_train_images = num_files / 100 * 90 files_processed = 0 if os.path.exists(csv_file_path): os.remove(csv_file_path) for pdf_file_path in glob.glob(os.path.join(PDF_FILES, '*.pdf')): if files_processed < num_train_images: is_train = True files_processed += 1 else: is_train = False file_id = str(os.path.basename(pdf_file_path).split('.pdf')[0]) tex_file_path = TEX_FILES + file_id + '_tex_files' if os.path.exists(tex_file_path): print('\nParsing ' + pdf_file_path + '...') try: detected_objects = parse_pdf(pdf_file_path, tex_file_path, is_annotation, is_train, True) except: detected_objects = [] if os.path.exists(PNG_FILES_TRAIN + file_id + '_annotated_images'): shutil.rmtree(PNG_FILES_TRAIN + file_id + '_annotated_images') if detected_objects and is_train: print('Save annotations...') generate_csv_annotations(csv_file_path, file_id, detected_objects) elif not is_train: print('Added test images.') elif not detected_objects: print('ERROR IN PARSING FILES, GO ON.') print('Num paper processed: ', files_processed) if os.path.exists(csv_file_path): txt_path = 'frcnn/annotated_train_images.txt' obtain_txt_train_images_file(csv_file_path, txt_path, '../png_files/train_images/')
def parse_csvs(): yrs = years() print(yrs) college_abbrevs = colleges() arguments = [] for year in yrs: for semester in [SPRING, SUMMER, FALL]: year_semester = str(year) + semester for college in college_abbrevs: arguments.append((year_semester, college)) with ProcessPoolExecutor() as executor: executor.map(download_pdf, arguments) pdf_paths = get_files_in_dir(PDF_DOWNLOAD_DIR, ".pdf") for pdf_path in pdf_paths: if not pdf_path: continue distribution_fields = pdf_parser.parse_pdf(pdf_path) with open(pdf_path.replace(".pdf", ".csv"), "w+") as csvfile: writer = csv.writer(csvfile) writer.writerow([ "A", "B", "C", "D", "F", "I", "S", "U", "Q", "X", "DEPT", "COURSE_NUM", "SECTION_NUM", "INSTRUCTOR_NAME", ]) for dist in distribution_fields: ( grades, (dept, course_num, section_num, instructor_name), ) = dist writer.writerow([ *grades, dept, course_num, section_num, instructor_name ])
def read_bill_summary_pdf(self, filename, template_dir): return parse_pdf(filename, template_dir)
tex_files = './tex_files/' csv_file_path = 'test_images_annotations.csv' txt_path = 'frcnn/annotated_test_images.txt' list_subfolders = [f.path for f in os.scandir(test_imgs_path) if f.is_dir()] file_processed = 0 if os.path.exists('./frcnn/parse_error_test_files.txt'): os.remove('./frcnn/parse_error_test_files.txt') errors = open('./frcnn/parse_error_test_files.txt', 'a+') for subdir in list_subfolders: file_id = os.path.basename(subdir).split('_annotated_images')[0] print(file_id) pdf_file_path = pdf_files + os.path.basename(subdir).split( '_annotated_images')[0] + '.pdf' tex_file_path = tex_files + file_id + '_tex_files' try: detected_objects = parse_pdf(pdf_file_path, tex_file_path, 'no', True, False) except: detected_objects = [] errors.write('Error parsing ' + str(file_id) + '\n') print('Error in processing ' + file_id) file_processed += 1 if detected_objects: generate_csv_annotations(csv_file_path, file_id, detected_objects) else: errors.write('Error parsing ' + str(file_id) + '\n') print('Error in processing ' + file_id) print(file_processed) errors.close() obtain_txt_train_images_file(csv_file_path, txt_path,