Python parse_pdf Examples, pdf_parser.parse_pdf Python Examples

Example #1

0

Show file

File: relation.py Project: brandeis-llc/dtra-covid-citationgraph

def load_domain():
    arxiv_feeder = pdf_feeder.ArxivFeeder()
    domain = []
    for paper in arxiv_feeder.feed():
        parsed = pdf_parser.parse_pdf(paper)
        if parsed is not None:
            domain.append(parsed)
    return domain

Example #2

0

Show file

File: relation.py Project: brandeis-llc/dtra-covid-citationgraph

def testload_codomain():
    local_feeder = pdf_feeder.LocalFeeder('./test', 'pdf')
    codomain = []
    for paper in local_feeder.feed():
        parsed = pdf_parser.parse_pdf(paper)
        if parsed is not None:
            codomain.append(parsed)
    return codomain

Example #3

0

Show file

File: relation.py Project: brandeis-llc/dtra-covid-citationgraph

def testload_domain():
    local_feeder = pdf_feeder.LocalFeeder('/data/dtriac/dtra-covid/Papers',
                                          'pdf')
    domain = []
    for paper in local_feeder.feed():
        parsed = (pdf_parser.parse_pdf(paper))
        if parsed is not None:
            domain.append(parsed)
    return domain

Example #4

0

Show file

def main():
    args = main_args()
    is_annotation = args.annotations
    csv_file_path = args.csv_file_path
    num_files = len([
        f for f in os.listdir(PDF_FILES)
        if os.path.isfile(os.path.join(PDF_FILES, f))
    ])
    num_train_images = num_files / 100 * 90
    files_processed = 0
    if os.path.exists(csv_file_path):
        os.remove(csv_file_path)
    for pdf_file_path in glob.glob(os.path.join(PDF_FILES, '*.pdf')):
        if files_processed < num_train_images:
            is_train = True
            files_processed += 1
        else:
            is_train = False
        file_id = str(os.path.basename(pdf_file_path).split('.pdf')[0])
        tex_file_path = TEX_FILES + file_id + '_tex_files'
        if os.path.exists(tex_file_path):
            print('\nParsing ' + pdf_file_path + '...')
            try:
                detected_objects = parse_pdf(pdf_file_path, tex_file_path,
                                             is_annotation, is_train, True)
            except:
                detected_objects = []
                if os.path.exists(PNG_FILES_TRAIN + file_id +
                                  '_annotated_images'):
                    shutil.rmtree(PNG_FILES_TRAIN + file_id +
                                  '_annotated_images')
            if detected_objects and is_train:
                print('Save annotations...')
                generate_csv_annotations(csv_file_path, file_id,
                                         detected_objects)
            elif not is_train:
                print('Added test images.')

            elif not detected_objects:
                print('ERROR IN PARSING FILES, GO ON.')

            print('Num paper processed: ', files_processed)

    if os.path.exists(csv_file_path):
        txt_path = 'frcnn/annotated_train_images.txt'
        obtain_txt_train_images_file(csv_file_path, txt_path,
                                     '../png_files/train_images/')

Example #5

0

Show file

def parse_csvs():
    yrs = years()
    print(yrs)
    college_abbrevs = colleges()
    arguments = []
    for year in yrs:
        for semester in [SPRING, SUMMER, FALL]:
            year_semester = str(year) + semester
            for college in college_abbrevs:
                arguments.append((year_semester, college))
    with ProcessPoolExecutor() as executor:
        executor.map(download_pdf, arguments)
        pdf_paths = get_files_in_dir(PDF_DOWNLOAD_DIR, ".pdf")
        for pdf_path in pdf_paths:
            if not pdf_path:
                continue
            distribution_fields = pdf_parser.parse_pdf(pdf_path)
            with open(pdf_path.replace(".pdf", ".csv"), "w+") as csvfile:
                writer = csv.writer(csvfile)
                writer.writerow([
                    "A",
                    "B",
                    "C",
                    "D",
                    "F",
                    "I",
                    "S",
                    "U",
                    "Q",
                    "X",
                    "DEPT",
                    "COURSE_NUM",
                    "SECTION_NUM",
                    "INSTRUCTOR_NAME",
                ])
                for dist in distribution_fields:
                    (
                        grades,
                        (dept, course_num, section_num, instructor_name),
                    ) = dist
                    writer.writerow([
                        *grades, dept, course_num, section_num, instructor_name
                    ])

Example #6

0

Show file

File: data_loader.py Project: kylecombes/ting-analytics

 def read_bill_summary_pdf(self, filename, template_dir):
     return parse_pdf(filename, template_dir)

Example #7

0

Show file

File: test_images_annotator.py Project: pisalore/FRCNN_teX-annotator

tex_files = './tex_files/'
csv_file_path = 'test_images_annotations.csv'
txt_path = 'frcnn/annotated_test_images.txt'
list_subfolders = [f.path for f in os.scandir(test_imgs_path) if f.is_dir()]
file_processed = 0
if os.path.exists('./frcnn/parse_error_test_files.txt'):
    os.remove('./frcnn/parse_error_test_files.txt')
errors = open('./frcnn/parse_error_test_files.txt', 'a+')
for subdir in list_subfolders:
    file_id = os.path.basename(subdir).split('_annotated_images')[0]
    print(file_id)
    pdf_file_path = pdf_files + os.path.basename(subdir).split(
        '_annotated_images')[0] + '.pdf'
    tex_file_path = tex_files + file_id + '_tex_files'
    try:
        detected_objects = parse_pdf(pdf_file_path, tex_file_path, 'no', True,
                                     False)
    except:
        detected_objects = []
        errors.write('Error parsing ' + str(file_id) + '\n')
        print('Error in processing ' + file_id)

    file_processed += 1
    if detected_objects:
        generate_csv_annotations(csv_file_path, file_id, detected_objects)
    else:
        errors.write('Error parsing ' + str(file_id) + '\n')
        print('Error in processing ' + file_id)
    print(file_processed)

errors.close()
obtain_txt_train_images_file(csv_file_path, txt_path,