Python PDFParserの例、pdf_parser.PDFParser Pythonの例

コード例 #1

0

ファイルを表示

ファイル: work_importer.py プロジェクト: kcl-ddh/chopin-online

    def _import_heading(self, heading_filename):
        """Imports the heading information about the Work."""
        self.logger.debug('_import_heading {0}'.format(heading_filename))

        # creates a new PDFParser to read the contents of the heading file
        parser = PDFParser(heading_filename)

        # gets the content of the PDF file
        heading = parser.get_text_content()

        return heading

コード例 #2

0

ファイルを表示

ファイル: work_importer.py プロジェクト: kingsdigitallab/chopin-django

    def _import_heading(self, heading_filename):
        """Imports the heading information about the Work."""
        self.logger.debug('_import_heading {0}'.format(heading_filename))

        # creates a new PDFParser to read the contents of the heading file
        parser = PDFParser(heading_filename)

        # gets the content of the PDF file
        heading = parser.get_text_content()

        return heading

コード例 #3

0

ファイルを表示

ファイル: Sample2.py プロジェクト: samuelkaeser/EDPR-Application

def count_words(run_once=False):
    txt_directory = '/Users/samuelkaeser/Documents/University/Classes/EE_460J/Homework/Lab5/txts'
    txts = [f for f in os.listdir(txt_directory)]
    parser = PDFParser()
    for txt in txts:
        txt_path = os.path.join(txt_directory, txt)
        with open(txt_path, 'r') as f:
            for line in f:
                parser.parse(line)
        if run_once:
            break
    return parser.word_counts, parser.total_words

コード例 #4

0

ファイルを表示

ファイル: work_importer.py プロジェクト: kcl-ddh/chopin-online

    def _import_impression (self, work, publishers_page, f_path):
        # creates a new PDFParser to get the impression
        self.logger.debug('Parsing {}'.format(f_path))
        parser = PDFParser(f_path)
        code = parser.get_impression_code()
        if code:
            self.logger.debug('Impression: ' + code)

            # Create an Impression PDF Document.
            document = Document(title=code)
            with open(f_path, 'rb') as fh:
                pdf_file = File(fh)
                document.file.save(os.path.basename(f_path), pdf_file)
            document.tags.add('impression')

            # creates a new impression
            impression = Impression()
            impression.title = code
            impression.impression_title = parser.get_title()
            impression.content = parser.get_text_content()
            impression.pdf = document
            try:
                sort_order = self._order_of_impressions.index(code.lower())
            except Exception:
                self.logger.error(
                    u'{0} missing from order of impressions, which consists of: {1}'.format(code, ', '.join(self._order_of_impressions)))
                sort_order = 999
            impression.sort_order = sort_order
            impression.slug = safe_slugify(impression.title,
                                           Impression)
            impression.comments = parser.get_comments()
            self._import_copies(impression, parser, code)
            publisher_code = impression.title.split('-')[-1]
            publisher = Publisher.objects.filter(title=publisher_code).first()
            if not publisher:
                publisher = Publisher(title=publisher_code)
                publisher.slug = slugify(publisher_code)
                publishers_page.add_child(instance=publisher)
            impression.publisher = publisher
            work.add_child(instance=impression)

コード例 #5

0

ファイルを表示

ファイル: parser.py プロジェクト: boxorange/scientific-literature-mining

def update_uid_list():
    ep = ElsevierParser()
    sp = SpringerParser()
    pp = PMCParser()
    pdf_p = PDFParser()
    rp = RSCParser()

    #ep.update_uid_list()
    #sp.update_uid_list()
    #pp.update_uid_list()
    #pdf_p.update_uid_list('APS')
    #pdf_p.update_uid_list('ACS')
    #pdf_p.update_uid_list('Wiley')
    #pdf_p.update_uid_list('IUCr')
    #pdf_p.update_uid_list('RSC')
    #pdf_p.update_uid_list('IOP_JSON')
    rp.update_uid_list()

コード例 #6

0

ファイルを表示

ファイル: work_importer.py プロジェクト: kingsdigitallab/chopin-django

    def _import_impression(self, work, publishers_page, f_path):
        # creates a new PDFParser to get the impression
        self.logger.debug('Parsing {}'.format(f_path))
        parser = PDFParser(f_path)
        code = parser.get_impression_code()
        if code:
            self.logger.debug('Impression: ' + code)

            # Create an Impression PDF Document.
            document = Document(title=code)
            with open(f_path, 'rb') as fh:
                pdf_file = File(fh)
                document.file.save(os.path.basename(f_path), pdf_file)
            document.tags.add('impression')

            # creates a new impression
            impression = Impression()
            impression.title = code
            impression.impression_title = parser.get_title()
            impression.content = parser.get_text_content()
            impression.pdf = document
            try:
                sort_order = self._order_of_impressions.index(code.lower())
            except Exception:
                self.logger.error(
                    u'{0} missing from order of impressions, which consists of: {1}'
                    .format(code, ', '.join(self._order_of_impressions)))
                sort_order = 999
            impression.sort_order = sort_order
            impression.slug = safe_slugify(impression.title, Impression)
            impression.comments = parser.get_comments()
            self._import_copies(impression, parser, code)
            publisher_code = impression.title.split('-')[-1]
            publisher = Publisher.objects.filter(title=publisher_code).first()
            if not publisher:
                publisher = Publisher(title=publisher_code)
                publisher.slug = slugify(publisher_code)
                publishers_page.add_child(instance=publisher)
            impression.publisher = publisher
            work.add_child(instance=impression)

コード例 #7

0

ファイルを表示

# STEP 1: import parser class
from pdf_parser import PDFParser

# STEP 2: instantiate class
parser = PDFParser()

# STEP 3: get available forms (i9, etc.)
forms = parser.available_forms()
print(forms)

# STEP 4: get form details (i.e. array of fields/questions object)
details = parser.form_details('dmv44')
print(details)

# STEP 5: fill form with form_name and answers dict, returns bytes
dummy_answers = {
    'ssn': '123456789',
    'first_name': 'lil\'',
    'last_name': 'pea',
    'middle_name': '',
    'address': '21 Pea Rd',
    'apt_number': '1c',
    'city': 'New York',
    'state': 'NY',
    'date_of_birth': '02022019',
    'telephone_number': '917-PEA-PEA',
    'applying_for': 1,
    'purpose': 4,
    'organ': 1
}

コード例 #8

0

ファイルを表示

ファイル: parser.py プロジェクト: boxorange/scientific-literature-mining

def parse_PDF():
    pdf_p = PDFParser()
    '''
	dir = "/home/gpark/corpus_web/tdm/archive/RSC"
	for filename in os.listdir(dir):
		if filename.endswith(".pdf"):
			print(filename)
			pdf_p.parse(os.path.join(dir, filename))
			
			input("Press Enter to continue...")
	'''

    cnt_article_w_keyword = 0
    terms = ['EXAFS', 'XANES', 'NEXAFS', 'pair distribution function']
    terms = [x.lower() for x in terms]  # lowercase

    num_of_files = 0

    #check_point_found = False

    dir = "/home/gpark/corpus_web/tdm/archive/IOP_JSON"

    # debugging
    file_doi = {}
    for file in os.listdir(dir):
        if file.endswith(".json"):
            with open(os.path.join(dir, file), "r") as read_file:
                data = json.load(read_file)

                body_text = data['body_text']

                found = False
                for sent in body_text:
                    tokens = sent['sent'].split()
                    tokens = [x.lower() for x in tokens]

                    if any(elem in tokens for elem in terms[:3]):
                        found = True
                        break

                if found is True:
                    pdf_file = file.replace('.json', '.pdf')
                    file_doi[pdf_file] = data['uid']

    with open("iop_filtered_list.txt", 'a') as out_file:
        for file, doi in file_doi.items():
            out_file.write(file + ' -> https://doi.org/' + doi + '\n')

    sys.exit()
    # debugging

    for root, dirs, files in os.walk(dir):
        dirs.sort(
            reverse=True
        )  # it will traverse the subdirectories in reverse lexicographic order of their names.
        for file in files:
            if file.endswith(".pdf"):
                ''' when an error occurs, to start after the last processed file.
				if check_point_found == False:
					if file == 'epl_38_6_453.pdf':
						check_point_found = True
						continue
					else:
						continue
				'''

                iop_meta_file = os.path.join(root, '.article')

                if os.path.exists(iop_meta_file) == False:
                    continue

                pdf_p.parse(os.path.join(root, file), terms, iop_meta_file)

                num_of_files += 1

                print('>> file: ', os.path.join(root, file),
                      ' / num_of_files: ', num_of_files)

                #input("Press Enter to continue...")

                #if file in ['jpmater_1_1_01LT02.pdf', 'jpmater_1_1_015010.pdf', 'jpmater_1_1_015006.pdf', 'mfm_1_1_015005.pdf']:
                #if file in ['jpco_3_1_015002.pdf']:
                #	input("Press Enter to continue...")

                #if pdf_p.parse(os.path.join(root, file), terms) == True:	# len(body_text) == 0 -> True
                #	cnt_article_w_keyword += 1

    print(cnt_article_w_keyword)

コード例 #9

0

ファイルを表示

def start_questions(recipient_id, payload, txt=None):
    global user_data
    if not payload:
        payload = user_data[recipient_id]["current_form"]
    if recipient_id in user_data and 'done' in user_data[
            recipient_id] and user_data[recipient_id]['done'] == True:
        if txt.isdigit():
            account_sid = 'AC67c8a0b6b16986da80dc1ac0fdb26808'
            auth_token = '1db9a6a11f38faafa562d1e72607ba39'
            client = Client(account_sid, auth_token)

            message = client.messages \
                .create(
                    body=user_data[recipient_id]["public_url"],
                    from_='+12153911286',
                    to='+1'+txt
                )
        else:
            print("form is already filled. please reset")
        return ""

    payload_correct = ''.join(
        [i.lower() for i in payload if i.isalpha() or i.isdigit()])
    print(payload)
    print(payload_correct)
    parser = PDFParser()
    details = parser.form_details(payload_correct)
    print(details)
    txt_trans = ""
    if txt is not None:
        translate_client = translate.Client()
        txt_trans = translate_client.translate(txt, target_language='en')
        txt_trans = txt_trans['translatedText']

    if recipient_id in user_data and 'in_progress' in user_data[recipient_id]:

        print("in progress")
        current_key = details[len(user_data[recipient_id]["answers"])]['id']
        user_data[recipient_id]["answers"][current_key] = txt_trans
        if len(user_data[recipient_id]["answers"]) == len(details):
            print("done!")
            user_data[recipient_id]['done'] = True
            filled_form = parser.fill_form(
                user_data[recipient_id]["current_form"],
                user_data[recipient_id]["answers"])

            import time
            timestamp = int(time.time())
            save(filled_form, 'files/filled_test-{}.pdf'.format(timestamp))

            user_info = bot.get_user_info(recipient_id)
            fname = user_info["first_name"]
            lname = user_info["last_name"]
            pdf_form = user_data[recipient_id]["current_form"]

            translate_client = translate.Client()
            target_lang = user_data[recipient_id]['lang']
            translated_text = translate_client.translate(
                "You are done! Here is your file", target_language=target_lang)
            translated_text = translated_text['translatedText']
            translated_text = translated_text
            bot.send_text_message(recipient_id, translated_text)

            storage_client = storage.Client()
            bucket_name = 'ezpz-files-public'
            bucket = storage_client.get_bucket(bucket_name)
            source_file_name = "/Users/tomeraharoni/Documents/Projects/devfest/files/filled_test-{}.pdf".format(
                timestamp)

            destination_blob_name = "{}-{}-{}-{}-filled.pdf".format(
                pdf_form, fname, lname, timestamp)
            blob = bucket.blob(destination_blob_name)
            blob.upload_from_filename(source_file_name)
            blob.make_public()
            bot.send_text_message(recipient_id, blob.public_url)
            user_data[recipient_id]['public_url'] = blob.public_url
            target_lang = user_data[recipient_id]['lang']
            translated_text = translate_client.translate(
                "If you want the file sent to your phone, please type in your number",
                target_language=target_lang)
            translated_text = translated_text['translatedText']
            bot.send_text_message(recipient_id, translated_text)
            return

        question_object = details[len(user_data[recipient_id]["answers"])]
        question_type = question_object["type"]
        question_text = question_object["question"]
        added_string = ""
        if question_type == "bool":
            added_string = "(Yes / No)"
        text_to_send = "{} {}".format(question_text, added_string)
        translate_client = translate.Client()
        target_lang = user_data[recipient_id]['lang']
        if target_lang == 'en':
            translated_text = text_to_send
        else:
            translated_text = translate_client.translate(
                text_to_send, target_language=target_lang)
            translated_text = translated_text['translatedText']
        bot.send_text_message(recipient_id, translated_text)
    else:
        print("first question!")
        user_data[recipient_id]["in_progress"] = True
        user_data[recipient_id]["answers"] = {}
        user_data[recipient_id]["current_form"] = payload_correct

        translate_client = translate.Client()
        target_lang = user_data[recipient_id]['lang']

        filling_intro_text = 'Sure! I can help you with your {} form'.format(
            payload)
        filling_intro_text_tra = translate_client.translate(
            filling_intro_text, target_language=target_lang)
        filling_intro_text_tra = filling_intro_text_tra['translatedText']
        bot.send_text_message(recipient_id, filling_intro_text_tra)

        question_object = details[0]
        question_type = question_object["type"]
        question_text = question_object["question"]
        added_string = ""
        if question_type == "bool":
            added_string = "(Yes / No)"
        elif question_type == "option":
            q_options = question_object["options"]
            added_string = "\n"
            for i in range(len(q_options)):
                added_string = added_string + \
                    "\n{}) {}".format(i, q_options[i])
        text_to_send = "{} {}".format(question_text, added_string)
        if target_lang == 'en':
            translated_text = text_to_send
        else:
            translated_text = translate_client.translate(
                text_to_send, target_language=target_lang)
            translated_text = translated_text['translatedText']
        bot.send_text_message(recipient_id, translated_text)