Example #1
0
def processpdfnew(verbose, debug, pagetext):

    # Create lists for all values to be exported to CSV file. Each index value will correspond to the metadata
    # for one article across all lists. This code assumes that there will be no more than two authors on each article.
    title = []
    author = []
    start_page = []
    start_pdf_page = []
    end_pdf_page = []
    page_number = 0

    # Get titles and authors from first page--for original format publications
    toc = re.findall(
        r'([IVXL]{1,4}\.?)\s([A-Za-z0-9., ]+)\s(\(([A-Za-z ]{5,})\))?',
        pagetext[0], re.DOTALL)
    # Clean up title. Then append titles and authors to title and Author lists. Append blank start page.
    for r in range(0, len(toc)):
        if 0 < debug < 3:
            print(f'Record {r}, {toc[r]}')
        temp_title = toc[r][1]
        temp_title = temp_title.strip()
        temp_title = re.sub(r' {2,}', " ", temp_title)
        temp_title = temp_title.title()
        temp_title = journaltools.capitalize_title(temp_title)
        title.append(temp_title)
        find_author = re.sub(r' {2,}', ' ', toc[r][3])
        find_author = re.split(r' and ', find_author, 2)
        if find_author:
            author_list = []
            for count in range(0, 4):
                try:
                    f_name, m_name, l_name, suffix = journaltools.splitname(
                        find_author[count])
                    author_temp = f_name, m_name, l_name, suffix
                except IndexError:
                    author_temp = '', '', '', ''
                author_list.append(author_temp)
            author.append(author_list)
        start_page.append('')
        if 1 < debug < 5:
            print(f'{title[r]}, {author[r]}')

    # Process each page. Step through pages and attempt to find titles, authors, and page numbers in OCR text.
    # Store this metadata and the start and end pages of each article into lists.
    if verbose:
        print('Processing PDF pages')

    for page_number in range(1, len(pagetext)):

        if 0 < debug < 6:
            print('Processing PDF page number %d' % page_number)

        # look for an article title on this page
        title_parts = re.search(
            r'(?<=\n)([XIVHLlixv]{1,4}\.)\s([A-Za-z0-9.,*\- ]*)\s(?=\n)',
            pagetext[page_number],
            flags=0)
        if 1 < debug < 5 and title_parts:
            print('title parts: %s' % title_parts)

        # Append temp_title to title list. Don't look for original page number in OCR text, because they didn't come
        # through on these. Append placeholder string for original start page. Append the page number of the
        # PDF file to start_pdf_page list. For every page after the first, append the page number to end_pdf_page.
        # This will add a garbage page to articles that start a page, but there's no great way to determine if
        # the article starts the page.

        if title_parts:
            # OriginalPageNumber = re.search(r'^[\d]{1,4}$', pagetext[page_number], re.MULTILINE)
            # if OriginalPageNumber:
            #     start_page.append(OriginalPageNumber[0])
            # else:
            #     start_page.append(" ")
            start_pdf_page.append(page_number)
            if page_number > 1:
                end_pdf_page.append(page_number)

        if 1 < debug < 5:
            print(f'PDF start pages: {start_pdf_page}')
            print(f'PDF end pages: {end_pdf_page}')

    end_pdf_page.append(page_number)

    # Compare all lists to make sure they contain the same number of items. Add empty items to short lists.
    if len(start_pdf_page) < len(title):
        for r in range(len(start_pdf_page), len(title)):
            start_pdf_page.append(0)
        print('WARNING! Missing Start PDF Page(s)')
    if len(end_pdf_page) < len(title):
        for r in range(len(end_pdf_page), len(title)):
            end_pdf_page.append(0)
        print('WARNING! Missing End PDF Page(s)')

    # Lots of debugging output
    # Print all of the lists; debug levels 2 & 4
    if debug == 2 or debug == 4:
        print('\n\nAll list values:')
        print(title)
        print(author)
        print(start_page)
        print(start_pdf_page)
        print(end_pdf_page)
    # step through each record and print all contents; debug level 6
    if debug == 6:
        print('\n\nAll records:')
        for r in range(0, len(title)):
            print(
                f'Record {r}: {title[r]}; {author[r]}; {start_page[r]}; {start_pdf_page[r]};'
                f' {end_pdf_page[r]}')

    # Return all collected metadata lists.
    return title, start_page, start_pdf_page, end_pdf_page, author
Example #2
0
def processpdfnew(verbose, debug, pagetext):

    # Create lists for all values to be exported to CSV file. Each index value will correspond to the metadata
    # for one article across all lists. This code assumes that there will be no more than two authors on each article.
    title = []
    author = []
    start_page = []
    start_pdf_page = []
    end_pdf_page = []
    page_number = 0

    # Process each page. Step through pages and attempt to find titles, authors, and page numbers in OCR text.
    # Store this metadata and the start and end pages of each article into lists.
    if verbose:
        print('Processing PDF pages')
    for page_number in range(0, len(pagetext)):
        if 1 < debug < 6:
            print('Processing PDF page number %d' % page_number)

        # look for an article title on this page, add lines to title_parts,
        # then add them together and title capitalize
        temp_title = ""
        # Look for all lines that consist only of three or more of the following characters on their own line:
        # all caps, spaces, hyphens and single quotes.
        title_parts = re.findall(r'(?<=\n)[A-Z][A-Za-z0-9 .,():"\'\-]{3,}\.(?=\s+By)|'
                                 r'By\s{1,2}[A-Za-z \-,&.]+\.',
                                 pagetext[page_number])
        if 1 < debug < 5 and title_parts:
            print('title parts: %s' % title_parts)

        # Join all returned lines together in temp_title. Strip extra spaces and use title capitalization.
        # Any word with an apostrophe comes out with a space before the apostrophe and the next letter
        # capitalized. Fix in a future version.
        for t in title_parts:
            temp_title = temp_title + " " + t
        temp_title = re.sub(r'\n', ' ', temp_title)
        temp_title = temp_title.strip()
        temp_title = re.sub(r' {2,}', " ", temp_title)
        temp_title = temp_title.title()
        temp_title = journaltools.capitalize_title(temp_title)
        # Print processed title at debug levels 1-4.
        if 0 < debug < 5 and temp_title:
            print('TITLE: %s' % temp_title)

        # If title is at least four characters long, append to title list. This should be enough to get rid of
        # garbage lines, but short enough to keep short ones. Look for original page number in OCR text, and
        # if found append to start_page list. If not, append placeholder string. Append the page number of the
        # PDF file to start_pdf_page list.
        if len(temp_title) > 5:
            title.append(temp_title)
            original_page_number = re.search(r'^[\d]{1,4}$', pagetext[page_number], re.MULTILINE)
            if original_page_number:
                start_page.append(original_page_number[0])
            else:
                start_page.append(" ")
            start_pdf_page.append(page_number)
            if 0 < debug < 5:
                if original_page_number:
                    print('Start page in PDF text: %s' % original_page_number[0])
                else:
                    print('No start page found in PDF text')

        # Find authors. If one or two lines
        # are returned, append them to find_author. Append the current PDF file page to end_pdf_page.
        find_author = re.findall(
            r'(?<=\n)[A-Z][A-Za-z]*\.? +[A-Z][A-Za-z]*\.? +[A-Za-z]+\.?[,. A-Za-z]{0,6}\*?(?=\n)|'
            r'(?<=\n)[A-Z][A-Za-z]+ +[A-Z][a-z]+[,. A-Za-z]{0,6}\*?(?=\n)|'
            r'(?<=\n)[A-Z][A-Za-z]*\.? *[A-Z][a-z]*\.? +[A-Za-z]+\.? +[A-Za-z]+\.?[,. A-Za-z]{0,6}\*?(?=\n)',
            pagetext[page_number])
        if find_author:
            author_list = []
            for count in range(0, 4):
                try:
                    f_name, m_name, l_name, suffix = journaltools.splitname(find_author[count])
                    author_temp = f_name, m_name, l_name, suffix
                except IndexError:
                    author_temp = '', '', '', ''
                author_list.append(author_temp)
            author.append(author_list)
            end_pdf_page.append(page_number)
        if 0 < debug < 5:
            print('Author: %s' % find_author)
        if 1 < debug < 5:
            print(f'PDF start pages: {start_pdf_page}')
            print(f'PDF end pages: {end_pdf_page}')
    if len(start_pdf_page) > len(end_pdf_page):
        end_pdf_page.append(page_number)

    # Compare lists to see if they contain the same number of values. If not, then pad out the short lists with
    # empty values and throw a warning. Evaluation is in two groups: The values updated when a title is found,
    # and the values updated when an author is found.
    if len(title) > len(author):
        print('WARNING! Missing authors and ending PDF pages')
        for r in range(len(author), len(title)):
            author.append([('', '', '', ''), ('', '', '', ''), ('', '', '', ''), ('', '', '', '')])
            end_pdf_page.append(0)
    elif len(author) > len(title):
        print('WARNING! Missing titles, start pages, and starting PDF pages')
        for r in range(len(title), len(author)):
            title.append('')
            start_page.append('')
            start_pdf_page.append(0)

    # Lots of debugging output
    # Print all of the lists; debug levels 2 & 4
    if debug == 2 or debug == 4:
        print('\n\nAll list values:')
        print(title)
        print(author)
        print(start_page)
        print(start_pdf_page)
        print(end_pdf_page)
    # step through each record and print all contents; debug level 6
    if debug == 6:
        print('\n\nAll records:')
        for r in range(0, len(title)):
            print(f'Record {r}: {title[r]}; {author[r]}; {start_page[r]}; {start_pdf_page[r]};'
                  f' {end_pdf_page[r]}')

    # Return all collected metadata lists.
    return title, start_page, start_pdf_page, end_pdf_page, author
Example #3
0
def processpdfnew(verbose, debug, page_text):
    # This is the main processing function. It looks through each page of the list passed to it and tries to pull
    # as much metadata as it can find.
    #
    # The format of this journal is: the first page of each issue contains a header that has the journal name,
    # volume number, issue number, month and year, and then the article title in mixed case, then the author in all
    # caps. That will usually be followed by the first heading of the article. Subsequent articles will follow this
    # format without the header. Subsequent pages will feature a header containing the journal title, page number, and
    # alternating volume and years, in arabic numerals. Each piece of both headers is on its own line in the PDF.
    # The title will be broken into multiple lines, with blank lines between some lines. There seems to be no
    # rhyme or reason to the pattern of blank lines. The author is always last, so the algorithm takes everything
    # that it doesn't recognize as an author or a piece of a heading as a piece of the title.

    # Process each page. Step through pages and attempt to find titles, authors, and page numbers in OCR text.
    # Store this metadata and the start and end pages of each article into lists.
    if verbose:
        print(f'Processing PDF pages')
    title = ''
    volume = 0
    start_page = 0
    issue_number = 0
    year = ''
    month = ''
    author = []
    doc_type = ''

    for page_number in range(0, 3):
        if 0 < debug < 6:
            print('Processing PDF page number %d' % page_number)
        if page_number == 0:
            # Process first page. Split page into lines, then step through lines
            page_lines = page_text[page_number].splitlines()
            if 2 < debug < 5:
                print(f'{page_lines}')
            line = 0
            author_flag = 0
            header_flag = 0
            # Step through lines looking for metadata until all authors are found (author_flag == 2). The general
            # format of BLR for the first page of an issue is a header with the journal name, number, month and year,
            # then the article title, then authors. The remainder of the issue will not have the full issue header,
            # but may have a designation of "essay" or "comment." The lines of the titles may be interwoven with
            # the header. The only thing that is consistent is that the authors have always been last. So once the
            # authors appear, there is no more metadata to gather.
            while author_flag < 2:
                line += 1
                if 1 < debug < 5:
                    print(f'Line: {line}')
                    print(page_lines[line])
                # Check to make sure that we're not past the last line. If we're at the end of the page, then there
                # is either no author or the author wasn't found. Set the author_flag to 2 and end the loop so we
                # avoid throwing an error. But give the user a warning.
                if line == len(page_lines):
                    print(
                        'WARNING! No author found. Title probably incorrect (and very long).'
                    )
                    author_temp = '', '', '', ''
                    author.append(author_temp)
                    author_flag = 2
                    continue
                # Skip empty lines. If the author flag is set to 1 (at least one author has been found), then a blank
                # line should mean all the authors are found and the next line will be the start of the text or a
                # heading just before the text. Set the author_flag to 2 to end the loop and avoid pulling in an
                # all-caps heading as an author.
                if page_lines[line] == '' or page_lines[line] == ' ':
                    if author_flag == 0:
                        continue
                    elif author_flag == 1:
                        author_flag = 2
                # If the header_flag is set, look for the issue number, month and year, and volume number.
                if header_flag == 1:
                    if volume == 0:
                        volume_test = re.search(r'(?<=VOLUME )\d{1,3}',
                                                page_lines[line])
                        if volume_test is not None:
                            volume = volume_test[0]
                            continue
                    if issue_number == 0:
                        issue_test = re.search(r'(?<=NUMBER )\d',
                                               page_lines[line])
                        if issue_test is not None:
                            issue_number = issue_test[0]
                            continue
                    if year == '':
                        date_parts = re.search(r'([A-Z]+) (\d{4})',
                                               page_lines[line])
                        if date_parts is not None:
                            month = date_parts[1]
                            year = date_parts[2]
                            continue
                # If the line is the journal name, the issue header should be present. Set the header_flag.
                if re.match(r'Buffalo Law Review|BUFFALO LAW REVIEW',
                            page_lines[line]):
                    header_flag = 1
                    continue
                # If the line is a document type header, set the doc_type, then move to the next loop cycle to
                # avoid adding the document type to the title.
                if re.match(r'ESSAY ?', page_lines[line]):
                    doc_type = 'essay'
                    continue
                if re.match(r'COMMENT ?', page_lines[line]):
                    doc_type = 'comment'
                    continue
                # Look for something that looks like an author. It will be in all caps and may have one or more
                # symbols at the end (designating the author's biographical information).
                author_search = re.search(
                    r'([A-ZÁÄÀÉËÈÍÏÌÑÓÖÙ]+ [A-ZÁÄÀÉËÈÍÏÌÑÓÖÙ]+\.? ?[A-ZÁÄÀÉËÈÍÏÌÑÓÖÙ]*,? ?'
                    r'[A-ZÁÄÀÉËÈÍÏÌÑÓÖÙ]*\.?)\W* ?$', page_lines[line])
                # If an author hasn't been found, then this must be part of the title. If the author_flag is still 0
                # (no authors have been found), add the line to the title, unless it's already 254 characters.
                # If the author flag is set to 1, then the whole title has been found already and this must be the
                # start of the paper. Set the author_flag to 2 to end the loop.
                if author_search is None:
                    if author_flag == 0:
                        if len(title) < 255:
                            title = title + page_lines[line]
                    elif author_flag == 1:
                        author_flag = 2
                else:
                    author_temp = author_search[1].title()
                    f_name, m_name, l_name, suffix = splitname(author_temp)
                    author_temp = f_name, m_name, l_name, suffix
                    author.append(author_temp)
                    author_flag = 1
            # Post-processing. Strip spaces from beginning and end of title. Set doc_type to article if it isn't
            # already set to something else. Look for the page number and set.
            title = title.strip()
            if doc_type == '':
                doc_type = 'article'
            original_page_number = re.search(r'^([\d]{1,4}) ?$',
                                             page_text[page_number],
                                             re.MULTILINE)
            if original_page_number:
                start_page = original_page_number[1]
        else:
            # If volume and year were not on the front page, look for them on the next two pages.
            # Process first page. Split page into lines, then step through lines
            try:
                page_lines = page_text[page_number].splitlines()
            except IndexError:
                continue
            if 2 < debug < 5:
                print(f'{page_lines}')
            for line in range(0, 10):
                if 1 < debug < 3:
                    print(f'Line: {line}')
                    print(f'{page_lines[line]}')
                if volume == 0:
                    volume_test = re.search(r'(?<=Vol.) +([\dXVILC]{1,3})',
                                            page_lines[line])
                    if volume_test is not None:
                        volume = volume_test[0]
                if year == '':
                    year_test = re.search(r'^[\d]{4}[\-—–][\d]{4}|^[\d]{4}',
                                          page_lines[line])
                    if year_test is not None:
                        year = year_test[0]
                        if 1 < debug < 3:
                            print(f'Year: {year}, Line: {line}')

    if 0 < debug < 5:
        print(
            f'{volume}, {month}, {year}, {issue_number}, {author}, {title}, {start_page}, {doc_type}'
        )

    return title, volume, start_page, issue_number, month, year, doc_type, author