def processpdfnew(verbose, debug, pagetext):

    # Create lists for all values to be exported to CSV file. Each index value will correspond to the metadata
    # for one article across all lists. This code assumes that there will be no more than two authors on each article.
    title = []
    author = []
    start_page = []
    start_pdf_page = []
    end_pdf_page = []
    page_number = 0

    # Get titles and authors from first page--for original format publications
    toc = re.findall(
        r'([IVXL]{1,4}\.?)\s([A-Za-z0-9., ]+)\s(\(([A-Za-z ]{5,})\))?',
        pagetext[0], re.DOTALL)
    # Clean up title. Then append titles and authors to title and Author lists. Append blank start page.
    for r in range(0, len(toc)):
        if 0 < debug < 3:
            print(f'Record {r}, {toc[r]}')
        temp_title = toc[r][1]
        temp_title = temp_title.strip()
        temp_title = re.sub(r' {2,}', " ", temp_title)
        temp_title = temp_title.title()
        temp_title = journaltools.capitalize_title(temp_title)
        title.append(temp_title)
        find_author = re.sub(r' {2,}', ' ', toc[r][3])
        find_author = re.split(r' and ', find_author, 2)
        if find_author:
            author_list = []
            for count in range(0, 4):
                try:
                    f_name, m_name, l_name, suffix = journaltools.splitname(
                        find_author[count])
                    author_temp = f_name, m_name, l_name, suffix
                except IndexError:
                    author_temp = '', '', '', ''
                author_list.append(author_temp)
            author.append(author_list)
        start_page.append('')
        if 1 < debug < 5:
            print(f'{title[r]}, {author[r]}')

    # Process each page. Step through pages and attempt to find titles, authors, and page numbers in OCR text.
    # Store this metadata and the start and end pages of each article into lists.
    if verbose:
        print('Processing PDF pages')

    for page_number in range(1, len(pagetext)):

        if 0 < debug < 6:
            print('Processing PDF page number %d' % page_number)

        # look for an article title on this page
        title_parts = re.search(
            r'(?<=\n)([XIVHLlixv]{1,4}\.)\s([A-Za-z0-9.,*\- ]*)\s(?=\n)',
            pagetext[page_number],
            flags=0)
        if 1 < debug < 5 and title_parts:
            print('title parts: %s' % title_parts)

        # Append temp_title to title list. Don't look for original page number in OCR text, because they didn't come
        # through on these. Append placeholder string for original start page. Append the page number of the
        # PDF file to start_pdf_page list. For every page after the first, append the page number to end_pdf_page.
        # This will add a garbage page to articles that start a page, but there's no great way to determine if
        # the article starts the page.

        if title_parts:
            # OriginalPageNumber = re.search(r'^[\d]{1,4}$', pagetext[page_number], re.MULTILINE)
            # if OriginalPageNumber:
            #     start_page.append(OriginalPageNumber[0])
            # else:
            #     start_page.append(" ")
            start_pdf_page.append(page_number)
            if page_number > 1:
                end_pdf_page.append(page_number)

        if 1 < debug < 5:
            print(f'PDF start pages: {start_pdf_page}')
            print(f'PDF end pages: {end_pdf_page}')

    end_pdf_page.append(page_number)

    # Compare all lists to make sure they contain the same number of items. Add empty items to short lists.
    if len(start_pdf_page) < len(title):
        for r in range(len(start_pdf_page), len(title)):
            start_pdf_page.append(0)
        print('WARNING! Missing Start PDF Page(s)')
    if len(end_pdf_page) < len(title):
        for r in range(len(end_pdf_page), len(title)):
            end_pdf_page.append(0)
        print('WARNING! Missing End PDF Page(s)')

    # Lots of debugging output
    # Print all of the lists; debug levels 2 & 4
    if debug == 2 or debug == 4:
        print('\n\nAll list values:')
        print(title)
        print(author)
        print(start_page)
        print(start_pdf_page)
        print(end_pdf_page)
    # step through each record and print all contents; debug level 6
    if debug == 6:
        print('\n\nAll records:')
        for r in range(0, len(title)):
            print(
                f'Record {r}: {title[r]}; {author[r]}; {start_page[r]}; {start_pdf_page[r]};'
                f' {end_pdf_page[r]}')

    # Return all collected metadata lists.
    return title, start_page, start_pdf_page, end_pdf_page, author
Exemple #2
0
def processpdfnew(verbose, debug, pagetext):

    # Create lists for all values to be exported to CSV file. Each index value will correspond to the metadata
    # for one article across all lists. This code assumes that there will be no more than two authors on each article.
    title = []
    author = []
    start_page = []
    start_pdf_page = []
    end_pdf_page = []
    page_number = 0

    # Process each page. Step through pages and attempt to find titles, authors, and page numbers in OCR text.
    # Store this metadata and the start and end pages of each article into lists.
    if verbose:
        print('Processing PDF pages')
    for page_number in range(0, len(pagetext)):
        if 1 < debug < 6:
            print('Processing PDF page number %d' % page_number)

        # look for an article title on this page, add lines to title_parts,
        # then add them together and title capitalize
        temp_title = ""
        # Look for all lines that consist only of three or more of the following characters on their own line:
        # all caps, spaces, hyphens and single quotes.
        title_parts = re.findall(r'(?<=\n)[A-Z][A-Za-z0-9 .,():"\'\-]{3,}\.(?=\s+By)|'
                                 r'By\s{1,2}[A-Za-z \-,&.]+\.',
                                 pagetext[page_number])
        if 1 < debug < 5 and title_parts:
            print('title parts: %s' % title_parts)

        # Join all returned lines together in temp_title. Strip extra spaces and use title capitalization.
        # Any word with an apostrophe comes out with a space before the apostrophe and the next letter
        # capitalized. Fix in a future version.
        for t in title_parts:
            temp_title = temp_title + " " + t
        temp_title = re.sub(r'\n', ' ', temp_title)
        temp_title = temp_title.strip()
        temp_title = re.sub(r' {2,}', " ", temp_title)
        temp_title = temp_title.title()
        temp_title = journaltools.capitalize_title(temp_title)
        # Print processed title at debug levels 1-4.
        if 0 < debug < 5 and temp_title:
            print('TITLE: %s' % temp_title)

        # If title is at least four characters long, append to title list. This should be enough to get rid of
        # garbage lines, but short enough to keep short ones. Look for original page number in OCR text, and
        # if found append to start_page list. If not, append placeholder string. Append the page number of the
        # PDF file to start_pdf_page list.
        if len(temp_title) > 5:
            title.append(temp_title)
            original_page_number = re.search(r'^[\d]{1,4}$', pagetext[page_number], re.MULTILINE)
            if original_page_number:
                start_page.append(original_page_number[0])
            else:
                start_page.append(" ")
            start_pdf_page.append(page_number)
            if 0 < debug < 5:
                if original_page_number:
                    print('Start page in PDF text: %s' % original_page_number[0])
                else:
                    print('No start page found in PDF text')

        # Find authors. If one or two lines
        # are returned, append them to find_author. Append the current PDF file page to end_pdf_page.
        find_author = re.findall(
            r'(?<=\n)[A-Z][A-Za-z]*\.? +[A-Z][A-Za-z]*\.? +[A-Za-z]+\.?[,. A-Za-z]{0,6}\*?(?=\n)|'
            r'(?<=\n)[A-Z][A-Za-z]+ +[A-Z][a-z]+[,. A-Za-z]{0,6}\*?(?=\n)|'
            r'(?<=\n)[A-Z][A-Za-z]*\.? *[A-Z][a-z]*\.? +[A-Za-z]+\.? +[A-Za-z]+\.?[,. A-Za-z]{0,6}\*?(?=\n)',
            pagetext[page_number])
        if find_author:
            author_list = []
            for count in range(0, 4):
                try:
                    f_name, m_name, l_name, suffix = journaltools.splitname(find_author[count])
                    author_temp = f_name, m_name, l_name, suffix
                except IndexError:
                    author_temp = '', '', '', ''
                author_list.append(author_temp)
            author.append(author_list)
            end_pdf_page.append(page_number)
        if 0 < debug < 5:
            print('Author: %s' % find_author)
        if 1 < debug < 5:
            print(f'PDF start pages: {start_pdf_page}')
            print(f'PDF end pages: {end_pdf_page}')
    if len(start_pdf_page) > len(end_pdf_page):
        end_pdf_page.append(page_number)

    # Compare lists to see if they contain the same number of values. If not, then pad out the short lists with
    # empty values and throw a warning. Evaluation is in two groups: The values updated when a title is found,
    # and the values updated when an author is found.
    if len(title) > len(author):
        print('WARNING! Missing authors and ending PDF pages')
        for r in range(len(author), len(title)):
            author.append([('', '', '', ''), ('', '', '', ''), ('', '', '', ''), ('', '', '', '')])
            end_pdf_page.append(0)
    elif len(author) > len(title):
        print('WARNING! Missing titles, start pages, and starting PDF pages')
        for r in range(len(title), len(author)):
            title.append('')
            start_page.append('')
            start_pdf_page.append(0)

    # Lots of debugging output
    # Print all of the lists; debug levels 2 & 4
    if debug == 2 or debug == 4:
        print('\n\nAll list values:')
        print(title)
        print(author)
        print(start_page)
        print(start_pdf_page)
        print(end_pdf_page)
    # step through each record and print all contents; debug level 6
    if debug == 6:
        print('\n\nAll records:')
        for r in range(0, len(title)):
            print(f'Record {r}: {title[r]}; {author[r]}; {start_page[r]}; {start_pdf_page[r]};'
                  f' {end_pdf_page[r]}')

    # Return all collected metadata lists.
    return title, start_page, start_pdf_page, end_pdf_page, author
Exemple #3
0
def importxl(import_file):
    # Import CSV file for this issue. User should in theory use a template with the right fields. Note to self: create
    # a template. Read each row, do some minor processing, then put the contents into a set of variables.
    # Eventually,  return the variables so they can be passed to another procedure that will write a CSV file that
    # can be used to split the full issue PDF and also to be converted to Digital Commons format for "easy" batch
    # importing. Har har har.

    title = []
    page = []
    pdf_start_page = []
    pdf_end_page = []
    author = []
    section = []

    # Set defaults for columns; will be overwritten as necessary
    section_col = 1
    title_col = 2
    page_col = 3
    start_col = 4
    end_col = 5
    first_col = 6
    middle_col = 7
    last_col = 8
    suffix_col = 9
    author2_first_col = 0
    author2_middle_col = 0
    author2_last_col = 0
    author2_suffix_col = 0

    wb = load_workbook(filename=import_file, data_only=True)
    ws = wb.active
    # Read first row and get headers.
    headers = []
    for c in range(1, ws.max_column + 1):
        headers.append(ws.cell(row=1, column=c).internal_value)
    for c in range(0, len(headers)):
        if headers[c] == 'section':
            section_col = c + 1
        if headers[c] == 'title':
            title_col = c + 1
        if headers[c] == 'page':
            page_col = c + 1
        if headers[c] == 'start_pdf_page':
            start_col = c + 1
        if headers[c] == 'end_pdf_page':
            end_col = c + 1
        if headers[c] == 'author_first':
            first_col = c + 1
        if headers[c] == 'author_middle':
            middle_col = c + 1
        if headers[c] == 'author_last':
            last_col = c + 1
        if headers[c] == 'author_suffix':
            suffix_col = c + 1
        if headers[c] == 'author2_first':
            author2_first_col = c + 1
        if headers[c] == 'author2_middle':
            author2_middle_col = c + 1
        if headers[c] == 'author2_last':
            author2_last_col = c + 1
        if headers[c] == 'author2_suffix':
            author2_suffix_col = c + 1

    # Iterate through all rows, reading values into lists to pass back to main.
    max_row = ws.max_row
    for i in range(2, max_row + 1):
        section_temp = ws.cell(row=i, column=section_col).internal_value
        if section_temp:
            section_temp = section_temp.title()
            section_temp = capitalize_title(section_temp)
        section.append(section_temp)
        temp_title = ws.cell(row=i, column=title_col).internal_value
        temp_title = temp_title.title()
        temp_title = capitalize_title(temp_title)
        title.append(temp_title)
        page_temp = ws.cell(row=i, column=page_col).internal_value
        if page_temp:
            page.append(page_temp)
        else:
            page.append('')
        pdf_start_page.append(ws.cell(row=i, column=start_col).internal_value)
        pdf_end_page.append(ws.cell(row=i, column=end_col).internal_value)
        author_temp = ws.cell(row=i, column=first_col).internal_value, \
            ws.cell(row=i, column=middle_col).internal_value, ws.cell(row=i, column=last_col).internal_value, \
            ws.cell(row=i, column=suffix_col).internal_value
        author_list = [author_temp]
        # Only look for second author if there were second author columns in the input file.
        if author2_first_col:
            # If there are columns in the input file for a second author, check to make sure there's a value in the
            # first name field. If there is, then pull all four columns into a tuple, then append it to the list.
            if ws.cell(row=i, column=author2_first_col).internal_value:
                author_temp = ws.cell(row=i, column=author2_first_col).internal_value, \
                               ws.cell(row=i, column=author2_middle_col).internal_value, \
                               ws.cell(row=i, column=author2_last_col).internal_value, \
                               ws.cell(row=i, column=author2_suffix_col).internal_value
                author_list.append(author_temp)

        if author_list:
            author.append(author_list)
        else:
            author.append('')

    return title, page, pdf_start_page, pdf_end_page, author, section