Python splitname Examples

Programming Language: Python

Namespace/Package Name: journaltools

Method/Function: splitname

Examples at hotexamples.com: 3

Python splitname - 3 examples found. These are the top rated real world Python examples of journaltools.splitname extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: dsplit-coa.py Project: johnrbeatty/journaltools

def processpdfnew(verbose, debug, pagetext):

    # Create lists for all values to be exported to CSV file. Each index value will correspond to the metadata
    # for one article across all lists. This code assumes that there will be no more than two authors on each article.
    title = []
    author = []
    start_page = []
    start_pdf_page = []
    end_pdf_page = []
    page_number = 0

    # Get titles and authors from first page--for original format publications
    toc = re.findall(
        r'([IVXL]{1,4}\.?)\s([A-Za-z0-9., ]+)\s(\(([A-Za-z ]{5,})\))?',
        pagetext[0], re.DOTALL)
    # Clean up title. Then append titles and authors to title and Author lists. Append blank start page.
    for r in range(0, len(toc)):
        if 0 < debug < 3:
            print(f'Record {r}, {toc[r]}')
        temp_title = toc[r][1]
        temp_title = temp_title.strip()
        temp_title = re.sub(r' {2,}', " ", temp_title)
        temp_title = temp_title.title()
        temp_title = journaltools.capitalize_title(temp_title)
        title.append(temp_title)
        find_author = re.sub(r' {2,}', ' ', toc[r][3])
        find_author = re.split(r' and ', find_author, 2)
        if find_author:
            author_list = []
            for count in range(0, 4):
                try:
                    f_name, m_name, l_name, suffix = journaltools.splitname(
                        find_author[count])
                    author_temp = f_name, m_name, l_name, suffix
                except IndexError:
                    author_temp = '', '', '', ''
                author_list.append(author_temp)
            author.append(author_list)
        start_page.append('')
        if 1 < debug < 5:
            print(f'{title[r]}, {author[r]}')

    # Process each page. Step through pages and attempt to find titles, authors, and page numbers in OCR text.
    # Store this metadata and the start and end pages of each article into lists.
    if verbose:
        print('Processing PDF pages')

    for page_number in range(1, len(pagetext)):

        if 0 < debug < 6:
            print('Processing PDF page number %d' % page_number)

        # look for an article title on this page
        title_parts = re.search(
            r'(?<=\n)([XIVHLlixv]{1,4}\.)\s([A-Za-z0-9.,*\- ]*)\s(?=\n)',
            pagetext[page_number],
            flags=0)
        if 1 < debug < 5 and title_parts:
            print('title parts: %s' % title_parts)

        # Append temp_title to title list. Don't look for original page number in OCR text, because they didn't come
        # through on these. Append placeholder string for original start page. Append the page number of the
        # PDF file to start_pdf_page list. For every page after the first, append the page number to end_pdf_page.
        # This will add a garbage page to articles that start a page, but there's no great way to determine if
        # the article starts the page.

        if title_parts:
            # OriginalPageNumber = re.search(r'^[\d]{1,4}$', pagetext[page_number], re.MULTILINE)
            # if OriginalPageNumber:
            #     start_page.append(OriginalPageNumber[0])
            # else:
            #     start_page.append(" ")
            start_pdf_page.append(page_number)
            if page_number > 1:
                end_pdf_page.append(page_number)

        if 1 < debug < 5:
            print(f'PDF start pages: {start_pdf_page}')
            print(f'PDF end pages: {end_pdf_page}')

    end_pdf_page.append(page_number)

    # Compare all lists to make sure they contain the same number of items. Add empty items to short lists.
    if len(start_pdf_page) < len(title):
        for r in range(len(start_pdf_page), len(title)):
            start_pdf_page.append(0)
        print('WARNING! Missing Start PDF Page(s)')
    if len(end_pdf_page) < len(title):
        for r in range(len(end_pdf_page), len(title)):
            end_pdf_page.append(0)
        print('WARNING! Missing End PDF Page(s)')

    # Lots of debugging output
    # Print all of the lists; debug levels 2 & 4
    if debug == 2 or debug == 4:
        print('\n\nAll list values:')
        print(title)
        print(author)
        print(start_page)
        print(start_pdf_page)
        print(end_pdf_page)
    # step through each record and print all contents; debug level 6
    if debug == 6:
        print('\n\nAll records:')
        for r in range(0, len(title)):
            print(
                f'Record {r}: {title[r]}; {author[r]}; {start_page[r]}; {start_pdf_page[r]};'
                f' {end_pdf_page[r]}')

    # Return all collected metadata lists.
    return title, start_page, start_pdf_page, end_pdf_page, author

Example #2

Show file

def processpdfnew(verbose, debug, pagetext):

    # Create lists for all values to be exported to CSV file. Each index value will correspond to the metadata
    # for one article across all lists. This code assumes that there will be no more than two authors on each article.
    title = []
    author = []
    start_page = []
    start_pdf_page = []
    end_pdf_page = []
    page_number = 0

    # Process each page. Step through pages and attempt to find titles, authors, and page numbers in OCR text.
    # Store this metadata and the start and end pages of each article into lists.
    if verbose:
        print('Processing PDF pages')
    for page_number in range(0, len(pagetext)):
        if 1 < debug < 6:
            print('Processing PDF page number %d' % page_number)

        # look for an article title on this page, add lines to title_parts,
        # then add them together and title capitalize
        temp_title = ""
        # Look for all lines that consist only of three or more of the following characters on their own line:
        # all caps, spaces, hyphens and single quotes.
        title_parts = re.findall(r'(?<=\n)[A-Z][A-Za-z0-9 .,():"\'\-]{3,}\.(?=\s+By)|'
                                 r'By\s{1,2}[A-Za-z \-,&.]+\.',
                                 pagetext[page_number])
        if 1 < debug < 5 and title_parts:
            print('title parts: %s' % title_parts)

        # Join all returned lines together in temp_title. Strip extra spaces and use title capitalization.
        # Any word with an apostrophe comes out with a space before the apostrophe and the next letter
        # capitalized. Fix in a future version.
        for t in title_parts:
            temp_title = temp_title + " " + t
        temp_title = re.sub(r'\n', ' ', temp_title)
        temp_title = temp_title.strip()
        temp_title = re.sub(r' {2,}', " ", temp_title)
        temp_title = temp_title.title()
        temp_title = journaltools.capitalize_title(temp_title)
        # Print processed title at debug levels 1-4.
        if 0 < debug < 5 and temp_title:
            print('TITLE: %s' % temp_title)

        # If title is at least four characters long, append to title list. This should be enough to get rid of
        # garbage lines, but short enough to keep short ones. Look for original page number in OCR text, and
        # if found append to start_page list. If not, append placeholder string. Append the page number of the
        # PDF file to start_pdf_page list.
        if len(temp_title) > 5:
            title.append(temp_title)
            original_page_number = re.search(r'^[\d]{1,4}$', pagetext[page_number], re.MULTILINE)
            if original_page_number:
                start_page.append(original_page_number[0])
            else:
                start_page.append(" ")
            start_pdf_page.append(page_number)
            if 0 < debug < 5:
                if original_page_number:
                    print('Start page in PDF text: %s' % original_page_number[0])
                else:
                    print('No start page found in PDF text')

        # Find authors. If one or two lines
        # are returned, append them to find_author. Append the current PDF file page to end_pdf_page.
        find_author = re.findall(
            r'(?<=\n)[A-Z][A-Za-z]*\.? +[A-Z][A-Za-z]*\.? +[A-Za-z]+\.?[,. A-Za-z]{0,6}\*?(?=\n)|'
            r'(?<=\n)[A-Z][A-Za-z]+ +[A-Z][a-z]+[,. A-Za-z]{0,6}\*?(?=\n)|'
            r'(?<=\n)[A-Z][A-Za-z]*\.? *[A-Z][a-z]*\.? +[A-Za-z]+\.? +[A-Za-z]+\.?[,. A-Za-z]{0,6}\*?(?=\n)',
            pagetext[page_number])
        if find_author:
            author_list = []
            for count in range(0, 4):
                try:
                    f_name, m_name, l_name, suffix = journaltools.splitname(find_author[count])
                    author_temp = f_name, m_name, l_name, suffix
                except IndexError:
                    author_temp = '', '', '', ''
                author_list.append(author_temp)
            author.append(author_list)
            end_pdf_page.append(page_number)
        if 0 < debug < 5:
            print('Author: %s' % find_author)
        if 1 < debug < 5:
            print(f'PDF start pages: {start_pdf_page}')
            print(f'PDF end pages: {end_pdf_page}')
    if len(start_pdf_page) > len(end_pdf_page):
        end_pdf_page.append(page_number)

    # Compare lists to see if they contain the same number of values. If not, then pad out the short lists with
    # empty values and throw a warning. Evaluation is in two groups: The values updated when a title is found,
    # and the values updated when an author is found.
    if len(title) > len(author):
        print('WARNING! Missing authors and ending PDF pages')
        for r in range(len(author), len(title)):
            author.append([('', '', '', ''), ('', '', '', ''), ('', '', '', ''), ('', '', '', '')])
            end_pdf_page.append(0)
    elif len(author) > len(title):
        print('WARNING! Missing titles, start pages, and starting PDF pages')
        for r in range(len(title), len(author)):
            title.append('')
            start_page.append('')
            start_pdf_page.append(0)

    # Lots of debugging output
    # Print all of the lists; debug levels 2 & 4
    if debug == 2 or debug == 4:
        print('\n\nAll list values:')
        print(title)
        print(author)
        print(start_page)
        print(start_pdf_page)
        print(end_pdf_page)
    # step through each record and print all contents; debug level 6
    if debug == 6:
        print('\n\nAll records:')
        for r in range(0, len(title)):
            print(f'Record {r}: {title[r]}; {author[r]}; {start_page[r]}; {start_pdf_page[r]};'
                  f' {end_pdf_page[r]}')

    # Return all collected metadata lists.
    return title, start_page, start_pdf_page, end_pdf_page, author

Example #3

Show file

def processpdfnew(verbose, debug, page_text):
    # This is the main processing function. It looks through each page of the list passed to it and tries to pull
    # as much metadata as it can find.
    #
    # The format of this journal is: the first page of each issue contains a header that has the journal name,
    # volume number, issue number, month and year, and then the article title in mixed case, then the author in all
    # caps. That will usually be followed by the first heading of the article. Subsequent articles will follow this
    # format without the header. Subsequent pages will feature a header containing the journal title, page number, and
    # alternating volume and years, in arabic numerals. Each piece of both headers is on its own line in the PDF.
    # The title will be broken into multiple lines, with blank lines between some lines. There seems to be no
    # rhyme or reason to the pattern of blank lines. The author is always last, so the algorithm takes everything
    # that it doesn't recognize as an author or a piece of a heading as a piece of the title.

    # Process each page. Step through pages and attempt to find titles, authors, and page numbers in OCR text.
    # Store this metadata and the start and end pages of each article into lists.
    if verbose:
        print(f'Processing PDF pages')
    title = ''
    volume = 0
    start_page = 0
    issue_number = 0
    year = ''
    month = ''
    author = []
    doc_type = ''

    for page_number in range(0, 3):
        if 0 < debug < 6:
            print('Processing PDF page number %d' % page_number)
        if page_number == 0:
            # Process first page. Split page into lines, then step through lines
            page_lines = page_text[page_number].splitlines()
            if 2 < debug < 5:
                print(f'{page_lines}')
            line = 0
            author_flag = 0
            header_flag = 0
            # Step through lines looking for metadata until all authors are found (author_flag == 2). The general
            # format of BLR for the first page of an issue is a header with the journal name, number, month and year,
            # then the article title, then authors. The remainder of the issue will not have the full issue header,
            # but may have a designation of "essay" or "comment." The lines of the titles may be interwoven with
            # the header. The only thing that is consistent is that the authors have always been last. So once the
            # authors appear, there is no more metadata to gather.
            while author_flag < 2:
                line += 1
                if 1 < debug < 5:
                    print(f'Line: {line}')
                    print(page_lines[line])
                # Check to make sure that we're not past the last line. If we're at the end of the page, then there
                # is either no author or the author wasn't found. Set the author_flag to 2 and end the loop so we
                # avoid throwing an error. But give the user a warning.
                if line == len(page_lines):
                    print(
                        'WARNING! No author found. Title probably incorrect (and very long).'
                    )
                    author_temp = '', '', '', ''
                    author.append(author_temp)
                    author_flag = 2
                    continue
                # Skip empty lines. If the author flag is set to 1 (at least one author has been found), then a blank
                # line should mean all the authors are found and the next line will be the start of the text or a
                # heading just before the text. Set the author_flag to 2 to end the loop and avoid pulling in an
                # all-caps heading as an author.
                if page_lines[line] == '' or page_lines[line] == ' ':
                    if author_flag == 0:
                        continue
                    elif author_flag == 1:
                        author_flag = 2
                # If the header_flag is set, look for the issue number, month and year, and volume number.
                if header_flag == 1:
                    if volume == 0:
                        volume_test = re.search(r'(?<=VOLUME )\d{1,3}',
                                                page_lines[line])
                        if volume_test is not None:
                            volume = volume_test[0]
                            continue
                    if issue_number == 0:
                        issue_test = re.search(r'(?<=NUMBER )\d',
                                               page_lines[line])
                        if issue_test is not None:
                            issue_number = issue_test[0]
                            continue
                    if year == '':
                        date_parts = re.search(r'([A-Z]+) (\d{4})',
                                               page_lines[line])
                        if date_parts is not None:
                            month = date_parts[1]
                            year = date_parts[2]
                            continue
                # If the line is the journal name, the issue header should be present. Set the header_flag.
                if re.match(r'Buffalo Law Review|BUFFALO LAW REVIEW',
                            page_lines[line]):
                    header_flag = 1
                    continue
                # If the line is a document type header, set the doc_type, then move to the next loop cycle to
                # avoid adding the document type to the title.
                if re.match(r'ESSAY ?', page_lines[line]):
                    doc_type = 'essay'
                    continue
                if re.match(r'COMMENT ?', page_lines[line]):
                    doc_type = 'comment'
                    continue
                # Look for something that looks like an author. It will be in all caps and may have one or more
                # symbols at the end (designating the author's biographical information).
                author_search = re.search(
                    r'([A-ZÁÄÀÉËÈÍÏÌÑÓÖÙ]+ [A-ZÁÄÀÉËÈÍÏÌÑÓÖÙ]+\.? ?[A-ZÁÄÀÉËÈÍÏÌÑÓÖÙ]*,? ?'
                    r'[A-ZÁÄÀÉËÈÍÏÌÑÓÖÙ]*\.?)\W* ?$', page_lines[line])
                # If an author hasn't been found, then this must be part of the title. If the author_flag is still 0
                # (no authors have been found), add the line to the title, unless it's already 254 characters.
                # If the author flag is set to 1, then the whole title has been found already and this must be the
                # start of the paper. Set the author_flag to 2 to end the loop.
                if author_search is None:
                    if author_flag == 0:
                        if len(title) < 255:
                            title = title + page_lines[line]
                    elif author_flag == 1:
                        author_flag = 2
                else:
                    author_temp = author_search[1].title()
                    f_name, m_name, l_name, suffix = splitname(author_temp)
                    author_temp = f_name, m_name, l_name, suffix
                    author.append(author_temp)
                    author_flag = 1
            # Post-processing. Strip spaces from beginning and end of title. Set doc_type to article if it isn't
            # already set to something else. Look for the page number and set.
            title = title.strip()
            if doc_type == '':
                doc_type = 'article'
            original_page_number = re.search(r'^([\d]{1,4}) ?$',
                                             page_text[page_number],
                                             re.MULTILINE)
            if original_page_number:
                start_page = original_page_number[1]
        else:
            # If volume and year were not on the front page, look for them on the next two pages.
            # Process first page. Split page into lines, then step through lines
            try:
                page_lines = page_text[page_number].splitlines()
            except IndexError:
                continue
            if 2 < debug < 5:
                print(f'{page_lines}')
            for line in range(0, 10):
                if 1 < debug < 3:
                    print(f'Line: {line}')
                    print(f'{page_lines[line]}')
                if volume == 0:
                    volume_test = re.search(r'(?<=Vol.) +([\dXVILC]{1,3})',
                                            page_lines[line])
                    if volume_test is not None:
                        volume = volume_test[0]
                if year == '':
                    year_test = re.search(r'^[\d]{4}[\-—–][\d]{4}|^[\d]{4}',
                                          page_lines[line])
                    if year_test is not None:
                        year = year_test[0]
                        if 1 < debug < 3:
                            print(f'Year: {year}, Line: {line}')

    if 0 < debug < 5:
        print(
            f'{volume}, {month}, {year}, {issue_number}, {author}, {title}, {start_page}, {doc_type}'
        )

    return title, volume, start_page, issue_number, month, year, doc_type, author