Exemple #1
0
def get_sections(sorted_list, temp, final_subs, dict_count):
    """
        Get Sections without body because there are enough section subtitles. Loop through the document
        until a section title is found and add to the temp dictionary

        :param sorted_list: document to be processed
        :type sorted_list: list(list(dict))
        :param temp: dictionary that holds the title and previous features
        :type temp: dict
        :param final_subs: list of section titles
        :type final_subs: list(str)
        :param dict_count: counter of the list of section titles
        :type dict_count: int
        :return: dictionary of final sections
        :rtype: dict
    """
    second_loop = False
    temp_text = ""
    temp_group = []
    for page in sorted_list:
        for i, group in enumerate(page):
            # print(final_subs[dict_count-1])
            # print(group["text"] + "\n\n\n\n")
            text = ""
            for i in range(min(len(group['lines']), 3)):
                text = h.add_string(text, group['lines'][i]['text'])
            if group['lines'][0]["text"].find(final_subs[dict_count]) > -1 or \
                    text.find(final_subs[dict_count]) > -1:
                if not second_loop:
                    # temp2[final_subs[dict_count]] += group["text"]
                    # print(group["text"])
                    # temp_text += group["text"]
                    # print('\n\n'+str(temp_text))
                    second_loop = True
                    if dict_count < (len(final_subs) - 1):
                        dict_count += 1
                        # print(final_subs[dict_count])
                        # print(dict_count)
                else:
                    # print("HERE")
                    # print(final_subs[dict_count-1].lower())
                    # print(temp_text)
                    temp[final_subs[dict_count -
                                    1].lower().strip(" ")] = temp_group
                    temp_text = ""
                    temp_group = []
                    # print(dict_count)
                    # print("saving" + final_subs[dict_count-1])
                    if dict_count < (len(final_subs) - 1):
                        dict_count += 1
                        # print(final_subs[dict_count])
                        # print(dict_count)
            elif second_loop:
                for line in group['lines']:
                    temp_text = h.add_string(temp_text, line['text'])
                temp_group.append(group)
    temp[final_subs[dict_count].lower().strip(" ")] = temp_group
    return temp
Exemple #2
0
def clean_section_subs(list_groups):
    final_list = copy.deepcopy(list_groups)
    count = 0
    # print(font_dict)
    for group in list_groups:
        count += 1
        multi_lined = False
        if len(group['lines']) == 1:
            text = group['lines'][0]['text']
        else:
            multi_lined = True
            text = ""
            for line in group['lines']:
                text = h.add_string(text, line['text'])
        
        if (len(text) > 50 and not multi_lined)or\
           len(text) < 4 or\
           text.count(".") > 2 or\
           text.count('(') > 0 or \
           h.test_long_subs(text) or\
           text.count("=") > 0 or \
           h.contains_word(text, "hopg") or\
           text.replace(' ', "").lower() == "absorbance" or\
           text.replace(' ', "").lower() == "langmuir" or \
           not h.string_has_characters(text) or\
           not h.is_title(text):
            # print(final_list[count - 1])
            del final_list[count - 1]
            count -= 1
    return final_list
Exemple #3
0
def clean_subs(semi_list):
    """
    Function clean_subs cleans the subtitles list to eliminate any non-subtitle elements such as
    figures and other single lines. It rules out certain sentences by comparing
    type of font to a previous subtitle and only keeping those with the same fonts.

    :param semi_list: list containing the subtitles extracted from previous function
    :type (list(str)) - (subtitles extracted(text))
    :return new_string_list: list containing the cleaned subtitles
    :rtype list(str) - (clean subtitles(text))
    """

    sections_list = ['abstract', 'introduction', 'results', 'references', 'conclusion', 'acknowledgements',
                     'experimental', 'methodology']
    final_list = copy.deepcopy(semi_list)
    new_string_list = []
    count = 0
    font_dict = {}
    for element in semi_list:
        if isinstance(element, dict):
            if h.contains(element['text'], sections_list) and len(element['text']) < 26:
                # print(element['text'])
                if h.get_font_type(element) not in font_dict.keys():
                    font_dict[h.get_font_type(element)] = [h.get_font_size(element)]
                else:
                    if h.get_font_size(element) not in font_dict[h.get_font_type(element)]:
                        font_dict[h.get_font_type(element)].append(h.get_font_size(element))
    # print(font_dict)
    for element in semi_list:
        count += 1
        if isinstance(element, dict):
            if len(element['text']) > 50 or len(element['text']) < 4 or element['text'].count(".") > 1 or element['text'].count('(') > 0 or\
                    (element['text'][-1] == "." and h.test_fonts(element, font_dict)) or h.test_long_subs(element['text']) or element['text'].count("=") > 0 or\
                    h.contains_word(element['text'], "hopg") or h.test_fonts(element, font_dict)or\
                    element['text'].replace(' ', "").lower() == "absorbance" or element['text'].replace(' ', "").lower() == "langmuir" or\
                    not h.string_has_characters(element['text']):
                #print(final_list[count - 1])
                del final_list[count - 1]
                count -= 1
        else:
            # print(element)
            # del final_list[count - 1]
            # count -= 1
            if h.test_fonts(element[0], font_dict):
                # print("HERE")
                del final_list[count - 1]
                count -= 1
    for element in final_list:
        # print(element)
        if isinstance(element, dict):
            new_string_list.append(element['text'])
        else:
            new_string_list.append(h.add_string(element[0]['text'], element[1]['text']))
    return new_string_list
Exemple #4
0
def re_get_subtitles(dynamic_list, final_subs):
    """
    Function re_get_subtitles takes in the list containing the groups of text and
    the final subtitles extracted. It goes through and checks the subtitles

    :param dynamic_list: list containing all of the text groups
    :type (list(list(str))) - (pages(text-group(text)))
    :param final_subs: containing the extracted subtitles
    :type (list(str))
    :return new_sub: list containing final subtitles
    :rtype list(str) - subtitles
    """
    new_sub = []
    for page in dynamic_list:
        for group in page:
            text = ""
            for line in group['lines']:
                text = h.add_string(text, line['text'])
            if h.check_subs(group, final_subs):
                new_sub.append(text)
    return final_clean_subtitles(new_sub)
def handle_output_type(extracted, output):
    if output == 'json':
        product = {
            'title': "",
            'authors': "",
            "contents": deepcopy(extracted['contents'])
        }
        product['title'] = h.get_text_from_group(extracted['title'])
        product['authors'] = h.get_text_from_group(extracted['authors'])
        for k in extracted["contents"]:
            t = ""
            if type(extracted['contents'][k]) == list:
                for g in extracted['contents'][k]:
                    for line in g['lines']:
                        t = h.add_string(t, line['text'])
                product["contents"][k] = t
            else:
                for g in extracted['contents'][k]['free']:
                    for line in g['lines']:
                        t = h.add_string(t, line['text'])
                product['contents'][k]["free"] = t
                product['contents'][k]['subsections'] = []
                for s in extracted['contents'][k]['subsections']:
                    new_s = []
                    t = ""
                    for line in s[0]['lines']:
                        t = h.add_string(t, line['text'])
                    new_s.append(t)
                    t = ""
                    for g in s[1]:
                        for l in g['lines']:
                            t = h.add_string(t, l['text'])
                    new_s.append(t)
                    product['contents'][k]['subsections'].append(new_s)
        return product
    elif output == "html":
        html_string = "<!DOCTYPE html>\n<html>\n<head>\n<style>\nsup {\nvertical-align: super;\nfont-size: small;\n}\n</style>\n</head>\n<body>"
        t = ""
        product = {
            'title': "",
            'authors': "",
            "contents": deepcopy(extracted['contents'])
        }
        for line in extracted['title']['lines']:
            if "html" in line.keys():
                t = h.add_string(t, line['html'])
            else:
                t = h.add_string(t, line['text'])
        html_string += "\n<h1>\n{}\n</h1>".format(t)
        t = ""
        for line in extracted['authors']['lines']:
            if "html" in line.keys():
                t = h.add_string(t, line['html'])
            else:
                t = h.add_string(t, line['text'])
        html_string += "\n<p>\n{}\n</p>".format(t)
        for k in extracted["contents"]:
            t = ""
            if str(k) == 'abstract':
                html_string += "\n<h2>Abstract</h2>"
                for g in extracted['contents'][k]:
                    for line in g['lines']:
                        if "html" in line.keys():
                            t = h.add_string(t, line['html'])
                        else:
                            t = h.add_string(t, line['text'])
                html_string += "\n<p>\n{}\n</p>".format(t)
            else:
                html_string += "\n<h2>{}</h2>".format(k)
                t = ""
                for g in extracted['contents'][k]['free']:
                    for line in g['lines']:
                        if "html" in line.keys():
                            t = h.add_string(t, line['html'])
                        else:
                            t = h.add_string(t, line['text'])
                html_string += "\n<p>\n{}\n</p>".format(t)
                product['contents'][k]['subsections'] = []
                for s in extracted['contents'][k]['subsections']:

                    t = ""
                    for line in s[0]['lines']:
                        if "html" in line.keys():
                            t = h.add_string(t, line['html'])
                        else:
                            t = h.add_string(t, line['text'])
                    html_string += "\n<h4>{}</h4>".format(t)
                    t = ""
                    for g in s[1]:

                        for l in g['lines']:
                            if "html" in l.keys():
                                t = h.add_string(t, l['html'])
                            else:
                                t = h.add_string(t, l['text'])
                    html_string += "\n<p>\n{}\n</p>".format(t)
        html_string += "\n</body>\n</html>"
        return html_string
    else:
        product = extracted
    return product
Exemple #6
0
def section_extraction(sorted_list, final_subs):
    """
    Given a document and a list of section titles that are sorted correctly
    extract the sections of a document:
    - Find title and authors
    - If title is found on second page, find title and authors on second page
    - Get Abstract and location of abstract
    - If there is less than 3 sections, get section with body, else just get section without body
    - If no sections found, get all the text in one section called body

    :param sorted_list: document to be processed
    :type sorted_list: list(list(dict))
    :param final_subs: list of section titles
    :type final_subs: list(str)
    :return: Final dictionary of sections
    :rtype: dict
    """
    sections = {}
    #print_all(sorted)

    # Find title and authors
    title, authors = get_title_and_authors(sorted_list)
    sections['title'] = title
    sections['authors'] = authors

    # Find title and authors on second page
    if h.find_title_on_second_page(sorted_list, title):
        del sorted_list[0]
        title, authors = get_title_and_authors(sorted_list)
        sections['title'] = title
        sections['authors'] = authors
    for element in final_subs:
        #print(element)
        #print(authors)
        group_text = ""
        for line in authors['lines']:
            group_text = h.add_string(group_text, line['text'])
        if element.replace(" ", "") == group_text.replace(" ", ""):
            #print("HERE")
            final_subs.remove(element)
    dict_count = 0
    #print(final_subs)
    if final_subs:
        # print(dict_count)

        # Get Abstract
        temp, location = get_abstract(sorted_list, final_subs)
        if temp:
            index = 100
            for sub in range(len(final_subs)):
                if final_subs[sub].strip(" ").lower() == 'abstract':
                    index = sub
            if not index == 100:
                del final_subs[index]
        if dict_count < len(final_subs):
            if h.find_intro_in_subs(final_subs):
                if not h.contains_word(final_subs[dict_count], "introduction"):
                    dict_count += 1

        # If less than 3 sections get sections with body
        if len(final_subs) < 4 and not h.find_intro_in_subs(final_subs):
            # print("HERE")
            if location == (0, 0):
                contents = get_sections_with_body(sorted_list, temp,
                                                  final_subs, dict_count,
                                                  location, authors)
            else:
                contents = get_sections_with_body(sorted_list, temp,
                                                  final_subs, dict_count,
                                                  location)
            sections['contents'] = contents
            try:
                if len(sections['contents']['body']) < 500:
                    del sections['contents']['body']
            except KeyError:
                pass

        # Else find sections without body
        else:
            sections["contents"] = get_sections(sorted_list, temp, final_subs,
                                                dict_count)
    else:
        temp, location = get_abstract(sorted_list)
        # print(temp['abstract'])
        body = ""
        body_group = []
        abstract = False
        for i in range(len(sorted_list)):
            for j in range(len(sorted_list[i])):
                if (i, j) == location:
                    abstract = True
                elif abstract:
                    for line in sorted_list[i][j]['lines']:
                        body = h.add_string(body, line['text'])
                    body_group.append(sorted_list[i][j])
        temp['body'] = body_group
        sections['contents'] = temp
    # TODO: Keyword section handling with new output
    try:
        # print("HERE")
        # print(sections['contents']['keywords'])
        sections['contents']['keywords'] = sections['contents'][
            'keywords'][:70]
    except KeyError:
        # print("ERROR")
        pass
    return sections
Exemple #7
0
def get_title_and_authors(sorted_list):
    """
    Given a document list, find the title and authors. We are using a list of words
    to ignore

    :param sorted_list: document to be processed
    :type sorted_list: list(list(dict))
    :return: title, authors, location
    :rtype: str, str
    """
    ignore_list = [
        "author", "copyright", "journal", "published", "downloaded", "article",
        "licensed", "download", "available", "doi", "ip", "please",
        "institute", "university", "laboratory", "department", "letters",
        "rsc", "research", "iop", "school", "american", "keyword", "abstract",
        'introduction', 'appl', "angewandte", 'nih', 'adv', 'crystengcomm',
        'title'
    ]
    ignore_authors = [
        "communication", "vol.", "no.", "cite", "published", "licensed",
        "article", "factor", "citation", "institute", "paper", "mechanistic",
        "molecule", "issn", 'introduction', 'material', 'experimental', 'nano',
        'vol', 'wiley-vch', 'e-mail', 'supplementary', 'www', 'com',
        'angewandte', 'abstract', 'information', 'keyword', 'manuscript',
        'review', '(dsscs)', 'content', 'please', 'epitaxy', 'title', 'table',
        'fortran', 'zuschriften', 'nanoparticle', 'december', 'germany'
    ]

    copy_list = copy.deepcopy(sorted_list)
    new_list = []
    for page in copy_list:
        new_page = sorted(page, key=lambda g: g['bbox'][1])
        new_page = sorted(new_page, key=lambda g: int(g['size']), reverse=True)
        new_list.append(new_page)

    # for page in new_list:
    #     for group in page:
    #         print(group)

    title = ""
    title_found = False
    page_counter = 0
    for page in new_list:
        group_counter = 0
        for group in page:
            #print(group)
            group_text = ""
            for line in group['lines']:
                group_text = h.add_string(group_text, line['text'])
            if not h.contains(group_text, ignore_list) and group["bbox"][0] < 50 and (not title_found) and\
                    len(group_text) > 9 and not group_text.lower().replace(" ", "") == "shortcommunication" and\
                    len(group_text) < 200 and group['bbox'][3] > 5 and group['bbox'][1] > 2 and\
                    not group_text.lower() == ' nanoscale' and\
                    not group_text.lower() == ' comptes rendus chimie' and\
                    not group_text == ' Materials Chemistry' and group['bbox'][3] < 90:
                if len(group_text) < 25:
                    if not h.contains_word(group_text, 'information'):
                        title = group
                        title_found = True
                else:
                    title = group
                    #print(title)
                    title_found = True
            elif h.prove_author(group_text, ignore_authors) and title_found and 7 < len(group_text) < 500 and\
                            group['bbox'][0] < 75:
                if group_text.lower().strip(" ") == 'a b s t r a c t':
                    return title, {'lines': [{'text': 'no authors'}]}
                return title, group
            elif title_found and page_counter > 0 and h.contains_word(
                    group_text, 'introduction'):
                return title, {'lines': [{'text': 'no authors'}]}
            group_counter += 1
        page_counter += 1
    if title:
        return title, {'lines': [{'text': 'no authors'}]}
    else:
        try:
            return sorted_list[0][0]['text'], {
                'lines': [{
                    'text': 'no authors'
                }]
            }
        except IndexError:
            return {
                'lines': [{
                    'text': 'no title'
                }]
            }, {
                'lines': [{
                    'text': 'no authors'
                }]
            }
Exemple #8
0
def get_abstract(sorted_list, final_subs=None):
    """
    Given document and a list of section titles, extract the abstract

    :param sorted_list: document to be processed
    :type sorted_list: list(list(dict))
    :param final_subs: list of section titles
    :type final_subs: list(str)
    :return: dictionary that contains the abstract
    :rtype: dict
    """
    # print(dict_count)
    temp = OrderedDict()
    loop = False
    dict_count = 0
    if final_subs:
        if final_subs[0].lower().find("abstract") > -1:
            dict_count = 1
            abstract_in_subs = True
        else:
            dict_count = 0
            abstract_in_subs = False
    else:
        abstract_in_subs = False
    page_counter = 0
    for page in sorted_list:
        abstract_text = ""
        abstract_font = (0, "type")
        abstract_group = []
        group_counter = 0
        for group in page:
            if group["lines"][0]['text'].lower().find('abstract') > -1:
                loop = True
                if not abstract_in_subs and len(group['lines']) > 1:
                    for line in group['lines']:
                        words = line['text'].strip(" ").split(" ")
                        if words[0].replace(" ", "").replace(":", "").replace(
                                ".", "").replace(",",
                                                 "").lower() == 'abstract':
                            del words[0]
                        for word in words:
                            abstract_text = h.add_string(abstract_text, word)
                        # abstract_text += group["text"]
                        # print(group)
                        abstract_font = (h.get_font_size(line),
                                         h.get_font_type(line))
                    line_text = ''
                    words = group['lines'][0]['text'].strip(" ").split(" ")
                    if words[0].replace(" ", "").replace(":", "").replace(
                            ".", "").replace(",", "").lower() == 'abstract':
                        del words[0]
                    for word in words:
                        line_text = h.add_string(line_text, word)
                    group['lines'][0]['text'] = line_text
                    abstract_group.append(group)
            elif loop:
                if abstract_in_subs:
                    for line in group['lines']:
                        abstract_text = h.add_string(abstract_text,
                                                     line['text'])
                        # print(group)
                        abstract_font = (h.get_font_size(group),
                                         h.get_font_type(group))
                        abstract_in_subs = False
                    abstract_group.append(group)
                # print(dict_count)
                elif final_subs:
                    if (not (group['lines'][0]["text"].find(final_subs[dict_count]) > -1)) and\
                            abstract_font == (h.get_font_size(group), h.get_font_type(group)):
                        for line in group['lines']:
                            abstract_text = h.add_string(
                                abstract_text, line['text'])
                        abstract_group.append(group)
                    else:
                        temp["abstract"] = abstract_group
                        # sections["contents"].update(temp)
                        return temp, (page_counter, group_counter)
            group_counter += 1
        page_counter += 1
    group_counter = 0
    if len(sorted_list) > 0:
        for group in sorted_list[0]:
            if group['column'] == 1:
                if len(group['lines']) > 3:
                    abstract_text = ""
                    for line in group['lines']:
                        abstract_text = h.add_string(abstract_text,
                                                     line['text'])
                    temp['abstract'] = [group]
                    return temp, (0, group_counter)
            group_counter += 1
        group_counter = 0
        for group in sorted_list[0]:
            if group['column'] == 2:
                # print(group['font'])
                if len(group['lines']) > 3 and (h.check_bold(group)
                                                or h.check_italic(group)):
                    print("HERE")
                    abstract_text = ""
                    for line in group['lines']:
                        abstract_text = h.add_string(abstract_text,
                                                     line['text'])
                    temp['abstract'] = [group]
                    return temp, (0, group_counter)
            group_counter += 1
    return temp, (0, 0)
Exemple #9
0
def get_sections_with_body(sorted_list,
                           temp,
                           final_subs,
                           dict_count,
                           location,
                           authors=None):
    """
    Get Sections with body because there are not many section subtitles so a body
    section might be needed. Given the abstract location loop through the document
    until a section title is found and add to the temp dictionary

    :param sorted_list: document to be processed
    :type sorted_list: list(list(dict))
    :param temp: dictionary that holds the title and previous features
    :type temp: dict
    :param final_subs: list of section titles
    :type final_subs: list(str)
    :param dict_count: counter of the list of section titles
    :type dict_count: int
    :param location: abstract location: page, group
    :type location: list(int)
    :param authors: string representing authors or title depending on what is last
    :type authors: dict
    :return: dictionary of final sections
    :rtype: dict
    """
    # print("HERE")
    page_number = location[0]
    group_number = location[1]
    abstract_location = False
    second_loop = False
    temp_text = ""
    temp_group = []
    for page in range(len(sorted_list)):
        for group in range(len(sorted_list[page])):
            if abstract_location:
                # print(final_subs[dict_count])
                # print(sorted_list[page][group]["text"])
                # print(group["text"] + "\n\n\n\n")
                if sorted_list[page][group]["lines"][0]['text'].find(
                        final_subs[dict_count]) > -1:
                    # print()
                    if not second_loop:
                        # print("HERE")
                        # temp2[final_subs[dict_count]] += group["text"]
                        # print(group["text"])
                        # temp_text += group["text"]
                        # print('\n\n'+str(temp_text))
                        temp['body'] = temp_group
                        # print(temp_text)
                        temp_text = ""
                        temp_group = []
                        second_loop = True
                        if dict_count < (len(final_subs) - 1):
                            dict_count += 1
                            # print(final_subs[dict_count])
                            # print(dict_count)
                    else:
                        # print("HERE")
                        # print(final_subs[dict_count-1].lower())
                        # print(temp_text)
                        temp[final_subs[dict_count -
                                        1].lower().strip(" ")] = temp_group
                        temp_text = ""
                        temp_group = []
                        # print(dict_count)
                        # print("saving" + final_subs[dict_count-1])
                        if dict_count < (len(final_subs) - 1):
                            dict_count += 1
                            # print(final_subs[dict_count])
                            # print(dict_count)
                elif second_loop:
                    for i in range(len(sorted_list[page][group]['lines'])):
                        temp_text = h.add_string(
                            temp_text,
                            sorted_list[page][group]['lines'][i]['text'])
                    temp_group.append(sorted_list[page][group])
                else:
                    # print("HERE")
                    for i in range(len(sorted_list[page][group]['lines'])):
                        temp_text = h.add_string(
                            temp_text,
                            sorted_list[page][group]['lines'][i]['text'])
                    temp_group.append(sorted_list[page][group])
            elif page_number == 0 and group_number == 0:
                authors_text = ""
                for line in range(len(sorted_list[page][group]['lines'])):
                    authors_text = h.add_string(
                        authors_text,
                        sorted_list[page][group]['lines'][line]['text'])
                a_t = ""
                for line in authors['lines']:
                    a_t = h.add_string(a_t, line['text'])
                if authors_text == a_t:
                    abstract_location = True
            elif page == page_number and group_number == group:
                abstract_location = True
    temp[final_subs[dict_count].lower().strip(" ")] = temp_group
    return temp