Exemple #1
0
    def _parse(parser_obj):
        obj, parser = parser_obj

        abstract_section = next(x for x in parser.soup.find_all(
            'div', attrs={'class': 'section abstract'}))
        obj['Sections'] = extract_paragraphs_recursive(abstract_section)
        abstract_section.extract()

        return obj, parser
Exemple #2
0
    def _parse(parser_obj):
        obj, parser = parser_obj

        # Parse abstract
        abstract_body = parser.soup.find(**{'name': 'div', 'class': 'hlFld-Abstract'})
        abstract = extract_paragraphs_recursive(abstract_body)

        for each in abstract:
            each['type'] = 'abstract'
        
        # Full text
        full_text_body = parser.soup.find(**{'name': 'div', 'class': 'hlFld-Fulltext'})
        if full_text_body is not None:
            full_text = extract_paragraphs_recursive(full_text_body)
        else:
            full_text = []

        # remove indexes
        data = abstract + list(full_text)
        for i, sec in enumerate(data):
            # for sections that have no title
            if isinstance(sec, str):
                data[i] = {
                    'type': '',
                    'name': '',
                    'content': sec
                }

        def remove_indexes(sections):
            """
            remove indexes in section header
            """
            # include number, greek number and capital char
            indexes_pattern = re.compile(r'^([A-z0-9]+)(\.|\s)(\s)+')
            for sec in sections:
                if isinstance(sec, dict):
                    sec['name'] = re.sub(indexes_pattern, '', sec['name'])
                    remove_indexes(sec['content'])

        remove_indexes(data)

        obj.update({'Sections': data})

        return obj
Exemple #3
0
    def _parse(parser):
        """
        Collect metadata and sections from cleaned-up Paper structure.

        :type parser: LimeSoup.parser.parser_paper.ParserPaper
        :return:
        """
        # Collect information from the paper using ParserPaper
        keywords = parser.get_keywords(rules=[{'name': 'li', 'class': 'kwd'}])

        doi = parser.extract_first_meta('DC.Identifier')
        if doi is None:
            a_element = next(
                x for x in parser.soup.find_all('a', attrs={'title': 'Link to landing page via DOI'})
            )
            doi_text = a_element.get_text().strip()
            if len(doi_text) > 0:
                doi = doi_text

        journal_name = parser.extract_first_meta('citation_journal_title')
        if journal_name is None:
            a_element = next(
                x for x in parser.soup.find_all('a', attrs={'title': 'Link to journal home page'})
            )
            journal_text = a_element.get_text().strip()
            if len(journal_text) > 0:
                journal_name = journal_text

        title_element = next(
            x for x in parser.soup.find_all(attrs={'class': 'title_heading'})
        )
        title = get_tag_text(title_element).strip('*†‡§‖¶')
        # title = parser.extract_first_meta('citation_title')

        # Create tag from selection function in ParserPaper
        data = list()

        exclude_sections = [
            re.compile(r'.*?acknowledge?ment.*?', re.IGNORECASE),
            re.compile(r'.*?reference.*?', re.IGNORECASE),
        ]
        for item in parser.soup.find_all('section_h1'):
            for tag in item.find_all(**{'name': re.compile('^section_h[1-6]'), 'recursive': False}):
                data.extend(extract_paragraphs_recursive(
                    tag,
                    exclude_section_rules=exclude_sections
                ))

        obj = {
            'DOI': doi,
            'Title': title,
            'Keywords': keywords,
            'Journal': journal_name,
            'Sections': data
        }
        return obj
Exemple #4
0
    def _parse(parser_obj):
        obj, parser = parser_obj

        ending_sections = [
            re.compile(r'.*?acknowledge?ment.*?', re.IGNORECASE),
            re.compile(r'.*?reference.*?', re.IGNORECASE),
            re.compile(r'.*?author\s*information.*?', re.IGNORECASE),
            re.compile(r'.*?related\s*links.*?', re.IGNORECASE),
            re.compile(r'.*?about\s*this\s*article.*?', re.IGNORECASE),
        ]

        section_status = {'should_trim': False}

        def trim_sections(sections):
            """
            Remove anything after "ending_sections"
            """
            if isinstance(sections, dict):
                for rule in ending_sections:
                    if not section_status['should_trim']:
                        if rule.match(sections['name']):
                            section_status['should_trim'] = True
                            break

                should_include, secs = trim_sections(sections['content'])
                sections['content'] = secs

                return should_include, sections
            elif isinstance(sections, list):
                final_secs = []
                for sub_sec in sections:
                    should_include, sub_sec = trim_sections(sub_sec)
                    if should_include:
                        final_secs.append(sub_sec)

                return len(final_secs) > 0, final_secs
            else:
                return not section_status['should_trim'], sections

        raw_sections = extract_paragraphs_recursive(parser.soup)

        should_include, trimmed_sections = trim_sections(raw_sections)

        # Fix abstract, if the first element is just a plain text.
        if len(trimmed_sections) > 1 and \
                isinstance(trimmed_sections[0], str) and \
                isinstance(trimmed_sections[1], dict):
            trimmed_sections[0] = {
                'type': 'section_abstract_heuristics',
                'name': 'Abstract',
                'content': [trimmed_sections[0]],
            }
        obj['Sections'] = trimmed_sections

        return obj
Exemple #5
0
    def _parse(parser_obj):
        obj, parser = parser_obj

        exclude_sections = [
            re.compile(r'.*?acknowledge?ment.*?', re.IGNORECASE),
            re.compile(r'.*?reference.*?', re.IGNORECASE),
        ]
        obj['Sections'].extend(
            extract_paragraphs_recursive(
                parser.soup, exclude_section_rules=exclude_sections))
        return obj
Exemple #6
0
    def _parse(soup):
        obj = {
            'Journal': None,
            'DOI': None,
            'Title': None,
            'Keywords': None,
            'Sections': None
        }

        h1_tag = soup.find('h1')
        if h1_tag is not None:
            obj['Title'] = get_tag_text(h1_tag)
            h1_tag.extract()

        raw_sections = extract_paragraphs_recursive(soup)

        iterate_status = {'content_begins': False, 'content_ends': False}

        def iterate_sections(sec):
            """
            Use simple heuristics to remove garbage
            before abstract and after conclusions.
            :param sec:
            :return:
            """
            if isinstance(sec, dict):
                sec_name = re.sub(r'^[0-9.\s]+', '', sec['name'])

                if re.match(r'keywords?', sec_name, re.IGNORECASE) and \
                        all(isinstance(x, str) for x in sec['content']):
                    obj['Keywords'] = [x.strip(';') for x in sec['content']]
                    return False, sec

                if not iterate_status['content_begins']:
                    if re.match(r'.*?abstract.*?', sec_name, re.IGNORECASE) or \
                            (len(sec['content']) > 0 and
                             isinstance(sec['content'][0], str) and  # Typical abstract has more than 100 words
                             sec['content'][0].count(' ') > 100):
                        iterate_status['content_begins'] = True

                if not iterate_status['content_ends']:
                    if re.match(r'.*?(?:acknowledge?ment|reference).*?',
                                sec_name, re.IGNORECASE):
                        iterate_status['content_ends'] = True

                should_include, sub_sections = iterate_sections(sec['content'])
                sec['content'] = sub_sections
                sec['name'] = sec_name
                return should_include, sec
            elif isinstance(sec, list):
                final_secs = []
                for sub_sec in sec:
                    should_include, sub_sec = iterate_sections(sub_sec)
                    if should_include:
                        final_secs.append(sub_sec)

                return len(final_secs) > 0, final_secs
            else:
                # This is the key heuristics
                should_include = iterate_status['content_begins'] \
                                 and not iterate_status['content_ends']
                if should_include:
                    return True, sec
                else:
                    return False, sec

        success, sections = iterate_sections(raw_sections)
        obj['Sections'] = sections

        return obj