Beispiel #1
0
                should_include, sub_sections = iterate_sections(sec['content'])
                sec['content'] = sub_sections
                sec['name'] = sec_name
                return should_include, sec
            elif isinstance(sec, list):
                final_secs = []
                for sub_sec in sec:
                    should_include, sub_sec = iterate_sections(sub_sec)
                    if should_include:
                        final_secs.append(sub_sec)

                return len(final_secs) > 0, final_secs
            else:
                # This is the key heuristics
                should_include = iterate_status['content_begins'] \
                                 and not iterate_status['content_ends']
                if should_include:
                    return True, sec
                else:
                    return False, sec

        success, sections = iterate_sections(raw_sections)
        obj['Sections'] = sections

        return obj


ElsevierHTMLSoup = Soup(parser_version=__version__)
ElsevierHTMLSoup.add_ingredient(ElsevierRemoveTrash())
ElsevierHTMLSoup.add_ingredient(ElsevierCollect())
Beispiel #2
0
            parser.soup.front.decompose()
            parser.soup.back.decompose()
            body = parser.soup.find_all('p')
            for paras in body:
                p = re.sub('\n*\s+\n*', ' ', paras.text.strip())
                p = re.sub('\s,\s', ', ', p)
                p = re.sub('\s.\s', '. ', p)
                if p[-1] == '.' and p[-2] == ' ':
                    p = p[:-2] + '.'
                data.append(
                    parser.create_section(name='',
                                          type_section='section_h2',
                                          content=[p]))

        obj = {
            'DOI': doi[0],
            'Keywords': [],
            'Title': parser.title,
            'Journal': journal_name[0],
            'Sections': data
        }
        return obj


APSSoup = Soup(parser_version=__version__)
APSSoup.add_ingredient(APSReformat())
APSSoup.add_ingredient(APSRemoveTrash())
# APSSoup.add_ingredient(APSCreateTags())
APSSoup.add_ingredient(APSReplaceSectionTag())
APSSoup.add_ingredient(APSCollect())
Beispiel #3
0
                        if not skip:
                            text = parser.format_text(p.text)
                            # text = ''.join(filter(lambda x: x in string.printable, text)) Can be useful for formating but can remove characters
                            if text[-1] != '.':
                                index = text.rfind('.')
                                text = text[:index + 1]
                            if text == data[-1]['content'][0]:
                                continue
                            obj = {
                                'type': 'section_h2',
                                'name': '',
                                'content': [text]
                            }
                            data.insert(-1 * index2, obj)
        obj = {
            'DOI': doi,
            'Title': title,
            'Keywords': keys,
            'Journal': journal_name,
            'Sections': data
        }
        return obj


WileySoup = Soup(parser_version=__version__)
WileySoup.add_ingredient(WileyRemoveTagsSmallSub())
WileySoup.add_ingredient(WileyRemoveTrash())
WileySoup.add_ingredient(WileyCreateTags())
# WileySoup.add_ingredient(WileyCreateTagAbstract())
WileySoup.add_ingredient(WileyReplaceDivTag())
WileySoup.add_ingredient(WileyCollect())
Beispiel #4
0
        doi = parser.get(rules=[{'name': 'doi'}])
        parser.deal_with_sections()
        data = parser.data_sections
        parser.create_abstract(rule={'name': 'abstract'})

        obj = {
            'DOI': "".join(doi),
            'Keywords': [],
            'Title': parser.title,
            'Journal': journal_name,
            'Sections': data
        }
        return obj


IOPSoup1 = Soup(parser_version=__version__)
IOPSoup1.add_ingredient(IOPReformat1())
IOPSoup1.add_ingredient(IOPRemoveTrash1())
IOPSoup1.add_ingredient(IOPCreateTags1())
IOPSoup1.add_ingredient(IOPReplaceSectionTag1())
IOPSoup1.add_ingredient(IOPCollect1())


class IOPRemoveTrash2(RuleIngredient):
    @staticmethod
    def _parse(xml_str):
        # Tags to be removed from the xml paper
        list_remove = [
            {
                'name': 'ref-list'
            },
Beispiel #5
0
                        final_secs.append(sub_sec)

                return len(final_secs) > 0, final_secs
            else:
                return not section_status['should_trim'], sections

        raw_sections = extract_paragraphs_recursive(parser.soup)

        should_include, trimmed_sections = trim_sections(raw_sections)

        # Fix abstract, if the first element is just a plain text.
        if len(trimmed_sections) > 1 and \
                isinstance(trimmed_sections[0], str) and \
                isinstance(trimmed_sections[1], dict):
            trimmed_sections[0] = {
                'type': 'section_abstract_heuristics',
                'name': 'Abstract',
                'content': [trimmed_sections[0]],
            }
        obj['Sections'] = trimmed_sections

        return obj


NatureSoup = Soup(parser_version=__version__)
NatureSoup.add_ingredient(NatureRemoveTagsSmallSub())
NatureSoup.add_ingredient(NatureRemoveTrash())
NatureSoup.add_ingredient(NatureCollectMetadata())
NatureSoup.add_ingredient(NatureExtractArticleBody())
NatureSoup.add_ingredient(NatureCollect())
Beispiel #6
0
        ])
        # Create tag from selection function in ParserPaper
        data = list()

        parser.deal_with_sections()
        data = parser.data_sections

        obj = {
            'DOI': '',
            'Title': parser.title,
            'Keywords': parser.keywords,
            'Journal': ParserPaper.journal_name,
            'Sections': data
        }

        return obj


"""
Error where the paper has paragraphs (content) that is not inside of a tag,
problem to recover these paragraphs. 
"""
SpringerSoup = Soup(parser_version=__version__)
SpringerSoup.add_ingredient(SpringerRemoveTagsSmallSub())
SpringerSoup.add_ingredient(SpringerFindJournalName())
SpringerSoup.add_ingredient(SpringerCreateTagAbstract())
SpringerSoup.add_ingredient(SpringerRemoveTrash())
SpringerSoup.add_ingredient(SpringerCreateTags())
SpringerSoup.add_ingredient(SpringerReplaceDivTagPara())
SpringerSoup.add_ingredient(SpringerCollect())