Beispiel #1
0
__all__ = ['ElsevierSoup']


def classify_code_type(raw_string):
    """
    A very simple function to detect HTML/XML.
    """
    search_for_words = [
        '</div>',
        '</p>',
    ]
    for word in search_for_words:
        if word not in raw_string:
            return 'XML'
    return 'HTML'


class ElsevierChooseParser(RuleIngredient):
    @staticmethod
    def _parse(raw_string):
        code_type = classify_code_type(raw_string)

        if code_type == 'XML':
            return ElsevierXMLSoup.parse(raw_string)
        elif code_type == 'HTML':
            return ElsevierHTMLSoup.parse(raw_string)


ElsevierSoup = Soup(parser_version=__version__)
ElsevierSoup.add_ingredient(ElsevierChooseParser())
Beispiel #2
0
                should_include, sub_sections = iterate_sections(sec['content'])
                sec['content'] = sub_sections
                sec['name'] = sec_name
                return should_include, sec
            elif isinstance(sec, list):
                final_secs = []
                for sub_sec in sec:
                    should_include, sub_sec = iterate_sections(sub_sec)
                    if should_include:
                        final_secs.append(sub_sec)

                return len(final_secs) > 0, final_secs
            else:
                # This is the key heuristics
                should_include = iterate_status['content_begins'] \
                                 and not iterate_status['content_ends']
                if should_include:
                    return True, sec
                else:
                    return False, sec

        success, sections = iterate_sections(raw_sections)
        obj['Sections'] = sections

        return obj


ElsevierHTMLSoup = Soup(parser_version=__version__)
ElsevierHTMLSoup.add_ingredient(ElsevierRemoveTrash())
ElsevierHTMLSoup.add_ingredient(ElsevierCollect())
Beispiel #3
0
                        if not skip:
                            text = parser.format_text(p.text)
                            # text = ''.join(filter(lambda x: x in string.printable, text)) Can be useful for formating but can remove characters
                            if text[-1] != '.':
                                index = text.rfind('.')
                                text = text[:index + 1]
                            if text == data[-1]['content'][0]:
                                continue
                            obj = {
                                'type': 'section_h2',
                                'name': '',
                                'content': [text]
                            }
                            data.insert(-1 * index2, obj)
        obj = {
            'DOI': doi,
            'Title': title,
            'Keywords': keys,
            'Journal': journal_name,
            'Sections': data
        }
        return obj


WileySoup = Soup(parser_version=__version__)
WileySoup.add_ingredient(WileyRemoveTagsSmallSub())
WileySoup.add_ingredient(WileyRemoveTrash())
WileySoup.add_ingredient(WileyCreateTags())
# WileySoup.add_ingredient(WileyCreateTagAbstract())
WileySoup.add_ingredient(WileyReplaceDivTag())
WileySoup.add_ingredient(WileyCollect())
Beispiel #4
0
            parser.soup.front.decompose()
            parser.soup.back.decompose()
            body = parser.soup.find_all('p')
            for paras in body:
                p = re.sub('\n*\s+\n*', ' ', paras.text.strip())
                p = re.sub('\s,\s', ', ', p)
                p = re.sub('\s.\s', '. ', p)
                if p[-1] == '.' and p[-2] == ' ':
                    p = p[:-2] + '.'
                data.append(
                    parser.create_section(name='',
                                          type_section='section_h2',
                                          content=[p]))

        obj = {
            'DOI': doi[0],
            'Keywords': [],
            'Title': parser.title,
            'Journal': journal_name[0],
            'Sections': data
        }
        return obj


APSSoup = Soup(parser_version=__version__)
APSSoup.add_ingredient(APSReformat())
APSSoup.add_ingredient(APSRemoveTrash())
# APSSoup.add_ingredient(APSCreateTags())
APSSoup.add_ingredient(APSReplaceSectionTag())
APSSoup.add_ingredient(APSCollect())
Beispiel #5
0
                        final_secs.append(sub_sec)

                return len(final_secs) > 0, final_secs
            else:
                return not section_status['should_trim'], sections

        raw_sections = extract_paragraphs_recursive(parser.soup)

        should_include, trimmed_sections = trim_sections(raw_sections)

        # Fix abstract, if the first element is just a plain text.
        if len(trimmed_sections) > 1 and \
                isinstance(trimmed_sections[0], str) and \
                isinstance(trimmed_sections[1], dict):
            trimmed_sections[0] = {
                'type': 'section_abstract_heuristics',
                'name': 'Abstract',
                'content': [trimmed_sections[0]],
            }
        obj['Sections'] = trimmed_sections

        return obj


NatureSoup = Soup(parser_version=__version__)
NatureSoup.add_ingredient(NatureRemoveTagsSmallSub())
NatureSoup.add_ingredient(NatureRemoveTrash())
NatureSoup.add_ingredient(NatureCollectMetadata())
NatureSoup.add_ingredient(NatureExtractArticleBody())
NatureSoup.add_ingredient(NatureCollect())
Beispiel #6
0
        doi = parser.get(rules=[{'name': 'doi'}])
        parser.deal_with_sections()
        data = parser.data_sections
        parser.create_abstract(rule={'name': 'abstract'})

        obj = {
            'DOI': "".join(doi),
            'Keywords': [],
            'Title': parser.title,
            'Journal': journal_name,
            'Sections': data
        }
        return obj


IOPSoup1 = Soup(parser_version=__version__)
IOPSoup1.add_ingredient(IOPReformat1())
IOPSoup1.add_ingredient(IOPRemoveTrash1())
IOPSoup1.add_ingredient(IOPCreateTags1())
IOPSoup1.add_ingredient(IOPReplaceSectionTag1())
IOPSoup1.add_ingredient(IOPCollect1())


class IOPRemoveTrash2(RuleIngredient):
    @staticmethod
    def _parse(xml_str):
        # Tags to be removed from the xml paper
        list_remove = [
            {
                'name': 'ref-list'
            },
Beispiel #7
0
        # Create tag from selection function in ParserPaper
        data = list()

        exclude_sections = [
            re.compile(r'.*?acknowledge?ment.*?', re.IGNORECASE),
            re.compile(r'.*?reference.*?', re.IGNORECASE),
        ]
        for item in parser.soup.find_all('section_h1'):
            for tag in item.find_all(**{'name': re.compile('^section_h[1-6]'), 'recursive': False}):
                data.extend(extract_paragraphs_recursive(
                    tag,
                    exclude_section_rules=exclude_sections
                ))

        obj = {
            'DOI': doi,
            'Title': title,
            'Keywords': keywords,
            'Journal': journal_name,
            'Sections': data
        }
        return obj


RSCSoup = Soup(parser_version=__version__)
RSCSoup.add_ingredient(RSCParseHTML())
RSCSoup.add_ingredient(RSCRemoveTrash())
RSCSoup.add_ingredient(RSCCreateTags())
RSCSoup.add_ingredient(RSCCreateTagAbstract())
RSCSoup.add_ingredient(RSCCollect())
Beispiel #8
0
        obj, parser = parser_obj

        abstract_section = next(x for x in parser.soup.find_all(
            'div', attrs={'class': 'section abstract'}))
        obj['Sections'] = extract_paragraphs_recursive(abstract_section)
        abstract_section.extract()

        return obj, parser


class ECSCollect(RuleIngredient):
    @staticmethod
    def _parse(parser_obj):
        obj, parser = parser_obj

        exclude_sections = [
            re.compile(r'.*?acknowledge?ment.*?', re.IGNORECASE),
            re.compile(r'.*?reference.*?', re.IGNORECASE),
        ]
        obj['Sections'].extend(
            extract_paragraphs_recursive(
                parser.soup, exclude_section_rules=exclude_sections))
        return obj


ECSSoup = Soup(parser_version=__version__)
ECSSoup.add_ingredient(ECSRemoveTrash())
ECSSoup.add_ingredient(ECSCollectTitleKeywords())
ECSSoup.add_ingredient(ECSCollectAbstract())
ECSSoup.add_ingredient(ECSCollect())
Beispiel #9
0
class ElsevierCollect(RuleIngredient):
    @staticmethod
    def _parse(args):
        soup, obj = args

        paragraphs = []

        # find all sections
        for node in soup.find_all('ce:abstract'):
            abstract_paragraph = extract_ce_abstract(node)
            normalized_name = re.sub(r'[^\w]', '', abstract_paragraph['name'])
            if re.match(r'abstracts?', normalized_name, re.IGNORECASE):
                paragraphs.append(abstract_paragraph)

        sections = soup.find('ce:sections')
        if sections is not None:
            for node in find_non_empty_children(sections):
                if node_named(node, 'ce:para'):
                    paragraphs.extend(extract_ce_para(node).split('\n'))
                elif node_named(node, 'ce:section'):
                    paragraphs.append(extract_ce_section(node))

        obj['Sections'] = paragraphs
        return obj


ElsevierXMLSoup = Soup(parser_version=__version__)
ElsevierXMLSoup.add_ingredient(ElsevierParseXML())
ElsevierXMLSoup.add_ingredient(ElsevierReadMetaData())
ElsevierXMLSoup.add_ingredient(ElsevierCollect())
Beispiel #10
0
        ])
        # Create tag from selection function in ParserPaper
        data = list()

        parser.deal_with_sections()
        data = parser.data_sections

        obj = {
            'DOI': '',
            'Title': parser.title,
            'Keywords': parser.keywords,
            'Journal': ParserPaper.journal_name,
            'Sections': data
        }

        return obj


"""
Error where the paper has paragraphs (content) that is not inside of a tag,
problem to recover these paragraphs. 
"""
SpringerSoup = Soup(parser_version=__version__)
SpringerSoup.add_ingredient(SpringerRemoveTagsSmallSub())
SpringerSoup.add_ingredient(SpringerFindJournalName())
SpringerSoup.add_ingredient(SpringerCreateTagAbstract())
SpringerSoup.add_ingredient(SpringerRemoveTrash())
SpringerSoup.add_ingredient(SpringerCreateTags())
SpringerSoup.add_ingredient(SpringerReplaceDivTagPara())
SpringerSoup.add_ingredient(SpringerCollect())
Beispiel #11
0
            if isinstance(sec, str):
                data[i] = {
                    'type': '',
                    'name': '',
                    'content': sec
                }

        def remove_indexes(sections):
            """
            remove indexes in section header
            """
            # include number, greek number and capital char
            indexes_pattern = re.compile(r'^([A-z0-9]+)(\.|\s)(\s)+')
            for sec in sections:
                if isinstance(sec, dict):
                    sec['name'] = re.sub(indexes_pattern, '', sec['name'])
                    remove_indexes(sec['content'])

        remove_indexes(data)

        obj.update({'Sections': data})

        return obj


AIPSoup = Soup(parser_version=__version__)
AIPSoup.add_ingredient(AIPRemoveTrash())
AIPSoup.add_ingredient(AIPCollectMetadata())
AIPSoup.add_ingredient(AIPCleanArticleBody())
AIPSoup.add_ingredient(AIPCollect())