__all__ = ['ElsevierSoup'] def classify_code_type(raw_string): """ A very simple function to detect HTML/XML. """ search_for_words = [ '</div>', '</p>', ] for word in search_for_words: if word not in raw_string: return 'XML' return 'HTML' class ElsevierChooseParser(RuleIngredient): @staticmethod def _parse(raw_string): code_type = classify_code_type(raw_string) if code_type == 'XML': return ElsevierXMLSoup.parse(raw_string) elif code_type == 'HTML': return ElsevierHTMLSoup.parse(raw_string) ElsevierSoup = Soup(parser_version=__version__) ElsevierSoup.add_ingredient(ElsevierChooseParser())
should_include, sub_sections = iterate_sections(sec['content']) sec['content'] = sub_sections sec['name'] = sec_name return should_include, sec elif isinstance(sec, list): final_secs = [] for sub_sec in sec: should_include, sub_sec = iterate_sections(sub_sec) if should_include: final_secs.append(sub_sec) return len(final_secs) > 0, final_secs else: # This is the key heuristics should_include = iterate_status['content_begins'] \ and not iterate_status['content_ends'] if should_include: return True, sec else: return False, sec success, sections = iterate_sections(raw_sections) obj['Sections'] = sections return obj ElsevierHTMLSoup = Soup(parser_version=__version__) ElsevierHTMLSoup.add_ingredient(ElsevierRemoveTrash()) ElsevierHTMLSoup.add_ingredient(ElsevierCollect())
if not skip: text = parser.format_text(p.text) # text = ''.join(filter(lambda x: x in string.printable, text)) Can be useful for formating but can remove characters if text[-1] != '.': index = text.rfind('.') text = text[:index + 1] if text == data[-1]['content'][0]: continue obj = { 'type': 'section_h2', 'name': '', 'content': [text] } data.insert(-1 * index2, obj) obj = { 'DOI': doi, 'Title': title, 'Keywords': keys, 'Journal': journal_name, 'Sections': data } return obj WileySoup = Soup(parser_version=__version__) WileySoup.add_ingredient(WileyRemoveTagsSmallSub()) WileySoup.add_ingredient(WileyRemoveTrash()) WileySoup.add_ingredient(WileyCreateTags()) # WileySoup.add_ingredient(WileyCreateTagAbstract()) WileySoup.add_ingredient(WileyReplaceDivTag()) WileySoup.add_ingredient(WileyCollect())
parser.soup.front.decompose() parser.soup.back.decompose() body = parser.soup.find_all('p') for paras in body: p = re.sub('\n*\s+\n*', ' ', paras.text.strip()) p = re.sub('\s,\s', ', ', p) p = re.sub('\s.\s', '. ', p) if p[-1] == '.' and p[-2] == ' ': p = p[:-2] + '.' data.append( parser.create_section(name='', type_section='section_h2', content=[p])) obj = { 'DOI': doi[0], 'Keywords': [], 'Title': parser.title, 'Journal': journal_name[0], 'Sections': data } return obj APSSoup = Soup(parser_version=__version__) APSSoup.add_ingredient(APSReformat()) APSSoup.add_ingredient(APSRemoveTrash()) # APSSoup.add_ingredient(APSCreateTags()) APSSoup.add_ingredient(APSReplaceSectionTag()) APSSoup.add_ingredient(APSCollect())
final_secs.append(sub_sec) return len(final_secs) > 0, final_secs else: return not section_status['should_trim'], sections raw_sections = extract_paragraphs_recursive(parser.soup) should_include, trimmed_sections = trim_sections(raw_sections) # Fix abstract, if the first element is just a plain text. if len(trimmed_sections) > 1 and \ isinstance(trimmed_sections[0], str) and \ isinstance(trimmed_sections[1], dict): trimmed_sections[0] = { 'type': 'section_abstract_heuristics', 'name': 'Abstract', 'content': [trimmed_sections[0]], } obj['Sections'] = trimmed_sections return obj NatureSoup = Soup(parser_version=__version__) NatureSoup.add_ingredient(NatureRemoveTagsSmallSub()) NatureSoup.add_ingredient(NatureRemoveTrash()) NatureSoup.add_ingredient(NatureCollectMetadata()) NatureSoup.add_ingredient(NatureExtractArticleBody()) NatureSoup.add_ingredient(NatureCollect())
doi = parser.get(rules=[{'name': 'doi'}]) parser.deal_with_sections() data = parser.data_sections parser.create_abstract(rule={'name': 'abstract'}) obj = { 'DOI': "".join(doi), 'Keywords': [], 'Title': parser.title, 'Journal': journal_name, 'Sections': data } return obj IOPSoup1 = Soup(parser_version=__version__) IOPSoup1.add_ingredient(IOPReformat1()) IOPSoup1.add_ingredient(IOPRemoveTrash1()) IOPSoup1.add_ingredient(IOPCreateTags1()) IOPSoup1.add_ingredient(IOPReplaceSectionTag1()) IOPSoup1.add_ingredient(IOPCollect1()) class IOPRemoveTrash2(RuleIngredient): @staticmethod def _parse(xml_str): # Tags to be removed from the xml paper list_remove = [ { 'name': 'ref-list' },
# Create tag from selection function in ParserPaper data = list() exclude_sections = [ re.compile(r'.*?acknowledge?ment.*?', re.IGNORECASE), re.compile(r'.*?reference.*?', re.IGNORECASE), ] for item in parser.soup.find_all('section_h1'): for tag in item.find_all(**{'name': re.compile('^section_h[1-6]'), 'recursive': False}): data.extend(extract_paragraphs_recursive( tag, exclude_section_rules=exclude_sections )) obj = { 'DOI': doi, 'Title': title, 'Keywords': keywords, 'Journal': journal_name, 'Sections': data } return obj RSCSoup = Soup(parser_version=__version__) RSCSoup.add_ingredient(RSCParseHTML()) RSCSoup.add_ingredient(RSCRemoveTrash()) RSCSoup.add_ingredient(RSCCreateTags()) RSCSoup.add_ingredient(RSCCreateTagAbstract()) RSCSoup.add_ingredient(RSCCollect())
obj, parser = parser_obj abstract_section = next(x for x in parser.soup.find_all( 'div', attrs={'class': 'section abstract'})) obj['Sections'] = extract_paragraphs_recursive(abstract_section) abstract_section.extract() return obj, parser class ECSCollect(RuleIngredient): @staticmethod def _parse(parser_obj): obj, parser = parser_obj exclude_sections = [ re.compile(r'.*?acknowledge?ment.*?', re.IGNORECASE), re.compile(r'.*?reference.*?', re.IGNORECASE), ] obj['Sections'].extend( extract_paragraphs_recursive( parser.soup, exclude_section_rules=exclude_sections)) return obj ECSSoup = Soup(parser_version=__version__) ECSSoup.add_ingredient(ECSRemoveTrash()) ECSSoup.add_ingredient(ECSCollectTitleKeywords()) ECSSoup.add_ingredient(ECSCollectAbstract()) ECSSoup.add_ingredient(ECSCollect())
class ElsevierCollect(RuleIngredient): @staticmethod def _parse(args): soup, obj = args paragraphs = [] # find all sections for node in soup.find_all('ce:abstract'): abstract_paragraph = extract_ce_abstract(node) normalized_name = re.sub(r'[^\w]', '', abstract_paragraph['name']) if re.match(r'abstracts?', normalized_name, re.IGNORECASE): paragraphs.append(abstract_paragraph) sections = soup.find('ce:sections') if sections is not None: for node in find_non_empty_children(sections): if node_named(node, 'ce:para'): paragraphs.extend(extract_ce_para(node).split('\n')) elif node_named(node, 'ce:section'): paragraphs.append(extract_ce_section(node)) obj['Sections'] = paragraphs return obj ElsevierXMLSoup = Soup(parser_version=__version__) ElsevierXMLSoup.add_ingredient(ElsevierParseXML()) ElsevierXMLSoup.add_ingredient(ElsevierReadMetaData()) ElsevierXMLSoup.add_ingredient(ElsevierCollect())
]) # Create tag from selection function in ParserPaper data = list() parser.deal_with_sections() data = parser.data_sections obj = { 'DOI': '', 'Title': parser.title, 'Keywords': parser.keywords, 'Journal': ParserPaper.journal_name, 'Sections': data } return obj """ Error where the paper has paragraphs (content) that is not inside of a tag, problem to recover these paragraphs. """ SpringerSoup = Soup(parser_version=__version__) SpringerSoup.add_ingredient(SpringerRemoveTagsSmallSub()) SpringerSoup.add_ingredient(SpringerFindJournalName()) SpringerSoup.add_ingredient(SpringerCreateTagAbstract()) SpringerSoup.add_ingredient(SpringerRemoveTrash()) SpringerSoup.add_ingredient(SpringerCreateTags()) SpringerSoup.add_ingredient(SpringerReplaceDivTagPara()) SpringerSoup.add_ingredient(SpringerCollect())
if isinstance(sec, str): data[i] = { 'type': '', 'name': '', 'content': sec } def remove_indexes(sections): """ remove indexes in section header """ # include number, greek number and capital char indexes_pattern = re.compile(r'^([A-z0-9]+)(\.|\s)(\s)+') for sec in sections: if isinstance(sec, dict): sec['name'] = re.sub(indexes_pattern, '', sec['name']) remove_indexes(sec['content']) remove_indexes(data) obj.update({'Sections': data}) return obj AIPSoup = Soup(parser_version=__version__) AIPSoup.add_ingredient(AIPRemoveTrash()) AIPSoup.add_ingredient(AIPCollectMetadata()) AIPSoup.add_ingredient(AIPCleanArticleBody()) AIPSoup.add_ingredient(AIPCollect())