def write(cls, record, fp=sys.stdout): for field in cls.fieldDict: content = record.get(field) if field is 'bibcode' and content is None: continue elif not content: continue d = cls.fieldDict.get(field) fmt = d.get('fmt') if fmt: content = fmt(content) jc = d.get('join', '') if type(content) is type([]): content = jc.join(content) elif type(content) is type({}): content = jc.join( [u"{0}: {1}".format(k, v) for k, v in content.items()]) try: fp.write('%{0} {1}\n'.format(d.get('tag'), named_entities(content))) except: logging.error( "error writing content for tag {0}: {1}\n".format( d.get('tag'), named_entities(content))) raise fp.write('\n')
def transform_inline_css(full_html_content, **kwargs): # Assume we care about these, set sane defaults # Pop because we don't want to pass in 2 keyword # arguements with the same names. exclude_pseudoclasses = kwargs.pop('exclude_pseudoclasses', True) keep_style_tags = kwargs.pop('keep_style_tags', True) remove_classes = kwargs.pop('remove_classes', False) disable_basic_attributes = kwargs.pop('disable_basic_attributes', ['width','height','align']) strip_important = kwargs.pop('strip_important', False) # Remove the html value, it has been passed in seperately as full_html_content kwargs.pop('html', False) # Attempt to inline the CSS from premailer import transform, Premailer import namedentities p = Premailer(html=full_html_content, exclude_pseudoclasses=exclude_pseudoclasses, keep_style_tags=keep_style_tags, remove_classes=remove_classes, disable_basic_attributes=disable_basic_attributes, strip_important=strip_important, **kwargs) new_content = p.transform(encoding='ascii') # ascii output encoding means unicode is escaped # Need to fix the helpful replacements it made new_content = new_content.replace('%7B', '{') new_content = new_content.replace('%7D', '}') # replace unicode or numberic escaped unicode with named entities (where possible) new_content = namedentities.named_entities(new_content) return new_content
def parse(self, authors_str, normalize=False, delimiter=u';', default_to_last_name=True, collaborations_params={}): """ Receives an authors string with individual author names separated by a delimiter and returns re-formatted authors string where all author names follow the structure: last name, first name It also verifies if an author name string contains a collaboration string. The collaboration extraction can be controlled by the dictionary 'collaborations_params' which can have the following keys: - keywords [list of strings]: Keywords that appear in strings that should be identifier as collaboration strings. Default: 'group', 'team', 'collaboration' - remove_the [boolean]: Remove the article 'The' from collaboration strings (e.g., 'The collaboration'). Default: False. - first_author_delimiter [string]: Some collaboration strings include the first author separated by a delimiter (e.g., The collaboration: First author), the delimiter can be specified in this variable, otherwise None or False values can be provided to avoid trying to extract first authors from collaboration strings. Default: ':' - fix_arXiv_mixed_collaboration_string [boolean]: Some arXiv entries mix the collaboration string with the collaboration string. (e.g. 'collaboration, Gaia'). Default: False """ default_collaborations_params = self.default_collaborations_params.copy() default_collaborations_params.update(collaborations_params) collaborations_params = default_collaborations_params # Split and convert unicode characters and numerical HTML # (e.g. 'u'both em\u2014and–dashes…' -> 'both em—and–dashes…') if sys.version_info > (3,): str_type = str else: str_type = unicode authors_list = [str_type(named_entities(n.strip())) for n in authors_str.split(delimiter)] corrected_authors_list = [] for author_str in authors_list: author_str = self._clean_author_name(author_str) # Check for collaboration strings is_collaboration, collaboration_str = self._extract_collaboration(author_str, default_to_last_name, delimiter, collaborations_params) if is_collaboration: # Collaboration strings can contain the first author, which we need to split for corrected_author_str in collaboration_str.split(delimiter): corrected_authors_list.append(corrected_author_str.strip()) else: corrected_authors_list.append(self._reorder_author_name(author_str, default_to_last_name)) corrected_authors_str = (delimiter + u' ').join(corrected_authors_list) # Last minute global corrections due to manually detected problems in # our processing corrected_authors_str = # corrected_authors_str.replace(' ,', ',').replace(' ', ' '). # replace('. -', '.-') corrected_authors_str = corrected_authors_str.replace(u', , ', u', ') corrected_authors_str = corrected_authors_str.replace(u' -', u'-').replace(u' ~', u'~') if normalize: return self._normalize(corrected_authors_str, delimiter=delimiter, collaborations_params=collaborations_params) else: return corrected_authors_str
def convert(self): o = named_entities(self.input_text) oents = list(dict.fromkeys(re.findall(re_ents, o))) for e in oents: try: enew = self.ent_dict[e] except: pass else: o = re.sub(e, enew, o) self.output_text = o
def write(cls, record, fp=sys.stdout): for field in cls.fieldDict: content = record.get(field) if field is 'bibcode' and content is None: continue elif not content: continue d = cls.fieldDict.get(field) fmt = d.get('fmt') if fmt: content = fmt(content) jc = d.get('join','') if type(content) is type([]): content = jc.join(content) elif type(content) is type({}): content = jc.join([u"{0}: {1}".format(k, v) for k,v in content.items()]) try: fp.write('%{0} {1}\n'.format(d.get('tag'), named_entities(content))) except: logging.error("error writing content for tag {0}: {1}\n".format(d.get('tag'), named_entities(content))) raise fp.write('\n')
from namedentities import named_entities u = u'both em\u2014and–dashes…' print named_entities(u)
def parse(self, input_data, **kwargs): output_metadata = {} document = self.resource_dict(input_data, **kwargs) r = document.front try: article_meta = r.find('article-meta') journal_meta = r.find('journal-meta') except Exception as err: return {} back_meta = document.back base_metadata = {} # Title: title_xref_list = [] title_fn_list = [] titledoi = None try: title = article_meta.find('title-group').find('article-title') except Exception as err: pass else: try: for dx in title.find_all('ext-link'): titledoi = dx['xlink:href'] except Exception as err: pass try: for dx in title.find_all('xref'): title_xref_list.append(self._detag(dx, JATS_TAGSET['abstract']).strip()) dx.decompose() # title.xref.decompose() # title.xref.extract() except Exception as err: pass try: for df in title.find_all('fn'): title_fn_list.append(self._detag(df, JATS_TAGSET['abstract']).strip()) df.decompose() # title.fn.decompose() # title.fn.extract() except Exception as err: pass base_metadata['title'] = ( self._detag(title, JATS_TAGSET['title']).strip()) # Abstract: try: abstract = article_meta.abstract.p except Exception as err: pass else: abstract = (self._detag(abstract, JATS_TAGSET['abstract'])) base_metadata['abstract'] = abstract if title_fn_list: base_metadata['abstract'] += ' ' + ' '.join(title_fn_list) # Authors and their Affiliations: try: auth_affil = JATSContribs(soup=article_meta) auth_affil.parse() aa_output = auth_affil.output except Exception as err: pass else: base_metadata['authors'] = '; '.join(aa_output['authors']) base_metadata['affiliations'] = aa_output['affiliations'] # Copyright: try: copyright = article_meta.find('copyright-statement') except Exception as err: pass else: base_metadata['copyright'] = self._detag(copyright, []) # Keywords: isErratum = False try: keys_uat = [] keys_misc = [] keys_aas = [] uatkeys = [] keywords = [] keyword_groups = article_meta.find_all('kwd-group') for kg in keyword_groups: # Check for UAT first: if kg['kwd-group-type'] == 'author': keys_uat_test = kg.find_all('compound-kwd-part') for kk in keys_uat_test: if kk['content-type'] == 'uat-code': keys_uat.append(self._detag(kk, JATS_TAGSET['keywords'])) if not keys_uat: keys_misc_test = kg.find_all('kwd') for kk in keys_misc_test: keys_misc.append(self._detag(kk, JATS_TAGSET['keywords'])) # Then check for AAS: elif kg['kwd-group-type'] == 'AAS': keys_aas_test = kg.find_all('kwd') for kk in keys_aas_test: keys_aas.append(self._detag(kk, JATS_TAGSET['keywords'])) # If all else fails, just search for 'kwd' else: keys_misc_test = kg.find_all('kwd') for kk in keys_misc_test: keys_misc.append(self._detag(kk, JATS_TAGSET['keywords'])) if keys_uat: uatkeys = keys_uat if uatkeys: base_metadata['uatkeys'] = ', '.join(uatkeys) if keys_aas: keywords = keys_aas if keys_misc: if keywords: keywords.extend(keys_misc) else: keywords = keys_misc if keywords: base_metadata['keywords'] = ', '.join(keywords) except Exception as err: pass if 'keywords' not in base_metadata: try: keywords = article_meta.find('article-categories').find_all('subj-group') except Exception as err: keywords = [] for c in keywords: try: if c['subj-group-type'] == 'toc-minor': klist = [] for k in c.find_all('subject'): klist.append(self._detag(k, JATS_TAGSET['keywords'])) base_metadata['keywords'] = ', '.join(klist) else: for k in c.find_all('subject'): if k.string == 'Errata' or k.string == 'Corrigendum': isErratum = True except Exception as err: pass # No longer required: # Now convert any UAT keywords to their URI: # try: # uat_cnv = UATURIConverter() # base_metadata['uatkeys'] = uat_cnv.convert_to_uri(base_metadata['uatkeys']) # except Exception as err: # pass # Volume: volume = article_meta.volume base_metadata['volume'] = self._detag(volume, []) # Issue: issue = article_meta.issue base_metadata['issue'] = self._detag(issue, []) # Journal name: try: journal = journal_meta.find('journal-title-group').find('journal-title') base_metadata['publication'] = self._detag(journal, []) except Exception as err: try: journal = journal_meta.find('journal-title') base_metadata['publication'] = self._detag(journal, []) except Exception as err: pass try: jid = journal_meta.find('journal-id', {'journal-id-type': 'publisher-id'}) if jid: base_metadata['pub-id'] = self._detag(jid, []) else: try: jid = journal_meta.find('journal-id', {'journal-id-type': 'coden'}) if jid: base_metadata['coden'] = self._detag(jid, []) except Exception as err: pass except Exception as err: pass # Related article data, especially corrections and errata relateddoi = None try: related = article_meta.find_all('related-article') for r in related: if r['related-article-type'] == 'corrected-article': isErratum = True relateddoi = r['xlink:href'] except Exception as err: pass # links: DOI and arxiv preprints # DOI base_metadata['properties'] = {} if isErratum: try: doiurl_pat = r'(.*?)(doi.org\/)' if titledoi: base_metadata['properties']['ERRATUM'] = re.sub(doiurl_pat, '', titledoi) elif relateddoi: base_metadata['properties']['ERRATUM'] = re.sub(doiurl_pat, '', relateddoi) else: print('warning, no doi for erratum!') # pass except Exception as err: print('warning, problem making erratum: %s' % err) # pass try: ids = article_meta.find_all('article-id') except Exception as err: ids = [] for d in ids: if d['pub-id-type'] == 'doi': base_metadata['properties']['DOI'] = self._detag(d, []) # Arxiv Preprint try: arxiv = article_meta.find_all('custom-meta') except Exception as err: pass else: ax_pref = 'https://arxiv.org/abs/' for ax in arxiv: try: x_name = self._detag(ax.find('meta-name'), []) x_value = self._detag(ax.find('meta-value'), []) if x_name == 'arxivppt': base_metadata['properties']['HTML'] = ax_pref + x_value except Exception as err: pass # Pubdate: try: pub_dates = article_meta.find_all('pub-date') except Exception as err: pub_dates = [] for d in pub_dates: try: a = d['publication-format'] except KeyError: a = '' try: b = d['pub-type'] except KeyError: b = '' try: pubdate = "/" + self._detag(d.year, []) try: d.month except Exception as err: pubdate = "00" + pubdate else: try: int(self._detag(d.month, [])) except Exception as errrr: month_name = self._detag(d.month, [])[0:3].lower() month = MONTH_TO_NUMBER[month_name] else: month = self._detag(d.month, []) if int(month) < 10: month = "0" + str(int(month)) else: month = str(month) pubdate = month + pubdate except Exception as errrr: pass else: if a == 'print' or b == 'ppub' or b == 'cover': base_metadata['pubdate'] = pubdate elif a == 'electronic' or b == 'epub': try: base_metadata['pubdate'] except Exception as err: base_metadata['pubdate'] = pubdate try: if b == 'open-access': base_metadata.setdefault('properties', {}).setdefault('OPEN', 1) except Exception as err: pass # Check for open-access / "Permissions" field try: permissions = article_meta.find('permissions').find_all('license') except Exception as err: pass else: for p in permissions: try: if p['license-type'] == 'open': base_metadata.setdefault('properties', {}).setdefault('OPEN', 1) except Exception as err: pass # Pages: fpage = article_meta.fpage if fpage is None: fpage = article_meta.find('elocation-id') if fpage is None: fpage = article_meta.pageStart if fpage is None: del fpage try: fpage except NameError: pass else: lpage = article_meta.lpage if lpage is None: lpage = article_meta.pageEnd if lpage is None: del lpage else: if lpage == fpage: del lpage try: lpage except NameError: base_metadata['page'] = self._detag(fpage, []) else: base_metadata['page'] = self._detag(fpage, []) + "-" + ( self._detag(lpage, [])) # Number of Pages: try: counts = article_meta.counts pagecount = counts.find('page-count') base_metadata['numpages'] = '<NUMPAGES>' + pagecount['count'] + '</NUMPAGES>' except Exception as err: pass # References (now using back_meta): if back_meta is not None: ref_list_text = [] try: ref_results = back_meta.find('ref-list').find_all('ref') for r in ref_results: s = str_type(r.extract()).replace('\n', '') s = re.sub(r'\s+', r' ', s) s = namedentities.named_entities(s) ref_list_text.append(s) except Exception as err: pass else: base_metadata['refhandler_list'] = ref_list_text # Entity Conversions: econv = EntityConverter() for k, v in base_metadata.items(): if isinstance(v,str_type): econv.input_text = v econv.convert() v = econv.output_text elif isinstance(v, list): newv = [] for l in v: if isinstance(l,str_type): econv.input_text = l econv.convert() l = econv.output_text newv.append(l) v = newv else: pass output_metadata[k] = v return output_metadata
def run(self, text): return named_entities(smartyPants(text))
def clean(self, value): if isinstance(value, str): value = named_entities(value) return super(HTMLField, self).clean(value)
def htmlentities(s): replaced_entities = named_entities( escape(s).encode("ascii", "xmlcharrefreplace").decode("utf8")) return mark_safe(replaced_entities)
from namedentities import named_entities u = 'both em\u2014and–dashes…' print(named_entities(u))
def parse(self, fp, **kwargs): output_metadata = {} document = self.resource_dict(fp, **kwargs) r = document.front try: article_meta = r.find('article-meta') journal_meta = r.find('journal-meta') except Exception as err: return {} back_meta = document.back base_metadata = {} # Title: title_xref_list = [] title_fn_list = [] try: title = article_meta.find('title-group').find('article-title') except Exception as err: pass else: try: for dx in title.find_all('xref'): title_xref_list.append( self._detag(dx, JATS_TAGSET['abstract']).strip()) dx.decompose() # title.xref.decompose() # title.xref.extract() except Exception as err: pass try: for df in title.find_all('fn'): title_fn_list.append( self._detag(df, JATS_TAGSET['abstract']).strip()) df.decompose() # title.fn.decompose() # title.fn.extract() except Exception as err: pass base_metadata['title'] = (self._detag( title, JATS_TAGSET['title']).strip()) # Abstract: try: abstract = article_meta.abstract.p except Exception as err: pass else: try: for element in abstract( text=lambda text: isinstance(text, CData)): element.extract() except Exception as err: pass else: abstract = (self._detag(abstract, JATS_TAGSET['abstract'])) base_metadata['abstract'] = abstract if title_fn_list: base_metadata['abstract'] += ' ' + ' '.join(title_fn_list) # Authors and Affiliations: # Set up affils storage affils = OrderedDict() # Author notes/note ids try: notes = article_meta.find('author-notes').find_all('fn') except Exception as err: pass else: for n in notes: try: n.label.decompose() except Exception as err: pass else: try: n['id'] except Exception as err: pass # print "I'm failing on author notes!",err else: key = n['id'] note_text = self._detag(n, JATS_TAGSET['affiliations']) affils[key] = note_text.strip() # Affils/affil ids l_need_affils = False try: affil = article_meta.find('contrib-group').find_all('aff') if len(affil) == 0: try: affil = article_meta.find_all('aff') except Exception as err: pass except Exception as err: pass else: for a in affil: try: a.label.decompose() except Exception as err: pass try: a['id'] except Exception as err: # print "I'm failing in the affils loop!",err l_need_affils = True else: key = a['id'] ekey = '' try: email_array = [] email_a = a.find_all('ext-link') for em in email_a: if em['ext-link-type'] == 'email': address = self._detag( em, (JATS_TAGSET['affiliations'])) address_new = "<EMAIL>" + address + "</EMAIL>" ekey = em['id'] if ekey is not '': affils[ekey] = address_new while a.find('ext-link') is not None: a.find('ext-link').extract() except Exception as err: pass aff_text = self._detag(a, JATS_TAGSET['affiliations']) affils[key] = aff_text.strip() # if ekey is not '': # affils[ekey] = address_new # Author name and affil/note lists: try: authors = article_meta.find('contrib-group').find_all('contrib') except Exception as err: pass else: base_metadata['authors'] = [] base_metadata['affiliations'] = [] for a in authors: # ORCIDs orcid_out = None try: # orcids = a.find_all('ext-link') orcids = a.find('ext-link') try: if orcids['ext-link-type'] == 'orcid': o = self._detag(orcids, []) orcid_out = "<ID system=\"ORCID\">" + o + "</ID>" except Exception as err: pass except Exception as err: pass if orcid_out is None: try: if a.find('contrib-id') is not None: auth_id = a.find('contrib-id') if auth_id['contrib-id-type'] == 'orcid': o = self._detag(auth_id, []) o = o.split('/')[-1] orcid_out = "<ID system=\"ORCID\">" + o + "</ID>" except Exception as err: pass # If you didn't get affiliations above, l_need_affils == True, so do this... if l_need_affils: try: # MT, 2021-Jan-19 MNRAS fix: # note that a may have multiple aff.affiliations tags, so use find_all here # if a .find('aff') is not None: # aff_id = a.find('aff') # aff_text = self._detag(aff_id, JATS_TAGSET['affiliations']) if a.find_all('aff') is not None: aff_text_arr = list() for ax in a.find_all('aff'): aff_text_arr.append( self._detag( ax, JATS_TAGSET['affiliations']).strip()) aff_text = "; ".join(aff_text_arr) except Exception as err: pass # Author names if a.find('collab') is not None: base_metadata['authors'].append(self._detag(a.collab, [])) else: if a.find('surname') is not None: surname = self._detag(a.surname, []) else: surname = '' if a.find('prefix') is not None: prefix = self._detag(a.prefix, []) + ' ' else: prefix = '' if a.find('suffix') is not None: suffix = ' ' + self._detag(a.suffix, []) else: suffix = '' if a.find('given-names') is not None: given = self._detag(a.find('given-names'), []) else: given = '' forename = prefix + given + suffix if forename == '': if surname != '': base_metadata['authors'].append(surname) # else: # base_metadata['authors'].append('ANONYMOUS') # check instead whether author array is empty, and # pass an empty array to serializer else: if surname != '': base_metadata['authors'].append(surname + ', ' + forename) else: base_metadata['authors'].append(forename) # EMAIL in contrib-group (e.g. OUP) email = None if a.find('email') is not None: email = self._detag(a.email, []) email = '<EMAIL>' + email + '</EMAIL>' # Author affil/note ids try: aid = a.find_all('xref') except Exception as err: pass else: aid_arr = [] if len(aid) > 0: try: aid_str = ' '.join([x['rid'] for x in aid]) except Exception as err: print("jats.py: Failure in affil parsing: %s" % err) else: aid_arr = aid_str.split() try: new_aid_arr = [] for af in affils.keys(): if af in aid_arr: new_aid_arr.append(af) aid_arr = new_aid_arr # check whether or not you got affil data in one way or the other... if not l_need_affils: aff_text = '; '.join(affils[x] for x in aid_arr) aff_text = aff_text.replace(';;', ';').rstrip(';') aff_text = aff_text.replace('; ,', '').rstrip() # Got ORCID? if orcid_out is not None: aff_text = aff_text + '; ' + orcid_out if email is not None: aff_text = aff_text + ' ' + email base_metadata['affiliations'].append(aff_text) except Exception as errrror: if orcid_out is not None: base_metadata['affiliations'].append(orcid_out) else: base_metadata['affiliations'].append('') affnew = [] for affx in base_metadata['affiliations']: affnew.append(AffiliationParser(affx).parse()) base_metadata['affiliations'] = affnew if len(base_metadata['authors']) > 0: base_metadata['authors'] = "; ".join(base_metadata['authors']) else: del base_metadata['authors'] # Copyright: try: copyright = article_meta.find('copyright-statement') except Exception as err: pass else: base_metadata['copyright'] = self._detag(copyright, []) # Keywords: try: keys_uat = [] keys_misc = [] keys_aas = [] uatkeys = [] keywords = [] keyword_groups = article_meta.find_all('kwd-group') for kg in keyword_groups: # Check for UAT first: if kg['kwd-group-type'] == 'author': keys_uat_test = kg.find_all('compound-kwd-part') for kk in keys_uat_test: if kk['content-type'] == 'uat-code': keys_uat.append( self._detag(kk, JATS_TAGSET['keywords'])) if not keys_uat: keys_misc_test = kg.find_all('kwd') for kk in keys_misc_test: keys_misc.append( self._detag(kk, JATS_TAGSET['keywords'])) # Then check for AAS: elif kg['kwd-group-type'] == 'AAS': keys_aas_test = kg.find_all('kwd') for kk in keys_aas_test: keys_aas.append( self._detag(kk, JATS_TAGSET['keywords'])) # If all else fails, just search for 'kwd' else: keys_misc_test = kg.find_all('kwd') for kk in keys_misc_test: keys_misc.append( self._detag(kk, JATS_TAGSET['keywords'])) if keys_uat: uatkeys = keys_uat if uatkeys: base_metadata['uatkeys'] = ', '.join(uatkeys) if keys_aas: keywords = keys_aas if keys_misc: if keywords: keywords.extend(keys_misc) else: keywords = keys_misc if keywords: base_metadata['keywords'] = ', '.join(keywords) except Exception as err: pass if 'keywords' not in base_metadata: try: keywords = article_meta.find('article-categories').find_all( 'subj-group') except Exception as err: keywords = [] for c in keywords: try: if c['subj-group-type'] == 'toc-minor': klist = [] for k in c.find_all('subject'): klist.append( self._detag(k, JATS_TAGSET['keywords'])) base_metadata['keywords'] = ', '.join(klist) except Exception as err: pass # No longer required: # Now convert any UAT keywords to their URI: # try: # uat_cnv = UATURIConverter() # base_metadata['uatkeys'] = uat_cnv.convert_to_uri(base_metadata['uatkeys']) # except Exception as err: # pass # Volume: volume = article_meta.volume base_metadata['volume'] = self._detag(volume, []) # Issue: issue = article_meta.issue base_metadata['issue'] = self._detag(issue, []) # Journal name: try: journal = journal_meta.find('journal-title-group').find( 'journal-title') base_metadata['publication'] = self._detag(journal, []) except Exception as err: try: journal = journal_meta.find('journal-title') base_metadata['publication'] = self._detag(journal, []) except Exception as err: pass try: jid = journal_meta.find('journal-id', {'journal-id-type': 'publisher-id'}) if jid: base_metadata['pub-id'] = self._detag(jid, []) else: try: jid = journal_meta.find('journal-id', {'journal-id-type': 'coden'}) if jid: base_metadata['coden'] = self._detag(jid, []) except Exception as err: pass except Exception as err: pass # links: DOI and arxiv preprints # DOI base_metadata['properties'] = {} try: ids = article_meta.find_all('article-id') except Exception as err: ids = [] for d in ids: if d['pub-id-type'] == 'doi': base_metadata['properties']['DOI'] = self._detag(d, []) # Arxiv Preprint try: arxiv = article_meta.find_all('custom-meta') except Exception as err: pass else: ax_pref = 'https://arxiv.org/abs/' for ax in arxiv: try: x_name = self._detag(ax.find('meta-name'), []) x_value = self._detag(ax.find('meta-value'), []) if x_name == 'arxivppt': base_metadata['properties']['HTML'] = ax_pref + x_value except Exception as err: pass # Pubdate: try: pub_dates = article_meta.find_all('pub-date') except Exception as err: pub_dates = [] for d in pub_dates: try: a = d['publication-format'] except KeyError: a = '' try: b = d['pub-type'] except KeyError: b = '' try: pubdate = "/" + self._detag(d.year, []) try: d.month except Exception as err: pubdate = "00" + pubdate else: try: int(self._detag(d.month, [])) except Exception as errrr: month_name = self._detag(d.month, [])[0:3].lower() month = MONTH_TO_NUMBER[month_name] else: month = self._detag(d.month, []) if int(month) < 10: month = "0" + str(int(month)) else: month = str(month) pubdate = month + pubdate except Exception as errrr: pass else: if a == 'print' or b == 'ppub' or b == 'cover': base_metadata['pubdate'] = pubdate elif a == 'electronic' or b == 'epub': try: base_metadata['pubdate'] except Exception as err: base_metadata['pubdate'] = pubdate try: if b == 'open-access': base_metadata.setdefault('properties', {}).setdefault('OPEN', 1) except Exception as err: pass # Pages: fpage = article_meta.fpage if fpage is None: fpage = article_meta.find('elocation-id') if fpage is None: fpage = article_meta.pageStart if fpage is None: del fpage try: fpage except NameError: pass else: lpage = article_meta.lpage if lpage is None: lpage = article_meta.pageEnd if lpage is None: del lpage else: if lpage == fpage: del lpage try: lpage except NameError: base_metadata['page'] = self._detag(fpage, []) else: base_metadata['page'] = self._detag( fpage, []) + "-" + (self._detag(lpage, [])) # Number of Pages: try: counts = article_meta.counts pagecount = counts.find('page-count') base_metadata[ 'numpages'] = '<NUMPAGES>' + pagecount['count'] + '</NUMPAGES>' except Exception as err: pass # References (now using back_meta): if back_meta is not None: ref_list_text = [] try: ref_results = back_meta.find('ref-list').find_all('ref') for r in ref_results: s = str_type(r.extract()).replace('\n', '') s = re.sub(r'\s+', r' ', s) s = namedentities.named_entities(s) ref_list_text.append(s) except Exception as err: pass else: base_metadata['refhandler_list'] = ref_list_text output_metadata = base_metadata # Last step: entity conversion ec_fields = ['authors', 'abstract', 'title'] econv = EntityConverter() for ecf in ec_fields: try: econv.input_text = output_metadata[ecf] econv.convert() output_metadata[ecf] = econv.output_text except Exception as err: pass return output_metadata