def write(cls, record, fp=sys.stdout):
     for field in cls.fieldDict:
         content = record.get(field)
         if field is 'bibcode' and content is None:
             continue
         elif not content:
             continue
         d = cls.fieldDict.get(field)
         fmt = d.get('fmt')
         if fmt:
             content = fmt(content)
         jc = d.get('join', '')
         if type(content) is type([]):
             content = jc.join(content)
         elif type(content) is type({}):
             content = jc.join(
                 [u"{0}: {1}".format(k, v) for k, v in content.items()])
         try:
             fp.write('%{0} {1}\n'.format(d.get('tag'),
                                          named_entities(content)))
         except:
             logging.error(
                 "error writing content for tag {0}: {1}\n".format(
                     d.get('tag'), named_entities(content)))
             raise
     fp.write('\n')
Exemple #2
0
def transform_inline_css(full_html_content, **kwargs):

    # Assume we care about these, set sane defaults
    # Pop because we don't want to pass in 2 keyword
    # arguements with the same names.
    exclude_pseudoclasses = kwargs.pop('exclude_pseudoclasses', True)
    keep_style_tags = kwargs.pop('keep_style_tags', True)
    remove_classes = kwargs.pop('remove_classes', False)
    disable_basic_attributes = kwargs.pop('disable_basic_attributes', ['width','height','align'])
    strip_important = kwargs.pop('strip_important', False)
    # Remove the html value, it has been passed in seperately as full_html_content
    kwargs.pop('html', False)

    # Attempt to inline the CSS
    from premailer import transform, Premailer
    import namedentities
    p = Premailer(html=full_html_content, exclude_pseudoclasses=exclude_pseudoclasses, keep_style_tags=keep_style_tags,
                remove_classes=remove_classes, disable_basic_attributes=disable_basic_attributes, strip_important=strip_important, **kwargs)

    new_content = p.transform(encoding='ascii') # ascii output encoding means unicode is escaped
    # Need to fix the helpful replacements it made
    new_content = new_content.replace('%7B', '{')
    new_content = new_content.replace('%7D', '}')
    # replace unicode or numberic escaped unicode with named entities (where possible)
    new_content = namedentities.named_entities(new_content)

    return new_content
    def parse(self, authors_str, normalize=False, delimiter=u';', default_to_last_name=True, collaborations_params={}):
        """
        Receives an authors string with individual author names separated by a
        delimiter and returns re-formatted authors string where all author
        names follow the structure: last name, first name

        It also verifies if an author name string contains a collaboration
        string.  The collaboration extraction can be controlled by the
        dictionary 'collaborations_params' which can have the following keys:

        - keywords [list of strings]: Keywords that appear in strings that
          should be identifier as collaboration strings. Default: 'group',
          'team', 'collaboration'
        - remove_the [boolean]: Remove the article 'The' from collaboration
          strings (e.g., 'The collaboration'). Default: False.
        - first_author_delimiter [string]: Some collaboration strings include
          the first author separated by a delimiter (e.g., The collaboration:
          First author), the delimiter can be specified in this variable,
          otherwise None or False values can be provided to avoid trying to
          extract first authors from collaboration strings. Default: ':'
        - fix_arXiv_mixed_collaboration_string [boolean]: Some arXiv entries
          mix the collaboration string with the collaboration string.
          (e.g. 'collaboration, Gaia'). Default: False
        """
        default_collaborations_params = self.default_collaborations_params.copy()
        default_collaborations_params.update(collaborations_params)
        collaborations_params = default_collaborations_params

        # Split and convert unicode characters and numerical HTML
        # (e.g. 'u'both em\u2014and–dashes…' -> 'both em—and–dashes…')
        if sys.version_info > (3,):
            str_type = str
        else:
            str_type = unicode
        authors_list = [str_type(named_entities(n.strip())) for n in authors_str.split(delimiter)]

        corrected_authors_list = []
        for author_str in authors_list:
            author_str = self._clean_author_name(author_str)
            # Check for collaboration strings
            is_collaboration, collaboration_str = self._extract_collaboration(author_str, default_to_last_name, delimiter, collaborations_params)
            if is_collaboration:
                # Collaboration strings can contain the first author, which we need to split
                for corrected_author_str in collaboration_str.split(delimiter):
                    corrected_authors_list.append(corrected_author_str.strip())
            else:
                corrected_authors_list.append(self._reorder_author_name(author_str, default_to_last_name))
        corrected_authors_str = (delimiter + u' ').join(corrected_authors_list)

        # Last minute global corrections due to manually detected problems in
        # our processing corrected_authors_str =
        # corrected_authors_str.replace(' ,', ',').replace('  ', ' ').
        # replace('. -', '.-')
        corrected_authors_str = corrected_authors_str.replace(u', , ', u', ')
        corrected_authors_str = corrected_authors_str.replace(u' -', u'-').replace(u' ~', u'~')
        if normalize:
            return self._normalize(corrected_authors_str, delimiter=delimiter, collaborations_params=collaborations_params)
        else:
            return corrected_authors_str
    def convert(self):
        o = named_entities(self.input_text)
        oents = list(dict.fromkeys(re.findall(re_ents, o)))

        for e in oents:
            try:
                enew = self.ent_dict[e]
            except:
                pass
            else:
                o = re.sub(e, enew, o)
        self.output_text = o
Exemple #5
0
 def write(cls, record, fp=sys.stdout):
     for field in cls.fieldDict:
         content = record.get(field)
         if field is 'bibcode' and content is None:
             continue
         elif not content:
             continue
         d = cls.fieldDict.get(field)
         fmt = d.get('fmt')
         if fmt:
             content = fmt(content)
         jc = d.get('join','')
         if type(content) is type([]):
             content = jc.join(content)
         elif type(content) is type({}):
             content = jc.join([u"{0}: {1}".format(k, v) for k,v in content.items()])
         try:
             fp.write('%{0} {1}\n'.format(d.get('tag'), named_entities(content)))
         except:
             logging.error("error writing content for tag {0}: {1}\n".format(d.get('tag'), named_entities(content)))
             raise
     fp.write('\n')
Exemple #6
0
from namedentities import named_entities
 
u = u'both em\u2014and–dashes…'
print named_entities(u)
    def parse(self, input_data, **kwargs):

        output_metadata = {}

        document = self.resource_dict(input_data, **kwargs)

        r = document.front

        try:
            article_meta = r.find('article-meta')
            journal_meta = r.find('journal-meta')
        except Exception as err:
            return {}

        back_meta = document.back

        base_metadata = {}

        # Title:
        title_xref_list = []
        title_fn_list = []
        titledoi = None
        try:
            title = article_meta.find('title-group').find('article-title')
        except Exception as err:
            pass
        else:
            try:
                for dx in title.find_all('ext-link'):
                    titledoi = dx['xlink:href']
            except Exception as err:
                pass
            try:
                for dx in title.find_all('xref'):
                    title_xref_list.append(self._detag(dx, JATS_TAGSET['abstract']).strip())
                    dx.decompose()
                # title.xref.decompose()
                # title.xref.extract()
            except Exception as err:
                pass
            try:
                for df in title.find_all('fn'):
                    title_fn_list.append(self._detag(df, JATS_TAGSET['abstract']).strip())
                    df.decompose()
                # title.fn.decompose()
                # title.fn.extract()
            except Exception as err:
                pass
            base_metadata['title'] = (
                self._detag(title, JATS_TAGSET['title']).strip())

        # Abstract:
        try:
            abstract = article_meta.abstract.p
        except Exception as err:
            pass
        else:
            abstract = (self._detag(abstract, JATS_TAGSET['abstract']))
            base_metadata['abstract'] = abstract
            if title_fn_list:
                base_metadata['abstract'] += '  ' + ' '.join(title_fn_list)

        # Authors and their Affiliations:
        try:
            auth_affil = JATSContribs(soup=article_meta)
            auth_affil.parse()
            aa_output = auth_affil.output
        except Exception as err:
            pass
        else:
            base_metadata['authors'] = '; '.join(aa_output['authors'])
            base_metadata['affiliations'] = aa_output['affiliations']

        # Copyright:
        try:
            copyright = article_meta.find('copyright-statement')
        except Exception as err:
            pass
        else:
            base_metadata['copyright'] = self._detag(copyright, [])

        # Keywords:
        isErratum = False
        try:
            keys_uat = []
            keys_misc = []
            keys_aas = []
            uatkeys = []
            keywords = []
            keyword_groups = article_meta.find_all('kwd-group')
            for kg in keyword_groups:
                # Check for UAT first:
                if kg['kwd-group-type'] == 'author':
                    keys_uat_test = kg.find_all('compound-kwd-part')
                    for kk in keys_uat_test:
                        if kk['content-type'] == 'uat-code':
                            keys_uat.append(self._detag(kk, JATS_TAGSET['keywords']))
                    if not keys_uat:
                        keys_misc_test = kg.find_all('kwd')
                        for kk in keys_misc_test:
                            keys_misc.append(self._detag(kk, JATS_TAGSET['keywords']))
                # Then check for AAS:
                elif kg['kwd-group-type'] == 'AAS':
                    keys_aas_test = kg.find_all('kwd')
                    for kk in keys_aas_test:
                        keys_aas.append(self._detag(kk, JATS_TAGSET['keywords']))
                # If all else fails, just search for 'kwd'
                else:
                    keys_misc_test = kg.find_all('kwd')
                    for kk in keys_misc_test:
                        keys_misc.append(self._detag(kk, JATS_TAGSET['keywords']))

            if keys_uat:
                uatkeys = keys_uat
            if uatkeys:
                base_metadata['uatkeys'] = ', '.join(uatkeys)

            if keys_aas:
                keywords = keys_aas
            if keys_misc:
                if keywords:
                    keywords.extend(keys_misc)
                else:
                    keywords = keys_misc
            if keywords:
                base_metadata['keywords'] = ', '.join(keywords)
        except Exception as err:
            pass

        if 'keywords' not in base_metadata:
            try:
                keywords = article_meta.find('article-categories').find_all('subj-group')
            except Exception as err:
                keywords = []
            for c in keywords:
                try:
                    if c['subj-group-type'] == 'toc-minor':
                        klist = []
                        for k in c.find_all('subject'):
                            klist.append(self._detag(k, JATS_TAGSET['keywords']))
                        base_metadata['keywords'] = ', '.join(klist)
                    else:
                        for k in c.find_all('subject'):
                            if k.string == 'Errata' or k.string == 'Corrigendum':
                                isErratum = True
                except Exception as err:
                    pass

        # No longer required:
        # Now convert any UAT keywords to their URI:
        # try:
            # uat_cnv = UATURIConverter()
            # base_metadata['uatkeys'] = uat_cnv.convert_to_uri(base_metadata['uatkeys'])
        # except Exception as err:
            # pass
        

        # Volume:
        volume = article_meta.volume
        base_metadata['volume'] = self._detag(volume, [])

        # Issue:
        issue = article_meta.issue
        base_metadata['issue'] = self._detag(issue, [])

        # Journal name:
        try:
            journal = journal_meta.find('journal-title-group').find('journal-title')
            base_metadata['publication'] = self._detag(journal, [])
        except Exception as err:
            try:
                journal = journal_meta.find('journal-title')
                base_metadata['publication'] = self._detag(journal, [])
            except Exception as err:
                pass

        try:
            jid = journal_meta.find('journal-id', {'journal-id-type': 'publisher-id'})
            if jid:
                base_metadata['pub-id'] = self._detag(jid, [])
            else:
                try:
                    jid = journal_meta.find('journal-id', {'journal-id-type': 'coden'})
                    if jid:
                        base_metadata['coden'] = self._detag(jid, [])
                except Exception as err:
                    pass
        except Exception as err:
            pass

        # Related article data, especially corrections and errata
        relateddoi = None
        try:
            related = article_meta.find_all('related-article')
            for r in related:
                if r['related-article-type'] == 'corrected-article':
                    isErratum = True
                    relateddoi = r['xlink:href']
        except Exception as err:
            pass

        # links: DOI and arxiv preprints
        # DOI
        base_metadata['properties'] = {}
        if isErratum:
            try:
                doiurl_pat = r'(.*?)(doi.org\/)'
                if titledoi:
                    base_metadata['properties']['ERRATUM'] = re.sub(doiurl_pat, '', titledoi)
                elif relateddoi:
                    base_metadata['properties']['ERRATUM'] = re.sub(doiurl_pat, '', relateddoi)
                else:
                    print('warning, no doi for erratum!')
                    # pass
            except Exception as err:
                print('warning, problem making erratum: %s' % err)
                # pass
        try:
            ids = article_meta.find_all('article-id')
        except Exception as err:
            ids = []
        for d in ids:
            if d['pub-id-type'] == 'doi':
                base_metadata['properties']['DOI'] = self._detag(d, [])
        # Arxiv Preprint
        try:
            arxiv = article_meta.find_all('custom-meta')
        except Exception as err:
            pass
        else:
            ax_pref = 'https://arxiv.org/abs/'
            for ax in arxiv:
                try:
                    x_name = self._detag(ax.find('meta-name'), [])
                    x_value = self._detag(ax.find('meta-value'), [])
                    if x_name == 'arxivppt':
                        base_metadata['properties']['HTML'] = ax_pref + x_value
                except Exception as err:
                    pass

        # Pubdate:
        try:
            pub_dates = article_meta.find_all('pub-date')
        except Exception as err:
            pub_dates = []
        for d in pub_dates:
            try:
                a = d['publication-format']
            except KeyError:
                a = ''
            try:
                b = d['pub-type']
            except KeyError:
                b = ''
            try:
                pubdate = "/" + self._detag(d.year, [])
                try:
                    d.month
                except Exception as err:
                    pubdate = "00" + pubdate
                else:
                    try:
                        int(self._detag(d.month, []))
                    except Exception as errrr:
                        month_name = self._detag(d.month, [])[0:3].lower()
                        month = MONTH_TO_NUMBER[month_name]
                    else:
                        month = self._detag(d.month, [])
                    if int(month) < 10:
                        month = "0" + str(int(month))
                    else:
                        month = str(month)
                    pubdate = month + pubdate
            except Exception as errrr:
                pass
            else:
                if a == 'print' or b == 'ppub' or b == 'cover':
                    base_metadata['pubdate'] = pubdate
                elif a == 'electronic' or b == 'epub':
                    try:
                        base_metadata['pubdate']
                    except Exception as err:
                        base_metadata['pubdate'] = pubdate
            try:
                if b == 'open-access':
                    base_metadata.setdefault('properties', {}).setdefault('OPEN', 1)
            except Exception as err:
                pass

        # Check for open-access / "Permissions" field
        try:
            permissions = article_meta.find('permissions').find_all('license')
        except Exception as err:
            pass
        else:
            for p in permissions:
                try:
                    if p['license-type'] == 'open':
                        base_metadata.setdefault('properties', {}).setdefault('OPEN', 1)
                except Exception as err:
                    pass
                

        # Pages:

        fpage = article_meta.fpage
        if fpage is None:
            fpage = article_meta.find('elocation-id')
            if fpage is None:
                fpage = article_meta.pageStart
                if fpage is None:
                    del fpage
        try:
            fpage
        except NameError:
            pass
        else:
            lpage = article_meta.lpage
            if lpage is None:
                lpage = article_meta.pageEnd
                if lpage is None:
                    del lpage
            else:
                if lpage == fpage:
                    del lpage
            try:
                lpage
            except NameError:
                base_metadata['page'] = self._detag(fpage, [])
            else:
                base_metadata['page'] = self._detag(fpage, []) + "-" + (
                    self._detag(lpage, []))

        # Number of Pages:
        try:
            counts = article_meta.counts
            pagecount = counts.find('page-count')
            base_metadata['numpages'] = '<NUMPAGES>' + pagecount['count'] + '</NUMPAGES>'
        except Exception as err:
            pass

        # References (now using back_meta):
        if back_meta is not None:

            ref_list_text = []
            try:
                ref_results = back_meta.find('ref-list').find_all('ref')
                for r in ref_results:
                    s = str_type(r.extract()).replace('\n', '')
                    s = re.sub(r'\s+', r' ', s)
                    s = namedentities.named_entities(s)
                    ref_list_text.append(s)
            except Exception as err:
                pass
            else:
                base_metadata['refhandler_list'] = ref_list_text

        # Entity Conversions:
        econv = EntityConverter()
        for k, v in base_metadata.items():
            if isinstance(v,str_type):
                econv.input_text = v
                econv.convert()
                v = econv.output_text
            elif isinstance(v, list):
                newv = []
                for l in v:
                    if isinstance(l,str_type):
                        econv.input_text = l
                        econv.convert()
                        l = econv.output_text
                    newv.append(l)
                v = newv
            else:
                pass
            output_metadata[k] = v


        return output_metadata
 def run(self, text):
     return named_entities(smartyPants(text))
Exemple #9
0
 def clean(self, value):
     if isinstance(value, str):
         value = named_entities(value)
     return super(HTMLField, self).clean(value)
Exemple #10
0
def htmlentities(s):
    replaced_entities = named_entities(
        escape(s).encode("ascii", "xmlcharrefreplace").decode("utf8"))
    return mark_safe(replaced_entities)
Exemple #11
0
from namedentities import named_entities

u = 'both em\u2014and&#x2013;dashes&hellip;'
print(named_entities(u))
Exemple #12
0
 def run(self, text):
     return named_entities(smartyPants(text))
Exemple #13
0
    def parse(self, fp, **kwargs):

        output_metadata = {}

        document = self.resource_dict(fp, **kwargs)
        r = document.front

        try:
            article_meta = r.find('article-meta')
            journal_meta = r.find('journal-meta')
        except Exception as err:
            return {}

        back_meta = document.back

        base_metadata = {}

        # Title:
        title_xref_list = []
        title_fn_list = []
        try:
            title = article_meta.find('title-group').find('article-title')
        except Exception as err:
            pass
        else:
            try:
                for dx in title.find_all('xref'):
                    title_xref_list.append(
                        self._detag(dx, JATS_TAGSET['abstract']).strip())
                    dx.decompose()
                # title.xref.decompose()
                # title.xref.extract()
            except Exception as err:
                pass
            try:
                for df in title.find_all('fn'):
                    title_fn_list.append(
                        self._detag(df, JATS_TAGSET['abstract']).strip())
                    df.decompose()
                # title.fn.decompose()
                # title.fn.extract()
            except Exception as err:
                pass
            base_metadata['title'] = (self._detag(
                title, JATS_TAGSET['title']).strip())

        # Abstract:
        try:
            abstract = article_meta.abstract.p
        except Exception as err:
            pass
        else:
            try:
                for element in abstract(
                        text=lambda text: isinstance(text, CData)):
                    element.extract()
            except Exception as err:
                pass
            else:
                abstract = (self._detag(abstract, JATS_TAGSET['abstract']))
            base_metadata['abstract'] = abstract
            if title_fn_list:
                base_metadata['abstract'] += '  ' + ' '.join(title_fn_list)

        # Authors and Affiliations:
        # Set up affils storage
        affils = OrderedDict()

        # Author notes/note ids
        try:
            notes = article_meta.find('author-notes').find_all('fn')
        except Exception as err:
            pass
        else:
            for n in notes:
                try:
                    n.label.decompose()
                except Exception as err:
                    pass
                else:
                    try:
                        n['id']
                    except Exception as err:
                        pass
                        # print "I'm failing on author notes!",err
                    else:
                        key = n['id']
                        note_text = self._detag(n, JATS_TAGSET['affiliations'])
                        affils[key] = note_text.strip()

        # Affils/affil ids
        l_need_affils = False
        try:
            affil = article_meta.find('contrib-group').find_all('aff')
            if len(affil) == 0:
                try:
                    affil = article_meta.find_all('aff')
                except Exception as err:
                    pass
        except Exception as err:
            pass
        else:
            for a in affil:
                try:
                    a.label.decompose()
                except Exception as err:
                    pass
                try:
                    a['id']
                except Exception as err:
                    # print "I'm failing in the affils loop!",err
                    l_need_affils = True
                else:
                    key = a['id']
                    ekey = ''
                    try:
                        email_array = []
                        email_a = a.find_all('ext-link')
                        for em in email_a:
                            if em['ext-link-type'] == 'email':
                                address = self._detag(
                                    em, (JATS_TAGSET['affiliations']))
                                address_new = "<EMAIL>" + address + "</EMAIL>"
                                ekey = em['id']
                                if ekey is not '':
                                    affils[ekey] = address_new
                        while a.find('ext-link') is not None:
                            a.find('ext-link').extract()
                    except Exception as err:
                        pass

                    aff_text = self._detag(a, JATS_TAGSET['affiliations'])
                    affils[key] = aff_text.strip()
                    # if ekey is not '':
                    #     affils[ekey] = address_new

        # Author name and affil/note lists:
        try:
            authors = article_meta.find('contrib-group').find_all('contrib')
        except Exception as err:
            pass
        else:
            base_metadata['authors'] = []
            base_metadata['affiliations'] = []
            for a in authors:

                # ORCIDs
                orcid_out = None
                try:
                    # orcids = a.find_all('ext-link')
                    orcids = a.find('ext-link')
                    try:
                        if orcids['ext-link-type'] == 'orcid':
                            o = self._detag(orcids, [])
                            orcid_out = "<ID system=\"ORCID\">" + o + "</ID>"
                    except Exception as err:
                        pass
                except Exception as err:
                    pass
                if orcid_out is None:
                    try:
                        if a.find('contrib-id') is not None:
                            auth_id = a.find('contrib-id')
                            if auth_id['contrib-id-type'] == 'orcid':
                                o = self._detag(auth_id, [])
                                o = o.split('/')[-1]
                                orcid_out = "<ID system=\"ORCID\">" + o + "</ID>"
                    except Exception as err:
                        pass

                # If you didn't get affiliations above, l_need_affils == True, so do this...
                if l_need_affils:
                    try:
                        # MT, 2021-Jan-19 MNRAS fix:
                        # note that a may have multiple aff.affiliations tags, so use find_all here
                        # if a .find('aff') is not None:
                        # aff_id = a.find('aff')
                        # aff_text = self._detag(aff_id, JATS_TAGSET['affiliations'])
                        if a.find_all('aff') is not None:
                            aff_text_arr = list()
                            for ax in a.find_all('aff'):
                                aff_text_arr.append(
                                    self._detag(
                                        ax,
                                        JATS_TAGSET['affiliations']).strip())
                            aff_text = "; ".join(aff_text_arr)
                    except Exception as err:
                        pass

                # Author names
                if a.find('collab') is not None:
                    base_metadata['authors'].append(self._detag(a.collab, []))
                else:
                    if a.find('surname') is not None:
                        surname = self._detag(a.surname, [])
                    else:
                        surname = ''
                    if a.find('prefix') is not None:
                        prefix = self._detag(a.prefix, []) + ' '
                    else:
                        prefix = ''
                    if a.find('suffix') is not None:
                        suffix = ' ' + self._detag(a.suffix, [])
                    else:
                        suffix = ''
                    if a.find('given-names') is not None:
                        given = self._detag(a.find('given-names'), [])
                    else:
                        given = ''
                    forename = prefix + given + suffix
                    if forename == '':
                        if surname != '':
                            base_metadata['authors'].append(surname)
                        # else:
                        # base_metadata['authors'].append('ANONYMOUS')
                        # check instead whether author array is empty, and
                        # pass an empty array to serializer
                    else:
                        if surname != '':
                            base_metadata['authors'].append(surname + ', ' +
                                                            forename)
                        else:
                            base_metadata['authors'].append(forename)

                    # EMAIL in contrib-group (e.g. OUP)
                    email = None
                    if a.find('email') is not None:
                        email = self._detag(a.email, [])
                        email = '<EMAIL>' + email + '</EMAIL>'

                # Author affil/note ids
                try:
                    aid = a.find_all('xref')
                except Exception as err:
                    pass
                else:
                    aid_arr = []
                    if len(aid) > 0:
                        try:
                            aid_str = ' '.join([x['rid'] for x in aid])
                        except Exception as err:
                            print("jats.py: Failure in affil parsing: %s" %
                                  err)
                        else:
                            aid_arr = aid_str.split()

                try:
                    new_aid_arr = []
                    for af in affils.keys():
                        if af in aid_arr:
                            new_aid_arr.append(af)
                    aid_arr = new_aid_arr

                    # check whether or not you got affil data in one way or the other...
                    if not l_need_affils:
                        aff_text = '; '.join(affils[x] for x in aid_arr)
                    aff_text = aff_text.replace(';;', ';').rstrip(';')
                    aff_text = aff_text.replace('; ,', '').rstrip()

                    # Got ORCID?
                    if orcid_out is not None:
                        aff_text = aff_text + '; ' + orcid_out
                    if email is not None:
                        aff_text = aff_text + ' ' + email
                    base_metadata['affiliations'].append(aff_text)
                except Exception as errrror:
                    if orcid_out is not None:
                        base_metadata['affiliations'].append(orcid_out)
                    else:
                        base_metadata['affiliations'].append('')
                affnew = []
                for affx in base_metadata['affiliations']:
                    affnew.append(AffiliationParser(affx).parse())
                base_metadata['affiliations'] = affnew

            if len(base_metadata['authors']) > 0:
                base_metadata['authors'] = "; ".join(base_metadata['authors'])
            else:
                del base_metadata['authors']

        # Copyright:
        try:
            copyright = article_meta.find('copyright-statement')
        except Exception as err:
            pass
        else:
            base_metadata['copyright'] = self._detag(copyright, [])

        # Keywords:
        try:
            keys_uat = []
            keys_misc = []
            keys_aas = []
            uatkeys = []
            keywords = []
            keyword_groups = article_meta.find_all('kwd-group')
            for kg in keyword_groups:
                # Check for UAT first:
                if kg['kwd-group-type'] == 'author':
                    keys_uat_test = kg.find_all('compound-kwd-part')
                    for kk in keys_uat_test:
                        if kk['content-type'] == 'uat-code':
                            keys_uat.append(
                                self._detag(kk, JATS_TAGSET['keywords']))
                    if not keys_uat:
                        keys_misc_test = kg.find_all('kwd')
                        for kk in keys_misc_test:
                            keys_misc.append(
                                self._detag(kk, JATS_TAGSET['keywords']))
                # Then check for AAS:
                elif kg['kwd-group-type'] == 'AAS':
                    keys_aas_test = kg.find_all('kwd')
                    for kk in keys_aas_test:
                        keys_aas.append(
                            self._detag(kk, JATS_TAGSET['keywords']))
                # If all else fails, just search for 'kwd'
                else:
                    keys_misc_test = kg.find_all('kwd')
                    for kk in keys_misc_test:
                        keys_misc.append(
                            self._detag(kk, JATS_TAGSET['keywords']))

            if keys_uat:
                uatkeys = keys_uat
            if uatkeys:
                base_metadata['uatkeys'] = ', '.join(uatkeys)

            if keys_aas:
                keywords = keys_aas
            if keys_misc:
                if keywords:
                    keywords.extend(keys_misc)
                else:
                    keywords = keys_misc
            if keywords:
                base_metadata['keywords'] = ', '.join(keywords)
        except Exception as err:
            pass
        if 'keywords' not in base_metadata:
            try:
                keywords = article_meta.find('article-categories').find_all(
                    'subj-group')
            except Exception as err:
                keywords = []
            for c in keywords:
                try:
                    if c['subj-group-type'] == 'toc-minor':
                        klist = []
                        for k in c.find_all('subject'):
                            klist.append(
                                self._detag(k, JATS_TAGSET['keywords']))
                        base_metadata['keywords'] = ', '.join(klist)
                except Exception as err:
                    pass

        # No longer required:
        # Now convert any UAT keywords to their URI:
        # try:
        # uat_cnv = UATURIConverter()
        # base_metadata['uatkeys'] = uat_cnv.convert_to_uri(base_metadata['uatkeys'])
        # except Exception as err:
        # pass

        # Volume:
        volume = article_meta.volume
        base_metadata['volume'] = self._detag(volume, [])

        # Issue:
        issue = article_meta.issue
        base_metadata['issue'] = self._detag(issue, [])

        # Journal name:
        try:
            journal = journal_meta.find('journal-title-group').find(
                'journal-title')
            base_metadata['publication'] = self._detag(journal, [])
        except Exception as err:
            try:
                journal = journal_meta.find('journal-title')
                base_metadata['publication'] = self._detag(journal, [])
            except Exception as err:
                pass

        try:
            jid = journal_meta.find('journal-id',
                                    {'journal-id-type': 'publisher-id'})
            if jid:
                base_metadata['pub-id'] = self._detag(jid, [])
            else:
                try:
                    jid = journal_meta.find('journal-id',
                                            {'journal-id-type': 'coden'})
                    if jid:
                        base_metadata['coden'] = self._detag(jid, [])
                except Exception as err:
                    pass
        except Exception as err:
            pass

        # links: DOI and arxiv preprints
        # DOI
        base_metadata['properties'] = {}
        try:
            ids = article_meta.find_all('article-id')
        except Exception as err:
            ids = []
        for d in ids:
            if d['pub-id-type'] == 'doi':
                base_metadata['properties']['DOI'] = self._detag(d, [])
        # Arxiv Preprint
        try:
            arxiv = article_meta.find_all('custom-meta')
        except Exception as err:
            pass
        else:
            ax_pref = 'https://arxiv.org/abs/'
            for ax in arxiv:
                try:
                    x_name = self._detag(ax.find('meta-name'), [])
                    x_value = self._detag(ax.find('meta-value'), [])
                    if x_name == 'arxivppt':
                        base_metadata['properties']['HTML'] = ax_pref + x_value
                except Exception as err:
                    pass

        # Pubdate:
        try:
            pub_dates = article_meta.find_all('pub-date')
        except Exception as err:
            pub_dates = []
        for d in pub_dates:
            try:
                a = d['publication-format']
            except KeyError:
                a = ''
            try:
                b = d['pub-type']
            except KeyError:
                b = ''
            try:
                pubdate = "/" + self._detag(d.year, [])
                try:
                    d.month
                except Exception as err:
                    pubdate = "00" + pubdate
                else:
                    try:
                        int(self._detag(d.month, []))
                    except Exception as errrr:
                        month_name = self._detag(d.month, [])[0:3].lower()
                        month = MONTH_TO_NUMBER[month_name]
                    else:
                        month = self._detag(d.month, [])
                    if int(month) < 10:
                        month = "0" + str(int(month))
                    else:
                        month = str(month)
                    pubdate = month + pubdate
            except Exception as errrr:
                pass
            else:
                if a == 'print' or b == 'ppub' or b == 'cover':
                    base_metadata['pubdate'] = pubdate
                elif a == 'electronic' or b == 'epub':
                    try:
                        base_metadata['pubdate']
                    except Exception as err:
                        base_metadata['pubdate'] = pubdate
            try:
                if b == 'open-access':
                    base_metadata.setdefault('properties',
                                             {}).setdefault('OPEN', 1)
            except Exception as err:
                pass

        # Pages:

        fpage = article_meta.fpage
        if fpage is None:
            fpage = article_meta.find('elocation-id')
            if fpage is None:
                fpage = article_meta.pageStart
                if fpage is None:
                    del fpage
        try:
            fpage
        except NameError:
            pass
        else:
            lpage = article_meta.lpage
            if lpage is None:
                lpage = article_meta.pageEnd
                if lpage is None:
                    del lpage
            else:
                if lpage == fpage:
                    del lpage
            try:
                lpage
            except NameError:
                base_metadata['page'] = self._detag(fpage, [])
            else:
                base_metadata['page'] = self._detag(
                    fpage, []) + "-" + (self._detag(lpage, []))

        # Number of Pages:
        try:
            counts = article_meta.counts
            pagecount = counts.find('page-count')
            base_metadata[
                'numpages'] = '<NUMPAGES>' + pagecount['count'] + '</NUMPAGES>'
        except Exception as err:
            pass

        # References (now using back_meta):
        if back_meta is not None:

            ref_list_text = []
            try:
                ref_results = back_meta.find('ref-list').find_all('ref')
                for r in ref_results:
                    s = str_type(r.extract()).replace('\n', '')
                    s = re.sub(r'\s+', r' ', s)
                    s = namedentities.named_entities(s)
                    ref_list_text.append(s)
            except Exception as err:
                pass
            else:
                base_metadata['refhandler_list'] = ref_list_text

        output_metadata = base_metadata

        # Last step: entity conversion
        ec_fields = ['authors', 'abstract', 'title']
        econv = EntityConverter()
        for ecf in ec_fields:
            try:
                econv.input_text = output_metadata[ecf]
                econv.convert()
                output_metadata[ecf] = econv.output_text
            except Exception as err:
                pass

        return output_metadata