Exemple #1
0
    def test_03_languages(self):
        """ Use language information from our datasets """
        assert datasets.name_for_lang('en') == 'English'
        assert datasets.name_for_lang('eng') == 'English'
        assert datasets.language_for('English').name == 'English'

        assert datasets.language_for('german').bibliographic == 'ger'
Exemple #2
0
 def language_name(self):
     # copy the languages and convert them to their english forms
     from portality import datasets  # delayed import, as it loads some stuff from file
     if self.language is not None:
         langs = self.language
     langs = [datasets.name_for_lang(l) for l in langs]
     uc = dataobj.to_unicode()
     langs = [uc(l) for l in langs]
     return list(set(langs))
Exemple #3
0
 def language_name(self):
     # copy the languages and convert them to their english forms
     from portality import datasets  # delayed import, as it loads some stuff from file
     if self.language is not None:
         langs = self.language
     langs = [datasets.name_for_lang(l) for l in langs]
     uc = dataobj.to_unicode()
     langs = [uc(l) for l in langs]
     return list(set(langs))
Exemple #4
0
 def languages(vals):
     keep = []
     codes = [c.lower() for c, _ in datasets.language_options]
     names = [n.lower() for _, n in datasets.language_options]
     for v in vals:
         if v.lower() in codes:
             keep.append(datasets.name_for_lang(v))
         elif v.lower() in names:
             keep.append(v)
     return ", ".join(keep)
Exemple #5
0
    def _generate_index(self):
        # the index fields we are going to generate
        issns = []
        subjects = []
        schema_subjects = []
        schema_codes = []
        classification = []
        langs = []
        country = None
        licenses = []
        publisher = []
        classification_paths = []
        unpunctitle = None
        asciiunpunctitle = None
        doi = None
        fulltext = None

        # the places we're going to get those fields from
        cbib = self.bibjson()
        jindex = self.data.get('index', {})
        hist = self.history()

        # get the issns out of the current bibjson
        issns += cbib.get_identifiers(cbib.P_ISSN)
        issns += cbib.get_identifiers(cbib.E_ISSN)

        # get the issn from the journal bibjson
        if isinstance(cbib.journal_issns, list):
            issns += cbib.journal_issns

        # de-duplicate the issns
        issns = list(set(issns))

        # now get the issns out of the historic records
        for date, hbib in hist:
            issns += hbib.get_identifiers(hbib.P_ISSN)
            issns += hbib.get_identifiers(hbib.E_ISSN)

        # get the subjects and concatenate them with their schemes from the current bibjson
        for subs in cbib.subjects():
            scheme = subs.get("scheme")
            term = subs.get("term")
            subjects.append(term)
            schema_subjects.append(scheme + ":" + term)
            classification.append(term)
            if "code" in subs:
                schema_codes.append(scheme + ":" + subs.get("code"))

        # copy the languages
        from portality import datasets  # delayed import, as it loads some stuff from file
        if len(cbib.journal_language) > 0:
            langs = [datasets.name_for_lang(l) for l in cbib.journal_language]

        # copy the country
        if jindex.get('country'):
            country = jindex.get('country')
        elif cbib.journal_country:
            country = xwalk.get_country_name(cbib.journal_country)

        # get the title of the license
        lic = cbib.get_journal_license()
        if lic is not None:
            licenses.append(lic.get("title"))

        # copy the publisher/provider
        if cbib.publisher:
            publisher.append(cbib.publisher)

        # deduplicate the lists
        issns = list(set(issns))
        subjects = list(set(subjects))
        schema_subjects = list(set(schema_subjects))
        classification = list(set(classification))
        licenses = list(set(licenses))
        publisher = list(set(publisher))
        langs = list(set(langs))
        schema_codes = list(set(schema_codes))

        # work out what the date of publication is
        date = cbib.get_publication_date()

        # calculate the classification paths
        from portality.lcc import lcc  # inline import since this hits the database
        for subs in cbib.subjects():
            scheme = subs.get("scheme")
            term = subs.get("term")
            if scheme == "LCC":
                path = lcc.pathify(term)
                if path is not None:
                    classification_paths.append(path)

        # normalise the classification paths, so we only store the longest ones
        classification_paths = lcc.longest(classification_paths)

        # create an unpunctitle
        if cbib.title is not None:
            throwlist = string.punctuation + '\n\t'
            unpunctitle = "".join(c for c in cbib.title
                                  if c not in throwlist).strip()
            try:
                asciiunpunctitle = unidecode(unpunctitle)
            except:
                asciiunpunctitle = unpunctitle

        # determine if the seal is applied
        has_seal = "Yes" if self.has_seal() else "No"

        # create a normalised version of the DOI for deduplication
        source_doi = cbib.get_one_identifier(constants.IDENT_TYPE_DOI)
        try:
            doi = normalise.normalise_doi(source_doi)
        except ValueError as e:
            # if we can't normalise the DOI, just store it as-is.
            doi = source_doi

        # create a normalised version of the fulltext URL for deduplication
        fulltexts = cbib.get_urls(constants.LINK_TYPE_FULLTEXT)
        if len(fulltexts) > 0:
            source_fulltext = fulltexts[0]
            try:
                fulltext = normalise.normalise_url(source_fulltext)
            except ValueError as e:
                # if we can't normalise the fulltext store it as-is
                fulltext = source_fulltext

        # build the index part of the object
        self.data["index"] = {}
        if len(issns) > 0:
            self.data["index"]["issn"] = issns
        if date != "":
            self.data["index"]["date"] = date
            self.data["index"][
                "date_toc_fv_month"] = date  # Duplicated so we can have year/month facets in fv2
        if len(subjects) > 0:
            self.data["index"]["subject"] = subjects
        if len(schema_subjects) > 0:
            self.data["index"]["schema_subject"] = schema_subjects
        if len(classification) > 0:
            self.data["index"]["classification"] = classification
        if len(publisher) > 0:
            self.data["index"]["publisher"] = publisher
        if len(licenses) > 0:
            self.data["index"]["license"] = licenses
        if len(langs) > 0:
            self.data["index"]["language"] = langs
        if country is not None:
            self.data["index"]["country"] = country
        if schema_codes > 0:
            self.data["index"]["schema_code"] = schema_codes
        if len(classification_paths) > 0:
            self.data["index"]["classification_paths"] = classification_paths
        if unpunctitle is not None:
            self.data["index"]["unpunctitle"] = unpunctitle
        if asciiunpunctitle is not None:
            self.data["index"]["asciiunpunctitle"] = unpunctitle
        if has_seal:
            self.data["index"]["has_seal"] = has_seal
        if doi is not None:
            self.data["index"]["doi"] = doi
        if fulltext is not None:
            self.data["index"]["fulltext"] = fulltext
Exemple #6
0
 def language_name(self):
     # copy the languages and convert them to their english forms
     langs = [datasets.name_for_lang(l) for l in self.language]
     uc = dataobj.to_unicode()
     langs = [uc(l) for l in langs]
     return list(set(langs))
Exemple #7
0
    def _generate_index(self):
        # the index fields we are going to generate
        issns = []
        subjects = []
        schema_subjects = []
        schema_codes = []
        classification = []
        langs = []
        country = None
        licenses = []
        publisher = []
        classification_paths = []
        unpunctitle = None
        asciiunpunctitle = None
        doi = None
        fulltext = None

        # the places we're going to get those fields from
        cbib = self.bibjson()
        jindex = self.data.get('index', {})
        hist = self.history()

        # get the issns out of the current bibjson
        issns += cbib.get_identifiers(cbib.P_ISSN)
        issns += cbib.get_identifiers(cbib.E_ISSN)

        # get the issn from the journal bibjson
        if isinstance(cbib.journal_issns, list):
            issns += cbib.journal_issns

        # de-duplicate the issns
        issns = list(set(issns))

        # now get the issns out of the historic records
        for date, hbib in hist:
            issns += hbib.get_identifiers(hbib.P_ISSN)
            issns += hbib.get_identifiers(hbib.E_ISSN)

        # get the subjects and concatenate them with their schemes from the current bibjson
        for subs in cbib.subjects():
            scheme = subs.get("scheme")
            term = subs.get("term")
            subjects.append(term)
            schema_subjects.append(scheme + ":" + term)
            classification.append(term)
            if "code" in subs:
                schema_codes.append(scheme + ":" + subs.get("code"))

        # copy the languages
        from portality import datasets  # delayed import, as it loads some stuff from file
        if len(cbib.journal_language) > 0:
            langs = [datasets.name_for_lang(l) for l in cbib.journal_language]

        # copy the country
        if jindex.get('country'):
            country = jindex.get('country')
        elif cbib.journal_country:
            country = xwalk.get_country_name(cbib.journal_country)

        # get the title of the license
        lic = cbib.get_journal_license()
        if lic is not None:
            licenses.append(lic.get("title"))

        # copy the publisher/provider
        if cbib.publisher:
            publisher.append(cbib.publisher)

        # deduplicate the lists
        issns = list(set(issns))
        subjects = list(set(subjects))
        schema_subjects = list(set(schema_subjects))
        classification = list(set(classification))
        licenses = list(set(licenses))
        publisher = list(set(publisher))
        langs = list(set(langs))
        schema_codes = list(set(schema_codes))

        # work out what the date of publication is
        date = cbib.get_publication_date()

        # calculate the classification paths
        from portality.lcc import lcc  # inline import since this hits the database
        for subs in cbib.subjects():
            scheme = subs.get("scheme")
            term = subs.get("term")
            if scheme == "LCC":
                path = lcc.pathify(term)
                if path is not None:
                    classification_paths.append(path)

        # normalise the classification paths, so we only store the longest ones
        classification_paths = lcc.longest(classification_paths)

        # create an unpunctitle
        if cbib.title is not None:
            throwlist = string.punctuation + '\n\t'
            unpunctitle = "".join(c for c in cbib.title if c not in throwlist).strip()
            try:
                asciiunpunctitle = unidecode(unpunctitle)
            except:
                asciiunpunctitle = unpunctitle

        # determine if the seal is applied
        has_seal = "Yes" if self.has_seal() else "No"

        # create a normalised version of the DOI for deduplication
        source_doi = cbib.get_one_identifier(constants.IDENT_TYPE_DOI)
        try:
            doi = normalise.normalise_doi(source_doi)
        except ValueError as e:
            # if we can't normalise the DOI, just store it as-is.
            doi = source_doi


        # create a normalised version of the fulltext URL for deduplication
        fulltexts = cbib.get_urls(constants.LINK_TYPE_FULLTEXT)
        if len(fulltexts) > 0:
            source_fulltext = fulltexts[0]
            try:
                fulltext = normalise.normalise_url(source_fulltext)
            except ValueError as e:
                # if we can't normalise the fulltext store it as-is
                fulltext = source_fulltext

        # build the index part of the object
        self.data["index"] = {}
        if len(issns) > 0:
            self.data["index"]["issn"] = issns
        if date != "":
            self.data["index"]["date"] = date
            self.data["index"]["date_toc_fv_month"] = date        # Duplicated so we can have year/month facets in fv2
        if len(subjects) > 0:
            self.data["index"]["subject"] = subjects
        if len(schema_subjects) > 0:
            self.data["index"]["schema_subject"] = schema_subjects
        if len(classification) > 0:
            self.data["index"]["classification"] = classification
        if len(publisher) > 0:
            self.data["index"]["publisher"] = publisher
        if len(licenses) > 0:
            self.data["index"]["license"] = licenses
        if len(langs) > 0:
            self.data["index"]["language"] = langs
        if country is not None:
            self.data["index"]["country"] = country
        if schema_codes > 0:
            self.data["index"]["schema_code"] = schema_codes
        if len(classification_paths) > 0:
            self.data["index"]["classification_paths"] = classification_paths
        if unpunctitle is not None:
            self.data["index"]["unpunctitle"] = unpunctitle
        if asciiunpunctitle is not None:
            self.data["index"]["asciiunpunctitle"] = unpunctitle
        if has_seal:
            self.data["index"]["has_seal"] = has_seal
        if doi is not None:
            self.data["index"]["doi"] = doi
        if fulltext is not None:
            self.data["index"]["fulltext"] = fulltext