def test_03_languages(self): """ Use language information from our datasets """ assert datasets.name_for_lang('en') == 'English' assert datasets.name_for_lang('eng') == 'English' assert datasets.language_for('English').name == 'English' assert datasets.language_for('german').bibliographic == 'ger'
def language_name(self): # copy the languages and convert them to their english forms from portality import datasets # delayed import, as it loads some stuff from file if self.language is not None: langs = self.language langs = [datasets.name_for_lang(l) for l in langs] uc = dataobj.to_unicode() langs = [uc(l) for l in langs] return list(set(langs))
def languages(vals): keep = [] codes = [c.lower() for c, _ in datasets.language_options] names = [n.lower() for _, n in datasets.language_options] for v in vals: if v.lower() in codes: keep.append(datasets.name_for_lang(v)) elif v.lower() in names: keep.append(v) return ", ".join(keep)
def _generate_index(self): # the index fields we are going to generate issns = [] subjects = [] schema_subjects = [] schema_codes = [] classification = [] langs = [] country = None licenses = [] publisher = [] classification_paths = [] unpunctitle = None asciiunpunctitle = None doi = None fulltext = None # the places we're going to get those fields from cbib = self.bibjson() jindex = self.data.get('index', {}) hist = self.history() # get the issns out of the current bibjson issns += cbib.get_identifiers(cbib.P_ISSN) issns += cbib.get_identifiers(cbib.E_ISSN) # get the issn from the journal bibjson if isinstance(cbib.journal_issns, list): issns += cbib.journal_issns # de-duplicate the issns issns = list(set(issns)) # now get the issns out of the historic records for date, hbib in hist: issns += hbib.get_identifiers(hbib.P_ISSN) issns += hbib.get_identifiers(hbib.E_ISSN) # get the subjects and concatenate them with their schemes from the current bibjson for subs in cbib.subjects(): scheme = subs.get("scheme") term = subs.get("term") subjects.append(term) schema_subjects.append(scheme + ":" + term) classification.append(term) if "code" in subs: schema_codes.append(scheme + ":" + subs.get("code")) # copy the languages from portality import datasets # delayed import, as it loads some stuff from file if len(cbib.journal_language) > 0: langs = [datasets.name_for_lang(l) for l in cbib.journal_language] # copy the country if jindex.get('country'): country = jindex.get('country') elif cbib.journal_country: country = xwalk.get_country_name(cbib.journal_country) # get the title of the license lic = cbib.get_journal_license() if lic is not None: licenses.append(lic.get("title")) # copy the publisher/provider if cbib.publisher: publisher.append(cbib.publisher) # deduplicate the lists issns = list(set(issns)) subjects = list(set(subjects)) schema_subjects = list(set(schema_subjects)) classification = list(set(classification)) licenses = list(set(licenses)) publisher = list(set(publisher)) langs = list(set(langs)) schema_codes = list(set(schema_codes)) # work out what the date of publication is date = cbib.get_publication_date() # calculate the classification paths from portality.lcc import lcc # inline import since this hits the database for subs in cbib.subjects(): scheme = subs.get("scheme") term = subs.get("term") if scheme == "LCC": path = lcc.pathify(term) if path is not None: classification_paths.append(path) # normalise the classification paths, so we only store the longest ones classification_paths = lcc.longest(classification_paths) # create an unpunctitle if cbib.title is not None: throwlist = string.punctuation + '\n\t' unpunctitle = "".join(c for c in cbib.title if c not in throwlist).strip() try: asciiunpunctitle = unidecode(unpunctitle) except: asciiunpunctitle = unpunctitle # determine if the seal is applied has_seal = "Yes" if self.has_seal() else "No" # create a normalised version of the DOI for deduplication source_doi = cbib.get_one_identifier(constants.IDENT_TYPE_DOI) try: doi = normalise.normalise_doi(source_doi) except ValueError as e: # if we can't normalise the DOI, just store it as-is. doi = source_doi # create a normalised version of the fulltext URL for deduplication fulltexts = cbib.get_urls(constants.LINK_TYPE_FULLTEXT) if len(fulltexts) > 0: source_fulltext = fulltexts[0] try: fulltext = normalise.normalise_url(source_fulltext) except ValueError as e: # if we can't normalise the fulltext store it as-is fulltext = source_fulltext # build the index part of the object self.data["index"] = {} if len(issns) > 0: self.data["index"]["issn"] = issns if date != "": self.data["index"]["date"] = date self.data["index"][ "date_toc_fv_month"] = date # Duplicated so we can have year/month facets in fv2 if len(subjects) > 0: self.data["index"]["subject"] = subjects if len(schema_subjects) > 0: self.data["index"]["schema_subject"] = schema_subjects if len(classification) > 0: self.data["index"]["classification"] = classification if len(publisher) > 0: self.data["index"]["publisher"] = publisher if len(licenses) > 0: self.data["index"]["license"] = licenses if len(langs) > 0: self.data["index"]["language"] = langs if country is not None: self.data["index"]["country"] = country if schema_codes > 0: self.data["index"]["schema_code"] = schema_codes if len(classification_paths) > 0: self.data["index"]["classification_paths"] = classification_paths if unpunctitle is not None: self.data["index"]["unpunctitle"] = unpunctitle if asciiunpunctitle is not None: self.data["index"]["asciiunpunctitle"] = unpunctitle if has_seal: self.data["index"]["has_seal"] = has_seal if doi is not None: self.data["index"]["doi"] = doi if fulltext is not None: self.data["index"]["fulltext"] = fulltext
def language_name(self): # copy the languages and convert them to their english forms langs = [datasets.name_for_lang(l) for l in self.language] uc = dataobj.to_unicode() langs = [uc(l) for l in langs] return list(set(langs))
def _generate_index(self): # the index fields we are going to generate issns = [] subjects = [] schema_subjects = [] schema_codes = [] classification = [] langs = [] country = None licenses = [] publisher = [] classification_paths = [] unpunctitle = None asciiunpunctitle = None doi = None fulltext = None # the places we're going to get those fields from cbib = self.bibjson() jindex = self.data.get('index', {}) hist = self.history() # get the issns out of the current bibjson issns += cbib.get_identifiers(cbib.P_ISSN) issns += cbib.get_identifiers(cbib.E_ISSN) # get the issn from the journal bibjson if isinstance(cbib.journal_issns, list): issns += cbib.journal_issns # de-duplicate the issns issns = list(set(issns)) # now get the issns out of the historic records for date, hbib in hist: issns += hbib.get_identifiers(hbib.P_ISSN) issns += hbib.get_identifiers(hbib.E_ISSN) # get the subjects and concatenate them with their schemes from the current bibjson for subs in cbib.subjects(): scheme = subs.get("scheme") term = subs.get("term") subjects.append(term) schema_subjects.append(scheme + ":" + term) classification.append(term) if "code" in subs: schema_codes.append(scheme + ":" + subs.get("code")) # copy the languages from portality import datasets # delayed import, as it loads some stuff from file if len(cbib.journal_language) > 0: langs = [datasets.name_for_lang(l) for l in cbib.journal_language] # copy the country if jindex.get('country'): country = jindex.get('country') elif cbib.journal_country: country = xwalk.get_country_name(cbib.journal_country) # get the title of the license lic = cbib.get_journal_license() if lic is not None: licenses.append(lic.get("title")) # copy the publisher/provider if cbib.publisher: publisher.append(cbib.publisher) # deduplicate the lists issns = list(set(issns)) subjects = list(set(subjects)) schema_subjects = list(set(schema_subjects)) classification = list(set(classification)) licenses = list(set(licenses)) publisher = list(set(publisher)) langs = list(set(langs)) schema_codes = list(set(schema_codes)) # work out what the date of publication is date = cbib.get_publication_date() # calculate the classification paths from portality.lcc import lcc # inline import since this hits the database for subs in cbib.subjects(): scheme = subs.get("scheme") term = subs.get("term") if scheme == "LCC": path = lcc.pathify(term) if path is not None: classification_paths.append(path) # normalise the classification paths, so we only store the longest ones classification_paths = lcc.longest(classification_paths) # create an unpunctitle if cbib.title is not None: throwlist = string.punctuation + '\n\t' unpunctitle = "".join(c for c in cbib.title if c not in throwlist).strip() try: asciiunpunctitle = unidecode(unpunctitle) except: asciiunpunctitle = unpunctitle # determine if the seal is applied has_seal = "Yes" if self.has_seal() else "No" # create a normalised version of the DOI for deduplication source_doi = cbib.get_one_identifier(constants.IDENT_TYPE_DOI) try: doi = normalise.normalise_doi(source_doi) except ValueError as e: # if we can't normalise the DOI, just store it as-is. doi = source_doi # create a normalised version of the fulltext URL for deduplication fulltexts = cbib.get_urls(constants.LINK_TYPE_FULLTEXT) if len(fulltexts) > 0: source_fulltext = fulltexts[0] try: fulltext = normalise.normalise_url(source_fulltext) except ValueError as e: # if we can't normalise the fulltext store it as-is fulltext = source_fulltext # build the index part of the object self.data["index"] = {} if len(issns) > 0: self.data["index"]["issn"] = issns if date != "": self.data["index"]["date"] = date self.data["index"]["date_toc_fv_month"] = date # Duplicated so we can have year/month facets in fv2 if len(subjects) > 0: self.data["index"]["subject"] = subjects if len(schema_subjects) > 0: self.data["index"]["schema_subject"] = schema_subjects if len(classification) > 0: self.data["index"]["classification"] = classification if len(publisher) > 0: self.data["index"]["publisher"] = publisher if len(licenses) > 0: self.data["index"]["license"] = licenses if len(langs) > 0: self.data["index"]["language"] = langs if country is not None: self.data["index"]["country"] = country if schema_codes > 0: self.data["index"]["schema_code"] = schema_codes if len(classification_paths) > 0: self.data["index"]["classification_paths"] = classification_paths if unpunctitle is not None: self.data["index"]["unpunctitle"] = unpunctitle if asciiunpunctitle is not None: self.data["index"]["asciiunpunctitle"] = unpunctitle if has_seal: self.data["index"]["has_seal"] = has_seal if doi is not None: self.data["index"]["doi"] = doi if fulltext is not None: self.data["index"]["fulltext"] = fulltext