def parse_record(self, row) -> Optional[DirectoryInfo]: if not row: return None row = json.loads(row) info = DirectoryInfo( directory_slug=self.source_slug, issne=row.get("issne"), issnp=row.get("issnp"), custom_id=row["ezb_id"], name=clean_str(row["title"]), publisher=clean_str(row.get("publisher")), ) info.extra = dict() for k in ( "ezb_color", "subjects", "keywords", "zdb_id", "first_volume", "first_issue", "first_year", "appearance", "costs", ): if row.get(k): info.extra[k] = row[k] url = HomepageUrl.from_url(row.get("url")) if url: info.homepage_urls.append(url) return info
def parse_record(self, row) -> Optional[DirectoryInfo]: if not row: return None row = json.loads(row) info = DirectoryInfo( directory_slug=self.source_slug, issne=row.get("issne"), issnp=row.get("issnp"), raw_issn=row.get("issn"), name=clean_str(row["title"]), publisher=clean_str(row.get("ed")), ) info.extra["as_of"] = self.config.szczepanski.date if row.get("extra"): info.extra["notes"] = row.get("extra") for k in ("other_titles", "year_spans", "ed"): if row.get(k): info.extra[k] = row[k] url = HomepageUrl.from_url(row.get("url")) if url: info.homepage_urls.append(url) return info
def parse_record(self, record) -> Optional[DirectoryInfo]: info = DirectoryInfo( directory_slug=self.source_slug, issne=record["eissn"], issnp=record["pissn"], custom_id=record.get("doi").strip() or None, name=clean_str(record.get("JournalTitle")), publisher=clean_str(record.get("Publisher")), ) if record["additionalIssns"]: info.raw_issn = record["additionalIssns"][0] return info
def parse_record(self, record) -> Optional[DirectoryInfo]: """ Most of this metadata comes from chocula/fatcat; we are only interested in the homepage URLs. The "corrected titles" have been manually entered into fatcat directly. CSV columns: - issnl - issnp - issne - name - corrected title - publisher - country - lang - release_count - Homepage URL - Inactive """ info = DirectoryInfo( directory_slug=self.source_slug, issnl=record["issnl"], ) url = record["Homepage URL"] if url is None or url.lower() == "unknown" or len(url) < 4: return None homepage = HomepageUrl.from_url(url) if homepage: info.homepage_urls.append(homepage) if homepage is None: return None return info
def parse_record(self, line) -> Optional[DirectoryInfo]: record = json.loads(line) extra = dict( status=clean_str(record.get("current_status")), first_year=record.get("first_year"), collection=record.get("collection_acronym"), ) for k in list(extra.keys()): if extra[k] is None: extra.pop(k) country: Optional[str] = None if record["publisher_country"] and len( record["publisher_country"][0]) == 2: country = record["publisher_country"][0].lower() info = DirectoryInfo( directory_slug=self.source_slug, issne=clean_issn(record.get("electronic_issn") or ""), issnp=clean_issn(record.get("print_issn") or ""), custom_id=clean_str(record.get("scielo_issn")), name=clean_str(record.get("fulltitle")), publisher=clean_str((record.get("publisher_name") or [""])[0]), abbrev=clean_str(record["abbreviated_iso_title"]), platform="scielo", langs=[ lang for lang in [parse_lang(s) for s in record["languages"]] if lang ], country=country, extra=extra, ) if record["url"]: homepage = HomepageUrl.from_url(record["url"]) if homepage: info.homepage_urls.append(homepage) return info
def index_file(self, db) -> Counter: """ Transforms a KBART file into a dict of dicts; but basically a list of JSON objects, one per journal. KBART files can have multiple rows per journal (eg, different year spans), which is why this pass is needed. """ print(f"##### Loading {self.source_slug} KBART...", file=sys.stderr) counts: Counter = Counter() kbart_dict: Dict[str, KbartRecord] = dict() for row in self.open_file(): counts["total"] += 1 record = self.parse_record(row, db.issn_db) if record is None: counts["skip-parse"] += 1 continue elif not record.issnl: counts["skip-issnl"] += 1 continue elif record.start_year is None or record.end_year is None: counts["partial-missing-years"] += 1 counts["parsed"] += 1 existing = kbart_dict.get(record.issnl, record) if record.start_year and record.end_year: old_spans = existing.year_spans or [] if not record.start_year <= record.end_year: new_spans = [[record.end_year, record.start_year]] else: new_spans = [[record.start_year, record.end_year]] record.year_spans = merge_spans(old_spans, new_spans) elif record.year_spans: old_spans = existing.year_spans or [] record.year_spans = merge_spans(old_spans, record.year_spans) kbart_dict[record.issnl] = record counts["unique-issnl"] = len(kbart_dict) cur = db.db.cursor() for issnl, record in kbart_dict.items(): info = DirectoryInfo( directory_slug=self.source_slug, issnl=record.issnl, issne=record.issne, issnp=record.issnp, name=record.title, publisher=record.publisher, homepage_urls=[], extra=dict(year_spans=record.year_spans), ) if record.url: info.homepage_urls.append(record.url) status = db.insert_directory(info, cur=cur) counts[status] += 1 cur.close() db.db.commit() return counts
def parse_record(self, row) -> Optional[DirectoryInfo]: # super mangled :( row.update(self.sherpa_policies[row["RoMEO Record ID"]]) info = DirectoryInfo( directory_slug=self.source_slug, issnp=row["ISSN"], issne=row["ESSN"], name=clean_str(row["Journal Title"]), publisher=clean_str(row["Publisher"]), country=parse_country(row["Country"]), custom_id=row["RoMEO Record ID"], ) if row["RoMEO colour"]: info.extra["color"] = row["RoMEO colour"] return info
def parse_record(self, record) -> Optional[DirectoryInfo]: if not (record.get("ISSN (Online)") or record.get("ISSN (Print)")): return None return DirectoryInfo( directory_slug=self.source_slug, issne=record.get("ISSN (Online)"), issnp=record.get("ISSN (Print)"), custom_id=record.get("NlmId").strip() or None, name=clean_str(record.get("JournalTitle")), abbrev=clean_str(record["IsoAbbr"]), )
def parse_record(self, row) -> Optional[DirectoryInfo]: if not row.get("issn"): return None info = DirectoryInfo( directory_slug=self.source_slug, issne=row["issn_electronic"], issnp=row["issn_print"], raw_issn=row["issn_l"] or row["issn"], name=clean_str(row["journal_full_title"]), publisher=clean_str(row["publisher"]), ) info.extra["is_hybrid"] = bool(row["is_hybrid"]) homepage = HomepageUrl.from_url(row["url"]) if homepage: info.homepage_urls.append(homepage) return info
def parse_record(self, record) -> Optional[DirectoryInfo]: info = DirectoryInfo( directory_slug=self.source_slug, raw_issn=clean_issn(record["Issn"]), custom_id=record["JournalId"], name=clean_str(record["DisplayName"]), publisher=clean_str(record["Publisher"]), ) homepage = HomepageUrl.from_url(record["Webpage"] or "") if homepage: info.homepage_urls.append(homepage) return info
def parse_record(self, record) -> Optional[DirectoryInfo]: if not record.strip(): return None record = json.loads(record) info = DirectoryInfo(directory_slug=self.source_slug, issnl=record["issn"]) homepage = HomepageUrl.from_url(record["homepage"]) if homepage: info.homepage_urls.append(homepage) else: return None return info
def parse_record(self, row) -> Optional[DirectoryInfo]: if not (row.get("issn") and row.get("title")): return None wikidata_qid = row["item"].strip().split("/")[-1] publisher = row["publisher_name"] if ((publisher.startswith("Q") and publisher[1].isdigit()) or publisher.startswith("t1") or not publisher): publisher = None info = DirectoryInfo( directory_slug=self.source_slug, raw_issn=row["issn"], custom_id=wikidata_qid, name=clean_str(row["title"]), publisher=clean_str(publisher), ) if row.get("start_year"): info.extra["start_year"] = row["start_year"] url = HomepageUrl.from_url(row.get("websiteurl")) if url: info.homepage_urls.append(url) return info
def parse_record(self, row) -> Optional[DirectoryInfo]: info = DirectoryInfo( directory_slug=self.source_slug, raw_issn=row["ISSN 1"], custom_id=clean_str(row["ERA Journal Id"]), name=clean_str(row.get("Title")), original_name=clean_str(row.get("Foreign Title")), extra=dict(australian_era=dict( era_id=clean_str(row["ERA Journal Id"]), field=clean_str(row["FoR 1 Name"]), field_code=clean_str(row["FoR 1"]), )), ) return info
def parse_record(self, row) -> Optional[DirectoryInfo]: info = DirectoryInfo( directory_slug=self.source_slug, issnp=row["Print ISSN"], issne=row["Online ISSN"], custom_id=clean_str(row["NSD tidsskrift_id"]), publisher=clean_str(row["Publisher"]), country=parse_country(row["Country of publication"]), name=clean_str(row.get("International title")), langs=[lang for lang in [parse_lang(row["Language"])] if lang], ) info.extra["as_of"] = self.config.norwegian.date if row["Level 2019"]: info.extra["level"] = int(row["Level 2019"]) if row["Original title"] != row["International title"]: info.original_name = clean_str(row["Original title"]) url = HomepageUrl.from_url(row["URL"]) if url: info.homepage_urls.append(url) return info
def parse_record(self, row) -> Optional[DirectoryInfo]: row = json.loads(row) info = DirectoryInfo( directory_slug=self.source_slug, ) # format is an array of metadata elements for el in row: if "label" in el and el["@id"].startswith( "http://id.loc.gov/vocabulary/countries" ): value = el["label"] if "(State)" in value: value = "" if value == "Russia (Federation)": value = "Russia" info.country = parse_country(el["label"]) if not "@type" in el: continue if el["@type"] == "http://id.loc.gov/ontologies/bibframe/IssnL": info.issnl = clean_issn(el["value"]) if "mainTitle" in el: if type(el["mainTitle"]) == list: info.name = clean_str(el["mainTitle"][0]) else: info.name = clean_str(el["mainTitle"]) if el.get("format") == "vocabularies/medium#Print": info.issnp = clean_issn(el["issn"]) elif el.get("format") == "vocabularies/medium#Electronic": info.issne = clean_issn(el["issn"]) urls = el.get("url", []) if isinstance(urls, str): urls = [ urls, ] for url in urls: homepage = HomepageUrl.from_url(url) if homepage: info.homepage_urls.append(homepage) return info
def parse_record(self, record) -> Optional[DirectoryInfo]: # HACK if "\ufeffTitle" in record: record["Title"] = record["\ufeffTitle"] if not record["Title"]: return None info = DirectoryInfo( directory_slug=self.source_slug, raw_issn=clean_issn(record["ISSN"]), issne=clean_issn(record["EISSN"]), name=clean_str(record["Title"]), ) homepage = HomepageUrl.from_url(record["URL"]) if homepage: info.homepage_urls.append(homepage) return info
def parse_record(self, record) -> Optional[DirectoryInfo]: if not record["Journal Name"]: return None info = DirectoryInfo( directory_slug=self.source_slug, raw_issn=clean_issn(record["ISSN"]), issne=clean_issn(record["E-ISSN"]), name=clean_str(record["Journal Name"]), publisher=clean_str(record["Publisher"]), langs=[ lang for lang in [parse_lang(record["Language(s)"])] if lang ], country=parse_country(record["Country"]), ) homepage = HomepageUrl.from_url(record["Internet Archive Link"]) if homepage: info.homepage_urls.append(homepage) return info
def parse_record(self, row) -> Optional[DirectoryInfo]: # TODO: Subjects, Permanent article identifiers, work_level stuff info = DirectoryInfo( directory_slug=self.source_slug, issnp=row["Journal ISSN (print version)"], issne=row["Journal EISSN (online version)"], name=clean_str(row["Journal title"]), publisher=clean_str(row["Publisher"]), country=parse_country(row["Country of publisher"]), ) lang = parse_lang( row["Languages in which the journal accepts manuscripts"]) if lang: info.langs.append(lang) info.extra["as_of"] = self.config.snapshot.date if row["DOAJ Seal"]: info.extra["seal"] = { "no": False, "yes": True }[row["DOAJ Seal"].lower()] if row["Preservation Services"]: info.extra["archive"] = [ a.strip() for a in row["Preservation Services"].split(",") if a.strip() ] elif row["Preservation Service: national library"]: info.extra["archive"] = ["national-library"] default_license = row["Journal license"] if default_license and default_license.startswith("CC"): info.extra["default_license"] = default_license.replace( "CC ", "CC-").strip() url = row["Journal URL"] if url: homepage = HomepageUrl.from_url(row["Journal URL"]) if homepage: info.homepage_urls.append(homepage) return info
def parse_record(self, row) -> Optional[DirectoryInfo]: info = DirectoryInfo( directory_slug=self.source_slug, raw_issn=row["ISSN-L"], name=clean_str(row["Short Title"]), publisher=clean_str(row["Publisher"]), langs=[ lang for lang in [parse_lang(s) for s in (row["Lang1"], row["Lang2"])] if lang ], ) # TODO: region mapping: "Europe and North America" # TODO: lang mapping: already alpha-3 # homepages for url in [u for u in (row["URL1"], row["URL2"]) if u]: homepage = HomepageUrl.from_url(url) if homepage: info.homepage_urls.append(homepage) return info
def parse_record(self, row) -> Optional[DirectoryInfo]: if not (row.get("ISSN_L") and row.get("TITLE")): return None # TODO: also add for other non-direct indices # for ind in ('WOS', 'SCOPUS'): # issnl, status = self.add_issn( # ind.lower(), # raw_issn=row['ISSN_L'], # name=row['TITLE'], # ) extra = dict() for ind in ("DOAJ", "ROAD", "PMC", "OAPC", "WOS", "SCOPUS"): extra["in_" + ind.lower()] = bool(int(row["JOURNAL_IN_" + ind])) return DirectoryInfo( directory_slug=self.source_slug, raw_issn=row["ISSN_L"], name=clean_str(row["TITLE"]), extra=extra, )
def parse_record(self, line) -> Optional[DirectoryInfo]: record = json.loads(line) issn_info = record.get("identifiers", {}).get("issn", {}) # sometimes is a list for k in "generic", "electronic", "print": if type(issn_info.get(k)) == list: issn_info[k] = issn_info[k][0] info = DirectoryInfo( directory_slug=self.source_slug, raw_issn=clean_issn(issn_info.get("generic", "")), issne=clean_issn(issn_info.get("electronic", "")), issnp=clean_issn(issn_info.get("print", "")), name=clean_str(record.get("title")), langs=[ lang for lang in [parse_lang(s) for s in record["languages"]] if lang ], ) if record["url"]: homepage = HomepageUrl.from_url(record["url"]) if homepage: info.homepage_urls.append(homepage) return info