Exemple #1
0
    def parse_record(self, row) -> Optional[DirectoryInfo]:

        if not row:
            return None
        row = json.loads(row)

        info = DirectoryInfo(
            directory_slug=self.source_slug,
            issne=row.get("issne"),
            issnp=row.get("issnp"),
            custom_id=row["ezb_id"],
            name=clean_str(row["title"]),
            publisher=clean_str(row.get("publisher")),
        )

        info.extra = dict()
        for k in (
                "ezb_color",
                "subjects",
                "keywords",
                "zdb_id",
                "first_volume",
                "first_issue",
                "first_year",
                "appearance",
                "costs",
        ):
            if row.get(k):
                info.extra[k] = row[k]

        url = HomepageUrl.from_url(row.get("url"))
        if url:
            info.homepage_urls.append(url)

        return info
    def parse_record(self, row) -> Optional[DirectoryInfo]:

        if not row:
            return None

        row = json.loads(row)

        info = DirectoryInfo(
            directory_slug=self.source_slug,
            issne=row.get("issne"),
            issnp=row.get("issnp"),
            raw_issn=row.get("issn"),
            name=clean_str(row["title"]),
            publisher=clean_str(row.get("ed")),
        )

        info.extra["as_of"] = self.config.szczepanski.date
        if row.get("extra"):
            info.extra["notes"] = row.get("extra")
        for k in ("other_titles", "year_spans", "ed"):
            if row.get(k):
                info.extra[k] = row[k]

        url = HomepageUrl.from_url(row.get("url"))
        if url:
            info.homepage_urls.append(url)

        return info
Exemple #3
0
    def parse_record(self, record) -> Optional[DirectoryInfo]:
        info = DirectoryInfo(
            directory_slug=self.source_slug,
            issne=record["eissn"],
            issnp=record["pissn"],
            custom_id=record.get("doi").strip() or None,
            name=clean_str(record.get("JournalTitle")),
            publisher=clean_str(record.get("Publisher")),
        )

        if record["additionalIssns"]:
            info.raw_issn = record["additionalIssns"][0]

        return info
Exemple #4
0
    def parse_record(self, record) -> Optional[DirectoryInfo]:
        """
        Most of this metadata comes from chocula/fatcat; we are only interested
        in the homepage URLs.

        The "corrected titles" have been manually entered into fatcat directly.

        CSV columns:
        - issnl
        - issnp
        - issne
        - name
        - corrected title
        - publisher
        - country
        - lang
        - release_count
        - Homepage URL
        - Inactive
        """

        info = DirectoryInfo(
            directory_slug=self.source_slug,
            issnl=record["issnl"],
        )
        url = record["Homepage URL"]
        if url is None or url.lower() == "unknown" or len(url) < 4:
            return None
        homepage = HomepageUrl.from_url(url)
        if homepage:
            info.homepage_urls.append(homepage)
        if homepage is None:
            return None
        return info
Exemple #5
0
 def parse_record(self, line) -> Optional[DirectoryInfo]:
     record = json.loads(line)
     extra = dict(
         status=clean_str(record.get("current_status")),
         first_year=record.get("first_year"),
         collection=record.get("collection_acronym"),
     )
     for k in list(extra.keys()):
         if extra[k] is None:
             extra.pop(k)
     country: Optional[str] = None
     if record["publisher_country"] and len(
             record["publisher_country"][0]) == 2:
         country = record["publisher_country"][0].lower()
     info = DirectoryInfo(
         directory_slug=self.source_slug,
         issne=clean_issn(record.get("electronic_issn") or ""),
         issnp=clean_issn(record.get("print_issn") or ""),
         custom_id=clean_str(record.get("scielo_issn")),
         name=clean_str(record.get("fulltitle")),
         publisher=clean_str((record.get("publisher_name") or [""])[0]),
         abbrev=clean_str(record["abbreviated_iso_title"]),
         platform="scielo",
         langs=[
             lang for lang in [parse_lang(s) for s in record["languages"]]
             if lang
         ],
         country=country,
         extra=extra,
     )
     if record["url"]:
         homepage = HomepageUrl.from_url(record["url"])
         if homepage:
             info.homepage_urls.append(homepage)
     return info
Exemple #6
0
    def index_file(self, db) -> Counter:
        """
        Transforms a KBART file into a dict of dicts; but basically a list of
        JSON objects, one per journal. KBART files can have multiple rows per
        journal (eg, different year spans), which is why this pass is needed.
        """
        print(f"##### Loading {self.source_slug} KBART...", file=sys.stderr)
        counts: Counter = Counter()
        kbart_dict: Dict[str, KbartRecord] = dict()
        for row in self.open_file():
            counts["total"] += 1

            record = self.parse_record(row, db.issn_db)
            if record is None:
                counts["skip-parse"] += 1
                continue
            elif not record.issnl:
                counts["skip-issnl"] += 1
                continue
            elif record.start_year is None or record.end_year is None:
                counts["partial-missing-years"] += 1
            counts["parsed"] += 1

            existing = kbart_dict.get(record.issnl, record)
            if record.start_year and record.end_year:
                old_spans = existing.year_spans or []
                if not record.start_year <= record.end_year:
                    new_spans = [[record.end_year, record.start_year]]
                else:
                    new_spans = [[record.start_year, record.end_year]]
                record.year_spans = merge_spans(old_spans, new_spans)
            elif record.year_spans:
                old_spans = existing.year_spans or []
                record.year_spans = merge_spans(old_spans, record.year_spans)
            kbart_dict[record.issnl] = record

        counts["unique-issnl"] = len(kbart_dict)
        cur = db.db.cursor()
        for issnl, record in kbart_dict.items():
            info = DirectoryInfo(
                directory_slug=self.source_slug,
                issnl=record.issnl,
                issne=record.issne,
                issnp=record.issnp,
                name=record.title,
                publisher=record.publisher,
                homepage_urls=[],
                extra=dict(year_spans=record.year_spans),
            )
            if record.url:
                info.homepage_urls.append(record.url)
            status = db.insert_directory(info, cur=cur)
            counts[status] += 1
        cur.close()
        db.db.commit()
        return counts
Exemple #7
0
    def parse_record(self, row) -> Optional[DirectoryInfo]:
        # super mangled :(

        row.update(self.sherpa_policies[row["RoMEO Record ID"]])

        info = DirectoryInfo(
            directory_slug=self.source_slug,
            issnp=row["ISSN"],
            issne=row["ESSN"],
            name=clean_str(row["Journal Title"]),
            publisher=clean_str(row["Publisher"]),
            country=parse_country(row["Country"]),
            custom_id=row["RoMEO Record ID"],
        )

        if row["RoMEO colour"]:
            info.extra["color"] = row["RoMEO colour"]

        return info
Exemple #8
0
 def parse_record(self, record) -> Optional[DirectoryInfo]:
     if not (record.get("ISSN (Online)") or record.get("ISSN (Print)")):
         return None
     return DirectoryInfo(
         directory_slug=self.source_slug,
         issne=record.get("ISSN (Online)"),
         issnp=record.get("ISSN (Print)"),
         custom_id=record.get("NlmId").strip() or None,
         name=clean_str(record.get("JournalTitle")),
         abbrev=clean_str(record["IsoAbbr"]),
     )
Exemple #9
0
    def parse_record(self, row) -> Optional[DirectoryInfo]:

        if not row.get("issn"):
            return None

        info = DirectoryInfo(
            directory_slug=self.source_slug,
            issne=row["issn_electronic"],
            issnp=row["issn_print"],
            raw_issn=row["issn_l"] or row["issn"],
            name=clean_str(row["journal_full_title"]),
            publisher=clean_str(row["publisher"]),
        )

        info.extra["is_hybrid"] = bool(row["is_hybrid"])

        homepage = HomepageUrl.from_url(row["url"])
        if homepage:
            info.homepage_urls.append(homepage)

        return info
Exemple #10
0
    def parse_record(self, record) -> Optional[DirectoryInfo]:
        info = DirectoryInfo(
            directory_slug=self.source_slug,
            raw_issn=clean_issn(record["Issn"]),
            custom_id=record["JournalId"],
            name=clean_str(record["DisplayName"]),
            publisher=clean_str(record["Publisher"]),
        )
        homepage = HomepageUrl.from_url(record["Webpage"] or "")
        if homepage:
            info.homepage_urls.append(homepage)

        return info
Exemple #11
0
    def parse_record(self, record) -> Optional[DirectoryInfo]:

        if not record.strip():
            return None
        record = json.loads(record)

        info = DirectoryInfo(directory_slug=self.source_slug, issnl=record["issn"])

        homepage = HomepageUrl.from_url(record["homepage"])
        if homepage:
            info.homepage_urls.append(homepage)
        else:
            return None
        return info
Exemple #12
0
    def parse_record(self, row) -> Optional[DirectoryInfo]:

        if not (row.get("issn") and row.get("title")):
            return None
        wikidata_qid = row["item"].strip().split("/")[-1]
        publisher = row["publisher_name"]
        if ((publisher.startswith("Q") and publisher[1].isdigit())
                or publisher.startswith("t1") or not publisher):
            publisher = None
        info = DirectoryInfo(
            directory_slug=self.source_slug,
            raw_issn=row["issn"],
            custom_id=wikidata_qid,
            name=clean_str(row["title"]),
            publisher=clean_str(publisher),
        )
        if row.get("start_year"):
            info.extra["start_year"] = row["start_year"]

        url = HomepageUrl.from_url(row.get("websiteurl"))
        if url:
            info.homepage_urls.append(url)

        return info
    def parse_record(self, row) -> Optional[DirectoryInfo]:
        info = DirectoryInfo(
            directory_slug=self.source_slug,
            raw_issn=row["ISSN 1"],
            custom_id=clean_str(row["ERA Journal Id"]),
            name=clean_str(row.get("Title")),
            original_name=clean_str(row.get("Foreign Title")),
            extra=dict(australian_era=dict(
                era_id=clean_str(row["ERA Journal Id"]),
                field=clean_str(row["FoR 1 Name"]),
                field_code=clean_str(row["FoR 1"]),
            )),
        )

        return info
Exemple #14
0
    def parse_record(self, row) -> Optional[DirectoryInfo]:
        info = DirectoryInfo(
            directory_slug=self.source_slug,
            issnp=row["Print ISSN"],
            issne=row["Online ISSN"],
            custom_id=clean_str(row["NSD tidsskrift_id"]),
            publisher=clean_str(row["Publisher"]),
            country=parse_country(row["Country of publication"]),
            name=clean_str(row.get("International title")),
            langs=[lang for lang in [parse_lang(row["Language"])] if lang],
        )

        info.extra["as_of"] = self.config.norwegian.date
        if row["Level 2019"]:
            info.extra["level"] = int(row["Level 2019"])

        if row["Original title"] != row["International title"]:
            info.original_name = clean_str(row["Original title"])

        url = HomepageUrl.from_url(row["URL"])
        if url:
            info.homepage_urls.append(url)

        return info
Exemple #15
0
    def parse_record(self, row) -> Optional[DirectoryInfo]:

        row = json.loads(row)

        info = DirectoryInfo(
            directory_slug=self.source_slug,
        )
        # format is an array of metadata elements
        for el in row:
            if "label" in el and el["@id"].startswith(
                "http://id.loc.gov/vocabulary/countries"
            ):
                value = el["label"]
                if "(State)" in value:
                    value = ""
                if value == "Russia (Federation)":
                    value = "Russia"
                info.country = parse_country(el["label"])
            if not "@type" in el:
                continue
            if el["@type"] == "http://id.loc.gov/ontologies/bibframe/IssnL":
                info.issnl = clean_issn(el["value"])
            if "mainTitle" in el:
                if type(el["mainTitle"]) == list:
                    info.name = clean_str(el["mainTitle"][0])
                else:
                    info.name = clean_str(el["mainTitle"])
                if el.get("format") == "vocabularies/medium#Print":
                    info.issnp = clean_issn(el["issn"])
                elif el.get("format") == "vocabularies/medium#Electronic":
                    info.issne = clean_issn(el["issn"])
            urls = el.get("url", [])
            if isinstance(urls, str):
                urls = [
                    urls,
                ]
            for url in urls:
                homepage = HomepageUrl.from_url(url)
                if homepage:
                    info.homepage_urls.append(homepage)

        return info
Exemple #16
0
    def parse_record(self, record) -> Optional[DirectoryInfo]:

        # HACK
        if "\ufeffTitle" in record:
            record["Title"] = record["\ufeffTitle"]
        if not record["Title"]:
            return None

        info = DirectoryInfo(
            directory_slug=self.source_slug,
            raw_issn=clean_issn(record["ISSN"]),
            issne=clean_issn(record["EISSN"]),
            name=clean_str(record["Title"]),
        )

        homepage = HomepageUrl.from_url(record["URL"])
        if homepage:
            info.homepage_urls.append(homepage)
        return info
    def parse_record(self, record) -> Optional[DirectoryInfo]:

        if not record["Journal Name"]:
            return None

        info = DirectoryInfo(
            directory_slug=self.source_slug,
            raw_issn=clean_issn(record["ISSN"]),
            issne=clean_issn(record["E-ISSN"]),
            name=clean_str(record["Journal Name"]),
            publisher=clean_str(record["Publisher"]),
            langs=[
                lang for lang in [parse_lang(record["Language(s)"])] if lang
            ],
            country=parse_country(record["Country"]),
        )
        homepage = HomepageUrl.from_url(record["Internet Archive Link"])
        if homepage:
            info.homepage_urls.append(homepage)
        return info
Exemple #18
0
    def parse_record(self, row) -> Optional[DirectoryInfo]:
        # TODO: Subjects, Permanent article identifiers, work_level stuff

        info = DirectoryInfo(
            directory_slug=self.source_slug,
            issnp=row["Journal ISSN (print version)"],
            issne=row["Journal EISSN (online version)"],
            name=clean_str(row["Journal title"]),
            publisher=clean_str(row["Publisher"]),
            country=parse_country(row["Country of publisher"]),
        )

        lang = parse_lang(
            row["Languages in which the journal accepts manuscripts"])
        if lang:
            info.langs.append(lang)

        info.extra["as_of"] = self.config.snapshot.date
        if row["DOAJ Seal"]:
            info.extra["seal"] = {
                "no": False,
                "yes": True
            }[row["DOAJ Seal"].lower()]

        if row["Preservation Services"]:
            info.extra["archive"] = [
                a.strip() for a in row["Preservation Services"].split(",")
                if a.strip()
            ]
        elif row["Preservation Service: national library"]:
            info.extra["archive"] = ["national-library"]

        default_license = row["Journal license"]
        if default_license and default_license.startswith("CC"):
            info.extra["default_license"] = default_license.replace(
                "CC ", "CC-").strip()

        url = row["Journal URL"]
        if url:
            homepage = HomepageUrl.from_url(row["Journal URL"])
            if homepage:
                info.homepage_urls.append(homepage)
        return info
Exemple #19
0
    def parse_record(self, row) -> Optional[DirectoryInfo]:
        info = DirectoryInfo(
            directory_slug=self.source_slug,
            raw_issn=row["ISSN-L"],
            name=clean_str(row["Short Title"]),
            publisher=clean_str(row["Publisher"]),
            langs=[
                lang for lang in
                [parse_lang(s) for s in (row["Lang1"], row["Lang2"])] if lang
            ],
        )

        # TODO: region mapping: "Europe and North America"
        # TODO: lang mapping: already alpha-3

        # homepages
        for url in [u for u in (row["URL1"], row["URL2"]) if u]:
            homepage = HomepageUrl.from_url(url)
            if homepage:
                info.homepage_urls.append(homepage)

        return info
Exemple #20
0
    def parse_record(self, row) -> Optional[DirectoryInfo]:

        if not (row.get("ISSN_L") and row.get("TITLE")):
            return None

        # TODO: also add for other non-direct indices
        # for ind in ('WOS', 'SCOPUS'):
        #    issnl, status = self.add_issn(
        #        ind.lower(),
        #        raw_issn=row['ISSN_L'],
        #        name=row['TITLE'],
        #    )

        extra = dict()
        for ind in ("DOAJ", "ROAD", "PMC", "OAPC", "WOS", "SCOPUS"):
            extra["in_" + ind.lower()] = bool(int(row["JOURNAL_IN_" + ind]))

        return DirectoryInfo(
            directory_slug=self.source_slug,
            raw_issn=row["ISSN_L"],
            name=clean_str(row["TITLE"]),
            extra=extra,
        )
Exemple #21
0
    def parse_record(self, line) -> Optional[DirectoryInfo]:
        record = json.loads(line)

        issn_info = record.get("identifiers", {}).get("issn", {})
        # sometimes is a list
        for k in "generic", "electronic", "print":
            if type(issn_info.get(k)) == list:
                issn_info[k] = issn_info[k][0]
        info = DirectoryInfo(
            directory_slug=self.source_slug,
            raw_issn=clean_issn(issn_info.get("generic", "")),
            issne=clean_issn(issn_info.get("electronic", "")),
            issnp=clean_issn(issn_info.get("print", "")),
            name=clean_str(record.get("title")),
            langs=[
                lang for lang in [parse_lang(s) for s in record["languages"]]
                if lang
            ],
        )
        if record["url"]:
            homepage = HomepageUrl.from_url(record["url"])
            if homepage:
                info.homepage_urls.append(homepage)
        return info