コード例 #1
0
ファイル: scielo.py プロジェクト: internetarchive/chocula
 def parse_record(self, line) -> Optional[DirectoryInfo]:
     record = json.loads(line)
     extra = dict(
         status=clean_str(record.get("current_status")),
         first_year=record.get("first_year"),
         collection=record.get("collection_acronym"),
     )
     for k in list(extra.keys()):
         if extra[k] is None:
             extra.pop(k)
     country: Optional[str] = None
     if record["publisher_country"] and len(
             record["publisher_country"][0]) == 2:
         country = record["publisher_country"][0].lower()
     info = DirectoryInfo(
         directory_slug=self.source_slug,
         issne=clean_issn(record.get("electronic_issn") or ""),
         issnp=clean_issn(record.get("print_issn") or ""),
         custom_id=clean_str(record.get("scielo_issn")),
         name=clean_str(record.get("fulltitle")),
         publisher=clean_str((record.get("publisher_name") or [""])[0]),
         abbrev=clean_str(record["abbreviated_iso_title"]),
         platform="scielo",
         langs=[
             lang for lang in [parse_lang(s) for s in record["languages"]]
             if lang
         ],
         country=country,
         extra=extra,
     )
     if record["url"]:
         homepage = HomepageUrl.from_url(record["url"])
         if homepage:
             info.homepage_urls.append(homepage)
     return info
コード例 #2
0
    def parse_record(self, record) -> Optional[DirectoryInfo]:
        """
        Most of this metadata comes from chocula/fatcat; we are only interested
        in the homepage URLs.

        The "corrected titles" have been manually entered into fatcat directly.

        CSV columns:
        - issnl
        - issnp
        - issne
        - name
        - corrected title
        - publisher
        - country
        - lang
        - release_count
        - Homepage URL
        - Inactive
        """

        info = DirectoryInfo(
            directory_slug=self.source_slug,
            issnl=record["issnl"],
        )
        url = record["Homepage URL"]
        if url is None or url.lower() == "unknown" or len(url) < 4:
            return None
        homepage = HomepageUrl.from_url(url)
        if homepage:
            info.homepage_urls.append(homepage)
        if homepage is None:
            return None
        return info
コード例 #3
0
ファイル: ezb.py プロジェクト: internetarchive/chocula
    def parse_record(self, row) -> Optional[DirectoryInfo]:

        if not row:
            return None
        row = json.loads(row)

        info = DirectoryInfo(
            directory_slug=self.source_slug,
            issne=row.get("issne"),
            issnp=row.get("issnp"),
            custom_id=row["ezb_id"],
            name=clean_str(row["title"]),
            publisher=clean_str(row.get("publisher")),
        )

        info.extra = dict()
        for k in (
                "ezb_color",
                "subjects",
                "keywords",
                "zdb_id",
                "first_volume",
                "first_issue",
                "first_year",
                "appearance",
                "costs",
        ):
            if row.get(k):
                info.extra[k] = row[k]

        url = HomepageUrl.from_url(row.get("url"))
        if url:
            info.homepage_urls.append(url)

        return info
コード例 #4
0
    def parse_record(self, row) -> Optional[DirectoryInfo]:

        if not row:
            return None

        row = json.loads(row)

        info = DirectoryInfo(
            directory_slug=self.source_slug,
            issne=row.get("issne"),
            issnp=row.get("issnp"),
            raw_issn=row.get("issn"),
            name=clean_str(row["title"]),
            publisher=clean_str(row.get("ed")),
        )

        info.extra["as_of"] = self.config.szczepanski.date
        if row.get("extra"):
            info.extra["notes"] = row.get("extra")
        for k in ("other_titles", "year_spans", "ed"):
            if row.get(k):
                info.extra[k] = row[k]

        url = HomepageUrl.from_url(row.get("url"))
        if url:
            info.homepage_urls.append(url)

        return info
コード例 #5
0
    def parse_record(self, record) -> Optional[DirectoryInfo]:
        info = DirectoryInfo(
            directory_slug=self.source_slug,
            raw_issn=clean_issn(record["Issn"]),
            custom_id=record["JournalId"],
            name=clean_str(record["DisplayName"]),
            publisher=clean_str(record["Publisher"]),
        )
        homepage = HomepageUrl.from_url(record["Webpage"] or "")
        if homepage:
            info.homepage_urls.append(homepage)

        return info
コード例 #6
0
ファイル: zdb_fize.py プロジェクト: internetarchive/chocula
    def parse_record(self, record) -> Optional[DirectoryInfo]:

        if not record.strip():
            return None
        record = json.loads(record)

        info = DirectoryInfo(directory_slug=self.source_slug, issnl=record["issn"])

        homepage = HomepageUrl.from_url(record["homepage"])
        if homepage:
            info.homepage_urls.append(homepage)
        else:
            return None
        return info
コード例 #7
0
    def parse_record(self, record) -> Optional[DirectoryInfo]:

        # HACK
        if "\ufeffTitle" in record:
            record["Title"] = record["\ufeffTitle"]
        if not record["Title"]:
            return None

        info = DirectoryInfo(
            directory_slug=self.source_slug,
            raw_issn=clean_issn(record["ISSN"]),
            issne=clean_issn(record["EISSN"]),
            name=clean_str(record["Title"]),
        )

        homepage = HomepageUrl.from_url(record["URL"])
        if homepage:
            info.homepage_urls.append(homepage)
        return info
コード例 #8
0
ファイル: doaj.py プロジェクト: internetarchive/chocula
    def parse_record(self, row) -> Optional[DirectoryInfo]:
        # TODO: Subjects, Permanent article identifiers, work_level stuff

        info = DirectoryInfo(
            directory_slug=self.source_slug,
            issnp=row["Journal ISSN (print version)"],
            issne=row["Journal EISSN (online version)"],
            name=clean_str(row["Journal title"]),
            publisher=clean_str(row["Publisher"]),
            country=parse_country(row["Country of publisher"]),
        )

        lang = parse_lang(
            row["Languages in which the journal accepts manuscripts"])
        if lang:
            info.langs.append(lang)

        info.extra["as_of"] = self.config.snapshot.date
        if row["DOAJ Seal"]:
            info.extra["seal"] = {
                "no": False,
                "yes": True
            }[row["DOAJ Seal"].lower()]

        if row["Preservation Services"]:
            info.extra["archive"] = [
                a.strip() for a in row["Preservation Services"].split(",")
                if a.strip()
            ]
        elif row["Preservation Service: national library"]:
            info.extra["archive"] = ["national-library"]

        default_license = row["Journal license"]
        if default_license and default_license.startswith("CC"):
            info.extra["default_license"] = default_license.replace(
                "CC ", "CC-").strip()

        url = row["Journal URL"]
        if url:
            homepage = HomepageUrl.from_url(row["Journal URL"])
            if homepage:
                info.homepage_urls.append(homepage)
        return info
コード例 #9
0
    def parse_record(self, row) -> Optional[DirectoryInfo]:

        row = json.loads(row)

        info = DirectoryInfo(
            directory_slug=self.source_slug,
        )
        # format is an array of metadata elements
        for el in row:
            if "label" in el and el["@id"].startswith(
                "http://id.loc.gov/vocabulary/countries"
            ):
                value = el["label"]
                if "(State)" in value:
                    value = ""
                if value == "Russia (Federation)":
                    value = "Russia"
                info.country = parse_country(el["label"])
            if not "@type" in el:
                continue
            if el["@type"] == "http://id.loc.gov/ontologies/bibframe/IssnL":
                info.issnl = clean_issn(el["value"])
            if "mainTitle" in el:
                if type(el["mainTitle"]) == list:
                    info.name = clean_str(el["mainTitle"][0])
                else:
                    info.name = clean_str(el["mainTitle"])
                if el.get("format") == "vocabularies/medium#Print":
                    info.issnp = clean_issn(el["issn"])
                elif el.get("format") == "vocabularies/medium#Electronic":
                    info.issne = clean_issn(el["issn"])
            urls = el.get("url", [])
            if isinstance(urls, str):
                urls = [
                    urls,
                ]
            for url in urls:
                homepage = HomepageUrl.from_url(url)
                if homepage:
                    info.homepage_urls.append(homepage)

        return info
コード例 #10
0
    def parse_record(self, record) -> Optional[DirectoryInfo]:

        if not record["Journal Name"]:
            return None

        info = DirectoryInfo(
            directory_slug=self.source_slug,
            raw_issn=clean_issn(record["ISSN"]),
            issne=clean_issn(record["E-ISSN"]),
            name=clean_str(record["Journal Name"]),
            publisher=clean_str(record["Publisher"]),
            langs=[
                lang for lang in [parse_lang(record["Language(s)"])] if lang
            ],
            country=parse_country(record["Country"]),
        )
        homepage = HomepageUrl.from_url(record["Internet Archive Link"])
        if homepage:
            info.homepage_urls.append(homepage)
        return info
コード例 #11
0
ファイル: common.py プロジェクト: internetarchive/chocula
    def parse_record(self, row: dict, issn_db: IssnDatabase) -> Optional[KbartRecord]:

        raw_issn = clean_issn(row["ISSN"])
        issnl = issn_db.issn2issnl(raw_issn or "")
        start_year = int(row["Published"][:4])
        start_volume = clean_str(row["Vol"])
        record = KbartRecord(
            issnl=issnl,
            issne=None,
            issnp=None,
            embargo=None,
            title=clean_str(row["Title"]),
            publisher=clean_str(row["Publisher"]),
            url=HomepageUrl.from_url(row["Url"]),
            start_year=start_year,
            end_year=start_year,
            start_volume=start_volume,
            end_volume=start_volume,
            year_spans=[],
        )
        return record
コード例 #12
0
    def parse_record(self, row) -> Optional[DirectoryInfo]:

        if not row.get("issn"):
            return None

        info = DirectoryInfo(
            directory_slug=self.source_slug,
            issne=row["issn_electronic"],
            issnp=row["issn_print"],
            raw_issn=row["issn_l"] or row["issn"],
            name=clean_str(row["journal_full_title"]),
            publisher=clean_str(row["publisher"]),
        )

        info.extra["is_hybrid"] = bool(row["is_hybrid"])

        homepage = HomepageUrl.from_url(row["url"])
        if homepage:
            info.homepage_urls.append(homepage)

        return info
コード例 #13
0
ファイル: road.py プロジェクト: internetarchive/chocula
    def parse_record(self, row) -> Optional[DirectoryInfo]:
        info = DirectoryInfo(
            directory_slug=self.source_slug,
            raw_issn=row["ISSN-L"],
            name=clean_str(row["Short Title"]),
            publisher=clean_str(row["Publisher"]),
            langs=[
                lang for lang in
                [parse_lang(s) for s in (row["Lang1"], row["Lang2"])] if lang
            ],
        )

        # TODO: region mapping: "Europe and North America"
        # TODO: lang mapping: already alpha-3

        # homepages
        for url in [u for u in (row["URL1"], row["URL2"]) if u]:
            homepage = HomepageUrl.from_url(url)
            if homepage:
                info.homepage_urls.append(homepage)

        return info
コード例 #14
0
ファイル: common.py プロジェクト: internetarchive/chocula
    def parse_record(self, row: dict, issn_db: IssnDatabase) -> Optional[KbartRecord]:

        issne: Optional[str] = clean_issn(row["online_identifier"] or "")
        issnp: Optional[str] = clean_issn(row["print_identifier"] or "")
        issnl: Optional[str] = None
        if issne:
            issnl = issn_db.issn2issnl(issne)
        if issnp and not issnl:
            issnl = issn_db.issn2issnl(issnp)
        start_year: Optional[int] = None
        end_year: Optional[int] = None
        if row["date_first_issue_online"]:
            start_year = int(row["date_first_issue_online"][:4])
        if row["date_last_issue_online"]:
            end_year = int(row["date_last_issue_online"][:4])
        end_volume = row["num_last_vol_online"]
        # hack to handle open-ended preservation
        if end_year is None and end_volume and "(present)" in end_volume:
            end_year = THIS_YEAR
        record = KbartRecord(
            issnl=issnl,
            issnp=issnp,
            issne=issne,
            title=clean_str(row["publication_title"]),
            publisher=clean_str(row["publisher_name"]),
            url=HomepageUrl.from_url(row["title_url"]),
            embargo=clean_str(row["embargo_info"]),
            start_year=start_year,
            end_year=end_year,
            start_volume=clean_str(row["num_first_vol_online"]),
            end_volume=clean_str(row["num_last_vol_online"]),
            year_spans=[],
        )
        if record.start_volume == "null":
            record.start_volume = None
        if record.end_volume == "null":
            record.end_volume = None
        return record
コード例 #15
0
ファイル: norwegian.py プロジェクト: internetarchive/chocula
    def parse_record(self, row) -> Optional[DirectoryInfo]:
        info = DirectoryInfo(
            directory_slug=self.source_slug,
            issnp=row["Print ISSN"],
            issne=row["Online ISSN"],
            custom_id=clean_str(row["NSD tidsskrift_id"]),
            publisher=clean_str(row["Publisher"]),
            country=parse_country(row["Country of publication"]),
            name=clean_str(row.get("International title")),
            langs=[lang for lang in [parse_lang(row["Language"])] if lang],
        )

        info.extra["as_of"] = self.config.norwegian.date
        if row["Level 2019"]:
            info.extra["level"] = int(row["Level 2019"])

        if row["Original title"] != row["International title"]:
            info.original_name = clean_str(row["Original title"])

        url = HomepageUrl.from_url(row["URL"])
        if url:
            info.homepage_urls.append(url)

        return info
コード例 #16
0
ファイル: awol.py プロジェクト: internetarchive/chocula
    def parse_record(self, line) -> Optional[DirectoryInfo]:
        record = json.loads(line)

        issn_info = record.get("identifiers", {}).get("issn", {})
        # sometimes is a list
        for k in "generic", "electronic", "print":
            if type(issn_info.get(k)) == list:
                issn_info[k] = issn_info[k][0]
        info = DirectoryInfo(
            directory_slug=self.source_slug,
            raw_issn=clean_issn(issn_info.get("generic", "")),
            issne=clean_issn(issn_info.get("electronic", "")),
            issnp=clean_issn(issn_info.get("print", "")),
            name=clean_str(record.get("title")),
            langs=[
                lang for lang in [parse_lang(s) for s in record["languages"]]
                if lang
            ],
        )
        if record["url"]:
            homepage = HomepageUrl.from_url(record["url"])
            if homepage:
                info.homepage_urls.append(homepage)
        return info
コード例 #17
0
    def parse_record(self, row) -> Optional[DirectoryInfo]:

        if not (row.get("issn") and row.get("title")):
            return None
        wikidata_qid = row["item"].strip().split("/")[-1]
        publisher = row["publisher_name"]
        if ((publisher.startswith("Q") and publisher[1].isdigit())
                or publisher.startswith("t1") or not publisher):
            publisher = None
        info = DirectoryInfo(
            directory_slug=self.source_slug,
            raw_issn=row["issn"],
            custom_id=wikidata_qid,
            name=clean_str(row["title"]),
            publisher=clean_str(publisher),
        )
        if row.get("start_year"):
            info.extra["start_year"] = row["start_year"]

        url = HomepageUrl.from_url(row.get("websiteurl"))
        if url:
            info.homepage_urls.append(url)

        return info