def barcode(self, key, value): """Translates the barcodes.""" _migration = self["_migration"] for v in force_list(value): val_a = clean_val("a", v, str) val_n = clean_val("n", v, str) val_x = clean_val("x", v, str) val_9 = clean_val("9", v, str) if val_a or val_9: if val_n or val_x or val_a and val_9: raise UnexpectedValue() identifier = {"scheme": "report_number", "value": val_a or val_9} if val_9: identifier["hidden"] = True identifiers = self.get("identifiers", []) identifiers.append(identifier) self["identifiers"] = identifiers raise IgnoreKey("barcode") if val_n and val_x: volume_number = extract_volume_number( val_n, raise_exception=True, subfield="n" ) _insert_volume(_migration, volume_number, {"barcode": val_x}) elif val_x: raise MissingRequiredField( subfield="n", message=" this record is missing a volume number" ) else: raise MissingRequiredField( subfield="x", message=" this record is missing a barcode number", ) raise IgnoreKey("barcode")
def isbns(self, key, value): """Translates isbns stored in the record.""" _migration = self["_migration"] _identifiers = self.get("identifiers", []) val_u = clean_val("u", value, str) val_a = clean_val("a", value, str) val_b = clean_val("b", value, str) if val_u: volume_info = extract_volume_info(val_u) # if set found it means that the isbn is for the whole multipart set_search = re.search(r"(.*?)\(set\.*\)", val_u) if volume_info: # if we have volume there it means that the ISBN is of the volume volume_obj = { "isbn": clean_val("a", value, str), "physical_description": volume_info["description"].strip(), "is_electronic": val_b is not None, } _insert_volume(_migration, volume_info["volume"], volume_obj) raise IgnoreKey("identifiers") if set_search: self["physical_description"] = set_search.group(1).strip() isbn = {"scheme": "ISBN", "value": val_a} return isbn if isbn not in _identifiers else None if not volume_info: # Try to find a volume number volume_number = extract_volume_number(val_u) if volume_number: # volume, but without description volume_obj = { "isbn": clean_val("a", value, str), "is_electronic": val_b is not None, } _insert_volume(_migration, volume_number, volume_obj) raise IgnoreKey("identifiers") elif extract_volume_number(val_u, search=True): raise UnexpectedValue( subfield="u", message=" found volume but failed to parse description", ) else: self["physical_description"] = val_u isbn = {"scheme": "ISBN", "value": val_a} return isbn if isbn not in _identifiers else None if not set_search and not volume_info: self["physical_description"] = val_u isbn = {"scheme": "ISBN", "value": val_a} return isbn if isbn not in _identifiers else None elif not val_u and val_a: # if I dont have volume info but only isbn isbn = {"scheme": "ISBN", "value": val_a} return isbn if isbn not in _identifiers else None else: raise UnexpectedValue(subfield="a", message=" isbn not provided")
def wrapper(self, key, value, **kwargs): out = f(self, key, value) if out: clean_list = [ dict((k, v) for k, v in elem.items() if v) for elem in out if elem ] clean_list = [elem for elem in clean_list if elem] if not clean_list: raise IgnoreKey(key) return clean_list else: raise IgnoreKey(key)
def proxy(self, key, value, **kwargs): res = fn_decorated(self, key, value, **kwargs) if not res: raise IgnoreKey(key) if isinstance(res, str): # the value is not checked for empty strings here because clean_val # does the job, it will be None caught before return res.strip() elif isinstance(res, list): cleaned = [elem.strip() for elem in res if elem] if not cleaned: raise IgnoreKey(key) return cleaned else: return res
def related_records(self, key, value): """Translates related_records field. RELATED records """ _migration = self["_migration"] _related = _migration["related"] relation_type = OTHER_RELATION.name relation_description = None try: if key == "775__" and "b" in value: description = clean_val("b", value, str) relation_description = description relation_type_tag = clean_val("x", value, str) if relation_type_tag: if relation_type_tag.lower() == "edition": relation_type = EDITION_RELATION.name elif relation_type_tag.lower() == "language": relation_type = LANGUAGE_RELATION.name if key == "787__" and "i" in value: clean_val("i", value, str, manual=True) _related.append({ "related_recid": clean_val("w", value, str, req=True), "relation_type": relation_type, "relation_description": relation_description, }) _migration.update({"related": _related, "has_related": True}) raise IgnoreKey("_migration") except ManualImportRequired as e: if key == "775__": e.subfield = "b or c" else: e.subfield = "i" raise e
def isbns(self, key, value): """Translates isbns fields.""" _isbns = self.get("identifiers", []) for v in force_list(value): subfield_u = clean_val("u", v, str) isbn = { "value": clean_val("a", v, str) or clean_val("z", v, str), "scheme": "ISBN", } if not isbn["value"]: raise IgnoreKey("identifiers") if subfield_u: volume = re.search(r"(\(*v[.| ]*\d+.*\)*)", subfield_u) if volume: volume = volume.group(1) subfield_u = subfield_u.replace(volume, "").strip() existing_volume = self.get("volume") if existing_volume: raise ManualImportRequired(subfield="u") self["volume"] = volume # WARNING! vocabulary document_identifiers_materials material = mapping(IDENTIFIERS_MEDIUM_TYPES, subfield_u, subfield="u") if material: isbn.update({"material": material}) if isbn not in _isbns: _isbns.append(isbn) return _isbns
def arxiv_eprints(self, key, value): """Translates arxiv_eprints fields. output: { 'alternative_identifiers': [{'scheme': 'arXiv', 'value': `037__a`}], } """ def check_category(field, val): category = clean_val(field, val, str) if category: if category in ARXIV_CATEGORIES: return category raise UnexpectedValue(subfield=field) if key == "037__": _alternative_identifiers = self.get("alternative_identifiers", []) for v in force_list(value): eprint_id = clean_val("a", v, str, req=True) duplicated = [ elem for i, elem in enumerate(_alternative_identifiers) if elem["value"] == eprint_id and elem["scheme"].lower() == "arxiv" ] category = check_category("c", v) if not duplicated: eprint = {"value": eprint_id, "scheme": "arXiv"} _alternative_identifiers.append(eprint) self["alternative_identifiers"] = _alternative_identifiers if category: _subjects = self.get("subjects", []) subject = {"scheme": "arXiv", "value": category} _subjects.append(subject) if subject not in _subjects else None self["subjects"] = _subjects raise IgnoreKey("subjects")
def isbns(self, key, value): """Translates isbns fields.""" _isbns = self.get("identifiers", []) for v in force_list(value): subfield_u = clean_val("u", v, str) isbn = { "value": clean_val("a", v, str) or clean_val("z", v, str), "scheme": "ISBN", } if not isbn["value"]: raise IgnoreKey("identifiers") if subfield_u: volume = re.search(r"(\(*v[.| ]*\d+.*\)*)", subfield_u) if volume: volume = volume.group(1) subfield_u = subfield_u.replace(volume, "").strip() existing_volume = self.get("volume") if existing_volume: raise ManualImportRequired(subfield="u") self["volume"] = volume if subfield_u.upper() in MEDIUM_TYPES: isbn.update({"medium": subfield_u}) else: isbn.update({"description": subfield_u}) if isbn not in _isbns: _isbns.append(isbn) return _isbns
def open_access(self, key, value): """Translate open access.""" _open_access = clean_val("a", value, str) _eitem = self.get("_eitem", {}) if _open_access.lower() == "open access": _eitem["open_access"] = True self["_eitem"] = _eitem raise IgnoreKey("open_access")
def open_access(self, key, value): """Translate open access field. If the field is present, then the eitems of this record have open access """ sub_r = clean_val("r", value, str) if sub_r and "open access" in sub_r.lower(): self["_migration"]["eitems_open_access"] = True raise IgnoreKey("_migration")
def document_type(self, key, value): """Translates document type field.""" for v in force_list(value): clean_val_a = clean_val("a", v, str) if (((key == "980__" or key == "690C_") and clean_val_a == "PERI") or key == "960__" and clean_val_a == "31"): raise IgnoreKey("document_type") else: raise UnexpectedValue(subfield="a")
def alternative_abstracts(self, key, value): """Translates abstracts fields.""" abstract = self.get("abstract", None) _alternative_abstracts = self.get("alternative_abstracts", []) if not abstract: # takes first abstract as main self["abstract"] = clean_val("a", value, str, req=True) raise IgnoreKey("alternative_abstracts") new_abstract = clean_val("a", value, str, req=True) return new_abstract if new_abstract not in _alternative_abstracts else None
def open_access(self, key, value): """Translate open access field. If the field is present, then the eitems of this record have open access """ has_open_access = "r" in value if has_open_access: self["_migration"]["eitems_open_access"] = True raise IgnoreKey("_migration")
def children_records(self, key, value): """Translates fields related to children record types.""" _migration = self["_migration"] _electronic_items = _migration.get("electronic_items", []) if key == "362__": _electronic_items.append({"subscription": clean_val("a", value, str)}) _migration.update({ "electronic_items": _electronic_items, }) raise IgnoreKey("_children")
def subject_classification(self, key, value): """Translates subject classification field.""" prev_subjects = self.get("subjects", []) _subject_classification = { "value": clean_val("c", value, str, req=True), "scheme": "ICS" } if _subject_classification not in prev_subjects: return _subject_classification else: raise IgnoreKey("subjects")
def subject_classification(self, key, value): """Translates subject classification field.""" prev_subjects = self.get("subjects", []) _subject_classification = {"value": clean_val("a", value, str, req=True)} if key == "080__": _subject_classification.update({"scheme": "UDC"}) elif key.startswith("082"): _subject_classification.update({"scheme": "Dewey"}) elif key == "084__": sub_2 = clean_val("2", value, str) if sub_2 and sub_2.upper() in SUBJECT_CLASSIFICATION_EXCEPTIONS: keywords(self, key, value) raise IgnoreKey("subjects") else: _subject_classification.update({"scheme": "ICS"}) elif key.startswith("050"): _subject_classification.update({"scheme": "LoC"}) if _subject_classification not in prev_subjects: return _subject_classification else: raise IgnoreKey("subjects")
def barcodes(self, key, value): """Match barcodes of items to volumes.""" val_n = clean_val("n", value, str) val_x = clean_val("x", value, str) _migration = self["_migration"] _migration["volumes"].append( dict( volume=extract_volume_number(val_n), barcode=val_x, )) raise IgnoreKey("barcodes")
def created(self, key, value): """Translates created information to fields.""" _created_by = self.get("created_by", {}) date_value = clean_val("x", value, int, regex_format=r"\d{8}$") if date_value: year, week, day = str(date_value)[:4],\ str(date_value)[4:6],\ str(date_value)[6:8] date = datetime.date(int(year), int(week), int(day)) return date.isoformat() raise IgnoreKey("_created")
def languages(self, key, value): """Translates languages fields.""" lang = clean_val("b", value, str).lower() _languages = self.get("languages", []) try: new_lang = pycountry.languages.lookup(lang).alpha_3.upper() if new_lang not in _languages: return new_lang else: raise IgnoreKey("languages") except (KeyError, AttributeError, LookupError): raise UnexpectedValue(subfield="a")
def book_series(self, key, value): """Match barcodes to volumes.""" val_n = clean_val("n", value, str) val_x = clean_val("x", value, str) _migration = self["_migration"] _migration["serials"].append({ "title": clean_val("a", value, str), "volume": clean_val("v", value, str), "issn": val_x, }) _migration["has_serial"] = True raise IgnoreKey("book_series")
def number_of_pages(self, key, value): """Translates number_of_pages fields.""" val_x = clean_val("x", value, str) val_a = clean_val("a", value, str) if val_x: if val_x == "volume": raise IgnoreKey("number_of_pages") elif val_x.lower() in ["phys.desc.", "phys.desc"]: self["physical_description"] = val_a raise IgnoreKey("number_of_pages") else: if is_excluded(val_a): raise IgnoreKey("number_of_pages") parts = extract_parts(val_a) if parts["has_extra"]: raise UnexpectedValue(subfield="a") if parts["physical_description"]: self["physical_description"] = parts["physical_description"] if parts["number_of_pages"]: return str(parts["number_of_pages"]) raise UnexpectedValue(subfield="a")
def number_of_volumes(self, key, value): """Translates number of volumes.""" _series_title = self.get("title", None) if not _series_title: raise MissingRequiredField( subfield="a", message=" this record is missing a main title" ) val_a = clean_val("a", value, str) parsed_a = extract_parts(val_a) if not parsed_a["number_of_pages"] and ("v" in val_a or "vol" in val_a): _volumes = re.findall(r"\d+", val_a) if _volumes: return _volumes[0] raise IgnoreKey("number_of_volumes")
def number_of_pages(self, key, value): """Translates number_of_pages fields.""" val = clean_val("a", value, str) if is_excluded(val): raise IgnoreKey("number_of_pages") parts = extract_parts(val) if parts["has_extra"]: raise UnexpectedValue(subfield="a") if parts["physical_copy_description"]: self["physical_copy_description"] = parts["physical_copy_description"] if parts["number_of_pages"]: return str(parts["number_of_pages"]) raise UnexpectedValue(subfield="a")
def created(self, key, value): """Translates created information to fields.""" _created_by = self.get("created_by", {}) if key == "916__": if "s" in value: _created_by.update({ "type": mapping( ACQUISITION_METHOD, clean_val("s", value, str, default="migration"), raise_exception=True, ) }) self["created_by"] = _created_by date = clean_val("w", value, int, regex_format=r"\d{6}$") if date: year, week = str(date)[:4], str(date)[4:] date = get_week_start(int(year), int(week)) return date.isoformat() elif key == "595__": try: sub_a = clean_val("a", value, str, regex_format=r"[A-Z]{3}[0-9]{6}$") if sub_a: source = sub_a[:3] self["source"] = source year, month = int(sub_a[3:7]), int(sub_a[7:]) self["_created"] = datetime.date(year, month, 1).isoformat() raise IgnoreKey("_created") except UnexpectedValue as e: e.subfield = "a" self["internal_notes"] = internal_notes(self, key, value) raise IgnoreKey("_created") raise IgnoreKey("_created")
def multivolume_record_format(self, key, value): """Multivolume kind.""" val_a = clean_val("a", value, str) _migration = self["_migration"] if val_a == "MULTIVOLUMES-1": parsed = True elif val_a == "MULTIVOLUMES-X" or val_a == "MULTIVOLUMES-x": parsed = False elif val_a == "MULTIVOLUMES-MANUAL": raise Exception("This record should not be migrated!") else: raise UnexpectedValue( subfield="a", message=" unrecognized migration multipart tag" ) _migration["multivolume_record_format"] = parsed raise IgnoreKey("multivolume_record_format")
def special_serials(self, key, value): """Translates serial fields.""" _migration = self["_migration"] _serials = _migration.get("serials", []) for v in force_list(value): result_a = mapping(SERIAL, clean_val("a", v, str)) if result_a: _serials.append({ "title": result_a, "volume": None, "issn": None, }) if result_a not in _serials else None _migration.update({"serials": _serials, "has_serial": True}) if not result_a: self["document_type"] = document_type(self, key, value) raise IgnoreKey("_migration") return _migration
def collection(self, key, value): """Translates collection field - WARNING - also document type field.""" _migration = self["_migration"] _tags = _migration["tags"] for v in force_list(value): result_a = mapping(COLLECTION, clean_val("a", v, str)) result_b = mapping(COLLECTION, clean_val("b", v, str)) if result_a: _tags.append(result_a) if result_a not in _tags else None _migration["has_tags"] = True if result_b: _tags.append(result_b) if result_b not in _tags else None _migration["has_tags"] = True if not result_a and not result_b: self["document_type"] = document_type(self, key, value) raise IgnoreKey("_migration") return _migration
def project_id(self, key, value): """Report number.""" values = force_list(value) project_id = None related_links = self.get('related_links', []) for value in values: related_link = {} if 'p' in value and 'u' in value: related_link['name'] = value.get('p') related_link['url'] = value.get('u') related_links.append(related_link) else: project_id = value.get('u') if related_links: self['related_links'] = related_links if not project_id: raise IgnoreKey('project_id') return project_id
def alternative_titles_doc(self, key, value): """Alternative titles.""" _alternative_titles = self.get("alternative_titles", []) if key == "242__": _alternative_titles += alternative_titles(self, key, value) elif key == "246__": if ("n" in value and "p" not in value) or ("n" not in value and "p" in value): raise MissingRequiredField(subfield="n or p") if "p" in value: _migration = self.get("_migration", {}) if "volumes" not in _migration: _migration["volumes"] = [] val_n = clean_val("n", value, str) _migration["volumes"].append({ "volume": extract_volume_number(val_n, raise_exception=True), "title": clean_val("p", value, str), }) _migration["is_multipart"] = True _migration["record_type"] = "multipart" self["_migration"] = _migration raise IgnoreKey("alternative_titles") else: if "a" in value: _alternative_titles.append({ "value": clean_val("a", value, str, req=True), "type": "ALTERNATIVE_TITLE", }) if "b" in value: _alternative_titles.append({ "value": clean_val("b", value, str, req=True), "type": "SUBTITLE", }) return _alternative_titles
def corporate_authors(self, key, value): """Translates the corporate authors field.""" _corporate_authors = self.get("authors", []) for v in force_list(value): if key == "710__": if "a" in v: _corporate_authors.append({ "full_name": clean_val("a", v, str), "type": "ORGANISATION", }) else: self["authors"] = collaborations(self, key, value) raise IgnoreKey("corporate_authors") else: _corporate_authors.append({ "full_name": clean_val("a", v, str), "type": "ORGANISATION" }) return _corporate_authors