def marc21_to_part_of(self, key, value): """Get part_of.""" part_of = {} subfield_x = not_repetitive(unimarc.bib_id, 'unimarc', key, value, 'x', default='').strip() linked_pid = None if subfield_x: for pid in Document.get_document_pids_by_issn(subfield_x): linked_pid = pid break if linked_pid: part_of['document'] = { '$ref': f'https://ils.rero.ch/api/documents/{linked_pid}' } subfield_v = not_repetitive(unimarc.bib_id, 'unimarc', key, value, 'v', default='').strip() if subfield_v: part_of['numbering'] = subfield_v self['partOf'] = self.get('partOf', []) self['partOf'].append(part_of)
def marc21_to_identifiedBy_from_field_028(self, key, value): """Get identifier from field 028.""" type_for_ind1 = { '0': 'bf:AudioIssueNumber', '1': 'bf:MatrixNumber', '2': 'bf:MusicPlate', '3': 'bf:MusicPublisherNumber', '4': 'bf:VideoRecordingNumber', '5': 'bf:PublisherNumber', '6': 'bf:MusicDistributorNumber' } identifier = {} subfield_a = not_repetitive(marc21.bib_id, key, value, 'a', default='').strip() if subfield_a: identifier['value'] = subfield_a if value.get('q'): # $q is repetitive identifier['qualifier'] = \ ', '.join(utils.force_list(value.get('q'))) subfield_b = not_repetitive(marc21.bib_id, key, value, 'b', default='').strip() if subfield_b: identifier['source'] = subfield_b # key[3] is the indicateur_1 identifier['type'] = type_for_ind1.get(key[3], 'bf:Identifier') identifiedBy = self.get('identifiedBy', []) identifiedBy.append(identifier) return identifiedBy or None
def populate_acquisitionTerms_note_qualifier(identifier): subfield_c = not_repetitive(marc21.bib_id, key, value, 'c', default='').strip() if subfield_c: identifier['acquisitionTerms'] = subfield_c subfield_d = not_repetitive(marc21.bib_id, key, value, 'd', default='').strip() if subfield_d: identifier['note'] = subfield_d if value.get('q'): # $q is repetitive identifier['qualifier'] = \ ', '.join(utils.force_list(value.get('q')))
def build_identifier_from(subfield_data, status=None): subfield_data = subfield_data.strip() identifier = {'value': subfield_data} subfield_c = not_repetitive(marc21.bib_id, marc21.rero_id, key, value, 'c', default='').strip() if subfield_c: identifier['acquisitionTerms'] = subfield_c if value.get('q'): # $q is repetitive identifier['qualifier'] = \ ', '.join(utils.force_list(value.get('q'))) match = re.search(r'^(.+?)\s*\((.+)\)$', subfield_data) if match: # match.group(2) : parentheses content identifier['qualifier'] = ', '.join( filter(None, [match.group(2), identifier.get('qualifier', '')])) # value without parenthesis and parentheses content identifier['value'] = match.group(1) if status: identifier['status'] = status identifier['type'] = 'bf:Isbn' identifiedBy.append(identifier)
def marc21_to_is_part_of(self, key, value): """Get is_part_of. is_part_of: [773$t repetitive] """ if not self.get('is_part_of', None): return not_repetitive(marc21.bib_id, key, value, 't')
def marc21_to_identifiedBy_from_field_020(self, key, value): """Get identifier from field 020.""" def build_identifier_from(subfield_data, status=None): subfield_data = subfield_data.strip() identifier = {'value': subfield_data} subfield_c = not_repetitive(marc21.bib_id, key, value, 'c', default='').strip() if subfield_c: identifier['acquisitionTerms'] = subfield_c if value.get('q'): # $q is repetitive identifier['qualifier'] = \ ', '.join(utils.force_list(value.get('q'))) match = re.search(r'^(.+?)\s*\((.+)\)$', subfield_data) if match: # match.group(2) : parentheses content identifier['qualifier'] = ', '.join( filter(None, [match.group(2), identifier.get('qualifier', '')])) # value without parenthesis and parentheses content identifier['value'] = match.group(1) if status: identifier['status'] = status identifier['type'] = 'bf:Isbn' identifiedBy.append(identifier) identifiedBy = self.get('identifiedBy', []) subfield_a = not_repetitive(marc21.bib_id, key, value, 'a') if subfield_a: build_identifier_from(subfield_a) subfields_z = value.get('z') if subfields_z: for subfield_z in utils.force_list(subfields_z): build_identifier_from(subfield_z, status='invalid or cancelled') return identifiedBy or None
def marc21_to_notes(self, key, value): """Get notes. note: [500$a repetitive] """ add_note( dict(noteType='general', label=not_repetitive(marc21.bib_id, marc21.rero_id, key, value, 'a')), self) return None
def marc21_to_author(self, key, value): """Get author. authors: loop: authors.name: 100$a [+ 100$b if it exists] or [700$a (+$b if it exists) repetitive] or [ 710$a repetitive (+$b if it exists, repetitive)] authors.date: 100 $d or 700 $d (facultatif) authors.qualifier: 100 $c or 700 $c (facultatif) authors.type: if 100 or 700 then person, if 710 then organisation """ if not key[4] == '2': author = {} author['type'] = 'person' if value.get('0'): refs = utils.force_list(value.get('0')) for ref in refs: ref = get_person_link(marc21.bib_id, ref, key, value) if ref: author['$ref'] = ref # we do not have a $ref if not author.get('$ref'): author['name'] = '' if value.get('a'): data = not_repetitive(marc21.bib_id, key, value, 'a') author['name'] = remove_trailing_punctuation(data) author_subs = utils.force_list(value.get('b')) if author_subs: for author_sub in author_subs: author['name'] += ' ' + \ remove_trailing_punctuation(author_sub) if key[:3] == '710': author['type'] = 'organisation' else: if value.get('c'): data = not_repetitive(marc21.bib_id, key, value, 'c') author['qualifier'] = remove_trailing_punctuation(data) if value.get('d'): data = not_repetitive(marc21.bib_id, key, value, 'd') author['date'] = remove_trailing_punctuation(data) return author else: return None
def marc21_to_identifiedBy_from_field_035(self, key, value): """Get identifier from field 035.""" subfield_a = not_repetitive(marc21.bib_id, key, value, 'a', default='').strip() if subfield_a: identifier = { 'value': subfield_a, 'type': 'bf:Local', 'source': 'RERO' } identifiedBy = self.get('identifiedBy', []) identifiedBy.append(identifier) return identifiedBy or None
def marc21_to_identifiedBy_from_field_930(self, key, value): """Get identifier from field 930.""" subfield_a = not_repetitive(marc21.bib_id, key, value, 'a', default='').strip() if subfield_a: identifier = {} match = re.search(r'^\((.+?)\)\s*(.*)$', subfield_a) if match: # match.group(1) : parentheses content identifier['source'] = match.group(1) # value without parenthesis and parentheses content identifier['value'] = match.group(2) else: identifier['value'] = subfield_a identifier['type'] = 'bf:Local' identifiedBy = self.get('identifiedBy', []) identifiedBy.append(identifier) return identifiedBy or None
def marc21_to_identified_by_from_field_930(self, key, value): """Get identifier from field 930.""" subfield_a = not_repetitive(marc21.bib_id, marc21.rero_id, key, value, 'a', default='').strip() if subfield_a: identifier = {} match = re_identified.match(subfield_a) if match: # match.group(1) : parentheses content identifier['source'] = match.group(1) # value without parenthesis and parentheses content identifier['value'] = match.group(2) else: identifier['value'] = subfield_a identifier['type'] = 'bf:Local' identified_by = self.get('identifiedBy', []) identified_by.append(identifier) self['identifiedBy'] = identified_by
def marc21_to_identifiedBy_from_field_024(self, key, value): """Get identifier from field 024.""" def populate_acquisitionTerms_note_qualifier(identifier): subfield_c = not_repetitive(marc21.bib_id, marc21.rero_id, key, value, 'c', default='').strip() if subfield_c: identifier['acquisitionTerms'] = subfield_c subfield_d = not_repetitive(marc21.bib_id, marc21.rero_id, key, value, 'd', default='').strip() if subfield_d: identifier['note'] = subfield_d if value.get('q'): # $q is repetitive identifier['qualifier'] = \ ', '.join(utils.force_list(value.get('q'))) subfield_2_regexp = { 'doi': { 'type': 'bf:Doi' }, 'urn': { 'type': 'bf:Urn' }, 'nipo': { 'type': 'bf:Local', 'source': 'NIPO' }, 'danacode': { 'type': 'bf:Local', 'source': 'danacode' }, 'vd18': { 'type': 'bf:Local', 'source': 'vd18' }, 'gtin-14': { 'type': 'bf:Gtin14Number' } } type_for_ind1 = { '0': { 'type': 'bf:Isrc' }, '1': { 'type': 'bf:Upc' }, '2': { 'pattern': r'^(M|9790|979-0)', 'matching_type': 'bf:Ismn' }, '3': { 'pattern': r'^97', 'matching_type': 'bf:Ean' }, '8': { # 33 chars example: 0000-0002-A3B1-0000-0-0000-0000-2 'pattern': r'^(.{24}|.{26}|(.{4}-){4}.-(.{4}\-){2}.)$', 'matching_type': 'bf:Isan' } } identifier = {} identifiedBy = None subfield_a = not_repetitive(marc21.bib_id, marc21.rero_id, key, value, 'a', default='').strip() subfield_2 = not_repetitive(marc21.bib_id, marc21.rero_id, key, value, '2', default='').strip() if subfield_a: if re.search(r'permalink\.snl\.ch', subfield_a, re.IGNORECASE): identifier.update({ 'value': subfield_a, 'type': 'uri', 'source': 'SNL' }) elif re.search(r'bnf\.fr/ark', subfield_a, re.IGNORECASE): identifier.update({ 'value': subfield_a, 'type': 'uri', 'source': 'BNF' }) elif subfield_2: identifier['value'] = subfield_a populate_acquisitionTerms_note_qualifier(identifier) for pattern in subfield_2_regexp: if re.search(pattern, subfield_2, re.IGNORECASE): identifier.update(subfield_2_regexp[pattern]) else: # without subfield $2 ind1 = key[3] # indicateur_1 if ind1 in ('0', '1', '2', '3', '8'): populate_acquisitionTerms_note_qualifier(identifier) match = re.search(r'^(.+?)\s*\((.*)\)$', subfield_a) if match: # match.group(2) : parentheses content identifier['qualifier'] = ', '.join( filter( None, [match.group(2), identifier.get('qualifier', '')])) # value without parenthesis and parentheses content identifier['value'] = match.group(1) else: identifier['value'] = subfield_a if 'type' in type_for_ind1[ind1]: # ind1 0,1 identifier['type'] = type_for_ind1[ind1]['type'] else: # ind1 in (2, 3, 8) data = subfield_a if ind1 == '8': data = identifier['value'] if re.search(type_for_ind1[ind1]['pattern'], data): identifier['type'] = \ type_for_ind1[ind1]['matching_type'] else: identifier['type'] = 'bf:Identifier' else: # ind1 not in (0, 1, 2, 3, 8) identifier.update({ 'value': subfield_a, 'type': 'bf:Identifier' }) identifiedBy = self.get('identifiedBy', []) if not identifier.get('type'): identifier['type'] = 'bf:Identifier' identifiedBy.append(identifier) return identifiedBy or None
def marc21_to_contribution(self, key, value): """Get contribution.""" if not key[4] == '2' and key[:3] in ['100', '700', '710', '711']: agent = {} if value.get('0'): refs = utils.force_list(value.get('0')) for ref in refs: ref = get_person_link(marc21.bib_id, marc21.rero_id, ref, key, value) if ref: agent['$ref'] = ref # we do not have a $ref if not agent.get('$ref') and value.get('a'): agent = {'type': 'bf:Person'} if value.get('a'): name = not_repetitive(marc21.bib_id, marc21.rero_id, key, value, 'a').rstrip('.') if name: agent['preferred_name'] = name # 100|700 Person if key[:3] in ['100', '700']: if value.get('b'): numeration = not_repetitive(marc21.bib_id, marc21.rero_id, key, value, 'b') numeration = remove_trailing_punctuation(numeration) if numeration: agent['numeration'] = numeration if value.get('c'): qualifier = not_repetitive(marc21.bib_id, marc21.rero_id, key, value, 'c') agent['qualifier'] = remove_trailing_punctuation(qualifier) if value.get('d'): date = not_repetitive(marc21.bib_id, marc21.rero_id, key, value, 'd') date = date.rstrip(',') dates = remove_trailing_punctuation(date).split('-') try: date_of_birth = dates[0].strip() if date_of_birth: agent['date_of_birth'] = date_of_birth except Exception: pass try: date_of_death = dates[1].strip() if date_of_death: agent['date_of_death'] = date_of_death except Exception: pass if value.get('q'): fuller_form_of_name = not_repetitive( marc21.bib_id, marc21.rero_id, key, value, 'q') fuller_form_of_name = remove_trailing_punctuation( fuller_form_of_name).lstrip('(').rstrip(')') if fuller_form_of_name: agent['fuller_form_of_name'] = fuller_form_of_name # 710|711 Organisation elif key[:3] in ['710', '711']: agent['type'] = 'bf:Organisation' if key[:3] == '711': agent['conference'] = True else: agent['conference'] = False if value.get('b'): subordinate_units = [] for subordinate_unit in utils.force_list(value.get('b')): subordinate_units.append(subordinate_unit.rstrip('.')) agent['subordinate_unit'] = subordinate_units if value.get('e'): subordinate_units = agent.get('subordinate_unit', []) for subordinate_unit in utils.force_list(value.get('e')): subordinate_units.append(subordinate_unit.rstrip('.')) agent['subordinate_unit'] = subordinate_units if value.get('n'): conference_number = not_repetitive(marc21.bib_id, marc21.rero_id, key, value, 'n') conference_number = remove_trailing_punctuation( conference_number).lstrip('(').rstrip(')') if conference_number: agent['conference_number'] = conference_number if value.get('d'): conference_date = not_repetitive(marc21.bib_id, marc21.rero_id, key, value, 'd') conference_date = remove_trailing_punctuation( conference_date).lstrip('(').rstrip(')') if conference_date: agent['conference_date'] = conference_date if value.get('c'): conference_place = not_repetitive(marc21.bib_id, marc21.rero_id, key, value, 'c') conference_place = remove_trailing_punctuation( conference_place).lstrip('(').rstrip(')') if conference_place: agent['conference_place'] = conference_place if value.get('4'): roles = [] for role in utils.force_list(value.get('4')): if len(role) != 3: error_print('WARNING CONTRIBUTION ROLE LENGTH:', marc21.bib_id, marc21.rero_id, role) role = role[:3] if role == 'sce': error_print('WARNING CONTRIBUTION ROLE SCE:', marc21.bib_id, marc21.rero_id, 'sce --> aus') role = 'aus' role = role.lower() if role not in _CONTRIBUTION_ROLE: error_print('WARNING CONTRIBUTION ROLE DEFINITION:', marc21.bib_id, marc21.rero_id, role) role = 'ctb' roles.append(role) else: if key[:3] == '100': roles = ['cre'] elif key[:3] == '711': roles = ['aut'] else: roles = ['ctb'] if agent: return {'agent': agent, 'role': list(set(roles))} return None
def marc21_to_titlesProper(self, key, value): """Test dojson marc21titlesProper. titleProper: 730$a """ return not_repetitive(marc21.bib_id, marc21.rero_id, key, value, 'a')
def marc21_to_part_of(self, key, value): r"""Get part_of. The 773 $g can have multiple pattern, most important is to find the year (94% of $g start with pattern '\d{4}' - a/b/c/d > a=year, b=vol, c=issue, d=pages (if a != year pattern, then abandon data) - a/b/c > a=year, b=issue, c=pages (if a != year pattern, then put a in vol, and b in issue, and c in pages) - a/b > a=year, b=pages (if a != year pattern, then put it in vol, and b in issue) - a > a=year (if a != year pattern, then put it in pages) For b, c, d: check that the values match the integer or pages patterns, otherwise abandon data. pages pattern: \d+(-\d+)? examples: 12-25, 837, 837-838 When a field 773, 800 or 830 has no link specified, then a seriesStatement must be generated instead of a partOf. But, in this case, a seriesStatement does not be generated for a field 773 if a field 580 exists and for the fields 800 and 830 if a field 490 exists """ class Numbering(object): """The purpose of this class is to build the `Numbering` data.""" def __init__(self): """Constructor method.""" self._numbering = {} self._year_regexp = re.compile(r'^\d{4}') self._integer_regexp = re.compile(r'^\d+$') self._pages_regexp = re.compile(r'^\d+(-\d+)?$') self._pattern_per_key = { 'year': self._year_regexp, 'pages': self._pages_regexp, 'issue': self._integer_regexp, 'volume': self._integer_regexp } def add_numbering_value(self, key, value): """Add numbering `key: value` to `Numbering` data. The `Numbering` object is progressively build with the data col- lected by the succesive calls of the method `add_numbering_value`. :param key: key code of data to be added :type key: str :param value: value data to be associated the given `key` :type value: str """ if self._pattern_per_key[key].search(value): if key in ('issue', 'volume'): value = int(value) if value > 0: self._numbering[key] = value else: self._numbering[key] = value elif key != 'year': self._numbering['discard'] = True def has_year(self): """Check if `year` key is present in `Numbering` data.""" return 'year' in self._numbering def is_valid(self): """Check if `Numbering` data is valid.""" return self._numbering and 'discard' not in self._numbering def get(self): """Get the `Numbering` data object.""" return self._numbering def add_author_to_subfield_t(value): """Get author from subfield_t and add it to subfield_t. The form 'lastname, firstname' of the author form subfield a is a appended to the subfield_t in the following form: ' / firstname lastname' """ items = get_field_items(value) new_data = [] author = None pending_g_values = [] pending_v_values = [] match = re.compile(r'\. -$') # match the trailing '. -' subfield_selection = {'a', 't', 'g', 'v'} for blob_key, blob_value in items: if blob_key in subfield_selection: if blob_key == 'a': # remove the trailing '. -' author = match.sub('', blob_value) # reverse first name and last name author_parts = author.split(',') author = ' '.join(reversed(author_parts)).strip() subfield_selection.remove('a') elif blob_key == 't': subfield_t = blob_value if author: subfield_t += ' / ' + author new_data.append(('t', subfield_t)) elif blob_key == 'g': pending_g_values.append(blob_value) elif blob_key == 'v': pending_v_values.append(blob_value) for g_value in pending_g_values: new_data.append(('g', g_value)) for v_value in pending_v_values: new_data.append(('v', v_value)) return GroupableOrderedDict(tuple(new_data)) part_of = {} numbering_list = [] subfield_w = not_repetitive(marc21.bib_id, marc21.rero_id, key, value, 'w', default='').strip() if subfield_w: match = re.compile(r'^REROILS:') pid = match.sub('', subfield_w) part_of['document'] = { '$ref': 'https://ils.rero.ch/api/documents/{pid}'.format(pid=pid) } if key[:3] == '773': discard_numbering = False for subfield_g in utils.force_list(value.get('g', [])): numbering = Numbering() values = subfield_g.strip().split('/') numbering.add_numbering_value('year', values[0][:4]) if len(values) == 1 and not numbering.has_year(): if values[0]: numbering.add_numbering_value('pages', values[0]) elif len(values) == 2: if numbering.has_year(): if values[1]: numbering.add_numbering_value('pages', values[1]) else: if values[0]: numbering.add_numbering_value('volume', values[0]) if values[1]: numbering.add_numbering_value('issue', values[1]) elif len(values) == 3: if not numbering.has_year() and values[0]: numbering.add_numbering_value('volume', values[0]) if values[1]: numbering.add_numbering_value('issue', values[1]) if values[2]: numbering.add_numbering_value('pages', values[2]) elif len(values) == 4: if numbering.has_year(): if values[1]: numbering.add_numbering_value('volume', values[1]) if values[2]: numbering.add_numbering_value('issue', values[2]) if values[3]: numbering.add_numbering_value('pages', values[3]) else: discard_numbering = True if not discard_numbering and numbering.is_valid(): numbering_list.append(numbering.get()) else: # 800, 830 for subfield_v in utils.force_list(value.get('v', [])): numbering = Numbering() if subfield_v: numbering.add_numbering_value('volume', subfield_v) if numbering.is_valid(): numbering_list.append(numbering.get()) if 'document' in part_of: if numbering_list: part_of['numbering'] = numbering_list self['partOf'] = self.get('partOf', []) self['partOf'].append(part_of) else: # no link found if key[:3] == '773': if not marc21.has_field_580: # the author in subfield $a is appended to subfield $t value = add_author_to_subfield_t(value) # create a seriesStatement instead of a partOf marc21.extract_series_statement_from_marc_field( key, value, self) else: # 800, 830 if not marc21.has_field_490: # create a seriesStatement instead of a partOf if key[:3] == '800': # the author in subfield $a is appended to subfield $t value = add_author_to_subfield_t(value) marc21.extract_series_statement_from_marc_field( key, value, self)