def decode(self, raw, raw_encoding): """ decode_marc() accepts a MARC record in transmission format as a a string argument, and will populate the object based on the data found. The Record constructor actually uses decode_marc() behind the scenes when you pass in a chunk of MARC data to it. """ # extract record leader self._leader = array('c', raw[0:LEADER_LEN]) if len(self._leader) != LEADER_LEN: raise exc.RecordLeaderInvalid # extract the byte offset where the record data starts base_address = int(raw[12:17]) if base_address <= 0: raise exc.BaseAddressNotFound if base_address >= len(raw): raise exc.BaseAddressInvalid # extract directory, base_address-1 is used since the # director ends with an END_OF_FIELD byte directory = raw[LEADER_LEN:base_address - 1] # determine the number of fields in record if len(directory) % DIRECTORY_ENTRY_LEN != 0: raise exc.RecordDirectoryInvalid field_total = len(directory) / DIRECTORY_ENTRY_LEN # add fields to our record using directory offsets field_count = 0 while field_count < field_total: entry_start = field_count * DIRECTORY_ENTRY_LEN entry_end = entry_start + DIRECTORY_ENTRY_LEN entry = directory[entry_start:entry_end] entry_tag = entry[0:3] entry_length = int(entry[3:7]) entry_offset = int(entry[7:12]) entry_data = raw[base_address + entry_offset: base_address + entry_offset + entry_length - 1] # assume controlfields are numeric; replicates ruby-marc behavior if entry_tag < '010' and entry_tag.isdigit(): if raw_encoding == 'marc8': data = marc8_to_unicode(entry_data) else: data = entry_data.decode(raw_encoding) field = ControlField(tag=entry_tag, data=data) else: subfields = [] subs = entry_data.split(SUBFIELD_INDICATOR) ind1 = subs[0][0] ind2 = subs[0][1] for subfield in subs[1:]: if len(subfield) == 0: continue code = subfield[0] data = subfield[1:] if raw_encoding == 'marc8': data = marc8_to_unicode(data) else: data = data.decode(raw_encoding) subfields.append(Subfield(code=code, data=data)) field = DataField( tag=entry_tag, ind1=ind1, ind2=ind2, subfields=subfields, ) if field.tag not in self._fields: self._fields[field.tag] = [] self._fields[field.tag].append(field) field_count += 1 if field_count == 0: raise exc.NoFieldsFound
def decode_marc(self, marc, to_unicode=True, force_utf8=False, encoding=None, hide_utf8_warnings=False, utf8_handling='strict'): self.marc = marc """ decode_marc() accepts a MARC record in transmission format as a a string argument, and will populate the object based on the data found. The Record constructor actually uses decode_marc() behind the scenes when you pass in a chunk of MARC data to it. """ # extract record leader self.leader = marc[0:LEADER_LEN] if len(self.leader) != LEADER_LEN: raise RecordLeaderInvalid # extract the byte offset where the record data starts base_address = int(marc[12:17]) if base_address <= 0: raise BaseAddressNotFound if base_address >= len(marc): raise BaseAddressInvalid # extract directory, base_address-1 is used since the # director ends with an END_OF_FIELD byte directory = marc[LEADER_LEN:base_address - 1] # determine the number of fields in record if len(directory) % DIRECTORY_ENTRY_LEN != 0: raise RecordDirectoryInvalid field_total = len(directory) / DIRECTORY_ENTRY_LEN # add fields to our record using directory offsets field_count = 0 while field_count < field_total: entry_start = field_count * DIRECTORY_ENTRY_LEN entry_end = entry_start + DIRECTORY_ENTRY_LEN entry = directory[entry_start:entry_end] entry_tag = entry[0:3] entry_length = int(entry[3:7]) entry_offset = int(entry[7:12]) entry_data = marc[base_address + entry_offset: base_address + entry_offset + entry_length - 1] # assume controlfields are numeric; replicates ruby-marc behavior if entry_tag < '010' and entry_tag.isdigit(): field = Field(tag=entry_tag, data=entry_data) elif entry_tag[0] == '4': subfields = list() subs = entry_data.split(SUBFIELD_INDICATOR) first_indicator = subs[0][0] second_indicator = subs[0][1] field_data = subs[1:] i = 0 field_data_len = len(field_data) while i < field_data_len: #if subfield 1, then read embedded field tag number if field_data[i][0] == '1': embedded_field_tag = field_data[i][1:4] if embedded_field_tag == '001': emb_field = Field(tag=embedded_field_tag, data=field_data[i][4:]) subfields.append('1') subfields.append(emb_field) i += 1 continue if len(field_data[i]) < 6: i+=1 continue embedded_field_i1 = field_data[i][4] embedded_field_i2 = field_data[i][5] embedded_subfields = list() i += 1 while i < field_data_len and field_data[i][0] != '1': code = field_data[i][0] data = field_data[i][1:] if to_unicode: # if encoding != 'marc-8' and force_utf8: # data = data.decode('utf-8', 'replace') if encoding != 'marc-8': data = data.decode(encoding, utf8_handling) else: data = data.decode(encoding, 'replace') embedded_subfields.append(code) embedded_subfields.append(data) i += 1 emb_field = Field(tag=embedded_field_tag, indicators=[embedded_field_i1, embedded_field_i2], subfields=embedded_subfields) subfields.append('1') subfields.append(emb_field) field = Field( tag=entry_tag, indicators=[first_indicator, second_indicator], subfields=subfields, ) else: subfields = list() subs = entry_data.split(SUBFIELD_INDICATOR) first_indicator = subs[0][0] second_indicator = subs[0][1] for subfield in subs[1:]: if len(subfield) == 0: continue code = subfield[0] data = subfield[1:] if to_unicode: if encoding != 'marc-8': data = data.decode(encoding, utf8_handling) else: data = marc8_to_unicode(data, hide_utf8_warnings) subfields.append(code) subfields.append(data) field = Field( tag=entry_tag, indicators=[first_indicator, second_indicator], subfields=subfields, ) self.add_field(field) field_count += 1 if field_count == 0: raise NoFieldsFound
def map_marc8_field(f): if f.is_control_field(): f.data = marc8_to_unicode(f.data) else: f.subfields = map(marc8_to_unicode, f.subfields) return f