Ejemplo n.º 1
0
    def decode_marc(
        self,
        marc,
        to_unicode=True,
        force_utf8=False,
        hide_utf8_warnings=False,
        utf8_handling="strict",
        encoding="iso8859-1",
    ):
        """Populate the object based on the `marc`` record in transmission format.

        The Record constructor actually uses decode_marc() behind the scenes when you
        pass in a chunk of MARC data to it.
        """
        # extract record leader
        self.leader = marc[0:LEADER_LEN].decode("ascii")
        if len(self.leader) != LEADER_LEN:
            raise RecordLeaderInvalid

        if self.leader[9] == "a" or self.force_utf8:
            encoding = "utf-8"

        # extract the byte offset where the record data starts
        base_address = int(marc[12:17])
        if base_address <= 0:
            raise BaseAddressNotFound
        if base_address >= len(marc):
            raise BaseAddressInvalid

        # extract directory, base_address-1 is used since the
        # director ends with an END_OF_FIELD byte
        directory = marc[LEADER_LEN:base_address - 1].decode("ascii")

        # determine the number of fields in record
        if len(directory) % DIRECTORY_ENTRY_LEN != 0:
            raise RecordDirectoryInvalid
        field_total = len(directory) / DIRECTORY_ENTRY_LEN

        # add fields to our record using directory offsets
        field_count = 0
        while field_count < field_total:
            entry_start = field_count * DIRECTORY_ENTRY_LEN
            entry_end = entry_start + DIRECTORY_ENTRY_LEN
            entry = directory[entry_start:entry_end]
            entry_tag = entry[0:3]
            entry_length = int(entry[3:7])
            entry_offset = int(entry[7:12])
            entry_data = marc[base_address + entry_offset:base_address +
                              entry_offset + entry_length - 1]
            # assume controlfields are numeric; replicates ruby-marc behavior
            if entry_tag < "010" and entry_tag.isdigit():
                if to_unicode:
                    field = Field(tag=entry_tag,
                                  data=entry_data.decode(encoding))
                else:
                    field = RawField(tag=entry_tag, data=entry_data)
            else:
                subfields = list()
                subs = entry_data.split(SUBFIELD_INDICATOR.encode("ascii"))

                # The MARC spec requires there to be two indicators in a
                # field. However experience in the wild has shown that
                # indicators are sometimes missing, and sometimes there
                # are too many. Rather than throwing an exception because
                # we can't find what we want and rejecting the field, or
                # barfing on the whole record we'll try to use what we can
                # find. This means missing indicators will be recorded as
                # blank spaces, and any more than 2 are dropped on the floor.

                first_indicator = second_indicator = " "
                subs[0] = subs[0].decode("ascii")
                if len(subs[0]) == 0:
                    logging.warning("missing indicators: %s", entry_data)
                    first_indicator = second_indicator = " "
                elif len(subs[0]) == 1:
                    logging.warning("only 1 indicator found: %s", entry_data)
                    first_indicator = subs[0][0]
                    second_indicator = " "
                elif len(subs[0]) > 2:
                    logging.warning("more than 2 indicators found: %s",
                                    entry_data)
                    first_indicator = subs[0][0]
                    second_indicator = subs[0][1]
                else:
                    first_indicator = subs[0][0]
                    second_indicator = subs[0][1]

                for subfield in subs[1:]:
                    skip_bytes = 1
                    if len(subfield) == 0:
                        continue
                    try:
                        code = subfield[0:1].decode("ascii")
                    except UnicodeDecodeError:
                        warnings.warn(BadSubfieldCodeWarning())
                        code, skip_bytes = normalize_subfield_code(subfield)
                    data = subfield[skip_bytes:]

                    if to_unicode:
                        if self.leader[9] == "a" or force_utf8:
                            data = data.decode("utf-8", utf8_handling)
                        elif encoding == "iso8859-1":
                            data = marc8_to_unicode(data, hide_utf8_warnings)
                        else:
                            data = data.decode(encoding)
                    subfields.append(code)
                    subfields.append(data)
                if to_unicode:
                    field = Field(
                        tag=entry_tag,
                        indicators=[first_indicator, second_indicator],
                        subfields=subfields,
                    )
                else:
                    field = RawField(
                        tag=entry_tag,
                        indicators=[first_indicator, second_indicator],
                        subfields=subfields,
                    )
            self.add_field(field)
            field_count += 1

        if field_count == 0:
            raise NoFieldsFound
Ejemplo n.º 2
0
    def decode_marc(self,
                    marc,
                    to_unicode=True,
                    force_utf8=False,
                    hide_utf8_warnings=False,
                    utf8_handling='ignore'):

        self.leader = marc[0:LEADER_LEN]
        if len(self.leader) != LEADER_LEN:
            raise RecordLeaderInvalid
        if self.leader[9] == 'a' or self.force_utf8:
            encoding = 'utf-8'
        else:
            encoding = 'iso8859-1'

        base_address = int(marc[12:17])
        if base_address <= 0:
            raise BaseAddressNotFound
        if base_address >= len(marc):
            raise BaseAddressInvalid

        directory = marc[LEADER_LEN:base_address - 1]

        if len(directory) % DIRECTORY_ENTRY_LEN != 0:
            raise RecordDirectoryInvalid
        field_total = len(directory) / DIRECTORY_ENTRY_LEN

        field_count = 0
        while field_count < field_total:
            entry_start = field_count * DIRECTORY_ENTRY_LEN
            entry_end = entry_start + DIRECTORY_ENTRY_LEN
            entry = directory[entry_start:entry_end]
            entry_tag = entry[0:3]
            entry_length = int(entry[3:7])
            entry_offset = int(entry[7:12])
            entry_data = marc[base_address + entry_offset:base_address +
                              entry_offset + entry_length - 1]

            if entry_tag < '010' and entry_tag.isdigit():
                if to_unicode:
                    field = Field(tag=entry_tag, data=entry_data)
                else:
                    field = RawField(tag=entry_tag, data=entry_data)
            else:
                subfields = list()
                subs = entry_data.split(SUBFIELD_INDICATOR)
                #subs = entry_data.split(SUBFIELD_INDICATOR.encode().decode('ascii', errors='ignore'))
                first_indicator = second_indicator = ' '
                #subs[0] = subs[0].decode('ascii')
                if len(subs[0]) == 0:
                    logging.warning("missing indicators: %s", entry_data)
                    first_indicator = second_indicator = ' '
                elif len(subs[0]) == 1:
                    logging.warning("only 1 indicator found: %s", entry_data)
                    first_indicator = subs[0][0]
                    second_indicator = ' '
                elif len(subs[0]) > 2:
                    logging.warning("more than 2 indicators found: %s",
                                    entry_data)
                    first_indicator = subs[0][0]
                    second_indicator = subs[0][1]
                else:
                    first_indicator = subs[0][0]
                    second_indicator = subs[0][1]
                for subfield in subs[1:]:
                    if len(subfield) == 0:
                        continue
                    code = subfield[0:1]
                    data = subfield[1:]
                    if to_unicode:
                        if self.leader[9] == 'a' or force_utf8:
                            data = data.encode().decode('utf-8', utf8_handling)
                        else:
                            data = marc8_to_unicode(data, hide_utf8_warnings)
                    subfields.append(code)
                    subfields.append(data)
                if to_unicode:
                    field = Field(
                        tag=entry_tag,
                        indicators=[first_indicator, second_indicator],
                        subfields=subfields)
                else:
                    field = RawField(
                        tag=entry_tag,
                        indicators=[first_indicator, second_indicator],
                        subfields=subfields)
            self.add_field(field)
            field_count += 1

        if field_count == 0:
            raise NoFieldsFound

        return self