Example #1
0
def html_compare(file1, file2, outfile, include_missing=False, brief=True):
    """Compare 2 files of MARC records and write HTML diff"""
    differ = difflib.HtmlDiff(wrapcolumn=85)
    with open(file1,
              "rb") as f1, open(file2,
                                "rb") as f2, open(outfile,
                                                  "w",
                                                  encoding="utf-8") as out:
        out.write(FILE_START)
        r1 = pymarc.MARCReader(f1)
        r2 = pymarc.MARCReader(f2)

        rec2 = next(r2)
        for rec1 in r1:
            rec_id = rec1["001"].data

            if rec2["001"].data != rec_id:
                if include_missing:
                    out.write(f"<strong>record {rec_id} missing</strong>")
                    if not brief:
                        out.write(
                            differ.make_table(str(rec1).splitlines(), [""]))
                    out.write("<hr />")
            else:
                out.write(
                    differ.make_table(str(rec1).splitlines(),
                                      str(rec2).splitlines(),
                                      context=brief))
                try:
                    rec2 = next(r2)
                except StopIteration:
                    pass
        out.write(FILE_END)
Example #2
0
 def setUp(self):
     self.reader = pymarc.MARCReader(open("test/test.dat", "rb"))
     self._record = pymarc.Record()
     field = pymarc.Field(tag="245",
                          indicators=["1", "0"],
                          subfields=["a", "Python", "c", "Guido"])
     self._record.add_field(field)
Example #3
0
def iso2tables(master, entry_filename, file_format, rec_format, id_traitement):
    #input_file_test = open(entry_filename,'rb').read()
    #print(chardet.detect(input_file_test).read())
    encoding = "iso-8859-1"
    if (file_format == 1):
        encoding = "utf-8"
    (test_file, input_file) = test_encoding_file(master, entry_filename,
                                                 encoding, file_format)
    assert test_file

    temp_list = [el + u'\u001D' for el in input_file]
    i = 0
    for rec in temp_list:
        i += 1
        outputfilename = "temp_record.txt"
        outputfile = open(outputfilename, "w", encoding="utf-8")

        outputfile.write(rec)
        outputfile.close()
        with open(outputfilename, 'rb') as fh:
            collection = mc.MARCReader(fh)
            if (file_format == 1):
                collection.force_utf8 = True
            (test, record) = detect_errors_encoding_iso(collection)
            if (test):
                record2listemetas(id_traitement, record, rec_format)
    try:
        os.remove("temp_record.txt")
    except FileNotFoundError as err:
        print(err)
    stats["Nombre total de notices traitées"] = i
Example #4
0
def iso2tables(master, entry_filename, rec_format, id_traitement):
    input_file = open(entry_filename, 'r',
                      encoding="utf-8").read().split(u'\u001D')[0:-1]
    temp_list = [el + u'\u001D' for el in input_file]
    for rec in temp_list:
        outputfilename = "temp_record.txt"
        outputfile = open(outputfilename, "w", encoding="utf-8")
        outputfile.write(rec)
        outputfile.close()
        with open(outputfilename, 'rb') as fh:
            collection = mc.MARCReader(fh)
            collection.force_utf8 = True
            try:
                for record in collection:
                    print(record2meta(record, ["001"]))
                    record2listemetas(record, rec_format)
            except mc.exceptions.RecordLengthInvalid as err:
                NumNot = record2meta(record, ["001"])
                liste_notices_pb_encodage.append(NumNot)
                pass
            except UnicodeDecodeError as err:
                NumNot = record2meta(record, ["001"])
                liste_notices_pb_encodage.append(NumNot)
                pass
    try:
        os.remove("temp_record.txt")
    except FileNotFoundError as err:
        main.popup_errors(master, main.errors["format_fichier_en_entree"])
Example #5
0
def marc_recs_received(marcfile, hostenv):
    """Determines how many MARC records are to be delivered and
       update marc_records table in ETD db"""
    count = 0
    try:
        reader = pymarc.MARCReader(open(marcfile, 'rb'))
        for record in reader:
            fields856 = []
            count += 1
            # only records delivered from Proquest include an 020 field
            if record['020'] is not None:
                isbn = record['020']['a']
                for mfield in record.get_fields('856'):
                    fields856.append(mfield['u'])
                # this updates the table for MARC records rec'd from PQ;
                # the table is also update in create_marc_xml for MARC
                # records generated from PQ XML metadata.
                update_marc_table(isbn, fields856[0], fields856[1], hostenv)


#pylint: disable=maybe-no-member
    except pymarc.exceptions.PymarcException as err:
        logging.exception("ERROR opening MARC records %s: %s", marcfile,
                          err.message)
    return count
Example #6
0
def break_up_record( start_record=0, end_record=0 ):
    """ Splits big marc file into smaller files.
        This can successfully re-write the whole errant `rec_19.mrc` file. """
    log.debug( 'start_record, `{st}`; end_record, `{en}`'.format( st=start_record, en=end_record ) )
    BIG_MARC_FILEPATH = settings.INPUT_FILEPATH
    SMALLER_OUTPUT_FILEPATH = settings.OUTPUT_FILEPATH
    log.debug( 'processing file, ``{}```'.format(BIG_MARC_FILEPATH) )
    log.debug( 'output file, ``{}```'.format(SMALLER_OUTPUT_FILEPATH) )

    start_time = datetime.datetime.now()
    count = 0

    with open( BIG_MARC_FILEPATH, 'rb' ) as input_fh:
        # reader = pymarc.MARCReader( input_fh, force_utf8=True, utf8_handling='ignore' )
        # reader = pymarc.MARCReader( input_fh )
        # reader = pymarc.MARCReader( input_fh, to_unicode=True )
        reader = pymarc.MARCReader( input_fh, to_unicode=True, utf8_handling='ignore' )  # works!

        with open( SMALLER_OUTPUT_FILEPATH, 'wb' ) as output_fh:
            writer = pymarc.MARCWriter( output_fh )

            for record in reader:
                count += 1
                if count % 10000 == 0:
                    print( '`{}` records processed'.format(count) )
                if count >= start_record:
                    writer.write( record )
                    if count >= end_record:
                        break

    end_time = datetime.datetime.now()
    log.debug( 'records processed, `{}`'.format(count) )
    log.debug( 'time_taken, `{}`'.format(end_time-start_time) )
Example #7
0
def getMRCHeader(marc_file):
    out = set()
    marcrange = ["%03d" % i for i in range(999)]
    for record in pymarc.MARCReader(open(marc_file, 'rb')):
        for num in marcrange:
            for field in record.get_fields(num):
                out.add('leader')
                if int(field.tag) < 10:
                    out.add(field.tag)
                else:
                    if field.tag in fieldsNonFile1:
                        ind1_norm = field.indicator1
                        ind2_norm = '*'
                    elif field.tag in fieldsNonFile2:
                        ind1_norm = '*'
                        ind2_norm = field.indicator2
                    else:
                        ind1 = field.indicator1
                        ind2 = field.indicator2
                        ind1_norm = ind1.replace(' ', '#')
                        ind2_norm = ind2.replace(' ', '#')
                    if field.tag in fieldsHeadings:
                        out.add(field.tag)
                    else:
                        subfrange = list(string.ascii_lowercase)
                        for num in string.digits:
                            subfrange.append(num)
                        for subfield in subfrange:
                            if field.get_subfields(subfield):
                                tag_concat = (field.tag + '_' + ind1_norm +
                                              ind2_norm + '$' + subfield)
                            out.add(tag_concat)
    out_sorted = sorted(out)
    return(out_sorted)
Example #8
0
def test_reader():
    reader = pymarc.MARCReader(file('rusmarc_ebsco.mrc', 'r'),
                               to_unicode=True,
                               encoding='utf-8')
    for i, record in enumerate(reader):
        if i == 100: break
        print record_to_xml(record)
Example #9
0
 def setUp(self):
     self.reader = pymarc.MARCReader(open('test/test.dat', 'rb'))
     self._record = pymarc.Record()
     field = pymarc.Field(tag='245',
                          indicators=['1', '0'],
                          subfields=['a', 'Python', 'c', 'Guido'])
     self._record.add_field(field)
Example #10
0
 def disabled_test_codecs(self):
     import codecs
     with codecs.open('test/test.dat', encoding='utf-8') as fh:
         reader = pymarc.MARCReader(fh)
         record = next(reader)
         self.assertEqual(record['245']['a'],
                          u'ActivePerl with ASP and ADO /')
def generateMARCXML(MARC21input, MARCXMLoutput):
    """Write MARCXML file for given MARC21 binary file."""
    reader = pymarc.MARCReader(open(MARC21input, 'rb'))
    for record in reader:
        writer = pymarc.XMLWriter(open(MARCXMLoutput,'wb'))
        writer.write(record)
        writer.close()
Example #12
0
 def load_training_marc(self, marc_filename, marc_labels):
     """Method loads a training set of MARC records for a Creative Work
     
     Parameters:
     marc_filename -- Full path to marc filename
     marc_labels -- A list of booleans, True is Good, False is Bad
     """
     marc_reader = pymarc.MARCReader(open(marc_filename, 'rb'))
     words_re = re.compile(r"(\w+)")
     self.records = []
     for record in marc_reader:
         self.records.append(record)
     if len(self.records) != len(marc_labels):
         error_msg = "Number of records {0} must match MARC Labels {1}".format(
             len(self.records), len(marc_labels))
         raise WorkClassifierError(error_msg)
     count = 0
     good_tokens, bad_tokens = [], []
     for record in self.records:
         tokens = self.__tokenize_marc21__(record)
         if marc_labels[count] is True:
             good_tokens.extend(tokens)
         elif marc_labels[count] is False:
             bad_tokens.extend(tokens)
         else:
             raise WorkClassifierError(
                 "Unknown value for rec #{0} {1} {3}".format(
                     count, marc_labels[count], tokens))
         count += 1
     self.rb.train('good', ' '.join(good_tokens))
     self.rb.train('bad', ' '.join(bad_tokens))
Example #13
0
 def to_dict(cls, bin_marc):
     """Takes binary marc or marcxml and parses it into a human readable
     dict by first creating a pymarc object which enables easy
     access to the marc's fields and data.  Pymarc doesn't have a
     convenient way to cast itself to a dict (keyed) with
     meaningful values (e.g. title, isbn, etc) so this method
     converts pymarc's Record (pymarc.record.Record) to a human
     readable dict.
     """
     reader = pymarc.MARCReader(bin_marc,
                                hide_utf8_warnings=True,
                                force_utf8=True,
                                utf8_handling='ignore')
     record = reader.next()
     keyed_record = MARCRecord(record)
     data = {
         'identifiers': {},
         'authors': [keyed_record.author],
     }
     isbn = record.isbn()
     if isbn:
         data['isbn_%s' % len(record.isbn())] = [record.isbn()]
     data.update(keyed_record.publisher)
     data.update(keyed_record.title)
     return data
Example #14
0
def iso2tables_old(master, entry_filename, rec_format, id_traitement):
    with open(entry_filename, 'rb') as fh:
        collection = mc.MARCReader(fh)
        collection.force_utf8 = True
        try:
            for record in collection:
                record2listemetas(record, rec_format)
        except mc.exceptions.RecordLengthInvalid as err:
            print("\n\n/*---------------------------------------------*\n\n")
            print(main.errors["pb_input_utf8"])
            print(err)
            print("\n\n*------------------------------------------------*/")
            main.popup_errors(
                master, main.errors["pb_input_utf8_marcEdit"],
                "Aide en ligne : conversion iso2709 > XML",
                "https://github.com/Transition-bibliographique/bibliostratus/wiki/1-%5BBleu%5D-Pr%C3%A9parer-ses-donn%C3%A9es-pour-l'alignement-%C3%A0-partir-d'un-export-catalogue#un-probl%C3%A8me-dencodage--passez-en-xml-avec-marcedit"
            )
        except UnicodeDecodeError as err:
            print("\n\n/*---------------------------------------------*\n\n")
            print(main.errors["pb_input_utf8"])
            print(err)
            print("\n\n*------------------------------------------------*/")
            main.popup_errors(
                master, main.errors["pb_input_utf8_marcEdit"],
                "Aide en ligne : conversion iso2709 > XML",
                "https://github.com/Transition-bibliographique/bibliostratus/wiki/1-%5BBleu%5D-Pr%C3%A9parer-ses-donn%C3%A9es-pour-l'alignement-%C3%A0-partir-d'un-export-catalogue#un-probl%C3%A8me-dencodage--passez-en-xml-avec-marcedit"
            )
def fixRecord(record="",
              record_id=0,
              validation=False,
              replaceMethod='decimal'):
    replaceMethods = {
        'decimal': (('#29;', '#30;', '#31;'), ("\x1D", "\x1E", "\x1F")),
        'unicode': (('\u001d', '\u001e', '\u001f'), ("\x1D", "\x1E", "\x1F")),
        'hex': (('\x1D', '\x1E', '\x1F'), ("\x1D", "\x1E", "\x1F"))
    }
    marcFullRecordFixed = record
    for i in range(0, 3):
        marcFullRecordFixed = marcFullRecordFixed.replace(
            replaceMethods.get(replaceMethod)[0][i],
            replaceMethods.get(replaceMethod)[1][i])
    if validation:
        try:
            reader = pymarc.MARCReader(marcFullRecordFixed.encode('utf8'),
                                       utf8_handling='replace')
            marcrecord = next(reader)
        except (RecordLengthInvalid, RecordLeaderInvalid, BaseAddressNotFound,
                BaseAddressInvalid, RecordDirectoryInvalid, NoFieldsFound,
                UnicodeDecodeError) as e:
            eprint("record id {0}:".format(record_id) + str(e))
            with open('invalid_records.txt', 'a') as error:
                eprint(marcFullRecordFixed, file=error)
                return None
    return marcFullRecordFixed
Example #16
0
def break_up_record( start_record=0, end_record=0 ):
    """ Splits big marc file into smaller files.
        This can successfully re-write the whole errant `rec_19.mrc` file. """
    log.debug( 'start_record, `{st}`; end_record, `{en}`'.format( st=start_record, en=end_record ) )
    BIG_MARC_FILEPATH = os.environ['PYMARC_EXP__BIG_MARC_FILEPATH']
    SMALLER_OUTPUT_FILEPATH = os.environ['PYMARC_EXP__SMALLER_OUTPUT_MARC_FILEPATH']
    log.debug( 'processing file, ``{}```'.format(BIG_MARC_FILEPATH) )
    log.debug( 'output file, ``{}```'.format(SMALLER_OUTPUT_FILEPATH) )

    start_time = datetime.datetime.now()
    count = 0
    last_record = 'init'

    with open( BIG_MARC_FILEPATH, 'rb' ) as input_fh:
        # reader = pymarc.MARCReader( input_fh, force_utf8=True, utf8_handling='ignore' )
        # reader = pymarc.MARCReader( input_fh )
        # reader = pymarc.MARCReader( input_fh, to_unicode=True )

        # reader = pymarc.MARCReader( input_fh, to_unicode=True, utf8_handling='ignore' )  # works!
        reader = pymarc.MARCReader( input_fh, to_unicode=True, force_utf8=True, utf8_handling='ignore' )
        # reader = pymarc.MARCReader( input_fh, to_unicode=False, utf8_handling='ignore' )

        with open( SMALLER_OUTPUT_FILEPATH, 'wb' ) as output_fh:
            writer = pymarc.MARCWriter( output_fh )


            processing_flag = True
            while processing_flag is True:
                try:
                    record = next(reader)
                except Exception as e:
                    record = None
                    log.error( 'exception looping through records; ```{}```'.format( unicode(repr(e)) ) )
                    log.error( 'e info, ```{}```'.format(e) )
                    e_type, e_value, e_traceback = sys.exc_info()  # <http://stackoverflow.com/a/15890953>
                    log.error( 'e_type, ```{}``'.format(e_type) )
                    log.error( 'e_value, ```{}``'.format(e_value) )
                    log.error( 'e_traceback, ```{}```'.format(e_traceback) )
                    log.error( 'traceback info, ```{}```'.format( traceback.format_exc() ) )
                    # log.error( 'current record, ```{}```'.format( record ) )
                    log.error( 'current count, `{}`'.format(count) )
                    # log.error( 'last_record, ```{}```'.format(last_record) )

                last_record = record
                count += 1
                if count % 10000 == 0:
                    print( '`{}` records processed'.format(count) )
                if count >= start_record:
                    log.debug( 'count, `{}`'.format(count) )
                    if record:
                        log.debug( 'count is, `{cnt}`, so will write record.as_json()[0:100], ```{rcd}```'.format( cnt=count, rcd=record.as_json()[0:100] ) )
                        writer.write( record )
                    if count >= end_record:
                        processing_flag = False


    end_time = datetime.datetime.now()
    log.debug( 'records processed, `{}`'.format(count) )
    log.debug( 'time_taken, `{}`'.format(end_time-start_time) )
Example #17
0
def iso2tables(master, entry_filename, file_format, rec_format, id_traitement):
    #input_file_test = open(entry_filename,'rb').read()
    #print(chardet.detect(input_file_test).read())
    encoding = "iso-8859-1"
    if (file_format == 1):
        encoding = "utf-8"
    (test_file, input_file) = test_encoding_file(master, entry_filename,
                                                 encoding)
    assert test_file

    temp_list = [el + u'\u001D' for el in input_file]
    i = 0
    for rec in temp_list:
        i += 1
        outputfilename = "temp_record.txt"
        outputfile = open(outputfilename, "w", encoding="utf-8")

        outputfile.write(rec)
        outputfile.close()
        with open(outputfilename, 'rb') as fh:
            collection = mc.MARCReader(fh)
            if (file_format == 1):
                collection.force_utf8 = True
            (test, record) = detect_errors_encoding_iso(collection)
            if (test):
                record2listemetas(id_traitement, record, rec_format)


#==============================================================================
#             try:
#                 for record in collection:
#                     #print(record2meta(record,["001"]))
#                     record2listemetas(id_traitement, record,rec_format)
#             except ValueError as err:
#                 alerte_bom(str(err))
#             except UnboundLocalError:
#                 main.popup_errors(master,main.errors["format_fichier_en_entree"])
#             except mc.exceptions.RecordLengthInvalid as err:
#                 alerte_bom(str(err))
#                 NumNot = record2meta(record,["001"])
#                 liste_notices_pb_encodage.append(NumNot)
#                 pass
#             except UnicodeDecodeError as err:
#                 NumNot = record2meta(record,["001"])
#                 liste_notices_pb_encodage.append(NumNot)
#                 pass
#==============================================================================
#==============================================================================
#     except UnicodeDecodeError as err:
#         print("""Le fichier en entrée n'est pas en """ + encoding + """
# Essayez l'autre option d'encodage du module, ou convertissez le fichier en XML
# en utilisant MarcEdit""")
#
#==============================================================================
    try:
        os.remove("temp_record.txt")
    except FileNotFoundError as err:
        print(err)
    stats["Nombre total de notices traitées"] = i
Example #18
0
 def test_regression_45(self):
     # https://github.com/edsu/pymarc/issues/45
     with open("test/regression45.dat", "rb") as fh:
         reader = pymarc.MARCReader(fh)
         record = next(reader)
         self.assertEqual(record["752"]["a"], "Russian Federation")
         self.assertEqual(record["752"]["b"], "Kostroma Oblast")
         self.assertEqual(record["752"]["d"], "Kostroma")
Example #19
0
    def disabled_test_codecs(self):
        import codecs

        with codecs.open("test/test.dat", encoding="utf-8") as fh:
            reader = pymarc.MARCReader(fh)
            record = next(reader)
            self.assertEqual(record["245"]["a"],
                             "ActivePerl with ASP and ADO /")
Example #20
0
def main():
    marc_file = open('hello_marc.dat', 'rb')
    reader = pymarc.MARCReader(marc_file, force_utf8=True)
    # record = next(reader)  # returns a record object
    for record in reader:
        print(type(record))
        print(type(record.fields))
        print(record.leader)
def mrc_to_mrk(path_in, path_out):
    reader = pymarc.MARCReader(open(path_in, 'rb'),
                               to_unicode=True,
                               force_utf8=True)
    writer = pymarc.TextWriter(io.open(path_out, 'wt', encoding="UTF-8"))
    for record in reader:
        writer.write(record)
    writer.close()
Example #22
0
 def test_regression_45(self):
     # https://github.com/edsu/pymarc/issues/45
     with open('test/regression45.dat', 'rb') as fh:
         reader = pymarc.MARCReader(fh)
         record = next(reader)
         self.assertEqual(record['752']['a'], 'Russian Federation')
         self.assertEqual(record['752']['b'], 'Kostroma Oblast')
         self.assertEqual(record['752']['d'], 'Kostroma')
Example #23
0
 def __init__(self, file_name, exceptions=False):
     """DocString."""
     self.error = None
     self._marc_reader = pymarc.MARCReader(open(file_name, 'rb'),
                                           to_unicode=True,
                                           force_utf8=True,
                                           utf8_handling='ignore')
     self.error = ''
     self.exceptions = exceptions
Example #24
0
def write_csv(jangle_feed, csv_file_handle, ils=None):
    """
    Convert a MARC dump file to a CSV file.
    """
    # This doctest commented out until field names are stable.
    #>>> write_csv('test/marc.dat', 'test/records.csv')
    #>>> csv_records = open('test/records.csv').read()
    #>>> csv_measure = open('test/measure.csv').read()
    #>>> csv_records == csv_measure
    #True
    #>>> os.remove('test/records.csv')
    import elementtree.ElementTree as ET   
    import urlparse
    import StringIO
    feed = ET.fromstring(jangle_feed.read())
    continue_processing = "true"
    fieldname_dict = {}
    for fieldname in FIELDNAMES:
        fieldname_dict[fieldname] = fieldname
    #for record in reader
    atom_ns = "http://www.w3.org/2005/Atom"
    count = 0
    try:
        writer = csv.DictWriter(csv_file_handle, FIELDNAMES)
        writer.writerow(fieldname_dict)
        content_record = ''
        while continue_processing == "true":
            for entry in feed.findall("./{%s}entry" % atom_ns):
                try:
                    id = 'djo'+urlparse.urlparse(entry.find("{%s}id" % atom_ns).text).path.replace("/",":")
                except Exception, e: 
                    print(str(e))                    
                content = entry.find("{%s}content" % atom_ns)
                content_record = base64.b64decode(content.text)
                try:
                    reader = pymarc.MARCReader(StringIO.StringIO(content_record))
                except Exception, e: 
                    print(str(e))                    
                try:    
                    for marc_record in reader:
                        count += 1
                        try:
                            record = get_record(marc_record, id, ils=ils)
                            if record:  # skip when get_record returns None
                                row = get_row(record)
                                writer.writerow(row)
                        except:
                            sys.stderr.write("\nError in MARC record #%s (%s):\n" % (count, 
                             marc_record.title()))
                            raise
                        else:
                            if count % 1000:
                                sys.stderr.write(".")
                            else:
                                sys.stderr.write(str(count))
                except Exception, e: 
                    print(str(e))
def main(arguments):
    marc_file = open(arguments.filename, 'rb')
    reader = pymarc.MARCReader(marc_file, force_utf8=True)
    record = next(reader)  # returns a record object
    if arguments.field:
        print(record[arguments.field])
    else:
        print(record)  # print the MARC-text representation of the record
    return record
Example #26
0
 def test_MARCRecord(self):
     with open(os.path.join(EXAMPLES_PATH, 'line_marc.txt')) as line_marc:
         bin_marc = MARC.convert(line_marc.read())
         reader = pymarc.MARCReader(bin_marc,
                                    hide_utf8_warnings=True,
                                    force_utf8=True,
                                    utf8_handling='ignore')
         keyed_record = MARCRecord(next(reader))
         self.assertTrue(keyed_record.author.name,
                         "Failed to retrieve author name")
Example #27
0
 def load_training_marc(self, marc_filename):
     """Method loads a training set of MARC records for a Creative Work
     
     Parameters:
     marc_filename -- Full path to marc filename
     """
     marc_reader = pymarc.MARCReader(open(marc_filename, 'rb'))
     words_re = re.compile(r"(\w+)")
     for record in marc_reader:
         self.training_data.append(self.__tokenize_marc21__(record))
Example #28
0
def run_dedup_experiment(pp_filepath,
                         md_filepath,
                         cache_datastore=redis.StrictRedis(),
                         search_index=Elasticsearch()):
    """Runs experiment for de-duplicating BIBFRAME Person RDF graphs using MARC
    records from two samples representing Pride and Prejudice and Moby Dick
    records. 
    """
    pride_prejudice_records = [
        check001(r)
        for r in pymarc.MARCReader(open(pp_filepath, "br+"), to_unicode=True)
    ]
    moby_dick_records = [
        check001(r)
        for r in pymarc.MARCReader(open(md_filepath, "br+"), to_unicode=True)
    ]
    for recs in [pride_prejudice_records, moby_dick_records]:
        for record in recs:
            process_record(record, cache_datastore, search_index)
Example #29
0
 def make_record(self, raw_bytes):
     """ Not a test; preps record object. """
     s = raw_bytes.decode(
         'utf-8', errors='ignore'
     )  # 'replace' and 'backslashreplace' generate errors on ```record = next( reader )```
     handled_b = s.encode('utf-8')
     fh = io.BytesIO(handled_b)
     reader = pymarc.MARCReader(fh)
     record = next(reader)
     return record
Example #30
0
def extract_info():
    """ Prints/logs certain record elements.
        The ```utf8_handling='ignore'``` is required to avoid a unicode-error.
        """
    big_marc_filepath = settings.INPUT_FILEPATH
    log.debug( 'processing file, ``{}```'.format(big_marc_filepath) )
    with open( big_marc_filepath, 'rb' ) as fh:
        reader = pymarc.MARCReader( fh, force_utf8=True, utf8_handling='ignore' )  # w/o 'ignore', this line generates a unicode-error
        start = datetime.datetime.now()
        count = 0
        for record in reader:
            record.force_utf8 = True
            record_dct = record.as_dict()
            fields = record_dct['fields']
            ##
            title = record.title()
            ##
            bib_id = 'not_available'
            item_id = 'not_available'
            record_dct_logged = False
            for field_dct in fields:
                for (k, val_dct) in field_dct.items():
                    if k == '907':
                        try:
                            bib_id = val_dct['subfields'][0]['a'][0:9]
                        except Exception as e:
                            log.debug( 'exception getting bib_id, ``{}```'.format(e) )
                            log.debug( 'record_dct, ```{}```'.format( pprint.pformat(record_dct) ) )
                            record_dct_logged = True
                    if k == '945':
                        try:
                            subfields = val_dct['subfields']
                            for subfield_dct in subfields:
                                for (k2, val2) in subfield_dct.items():
                                    if k2 == 'y':
                                        item_id = val2
                        except Exception as f:
                            log.debug( 'exception getting item_id, ``{}```'.format(f) )
                            if record_dct_logged is False:
                                log.debug( 'record_dct, ```{}```'.format( pprint.pformat(record_dct) ) )
            basic_info = {
                'title': record.title(), 'bib_id': bib_id, 'item_id': item_id }
            # print( 'bas   ic_info, ```{}```'.format( pprint.pformat(basic_info) ) )
            log.info( 'basic_info, ```{}```'.format( pprint.pformat(basic_info) ) )
            try:
                count+=1
                if count % 10000 == 0:
                    print( '`{}` records processed'.format(count) )
                # if count > 100000:
                #     break
            except Exception as e:
                log.debug( 'exception on record ```{rec}```; error, ```{err}```'.format(rec=record, err=e) )
    end = datetime.datetime.now()
    log.info( 'count of records in file, `{}`'.format(count) )
    log.info( 'time_taken, `{}`'.format(end-start) )