Beispiel #1
0
    def disabled_test_xml_quiet(self):
        """ Tests the 'quiet' parameter of the MARC8ToUnicode class,
            passed in via the pymarc.record_to_xml() method
        """
        outfile = 'test/dummy_stderr.txt'
        # truncate outfile in case someone's fiddled with it
        open(outfile, 'wb').close()
        # redirect stderr
        sys.stderr = open(outfile, 'wb')
        # reload pymarc so it picks up the new sys.stderr
        reload(pymarc)
        # get problematic record
        record = next(
            pymarc.reader.MARCReader(open('test/utf8_errors.dat', 'rb')))
        # record_to_xml() with quiet set to False should generate errors
        #   and write them to sys.stderr
        xml = pymarc.record_to_xml(record, quiet=False)
        # close dummy stderr so we can accurately get its size
        sys.stderr.close()
        # file size should be greater than 0
        self.assertNotEqual(getsize(outfile), 0)

        # truncate file again
        open(outfile, 'wb').close()
        # be sure its truncated
        self.assertEqual(getsize(outfile), 0)
        # redirect stderr again
        sys.stderr = open(outfile, 'wb')
        reload(pymarc)
        # record_to_xml() with quiet set to True should not generate errors
        xml = pymarc.record_to_xml(record, quiet=True)
        # close dummy stderr
        sys.stderr.close()
        # no errors should have been written
        self.assertEqual(getsize(outfile), 0)
Beispiel #2
0
    def disabled_test_xml_quiet(self):
        """ Tests the 'quiet' parameter of the MARC8ToUnicode class,
            passed in via the pymarc.record_to_xml() method
        """
        outfile = 'test/dummy_stderr.txt'
        # truncate outfile in case someone's fiddled with it
        open(outfile, 'wb').close()
        # redirect stderr
        sys.stderr = open(outfile, 'wb')
        # reload pymarc so it picks up the new sys.stderr
        reload(pymarc)
        # get problematic record
        record = next(pymarc.reader.MARCReader(open('test/utf8_errors.dat', 'rb')))
        # record_to_xml() with quiet set to False should generate errors
        #   and write them to sys.stderr
        xml = pymarc.record_to_xml(record, quiet=False)
        # close dummy stderr so we can accurately get its size
        sys.stderr.close()
        # file size should be greater than 0
        self.assertNotEqual(getsize(outfile), 0)

        # truncate file again
        open(outfile, 'wb').close()
        # be sure its truncated
        self.assertEqual(getsize(outfile), 0)
        # redirect stderr again
        sys.stderr = open(outfile, 'wb')
        reload(pymarc)
        # record_to_xml() with quiet set to True should not generate errors
        xml = pymarc.record_to_xml(record, quiet=True)
        # close dummy stderr
        sys.stderr.close()
        # no errors should have been written
        self.assertEqual(getsize(outfile), 0)
Beispiel #3
0
    def test_xml_namespaces(self):
        """ Tests the 'namespace' parameter of the record_to_xml() method
        """
        # get a test record
        record = pymarc.reader.MARCReader(open('test/test.dat')).next()
        # record_to_xml() with quiet set to False should generate errors
        #   and write them to sys.stderr
        xml = pymarc.record_to_xml(record, namespace=False)
        # look for the xmlns in the written xml, should be -1
        self.assertEqual(xml.find('xmlns="http://www.loc.gov/MARC21/slim"'), -1)

        # record_to_xml() with quiet set to True should not generate errors
        xml = pymarc.record_to_xml(record, namespace=True)
        # look for the xmlns in the written xml, should be >= 0
        self.assertNotEqual(xml.find('xmlns="http://www.loc.gov/MARC21/slim"'), -1)
Beispiel #4
0
 def to_html(self):
     """Return an HTML representation of any MARC records."""
     records = [pymarc.Record(data=r) for r in self.record_data]
     xmllist = [pymarc.record_to_xml(r) for r in records]
     xslt = 'marcxml-to-html.xsl'
     html_list = [self._transform(xml, xslt) for xml in xmllist]
     return "".join(html_list)
Beispiel #5
0
    def test_xml(self):
        # read in xml to a record
        fh = gzip.open('test/batch.xml.gz','rb')
        record1 = pymarc.parse_xml_to_array(fh)[0]
        # generate xml
        xml = pymarc.record_to_xml(record1)
        # parse generated xml 
        record2 = pymarc.parse_xml_to_array(six.BytesIO(xml))[0]

        # compare original and resulting record
        self.assertEqual(record1.leader, record2.leader)

        field1 = record1.get_fields()
        field2 = record2.get_fields()
        self.assertEqual(len(field1), len(field2))

        pos = 0
        while pos < len(field1):
            self.assertEqual(field1[pos].tag, field2[pos].tag)
            if field1[pos].is_control_field():
                self.assertEqual(field1[pos].data, field2[pos].data)
            else:
                self.assertEqual(field1[pos].get_subfields(), field2[pos].get_subfields())
                self.assertEqual(field1[pos].indicators, field2[pos].indicators)
            pos += 1
Beispiel #6
0
    def test_xml(self):
        # read in xml to a record
        record1 = pymarc.parse_xml_to_array("test/batch.xml")[0]
        # generate xml
        xml = pymarc.record_to_xml(record1)
        # parse generated xml
        record2 = pymarc.parse_xml_to_array(BytesIO(xml))[0]

        # compare original and resulting record
        self.assertEqual(record1.leader, record2.leader)

        field1 = record1.get_fields()
        field2 = record2.get_fields()
        self.assertEqual(len(field1), len(field2))

        pos = 0
        while pos < len(field1):
            self.assertEqual(field1[pos].tag, field2[pos].tag)
            if field1[pos].is_control_field():
                self.assertEqual(field1[pos].data, field2[pos].data)
            else:
                self.assertEqual(
                    field1[pos].get_subfields(), field2[pos].get_subfields()
                )
                self.assertEqual(field1[pos].indicators, field2[pos].indicators)
            pos += 1
Beispiel #7
0
    def test_xml_sort(self):
        # read in xml to a record
        record1 = pymarc.parse_xml_to_array('test/order.xml')[0]
        # generate xml
        xml = pymarc.record_to_xml(record1)
        # parse generated xml 
        record1 = pymarc.parse_xml_to_array(StringIO(xml))[0]
        # parse xml in order
        record2 = pymarc.parse_xml_to_array('test/order_ok.xml')[0]

        # compare original and resulting record
        self.assertEqual(record1.leader, record2.leader)

        field1 = record1.get_fields()
        field2 = record2.get_fields()
        self.assertEqual(len(field1), len(field2))

        pos = 0
        while pos < len(field1):
            self.assertEqual(field1[pos].tag, field2[pos].tag)
            if field1[pos].is_control_field():
                self.assertEqual(field1[pos].data, field2[pos].data)
            else:
                self.assertEqual(field1[pos].get_subfields(), field2[pos].get_subfields())
                self.assertEqual(field1[pos].indicators, field2[pos].indicators)
            pos += 1
Beispiel #8
0
    def test_xml_namespaces(self):
        """Tests the 'namespace' parameter of the record_to_xml() method."""
        # get a test record
        fh = open("test/test.dat", "rb")
        record = next(pymarc.reader.MARCReader(fh))
        # record_to_xml() with quiet set to False should generate errors
        #   and write them to sys.stderr
        xml = pymarc.record_to_xml(record, namespace=False)
        # look for the xmlns in the written xml, should be -1
        self.assertFalse(b'xmlns="http://www.loc.gov/MARC21/slim"' in xml)

        # record_to_xml() with quiet set to True should not generate errors
        xml = pymarc.record_to_xml(record, namespace=True)
        # look for the xmlns in the written xml, should be >= 0
        self.assertTrue(b'xmlns="http://www.loc.gov/MARC21/slim"' in xml)

        fh.close()
Beispiel #9
0
    def test_xml_namespaces(self):
        """ Tests the 'namespace' parameter of the record_to_xml() method
        """
        # get a test record
        fh = open('test/test.dat', 'rb')
        record = next(pymarc.reader.MARCReader(fh))
        # record_to_xml() with quiet set to False should generate errors
        #   and write them to sys.stderr
        xml = pymarc.record_to_xml(record, namespace=False)
        # look for the xmlns in the written xml, should be -1
        self.assertNotIn(b'xmlns="http://www.loc.gov/MARC21/slim"', xml)

        # record_to_xml() with quiet set to True should not generate errors
        xml = pymarc.record_to_xml(record, namespace=True)
        # look for the xmlns in the written xml, should be >= 0
        self.assertIn(b'xmlns="http://www.loc.gov/MARC21/slim"', xml)

        fh.close()
Beispiel #10
0
 def to_marcxml(self):
     """Return a MARCXML representation of any MARC records."""
     records = [pymarc.Record(data=r) for r in self.record_data]
     xmllist = [pymarc.record_to_xml(r) for r in records]
     xmlstr = "".join(xmllist)
     xmldoc = """<?xml version="1.0" encoding="utf-8"?>
                 <collection xmlns="http://www.loc.gov/MARC21/slim">
                     {0}
                 </collection>""".format(xmlstr)
     return self._transform(xmldoc, 'format-xml.xsl')
Beispiel #11
0
def __rec2ld__(rec):
    marc_xml = pymarc.record_to_xml(rec, namespace=True)
    bf_rdf_xml = MARC2BF(lxml.etree.XML(marc_xml),
        baseuri='"http://catalog.coloradocollege.edu/"')
    bf_rdf = rdflib.Graph()
    bf_rdf.parse(data=lxml.etree.tostring(bf_rdf_xml),
        format='xml')
    bibcat.clean_uris(bf_rdf)
    try:
        raw_turtle = bf_rdf.serialize(format='turtle')
    except:
        raw_turtle = None
    return raw_turtle
Beispiel #12
0
    def test_marc(self):
        record = marc.stub(self.pandata)
        open(TESTDATA_MARCFILENAME, "w+").write(pymarc.record_to_xml(record))
        for field in record.get_fields('650'):

            self.assertEqual(field.get_subfields('a')[0], 'Science fiction')
            break
        for field in record.get_fields('100'):
            self.assertEqual(field.get_subfields('a')[0], 'Piper, H. Beam')
            break
        for field in record.get_fields('700'):
            self.assertEqual(field.get_subfields('4')[0], 'ill')
            break
Beispiel #13
0
 def test_marc(self):
     record = marc.stub(self.pandata)
     open(TESTDATA_MARCFILENAME,"w+").write(pymarc.record_to_xml(record))
     for field in record.get_fields('650'):
         
         self.assertEqual(field.get_subfields('a')[0],  'Science fiction')
         break
     for field in record.get_fields('100'):
         self.assertEqual(field.get_subfields('a')[0],  'Piper, H. Beam')
         break
     for field in record.get_fields('700'):
         self.assertEqual(field.get_subfields('4')[0],  'ill')
         break
def addLBD(config, oclcnumber, note):
    oauth_session = config.get('oauth-session')
    #create the LBD
    record = Record(leader='00000n   a2200000   4500')
    record.add_field(Field(tag='004', data=oclcnumber))
    record.add_field(
        Field(indicators=[' ', ' '], tag='500', subfields=['a', note]),
        Field(indicators=[' ', ' '],
              tag='935',
              subfields=['a', str(time.time())]),
        Field(indicators=[' ', ' '],
              tag='940',
              subfields=['a', config.get('oclcSymbol')]))
    input = pymarc.record_to_xml(record).decode("utf-8")

    try:
        r = oauth_session.post(
            config.get('metadata_service_url') + "/lbd/data",
            data=input,
            headers={
                "Accept":
                'application/atom+xml;content="application/vnd.oclc.marc21+xml"',
                "Content-Type": "application/vnd.oclc.marc21+xml"
            })
        r.raise_for_status
        try:
            result = ElementTree.fromstring(r.content)
            ns = {
                'atom': 'http://www.w3.org/2005/Atom',
                'wc': 'http://worldcat.org/rb'
            }
            marcNode = result.findall('atom:content/wc:response',
                                      ns)[0].getchildren()[0]
            marcData = StringIO(
                ElementTree.tostring(marcNode,
                                     encoding='unicode',
                                     method='xml'))
            # need to get this XML section out as a string and into a file like object
            marcRecords = pymarc.parse_xml_to_array(marcData)
            # pull out the LBD accession number
            print(marcRecords)
            accessionNumber = marcRecords[0]['001'].value()
            status = "success"
        except xml.etree.ElementTree.ParseError as err:
            accessionNumber = ""
            status = "failed XML parsing issue"
            print(err)
    except requests.exceptions.HTTPError as err:
        status = "failed"
    return pd.Series([oclcnumber, accessionNumber, status])
Beispiel #15
0
 def GET(self, query_string):
     try:
         marc = urllib2.urlopen('http://localhost:8080/marc/%s' % query_string).read()
         xml = pymarc.record_to_xml(pymarc.Record(data=marc))    
         if xml != '':
             xml = '<?xml version="1.0" encoding="UTF-8"?><collection xmlns="http://www.loc.gov/MARC21/slim" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">%s</collection>' % xml
             web.header('Content-Type', 'application/xml')
             print xml
         else:
             web.notfound()
     except urllib2.HTTPError, e:
         if e.code == 404:
             web.notfound()
         else:
             raise
def update_holding(holding, holding_id, full_holding_string, five_forty_one,
                   mms_id):
    holding.add_field(five_forty_one)
    print("Holding with new field: \n" + str(holding) + "\n\n\n")
    updated_holding = pym.record_to_xml(holding).decode('utf-8')

    full_holding_string = full_holding_string.decode('utf-8')

    full_updated_holding = re.sub(r'<record>(.+)</record>', updated_holding,
                                  full_holding_string)

    print("Updated XML Holding Record: \n" + full_updated_holding + "\n")

    full_updated_holding = full_updated_holding.replace(
        '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>', '')

    #success = True

    # faulty_xml = "<holding></holding>"
    #
    # # full_holdings_xml = root.find('holding/holding_id=')
    #
    #
    response = requests.put(
        "https://api-na.hosted.exlibrisgroup.com/almaws/v1/bibs/" +
        str(mms_id) + "/holdings/" + str(holding_id) +
        "?apikey=l8xx6d87f17c700040dba6a300a2d59e98bc",
        data=full_updated_holding,
        headers=headers)
    #
    time.sleep(2)
    print(response.content)
    # #
    # #
    # # # response = requests.put("https://api-na.hosted.exlibrisgroup.com/almaws/v1/bibs/" + str(mms_id) + "/holdings/" + str(holding_id) + "?apikey=l8xxa16e86330d384b0c9171da72a6536dc8", data=full_updated_holding, headers=headers)
    # # #
    # # # print(response.content)
    if re.search('<errorsExist>true</errorsExist>', response.content):
        print("Couldn't write back to Alma for MMS ID: " + mms_id + "\n")
        error_file.write("Couldn't write back to Alma for MMS ID: " + mms_id +
                         "\n")
        success = False
    else:
        output_file.write("<MMS_ID_" + mms_id + ">" + full_updated_holding +
                          "</MMS_ID_" + mms_id + ">")

        success = True
    return success
def writeChangesToRecord(record, correct_headings, changed_headings,
                         heading_type_settings):
    for heading_type in changed_headings:
        if len(correct_headings[heading_type]) > 0 or len(
                changed_headings[heading_type]) > 0:
            print heading_type
            print correct_headings[heading_type]
            print changed_headings[heading_type]
            print heading_type_settings
            print heading_type_settings
            record = writeHeadingsToRecord(
                record, correct_headings[heading_type] +
                changed_headings[heading_type],
                heading_type_settings[heading_type]['heading_type'],
                heading_type_settings[heading_type]['bib_fields'])

    if record is not None:
        return record_to_xml(record)
Beispiel #18
0
total = 0
marcset = MARCReader(marcfile)
for record in marcset:
    # if we've reached the maximum records per transform, close this file
    # and start a new one
    if (recs != 0 and recs % int(config.get('system','max_records_per_transform')) == 0):
        current_batch_f.write(marcxmlfooter)
        current_batch_f.close()
        filecount += 1
        current_batch_f = open('batch_%05d.tmp' % filecount,'w')
        current_batch_f.write(marcxmlheader)
        batchfiles.append('batch_%05d.tmp' % filecount)

    if (filter.passStage(record) and not filter.rejectStage(record)):
        recs += 1
        current_batch_f.write(record_to_xml(record))
        if (output_marcxml_flag == 1):
            marc_xml_f.write(record_to_xml(record))
    total += 1


current_batch_f.write(marcxmlfooter)
current_batch_f.close()

# loop over batch files created, and run them through the XSLT parser
for f in batchfiles:
   print "transforming %s" % f
   utils.apply_stylesheets(f,config)
   olac_rec = utils.getstringfromfile(f,'<olac:olac','</olac:olac>')
   print 'olac_rec = ', olac_rec
Beispiel #19
0
	def __init__(self):
		
		setup()
					
		# start by assuming something will go wrong:
		status = CLI.EX_SOMETHING_ELSE
		
		desc = "Adds id.loc.gov URIs to subject and " + \
				"name headings when established forms can be found. Works with EAD or MaRCXML files."
		
		# note: defaults in config file can be overridden by args on commandline
		# argparse...
		epi = """Exit statuses:
		 0 = All good
		 9 = Something unanticipated went wrong
		64 = The command was used incorrectly, e.g., with the wrong number of arguments, a bad flag, a bad syntax in a parameter, or whatever.
		65 = The input data was incorrect in some way.
		66 = Input file (not a system file) did not exist or was not readable.
		70 = An internal software (not OS) error has been detected.
		73 = User specified output file cannot be created.
		74 = An error occurred while doing I/O on some file.
		"""
		
		rHelp = "The input file."
		
		mHelp = "The input file is MaRCXML rather than EAD. Found URIs are put into $0."
	
		oHelp = "Path to the output file. Writes to stdout if no option " + \
			"is supplied."
		
		nHelp = "Try to find URIs for names."
		
		sHelp = "Try to find URIs for subjects."
	
		aHelp = "Annotate the record. When multiple matches are found XML " + \
			"comments containing the matches and their URIs will be added " + \
			"to the record."
			
		vHelp = "Print messages to stdout (one-hit headings) and stderr " + \
			"(zero or more than one hit headings)."
			
		cHelp = "Does just what it says.\n"	
		
		lHelp = "Log alternatives.\n"
		
		cfgHelp = "Specify the config file. Defaults can be overridden. " + \
			"At minimum, run e.g.: python addauths.py myfile.marc.xml"
					
		conf_parser = ArgumentParser(add_help=False, description=desc)
		conf_parser.add_argument("-c", "--conf_file", default=CONFIG, required=False, dest="conf_file", help=cfgHelp)
		args, remaining_argv = conf_parser.parse_known_args()
		defaults = {
			"marc" : False,
			"outpath": None,
			"names" : False,
			"subjects" : False,
			"annotate" : False,
			"verbose" : False,
			"ignore_cache" : False,
			"record": None,
			"log" : False
		}
		# if -c or --conf_file, override the defaults above
		if args.conf_file:
			config = ConfigParser.SafeConfigParser()
			config.read([args.conf_file])
			cfgdict = dict(config.items('Paths')) # Paths section of config file
			booldict = dict(config.items('Booleans')) # Booleans section of config file
			for k,v in booldict.iteritems():
				# need to get the booleans as booleans, not as 'strings'
				boo = config.getboolean('Booleans',k)
				cfgdict[k]=boo
			defaults = cfgdict

		parser = ArgumentParser(parents=[conf_parser],description=desc,formatter_class=RawDescriptionHelpFormatter,epilog=epi)
		parser.set_defaults(**defaults)
		parser.add_argument("-m", "--marc", required=False, dest="mrx", action="store_true", help=mHelp)
		parser.add_argument("-o", "--output", required=False, dest="outpath", help=oHelp)
		parser.add_argument("-n", "--names", required=False, dest="names", action="store_true", help=nHelp)
		parser.add_argument("-s", "--subjects", required=False, dest="subjects", action="store_true", help=sHelp)
		parser.add_argument("-a", "--annotate", required=False, dest="annotate", action="store_true", help=aHelp)
		parser.add_argument("-v", "--verbose", required=False, dest="verbose", action="store_true", help=vHelp)
		parser.add_argument("-C", "--ignore-cache",required=False, dest="ignore_cache", action="store_true", help=cHelp)
		parser.add_argument("-l", "--log",required=False, dest="log", action="store_true", help=lHelp)
		parser.add_argument("-f", "--file",required=True, dest="record", help=rHelp)
		args = parser.parse_args(remaining_argv)

		# TODO args to log (along with batch no.) -pmg		
		print(args)

		#=======================================================================
		# Checks on our args and options. We can exit before we do any work.
		#=======================================================================
		if not os.path.exists(args.record):
			os.sys.stderr.write("File " + args.record + " does not exist\n")
			exit(CLI.EX_NO_INPUT)
			
		if args.record == None:
			os.sys.stderr.write("No input file supplied. See --help for usage\n")
			exit(CLI.EX_WRONG_USAGE)
	
		if not args.names and not args.subjects:
			msg = "Supply -n and or -s to link headings. Use --help " + \
			"for more details.\n"
			os.sys.stderr.write(msg)
			exit(CLI.EX_WRONG_USAGE)
			
		if args.mrx == True:
			marc_path = args.record
			# a quick and dirty test...
			reader = pymarc.marcxml.parse_xml_to_array(marc_path)
			if not reader:
				msg = "-m flag used but input file isn't MaRCXML.\n"
				os.sys.stderr.write(msg)
				exit(CLI.EX_WRONG_USAGE)
	
		if args.outpath:
			outdir = os.path.dirname(args.outpath)
			if not os.path.exists(outdir):
				msg = "Directory " + outdir + " does not exist\n"
				os.sys.stderr.write(msg)
				exit(CLI.EX_CANT_CREATE)
			if not os.access(outdir, os.W_OK):
				msg = "Output directory " + outdir + " not writable\n"
				os.sys.stderr.write(msg) 
				exit(CLI.EX_CANT_CREATE)

		#=======================================================================
		# The work...
		#=======================================================================
		shelf = shelve.open(SHELF_FILE, protocol=pickle.HIGHEST_PROTOCOL)
		ctxt = None
		mrx_subs = []
		h = ""
		mrxheader = """<?xml version="1.0" encoding="UTF-8" ?>
<collection xmlns="http://www.loc.gov/MARC21/slim" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd">"""
		try:
			reader = pymarc.marcxml.parse_xml_to_array(args.record)
			#writer = codecs.open("test_out.marc.xml", 'w', 'utf-8')
			options = {'annotate':args.annotate, 'verbose':args.verbose, 'mrx':args.mrx, 'log':args.log, 'ignore_cache':args.ignore_cache}
			fh = open(OUTDIR+'tmp.xml', 'wb+')
			fh.write(mrxheader)
			for rec in reader:
				f001 = rec.get_fields('001')
				for b in f001:
					bbid = b.value()
				if args.names:
					#=======================
					# NAMES
					#=======================
					# get names data from these subfields
					namesubf = ['a','c','d','q']
					names = ['100','110','130','700','710','730']
					for n in rec.get_fields(*names):
						for s in n.get_subfields(*namesubf):
							s = s.encode('utf8')
							mrx_subs.append(s)
						h = "--".join(mrx_subs)
						tag = n.tag
						_update_headings(bbid, 'nam', h, n, shelf, tag, **options)
						mrx_subs = []
				if args.subjects:
					#=======================
					# SUBJECTS
					#=======================
					# get subjects data from these subfields (all but 0,2,3,6,8)
					subs = ['600','610','611','630','650','651']
					subsubf = ['a', 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 
					'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'z', '4'] 
					for f in rec.get_fields(*subs):
						for s in f.get_subfields(*subsubf):
							s = s.encode('utf8')
							mrx_subs.append(s)
						h = "--".join(mrx_subs)
						tag = f.tag
						_update_headings(bbid, 'sub', h, f, shelf, tag, **options)
						mrx_subs = []
				out = "%s" % (pymarc.record_to_xml(rec))
				fh.write(out)
			fh.write("</collection>")
			fh.close()
			
			if args.outpath == None:
				os.sys.stdout.write(out)

			# if we got here...
			status = CLI.EX_OK

		#=======================================================================
		# Problems while doing "the work" are handled w/ Exceptions
		#=======================================================================
		except libxml2.parserError, e: # TODO: pymarc exceptions
			os.sys.stderr.write(str(e.message) + "\n")
			status = CLI.EX_DATA_ERR
Beispiel #20
0
    def ingest(self, record, workspace=None):
        """Method runs entire tool-chain to ingest a single MARC record into
        Datastore.

        Args:
            record (pymarc.Record): MARC21 record
            workspace (str): Fedora4 workspace to ingest records into

        Returns:
            list: MongoID
        """
        self.graph_ids = {}
        # MARC record must have a 001 for BIBFRAME xquery to function properly
        if not record['001']:
            unique_id = uuid.uuid1()
            field001 = pymarc.Field('001')
            field001.data = str(unique_id).split("-")[0]
            record.add_field(field001)
        marc_xml = pymarc.record_to_xml(record, namespace=True)
        marc_uri = rdflib.URIRef(self.__process_marc__(record))
        derived_from = rdflib.URIRef('http://bibframe.org/vocab/derivedFrom')
        ##        try:
        ##            bibframe_graph = self.__xquery_chain__(marc_xml)
        ##            for subject, obj in bibframe_graph.subject_objects(
        ##                predicate=derived_from):
        ##                bibframe_graph.set(
        ##                    (subject,
        ##                    derived_from,
        ##                    marc_uri))
        ##            all_graphs = self.__decompose_bf_graph__(bibframe_graph, workspace)
        ##            for graph in all_graphs:
        ##                graph_url = str(next(graph.subjects()))
        ##                add_stub_request = urllib.request.Request(
        ##                    graph_url,
        ##                    method='PUT')
        ##                try:
        ##                    urllib.request.urlopen(add_stub_request)
        ##                except:
        ##                    print("Tried to add stub for {}".format(graph_url))
        ##            for graph in all_graphs:
        ##                self.__process_bibframe__(graph)
        ##                index_result = self.__index_into_es__(
        ##                    str(next(graph.subjects())),
        ##                    graph)
        ##            bibframe_graph.close()
        ##        except:
        ##            print("Error with record {}".format(sys.exc_info()[0]))
        bibframe_graph = self.__xquery_chain__(marc_xml)
        for subject, obj in bibframe_graph.subject_objects(
                predicate=derived_from):
            bibframe_graph.set((subject, derived_from, marc_uri))
        all_graphs = self.__decompose_bf_graph__(bibframe_graph, workspace)
        ##        for graph in all_graphs:
        ##            graph_url = str(next(graph.subjects()))
        ##            add_stub_request = urllib.request.Request(
        ##                graph_url,
        ##                method='PUT')
        ##            try:
        ##                urllib.request.urlopen(add_stub_request)
        ##            except:
        ##                print("Tried to add stub for {}".format(graph_url))
        for graph in all_graphs:
            self.__process_bibframe__(graph)


##            index_result = self.__index_into_es__(
##                str(next(graph.subjects())))
        bibframe_graph.close()
Beispiel #21
0
    def load_bib(self, record):
        title = None

        # we must have an lccn, but it's not an error if we don't find one
        lccn_orig = _extract(record, "010", "a")
        lccn = _normal_lccn(lccn_orig)

        if not lccn:
            # LOGGER.info("###### LCCN in OCLC pull, \
            #              but not in database. Missing LCCN. ######")
            # LOGGER.info(record)
            self.missing_lccns += 1
            return

        # newer marc xml sets pulled from OCLC do not have the 005 control
        # field. 005 is the date and time of the last transaction.
        try:
            s = _extract(record, "005")
            parts = s.split(".")
            dt = datetime.datetime(*strptime(parts[0], "%Y%m%d%H%M%S")[0:6])
        except AttributeError:
            dt = datetime.datetime.now()

        # dt.replace(microsecond=int(parts[1]))

        # it's remotely possible that a title with the LCCN already exists
        try:
            title = models.Title.objects.get(lccn=lccn)
            LOGGER.debug("Found another record for lccn: %s", lccn)
            if title.version == dt:
                LOGGER.debug("    with the same timestamp: %s", title.version)
                return  # skip over this record with same timestamp
            elif title.version < dt:
                LOGGER.debug("    with newer timestamp: %s vs %s", title.version, dt)
                title.version = dt
                self.records_updated += 1
            elif title.version > dt:
                LOGGER.debug("    with older timestamp: %s vs %s", title.version, dt)
                return  # skip over older record
            else:
                raise ValueError("It should not be possible to have a version fail <, =, and > checks")
        except models.Title.DoesNotExist:
            self.records_created += 1
            title = models.Title(lccn=lccn)
            title.version = dt

        # clear m2m relationships
        # these will come from the extraction
        title.subjects.clear()
        title.languages.clear()
        title.places.clear()

        # delete fk relationships
        # these will come from the extraction
        title.publication_dates.all().delete()
        title.notes.all().delete()
        title.alt_titles.all().delete()
        title.succeeding_title_links.all().delete()
        title.preceeding_title_links.all().delete()
        title.related_title_links.all().delete()
        title.urls.all().delete()

        # update title fields
        self._set_name(record, title)

        title.lccn_orig = lccn_orig
        title.oclc = self._extract_oclc(record)
        title.edition = _extract(record, "250", "a")
        title.publisher = _extract(record, "260", "b")
        title.frequency = _extract(record, "310", "a")
        title.frequency_date = _extract(record, "310", "b")
        title.uri = _extract(record, "856", "u")

        # rda records use 265$a, fallback to 260$a
        title.place_of_publication = _extract(record, "264", "a")
        if not title.place_of_publication:
            title.place_of_publication = _extract(record, "260", "a")

        # rda records use 338$a, fallback to 245$h
        title.medium = _extract(record, "338", "a")
        if not title.medium:
            title.medium = _extract(record, "245", "h")

        title.issn = _extract(record, "022", "a")
        f008 = record["008"].data

        title.start_year = _normal_year(f008[7:11])
        title.end_year = _normal_year(f008[11:15])
        # check to make sure start and end years are not blank
        if not title.start_year:
            LOGGER.error("lccn %s title has blank start year! Defaulting to 0", title.lccn)
            title.start_year = "0"
        if not title.end_year:
            LOGGER.error("lccn %s title has blank end year! Defaulting to 9999", title.lccn)
            title.end_year = "9999"

        title.country = self._extract_country(record)
        title.save()

        # update fk relationships with new values
        self._extract_languages(record, title)
        self._extract_places(record, title)
        self._extract_publication_dates(record, title)
        self._extract_subjects(record, title)
        self._extract_notes(record, title)
        self._extract_preceeding_titles(record, title)
        self._extract_succeeding_titles(record, title)
        self._extract_related_titles(record, title)
        self._extract_alt_titles(record, title)
        self._extract_urls(record, title)
        title.save()

        marc, marc_created = models.MARC.objects.get_or_create(title=title)
        marc.xml = record_to_xml(record)
        marc.save()

        # for context see: https://rdc.lctl.gov/trac/ndnp/ticket/375
        if _is_chronam_electronic_resource(title, record):
            LOGGER.info("deleting title record for chronam electronic resource: %s", title)
            title.delete()

        # this is for long running processes so the query cache
        # doesn't bloat memory
        reset_queries()

        return title
Beispiel #22
0
def convert_marc_to_xml(hostenv):
    """Proquest delivers MARC-formatted files to CDL on behalf of campuses,
       generally 6-8 weeks after ETD was delivered. We transform these using
       campus XSLT customizations, adding both eScholarship and Proquest links
       in 856 fields. Proquest files have file extension '.UNX'"""
    marc_tmpfile = os.path.join(app_configs[hostenv]['tmp_dir'],
                                'marctmpfile.xml')
    xmlmarcnamespace = '<collection xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" ' \
                        'xsi:schemaLocation="http://www.loc.gov/MARC21/slim ' \
                        'http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd">\n'
    xmlcloser = '</collection>'
    for marcfilename in os.listdir(app_configs[hostenv]['marc_dir']):
        campuscode = None
        # find *.UNX or .unx file in /marc subdirectory
        if marcfilename[-3:].upper() == 'UNX':
            # determine campus abbr
            campusabbr = re.search(r'UC\s(.*?)\sMARC.*?', marcfilename)
            if campusabbr is not None:
                campuscode = constants.PQ_CAMPUS_NAMES.get(campusabbr.group(1))
            # convert to XML
            marcpathname = os.path.join(app_configs[hostenv]['marc_dir'],
                                        marcfilename)
            try:
                reader = pymarc.MARCReader(open(marcpathname, 'rb'),
                                           to_unicode=True)


#pylint: disable=maybe-no-member
            except pymarc.exceptions.PymarcException as err:
                logging.exception("ERROR opening PQ MARC file %s: %s",
                                  marcpathname, err.message)
            writer = codecs.open(marc_tmpfile, 'w', 'utf-8')
            writer.write(constants.XML_PROLOG)
            writer.write(xmlmarcnamespace)
            for record in reader:
                record.leader = record.leader[:9] + 'a' + record.leader[10:]
                writer.write(
                    pymarc.record_to_xml(record, namespace=False) + "\n")
            writer.write(xmlcloser)
            writer.close()
            # need to add namespaces using XSLT
            marc_tmpfilestr = open(marc_tmpfile, 'r')
            marc_tmpfileread = marc_tmpfilestr.read()
            namespace_xmlstr = xml_saxon_transform(marc_tmpfileread,
                                                   constants.NAMESPACE_XSLT)
            # test if all ISBNs are available
            test_str = xml_saxon_transform(namespace_xmlstr,
                                           constants.TEST_XSLT)
            # convert using campus customizations using XSLT
            if "ERROR" not in test_str:
                if campuscode is not None:
                    campus_stylesheet = os.path.join(
                        app_configs[hostenv]['xsl_dir'],
                        campus_configs[campuscode]['pqmarcxslt'])
                    campus_xml_str = xml_saxon_transform(
                        namespace_xmlstr, campus_stylesheet)
                    outfilename = campuscode + time.strftime(
                        "%Y%m%d") + 'PQ-orig.xml'
                    outfullpath = os.path.join(
                        app_configs[hostenv]['marc_dir'], outfilename)
                    campus_xml_file = codecs.open(outfullpath, 'wb')
                    campus_xml_file.write(campus_xml_str)
                    campus_xml_file.close()
                else:
                    logging.error("ERROR: campus code not found %s",
                                  marcfilename)
            else:
                logging.error("ERROR: UNX file %s not converted; missing %s",
                              marcfilename, test_str)
        response = requests.get('http://id.loc.gov/authorities/names/label/%s' % subject)
        link = response.url
        pattern = re.compile(r'(.+)\.html$')
        value = pattern.findall(link)[0].encode('utf-8')
        return ('http://id.loc.gov/authorities/names', value)        
    except:
        return (False, False)

with codecs.open('G://Metadata Projects/sheet_music/sheet_music.bib', 'rb') as fh:
    regexpNS = 'http://exslt.org/regular-expressions'
    modsNS = 'http://www.loc.gov/mods/v3'
    nameDict = defaultdict(int)
    reader = pymarc.MARCReader(fh, to_unicode=True)
    for record in reader:
        root = etree.XML(pymarc.record_to_xml(record, namespace=True))
        xslt_root = etree.parse(open('G:/Metadata Projects/sheet_music/sheet.xsl','r'), parser)
        transform = etree.XSLT(xslt_root)
        root = transform(root)

        title = root.xpath('/mods:mods/mods:titleInfo[not(@*)]/mods:title', namespaces={'mods': modsNS})[0].text
        bib = root.xpath('/mods:mods/mods:recordInfo/mods:recordIdentifier', namespaces={'mods': modsNS})[0].text
        print "----------------------------------------\nTransformation started for '%s.'\n" % title

        try:

            ## Get name URIs

            print "... Retrieving URIs for names and titles ...\n"

            pattern = re.compile("\s[A-Za-z]{2,}\.$")
Beispiel #24
0
import codecs
import pymarc

input = 'records_in.mrc'
output = 'records_out.xml'

reader = pymarc.MARCReader(open(input, 'rb'), to_unicode=True)
writer = codecs.open(output, 'w', 'utf-8')
for record in reader:
    record.leader = record.leader[:9] + 'a' + record.leader[10:]
    writer.write(bytes.decode(pymarc.record_to_xml(record)) + '\n')
Beispiel #25
0
    def load_bib(self, record):
        title = None

        # we must have an lccn, but it's not an error if we don't find one
        lccn_orig = _extract(record, '010', 'a')
        lccn = _normal_lccn(lccn_orig)

        if not lccn:
            #_logger.info("###### LCCN in OCLC pull, \
            #              but not in database. Missing LCCN. ######")
            #_logger.info(record)
            self.missing_lccns += 1
            return

        # newer marc xml sets pulled from OCLC do not have the 005 control
        # field. 005 is the date and time of the last transaction.
        try:
            s = _extract(record, '005')
            parts = s.split(".")
            dt = datetime.datetime(*strptime(parts[0], '%Y%m%d%H%M%S')[0:6])
        except AttributeError:
            dt = datetime.datetime.now()

        #dt.replace(microsecond=int(parts[1]))

        # it's remotely possible that a title with the LCCN already exists
        try:
            title = models.Title.objects.get(lccn=lccn)
            _logger.debug("Found another record for lccn: %s" % lccn)
            if title.version == dt:
                _logger.debug("    with the same timestamp: %s" % title.version)
                return  # skip over this record with same timestamp
            elif title.version < dt:
                _logger.debug("    with newer timestamp: %s vs %s" % (title.version, dt))
                title.version = dt
                self.records_updated += 1
            elif title.version > dt:
                _logger.debug("    with older timestamp: %s vs %s" % (title.version, dt))
                return  # skip over older record
            else:
                _logger.error("Logic error... this should be unreachable.")
        except models.Title.DoesNotExist:
            self.records_created += 1
            title = models.Title(lccn=lccn)
            title.version = dt

        # clear m2m relationships
        # these will come from the extraction
        title.subjects.clear()
        title.languages.clear()
        title.places.clear()

        #TODO: Add a check to the title load that deletes all m2m in subjects and
        # places, to keep from lonely m2m records hanging out

        # delete fk relationships
        # these will come from the extraction
        title.publication_dates.all().delete()
        title.notes.all().delete()
        title.alt_titles.all().delete()
        title.succeeding_title_links.all().delete()
        title.preceeding_title_links.all().delete()
        title.related_title_links.all().delete()
        title.urls.all().delete()

        # update title fields
        self._set_name(record, title)

        title.lccn_orig = lccn_orig
        title.oclc = self._extract_oclc(record)
        title.edition = _extract(record, '250', 'a')
        title.place_of_publication = _extract(record, '260', 'a')
        title.publisher = _extract(record, '260', 'b')
        title.frequency = _extract(record, '310', 'a')
        title.frequency_date = _extract(record, '310', 'b')
        # the main purpose of this it to look for records
        # with 245 $h[microform] or [microfilm]
        # but we save everything
        title.medium = _extract(record, '245', 'h')
        title.issn = _extract(record, '022', 'a')
        f008 = record['008'].data
        title.start_year = _normal_year(f008[7:11])
        title.end_year = _normal_year(f008[11:15])
        title.country = self._extract_country(record)
        title.save()

        # update fk relationships with new values
        self._extract_languages(record, title)
        self._extract_places(record, title)
        self._extract_publication_dates(record, title)
        self._extract_subjects(record, title)
        self._extract_notes(record, title)
        self._extract_preceeding_titles(record, title)
        self._extract_succeeding_titles(record, title)
        self._extract_related_titles(record, title)
        self._extract_alt_titles(record, title)
        self._extract_urls(record, title)
        title.save()

        marc, marc_created = models.MARC.objects.get_or_create(title=title)
        marc.xml = record_to_xml(record)
        marc.save()

        # for context see: https://rdc.lctl.gov/trac/ndnp/ticket/375
        if _is_chronam_electronic_resource(title, record):
            _logger.info("deleting title record for chronam electronic resource: %s" % title)
            title.delete()

        # this is for long running processes so the query cache
        # doesn't bloat memory
        reset_queries()

        return title
from pymarc import Record,Field,record_to_xml
import MySQLdb
record=Record()
print dir(record)
dbLo=MySQLdb.connect("localhost","shailesh","123","shailesh")
dbKoha=MySQLdb.connect("localhost","root","","koha1")
curaKoha=dbKoha.cursor()
curaLocl = dbLo.cursor()
curaLocl.execute("select BookNo,BookName,authorName from book_info group by BookNo;")
dat=curaLocl.fetchall()
curaLocl.execute("select accession,BookNo,callNo from book_info;")
datIte=curaLocl.fetchall()
for i in dat:
		record=Record()
		record.add_field(Field(tag='040',indicators=['0','1'],subfields=['c','LIBRARY OF CONGRESS']))
		record.add_field(Field(tag='245',indicators=['0','1'],subfields=['a',i[1]]))	
		record.add_field(Field(tag='942',indicators=['0','1'],subfields=['2','book of parag','c','BOOK']))
		record.add_field(Field(tag='100',indicators=['0','1'],subfields=['a',i[2]]))
		record.add_field(Field(tag='999',indicators=['0','1'],subfields=['c','8','d','8']))
		marcI=record_to_xml(record)
		#print i[0],i[1],i[2]
		curaKoha.execute("insert into biblio(biblionumber,title,author) values(%s,%s,%s);",(i[0],i[1],i[2]))
		curaKoha.execute("insert into biblioitems(biblionumber,biblioitemnumber,marcxml) values(%s,%s,%s);",(i[0],i[0],marcI))

for i in datIte:
		barcode='1111'+str(i[0])
		curaKoha.execute("insert into items(itemnumber,biblionumber,biblioitemnumber,barcode,itemcallnumber) values(%s,%s,%s,%s,%s);",(i[0],i[1],i[1],barcode,i[2]))

dbKoha.commit()
dbKoha.close()		
def convert2bibframe(record):
    return xquery_socket(pymarc.record_to_xml(record, namespace=True))
Beispiel #28
0
def read_mrx(mrcrec,names,subjects):
	'''
	Read through a given MARCXML file and optionally copy it, inserting $0 as appropriate
	'''
	enhanced = [] # will be True or False, from check_heading()
	recs = [] # will be a pymarc object or None, from check_heading()
	mrxheader = """<?xml version="1.0" encoding="UTF-8" ?>
	<collection xmlns="http://www.loc.gov/MARC21/slim" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd">"""
	try:
		reader = pymarc.marcxml.parse_xml_to_array(INDIR+mrcrec)
		
		for rec in reader:
			f001 = rec.get_fields('001')
			f035 = rec.get_fields('035')
			try:
				ocn = rec['035']['a']
			except:
				ocn = None
			for b in f001:
				bbid = b.value()
			if names:
				en,r = check_heading(bbid,rec,'nam')
				enhanced.append(en)
				if en == True:
					recs.append(r)
			if subjects: # if just searching subjects, or if a rec only has subjects, no names
				en,r = check_heading(bbid,rec,'sub')
				enhanced.append(en) 	
				if en == True and r not in recs:
					recs.append(r)
		if nomarc == False:
			outfile = str.replace(mrcrec,'.xml','') 
			fh = open(TMPDIR+outfile+'_tmp.xml', 'wb+') 
			fh.write(mrxheader)
			for record in recs:
				if record is not None:
					try:
						out = "%s" % (pymarc.record_to_xml(record))
						fh.write(out)
					except Exception as e:
						raise						
		if nomarc == False and ((enhanced_only == True and (True in enhanced)) or (enhanced_only == False)):
			fh.write("</collection>")
			fh.close()
			
	except AttributeError as e:
		return
	except:	
		if names:
			scheme = 'nam'
		elif subjects:
			scheme = 'sub'
		etype,evalue,etraceback = sys.exc_info()
		flag = "read_mrx problem: %s %s %s line %s" % (etype,evalue,etraceback,etraceback.tb_lineno)
		print(flag)

		if csvout or nomarc: # idea here is to report something out even when mrx has issues
			write_csv(bbid,flag,'',scheme,'','')

	if not nomarc and ((enhanced_only == True and (True in enhanced)) or (enhanced_only == False)):
		try:
			subprocess.Popen(['xmllint','--format','-o', OUTDIR+outfile, TMPDIR+outfile+'_tmp.xml']).wait()
			mrx2mrc(OUTDIR+outfile)
		except:
			etype,evalue,etraceback = sys.exc_info()
			print("xmllint problem: %s" % evalue)

	if (keep == False):
		os.remove(INDIR+mrcrec)
Beispiel #29
0
    # write OAI header
    # TODO: handle exceptions for template variable substitution
    # is this exception handling working???
    if (count == 0): # first iteration
        oaiheader = Template(utils.file2string(config.get('system','oai_header_file')))
        uservars['sample_id'] = 'oai:' + \
        uservars['repository_id'] + ':' +record['001'].value()
        try:
            olac_xml_f.write(oaiheader.substitute(uservars))
        except KeyError:
            pass


    # construct a proper marcxml document
    xmlrec = record_to_xml(record) 
    if (output_marcxml_flag == 1):
        marc_xml_f.write(xmlrec)

    xmlrec = marcxmlheader  + xmlrec + marcxmlfooter 

    # write out xml rec to a temp file
    xml_input_f = open(xml_input,'w')
    xml_input_f.write(xmlrec)
    xml_input_f.close()

    # apply stylesheets
    print "start stylesheet"
    utils.apply_stylesheets(xml_input,config)
    print "end stylesheet"
Beispiel #30
0
import utils

# get params from command line
try:
    input = sys.argv.pop(1)
    output = sys.argv.pop(1)
except:
    print "you need two arguments: input_file output_file"
    sys.exit(2)

f = open(output,'w')
ctr = 0
marcset = pymarc.MARCReader(open(input))
f.write('<?xml version="1.0" encoding="UTF-8" ?>\n<collection xmlns="http://www.loc.gov/MARC21/slim">')
for rec in marcset:
    xmlrec = pymarc.record_to_xml(rec)
    #xmlrec = libxml2.parseDoc(xmlrec)
    if (rec['695'] and (rec['695'].value().lower().find('language') != -1 or rec['695'].value().lower().find('music') != -1)):
        print rec['695']
        f.write(xmlrec + '\n')
        ctr += 1
    if ctr % 500 == 0: print "writing %sth record..." % ctr
    #if ctr == 100: break
f.write('</collection>')
f.close()

#print "formatting xml..."
#xmlrec = libxml2.parseDoc(utils.file2string(output))
#f = open(output,'w')
#f.write(xmlrec.serialize(None,1))
#f.write(xmlrec.serialize(None,2))
Beispiel #31
0
	sleep(1)
	
	
if __name__ == "__main__":
	mrxheader = """<?xml version="1.0" encoding="UTF-8" ?>
<collection xmlns="http://www.loc.gov/MARC21/slim" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd">"""
	fh = open('out/owi_tmp.xml', 'w+')
	fh.write(mrxheader)
	reader = pymarc.marcxml.parse_xml_to_array(infile)
	for rec in reader:
		for n in rec.get_fields('035'):
			for s in n.get_subfields('a'):
				if 'OCoLC' in s:
					num = s.replace('(OCoLC)','')
					workid = check_shelf(str(num))
		if workid != None and workid != '':
			field = pymarc.Field(
				tag = '787', 
				indicators = ['0',' '],
				subfields = [
					'o', str(workid)
				])
			rec.add_field(field)
		workid = ""
		out = "%s" % (pymarc.record_to_xml(rec))
		fh.write(out)
	fh.write("</collection>")
	fh.close()
	# format output for readability
	subprocess.Popen(['xmllint','--format','-o', outfile,'owi_tmp.xml'])
Beispiel #32
0
    filename_xml = 'examplerecord_%s.xml' % i 
    filename_out = 'examplerecord_%s.out' % i 

    records = MARCReader(open(filename),
                         to_unicode=True,
                         force_utf8=True,
                         utf8_handling='ignore')

    writer_dat = MARCWriter(file(filename_out,'a'))
    writer_xml = open(filename_xml,'a')

    for marc in records:

        isbn_list = marc.get_fields('020')
        try:
            isbn_field = isbn_list[0]
        except Exception, e:
            j = i - 10
            marc.add_ordered_field(
                Field(
                    tag='020',
                    indicators=[' ', ' '],
                    subfields = ['a', isbns[j]]
                    ))

        writer_dat.write(marc)
        writer_xml.write(record_to_xml(marc) + "\n")

    writer_dat.close()
    writer_xml.close()
Beispiel #33
0
    def load_bib(self, record):
        title = None

        # we must have an lccn, but it's not an error if we don't find one
        lccn_orig = _extract(record, '010', 'a')
        lccn = _normal_lccn(lccn_orig)

        if not lccn:
            #_logger.info("###### LCCN in OCLC pull, \
            #              but not in database. Missing LCCN. ######")
            #_logger.info(record)
            self.missing_lccns += 1
            return

        # newer marc xml sets pulled from OCLC do not have the 005 control
        # field. 005 is the date and time of the last transaction.
        try:
            s = _extract(record, '005')
            parts = s.split(".")
            dt = datetime.datetime(*strptime(parts[0], '%Y%m%d%H%M%S')[0:6])
        except AttributeError:
            dt = datetime.datetime.now()

        #dt.replace(microsecond=int(parts[1]))

        # it's remotely possible that a title with the LCCN already exists
        try:
            title = models.Title.objects.get(lccn=lccn)
            _logger.debug("Found another record for lccn: %s" % lccn)
            if title.version == dt:
                _logger.debug("    with the same timestamp: %s" %
                              title.version)
                return  # skip over this record with same timestamp
            elif title.version < dt:
                _logger.debug("    with newer timestamp: %s vs %s" %
                              (title.version, dt))
                title.version = dt
                self.records_updated += 1
            elif title.version > dt:
                _logger.debug("    with older timestamp: %s vs %s" %
                              (title.version, dt))
                return  # skip over older record
            else:
                _logger.error("Logic error... this should be unreachable.")
        except models.Title.DoesNotExist:
            self.records_created += 1
            title = models.Title(lccn=lccn)
            title.version = dt

        # clear m2m relationships
        # these will come from the extraction
        title.subjects.clear()
        title.languages.clear()
        title.places.clear()

        # delete fk relationships
        # these will come from the extraction
        title.publication_dates.all().delete()
        title.notes.all().delete()
        title.alt_titles.all().delete()
        title.succeeding_title_links.all().delete()
        title.preceeding_title_links.all().delete()
        title.related_title_links.all().delete()
        title.urls.all().delete()

        # update title fields
        self._set_name(record, title)

        title.lccn_orig = lccn_orig
        title.oclc = self._extract_oclc(record)
        title.edition = _extract(record, '250', 'a')
        title.publisher = _extract(record, '260', 'b')
        title.frequency = _extract(record, '310', 'a')
        title.frequency_date = _extract(record, '310', 'b')
        title.uri = _extract(record, '856', 'u')

        # rda records use 265$a, fallback to 260$a
        title.place_of_publication = _extract(record, '264', 'a')
        if not title.place_of_publication:
            title.place_of_publication = _extract(record, '260', 'a')

        # rda records use 338$a, fallback to 245$h
        title.medium = _extract(record, '338', 'a')
        if not title.medium:
            title.medium = _extract(record, '245', 'h')

        title.issn = _extract(record, '022', 'a')
        f008 = record['008'].data
        title.start_year = _normal_year(f008[7:11])
        title.end_year = _normal_year(f008[11:15])
        title.country = self._extract_country(record)
        title.save()

        # update fk relationships with new values
        self._extract_languages(record, title)
        self._extract_places(record, title)
        self._extract_publication_dates(record, title)
        self._extract_subjects(record, title)
        self._extract_notes(record, title)
        self._extract_preceeding_titles(record, title)
        self._extract_succeeding_titles(record, title)
        self._extract_related_titles(record, title)
        self._extract_alt_titles(record, title)
        self._extract_urls(record, title)
        title.save()

        marc, marc_created = models.MARC.objects.get_or_create(title=title)
        marc.xml = record_to_xml(record)
        marc.save()

        # for context see: https://rdc.lctl.gov/trac/ndnp/ticket/375
        if _is_chronam_electronic_resource(title, record):
            _logger.info(
                "deleting title record for chronam electronic resource: %s" %
                title)
            title.delete()

        # this is for long running processes so the query cache
        # doesn't bloat memory
        reset_queries()

        return title
def convert2bibframe(record):
    return xquery_socket(pymarc.record_to_xml(record, namespace=True))
Beispiel #35
0
def process_rec(rec, type):
	rec_orig = deepcopy(rec)
	dup_num = False
	no_880_rec = False
	missing_key_880_rec = False
	unlinked_880_rec = False
	indiv_rec_analysis_msg = ''			# string variable to collect individual analysis messages for each record
	
	rec_003_value = rec.get_fields('003')[0].value()	# either 'OCLC' or the partner's institution code from the 003 field
	rec_001_value = rec.get_fields('001')[0].value()	# either the OCLC number or the inst_id from the 001 field
	if type=='oclc':
		################################################
		# Check for duplicate OCLC record in batch
		for num in aco_globals.oclc_nums_processed:
			if rec_001_value == num:
				dup_num = True
		if not dup_num:
			aco_globals.oclc_nums_processed.add(rec_001_value)
		
	if type=='orig' or not dup_num:
		################################################
		# Add institutional ID and OCLC number to 999 field
		rec_orig, rec, oclc_id, inst_id, oclc_match, msg_1 = process_001_003_fields(rec_orig, rec, aco_globals.oclc_nums_bsns_all)
		indiv_rec_analysis_msg += msg_1
		if not oclc_match:
			aco_globals.recs_no_oclc_match_count += 1
		
		################################################
		# Check if record is missing any 880 script fields
		script_rec, missing_key_880s, msg_2 = check_for_missing_880s(rec)
		indiv_rec_analysis_msg += msg_2
		if not script_rec:
			no_880_rec = True
		else:
			aco_globals.recs_880s_count += 1
		
		if missing_key_880s:
			missing_key_880_rec = True
		
		################################################
		# Check if record has any unlinked 880 fields (having "00" in the 880 $6 numbering)
		unlinked_exist, msg_3 = check_for_unlinked_880s(rec)
		indiv_rec_analysis_msg += msg_3
		if unlinked_exist:
			unlinked_880_rec = True
		
		################################################
		# Check if record has any untraced 490 fields without corresponding 8XX fields
		msg_4 = check_series_hdgs(rec)
		indiv_rec_analysis_msg += msg_4
		
		################################################
		# Check if record contains RDA fields
		rda_rec, msg_5 = check_if_rda(rec)
		indiv_rec_analysis_msg += msg_5
		if rda_rec:
			aco_globals.recs_rda_count += 1	
		
		################################################
		# Check if record contains bad encoding script character (black diamond around question-mark)
		# Evidenced by presence of Python source code u"\uFFFD" (See: http://www.fileformat.info/info/unicode/char/0fffd/index.htm)
		repl_char, msg_6 = check_repl_char(rec)
		indiv_rec_analysis_msg += msg_6
		if repl_char:
			aco_globals.recs_repl_char_count += 1
		
		################################################
		# Add/Delete/Modify MARC fields in print record to convert to an e-resource record
		rec, msg_7 = convert_2_eres_rec(rec, rda_rec)
		indiv_rec_analysis_msg += msg_7
		
		################################################
		# Sort any $6 subfields that do not appear first in the field
		rec = sort_6_subs(rec)
		
		rec, msg_8 = second_sort_6_check(rec)
		indiv_rec_analysis_msg += msg_8
				
		################################################
		# Match the 001/003 fields and insert the corresponding URL handle in an 856 field
		rec, msg_9 = insert_url(rec, aco_globals.handles_lines)
		indiv_rec_analysis_msg += msg_9
		
		################################################
		# Match the BSNs and insert the corresponding SE (source entity) book IDs into the 999 field
		rec, msg_10 = insert_src_entities(rec, aco_globals.bsn_se_lines)
		indiv_rec_analysis_msg += msg_10
		
		################################################
		# Change LDR values
		ldr = list(rec.leader)
		ldr[5] = 'n'
		ldr[6] = 'a'
		ldr[7] = 'm'
		#ldr[9] = 'a'
		rec.leader = ''.join(ldr)
		
		################################################
		# Remove any existing 999 $e subfields and Add new 999 $e subfield with error type codes
		# --  NOTE: adding the field to the rec_orig (deep copy of rec) seems to also add to rec...??
		add_999e = False
		
		rec_orig_999s = rec_orig.get_fields('999')
		if len(rec_orig_999s) == 0:
			indiv_rec_analysis_msg += 'ERROR-MISC: The 999 field did not get added to the original record during processing\n'
		elif len(rec_orig_999s) > 1:
			indiv_rec_analysis_msg += 'ERROR-MISC: Original record contains multiple 999 fields\n'
		elif len(rec_orig_999s) == 1:
			if len(rec_orig.get_fields('999')[0].get_subfields('e')) > 0:
				for rec_orig_999e in rec_orig.get_fields('999')[0].get_subfields('e'):
					rec_orig.get_fields('999')[0].delete_subfield('e')
			add_999e = True
		
# 		rec_999s = rec.get_fields('999')
# 		if len(rec_999s) == 0:
# 			indiv_rec_analysis_msg += 'ERROR-MISC: The 999 field did not get added to the converted record during processing\n'
# 		elif len(rec_999s) > 1:
# 			indiv_rec_analysis_msg += 'ERROR-MISC: Converted record contains multiple 999 fields\n'
# 		elif len(rec_999s) == 1:
# 			if len(rec.get_fields('999')[0].get_subfields('e')) > 0:
# 				for rec_999e in rec.get_fields('999')[0].get_subfields('e'):
# 					rec.get_fields('999')[0].delete_subfield('e')
# 			add_999e = True
		
		if add_999e:
			error_types = ''
			if 'ERROR-880' in indiv_rec_analysis_msg:
				error_types += '(ERROR-880)'
			if 'ERROR-SERIES' in indiv_rec_analysis_msg:
				error_types += '(ERROR-SERIES)'
			if 'ERROR-MISC' in indiv_rec_analysis_msg:
				error_types += '(ERROR-MISC)'
			if not error_types == '':
				rec_orig.get_fields('999')[0].add_subfield('e', error_types)
		
		indiv_rec_analysis_msg += '---------------------------------------------------------------------\n'
		
		################################################
		# Write out ERROR message and MARC records depending on status
		if no_880_rec:
			aco_globals.recs_no_880s_count += 1
			aco_globals.marcRecsOut_no_880s.write(rec_orig)
			aco_globals.recs_no_880s_txt.write(indiv_rec_analysis_msg)
		if missing_key_880_rec:
			aco_globals.recs_missing_key_880s_count += 1
			aco_globals.marcRecsOut_missing_key_880s.write(rec_orig)
			aco_globals.recs_missing_key_880s_txt.write(indiv_rec_analysis_msg)
		if unlinked_880_rec:
			aco_globals.recs_unlinked_880s_count += 1
			aco_globals.marcRecsOut_unlinked_880s.write(rec_orig)
			aco_globals.recs_unlinked_880s_txt.write(indiv_rec_analysis_msg)
		if 'ERROR-SERIES' in indiv_rec_analysis_msg:
			aco_globals.recs_series_errors_count += 1
			aco_globals.marcRecsOut_series_errors.write(rec_orig)
			aco_globals.recs_series_errors_txt.write(indiv_rec_analysis_msg)
		if 'ERROR-MISC' in indiv_rec_analysis_msg:
			aco_globals.recs_misc_errors_count += 1
			aco_globals.marcRecsOut_misc_errors.write(rec_orig)
			aco_globals.recs_misc_errors_txt.write(indiv_rec_analysis_msg)
		
		if 'ERROR' in indiv_rec_analysis_msg:
			aco_globals.recs_errors_all_count += 1
			aco_globals.marcRecsOut_errors_all.write(rec_orig)
			aco_globals.recs_errors_all_txt.write(indiv_rec_analysis_msg)
		else:
			aco_globals.marcRecsOut_final_subset.write(rec)
			aco_globals.recs_final_this_subset_count += 1
		
		aco_globals.marcRecsOut_final_all.write(rec)
				
		################################################
		# Write out individual .mrc record
		try: os.makedirs(aco_globals.batch_folder+'/mrc_out')
		except OSError as exception:
			if exception.errno != errno.EEXIST:
				raise
		indiv_marcRecOut = pymarc.MARCWriter(file(aco_globals.batch_folder+'/mrc_out/'+inst_id+'_mrc.mrc', 'w'))
		indiv_marcRecOut.write(rec)
		indiv_marcRecOut.close()
		
		################################################
		# Convert MARC to MARCXML and write out individual MARCXML record
		rec_xml = pymarc.record_to_xml(rec, namespace=True)					# creates marcxml but all tags in a single line with no line breaks
		pretty_rec_xml = xml.dom.minidom.parseString(rec_xml)				# parses the single line of xml
		pretty_rec_xml = pretty_rec_xml.toprettyxml(encoding='utf-8')		# creates the correct indentations and line breaks, but adds extra line breaks within content tags
		pretty_xml_re = re.compile('>\n\s+([^<>\s].*?)\n\s+</', re.DOTALL)	# regular expression for removing extra line breaks in content
		pretty_rec_xml = pretty_xml_re.sub('>\g<1></', pretty_rec_xml)		# applying the reg ex to remove extra line breaks in content
		try: os.makedirs(aco_globals.batch_folder+'/marcxml_out')
		except OSError as exception:
			if exception.errno != errno.EEXIST:
				raise
		indiv_marcRecOut_xml = codecs.open(aco_globals.batch_folder+'/marcxml_out/'+inst_id+'_marcxml.xml', 'w')
		indiv_marcRecOut_xml.write(pretty_rec_xml)
		indiv_marcRecOut_xml.close()
				
		aco_globals.indiv_rec_analysis_msgs += indiv_rec_analysis_msg
	
	return dup_num
Beispiel #36
0
def _xml(record):
    return pymarc.record_to_xml(record)
Beispiel #37
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import codecs
import pymarc

input = 'records_in.mrc'
output = 'records_out.xml'

reader = pymarc.MARCReader(open(input, 'rb'), to_unicode=True)
writer = codecs.open(output, 'w', 'utf-8')
for record in reader:
    record.leader = record.leader[:9] + 'a' + record.leader[10:]
    writer.write(pymarc.record_to_xml(record) + "\n")
"""
Base script for DLF Forum 2014 Listening-Based Python workshop.

Modified from files at https://github.com/LibraryCodeYearIG/MARC-record-edit .
"""

import os
from pymarc import Field, MARCReader, MARCWriter, record_to_xml

records = MARCReader(open('../../exampledump.mrc'),
                     to_unicode=True,
                     force_utf8=True,
                     utf8_handling='ignore')

index = 1

for marc in records:
    filename_dat = 'examplerecord_%s.dat' % index
    filename_xml = 'examplerecord_%s.xml' % index

    writer_dat = MARCWriter(file(filename_dat,'a'))
    writer_xml = open(filename_xml,'a')

    writer_dat.write(marc)
    writer_xml.write(record_to_xml(marc) + "\n")

    writer_dat.close()
    writer_xml.close()

    index += 1
Beispiel #39
0
def extract(marc_filepath):
    """Takes a MARC21 file, iterates through each MARC record 
    and yields MARC XML""" 
    reader = pymarc.MARCReader(open(marc_filepath, "rb"), to_unicode=True)
    for record in reader:
        yield pymarc.record_to_xml(record, namespace=True)
Beispiel #40
0
def convert_mrc_recorc_to_xml(my_marc):
	root = et.Element("collection")
	xml_record = record_to_xml(record)
	root.insert(i, et.fromstring(xml_record))