def load_file(self, location, skip=0): location = urlparse.urljoin("file:", location) t0 = time() times = [] def load_record(record): try: # we test to see if it is a record, b/c not # all values returned from OCLC are records. # if it is not a record, we just want to skip it. if record: self.records_processed += 1 if skip > self.records_processed: LOGGER.info("skipped %i" % self.records_processed) elif record.leader[5] == 'd': self.delete_bib(record) elif record.leader[6] == 'a': self.load_bib(record) except Exception as e: LOGGER.error("unable to load: %s" % e) LOGGER.exception(e) self.errors += 1 seconds = time() - t0 times.append(seconds) if self.records_processed % 1000 == 0: LOGGER.info("processed %sk records in %.2f seconds" % (self.records_processed / 1000, seconds)) request = urllib2.Request(location, headers={'User-Agent': 'chronam-title-loader'}) map_xml(load_record, urllib2.urlopen(request))
def load_file(self, filename, skip=0): t0 = time() times = [] def _process_time(): seconds = time() - t0 times.append(seconds) if self.records_processed % 1000 == 0: LOGGER.info("processed %sk records in %.2f seconds" % (self.records_processed / 1000, seconds)) def load_xml_record(record): try: self.records_processed += 1 if skip > self.records_processed: LOGGER.info("skipped %i" % self.records_processed) return if record.leader[6] == 'y': self.load_xml_holding(record) except Exception as e: LOGGER.error("unable to load record %s: %s" % (self.records_processed, e)) LOGGER.exception(e) self.errors += 1 _process_time() map_xml(load_xml_record, file(filename, "rb"))
def test_map_xml(self): self.seen = 0 def count(record): self.seen += 1 fh = gzip.open('test/batch.xml.gz','rb') pymarc.map_xml(count, fh) self.assertEqual(2, self.seen)
def test_multi_map_xml(self): self.seen = 0 def count(record): self.seen += 1 fh1 = gzip.open('test/batch.xml.gz','rb') fh2 = gzip.open('test/batch.xml.gz','rb') pymarc.map_xml(count, fh1, fh2) self.assertEqual(4, self.seen)
def test_map_xml(self): self.seen = 0 def count(record): self.seen += 1 pymarc.map_xml(count, 'test/batch.xml') self.assertEqual(2, self.seen)
def test_multi_map_xml(self): self.seen = 0 def count(record): self.seen += 1 pymarc.map_xml(count, "test/batch.xml", "test/batch.xml") self.assertEqual(4, self.seen)
def marcxml2bioc(marcxmlFilename, biocFilename): with open(marcxmlFilename, 'rb') as inF, bioc.BioCXMLDocumentWriter(biocFilename) as writer: def marcxml2bioc_helper(record): writeMarcXMLRecordToBiocFile(record, writer) pymarc.map_xml(marcxml2bioc_helper, inF)
def test_read_utf8(self): self.field_count = 0 def process_xml(record): for field in record.get_fields(): self.field_count += 1 pymarc.map_xml(process_xml, 'test/utf8.xml') self.assertEqual(self.field_count, 8)
def test_map_xml(self): self.seen = 0 def count(record): self.seen += 1 fh = gzip.open('test/batch.xml.gz', 'rb') pymarc.map_xml(count, fh) self.assertEqual(2, self.seen)
def main(): '''Main method. Magic starts here.''' # TODO: räknare på allt! parser = argparse.ArgumentParser() parser.add_argument("records_file", help="path to marc records folder") parser.add_argument("result_path", help="path to Instance results file") parser.add_argument("okapi_url", help=("OKAPI base url")) parser.add_argument("tenant_id", help=("id of the FOLIO tenant.")) parser.add_argument("username", help=("the api user")) parser.add_argument("password", help=("the api users password")) parser.add_argument("-holdings_id_dict_path", "-ih", help=("")) parser.add_argument("-instance_id_dict_path", "-i", help=("")) parser.add_argument("-postgres_dump", "-p", help=("results will be written out for Postgres" "ingestion. Default is JSON"), action="store_true") parser.add_argument("-marcxml", "-x", help=("DATA is in MARCXML format"), action="store_true") parser.add_argument("-validate", "-v", help=("Validate JSON data against JSON Schema"), action="store_true") args = parser.parse_args() print('\tresults file:\t', args.result_path) print("\tOkapi URL:\t", args.okapi_url) print("\tTenanti Id:\t", args.tenant_id) print("\tUsername: \t", args.username) print("\tPassword: \tSecret") print("\tinstance idMap will get stored at:\t", args.instance_id_dict_path) print("\thold idMap will get stored at:\t", args.holdings_id_dict_path) print("File to process: {}".format(args.records_file)) folio_client = FolioClient(args.okapi_url, args.tenant_id, args.username, args.password) instance_id_map = {} with open(args.instance_id_dict_path, 'r') as json_file: instance_id_map = json.load(json_file) print("Number of instances in ID map: {}".format(len(instance_id_map))) default_mapper = HoldingsDefaultMapper(folio_client, instance_id_map) print("Starting") print("Rec./s\t\tTot. recs\t\t") with open(args.result_path + '/folio_holdings.json', 'w+') as results_file: processor = HoldingsMarcProcessor(default_mapper, folio_client, results_file, args) if args.marcxml: pymarc.map_xml(processor.process_record, args.records_file) else: with open(args.records_file, 'rb') as marc_file: pymarc.map_records(processor.process_record, marc_file) # wrap up print("Done. Wrapping up...") processor.wrap_up() print("done")
def test_multi_map_xml(self): self.seen = 0 def count(record): self.seen += 1 fh1 = gzip.open('test/batch.xml.gz', 'rb') fh2 = gzip.open('test/batch.xml.gz', 'rb') pymarc.map_xml(count, fh1, fh2) self.assertEqual(4, self.seen)
def test_copy_utf8(self): writer = pymarc.MARCWriter(open('test/write-utf8-test.dat', 'wb')) new_record = pymarc.Record(to_unicode=True, force_utf8=True) def process_xml(record): new_record.leader = record.leader for field in record.get_fields(): new_record.add_field(field) pymarc.map_xml(process_xml, 'test/utf8.xml') try: writer.write(new_record) writer.close() finally: # remove it os.remove('test/write-utf8-test.dat')
def main(): '''parses args pointing to record xml paths, specifies output paths, and applies "pull_arabic"''' logger = logging.getLogger(__name__) logger.info( 'collecting arabic records and extracting parallel Arabic/Romanized representations' ) parser = argparse.ArgumentParser() parser.add_argument('input_directory', help='path to directory containing records') parser.add_argument( '-f', '--sub_directory_filter', help= 'select a particular subdirectory inside a complex directory structure' ) parser.add_argument( '-n', '--name', help='optional source name, otherwise take directory name') args = parser.parse_args() if args.name: name = args.name else: name = args.input_directory.split('/')[-1] logger.info(f'source: {name}') record_paths = get_xml_paths(args.input_directory, args.sub_directory_filter) writer = pymarc.XMLWriter(open(f'data/arabic_records/{name}.xml', 'wb')) for path in record_paths: xmlname = path.split('/')[-1].replace('.xml', '') pymarc.map_xml(lambda record: pull_arabic(record, writer=writer), path) writer.close() global counter008 global counter880 logger.info( f'# of Arabic records ("ara" in language field 008): {counter008}')
def main(): parser = argparse.ArgumentParser(description='Tool to summarize data from PubAg') parser.add_argument('--i',type=str,required=True,help="PubAg MarcXML file") parser.add_argument('--oTitles',type=str,required=True,help="File containing titles") parser.add_argument('--oHasTitlesAndAbstracts',type=str,required=True,help="File containing counts of titles and abstracts") parser.add_argument('--oPubYear',type=str,required=True,help="File containing counts of publication years") parser.add_argument('--oJournals',type=str,required=True,help="File containing counts of journals") args = parser.parse_args() fTitles = codecs.open(args.oTitles,'w','utf-8') fHasTitlesAndAbstracts = codecs.open(args.oHasTitlesAndAbstracts,'w','utf-8') fPubYear = codecs.open(args.oPubYear,'w','utf-8') fJournals = codecs.open(args.oJournals,'w','utf-8') def summarizeMarcXMLFile_helper(record): summarizeMarcXMLFile(record,fTitles,fHasTitlesAndAbstracts,fPubYear,fJournals) with open(args.i,'rb') as inF: pymarc.map_xml(summarizeMarcXMLFile_helper,inF)
class TitleLoader(object): def __init__(self): self.records_processed = 0 self.records_created = 0 self.records_updated = 0 self.records_deleted = 0 self.missing_lccns = 0 self.errors = 0 def load_file(self, location, skip=0): location = urlparse.urljoin("file:", location) t0 = time() times = [] def load_record(record): try: # we test to see if it is a record, b/c not # all values returned from OCLC are records. # if it is not a record, we just want to skip it. if record: self.records_processed += 1 if skip > self.records_processed: _logger.info("skipped %i" % self.records_processed) elif record.leader[5] == 'd': self.delete_bib(record) elif record.leader[6] == 'a': self.load_bib(record) except Exception, e: _logger.error("unable to load: %s" % e) _logger.exception(e) self.errors += 1 seconds = time() - t0 times.append(seconds) if self.records_processed % 1000 == 0: _logger.info("processed %sk records in %.2f seconds" % (self.records_processed / 1000, seconds)) if 'https' in location: lccn = location[40:location.find('/marc.xml')] c = httplib.HTTPSConnection('chroniclingamerica.loc.gov') c.request("GET", '/lccn/%s/marc.xml' % lccn) xml = c.getresponse() else: xml = urllib2.urlopen(location) map_xml(load_record, xml)
class HoldingLoader: """ A loader for holdings data. Intended to be run after titles have been loaded with TitleLoader. This is necessary so that holdings records can be attached to the appropriate Title. """ def __init__(self): self.records_processed = 0 self.missing_title = 0 self.errors = 0 self.skipped = 0 self.holding_created = 0 self.no_oclc = 0 self.files_processed = 0 def load_file(self, filename, skip=0): t0 = time() times = [] def _process_time(): seconds = time() - t0 times.append(seconds) if self.records_processed % 1000 == 0: _logger.info("processed %sk records in %.2f seconds" % (self.records_processed / 1000, seconds)) def load_xml_record(record): try: self.records_processed += 1 if skip > self.records_processed: _logger.info("skipped %i" % self.records_processed) return if record.leader[6] == 'y': self.load_xml_holding(record) except Exception, e: _logger.error("unable to load record %s: %s" % (self.records_processed, e)) _logger.exception(e) self.errors += 1 _process_time() map_xml(load_xml_record, file(filename, "rb"))
class TitleLoader(object): def __init__(self): self.records_processed = 0 self.records_created = 0 self.records_updated = 0 self.records_deleted = 0 self.missing_lccns = 0 self.errors = 0 def load_file(self, location, skip=0): location = urlparse.urljoin("file:", location) t0 = time() times = [] def load_record(record): try: # we test to see if it is a record, b/c not # all values returned from OCLC are records. # if it is not a record, we just want to skip it. if record: self.records_processed += 1 if skip > self.records_processed: _logger.info("skipped %i" % self.records_processed) elif record.leader[5] == 'd': self.delete_bib(record) elif record.leader[6] == 'a': self.load_bib(record) except Exception, e: _logger.error("unable to load: %s" % e) _logger.exception(e) self.errors += 1 seconds = time() - t0 times.append(seconds) if self.records_processed % 1000 == 0: _logger.info("processed %sk records in %.2f seconds" % (self.records_processed / 1000, seconds)) map_xml(load_record, urllib2.urlopen(location))
def parse_record(record, field=FIELD, subfield=SUBFIELD): value = extract(record, field, subfield) if value: rec_id = extract(record, '010', 'a') if not rec_id: rec_id = extract(record, '004') values.append((rec_id, value)) if __name__ == '__main__': if os.path.isdir(SOURCE): marc_xml_dir = os.listdir(SOURCE) for xml_file in marc_xml_dir: marc_file = os.path.join(SOURCE, xml_file) map_xml(parse_record, open(marc_file, 'r')) else: map_xml(parse_record, open(SOURCE, 'r')) # all values #for value in values: # print str(value[0]), ',',value[1] total = len(values) #Get a sample of 50 random values for that field for i in range(50): try: random_value = choice(values) values.remove(random_value) print ','.join([random_value[0], random_value[1]])
# Options and arguments __version__ = '0.0.1' p = optparse.OptionParser(description='MARCXML Record Counter', usage='usage: %prog [[opts]] [file1] .. [fileN]', version='%prog ' + __version__) p.add_option('--verbose', '-v', action='store_true', help="verbose, show additional informational messages") (opt, args) = p.parse_args() # Loop over all files specified counting records in each total = 0 fmt = "%-7d %s" for arg in args: seen = 0 fh = 0 if (re.search(r'\.gz$', arg)): if (opt.verbose): print "Reading %s as gzipped MARCXML" % (arg) fh = gzip.open(arg, 'rb') else: if (opt.verbose): print "Reading %s as MARCXML" % (arg) fh = open(arg, 'rb') pymarc.map_xml(count, fh) print fmt % (seen, arg) total += seen if (len(args) > 1): print fmt % (total, 'TOTAL')
#!/usr/bin/python import os import pymarc path = './' def get_place_of_pub(record): try: place_of_pub = record[''][''] print(place_of_pub) with open('out.txt', 'a') as f: print(place_of_pub, file=f) except Exception as e: print(e) for file in os.listdir(path): if file.endswith('.xml'): pymarc.map_xml(get_place_of_pub, path + file)
# Options and arguments __version__ = "0.0.1" p = optparse.OptionParser( description="MARCXML Record Counter", usage="usage: %prog [[opts]] [file1] .. [fileN]", version="%prog " + __version__, ) p.add_option("--verbose", "-v", action="store_true", help="verbose, show additional informational messages") (opt, args) = p.parse_args() # Loop over all files specified counting records in each total = 0 fmt = "%-7d %s" for arg in args: seen = 0 fh = 0 if re.search(r"\.gz$", arg): if opt.verbose: print "Reading %s as gzipped MARCXML" % (arg) fh = gzip.open(arg, "rb") else: if opt.verbose: print "Reading %s as MARCXML" % (arg) fh = open(arg, "rb") pymarc.map_xml(count, fh) print fmt % (seen, arg) total += seen if len(args) > 1: print fmt % (total, "TOTAL")
def xml_to_mrc(path_in, path_out): writer = pymarc.MARCWriter(open(path_out, 'wb')) records = pymarc.map_xml(writer.write, path_in) writer.close()
values = [] def parse_record(record, field=FIELD, subfield=SUBFIELD): value = extract(record, field, subfield) if value: rec_id = extract(record, '010', 'a') if not rec_id: rec_id = extract(record, '004') values.append((rec_id, value)) if __name__ == '__main__': if os.path.isdir(SOURCE): marc_xml_dir = os.listdir(SOURCE) for xml_file in marc_xml_dir: marc_file = os.path.join(SOURCE, xml_file) map_xml(parse_record, open(marc_file, 'r')) else: map_xml(parse_record, open(SOURCE, 'r')) # all values #for value in values: # print str(value[0]), ',',value[1] total = len(values) #Get a sample of 50 random values for that field for i in range(50): try: random_value = choice(values) values.remove(random_value) print ','.join([random_value[0], random_value[1]])
def test_multi_map_xml(self): self.seen = 0 def count(record): self.seen += 1 pymarc.map_xml(count, 'test/batch.xml', 'test/batch.xml') self.assertEqual(4, self.seen)
def xml_to_mrk(path_in, path_out): writer = pymarc.TextWriter(io.open(path_out, 'wt', encoding="utf-8")) records = pymarc.map_xml(writer.write, path_in) writer.close()
pass except Exception as e: print e # List of transcript filenames with everything but digits stripped from names l = [strip_call_no(f) for f in os.listdir(path_to_transcripts) if f.startswith('Ms')] # Set of duplicates found in the above list s = set([x for x in l if l.count(x) > 1]) # Dictionary mapping digit-only transcript filename to full filename (if not a duplicate) d = {strip_call_no(f): f for f in os.listdir(path_to_transcripts) if f.startswith('Ms') and strip_call_no(f) not in s} # Dictionary mapping digit-only MARC Call No. to MARC filename. fd = {} tempcall = '' print 'Looking for matching transcription files ...' for file in os.listdir(path_to_marcs): if file.endswith('.xml'): pymarc.map_xml(get_call_no, path_to_marcs + file) fd[tempcall] = file pymarc.map_xml(check_call_no, path_to_marcs + file) print 'There are ' + str(matches) + ' matching records.' print 'There are ' + str(doubles) + ' records with potential matches.' csv_file.close()
def readMARC(filename, parseFunction, estimated=1362493): entry = None entries = [] thefield = None allKeys = {} allKeysExample = {} allKeysListCount = {} pbar = tqdm(total=estimated) with gzip.open(filename, "r") as fd: def print_title(r): nonlocal entry, entries, thefield, allKeys, allKeysExample, allKeysListCount, pbar pbar.update(1) entry = r # print(r["961"].format_field()) # fout.write(r.title()+"\n") keysSet = set() keysCount = {} keysContent = {} entryData = {} for field in r.fields: thefield = field tag = field.tag content = field.format_field() if (tag not in keysSet): keysSet.add(tag) keysCount[tag] = 0 keysContent[tag] = content keysCount[tag] += 1 # fout.write("\t%s: %s\n"%(tag,content)); try: subfields = {} subfieldsLetter = {} if (len(field.subfields) > 0): for subfieldIndex in range(0, len(field.subfields), 2): subfieldName = field.subfields[subfieldIndex] subfieldValue = field.subfields[subfieldIndex + 1] subTag = tag + "." + subfieldName if (subTag not in keysSet): keysSet.add(subTag) keysCount[subTag] = 0 keysContent[subTag] = subfieldValue keysCount[subTag] += 1 subfields[subTag] = subfieldValue subfieldsLetter[subfieldName] = subfieldValue if (tag in entryData): if (not isinstance(entryData[tag], list)): entryData[tag] = [entryData[tag]] entryData[tag].append(subfieldsLetter) else: entryData[tag] = subfieldsLetter # fout.write("\t\t%s: %s\n"%(subfieldName,subfieldValue)); except AttributeError as error: pass if (len(subfields) == 0): if (tag in entryData): if (not isinstance(entryData[tag], list)): entryData[tag] = [entryData[tag]] entryData[tag].append(content) else: entryData[tag] = content # fout.write("-----\n"); # fout.flush(); # entries.append(entry); for key in keysSet: if (key not in allKeys): allKeys[key] = 0 allKeysExample[key] = keysContent[key] allKeysListCount[key] = 0 allKeysListCount[key] += keysCount[key] allKeys[key] += 1 processedEntry = parseFunction(entryData) # processedEntry["raw"] = entryData; entries.append(processedEntry) # raise ValueError() map_xml(print_title, fd) return (entries, allKeys, allKeysExample, allKeysListCount)
else: logging.warning("Cannot tell file type, defaulting to MARCXML"); is_xml = true if (is_xml): if (re.search(r'\.gz$',arg)): logging.warning("#Reading %s as gzipped MARCXML" % (arg)) if (opt.verbose): print "#Reading %s as gzipped MARCXML" % (arg) fh = gzip.open(arg,'rb') else: logging.warning("#Reading %s as MARCXML" % (arg)) if (opt.verbose): print "#Reading %s as MARCXML" % (arg) fh = open(arg,'rb') reader = pymarc.MARCReader(fh) pymarc.map_xml(mg.grep, fh) else: if (re.search(r'\.gz$',arg)): logging.warning("#Reading %s as gzipped MARC21" % (arg)) if (opt.verbose): print "#Reading %s as gzipped MARC21" % (arg) fh = gzip.open(arg,'rb') else: logging.warning("#Reading %s as MARC21" % (arg)) if (opt.verbose): print "#Reading %s as MARC21" % (arg) fh = open(arg,'rb') reader = pymarc.MARCReader(fh,to_unicode=True) pymarc.map_records(mg.grep, fh) except Exception as e: # Catch any error, log it and move on to the next file.
try: subjectHeading647a = record['647']['a'] booksxml.write(subjectHeading647a + ',') except: booksxml.write(',') try: subjectHeading648a = record['648']['a'] booksxml.write(subjectHeading648a + ',') except: booksxml.write(',') try: subjectHeading651a = record['651']['a'] booksxml.write(subjectHeading651a + ',') except: booksxml.write(',') #print('</record>') booksxml.write('\n') #print('<collection>') booksxml.write( 'title,title2,author,authorDates,isbn,LCCN,LCCN2,dewey,placePub,publisher,pubDate,extent,itemDetails,dimensions,generalNote,summary,subjectHeading650a,subjectHeading650ax,subjectHeading650ae,subjectHeading650av,subjectHeading650ay,subjectHeading650az,subjectHeading650ayz,subjectHeading650azx,subjectHeading650ayx,subjectHeading600a,subjectHeading600ad,subjectHeading610a,subjectHeading647a,subjectHeading648a,subjectHeading651a\n' ) pymarc.map_xml(getMarcInfo, xml) #print('</collection>') booksxml.close()
def main(): global inputfile, target, mapping parser = argparse.ArgumentParser( description='Process and map classification authority records for BCUR.' ) parser.add_argument('-i', '--inputfiles', type=str, nargs='+', help='one or more file(s) to be processed', required=True) parser.add_argument('-o', '--outputfile', type=str, nargs=1, help='name of the output file', required=True) parser.add_argument('-m', '--map', type=str, nargs=1, help='map target code', required=True, choices=valid_targets) args = parser.parse_args() targetcode = args.map[0] # For musi and musg records, found records are copied as-is, no field mapping. if targetcode in ('musi', 'musg'): mapping = False outputfile = args.outputfile[0] # Open a new XML document in which target records will be stored global writer writer = XMLWriter(open(outputfile, 'wb')) # Record start processing time tstart = datetime.datetime.now() # Print header row in case log is opened as CSV print("Notice,Champ,Contenu du champ,Message") # Loop through the list of input files and call the mapping function for infile in args.inputfiles: inputfile = infile target = targetcode if mapping: print( f"----- Traitement du fichier {inputfile} avec mapping {target} -----" ) else: print( f"----- Traitement du fichier {inputfile} sans mapping -----") # This applies the mapping function to each record in inputfile map_xml(record_map, inputfile) if targetcode == 'vddoc': # For vddoc, also look for vddoc-la target = 'vddoc-la' map_xml(record_map, inputfile) # Calculate the total time elapsed tdiff = datetime.datetime.now() - tstart # Close the output document writer.close() print( f'Routine terminée en {tdiff.total_seconds()} secondes. Résultat enregistré dans {outputfile}' )