def import_records(marcfile): class RecordLoader(pymarc.XmlHandler): Edition = apps.get_model(*EDITION_MODEL.split('.')) num_loaded = 0 def process_record(self, record): try: field020 = record.get_fields('020')[0] isbn = field020.get_subfields('a')[0] edition = self.Edition.get_by_isbn(isbn) if edition: try: mr = MARCRecord.objects.get(edition=edition) logger.info('already have a record for %s' % isbn) except MARCRecord.DoesNotExist: mr = MARCRecord(edition=edition, the_record=record) mr.save() self.num_loaded += 1 else: logger.info('no edition for %s' % isbn) except IndexError: logger.info('020 absent') handler = RecordLoader() pymarc.parse_xml(marcfile, handler) return handler.num_loaded
def parse_xml_to_array_patched(xml_file, strict=False, normalize_form=None): """ parse an xml file and return the records as an array. If you would like the parser to explicitly check the namespaces for the MARCSlim namespace use the strict=True option. Valid values for normalize_form are 'NFC', 'NFKC', 'NFD', and 'NFKD'. See unicodedata.normalize info. """ handler = XmlHandlerPatched(strict, normalize_form) parse_xml(xml_file, handler) return handler.records
def main(marc_file, catalog_output): if os.path.isfile(marc_file): if os.path.isfile(catalog_output): print(f"** {catalog_output} already exists") else: print(f"Processing {marc_file} to create {catalog_output}") with open(catalog_output, "w") as jsonfile: writer = JSONCreate(jsonfile) parse_xml(marc_file, ExtractXmlHandler(writer)) writer.close() else: print(f"You must first download {marc_file}")
def pymarc_extract(xml): """worldcat.util.pymarc_extract: extract records to pymarc Record objects Requires pymarc >= 1.2. StringIO is used since xml.sax.XMLReader's parse objects (which pymarc.marcxml.parse_xml uses) expect a filename, a file-like object, or an InputSource object. """ pymarc_records = [] records = extract_elements(xml) for record in records: handler = pymarc.XmlHandler() pymarc.parse_xml(StringIO(ET.tostring(record)), handler) pymarc_records.extend(handler.records) return pymarc_records
def parse_xml_record(xml_file, strict=False, normalize_form=None): """Parse XML data.""" handler = NightShiftXmlHandler(strict, normalize_form) parse_xml(xml_file, handler) return handler.records
# 2127216,650,1,7,a2,gtt # 3211132,650, ,7,ax2,ram # 3234100,651, ,7,a20,fast for subj in record.subjects(): d['tag'] = subj.tag d['i1'] = subj.indicators[0] d['i2'] = subj.indicators[1] d['subfields'] = ''.join(subj.subfields[::2]) if '2' in d['subfields']: idx = subj.subfields.index('2') d['sf2'] = subj.subfields[idx + 1] else: d['sf2'] = '' self._writer.writerow(d) def parse_args(): p = argparse.ArgumentParser() p.add_argument('input_file', help='path to input file') p.add_argument('output_file', help='path to output file') return p.parse_args() if __name__ == '__main__': args = parse_args() fieldnames = ['bibid', 'tag', 'i1', 'i2', 'subfields', 'sf2'] with codecs.open(args.output_file, 'wb', 'utf-8') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) # writer.writeheader() reader = parse_xml(args.input_file, ExtractXmlHandler(writer))
import requests from pymarc import XmlHandler, parse_xml from pymongo import MongoClient mongo_connection_string = sys.argv[1] path_to_csv_of_lccns = sys.argv[2] c = MongoClient(mongo_connection_string) db = c['catrecords'] mij = db.mij handler = XmlHandler() lccns = [] with open(path_to_csv_of_lccns) as csvfile: spamreader = csv.reader(csvfile, delimiter=',') for r in spamreader: for l in r: lccns.append(l) for lccn in lccns: r = requests.get("http://lccn.loc.gov/{0}/marcxml".format(lccn)) f = io.StringIO(r.text) parse_xml(f, handler) for rec in handler.records: mij.insert_one(json.loads(rec.as_json()))
# 2127216,650,1,7,a2,gtt # 3211132,650, ,7,ax2,ram # 3234100,651, ,7,a20,fast for subj in record.subjects(): d["tag"] = subj.tag d["i1"] = subj.indicators[0] d["i2"] = subj.indicators[1] d["subfields"] = "".join(subj.subfields[::2]) if "2" in d["subfields"]: idx = subj.subfields.index("2") d["sf2"] = subj.subfields[idx + 1] else: d["sf2"] = "" self._writer.writerow(d) def parse_args(): p = argparse.ArgumentParser() p.add_argument("input_file", help="path to input file") p.add_argument("output_file", help="path to output file") return p.parse_args() if __name__ == "__main__": args = parse_args() fieldnames = ["bibid", "tag", "i1", "i2", "subfields", "sf2"] with codecs.open(args.output_file, "wb", "utf-8") as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) # writer.writeheader() reader = parse_xml(args.input_file, ExtractXmlHandler(writer))
def _get_marc_record(oai_string): marc_string = re.search(r'<record .*?/record>', oai_string, re.DOTALL).group() parse_xml(io.StringIO(marc_string), handler) return handler.records.pop()