Example #1
0
def import_records(marcfile):
    class RecordLoader(pymarc.XmlHandler):
        Edition = apps.get_model(*EDITION_MODEL.split('.'))
        num_loaded = 0

        def process_record(self, record):
            try:
                field020 = record.get_fields('020')[0]
                isbn = field020.get_subfields('a')[0]
                edition = self.Edition.get_by_isbn(isbn)
                if edition:
                    try:
                        mr = MARCRecord.objects.get(edition=edition)
                        logger.info('already have a record for %s' % isbn)
                    except MARCRecord.DoesNotExist:
                        mr = MARCRecord(edition=edition, the_record=record)
                        mr.save()
                        self.num_loaded += 1
                else:
                    logger.info('no edition for %s' % isbn)
            except IndexError:
                logger.info('020 absent')

    handler = RecordLoader()
    pymarc.parse_xml(marcfile, handler)
    return handler.num_loaded
Example #2
0
def parse_xml_to_array_patched(xml_file, strict=False, normalize_form=None):
    """
    parse an xml file and return the records as an array. If you would
    like the parser to explicitly check the namespaces for the MARCSlim
    namespace use the strict=True option.
    Valid values for normalize_form are 'NFC', 'NFKC', 'NFD', and 'NFKD'. See
    unicodedata.normalize info.
    """
    handler = XmlHandlerPatched(strict, normalize_form)
    parse_xml(xml_file, handler)
    return handler.records
def main(marc_file, catalog_output):

    if os.path.isfile(marc_file):
        if os.path.isfile(catalog_output):
            print(f"** {catalog_output} already exists")
        else:
            print(f"Processing {marc_file} to create {catalog_output}")
            with open(catalog_output, "w") as jsonfile:
                writer = JSONCreate(jsonfile)
                parse_xml(marc_file, ExtractXmlHandler(writer))
                writer.close()
    else:
        print(f"You must first download {marc_file}")
Example #4
0
def pymarc_extract(xml):
    """worldcat.util.pymarc_extract: extract records to pymarc Record objects

    Requires pymarc >= 1.2. StringIO is used since xml.sax.XMLReader's
    parse objects (which pymarc.marcxml.parse_xml uses) expect a filename, a
    file-like object, or an InputSource object.
    """
    pymarc_records = []
    records = extract_elements(xml)
    for record in records:
        handler = pymarc.XmlHandler()
        pymarc.parse_xml(StringIO(ET.tostring(record)), handler)
        pymarc_records.extend(handler.records)
    return pymarc_records
Example #5
0
def parse_xml_record(xml_file, strict=False, normalize_form=None):
    """Parse XML data."""
    handler = NightShiftXmlHandler(strict, normalize_form)
    parse_xml(xml_file, handler)
    return handler.records
Example #6
0
        #   2127216,650,1,7,a2,gtt
        #   3211132,650, ,7,ax2,ram
        #   3234100,651, ,7,a20,fast
        for subj in record.subjects():
            d['tag'] = subj.tag
            d['i1'] = subj.indicators[0]
            d['i2'] = subj.indicators[1]
            d['subfields'] = ''.join(subj.subfields[::2])
            if '2' in d['subfields']:
                idx = subj.subfields.index('2')
                d['sf2'] = subj.subfields[idx + 1]
            else:
                d['sf2'] = ''
            self._writer.writerow(d)


def parse_args():
    p = argparse.ArgumentParser()
    p.add_argument('input_file', help='path to input file')
    p.add_argument('output_file', help='path to output file')
    return p.parse_args()


if __name__ == '__main__':
    args = parse_args()
    fieldnames = ['bibid', 'tag', 'i1', 'i2', 'subfields', 'sf2']
    with codecs.open(args.output_file, 'wb', 'utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        # writer.writeheader()
        reader = parse_xml(args.input_file, ExtractXmlHandler(writer))
Example #7
0
import requests
from pymarc import XmlHandler, parse_xml
from pymongo import MongoClient

mongo_connection_string = sys.argv[1]
path_to_csv_of_lccns = sys.argv[2]

c = MongoClient(mongo_connection_string)
db = c['catrecords']
mij = db.mij

handler = XmlHandler()

lccns = []
with open(path_to_csv_of_lccns) as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',')
    for r in spamreader:
        for l in r:
            lccns.append(l)

for lccn in lccns:
    r = requests.get("http://lccn.loc.gov/{0}/marcxml".format(lccn))
    f = io.StringIO(r.text)
    parse_xml(f, handler)

for rec in handler.records:
    mij.insert_one(json.loads(rec.as_json()))
    

Example #8
0
        #   2127216,650,1,7,a2,gtt
        #   3211132,650, ,7,ax2,ram
        #   3234100,651, ,7,a20,fast
        for subj in record.subjects():
            d["tag"] = subj.tag
            d["i1"] = subj.indicators[0]
            d["i2"] = subj.indicators[1]
            d["subfields"] = "".join(subj.subfields[::2])
            if "2" in d["subfields"]:
                idx = subj.subfields.index("2")
                d["sf2"] = subj.subfields[idx + 1]
            else:
                d["sf2"] = ""
            self._writer.writerow(d)


def parse_args():
    p = argparse.ArgumentParser()
    p.add_argument("input_file", help="path to input file")
    p.add_argument("output_file", help="path to output file")
    return p.parse_args()


if __name__ == "__main__":
    args = parse_args()
    fieldnames = ["bibid", "tag", "i1", "i2", "subfields", "sf2"]
    with codecs.open(args.output_file, "wb", "utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        # writer.writeheader()
        reader = parse_xml(args.input_file, ExtractXmlHandler(writer))
Example #9
0
def _get_marc_record(oai_string):
    marc_string = re.search(r'<record .*?/record>', oai_string, re.DOTALL).group()
    parse_xml(io.StringIO(marc_string), handler)
    return handler.records.pop()