Esempio n. 1
0
    def load_file(self, location, skip=0):
        location = urlparse.urljoin("file:", location)
        t0 = time()
        times = []

        def load_record(record):

            try:
                # we test to see if it is a record, b/c not
                # all values returned from OCLC are records.
                # if it is not a record, we just want to skip it.
                if record:
                    self.records_processed += 1
                    if skip > self.records_processed:
                        LOGGER.info("skipped %i" % self.records_processed)
                    elif record.leader[5] == 'd':
                        self.delete_bib(record)
                    elif record.leader[6] == 'a':
                        self.load_bib(record)

            except Exception as e:
                LOGGER.error("unable to load: %s" % e)
                LOGGER.exception(e)
                self.errors += 1

            seconds = time() - t0
            times.append(seconds)

            if self.records_processed % 1000 == 0:
                LOGGER.info("processed %sk records in %.2f seconds" %
                            (self.records_processed / 1000, seconds))

        request = urllib2.Request(location, headers={'User-Agent': 'chronam-title-loader'})
        map_xml(load_record, urllib2.urlopen(request))
    def load_file(self, filename, skip=0):
        t0 = time()
        times = []

        def _process_time():
            seconds = time() - t0
            times.append(seconds)

            if self.records_processed % 1000 == 0:
                LOGGER.info("processed %sk records in %.2f seconds" %
                            (self.records_processed / 1000, seconds))

        def load_xml_record(record):
            try:
                self.records_processed += 1
                if skip > self.records_processed:
                    LOGGER.info("skipped %i" % self.records_processed)
                    return
                if record.leader[6] == 'y':
                    self.load_xml_holding(record)

            except Exception as e:
                LOGGER.error("unable to load record %s: %s" %
                             (self.records_processed, e))
                LOGGER.exception(e)
                self.errors += 1

            _process_time()

        map_xml(load_xml_record, file(filename, "rb"))
Esempio n. 3
0
 def test_map_xml(self):
     self.seen = 0
     def count(record): 
         self.seen += 1
     fh = gzip.open('test/batch.xml.gz','rb')
     pymarc.map_xml(count, fh)
     self.assertEqual(2, self.seen)
Esempio n. 4
0
    def load_file(self, filename, skip=0):
        t0 = time()
        times = []

        def _process_time():
            seconds = time() - t0
            times.append(seconds)

            if self.records_processed % 1000 == 0:
                LOGGER.info("processed %sk records in %.2f seconds" %
                            (self.records_processed / 1000, seconds))

        def load_xml_record(record):
            try:
                self.records_processed += 1
                if skip > self.records_processed:
                    LOGGER.info("skipped %i" % self.records_processed)
                    return
                if record.leader[6] == 'y':
                    self.load_xml_holding(record)

            except Exception as e:
                LOGGER.error("unable to load record %s: %s" %
                             (self.records_processed, e))
                LOGGER.exception(e)
                self.errors += 1

            _process_time()

        map_xml(load_xml_record, file(filename, "rb"))
Esempio n. 5
0
 def test_multi_map_xml(self):
     self.seen = 0
     def count(record): 
         self.seen += 1
     fh1 = gzip.open('test/batch.xml.gz','rb')
     fh2 = gzip.open('test/batch.xml.gz','rb')
     pymarc.map_xml(count, fh1, fh2)
     self.assertEqual(4, self.seen)
Esempio n. 6
0
    def test_map_xml(self):
        self.seen = 0

        def count(record):
            self.seen += 1

        pymarc.map_xml(count, 'test/batch.xml')
        self.assertEqual(2, self.seen)
Esempio n. 7
0
    def test_multi_map_xml(self):
        self.seen = 0

        def count(record):
            self.seen += 1

        pymarc.map_xml(count, "test/batch.xml", "test/batch.xml")
        self.assertEqual(4, self.seen)
Esempio n. 8
0
def marcxml2bioc(marcxmlFilename, biocFilename):
    with open(marcxmlFilename,
              'rb') as inF, bioc.BioCXMLDocumentWriter(biocFilename) as writer:

        def marcxml2bioc_helper(record):
            writeMarcXMLRecordToBiocFile(record, writer)

        pymarc.map_xml(marcxml2bioc_helper, inF)
Esempio n. 9
0
    def test_read_utf8(self):
        self.field_count = 0

        def process_xml(record):
            for field in record.get_fields():
                self.field_count += 1

        pymarc.map_xml(process_xml, 'test/utf8.xml')
        self.assertEqual(self.field_count, 8)
Esempio n. 10
0
    def test_map_xml(self):
        self.seen = 0

        def count(record):
            self.seen += 1

        fh = gzip.open('test/batch.xml.gz', 'rb')
        pymarc.map_xml(count, fh)
        self.assertEqual(2, self.seen)
Esempio n. 11
0
    def test_read_utf8(self):
        self.field_count = 0

        def process_xml(record):
            for field in record.get_fields():
                self.field_count += 1

        pymarc.map_xml(process_xml, 'test/utf8.xml')
        self.assertEqual(self.field_count, 8)
Esempio n. 12
0
def main():
    '''Main method. Magic starts here.'''
    # TODO: räknare på allt!
    parser = argparse.ArgumentParser()
    parser.add_argument("records_file", help="path to marc records folder")
    parser.add_argument("result_path", help="path to Instance results file")
    parser.add_argument("okapi_url", help=("OKAPI base url"))
    parser.add_argument("tenant_id", help=("id of the FOLIO tenant."))
    parser.add_argument("username", help=("the api user"))
    parser.add_argument("password", help=("the api users password"))
    parser.add_argument("-holdings_id_dict_path", "-ih", help=(""))
    parser.add_argument("-instance_id_dict_path", "-i", help=(""))
    parser.add_argument("-postgres_dump",
                        "-p",
                        help=("results will be written out for Postgres"
                              "ingestion. Default is JSON"),
                        action="store_true")
    parser.add_argument("-marcxml",
                        "-x",
                        help=("DATA is in MARCXML format"),
                        action="store_true")
    parser.add_argument("-validate",
                        "-v",
                        help=("Validate JSON data against JSON Schema"),
                        action="store_true")
    args = parser.parse_args()
    print('\tresults file:\t', args.result_path)
    print("\tOkapi URL:\t", args.okapi_url)
    print("\tTenanti Id:\t", args.tenant_id)
    print("\tUsername:   \t", args.username)
    print("\tPassword:   \tSecret")
    print("\tinstance idMap will get stored at:\t", args.instance_id_dict_path)
    print("\thold idMap will get stored at:\t", args.holdings_id_dict_path)

    print("File to process: {}".format(args.records_file))
    folio_client = FolioClient(args.okapi_url, args.tenant_id, args.username,
                               args.password)
    instance_id_map = {}
    with open(args.instance_id_dict_path, 'r') as json_file:
        instance_id_map = json.load(json_file)
    print("Number of instances in ID map: {}".format(len(instance_id_map)))
    default_mapper = HoldingsDefaultMapper(folio_client, instance_id_map)
    print("Starting")
    print("Rec./s\t\tTot. recs\t\t")

    with open(args.result_path + '/folio_holdings.json', 'w+') as results_file:
        processor = HoldingsMarcProcessor(default_mapper, folio_client,
                                          results_file, args)
        if args.marcxml:
            pymarc.map_xml(processor.process_record, args.records_file)
        else:
            with open(args.records_file, 'rb') as marc_file:
                pymarc.map_records(processor.process_record, marc_file)
    # wrap up
    print("Done. Wrapping up...")
    processor.wrap_up()
    print("done")
Esempio n. 13
0
    def test_multi_map_xml(self):
        self.seen = 0

        def count(record):
            self.seen += 1

        fh1 = gzip.open('test/batch.xml.gz', 'rb')
        fh2 = gzip.open('test/batch.xml.gz', 'rb')
        pymarc.map_xml(count, fh1, fh2)
        self.assertEqual(4, self.seen)
Esempio n. 14
0
    def test_copy_utf8(self):
        writer = pymarc.MARCWriter(open('test/write-utf8-test.dat', 'wb'))
        new_record = pymarc.Record(to_unicode=True, force_utf8=True)
        def process_xml(record):
            new_record.leader = record.leader

            for field in record.get_fields():
                new_record.add_field(field)

        pymarc.map_xml(process_xml, 'test/utf8.xml')

        try:
            writer.write(new_record)
            writer.close()

        finally:
            # remove it
            os.remove('test/write-utf8-test.dat')
def main():
    '''parses args pointing to record xml paths, specifies output paths, and applies "pull_arabic"'''
    logger = logging.getLogger(__name__)
    logger.info(
        'collecting arabic records and extracting parallel Arabic/Romanized representations'
    )

    parser = argparse.ArgumentParser()

    parser.add_argument('input_directory',
                        help='path to directory containing records')
    parser.add_argument(
        '-f',
        '--sub_directory_filter',
        help=
        'select a particular subdirectory inside a complex directory structure'
    )
    parser.add_argument(
        '-n',
        '--name',
        help='optional source name, otherwise take directory name')

    args = parser.parse_args()

    if args.name:
        name = args.name
    else:
        name = args.input_directory.split('/')[-1]
    logger.info(f'source: {name}')

    record_paths = get_xml_paths(args.input_directory,
                                 args.sub_directory_filter)

    writer = pymarc.XMLWriter(open(f'data/arabic_records/{name}.xml', 'wb'))

    for path in record_paths:
        xmlname = path.split('/')[-1].replace('.xml', '')
        pymarc.map_xml(lambda record: pull_arabic(record, writer=writer), path)
    writer.close()

    global counter008
    global counter880
    logger.info(
        f'# of Arabic records ("ara" in language field 008): {counter008}')
Esempio n. 16
0
    def test_copy_utf8(self):
        writer = pymarc.MARCWriter(open('test/write-utf8-test.dat', 'wb'))
        new_record = pymarc.Record(to_unicode=True, force_utf8=True)

        def process_xml(record):
            new_record.leader = record.leader

            for field in record.get_fields():
                new_record.add_field(field)

        pymarc.map_xml(process_xml, 'test/utf8.xml')

        try:
            writer.write(new_record)
            writer.close()

        finally:
            # remove it
            os.remove('test/write-utf8-test.dat')
def main():
	parser = argparse.ArgumentParser(description='Tool to summarize data from PubAg')
	parser.add_argument('--i',type=str,required=True,help="PubAg MarcXML file")
	parser.add_argument('--oTitles',type=str,required=True,help="File containing titles")
	parser.add_argument('--oHasTitlesAndAbstracts',type=str,required=True,help="File containing counts of titles and abstracts")
	parser.add_argument('--oPubYear',type=str,required=True,help="File containing counts of publication years")
	parser.add_argument('--oJournals',type=str,required=True,help="File containing counts of journals")

	args = parser.parse_args()

	fTitles = codecs.open(args.oTitles,'w','utf-8')
	fHasTitlesAndAbstracts = codecs.open(args.oHasTitlesAndAbstracts,'w','utf-8')
	fPubYear = codecs.open(args.oPubYear,'w','utf-8')
	fJournals = codecs.open(args.oJournals,'w','utf-8')
	
	def summarizeMarcXMLFile_helper(record):
		summarizeMarcXMLFile(record,fTitles,fHasTitlesAndAbstracts,fPubYear,fJournals)

	with open(args.i,'rb') as inF:
		pymarc.map_xml(summarizeMarcXMLFile_helper,inF)
Esempio n. 18
0
class TitleLoader(object):
    def __init__(self):
        self.records_processed = 0
        self.records_created = 0
        self.records_updated = 0
        self.records_deleted = 0
        self.missing_lccns = 0
        self.errors = 0

    def load_file(self, location, skip=0):
        location = urlparse.urljoin("file:", location)
        t0 = time()
        times = []

        def load_record(record):

            try:
                # we test to see if it is a record, b/c not
                # all values returned from OCLC are records.
                # if it is not a record, we just want to skip it.
                if record:
                    self.records_processed += 1
                    if skip > self.records_processed:
                        _logger.info("skipped %i" % self.records_processed)
                    elif record.leader[5] == 'd':
                        self.delete_bib(record)
                    elif record.leader[6] == 'a':
                        self.load_bib(record)

            except Exception, e:
                _logger.error("unable to load: %s" % e)
                _logger.exception(e)
                self.errors += 1

            seconds = time() - t0
            times.append(seconds)

            if self.records_processed % 1000 == 0:
                _logger.info("processed %sk records in %.2f seconds" %
                             (self.records_processed / 1000, seconds))

        if 'https' in location:
            lccn = location[40:location.find('/marc.xml')]
            c = httplib.HTTPSConnection('chroniclingamerica.loc.gov')
            c.request("GET", '/lccn/%s/marc.xml' % lccn)
            xml = c.getresponse()
        else:
            xml = urllib2.urlopen(location)

        map_xml(load_record, xml)
Esempio n. 19
0
class HoldingLoader:
    """
    A loader for holdings data. Intended to be run after titles have been
    loaded with TitleLoader. This is necessary so that holdings records
    can be attached to the appropriate Title.
    """
    def __init__(self):
        self.records_processed = 0
        self.missing_title = 0
        self.errors = 0
        self.skipped = 0

        self.holding_created = 0
        self.no_oclc = 0
        self.files_processed = 0

    def load_file(self, filename, skip=0):
        t0 = time()
        times = []

        def _process_time():
            seconds = time() - t0
            times.append(seconds)

            if self.records_processed % 1000 == 0:
                _logger.info("processed %sk records in %.2f seconds" %
                             (self.records_processed / 1000, seconds))

        def load_xml_record(record):
            try:
                self.records_processed += 1
                if skip > self.records_processed:
                    _logger.info("skipped %i" % self.records_processed)
                    return
                if record.leader[6] == 'y':
                    self.load_xml_holding(record)

            except Exception, e:
                _logger.error("unable to load record %s: %s" %
                              (self.records_processed, e))
                _logger.exception(e)
                self.errors += 1

            _process_time()

        map_xml(load_xml_record, file(filename, "rb"))
Esempio n. 20
0
class TitleLoader(object):
    def __init__(self):
        self.records_processed = 0
        self.records_created = 0
        self.records_updated = 0
        self.records_deleted = 0
        self.missing_lccns = 0
        self.errors = 0

    def load_file(self, location, skip=0):
        location = urlparse.urljoin("file:", location)
        t0 = time()
        times = []

        def load_record(record):

            try:
                # we test to see if it is a record, b/c not
                # all values returned from OCLC are records.
                # if it is not a record, we just want to skip it.
                if record:
                    self.records_processed += 1
                    if skip > self.records_processed:
                        _logger.info("skipped %i" % self.records_processed)
                    elif record.leader[5] == 'd':
                        self.delete_bib(record)
                    elif record.leader[6] == 'a':
                        self.load_bib(record)

            except Exception, e:
                _logger.error("unable to load: %s" % e)
                _logger.exception(e)
                self.errors += 1

            seconds = time() - t0
            times.append(seconds)

            if self.records_processed % 1000 == 0:
                _logger.info("processed %sk records in %.2f seconds" %
                             (self.records_processed / 1000, seconds))

        map_xml(load_record, urllib2.urlopen(location))
Esempio n. 21
0
def parse_record(record, field=FIELD, subfield=SUBFIELD):
    value = extract(record, field, subfield)
    if value:
        rec_id = extract(record, '010', 'a')
        if not rec_id:
            rec_id = extract(record, '004')
        values.append((rec_id, value))


if __name__ == '__main__':
    if os.path.isdir(SOURCE):
        marc_xml_dir = os.listdir(SOURCE)
        for xml_file in marc_xml_dir:
            marc_file = os.path.join(SOURCE, xml_file)
            map_xml(parse_record, open(marc_file, 'r'))
    else:
        map_xml(parse_record, open(SOURCE, 'r'))

    # all values
    #for value in values:
    #    print str(value[0]), ',',value[1]

    total = len(values)

    #Get a sample of 50 random values for that field
    for i in range(50):
        try:
            random_value = choice(values)
            values.remove(random_value)
            print ','.join([random_value[0], random_value[1]])
Esempio n. 22
0
# Options and arguments
__version__ = '0.0.1'
p = optparse.OptionParser(description='MARCXML Record Counter',
                          usage='usage: %prog [[opts]] [file1] .. [fileN]',
                          version='%prog ' + __version__)
p.add_option('--verbose',
             '-v',
             action='store_true',
             help="verbose, show additional informational messages")
(opt, args) = p.parse_args()

# Loop over all files specified counting records in each
total = 0
fmt = "%-7d %s"
for arg in args:
    seen = 0
    fh = 0
    if (re.search(r'\.gz$', arg)):
        if (opt.verbose):
            print "Reading %s as gzipped MARCXML" % (arg)
        fh = gzip.open(arg, 'rb')
    else:
        if (opt.verbose):
            print "Reading %s as MARCXML" % (arg)
        fh = open(arg, 'rb')
    pymarc.map_xml(count, fh)
    print fmt % (seen, arg)
    total += seen
if (len(args) > 1):
    print fmt % (total, 'TOTAL')
Esempio n. 23
0
#!/usr/bin/python

import os
import pymarc

path = './'


def get_place_of_pub(record):
    try:
        place_of_pub = record['']['']
        print(place_of_pub)
        with open('out.txt', 'a') as f:
            print(place_of_pub, file=f)
    except Exception as e:
        print(e)


for file in os.listdir(path):
    if file.endswith('.xml'):
        pymarc.map_xml(get_place_of_pub, path + file)
Esempio n. 24
0
# Options and arguments
__version__ = "0.0.1"
p = optparse.OptionParser(
    description="MARCXML Record Counter",
    usage="usage: %prog [[opts]] [file1] .. [fileN]",
    version="%prog " + __version__,
)
p.add_option("--verbose", "-v", action="store_true", help="verbose, show additional informational messages")
(opt, args) = p.parse_args()

# Loop over all files specified counting records in each
total = 0
fmt = "%-7d %s"
for arg in args:
    seen = 0
    fh = 0
    if re.search(r"\.gz$", arg):
        if opt.verbose:
            print "Reading %s as gzipped MARCXML" % (arg)
        fh = gzip.open(arg, "rb")
    else:
        if opt.verbose:
            print "Reading %s as MARCXML" % (arg)
        fh = open(arg, "rb")
    pymarc.map_xml(count, fh)
    print fmt % (seen, arg)
    total += seen
if len(args) > 1:
    print fmt % (total, "TOTAL")
def xml_to_mrc(path_in, path_out):
    writer = pymarc.MARCWriter(open(path_out, 'wb'))
    records = pymarc.map_xml(writer.write, path_in)
    writer.close()
Esempio n. 26
0
values = []

def parse_record(record, field=FIELD, subfield=SUBFIELD):
    value = extract(record, field, subfield)
    if value:
        rec_id = extract(record, '010', 'a')
        if not rec_id:
            rec_id = extract(record, '004')
        values.append((rec_id, value))

if __name__ == '__main__':
    if os.path.isdir(SOURCE):
        marc_xml_dir = os.listdir(SOURCE)
        for xml_file in marc_xml_dir:
            marc_file = os.path.join(SOURCE, xml_file)
            map_xml(parse_record, open(marc_file, 'r'))
    else:
        map_xml(parse_record, open(SOURCE, 'r'))

    # all values
    #for value in values:
    #    print str(value[0]), ',',value[1]

    total = len(values)

    #Get a sample of 50 random values for that field
    for i in range(50):
        try:
            random_value = choice(values) 
            values.remove(random_value)
            print ','.join([random_value[0], random_value[1]])
Esempio n. 27
0
 def test_multi_map_xml(self):
     self.seen = 0
     def count(record): 
         self.seen += 1
     pymarc.map_xml(count, 'test/batch.xml', 'test/batch.xml')
     self.assertEqual(4, self.seen)
def xml_to_mrk(path_in, path_out):
    writer = pymarc.TextWriter(io.open(path_out, 'wt', encoding="utf-8"))
    records = pymarc.map_xml(writer.write, path_in)
    writer.close()
           pass
    except Exception as e:
        print e

# List of transcript filenames with everything but digits stripped from names
l = [strip_call_no(f) for f in os.listdir(path_to_transcripts) if f.startswith('Ms')]

# Set of duplicates found in the above list
s = set([x for x in l if l.count(x) > 1])

# Dictionary mapping digit-only transcript filename to full filename (if not a duplicate)
d = {strip_call_no(f): f for f in os.listdir(path_to_transcripts) if f.startswith('Ms') and strip_call_no(f) not in s}

# Dictionary mapping digit-only MARC Call No. to MARC filename.
fd = {}

tempcall = ''

print 'Looking for matching transcription files ...'

for file in os.listdir(path_to_marcs):
    if file.endswith('.xml'):
        pymarc.map_xml(get_call_no, path_to_marcs + file)
        fd[tempcall] = file
        pymarc.map_xml(check_call_no, path_to_marcs + file)

print 'There are '  + str(matches) + ' matching records.'
print 'There are ' + str(doubles) + ' records with potential matches.'

csv_file.close()
Esempio n. 30
0
def readMARC(filename, parseFunction, estimated=1362493):
    entry = None
    entries = []
    thefield = None
    allKeys = {}
    allKeysExample = {}
    allKeysListCount = {}
    pbar = tqdm(total=estimated)
    with gzip.open(filename, "r") as fd:

        def print_title(r):
            nonlocal entry, entries, thefield, allKeys, allKeysExample, allKeysListCount, pbar
            pbar.update(1)
            entry = r
            # print(r["961"].format_field())
            # fout.write(r.title()+"\n")
            keysSet = set()
            keysCount = {}
            keysContent = {}
            entryData = {}

            for field in r.fields:
                thefield = field
                tag = field.tag
                content = field.format_field()
                if (tag not in keysSet):
                    keysSet.add(tag)
                    keysCount[tag] = 0
                    keysContent[tag] = content
                keysCount[tag] += 1
                #           fout.write("\t%s: %s\n"%(tag,content));
                try:
                    subfields = {}
                    subfieldsLetter = {}
                    if (len(field.subfields) > 0):
                        for subfieldIndex in range(0, len(field.subfields), 2):
                            subfieldName = field.subfields[subfieldIndex]
                            subfieldValue = field.subfields[subfieldIndex + 1]
                            subTag = tag + "." + subfieldName
                            if (subTag not in keysSet):
                                keysSet.add(subTag)
                                keysCount[subTag] = 0
                                keysContent[subTag] = subfieldValue
                            keysCount[subTag] += 1
                            subfields[subTag] = subfieldValue
                            subfieldsLetter[subfieldName] = subfieldValue
                        if (tag in entryData):
                            if (not isinstance(entryData[tag], list)):
                                entryData[tag] = [entryData[tag]]
                            entryData[tag].append(subfieldsLetter)
                        else:
                            entryData[tag] = subfieldsLetter
    #               fout.write("\t\t%s: %s\n"%(subfieldName,subfieldValue));
                except AttributeError as error:
                    pass

                if (len(subfields) == 0):
                    if (tag in entryData):
                        if (not isinstance(entryData[tag], list)):
                            entryData[tag] = [entryData[tag]]
                        entryData[tag].append(content)
                    else:
                        entryData[tag] = content
    #       fout.write("-----\n");

    #       fout.flush();
    #       entries.append(entry);
            for key in keysSet:
                if (key not in allKeys):
                    allKeys[key] = 0
                    allKeysExample[key] = keysContent[key]
                    allKeysListCount[key] = 0
                allKeysListCount[key] += keysCount[key]
                allKeys[key] += 1
            processedEntry = parseFunction(entryData)
            #       processedEntry["raw"] = entryData;
            entries.append(processedEntry)


#       raise ValueError()

        map_xml(print_title, fd)
    return (entries, allKeys, allKeysExample, allKeysListCount)
Esempio n. 31
0
         else:
             logging.warning("Cannot tell file type, defaulting to MARCXML");
             is_xml = true
     if (is_xml):
         if (re.search(r'\.gz$',arg)):
             logging.warning("#Reading %s as gzipped MARCXML" % (arg))
             if (opt.verbose):
                 print "#Reading %s as gzipped MARCXML" % (arg)
             fh = gzip.open(arg,'rb')
         else:
             logging.warning("#Reading %s as MARCXML" % (arg))
             if (opt.verbose):
                 print "#Reading %s as MARCXML" % (arg)
             fh = open(arg,'rb')
         reader = pymarc.MARCReader(fh)
         pymarc.map_xml(mg.grep, fh)
     else:
         if (re.search(r'\.gz$',arg)):
             logging.warning("#Reading %s as gzipped MARC21" % (arg))
             if (opt.verbose):
                 print "#Reading %s as gzipped MARC21" % (arg)
             fh = gzip.open(arg,'rb')
         else:
             logging.warning("#Reading %s as MARC21" % (arg))
             if (opt.verbose):
                 print "#Reading %s as MARC21" % (arg)
             fh = open(arg,'rb')
         reader = pymarc.MARCReader(fh,to_unicode=True)
         pymarc.map_records(mg.grep, fh)
 except Exception as e:
     # Catch any error, log it and move on to the next file.
Esempio n. 32
0
    try:
        subjectHeading647a = record['647']['a']
        booksxml.write(subjectHeading647a + ',')
    except:

        booksxml.write(',')

    try:
        subjectHeading648a = record['648']['a']
        booksxml.write(subjectHeading648a + ',')
    except:
        booksxml.write(',')

    try:
        subjectHeading651a = record['651']['a']
        booksxml.write(subjectHeading651a + ',')
    except:
        booksxml.write(',')

    #print('</record>')
    booksxml.write('\n')


#print('<collection>')
booksxml.write(
    'title,title2,author,authorDates,isbn,LCCN,LCCN2,dewey,placePub,publisher,pubDate,extent,itemDetails,dimensions,generalNote,summary,subjectHeading650a,subjectHeading650ax,subjectHeading650ae,subjectHeading650av,subjectHeading650ay,subjectHeading650az,subjectHeading650ayz,subjectHeading650azx,subjectHeading650ayx,subjectHeading600a,subjectHeading600ad,subjectHeading610a,subjectHeading647a,subjectHeading648a,subjectHeading651a\n'
)
pymarc.map_xml(getMarcInfo, xml)
#print('</collection>')
booksxml.close()
Esempio n. 33
0
def main():
    global inputfile, target, mapping
    parser = argparse.ArgumentParser(
        description='Process and map classification authority records for BCUR.'
    )
    parser.add_argument('-i',
                        '--inputfiles',
                        type=str,
                        nargs='+',
                        help='one or more file(s) to be processed',
                        required=True)
    parser.add_argument('-o',
                        '--outputfile',
                        type=str,
                        nargs=1,
                        help='name of the output file',
                        required=True)
    parser.add_argument('-m',
                        '--map',
                        type=str,
                        nargs=1,
                        help='map target code',
                        required=True,
                        choices=valid_targets)

    args = parser.parse_args()

    targetcode = args.map[0]

    # For musi and musg records, found records are copied as-is, no field mapping.
    if targetcode in ('musi', 'musg'):
        mapping = False

    outputfile = args.outputfile[0]

    # Open a new XML document in which target records will be stored
    global writer
    writer = XMLWriter(open(outputfile, 'wb'))

    # Record start processing time
    tstart = datetime.datetime.now()

    # Print header row in case log is opened as CSV
    print("Notice,Champ,Contenu du champ,Message")

    # Loop through the list of input files and call the mapping function
    for infile in args.inputfiles:
        inputfile = infile
        target = targetcode
        if mapping:
            print(
                f"----- Traitement du fichier {inputfile} avec mapping {target} -----"
            )
        else:
            print(
                f"----- Traitement du fichier {inputfile} sans mapping -----")

        # This applies the mapping function to each record in inputfile
        map_xml(record_map, inputfile)

        if targetcode == 'vddoc':
            # For vddoc, also look for vddoc-la
            target = 'vddoc-la'
            map_xml(record_map, inputfile)

    # Calculate the total time elapsed
    tdiff = datetime.datetime.now() - tstart

    # Close the output document
    writer.close()

    print(
        f'Routine terminée en {tdiff.total_seconds()} secondes. Résultat enregistré dans {outputfile}'
    )