コード例 #1
0
ファイル: title_loader.py プロジェクト: stephenndele/chronam
    def load_file(self, location, skip=0):
        location = urlparse.urljoin("file:", location)
        t0 = time()
        times = []

        def load_record(record):

            try:
                # we test to see if it is a record, b/c not
                # all values returned from OCLC are records.
                # if it is not a record, we just want to skip it.
                if record:
                    self.records_processed += 1
                    if skip > self.records_processed:
                        LOGGER.info("skipped %i" % self.records_processed)
                    elif record.leader[5] == 'd':
                        self.delete_bib(record)
                    elif record.leader[6] == 'a':
                        self.load_bib(record)

            except Exception as e:
                LOGGER.error("unable to load: %s" % e)
                LOGGER.exception(e)
                self.errors += 1

            seconds = time() - t0
            times.append(seconds)

            if self.records_processed % 1000 == 0:
                LOGGER.info("processed %sk records in %.2f seconds" %
                            (self.records_processed / 1000, seconds))

        request = urllib2.Request(location, headers={'User-Agent': 'chronam-title-loader'})
        map_xml(load_record, urllib2.urlopen(request))
コード例 #2
0
    def load_file(self, filename, skip=0):
        t0 = time()
        times = []

        def _process_time():
            seconds = time() - t0
            times.append(seconds)

            if self.records_processed % 1000 == 0:
                LOGGER.info("processed %sk records in %.2f seconds" %
                            (self.records_processed / 1000, seconds))

        def load_xml_record(record):
            try:
                self.records_processed += 1
                if skip > self.records_processed:
                    LOGGER.info("skipped %i" % self.records_processed)
                    return
                if record.leader[6] == 'y':
                    self.load_xml_holding(record)

            except Exception as e:
                LOGGER.error("unable to load record %s: %s" %
                             (self.records_processed, e))
                LOGGER.exception(e)
                self.errors += 1

            _process_time()

        map_xml(load_xml_record, file(filename, "rb"))
コード例 #3
0
ファイル: xml_test.py プロジェクト: digideskio/mx
 def test_map_xml(self):
     self.seen = 0
     def count(record): 
         self.seen += 1
     fh = gzip.open('test/batch.xml.gz','rb')
     pymarc.map_xml(count, fh)
     self.assertEqual(2, self.seen)
コード例 #4
0
    def load_file(self, filename, skip=0):
        t0 = time()
        times = []

        def _process_time():
            seconds = time() - t0
            times.append(seconds)

            if self.records_processed % 1000 == 0:
                LOGGER.info("processed %sk records in %.2f seconds" %
                            (self.records_processed / 1000, seconds))

        def load_xml_record(record):
            try:
                self.records_processed += 1
                if skip > self.records_processed:
                    LOGGER.info("skipped %i" % self.records_processed)
                    return
                if record.leader[6] == 'y':
                    self.load_xml_holding(record)

            except Exception as e:
                LOGGER.error("unable to load record %s: %s" %
                             (self.records_processed, e))
                LOGGER.exception(e)
                self.errors += 1

            _process_time()

        map_xml(load_xml_record, file(filename, "rb"))
コード例 #5
0
ファイル: xml_test.py プロジェクト: digideskio/mx
 def test_multi_map_xml(self):
     self.seen = 0
     def count(record): 
         self.seen += 1
     fh1 = gzip.open('test/batch.xml.gz','rb')
     fh2 = gzip.open('test/batch.xml.gz','rb')
     pymarc.map_xml(count, fh1, fh2)
     self.assertEqual(4, self.seen)
コード例 #6
0
ファイル: xml_test.py プロジェクト: vaneseltine/pymarc
    def test_map_xml(self):
        self.seen = 0

        def count(record):
            self.seen += 1

        pymarc.map_xml(count, 'test/batch.xml')
        self.assertEqual(2, self.seen)
コード例 #7
0
ファイル: test_xml.py プロジェクト: pejalptar/marc
    def test_multi_map_xml(self):
        self.seen = 0

        def count(record):
            self.seen += 1

        pymarc.map_xml(count, "test/batch.xml", "test/batch.xml")
        self.assertEqual(4, self.seen)
コード例 #8
0
def marcxml2bioc(marcxmlFilename, biocFilename):
    with open(marcxmlFilename,
              'rb') as inF, bioc.BioCXMLDocumentWriter(biocFilename) as writer:

        def marcxml2bioc_helper(record):
            writeMarcXMLRecordToBiocFile(record, writer)

        pymarc.map_xml(marcxml2bioc_helper, inF)
コード例 #9
0
ファイル: utf8_test.py プロジェクト: symac/pymarc
    def test_read_utf8(self):
        self.field_count = 0

        def process_xml(record):
            for field in record.get_fields():
                self.field_count += 1

        pymarc.map_xml(process_xml, 'test/utf8.xml')
        self.assertEqual(self.field_count, 8)
コード例 #10
0
    def test_map_xml(self):
        self.seen = 0

        def count(record):
            self.seen += 1

        fh = gzip.open('test/batch.xml.gz', 'rb')
        pymarc.map_xml(count, fh)
        self.assertEqual(2, self.seen)
コード例 #11
0
ファイル: utf8_test.py プロジェクト: EdwardBetts/pymarc
    def test_read_utf8(self):
        self.field_count = 0

        def process_xml(record):
            for field in record.get_fields():
                self.field_count += 1

        pymarc.map_xml(process_xml, 'test/utf8.xml')
        self.assertEqual(self.field_count, 8)
コード例 #12
0
def main():
    '''Main method. Magic starts here.'''
    # TODO: räknare på allt!
    parser = argparse.ArgumentParser()
    parser.add_argument("records_file", help="path to marc records folder")
    parser.add_argument("result_path", help="path to Instance results file")
    parser.add_argument("okapi_url", help=("OKAPI base url"))
    parser.add_argument("tenant_id", help=("id of the FOLIO tenant."))
    parser.add_argument("username", help=("the api user"))
    parser.add_argument("password", help=("the api users password"))
    parser.add_argument("-holdings_id_dict_path", "-ih", help=(""))
    parser.add_argument("-instance_id_dict_path", "-i", help=(""))
    parser.add_argument("-postgres_dump",
                        "-p",
                        help=("results will be written out for Postgres"
                              "ingestion. Default is JSON"),
                        action="store_true")
    parser.add_argument("-marcxml",
                        "-x",
                        help=("DATA is in MARCXML format"),
                        action="store_true")
    parser.add_argument("-validate",
                        "-v",
                        help=("Validate JSON data against JSON Schema"),
                        action="store_true")
    args = parser.parse_args()
    print('\tresults file:\t', args.result_path)
    print("\tOkapi URL:\t", args.okapi_url)
    print("\tTenanti Id:\t", args.tenant_id)
    print("\tUsername:   \t", args.username)
    print("\tPassword:   \tSecret")
    print("\tinstance idMap will get stored at:\t", args.instance_id_dict_path)
    print("\thold idMap will get stored at:\t", args.holdings_id_dict_path)

    print("File to process: {}".format(args.records_file))
    folio_client = FolioClient(args.okapi_url, args.tenant_id, args.username,
                               args.password)
    instance_id_map = {}
    with open(args.instance_id_dict_path, 'r') as json_file:
        instance_id_map = json.load(json_file)
    print("Number of instances in ID map: {}".format(len(instance_id_map)))
    default_mapper = HoldingsDefaultMapper(folio_client, instance_id_map)
    print("Starting")
    print("Rec./s\t\tTot. recs\t\t")

    with open(args.result_path + '/folio_holdings.json', 'w+') as results_file:
        processor = HoldingsMarcProcessor(default_mapper, folio_client,
                                          results_file, args)
        if args.marcxml:
            pymarc.map_xml(processor.process_record, args.records_file)
        else:
            with open(args.records_file, 'rb') as marc_file:
                pymarc.map_records(processor.process_record, marc_file)
    # wrap up
    print("Done. Wrapping up...")
    processor.wrap_up()
    print("done")
コード例 #13
0
    def test_multi_map_xml(self):
        self.seen = 0

        def count(record):
            self.seen += 1

        fh1 = gzip.open('test/batch.xml.gz', 'rb')
        fh2 = gzip.open('test/batch.xml.gz', 'rb')
        pymarc.map_xml(count, fh1, fh2)
        self.assertEqual(4, self.seen)
コード例 #14
0
ファイル: utf8_test.py プロジェクト: EdwardBetts/pymarc
    def test_copy_utf8(self):
        writer = pymarc.MARCWriter(open('test/write-utf8-test.dat', 'wb'))
        new_record = pymarc.Record(to_unicode=True, force_utf8=True)
        def process_xml(record):
            new_record.leader = record.leader

            for field in record.get_fields():
                new_record.add_field(field)

        pymarc.map_xml(process_xml, 'test/utf8.xml')

        try:
            writer.write(new_record)
            writer.close()

        finally:
            # remove it
            os.remove('test/write-utf8-test.dat')
コード例 #15
0
def main():
    '''parses args pointing to record xml paths, specifies output paths, and applies "pull_arabic"'''
    logger = logging.getLogger(__name__)
    logger.info(
        'collecting arabic records and extracting parallel Arabic/Romanized representations'
    )

    parser = argparse.ArgumentParser()

    parser.add_argument('input_directory',
                        help='path to directory containing records')
    parser.add_argument(
        '-f',
        '--sub_directory_filter',
        help=
        'select a particular subdirectory inside a complex directory structure'
    )
    parser.add_argument(
        '-n',
        '--name',
        help='optional source name, otherwise take directory name')

    args = parser.parse_args()

    if args.name:
        name = args.name
    else:
        name = args.input_directory.split('/')[-1]
    logger.info(f'source: {name}')

    record_paths = get_xml_paths(args.input_directory,
                                 args.sub_directory_filter)

    writer = pymarc.XMLWriter(open(f'data/arabic_records/{name}.xml', 'wb'))

    for path in record_paths:
        xmlname = path.split('/')[-1].replace('.xml', '')
        pymarc.map_xml(lambda record: pull_arabic(record, writer=writer), path)
    writer.close()

    global counter008
    global counter880
    logger.info(
        f'# of Arabic records ("ara" in language field 008): {counter008}')
コード例 #16
0
ファイル: utf8_test.py プロジェクト: symac/pymarc
    def test_copy_utf8(self):
        writer = pymarc.MARCWriter(open('test/write-utf8-test.dat', 'wb'))
        new_record = pymarc.Record(to_unicode=True, force_utf8=True)

        def process_xml(record):
            new_record.leader = record.leader

            for field in record.get_fields():
                new_record.add_field(field)

        pymarc.map_xml(process_xml, 'test/utf8.xml')

        try:
            writer.write(new_record)
            writer.close()

        finally:
            # remove it
            os.remove('test/write-utf8-test.dat')
コード例 #17
0
def main():
	parser = argparse.ArgumentParser(description='Tool to summarize data from PubAg')
	parser.add_argument('--i',type=str,required=True,help="PubAg MarcXML file")
	parser.add_argument('--oTitles',type=str,required=True,help="File containing titles")
	parser.add_argument('--oHasTitlesAndAbstracts',type=str,required=True,help="File containing counts of titles and abstracts")
	parser.add_argument('--oPubYear',type=str,required=True,help="File containing counts of publication years")
	parser.add_argument('--oJournals',type=str,required=True,help="File containing counts of journals")

	args = parser.parse_args()

	fTitles = codecs.open(args.oTitles,'w','utf-8')
	fHasTitlesAndAbstracts = codecs.open(args.oHasTitlesAndAbstracts,'w','utf-8')
	fPubYear = codecs.open(args.oPubYear,'w','utf-8')
	fJournals = codecs.open(args.oJournals,'w','utf-8')
	
	def summarizeMarcXMLFile_helper(record):
		summarizeMarcXMLFile(record,fTitles,fHasTitlesAndAbstracts,fPubYear,fJournals)

	with open(args.i,'rb') as inF:
		pymarc.map_xml(summarizeMarcXMLFile_helper,inF)
コード例 #18
0
class TitleLoader(object):
    def __init__(self):
        self.records_processed = 0
        self.records_created = 0
        self.records_updated = 0
        self.records_deleted = 0
        self.missing_lccns = 0
        self.errors = 0

    def load_file(self, location, skip=0):
        location = urlparse.urljoin("file:", location)
        t0 = time()
        times = []

        def load_record(record):

            try:
                # we test to see if it is a record, b/c not
                # all values returned from OCLC are records.
                # if it is not a record, we just want to skip it.
                if record:
                    self.records_processed += 1
                    if skip > self.records_processed:
                        _logger.info("skipped %i" % self.records_processed)
                    elif record.leader[5] == 'd':
                        self.delete_bib(record)
                    elif record.leader[6] == 'a':
                        self.load_bib(record)

            except Exception, e:
                _logger.error("unable to load: %s" % e)
                _logger.exception(e)
                self.errors += 1

            seconds = time() - t0
            times.append(seconds)

            if self.records_processed % 1000 == 0:
                _logger.info("processed %sk records in %.2f seconds" %
                             (self.records_processed / 1000, seconds))

        if 'https' in location:
            lccn = location[40:location.find('/marc.xml')]
            c = httplib.HTTPSConnection('chroniclingamerica.loc.gov')
            c.request("GET", '/lccn/%s/marc.xml' % lccn)
            xml = c.getresponse()
        else:
            xml = urllib2.urlopen(location)

        map_xml(load_record, xml)
コード例 #19
0
class HoldingLoader:
    """
    A loader for holdings data. Intended to be run after titles have been
    loaded with TitleLoader. This is necessary so that holdings records
    can be attached to the appropriate Title.
    """
    def __init__(self):
        self.records_processed = 0
        self.missing_title = 0
        self.errors = 0
        self.skipped = 0

        self.holding_created = 0
        self.no_oclc = 0
        self.files_processed = 0

    def load_file(self, filename, skip=0):
        t0 = time()
        times = []

        def _process_time():
            seconds = time() - t0
            times.append(seconds)

            if self.records_processed % 1000 == 0:
                _logger.info("processed %sk records in %.2f seconds" %
                             (self.records_processed / 1000, seconds))

        def load_xml_record(record):
            try:
                self.records_processed += 1
                if skip > self.records_processed:
                    _logger.info("skipped %i" % self.records_processed)
                    return
                if record.leader[6] == 'y':
                    self.load_xml_holding(record)

            except Exception, e:
                _logger.error("unable to load record %s: %s" %
                              (self.records_processed, e))
                _logger.exception(e)
                self.errors += 1

            _process_time()

        map_xml(load_xml_record, file(filename, "rb"))
コード例 #20
0
ファイル: title_loader.py プロジェクト: rugby110/chronam
class TitleLoader(object):
    def __init__(self):
        self.records_processed = 0
        self.records_created = 0
        self.records_updated = 0
        self.records_deleted = 0
        self.missing_lccns = 0
        self.errors = 0

    def load_file(self, location, skip=0):
        location = urlparse.urljoin("file:", location)
        t0 = time()
        times = []

        def load_record(record):

            try:
                # we test to see if it is a record, b/c not
                # all values returned from OCLC are records.
                # if it is not a record, we just want to skip it.
                if record:
                    self.records_processed += 1
                    if skip > self.records_processed:
                        _logger.info("skipped %i" % self.records_processed)
                    elif record.leader[5] == 'd':
                        self.delete_bib(record)
                    elif record.leader[6] == 'a':
                        self.load_bib(record)

            except Exception, e:
                _logger.error("unable to load: %s" % e)
                _logger.exception(e)
                self.errors += 1

            seconds = time() - t0
            times.append(seconds)

            if self.records_processed % 1000 == 0:
                _logger.info("processed %sk records in %.2f seconds" %
                             (self.records_processed / 1000, seconds))

        map_xml(load_record, urllib2.urlopen(location))
コード例 #21
0
def parse_record(record, field=FIELD, subfield=SUBFIELD):
    value = extract(record, field, subfield)
    if value:
        rec_id = extract(record, '010', 'a')
        if not rec_id:
            rec_id = extract(record, '004')
        values.append((rec_id, value))


if __name__ == '__main__':
    if os.path.isdir(SOURCE):
        marc_xml_dir = os.listdir(SOURCE)
        for xml_file in marc_xml_dir:
            marc_file = os.path.join(SOURCE, xml_file)
            map_xml(parse_record, open(marc_file, 'r'))
    else:
        map_xml(parse_record, open(SOURCE, 'r'))

    # all values
    #for value in values:
    #    print str(value[0]), ',',value[1]

    total = len(values)

    #Get a sample of 50 random values for that field
    for i in range(50):
        try:
            random_value = choice(values)
            values.remove(random_value)
            print ','.join([random_value[0], random_value[1]])
コード例 #22
0
# Options and arguments
__version__ = '0.0.1'
p = optparse.OptionParser(description='MARCXML Record Counter',
                          usage='usage: %prog [[opts]] [file1] .. [fileN]',
                          version='%prog ' + __version__)
p.add_option('--verbose',
             '-v',
             action='store_true',
             help="verbose, show additional informational messages")
(opt, args) = p.parse_args()

# Loop over all files specified counting records in each
total = 0
fmt = "%-7d %s"
for arg in args:
    seen = 0
    fh = 0
    if (re.search(r'\.gz$', arg)):
        if (opt.verbose):
            print "Reading %s as gzipped MARCXML" % (arg)
        fh = gzip.open(arg, 'rb')
    else:
        if (opt.verbose):
            print "Reading %s as MARCXML" % (arg)
        fh = open(arg, 'rb')
    pymarc.map_xml(count, fh)
    print fmt % (seen, arg)
    total += seen
if (len(args) > 1):
    print fmt % (total, 'TOTAL')
コード例 #23
0
#!/usr/bin/python

import os
import pymarc

path = './'


def get_place_of_pub(record):
    try:
        place_of_pub = record['']['']
        print(place_of_pub)
        with open('out.txt', 'a') as f:
            print(place_of_pub, file=f)
    except Exception as e:
        print(e)


for file in os.listdir(path):
    if file.endswith('.xml'):
        pymarc.map_xml(get_place_of_pub, path + file)
コード例 #24
0
ファイル: mx_count.py プロジェクト: zimeon/mx
# Options and arguments
__version__ = "0.0.1"
p = optparse.OptionParser(
    description="MARCXML Record Counter",
    usage="usage: %prog [[opts]] [file1] .. [fileN]",
    version="%prog " + __version__,
)
p.add_option("--verbose", "-v", action="store_true", help="verbose, show additional informational messages")
(opt, args) = p.parse_args()

# Loop over all files specified counting records in each
total = 0
fmt = "%-7d %s"
for arg in args:
    seen = 0
    fh = 0
    if re.search(r"\.gz$", arg):
        if opt.verbose:
            print "Reading %s as gzipped MARCXML" % (arg)
        fh = gzip.open(arg, "rb")
    else:
        if opt.verbose:
            print "Reading %s as MARCXML" % (arg)
        fh = open(arg, "rb")
    pymarc.map_xml(count, fh)
    print fmt % (seen, arg)
    total += seen
if len(args) > 1:
    print fmt % (total, "TOTAL")
コード例 #25
0
def xml_to_mrc(path_in, path_out):
    writer = pymarc.MARCWriter(open(path_out, 'wb'))
    records = pymarc.map_xml(writer.write, path_in)
    writer.close()
コード例 #26
0
ファイル: extract_marc_values.py プロジェクト: edsu/open-oni
values = []

def parse_record(record, field=FIELD, subfield=SUBFIELD):
    value = extract(record, field, subfield)
    if value:
        rec_id = extract(record, '010', 'a')
        if not rec_id:
            rec_id = extract(record, '004')
        values.append((rec_id, value))

if __name__ == '__main__':
    if os.path.isdir(SOURCE):
        marc_xml_dir = os.listdir(SOURCE)
        for xml_file in marc_xml_dir:
            marc_file = os.path.join(SOURCE, xml_file)
            map_xml(parse_record, open(marc_file, 'r'))
    else:
        map_xml(parse_record, open(SOURCE, 'r'))

    # all values
    #for value in values:
    #    print str(value[0]), ',',value[1]

    total = len(values)

    #Get a sample of 50 random values for that field
    for i in range(50):
        try:
            random_value = choice(values) 
            values.remove(random_value)
            print ','.join([random_value[0], random_value[1]])
コード例 #27
0
ファイル: xml_test.py プロジェクト: Gluejar/pymarc
 def test_multi_map_xml(self):
     self.seen = 0
     def count(record): 
         self.seen += 1
     pymarc.map_xml(count, 'test/batch.xml', 'test/batch.xml')
     self.assertEqual(4, self.seen)
コード例 #28
0
def xml_to_mrk(path_in, path_out):
    writer = pymarc.TextWriter(io.open(path_out, 'wt', encoding="utf-8"))
    records = pymarc.map_xml(writer.write, path_in)
    writer.close()
コード例 #29
0
           pass
    except Exception as e:
        print e

# List of transcript filenames with everything but digits stripped from names
l = [strip_call_no(f) for f in os.listdir(path_to_transcripts) if f.startswith('Ms')]

# Set of duplicates found in the above list
s = set([x for x in l if l.count(x) > 1])

# Dictionary mapping digit-only transcript filename to full filename (if not a duplicate)
d = {strip_call_no(f): f for f in os.listdir(path_to_transcripts) if f.startswith('Ms') and strip_call_no(f) not in s}

# Dictionary mapping digit-only MARC Call No. to MARC filename.
fd = {}

tempcall = ''

print 'Looking for matching transcription files ...'

for file in os.listdir(path_to_marcs):
    if file.endswith('.xml'):
        pymarc.map_xml(get_call_no, path_to_marcs + file)
        fd[tempcall] = file
        pymarc.map_xml(check_call_no, path_to_marcs + file)

print 'There are '  + str(matches) + ' matching records.'
print 'There are ' + str(doubles) + ' records with potential matches.'

csv_file.close()
コード例 #30
0
def readMARC(filename, parseFunction, estimated=1362493):
    entry = None
    entries = []
    thefield = None
    allKeys = {}
    allKeysExample = {}
    allKeysListCount = {}
    pbar = tqdm(total=estimated)
    with gzip.open(filename, "r") as fd:

        def print_title(r):
            nonlocal entry, entries, thefield, allKeys, allKeysExample, allKeysListCount, pbar
            pbar.update(1)
            entry = r
            # print(r["961"].format_field())
            # fout.write(r.title()+"\n")
            keysSet = set()
            keysCount = {}
            keysContent = {}
            entryData = {}

            for field in r.fields:
                thefield = field
                tag = field.tag
                content = field.format_field()
                if (tag not in keysSet):
                    keysSet.add(tag)
                    keysCount[tag] = 0
                    keysContent[tag] = content
                keysCount[tag] += 1
                #           fout.write("\t%s: %s\n"%(tag,content));
                try:
                    subfields = {}
                    subfieldsLetter = {}
                    if (len(field.subfields) > 0):
                        for subfieldIndex in range(0, len(field.subfields), 2):
                            subfieldName = field.subfields[subfieldIndex]
                            subfieldValue = field.subfields[subfieldIndex + 1]
                            subTag = tag + "." + subfieldName
                            if (subTag not in keysSet):
                                keysSet.add(subTag)
                                keysCount[subTag] = 0
                                keysContent[subTag] = subfieldValue
                            keysCount[subTag] += 1
                            subfields[subTag] = subfieldValue
                            subfieldsLetter[subfieldName] = subfieldValue
                        if (tag in entryData):
                            if (not isinstance(entryData[tag], list)):
                                entryData[tag] = [entryData[tag]]
                            entryData[tag].append(subfieldsLetter)
                        else:
                            entryData[tag] = subfieldsLetter
    #               fout.write("\t\t%s: %s\n"%(subfieldName,subfieldValue));
                except AttributeError as error:
                    pass

                if (len(subfields) == 0):
                    if (tag in entryData):
                        if (not isinstance(entryData[tag], list)):
                            entryData[tag] = [entryData[tag]]
                        entryData[tag].append(content)
                    else:
                        entryData[tag] = content
    #       fout.write("-----\n");

    #       fout.flush();
    #       entries.append(entry);
            for key in keysSet:
                if (key not in allKeys):
                    allKeys[key] = 0
                    allKeysExample[key] = keysContent[key]
                    allKeysListCount[key] = 0
                allKeysListCount[key] += keysCount[key]
                allKeys[key] += 1
            processedEntry = parseFunction(entryData)
            #       processedEntry["raw"] = entryData;
            entries.append(processedEntry)


#       raise ValueError()

        map_xml(print_title, fd)
    return (entries, allKeys, allKeysExample, allKeysListCount)
コード例 #31
0
ファイル: mx_grep_oclc.py プロジェクト: zimeon/mx
         else:
             logging.warning("Cannot tell file type, defaulting to MARCXML");
             is_xml = true
     if (is_xml):
         if (re.search(r'\.gz$',arg)):
             logging.warning("#Reading %s as gzipped MARCXML" % (arg))
             if (opt.verbose):
                 print "#Reading %s as gzipped MARCXML" % (arg)
             fh = gzip.open(arg,'rb')
         else:
             logging.warning("#Reading %s as MARCXML" % (arg))
             if (opt.verbose):
                 print "#Reading %s as MARCXML" % (arg)
             fh = open(arg,'rb')
         reader = pymarc.MARCReader(fh)
         pymarc.map_xml(mg.grep, fh)
     else:
         if (re.search(r'\.gz$',arg)):
             logging.warning("#Reading %s as gzipped MARC21" % (arg))
             if (opt.verbose):
                 print "#Reading %s as gzipped MARC21" % (arg)
             fh = gzip.open(arg,'rb')
         else:
             logging.warning("#Reading %s as MARC21" % (arg))
             if (opt.verbose):
                 print "#Reading %s as MARC21" % (arg)
             fh = open(arg,'rb')
         reader = pymarc.MARCReader(fh,to_unicode=True)
         pymarc.map_records(mg.grep, fh)
 except Exception as e:
     # Catch any error, log it and move on to the next file.
コード例 #32
0
    try:
        subjectHeading647a = record['647']['a']
        booksxml.write(subjectHeading647a + ',')
    except:

        booksxml.write(',')

    try:
        subjectHeading648a = record['648']['a']
        booksxml.write(subjectHeading648a + ',')
    except:
        booksxml.write(',')

    try:
        subjectHeading651a = record['651']['a']
        booksxml.write(subjectHeading651a + ',')
    except:
        booksxml.write(',')

    #print('</record>')
    booksxml.write('\n')


#print('<collection>')
booksxml.write(
    'title,title2,author,authorDates,isbn,LCCN,LCCN2,dewey,placePub,publisher,pubDate,extent,itemDetails,dimensions,generalNote,summary,subjectHeading650a,subjectHeading650ax,subjectHeading650ae,subjectHeading650av,subjectHeading650ay,subjectHeading650az,subjectHeading650ayz,subjectHeading650azx,subjectHeading650ayx,subjectHeading600a,subjectHeading600ad,subjectHeading610a,subjectHeading647a,subjectHeading648a,subjectHeading651a\n'
)
pymarc.map_xml(getMarcInfo, xml)
#print('</collection>')
booksxml.close()
コード例 #33
0
def main():
    global inputfile, target, mapping
    parser = argparse.ArgumentParser(
        description='Process and map classification authority records for BCUR.'
    )
    parser.add_argument('-i',
                        '--inputfiles',
                        type=str,
                        nargs='+',
                        help='one or more file(s) to be processed',
                        required=True)
    parser.add_argument('-o',
                        '--outputfile',
                        type=str,
                        nargs=1,
                        help='name of the output file',
                        required=True)
    parser.add_argument('-m',
                        '--map',
                        type=str,
                        nargs=1,
                        help='map target code',
                        required=True,
                        choices=valid_targets)

    args = parser.parse_args()

    targetcode = args.map[0]

    # For musi and musg records, found records are copied as-is, no field mapping.
    if targetcode in ('musi', 'musg'):
        mapping = False

    outputfile = args.outputfile[0]

    # Open a new XML document in which target records will be stored
    global writer
    writer = XMLWriter(open(outputfile, 'wb'))

    # Record start processing time
    tstart = datetime.datetime.now()

    # Print header row in case log is opened as CSV
    print("Notice,Champ,Contenu du champ,Message")

    # Loop through the list of input files and call the mapping function
    for infile in args.inputfiles:
        inputfile = infile
        target = targetcode
        if mapping:
            print(
                f"----- Traitement du fichier {inputfile} avec mapping {target} -----"
            )
        else:
            print(
                f"----- Traitement du fichier {inputfile} sans mapping -----")

        # This applies the mapping function to each record in inputfile
        map_xml(record_map, inputfile)

        if targetcode == 'vddoc':
            # For vddoc, also look for vddoc-la
            target = 'vddoc-la'
            map_xml(record_map, inputfile)

    # Calculate the total time elapsed
    tdiff = datetime.datetime.now() - tstart

    # Close the output document
    writer.close()

    print(
        f'Routine terminée en {tdiff.total_seconds()} secondes. Résultat enregistré dans {outputfile}'
    )