def test_xml(self): # read in xml to a record record1 = pymarc.parse_xml_to_array("test/batch.xml")[0] # generate xml xml = pymarc.record_to_xml(record1) # parse generated xml record2 = pymarc.parse_xml_to_array(BytesIO(xml))[0] # compare original and resulting record self.assertEqual(record1.leader, record2.leader) field1 = record1.get_fields() field2 = record2.get_fields() self.assertEqual(len(field1), len(field2)) pos = 0 while pos < len(field1): self.assertEqual(field1[pos].tag, field2[pos].tag) if field1[pos].is_control_field(): self.assertEqual(field1[pos].data, field2[pos].data) else: self.assertEqual( field1[pos].get_subfields(), field2[pos].get_subfields() ) self.assertEqual(field1[pos].indicators, field2[pos].indicators) pos += 1
def test_xml(self): # read in xml to a record fh = gzip.open('test/batch.xml.gz','rb') record1 = pymarc.parse_xml_to_array(fh)[0] # generate xml xml = pymarc.record_to_xml(record1) # parse generated xml record2 = pymarc.parse_xml_to_array(six.BytesIO(xml))[0] # compare original and resulting record self.assertEqual(record1.leader, record2.leader) field1 = record1.get_fields() field2 = record2.get_fields() self.assertEqual(len(field1), len(field2)) pos = 0 while pos < len(field1): self.assertEqual(field1[pos].tag, field2[pos].tag) if field1[pos].is_control_field(): self.assertEqual(field1[pos].data, field2[pos].data) else: self.assertEqual(field1[pos].get_subfields(), field2[pos].get_subfields()) self.assertEqual(field1[pos].indicators, field2[pos].indicators) pos += 1
def test_xml_sort(self): # read in xml to a record record1 = pymarc.parse_xml_to_array('test/order.xml')[0] # generate xml xml = pymarc.record_to_xml(record1) # parse generated xml record1 = pymarc.parse_xml_to_array(StringIO(xml))[0] # parse xml in order record2 = pymarc.parse_xml_to_array('test/order_ok.xml')[0] # compare original and resulting record self.assertEqual(record1.leader, record2.leader) field1 = record1.get_fields() field2 = record2.get_fields() self.assertEqual(len(field1), len(field2)) pos = 0 while pos < len(field1): self.assertEqual(field1[pos].tag, field2[pos].tag) if field1[pos].is_control_field(): self.assertEqual(field1[pos].data, field2[pos].data) else: self.assertEqual(field1[pos].get_subfields(), field2[pos].get_subfields()) self.assertEqual(field1[pos].indicators, field2[pos].indicators) pos += 1
def test_load_title_missing_language(self): filename = abs_filename('./test-data/title-missing-language.xml') marc = pymarc.parse_xml_to_array(filename)[0] loader = TitleLoader() title = Title.objects.get(lccn='sn83030846') self.assertRaises(core.models.Language.DoesNotExist, loader._extract_languages, marc, title)
def raw(marcfile, edition): record = pymarc.parse_xml_to_array(marcfile)[0] for field in record: if field.tag in ('001', '003', '005', '006', '007', '856') or int( field.tag ) > 900: record.remove_field(field) add_stuff(record) return record
def xml_to_binary(rec, writer): #parse records records = pymarc.parse_xml_to_array(rec) for r in records: #strip the tags r.remove_fields('938') #find subjects if not r.subjects(): return False continue else: #count fields fields = r.get_fields() if len(fields) >= 9: #determins the format format = get_format(r) #adds the GMD if format: gmd = '[' + format + ']' if r['245']['h']: r['245']['h'] = gmd else: r['245'].add_subfield('h', gmd) #writes the record writer.write(r) return True else: return False
def generate_marcfiles(reverse_order = False): docfiles = sorted([x for x in os.listdir(DATAROOT) if x.startswith("19C_0")]) if reverse_order: docfiles.reverse() for docfile in docfiles: docfilepath = os.path.join(DATAROOT, docfile) yield (docfilepath, pymarc.parse_xml_to_array(docfilepath))
def search(self, query, page=1, size=20): try: # The "sc" parameter (split by collection) is used to provide # search results consistent with the ones from the CDS website req = requests.get( self.baseURL + "/search", params={ "p": query, "of": "xm", "rg": size, "jrec": int(size) * (int(page) - 1) + 1, }, ) except Exception: raise ServiceUnavailable("Cannot perform search") if not req.ok: raise ServiceUnavailable(f"Search failed with error code {req.status_code}") # Parse MARC XML records = pymarc.parse_xml_to_array(io.BytesIO(req.content)) results = [] for record in records: results.append(self.parse_record(record)) if len(records) > 0: # Get total number of hits pattern = "<!-- Search-Engine-Total-Number-Of-Results:(.*?)-->" total_num_hits = int(re.search(pattern, req.text).group(1)) else: total_num_hits = 0 return {"total_num_hits": total_num_hits, "results": results}
def readMARCfromURL(url): #r = requests.get(url) #print(r.text[0:100]) reader = parse_xml_to_array('sources.xml') record = reader[0] for field in record.get_fields('031'): print(field['d'], field['p'])
def generate_marcfiles(reverse_order=False): docfiles = sorted( [x for x in os.listdir(DATAROOT) if x.startswith("19C_0")]) if reverse_order: docfiles.reverse() for docfile in docfiles: docfilepath = os.path.join(DATAROOT, docfile) yield (docfilepath, pymarc.parse_xml_to_array(docfilepath))
def test_marc_to_concept(self): r = parse_xml_to_array('test_data/record.xml')[0] c = create_concept(r) self.assertEqual(c.lccn, 'sh00000011') self.assertEqual(c.pref_label, 'ActionScript (Computer program language)') self.assertEqual(c.modified, datetime(2007, 10, 12, 7, 53, 10)) self.assertEqual(c.created, date(2000, 9, 27)) self.assertEqual(c.heading_tag, '150')
def seekBookbyISBN(isbn, library="helmet"): library = builcodes[library] ##print("XXX", url+isbn+full+FLTR+library) result = urllib.request.urlopen(url + isbn + full + FLTR + library).read() result = json.loads(result) result = result.get('records') # print "data searched:" + url + isbn + "\n" # pp = pprint.PrettyPrinter(indent=4) # pp.pprint(result) resultSet = result[0] xmlmarc = result[0]['fullRecord'] # todo unnecessary bit... with open("kirja.xml", "w") as text_file: text_file.write(xmlmarc.encode("UTF-8")) from pymarc import parse_xml_to_array reader = parse_xml_to_array("kirja.xml") details = {} import re isbn = "" isbns = [] for record in reader: for f in record.get_fields('020'): #print("kentta:", f['a']) isbns.append(f['a'].replace("-", "")) try: isbn = record['020']['a'] except TypeError: pass if (len(isbns) > 1): if (len(isbns[0]) > len(isbns[1])): isbn = isbns[0] if (len(isbns[1]) > len(isbns[0])): isbn = isbns[1] title = getMarcValue(record, 'title') author = getMarcValue(record, 'author') publisher = getMarcValue(record, 'publisher').replace(",", "") pubyear = getMarcValue(record, 'pubyear').replace(".", "") #print("Valittu ISBN: {0}, title {1}".format(isbn, title.encode('utf-8'))) return (title, author, isbn, publisher, pubyear)
def marc(self): # use pymarc to read the marcxml to make fields available if os.path.exists(self.marc_path): # with codecs.open(self.marc_path, 'r', "utf-8") as marcdata: with open(self.marc_path, 'r') as marcdata: # reader = MARCReader(marcdata, utf8_handling='ignore') return pymarc.parse_xml_to_array(marcdata)[0] else: print "Check if file %s exists or your mount connection" % self.marc_path
def load(self, mrcfile: str = '', filetype: str = 'mrc') -> None: if filetype == "mrc": with open(mrcfile, 'rb') as fh: reader = MARCReader(fh) for record in reader: self.records.append(record) elif filetype == "xml": self.records = parse_xml_to_array(mrcfile) return None
def _record(self): if self._the_record: the_record = self._the_record else: the_record = pymarc.parse_xml_to_array(StringIO(self.guts))[0] for field in the_record.get_fields('856'): the_record.remove_field(field) self._the_record = the_record return the_record
def get_marc(self, url): xml = self.get_page(url) f = tempfile.NamedTemporaryFile(delete=False) f.write(xml) f.close() records = pymarc.parse_xml_to_array(f.name) os.unlink(f.name) return records
def marcxml2array(marcxml): """ serializes marcxml into pymarc array args: marcxml: xml returns: records: pymarc records array """ records = BytesIO(ET.tostring(marcxml, encoding="utf-8")) return parse_xml_to_array(records)
def __init__(self, record_element, strip_ns=True): super(SickleMARCRecord, self).__init__(record_element, strip_ns=strip_ns) if not self.deleted: marc_file = tempfile.TemporaryFile() metadata = self.xml.find(".//" + self._oai_namespace + "metadata/") marc_file.write(etree.tostring(metadata, encoding='utf-8')) marc_file.seek(0) records = parse_xml_to_array(marc_file) self.metadata = records[0].as_dict()
def _record(self): if self._the_record: the_record = self._the_record else: the_record = pymarc.parse_xml_to_array( BytesIO(bytes(self.guts, 'utf-8')))[0] for field in the_record.get_fields('856'): the_record.remove_field(field) self._the_record = the_record return the_record
def set_search_strings(cls,f,*args): ''' Function: set_search_strings Purpose: the function reads a file and generates a list of search strings. Parameters: f = the path of the file to open that contains the search strings. This can be a 'txt', 'csv' or marcxml 'xml' file. The file extension is used to determine how to process the file. t_a (optional) = the type of file string to create. This is most relevant for marcxml files. The file string can be constructed to include just the title or the title and author. Appropriate values include: 'title' and 'title_author' Example: f='<path/to/file>' set_search_strings(f,'title_author') ''' import pymarc import marcx import pandas as pd import io if len(args) > 0: t_a = args[0] else: t_a = None if (f[-3:]) == 'csv': cls.search_strings = open(f).read().splitlines() if (f[-3:]) == 'txt': cls.search_strings = open(f).read().splitlines() if (f[-3:]) == 'xml': records = pymarc.parse_xml_to_array(io.open(f,mode='r',encoding='utf-8')) df_search = pd.DataFrame() for rec in records: d = {} rec = marcx.FatRecord.from_record(rec) try: d['author'] = rec['100']['a'] except Exception as e: d['author'] = '' d['title'] = rec.title() #.replace('/','') d['mmsid'] = rec['001'].data d['title_author'] = rec.title() #.replace('/','') + ' ' + d['author'] #rec.author() df_search = df_search.append(d,ignore_index=True) if t_a == None: t_a = 'title' if t_a == 'title_author': cls.search_strings = df_search['title_author'] else: cls.search_strings = df_search['title']
def handle(self, **options): for title in Title.objects.filter(urls__value__icontains='chroniclingamerica'): record = pymarc.parse_xml_to_array(StringIO(title.marc.xml))[0] if record['245']['h'] == '[electronic resource].': if options['pretend']: print title else: LOGGER.info("deleting %s [%s] from solr index") index.delete_title(title) LOGGER.info("purging %s [%s]" % (title, title.lccn)) title.delete() if not options['pretend']: index.commit()
def convert_xml_to_marc(hostenv): """Convert MARC XML to MARC formatted .mrc file""" for marcfilename in os.listdir(app_configs[hostenv]['marc_dir']): if marcfilename[-3:] == 'xml': newfilename = re.sub("-orig.xml", "-marc.mrc", marcfilename) logging.info("Converting to MARC %s", marcfilename) marc_recs_out = pymarc.MARCWriter(open(app_configs[hostenv]['marc_dir'] \ +"/"+ newfilename, 'wb')) marc_xml_array = pymarc.parse_xml_to_array(app_configs[hostenv]['marc_dir'] \ +marcfilename) for rec in marc_xml_array: marc_recs_out.write(rec) marc_recs_out.close()
def handle(self, **options): for title in Title.objects.filter(urls__value__icontains='chroniclingamerica'): record = pymarc.parse_xml_to_array(StringIO(title.marc.xml))[0] if record['245']['h'] == '[electronic resource].': if options['pretend']: self.stdout.write(title) else: self.stdout.write("deleting %s [%s] from solr index") index.delete_title(title) self.stdout.write("purging %s [%s]" % (title, title.lccn)) title.delete() if not options['pretend']: index.commit()
def read_marc(filename): """Read MARC record from filename. Takes just the first record if there are multiple ones. """ logging.info("Reading %s" % (filename)) records = parse_xml_to_array(filename) if (len(records) == 0): logging.error("No records in %s, aborting" % (filename)) raise Exception("No records in %s, aborting" % (filename)) elif (len(records) > 1): logging.info("Have taken first of %d records from %s" % (len(records), filename)) return(records[0])
def __get_records(self, acnr): """Get all records containing acnr and return a list of pymarc.Record objects.""" # Namespaces for the responses from Alma ns = {'marc': 'http://www.loc.gov/MARC21/slim', 'srw': 'http://www.loc.gov/zing/srw/'} # Template für MARC-XML marc_template = """<marc:collection xmlns:marc="http://www.loc.gov/MARC21/slim" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd"/>""" xml_records = ET.fromstring(marc_template) # get the records from Alma offset = 1 sru_request = "https://obv-at-obvsg.alma.exlibrisgroup.com/view/sru/43ACC_NETWORK?version=1.2&operation=searchRetrieve&recordSchema=marcxml&query=other_system_number={acnr}&startRecord={offset}&maximumRecords=50" # sru_request = "https://obv-at-obvsg.alma.exlibrisgroup.com/view/sru/43ACC_UBG?version=1.2&operation=searchRetrieve&recordSchema=marcxml&query=other_system_number={acnr}&startRecord={offset}&maximumRecords=50" # get the first 50 records res = requests.get(sru_request.format(acnr=acnr, offset=offset)) # check how many records there are. If there are none, return None res_xml = ET.fromstring(res.text) numberOfRecords = int(res_xml.find("srw:numberOfRecords", ns).text) if numberOfRecords == 0: return None # add the records to the record list for record in res_xml.findall('.//marc:record', ns): xml_records.append(record) # repeat the request with increasing offset to get all records while offset < numberOfRecords - 50: offset += 50 res = requests.get(sru_request.format(acnr=acnr, offset=offset)) res_xml = ET.fromstring(res.text) # add the records to the record list for record in res_xml.findall('.//marc:record', ns): xml_records.append(record) # convert xml element to file-like-object, so pymarc can parse it marcfile = io.StringIO(ET.tostring(xml_records, encoding="unicode")) # parse the xml to a pymarc.Reader and make a list of pymarc.Records with marcfile as marcfile: reader = pymarc.parse_xml_to_array(marcfile) pymarc_records = [] for record in reader: pymarc_records.append(record) return pymarc_records
def default_map(self, file_name, xpath): ns = {'marc': 'http://www.loc.gov/MARC21/slim', 'oai': 'http://www.openarchives.org/OAI/2.0/'} file_path = r'./tests/test_data/default/{}'.format(file_name) record = pymarc.parse_xml_to_array(file_path)[0] result = self.mapper.parse_bib(record, "source") if self.config.validate_json_schema: validate(result, self.instance_schema) root = etree.parse(file_path) data = str('') for element in root.xpath(xpath, namespaces=ns): data = ' '.join( [data, str(etree.tostring(element, pretty_print=True), 'utf-8')]) return [result, data]
def addLBD(config, oclcnumber, note): oauth_session = config.get('oauth-session') #create the LBD record = Record(leader='00000n a2200000 4500') record.add_field(Field(tag='004', data=oclcnumber)) record.add_field( Field(indicators=[' ', ' '], tag='500', subfields=['a', note]), Field(indicators=[' ', ' '], tag='935', subfields=['a', str(time.time())]), Field(indicators=[' ', ' '], tag='940', subfields=['a', config.get('oclcSymbol')])) input = pymarc.record_to_xml(record).decode("utf-8") try: r = oauth_session.post( config.get('metadata_service_url') + "/lbd/data", data=input, headers={ "Accept": 'application/atom+xml;content="application/vnd.oclc.marc21+xml"', "Content-Type": "application/vnd.oclc.marc21+xml" }) r.raise_for_status try: result = ElementTree.fromstring(r.content) ns = { 'atom': 'http://www.w3.org/2005/Atom', 'wc': 'http://worldcat.org/rb' } marcNode = result.findall('atom:content/wc:response', ns)[0].getchildren()[0] marcData = StringIO( ElementTree.tostring(marcNode, encoding='unicode', method='xml')) # need to get this XML section out as a string and into a file like object marcRecords = pymarc.parse_xml_to_array(marcData) # pull out the LBD accession number print(marcRecords) accessionNumber = marcRecords[0]['001'].value() status = "success" except xml.etree.ElementTree.ParseError as err: accessionNumber = "" status = "failed XML parsing issue" print(err) except requests.exceptions.HTTPError as err: status = "failed" return pd.Series([oclcnumber, accessionNumber, status])
def batch_to_list(infile): """Take a filename of a marc-file (binary or xml)and return a list of pymarc.Record objects.""" with open(infile, "rb") as fh: # check if its xml or binary firstline = fh.readline() # set the pointer back to the beginning fh.seek(0) if b"<?xml version" in firstline: reader = pymarc.parse_xml_to_array(fh) else: # default: utf8_handling="strict" reader = pymarc.MARCReader(fh) record_list = list(reader) return record_list
def download_infoscience_labs(): INFOSCIENCE_API_KEY = 'Token ' + read_api_key() labs = dict() search_url = "https://infoscience.epfl.ch/api/v1/search?p=&cc=Lab&c=Lab&format=files" #search_url = "https://infoscience.epfl.ch/api/v1/search?p=Shchutska&cc=People&c=People&format=files" headers = {'User-Agent': 'Custom FORCE_SCRIPT_NAME = None', 'Authorization': INFOSCIENCE_API_KEY} r = requests.get(search_url, headers=headers, stream=True) data = r.content dump = open('dump.dat', 'wb') dump.write(data) dump.close() z = zipfile.ZipFile(io.BytesIO(data)) for x in z.infolist(): if x.filename.find('metadata.xml') > 0: recid = "" lab_code = "" lab_uid = "" liaison_librarian = "Unknown" infoscience_manager = "" metadata = z.read(x.filename) pseudofile = io.StringIO(metadata.decode('utf-8')) records = pymarc.parse_xml_to_array(pseudofile) for field in records[0].fields: if field.tag == '001': recid = field.data if field.tag > '010': codes = field.subfields[0:len(field.subfields):2] values = field.subfields[1:len(field.subfields):2] field_subfields = dict(zip(codes, values)) if field.tag == '195': lab_code = field_subfields['a'] if field.tag == '371': lab_uid = field_subfields['g'] if field.tag == '270': try: infoscience_manager = field_subfields['m'] except KeyError: print('manager', field) if field.tag == '271': try: liaison_librarian = field_subfields['p'] except KeyError: print('liaison', field) labs[lab_code] = {'uid': lab_uid, 'recid': recid, 'manager': infoscience_manager, 'liaison': liaison_librarian} return labs
def extract_lines(record_xml): index = 0 parsed_all = [] sourcename = record_xml.split('/')[-1].replace('.xml', '') records = pymarc.parse_xml_to_array(record_xml) for r in records: index += 1 parsed_record = parse_marc(r, source=sourcename, index=index) parsed_all.append(parsed_record) if index % 1000 == 0: print(f"# of extracted records: {index} ...") # logging.info(f"# of extracted records: {index} ...") print(f"Extracted {index} records from {sourcename}.xml") logging.info(f"Extracted {index} records from {sourcename}.xml") return pd.concat(parsed_all)
def main(argv): if len(argv) != 2: usage(sys.stderr) sys.exit(1) # inputs inFile = argv[1] # filecheck inputs fileCheck(inFile) # output file outFile = 'wau.alma.archived.' + time.strftime("%Y%m%d") + '.mrc' # file streams writer = codecs.open(outFile, 'wb', 'utf-8') #------------------------------------------------------------------# # Read an MMS ID, make a REST Call, Store Both IDs #------------------------------------------------------------------# print('Reading MARCXML file...') records = pymarc.parse_xml_to_array(inFile) count = 0 for rec in records: # force utf-8 rec.force_utf8 = True # skip over any entries that have an empty 583 field if len(rec.get_fields('583')) == 0: print('Blank 583 field: skipping ' + rec['001'].value() + ' / ' + rec['004'].value()) continue; count = count + 1 # get string representation of marc marc = rec.as_marc() # decode character set marc = marc.decode('utf-8') # output writer.write(marc) # end for loop print('Finished. ' + unicode(count) + ' MARCXML records converted to MARC21 binary.')
def download_infoscience_authors(): INFOSCIENCE_API_KEY = 'Token ' + read_api_key() authors = dict() search_url = "https://infoscience.epfl.ch/api/v1/search?p=&cc=People&c=People&format=files" #search_url = "https://infoscience.epfl.ch/api/v1/search?p=Shchutska&cc=People&c=People&format=files" headers = {'User-Agent': 'Custom FORCE_SCRIPT_NAME = None', 'Authorization': INFOSCIENCE_API_KEY} r = requests.get(search_url, headers=headers, stream=True) data = r.content dump = open('dump.dat', 'wb') dump.write(data) dump.close() z = zipfile.ZipFile(io.BytesIO(data)) for x in z.infolist(): names = [] labs = [] if x.filename.find('metadata.xml') > 0: metadata = z.read(x.filename) pseudofile = io.StringIO(metadata.decode('utf-8')) records = pymarc.parse_xml_to_array(pseudofile) try: names.append(records[0]['100']['a']) sciper = records[0]['935']['a'] except: print('Extracting names:',inspire_recid,records[0].as_json(indent=2)) sciper = '' for field in records[0].fields: if field.tag == '001': recid = field.data if field.tag == '400': names.append(field.subfields[1]) if field.tag == '790': codes = field.subfields[0:len(field.subfields):2] values = field.subfields[1:len(field.subfields):2] labs_positions = getIndexPositions(codes, 'a') labs = [values[k] for k in labs_positions] # print(labs) for name in names: if name not in authors: authors[name] = [(sciper, recid, labs)] else: if (sciper, recid) != authors[name][0:2]: print(recid, name, 'already in database:', authors[name]) authors[name].append((sciper, recid, labs)) return authors
def test_encoding(self): # Create a record record1 = pymarc.Record() # Add a field containing no diacritics record1.add_field( pymarc.Field( tag='245', indicators=[' ', ' '], subfields=[ 'a', 'Report of the Committee on the Peaceful Uses of Outer Space' ])) # And a field containing diacritics record1.add_field( pymarc.Field( tag='246', indicators=[' ', ' '], subfields=[ 'a', "Rapport du Comité des utilisations pacifiques de l'espace extra-atmosphérique" ])) # Create XML with an encoding specified record_xml = pymarc.marcxml.record_to_xml(record1, encoding='utf-8') # Parse the generated XML record2 = pymarc.parse_xml_to_array(six.BytesIO(record_xml))[0] # Compare the two records. If the other tests above pass, and this one passes, then the addition of an encoding # parameter in the marcxml.record_to_xml fuction didn't seem to break basic functionality of the library. self.assertEqual(record1.leader, record2.leader) field1 = record1.get_fields() field2 = record2.get_fields() self.assertEqual(len(field1), len(field2)) pos = 0 while pos < len(field1): self.assertEqual(field1[pos].tag, field2[pos].tag) if field1[pos].is_control_field(): self.assertEqual(field1[pos].data, field2[pos].data) else: self.assertEqual(field1[pos].get_subfields(), field2[pos].get_subfields()) self.assertEqual(field1[pos].indicators, field2[pos].indicators) pos += 1
def splitMarcFile(marcFilename, saveLocation): records = parse_xml_to_array(marcFilename) for record in records: print "Saving record for " + record.title() if '856' in record: url = record['856']['u'] if url.split('.')[-1] == "pdf": print "Downloading PDF" outputFilename = os.path.join(saveLocation,record['001'].format_field()) try: urllib.urlretrieve(record['856']['u'], outputFilename+'.pdf') except: print "Error: Cannot download pdf file " + url try: writeRecordToFile(record, outputFilename+".xml") except: print "Error: Cannot write record to "+outputFilename+".xml" return len(records)
def search_by_id(self, recid): result = [] try: # The "sc" parameter (split by collection) is used to provide # search results consistent with the ones from the CDS website req = requests.get(self.get_record_url(recid), params={"of": "xm"}) except Exception: raise ServiceUnavailable("Cannot perform search") if req.ok: try: record = pymarc.parse_xml_to_array(io.BytesIO(req.content))[0] result.append(self.parse_record(record)) except SAXParseException: # If authentication failed page is returned result = [] return {"result": result}
def main(): # with pymarc.MARCWriter(file(DATA_DIR+'ebooks.mrc','wb')) as writer: writer = pymarc.MARCWriter(codecs.open(DATA_DIR+'ebooks.mrc','w','utf-8')) count = written = 0 for line in codecs.open(DATA_DIR+'openlib_url_list.tsv', encoding='utf-8'): count += 1 if count < 2: continue # skip header line #url = line.rstrip('\n').split('\t')[6].replace('https:','http:') url = line.rstrip('\n').replace('https:','http:') print ' ',url jsonurl = '/'.join(url.split('/')[0:5])+'.json' json = get_json(jsonurl) if 'ocaid' in json: ia = json['ocaid'] # This is the Internet Archive version of the MARC record for the electronic version e.g. # https://archive.org/download/myantonia00cathrich/myantonia00cathrich_archive_marc.xml # not the original libraries MARC record for the paper book e.g. # https://archive.org/download/myantonia00cathrich/myantonia00cathrich_marc.xml marcurl = 'http://archive.org/download/%s/%s_archive_marc.xml' % (ia,ia) records = None retries = 0 while retries < 3 and records == None: retries += 1 try: records = pymarc.parse_xml_to_array(marcurl) except SAXParseException: records = None record = update_marc_record(records[0],ia,url) # record.force_utf8 = True try: writer.write(record) written += 1 except: print '** failed to write MARC record for ',marcurl print traceback.format_exc() #for field in record.fields: # print str(field) else: print '** Unexpectedly missing ocaid for ',jsonurl writer.close() print 'Wrote %d of %d MARC records' % (written, count-1)
def next(self): '''Return MARC records in sets to controller. Break when last record position == num_records ''' if self.current_record >= self.num_records: raise StopIteration # get chunk from self.current_record to self.current_record + page_size tree = self.get_current_xml_tree() recs_xml = tree.findall('.//zs:record', self.ns) # advance current record to end of set self.current_record = int(recs_xml[-1].find( './/zs:recordPosition', self.ns).text) self.current_record += 1 # translate to pymarc records & return marc_xml_file = tempfile.TemporaryFile() marc_xml_file.write(ET.tostring(tree)) marc_xml_file.seek(0) recs = [rec.as_dict() for rec in pymarc.parse_xml_to_array(marc_xml_file) if rec is not None] return recs
def main(): # with pymarc.MARCWriter(file(DATA_DIR+'SCCLclassics.mrc','wb')) as writer: writer = pymarc.MARCWriter(codecs.open(DATA_DIR + "SCCLclassics.mrc", "w", "utf-8")) count = written = 0 for line in codecs.open(DATA_DIR + "SCCL classics candidates - v3 selected.tsv", encoding="utf-8"): count += 1 if count < 2: continue # skip header line url = line.rstrip("\n").split("\t")[6].replace("https:", "http:") print " ", url jsonurl = "/".join(url.split("/")[0:5]) + ".json" json = get_json(jsonurl) if "ocaid" in json: ia = json["ocaid"] # This is the Internet Archive version of the MARC record for the electronic version e.g. # https://archive.org/download/myantonia00cathrich/myantonia00cathrich_archive_marc.xml # not the original libraries MARC record for the paper book e.g. # https://archive.org/download/myantonia00cathrich/myantonia00cathrich_marc.xml marcurl = "http://archive.org/download/%s/%s_archive_marc.xml" % (ia, ia) records = None retries = 0 while retries < 3 and records == None: retries += 1 try: records = pymarc.parse_xml_to_array(marcurl) except SAXParseException: records = None record = update_marc_record(records[0], ia, url) # record.force_utf8 = True try: writer.write(record) written += 1 except: print "** failed to write MARC record for ", marcurl print traceback.format_exc() # for field in record.fields: # print str(field) else: print "** Unexpectedly missing ocaid for ", jsonurl writer.close() print "Wrote %d of %d MARC records" % (written, count - 1)
def hathi_record_yielder( filenames, sample_files=100, sample_records=100 ): """Returns a generator that cycles, one at a time, through all the records. We're reading the full DPLA dump out of the tarfile, and then definnig a generator object from a yielding function. Each object returned by the generator is a record from the great pymarc utility. It has a native method for parsing multirecord marcxml, but that relies on reading the entire giant files into the DOM. That's a waste of time, so I just chunk the records out by hand in a non-elegant way. This method may not work on non-Hathi MARC files if they use different patterns of newlines. """ # hathi_records = tarfile.open(tarfile_location) for file in filenames: if file.endswith(".xml") and random.random()<=(sample_files/float(100)): logging.info("Parsing new XML file " + file) buffer = "" in_record = False for line in open(file,"r"): #hathi_records.extractfile(file):\ if "<record>" in line: if random.random()<=(sample_records/float(100)): in_record=True if in_record: buffer += line if "</record>" in line and in_record: in_record = False records = pymarc.parse_xml_to_array(cStringIO.StringIO(buffer)) buffer = "" for record in records: record.__class__ = BRecord yield record
def next(self): '''Return MARC records in sets to controller. Break when last record position == num_records ''' if self.current_record >= self.num_records: raise StopIteration # get chunk from self.current_record to self.current_record + page_size tree = self.get_current_xml_tree() recs_xml = tree.findall('.//zs:record', self.ns) # advance current record to end of set self.current_record = int(recs_xml[-1].find('.//zs:recordPosition', self.ns).text) self.current_record += 1 # translate to pymarc records & return marc_xml_file = tempfile.TemporaryFile() marc_xml_file.write(ET.tostring(tree)) marc_xml_file.seek(0) recs = [ rec.as_dict() for rec in pymarc.parse_xml_to_array(marc_xml_file) if rec is not None ] return recs
def metadata(self): """ Fetch additional information about a volume from the HathITrust Bibliographic API. See: https://www.hathitrust.org/bib_api :return: A `pymarc` record. See pymarc's documentation for details on using it. """ if not self._metadata: logging.debug("Looking up full metadata for {0}".format(self.id)) data = requests.get(self.ht_bib_url).json() record_id = data['items'][0]['fromRecord'] marc = data['records'][record_id]['marc-xml'] # Pymarc only reads a file, so stream the text as if it was one xml_stream = StringIO(marc) xml_record = pymarc.parse_xml_to_array(xml_stream)[0] xml_stream.close() self._metadata = xml_record return self._metadata
def test_parse_to_array(self): records = pymarc.parse_xml_to_array('test/batch.xml') self.assertEqual(len(records), 2) # should've got two records self.assertEqual(type(records[0]), pymarc.Record) self.assertEqual(type(records[1]), pymarc.Record) # first record should have 18 fields record = records[0] self.assertEqual(len(record.get_fields()), 18) # check the content of a control field self.assertEqual(record['008'].data, u'910926s1957 nyuuun eng ') # check a data field with subfields field = record['245'] self.assertEqual(field.indicator1, '0') self.assertEqual(field.indicator2, '4') self.assertEqual(field['a'], u'The Great Ray Charles') self.assertEqual(field['h'], u'[sound recording].')
def metadata(self): """ Fetch additional information about a volume from the HathITrust Bibliographic API. See: https://www.hathitrust.org/bib_api return: A `pymarc` record. See pymarc's documentation for details on using it. """ if not self._extra_metadata: logging.debug("Looking up full metadata for {0}".format(self.id)) data = requests.get(self.ht_bib_url).json() record_id = data['items'][0]['fromRecord'] marc = data['records'][record_id]['marc-xml'] # Pymarc only reads a file, so stream the text as if it was one xml_stream = StringIO(marc) xml_record = pymarc.parse_xml_to_array(xml_stream)[0] xml_stream.close() self._extra_metadata = xml_record return self._extra_metadata
continue cleaned_subfields.append(code) cleaned_subfields.append(value) return cleaned_subfields inputfilename = "30_input.xml" outputfilename = "30_output.mrc" if len(sys.argv) >= 3: inputfilename, outputfilename = sys.argv[1:3] inputfile = io.open(inputfilename, "rb") outputfile = io.open(outputfilename, "wb") reader = pymarc.parse_xml_to_array(inputfile) for oldrecord in reader: newrecord = marcx.Record() newrecord.strict = False # prüfen, ob Titel vorhanden ist if not oldrecord["245"]: continue # leader newrecord.leader = " " + oldrecord.leader[5:] if len(newrecord.leader) < 9: logging.debug("too short %s: %s", len(newrecord.leader), newrecord.leader) continue
if os.path.exists(src_folder+src): shutil.move(src_folder+src, dest_folder+src) # (Re)Process the records # Convert the individual marcxml_in files to raw marc and write them all to a single .mrc file # OUTPUT FILE marcRecsOut_orig_recs = pymarc.MARCWriter(file(aco_globals.batch_folder+'/'+batch_name+'_0_orig_recs.mrc', 'w')) marcxml_dir = aco_globals.batch_folder+'/marcxml_in' for filename in os.listdir(marcxml_dir): file_path = os.path.join(marcxml_dir,filename) if os.path.isfile(file_path): if file_path[-3:]=='xml': marc_xml_array = pymarc.parse_xml_to_array(file_path) for rec in marc_xml_array: rec = aco_functions.pad_008(rec) rec_001 = rec.get_fields('001')[0] print rec_001 marcRecsOut_orig_recs.write(rec) marcRecsOut_orig_recs.close() # Extract the OCLC numbers from each record and write records to .mrc and .txt files depending if record contains OCLC number or not # INPUT FILE marcRecsIn_orig_recs = pymarc.MARCReader(file(aco_globals.batch_folder+'/'+batch_name+'_0_orig_recs.mrc'), to_unicode=True, force_utf8=True) # OUTPUT FILES try: os.makedirs(aco_globals.batch_folder+'/'+batch_name+'_1/') except OSError as exception:
import pymarc import time writer = pymarc.MARCWriter(file('free_ebooks.marc', 'w')) error = open('errors.txt', 'w') count = 0 for line in open('marc_urls.txt'): try: marcrec = pymarc.parse_xml_to_array(line) writer.write(marcrec[0]) except: error.write(line) count += 1 time.sleep(0.3) #try to be nice to IA servers if (count % 1000) == 0: time.sleep(600) #try to be nice to IA servers writer.close()
import pymarc import os import pandas as pd data_dir = '/Volumes/Data/LibraryVis' out_file = 'TOC_800s_subject.csv' # For our 5.8 Gb MARCXML file, this takes a long time (but < 1.5 hrs) and 32 Gb RAM... records = pymarc.parse_xml_to_array(os.path.join(data_dir, 'TOC_2000bis2016.mrc.xml')) titles = [r.title() for r in records] # Not going with int() because some have things like 'c 2005' years = [r.pubyear() for r in records] # Only really need part of the dewey field deweys = [r.get_fields('082')[0].get_subfields('a')[0] if '082' in r else '' for r in records] # Getting rid of extra author info don't need right now authors = ["|".join(r['100'].get_subfields('a','d')) if '100' in r else '' for r in records] # MARC records # tag 082 - Dewey Decimal Classification Number # tag 856 - Electronic Location and Access # TOC URLs # e.g. # {'856': {'ind1': u'4', # 'ind2': u'2', # 'subfields': [{u'm': u'V:DE-605'}, # {u'q': u'application/pdf'},
def setUp(self): self.tf1 = pymarc.parse_xml_to_array(TF1) self.collated_tf1 = collate(self.tf1[0]) self.tf2 = pymarc.parse_xml_to_array(TF2) self.collated_tf2 = collate(self.tf2[0])
def test_strict(self): fh = gzip.open('test/batch.xml.gz','rb') a = pymarc.parse_xml_to_array(fh, strict=True) self.assertEqual(len(a), 2)
import marcx import pymarc copytags = ("100", "105", "120", "130", "150", "174", "200", "245", "246", "250", "260", "300", "335", "336", "337", "338", "351", "361", "400", "500", "520", "650", "689", "700", "710", "800") inputfilename = "156_input.xml" outputfilename = "156_output.mrc" if len(sys.argv) == 3: inputfilename, outputfilename = sys.argv[1:] inputfile = open(inputfilename, "rb") outputfile = open(outputfilename, "wb") oldrecords = pymarc.parse_xml_to_array(inputfile) for i, oldrecord in enumerate(oldrecords, start=1): try: f245a = oldrecord["245"]["a"] except: continue newrecord = marcx.Record(force_utf8=True) # leader leader = " " + oldrecord.leader[5:] newrecord.leader = leader # 001
def load(self, file_name): import pymarc from es_config import marc2json for record in pymarc.parse_xml_to_array(file_name): fields = record.as_dict().get("fields") self.append(marc2json(fields))
# <codecell> import xml.etree.ElementTree as ET from collections import defaultdict import networkx as nx import matplotlib.pyplot as plt import pyddc import pymarc import operator import math import json # <codecell> print 'loading xml' completedb = pymarc.parse_xml_to_array(open('CLASS_23eng_marc_webdewey_20131020.xml')) print 'loaded xml' # <markdowncell> # Our rules: # # * only if leader byte 8 is 'a' # * exclude spans, recrods that have eithe \$c or \$y in 153 # * 253 ind 0 \$a "see reference" # * 253 ind 2 \$a "class elsewhere" # * may be multiple 253s # * 153 \$a to $e is "notational hiearchy" # # # Later on tables: #
def test_strict(self): a = pymarc.parse_xml_to_array(open('test/batch.xml'), strict=True) self.assertEqual(len(a), 2)
def test_bad_tag(self): a = pymarc.parse_xml_to_array(open('test/bad_tag.xml')) self.assertEqual(len(a), 1)
def parse_xml_string_to_record(xmlstring): """ Parse an xml string and return a pymarc.Record object. """ xml_io = cStringIO.StringIO(xmlstring) record = parse_xml_to_array(xml_io)[0] return record