def rero_get_record(id, verbose=False, debug=False): """Get a record from RERO data repo. RERO documentation: http://data.rero.ch/ http://data.rero.ch/02-A000069866/marcxml """ base_url = 'http://data.rero.ch/02-' query_id = '{id}'.format(id=id) format = '/marcxml' url = '{base_url}{query_id}{format}'.format(base_url=base_url, query_id=query_id, format=format) trans_record = None response = requests.get(url) if response.status_code == requests.codes.ok: try: records = parse_xml_to_array(BytesIO(response.content)) if records: trans_record = Transformation(records[0]).json if verbose: click.echo('API-rero get: {id}'.format(id=id)) except Exception as err: if verbose: click.echo('ERROR get rero record: {err}'.format(err=err)) if debug: raise Exception(err) return trans_record
def trans_prep(source, xml_part_to_add): """Prepare transformation.""" build_xml_record_file(xml_part_to_add) current_dir = os.path.dirname(__file__) file_name = os.path.join(current_dir, 'examples/xml_minimal_record.xml') records = marcxml.parse_xml_to_array(file_name, strict=False, normalize_form=None) logger = Logger() trans = { 'gnd': Transformation_gnd(marc=records[0], logger=logger, verbose=True, transform=False), 'idref': Transformation_idref(marc=records[0], logger=logger, verbose=True, transform=False), 'rero': Transformation_rero(marc=records[0], logger=logger, verbose=True, transform=False) } return trans.get(source)
def _fetch_metadata(url): """ @args: url, UNDL url given by user raises: 500 error if url can not be retrieved parse xml of response and return array of pymarc records """ try: r = req.Request( url, data=None, headers={ 'User-Agent': "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11" }) resp = req.urlopen(r, context=ssl._create_unverified_context()) except (HTTPError, URLError) as e: logger.error("Error: {}".format(e)) abort(500) if resp.status != 200: logger.error("Could not get {}, status: {}".format(url, resp.status)) abort(500) else: raw_xml = resp.read() #print(raw_xml) xml_doc = BytesIO(raw_xml) logger.info(xml_doc) collection = marcxml.parse_xml_to_array(xml_doc, False, 'NFC') return collection
def getRawFilesLocs(metadata_filename): """ Given a MARC21 metadata file, return an array of "Files" objects, containing: `filename` `uri` `remote` `hash` `size` """ # Parse the XML as MARC21 # and get the first record (result should be one record anyway) record = marcxml.parse_xml_to_array(metadata_filename)[0] # Look for 856 entries # MARC21: 856 - Electronic Location and Access (R) files = [] for f in record.get_fields("856"): obj = {} # Unknown size fallback obj["size"] = 0 if f["u"]: obj["url"] = f["u"] obj["remote"] = "HTTP" elif f["d"]: obj["url"] = f["d"] obj["remote"] = "EOS" else: logging.debug(f'Skipped 856 entry "{f}", no u or d field.') continue # File checksum if f["w"]: p = re.compile(r"\([A-Za-z]*:([A-Za-z0-9]*).*;([A-Za-z0-9]*)") m = p.match(f["w"]) alg = m.groups()[0].lower() checksum = m.groups()[1] obj["checksum"] = f"{alg}:{checksum}" # File size if f["s"]: obj["size"] = int(f["s"]) # Get basename if obj["url"]: obj["filename"] = ntpath.basename(obj["url"]) obj["path"] = obj["filename"] obj["metadata"] = False obj["downloaded"] = False if obj["filename"]: files.append(obj) else: logging.warning(f'Skipped entry "{f}", no basename found (probably an URL?)') return files
def MarcEditXmlToMarc(self, x): mrcFileName = re.sub('.xml', '.mrc', x) print('\n<Converting from XML to MARC>\n') #subprocess.call([MonoBin,MarcEditBin,"-s", x, "-d",mrcFileName,"-xmlmarc","-marc8", "-mxslt","/opt/marcedit/xslt/MARC21XML2Mnemonic_plugin.xsl"]) marcStr = '' with open(x, 'rb') as fh: recs = marcxml.parse_xml_to_array(fh) for rec in recs: marcStr += str(rec) return marcStr
def __init__(self, url): resp = req.urlopen(url, context=ssl._create_unverified_context()) if resp.status != 200: raise PageNotFoundException( "Could not get data from {}".format(url)) self.xml_doc = BytesIO(resp.read()) #print(self.xml_doc) r = marcxml.parse_xml_to_array(self.xml_doc, False, 'NFC') #print(r) if len(r) > 0: self.record = r[0] else: self.record = None
def trans_prep(source, xml_part_to_add): """Prepare transformation.""" build_xml_record_file(xml_part_to_add) current_dir = os.path.dirname(__file__) file_name = os.path.join( current_dir, 'examples/xml_minimal_record.xml') records = marcxml.parse_xml_to_array( file_name, strict=False, normalize_form=None) trans = { 'concepts': Transformation_rero_concepts( marc=records[0], logger=None, verbose=False, transform=False) } return trans.get(source)
def get_resource(parameters, stats): stats['search_rows'] = 0 for filenum in parameters['filenums']: filepath = parameters['files-path-template'].format(filenum=filenum) search_id = 'neaman{}'.format(filenum) with open(filepath) as f: for record_num, record in enumerate(parse_xml_to_array(f)): row = parse_record(record) migdar_id = '{}-{}'.format(search_id, record_num) row.update(migdar_id=migdar_id, first_ccl_query='neaman{}.xml'.format(filenum), last_query_datetime=datetime.datetime.now(), json=json.loads(row['json'])) stats['search_rows'] += 1 yield row
def main(filename, out_filename=""): # check stdin if type(filename) is str: out_path = emx.get_out_filename( filename) if not out_filename else out_filename else: return 1 with open(out_path, 'w', newline='') as fh: # open output file csv.register_dialect('marcxmltotsv',\ delimiter='\t',quoting=csv.QUOTE_NONE,quotechar='',\ doublequote=False,escapechar=None) csv_writer = csv.DictWriter(fh, fieldnames=COLUMNS, dialect='marcxmltotsv') csv_writer.writeheader() # parse xml collection = marcxml.parse_xml_to_array(filename, strict=True) def get_curr_row(record): curr_row = {} for col in COLUMNS: # for each record create row in tsv curr_row[col] = emx.get_value(col, record) return curr_row for record in collection: if len(DATE_RANGE) < 2: curr_row = get_curr_row(record) csv_writer.writerow(curr_row) continue else: curr_date = emx.get_pymarc_field_value('008', record)[7:11].strip() if len(curr_date) == 4 and int(curr_date) >= DATE_RANGE[0] \ and int(curr_date) <= DATE_RANGE[1]: curr_row = get_curr_row(record) csv_writer.writerow(curr_row) return 0
def worldcat(num, num_type, url, key): # e.g., /oclc/34473395 /oclc/34474496 url = url % (num, key) try: records = marcxml.parse_xml_to_array(urlopen(url)) if not records: return None record = records[0] except: return None bib = {} bib[num_type.upper()] = num bib['TITLE'] = record.uniformtitle() if not bib['TITLE']: bib['TITLE'] = record.title() if record.title() else '' bib['TITLE_ALL'] = bib['TITLE'] bib['AUTHORS'] = [entry.format_field() for entry in record.addedentries()] bib['AUTHOR'] = record.author() if record.author() else '' if bib['AUTHOR']: bib['AUTHORS'].insert(0, bib['AUTHOR']) bib['PUBLISHER'] = record.publisher() if record.publisher() else '' try: bib['PUBLISH_DATE'] = record.pubyear() except: bib['PUBLISH_DATE'] = '' bib['IMPRINT'] = '%s %s' % (bib['PUBLISHER'], bib['PUBLISH_DATE']) bib['BIB_FORMAT'] = 'as' if num_type == 'issn' else 'am' bib['ISBN'] = record.isbn() try: physical = [entry.format_field() for entry in record.physicaldescription()] bib['DESC'] = physical[0] except: bib['DESC'] = '' try: notes = [entry.format_field() for entry in record.notes()] bib['NOTES'] = notes[0] except: bib['NOTES'] = '' try: subjects = [entry.format_field() for entry in record.subjects()] bib['SUBJECTS'] = subjects[0] except: bib['SUBJECTS'] = '' # identify worldcat response for the item.html block bib['WORLDCAT_RESPONSE'] = num bib['WORLDCAT_MESSAGE'] = 'This page contains information from the OCLC WorldCat catalog.' bib['WORLDCAT_SEARCH'] = '<a href=http://www.worldcat.org/search?q={{ bib.TITLE }}\ >Search OCLC Worldcat</a>' return bib
def rero_get_record(id, verbose=False, debug=False): """Get a record from RERO data repo. RERO documentation: http://data.rero.ch/ http://data.rero.ch/02-A000069866/marcxml """ url = f'http://data.rero.ch/02-{id}/marcxml' trans_record = None response = requests.get(url) if response.status_code == requests.codes.ok: try: records = parse_xml_to_array(BytesIO(response.content)) if records: trans_record = Transformation(records[0]).json if verbose: click.echo(f'API-rero get: {id}') except Exception as err: if verbose: click.echo(f'ERROR get RERO record: {err}') if debug: raise Exception(err) return trans_record
def oai_get_record(id, name, transformation, record_cls, access_token=None, identifier=None, dbcommit=False, reindex=False, test_md5=False, verbose=False, debug=False, **kwargs): """Get record from an OAI repo. :param identifier: identifier of record. """ url, metadata_prefix, lastrun, setspecs = get_info_by_oai_name(name) request = Sickle(url) params = {} if access_token: params['accessToken'] = access_token params['metadataPrefix'] = metadata_prefix params['identifier'] = f'{identifier}{id}' try: record = request.GetRecord(**params) except Exception as err: if debug: raise Exception(err) return None records = parse_xml_to_array(StringIO(record.raw)) trans_record = transformation(records[0]).json if verbose: click.echo(f'OAI-{name} get: {id}') return trans_record
def trans_prep(source, xml_part_to_add): """Prepare transformation.""" build_xml_record_file(xml_part_to_add) current_dir = os.path.dirname(__file__) file_name = os.path.join(current_dir, 'examples/xml_minimal_record.xml') records = marcxml.parse_xml_to_array(file_name, strict=False, normalize_form=None) if source == 'bnf': trans = Transformation_bnf(marc=records[0], logger=None, verbose=False, transform=False) elif source == 'gnd': trans = Transformation_gnd(marc=records[0], logger=None, verbose=False, transform=False) elif source == 'rero': trans = Transformation_rero(marc=records[0], logger=None, verbose=False, transform=False) return trans
def worldcat(num, num_type, url, key): url = url % (num_type, num, key) try: records = marcxml.parse_xml_to_array(urlopen(url)) if not records: return None record = records[0] except: return None bib = {} bib[num_type.upper()] = num bib['TITLE'] = record.uniformtitle() if not bib['TITLE']: bib['TITLE'] = record.title() if record.title() else '' bib['TITLE_ALL'] = bib['TITLE'] bib['AUTHORS'] = [entry.format_field() for entry in record.addedentries()] bib['AUTHOR'] = record.author() if record.author() else '' if bib['AUTHOR']: bib['AUTHORS'].insert(0, bib['AUTHOR']) bib['PUBLISHER'] = record.publisher() if record.publisher() else '' bib['PUBLISHER_DATE'] = record.pubyear() if record.pubyear() else '' bib['IMPRINT'] = '%s %s' % (bib['PUBLISHER'], bib['PUBLISHER_DATE']) bib['BIB_FORMAT'] = 'as' if num_type == 'issn' else 'am' return bib
def oai_process_records_from_dates(name, sickle, oai_item_iterator, transformation, record_cls, max_retries=0, access_token=None, days_spann=30, from_date=None, until_date=None, ignore_deleted=False, dbcommit=True, reindex=True, test_md5=True, online=False, verbose=False, debug=False, **kwargs): """Harvest multiple records from an OAI repo. :param name: The name of the OAIHarvestConfig to use instead of passing specific parameters. :param from_date: The lower bound date for the harvesting (optional). :param until_date: The upper bound date for the harvesting (optional). """ # data on IDREF Servers starts on 2000-10-01 if kwargs.get('kwargs', {}).get('online'): online = kwargs.get('kwargs', {}).get('online') name = name days_spann = days_spann last_run = None url, metadata_prefix, last_run, setspecs = get_info_by_oai_name(name) request = sickle(url, iterator=oai_item_iterator, max_retries=max_retries) dates_inital = { 'from': from_date or last_run, 'until': until_date } update_last_run = from_date is None and until_date is None # Sanity check if dates_inital['until'] is not None \ and dates_inital['from'] > dates_inital['until']: raise WrongDateCombination("'Until' date larger than 'from' date.") last_run_date = datetime.now() # If we don't have specifications for set searches the setspecs will be # set to e list with None to go into the retrieval loop without # a set definition (line 177) setspecs = setspecs.split() or [None] count = 0 action_count = {} mef_action_count = {} viaf_online_count = 0 for spec in setspecs: dates = dates_inital params = { 'metadataPrefix': metadata_prefix, 'ignore_deleted': ignore_deleted } if access_token: params['accessToken'] = access_token params.update(dates) if spec: params['set'] = spec my_from_date = parser.parse(dates['from']) my_until_date = last_run_date if dates['until']: my_until_date = parser.parse(dates['until']) while my_from_date <= my_until_date: until_date = my_from_date + timedelta(days=days_spann) if until_date > my_until_date: until_date = my_until_date dates = { 'from': my_from_date.strftime("%Y-%m-%d"), 'until': until_date.strftime("%Y-%m-%d") } params.update(dates) try: for record in request.ListRecords(**params): count += 1 records = parse_xml_to_array(StringIO(record.raw)) try: try: updated = datetime.strptime( records[0]['005'].data, '%Y%m%d%H%M%S.%f' ) except: updated = '????' rec = transformation(records[0]).json pid = rec.get('pid') rec, action, m_record, m_action, v_record, v_online = \ record_cls.create_or_update_agent_mef_viaf( data=rec, dbcommit=True, reindex=True, online=online, verbose=verbose ) action_count.setdefault(action.name, 0) action_count[action.name] += 1 mef_action_count.setdefault(m_action.name, 0) mef_action_count[m_action.name] += 1 if v_online: viaf_online_count += 1 if verbose: m_pid = 'Non' if m_record: m_pid = m_record.pid v_pid = 'Non' if v_record: v_pid = v_record.pid click.echo( ( 'OAI {name} spec({spec}): {pid}' ' updated: {updated} {action}' ' | mef: {m_pid} {m_action}' ' | viaf: {v_pid} online: {online}' ).format( name=name, spec=spec, pid=pid, action=action.value, m_pid=m_pid, m_action=m_action.value, v_pid=v_pid, online=v_online, updated=updated ) ) except Exception as err: msg = 'ERROR creating {name} {count}: {err}' msg = msg.format( name=name, count=count, err=err ) if rec: msg += '\n{rec}'.format(rec=rec) current_app.logger.error(msg) if debug: traceback.print_exc() except NoRecordsMatch: my_from_date = my_from_date + timedelta(days=days_spann + 1) continue except Exception as err: current_app.logger.error(err) if debug: traceback.print_exc() count = -1 my_from_date = my_from_date + timedelta(days=days_spann + 1) if verbose: click.echo( ('OAI {name} {spec}: {from_d} .. +{days_spann}').format( name=name, spec=spec, from_d=my_from_date.strftime("%Y-%m-%d"), days_spann=days_spann ) ) if update_last_run: if verbose: click.echo( ('OAI {name}: update last run: {last_run}').format( name=name, last_run=last_run_date ) ) oai_source = get_oaiharvest_object(name) oai_source.update_lastrun(last_run_date) oai_source.save() db.session.commit() return count, action_count, mef_action_count
def parse_metadata(self, metadata_filename): """ Given a MARC21 metadata file, return an array of "Files" objects, containing: `filename` `uri` `remote` `hash` `size` """ # Parse the XML as MARC21 # and get the first record (result should be one record anyway) logging.debug("Parsing metadata..") try: record = marcxml.parse_xml_to_array(metadata_filename)[0] except: raise Exception( "Malformed metadata. Check if the record is public.") # Look for 856 entries # MARC21: 856 - Electronic Location and Access (R) files = [] for f in record.get_fields("856"): obj = {} # Unknown size fallback obj["size"] = 0 if f["u"]: obj["url"] = f["u"] obj["remote"] = "HTTP" elif f["d"]: obj["url"] = f["d"] obj["remote"] = "EOS" else: logging.debug(f'Skipped 856 entry "{f}". No `u` or `d` field.') continue # File checksum if f["w"]: p = re.compile(r"\([A-Za-z]*:([A-Za-z0-9]*).*;([A-Za-z0-9]*)") m = p.match(f["w"]) alg = m.groups()[0].lower() checksum = m.groups()[1] obj["checksum"] = f"{alg}:{checksum}" # File size if f["s"]: obj["size"] = int(f["s"]) # Get basename if obj["url"]: obj["filename"] = ntpath.basename(obj["url"]) # We suppose no folder structure obj["path"] = obj["filename"] obj["localpath"] = f"data/content/{obj['path']}" obj["metadata"] = False obj["downloaded"] = False if obj["filename"]: files.append(obj) else: logging.warning( f'Skipped entry "{f}". No basename found (probably an URL?)' ) logging.debug(f"Got {len(files)} files") meta_file_entry = { "filename": "metadata.xml", "path": "metadata.xml", "metadata": True, "downloaded": True, "localpath": "data/meta/metadata.xml", "url": self.metadata_url, "size": self.metadata_size, } files.append(meta_file_entry) return files
def oai_save_records_from_dates(name, file_name, sickle, oai_item_iterator, max_retries=0, access_token=None, days_spann=30, from_date=None, until_date=None, verbose=False, **kwargs): """Harvest and save multiple records from an OAI repo. :param name: The name of the OAIHarvestConfig to use instead of passing specific parameters. :param from_date: The lower bound date for the harvesting (optional). :param until_date: The upper bound date for the harvesting (optional). """ # data on IDREF Servers starts on 2000-10-01 name = name days_spann = days_spann last_run = None url, metadata_prefix, last_run, setspecs = get_info_by_oai_name(name) request = sickle(url, iterator=oai_item_iterator, max_retries=max_retries) dates_inital = { 'from': from_date or last_run, 'until': until_date } # Sanity check if dates_inital['until'] is not None \ and dates_inital['from'] > dates_inital['until']: raise WrongDateCombination("'Until' date larger than 'from' date.") last_run_date = datetime.now() # If we don't have specifications for set searches the setspecs will be # set to e list with None to go into the retrieval loop without # a set definition (line 177) setspecs = setspecs.split() or [None] count = 0 with open(file_name, 'bw') as output_file: for spec in setspecs: dates = dates_inital params = { 'metadataPrefix': metadata_prefix, 'ignore_deleted': False } if access_token: params['accessToken'] = access_token params.update(dates) if spec: params['set'] = spec my_from_date = parser.parse(dates['from']) my_until_date = last_run_date if dates['until']: my_until_date = parser.parse(dates['until']) while my_from_date <= my_until_date: until_date = my_from_date + timedelta(days=days_spann) if until_date > my_until_date: until_date = my_until_date dates = { 'from': my_from_date.strftime("%Y-%m-%d"), 'until': until_date.strftime("%Y-%m-%d") } params.update(dates) try: for record in request.ListRecords(**params): count += 1 records = parse_xml_to_array(StringIO(record.raw)) record_id = '???' field_001 = records[0]['001'] if field_001: record_id = field_001.data if verbose: click.echo( 'OAI {name} spec({spec}): {from_d} ' 'count:{count:>10} = {id}'.format( name=name, spec=spec, from_d=my_from_date.strftime("%Y-%m-%d"), days_spann=days_spann, count=count, id=record_id ) ) rec = records[0] rec.leader = rec.leader[0:9] + 'a' + rec.leader[10:] output_file.write(rec.as_marc()) except NoRecordsMatch: my_from_date = my_from_date + timedelta( days=days_spann + 1) continue except Exception as err: current_app.logger.error(err) my_from_date = my_from_date + timedelta(days=days_spann + 1) if verbose: click.echo( 'OAI {name} spec({spec}): ' '{from_d} .. +{days_spann}'.format( name=name, spec=spec, from_d=my_from_date.strftime("%Y-%m-%d"), days_spann=days_spann ) ) if verbose: click.echo('OAI {name}: {count}'.format( name=name, count=count )) return count
def main(filename, out_filename=""): def get_field_vol_values(field, codes, row): """return dictionary of values based on current volume level field""" def get_subfield_values_list(field, code): values = [] for subfield in field.get_subfields(code): values.append(subfield.replace('\\', '').strip()) return values if values else "" for code in codes: row[code] = get_subfield_values_list(field, code) return row def field_to_tsv(field, collection): out_path = emx.get_out_filename(filename, field['desc']) with open(out_path, 'w', newline='') as fh: # open output file columns = emx.COLUMNS_WORK + field['codes'] csv.register_dialect('marcxmltotsv',\ delimiter='\t',quoting=csv.QUOTE_NONE,quotechar='',\ doublequote=False,escapechar=None) csv_writer = csv.DictWriter(fh, fieldnames=columns, dialect='marcxmltotsv') headers = {} headers_list = [] headers_list += emx.COLUMNS_WORK if 'headers' in field: headers_list += field['headers'] else: headers_list += field['codes'] i = 0 for column in columns: headers[column] = headers_list[i] i += 1 index = 0 csv_writer.writerow(headers) def get_curr_row(curr_fields, curr_work, tag): curr_row = {} if curr_fields and tag == '989': curr_row = emx.get_field_989_values( curr_fields, field['codes'], curr_work) else: for curr_field in curr_fields: curr_row = curr_work # curr_row = emx.get_field_vol_values(curr_field,field['codes'],curr_row) curr_row = get_field_vol_values( curr_field, field['codes'], curr_row) return curr_row for record in collection: curr_work = emx.get_work_metadata(record) curr_fields = record.get_fields(field['tag']) ''' lines 160 - 171 are an alternate way of returning current row including relator conditional created for epigraph author data subset and retained for demonstration ''' # if len(DATE_RANGE) < 2: # epi_authors = [] # for curr_field in curr_fields: # if curr_field['4'] and 'author (epigraph)' in curr_field['4'].strip().lower(): # epi_authors.append(curr_field) # curr_row = get_curr_row(epi_authors,curr_work,field['tag']) # if curr_row: csv_writer.writerow(curr_row) # continue if len(DATE_RANGE) < 2: curr_row = get_curr_row(curr_fields, curr_work, field['tag']) if curr_row: csv_writer.writerow(curr_row) continue else: curr_date = emx.get_pymarc_field_value( '008', record)[7:11].strip() if len(curr_date) == 4 and int(curr_date) >= DATE_RANGE[0] \ and int(curr_date) <= DATE_RANGE[1]: curr_row = get_curr_row(curr_fields, curr_work, field['tag']) # if curr_row: csv_writer.writerow(curr_row) return 0 # parse xml collection = marcxml.parse_xml_to_array(filename, strict=True) for datafield in DATAFIELDS: field_to_tsv(datafield, collection)
def import_marc21xml(url, can_display_pending_publications): result = [] o = urlparse(url) if '*' not in settings.ALLOWED_HOSTS and \ o.netloc not in settings.ALLOWED_HOSTS: result.append(get_message('danger', _('The domain is not allowed'))) return result if url.find(settings.SITE_DOMAIN + settings.SITE_PATH) != -1: # self referencing url is bad result.append(get_message('danger', _('The domain is not allowed'))) return result try: url = ''.join(c for c in unicodedata.normalize('NFD', url) if unicodedata.category(c) != 'Mn') reader = marcxml.parse_xml_to_array(urlopen(url)) except IOError as e: result.append(get_message('danger', str(e))) except Exception as e: result.append(get_message('danger', str(e))) if result: return result for record in reader: dict_result = {} dict_record = parse_dict(record.as_dict()) ## # only parse the good ones if 'publication_status' in dict_record and dict_record['publication_status'] == 'DELETED': continue if 'control_number' not in dict_record: continue dict_result['Id'] = dict_record['control_number'] dict_result['Infoscience_URL'] = "{}/record/{}".format( 'https://infoscience.epfl.ch', dict_record['control_number']) dict_result['ELA_Icon'] = dict_record['electronic_location_access']['icon'] dict_result['ELA_URL'] = dict_record['electronic_location_access']['fulltexts'] dict_result['DOI'] = dict_record['other_standard_identification_doi'] dict_result['Title'] = dict_record['title'] dict_result['Authors'] = dict_record['added_entry_personal_name'] dict_result['Authors_1'] = dict_record['added_entry_uncontrolled_name_person_1'] dict_result['Authors_3'] = dict_record['added_entry_uncontrolled_name_person_3'] dict_result['Directors'] = dict_record['added_entry_uncontrolled_name_person_2'] dict_result['Patents'] = dict_record['patent_control_information'] dict_result['Publication_Location'] = dict_record['publication_distribution'].get('a', '') dict_result['Publication_Institution'] = dict_record['publication_distribution'].get('b', '') dict_result['Publication_Date'] = dict_record['date_of_publication'] dict_result['Publication_Year'] = set_year(dict_result['Publication_Date']) dict_result['Publication_Pages'] = dict_record['physical_description_extent'] dict_result['Publisher'] = dict_record['host_item_entry'].get('t', '') dict_result['Publisher_Volume'] = dict_record['host_item_entry'].get('j', '') dict_result['Publisher_Volume_Number'] = dict_record['host_item_entry'].get('k', '') dict_result['Publisher_Volume_Pages'] = dict_record['host_item_entry'].get('q', '') dict_result['Local_Url_Link'] = dict_record['local_added_entry_url_link'] dict_result['Conference_Meeting_Name'] = dict_record['added_entry_meeting'].get('a', '') dict_result['Conference_Meeting_Location'] = dict_record['added_entry_meeting'].get('c', '') dict_result['Conference_Meeting_Date'] = dict_record['added_entry_meeting'].get('d', '') dict_result['Corporate_Name'] = dict_record['added_entry_corporate_name'] dict_result['Company_Name'] = dict_record['added_entry_uncontrolled_name_company'] dict_result['Approved_Publications'] = dict_record['approved_publications'] dict_result['Pending_Publications'] = dict_record['pending_publications'] dict_result['Doc_Type'] = dict_record['source_of_acquisition'] dict_result['ISBN'] = dict_record['isbn'] dict_result['Summary'] = dict_record['summary'] dict_result['Description'] = [entry.format_field() for entry in record.physicaldescription()] dict_result['Subjects'] = [entry.format_field() for entry in record.subjects()] is_pending = dict_result['Pending_Publications'] and not dict_result['Approved_Publications'] if not is_pending or can_display_pending_publications: result.append(dict_result) if len(result) == 0 and len(reader) > 0: result.append(get_message('info', _('There are only pending publications'))) return result
prefix + 'subfield') data_subfield.set('code', subfield[0]) data_subfield.text = translate(subfield[1]) return root # open marcxml file filename = 'full-041417.xml' fileout = filename[:-4] + '-appended' + '.xml' # fileout = filename[:-4] + '.tsv' # fileout = filename[:-4] + '.json' xmlhandle = NSMarcXml() records = marcxml.parse_xml_to_array(filename, strict=True) """ sample code for writing out to json """ # jsonwriter = JSONWriter(open(fileout,'wt')) # for record in records: jsonwriter.write(record) # jsonwriter.close() """ reads in tsv and generates dictionary of cleaned values with id keys """ cleaned_dict = {} cleaned_keys = [] with open('cleaned.tsv', 'r') as fh: cleaned_list = csv.reader(fh, delimiter="\t") for row in cleaned_list: # cleaned_dict[row[0].strip()] = [row[2],row[4]] cleaned_dict[row[0]] = [row[2].strip(), row[4].strip()]
def main(filename, out_filename=""): def field_to_tsv(field, collection): out_path = emx.get_out_filename(filename, field['desc']) with open(out_path, 'w', newline='') as fh: # open output file columns = emx.COLUMNS_WORK + field['codes'] csv.register_dialect('marcxmltotsv',\ delimiter='\t',quoting=csv.QUOTE_NONE,quotechar='',\ doublequote=False,escapechar=None) csv_writer = csv.DictWriter(fh, fieldnames=columns, dialect='marcxmltotsv') headers = {} headers_list = [] headers_list += emx.COLUMNS_WORK if 'headers' in field: headers_list += field['headers'] else: headers_list += field['codes'] i = 0 for column in columns: headers[column] = headers_list[i] i += 1 index = 0 csv_writer.writerow(headers) def get_curr_row(curr_fields, curr_work, tag): curr_row = {} if curr_fields and tag == '989': curr_row = emx.get_field_989_values( curr_fields, field['codes'], curr_work) else: for curr_field in curr_fields: curr_row = curr_work curr_row = emx.get_field_vol_values( curr_field, field['codes'], curr_row) return curr_row for record in collection: curr_work = emx.get_work_metadata(record) curr_fields = record.get_fields(field['tag']) if len(DATE_RANGE) < 2: curr_row = get_curr_row(curr_fields, curr_work, field['tag']) if curr_row: csv_writer.writerow(curr_row) continue else: curr_date = emx.get_pymarc_field_value( '008', record)[7:11].strip() if len(curr_date) == 4 and int(curr_date) >= DATE_RANGE[0] \ and int(curr_date) <= DATE_RANGE[1]: curr_row = get_curr_row(curr_fields, curr_work, field['tag']) if curr_row: csv_writer.writerow(curr_row) return 0 # parse xml collection = marcxml.parse_xml_to_array(filename, strict=True) for datafield in DATAFIELDS: field_to_tsv(datafield, collection)