def convert_marc_xml(self, db_update_obj): # check if it is a marc file if self.source.endswith('.mrc') or self.source.endswith('.marc'): # for each .mrc file create a sub-folder based on timestamp to store converted MARC/XML files subfolder_name = '%s_%s' %(self.source.replace("Webapp/source/MARC/", "").replace("Webapp/source/BIBFRAME/", "").split('.')[0], datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) db_update_obj.master_file = subfolder_name db_update_obj.save() sufolder = os.path.join(self.folder, subfolder_name) if not os.path.exists(sufolder): os.makedirs(sufolder) #create the BIBFRAME converter object BIBFRAME = XML_BIBFRAME(subfolder_name) output = os.path.join(sufolder, '') with open(self.source, "rb") as marc_file: reader = MARCReader(marc_file, to_unicode=True, force_utf8=False, utf8_handling='ignore') for i, record in enumerate(reader): #print ("converting record number %s to XML" %(str(i))) if record.title(): ti = record.title() ti = ti.replace("/", "") ti = ti.replace(" ", "_") ti = ti[0:50] writer = XMLWriter(open(output + ti + '.xml','wb')) writer.write(record) writer.close() else: writer = XMLWriter(open(output + 'unknownTitle' + str(i) + '.xml','wb')) marc_file.close() #convert MARC/XML to BIBFRAME db_update_obj.stage = "MARC-XML_to_BIBFRAME" db_update_obj.save() BIBFRAME.convert_to_BIBFRAME(i, db_update_obj) # merge the BIBFRAME files into one (per the master MARC file) for ease of processing merged = BIBFRAME.merger() return merged
def writeRecordToFile(record, filename): try: writer = XMLWriter(open(filename, 'wb')) writer.write(record) except: print "Error: Cannot write metadata to "+filename writer.close() # Important!
def main(): files = parser.parse_args().files for filename in files: print filename with open(filename) as fp: reader = MARCReader(fp) writer = XMLWriter(open(filename.split('.')[0]+'.xml', 'wb')) for record in reader: writer.write(record) writer.close() # Important!
def upload_success(filename): if request.method == 'POST': file = "uploads/" + filename if not os.path.exists("processed"): os.makedirs("processed") with open(file, "rb") as infile: ts = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') reader = MARCReader(infile, to_unicode=True, force_utf8=False, utf8_handling='ignore') files = [] for i, record in enumerate(reader): print("converting record number " + str(i) + " to XML") if record.title(): ti = record.title() ti = ti.replace("/", "") ti = ti.replace(" ", "_") ti = ti[0:50] writer = XMLWriter(open('processed/' + ti + '.xml', 'wb')) writer.write(record) writer.close() files.append(ti) else: writer = XMLWriter( open('processed/' + 'unknownTitle' + str(i) + '.xml', 'wb')) files.append('unknownTitle' + str(i)) infile.close() #print (file + " was successfully converted to XML") xslt = ET.parse("marc2bibframe2-master/xsl/marc2bibframe2.xsl") tf = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') #print("walltime:", datetime.strptime(tf, '%H:%M:%S') - datetime.strptime(ts, '%H:%M:%S')) for i, file in enumerate(files): #print ("start processing record number " + str(i)) #print ("starting BIBFRAME transformation") f = "processed/" + file + ".xml" dom = ET.parse(f) transform = ET.XSLT(xslt) newdom = transform(dom) if not os.path.exists("BIB"): os.makedirs("BIB") with open("BIB/" + file + ".xml", "w+") as oo: oo.write(str(newdom).replace('<?xml version="1.0"?>', '')) #print ("starting enrichment process") main("BIB/" + file + ".xml") tf = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') #print("walltime:", datetime.strptime(tf, '%H:%M:%S') - datetime.strptime(ts, '%H:%M:%S')) return redirect(url_for('process_success', files=file)) return '''
def upload_success(filename): if request.method == 'POST': file = "uploads/" + filename if not os.path.exists("processed"): os.makedirs("processed") with open(file, "rb") as infile: ts = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') reader = MARCReader(infile, to_unicode=True, force_utf8=False, utf8_handling='ignore') files = [] for i, record in enumerate(reader): print ("converting record number " + str(i) + " to XML") if record.title(): ti = record.title() ti = ti.replace("/", "") ti = ti.replace(" ", "_") ti = ti[0:50] writer = XMLWriter(open('processed/' + ti + '.xml','wb')) writer.write(record) writer.close() files.append(ti) else: writer = XMLWriter(open('processed/' + 'unknownTitle' + str(i) + '.xml','wb')) files.append('unknownTitle' + str(i)) infile.close() #print (file + " was successfully converted to XML") xslt = ET.parse("marc2bibframe2-master/xsl/marc2bibframe2.xsl") tf = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') #print("walltime:", datetime.strptime(tf, '%H:%M:%S') - datetime.strptime(ts, '%H:%M:%S')) for i, file in enumerate(files): #print ("start processing record number " + str(i)) #print ("starting BIBFRAME transformation") f = "processed/" + file + ".xml" dom = ET.parse(f) transform = ET.XSLT(xslt) newdom = transform(dom) if not os.path.exists("BIB"): os.makedirs("BIB") with open ("BIB/" + file + ".xml", "w+") as oo: oo.write(str(newdom).replace('<?xml version="1.0"?>', '')) #print ("starting enrichment process") main("BIB/" + file + ".xml") tf = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') #print("walltime:", datetime.strptime(tf, '%H:%M:%S') - datetime.strptime(ts, '%H:%M:%S')) return redirect(url_for('process_success', files=file)) return'''
def convert_marc_xml(self, db_update_obj): # check if it is a marc file if self.source.endswith('.mrc') or self.source.endswith('.marc'): # for each .mrc file create a sub-folder based on timestamp to store converted MARC/XML files subfolder_name = '%s_%s' % (self.source.replace( "Webapp/source/MARC/", "").replace( "Webapp/source/BIBFRAME/", "").split('.')[0], datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S')) db_update_obj.master_file = subfolder_name db_update_obj.save() sufolder = os.path.join(self.folder, subfolder_name) if not os.path.exists(sufolder): os.makedirs(sufolder) #create the BIBFRAME converter object BIBFRAME = XML_BIBFRAME(subfolder_name) output = os.path.join(sufolder, '') with open(self.source, "rb") as marc_file: reader = MARCReader(marc_file, to_unicode=True, force_utf8=False, utf8_handling='ignore') for i, record in enumerate(reader): #print ("converting record number %s to XML" %(str(i))) if record.title(): ti = record.title() ti = ti.replace("/", "") ti = ti.replace(" ", "_") ti = ti[0:50] writer = XMLWriter(open(output + ti + '.xml', 'wb')) writer.write(record) writer.close() else: writer = XMLWriter( open(output + 'unknownTitle' + str(i) + '.xml', 'wb')) marc_file.close() #convert MARC/XML to BIBFRAME db_update_obj.stage = "MARC-XML_to_BIBFRAME" db_update_obj.save() BIBFRAME.convert_to_BIBFRAME(i, db_update_obj) # merge the BIBFRAME files into one (per the master MARC file) for ease of processing merged = BIBFRAME.merger() return merged
def convert_marc_xml(self): # for each .marc file in the "marc" folder, convert marc to MARC/XML and then to BIBFRAME for index, files in enumerate(os.listdir(self.source)): # check if it is a marc file if files.endswith('.mrc') or files.endswith('.marc'): # for each .mrc file create a sub-folder based on timestamp to store converted MARC/XML files subfolder_name = '%s_%s' % ( files.split('.')[0], datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S')) sufolder = os.path.join(self.folder, subfolder_name) if not os.path.exists(sufolder): os.makedirs(sufolder) #create the BIBFRAME converter object BIBFRAME = XML_BIBFRAME(subfolder_name) file = os.path.join(self.source, files) output = os.path.join(sufolder, '') with open(file, "rb") as marc_file: reader = MARCReader(marc_file, to_unicode=True, force_utf8=False, utf8_handling='ignore') for i, record in enumerate(reader): print( "converting record number %s of file number %s to XML" % (str(i), str(index + 1))) if record.title(): ti = record.title() ti = ti.replace("/", "") ti = ti.replace(" ", "_") ti = ti[0:50] writer = XMLWriter(open(output + ti + '.xml', 'wb')) writer.write(record) writer.close() else: writer = XMLWriter( open(output + 'unknownTitle' + str(i) + '.xml', 'wb')) marc_file.close() #convert MARC/XML to BIBFRAME BIBFRAME.convert_to_BIBFRAME(i) # merge the BIBFRAME files into one (per the master MARC file) for ease of processing BIBFRAME.merger()
def convert(self, filename): """Convert file into zero or more MARC records.""" g = Graph() with open(filename, 'r') as fh: g.parse(file=fh, format='n3') # Get JSON-LD object in PyLD form jld = pyld_jsonld_from_rdflib_graph(g) # print(json.dumps(jld, indent=2, sort_keys=True)) # Manipulate the JSON in some way (say switch @context # to a reference) and output # -- no easy way to do this is rdflib with open('cache/biblioteko_context.json', 'r') as cfh: context = json.load(cfh) with open('cache/biblioteko_frame.json', 'r') as ffh: frame = json.load(ffh) comp = jsonld.compact(jld, ctx=context) comp = jsonld.compact(jsonld.frame(comp, frame), ctx=context, options={'graph': True}) if (self.dump_json): sys.stderr.write("Framed and compacted JSON-LD:\n" + json.dumps(comp, indent=2, sort_keys=True) + '\n') self.jsonld = comp memory = BytesIO() writer = XMLWriter(memory) for obj in self.jsonld['@graph']: if (obj.get('type') == 'bf:Instance'): writer.write(self.make_marc(obj)) else: logging.info( "Ignoring object %s type %s" % (obj.get('id', 'NO-URI'), obj.get('type', 'NO-TYPE'))) writer.close(close_fh=False) # Important! xml = memory.getvalue().decode(encoding='UTF-8') # Dumb semi pretty print xml = re.sub(r'(</\w+>)', r'\1\n', xml) xml = re.sub(r'(\?>)', r'\1\n', xml) xml = re.sub(r'(<record>)', r'\n\1\n', xml) return (xml)
def marc_record_to_xml_string(record): b = BytesIO() writer = XMLWriter(b) writer.write(record) writer.close(close_fh=False) # Transform record from bytes to string. b.seek(0,0) bytestr = b.read() marc = bytestr.decode('UTF-8') # Remove the XML declaration and collection stuff. marc = re.sub(r'^.*<collection[^>]*>','',marc) marc = re.sub(r'</collection>$','',marc) # Remove tab characters. marc = re.sub(r'\t','',marc) # Verify cleaning worked: if not marc.startswith('<record'): print("Error: record failed to create clean XML.") return False else: return marc
def create_MARC_XML(self): """ Creates a MARC XML format file from the given dataframe :return: """ df = self.marc_data # MARCXML file output_file = self.data_path_processed / ( self.collection_id + "_final_" + self.dt_now + ".xml") writer = XMLWriter(open(output_file, "wb")) # MarcEdit MRK file output_file_mrk = self.data_path_processed / ( self.collection_id + "_finalMRK_" + self.dt_now + ".txt") mrk_file = open(output_file_mrk, "w", encoding="utf8") start_time = time.time() counter = 1 for index, row in df.iterrows(): record = Record() # add control field record.add_field(Field(tag="001", data=str(index))) for col in df: # if field is empty, skip if str(row[col]) == "": continue # leader elif col == "LDR": l = record.leader l.record_status = "c" # c - Corrected or revised l.type_of_record = "p" # p - Mixed materials # Bibliographic level if row["351"] == "File Record" or "Item Record": l.bibliographic_level = "c" else: l.bibliographic_level = "d" l.coding_scheme = "a" # flag saying this record is utf8 l.cataloging_form = "a" continue # 008 elif col == "008": field = Field(tag="008", data=row[col]) record.add_field(field) continue # extract field name field = col[:3] # extract indicators if col.find("_") == -1: col_name = "{:<5}".format(col) ind = [col_name[3], col_name[4]] else: col_name = "{:<5}".format(col[:col.find("_")]) ind = [col_name[3], col_name[4]] # extract sub-fields subfields_data = list() subfields_prep = list(filter(None, str(row[col]).split("$$"))) for subfield in subfields_prep: if subfield == "": continue subfields_data.append(subfield[0]) subfields_data.append(subfield[1:]) # print('field:', field) # if not ind: # print('no indicators') # else: # print('indicators:', ind) # # print('subfields:', subfields_data) record.add_field( Field(tag=field, indicators=ind, subfields=subfields_data)) counter += 1 # mrk_file.write(record.as_marc()) writer.write(record) writer.close() mrk_file.close() run_time = time.time() - start_time return counter, run_time
subfields = [ 'w', wval, 'a', unicode(label) ] else: subfields = [ 'a', unicode(label) ] rec.add_field( Field( tag='550', indicators = [' ', ' '], subfields = subfields ) ) # skos:note -> 680 for note in sorted(g.objects(conc, SKOS.note), key=lambda v:v.language): rec.add_field( Field( tag='680', indicators = [' ', ' '], subfields=[ 'i', unicode(note), '9', format_language(note.language) ] ) ) # TODO 750 links to ysa/allars/lcsh - need to look up labels for the URIs writer.write(rec) writer.close()
def convert(cs, language, g): vocId = cs.get("vocabulary_code") # variable for a bit complicated constants and casting/converting them to appropiate types helper_variables = { "vocCode" : (cs.get("vocabulary_code") + "/" + LANGUAGES[language] \ if cs.getboolean("multilanguage", fallback=False) \ else vocId), "groupingClasses" : [URIRef(x) for x in cs.get("groupingClasses", fallback=",".join(GROUPINGCLASSES)).split(",")], "groupingClassesDefault" : [URIRef(x) for x in cs.parser.get("DEFAULT", "groupingClasses", fallback=",".join(GROUPINGCLASSES)).split(",")], 'modificationDates': cs.get("modificationDates", fallback=None), 'keepModified' : cs.get("keepModifiedAfter", fallback=None), 'keepGroupingClasses' : cs.getboolean("keepGroupingClasses", fallback=False), 'defaultOutputFileName' : "yso2marc-" + cs.name.lower() + "-" + language + ".mrcx" } if helper_variables['keepModified']: helper_variables['keepModifiedLimit'] = False \ if cs.get("keepModifiedAfter", fallback=KEEPMODIFIEDAFTER).lower() == "all" \ else datetime.date(datetime.strptime(cs.get("keepModifiedAfter"), "%Y-%m-%d")) if cs.get("output", fallback=None): parts = cs.get("languages").split(",") if len(parts) > 1: output = cs.get("output") if len(output.split(".")) > 1: helper_variables["outputFileName"] = ".".join( output.split(".") [:-1]) + "-" + language + "." + output.split(".")[-1] else: helper_variables["outputFileName"] = output + "-" + language if not "outputFileName" in helper_variables: helper_variables["outputFileName"] = cs.get( "output", fallback=helper_variables["defaultOutputFileName"]) #modified_dates on dict-objekti, joka sisältää tietueen id:n avaimena ja #arvona tuplen, jossa on tietueen viimeinen muokkauspäivämäärä ja tietueen sisältö MD5-tiivisteenä if helper_variables['modificationDates']: if os.path.isfile(helper_variables['modificationDates']): with open(helper_variables['modificationDates'], 'rb') as pickle_file: try: modified_dates = pickle.load(pickle_file) except EOFError: logging.error( "The file %s for modification dates is empty " % helper_variables['modificationDates']) sys.exit(2) else: modified_dates = {} logging.info( "Processing vocabulary with vocabulary code '%s' in language '%s'" % (vocId, language)) incrementor = 0 writer_records_counter = 0 ET_namespaces = {"marcxml": "http://www.loc.gov/MARC21/slim"} handle = open( cs.get("output", fallback=helper_variables["defaultOutputFileName"]), "wb") writer = XMLWriter(handle) # listataan preflabelit, jotta voidaan karsia alt_labelit, jotka toisessa käsitteessä pref_labelina pref_labels = set() for conc in g.subjects(RDF.type, SKOS.Concept): pref_label = g.preferredLabel(conc, lang=language) if pref_label: pref_labels.add(str(pref_label[0][1])) # vain nämä mts-käsiteryhmät otetaan mukaan, ryhmän nimestä ei tehdä MARC21-tietuetta ids = {"occupations": ['m2332'], "titles": ['m121', 'm3764']} uris = {} for key in ids: uris[key] = set() for id in ids[key]: uris[key].add(MTS + id) for group in g.subjects(RDF.type, ISOTHES.ConceptGroup): for key in uris: if any(str(group).endswith(uri) for uri in uris[key]): get_member_groups(g, group, uris[key]) concs = [] if helper_variables['keepModified']: concs = [] for uri in modified_dates: if modified_dates[uri][0] >= helper_variables['keepModifiedLimit']: concs.append(URIRef(uri)) else: for conc in g.subjects(RDF.type, SKOS.Concept): concs.append(conc) #luotujen käsitteiden tunnukset, joilla voidaan selvittää modification_dates-listan avulla poistetut käsitteet created_concepts = set() for concept in concs: #vain ammateista ja arvonimistä luodaan MARC21-tietueet if not (concept in uris['occupations'] or concept in uris['titles']): continue created_concepts.add(str(concept)) incrementor += 1 if incrementor % 1000 == 0: logging.info("Processing %sth concept" % (incrementor)) #skipataan ryhmittelevät käsitteet if not helper_variables['keepGroupingClasses']: if any(conceptType in helper_variables["groupingClasses"] for conceptType in g.objects(concept, RDF.type)): continue rec = Record() rec.leader = cs.get("leaderNew", fallback=LEADERNEW) # 024 muut standarditunnukset - käsitteen URI tallennetaan tähän rec.add_field( Field(tag='024', indicators=['7', ' '], subfields=['a', concept, '2', "uri"])) # 040 luetteloiva organisaatio rec.add_field( Field(tag='040', indicators=[' ', ' '], subfields=[ 'a', cs.get("creatorAgency", fallback=CREATOR_AGENCY), 'b', LANGUAGES[language], 'f', helper_variables["vocCode"] ])) valueProps = sorted(getValues(g, concept, SKOS.prefLabel, language=language), key=lambda o: o.value) if len(valueProps) == 0: logging.warning( "Could not find preflabel for concept %s in language %s. Skipping the whole concept." % (concept, language)) continue elif len(valueProps) != 1: logging.warning( "Multiple prefLabels detected for concept %s in language %s. Choosing the first." % (concept, language)) if concept in uris['occupations']: tag = "174" subfield_code = "a" elif concept in uris['titles']: tag = "168" subfield_code = "d" rec.add_field( Field(tag=tag, indicators=[' ', ' '], subfields=[ subfield_code, decomposedÅÄÖtoUnicodeCharacters( unicodedata.normalize(NORMALIZATION_FORM, str(valueProps[0].value))) ])) # skos:altLabel -> 467, 474 # 450 katso-viittaus # jätetään tuottamatta 45X-kentät, jotka ovat toisessa käsitteessä 15X-kenttinä, paitsi altLabelein kohdalla seen_values = set() for valueProp in sorted(getValues(g, concept, [SKOS.altLabel], language=language), key=lambda o: str(o.value)): if valueProp.prop != SKOS.altLabel and str( valueProp.value) in pref_labels: continue if valueProp.prop == SKOS.hiddenLabel: if str(valueProp.value) in seen_values: continue seen_values.add(str(valueProp.value)) if concept in uris['occupations']: tag = "474" subfield_code = "a" elif concept in uris['titles']: tag = "468" subfield_code = "d" rec.add_field( Field(tag=tag, indicators=[' ', ' '], subfields=[ subfield_code, decomposedÅÄÖtoUnicodeCharacters( unicodedata.normalize(NORMALIZATION_FORM, str(valueProp.value))) ])) valueProps = getValues(g, concept, [ SKOS.prefLabel, SKOS.exactMatch, SKOS.closeMatch, SKOS.broadMatch, SKOS.narrowMatch, SKOS.relatedMatch ]) fields = list( ) # kerätään kentät tähän muuttujaan, joka sitten lopuksi järjestetään for valueProp in valueProps: if valueProp.prop == SKOS.prefLabel: # suodatetaan samankieliset, jotka menivät jo 1xx-kenttiin # valueProp.value sisältää tässä poikkeuksellisesti jo halutun literaalin # (vrt. kun muissa on solmu) if valueProp.value.language == language: continue else: # otetaan vain viittaukset samaan sanastoon continue if concept in uris['occupations']: tag = "774" subfield_code = "a" elif concept in uris['titles']: tag = "768" subfield_code = "d" sub2 = "mts" + "/" + LANGUAGES[valueProp.value.language] fields.append( Field(tag=tag, indicators=[' ', ' '], subfields=[ subfield_code, decomposedÅÄÖtoUnicodeCharacters( unicodedata.normalize(NORMALIZATION_FORM, str(valueProp.value))), '4', 'EQ', '2', sub2, '0', concept ])) # sort fields and add them for sorted_field in sorted(fields, key=lambda o: (o.tag, o.value().lower())): rec.add_field(sorted_field) writer_records_counter += 1 writer.write(rec) if helper_variables['modificationDates']: md5 = hashlib.md5() md5.update(str.encode(str(rec))) hash = md5.hexdigest() if str(concept) in modified_dates: if not hash == modified_dates[str(concept)][1]: modified_dates[str(concept)] = (date.today(), hash) else: modified_dates[str(concept)] = (date.today(), hash) #tuotetaan poistetut käsitteet, kun haetaan muuttuneet käsitteet #jos tietue on modified_dates-parametrillä määritettyssä tiedostossa, mutta ei graafissa, tulkitana poistetuksi tietueeksi #mts:ssä ei ole deprekointipäiviä # if helper_variables['keepModified']: concs = [] for conc in g.subjects(RDF.type, SKOS.Concept): if conc in uris['occupations'] or conc in uris['titles']: concs.append(str(conc)) for conc in modified_dates: if conc not in concs: #jos ei ole hajautussummaa (tuplen 2. arvo), luodaan deprekoitu käsite if modified_dates[conc][1]: rec = Record() rec.leader = cs.get("leaderDeleted0", fallback=LEADERDELETED0) rec.add_field( Field(tag='024', indicators=['7', ' '], subfields=['a', conc, '2', "uri"])) modified_dates[conc] = (date.today(), "") writer_records_counter += 1 writer.write(rec) if handle is not sys.stdout: writer.close() if helper_variables['modificationDates']: with open(helper_variables['modificationDates'], 'wb') as output: pickle.dump(modified_dates, output, pickle.HIGHEST_PROTOCOL) #jos luodaan kaikki käsitteet, tuotetaan tuotetaan lopuksi käsitteet laveassa XML-muodossa #if not helper_variables['keepModified']: parser = ET.XMLParser(remove_blank_text=True, strip_cdata=False) file_path = helper_variables["outputFileName"] tree = ET.parse(file_path, parser) e = tree.getroot() handle = open( cs.get("output", fallback=helper_variables["defaultOutputFileName"]), "wb") handle.write( ET.tostring(e, encoding='UTF-8', pretty_print=True, xml_declaration=True)) if handle is not sys.stdout: handle.close() # lokitetaan vähän tietoa konversiosta logging.info("Processed %s concepts. Wrote %s MARCXML records." % (incrementor, writer_records_counter)) if cs.get("outputSpecified", fallback=None) == None: outputChannel = sys.stdout.buffer with open( cs.get("output", fallback=helper_variables['defaultOutputFileName']), "rb") as f: shutil.copyfileobj(f, outputChannel) if cs.get("outputSpecified", fallback=None) == None: os.remove( cs.get("output", fallback=helper_variables['defaultOutputFileName'])) logging.info("Conversion completed: %s" % datetime.now().replace(microsecond=0).isoformat())
def compare_records(args): """ input_file_1: Vertailtavien MARC-tietueiden tiedostonimi input_file_2: Tiedostonimi MARC-tietueille, joista tallennetaan muokatut ja uudet output_file: muokatuista ja uusista tietueista muodostetun MARCXML-tiedoston nimi pickle_file: tallettaa muutospäivämäärät pickle-tiedostoon date_1 ja date_2 parametrien mukaan date_1: alkuperäinen päivämäärä date_2: muutospäivämäärä """ #git rev-list -1 --before="2019-08-23 23:59" master #git log input_file_1 = args.first_input_file input_file_2 = args.second_input_file mrcx_file = args.output_mrcx pickle_file = args.output_pkl date_1 = args.original_date date_2 = args.modified_date modified_records = 0 new_records = 0 all_records = {} loglevel = logging.INFO logger = logging.getLogger() logger.setLevel(loglevel) if date_1: old_date = datetime.date(datetime.strptime(date_1, "%Y-%m-%d")) else: old_date = date.fromtimestamp(os.path.getmtime(input_file_1)) if date_2: new_date = datetime.date(datetime.strptime(date_2, "%Y-%m-%d")) else: new_date = date.fromtimestamp(os.path.getmtime(input_file_2)) writer = XMLWriter(open(mrcx_file, "wb")) records = parse_xml_to_array(input_file_1) old_records_dict = {} for record in records: md5 = hashlib.md5() md5.update(str.encode(str(record))) for field in record.get_fields('024'): old_records_dict.update({field['a']: md5.hexdigest()}) records = parse_xml_to_array(input_file_2) for record in records: record_id = None modified = False modified_date = old_date for field in record.get_fields('024'): record_id = field['a'] if record_id: if record_id in old_records_dict: md5 = hashlib.md5() md5.update(str.encode(str(record))) hash = md5.hexdigest() old_hash = old_records_dict[record_id] if not old_hash == hash: modified = True modified_records += 1 else: modified = True new_records += 1 else: logging.warning("Record id missing") if modified: writer.write(record) modified_date = new_date all_records[record_id] = (modified_date, hash) logging.info("Number of modified records: %s"%modified_records) logging.info("Number of new records: %s"%new_records) if pickle_file: with open(pickle_file, 'wb') as output: pickle.dump(all_records, output, pickle.HIGHEST_PROTOCOL) output.close() writer.close() parser = ET.XMLParser(remove_blank_text=True,strip_cdata=False) tree = ET.parse(mrcx_file, parser) e = tree.getroot() handle = open(mrcx_file, "wb") handle.write(ET.tostring(e, encoding='UTF-8', pretty_print=True, xml_declaration=True))
def writeMARCXML(record, filename): # Write out record in MARCXML format writer = XMLWriter(open(filename,'wb')) writer.write(record) writer.close() # Important!
def main(): global inputfile, target, mapping parser = argparse.ArgumentParser( description='Process and map classification authority records for BCUR.' ) parser.add_argument('-i', '--inputfiles', type=str, nargs='+', help='one or more file(s) to be processed', required=True) parser.add_argument('-o', '--outputfile', type=str, nargs=1, help='name of the output file', required=True) parser.add_argument('-m', '--map', type=str, nargs=1, help='map target code', required=True, choices=valid_targets) args = parser.parse_args() targetcode = args.map[0] # For musi and musg records, found records are copied as-is, no field mapping. if targetcode in ('musi', 'musg'): mapping = False outputfile = args.outputfile[0] # Open a new XML document in which target records will be stored global writer writer = XMLWriter(open(outputfile, 'wb')) # Record start processing time tstart = datetime.datetime.now() # Print header row in case log is opened as CSV print("Notice,Champ,Contenu du champ,Message") # Loop through the list of input files and call the mapping function for infile in args.inputfiles: inputfile = infile target = targetcode if mapping: print( f"----- Traitement du fichier {inputfile} avec mapping {target} -----" ) else: print( f"----- Traitement du fichier {inputfile} sans mapping -----") # This applies the mapping function to each record in inputfile map_xml(record_map, inputfile) if targetcode == 'vddoc': # For vddoc, also look for vddoc-la target = 'vddoc-la' map_xml(record_map, inputfile) # Calculate the total time elapsed tdiff = datetime.datetime.now() - tstart # Close the output document writer.close() print( f'Routine terminée en {tdiff.total_seconds()} secondes. Résultat enregistré dans {outputfile}' )