def validate_corpus(corpus, error_file_path): if corpus and error_file_path: with open(error_file_path, "w") as error_file: # fetch document_list = repository_service.get_documents(corpus) # validate all_errors = [] for document in document_list: rec_id = document["rec_id"] rec_source = "empy" if "rec_source" in document: rec_source = document["rec_source"] errors = metajson_validation.validate_metajson_document( document) for error in errors: formatted_error = "".join([ corpus, ":", rec_source, ":", rec_id, ":", error, "\n" ]) all_errors.append(formatted_error) if error_file: error_file.write(formatted_error) return all_errors
def export_corpus(corpus, output_file_path, output_format, all_in_one_file, one_record_per_copy=False): if corpus and output_file_path: # fetch metajson_list = repository_service.get_documents(corpus) # one record per physical resource if one_record_per_copy: metajson_tmp = export_one_record_per_copy(metajson_list) else: metajson_tmp = metajson_list # convert results = crosswalks_service.convert_metajson_list( metajson_tmp, output_format, all_in_one_file) # export if all_in_one_file: io_service.write_items_in_one_file(corpus, corpus, results, output_file_path, output_format) else: io_service.write_items(corpus, corpus, results, output_file_path, output_format)
def format_corpus(corpus, output_title, output_file_path, output_style): if corpus and output_file_path: # fetch metajson_list = repository_service.get_documents(corpus) # convert to html # format io_service.write_html(corpus, output_title, metajson_list, output_file_path, output_style)
def format_corpus(corpus, output_title, output_file_path, output_style): if corpus and output_file_path: # fetch metajson_list = repository_service.get_documents(corpus) # convert to html # format io_service.write_html(corpus, output_title, metajson_list, output_file_path, output_style)
def export_corpus(corpus, output_file_path, output_format, all_in_one_file, one_record_per_copy=False): if corpus and output_file_path: # fetch metajson_list = repository_service.get_documents(corpus) # one record per physical resource if one_record_per_copy: metajson_tmp = export_one_record_per_copy(metajson_list) else: metajson_tmp = metajson_list # convert results = crosswalks_service.convert_metajson_list(metajson_tmp, output_format, all_in_one_file) # export if all_in_one_file: io_service.write_items_in_one_file(corpus, corpus, results, output_file_path, output_format) else: io_service.write_items(corpus, corpus, results, output_file_path, output_format)
def validate_corpus(corpus, error_file_path): if corpus and error_file_path: with open(error_file_path, "w") as error_file: # fetch document_list = repository_service.get_documents(corpus) # validate all_errors = [] for document in document_list: rec_id = document["rec_id"] rec_source = "empy" if "rec_source" in document: rec_source = document["rec_source"] errors = metajson_validation.validate_metajson_document(document) for error in errors: formatted_error = "".join([corpus, ":", rec_source, ":", rec_id, ":", error, "\n"]) all_errors.append(formatted_error) if error_file: error_file.write(formatted_error) return all_errors
if __name__ == "__main__": date_begin = datetime.datetime.now() # conf params corpus = "perio" source = "Sciences Po | la bibliothèque" rec_id_prefix = "" input_file_path = os.path.join("data", "unimarc", "periouni.mrc") input_format = constants.FORMAT_UNIMARC csv_file_name = "".join(["validation-", corpus, ".csv"]) csv_file_path = os.path.join("data", "result", csv_file_name) # conf corpus corpus_service.clean_corpus(corpus) corpus_service.conf_corpus(corpus, "aime") date_clean = datetime.datetime.now() chrono.chrono_trace("Clean and conf corpus", date_begin, date_clean, None) # import corpus_service.import_metadata_file(corpus, input_file_path, input_format, source, rec_id_prefix, True, None) date_import = datetime.datetime.now() chrono.chrono_trace("Import corpus", date_clean, date_import, None) # Validate perio documents = repository_service.get_documents(corpus) validate_perios(documents, csv_file_path) date_validate = datetime.datetime.now() chrono.chrono_trace("Validate perio", date_import, date_validate, None)
if __name__ == "__main__": date_begin = datetime.datetime.now() # conf params corpus = "perio" source = "Sciences Po | la bibliothèque" rec_id_prefix = "" input_file_path = os.path.join("data", "unimarc", "periouni.mrc") input_format = constants.FORMAT_UNIMARC csv_file_name = "".join(["validation-", corpus, ".csv"]) csv_file_path = os.path.join("data", "result", csv_file_name) # conf corpus corpus_service.clean_corpus(corpus) corpus_service.conf_corpus(corpus, "aime") date_clean = datetime.datetime.now() chrono.chrono_trace("Clean and conf corpus", date_begin, date_clean, None) # import corpus_service.import_metadata_file(corpus, input_file_path, input_format, source, rec_id_prefix, True, None) date_import = datetime.datetime.now() chrono.chrono_trace("Import corpus", date_clean, date_import, None) # Validate perio documents = repository_service.get_documents(corpus) validate_perios(documents, csv_file_path) date_validate = datetime.datetime.now() chrono.chrono_trace("Validate perio", date_import, date_validate, None)