def validate_corpus(args): corpus = args.corpus if not corpus: corpus = default_corpus logging.info("corpus: {}".format(corpus)) error_file_name = "".join(["validation-", corpus, ".txt"]) error_file_path = os.path.join(os.path.dirname(__file__), os.pardir, "log", error_file_name) logging.info("error_file_path: {}".format(error_file_path)) corpus_service.validate_corpus(corpus, error_file_path)
] harvest_by_ids(corpus, target, ids) #harvest_by_set(corpus, target, "SHS:STAT") # path data_result_dir = os.path.join(os.path.dirname(__file__), os.pardir, "data", "result") logging.info("data_result_dir: {}".format(data_result_dir)) error_file_path = os.path.join(data_result_dir, "result_validation_errors.txt") metajson_file_path = os.path.join(data_result_dir, "result_didl_metajson_spire.json") mods_file_path = os.path.join(data_result_dir, "result_didl_mods_spire.json") repec_file_path = os.path.join(data_result_dir, "result_repec.txt") date_path = datetime.datetime.now() # validate corpus_service.validate_corpus(corpus, error_file_path) date_validate = datetime.datetime.now() chrono.chrono_trace("Validate corpus", date_path, date_validate, None) # export MetaJSON corpus_service.export_corpus(corpus, metajson_file_path, constants.FORMAT_METAJSON, True) date_export_metajson = datetime.datetime.now() chrono.chrono_trace("Export corpus as MetaJSON", date_validate, date_export_metajson, None) # export MODS corpus_service.export_corpus(corpus, mods_file_path, constants.FORMAT_MODS, True) date_export_mods = datetime.datetime.now() chrono.chrono_trace("Export corpus as MODS", date_export_metajson, date_export_mods, None) # export RePEc corpus_service.export_corpus(corpus, repec_file_path, constants.FORMAT_REPEC, True)
corpus_service.clean_corpus(corpus) corpus_service.conf_corpus(corpus, "aime") date_clean = datetime.datetime.now() chrono.chrono_trace("Clean and conf corpus", date_begin, date_clean, None) # import input_file_paths = io_service.get_relevant_file_list_by_format( input_dir_path, input_format) results = corpus_service.import_metadata_files(corpus, input_file_paths, input_format, source, rec_id_prefix, True, None) date_import = datetime.datetime.now() chrono.chrono_trace("Import corpus", date_clean, date_import, None) # Validate corpus_service.validate_corpus(corpus, error_file_path) date_validate = datetime.datetime.now() chrono.chrono_trace("Validate corpus", date_import, date_validate, None) # Export mods corpus_service.export_corpus(corpus, output_dir_path, constants.FORMAT_MODS, False, True) date_export_mods = datetime.datetime.now() chrono.chrono_trace("Export corpus mods", date_validate, date_export_mods, None) # Export oai_dc corpus_service.export_corpus(corpus, output_dir_path, constants.FORMAT_OAI_DC, False, True) date_export_oai_dc = datetime.datetime.now() chrono.chrono_trace("Export corpus oai_dc", date_export_mods,