metajson_list.append(oaipmh_harvester.get_record(target, identifier)) date_harvest = datetime.datetime.now() chrono.chrono_trace("harvest spire and convert to metajson", date_begin, date_harvest, len(ids)) # import result_import = corpus_service.import_metajson_list(corpus, metajson_list, True, None) date_import = datetime.datetime.now() chrono.chrono_trace("import", date_harvest, date_import, len(result_import)) if __name__ == "__main__": date_begin = datetime.datetime.now() # conf corpus corpus = "spire" corpus_service.clean_corpus(corpus) date_clean = datetime.datetime.now() chrono.chrono_trace("Initialize corpus", date_begin, date_clean, None) target = Target() target['identifier'] = 'spire' target['title'] = 'Sciences Po Institutional Repository' target['type'] = 'oaipmh' target['url'] = 'http://spire.sciencespo.fr/dissemination/oaipmh2-no-prefix-publications.xml' target['metadata_prefix'] = 'didl' ids = [ "oai:spire.sciencespo.fr:2441/dambferfb7dfprc9m26c8c8o3", "oai:spire.sciencespo.fr:2441/eo6779thqgm5r489makgoai85", "oai:spire.sciencespo.fr:2441/5l6uh8ogmqildh09h6m8hj429", "oai:spire.sciencespo.fr:2441/3fm4jv3k2s99lms9jb5i5asil",
def clean_corpus(args): corpus = args.corpus if not corpus: corpus = default_corpus logging.info("corpus: {}".format(corpus)) corpus_service.clean_corpus(corpus)
# conf params corpus = "num" source = "FNSP" rec_id_prefix = "sc_" input_dir_path = os.path.join("data", "num", "input") input_format = constants.FORMAT_UNIMARC output_dir_path = os.path.join("data", "num", "output") if not os.path.exists(output_dir_path): os.mkdir(output_dir_path) error_file_name = "".join(["validation-", corpus, ".txt"]) error_file_path = os.path.join(output_dir_path, error_file_name) #logging.debug("error_file_path: {}".format(error_file_path)) # conf corpus corpus_service.clean_corpus(corpus) corpus_service.conf_corpus(corpus, "aime") date_clean = datetime.datetime.now() chrono.chrono_trace("Clean and conf corpus", date_begin, date_clean, None) # import input_file_paths = io_service.get_relevant_file_list_by_format( input_dir_path, input_format) results = corpus_service.import_metadata_files(corpus, input_file_paths, input_format, source, rec_id_prefix, True, None) date_import = datetime.datetime.now() chrono.chrono_trace("Import corpus", date_clean, date_import, None) # Validate corpus_service.validate_corpus(corpus, error_file_path)