Beispiel #1
0
def conf_corpus(corpus, corpus_conf_dir_name):
    if not corpus:
        logging.error("Error: empty corpus")
    else:
        logging.info("init corpus: {}".format(corpus))

        if not corpus_conf_dir_name:
            corpus_conf_dir_name = corpus

        date_begin = datetime.datetime.now()

        # types
        results_types_common = conf_types(corpus, "common")
        results_types_corpus = conf_types(corpus, corpus_conf_dir_name)
        date_types = datetime.datetime.now()
        total_count = 0
        logging.info("# Import common types:")
        if results_types_common:
            for entry in results_types_common:
                total_count += 1
                logging.info("type_id: {}, _id: {}".format(
                    entry["type_id"], entry["_id"]))
        else:
            logging.debug("Empty common types")
        logging.info("# Import {} types:".format(corpus))
        if results_types_corpus:
            for entry in results_types_corpus:
                total_count += 1
                logging.info("type_id: {}, _id: {}".format(
                    entry["type_id"], entry["_id"]))
        else:
            logging.info("Empty {} types".format(corpus))
        chrono.chrono_trace("conf_types", date_begin, date_types, total_count)

        # datafields
        results_fields_common = conf_fields(corpus, "common")
        results_fields_corpus = conf_fields(corpus, corpus)
        date_fields = datetime.datetime.now()
        total_count = 0
        logging.info("# Import common fields:")
        if results_fields_common:
            for entry in results_fields_common:
                total_count += 1
                logging.info("rec_type: {}, _id: {}".format(
                    entry["rec_type"], entry["_id"]))
        else:
            logging.info("Empty common fields")
        logging.info("# Import {} fields:".format(corpus))
        if results_fields_corpus:
            for entry in results_fields_corpus:
                total_count += 1
                logging.info("rec_type: {}, _id: {}".format(
                    entry["rec_type"], entry["_id"]))
        else:
            logging.info("Empty {} fields".format(corpus))
        chrono.chrono_trace("conf_fields", date_types, date_fields,
                            total_count)
def conf_corpus(corpus, corpus_conf_dir_name):
    if not corpus:
        logging.error("Error: empty corpus")
    else:
        logging.info("init corpus: {}".format(corpus))

        if not corpus_conf_dir_name:
            corpus_conf_dir_name = corpus

        date_begin = datetime.datetime.now()

        # types
        results_types_common = conf_types(corpus, "common")
        results_types_corpus = conf_types(corpus, corpus_conf_dir_name)
        date_types = datetime.datetime.now()
        total_count = 0
        logging.info("# Import common types:")
        if results_types_common:
            for entry in results_types_common:
                total_count += 1
                logging.info("type_id: {}, _id: {}".format(entry["type_id"], entry["_id"]))
        else:
            logging.debug("Empty common types")
        logging.info("# Import {} types:".format(corpus))
        if results_types_corpus:
            for entry in results_types_corpus:
                total_count += 1
                logging.info("type_id: {}, _id: {}".format(entry["type_id"], entry["_id"]))
        else:
            logging.info("Empty {} types".format(corpus))
        chrono.chrono_trace("conf_types", date_begin, date_types, total_count)

        # datafields
        results_fields_common = conf_fields(corpus, "common")
        results_fields_corpus = conf_fields(corpus, corpus)
        date_fields = datetime.datetime.now()
        total_count = 0
        logging.info("# Import common fields:")
        if results_fields_common:
            for entry in results_fields_common:
                total_count += 1
                logging.info("rec_type: {}, _id: {}".format(entry["rec_type"], entry["_id"]))
        else:
            logging.info("Empty common fields")
        logging.info("# Import {} fields:".format(corpus))
        if results_fields_corpus:
            for entry in results_fields_corpus:
                total_count += 1
                logging.info("rec_type: {}, _id: {}".format(entry["rec_type"], entry["_id"]))
        else:
            logging.info("Empty {} fields".format(corpus))
        chrono.chrono_trace("conf_fields", date_types, date_fields, total_count)
def harvest_by_set(corpus, target, target_set):
    logging.info("harvest_by_set: {}".format(target_set))
    date_begin = datetime.datetime.now()

    # harvest
    metajson_list = oaipmh_harvester.list_records(target, None, None, target_set)
    date_harvest = datetime.datetime.now()
    chrono.chrono_trace("harvest spire and convert to metajson", date_begin, date_harvest, len(ids))
    
    # import
    result_import = corpus_service.import_metajson_list(corpus, metajson_list, True, None)
    date_import = datetime.datetime.now()
    chrono.chrono_trace("harvest spire, convert metadata and save to MongoDB", date_harvest, date_import, len(result_import[0]))
Beispiel #4
0
def clean_corpus(corpus):
    if not corpus:
        logging.error("Error: empty corpus")
    else:
        logging.info("clean corpus: {}".format(corpus))

        date_begin = datetime.datetime.now()

        repository_service.create_corpus(corpus)
        repository_service.empty_corpus(corpus)
        repository_service.init_corpus_indexes(corpus)

        date_end = datetime.datetime.now()
        chrono.chrono_trace("clean_corpus", date_begin, date_end, None)
def clean_corpus(corpus):
    if not corpus:
        logging.error("Error: empty corpus")
    else:
        logging.info("clean corpus: {}".format(corpus))

        date_begin = datetime.datetime.now()

        repository_service.create_corpus(corpus)
        repository_service.empty_corpus(corpus)
        repository_service.init_corpus_indexes(corpus)

        date_end = datetime.datetime.now()
        chrono.chrono_trace("clean_corpus", date_begin, date_end, None)
def harvest_by_ids(corpus, target, ids):
    logging.info("harvest_by_ids: {}".format(ids))
    date_begin = datetime.datetime.now()

    # harvest
    metajson_list = []
    for identifier in ids:
        metajson_list.append(oaipmh_harvester.get_record(target, identifier))
    date_harvest = datetime.datetime.now()
    chrono.chrono_trace("harvest spire and convert to metajson", date_begin, date_harvest, len(ids))

    # import
    result_import = corpus_service.import_metajson_list(corpus, metajson_list, True, None)
    date_import = datetime.datetime.now()
    chrono.chrono_trace("import", date_harvest, date_import, len(result_import))
Beispiel #7
0
def harvest_by_set(corpus, target, target_set):
    logging.info("harvest_by_set: {}".format(target_set))
    date_begin = datetime.datetime.now()

    # harvest
    metajson_list = oaipmh_harvester.list_records(target, None, None,
                                                  target_set)
    date_harvest = datetime.datetime.now()
    chrono.chrono_trace("harvest spire and convert to metajson", date_begin,
                        date_harvest, len(ids))

    # import
    result_import = corpus_service.import_metajson_list(
        corpus, metajson_list, True, None)
    date_import = datetime.datetime.now()
    chrono.chrono_trace("harvest spire, convert metadata and save to MongoDB",
                        date_harvest, date_import, len(result_import[0]))
Beispiel #8
0
def harvest_by_ids(corpus, target, ids):
    logging.info("harvest_by_ids: {}".format(ids))
    date_begin = datetime.datetime.now()

    # harvest
    metajson_list = []
    for identifier in ids:
        metajson_list.append(oaipmh_harvester.get_record(target, identifier))
    date_harvest = datetime.datetime.now()
    chrono.chrono_trace("harvest spire and convert to metajson", date_begin,
                        date_harvest, len(ids))

    # import
    result_import = corpus_service.import_metajson_list(
        corpus, metajson_list, True, None)
    date_import = datetime.datetime.now()
    chrono.chrono_trace("import", date_harvest, date_import,
                        len(result_import))
    chrono.chrono_trace("harvest spire and convert to metajson", date_begin, date_harvest, len(ids))

    # import
    result_import = corpus_service.import_metajson_list(corpus, metajson_list, True, None)
    date_import = datetime.datetime.now()
    chrono.chrono_trace("import", date_harvest, date_import, len(result_import))


if __name__ == "__main__":
    date_begin = datetime.datetime.now()

    # conf corpus
    corpus = "spire"
    corpus_service.clean_corpus(corpus)
    date_clean = datetime.datetime.now()
    chrono.chrono_trace("Initialize corpus", date_begin, date_clean, None)

    target = Target()
    target['identifier'] = 'spire'
    target['title'] = 'Sciences Po Institutional Repository'
    target['type'] = 'oaipmh'
    target['url'] = 'http://spire.sciencespo.fr/dissemination/oaipmh2-no-prefix-publications.xml'
    target['metadata_prefix'] = 'didl'

    ids = [
        "oai:spire.sciencespo.fr:2441/dambferfb7dfprc9m26c8c8o3",
        "oai:spire.sciencespo.fr:2441/eo6779thqgm5r489makgoai85",
        "oai:spire.sciencespo.fr:2441/5l6uh8ogmqildh09h6m8hj429",
        "oai:spire.sciencespo.fr:2441/3fm4jv3k2s99lms9jb5i5asil",
        "oai:spire.sciencespo.fr:2441/f4rshpf3v1umfa09lb0joe5g5",
        "oai:spire.sciencespo.fr:2441/dambferfb7dfprc9m2h2og5ig",
Beispiel #10
0
    source = "FNSP"
    rec_id_prefix = "sc_"
    input_dir_path = os.path.join("data", "num", "input")
    input_format = constants.FORMAT_UNIMARC
    output_dir_path = os.path.join("data", "num", "output")
    if not os.path.exists(output_dir_path):
        os.mkdir(output_dir_path)
    error_file_name = "".join(["validation-", corpus, ".txt"])
    error_file_path = os.path.join(output_dir_path, error_file_name)
    #logging.debug("error_file_path: {}".format(error_file_path))

    # conf corpus
    corpus_service.clean_corpus(corpus)
    corpus_service.conf_corpus(corpus, "aime")
    date_clean = datetime.datetime.now()
    chrono.chrono_trace("Clean and conf corpus", date_begin, date_clean, None)

    # import
    input_file_paths = io_service.get_relevant_file_list_by_format(
        input_dir_path, input_format)
    results = corpus_service.import_metadata_files(corpus, input_file_paths,
                                                   input_format, source,
                                                   rec_id_prefix, True, None)
    date_import = datetime.datetime.now()
    chrono.chrono_trace("Import corpus", date_clean, date_import, None)

    # Validate
    corpus_service.validate_corpus(corpus, error_file_path)
    date_validate = datetime.datetime.now()
    chrono.chrono_trace("Validate corpus", date_import, date_validate, None)

if __name__ == "__main__":
    date_begin = datetime.datetime.now()

    # conf params
    corpus = "perio"
    source = "Sciences Po | la bibliothèque"
    rec_id_prefix = ""
    input_file_path = os.path.join("data", "unimarc", "periouni.mrc")
    input_format = constants.FORMAT_UNIMARC
    csv_file_name = "".join(["validation-", corpus, ".csv"])
    csv_file_path = os.path.join("data", "result", csv_file_name)

    # conf corpus
    corpus_service.clean_corpus(corpus)
    corpus_service.conf_corpus(corpus, "aime")
    date_clean = datetime.datetime.now()
    chrono.chrono_trace("Clean and conf corpus", date_begin, date_clean, None)

    # import
    corpus_service.import_metadata_file(corpus, input_file_path, input_format, source, rec_id_prefix, True, None)
    date_import = datetime.datetime.now()
    chrono.chrono_trace("Import corpus", date_clean, date_import, None)

    # Validate perio
    documents = repository_service.get_documents(corpus)
    validate_perios(documents, csv_file_path)
    date_validate = datetime.datetime.now()
    chrono.chrono_trace("Validate perio", date_import, date_validate, None)
Beispiel #12
0
    source = "FNSP"
    rec_id_prefix = "sc"
    input_dir_path = os.path.join("data", "num", "input")
    input_format = constants.FORMAT_UNIMARC
    output_dir_path = os.path.join("data", "num", "output")
    if not os.path.exists(output_dir_path):
        os.mkdir(output_dir_path)
    error_file_name = "".join(["validation-", corpus, ".txt"])
    error_file_path = os.path.join(output_dir_path, error_file_name)
    #logging.debug("error_file_path: {}".format(error_file_path))

    # conf corpus
    corpus_service.clean_corpus(corpus)
    corpus_service.conf_corpus(corpus, "aime")
    date_clean = datetime.datetime.now()
    chrono.chrono_trace("Clean and conf corpus", date_begin, date_clean, None)

    # import
    input_file_paths = io_service.get_relevant_file_list_by_format(input_dir_path, input_format)
    results = corpus_service.import_metadata_files(corpus, input_file_paths, input_format, source, rec_id_prefix, True, None)
    date_import = datetime.datetime.now()
    chrono.chrono_trace("Import corpus", date_clean, date_import, None)

    # Validate
    corpus_service.validate_corpus(corpus, error_file_path)
    date_validate = datetime.datetime.now()
    chrono.chrono_trace("Validate corpus", date_import, date_validate, None)

    # Export mods
    corpus_service.export_corpus(corpus, output_dir_path, constants.FORMAT_MODS, False, True)
    date_export_mods = datetime.datetime.now()
Beispiel #13
0
if __name__ == "__main__":
    date_begin = datetime.datetime.now()

    # conf params
    corpus = "perio"
    source = "Sciences Po | la bibliothèque"
    rec_id_prefix = ""
    input_file_path = os.path.join("data", "unimarc", "periouni.mrc")
    input_format = constants.FORMAT_UNIMARC
    csv_file_name = "".join(["validation-", corpus, ".csv"])
    csv_file_path = os.path.join("data", "result", csv_file_name)

    # conf corpus
    corpus_service.clean_corpus(corpus)
    corpus_service.conf_corpus(corpus, "aime")
    date_clean = datetime.datetime.now()
    chrono.chrono_trace("Clean and conf corpus", date_begin, date_clean, None)

    # import
    corpus_service.import_metadata_file(corpus, input_file_path, input_format,
                                        source, rec_id_prefix, True, None)
    date_import = datetime.datetime.now()
    chrono.chrono_trace("Import corpus", date_clean, date_import, None)

    # Validate perio
    documents = repository_service.get_documents(corpus)
    validate_perios(documents, csv_file_path)
    date_validate = datetime.datetime.now()
    chrono.chrono_trace("Validate perio", date_import, date_validate, None)
Beispiel #14
0
    # import
    result_import = corpus_service.import_metajson_list(
        corpus, metajson_list, True, None)
    date_import = datetime.datetime.now()
    chrono.chrono_trace("import", date_harvest, date_import,
                        len(result_import))


if __name__ == "__main__":
    date_begin = datetime.datetime.now()

    # conf corpus
    corpus = "spire"
    corpus_service.clean_corpus(corpus)
    date_clean = datetime.datetime.now()
    chrono.chrono_trace("Initialize corpus", date_begin, date_clean, None)

    target = Target()
    target['identifier'] = 'spire'
    target['title'] = 'Sciences Po Institutional Repository'
    target['type'] = 'oaipmh'
    target[
        'url'] = 'http://spire.sciencespo.fr/dissemination/oaipmh2-no-prefix-publications.xml'
    target['metadata_prefix'] = 'didl'

    ids = [
        "oai:spire.sciencespo.fr:2441/dambferfb7dfprc9m26c8c8o3",
        "oai:spire.sciencespo.fr:2441/eo6779thqgm5r489makgoai85",
        "oai:spire.sciencespo.fr:2441/5l6uh8ogmqildh09h6m8hj429",
        "oai:spire.sciencespo.fr:2441/3fm4jv3k2s99lms9jb5i5asil",
        "oai:spire.sciencespo.fr:2441/f4rshpf3v1umfa09lb0joe5g5",