コード例 #1
0
        metajson_list.append(oaipmh_harvester.get_record(target, identifier))
    date_harvest = datetime.datetime.now()
    chrono.chrono_trace("harvest spire and convert to metajson", date_begin, date_harvest, len(ids))

    # import
    result_import = corpus_service.import_metajson_list(corpus, metajson_list, True, None)
    date_import = datetime.datetime.now()
    chrono.chrono_trace("import", date_harvest, date_import, len(result_import))


if __name__ == "__main__":
    date_begin = datetime.datetime.now()

    # conf corpus
    corpus = "spire"
    corpus_service.clean_corpus(corpus)
    date_clean = datetime.datetime.now()
    chrono.chrono_trace("Initialize corpus", date_begin, date_clean, None)

    target = Target()
    target['identifier'] = 'spire'
    target['title'] = 'Sciences Po Institutional Repository'
    target['type'] = 'oaipmh'
    target['url'] = 'http://spire.sciencespo.fr/dissemination/oaipmh2-no-prefix-publications.xml'
    target['metadata_prefix'] = 'didl'

    ids = [
        "oai:spire.sciencespo.fr:2441/dambferfb7dfprc9m26c8c8o3",
        "oai:spire.sciencespo.fr:2441/eo6779thqgm5r489makgoai85",
        "oai:spire.sciencespo.fr:2441/5l6uh8ogmqildh09h6m8hj429",
        "oai:spire.sciencespo.fr:2441/3fm4jv3k2s99lms9jb5i5asil",
コード例 #2
0
def clean_corpus(args):
    corpus = args.corpus
    if not corpus:
        corpus = default_corpus
    logging.info("corpus: {}".format(corpus))
    corpus_service.clean_corpus(corpus)
コード例 #3
0
    # conf params
    corpus = "num"
    source = "FNSP"
    rec_id_prefix = "sc_"
    input_dir_path = os.path.join("data", "num", "input")
    input_format = constants.FORMAT_UNIMARC
    output_dir_path = os.path.join("data", "num", "output")
    if not os.path.exists(output_dir_path):
        os.mkdir(output_dir_path)
    error_file_name = "".join(["validation-", corpus, ".txt"])
    error_file_path = os.path.join(output_dir_path, error_file_name)
    #logging.debug("error_file_path: {}".format(error_file_path))

    # conf corpus
    corpus_service.clean_corpus(corpus)
    corpus_service.conf_corpus(corpus, "aime")
    date_clean = datetime.datetime.now()
    chrono.chrono_trace("Clean and conf corpus", date_begin, date_clean, None)

    # import
    input_file_paths = io_service.get_relevant_file_list_by_format(
        input_dir_path, input_format)
    results = corpus_service.import_metadata_files(corpus, input_file_paths,
                                                   input_format, source,
                                                   rec_id_prefix, True, None)
    date_import = datetime.datetime.now()
    chrono.chrono_trace("Import corpus", date_clean, date_import, None)

    # Validate
    corpus_service.validate_corpus(corpus, error_file_path)
コード例 #4
0
ファイル: __main__.py プロジェクト: dinosv/reference_manager
def clean_corpus(args):
    corpus = args.corpus
    if not corpus:
        corpus = default_corpus
    logging.info("corpus: {}".format(corpus))
    corpus_service.clean_corpus(corpus)