Esempio n. 1
0
def genrate_gold_gui_data(corpus_dir, doc_id, data_file):
    data_reader = jsonlines.open(data_file)

    # handle the case that the doc_id already exists.
    if check_duplicate_dir(corpus_dir):
        sys.exit()

    doc_ids = []
    for doc_dict in data_reader.iter():
        doc = Document(doc_dict)
        doc_ids.append(doc.doc_id)

        # doc data
        doc_data = doc.get_visualize_data()
        doc_data_file = "%s/span/%s.json" % (corpus_dir, doc.doc_id)
        save_to_json(doc_data_file, doc_data)

        # surface data
        surface_data = doc.get_surface_data()
        surface_data_file = "%s/detail/%s.json" % (corpus_dir, doc.doc_id)
        save_to_json(surface_data_file, surface_data)

        # cluster data
        cluster_data = doc.get_cluster_data()
        cluster_data_file = "%s/coref/%s.json" % (corpus_dir, doc.doc_id)
        save_to_json(cluster_data_file, cluster_data)

    # doc ids and corpus ids.
    doc_ids_file = "%s/doc_ids.json" % corpus_dir
    save_to_json(doc_ids_file, doc_ids)
    update_corpus_ids(doc_id)