Beispiel #1
0
def get_documents(instance_names,text_dir):
    documents = {}
    for instance in instance_names:
        #print "for %s" %instance
        source_dir = os.path.join(text_dir,instance)
        
        sub_dirs = os.walk(source_dir).next()[1]
        documents[instance] = {}
        for a_dir in sub_dirs:
            date_dir = os.path.join(source_dir,a_dir)
            for single_file in get_files(date_dir):
                #print "open file %s" %os.path.join(date_dir,single_file)
                single_file = os.path.join(date_dir,single_file)
                documents[instance][single_file] = Document(single_file,file_path = single_file)
    return documents
Beispiel #2
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("disaster_name")
    parser.add_argument(
        "--top_dir",
        '-tp',
        default=
        '/lustre/scratch/lukuang/Temporal_Summerization/TS-2013/data/disaster_profile/data'
    )
    parser.add_argument("dest_dir")
    parser.add_argument("run_id", type=int)
    parser.add_argument("--using_text_window", "-u", action='store_true')
    parser.add_argument("--window_size", '-wz', type=int, default=3)
    parser.add_argument(
        "--entity_judgement_file",
        "-e",
        default=
        "/lustre/scratch/lukuang/Temporal_Summerization/TS-2013/data/disaster_profile/data/src/new_judgement.json"
    )
    args = parser.parse_args()

    data = ""
    with open(args.entity_judgement_file) as f:
        data = f.read()

    entities_judgement_data = json.loads(data)
    entities_judgement = {}
    single = entities_judgement_data[args.run_id - 1]
    q = single["query_string"]
    single.pop("query_string", None)
    entities_judgement[q] = single
    #for single in entities_judgement_data:
    #    q = single["query_string"]
    #    single.pop("query_string",None)
    #    entities_judgement[q] = single

    args.top_dir = os.path.abspath(args.top_dir)
    instance_names = entities_judgement.keys()
    documents = {}
    for instance in instance_names:
        print "for %s" % instance
        source_dir = os.path.join(args.top_dir, "clean_text",
                                  args.disaster_name, instance)
        sub_dirs = os.walk(source_dir).next()[1]
        documents[instance] = {}
        for a_dir in sub_dirs:
            date_dir = os.path.join(source_dir, a_dir)
            print date_dir
            for single_file in get_files(date_dir):
                #print "open file %s" %os.path.join(date_dir,single_file)
                single_file = os.path.join(date_dir, single_file)
                documents[instance][single_file] = Document(
                    single_file, file_path=single_file)

    #show_documents(documents)#debug purpose
    #print json.documents(files,indent=4)
    if args.using_text_window:
        windows = get_all_text_windows(documents, entities_judgement,
                                       args.window_size)
    else:
        windows = get_all_sentence_windows(documents, entities_judgement)
    for entity_type in windows:
        for w in windows[entity_type]:
            windows[entity_type][w] = windows[entity_type][w].model
    with codecs.open(os.path.join(args.dest_dir, q), "w", "utf-8") as f:
        f.write(json.dumps(windows))
    #print json.dumps(windows,indent=4)
    print "finished"