def get_documents(instance_names,text_dir): documents = {} for instance in instance_names: #print "for %s" %instance source_dir = os.path.join(text_dir,instance) sub_dirs = os.walk(source_dir).next()[1] documents[instance] = {} for a_dir in sub_dirs: date_dir = os.path.join(source_dir,a_dir) for single_file in get_files(date_dir): #print "open file %s" %os.path.join(date_dir,single_file) single_file = os.path.join(date_dir,single_file) documents[instance][single_file] = Document(single_file,file_path = single_file) return documents
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("disaster_name") parser.add_argument( "--top_dir", '-tp', default= '/lustre/scratch/lukuang/Temporal_Summerization/TS-2013/data/disaster_profile/data' ) parser.add_argument("dest_dir") parser.add_argument("run_id", type=int) parser.add_argument("--using_text_window", "-u", action='store_true') parser.add_argument("--window_size", '-wz', type=int, default=3) parser.add_argument( "--entity_judgement_file", "-e", default= "/lustre/scratch/lukuang/Temporal_Summerization/TS-2013/data/disaster_profile/data/src/new_judgement.json" ) args = parser.parse_args() data = "" with open(args.entity_judgement_file) as f: data = f.read() entities_judgement_data = json.loads(data) entities_judgement = {} single = entities_judgement_data[args.run_id - 1] q = single["query_string"] single.pop("query_string", None) entities_judgement[q] = single #for single in entities_judgement_data: # q = single["query_string"] # single.pop("query_string",None) # entities_judgement[q] = single args.top_dir = os.path.abspath(args.top_dir) instance_names = entities_judgement.keys() documents = {} for instance in instance_names: print "for %s" % instance source_dir = os.path.join(args.top_dir, "clean_text", args.disaster_name, instance) sub_dirs = os.walk(source_dir).next()[1] documents[instance] = {} for a_dir in sub_dirs: date_dir = os.path.join(source_dir, a_dir) print date_dir for single_file in get_files(date_dir): #print "open file %s" %os.path.join(date_dir,single_file) single_file = os.path.join(date_dir, single_file) documents[instance][single_file] = Document( single_file, file_path=single_file) #show_documents(documents)#debug purpose #print json.documents(files,indent=4) if args.using_text_window: windows = get_all_text_windows(documents, entities_judgement, args.window_size) else: windows = get_all_sentence_windows(documents, entities_judgement) for entity_type in windows: for w in windows[entity_type]: windows[entity_type][w] = windows[entity_type][w].model with codecs.open(os.path.join(args.dest_dir, q), "w", "utf-8") as f: f.write(json.dumps(windows)) #print json.dumps(windows,indent=4) print "finished"