run_name = Unicode(help="Run name.", default_value="default").tag(config=True) train_cache_dir = Unicode(help="Path to training dump.").tag( config=True) train_cache_size = Integer(help="Number of items per cache").tag( config=True) model_dir = Unicode(help="Model directory.").tag(config=True) log_dir = Unicode(help="Logging directory.").tag(config=True) cmd_log = Bool(help="Log on command prompt only.", default_value=False).tag(config=True) pre_val = Bool(help="Pre-validate on the dev set.", default_value=False).tag(config=True) do_training = Bool(help="Flag for conducting training.", default_value=False).tag(config=True) do_test = Bool(help="Flag for conducting testing.", default_value=False).tag(config=True) run_baselines = Bool(help="Run baseline.", default_value=False).tag(config=True) debug_mode = Bool(help="Debug mode", default_value=False).tag(config=True) test_factor_role = Unicode( help="The field name of the role that is used to " "determine the slot type.").tag(config=True) train_factor_role = Unicode( help="The field name of the role that is used to " "determine the slot type.").tag(config=True) conf = load_mixed_configs() main(conf)
def main(): from event.util import basic_console_log from event.util import load_file_config, load_mixed_configs class OutputConf(Configurable): out_dir = Unicode(help="Output directory").tag(config=True) text_dir = Unicode(help="Text output directory").tag(config=True) brat_dir = Unicode(help="Brat visualization directory").tag( config=True) class DataConf(Configurable): # Default is just a large number to rank it late. order = Int(help="Order of this parser", default_value=10000000).tag(config=True) class EreConf(DataConf): source = Unicode(help="Plain source input directory").tag(config=True) ere = Unicode(help="ERE input data").tag(config=True) src_ext = Unicode(help="Source file extension", default_value=".xml").tag(config=True) ere_ext = Unicode(help="Ere file extension", default_value=".rich_ere.xml").tag(config=True) ere_split = Bool(help="Whether split ere based on the file names").tag( config=True) ignore_quote = Bool(help="model name", default_value=False).tag(config=True) format = Unicode(help="name for format", default_value="ERE").tag(config=False) class FrameNetConf(DataConf): fn_path = Unicode(help="FrameNet dataset path.").tag(config=True) format = Unicode(help="name for format", default_value="FrameNet").tag(config=True) class ConllConf(DataConf): in_dir = Unicode(help="Conll file input directory").tag(config=True) format = Unicode(help="name for format", default_value="ConllConf").tag(config=True) class AceConf(DataConf): in_dir = Unicode(help="Conll file input directory").tag(config=True) out_dir = Unicode(help="Output directory").tag(config=True) text_dir = Unicode(help="Raw Text Output directory").tag(config=True) format = Unicode(help="name for format", default_value="ACE").tag(config=True) class NomBankConfig(DataConf): nombank_path = Unicode(help="Nombank corpus.").tag(config=True) nomfile = Unicode(help="Nombank file.").tag(config=True) frame_file_pattern = Unicode(help="Frame file pattern.").tag( config=True) nombank_nouns_file = Unicode(help="Nomank nous.").tag(config=True) # PennTree Bank config. wsj_path = Unicode(help="PennTree Bank path.").tag(config=True) wsj_file_pattern = Unicode(help="File pattern to read PTD data").tag( config=True) implicit_path = Unicode(help="Implicit annotation xml path.").tag( config=True) gc_only = Bool(help="Only read docs that contains GC arguments.").tag( config=True) explicit_only = Bool(help="Do not add GC arguments.").tag(config=True) format = Unicode(help="name for format", default_value="NomBank").tag(config=True) stat_dir = Unicode(help="Path for stats.").tag(config=True) class PropBankConfig(DataConf): root = Unicode(help="Propbank corpus.").tag(config=True) propfile = Unicode(help="Prop File.").tag(config=True) frame_files = Unicode(help="Frame file pattern.").tag(config=True) verbs_file = Unicode(help="Verbs.").tag(config=True) format = Unicode(help="name for format", default_value="PropBank").tag(config=True) # PennTree Bank config. wsj_path = Unicode(help="PennTree Bank path.").tag(config=True) wsj_file_pattern = Unicode(help="File pattern to read PTD data").tag( config=True) class NegraConfig(DataConf): data_files = List(help="Input data path.", trait=Unicode).tag(config=True) stat_out = Unicode(help="Output statistics").tag(config=True) format = Unicode(help="name for format", default_value="Negra").tag(config=True) basic_console_log() config = load_mixed_configs() output_param = OutputConf(config=config) # Create paths for output. if not os.path.exists(output_param.out_dir): os.makedirs(output_param.out_dir) if not os.path.exists(output_param.text_dir): os.makedirs(output_param.text_dir) brat_data_path = os.path.join(output_param.brat_dir, "data") if not os.path.exists(brat_data_path): os.makedirs(brat_data_path) corpus = Corpus() order_parsers = [] # with_doc = index == 0 if "RichERE" in config: basic_param = EreConf(config=config) o = basic_param.order parser = RichERE(basic_param, corpus, o == 0) order_parsers.append((o, parser, basic_param)) if "FrameNetConf" in config: basic_param = FrameNetConf(config=config) o = basic_param.order parser = FrameNet(basic_param, corpus, o == 0) order_parsers.append((o, parser, basic_param)) if "ConllConf" in config: basic_param = ConllConf(config=config) o = basic_param.order parser = Conll(basic_param, corpus, o == 0) order_parsers.append((o, parser, basic_param)) if "AceConf" in config: basic_param = AceConf(config=config) o = basic_param.order parser = ACE(basic_param, corpus, o == 0) order_parsers.append((o, parser, basic_param)) if "NomBankConfig" in config: basic_param = NomBankConfig(config=config) o = basic_param.order parser = NomBank(basic_param, corpus, o == 0) order_parsers.append((o, parser, basic_param)) print("found nombank config") if "PropBankConfig" in config: basic_param = PropBankConfig(config=config) o = basic_param.order parser = PropBank(basic_param, corpus, o == 0) order_parsers.append((o, parser, basic_param)) print("found propbank config") if "NegraConfig" in config: basic_param = NegraConfig(config=config) o = basic_param.order parser = NeGraXML(basic_param, corpus, o == 0) order_parsers.append((o, parser, basic_param)) order_parsers.sort() first_parser = order_parsers[0][1] # Use the documents created by the first parser. for doc in first_parser.get_doc(): for _, parser, basic_param in order_parsers[1:]: # Add annotations from each parser. parser.add_all_annotations(doc) out_path = os.path.join(output_param.out_dir, doc.docid + ".json") ensure_dir(out_path) with open(out_path, "w") as out: out.write(doc.dump(indent=2)) out_path = os.path.join(output_param.text_dir, doc.docid + ".txt") ensure_dir(out_path) with open(out_path, "w") as out: out.write(doc.doc_text) source_text, ann_text = doc.to_brat() out_path = os.path.join(output_param.brat_dir, "data", doc.docid + ".ann") ensure_dir(out_path) with open(out_path, "w") as out: out.write(ann_text) out_path = os.path.join(output_param.brat_dir, "data", doc.docid + ".txt") ensure_dir(out_path) with open(out_path, "w") as out: out.write(source_text) for _, p, _ in order_parsers: p.print_stats() # Write brat configs. out_path = os.path.join(output_param.brat_dir, "annotation.conf") with open(out_path, "w") as out: out.write(corpus.get_brat_config())
"FrameNet and Propbank", default_value="Propbank", ).tag(config=True) use_gold_frame = Bool( help="Use gold the gold frame produced by annotation", default_value=False).tag(config=True) strict_arg_count = Bool(help="Force lossless number of arguments", default_value=False).tag(config=True) if __name__ == "__main__": from event.util import load_mixed_configs, set_basic_log set_basic_log() hash_params = HashParam(config=load_mixed_configs()) stat_counters = { "predicate": Counter(), "implicit predicates": Counter(), "implicit slots": Counter(), } stat_keys = stat_counters.keys() hash_data() print("==========Implicit arguments Statistics===========") headline = "Predicate\t" + "\t".join(stat_keys) print(headline) preds = sorted(stat_counters["implicit predicates"].keys())