def process_wikiner(paths, dataset): short_name = treebank_to_short_name(dataset) base_input_path = os.path.join(paths["NERBASE"], dataset) base_output_path = paths["NER_DATA_DIR"] raw_input_path = os.path.join(base_input_path, "raw") input_files = glob.glob(os.path.join(raw_input_path, "aij-wikiner*")) if len(input_files) == 0: raise FileNotFoundError("Could not find any raw wikiner files in %s" % raw_input_path) elif len(input_files) > 1: raise FileNotFoundError("Found too many raw wikiner files in %s: %s" % (raw_input_path, ", ".join(input_files))) csv_file = os.path.join(raw_input_path, "csv_" + short_name) print("Converting raw input %s to space separated file in %s" % (input_files[0], csv_file)) preprocess_wikiner(input_files[0], csv_file) # this should create train.bio, dev.bio, and test.bio print("Splitting %s to %s" % (csv_file, base_input_path)) split_wikiner(base_input_path, csv_file) for shard in SHARDS: input_filename = os.path.join(base_input_path, '%s.bio' % shard) if not os.path.exists(input_filename): raise FileNotFoundError('Cannot find %s component of %s in %s' % (shard, short_name, input_filename)) output_filename = os.path.join(base_output_path, '%s.%s.json' % (short_name, shard)) print("Converting %s to %s" % (input_filename, output_filename)) prepare_ner_file.process_dataset(input_filename, output_filename)
def process_treebank(treebank, paths, output_dir): with tempfile.TemporaryDirectory() as tokenizer_dir: paths = dict(paths) paths["TOKENIZE_DATA_DIR"] = tokenizer_dir short_name = treebank_to_short_name(treebank) # first we process the tokenization data args = argparse.Namespace() args.augment = False args.prepare_labels = False prepare_tokenizer_treebank.process_treebank(treebank, paths, args) # TODO: these names should be refactored train_file = f"{tokenizer_dir}/{short_name}.train.gold.conllu" dev_file = f"{tokenizer_dir}/{short_name}.dev.gold.conllu" test_file = f"{tokenizer_dir}/{short_name}.test.gold.conllu" train_set = prepare_tokenizer_treebank.read_sentences_from_conllu( train_file) dev_set = prepare_tokenizer_treebank.read_sentences_from_conllu( dev_file) test_set = prepare_tokenizer_treebank.read_sentences_from_conllu( test_file) train_out = os.path.join(output_dir, f"{short_name}.train.seg.txt") test_out = os.path.join(output_dir, f"{short_name}.test.seg.txt") write_segmenter_file(train_out, train_set + dev_set) write_segmenter_file(test_out, test_set)
def process_fire_2013(paths, dataset): """ Splits the FIRE 2013 dataset into train, dev, test The provided datasets are all mixed together at this point, so it is not possible to recreate the original test conditions used in the bakeoff """ short_name = treebank_to_short_name(dataset) langcode, _ = short_name.split("_") if not langcode in ("hi", "en", "ta", "bn", "mal"): raise ValueError("Language %s not one of the FIRE 2013 languages") language = lcode2lang[langcode].lower() # for example, FIRE2013/hindi_train base_input_path = os.path.join(paths["NERBASE"], "FIRE2013", "%s_train" % language) base_output_path = paths["NER_DATA_DIR"] train_csv_file = os.path.join(base_output_path, "%s.train.csv" % short_name) dev_csv_file = os.path.join(base_output_path, "%s.dev.csv" % short_name) test_csv_file = os.path.join(base_output_path, "%s.test.csv" % short_name) convert_fire_2013(base_input_path, train_csv_file, dev_csv_file, test_csv_file) for csv_file, shard in zip((train_csv_file, dev_csv_file, test_csv_file), SHARDS): output_filename = os.path.join(base_output_path, '%s.%s.json' % (short_name, shard)) prepare_ner_file.process_dataset(csv_file, output_filename)
def main(): if len(sys.argv) != 3: print('Usage: {} list_of_tb_file output_factory_file'.format( sys.argv[0])) sys.exit(0) # Read list of all treebanks of concern list_of_tb_file, output_file = sys.argv[1:] shorthands = [] fullnames = [] with open(list_of_tb_file) as f: for line in f: treebank = line.strip() fullnames.append(treebank) if SHORTNAME_RE.match(treebank): shorthands.append(treebank) else: shorthands.append(treebank_to_short_name(treebank)) # For each treebank, we would like to find the XPOS Vocab configuration that minimizes # the number of total classes needed to predict by all tagger classifiers. This is # achieved by enumerating different options of separators that different treebanks might # use, and comparing that to treating the XPOS tags as separate categories (using a # WordVocab). mapping = defaultdict(list) for sh, fn in zip(shorthands, fullnames): factory = get_factory(sh, fn) mapping[factory].append(sh) # Generate code. This takes the XPOS vocabulary classes selected above, and generates the # actual factory class as seen in models.pos.xpos_vocab_factory. first = True with open(output_file, 'w') as f: print( '''# This is the XPOS factory method generated automatically from stanza.models.pos.build_xpos_vocab_factory. # Please don't edit it! from stanza.models.pos.vocab import WordVocab, XPOSVocab def xpos_vocab_factory(data, shorthand):''', file=f) for key in mapping: print(" {} shorthand in [{}]:".format( 'if' if first else 'elif', ', '.join(['"{}"'.format(x) for x in mapping[key]])), file=f) print(" return {}".format(key), file=f) first = False print(''' else: raise NotImplementedError('Language shorthand "{}" not found!'.format(shorthand))''', file=f) print('Done!')
def project_to_short_name(treebank): """ Project either a treebank or a short name to a short name TODO: see if treebank_to_short_name can incorporate this """ if SHORTNAME_RE.match(treebank): return treebank else: return treebank_to_short_name(treebank)
def test_treebank(): """ Test the entire treebank name conversion """ # conversion of a UD_ name assert "hi_hdtb" == treebank_to_short_name("UD_Hindi-HDTB") # conversion of names without UD assert "hi_fire2013" == treebank_to_short_name("Hindi-fire2013") assert "hi_fire2013" == treebank_to_short_name("Hindi-Fire2013") assert "hi_fire2013" == treebank_to_short_name("Hindi-FIRE2013") # already short names are generally preserved assert "hi_fire2013" == treebank_to_short_name("hi-fire2013") assert "hi_fire2013" == treebank_to_short_name("hi_fire2013") # a special case assert "zh-hant_pud" == treebank_to_short_name("UD_Chinese-PUD") # a special case already converted once assert "zh-hant_pud" == treebank_to_short_name("zh-hant_pud") assert "zh-hant_pud" == treebank_to_short_name("zh-hant-pud") assert "zh-hans_gsdsimp" == treebank_to_short_name("zh-hans_gsdsimp")
def main(run_treebank, model_dir, model_name, add_specific_args=None): logger.info("Training program called with:\n" + " ".join(sys.argv)) paths = default_paths.get_default_paths() parser = build_argparse() if add_specific_args is not None: add_specific_args(parser) if '--extra_args' in sys.argv: idx = sys.argv.index('--extra_args') extra_args = sys.argv[idx + 1:] command_args = parser.parse_args(sys.argv[:idx]) else: command_args, extra_args = parser.parse_known_args() mode = command_args.mode treebanks = [] for treebank in command_args.treebanks: # this is a really annoying typo to make if you copy/paste a # UD directory name on the cluster and your job dies 30s after # being queued for an hour if treebank.endswith("/"): treebank = treebank[:-1] if treebank.lower() in ('ud_all', 'all_ud'): ud_treebanks = common.get_ud_treebanks(paths["UDBASE"]) treebanks.extend(ud_treebanks) else: treebanks.append(treebank) for treebank in treebanks: if SHORTNAME_RE.match(treebank): short_name = treebank else: short_name = treebank_to_short_name(treebank) logger.debug("%s: %s" % (treebank, short_name)) if mode == Mode.TRAIN and not command_args.force and model_name != 'ete': model_path = "saved_models/%s/%s_%s.pt" % (model_dir, short_name, model_name) if os.path.exists(model_path): logger.info("%s: %s exists, skipping!" % (treebank, model_path)) continue else: logger.info("%s: %s does not exist, training new model" % (treebank, model_path)) if command_args.temp_output and model_name != 'ete': with tempfile.NamedTemporaryFile() as temp_output_file: run_treebank(mode, paths, treebank, short_name, temp_output_file.name, command_args, extra_args) else: run_treebank(mode, paths, treebank, short_name, None, command_args, extra_args)
def main(run_treebank, model_dir, model_name, add_specific_args=None): """ A main program for each of the run_xyz scripts It collects the arguments and runs the main method for each dataset provided. It also tries to look for an existing model and not overwrite it unless --force is provided """ logger.info("Training program called with:\n" + " ".join(sys.argv)) paths = default_paths.get_default_paths() parser = build_argparse() if add_specific_args is not None: add_specific_args(parser) if '--extra_args' in sys.argv: idx = sys.argv.index('--extra_args') extra_args = sys.argv[idx + 1:] command_args = parser.parse_args(sys.argv[1:idx]) else: command_args, extra_args = parser.parse_known_args() # Pass this through to the underlying model as well as use it here if command_args.save_dir: extra_args.extend(["--save_dir", command_args.save_dir]) mode = command_args.mode treebanks = [] for treebank in command_args.treebanks: # this is a really annoying typo to make if you copy/paste a # UD directory name on the cluster and your job dies 30s after # being queued for an hour if treebank.endswith("/"): treebank = treebank[:-1] if treebank.lower() in ('ud_all', 'all_ud'): ud_treebanks = common.get_ud_treebanks(paths["UDBASE"]) treebanks.extend(ud_treebanks) else: treebanks.append(treebank) for treebank_idx, treebank in enumerate(treebanks): if treebank_idx > 0: logger.info("=========================================") if SHORTNAME_RE.match(treebank): short_name = treebank else: short_name = treebank_to_short_name(treebank) logger.debug("%s: %s" % (treebank, short_name)) if mode == Mode.TRAIN and not command_args.force and model_name != 'ete': if command_args.save_dir: model_path = "%s/%s_%s.pt" % (command_args.save_dir, short_name, model_name) else: model_path = "saved_models/%s/%s_%s.pt" % ( model_dir, short_name, model_name) if os.path.exists(model_path): logger.info("%s: %s exists, skipping!" % (treebank, model_path)) continue else: logger.info("%s: %s does not exist, training new model" % (treebank, model_path)) if command_args.temp_output and model_name != 'ete': with tempfile.NamedTemporaryFile() as temp_output_file: run_treebank(mode, paths, treebank, short_name, temp_output_file.name, command_args, extra_args) else: run_treebank(mode, paths, treebank, short_name, None, command_args, extra_args)
paths = default_paths.get_default_paths() udbase = paths["UDBASE"] directories = glob.glob(udbase + "/UD_*") directories.sort() output_name = os.path.join( os.path.split(__file__)[0], "short_name_to_treebank.py") with open(output_name, "w") as fout: fout.write( "# This module is autogenerated by build_short_name_to_treebank.py\n") fout.write("# Please do not edit\n") fout.write("\n") fout.write("SHORT_NAMES = {\n") for ud_path in directories: ud_name = os.path.split(ud_path)[1] short_name = treebank_to_short_name(ud_name) fout.write(" '%s': '%s',\n" % (short_name, ud_name)) if short_name.startswith("zh_"): short_name = "zh-hans_" + short_name[3:] fout.write(" '%s': '%s',\n" % (short_name, ud_name)) fout.write("}\n") fout.write(""" def short_name_to_treebank(short_name): return SHORT_NAMES[short_name] """)