Esempio n. 1
0
def process_wikiner(paths, dataset):
    short_name = treebank_to_short_name(dataset)

    base_input_path = os.path.join(paths["NERBASE"], dataset)
    base_output_path = paths["NER_DATA_DIR"]

    raw_input_path = os.path.join(base_input_path, "raw")
    input_files = glob.glob(os.path.join(raw_input_path, "aij-wikiner*"))
    if len(input_files) == 0:
        raise FileNotFoundError("Could not find any raw wikiner files in %s" %
                                raw_input_path)
    elif len(input_files) > 1:
        raise FileNotFoundError("Found too many raw wikiner files in %s: %s" %
                                (raw_input_path, ", ".join(input_files)))

    csv_file = os.path.join(raw_input_path, "csv_" + short_name)
    print("Converting raw input %s to space separated file in %s" %
          (input_files[0], csv_file))
    preprocess_wikiner(input_files[0], csv_file)

    # this should create train.bio, dev.bio, and test.bio
    print("Splitting %s to %s" % (csv_file, base_input_path))
    split_wikiner(base_input_path, csv_file)

    for shard in SHARDS:
        input_filename = os.path.join(base_input_path, '%s.bio' % shard)
        if not os.path.exists(input_filename):
            raise FileNotFoundError('Cannot find %s component of %s in %s' %
                                    (shard, short_name, input_filename))
        output_filename = os.path.join(base_output_path,
                                       '%s.%s.json' % (short_name, shard))
        print("Converting %s to %s" % (input_filename, output_filename))
        prepare_ner_file.process_dataset(input_filename, output_filename)
def process_treebank(treebank, paths, output_dir):
    with tempfile.TemporaryDirectory() as tokenizer_dir:
        paths = dict(paths)
        paths["TOKENIZE_DATA_DIR"] = tokenizer_dir

        short_name = treebank_to_short_name(treebank)

        # first we process the tokenization data
        args = argparse.Namespace()
        args.augment = False
        args.prepare_labels = False
        prepare_tokenizer_treebank.process_treebank(treebank, paths, args)

        # TODO: these names should be refactored
        train_file = f"{tokenizer_dir}/{short_name}.train.gold.conllu"
        dev_file = f"{tokenizer_dir}/{short_name}.dev.gold.conllu"
        test_file = f"{tokenizer_dir}/{short_name}.test.gold.conllu"

        train_set = prepare_tokenizer_treebank.read_sentences_from_conllu(
            train_file)
        dev_set = prepare_tokenizer_treebank.read_sentences_from_conllu(
            dev_file)
        test_set = prepare_tokenizer_treebank.read_sentences_from_conllu(
            test_file)

        train_out = os.path.join(output_dir, f"{short_name}.train.seg.txt")
        test_out = os.path.join(output_dir, f"{short_name}.test.seg.txt")

        write_segmenter_file(train_out, train_set + dev_set)
        write_segmenter_file(test_out, test_set)
Esempio n. 3
0
def process_fire_2013(paths, dataset):
    """
    Splits the FIRE 2013 dataset into train, dev, test

    The provided datasets are all mixed together at this point, so it
    is not possible to recreate the original test conditions used in
    the bakeoff
    """
    short_name = treebank_to_short_name(dataset)
    langcode, _ = short_name.split("_")
    if not langcode in ("hi", "en", "ta", "bn", "mal"):
        raise ValueError("Language %s not one of the FIRE 2013 languages")
    language = lcode2lang[langcode].lower()

    # for example, FIRE2013/hindi_train
    base_input_path = os.path.join(paths["NERBASE"], "FIRE2013",
                                   "%s_train" % language)
    base_output_path = paths["NER_DATA_DIR"]

    train_csv_file = os.path.join(base_output_path,
                                  "%s.train.csv" % short_name)
    dev_csv_file = os.path.join(base_output_path, "%s.dev.csv" % short_name)
    test_csv_file = os.path.join(base_output_path, "%s.test.csv" % short_name)

    convert_fire_2013(base_input_path, train_csv_file, dev_csv_file,
                      test_csv_file)

    for csv_file, shard in zip((train_csv_file, dev_csv_file, test_csv_file),
                               SHARDS):
        output_filename = os.path.join(base_output_path,
                                       '%s.%s.json' % (short_name, shard))
        prepare_ner_file.process_dataset(csv_file, output_filename)
Esempio n. 4
0
def main():
    if len(sys.argv) != 3:
        print('Usage: {} list_of_tb_file output_factory_file'.format(
            sys.argv[0]))
        sys.exit(0)

    # Read list of all treebanks of concern
    list_of_tb_file, output_file = sys.argv[1:]

    shorthands = []
    fullnames = []
    with open(list_of_tb_file) as f:
        for line in f:
            treebank = line.strip()
            fullnames.append(treebank)
            if SHORTNAME_RE.match(treebank):
                shorthands.append(treebank)
            else:
                shorthands.append(treebank_to_short_name(treebank))

    # For each treebank, we would like to find the XPOS Vocab configuration that minimizes
    # the number of total classes needed to predict by all tagger classifiers. This is
    # achieved by enumerating different options of separators that different treebanks might
    # use, and comparing that to treating the XPOS tags as separate categories (using a
    # WordVocab).
    mapping = defaultdict(list)
    for sh, fn in zip(shorthands, fullnames):
        factory = get_factory(sh, fn)
        mapping[factory].append(sh)

    # Generate code. This takes the XPOS vocabulary classes selected above, and generates the
    # actual factory class as seen in models.pos.xpos_vocab_factory.
    first = True
    with open(output_file, 'w') as f:
        print(
            '''# This is the XPOS factory method generated automatically from stanza.models.pos.build_xpos_vocab_factory.
# Please don't edit it!

from stanza.models.pos.vocab import WordVocab, XPOSVocab

def xpos_vocab_factory(data, shorthand):''',
            file=f)

        for key in mapping:
            print("    {} shorthand in [{}]:".format(
                'if' if first else 'elif',
                ', '.join(['"{}"'.format(x) for x in mapping[key]])),
                  file=f)
            print("        return {}".format(key), file=f)

            first = False
        print('''    else:
        raise NotImplementedError('Language shorthand "{}" not found!'.format(shorthand))''',
              file=f)

    print('Done!')
Esempio n. 5
0
def project_to_short_name(treebank):
    """
    Project either a treebank or a short name to a short name

    TODO: see if treebank_to_short_name can incorporate this
    """
    if SHORTNAME_RE.match(treebank):
        return treebank
    else:
        return treebank_to_short_name(treebank)
Esempio n. 6
0
def test_treebank():
    """
    Test the entire treebank name conversion
    """
    # conversion of a UD_ name
    assert "hi_hdtb" == treebank_to_short_name("UD_Hindi-HDTB")
    # conversion of names without UD
    assert "hi_fire2013" == treebank_to_short_name("Hindi-fire2013")
    assert "hi_fire2013" == treebank_to_short_name("Hindi-Fire2013")
    assert "hi_fire2013" == treebank_to_short_name("Hindi-FIRE2013")
    # already short names are generally preserved
    assert "hi_fire2013" == treebank_to_short_name("hi-fire2013")
    assert "hi_fire2013" == treebank_to_short_name("hi_fire2013")
    # a special case
    assert "zh-hant_pud" == treebank_to_short_name("UD_Chinese-PUD")
    # a special case already converted once
    assert "zh-hant_pud" == treebank_to_short_name("zh-hant_pud")
    assert "zh-hant_pud" == treebank_to_short_name("zh-hant-pud")
    assert "zh-hans_gsdsimp" == treebank_to_short_name("zh-hans_gsdsimp")
Esempio n. 7
0
def main(run_treebank, model_dir, model_name, add_specific_args=None):
    logger.info("Training program called with:\n" + " ".join(sys.argv))

    paths = default_paths.get_default_paths()

    parser = build_argparse()
    if add_specific_args is not None:
        add_specific_args(parser)
    if '--extra_args' in sys.argv:
        idx = sys.argv.index('--extra_args')
        extra_args = sys.argv[idx + 1:]
        command_args = parser.parse_args(sys.argv[:idx])
    else:
        command_args, extra_args = parser.parse_known_args()

    mode = command_args.mode
    treebanks = []

    for treebank in command_args.treebanks:
        # this is a really annoying typo to make if you copy/paste a
        # UD directory name on the cluster and your job dies 30s after
        # being queued for an hour
        if treebank.endswith("/"):
            treebank = treebank[:-1]
        if treebank.lower() in ('ud_all', 'all_ud'):
            ud_treebanks = common.get_ud_treebanks(paths["UDBASE"])
            treebanks.extend(ud_treebanks)
        else:
            treebanks.append(treebank)

    for treebank in treebanks:
        if SHORTNAME_RE.match(treebank):
            short_name = treebank
        else:
            short_name = treebank_to_short_name(treebank)
        logger.debug("%s: %s" % (treebank, short_name))

        if mode == Mode.TRAIN and not command_args.force and model_name != 'ete':
            model_path = "saved_models/%s/%s_%s.pt" % (model_dir, short_name,
                                                       model_name)
            if os.path.exists(model_path):
                logger.info("%s: %s exists, skipping!" %
                            (treebank, model_path))
                continue
            else:
                logger.info("%s: %s does not exist, training new model" %
                            (treebank, model_path))

        if command_args.temp_output and model_name != 'ete':
            with tempfile.NamedTemporaryFile() as temp_output_file:
                run_treebank(mode, paths, treebank, short_name,
                             temp_output_file.name, command_args, extra_args)
        else:
            run_treebank(mode, paths, treebank, short_name, None, command_args,
                         extra_args)
Esempio n. 8
0
def main(run_treebank, model_dir, model_name, add_specific_args=None):
    """
    A main program for each of the run_xyz scripts

    It collects the arguments and runs the main method for each dataset provided.
    It also tries to look for an existing model and not overwrite it unless --force is provided
    """
    logger.info("Training program called with:\n" + " ".join(sys.argv))

    paths = default_paths.get_default_paths()

    parser = build_argparse()
    if add_specific_args is not None:
        add_specific_args(parser)
    if '--extra_args' in sys.argv:
        idx = sys.argv.index('--extra_args')
        extra_args = sys.argv[idx + 1:]
        command_args = parser.parse_args(sys.argv[1:idx])
    else:
        command_args, extra_args = parser.parse_known_args()

    # Pass this through to the underlying model as well as use it here
    if command_args.save_dir:
        extra_args.extend(["--save_dir", command_args.save_dir])

    mode = command_args.mode
    treebanks = []

    for treebank in command_args.treebanks:
        # this is a really annoying typo to make if you copy/paste a
        # UD directory name on the cluster and your job dies 30s after
        # being queued for an hour
        if treebank.endswith("/"):
            treebank = treebank[:-1]
        if treebank.lower() in ('ud_all', 'all_ud'):
            ud_treebanks = common.get_ud_treebanks(paths["UDBASE"])
            treebanks.extend(ud_treebanks)
        else:
            treebanks.append(treebank)

    for treebank_idx, treebank in enumerate(treebanks):
        if treebank_idx > 0:
            logger.info("=========================================")

        if SHORTNAME_RE.match(treebank):
            short_name = treebank
        else:
            short_name = treebank_to_short_name(treebank)
        logger.debug("%s: %s" % (treebank, short_name))

        if mode == Mode.TRAIN and not command_args.force and model_name != 'ete':
            if command_args.save_dir:
                model_path = "%s/%s_%s.pt" % (command_args.save_dir,
                                              short_name, model_name)
            else:
                model_path = "saved_models/%s/%s_%s.pt" % (
                    model_dir, short_name, model_name)
            if os.path.exists(model_path):
                logger.info("%s: %s exists, skipping!" %
                            (treebank, model_path))
                continue
            else:
                logger.info("%s: %s does not exist, training new model" %
                            (treebank, model_path))

        if command_args.temp_output and model_name != 'ete':
            with tempfile.NamedTemporaryFile() as temp_output_file:
                run_treebank(mode, paths, treebank, short_name,
                             temp_output_file.name, command_args, extra_args)
        else:
            run_treebank(mode, paths, treebank, short_name, None, command_args,
                         extra_args)
paths = default_paths.get_default_paths()
udbase = paths["UDBASE"]

directories = glob.glob(udbase + "/UD_*")
directories.sort()

output_name = os.path.join(
    os.path.split(__file__)[0], "short_name_to_treebank.py")
with open(output_name, "w") as fout:
    fout.write(
        "# This module is autogenerated by build_short_name_to_treebank.py\n")
    fout.write("# Please do not edit\n")
    fout.write("\n")
    fout.write("SHORT_NAMES = {\n")
    for ud_path in directories:
        ud_name = os.path.split(ud_path)[1]
        short_name = treebank_to_short_name(ud_name)
        fout.write("    '%s': '%s',\n" % (short_name, ud_name))

        if short_name.startswith("zh_"):
            short_name = "zh-hans_" + short_name[3:]
            fout.write("    '%s': '%s',\n" % (short_name, ud_name))

    fout.write("}\n")

    fout.write("""

def short_name_to_treebank(short_name):
    return SHORT_NAMES[short_name]
""")