def process_treebank(treebank, paths, args):
    if args.tag_method is Tags.GOLD:
        prepare_tokenizer_treebank.copy_conllu_treebank(
            treebank, paths, paths["DEPPARSE_DATA_DIR"])
    elif args.tag_method is Tags.PREDICTED:
        short_name = common.project_to_short_name(treebank)
        short_language = short_name.split("_")[0]

        base_args = [
            "--wordvec_dir", paths["WORDVEC_DIR"], "--lang", short_language,
            "--shorthand", short_name, "--batch_size",
            pos_batch_size(short_name), "--mode", "predict"
        ]
        base_args = base_args + wordvec_args(short_language)

        def retag_dataset(tokenizer_dir, tokenizer_file, dest_dir, dest_file,
                          short_name):
            original = f"{tokenizer_dir}/{short_name}.{tokenizer_file}.conllu"
            retagged = f"{dest_dir}/{short_name}.{dest_file}.conllu"
            tagger_args = [
                "--eval_file", original, "--gold_file", original,
                "--output_file", retagged
            ]
            if args.wordvec_pretrain_file:
                tagger_args.extend(
                    ["--wordvec_pretrain_file", args.wordvec_pretrain_file])
            tagger_args = base_args + tagger_args
            logger.info("Running tagger to retag {} to {}\n  Args: {}".format(
                original, retagged, tagger_args))
            tagger.main(tagger_args)

        prepare_tokenizer_treebank.copy_conllu_treebank(
            treebank, paths, paths["DEPPARSE_DATA_DIR"], retag_dataset)
    else:
        raise ValueError("Unknown tags method: {}".format(arg.tag_method))
Example #2
0
def copy_conllu_treebank(treebank, paths, dest_dir, postprocess=None, augment=True):
    """
    This utility method copies only the conllu files to the given destination directory.

    Both POS and lemma annotators need this.
    """
    os.makedirs(dest_dir, exist_ok=True)

    short_name = common.project_to_short_name(treebank)
    short_language = short_name.split("_")[0]

    with tempfile.TemporaryDirectory() as tokenizer_dir:
        paths = dict(paths)
        paths["TOKENIZE_DATA_DIR"] = tokenizer_dir

        # first we process the tokenization data
        args = argparse.Namespace()
        args.augment = augment
        args.prepare_labels = False
        process_treebank(treebank, paths, args)

        os.makedirs(dest_dir, exist_ok=True)

        if postprocess is None:
            postprocess = copy_conllu_file

        # now we copy the processed conllu data files
        postprocess(tokenizer_dir, "train.gold", dest_dir, "train.in", short_name)
        postprocess(tokenizer_dir, "dev.gold", dest_dir, "dev.gold", short_name)
        copy_conllu_file(dest_dir, "dev.gold", dest_dir, "dev.in", short_name)
        postprocess(tokenizer_dir, "test.gold", dest_dir, "test.gold", short_name)
        copy_conllu_file(dest_dir, "test.gold", dest_dir, "test.in", short_name)
def process_treebank(treebank, paths, args):
    """
    Processes a single treebank into train, dev, test parts

    TODO
    Currently assumes it is always a UD treebank.  There are Thai
    treebanks which are not included in UD.

    Also, there is no specific mechanism for UD_Arabic-NYUAD or
    similar treebanks, which need integration with LDC datsets
    """
    udbase_dir = paths["UDBASE"]
    tokenizer_dir = paths["TOKENIZE_DATA_DIR"]
    handparsed_dir = paths["HANDPARSED_DIR"]

    short_name = common.project_to_short_name(treebank)
    short_language = short_name.split("_")[0]

    os.makedirs(tokenizer_dir, exist_ok=True)

    if short_name.startswith("ko_combined"):
        build_combined_korean(udbase_dir, tokenizer_dir, short_name)
    elif short_name in ("it_combined", "en_combined", "es_combined"):
        build_combined_dataset(udbase_dir, tokenizer_dir, handparsed_dir,
                               short_name, args.augment)
    elif short_name.startswith("en_gum"):
        # we special case GUM because it should include a filled-out GUMReddit
        print("Preparing data for %s: %s, %s" %
              (treebank, short_name, short_language))
        build_combined_english_gum(udbase_dir, tokenizer_dir, short_name,
                                   args.augment)
    else:
        # check that we can find the train file where we expect it
        train_conllu_file = common.find_treebank_dataset_file(treebank,
                                                              udbase_dir,
                                                              "train",
                                                              "conllu",
                                                              fail=True)

        print("Preparing data for %s: %s, %s" %
              (treebank, short_name, short_language))

        if not common.find_treebank_dataset_file(
                treebank, udbase_dir, "dev", "conllu", fail=False):
            process_partial_ud_treebank(treebank, udbase_dir, tokenizer_dir,
                                        short_name, short_language)
        else:
            process_ud_treebank(treebank, udbase_dir, tokenizer_dir,
                                short_name, short_language, args.augment)

    convert_conllu_to_txt(tokenizer_dir, short_name)

    if args.prepare_labels:
        prepare_treebank_labels(tokenizer_dir, short_name)
Example #4
0
def process_treebank(treebank, paths, args):
    """
    Processes a single treebank into train, dev, test parts

    Includes processing for a few external tokenization datasets:
      vi_vlsp, th_orchid, th_best

    Also, there is no specific mechanism for UD_Arabic-NYUAD or
    similar treebanks, which need integration with LDC datsets
    """
    udbase_dir = paths["UDBASE"]
    tokenizer_dir = paths["TOKENIZE_DATA_DIR"]
    handparsed_dir = paths["HANDPARSED_DIR"]

    short_name = common.project_to_short_name(treebank)
    short_language = short_name.split("_")[0]

    os.makedirs(tokenizer_dir, exist_ok=True)

    if short_name == "vi_vlsp":
        convert_vi_vlsp.convert_vi_vlsp(paths["EXTERN_DIR"], tokenizer_dir, args)
    elif short_name == "th_orchid":
        convert_th_orchid.main(paths["EXTERN_DIR"], tokenizer_dir)
    elif short_name == "th_lst20":
        convert_th_lst20.convert(paths["EXTERN_DIR"], tokenizer_dir, args)
    elif short_name == "th_best":
        convert_th_best.main(paths["EXTERN_DIR"], tokenizer_dir)
    elif short_name.startswith("ko_combined"):
        build_combined_korean(udbase_dir, tokenizer_dir, short_name)
    elif short_name in ("it_combined", "en_combined", "es_combined"):
        build_combined_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, args.augment)
    elif short_name.startswith("en_gum"):
        # we special case GUM because it should include a filled-out GUMReddit
        print("Preparing data for %s: %s, %s" % (treebank, short_name, short_language))
        build_combined_english_gum(udbase_dir, tokenizer_dir, short_name, args.augment)
    else:
        # check that we can find the train file where we expect it
        train_conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, "train", "conllu", fail=True)

        print("Preparing data for %s: %s, %s" % (treebank, short_name, short_language))

        if not common.find_treebank_dataset_file(treebank, udbase_dir, "dev", "conllu", fail=False):
            process_partial_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name, short_language)
        else:
            process_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name, short_language, args.augment)

    if not short_name in ('th_orchid', 'th_lst20'):
        convert_conllu_to_txt(tokenizer_dir, short_name)

    if args.prepare_labels:
        prepare_treebank_labels(tokenizer_dir, short_name)
def process_treebank(treebank, paths, args):
    """
    Processes a single treebank into train, dev, test parts

    TODO
    Currently assumes it is always a UD treebank.  There are Thai
    treebanks which are not included in UD.

    Also, there is no specific mechanism for UD_Arabic-NYUAD or
    similar treebanks, which need integration with LDC datsets
    """
    udbase_dir = paths["UDBASE"]
    tokenizer_dir = paths["TOKENIZE_DATA_DIR"]
    extern_dir = paths["EXTERN_DIR"]

    short_name = common.project_to_short_name(treebank)
    short_language = short_name.split("_")[0]

    if short_name.startswith("ko_combined"):
        build_combined_korean(udbase_dir, tokenizer_dir, short_name,
                              args.prepare_labels)
    elif short_name.startswith("it_combined"):
        build_combined_italian(udbase_dir, tokenizer_dir, extern_dir,
                               short_name, args.prepare_labels)
    elif short_name.startswith("en_combined"):
        build_combined_english(udbase_dir, tokenizer_dir, extern_dir,
                               short_name, args.prepare_labels)
    else:
        train_txt_file = common.find_treebank_dataset_file(
            treebank, udbase_dir, "train", "txt")
        if not train_txt_file:
            raise ValueError("Cannot find train file for treebank %s" %
                             treebank)

        print("Preparing data for %s: %s, %s" %
              (treebank, short_name, short_language))

        if not common.find_treebank_dataset_file(treebank, udbase_dir, "dev",
                                                 "txt"):
            process_partial_ud_treebank(treebank, udbase_dir, tokenizer_dir,
                                        short_name, short_language,
                                        args.prepare_labels)
        else:
            process_ud_treebank(treebank, udbase_dir, tokenizer_dir,
                                short_name, short_language, args.augment,
                                args.prepare_labels)
def process_treebank(treebank, paths, args):
    short_name = common.project_to_short_name(treebank)

    mwt_dir = paths["MWT_DATA_DIR"]
    os.makedirs(mwt_dir, exist_ok=True)

    with tempfile.TemporaryDirectory() as tokenizer_dir:
        paths = dict(paths)
        paths["TOKENIZE_DATA_DIR"] = tokenizer_dir

        # first we process the tokenization data
        tokenizer_args = argparse.Namespace()
        tokenizer_args.augment = False
        tokenizer_args.prepare_labels = True
        prepare_tokenizer_treebank.process_treebank(treebank, paths,
                                                    tokenizer_args)

        copy_conllu(tokenizer_dir, mwt_dir, short_name, "train", "in")
        copy_conllu(tokenizer_dir, mwt_dir, short_name, "dev", "gold")
        copy_conllu(tokenizer_dir, mwt_dir, short_name, "test", "gold")

        shutil.copyfile(
            prepare_tokenizer_treebank.mwt_name(tokenizer_dir, short_name,
                                                "train"),
            prepare_tokenizer_treebank.mwt_name(mwt_dir, short_name, "train"))
        shutil.copyfile(
            prepare_tokenizer_treebank.mwt_name(tokenizer_dir, short_name,
                                                "dev"),
            prepare_tokenizer_treebank.mwt_name(mwt_dir, short_name, "dev"))
        shutil.copyfile(
            prepare_tokenizer_treebank.mwt_name(tokenizer_dir, short_name,
                                                "test"),
            prepare_tokenizer_treebank.mwt_name(mwt_dir, short_name, "test"))

        contract_mwt(f"{mwt_dir}/{short_name}.dev.gold.conllu",
                     f"{mwt_dir}/{short_name}.dev.in.conllu")
        contract_mwt(f"{mwt_dir}/{short_name}.test.gold.conllu",
                     f"{mwt_dir}/{short_name}.test.in.conllu")
Example #7
0
def run_ete(paths, dataset, short_name, command_args, extra_args):
    short_language = short_name.split("_")[0]

    tokenize_dir = paths["TOKENIZE_DATA_DIR"]
    mwt_dir = paths["MWT_DATA_DIR"]
    lemma_dir = paths["LEMMA_DATA_DIR"]
    ete_dir = paths["ETE_DATA_DIR"]
    wordvec_dir = paths["WORDVEC_DIR"]

    # run models in the following order:
    #   tokenize
    #   mwt, if exists
    #   pos
    #   lemma, if exists
    #   depparse
    # the output of each step is either kept or discarded based on the
    # value of command_args.save_output

    if command_args and command_args.test_data:
        test_short_name = project_to_short_name(command_args.test_data)
    else:
        test_short_name = short_name

    # TOKENIZE step
    # the raw data to process starts in tokenize_dir
    # retokenize it using the saved model
    tokenizer_type = "--txt_file"
    tokenizer_file = f"{tokenize_dir}/{test_short_name}.{dataset}.txt"

    tokenizer_output = f"{ete_dir}/{short_name}.{dataset}.tokenizer.conllu"

    tokenizer_args = [
        "--mode", "predict", tokenizer_type, tokenizer_file, "--lang",
        short_language, "--conll_file", tokenizer_output, "--shorthand",
        short_name
    ]
    tokenizer_args = tokenizer_args + extra_args
    logger.info("-----  TOKENIZER  ----------")
    logger.info("Running tokenizer step with args: {}".format(tokenizer_args))
    tokenizer.main(tokenizer_args)

    # If the data has any MWT in it, there should be an MWT model
    # trained, so run that.  Otherwise, we skip MWT
    mwt_train_file = f"{mwt_dir}/{short_name}.train.in.conllu"
    logger.info("-----  MWT        ----------")
    if check_mwt(mwt_train_file):
        mwt_output = f"{ete_dir}/{short_name}.{dataset}.mwt.conllu"
        mwt_args = [
            '--eval_file', tokenizer_output, '--output_file', mwt_output,
            '--lang', short_language, '--shorthand', short_name, '--mode',
            'predict'
        ]
        mwt_args = mwt_args + extra_args
        logger.info("Running mwt step with args: {}".format(mwt_args))
        mwt_expander.main(mwt_args)
    else:
        logger.info("No MWT in training data.  Skipping")
        mwt_output = tokenizer_output

    # Run the POS step
    # TODO: add batch args
    logger.info("-----  POS        ----------")
    pos_output = f"{ete_dir}/{short_name}.{dataset}.pos.conllu"
    pos_args = [
        '--wordvec_dir', wordvec_dir, '--eval_file', mwt_output,
        '--output_file', pos_output, '--lang', short_name, '--shorthand',
        short_name, '--mode', 'predict'
    ]
    pos_args = pos_args + wordvec_args(short_language) + extra_args
    logger.info("Running pos step with args: {}".format(pos_args))
    tagger.main(pos_args)

    # Run the LEMMA step.  If there are no lemmas in the training
    # data, use the identity lemmatizer.
    logger.info("-----  LEMMA      ----------")
    lemma_train_file = f"{lemma_dir}/{short_name}.train.in.conllu"
    lemma_output = f"{ete_dir}/{short_name}.{dataset}.lemma.conllu"
    lemma_args = [
        '--eval_file', pos_output, '--output_file', lemma_output, '--lang',
        short_name, '--mode', 'predict'
    ]
    lemma_args = lemma_args + extra_args
    if check_lemmas(lemma_train_file):
        logger.info("Running lemmatizer step with args: {}".format(lemma_args))
        lemmatizer.main(lemma_args)
    else:
        logger.info("No lemmas in training data")
        logger.info("Running identity lemmatizer step with args: {}".format(
            lemma_args))
        identity_lemmatizer.main(lemma_args)

    # Run the DEPPARSE step.  This is the last step
    # Note that we do NOT use the depparse directory's data.  That is
    # because it has either gold tags, or predicted tags based on
    # retagging using gold tokenization, and we aren't sure which at
    # this point in the process.
    # TODO: add batch args
    logger.info("-----  DEPPARSE   ----------")
    depparse_output = f"{ete_dir}/{short_name}.{dataset}.depparse.conllu"
    depparse_args = [
        '--wordvec_dir', wordvec_dir, '--eval_file', lemma_output,
        '--output_file', depparse_output, '--lang', short_name, '--shorthand',
        short_name, '--mode', 'predict'
    ]
    depparse_args = depparse_args + wordvec_args(short_language) + extra_args
    logger.info("Running depparse step with args: {}".format(depparse_args))
    parser.main(depparse_args)

    logger.info("-----  EVALUATION ----------")
    gold_file = f"{tokenize_dir}/{test_short_name}.{dataset}.gold.conllu"
    ete_file = depparse_output
    results = common.run_eval_script(gold_file, ete_file)
    logger.info("End to end results for {} models on {} {} data:\n{}".format(
        short_name, test_short_name, dataset, results))