def build_combined_english_dataset(udbase_dir, tokenizer_dir, extern_dir,
                                   short_name, dataset, prepare_labels):
    output_txt = f"{tokenizer_dir}/{short_name}.{dataset}.txt"
    output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"

    if dataset == 'train':
        # TODO: include more UD treebanks, possibly with xpos removed
        #  UD_English-ParTUT, UD_English-Pronouns, UD_English-Pronouns - xpos are different
        # also include "external" treebanks such as PTB
        treebanks = ["UD_English-EWT", "UD_English-GUM"]
        sents = []
        for treebank in treebanks:
            conllu_file = common.find_treebank_dataset_file(treebank,
                                                            udbase_dir,
                                                            dataset,
                                                            "conllu",
                                                            fail=True)
            sents.extend(read_sentences_from_conllu(conllu_file))
    else:
        ewt_conllu = common.find_treebank_dataset_file("UD_English-EWT",
                                                       udbase_dir, dataset,
                                                       "conllu")
        sents = read_sentences_from_conllu(ewt_conllu)

    sents = strip_mwt_from_sentences(sents)
    write_sentences_to_conllu(output_conllu, sents)
    convert_conllu_to_txt(output_conllu, output_txt)

    if prepare_labels:
        prepare_dataset_labels(output_txt, output_conllu, tokenizer_dir,
                               short_name, "it", dataset)
Exemple #2
0
def build_combined_english_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset):
    """
    en_combined is currently EWT, GUM, PUD, and Pronouns

    TODO: use more of the handparsed data
    """
    check_gum_ready(udbase_dir)

    if dataset == 'train':
        # TODO: include more UD treebanks, possibly with xpos removed
        #  UD_English-ParTUT - xpos are different
        # also include "external" treebanks such as PTB
        # NOTE: in order to get the best results, make sure each of these treebanks have the latest edits applied
        train_treebanks = ["UD_English-EWT", "UD_English-GUM", "UD_English-GUMReddit"]
        test_treebanks = ["UD_English-PUD", "UD_English-Pronouns"]
        sents = []
        for treebank in train_treebanks:
            conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, "train", "conllu", fail=True)
            sents.extend(read_sentences_from_conllu(conllu_file))
        for treebank in test_treebanks:
            conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, "test", "conllu", fail=True)
            sents.extend(read_sentences_from_conllu(conllu_file))
    else:
        ewt_conllu = common.find_treebank_dataset_file("UD_English-EWT", udbase_dir, dataset, "conllu")
        sents = read_sentences_from_conllu(ewt_conllu)

    sents = strip_mwt_from_sentences(sents)
    return sents
def build_combined_korean_dataset(udbase_dir,
                                  tokenizer_dir,
                                  short_name,
                                  dataset,
                                  output_txt,
                                  output_conllu,
                                  prepare_labels=True):
    """
    Builds a combined dataset out of multiple Korean datasets.

    Currently this uses GSD and Kaist.  If a segmenter-appropriate
    dataset was requested, spaces are removed.

    TODO: we need to handle the difference in xpos tags somehow.
    """
    gsd_conllu = common.find_treebank_dataset_file("UD_Korean-GSD", udbase_dir,
                                                   dataset, "conllu")
    kaist_conllu = common.find_treebank_dataset_file("UD_Korean-Kaist",
                                                     udbase_dir, dataset,
                                                     "conllu")
    sents = read_sentences_from_conllu(
        gsd_conllu) + read_sentences_from_conllu(kaist_conllu)

    segmenter = short_name.endswith("_seg")
    if segmenter:
        sents = remove_spaces_from_sentences(sents)

    write_sentences_to_conllu(output_conllu, sents)
    convert_conllu_to_txt(output_conllu, output_txt)

    if prepare_labels:
        prepare_dataset_labels(output_txt, output_conllu, tokenizer_dir,
                               short_name, "ko", dataset)
Exemple #4
0
def build_combined_spanish_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset):
    """
    es_combined is AnCora and GSD put together

    TODO: remove features which aren't shared between datasets
    TODO: consider mixing in PUD?
    """
    if dataset == 'train':
        treebanks = ["UD_Spanish-AnCora", "UD_Spanish-GSD"]
        sents = []
        for treebank in treebanks:
            conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "conllu", fail=True)
            new_sents = read_sentences_from_conllu(conllu_file)
            if treebank.endswith("GSD"):
                new_sents = replace_semicolons(new_sents)
            sents.extend(new_sents)

        extra_spanish = os.path.join(handparsed_dir, "spanish-mwt", "spanish.mwt")
        if not os.path.exists(extra_spanish):
            raise FileNotFoundError("Cannot find the extra dataset 'spanish.mwt' which includes various multi-words retokenized, expected {}".format(extra_italian))
        extra_sents = read_sentences_from_conllu(extra_spanish)
        sents.extend(extra_sents)
    else:
        conllu_file = common.find_treebank_dataset_file("UD_Spanish-AnCora", udbase_dir, dataset, "conllu", fail=True)
        sents = read_sentences_from_conllu(conllu_file)

    return sents
Exemple #5
0
def build_combined_italian_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset):
    if dataset == 'train':
        # could maybe add ParTUT, but that dataset has a slightly different xpos set
        # (no DE or I)
        # and I didn't feel like sorting through the differences
        # Note: currently these each have small changes compared with
        # the UD2.7 release.  See the issues (possibly closed by now)
        # filed by AngledLuffa on each of the treebanks for more info.
        treebanks = ["UD_Italian-ISDT", "UD_Italian-VIT", "UD_Italian-TWITTIRO", "UD_Italian-PoSTWITA"]
        sents = []
        for treebank in treebanks:
            conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "conllu", fail=True)
            sents.extend(read_sentences_from_conllu(conllu_file))
        extra_italian = os.path.join(handparsed_dir, "italian-mwt", "italian.mwt")
        if not os.path.exists(extra_italian):
            raise FileNotFoundError("Cannot find the extra dataset 'italian.mwt' which includes various multi-words retokenized, expected {}".format(extra_italian))
        extra_sents = read_sentences_from_conllu(extra_italian)
        for sentence in extra_sents:
            if not sentence[2].endswith("_") or not MWT_RE.match(sentence[2]):
                raise AssertionError("Unexpected format of the italian.mwt file.  Has it already be modified to have SpaceAfter=No everywhere?")
            sentence[2] = sentence[2][:-1] + "SpaceAfter=No"
        sents = sents + extra_sents
    else:
        istd_conllu = common.find_treebank_dataset_file("UD_Italian-ISDT", udbase_dir, dataset, "conllu")
        sents = read_sentences_from_conllu(istd_conllu)

    return sents
def process_treebank(treebank, paths, args):
    """
    Processes a single treebank into train, dev, test parts

    TODO
    Currently assumes it is always a UD treebank.  There are Thai
    treebanks which are not included in UD.

    Also, there is no specific mechanism for UD_Arabic-NYUAD or
    similar treebanks, which need integration with LDC datsets
    """
    udbase_dir = paths["UDBASE"]
    tokenizer_dir = paths["TOKENIZE_DATA_DIR"]
    handparsed_dir = paths["HANDPARSED_DIR"]

    short_name = common.project_to_short_name(treebank)
    short_language = short_name.split("_")[0]

    os.makedirs(tokenizer_dir, exist_ok=True)

    if short_name.startswith("ko_combined"):
        build_combined_korean(udbase_dir, tokenizer_dir, short_name)
    elif short_name in ("it_combined", "en_combined", "es_combined"):
        build_combined_dataset(udbase_dir, tokenizer_dir, handparsed_dir,
                               short_name, args.augment)
    elif short_name.startswith("en_gum"):
        # we special case GUM because it should include a filled-out GUMReddit
        print("Preparing data for %s: %s, %s" %
              (treebank, short_name, short_language))
        build_combined_english_gum(udbase_dir, tokenizer_dir, short_name,
                                   args.augment)
    else:
        # check that we can find the train file where we expect it
        train_conllu_file = common.find_treebank_dataset_file(treebank,
                                                              udbase_dir,
                                                              "train",
                                                              "conllu",
                                                              fail=True)

        print("Preparing data for %s: %s, %s" %
              (treebank, short_name, short_language))

        if not common.find_treebank_dataset_file(
                treebank, udbase_dir, "dev", "conllu", fail=False):
            process_partial_ud_treebank(treebank, udbase_dir, tokenizer_dir,
                                        short_name, short_language)
        else:
            process_ud_treebank(treebank, udbase_dir, tokenizer_dir,
                                short_name, short_language, args.augment)

    convert_conllu_to_txt(tokenizer_dir, short_name)

    if args.prepare_labels:
        prepare_treebank_labels(tokenizer_dir, short_name)
Exemple #7
0
def process_treebank(treebank, paths, args):
    """
    Processes a single treebank into train, dev, test parts

    Includes processing for a few external tokenization datasets:
      vi_vlsp, th_orchid, th_best

    Also, there is no specific mechanism for UD_Arabic-NYUAD or
    similar treebanks, which need integration with LDC datsets
    """
    udbase_dir = paths["UDBASE"]
    tokenizer_dir = paths["TOKENIZE_DATA_DIR"]
    handparsed_dir = paths["HANDPARSED_DIR"]

    short_name = common.project_to_short_name(treebank)
    short_language = short_name.split("_")[0]

    os.makedirs(tokenizer_dir, exist_ok=True)

    if short_name == "vi_vlsp":
        convert_vi_vlsp.convert_vi_vlsp(paths["EXTERN_DIR"], tokenizer_dir, args)
    elif short_name == "th_orchid":
        convert_th_orchid.main(paths["EXTERN_DIR"], tokenizer_dir)
    elif short_name == "th_lst20":
        convert_th_lst20.convert(paths["EXTERN_DIR"], tokenizer_dir, args)
    elif short_name == "th_best":
        convert_th_best.main(paths["EXTERN_DIR"], tokenizer_dir)
    elif short_name.startswith("ko_combined"):
        build_combined_korean(udbase_dir, tokenizer_dir, short_name)
    elif short_name in ("it_combined", "en_combined", "es_combined"):
        build_combined_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, args.augment)
    elif short_name.startswith("en_gum"):
        # we special case GUM because it should include a filled-out GUMReddit
        print("Preparing data for %s: %s, %s" % (treebank, short_name, short_language))
        build_combined_english_gum(udbase_dir, tokenizer_dir, short_name, args.augment)
    else:
        # check that we can find the train file where we expect it
        train_conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, "train", "conllu", fail=True)

        print("Preparing data for %s: %s, %s" % (treebank, short_name, short_language))

        if not common.find_treebank_dataset_file(treebank, udbase_dir, "dev", "conllu", fail=False):
            process_partial_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name, short_language)
        else:
            process_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name, short_language, args.augment)

    if not short_name in ('th_orchid', 'th_lst20'):
        convert_conllu_to_txt(tokenizer_dir, short_name)

    if args.prepare_labels:
        prepare_treebank_labels(tokenizer_dir, short_name)
Exemple #8
0
def process_partial_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name, short_language):
    """
    Process a UD treebank with only train/test splits

    For example, in UD 2.7:
      UD_Buryat-BDT
      UD_Galician-TreeGal
      UD_Indonesian-CSUI
      UD_Kazakh-KTB
      UD_Kurmanji-MG
      UD_Latin-Perseus
      UD_Livvi-KKPP
      UD_North_Sami-Giella
      UD_Old_Russian-RNC
      UD_Sanskrit-Vedic
      UD_Slovenian-SST
      UD_Upper_Sorbian-UFAL
      UD_Welsh-CCG
    """
    train_input_conllu = common.find_treebank_dataset_file(treebank, udbase_dir, "train", "conllu")
    train_output_conllu = f"{tokenizer_dir}/{short_name}.train.gold.conllu"
    dev_output_conllu = f"{tokenizer_dir}/{short_name}.dev.gold.conllu"

    if not split_train_file(treebank=treebank,
                            train_input_conllu=train_input_conllu,
                            train_output_conllu=train_output_conllu,
                            dev_output_conllu=dev_output_conllu):
        return

    # the test set is already fine
    # currently we do not do any augmentation of these partial treebanks
    prepare_ud_dataset(treebank, udbase_dir, tokenizer_dir, short_name, short_language, "test", augment=False)
def build_combined_english_gum_dataset(udbase_dir, tokenizer_dir, short_name,
                                       dataset, augment):
    """
    Build the GUM dataset by combining GUMReddit

    It checks to make sure GUMReddit is filled out using the included script
    """
    check_gum_ready(udbase_dir)
    random.seed(1234)

    output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"

    treebanks = ["UD_English-GUM", "UD_English-GUMReddit"]
    sents = []
    for treebank in treebanks:
        conllu_file = common.find_treebank_dataset_file(treebank,
                                                        udbase_dir,
                                                        dataset,
                                                        "conllu",
                                                        fail=True)
        sents.extend(read_sentences_from_conllu(conllu_file))

    if dataset == 'train' and augment:
        sents = augment_punct(sents)

    write_sentences_to_conllu(output_conllu, sents)
def process_treebank(treebank, paths, args):
    """
    Processes a single treebank into train, dev, test parts

    TODO
    Currently assumes it is always a UD treebank.  There are Thai
    treebanks which are not included in UD.

    Also, there is no specific mechanism for UD_Arabic-NYUAD or
    similar treebanks, which need integration with LDC datsets
    """
    udbase_dir = paths["UDBASE"]
    tokenizer_dir = paths["TOKENIZE_DATA_DIR"]
    extern_dir = paths["EXTERN_DIR"]

    short_name = common.project_to_short_name(treebank)
    short_language = short_name.split("_")[0]

    if short_name.startswith("ko_combined"):
        build_combined_korean(udbase_dir, tokenizer_dir, short_name,
                              args.prepare_labels)
    elif short_name.startswith("it_combined"):
        build_combined_italian(udbase_dir, tokenizer_dir, extern_dir,
                               short_name, args.prepare_labels)
    elif short_name.startswith("en_combined"):
        build_combined_english(udbase_dir, tokenizer_dir, extern_dir,
                               short_name, args.prepare_labels)
    else:
        train_txt_file = common.find_treebank_dataset_file(
            treebank, udbase_dir, "train", "txt")
        if not train_txt_file:
            raise ValueError("Cannot find train file for treebank %s" %
                             treebank)

        print("Preparing data for %s: %s, %s" %
              (treebank, short_name, short_language))

        if not common.find_treebank_dataset_file(treebank, udbase_dir, "dev",
                                                 "txt"):
            process_partial_ud_treebank(treebank, udbase_dir, tokenizer_dir,
                                        short_name, short_language,
                                        args.prepare_labels)
        else:
            process_ud_treebank(treebank, udbase_dir, tokenizer_dir,
                                short_name, short_language, args.augment,
                                args.prepare_labels)
def check_gum_ready(udbase_dir):
    gum_conllu = common.find_treebank_dataset_file("UD_English-GUMReddit",
                                                   udbase_dir, "train",
                                                   "conllu")
    if common.mostly_underscores(gum_conllu):
        raise ValueError(
            "Cannot process UD_English-GUMReddit in its current form.  There should be a download script available in the directory which will help integrate the missing proprietary values.  Please run that script to update the data, then try again."
        )
def prepare_ud_dataset(treebank,
                       udbase_dir,
                       tokenizer_dir,
                       short_name,
                       short_language,
                       dataset,
                       augment=True,
                       prepare_labels=True):
    # TODO: do this higher up
    os.makedirs(tokenizer_dir, exist_ok=True)

    input_txt = common.find_treebank_dataset_file(treebank, udbase_dir,
                                                  dataset, "txt")
    input_txt_copy = f"{tokenizer_dir}/{short_name}.{dataset}.txt"

    input_conllu = common.find_treebank_dataset_file(treebank, udbase_dir,
                                                     dataset, "conllu")
    input_conllu_copy = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"

    if short_name == "sl_ssj":
        preprocess_ssj_data.process(input_txt, input_conllu, input_txt_copy,
                                    input_conllu_copy)
    elif short_name == "te_mtg" and dataset == 'train' and augment:
        write_augmented_dataset(input_conllu, input_conllu_copy,
                                input_txt_copy, augment_telugu)
    elif short_name == "ar_padt" and dataset == 'train' and augment:
        write_augmented_dataset(input_conllu, input_conllu_copy,
                                input_txt_copy, augment_arabic_padt)
    elif short_name.startswith("es_ancora") and dataset == 'train':
        # note that we always do this for AnCora, since this token is bizarre and confusing
        fix_spanish_ancora(input_conllu,
                           input_conllu_copy,
                           input_txt_copy,
                           augment=augment)
    elif short_name.startswith("ko_") and short_name.endswith("_seg"):
        remove_spaces(input_conllu, input_conllu_copy, input_txt_copy)
    else:
        shutil.copyfile(input_txt, input_txt_copy)
        shutil.copyfile(input_conllu, input_conllu_copy)

    if prepare_labels:
        prepare_dataset_labels(input_txt_copy, input_conllu_copy,
                               tokenizer_dir, short_name, short_language,
                               dataset)
Exemple #13
0
def prepare_ud_dataset(treebank, udbase_dir, tokenizer_dir, short_name, short_language, dataset, augment=True):
    input_conllu = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "conllu", fail=True)
    output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"

    if short_name == "te_mtg" and dataset == 'train' and augment:
        write_augmented_dataset(input_conllu, output_conllu, augment_telugu)
    elif short_name == "ar_padt" and dataset == 'train' and augment:
        write_augmented_dataset(input_conllu, output_conllu, augment_arabic_padt)
    elif short_name.startswith("ko_") and short_name.endswith("_seg"):
        remove_spaces(input_conllu, output_conllu)
    elif dataset == 'train' and augment:
        write_augmented_dataset(input_conllu, output_conllu, augment_punct)
    else:
        shutil.copyfile(input_conllu, output_conllu)
Exemple #14
0
def process_treebank(treebank, paths, args):
    if treebank.startswith("UD_"):
        udbase_dir = paths["UDBASE"]
        train_conllu = common.find_treebank_dataset_file(treebank,
                                                         udbase_dir,
                                                         "train",
                                                         "conllu",
                                                         fail=True)
        augment = check_lemmas(train_conllu)
        if not augment:
            print(
                "No lemma information found in %s.  Not augmenting the dataset"
                % train_conllu)
    else:
        # TODO: check the data to see if there are lemmas or not
        augment = True
    prepare_tokenizer_treebank.copy_conllu_treebank(treebank,
                                                    paths,
                                                    paths["LEMMA_DATA_DIR"],
                                                    augment=augment)