Example #1
0
def build_vocab(args):
    n_sample = int(args.n_sample)
    l1_file = Path(f"{args.file1}")
    l2_file = Path(f"{args.file2}")

    # get suffixes from filenames
    path1, name1, suff1 = split_filename(args.file1)
    path2, name2, suff2 = split_filename(args.file2)

    # instantiate vocab objects
    voc1 = Vocab(suff1, args.min_freq)
    voc2 = Vocab(suff2, args.min_freq)

    with open(l1_file, "r", encoding="utf-8") as srcvoc, \
            open(l2_file, "r", encoding="utf-8") as tgtvoc:
        for i, (l1, l2) in enumerate(zip(srcvoc, tgtvoc)):
            # add line to vocabs
            voc1.add_sentence(l1.strip())
            voc2.add_sentence(l2.strip())

            print(f"Building vocabulary: line {i + 1:,}", end="\r")

            if n_sample != 0:
                if i > n_sample:
                    break

    voc1.save_voc(path1)
    voc2.save_voc(path2)

    print(" "*50, end="\r")
    print("Building vocabulary: complete")
Example #2
0
def split_single(filename, train_n, eval_n, test_n, verbose):
    path, name, suffix = split_filename(str(filename))
    to_write = ["train", "eval", "test"]
    outputfiles = list()
    for subpart in to_write:
        outputfiles.append(
            open(Path(f"{path}/{subpart}.{suffix}"), "w", encoding="utf-8"))

    limits = [train_n, eval_n, test_n]
    o_index = 0
    current_limit = limits[o_index]
    ofile = outputfiles[o_index]

    with open(filename, "r", encoding="utf-8") as infile:
        for i, line in enumerate(infile):
            ofile.write(line)

            if i == current_limit - 1:
                o_index += 1

                if o_index == len(limits):
                    break

                current_limit += limits[o_index]
                ofile = outputfiles[o_index]

            if verbose is True:
                print(f"Splitting dataset: line {i:,}", end="\r")

    for out_file in outputfiles:
        out_file.close()
Example #3
0
def translate(args):
    inputfile = Path(args.file)
    beam_size = int(args.beam)

    # pick device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    cpu = os.cpu_count()
    torch.set_num_threads(cpu)

    checkpoint = torch.load(Path(args.model), map_location=device)
    model = checkpoint["model"]

    # transfer model and set eval mode
    model.to(device)
    model.eval()

    # print model
    print(model)

    path, name, suffix = split_filename(str(inputfile))

    # start translating
    outputfile = Path(f"{path}/{name}.translated.{model.tgt_lang.name}")
    with open(inputfile, "r", encoding="utf-8") as infile, \
            open(outputfile, "w", encoding="utf-8") as outfile:
        for progress, line in enumerate(infile):
            line = line.strip()
            hypotheses = model.beam_search(line, beam_size, device, args.alpha)

            # if verbose print all hypotheses
            if args.verbose:
                for hyp in hypotheses:
                    indeces = hyp.get_indeces()
                    tokens = model.tgt_lang.idx2toks(indeces.tolist())
                    print(tokens)

            # get indeces of best hypothesis
            indeces = hypotheses[0].get_indeces()
            tokens = model.tgt_lang.idx2toks(indeces.tolist())

            # remove SOS and EOS
            tokens = filter(lambda x: x not in {"<eos>", "<sos>"}, tokens)
            translated = " ".join(tokens)

            # write decoded sentence to output file
            outfile.write(f"{translated}\n")

            if args.verbose:
                print()
            else:
                print(f"Translating: line {progress + 1:,}", end="\r")

    print(" " * 50, end="\r")
    print("Translating: complete")
Example #4
0
def preprocess(args):
    if args.SP is None:
        pipe = Pipeline(args.file, args.language, args.bpe, args.replace_nums,
                        args.n)
        pipe.run()
    else:
        path, name, suffix = split_filename(args.file)
        modelname = f"{path}/model.sentencepiece.{args.SP}.{args.language}"
        # if model is already trained, load model
        if os.path.isfile(modelname):
            trained_model = False
            sp = spm.SentencePieceProcessor(model_file=modelname)

        else:
            # model needs to be trained
            trained_model = True
            sp_args = [
                f"--input={args.file}", f"--model_prefix={args.language}",
                f"--vocab_size={args.SP}", "--bos_id=-1", "--eos_id=-1"
            ]

            # add limit input sentence
            if args.n != 0:
                sp_args.append(f"--input_sentence_size={args.n}")

            # train and load model
            spm.SentencePieceTrainer.train(" ".join(sp_args))
            sp = spm.SentencePieceProcessor(
                model_file=f"{args.language}.model")

        outputfile = Path(f"{path}/{name}.processed.{suffix}")

        # tokenize file
        with open(Path(args.file)) as infile:
            with open(outputfile, "w", encoding="utf-8") as ofile:
                for i, line in enumerate(infile):
                    encoded = sp.encode(line.strip(), out_type=str)
                    ofile.write(f"{' '.join(encoded)}\n")
                    print(f"Preprocessing: line {i:,}", end="\r")

        # if a new model was trained, move model and vocab to the directory
        # where the input file (and output file) are also saved
        if trained_model is True:
            os.rename(f"{args.language}.model", modelname)
            os.rename(f"{args.language}.vocab",
                      f"{path}/vocab.{args.language}")
    print(" " * 50, end="\r")
    print("Preprocessing: complete")
Example #5
0
def replace_numbers(args):
    reference = Path(args.reference)
    translation = Path(args.translation)

    # compile overly complicated number regex
    number = re.compile(r"(?<=\s)\d[\d,'.]*\b")

    path, name, suffix = split_filename(args.translation)
    ofile = open(Path(f"{path}/{name}.numbered.{suffix}"), "w")

    with open(reference, "r", encoding="utf-8") as r_file, \
            open(translation, "r", encoding="utf-8") as t_file:
        for i, (line_r, line_t) in enumerate(zip(r_file, t_file)):
            # extract numbers from reference
            original_numbers = re.findall(number, line_r)

            # get <num> placeholders from translation
            place_holder = re.findall(r"<num>", line_t)

            if (len(original_numbers) > 0 and
                    len(original_numbers) == len(place_holder)):
                sen = list()
                i = 0
                for token in line_t.split():
                    if "<num>" in token:
                        # replace <num> with real number
                        numbered = token.replace(
                            "<num>", original_numbers[i]
                        )
                        sen.append(numbered)
                        i += 1
                    else:
                        sen.append(token)

            else:
                # no numbers or numbers don't match
                # leave translation as it is
                sen = line_t.strip().split()

            line = " ".join(sen)

            ofile.write(f"{line}\n")
            print(f"Replacing numbers: line {i:,}", end="\r")

    ofile.close()
    print(" " * 50, end="\r")
    print("Replacing numbers: complete")
Example #6
0
    def run(self):
        to_train = list()

        # collect trained processors
        for processor in self.trainable:
            if processor.trained:
                self.pipe.append(processor)
            else:
                to_train.append(processor)

        pipe_string = " > ".join([str(i) for i in self.pipe])
        complete = " > ".join([str(i) for i in self.pipe + to_train])
        print(f"Pipe: {complete}\n")
        t_0 = time.time()

        print(f"Applying: {pipe_string}")
        # applied untrainable and trained processors
        with open(Path(self.filename), "r", encoding="utf-8") as infile,\
                open(self.temp_file, "w", encoding="utf-8") as ofile:
            for i, line in enumerate(infile):

                for processor in self.pipe:
                    if not isinstance(processor, Truecaser):
                        line = processor(line)

                ofile.write(f"{line}\n")
                print(f"Preprocessing: line {i:,}", end="\r")

        print(" " * 50, end="\r")
        t_1 = time.time()
        ts = int(t_1 - t_0)
        print(f"Timestamp: {datetime.timedelta(seconds=ts)}\n")

        # train and apply untrained processors
        for processor in to_train:
            print(f"Applying: {processor}")
            self.apply_trainable(processor)

            t_1 = time.time()
            ts = int(t_1 - t_0)
            print(" " * 50, end="\r")
            print(f"Timestamp: {datetime.timedelta(seconds=ts)}\n")

        path, name, suffix = split_filename(self.filename)
        # rename last output file
        os.rename(self.temp_file, Path(f"{path}/{name}.processed.{suffix}"))
Example #7
0
def clean(args):
    # get arguments
    if not 0 < len(args.file) < 3:
        raise InvalidArgument(
            "You can only pass either one or two files"
        )

    files = [Path(name) for name in args.file]

    dexmler = Dexmler()
    cleaner = Cleaner(
        min_len=args.min_len,
        max_len=args.max_len,
        ratio=args.ratio
    )

    # create output files
    output_files = list()
    for filepath in files:
        path, name, suffix = split_filename(str(filepath))
        output_files.append(
            open(Path(f"{path}/{name}.clean.{suffix}"), "w", encoding="utf-8")
        )

    # open read files
    input_files = list()
    for filepath in files:
        input_files.append(open(filepath))

    for i, lines in enumerate(zip(*input_files)):
        # clean files
        dexmled = dexmler(*lines)
        cleaned = cleaner(*dexmled)

        if all((i != "" for i in cleaned)):
            for line, ofile in zip(cleaned, output_files):
                ofile.write(f"{line}\n")

        print(f"Cleaning: line {i:,}", end="\r")

    # close all files
    for open_file in output_files + input_files:
        open_file.close()

    print(" "*50, end="\r")
    print("Cleaning: complete")
Example #8
0
def chunk(args):
    input_file = Path(args.file)
    max_len = args.n

    path, name, suffix = split_filename(str(input_file))
    output_file = Path(f"{path}/{name}.chunked.{suffix}")

    with open(input_file, "r", encoding="utf-8") as infile, \
            open(output_file, "w", encoding="utf-8") as ofile:
        for i, line in enumerate(infile):
            chunks = slice_list(line.strip().split(), max_len)
            for sen in chunks:
                to_write = " ".join(sen)
                ofile.write(f"{to_write}\n")

            print(f"Chunking: line {i:,}", end="\r")

    print(" "*50, end="\r")
    print("Chunking: complete")
Example #9
0
def normalize(args):
    path, name, suffix = split_filename(args.file)

    ofile = Path(f"{path}/{name}.normalized.{suffix}")

    if args.sp_model is not None:
        sp = spm.SentencePieceProcessor(model_file=args.sp_model)
    else:
        truecaser = Truecaser(suffix, path)
        detok = Detokenizer(suffix)
        subword_regex = re.compile(r"@@( |$)")

    with open(Path(args.file), "r", encoding="utf-8") as infile, \
            open(ofile, "w", encoding="utf-8") as ofile:
        for i, line in enumerate(infile):
            if args.sp_model is not None:
                # file was encoded with sentencepiece
                to_write = sp.decode(line.strip().split())
                to_write = to_write.replace("⁇", "<unk>")

            else:
                # undo subword splitting
                if args.subword is True:
                    line = re.sub(subword_regex, "", line)

                # truecase
                line = truecaser(line)

                # detokenize
                to_write = detok(line)

            if args.upper is True:
                to_write = to_write.capitalize()

            # write output
            ofile.write(f"{to_write}\n")

            print(f"Normalizing: line {i:,}", end="\r")

    print(" " * 50, end="\r")
    print("Normalizing: complete")
Example #10
0
    def __init__(self, filename, language, bpe, remove_nums, max_lines):
        self.filename = filename
        self.max_lines = max_lines
        path, name, suffix = split_filename(filename)
        self.path = path
        self.temp_file = Path(f"{path}/temp.{language}")
        self.language = language

        self.pipe = [PunctNormalizer(language), Tokenizer(language)]

        if remove_nums is True:
            self.pipe.append(NumReplacer())

        tr = Truecaser(language, path)
        lc = LowerCaser(tr.trained)

        self.trainable = [
            tr,
            lc  # must be applied after training Truecaser
        ]

        # add bpe splitter to trainable processors
        if bpe is not None:
            self.trainable.append(SubwordSplitter(language, bpe, path))