def build_vocab(args): n_sample = int(args.n_sample) l1_file = Path(f"{args.file1}") l2_file = Path(f"{args.file2}") # get suffixes from filenames path1, name1, suff1 = split_filename(args.file1) path2, name2, suff2 = split_filename(args.file2) # instantiate vocab objects voc1 = Vocab(suff1, args.min_freq) voc2 = Vocab(suff2, args.min_freq) with open(l1_file, "r", encoding="utf-8") as srcvoc, \ open(l2_file, "r", encoding="utf-8") as tgtvoc: for i, (l1, l2) in enumerate(zip(srcvoc, tgtvoc)): # add line to vocabs voc1.add_sentence(l1.strip()) voc2.add_sentence(l2.strip()) print(f"Building vocabulary: line {i + 1:,}", end="\r") if n_sample != 0: if i > n_sample: break voc1.save_voc(path1) voc2.save_voc(path2) print(" "*50, end="\r") print("Building vocabulary: complete")
def split_single(filename, train_n, eval_n, test_n, verbose): path, name, suffix = split_filename(str(filename)) to_write = ["train", "eval", "test"] outputfiles = list() for subpart in to_write: outputfiles.append( open(Path(f"{path}/{subpart}.{suffix}"), "w", encoding="utf-8")) limits = [train_n, eval_n, test_n] o_index = 0 current_limit = limits[o_index] ofile = outputfiles[o_index] with open(filename, "r", encoding="utf-8") as infile: for i, line in enumerate(infile): ofile.write(line) if i == current_limit - 1: o_index += 1 if o_index == len(limits): break current_limit += limits[o_index] ofile = outputfiles[o_index] if verbose is True: print(f"Splitting dataset: line {i:,}", end="\r") for out_file in outputfiles: out_file.close()
def translate(args): inputfile = Path(args.file) beam_size = int(args.beam) # pick device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") cpu = os.cpu_count() torch.set_num_threads(cpu) checkpoint = torch.load(Path(args.model), map_location=device) model = checkpoint["model"] # transfer model and set eval mode model.to(device) model.eval() # print model print(model) path, name, suffix = split_filename(str(inputfile)) # start translating outputfile = Path(f"{path}/{name}.translated.{model.tgt_lang.name}") with open(inputfile, "r", encoding="utf-8") as infile, \ open(outputfile, "w", encoding="utf-8") as outfile: for progress, line in enumerate(infile): line = line.strip() hypotheses = model.beam_search(line, beam_size, device, args.alpha) # if verbose print all hypotheses if args.verbose: for hyp in hypotheses: indeces = hyp.get_indeces() tokens = model.tgt_lang.idx2toks(indeces.tolist()) print(tokens) # get indeces of best hypothesis indeces = hypotheses[0].get_indeces() tokens = model.tgt_lang.idx2toks(indeces.tolist()) # remove SOS and EOS tokens = filter(lambda x: x not in {"<eos>", "<sos>"}, tokens) translated = " ".join(tokens) # write decoded sentence to output file outfile.write(f"{translated}\n") if args.verbose: print() else: print(f"Translating: line {progress + 1:,}", end="\r") print(" " * 50, end="\r") print("Translating: complete")
def preprocess(args): if args.SP is None: pipe = Pipeline(args.file, args.language, args.bpe, args.replace_nums, args.n) pipe.run() else: path, name, suffix = split_filename(args.file) modelname = f"{path}/model.sentencepiece.{args.SP}.{args.language}" # if model is already trained, load model if os.path.isfile(modelname): trained_model = False sp = spm.SentencePieceProcessor(model_file=modelname) else: # model needs to be trained trained_model = True sp_args = [ f"--input={args.file}", f"--model_prefix={args.language}", f"--vocab_size={args.SP}", "--bos_id=-1", "--eos_id=-1" ] # add limit input sentence if args.n != 0: sp_args.append(f"--input_sentence_size={args.n}") # train and load model spm.SentencePieceTrainer.train(" ".join(sp_args)) sp = spm.SentencePieceProcessor( model_file=f"{args.language}.model") outputfile = Path(f"{path}/{name}.processed.{suffix}") # tokenize file with open(Path(args.file)) as infile: with open(outputfile, "w", encoding="utf-8") as ofile: for i, line in enumerate(infile): encoded = sp.encode(line.strip(), out_type=str) ofile.write(f"{' '.join(encoded)}\n") print(f"Preprocessing: line {i:,}", end="\r") # if a new model was trained, move model and vocab to the directory # where the input file (and output file) are also saved if trained_model is True: os.rename(f"{args.language}.model", modelname) os.rename(f"{args.language}.vocab", f"{path}/vocab.{args.language}") print(" " * 50, end="\r") print("Preprocessing: complete")
def replace_numbers(args): reference = Path(args.reference) translation = Path(args.translation) # compile overly complicated number regex number = re.compile(r"(?<=\s)\d[\d,'.]*\b") path, name, suffix = split_filename(args.translation) ofile = open(Path(f"{path}/{name}.numbered.{suffix}"), "w") with open(reference, "r", encoding="utf-8") as r_file, \ open(translation, "r", encoding="utf-8") as t_file: for i, (line_r, line_t) in enumerate(zip(r_file, t_file)): # extract numbers from reference original_numbers = re.findall(number, line_r) # get <num> placeholders from translation place_holder = re.findall(r"<num>", line_t) if (len(original_numbers) > 0 and len(original_numbers) == len(place_holder)): sen = list() i = 0 for token in line_t.split(): if "<num>" in token: # replace <num> with real number numbered = token.replace( "<num>", original_numbers[i] ) sen.append(numbered) i += 1 else: sen.append(token) else: # no numbers or numbers don't match # leave translation as it is sen = line_t.strip().split() line = " ".join(sen) ofile.write(f"{line}\n") print(f"Replacing numbers: line {i:,}", end="\r") ofile.close() print(" " * 50, end="\r") print("Replacing numbers: complete")
def run(self): to_train = list() # collect trained processors for processor in self.trainable: if processor.trained: self.pipe.append(processor) else: to_train.append(processor) pipe_string = " > ".join([str(i) for i in self.pipe]) complete = " > ".join([str(i) for i in self.pipe + to_train]) print(f"Pipe: {complete}\n") t_0 = time.time() print(f"Applying: {pipe_string}") # applied untrainable and trained processors with open(Path(self.filename), "r", encoding="utf-8") as infile,\ open(self.temp_file, "w", encoding="utf-8") as ofile: for i, line in enumerate(infile): for processor in self.pipe: if not isinstance(processor, Truecaser): line = processor(line) ofile.write(f"{line}\n") print(f"Preprocessing: line {i:,}", end="\r") print(" " * 50, end="\r") t_1 = time.time() ts = int(t_1 - t_0) print(f"Timestamp: {datetime.timedelta(seconds=ts)}\n") # train and apply untrained processors for processor in to_train: print(f"Applying: {processor}") self.apply_trainable(processor) t_1 = time.time() ts = int(t_1 - t_0) print(" " * 50, end="\r") print(f"Timestamp: {datetime.timedelta(seconds=ts)}\n") path, name, suffix = split_filename(self.filename) # rename last output file os.rename(self.temp_file, Path(f"{path}/{name}.processed.{suffix}"))
def clean(args): # get arguments if not 0 < len(args.file) < 3: raise InvalidArgument( "You can only pass either one or two files" ) files = [Path(name) for name in args.file] dexmler = Dexmler() cleaner = Cleaner( min_len=args.min_len, max_len=args.max_len, ratio=args.ratio ) # create output files output_files = list() for filepath in files: path, name, suffix = split_filename(str(filepath)) output_files.append( open(Path(f"{path}/{name}.clean.{suffix}"), "w", encoding="utf-8") ) # open read files input_files = list() for filepath in files: input_files.append(open(filepath)) for i, lines in enumerate(zip(*input_files)): # clean files dexmled = dexmler(*lines) cleaned = cleaner(*dexmled) if all((i != "" for i in cleaned)): for line, ofile in zip(cleaned, output_files): ofile.write(f"{line}\n") print(f"Cleaning: line {i:,}", end="\r") # close all files for open_file in output_files + input_files: open_file.close() print(" "*50, end="\r") print("Cleaning: complete")
def chunk(args): input_file = Path(args.file) max_len = args.n path, name, suffix = split_filename(str(input_file)) output_file = Path(f"{path}/{name}.chunked.{suffix}") with open(input_file, "r", encoding="utf-8") as infile, \ open(output_file, "w", encoding="utf-8") as ofile: for i, line in enumerate(infile): chunks = slice_list(line.strip().split(), max_len) for sen in chunks: to_write = " ".join(sen) ofile.write(f"{to_write}\n") print(f"Chunking: line {i:,}", end="\r") print(" "*50, end="\r") print("Chunking: complete")
def normalize(args): path, name, suffix = split_filename(args.file) ofile = Path(f"{path}/{name}.normalized.{suffix}") if args.sp_model is not None: sp = spm.SentencePieceProcessor(model_file=args.sp_model) else: truecaser = Truecaser(suffix, path) detok = Detokenizer(suffix) subword_regex = re.compile(r"@@( |$)") with open(Path(args.file), "r", encoding="utf-8") as infile, \ open(ofile, "w", encoding="utf-8") as ofile: for i, line in enumerate(infile): if args.sp_model is not None: # file was encoded with sentencepiece to_write = sp.decode(line.strip().split()) to_write = to_write.replace("⁇", "<unk>") else: # undo subword splitting if args.subword is True: line = re.sub(subword_regex, "", line) # truecase line = truecaser(line) # detokenize to_write = detok(line) if args.upper is True: to_write = to_write.capitalize() # write output ofile.write(f"{to_write}\n") print(f"Normalizing: line {i:,}", end="\r") print(" " * 50, end="\r") print("Normalizing: complete")
def __init__(self, filename, language, bpe, remove_nums, max_lines): self.filename = filename self.max_lines = max_lines path, name, suffix = split_filename(filename) self.path = path self.temp_file = Path(f"{path}/temp.{language}") self.language = language self.pipe = [PunctNormalizer(language), Tokenizer(language)] if remove_nums is True: self.pipe.append(NumReplacer()) tr = Truecaser(language, path) lc = LowerCaser(tr.trained) self.trainable = [ tr, lc # must be applied after training Truecaser ] # add bpe splitter to trainable processors if bpe is not None: self.trainable.append(SubwordSplitter(language, bpe, path))