def process_languages( self, lang_executor=None, tok_executor=None, split_executor=None ): if lang_executor is None: lang_executor = LocalExecutor() if self.lang_pair is not None: self.lang_pair.process_json_and_tok( self.keep_comments, self.extract_mode, lang_executor if tok_executor is None else tok_executor, ) else: jobs = [ lang_executor.submit( lang.process_json_and_tok, self.keep_comments, self.extract_mode, tok_executor, ) for lang in self.langs ] for job in jobs: job.result() jobs = [ lang_executor.submit( lang.split_train_test_valid, self.keep_comments, self.test_size ) for lang in self.langs ] for i, lang in enumerate(self.langs): self.sizes[lang.l] = jobs[i].result()
def process(self, keep_comments, tok_executor=None, test_size=1000, split_executor=None): suffix = '.with_comments' if keep_comments else '' print(f"{self.l}: process ...") self.process_json_and_tok(keep_comments, tok_executor) if (all( self.folder.joinpath(f'train{suffix}.{n}.tok').is_file() for n in range(8)) and self.folder.joinpath(f'test{suffix}.tok').is_file() and self.folder.joinpath(f'valid{suffix}.tok').is_file()): print(f"{self.l}: train, test and valid for already exist. ") nlines = 8 * \ get_nlines(self.folder.joinpath(f'train{suffix}.{0}.tok')) size_gb = 8 * \ self.folder.joinpath(f'train{suffix}.{0}.tok').stat().st_size else: print(f"{self.l}: split train, test and valid ... ") if split_executor is None: split_executor = LocalExecutor() job = split_executor.submit(self.split_train_test_valid, keep_comments, test_size) nlines, size_gb = job.result() print( f"{self.l}: train for is {nlines} lines and {size_gb / (1024 ** 3)} Go. " ) # nlines, size = self.split_train_test_valid(keep_comments, test_size) return nlines, size_gb
def extract_functions_and_apply_bpe(self, lang_executor=None, function_executor=None, bpe_executor=None): print("extract functions ... ") if lang_executor is None: lang_executor = LocalExecutor() jobs = [ lang_executor.submit(lang.extract_functions, self.keep_comments, self.test_size, function_executor) for lang in self.langs ] for job in jobs: job.result() for split in ['test', 'valid']: for f_type in ['functions_standalone', 'functions_class']: truncate_files( l.folder.joinpath(f'{split}{self.suffix}.{f_type}.tok') for l in self.langs) print("apply bpe on train ... ") self.apply_bpe(f'train{self.suffix}.[01234567].functions_*.tok', use_vocab=False, executor=bpe_executor) print("apply bpe on test and valid ...") self.apply_bpe(f'test{self.suffix}.functions_*.tok', use_vocab=False, executor=bpe_executor) self.apply_bpe(f'valid{self.suffix}.functions_*.tok', use_vocab=False, executor=bpe_executor)
def binarize_for_XLM(self, files_regex, executor=None): print(f"binarize {files_regex} ...") if executor is None: executor = LocalExecutor() jobs = [] for l in self.langs: for f in self.folder.glob(f"{l.l}.{files_regex}"): if not Path(str(f) + ".pth").is_file(): print(f"binarizing {f} ...") jobs.append(executor.submit(binarize_for_XLM_file, f, self.vocab)) for job in jobs: job.result()
def process_languages(self, lang_executor=None, tok_executor=None, split_executor=None): if lang_executor is None: lang_executor = LocalExecutor() jobs = [ lang_executor.submit(lang.process, self.keep_comments, tok_executor, self.test_size, split_executor) for lang in self.langs ] for i, lang in enumerate(self.langs): self.sizes[lang.l] = jobs[i].result()
def apply_bpe(self, files_regex, use_vocab=False, executor=None): vocab = "" if use_vocab is False else self.vocab if executor is None: executor = LocalExecutor() jobs = [] for l in self.langs: for f in l.folder.glob(files_regex): out = self.folder.joinpath(f"{l.l}.{f.name}").with_suffix(".bpe") if not out.is_file(): print(f"apply bpe on {f} ...") jobs.append( executor.submit(apply_bpe_file, f, out, self.codes, vocab) ) for job in jobs: job.result()
def extract_functions(self, lang_executor=None, function_executor=None): print("extract functions ... ") if lang_executor is None: lang_executor = LocalExecutor() jobs = [ lang_executor.submit(lang.extract_functions, self.keep_comments, self.test_size, function_executor) for lang in self.langs ] for job in jobs: job.result() for split in ['test', 'valid']: for f_type in ['functions_standalone', 'functions_class']: truncate_files( l.folder.joinpath(f'{split}{self.suffix}.{f_type}.tok') for l in self.langs)