Example #1
0
    def extract_functions_and_apply_bpe(self,
                                        lang_executor=None,
                                        function_executor=None,
                                        bpe_executor=None):
        print("extract functions ... ")
        if lang_executor is None:
            lang_executor = LocalExecutor()
        jobs = [
            lang_executor.submit(lang.extract_functions, self.keep_comments,
                                 self.test_size, function_executor)
            for lang in self.langs
        ]
        for job in jobs:
            job.result()

        for split in ['test', 'valid']:
            for f_type in ['functions_standalone', 'functions_class']:
                truncate_files(
                    l.folder.joinpath(f'{split}{self.suffix}.{f_type}.tok')
                    for l in self.langs)

        print("apply bpe on train ... ")
        self.apply_bpe(f'train{self.suffix}.[01234567].functions_*.tok',
                       use_vocab=False,
                       executor=bpe_executor)
        print("apply bpe on test and valid ...")
        self.apply_bpe(f'test{self.suffix}.functions_*.tok',
                       use_vocab=False,
                       executor=bpe_executor)
        self.apply_bpe(f'valid{self.suffix}.functions_*.tok',
                       use_vocab=False,
                       executor=bpe_executor)
Example #2
0
 def process(self,
             keep_comments,
             tok_executor=None,
             test_size=1000,
             split_executor=None):
     suffix = '.with_comments' if keep_comments else ''
     print(f"{self.l}: process ...")
     self.process_json_and_tok(keep_comments, tok_executor)
     if (all(
             self.folder.joinpath(f'train{suffix}.{n}.tok').is_file()
             for n in range(8))
             and self.folder.joinpath(f'test{suffix}.tok').is_file()
             and self.folder.joinpath(f'valid{suffix}.tok').is_file()):
         print(f"{self.l}: train, test and valid for already exist. ")
         nlines = 8 * \
                  get_nlines(self.folder.joinpath(f'train{suffix}.{0}.tok'))
         size_gb = 8 * \
                   self.folder.joinpath(f'train{suffix}.{0}.tok').stat().st_size
     else:
         print(f"{self.l}: split train, test and valid ... ")
         if split_executor is None:
             split_executor = LocalExecutor()
         job = split_executor.submit(self.split_train_test_valid,
                                     keep_comments, test_size)
         nlines, size_gb = job.result()
     print(
         f"{self.l}: train for is {nlines} lines and {size_gb / (1024 ** 3)} Go. "
     )
     # nlines, size = self.split_train_test_valid(keep_comments, test_size)
     return nlines, size_gb
Example #3
0
 def extract_docstrings(self, keep_comments, test_size=1000, executor=None):
     if executor is None:
         executor = LocalExecutor()
     suffix = '.with_comments' if keep_comments else ''
     files = list(
         self.folder.glob(f'train{suffix}.[01234567].functions_class.tok'))
     files += list(
         self.folder.glob(
             f'train{suffix}.[01234567].functions_standalone.tok'))
     files.append(self.folder.joinpath(f'test{suffix}.functions_class.tok'))
     files.append(
         self.folder.joinpath(f'test{suffix}.functions_standalone.tok'))
     files.append(
         self.folder.joinpath(f'valid{suffix}.functions_class.tok'))
     files.append(
         self.folder.joinpath(f'valid{suffix}.functions_standalone.tok'))
     toks = [
         tok for tok in files
         if not (tok.with_suffix('.DS-f.ds.tok').is_file()
                 and tok.with_suffix('.DS-f.f.tok').is_file())
     ]
     if len(toks) > 0:
         jobs = executor.map_array(extract_docstrings, toks,
                                   itertools.repeat(self.l))
         for job in jobs:
             job.result()
Example #4
0
    def process_languages(
        self, lang_executor=None, tok_executor=None, split_executor=None
    ):
        if lang_executor is None:
            lang_executor = LocalExecutor()

        if self.lang_pair is not None:
            self.lang_pair.process_json_and_tok(
                self.keep_comments,
                self.extract_mode,
                lang_executor if tok_executor is None else tok_executor,
            )
        else:
            jobs = [
                lang_executor.submit(
                    lang.process_json_and_tok,
                    self.keep_comments,
                    self.extract_mode,
                    tok_executor,
                )
                for lang in self.langs
            ]
            for job in jobs:
                job.result()

        jobs = [
            lang_executor.submit(
                lang.split_train_test_valid, self.keep_comments, self.test_size
            )
            for lang in self.langs
        ]
        for i, lang in enumerate(self.langs):
            self.sizes[lang.l] = jobs[i].result()
Example #5
0
 def binarize_for_XLM(self, files_regex, executor=None):
     print(f"binarize {files_regex} ...")
     if executor is None:
         executor = LocalExecutor()
     jobs = []
     for l in self.langs:
         for f in self.folder.glob(f"{l.l}.{files_regex}"):
             if not Path(str(f) + ".pth").is_file():
                 print(f"binarizing {f} ...")
                 jobs.append(executor.submit(binarize_for_XLM_file, f, self.vocab))
     for job in jobs:
         job.result()
Example #6
0
 def process_languages(self,
                       lang_executor=None,
                       tok_executor=None,
                       split_executor=None):
     if lang_executor is None:
         lang_executor = LocalExecutor()
     jobs = [
         lang_executor.submit(lang.process, self.keep_comments,
                              tok_executor, self.test_size, split_executor)
         for lang in self.langs
     ]
     for i, lang in enumerate(self.langs):
         self.sizes[lang.l] = jobs[i].result()
Example #7
0
 def apply_bpe(self, files_regex, use_vocab=False, executor=None):
     vocab = "" if use_vocab is False else self.vocab
     if executor is None:
         executor = LocalExecutor()
     jobs = []
     for l in self.langs:
         for f in l.folder.glob(files_regex):
             out = self.folder.joinpath(f"{l.l}.{f.name}").with_suffix(".bpe")
             if not out.is_file():
                 print(f"apply bpe on {f} ...")
                 jobs.append(
                     executor.submit(apply_bpe_file, f, out, self.codes, vocab)
                 )
     for job in jobs:
         job.result()
Example #8
0
    def extract_functions(self, lang_executor=None, function_executor=None):
        print("extract functions ... ")
        if lang_executor is None:
            lang_executor = LocalExecutor()
        jobs = [
            lang_executor.submit(lang.extract_functions, self.keep_comments,
                                 self.test_size, function_executor)
            for lang in self.langs
        ]
        for job in jobs:
            job.result()

        for split in ['test', 'valid']:
            for f_type in ['functions_standalone', 'functions_class']:
                truncate_files(
                    l.folder.joinpath(f'{split}{self.suffix}.{f_type}.tok')
                    for l in self.langs)
Example #9
0
    def process_json_and_tok(self, keep_comments, extract_mode, executor=None):
        if executor is None:
            executor = LocalExecutor()
        suffix = ".with_comments" if keep_comments else ""
        jsons = list(self.folder.glob("*.[0-9][0-9][0-9].json.gz"))
        assert (
            len(jsons) > 0
        ), f"there is no *.[0-9][0-9][0-9].json.gz in {str(self.folder)}"

        jsons = [
            json
            for json in jsons
            if not json.with_suffix("").with_suffix(suffix + ".tok.json").is_file()
        ]
        print(f"{self.lang1}-{self.lang2}: processing {len(jsons)} json files ...")
        if len(jsons) > 0:
            jobs = map_array(
                executor,
                process_language_pair_json,
                jsons,
                itertools.repeat(self.lang1),
                itertools.repeat(self.lang2),
                itertools.repeat(keep_comments),
                itertools.repeat(extract_mode),
            )
            for job in jobs:
                job.result()

        # join
        all_tok = self.folder.joinpath(f"all{suffix}.tok.json")
        if not all_tok.is_file():
            command = (
                f"cd {self.folder}; cat *.[0-9][0-9][0-9]{suffix}.tok.json > {all_tok}"
            )
            proc = subprocess.run(
                command,
                shell=True,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                executable="/bin/bash",
            )

            # shuf
            shuf_file(all_tok)

        # extract to language toks
        jobs = map_array(
            executor,
            select_toks_json,
            [self.lang1, self.lang2],
            itertools.repeat(all_tok),
            [
                self.folder_lang1.joinpath(f"all{suffix}.tok"),
                self.folder_lang2.joinpath(f"all{suffix}.tok"),
            ],
        )
        for job in jobs:
            job.result()
Example #10
0
 def process_json_and_tok(self, keep_comments, executor=None):
     if executor is None:
         executor = LocalExecutor()
     suffix = '.with_comments' if keep_comments else ''
     assert len(list(self.folder.glob(
         '*.json.gz'))) > 0, f"there is no json in {str(self.folder)}"
     jsons = [
         json for json in self.folder.glob('*.json.gz')
         if not Path(str(json).replace('.json.gz', suffix +
                                       '.tok')).is_file()
     ]
     print(f"{self.l}: tokenizing {len(jsons)} json files ...")
     if len(jsons) > 0:
         jobs = executor.map_array(process_and_tokenize_json_file, jsons,
                                   itertools.repeat(self.l),
                                   itertools.repeat(keep_comments))
         for job in jobs:
             job.result()
     else:
         return
Example #11
0
    def process_json_and_tok(self, keep_comments, extract_mode, executor=None):
        print(f"{self.l}: process ...")

        if executor is None:
            executor = LocalExecutor()
        suffix = ".with_comments" if keep_comments else ""
        assert (
            len(list(self.folder.glob("*.json.gz"))) > 0
        ), f"there is no json in {str(self.folder)}"
        jsons = [
            json
            for json in self.folder.glob("*.json.gz")
            if not Path(str(json).replace(".json.gz", suffix + ".tok")).is_file()
        ]
        print(f"{self.l}: tokenizing {len(jsons)} json files ...")
        if len(jsons) > 0:
            jobs = map_array(
                executor,
                process_and_tokenize_json_file,
                jsons,
                itertools.repeat(self.l),
                itertools.repeat(keep_comments),
                itertools.repeat(extract_mode),
            )
            for job in jobs:
                job.result()

        # join
        all_tok = self.folder.joinpath(f"all{suffix}.tok")
        if not all_tok.is_file():
            command = f"cd {self.folder}; cat *.[0-9][0-9][0-9]{suffix}.tok > {all_tok}"
            proc = subprocess.run(
                command,
                shell=True,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                executable="/bin/bash",
            )

            # shuf
            shuf_file(all_tok)