Esempio n. 1
0
    def split_train_test_valid(self, keep_comments, test_size):
        suffix = ".with_comments" if keep_comments else ""
        all_tok = self.folder.joinpath(f"all{suffix}.tok")

        # select test/valid/train and split train in 8
        valid_file = self.folder.joinpath(f"valid{suffix}.tok")
        test_file = self.folder.joinpath(f"test{suffix}.tok")

        n_tests = 0

        if not valid_file.is_file():
            print(f"{self.l}: splitting valid ... ")
            subprocess.run(
                f"cat {all_tok} | head -n {test_size} > {valid_file}",
                shell=True,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
            )
            n_tests += test_size

        if not test_file.is_file():
            print(f"{self.l}: splitting test ... ")
            subprocess.run(
                f"cat {all_tok} | head -n {2 * test_size} | tail -n {test_size}  > {test_file}",
                shell=True,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
            )
            n_tests += test_size

        if not all(
            self.folder.joinpath(f"train{suffix}.{n}.tok").is_file() for n in range(8)
        ):
            n_lines = get_nlines(all_tok)
            split_len = int((n_lines - n_tests) / 8)
            print(f"{self.l}: splitting train ({n_lines}) to ({split_len}) ... ")
            for n, i in zip(range(8), range(2 * test_size, n_lines, split_len)):
                subprocess.run(
                    f"cat {all_tok} | head -n {i + split_len} | tail -n {split_len}  > {self.folder.joinpath(f'train{suffix}.{n}.tok')}",
                    shell=True,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE,
                )

        n_lines = get_nlines(self.folder.joinpath(f"train{suffix}.0.tok"))
        size_gb = self.folder.joinpath(f"train{suffix}.0.tok").stat().st_size

        print(f"{self.l}: Finished splitting train, test and valid.")
        print(f"{self.l}: train 0 is {n_lines} lines and {size_gb / (1024 ** 3)} Go. ")
        return n_lines, size_gb
Esempio n. 2
0
 def process(self,
             keep_comments,
             tok_executor=None,
             test_size=1000,
             split_executor=None):
     suffix = '.with_comments' if keep_comments else ''
     print(f"{self.l}: process ...")
     self.process_json_and_tok(keep_comments, tok_executor)
     if (all(
             self.folder.joinpath(f'train{suffix}.{n}.tok').is_file()
             for n in range(8))
             and self.folder.joinpath(f'test{suffix}.tok').is_file()
             and self.folder.joinpath(f'valid{suffix}.tok').is_file()):
         print(f"{self.l}: train, test and valid for already exist. ")
         nlines = 8 * \
                  get_nlines(self.folder.joinpath(f'train{suffix}.{0}.tok'))
         size_gb = 8 * \
                   self.folder.joinpath(f'train{suffix}.{0}.tok').stat().st_size
     else:
         print(f"{self.l}: split train, test and valid ... ")
         if split_executor is None:
             split_executor = LocalExecutor()
         job = split_executor.submit(self.split_train_test_valid,
                                     keep_comments, test_size)
         nlines, size_gb = job.result()
     print(
         f"{self.l}: train for is {nlines} lines and {size_gb / (1024 ** 3)} Go. "
     )
     # nlines, size = self.split_train_test_valid(keep_comments, test_size)
     return nlines, size_gb
    def split_train_test_valid(self, keep_comments, test_size=1000):
        suffix = '.with_comments' if keep_comments else ''

        # split train-test-valid
        # regroup
        all_tok = self.folder.joinpath(f'all{suffix}.tok')
        command = f"cd {self.folder}; cat *[0-4][0-9][0-9]{suffix}.tok > {all_tok}"
        proc = subprocess.run(command,
                              shell=True,
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE,
                              executable='/bin/bash')

        size_gb = all_tok.stat().st_size
        n_lines = get_nlines(all_tok)

        # shuf
        shuf_file(all_tok)

        # select test/valid/train and split train in 8
        subprocess.run(
            f"cat {all_tok} | head -n {test_size} > {self.folder.joinpath(f'valid{suffix}.tok')}",
            shell=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE)
        subprocess.run(
            f"cat {all_tok} | head -n {2 * test_size} | tail -n {test_size}  > {self.folder.joinpath(f'test{suffix}.tok')}",
            shell=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE)
        #split_len = int((n_lines - 2 * test_size) / 8)
        #for n, i in zip(range(8), range(2 * test_size, n_lines, split_len)):
        #    subprocess.run(f"cat {all_tok} | head -n {i + split_len} | tail -n {split_len}  > {self.folder.joinpath(f'train{suffix}.{n}.tok')}", shell=True, stdout=subprocess.PIPE,
        #                   stderr=subprocess.PIPE)
        split_len = int((n_lines - 2 * test_size) / 1)  # Modified by Rakesh
        for n, i in zip(range(1), range(2 * test_size, n_lines, split_len)):
            subprocess.run(
                f"cat {all_tok} | head -n {i + split_len} | tail -n {split_len}  > {self.folder.joinpath(f'train{suffix}.tok')}",
                shell=True,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE)

        return n_lines, size_gb