Esempio n. 1
0
    def prepare_model(self, model: str, use_latest: bool = False, debug:bool = False, cross_proj: bool=False):
        if not use_latest:
            model_work_dir = self.work_dir / model
        else:
            model_work_dir = self.work_dir / f"{model}-latest"
            # end if
        if cross_proj:
            model_work_dir = Path(f"{model_work_dir}-cross-proj")
        if debug:
            model_work_dir = Path(f"{model_work_dir}-debug")

        IOUtils.mk_dir(model_work_dir)

        if model == "DeepCom":
            from csevo.ml.DeepComRunner import DeepComRunner
            runner = DeepComRunner(model_work_dir, use_latest)
        elif model == "Seq2seq":
            from csevo.ml.Seq2seqRunner import Seq2seqRunner
            runner = Seq2seqRunner(model_work_dir, use_latest)
        elif model == "Seq2seqAtt":
            from csevo.ml.Seq2seqAttRunner import Seq2seqAttRunner
            runner = Seq2seqAttRunner(model_work_dir, use_latest)
        elif model == "DeepCom-SBT":
            from csevo.ml.DeepComSBTRunner import DeepComSBTRunner
            runner = DeepComSBTRunner(model_work_dir, use_latest)
        elif model == "DeepCom-Preorder":
            from csevo.ml.DeepComPreorderRunner import DeepComPreorderRunner
            runner = DeepComPreorderRunner(model_work_dir, use_latest)
        elif model == "Code2Seq":
            from csevo.ml.Code2SeqRunner import Code2SeqRunner
            runner = Code2SeqRunner(model_work_dir, use_latest, debug, cross_proj)
        elif model == "Bi-LSTM":
            from csevo.ml.BiLSTMRunner import BiLSTMRunner
            runner = BiLSTMRunner(model_work_dir, use_latest, debug, cross_proj)
        elif model == "no-split-Bi-LSTM":
            from csevo.ml.NoSplitBiLSTMRunner import BiLSTMRunner
            runner = BiLSTMRunner(model_work_dir, use_latest, debug, cross_proj)
        elif model == "Transformer":
            from csevo.ml.TransformerRunner import TransformerRunner
            runner = TransformerRunner(model_work_dir, use_latest)
        else:
            raise ValueError(f"Model {model} not ready yet")
        # end if

        runner.prepare()
        return
Esempio n. 2
0
    def prepare_data(self):
        data_prefix = f"{self.eval_setting}-{self.year}"
        data_dir = self.work_dir / "data"

        self.logger.info(f"Preparing the data for {self.eval_setting} {self.year} at {data_dir}")
        IOUtils.rm_dir(data_dir)
        IOUtils.mk_dir(data_dir)

        # Copy train/val/test_common/test_standard data
        BashUtils.run(f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.train}/train {data_dir}/train", expected_return_code=0)
        BashUtils.run(f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.val}/valid {data_dir}/valid", expected_return_code=0)
        BashUtils.run(f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.test_common}/test {data_dir}/{Macros.test_common}", expected_return_code=0)
        BashUtils.run(f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/test {data_dir}/{Macros.test_standard}", expected_return_code=0)

        # Copy vocab
        BashUtils.run(f"cp {self.model_data_dir}/{data_prefix}-{Macros.train}/vocab* {data_dir}/", expected_return_code=0)
        return
Esempio n. 3
0
    def prepare_configs_and_scripts(self, trials: List[int]):
        with open(self.base_config_file, "r") as f:
            base_config = yaml.load(f)
        exp_dir = self.work_dir
        for trial in trials:
            seed = random.randint(0, 9)
            trial_dir = exp_dir / f"trial-{trial}"
            IOUtils.mk_dir(trial_dir)

            config = copy.copy(base_config)
            config["data"] = str(self.data_dir / "biLSTM")
            config["save_model"] = str(trial_dir / "bestLSTM")
            config_file = trial_dir / "config.yaml"
            with open(config_file, "w+") as f:
                yaml.dump(config, f)
            train_script_file = trial_dir/"train.sh"
            train_script = f"#!/bin/bash\n" \
                           f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                           f"module load cuda/10.1 cudnn/7.6.2\n" \
                           f"conda activate {self.CONDA_ENV}\n" \
                           f"cd {self.code_dir}\n" \
                           f"export MKL_SERVICE_FORCE_INTEL=1\n"\
                           f"python3 train.py --config {config_file} --world_size 1 --gpu_ranks 0 -keep_checkpoint 1 " \
                           f"--seed {seed} &> {trial_dir}/train-log.txt\n"
            IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt)
            BashUtils.run(f"chmod +x {train_script_file}", expected_return_code=0)

            for test_type in [Macros.test_common, Macros.test_standard]:
                test_script_file = trial_dir/f"{test_type}.sh"
                output_file = trial_dir / f"output_{test_type}.txt"
                test_script = f"#!/bin/bash\n" \
                              f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                              f"module load cuda/10.1 cudnn/7.6.2\n" \
                              f"conda activate {self.CONDA_ENV}\n" \
                              f"cd {self.code_dir}\n" \
                              f"export MKL_SERVICE_FORCE_INTEL=1\n"\
                              f"python3 translate.py "\
                              f"--model {trial_dir}/*.pt --output {output_file} --src {self.data_dir}/src-{test_type}.txt "\
                              f"&> {trial_dir}/{test_type}-log.txt\n" \
                              f"python3 eval_utils.py " \
                              f"{self.data_dir}/tgt-{test_type}.txt {output_file} {trial_dir}/results_{test_type}.json\n"
                IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt)
                BashUtils.run(f"chmod +x {test_script_file}", expected_return_code=0)

            # end for
        return
Esempio n. 4
0
    def require_special_repo(cls, directory: Path, branch: str):
        cls.logger.info(f"Updating {directory} to {branch} branch")
        if directory.exists():
            if not directory.is_dir() or not (directory / ".git").is_dir():
                LoggingUtils.log_and_raise(
                    cls.logger,
                    f"Path {directory} already exists but is not a proper git repository!",
                    Exception)
            # end if

            with IOUtils.cd(directory):
                BashUtils.run(f"git pull", expected_return_code=0)
            # end with
        else:
            IOUtils.mk_dir(directory)
            with IOUtils.cd(directory):
                BashUtils.run(
                    f"git clone --single-branch -b {branch} -- {cls.get_git_url()} .",
                    expected_return_code=0)
Esempio n. 5
0
    def process_data(
            self,
            data_dir: Path,
            output_processed_data_dir: Path,
            is_train: bool = False,
    ) -> NoReturn:
        """
        Processes the data to the intermediate format.
        """
        self.logger.info(self.logging_prefix + f"Processing data from {data_dir} to {output_processed_data_dir}")
        IOUtils.rm_dir(output_processed_data_dir)
        IOUtils.mk_dir(output_processed_data_dir)

        if is_train:
            # Preprocess with training data, if needed
            self.preprocess_with_train_data(data_dir, output_processed_data_dir)
        # end if

        self.process_data_impl(data_dir, output_processed_data_dir)
        return
Esempio n. 6
0
    def prepare_model(self, model: str, year: int, eval_setting: str, debug: bool = False):
        sub_dir_name = f"{model}"
        if debug:
            sub_dir_name = f"{sub_dir_name}-debug"
        model_work_dir = self.work_dir / sub_dir_name

        IOUtils.mk_dir(model_work_dir)

        if model == "DeepCom":
            from csevo.ml.DeepComRunner import DeepComRunner
            runner = DeepComRunner(model_work_dir, year, eval_setting)
        elif model == "Seq2seq":
            from csevo.ml.Seq2seqRunner import Seq2seqRunner
            runner = Seq2seqRunner(model_work_dir, year, eval_setting)
        elif model == "Seq2seqAtt":
            from csevo.ml.Seq2seqAttRunner import Seq2seqAttRunner
            runner = Seq2seqAttRunner(model_work_dir, year, eval_setting)
        elif model == "DeepCom-SBT":
            from csevo.ml.DeepComSBTRunner import DeepComSBTRunner
            runner = DeepComSBTRunner(model_work_dir, year, eval_setting)
        elif model == "DeepCom-Preorder":
            from csevo.ml.DeepComPreorderRunner import DeepComPreorderRunner
            runner = DeepComPreorderRunner(model_work_dir, year, eval_setting)
        elif model == "Code2Seq":
            from csevo.ml.Code2SeqRunner import Code2SeqRunner
            runner = Code2SeqRunner(model_work_dir, year, eval_setting)
        elif model == "Bi-LSTM":
            from csevo.ml.BiLSTMRunner import BiLSTMRunner
            runner = BiLSTMRunner(model_work_dir, year, eval_setting)
        elif model == "no-split-Bi-LSTM":
            from csevo.ml.NoSplitBiLSTMRunner import BiLSTMRunner
            runner = BiLSTMRunner(model_work_dir, year, eval_setting)
        elif model == "Transformer":
            from csevo.ml.TransformerRunner import TransformerRunner
            runner = TransformerRunner(model_work_dir, year, eval_setting)
        else:
            raise ValueError(f"Model {model} not ready yet")
        # end if

        runner.prepare()
        return
Esempio n. 7
0
    def __init__(self):
        self.plots_dir: Path = Macros.paper_dir / "figs"
        IOUtils.mk_dir(self.plots_dir)

        # Initialize seaborn
        sns.set()
        sns.set_palette("Dark2")
        sns.set_context("paper")
        mpl.rcParams["axes.titlesize"] = 24
        mpl.rcParams["axes.labelsize"] = 24
        mpl.rcParams["font.size"] = 18
        mpl.rcParams["xtick.labelsize"] = 24
        mpl.rcParams["xtick.major.size"] = 14
        mpl.rcParams["xtick.minor.size"] = 14
        mpl.rcParams["ytick.labelsize"] = 24
        mpl.rcParams["ytick.major.size"] = 14
        mpl.rcParams["ytick.minor.size"] = 14
        mpl.rcParams["legend.fontsize"] = 18
        mpl.rcParams["legend.title_fontsize"] = 18
        # print(mpl.rcParams)
        return
Esempio n. 8
0
    def train(
        self,
        train_processed_data_dir: Path,
        val_processed_data_dir: Path,
        output_model_dir: Path,
        force_retrain: bool = False,
    ) -> NoReturn:
        """
        Trains the model on the training data.

        The trained model should be saved to output_dir.
        This function auto-saves a training-completed.txt as a proof of completion of training at the end.

        :param train_processed_data_dir: the directory containing the processed train data
        :param val_processed_data_dir: the directory containing the processed val data
        :param output_model_dir: the directory to save the output model
        :param force_retrain: if set to True, re-train the model even if it was already trained (will remove previously trained model)
        """
        if force_retrain or not self.is_training_completed(output_model_dir):
            self.logger.info(
                self.logging_prefix +
                f"Training model at {output_model_dir}; train: {train_processed_data_dir}, val: {val_processed_data_dir}"
            )
            IOUtils.rm_dir(output_model_dir)
            IOUtils.mk_dir(output_model_dir)

            # Save spec & configs of this model
            IOUtils.dump(output_model_dir / "config-dict.json",
                         IOUtils.jsonfy(self.config),
                         IOUtils.Format.jsonPretty)
            IOUtils.dump(output_model_dir / "spec.json",
                         IOUtils.jsonfy(self.spec), IOUtils.Format.jsonPretty)
            self.train_impl(train_processed_data_dir, val_processed_data_dir,
                            output_model_dir)
            IOUtils.dump(output_model_dir / self.TRAINING_COMPLETED_FILE_NAME,
                         str(time.time_ns()), IOUtils.Format.txt)
        # end if
        return
Esempio n. 9
0
    def prepare_data(self):
        data_prefix = f"{self.eval_setting}-{self.year}"
        IOUtils.rm_dir(self.data_dir)
        IOUtils.mk_dir(self.data_dir)

        # build dataset used by Open-NMT
        BashUtils.run(f"cp {self.model_data_dir}/{data_prefix}-{Macros.train}/biLSTM* {self.data_dir}/",
                      expected_return_code=0)

        BashUtils.run(
            f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_common}/src-test.txt {self.data_dir}/src-{Macros.test_common}.txt",
            expected_return_code=0)
        BashUtils.run(
            f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_common}/tgt-test.txt {self.data_dir}/tgt-{Macros.test_common}.txt",
            expected_return_code=0)

        BashUtils.run(
            f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/src-test.txt {self.data_dir}/src-{Macros.test_standard}.txt",
            expected_return_code=0)
        BashUtils.run(
            f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/tgt-test.txt {self.data_dir}/tgt-{Macros.test_standard}.txt",
            expected_return_code=0)

        return
Esempio n. 10
0
    def prepare_configs_and_scripts(self, trials: List[int]):
        exp_dir = self.work_dir

        for trial in trials:
            trial_dir = exp_dir / f"trial-{trial}"
            IOUtils.mk_dir(trial_dir)

            model_dir = trial_dir / "models"
            IOUtils.mk_dir(model_dir)
            log_dir = trial_dir / "logs"
            IOUtils.mk_dir(log_dir)
            data = str(exp_dir / "data/code2seq")
            val_data = data + ".val.c2s"
            train_log = trial_dir / "training-trace.json"

            train_script_file = trial_dir / f"{Macros.train}.sh"
            # Copy config file
            BashUtils.run(
                f"cp {self.base_config_file} {trial_dir}/config.yaml",
                expected_return_code=0)
            output_file = trial_dir / "output_tmp.txt"
            reference_file = trial_dir / "ref_tmp.txt"
            config_file = trial_dir / "config.yaml"
            train_script = f"#!/bin/bash\n" \
                           f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                           f"conda activate {self.CONDA_ENV}\n" \
                           f"module load cuda/10.0 cudnn/7.6.2\n" \
                           f"cd {self.code_dir}\n" \
                           f"python -u code2seq.py --data {data} --test {val_data} --log {train_log} --config {config_file} " \
                           f"--pred_file {output_file} --ref_file {reference_file} "\
                           f"--save_prefix {model_dir}/model --gpu_id $1 &> {trial_dir}/train-log.txt"
            IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt)
            BashUtils.run(f"chmod +x {train_script_file}",
                          expected_return_code=0)

            for test_type in [Macros.test_common, Macros.test_standard]:
                test_data = exp_dir / "data" / f"code2seq.{test_type}.c2s"
                output_file = trial_dir / f"output_{test_type}.txt"
                reference_file = trial_dir / f"ref_{test_type}.txt"
                test_script_file = trial_dir / f"{test_type}.sh"
                test_script = f"#!/bin/bash\n" \
                              f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                              f"conda activate {self.CONDA_ENV}\n" \
                              f"module load cuda/10.0 cudnn/7.6.2\n" \
                              f"cd {self.code_dir}\n" \
                              f"python3 code2seq.py --load {model_dir}/model_best --test {test_data} --config {config_file} " \
                              f"--pred_file {output_file} --ref_file {reference_file} "\
                              f"--gpu_id $1 &> {trial_dir}/{test_type}-log.txt\n" \
                              f"python3 eval_utils.py {reference_file} {output_file} {trial_dir}/results_{test_type}.json\n"
                IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt)
                BashUtils.run(f"chmod +x {test_script_file}",
                              expected_return_code=0)

        return
Esempio n. 11
0
    def prepare_configs_and_scripts(self, trials: List[int]):
        base_config = IOUtils.load(self.base_config_file,
                                   IOUtils.Format.jsonPretty)
        if not self.use_latest:
            exps = [f"{t}{t+1}-train" for t in range(13, 18)]
            for exp in exps:
                exp_dir = self.work_dir / exp
                for trial in trials:
                    trial_dir = exp_dir / f"trial-{trial}"
                    IOUtils.mk_dir(trial_dir)

                    output_file = trial_dir / "output.txt"

                    config = copy.copy(base_config)
                    config["data_dir"] = str(exp_dir)
                    config["model_dir"] = str(trial_dir / "model")
                    config["output"] = str(output_file)

                    config_file = trial_dir / "config.json"
                    IOUtils.dump(config_file, config,
                                 IOUtils.Format.jsonPretty)

                    train_script_file = trial_dir / "train.sh"
                    train_script = f"#!/bin/bash\n" \
                                   f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                                   f"conda activate {self.CONDA_ENV}\n" \
                                   f"module load cuda/10.0 cudnn/7.6.2\n" \
                                   f"cd {self.code_dir}/translate\n" \
                                   f"python3 __main__.py {config_file} --train -v --gpu-id $1 &> {trial_dir}/log-train.txt\n"
                    IOUtils.dump(train_script_file, train_script,
                                 IOUtils.Format.txt)
                    BashUtils.run(f"chmod +x {train_script_file}",
                                  expected_return_code=0)

                    test_script_file = trial_dir / "test.sh"
                    test_script = f"#!/bin/bash\n" \
                                  f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                                  f"conda activate {self.CONDA_ENV}\n" \
                                  f"module load cuda/10.0 cudnn/7.6.2\n" \
                                  f"cd {self.code_dir}/translate\n" \
                                  f"python3 __main__.py {config_file} --eval {exp_dir}/test/test.token.code {exp_dir}/test/test.token.sbt {exp_dir}/test/test.token.nl &> {trial_dir}/log-test.txt"
                    IOUtils.dump(test_script_file, test_script,
                                 IOUtils.Format.txt)
                    BashUtils.run(f"chmod +x {test_script_file}",
                                  expected_return_code=0)

                    eval_script_file = trial_dir / "val.sh"
                    eval_script = f"#!/bin/bash\n" \
                                   f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                                   f"conda activate {self.CONDA_ENV}\n" \
                                   f"module load cuda/10.0 cudnn/7.6.2\n" \
                                   f"cd {self.code_dir}/translate\n" \
                                   f"python3 Bleu.py {exp_dir}/test/test.token.nl {trial_dir}/output.txt {trial_dir}\n"
                    IOUtils.dump(eval_script_file, eval_script,
                                 IOUtils.Format.txt)
                    BashUtils.run(f"chmod +x {eval_script_file}",
                                  expected_return_code=0)
                # end for
            # end for
        else:
            exp_dir = self.work_dir / "latest"
            for trial in trials:
                trial_dir = exp_dir / f"trial-{trial}"
                IOUtils.mk_dir(trial_dir)

                output_file = trial_dir / "output.txt"

                config = copy.copy(base_config)
                config["data_dir"] = str(exp_dir)
                config["model_dir"] = str(trial_dir / "model")
                config["output"] = str(output_file)

                config_file = trial_dir / "config.json"
                IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty)

                train_script_file = trial_dir / "train.sh"
                train_script = f"#!/bin/bash\n" \
                               f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                               f"conda activate {self.CONDA_ENV}\n" \
                               f"module load cuda/10.0 cudnn/7.6.2\n" \
                               f"cd {self.code_dir}/translate\n" \
                               f"python3 __main__.py {config_file} --train -v --gpu-id $1 &> {trial_dir}/log-train.txt\n"
                IOUtils.dump(train_script_file, train_script,
                             IOUtils.Format.txt)
                BashUtils.run(f"chmod +x {train_script_file}",
                              expected_return_code=0)

                test_script_file = trial_dir / "test.sh"
                test_script = f"#!/bin/bash\n" \
                              f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                              f"conda activate {self.CONDA_ENV}\n" \
                              f"module load cuda/10.0 cudnn/7.6.2\n" \
                              f"cd {self.code_dir}/translate\n" \
                              f"python3 __main__.py {config_file} --eval {exp_dir}/test/test.token.code {exp_dir}/test/test.token.sbt {exp_dir}/test/test.token.nl &> {trial_dir}/log-test.txt"
                IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt)
                BashUtils.run(f"chmod +x {test_script_file}",
                              expected_return_code=0)

                eval_script_file = trial_dir / "val.sh"
                eval_script = f"#!/bin/bash\n" \
                              f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                              f"conda activate {self.CONDA_ENV}\n" \
                              f"module load cuda/10.0 cudnn/7.6.2\n" \
                              f"cd {self.code_dir}/translate\n" \
                              f"python3 Bleu.py {exp_dir}/test/test.token.nl {trial_dir}/output.txt {trial_dir}\n"
                IOUtils.dump(eval_script_file, eval_script, IOUtils.Format.txt)
                BashUtils.run(f"chmod +x {eval_script_file}",
                              expected_return_code=0)
        return
Esempio n. 12
0
    def extract_data_from_corpus(
        cls,
        corpus_path: Path,
        trainevals: List[str],
        groups: List[str],
        output_path: Path,
    ):
        # 1. Prepare output path
        if output_path.is_dir():
            cls.logger.warning(
                f"{output_path} already exists, will overwrite the files.")
        elif output_path.is_file():
            LoggingUtils.log_and_raise(
                cls.logger,
                f"{output_path} already exists as a file. Aborting.",
                Exception)
        else:
            IOUtils.mk_dir(output_path)
        # end if

        assert all(
            [traineval in Macros.DS_TRAINEVALS for traineval in trainevals])
        assert all([
            group in Macros.DS_GROUPS + [Macros.DS_GROUP_TA]
            for group in groups
        ])

        data_mgr = FilesManager(corpus_path)

        # 2. Load lemmas and definitions
        lemmas_filtered: List[Lemma] = data_mgr.load_data(
            [FilesManager.LEMMAS_FILTERED],
            IOUtils.Format.json,
            is_batched=True,
            clz=Lemma)
        definitions: List[Definition] = data_mgr.load_data(
            [FilesManager.DEFINITIONS, "definitions.json"],
            IOUtils.Format.json,
            clz=Definition)

        # 3. Output to output_path for each combination of traineval and group
        for traineval in trainevals:
            for group in groups:
                IOUtils.mk_dir(output_path / f"{group}-{traineval}")
                data_indexes = IOUtils.load(
                    Macros.project_dir / "training" /
                    f"{group}-{traineval}.json", IOUtils.Format.json)
                IOUtils.dump(
                    output_path / f"{group}-{traineval}/lemmas.json",
                    IOUtils.jsonfy([
                        l for l in lemmas_filtered
                        if l.data_index in data_indexes
                    ]), IOUtils.Format.json)
                IOUtils.dump(
                    output_path / f"{group}-{traineval}/definitions.json",
                    IOUtils.jsonfy([
                        d for d in definitions if d.data_index in data_indexes
                    ]), IOUtils.Format.json)
            # end for
        # end for
        return
Esempio n. 13
0
    def extract_data_project(
        cls,
        project_path: Path,
        files: Optional[List[str]],
        exclude_files: Optional[List[str]],
        exclude_pattern: Optional[str],
        serapi_options: str,
        output_path: Path,
    ):
        # 1. Prepare output path
        if output_path.is_dir():
            cls.logger.warning(
                f"{output_path} already exists, will overwrite the files.")
        elif output_path.is_file():
            LoggingUtils.log_and_raise(
                cls.logger,
                f"{output_path} already exists as a file. Aborting.",
                Exception)
        else:
            IOUtils.mk_dir(output_path)
        # end if

        # 2. Extract documents, tok.sexp and ast.sexp
        coq_documents: Dict[str, CoqDocument] = collections.OrderedDict()
        ast_sexp_lists: Dict[str, List[SexpNode]] = dict()
        tok_sexp_lists: Dict[str, List[SexpNode]] = dict()

        with IOUtils.cd(project_path):
            coq_files: List[str] = BashUtils.run(
                f"find -name '*.v' -type f").stdout.split("\n")[:-1]
            coq_files = [coq_file[2:] for coq_file in coq_files]

            if files is not None:
                coq_files = [f for f in coq_files if f in files]
            # end if

            if exclude_files is not None:
                coq_files = [f for f in coq_files if f not in exclude_files]
            # end if

            if exclude_pattern is not None:
                re_exclude_pattern = re.compile(exclude_pattern)
                coq_files = [
                    f for f in coq_files if not re_exclude_pattern.fullmatch(f)
                ]
            # end if

            for i, coq_file in enumerate(tqdm(coq_files)):
                try:
                    # Read file
                    with open(coq_file, "r", newline="") as f:
                        source_code = f.read()
                    # end with

                    # Get unicode offsets
                    unicode_offsets = ParserUtils.get_unicode_offsets(
                        source_code)

                    # Call SerAPI
                    ast_sexp_str: str = BashUtils.run(
                        f"sercomp {serapi_options} --mode=sexp -- {coq_file}",
                        expected_return_code=0).stdout
                    tok_sexp_str: str = BashUtils.run(
                        f"sertok {serapi_options} -- {coq_file}",
                        expected_return_code=0).stdout

                    # Parse ast sexp
                    ast_sexp_list: List[SexpNode] = SexpParser.parse_list(
                        ast_sexp_str)
                    tok_sexp_list: List[SexpNode] = SexpParser.parse_list(
                        tok_sexp_str)

                    # Parse the document
                    coq_document = CoqParser.parse_document(
                        source_code,
                        ast_sexp_list,
                        tok_sexp_list,
                        unicode_offsets=unicode_offsets)

                    # Set meta data
                    coq_document.file_name = coq_file
                    coq_document.project_name = project_path.name

                    coq_documents[coq_file] = coq_document
                    ast_sexp_lists[coq_file] = ast_sexp_list
                    tok_sexp_lists[coq_file] = tok_sexp_list
                except KeyboardInterrupt:
                    cls.logger.warning("Keyboard interrupt!")
                    raise
                except:
                    cls.logger.warning(
                        f"File {coq_file} failed! Exception was: {traceback.format_exc()}"
                    )
                    continue
                # end try
            # end for

            # 3. Extract and save lemmas and definitions
            lemmas: List[Lemma] = list()
            definitions: List[Definition] = list()

            # Increase recursion limit because the backend sexps are CRAZZZZY deep
            sys.setrecursionlimit(10000)

            for file_path, doc in tqdm(coq_documents.items()):
                ast_sexp_list = ast_sexp_lists[file_path]
                lemmas_doc = cls.collect_lemmas_doc(doc, ast_sexp_list,
                                                    serapi_options)
                lemmas.extend(lemmas_doc)
                definitions_doc = cls.collect_definitions_doc(
                    doc, ast_sexp_list)
                definitions.extend(definitions_doc)
            # end for

            IOUtils.dump(output_path / "lemmas.json", IOUtils.jsonfy(lemmas),
                         IOUtils.Format.json)
            IOUtils.dump(output_path / "definitions.json",
                         IOUtils.jsonfy(definitions), IOUtils.Format.json)
        # end with
        return
Esempio n. 14
0
    def process_shared(self,
                       output_dir: Path,
                       years: List[str],
                       eval_settings: List[str],
                       task: str = "CG"):
        """
        Extracts the train/val/test method-data for all eval_setting/year.
        This is a shared step for the processing for all models, so do this first (and once).
        1. split the data into train/val/test
        2. extract for every setting
        """
        shared_data_dir = output_dir / f"{task}-shared"
        IOUtils.mk_dir(shared_data_dir)

        # Load project list
        projects = IOUtils.load(Macros.data_dir /
                                f"projects-github-{task}-100.json")

        # Load data
        projects_2_data_list: Dict[str, List] = dict()

        for proj in tqdm(projects):
            # split data split method in the projects, create 19-20-methods-train.json and latest-methods-val.json files
            ds = DataSpliter()
            ds.project_data_split(proj, task)
            method_data_list = IOUtils.load(Macros.repos_results_dir / proj /
                                            "collector" / "method-data.json")
            projects_2_data_list[proj] = method_data_list

        # split data across projects
        num_proj = len(projects)
        random.seed(Environment.random_seed)
        random.Random(Environment.random_seed).shuffle(projects)
        train_index = round(num_proj * self.TRAIN_RATIO)
        valid_index = train_index + round(num_proj * self.VAL_RATIO)
        train_projs = projects[:train_index]
        valid_projs = projects[train_index:valid_index]
        test_projs = projects[valid_index:]
        project_split = {
            "train": train_projs,
            "val": valid_projs,
            "test": test_projs
        }
        #project_split = IOUtils.load(Macros.data_dir/f"projects-split-{task}-100.json")
        IOUtils.dump(Macros.data_dir / f"projects-split-{task}-100.json",
                     project_split, IOUtils.Format.jsonNoSort)
        assert len(project_split["test"]) > len(project_split["val"])

        data_type_2_project_list: Dict[str, List] = {
            Macros.train: project_split["train"],
            Macros.val: project_split["val"],
            Macros.test: project_split["test"],
        }

        for year in years:
            data_type_2_data_list: Dict[str, List] = dict()
            year = int(year)
            # test_common: D_test(P_test, year-1, year)
            data_type_2_data_list[f"{year}-{Macros.test_common}"] = list()
            for proj in tqdm(data_type_2_project_list[Macros.test]):
                filter_indexes_file = Macros.repos_results_dir / proj / "collector" / f"19-20-methods-{task}-test.json"
                filter_indexes = IOUtils.load(filter_indexes_file)
                data_type_2_data_list[f"{year}-{Macros.test_common}"] += [
                    projects_2_data_list[proj][i] for i in filter_indexes
                ]

            for eval_setting in eval_settings:
                if eval_setting == "evo":
                    # train: D(P, year-3, year-2)
                    # val: D(P, year-2, year-1)
                    # test_standard: D(P, year-1, year)
                    data_type_2_data_list[
                        f"{eval_setting}-{year}-{Macros.train}"] = list()
                    data_type_2_data_list[
                        f"{eval_setting}-{year}-{Macros.val}"] = list()
                    data_type_2_data_list[
                        f"{eval_setting}-{year}-{Macros.test_standard}"] = list(
                        )
                    for proj in tqdm(projects):
                        all_filter_indexes_file = Macros.repos_results_dir / proj / "collector" / f"method-project-{task}-filtered.json"

                        all_filter_indexes = IOUtils.load(
                            all_filter_indexes_file)
                        train_filter_indexes = [
                            af["method_ids"] for af in all_filter_indexes
                            if af["time"] == f"{year-3}_Jan_1-{year-2}_Jan_1"
                        ][0]
                        train_filter_indexes += [
                            af["method_ids"] for af in all_filter_indexes
                            if af["time"] == f"{year-4}_Jan_1-{year-3}_Jan_1"
                        ][0]
                        val_filter_indexes = [
                            af["method_ids"] for af in all_filter_indexes
                            if af["time"] == f"{year-2}_Jan_1-{year-1}_Jan_1"
                        ][0]
                        test_standard_filter_indexes = [
                            af["method_ids"] for af in all_filter_indexes
                            if af["time"] == f"{year-1}_Jan_1-{year}_Jan_1"
                        ][0]

                        proj_data_list = projects_2_data_list[proj]

                        data_type_2_data_list[
                            f"{eval_setting}-{year}-{Macros.train}"] += [
                                proj_data_list[i] for i in train_filter_indexes
                            ]
                        data_type_2_data_list[
                            f"{eval_setting}-{year}-{Macros.val}"] += [
                                proj_data_list[i] for i in val_filter_indexes
                            ]
                        data_type_2_data_list[
                            f"{eval_setting}-{year}-{Macros.test_standard}"] += [
                                proj_data_list[i]
                                for i in test_standard_filter_indexes
                            ]
                elif eval_setting == "crossproj-evo":
                    # train: D(P_train, year-3, year-2)
                    # val: D(P_val, year-2, year-1)
                    # test: D(P_test, year-1, year)
                    data_type_2_data_list[
                        f"{eval_setting}-{year}-{Macros.train}"] = list()
                    data_type_2_data_list[
                        f"{eval_setting}-{year}-{Macros.val}"] = list()
                    data_type_2_data_list[
                        f"{eval_setting}-{year}-{Macros.test_standard}"] = list(
                        )
                    for data_type_tvt, project_list in data_type_2_project_list.items(
                    ):
                        if data_type_tvt == Macros.test:
                            data_type_2_data_list[
                                f"{eval_setting}-{year}-{Macros.test_standard}"] = data_type_2_data_list[
                                    f"{year}-{Macros.test_common}"]
                        else:
                            for proj in tqdm(project_list):
                                all_filter_indexes_file = Macros.repos_results_dir / proj / "collector" / f"method-project-{task}-filtered.json"
                                all_filter_indexes = IOUtils.load(
                                    all_filter_indexes_file)

                                proj_data_list = projects_2_data_list[proj]
                                if data_type_tvt == Macros.train:
                                    train_filter_indexes = [
                                        af["method_ids"]
                                        for af in all_filter_indexes
                                        if af["time"] ==
                                        f"{year - 3}_Jan_1-{year - 2}_Jan_1"
                                    ][0]
                                    train_filter_indexes += [
                                        af["method_ids"]
                                        for af in all_filter_indexes
                                        if af["time"] ==
                                        f"{year - 4}_Jan_1-{year - 3}_Jan_1"
                                    ][0]
                                    data_type_2_data_list[
                                        f"{eval_setting}-{year}-{Macros.train}"] += [
                                            proj_data_list[i]
                                            for i in train_filter_indexes
                                        ]
                                elif data_type_tvt == Macros.val:
                                    val_filter_indexes = [
                                        af["method_ids"]
                                        for af in all_filter_indexes
                                        if af["time"] ==
                                        f"{year - 2}_Jan_1-{year - 1}_Jan_1"
                                    ][0]
                                    data_type_2_data_list[
                                        f"{eval_setting}-{year}-{Macros.val}"] += [
                                            proj_data_list[i]
                                            for i in val_filter_indexes
                                        ]
                elif eval_setting == "crossproj":
                    # train: D(P_train, year)
                    # val: D(P_val, year)
                    # test_standard: D(P_test, year)
                    data_type_2_data_list[
                        f"{eval_setting}-{year}-{Macros.train}"] = list()
                    data_type_2_data_list[
                        f"{eval_setting}-{year}-{Macros.val}"] = list()
                    data_type_2_data_list[
                        f"{eval_setting}-{year}-{Macros.test_standard}"] = list(
                        )

                    for data_type_tvt, project_list in data_type_2_project_list.items(
                    ):
                        data_type = data_type_tvt if data_type_tvt != Macros.test else Macros.test_standard
                        for proj in tqdm(project_list):

                            latest_filter_indexes = list()
                            for t in [Macros.train, Macros.val, Macros.test]:
                                filter_indexes_file = Macros.repos_results_dir / proj / "collector" / f"latest-methods-{task}-{t}.json"
                                latest_filter_indexes += IOUtils.load(
                                    filter_indexes_file)

                            data_type_2_data_list[
                                f"{eval_setting}-{year}-{data_type}"] += [
                                    projects_2_data_list[proj][i]
                                    for i in latest_filter_indexes
                                ]
                elif eval_setting == "mixedproj":
                    # train: D_train(P, year)
                    # val: D_val(P, year)
                    # test_standard: D_test(P, year)
                    data_type_2_data_list[
                        f"{eval_setting}-{year}-{Macros.train}"] = list()
                    data_type_2_data_list[
                        f"{eval_setting}-{year}-{Macros.val}"] = list()
                    data_type_2_data_list[
                        f"{eval_setting}-{year}-{Macros.test_standard}"] = list(
                        )

                    for proj in tqdm(projects):
                        proj_data_list = projects_2_data_list[proj]
                        for data_type_tvt, data_type in zip(
                            [Macros.train, Macros.val, Macros.test],
                            [Macros.train, Macros.val, Macros.test_standard]):
                            filter_indexes_file = Macros.repos_results_dir / proj / "collector" / f"latest-methods-{task}-{data_type_tvt}.json"
                            filter_indexes = IOUtils.load(filter_indexes_file)
                            data_type_2_data_list[
                                f"{eval_setting}-{year}-{data_type}"] += [
                                    proj_data_list[i] for i in filter_indexes
                                ]

            for dt, data_list in data_type_2_data_list.items():
                IOUtils.dump(shared_data_dir / f"{dt}.json", data_list,
                             IOUtils.Format.json)
        return
Esempio n. 15
0
    def process(self, model: str, output_dir: Path, task: str, year: int,
                eval_setting: str):
        """
        Main entry for processors of different models.
        :param model: the model name, one of {"DeepCom", "ast-attendgru"}
        :param output_dir: the output directory (usually data/models)
        :param task: the task name, either "CG" or "MN"
        :param year: the year that the testing data should be on
        :param eval_setting: the evaluation setting, one of {"evo", "crossproj", "mixedproj"}
        """
        assert year == self.EVO_YEARS[
            -1]  # TODO: Only support the latest year for now
        assert task in self.TASKS.keys()

        model_data_dir = output_dir / model

        if model == "DeepCom":
            from csevo.processor.DeepComProcessor import DeepComProcessor
            processor = DeepComProcessor()
        elif model == "DeepCom-Preorder":
            from csevo.processor.DeepComProcessor import DeepComProcessor
            processor = DeepComProcessor()
        elif model == "Bi-LSTM":
            from csevo.processor.BiLSTMProcessor import BiLSTMProcessor
            processor = BiLSTMProcessor()
        elif model == "no-split-Bi-LSTM":
            from csevo.processor.BiLSTMProcessor import BiLSTMProcessor
            processor = BiLSTMProcessor()
        elif model == "Transformer":
            from csevo.processor.TransformerProcessor import TransformerProcessor
            processor = TransformerProcessor()
            data_prefix = f"{eval_setting}-{year}"
            processor.process_data(model_data_dir, data_prefix)
            return
        elif model == "ASTAttendGRU":
            from csevo.processor.ASTAttendGRUProcessor import ASTAttendGRUProcessor
            processor = ASTAttendGRUProcessor()
        elif model == "Code2Seq":
            from csevo.processor.Code2SeqProcessor import Code2SeqProcessor
            processor = Code2SeqProcessor()
        else:
            raise ValueError(f"Illegal model {model}")
        # end if
        error_ids = None

        # Load dataset after split (from shared directory)
        shared_data_dir = output_dir / f"{task}-shared"
        self.logger.info(f"Loading dataset from {shared_data_dir}")
        data_type_2_data_list: Dict[str, List] = dict()
        data_type_2_data_list[Macros.test_common] = IOUtils.load(
            shared_data_dir / f"{year}-{Macros.test_common}.json",
            IOUtils.Format.json)
        for dt in [Macros.train, Macros.val, Macros.test_standard]:
            data_type_2_data_list[dt] = IOUtils.load(
                shared_data_dir / f"{eval_setting}-{year}-{dt}.json",
                IOUtils.Format.json)

        # Process each set
        for data_type, data_list in data_type_2_data_list.items():
            sub_dir_name = f"{eval_setting}-{year}-{data_type}"

            if data_type in [Macros.test_common, Macros.test_standard]:
                data_type_tvt = Macros.test
            else:
                data_type_tvt = data_type

            model_dt_output_dir = model_data_dir / sub_dir_name
            IOUtils.mk_dir(model_dt_output_dir)
            if model == "DeepCom":
                error_ids = processor.process_data(data_list, data_type_tvt,
                                                   model_dt_output_dir, "sbt")
            elif model == "DeepCom-Preorder":
                error_ids = processor.process_data(data_list, data_type_tvt,
                                                   model_dt_output_dir,
                                                   "Preorder")
            elif model == "Code2Seq":
                error_ids = processor.process_data(data_list, data_type_tvt,
                                                   model_dt_output_dir)
            elif model == "Bi-LSTM":
                processor.process_data(data_list, data_type_tvt,
                                       model_dt_output_dir)
            elif model == "no-split-Bi-LSTM":
                processor.process_data(data_list,
                                       data_type_tvt,
                                       model_dt_output_dir,
                                       split=False)
            if error_ids is not None:
                self.logger.warning(f"Error data count: {len(error_ids)}")
                IOUtils.dump(model_data_dir / f"error-ids-{sub_dir_name}.json",
                             error_ids, IOUtils.Format.json)
        # extra step for Open-NMT data
        if model == "Bi-LSTM" or model == "no-split-Bi-LSTM":
            # build dataset used by Open-NMT
            BashUtils.run(
                f"onmt_preprocess -train_src {model_data_dir}/{eval_setting}-{year}-{Macros.train}/src-train.txt "
                f"-train_tgt {model_data_dir}/{eval_setting}-{year}-{Macros.train}/tgt-train.txt "
                f"-valid_src {model_data_dir}/{eval_setting}-{year}-{Macros.val}/src-val.txt "
                f"-valid_tgt {model_data_dir}/{eval_setting}-{year}-{Macros.val}/tgt-val.txt "
                f"-save_data {model_data_dir}/{eval_setting}-{year}-{Macros.train}/biLSTM --src_seq_length 200 --src_seq_"
                f"length_trunc 200",
                expected_return_code=0)

        return
Esempio n. 16
0
    def process_data(self, method_data_list, data_type, output_dir,
                     traversal) -> List[int]:
        self.logger.info(f"Start processing")

        # Use DeepCom's required names
        data_type = {
            Macros.train: "train",
            Macros.val: "valid",
            Macros.test: "test",
            "debug": "debug",
        }[data_type]

        # Initialize vocab, error_ids (shared between processes)
        manager = multiprocessing.Manager()
        code_vocab = manager.dict()
        nl_vocab = manager.dict()
        sbt_vocab = manager.dict()
        vocabs_lock = manager.Lock()
        error_ids = manager.list()
        error_ids_lock = manager.Lock()

        # Multi-processing, split the tasks evenly
        tasks_each_process = len(
            method_data_list) // Macros.multi_processing + 1
        processes = list()
        for pid in range(Macros.multi_processing):
            beg = pid * tasks_each_process
            method_data_list_p = method_data_list[beg:beg + tasks_each_process]
            output_dir_p = output_dir / str(pid)
            IOUtils.mk_dir(output_dir_p)
            process = multiprocessing.Process(
                target=self.process_data_mp,
                args=(method_data_list_p, data_type, output_dir_p, pid, beg,
                      code_vocab, nl_vocab, sbt_vocab, vocabs_lock, error_ids,
                      error_ids_lock, traversal))
            process.start()
            processes.append(process)
        # end for

        for process in processes:
            process.join()
        # end for

        # Merge results
        code_file_name = data_type + ".token.code"
        nl_file_name = data_type + ".token.nl"
        sbt_file_name = data_type + ".token.sbt"
        data_type_output_dir = output_dir / data_type
        IOUtils.mk_dir(data_type_output_dir)
        for pid in range(Macros.multi_processing):
            for fname in [code_file_name, nl_file_name, sbt_file_name]:
                BashUtils.run(
                    f"cat {output_dir}/{pid}/{fname} >> {data_type_output_dir}/{fname}"
                )
            # end for
            IOUtils.rm_dir(output_dir / str(pid))
        # end for
        error_ids.sort()

        # Build vocab
        if data_type == "train":
            code_vocab_file = output_dir / "vocab.code"
            nl_vocab_file = output_dir / "vocab.nl"
            sbt_vocab_file = output_dir / "vocab.sbt"
            fcv = open(code_vocab_file, "w+")
            fnv = open(nl_vocab_file, "w+")
            fsv = open(sbt_vocab_file, "w+")
            # write vocab to files
            special_tokens = [
                '<S>', '</S>', '<UNK>', '<KEEP>', '<DEL>', '<INS>', '<SUB>',
                '<NONE>'
            ]

            # Filter based on frequency, keep first MAX_VOCAB
            code_vocabs_list = special_tokens + list(
                code_vocab.keys())[:self.MAX_VOCAB]
            nl_vocabs_list = special_tokens + list(
                nl_vocab.keys())[:self.MAX_VOCAB]
            sbt_vocabs_list = special_tokens + list(
                sbt_vocab.keys())[:self.MAX_VOCAB]
            for v in code_vocabs_list:
                fcv.write(v + "\n")
            for v in nl_vocabs_list:
                fnv.write(v + "\n")
            for v in sbt_vocabs_list:
                fsv.write(v + "\n")
            fcv.close()
            fsv.close()
            fnv.close()
        # end if

        return list(error_ids)
Esempio n. 17
0
    def make_plot_draft_learning_curve(self,
            training_log_path: Path,
            output_name: str,
    ):
        special_plots_dir = self.plots_dir / "draft-learning-curve"
        IOUtils.mk_dir(special_plots_dir)

        fig: plt.Figure = plt.figure(figsize=(12,9))

        # TODO: these metrics may be specific to Code2Seq only
        x_field = "batch"
        yl_field = "training_loss"
        yr_field = "eval F1"

        x_min = 0
        x_max = -np.Inf
        yl_min = np.Inf
        yl_max = -np.Inf
        yr_min = np.Inf
        yr_max = -np.Inf

        # First, get ranges for all metrics (we want to use same ranges in all subplots)
        tvt_2_training_log = dict()
        tvt_2_x = dict()
        tvt_2_yl = dict()
        tvt_2_yr = dict()

        for tvt in [Macros.lat_lat, Macros.evo_lat, Macros.lat_evo, Macros.evo_evo]:
            # TODO: this path is hardcoded and work for Code2Seq 1 trial
            training_log = IOUtils.load(training_log_path / tvt / "trial-0" / "logs" / "train_log.json", IOUtils.Format.json)
            x = [d[x_field] for d in training_log]
            yl = [d[yl_field] for d in training_log]
            yr = [d[yr_field] for d in training_log]

            tvt_2_training_log[tvt] = training_log
            tvt_2_x[tvt] = x
            tvt_2_yl[tvt] = yl
            tvt_2_yr[tvt] = yr

            x_min = min(x_min, min(x))
            x_max = max(x_max, max(x))
            yl_min = min(yl_min, min(yl))
            yl_max = max(yl_max, max(yl))
            yr_min = min(yr_min, min(yr))
            yr_max = max(yr_max, max(yr))
        # end for

        x_lim = (x_min - (x_max - x_min) / 30, x_max + (x_max - x_min) / 30)
        yl_lim = (np.exp(np.log(yl_min) - (np.log(yl_max) - np.log(yl_min)) / 30), np.exp(np.log(yl_max) + (np.log(yl_max) - np.log(yl_min)) / 30))
        yr_lim = (yr_min - (yr_max - yr_min) / 30, yr_max + (yr_max - yr_min) / 30)

        for t_i, t in enumerate([Macros.lat, Macros.evo]):
            for vt_i, vt in enumerate([Macros.lat, Macros.evo]):
                tvt = f"{t}-{vt}"
                tvt_i = (t_i)*2+(vt_i)+1

                x = tvt_2_x[tvt]
                yl = tvt_2_yl[tvt]
                yr = tvt_2_yr[tvt]

                axl: plt.Axes = fig.add_subplot(2, 2, tvt_i)
                axr = axl.twinx()

                colorl = "tab:red"
                colorr = "tab:blue"

                axl.plot(x, yl, color=colorl)
                axr.plot(x, yr, color=colorr)

                axl.set_xlabel(x_field)
                axl.set_xlim(x_lim[0], x_lim[1])

                axl.set_ylabel(yl_field, color=colorl)
                axl.set_yscale("log")
                axl.set_ylim(yl_lim[0], yl_lim[1])

                axr.set_ylabel(yr_field, color=colorr)
                axr.set_ylim(yr_lim[0], yr_lim[1])

                axl.set_title(tvt)
            # end for
        # end for

        fig.tight_layout()
        with IOUtils.cd(special_plots_dir):
            fig.savefig(f"{output_name}.eps")
        # end with
        return
Esempio n. 18
0
    def collect_project(self, project_name: str, project_url: str):
        Environment.require_collector()

        # 0. Download repo
        downloads_dir = self.repos_downloads_dir / project_name
        results_dir = self.repos_results_dir / project_name

        # Remove previous results if any
        IOUtils.rm_dir(results_dir)
        IOUtils.mk_dir(results_dir)

        # Clone the repo if not exists
        if not downloads_dir.exists():
            with IOUtils.cd(self.repos_downloads_dir):
                with TimeUtils.time_limit(300):
                    BashUtils.run(f"git clone {project_url} {project_name}",
                                  expected_return_code=0)
                # end with
            # end with
        # end if

        project_data = ProjectData.create()
        project_data.name = project_name
        project_data.url = project_url

        # 1. Get list of revisions
        with IOUtils.cd(downloads_dir):
            git_log_out = BashUtils.run(f"git log --pretty=format:'%H %P'",
                                        expected_return_code=0).stdout
            for line in git_log_out.splitlines()[:self.MAX_REVISIONS]:
                shas = line.split()
                project_data.revisions.append(shas[0])
                project_data.parent_revisions[shas[0]] = shas[1:]
            # end for
        # end with

        # 2. Get revisions in different year
        with IOUtils.cd(downloads_dir):
            for year in self.YEARS:
                git_log_out = BashUtils.run(
                    f"git rev-list -1 --before=\"Jan 1 {year}\" origin",
                    expected_return_code=0).stdout
                project_data.year_revisions[str(year) +
                                            "_Jan_1"] = git_log_out.rstrip()
            # end for
        # end with

        project_data_file = results_dir / "project.json"
        IOUtils.dump(project_data_file, IOUtils.jsonfy(project_data),
                     IOUtils.Format.jsonPretty)

        # 2. Start java collector
        # Prepare config
        log_file = results_dir / "collector-log.txt"
        output_dir = results_dir / "collector"

        config = {
            "collect": True,
            "projectDir": str(downloads_dir),
            "projectDataFile": str(project_data_file),
            "logFile": str(log_file),
            "outputDir": str(output_dir),
            "year":
            True  # To indicate whether to collect all evo data or yearly data
        }
        config_file = results_dir / "collector-config.json"
        IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty)

        self.logger.info(
            f"Starting the Java collector. Check log at {log_file} and outputs at {output_dir}"
        )
        rr = BashUtils.run(
            f"java -jar {Environment.collector_jar} {config_file}",
            expected_return_code=0)
        if rr.stderr:
            self.logger.warning(f"Stderr of collector:\n{rr.stderr}")
        # end if

        # 3. In some cases, save collected data to appropriate location or database
        # TODO private info
        # On luzhou server for user pynie, move it to a dedicated location at /user/disk2
        if BashUtils.run(
                f"hostname").stdout.strip() == "luzhou" and BashUtils.run(
                    f"echo $USER").stdout.strip() == "pynie":
            alter_results_dir = Path(
                "/home/disk2/pynie/csevo-results") / project_name
            IOUtils.rm_dir(alter_results_dir)
            IOUtils.mk_dir(alter_results_dir.parent)
            BashUtils.run(f"mv {results_dir} {alter_results_dir}")
            self.logger.info(f"Results moved to {alter_results_dir}")
        # end if

        # -1. Remove repo
        IOUtils.rm_dir(downloads_dir)
        return
Esempio n. 19
0
    def split_dataset(
        cls,
        assignments_path: Path,
        output_dir: Path,
        seed,
        use_new_sub_tokenizer: bool,
    ):
        # Load the assignments dataset, as a flattened list
        data_list = cls.load_data_list(assignments_path)

        # Shuffle the data before splitting
        data_list = cls.shuffle_data(data_list, seed)

        # Split the data with 8:1:1 ratio
        split_index = len(data_list) // 10
        val_data_list = data_list[:split_index]
        test_data_list = data_list[split_index:2 * split_index]
        train_data_list = data_list[2 * split_index:]

        # Remove the data testing set that appeared in train/val
        seen_data_in_train_val = set()
        for data in train_data_list + val_data_list:
            key = hash((tuple(data["l"]), tuple(data["r"]),
                        tuple([
                            tuple(data[f"pa{pa_i+1}"])
                            for pa_i in range(Macros.MAX_PA_IN_MODEL)
                        ])))
            seen_data_in_train_val.add(key)
        # end for

        test_duplicate_indexes = list()
        for i, data in enumerate(test_data_list):
            key = hash((tuple(data["l"]), tuple(data["r"]),
                        tuple([
                            tuple(data[f"pa{pa_i+1}"])
                            for pa_i in range(Macros.MAX_PA_IN_MODEL)
                        ])))
            if key in seen_data_in_train_val:
                test_duplicate_indexes.append(i)
            # end if
        # end for
        for i in reversed(test_duplicate_indexes):
            del test_data_list[i]
        # end for

        # Sub-tokenize; this is after recording duplicates, as we
        # detect duplicates on token level
        for data in train_data_list + val_data_list + test_data_list:
            cls.sub_tokenize_data(data, use_new_sub_tokenizer)
        # end for

        # Collect statistics
        statistics = {
            "num-data": len(data_list),
            "num-data-train": len(train_data_list),
            "num-data-val": len(val_data_list),
            "num-data-test": len(test_data_list),
            "num-test-duplicate": len(test_duplicate_indexes),
        }

        # Save dataset after splitting; the tokens for each field of
        # data are joined to a single string separated by space
        IOUtils.mk_dir(output_dir)
        cls.dump_data_list(output_dir / "train.json", train_data_list)
        cls.dump_data_list(output_dir / "val.json", val_data_list)
        cls.dump_data_list(output_dir / "test.json", test_data_list)
        IOUtils.dump(output_dir / "statistics.json", statistics,
                     IOUtils.Format.jsonNoSort)
        return
Esempio n. 20
0
    def __init__(self):
        self.tables_dir: Path = Macros.paper_dir / "tables"
        IOUtils.mk_dir(self.tables_dir)

        self.metrics_dir: Path = Macros.results_dir / "metrics"
        return
Esempio n. 21
0
    def split_dataset_always_end(
        cls,
        assignments_path: Path,
        output_dir: Path,
        seed,
        use_new_sub_tokenizer: bool,
    ):
        data_list = cls.load_data_list(assignments_path)
        file_list = cls.shuffle_data(cls.extract_file_list(data_list), seed)
        file_list = [item for sublist in file_list for item in sublist]

        val_data_list = list()
        test_data_list = list()
        train_data_list = list()

        files_to_ass = dict()
        for fsha in file_list:
            assignments = cls.extract_assignments_from([fsha], data_list)
            if len(assignments) > 0:
                files_to_ass[fsha] = assignments

        file_list = list(files_to_ass.keys())
        files_to_ix = dict.fromkeys(file_list, -1)
        bound = int(len(data_list) * 0.1)

        while len(test_data_list) < bound:
            fsha = file_list[np.random.randint(0, len(file_list) - 1)]
            ix = files_to_ix[fsha]
            assignments = files_to_ass[fsha]

            if len(assignments) >= -ix:
                test_data_list.append(assignments[ix])
                files_to_ix[fsha] = ix - 1

        while len(val_data_list) < bound:
            fsha = file_list[np.random.randint(0, len(file_list) - 1)]
            ix = files_to_ix[fsha]
            assignments = files_to_ass[fsha]

            if len(assignments) >= -ix:
                val_data_list.append(assignments[ix])
                files_to_ix[fsha] = ix - 1

        for fsha in file_list:
            assignments = files_to_ass[fsha]
            ix = files_to_ix[fsha]
            if ix == -1:
                ix = len(assignments)
            else:
                ix = ix + 1
            if len(assignments) >= -ix:
                train_data_list.extend(assignments[0:ix])

        statistics = {
            "num-data": len(data_list),
            "num-data-train": len(train_data_list),
            "num-data-val": len(val_data_list),
            "num-data-test": len(test_data_list),
            "num-files": len(file_list),
        }

        IOUtils.mk_dir(output_dir)
        cls.dump_data_list(output_dir / "train.json", train_data_list)
        cls.dump_data_list(output_dir / "val.json", val_data_list)
        cls.dump_data_list(output_dir / "test.json", test_data_list)
        IOUtils.dump(output_dir / "statistics.json", statistics,
                     IOUtils.Format.jsonNoSort)
        IOUtils.dump(output_dir / "files.json", file_list,
                     IOUtils.Format.jsonNoSort)
        return
Esempio n. 22
0
    def split_dataset_cross_project(
        cls,
        assignments_path: Path,
        output_dir: Path,
        seed,
        use_new_sub_tokenizer: bool,
    ):
        # Load the assignments dataset, as a flattened list
        data_list = cls.load_data_list(assignments_path)

        # Load the mapping from project name to file cksums
        proj_2_cksums: Dict[str, List[str]] = cls.load_proj_2_cksums()

        # Each file can only be assigned to one project; shuffle the project list and assign in order
        projs_shuffle = cls.shuffle_data(sorted(list(proj_2_cksums.keys())),
                                         seed)
        seen_cksum = set()
        for proj in projs_shuffle:
            proj_2_cksums[proj] = [
                c for c in proj_2_cksums[proj] if c not in seen_cksum
            ]
            # Remove the project if all files in it has been seen
            if len(proj_2_cksums[proj]) == 0: del proj_2_cksums[proj]
            seen_cksum.update(proj_2_cksums[proj])
        # end for

        # Shuffle projects list once again as some projects may be removed due to no data
        projs_shuffle = cls.shuffle_data(sorted(list(proj_2_cksums.keys())),
                                         seed)

        # Split the data by project with roughly 8:1:1 ratio
        num_data = len(data_list)

        # First take test set until >= 10% data
        test_proj_list = list()
        test_data_list = list()
        while len(test_data_list) < 0.1 * num_data:
            proj = projs_shuffle.pop()
            test_proj_list.append(proj)
            test_data_list += [
                d for d in data_list if d["file_sha"][0] in proj_2_cksums[proj]
            ]
        # end while
        test_data_list = cls.shuffle_data(test_data_list, seed)

        # Then take train set until >= 80% data
        train_proj_list = list()
        train_data_list = list()
        while len(train_data_list) < 0.8 * num_data:
            proj = projs_shuffle.pop()
            train_proj_list.append(proj)
            train_data_list += [
                d for d in data_list if d["file_sha"][0] in proj_2_cksums[proj]
            ]
        # end while
        train_data_list = cls.shuffle_data(train_data_list, seed)

        # Remaining are assigned to val
        val_proj_list = projs_shuffle
        val_data_list = list()
        for proj in val_proj_list:
            val_data_list += [
                d for d in data_list if d["file_sha"][0] in proj_2_cksums[proj]
            ]
        # end for
        val_data_list = cls.shuffle_data(val_data_list, seed)

        # Remove the data testing set that appeared in train/val
        seen_data_in_train_val = set()
        for data in train_data_list + val_data_list:
            key = hash((tuple(data["l"]), tuple(data["r"]),
                        tuple([
                            tuple(data[f"pa{pa_i+1}"])
                            for pa_i in range(Macros.MAX_PA_IN_MODEL)
                        ])))
            seen_data_in_train_val.add(key)
        # end for

        test_duplicate_indexes = list()
        for i, data in enumerate(test_data_list):
            key = hash((tuple(data["l"]), tuple(data["r"]),
                        tuple([
                            tuple(data[f"pa{pa_i+1}"])
                            for pa_i in range(Macros.MAX_PA_IN_MODEL)
                        ])))
            if key in seen_data_in_train_val:
                test_duplicate_indexes.append(i)
            # end if
        # end for
        for i in reversed(test_duplicate_indexes):
            del test_data_list[i]
        # end for

        # Sub-tokenize; this is after recording duplicates, as we detect duplicates on token level
        for data in train_data_list + val_data_list + test_data_list:
            cls.sub_tokenize_data(data, use_new_sub_tokenizer)
        # end for

        # Collect statistics
        statistics = {
            "num-data":
            len(data_list),
            "num-data-train":
            len(train_data_list),
            "num-data-val":
            len(val_data_list),
            "num-data-test":
            len(test_data_list),
            "num-proj":
            len(train_proj_list) + len(val_proj_list) + len(test_proj_list),
            "num-proj-train":
            len(train_proj_list),
            "num-proj-val":
            len(val_proj_list),
            "num-proj-test":
            len(test_proj_list),
            "num-test-duplicate":
            len(test_duplicate_indexes),
        }

        # Save dataset after splitting; the tokens for each field of data are joined to a single string separated by space
        IOUtils.mk_dir(output_dir)
        cls.dump_data_list(output_dir / "train.json", train_data_list)
        cls.dump_data_list(output_dir / "val.json", val_data_list)
        cls.dump_data_list(output_dir / "test.json", test_data_list)
        IOUtils.dump(output_dir / "train-proj-list.json", train_proj_list,
                     IOUtils.Format.jsonPretty)
        IOUtils.dump(output_dir / "val-proj-list.json", val_proj_list,
                     IOUtils.Format.jsonPretty)
        IOUtils.dump(output_dir / "test-proj-list.json", test_proj_list,
                     IOUtils.Format.jsonPretty)
        IOUtils.dump(output_dir / "statistics.json", statistics,
                     IOUtils.Format.jsonNoSort)
        return
Esempio n. 23
0
    def make_table_draft_model_results(
        self,
        results_path: Path,
        output_name: str,
    ):
        special_tables_dir = self.tables_dir / "draft-model-results"
        IOUtils.mk_dir(special_tables_dir)
        file = latex.File(special_tables_dir / f"{output_name}.tex")

        # Header
        file.append(r"\begin{table*}")
        file.append(r"\begin{small}")
        file.append(r"\begin{center}")
        file.append(r"\caption{Model Results (Draft) from " +
                    str(results_path).replace("_", r"\_") + "}")

        metrics = None
        for tvt in [
                Macros.lat_lat, Macros.evo_lat, Macros.lat_evo, Macros.evo_evo
        ]:
            results = IOUtils.load(results_path / tvt / "test_results.json")

            # Flatten Rouge scores
            if "Rouge" in results:
                if results["Rouge"] == 0:
                    results["Rouge1-F1"] = 0
                    results["Rouge2-F1"] = 0
                    results["RougeL-F1"] = 0
                else:
                    results["Rouge1-F1"] = results["Rouge"]["rouge-1"]["f"]
                    results["Rouge2-F1"] = results["Rouge"]["rouge-2"]["f"]
                    results["RougeL-F1"] = results["Rouge"]["rouge-l"]["f"]
                # end if
                del results["Rouge"]
            # end if

            if metrics is None:
                metrics = list(sorted(results.keys()))

                # Table header line
                file.append(r"\begin{tabular}{l | " + "r" * len(metrics) + "}")
                file.append(r"\toprule")
                file.append("Training-Testing & " + " & ".join(metrics) +
                            r"\\")
                file.append(r"\midrule")
            # end if

            file.append(tvt)
            for m in metrics:
                file.append(f"& {results[m]:.2f}")
            # end for
            file.append(r"\\")
        # end for

        # Footer
        file.append(r"\bottomrule")
        file.append(r"\end{tabular}")
        file.append(r"\end{center}")
        file.append(r"\end{small}")
        file.append(r"\end{table*}")

        file.save()
        return
Esempio n. 24
0
    def submit_script(cls,
            cluster: str,
            name: str,
            log_path: Path,
            script: str,
            queue: str = None,
            timeout: str = None,
            require_conda: bool = True,
            conda_env: str = None,
            modules: List[str] = None,
    ) -> int:
        # Get default values
        if modules is None:
            modules = TACCRunnerConsts.modules[cluster]
        # end if
        if queue is None:
            queue = TACCRunnerConsts.queue[cluster]
        # end if
        if timeout is None:
            timeout = TACCRunnerConsts.timeout[cluster]
        # end if
        if conda_env is None:
            conda_env = TACCRunnerConsts.conda_env[cluster]
        # end if

        # Prepare submit script
        IOUtils.mk_dir(log_path)

        s = f"""#!/bin/bash
#SBATCH -J {name}               # Job name
#SBATCH -o {log_path}/%j.stdout # Name of stdout output file(%j expands to jobId)
#SBATCH -e {log_path}/%j.stderr # Name of stderr output file(%j expands to jobId)
#SBATCH -p {queue}              # Queue name
#SBATCH -N 1                    # Total number of nodes requested
#SBATCH -n 1                    # Total number of mpi tasks requested
#SBATCH -t {timeout}            # Max run time (hh:mm:ss)
#SBATCH [email protected]
#SBATCH --mail-type=ALL
# The next line is required if the user has more than one project
#SBATCH -A {TACCRunnerConsts.allocation}      # Allocation name to charge job against

module reset
module unload python2
"""
        for m in modules:
            s += f"module load {m}\n"
        # end for
        s += f"""
module list
echo "START: $(date)"

# Launch serial code...
# Do not use ibrun or any other MPI launcher
"""

        if require_conda:
            s += f"""
unset PYTHONPATH
source {TACCRunnerConsts.conda_init_path[cluster]}
conda activate {conda_env}
"""

        s += f"""
cd {Macros.python_dir}
{script}

echo "END: $(date)"
"""

        # Submit the script
        submit_script = BashUtils.get_temp_file()
        IOUtils.dump(submit_script, s, IOUtils.Format.txt)
        receipt = BashUtils.run(f"sbatch {submit_script}", expected_return_code=0).stdout

        # Get job id as the last number in output
        job_id = int(receipt.splitlines()[-1].split()[-1])

        # Save the script at log_path as well
        BashUtils.run(f"mv {submit_script} {log_path}/{job_id}.sh")

        return job_id