Exemple #1
0
 def get_cur_cluster(cls) -> str:
     hostname = BashUtils.run(f"hostname").stdout.strip()
     if hostname.endswith("maverick2.tacc.utexas.edu"):
         return cls.maverick2
     elif hostname.endswith("stampede2.tacc.utexas.edu"):
         return cls.stampede2
     else:
         cls.logger.warning("Currently not on TACC")
         return cls.maverick2
Exemple #2
0
 def test_propagate_env(self):
     del os.environ[self.TEST_ENV_A_KEY]
     self.assertTrue(self.TEST_ENV_A_KEY not in os.environ)
     self.assertEqual(
         self.TEST_ENV_A_VALUE,
         BashUtils.run(
             f"export {self.TEST_ENV_A_KEY}={self.TEST_ENV_A_VALUE}; echo -n ${self.TEST_ENV_A_KEY}",
             is_update_env=True).stdout)
     self.assertEqual(self.TEST_ENV_A_VALUE,
                      os.environ[self.TEST_ENV_A_KEY])
Exemple #3
0
    def require_special_repo(cls, directory: Path, branch: str):
        cls.logger.info(f"Updating {directory} to {branch} branch")
        if directory.exists():
            if not directory.is_dir() or not (directory / ".git").is_dir():
                LoggingUtils.log_and_raise(
                    cls.logger,
                    f"Path {directory} already exists but is not a proper git repository!",
                    Exception)
            # end if

            with IOUtils.cd(directory):
                BashUtils.run(f"git pull", expected_return_code=0)
            # end with
        else:
            IOUtils.mk_dir(directory)
            with IOUtils.cd(directory):
                BashUtils.run(
                    f"git clone --single-branch -b {branch} -- {cls.get_git_url()} .",
                    expected_return_code=0)
    def tacc_submit_jobs(cls,
                         submit_script: Path,
                         titles: List[str],
                         scripts: List[Path],
                         timeouts: List[str],
                         output_dir: Path,
                         submit_cd: int = 600,
                         max_jobs: int = 4):
        job_i = 0
        while job_i < len(scripts):
            if cls.tacc_get_num_jobs() >= max_jobs:
                cls.logger.warning(
                    f"Number of running jobs reach limit {max_jobs}, will retry after {submit_cd} seconds at {time.strftime('%a, %d %b %Y %H:%M:%S +0000', time.localtime(time.time()+submit_cd))}"
                )
                time.sleep(submit_cd)
                continue
            # end if

            title = titles[job_i]
            script = scripts[job_i]
            timeout = timeouts[job_i]
            cls.logger.info(f"Submitting script {script}")

            try:
                BashUtils.run(
                    f"{submit_script} \"{title}\" \"{output_dir}\" \"{script}\" \"{timeout}\"",
                    expected_return_code=0)
            except KeyboardInterrupt:
                cls.logger.warning(f"Keyboard interrupt!")
                break
            except:
                cls.logger.warning(
                    f"Failed to submit, will retry after {submit_cd} seconds at {time.strftime('%a, %d %b %Y %H:%M:%S +0000', time.localtime(time.time()+submit_cd))}"
                )
                time.sleep(submit_cd)
                continue
            # end try

            # Submit successfully
            job_i += 1
        # end while
        return
Exemple #5
0
    def prepare_data(self):
        data_prefix = f"{self.eval_setting}-{self.year}"
        IOUtils.rm_dir(self.data_dir)
        IOUtils.mk_dir(self.data_dir)

        # build dataset used by Open-NMT
        BashUtils.run(f"cp {self.model_data_dir}/{data_prefix}-{Macros.train}/biLSTM* {self.data_dir}/",
                      expected_return_code=0)

        BashUtils.run(
            f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_common}/src-test.txt {self.data_dir}/src-{Macros.test_common}.txt",
            expected_return_code=0)
        BashUtils.run(
            f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_common}/tgt-test.txt {self.data_dir}/tgt-{Macros.test_common}.txt",
            expected_return_code=0)

        BashUtils.run(
            f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/src-test.txt {self.data_dir}/src-{Macros.test_standard}.txt",
            expected_return_code=0)
        BashUtils.run(
            f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/tgt-test.txt {self.data_dir}/tgt-{Macros.test_standard}.txt",
            expected_return_code=0)

        return
Exemple #6
0
    def prepare_configs_and_scripts(self, trials: List[int]):
        exp_dir = self.work_dir

        for trial in trials:
            trial_dir = exp_dir / f"trial-{trial}"
            IOUtils.mk_dir(trial_dir)

            model_dir = trial_dir / "models"
            IOUtils.mk_dir(model_dir)
            log_dir = trial_dir / "logs"
            IOUtils.mk_dir(log_dir)
            data = str(exp_dir / "data/code2seq")
            val_data = data + ".val.c2s"
            train_log = trial_dir / "training-trace.json"

            train_script_file = trial_dir / f"{Macros.train}.sh"
            # Copy config file
            BashUtils.run(
                f"cp {self.base_config_file} {trial_dir}/config.yaml",
                expected_return_code=0)
            output_file = trial_dir / "output_tmp.txt"
            reference_file = trial_dir / "ref_tmp.txt"
            config_file = trial_dir / "config.yaml"
            train_script = f"#!/bin/bash\n" \
                           f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                           f"conda activate {self.CONDA_ENV}\n" \
                           f"module load cuda/10.0 cudnn/7.6.2\n" \
                           f"cd {self.code_dir}\n" \
                           f"python -u code2seq.py --data {data} --test {val_data} --log {train_log} --config {config_file} " \
                           f"--pred_file {output_file} --ref_file {reference_file} "\
                           f"--save_prefix {model_dir}/model --gpu_id $1 &> {trial_dir}/train-log.txt"
            IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt)
            BashUtils.run(f"chmod +x {train_script_file}",
                          expected_return_code=0)

            for test_type in [Macros.test_common, Macros.test_standard]:
                test_data = exp_dir / "data" / f"code2seq.{test_type}.c2s"
                output_file = trial_dir / f"output_{test_type}.txt"
                reference_file = trial_dir / f"ref_{test_type}.txt"
                test_script_file = trial_dir / f"{test_type}.sh"
                test_script = f"#!/bin/bash\n" \
                              f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                              f"conda activate {self.CONDA_ENV}\n" \
                              f"module load cuda/10.0 cudnn/7.6.2\n" \
                              f"cd {self.code_dir}\n" \
                              f"python3 code2seq.py --load {model_dir}/model_best --test {test_data} --config {config_file} " \
                              f"--pred_file {output_file} --ref_file {reference_file} "\
                              f"--gpu_id $1 &> {trial_dir}/{test_type}-log.txt\n" \
                              f"python3 eval_utils.py {reference_file} {output_file} {trial_dir}/results_{test_type}.json\n"
                IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt)
                BashUtils.run(f"chmod +x {test_script_file}",
                              expected_return_code=0)

        return
Exemple #7
0
    def process_data(self,
                     method_data_list: List[MethodData],
                     data_type: str,
                     output_dir: Path,
                     split: bool = True):
        Environment.require_collector()

        log_file = output_dir / "collector-log.txt"
        data_file = output_dir / "method-data.json"
        IOUtils.dump(data_file, IOUtils.jsonfy(method_data_list),
                     IOUtils.Format.json)

        config = {
            "transform": True,
            "model": "BiLSTM",
            "dataType": data_type,
            "dataFile": str(data_file),
            "logFile": str(log_file),
            "outputDir": str(output_dir),
        }
        config_file = output_dir / "collector-config.json"
        IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty)

        self.logger.info(
            f"Starting the Java collector. Check log at {log_file} and outputs at {output_dir}"
        )
        rr = BashUtils.run(
            f"java -jar {Environment.collector_jar} {config_file}",
            expected_return_code=0)
        if rr.stdout:
            self.logger.warning(f"Stdout of collector:\n{rr.stdout}")
        # end if
        if rr.stderr:
            self.logger.warning(f"Stderr of collector:\n{rr.stderr}")
        # end if
        # build raw dataset
        if split:
            self.tokenizeFile(output_dir / f"{data_type}.raw.txt", data_type)
        else:
            self.noSplit(output_dir / f"{data_type}.raw.txt", data_type)

        error_ids = IOUtils.load(str(output_dir) + "-error-ids.json")
        print(f"Number of error id is: {len(error_ids)}")
        # BashUtils.run(f"rm {output_dir}-error-ids.json", expected_return_code=0)
        return error_ids
    def prepare_code(self):
        IOUtils.rm_dir(self.code_dir)
        IOUtils.mk_dir(self.code_dir.parent)
        with IOUtils.cd(self.code_dir.parent):
            BashUtils.run(f"git clone {self.REPO_URL} {self.code_dir.name}", expected_return_code=0)
        # end with

        with IOUtils.cd(self.code_dir):
            BashUtils.run(f"git checkout {self.REPO_SHA}", expected_return_code=0)
        # end with

        # copy eval code
        BashUtils.run(f"cp {Macros.this_dir}/eval/eval_utils.py {self.code_dir}/")
        return
Exemple #9
0
    def process(self, model: str, output_dir: Path, task: str, year: int,
                eval_setting: str):
        """
        Main entry for processors of different models.
        :param model: the model name, one of {"DeepCom", "ast-attendgru"}
        :param output_dir: the output directory (usually data/models)
        :param task: the task name, either "CG" or "MN"
        :param year: the year that the testing data should be on
        :param eval_setting: the evaluation setting, one of {"evo", "crossproj", "mixedproj"}
        """
        assert year == self.EVO_YEARS[
            -1]  # TODO: Only support the latest year for now
        assert task in self.TASKS.keys()

        model_data_dir = output_dir / model

        if model == "DeepCom":
            from csevo.processor.DeepComProcessor import DeepComProcessor
            processor = DeepComProcessor()
        elif model == "DeepCom-Preorder":
            from csevo.processor.DeepComProcessor import DeepComProcessor
            processor = DeepComProcessor()
        elif model == "Bi-LSTM":
            from csevo.processor.BiLSTMProcessor import BiLSTMProcessor
            processor = BiLSTMProcessor()
        elif model == "no-split-Bi-LSTM":
            from csevo.processor.BiLSTMProcessor import BiLSTMProcessor
            processor = BiLSTMProcessor()
        elif model == "Transformer":
            from csevo.processor.TransformerProcessor import TransformerProcessor
            processor = TransformerProcessor()
            data_prefix = f"{eval_setting}-{year}"
            processor.process_data(model_data_dir, data_prefix)
            return
        elif model == "ASTAttendGRU":
            from csevo.processor.ASTAttendGRUProcessor import ASTAttendGRUProcessor
            processor = ASTAttendGRUProcessor()
        elif model == "Code2Seq":
            from csevo.processor.Code2SeqProcessor import Code2SeqProcessor
            processor = Code2SeqProcessor()
        else:
            raise ValueError(f"Illegal model {model}")
        # end if
        error_ids = None

        # Load dataset after split (from shared directory)
        shared_data_dir = output_dir / f"{task}-shared"
        self.logger.info(f"Loading dataset from {shared_data_dir}")
        data_type_2_data_list: Dict[str, List] = dict()
        data_type_2_data_list[Macros.test_common] = IOUtils.load(
            shared_data_dir / f"{year}-{Macros.test_common}.json",
            IOUtils.Format.json)
        for dt in [Macros.train, Macros.val, Macros.test_standard]:
            data_type_2_data_list[dt] = IOUtils.load(
                shared_data_dir / f"{eval_setting}-{year}-{dt}.json",
                IOUtils.Format.json)

        # Process each set
        for data_type, data_list in data_type_2_data_list.items():
            sub_dir_name = f"{eval_setting}-{year}-{data_type}"

            if data_type in [Macros.test_common, Macros.test_standard]:
                data_type_tvt = Macros.test
            else:
                data_type_tvt = data_type

            model_dt_output_dir = model_data_dir / sub_dir_name
            IOUtils.mk_dir(model_dt_output_dir)
            if model == "DeepCom":
                error_ids = processor.process_data(data_list, data_type_tvt,
                                                   model_dt_output_dir, "sbt")
            elif model == "DeepCom-Preorder":
                error_ids = processor.process_data(data_list, data_type_tvt,
                                                   model_dt_output_dir,
                                                   "Preorder")
            elif model == "Code2Seq":
                error_ids = processor.process_data(data_list, data_type_tvt,
                                                   model_dt_output_dir)
            elif model == "Bi-LSTM":
                processor.process_data(data_list, data_type_tvt,
                                       model_dt_output_dir)
            elif model == "no-split-Bi-LSTM":
                processor.process_data(data_list,
                                       data_type_tvt,
                                       model_dt_output_dir,
                                       split=False)
            if error_ids is not None:
                self.logger.warning(f"Error data count: {len(error_ids)}")
                IOUtils.dump(model_data_dir / f"error-ids-{sub_dir_name}.json",
                             error_ids, IOUtils.Format.json)
        # extra step for Open-NMT data
        if model == "Bi-LSTM" or model == "no-split-Bi-LSTM":
            # build dataset used by Open-NMT
            BashUtils.run(
                f"onmt_preprocess -train_src {model_data_dir}/{eval_setting}-{year}-{Macros.train}/src-train.txt "
                f"-train_tgt {model_data_dir}/{eval_setting}-{year}-{Macros.train}/tgt-train.txt "
                f"-valid_src {model_data_dir}/{eval_setting}-{year}-{Macros.val}/src-val.txt "
                f"-valid_tgt {model_data_dir}/{eval_setting}-{year}-{Macros.val}/tgt-val.txt "
                f"-save_data {model_data_dir}/{eval_setting}-{year}-{Macros.train}/biLSTM --src_seq_length 200 --src_seq_"
                f"length_trunc 200",
                expected_return_code=0)

        return
Exemple #10
0
    def prepare_data(self):
        if not self.use_latest:
            for t in range(13, 18):
                exp_dir = self.work_dir / f"{t}{t+1}-train"
                self.logger.info(
                    f"Preparing the data for {t}-{t+1} at {exp_dir}")
                IOUtils.rm_dir(exp_dir)
                IOUtils.mk_dir(exp_dir)

                # Copy train data
                BashUtils.run(
                    f"cp -r {self.model_data_dir}/20{t}-20{t+1}-train/train {exp_dir}/",
                    expected_return_code=0)

                # Copy val test data
                BashUtils.run(
                    f"cp -r {self.model_data_dir}/20{t+1}-20{t+2}-val/valid {exp_dir}/",
                    expected_return_code=0)
                BashUtils.run(
                    f"cp -r {self.model_data_dir}/20{t+2}-20{t+3}-test/test {exp_dir}/",
                    expected_return_code=0)

                # Copy vocab
                BashUtils.run(
                    f"cp {self.model_data_dir}/20{t}-20{t+1}-train/vocab* {exp_dir}/",
                    expected_return_code=0)
                # end for
            # end for
        else:
            exp_dir = self.work_dir / "latest"
            IOUtils.rm_dir(exp_dir)
            IOUtils.mk_dir(exp_dir)
            # Copy Train data
            BashUtils.run(
                f"cp -r {self.model_data_dir}/latest/train {exp_dir}/",
                expected_return_code=0)

            BashUtils.run(
                f"cp -r {self.model_data_dir}/latest/valid {exp_dir}/",
                expected_return_code=0)
            BashUtils.run(
                f"cp -r {self.model_data_dir}/latest/test {exp_dir}/",
                expected_return_code=0)

            # Copy vocab
            BashUtils.run(f"cp {self.model_data_dir}/latest/vocab* {exp_dir}/",
                          expected_return_code=0)
        return
Exemple #11
0
    def prepare_configs_and_scripts(self, trials: List[int]):
        base_config = IOUtils.load(self.base_config_file,
                                   IOUtils.Format.jsonPretty)
        if not self.use_latest:
            exps = [f"{t}{t+1}-train" for t in range(13, 18)]
            for exp in exps:
                exp_dir = self.work_dir / exp
                for trial in trials:
                    trial_dir = exp_dir / f"trial-{trial}"
                    IOUtils.mk_dir(trial_dir)

                    output_file = trial_dir / "output.txt"

                    config = copy.copy(base_config)
                    config["data_dir"] = str(exp_dir)
                    config["model_dir"] = str(trial_dir / "model")
                    config["output"] = str(output_file)

                    config_file = trial_dir / "config.json"
                    IOUtils.dump(config_file, config,
                                 IOUtils.Format.jsonPretty)

                    train_script_file = trial_dir / "train.sh"
                    train_script = f"#!/bin/bash\n" \
                                   f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                                   f"conda activate {self.CONDA_ENV}\n" \
                                   f"module load cuda/10.0 cudnn/7.6.2\n" \
                                   f"cd {self.code_dir}/translate\n" \
                                   f"python3 __main__.py {config_file} --train -v --gpu-id $1 &> {trial_dir}/log-train.txt\n"
                    IOUtils.dump(train_script_file, train_script,
                                 IOUtils.Format.txt)
                    BashUtils.run(f"chmod +x {train_script_file}",
                                  expected_return_code=0)

                    test_script_file = trial_dir / "test.sh"
                    test_script = f"#!/bin/bash\n" \
                                  f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                                  f"conda activate {self.CONDA_ENV}\n" \
                                  f"module load cuda/10.0 cudnn/7.6.2\n" \
                                  f"cd {self.code_dir}/translate\n" \
                                  f"python3 __main__.py {config_file} --eval {exp_dir}/test/test.token.code {exp_dir}/test/test.token.sbt {exp_dir}/test/test.token.nl &> {trial_dir}/log-test.txt"
                    IOUtils.dump(test_script_file, test_script,
                                 IOUtils.Format.txt)
                    BashUtils.run(f"chmod +x {test_script_file}",
                                  expected_return_code=0)

                    eval_script_file = trial_dir / "val.sh"
                    eval_script = f"#!/bin/bash\n" \
                                   f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                                   f"conda activate {self.CONDA_ENV}\n" \
                                   f"module load cuda/10.0 cudnn/7.6.2\n" \
                                   f"cd {self.code_dir}/translate\n" \
                                   f"python3 Bleu.py {exp_dir}/test/test.token.nl {trial_dir}/output.txt {trial_dir}\n"
                    IOUtils.dump(eval_script_file, eval_script,
                                 IOUtils.Format.txt)
                    BashUtils.run(f"chmod +x {eval_script_file}",
                                  expected_return_code=0)
                # end for
            # end for
        else:
            exp_dir = self.work_dir / "latest"
            for trial in trials:
                trial_dir = exp_dir / f"trial-{trial}"
                IOUtils.mk_dir(trial_dir)

                output_file = trial_dir / "output.txt"

                config = copy.copy(base_config)
                config["data_dir"] = str(exp_dir)
                config["model_dir"] = str(trial_dir / "model")
                config["output"] = str(output_file)

                config_file = trial_dir / "config.json"
                IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty)

                train_script_file = trial_dir / "train.sh"
                train_script = f"#!/bin/bash\n" \
                               f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                               f"conda activate {self.CONDA_ENV}\n" \
                               f"module load cuda/10.0 cudnn/7.6.2\n" \
                               f"cd {self.code_dir}/translate\n" \
                               f"python3 __main__.py {config_file} --train -v --gpu-id $1 &> {trial_dir}/log-train.txt\n"
                IOUtils.dump(train_script_file, train_script,
                             IOUtils.Format.txt)
                BashUtils.run(f"chmod +x {train_script_file}",
                              expected_return_code=0)

                test_script_file = trial_dir / "test.sh"
                test_script = f"#!/bin/bash\n" \
                              f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                              f"conda activate {self.CONDA_ENV}\n" \
                              f"module load cuda/10.0 cudnn/7.6.2\n" \
                              f"cd {self.code_dir}/translate\n" \
                              f"python3 __main__.py {config_file} --eval {exp_dir}/test/test.token.code {exp_dir}/test/test.token.sbt {exp_dir}/test/test.token.nl &> {trial_dir}/log-test.txt"
                IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt)
                BashUtils.run(f"chmod +x {test_script_file}",
                              expected_return_code=0)

                eval_script_file = trial_dir / "val.sh"
                eval_script = f"#!/bin/bash\n" \
                              f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                              f"conda activate {self.CONDA_ENV}\n" \
                              f"module load cuda/10.0 cudnn/7.6.2\n" \
                              f"cd {self.code_dir}/translate\n" \
                              f"python3 Bleu.py {exp_dir}/test/test.token.nl {trial_dir}/output.txt {trial_dir}\n"
                IOUtils.dump(eval_script_file, eval_script, IOUtils.Format.txt)
                BashUtils.run(f"chmod +x {eval_script_file}",
                              expected_return_code=0)
        return
Exemple #12
0
    def submit_script(cls,
            cluster: str,
            name: str,
            log_path: Path,
            script: str,
            queue: str = None,
            timeout: str = None,
            require_conda: bool = True,
            conda_env: str = None,
            modules: List[str] = None,
    ) -> int:
        # Get default values
        if modules is None:
            modules = TACCRunnerConsts.modules[cluster]
        # end if
        if queue is None:
            queue = TACCRunnerConsts.queue[cluster]
        # end if
        if timeout is None:
            timeout = TACCRunnerConsts.timeout[cluster]
        # end if
        if conda_env is None:
            conda_env = TACCRunnerConsts.conda_env[cluster]
        # end if

        # Prepare submit script
        IOUtils.mk_dir(log_path)

        s = f"""#!/bin/bash
#SBATCH -J {name}               # Job name
#SBATCH -o {log_path}/%j.stdout # Name of stdout output file(%j expands to jobId)
#SBATCH -e {log_path}/%j.stderr # Name of stderr output file(%j expands to jobId)
#SBATCH -p {queue}              # Queue name
#SBATCH -N 1                    # Total number of nodes requested
#SBATCH -n 1                    # Total number of mpi tasks requested
#SBATCH -t {timeout}            # Max run time (hh:mm:ss)
#SBATCH [email protected]
#SBATCH --mail-type=ALL
# The next line is required if the user has more than one project
#SBATCH -A {TACCRunnerConsts.allocation}      # Allocation name to charge job against

module reset
module unload python2
"""
        for m in modules:
            s += f"module load {m}\n"
        # end for
        s += f"""
module list
echo "START: $(date)"

# Launch serial code...
# Do not use ibrun or any other MPI launcher
"""

        if require_conda:
            s += f"""
unset PYTHONPATH
source {TACCRunnerConsts.conda_init_path[cluster]}
conda activate {conda_env}
"""

        s += f"""
cd {Macros.python_dir}
{script}

echo "END: $(date)"
"""

        # Submit the script
        submit_script = BashUtils.get_temp_file()
        IOUtils.dump(submit_script, s, IOUtils.Format.txt)
        receipt = BashUtils.run(f"sbatch {submit_script}", expected_return_code=0).stdout

        # Get job id as the last number in output
        job_id = int(receipt.splitlines()[-1].split()[-1])

        # Save the script at log_path as well
        BashUtils.run(f"mv {submit_script} {log_path}/{job_id}.sh")

        return job_id
Exemple #13
0
    def collect_lemmas_doc(
        cls,
        doc: CoqDocument,
        ast_sexp_list: List[SexpNode],
        serapi_options: str,
    ) -> List[Lemma]:
        lemmas_doc: List[Lemma] = list()
        data_index = doc.get_data_index()

        # Maintain a stack of module
        modules: List[str] = list()

        # Prepare qualified name prefix
        qprefix_this_doc = "./" + doc.file_name[:-2]  # Remove .v
        for m in cls.RE_PATH_TO_QUALIFIED_PREFIX.finditer(serapi_options):
            path = m.group("path")
            if path != ".": path = "./" + path
            qprefix = m.group("qprefix")

            if qprefix_this_doc.startswith(path):
                qprefix_this_doc = qprefix + qprefix_this_doc[len(path):]
                break
            # end if
        # end for
        if qprefix_this_doc.startswith("./"):
            qprefix_this_doc = qprefix_this_doc[len("./"):]
        qprefix_this_doc = qprefix_this_doc.replace("/", ".")

        for sent_i, sent in enumerate(doc.sentences):
            ast_sexp = ast_sexp_list[sent_i]
            vernac = SexpAnalyzer.analyze_vernac(ast_sexp)

            if vernac.vernac_type in cls.VTYPES_MODULE_BEG:
                # (VernacExpr()(VernacDefineModule()  (  (   v   ( Id <module name>)) ...
                #  0         1 2 20               21  22 220  2201    22011
                module_name = vernac.vernac_sexp[2][2][0][1][
                    1].content_no_quote
                modules.append(module_name)
            elif vernac.vernac_type in cls.VTYPES_MODULE_END:
                # (VernacExpr()(VernacEndSegment  (  (   v   ( Id <module name>)) ...
                #  0         1 2 20               21 210  2101    21011
                try:
                    module_name = vernac.vernac_sexp[2][1][0][1][
                        1].content_no_quote
                except:
                    print(vernac.vernac_sexp.pretty_format())
                    raise
                # end try
                if len(modules) > 0 and module_name == modules[-1]:
                    modules.pop(
                    )  # EndModule and EndSection share the same vernac type
            elif vernac.vernac_type in cls.VTYPES_LEMMA:
                # (VernacExpr()(VernacStartTheoremProof Lemma ( ( ( ( ( v (       Id <lemma name>))
                #  0         1 2 20                     21   22   2200000 2200001    22000011
                lemma = Lemma()
                lemma.data_index = data_index

                lemma.name = vernac.vernac_sexp[2][2][0][0][0][0][1][
                    1].content_no_quote
                lemma.qname = qprefix_this_doc + "." + ".".join(modules +
                                                                [lemma.name])

                # Find lemma content, after the first token matching the lemma name
                tok_i = 0
                for tok in sent.tokens:
                    if tok.content == lemma.name: break
                    tok_i += 1
                # end for
                if tok_i == len(sent.tokens):
                    LoggingUtils.log_and_raise(
                        cls.logger,
                        f"Lemma name {lemma.name} didn't appear in the source code {sent.str_with_space()}",
                        Exception)

                lemma.vernac_command = sent.tokens[:tok_i]
                lemma.statement = sent.tokens[tok_i + 1:]
                lemma.ast_sexp = vernac.vernac_sexp

                lemmas_doc.append(lemma)
            # end if
        # end for

        # Use sername to get the backend representations
        lemma_qnames: str = "".join([l.qname + "\n" for l in lemmas_doc])
        lemma_qnames_file = BashUtils.get_temp_file()
        IOUtils.dump(lemma_qnames_file, lemma_qnames, IOUtils.Format.txt)

        lemma_qnames_backend_sexps_str: str = BashUtils.run(
            f"sername {serapi_options} --require-lib={qprefix_this_doc} {lemma_qnames_file}",
            expected_return_code=0).stdout
        IOUtils.rm(lemma_qnames_file)
        for qname_backend_sexp_str in lemma_qnames_backend_sexps_str.splitlines(
        ):
            qname, backend_sexp_str = qname_backend_sexp_str.split(":", 1)
            backend_sexp = SexpParser.parse(backend_sexp_str)

            for lemma in lemmas_doc:
                if lemma.qname == qname:
                    lemma.backend_sexp = backend_sexp
                    break
                # end if
            # end for
        # end for

        lemmas_doc = [l for l in lemmas_doc if l.backend_sexp is not None]
        return lemmas_doc
Exemple #14
0
 def test_inherit_env(self):
     os.environ[self.TEST_ENV_A_KEY] = self.TEST_ENV_A_VALUE
     self.assertEqual(
         self.TEST_ENV_A_VALUE,
         BashUtils.run(f"echo -n ${self.TEST_ENV_A_KEY}").stdout)
     return
Exemple #15
0
    def collect_coq_documents_project(
        cls,
        data_mgr: FilesManager,
        project: Project,
        names_projects: Dict[str, Project],
        files: List[str] = None,
        is_verifying_tokenizer: bool = False,
    ) -> List[CoqDocument]:
        coq_documents: List[CoqDocument] = list()

        # Clone and checkout repo
        project.clone()
        project.checkout(project.data["sha"], is_forced=True)

        # Build the project
        cls.install_coq_project(project, names_projects)

        # For each file, parse code to tokens
        with IOUtils.cd(project.checkout_dir):
            coq_files: List[str] = BashUtils.run(
                f"find -name '*.v' -type f").stdout.split("\n")[:-1]
            if files is not None:
                coq_files = [f for f in coq_files
                             if f[2:] in files]  # [2:] is to remove the ./
            # end if
            re_ignore_path = re.compile(
                project.data["ignore_path_regex"]
            ) if "ignore_path_regex" in project.data else None
            for i, coq_file in enumerate(coq_files):
                try:
                    coq_file = coq_file[2:]
                    cls.logger.debug(
                        f"File {i + 1}/{len(coq_files)}: {coq_file}")

                    # Check if file is ignored
                    if re_ignore_path is not None and re_ignore_path.fullmatch(
                            coq_file):
                        cls.logger.info(f"Ignoring file {coq_file}")
                        continue
                    # end if

                    # Read file
                    with open(coq_file, "r", newline="") as f:
                        source_code = f.read()
                    # end with

                    # Get unicode offsets
                    unicode_offsets = ParserUtils.get_unicode_offsets(
                        source_code)

                    # Save original file to original_files
                    data_mgr.dump_data([
                        FilesManager.ORIGINAL_FILES, project.full_name,
                        coq_file
                    ], source_code, IOUtils.Format.txt)

                    # Call SerAPI
                    serapi_options = project.data.get("serapi_options", "")
                    ast_sexp_str: str = BashUtils.run(
                        f"sercomp {serapi_options} --mode=sexp -- {coq_file}",
                        expected_return_code=0).stdout
                    tok_sexp_str: str = BashUtils.run(
                        f"sertok {serapi_options} -- {coq_file}",
                        expected_return_code=0).stdout

                    # Save ast sexp to dataset (.ast.sexp)
                    data_mgr.dump_data([
                        FilesManager.RAW_FILES, project.full_name,
                        coq_file[:-2] + ".ast.sexp"
                    ], ast_sexp_str, IOUtils.Format.txt)

                    # Save tok sexp to dataset (.tok.sexp)
                    data_mgr.dump_data([
                        FilesManager.RAW_FILES, project.full_name,
                        coq_file[:-2] + ".tok.sexp"
                    ], tok_sexp_str, IOUtils.Format.txt)

                    # Parse ast sexp
                    ast_sexp_list: List[SexpNode] = SexpParser.parse_list(
                        ast_sexp_str)
                    tok_sexp_list: List[SexpNode] = SexpParser.parse_list(
                        tok_sexp_str)

                    # Verify the tokenizer if requested
                    if is_verifying_tokenizer:
                        if not cls.verify_tokenizer(tok_sexp_list, source_code,
                                                    unicode_offsets):
                            LoggingUtils.log_and_raise(
                                cls.logger,
                                "Tokenized content doesn't match original file!",
                                Exception)
                        # end if
                    # end if

                    # Parse the document
                    coq_document = CoqParser.parse_document(
                        source_code,
                        ast_sexp_list,
                        tok_sexp_list,
                        unicode_offsets=unicode_offsets)

                    # Save the parsed document (printed format) to raw_files
                    data_mgr.dump_data(
                        [FilesManager.RAW_FILES, project.full_name, coq_file],
                        coq_document.str_with_space(), IOUtils.Format.txt)

                    # Set meta data
                    coq_document.file_name = coq_file
                    coq_document.project_name = project.full_name
                    coq_document.revision = project.revision

                    coq_documents.append(coq_document)
                except KeyboardInterrupt:
                    cls.logger.warning("Keyboard interrupt!")
                    raise
                except:
                    cls.logger.warning(
                        f"File {coq_file} failed! Exception was: {traceback.format_exc()}"
                    )
                    continue
                # end try
            # end for
        # end with

        return coq_documents
Exemple #16
0
    def install_coq_project(cls, project: Project,
                            names_projects: Dict[str, Project]) -> None:
        """
        :requires: the project is cloned and checked-out to the desired version.
        """
        if not project.is_cloned:
            project.clone()
            project.checkout(project.data["sha"], is_forced=True)
        # end if

        # Check if the project is already compiled
        confirmation_file = "lpc-installed.txt"
        confirmation_content = project.revision + " " + BashUtils.run(
            "opam list coq -s", expected_return_code=0).stdout.strip()
        if (project.checkout_dir /
                confirmation_file).is_file() and IOUtils.load(
                    project.checkout_dir / confirmation_file,
                    "txt") == confirmation_content:
            cls.logger.debug(f"Project {project.full_name} already installed")
            return
        # end if

        project.clean()

        # Install dependencies
        for dependency in project.data.get("dependencies", []):
            dependency_project = names_projects.get(dependency)
            if dependency_project is None:
                raise Exception(f"Cannot find dependency {dependency}")
            cls.logger.info(
                f"For Project {project.full_name}, installing dependency {dependency}"
            )
            cls.install_coq_project(dependency_project, names_projects)
        # end for

        if "build_cmd" not in project.data:
            raise Exception(
                f"Project {project.full_name} does not have build_cmd")
        if "install_cmd" not in project.data:
            raise Exception(
                f"Project {project.full_name} does not have install_cmd")

        with IOUtils.cd(project.checkout_dir):
            # Build
            cls.logger.info(
                f"Project {project.full_name}: Building with {project.data['build_cmd']}"
            )
            r = BashUtils.run(project.data["build_cmd"])
            if r.return_code != 0:
                raise Exception(
                    f"Compilation failed! Return code is {r.return_code}! stdout:\n{r.stdout}\n; stderr:\n{r.stderr}"
                )
            else:
                cls.logger.debug(
                    f"Compilation finished. Return code is {r.return_code}. stdout:\n{r.stdout}\n; stderr:\n{r.stderr}"
                )
            # end if

            # Install
            cls.logger.info(
                f"Project {project.full_name}: Installing with {project.data['install_cmd']}"
            )
            r = BashUtils.run(project.data["install_cmd"])
            if r.return_code != 0:
                raise Exception(
                    f"Installation failed! Return code is {r.return_code}! stdout:\n{r.stdout}\n; stderr:\n{r.stderr}"
                )
            else:
                cls.logger.debug(
                    f"Installation finished. Return code is {r.return_code}. stdout:\n{r.stdout}\n; stderr:\n{r.stderr}"
                )
            # end if

            IOUtils.dump(project.checkout_dir / confirmation_file,
                         confirmation_content, "txt")
        # end with
        return
Exemple #17
0
 def get_num_running_jobs(cls) -> int:
     return int(BashUtils.run(f"squeue -u {TACCRunnerConsts.user} | wc -l", expected_return_code=0).stdout) - 1
 def tacc_get_num_jobs(cls) -> int:
     return int(
         BashUtils.run(f"squeue -u {os.getenv('USER')} | wc -l").stdout) - 1
Exemple #19
0
    def collect_project(self, project_name: str, project_url: str):
        Environment.require_collector()

        # 0. Download repo
        downloads_dir = self.repos_downloads_dir / project_name
        results_dir = self.repos_results_dir / project_name

        # Remove previous results if any
        IOUtils.rm_dir(results_dir)
        IOUtils.mk_dir(results_dir)

        # Clone the repo if not exists
        if not downloads_dir.exists():
            with IOUtils.cd(self.repos_downloads_dir):
                with TimeUtils.time_limit(300):
                    BashUtils.run(f"git clone {project_url} {project_name}",
                                  expected_return_code=0)
                # end with
            # end with
        # end if

        project_data = ProjectData.create()
        project_data.name = project_name
        project_data.url = project_url

        # 1. Get list of revisions
        with IOUtils.cd(downloads_dir):
            git_log_out = BashUtils.run(f"git log --pretty=format:'%H %P'",
                                        expected_return_code=0).stdout
            for line in git_log_out.splitlines()[:self.MAX_REVISIONS]:
                shas = line.split()
                project_data.revisions.append(shas[0])
                project_data.parent_revisions[shas[0]] = shas[1:]
            # end for
        # end with

        # 2. Get revisions in different year
        with IOUtils.cd(downloads_dir):
            for year in self.YEARS:
                git_log_out = BashUtils.run(
                    f"git rev-list -1 --before=\"Jan 1 {year}\" origin",
                    expected_return_code=0).stdout
                project_data.year_revisions[str(year) +
                                            "_Jan_1"] = git_log_out.rstrip()
            # end for
        # end with

        project_data_file = results_dir / "project.json"
        IOUtils.dump(project_data_file, IOUtils.jsonfy(project_data),
                     IOUtils.Format.jsonPretty)

        # 2. Start java collector
        # Prepare config
        log_file = results_dir / "collector-log.txt"
        output_dir = results_dir / "collector"

        config = {
            "collect": True,
            "projectDir": str(downloads_dir),
            "projectDataFile": str(project_data_file),
            "logFile": str(log_file),
            "outputDir": str(output_dir),
            "year":
            True  # To indicate whether to collect all evo data or yearly data
        }
        config_file = results_dir / "collector-config.json"
        IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty)

        self.logger.info(
            f"Starting the Java collector. Check log at {log_file} and outputs at {output_dir}"
        )
        rr = BashUtils.run(
            f"java -jar {Environment.collector_jar} {config_file}",
            expected_return_code=0)
        if rr.stderr:
            self.logger.warning(f"Stderr of collector:\n{rr.stderr}")
        # end if

        # 3. In some cases, save collected data to appropriate location or database
        # TODO private info
        # On luzhou server for user pynie, move it to a dedicated location at /user/disk2
        if BashUtils.run(
                f"hostname").stdout.strip() == "luzhou" and BashUtils.run(
                    f"echo $USER").stdout.strip() == "pynie":
            alter_results_dir = Path(
                "/home/disk2/pynie/csevo-results") / project_name
            IOUtils.rm_dir(alter_results_dir)
            IOUtils.mk_dir(alter_results_dir.parent)
            BashUtils.run(f"mv {results_dir} {alter_results_dir}")
            self.logger.info(f"Results moved to {alter_results_dir}")
        # end if

        # -1. Remove repo
        IOUtils.rm_dir(downloads_dir)
        return
Exemple #20
0
    def process_data(self, method_data_list, data_type, output_dir,
                     traversal) -> List[int]:
        self.logger.info(f"Start processing")

        # Use DeepCom's required names
        data_type = {
            Macros.train: "train",
            Macros.val: "valid",
            Macros.test: "test",
            "debug": "debug",
        }[data_type]

        # Initialize vocab, error_ids (shared between processes)
        manager = multiprocessing.Manager()
        code_vocab = manager.dict()
        nl_vocab = manager.dict()
        sbt_vocab = manager.dict()
        vocabs_lock = manager.Lock()
        error_ids = manager.list()
        error_ids_lock = manager.Lock()

        # Multi-processing, split the tasks evenly
        tasks_each_process = len(
            method_data_list) // Macros.multi_processing + 1
        processes = list()
        for pid in range(Macros.multi_processing):
            beg = pid * tasks_each_process
            method_data_list_p = method_data_list[beg:beg + tasks_each_process]
            output_dir_p = output_dir / str(pid)
            IOUtils.mk_dir(output_dir_p)
            process = multiprocessing.Process(
                target=self.process_data_mp,
                args=(method_data_list_p, data_type, output_dir_p, pid, beg,
                      code_vocab, nl_vocab, sbt_vocab, vocabs_lock, error_ids,
                      error_ids_lock, traversal))
            process.start()
            processes.append(process)
        # end for

        for process in processes:
            process.join()
        # end for

        # Merge results
        code_file_name = data_type + ".token.code"
        nl_file_name = data_type + ".token.nl"
        sbt_file_name = data_type + ".token.sbt"
        data_type_output_dir = output_dir / data_type
        IOUtils.mk_dir(data_type_output_dir)
        for pid in range(Macros.multi_processing):
            for fname in [code_file_name, nl_file_name, sbt_file_name]:
                BashUtils.run(
                    f"cat {output_dir}/{pid}/{fname} >> {data_type_output_dir}/{fname}"
                )
            # end for
            IOUtils.rm_dir(output_dir / str(pid))
        # end for
        error_ids.sort()

        # Build vocab
        if data_type == "train":
            code_vocab_file = output_dir / "vocab.code"
            nl_vocab_file = output_dir / "vocab.nl"
            sbt_vocab_file = output_dir / "vocab.sbt"
            fcv = open(code_vocab_file, "w+")
            fnv = open(nl_vocab_file, "w+")
            fsv = open(sbt_vocab_file, "w+")
            # write vocab to files
            special_tokens = [
                '<S>', '</S>', '<UNK>', '<KEEP>', '<DEL>', '<INS>', '<SUB>',
                '<NONE>'
            ]

            # Filter based on frequency, keep first MAX_VOCAB
            code_vocabs_list = special_tokens + list(
                code_vocab.keys())[:self.MAX_VOCAB]
            nl_vocabs_list = special_tokens + list(
                nl_vocab.keys())[:self.MAX_VOCAB]
            sbt_vocabs_list = special_tokens + list(
                sbt_vocab.keys())[:self.MAX_VOCAB]
            for v in code_vocabs_list:
                fcv.write(v + "\n")
            for v in nl_vocabs_list:
                fnv.write(v + "\n")
            for v in sbt_vocabs_list:
                fsv.write(v + "\n")
            fcv.close()
            fsv.close()
            fnv.close()
        # end if

        return list(error_ids)
Exemple #21
0
 def get_git_url(cls):
     with IOUtils.cd(Macros.project_dir):
         return BashUtils.run(f"git config --get remote.origin.url",
                              expected_return_code=0).stdout.strip()
Exemple #22
0
    def extract_data_project(
        cls,
        project_path: Path,
        files: Optional[List[str]],
        exclude_files: Optional[List[str]],
        exclude_pattern: Optional[str],
        serapi_options: str,
        output_path: Path,
    ):
        # 1. Prepare output path
        if output_path.is_dir():
            cls.logger.warning(
                f"{output_path} already exists, will overwrite the files.")
        elif output_path.is_file():
            LoggingUtils.log_and_raise(
                cls.logger,
                f"{output_path} already exists as a file. Aborting.",
                Exception)
        else:
            IOUtils.mk_dir(output_path)
        # end if

        # 2. Extract documents, tok.sexp and ast.sexp
        coq_documents: Dict[str, CoqDocument] = collections.OrderedDict()
        ast_sexp_lists: Dict[str, List[SexpNode]] = dict()
        tok_sexp_lists: Dict[str, List[SexpNode]] = dict()

        with IOUtils.cd(project_path):
            coq_files: List[str] = BashUtils.run(
                f"find -name '*.v' -type f").stdout.split("\n")[:-1]
            coq_files = [coq_file[2:] for coq_file in coq_files]

            if files is not None:
                coq_files = [f for f in coq_files if f in files]
            # end if

            if exclude_files is not None:
                coq_files = [f for f in coq_files if f not in exclude_files]
            # end if

            if exclude_pattern is not None:
                re_exclude_pattern = re.compile(exclude_pattern)
                coq_files = [
                    f for f in coq_files if not re_exclude_pattern.fullmatch(f)
                ]
            # end if

            for i, coq_file in enumerate(tqdm(coq_files)):
                try:
                    # Read file
                    with open(coq_file, "r", newline="") as f:
                        source_code = f.read()
                    # end with

                    # Get unicode offsets
                    unicode_offsets = ParserUtils.get_unicode_offsets(
                        source_code)

                    # Call SerAPI
                    ast_sexp_str: str = BashUtils.run(
                        f"sercomp {serapi_options} --mode=sexp -- {coq_file}",
                        expected_return_code=0).stdout
                    tok_sexp_str: str = BashUtils.run(
                        f"sertok {serapi_options} -- {coq_file}",
                        expected_return_code=0).stdout

                    # Parse ast sexp
                    ast_sexp_list: List[SexpNode] = SexpParser.parse_list(
                        ast_sexp_str)
                    tok_sexp_list: List[SexpNode] = SexpParser.parse_list(
                        tok_sexp_str)

                    # Parse the document
                    coq_document = CoqParser.parse_document(
                        source_code,
                        ast_sexp_list,
                        tok_sexp_list,
                        unicode_offsets=unicode_offsets)

                    # Set meta data
                    coq_document.file_name = coq_file
                    coq_document.project_name = project_path.name

                    coq_documents[coq_file] = coq_document
                    ast_sexp_lists[coq_file] = ast_sexp_list
                    tok_sexp_lists[coq_file] = tok_sexp_list
                except KeyboardInterrupt:
                    cls.logger.warning("Keyboard interrupt!")
                    raise
                except:
                    cls.logger.warning(
                        f"File {coq_file} failed! Exception was: {traceback.format_exc()}"
                    )
                    continue
                # end try
            # end for

            # 3. Extract and save lemmas and definitions
            lemmas: List[Lemma] = list()
            definitions: List[Definition] = list()

            # Increase recursion limit because the backend sexps are CRAZZZZY deep
            sys.setrecursionlimit(10000)

            for file_path, doc in tqdm(coq_documents.items()):
                ast_sexp_list = ast_sexp_lists[file_path]
                lemmas_doc = cls.collect_lemmas_doc(doc, ast_sexp_list,
                                                    serapi_options)
                lemmas.extend(lemmas_doc)
                definitions_doc = cls.collect_definitions_doc(
                    doc, ast_sexp_list)
                definitions.extend(definitions_doc)
            # end for

            IOUtils.dump(output_path / "lemmas.json", IOUtils.jsonfy(lemmas),
                         IOUtils.Format.json)
            IOUtils.dump(output_path / "definitions.json",
                         IOUtils.jsonfy(definitions), IOUtils.Format.json)
        # end with
        return
Exemple #23
0
    def run_models_local(
            self,
            mode: str,
            models: List[str],
            exps: List[str],
            trials: List[int],
            timeout_hour: Optional[int],
            beg: int = 0,
            cnt: int = -1,
    ):
        """
        :param mode: train
        :param models: DeepCom
        :param exps: evolution-evolution ...
        :param trials: 0, 1, 2 ...
        :param timeout_hour: 24
        """
        if mode not in [Macros.train, Macros.test_common, Macros.test_standard]:
            raise ValueError(f"mode has to be one of {Macros.train}, {Macros.test_common}, {Macros.test_standard}")
        # end if

        assert beg >= 0
        assert cnt >= -1

        # Sort the models, exps, and trials lists to ensure the traversal order is stable
        models.sort()
        exps.sort()
        trials.sort()

        user = os.getenv("USER")
        home = os.getenv("HOME")
        re_work_dir = re.compile(rf"/work/\d+/{user}/maverick2")

        # Assuming each model uses one GPU
        total_script_cnt = 0
        for model in models:
            model_work_dir = self.work_dir / model

            for exp in exps:
                for trial in trials:
                    # Only output the jobs whose indexes are in the interval [beg, beg+cnt)
                    if (total_script_cnt < beg) or (cnt > 0 and total_script_cnt >= beg + cnt):
                        total_script_cnt += 1
                        continue

                    trial_dir = model_work_dir/exp/f"trial-{trial}"

                    # Modify the script to remove TACC stuff
                    script = IOUtils.load(trial_dir/f"{mode}.sh", IOUtils.Format.txt)
                    script = script.replace("\nmodule", "\n# module")
                    script = re_work_dir.sub(home, script)

                    # Replace the paths in config files as well
                    orig_configs = dict()
                    for config_file in trial_dir.glob("config*.json"):
                        config_content = IOUtils.load(config_file, IOUtils.Format.txt)
                        orig_configs[config_file] = config_content
                        config_content = re_work_dir.sub(home, config_content)
                        IOUtils.dump(config_file, config_content, IOUtils.Format.txt)

                    # Try to execute the script
                    try:
                        self.logger.info(f"Executing: {script}")
                        fd, fname = tempfile.mkstemp(suffix=".sh")
                        IOUtils.dump(fname, script, IOUtils.Format.txt)
                        os.close(fd)
                        BashUtils.run(f"chmod +x {fname}", expected_return_code=0)
                        BashUtils.run(f"{fname} 0\n", expected_return_code=0)
                    except RuntimeError:
                        traceback.print_exc()

                    # Revert the config files
                    for config_file, config_content in orig_configs.items():
                        IOUtils.dump(config_file, config_content, IOUtils.Format.txt)

                    total_script_cnt += 1
        return