Esempio n. 1
0
    def prepare_data(self):
        data_prefix = f"{self.eval_setting}-{self.year}"
        data_dir = self.work_dir / "data"

        self.logger.info(
            f"Preparing the data for {self.eval_setting} {self.year} at {data_dir}"
        )
        IOUtils.rm_dir(data_dir)
        IOUtils.mk_dir(data_dir)

        # Copy train/val/test_common/test_standard data
        BashUtils.run(
            f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.train}/train {data_dir}/train",
            expected_return_code=0)
        BashUtils.run(
            f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.val}/valid {data_dir}/valid",
            expected_return_code=0)
        BashUtils.run(
            f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.test_common}/test {data_dir}/{Macros.test_common}",
            expected_return_code=0)
        BashUtils.run(
            f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/test {data_dir}/{Macros.test_standard}",
            expected_return_code=0)

        # Copy vocab
        BashUtils.run(
            f"cp {self.model_data_dir}/{data_prefix}-{Macros.train}/vocab* {data_dir}/",
            expected_return_code=0)

        return
    def suggest_naming(self, file_path: Path, prj_root: Optional[Path] = None):
        """
        Processes a file to get its lemmas and runs the model to get predictions.
        """
        # Figure out which project we're at, and then load configs
        if prj_root is None:
            prj_root = RoosterizeDirUtils.auto_infer_project_root(file_path)
        self.load_configs(prj_root)

        # Infer SerAPI options
        serapi_options = self.infer_serapi_options(prj_root)

        # If user provided compile_cmd, first compile the project
        if self.compile_cmd is not None:
            with IOUtils.cd(prj_root):
                BashUtils.run(self.compile_cmd, expected_return_code=0)

        # Parse file
        data = self.parse_file(file_path, prj_root, serapi_options)

        # Load model
        self.load_local_model(prj_root)
        model = self.get_model()

        # Use the model to make predictions
        # Temp dirs for processed data and results
        temp_data_dir = Path(tempfile.mkdtemp(prefix="roosterize"))

        # Dump lemmas & definitions
        temp_raw_data_dir = temp_data_dir / "raw"
        temp_raw_data_dir.mkdir()
        IOUtils.dump(
            temp_raw_data_dir / "lemmas.json",
            IOUtils.jsonfy(data.lemmas),
            IOUtils.Format.json,
        )
        IOUtils.dump(
            temp_raw_data_dir / "definitions.json",
            IOUtils.jsonfy(data.definitions),
            IOUtils.Format.json,
        )

        # Model-specific process
        temp_processed_data_dir = temp_data_dir / "processed"
        temp_processed_data_dir.mkdir()
        model.process_data_impl(temp_raw_data_dir, temp_processed_data_dir)

        # Invoke eval
        candidates_logprobs = model.eval_impl(
            temp_processed_data_dir,
            beam_search_size=self.beam_search_size,
            k=self.k,
        )

        # Save predictions
        IOUtils.rm_dir(temp_data_dir)

        # Report predictions
        self.report_predictions(data, candidates_logprobs)
        return
Esempio n. 3
0
    def prepare_data(self):
        data_prefix = f"{self.eval_setting}-{self.year}"
        data_dir = self.work_dir / "data"

        self.logger.info(
            f"Preparing the data for {self.eval_setting} {self.year} at {self.work_dir}"
        )
        IOUtils.rm_dir(data_dir)
        IOUtils.mk_dir(data_dir)

        # Copy train/val/test_common/test_standard data
        BashUtils.run(
            f"cp {self.model_data_dir}/{data_prefix}-{Macros.train}/code2seq.train.c2s {self.work_dir}/data/",
            expected_return_code=0)
        BashUtils.run(
            f"cp {self.model_data_dir}/{data_prefix}-{Macros.val}/code2seq.val.c2s {self.work_dir}/data/",
            expected_return_code=0)
        BashUtils.run(
            f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_common}/code2seq.test.c2s {self.work_dir}/data/code2seq.{Macros.test_common}.c2s",
            expected_return_code=0)
        BashUtils.run(
            f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/code2seq.test.c2s {self.work_dir}/data/code2seq.{Macros.test_standard}.c2s",
            expected_return_code=0)

        # Copy vocab
        BashUtils.run(
            f"cp {self.model_data_dir}/{data_prefix}-{Macros.train}/code2seq.dict.c2s {self.work_dir}/data/",
            expected_return_code=0)
        return
    def download_global_model(self, force_yes: bool = False):
        """
        Downloads a global Roosterize model.
        """
        global_model_dir = RoosterizeDirUtils.get_global_model_dir()
        if global_model_dir.exists():
            ans = self.ask_for_confirmation(
                f"A Roosterize model already exists at {global_model_dir}. "
                f"Do you want to delete it and download again?")
            if force_yes:
                ans = True
            if ans != True:
                return
            IOUtils.rm_dir(global_model_dir)

        self.show_message("Downloading Roosterize model...")

        # Download and unpack
        temp_model_dir = Path(tempfile.mkdtemp(prefix="roosterize"))

        urllib.request.urlretrieve(self.model_url,
                                   str(temp_model_dir / "model.tgz"))
        with IOUtils.cd(temp_model_dir):
            BashUtils.run("tar xzf model.tgz", expected_return_code=0)

            # Move the stuff to global model place
            shutil.move(str(Path.cwd() / "model"), global_model_dir)

        # Delete temp dir
        IOUtils.rm_dir(temp_model_dir)

        self.show_message("Finish downloading Roosterize model.")
Esempio n. 5
0
    def train(
            self,
            train_processed_data_dir: Path,
            val_processed_data_dir: Path,
            force_retrain: bool = False,
    ) -> NoReturn:
        """
        Trains the model on the training data.

        The trained model should be saved to output_dir.
        This function auto-saves a training-completed.txt as a proof of completion of training at the end.

        :param train_processed_data_dir: the directory containing the processed train data
        :param val_processed_data_dir: the directory containing the processed val data
        :param force_retrain: if set to True, re-train the model even if it was already trained (will remove previously trained model)
        """
        if force_retrain or not self.is_training_completed():
            self.logger.info(self.logging_prefix + f"Training model at {self.model_dir}; train: {train_processed_data_dir}, val: {val_processed_data_dir}")
            IOUtils.rm_dir(self.model_dir)
            IOUtils.mk_dir(self.model_dir)

            # Save spec & configs of this model
            IOUtils.dump(self.model_dir/"config-dict.json", IOUtils.jsonfy(self.config), IOUtils.Format.jsonPretty)
            IOUtils.dump(self.model_dir/"spec.json", IOUtils.jsonfy(self.spec), IOUtils.Format.jsonPretty)
            self.train_impl(train_processed_data_dir, val_processed_data_dir)
            IOUtils.dump(self.model_dir / self.TRAINING_COMPLETED_FILE_NAME, str(time.time_ns()), IOUtils.Format.txt)
        # end if
        return
Esempio n. 6
0
    def __init__(self, database: Database):
        self.database = database
        self.output_dir = Macros.data_dir / "split"
        IOUtils.rm_dir(self.output_dir)
        IOUtils.mk_dir(self.output_dir)

        self.statistics = dict()
        return
Esempio n. 7
0
    def prepare_code(self):
        IOUtils.rm_dir(self.code_dir)
        IOUtils.mk_dir(self.code_dir.parent)
        with IOUtils.cd(self.code_dir.parent):
            BashUtils.run(f"git clone {self.REPO_URL} {self.code_dir.name}", expected_return_code=0)
        # end with

        with IOUtils.cd(self.code_dir):
            BashUtils.run(f"git checkout {self.REPO_SHA}", expected_return_code=0)
        # end with
        return
Esempio n. 8
0
    def prepare_code(self):
        IOUtils.rm_dir(self.code_dir)
        IOUtils.mk_dir(self.code_dir.parent)
        with IOUtils.cd(self.code_dir.parent):
            BashUtils.run(f"git clone {self.REPO_URL} {self.code_dir.name}", expected_return_code=0)
        # end with

        with IOUtils.cd(self.code_dir):
            BashUtils.run(f"git checkout {self.REPO_SHA}", expected_return_code=0)
        # end with

        # copy eval code
        BashUtils.run(f"cp {Macros.this_dir}/eval/eval_utils.py {self.code_dir}/")
        return
Esempio n. 9
0
    def prepare_data(self):
        data_prefix = f"{self.eval_setting}-{self.year}"
        IOUtils.rm_dir(self.data_dir)
        IOUtils.mk_dir(self.data_dir)

        BashUtils.run(f"cp {self.model_data_dir}/{data_prefix}-{Macros.train}/transformer.* {self.data_dir}/",
                      expected_return_code=0)

        BashUtils.run(f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_common}/src-test.txt {self.data_dir}/src-{Macros.test_common}.txt", expected_return_code=0)
        BashUtils.run(f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_common}/tgt-test.txt {self.data_dir}/tgt-{Macros.test_common}.txt", expected_return_code=0)

        BashUtils.run(f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/src-test.txt {self.data_dir}/src-{Macros.test_standard}.txt", expected_return_code=0)
        BashUtils.run(f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/tgt-test.txt {self.data_dir}/tgt-{Macros.test_standard}.txt", expected_return_code=0)

        return
Esempio n. 10
0
    def prepare_data(self):
        if not self.use_latest:
            for t in range(13, 18):
                exp_dir = self.work_dir / f"{t}{t+1}-train"
                self.logger.info(
                    f"Preparing the data for {t}-{t+1} at {exp_dir}")
                IOUtils.rm_dir(exp_dir)
                IOUtils.mk_dir(exp_dir)

                # Copy train data
                BashUtils.run(
                    f"cp -r {self.model_data_dir}/20{t}-20{t+1}-train/train {exp_dir}/",
                    expected_return_code=0)

                # Copy val test data
                BashUtils.run(
                    f"cp -r {self.model_data_dir}/20{t+1}-20{t+2}-val/valid {exp_dir}/",
                    expected_return_code=0)
                BashUtils.run(
                    f"cp -r {self.model_data_dir}/20{t+2}-20{t+3}-test/test {exp_dir}/",
                    expected_return_code=0)

                # Copy vocab
                BashUtils.run(
                    f"cp {self.model_data_dir}/20{t}-20{t+1}-train/vocab* {exp_dir}/",
                    expected_return_code=0)
                # end for
            # end for
        else:
            exp_dir = self.work_dir / "latest"
            IOUtils.rm_dir(exp_dir)
            IOUtils.mk_dir(exp_dir)
            # Copy Train data
            BashUtils.run(
                f"cp -r {self.model_data_dir}/latest/train {exp_dir}/",
                expected_return_code=0)

            BashUtils.run(
                f"cp -r {self.model_data_dir}/latest/valid {exp_dir}/",
                expected_return_code=0)
            BashUtils.run(
                f"cp -r {self.model_data_dir}/latest/test {exp_dir}/",
                expected_return_code=0)

            # Copy vocab
            BashUtils.run(f"cp {self.model_data_dir}/latest/vocab* {exp_dir}/",
                          expected_return_code=0)
        return
    def improve_project_model(self, prj_root: Optional[Path]):
        if prj_root is None:
            prj_root = RoosterizeDirUtils.auto_infer_project_root()

        # Deactivate loaded model
        self.model = None

        # Delete existing local model
        local_model_dir = RoosterizeDirUtils.get_local_model_dir(prj_root)
        if local_model_dir.exists():
            ans = self.ask_for_confirmation(
                f"A Roosterize model already exists at {local_model_dir}"
                f"Do you want to delete it and train again?")
            if not ans:
                return
            else:
                IOUtils.rm_dir(local_model_dir)

        # Copy global model to local model, but remove "training complete" marker
        global_model_dir = RoosterizeDirUtils.get_global_model_dir()
        if not global_model_dir.exists():
            raise Exception(
                "Global Roosterize model not found! Please download model first."
            )
        shutil.copytree(global_model_dir, local_model_dir)

        # Load local model
        self.load_local_model(prj_root)
        model = self.get_model()

        # Collect all lemmas in this project
        temp_data_dir = Path(tempfile.mkdtemp(prefix="roosterize"))

        DataMiner.extract_data_project(
            prj_root,
            files=None,
            exclude_files=self.exclude_files,
            exclude_pattern=self.exclude_pattern,
            serapi_options=self.infer_serapi_options(prj_root),
            output_path=temp_data_dir)

        # TODO: Split data into train/val set, then process each data (no pre-processing / rebuilding vocab!)

        # TODO: Train model

        # Delete temp file
        IOUtils.rm_dir(temp_data_dir)
Esempio n. 12
0
    def process_data(
            self,
            data_dir: Path,
            output_processed_data_dir: Path,
            is_train: bool = False,
    ) -> NoReturn:
        """
        Processes the data to the intermediate format.
        """
        self.logger.info(self.logging_prefix + f"Processing data from {data_dir} to {output_processed_data_dir}")
        IOUtils.rm_dir(output_processed_data_dir)
        IOUtils.mk_dir(output_processed_data_dir)

        if is_train:
            # Preprocess with training data, if needed
            self.preprocess_with_train_data(data_dir, output_processed_data_dir)
        # end if

        self.process_data_impl(data_dir, output_processed_data_dir)
        return
Esempio n. 13
0
    def collect_project(self, project_name: str, project_url: str):
        Environment.require_collector()

        # 0. Download repo
        downloads_dir = self.repos_downloads_dir / project_name
        results_dir = self.repos_results_dir / project_name

        # Remove previous results if any
        IOUtils.rm_dir(results_dir)
        IOUtils.mk_dir(results_dir)

        # Clone the repo if not exists
        if not downloads_dir.exists():
            with IOUtils.cd(self.repos_downloads_dir):
                with TimeUtils.time_limit(300):
                    BashUtils.run(f"git clone {project_url} {project_name}",
                                  expected_return_code=0)
                # end with
            # end with
        # end if

        project_data = ProjectData.create()
        project_data.name = project_name
        project_data.url = project_url

        # 1. Get list of revisions
        with IOUtils.cd(downloads_dir):
            git_log_out = BashUtils.run(f"git log --pretty=format:'%H %P'",
                                        expected_return_code=0).stdout
            for line in git_log_out.splitlines()[:self.MAX_REVISIONS]:
                shas = line.split()
                project_data.revisions.append(shas[0])
                project_data.parent_revisions[shas[0]] = shas[1:]
            # end for
        # end with

        # 2. Get revisions in different year
        with IOUtils.cd(downloads_dir):
            for year in self.YEARS:
                git_log_out = BashUtils.run(
                    f"git rev-list -1 --before=\"Jan 1 {year}\" origin",
                    expected_return_code=0).stdout
                project_data.year_revisions[str(year) +
                                            "_Jan_1"] = git_log_out.rstrip()
            # end for
        # end with

        project_data_file = results_dir / "project.json"
        IOUtils.dump(project_data_file, IOUtils.jsonfy(project_data),
                     IOUtils.Format.jsonPretty)

        # 2. Start java collector
        # Prepare config
        log_file = results_dir / "collector-log.txt"
        output_dir = results_dir / "collector"

        config = {
            "collect": True,
            "projectDir": str(downloads_dir),
            "projectDataFile": str(project_data_file),
            "logFile": str(log_file),
            "outputDir": str(output_dir),
            "year":
            True  # To indicate whether to collect all evo data or yearly data
        }
        config_file = results_dir / "collector-config.json"
        IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty)

        self.logger.info(
            f"Starting the Java collector. Check log at {log_file} and outputs at {output_dir}"
        )
        rr = BashUtils.run(
            f"java -jar {Environment.collector_jar} {config_file}",
            expected_return_code=0)
        if rr.stderr:
            self.logger.warning(f"Stderr of collector:\n{rr.stderr}")
        # end if

        # 3. In some cases, save collected data to appropriate location or database
        # TODO private info
        # On luzhou server for user pynie, move it to a dedicated location at /user/disk2
        if BashUtils.run(
                f"hostname").stdout.strip() == "luzhou" and BashUtils.run(
                    f"echo $USER").stdout.strip() == "pynie":
            alter_results_dir = Path(
                "/home/disk2/pynie/csevo-results") / project_name
            IOUtils.rm_dir(alter_results_dir)
            IOUtils.mk_dir(alter_results_dir.parent)
            BashUtils.run(f"mv {results_dir} {alter_results_dir}")
            self.logger.info(f"Results moved to {alter_results_dir}")
        # end if

        # -1. Remove repo
        IOUtils.rm_dir(downloads_dir)
        return
Esempio n. 14
0
    def process_data(self, method_data_list, data_type, output_dir,
                     traversal) -> List[int]:
        self.logger.info(f"Start processing")

        # Use DeepCom's required names
        data_type = {
            Macros.train: "train",
            Macros.val: "valid",
            Macros.test: "test",
            "debug": "debug",
        }[data_type]

        # Initialize vocab, error_ids (shared between processes)
        manager = multiprocessing.Manager()
        code_vocab = manager.dict()
        nl_vocab = manager.dict()
        sbt_vocab = manager.dict()
        vocabs_lock = manager.Lock()
        error_ids = manager.list()
        error_ids_lock = manager.Lock()

        # Multi-processing, split the tasks evenly
        tasks_each_process = len(
            method_data_list) // Macros.multi_processing + 1
        processes = list()
        for pid in range(Macros.multi_processing):
            beg = pid * tasks_each_process
            method_data_list_p = method_data_list[beg:beg + tasks_each_process]
            output_dir_p = output_dir / str(pid)
            IOUtils.mk_dir(output_dir_p)
            process = multiprocessing.Process(
                target=self.process_data_mp,
                args=(method_data_list_p, data_type, output_dir_p, pid, beg,
                      code_vocab, nl_vocab, sbt_vocab, vocabs_lock, error_ids,
                      error_ids_lock, traversal))
            process.start()
            processes.append(process)
        # end for

        for process in processes:
            process.join()
        # end for

        # Merge results
        code_file_name = data_type + ".token.code"
        nl_file_name = data_type + ".token.nl"
        sbt_file_name = data_type + ".token.sbt"
        data_type_output_dir = output_dir / data_type
        IOUtils.mk_dir(data_type_output_dir)
        for pid in range(Macros.multi_processing):
            for fname in [code_file_name, nl_file_name, sbt_file_name]:
                BashUtils.run(
                    f"cat {output_dir}/{pid}/{fname} >> {data_type_output_dir}/{fname}"
                )
            # end for
            IOUtils.rm_dir(output_dir / str(pid))
        # end for
        error_ids.sort()

        # Build vocab
        if data_type == "train":
            code_vocab_file = output_dir / "vocab.code"
            nl_vocab_file = output_dir / "vocab.nl"
            sbt_vocab_file = output_dir / "vocab.sbt"
            fcv = open(code_vocab_file, "w+")
            fnv = open(nl_vocab_file, "w+")
            fsv = open(sbt_vocab_file, "w+")
            # write vocab to files
            special_tokens = [
                '<S>', '</S>', '<UNK>', '<KEEP>', '<DEL>', '<INS>', '<SUB>',
                '<NONE>'
            ]

            # Filter based on frequency, keep first MAX_VOCAB
            code_vocabs_list = special_tokens + list(
                code_vocab.keys())[:self.MAX_VOCAB]
            nl_vocabs_list = special_tokens + list(
                nl_vocab.keys())[:self.MAX_VOCAB]
            sbt_vocabs_list = special_tokens + list(
                sbt_vocab.keys())[:self.MAX_VOCAB]
            for v in code_vocabs_list:
                fcv.write(v + "\n")
            for v in nl_vocabs_list:
                fnv.write(v + "\n")
            for v in sbt_vocabs_list:
                fsv.write(v + "\n")
            fcv.close()
            fsv.close()
            fnv.close()
        # end if

        return list(error_ids)