Esempio n. 1
0
def get_pytext_home():
    internal_home = os.path.realpath(os.path.join(__file__, "../../"))
    oss_home = os.path.realpath(os.path.join(__file__, "../../../"))
    default_home = ""
    # use tests as anchor which will always in PYTEXT_HOME/tests
    if PathManager.exists(os.path.join(internal_home, "tests")):
        default_home = internal_home
    elif PathManager.exists(os.path.join(oss_home, "tests")):
        default_home = oss_home
    else:
        # when PyText is used as a module and packed as part of a single file X
        # __file__ will be path of X instead of path.py
        # in these case, PYTEXT_HOME will be the parent folder of X
        default_home = os.path.dirname(__file__)
    pytext_home = os.environ.get("PYTEXT_HOME", default_home)
    return pytext_home
Esempio n. 2
0
 def __init__(
     self,
     pre_train_model_path,
     analyzed_sparsity,
     max_analysis_batches,
     max_skipped_weight,
     pre_analysis_path,
     sparsity,
 ):
     assert PathManager.exists(
         pre_train_model_path), "The pre-trained model must be exist"
     self.pre_train_model_path = pre_train_model_path
     self.param_dict = None
     assert (0.0 <= analyzed_sparsity <=
             1.0), "Analyzed sparsity need to be in the range of [0, 1]"
     self.analyzed_sparsity = analyzed_sparsity
     self.max_analysis_batches = max_analysis_batches
     self.max_skipped_weight = max_skipped_weight
     self.require_mask_parameters = []
     self.pre_analysis_path = pre_analysis_path
     assert (0.0 <= sparsity <=
             1.0), "Pruning sparsity need to be in the range of [0, 1]"
     self.sparsity = sparsity
     self._masks = None
     self.analysis_state = State.OTHERS
Esempio n. 3
0
    def process_squad_json(self, fname):
        if not fname:
            return
        if not PathManager.exists(fname):
            print(f"{fname} does not exist. Not unflattening.")
            return
        with PathManager.open(fname) as infile:
            dump = json.load(infile)

        id = 0
        for article in dump["data"]:
            for paragraph in article["paragraphs"]:
                doc = paragraph["context"]
                for question in paragraph["qas"]:
                    has_answer = not question.get("is_impossible", False)
                    answers = (question["answers"] if has_answer else
                               question["plausible_answers"])
                    question = question["question"]
                    answer_texts = [answer["text"] for answer in answers]
                    answer_starts = [
                        int(answer["answer_start"]) for answer in answers
                    ]
                    for piece_dict in _split_document(
                            id,
                            doc,
                            question,
                            answer_texts,
                            answer_starts,
                            has_answer,
                            self.ignore_impossible,
                            self.max_character_length,
                            self.min_overlap,
                    ):
                        yield piece_dict
                    id += 1
Esempio n. 4
0
def get_absolute_path(file_path: str) -> str:
    if os.path.isabs(file_path):
        return file_path
    absolute_path = os.path.realpath(os.path.join(PYTEXT_HOME, file_path))
    if PathManager.exists(absolute_path):
        return absolute_path
    return file_path
Esempio n. 5
0
    def process_file(self, fname, is_train):
        if not fname:
            print(f"File path is either empty or None. Not unflattening.")
            return
        if not PathManager.exists(fname):
            print(f"{fname} does not exist. Not unflattening.")
            return

        with PathManager.open(fname) as infile:
            dump = json.load(infile)

        # Code pointer: https://fburl.com/yv8osgvo
        for row in dump:
            question = row["question"]
            positive_ctx = combine_title_text(row["positive_ctxs"][0],
                                              self.use_title)
            negative_ctxs = ([
                combine_title_text(ctx, self.use_title)
                for ctx in row["negative_ctxs"]
            ] if is_train else None)
            num_negative_ctx = min(self.num_negative_ctxs, len(negative_ctxs))
            yield {
                "question": question,
                "positive_ctx": positive_ctx,
                "negative_ctxs": random.shuffle(negative_ctxs),
                "label": "1",  # Make LabelTensorizer.initialize() happy.
                "num_negative_ctx": num_negative_ctx,
            }
Esempio n. 6
0
    def process_file(self, fname, is_train):
        if not fname:
            print("File path is either empty or None. Not unflattening.")
            return
        if not PathManager.exists(fname):
            print(f"{fname} does not exist. Not unflattening.")
            return

        for row in self.read_file(fname):
            question = row["question"]
            positive_ctx = combine_title_text_id(row["positive_ctxs"][0],
                                                 self.use_title)

            negative_ctxs = [
                combine_title_text_id(ctx, self.use_title)
                for ctx in row["negative_ctxs"]
            ]

            if not negative_ctxs and row.get("distant_negatives"):
                # use distant_negatives in case we don't have hard negatives
                # it's better to have at least one negative for training
                negative_ctxs = [
                    combine_title_text_id(ctx, self.use_title)
                    for ctx in row["distant_negatives"]
                ]

            if is_train:
                random.shuffle(negative_ctxs)
                if isinstance(question, list):
                    # We can have a list of questions in the training data.
                    # This is to account for paraphrases. We randomly sample a single paraphrases
                    # in every epoch. Thus, with enough epochs all questions should be tried.
                    question = question[random.randint(0, len(question) - 1)]
            else:
                # for non training runs, always take the num_negative_ctxs without shuffling
                # this makes the evaluation and test sets deterministic
                negative_ctxs = negative_ctxs[:self.num_negative_ctxs]

            num_negative_ctx = min(self.num_negative_ctxs, len(negative_ctxs))
            yield {
                "question": question,
                "positive_ctx": positive_ctx,
                "negative_ctxs": negative_ctxs,
                "label": "1",  # Make LabelTensorizer.initialize() happy.
                "num_negative_ctx": num_negative_ctx,
            }
Esempio n. 7
0
    def process_file(self, fname, is_train):
        if not fname:
            print(f"File path is either empty or None. Not unflattening.")
            return
        if not PathManager.exists(fname):
            print(f"{fname} does not exist. Not unflattening.")
            return

        with PathManager.open(fname) as infile:
            # Code pointer: https://fburl.com/yv8osgvo
            for line in infile:
                row = json.loads(line)
                question = row["question"]
                positive_ctx = combine_title_text(row["positive_ctxs"][0],
                                                  self.use_title)

                negative_ctxs = [
                    combine_title_text(ctx, self.use_title)
                    for ctx in row["negative_ctxs"]
                ]

                if not negative_ctxs and row.get("distant_negatives"):
                    # use distant_negatives in case we don't have hard negatives
                    # it's better to have at least one negative for training
                    negative_ctxs = [
                        combine_title_text(ctx, self.use_title)
                        for ctx in row["distant_negatives"]
                    ]

                if is_train:
                    random.shuffle(negative_ctxs)
                else:
                    # for non training runs, always take the num_negative_ctxs without shuffling
                    # this makes the evaluation and test sets deterministic
                    negative_ctxs = negative_ctxs[:self.num_negative_ctxs]

                num_negative_ctx = min(self.num_negative_ctxs,
                                       len(negative_ctxs))
                yield {
                    "question": question,
                    "positive_ctx": positive_ctx,
                    "negative_ctxs": negative_ctxs,
                    "label": "1",  # Make LabelTensorizer.initialize() happy.
                    "num_negative_ctx": num_negative_ctx,
                }
Esempio n. 8
0
 def __init__(
     self,
     pre_train_model_path,
     analyzed_sparsity,
     max_analysis_batches,
     max_skipped_weight,
 ):
     assert PathManager.exists(
         pre_train_model_path
     ), "The pre-trained model must be exist"
     self.pre_train_model_path = pre_train_model_path
     assert (
         0.0 <= analyzed_sparsity <= 1.0
     ), "Analyzed sparsity need to be in the range of [0, 1]"
     self.analyzed_sparsity = analyzed_sparsity
     self.max_analysis_batches = max_analysis_batches
     self.max_skipped_weight = max_skipped_weight
     self.require_mask_parameters = []
Esempio n. 9
0
    def __init__(
        self,
        pre_train_model_path,
        analyzed_sparsity,
        max_analysis_batches,
        max_skipped_weight,
        pre_analysis_path,
        sparsity,
        iterative_pruning,
        pruning_iterations,
        start_sparsity_ratio,
    ):
        assert PathManager.exists(
            pre_train_model_path), "The pre-trained model must be exist"
        self.pre_train_model_path = pre_train_model_path
        self.param_dict = None
        assert (0.0 <= analyzed_sparsity <=
                1.0), "Analyzed sparsity need to be in the range of [0, 1]"
        self.analyzed_sparsity = analyzed_sparsity
        self.max_analysis_batches = max_analysis_batches
        self.max_skipped_weight = max_skipped_weight
        self.require_mask_parameters = []
        self.pre_analysis_path = pre_analysis_path
        assert (0.0 <= sparsity <=
                1.0), "Pruning sparsity need to be in the range of [0, 1]"
        self.sparsity = sparsity
        self._masks = None
        self.analysis_state = State.OTHERS
        self.iterative_pruning = iterative_pruning

        # members used for iterative pruning
        if self.iterative_pruning:
            assert (
                pruning_iterations > 1
            ), "iterative pruning should contains at least two pruning iterations"
            self.pruning_iterations = pruning_iterations
            self.start_sparsity = start_sparsity_ratio * sparsity
            self.end_sparsity = self.sparsity
            self.epochs_per_iter = 0
            self.sparsity_increment = 0.0
Esempio n. 10
0
def get_latest_checkpoint_path(dir_path: Optional[str] = None) -> str:
    """
    Get the latest checkpoint path
    args:
        dir_path: the dir to scan for existing checkpoint files. Default: if None,
        the latest checkpoint path saved in momery will be returned
    Returns: checkpoint_path
    """
    if not dir_path:
        return _CHECKPOINT_MANAGER.get_latest_checkpoint_path()

    if PathManager.exists(dir_path):
        checkpoint_indices = [
            int(file_path.split("-")[1])
            for file_path in PathManager.ls(dir_path)
            if file_path.startswith("checkpoint")
        ]
        if checkpoint_indices:
            latest_checkpoint_path = f"{dir_path}/checkpoint-{max(checkpoint_indices)}"
            logger.info(
                f"find the latest checkpoint: {latest_checkpoint_path}")
            return latest_checkpoint_path
    return None
Esempio n. 11
0
    def save(
        self,
        config: PyTextConfig,
        model: Model,
        meta: Optional[CommonMetadata],
        tensorizers: Dict[str, Tensorizer],
        training_state: Optional[TrainingState] = None,
        identifier: str = None,
    ) -> str:
        """
        save a checkpoint to given path, config, model and training_state
        together represent the checkpoint. When identifier is None, this
        function is used to save post-training snapshot
        """
        saved_path = ""
        if identifier:
            # saving during-training checkpoints
            saved_path = self.generate_checkpoint_path(config, identifier)
            print("Saving checkpoint to ", saved_path)
        else:
            # saving post-training snapshot if no identifer given
            saved_path = config.save_snapshot_path
            print(f"Saving pytorch model to: {saved_path}")

        saved_folder = os.path.dirname(saved_path)
        if not PathManager.exists(saved_folder):
            PathManager.mkdirs(saved_folder)
            print(f"created {saved_folder}")

        with PathManager.open(saved_path, "wb") as checkpoint_f:
            save_checkpoint(checkpoint_f, config, model, meta, tensorizers,
                            training_state)
            if identifier:
                self._saved_paths.append(saved_path)
            else:
                self._post_training_snapshot_path = saved_path
        return saved_path
Esempio n. 12
0
def save(
    config: PyTextConfig,
    model: Model,
    meta: Optional[CommonMetadata],
    tensorizers: Dict[str, Tensorizer],
    training_state: Optional[TrainingState] = None,
    identifier: Optional[str] = None,
) -> str:
    """
    Save all stateful information of a training task to a specified file-like
    object, will save the original config, model state, metadata,
    training state if training is not completed
    Args:
    identifier (str): used to identify a checkpoint within a training job,
    used as a suffix for save path
    config (PytextConfig): contains all raw parameter/hyper-parameters
    for training task
    model (Model): actual model in training
    training_state (TrainingState): stateful infomation during training
    Returns:
    identifier (str): if identifier is not specified, will save to
    config.save_snapshot_path to be consistent to post-training snapshot;
    if specified, will be used to save checkpoint during training,
    identifier is used to identify checkpoints in the same training
    """
    saved_path = ""
    if identifier:
        # saving during-training checkpoints
        saved_path = generate_checkpoint_path(config, identifier)
    else:
        # saving post-training snapshot if no identifer given
        saved_path = config.save_snapshot_path
        print(f"Saving pytorch model to: {saved_path}")

    saved_folder = os.path.dirname(saved_path)
    if not PathManager.exists(saved_folder):
        PathManager.mkdirs(saved_folder)
        print(f"created {saved_folder}")

    # Currently torch.save() has error pickling certain models when not saving
    # by model.state_dict(), thus currently overriding the model in
    # training_state with None, and put back saving
    # https://github.com/pytorch/pytorch/issues/15116
    model_in_training_state = None
    if training_state:
        model_in_training_state, training_state.model = training_state.model, None
    try:
        state = {
            DATA_STATE: meta,
            CONFIG_JSON: config_to_json(PyTextConfig, config),
            MODEL_STATE: model.state_dict(),
            SERIALIZE_VERSION_KEY: LATEST_SERIALIZE_VERSION,
            TENSORIZERS: tensorizers,
            TRAINING_STATE: training_state,
        }
        if identifier is not None:
            _CHECKPOINT_MANAGER.save_checkpoint(state, saved_path)
        else:
            _CHECKPOINT_MANAGER.save_snapshot(state, saved_path)

    finally:
        if training_state:
            training_state.model = model_in_training_state
    return saved_path