Beispiel #1
0
    def _build(self):
        data_dir = self.data_dir()
        data_dir.mkdir(exist_ok=True, parents=True)

        download_archive(
            "http://www.cs.cmu.edu/~ark/personas/data/MovieSummaries.tar.gz",
            data_dir)
Beispiel #2
0
 def _build(self):
     # Download data if we need it and don't already have it
     if self.fasttext_model is not None and not self.weights_dir.exists():
         self.weights_dir.mkdir(parents=True)
         try:
             self.logger.info("Downloading pre-trained weights.")
             download_archive(FASTTEXT_VECTOR_ARCHIVES[self.fasttext_model],
                              self.weights_dir)
             self.logger.info("Weights downloaded.")
         except Exception:
             # Don't leave the weights directory in a partially downloaded state
             if self.weights_dir.exists():
                 shutil.rmtree(self.weights_dir)
             raise
Beispiel #3
0
    def _build(self):
        # Download data if we need it and don't already have it
        if (self.fasttext_model is not None
                and not (self.weights_dir / self.fasttext_model).exists()):
            with tempfile.TemporaryDirectory() as tmpdir:
                tmp_weights_dir = Path(tmpdir) / self.weights_dir.name
                tmp_weights_dir.mkdir()
                self.logger.info("Downloading pre-trained weights.")
                download_archive(FASTTEXT_VECTOR_ARCHIVES[self.fasttext_model],
                                 tmp_weights_dir)
                shutil.move(tmp_weights_dir, self.weights_dir)
                self.logger.info("Weights downloaded.")

        # Build the custom docker image
        self.docker_client.images.build(
            path=str(FastText._BUILD_PATH),
            tag=self.image_tag,
            **self._base_docker_build_kwargs,
        )
Beispiel #4
0
    def _build(self):
        # Download data if we don't already have it
        if not self.weights_dir.exists():
            with tempfile.TemporaryDirectory() as tmpdir:
                tmp_weights_dir = Path(tmpdir) / self.weights_dir.name
                tmp_weights_dir.mkdir()
                self.logger.info("Downloading pre-trained weights.")
                download_archive(
                    USE_MODEL_ARCHIVES[self.use_model],
                    tmp_weights_dir,
                    filename=f"{self.use_model}.tar.gz",
                )
                shutil.move(tmp_weights_dir, self.weights_dir)
                self.logger.info("Weights downloaded.")

        # Build the docker image
        self.docker_client.images.build(
            path=str(USE._BUILD_PATH),
            tag=self.image_tag,
            **self._base_docker_build_kwargs,
        )
Beispiel #5
0
    def _build(self):
        # Download data if we need it and don't already have it
        if self.fasttext_model is not None and not self.weights_dir.exists():
            self.weights_dir.mkdir(parents=True)
            try:
                self.logger.info("Downloading pre-trained weights.")
                download_archive(FASTTEXT_VECTOR_ARCHIVES[self.fasttext_model],
                                 self.weights_dir)
                self.logger.info("Weights downloaded.")
            except Exception:
                # Don't leave the weights directory in a partially downloaded state
                if self.weights_dir.exists():
                    shutil.rmtree(self.weights_dir)
                raise

        # Build the custom docker image
        self.docker_client.images.build(
            path=str(FastText._BUILD_PATH),
            tag=self.image_tag,
            **self._base_docker_build_kwargs,
        )
Beispiel #6
0
    def _build(self):
        # Download data if we don't already have it
        # Download into a temp dir and move the result into the destination dir
        # to ensure partial downloads don't leave corrupted state
        if not self.weights_dir.exists():
            with tempfile.TemporaryDirectory() as tmpdir:
                tmp_weights_dir = Path(tmpdir) / self.weights_dir.name
                tmp_weights_dir.mkdir()
                self.logger.info("Downloading pre-trained weights.")
                download_archive(
                    BERT_MODEL_ARCHIVES[self.bert_model],
                    tmp_weights_dir,
                    junk_paths=True,
                )
                shutil.move(tmp_weights_dir, self.weights_dir)
                self.logger.info("Weights downloaded.")

        # Build the docker image
        self.docker_client.images.build(
            path=str(BERT._BUILD_PATH),
            tag=self.image_tag,
            **self._base_docker_build_kwargs,
        )
Beispiel #7
0
    def _build(self):
        # Download data if we don't already have it
        if not self.weights_dir.exists():
            self.weights_dir.mkdir(parents=True)
            try:
                self.logger.info("Downloading pre-trained weights.")
                download_archive(
                    BERT_MODEL_ARCHIVES[self.bert_model],
                    self.weights_dir,
                    junk_paths=True,
                )
                self.logger.info("Weights downloaded.")
            except Exception:
                # Don't leave the weights directory in a partially downloaded state
                if self.weights_dir.exists():
                    shutil.rmtree(self.weights_dir)
                raise

        # Build the docker image
        self.docker_client.images.build(
            path=str(BERT._BUILD_PATH),
            tag=self.image_tag,
            **self._base_docker_build_kwargs,
        )
Beispiel #8
0
    def __init__(
        self,
        # Can't make this type more restrictive since gensim might not be
        # available, and we need to make the union include a gensim type
        model: Any,
        tokenizer: Union[str, TokenizeMethod,
                         Callable[[List[str]],
                                  List[List[str]]]] = TokenizeMethod.SPLIT,
        n_similar: int = 10,
        diversity: float = 0.8,
    ):
        try:
            import gensim
            from gensim.scripts.glove2word2vec import glove2word2vec
        except ImportError:
            raise ImportError(
                "word2vec-based data augmentation requires gensim to be installed."
            )

        if isinstance(model, str):
            # Download and extract pretrained weights from a public source
            assert_in("word2vec model", model, set(WORD2VEC_MODELS.keys()))
            archive_name, filename = WORD2VEC_MODELS[model]
            archive_url = _WORD2VEC_MODEL_ARCHIVES[archive_name]

            LOGGER.debug(f"Downloading word2vec model '{model}'")
            # Some downloads aren't contained in archives
            if is_archive(Path(archive_url)):
                extract_dir = download_archive(archive_url,
                                               self.data_dir(),
                                               junk_paths=True)
                model_file = extract_dir / filename
            else:
                model_file = download_file(archive_url)

            if model.startswith("glove"):
                LOGGER.debug(f"Converting GloVe format to word2vec format")
                # Need to convert the downloaded file to word2vec format,
                # since GloVe vectorsr are formatted slightly differently
                with tempfile.NamedTemporaryFile() as f:
                    tempfile_path = Path(f.name)
                    glove2word2vec(model_file, tempfile_path)
                    shutil.copy2(tempfile_path, model_file)

            LOGGER.debug(f"Loading word2vec model '{model}'")
            self._model = gensim.models.KeyedVectors.load_word2vec_format(
                model_file)
            LOGGER.debug(f"word2vec model loaded")
        elif isinstance(model, Path):
            LOGGER.debug(f"Loading word2vec model from path '{model}'")
            self._model = gensim.models.KeyedVectors.load_word2vec_format(
                str(model))
            LOGGER.debug(f"word2vec model loaded")
        elif isinstance(model,
                        (gensim.models.Word2Vec, gensim.models.KeyedVectors)):
            self._model = model
        else:
            raise TypeError(
                f"unsupported type for initializing word2vec model: '{type(model)}'"
            )

        assert_type("n_similar", n_similar, int)
        if n_similar <= 0:
            raise ValueError("n_similar must be > 0")
        self.n_similar = n_similar

        assert_type("diversity", diversity, float)
        if not 0 < diversity <= 1:
            raise ValueError("diversity must be > 0 and <= 1")
        self.diversity = diversity

        if isinstance(tokenizer, str):
            tokenizer = TokenizeMethod[tokenizer]

        if isinstance(tokenizer, TokenizeMethod):
            # Avoid mypy error when passing a partially-applied function created by
            # functools.partial
            self.tokenizer = cast(
                Callable[[List[str]], List[List[str]]],
                functools.partial(tokenize, tokenizer),
            )
        elif callable(tokenizer):
            self.tokenizer = tokenizer
        else:
            raise TypeError(
                f"unsupported type for tokenizer: '{type(tokenizer)}'")
Beispiel #9
0
 def download(self, data_dir: Path) -> Path:
     return download_archive(
         "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
         data_dir)
Beispiel #10
0
 def download(self, data_dir: Path) -> Path:
     return download_archive(
         "https://ndownloader.figshare.com/files/5975967",
         data_dir,
         filename="20news-bydate.tar.gz",
     )