def _build(self): data_dir = self.data_dir() data_dir.mkdir(exist_ok=True, parents=True) download_archive( "http://www.cs.cmu.edu/~ark/personas/data/MovieSummaries.tar.gz", data_dir)
def _build(self): # Download data if we need it and don't already have it if self.fasttext_model is not None and not self.weights_dir.exists(): self.weights_dir.mkdir(parents=True) try: self.logger.info("Downloading pre-trained weights.") download_archive(FASTTEXT_VECTOR_ARCHIVES[self.fasttext_model], self.weights_dir) self.logger.info("Weights downloaded.") except Exception: # Don't leave the weights directory in a partially downloaded state if self.weights_dir.exists(): shutil.rmtree(self.weights_dir) raise
def _build(self): # Download data if we need it and don't already have it if (self.fasttext_model is not None and not (self.weights_dir / self.fasttext_model).exists()): with tempfile.TemporaryDirectory() as tmpdir: tmp_weights_dir = Path(tmpdir) / self.weights_dir.name tmp_weights_dir.mkdir() self.logger.info("Downloading pre-trained weights.") download_archive(FASTTEXT_VECTOR_ARCHIVES[self.fasttext_model], tmp_weights_dir) shutil.move(tmp_weights_dir, self.weights_dir) self.logger.info("Weights downloaded.") # Build the custom docker image self.docker_client.images.build( path=str(FastText._BUILD_PATH), tag=self.image_tag, **self._base_docker_build_kwargs, )
def _build(self): # Download data if we don't already have it if not self.weights_dir.exists(): with tempfile.TemporaryDirectory() as tmpdir: tmp_weights_dir = Path(tmpdir) / self.weights_dir.name tmp_weights_dir.mkdir() self.logger.info("Downloading pre-trained weights.") download_archive( USE_MODEL_ARCHIVES[self.use_model], tmp_weights_dir, filename=f"{self.use_model}.tar.gz", ) shutil.move(tmp_weights_dir, self.weights_dir) self.logger.info("Weights downloaded.") # Build the docker image self.docker_client.images.build( path=str(USE._BUILD_PATH), tag=self.image_tag, **self._base_docker_build_kwargs, )
def _build(self): # Download data if we need it and don't already have it if self.fasttext_model is not None and not self.weights_dir.exists(): self.weights_dir.mkdir(parents=True) try: self.logger.info("Downloading pre-trained weights.") download_archive(FASTTEXT_VECTOR_ARCHIVES[self.fasttext_model], self.weights_dir) self.logger.info("Weights downloaded.") except Exception: # Don't leave the weights directory in a partially downloaded state if self.weights_dir.exists(): shutil.rmtree(self.weights_dir) raise # Build the custom docker image self.docker_client.images.build( path=str(FastText._BUILD_PATH), tag=self.image_tag, **self._base_docker_build_kwargs, )
def _build(self): # Download data if we don't already have it # Download into a temp dir and move the result into the destination dir # to ensure partial downloads don't leave corrupted state if not self.weights_dir.exists(): with tempfile.TemporaryDirectory() as tmpdir: tmp_weights_dir = Path(tmpdir) / self.weights_dir.name tmp_weights_dir.mkdir() self.logger.info("Downloading pre-trained weights.") download_archive( BERT_MODEL_ARCHIVES[self.bert_model], tmp_weights_dir, junk_paths=True, ) shutil.move(tmp_weights_dir, self.weights_dir) self.logger.info("Weights downloaded.") # Build the docker image self.docker_client.images.build( path=str(BERT._BUILD_PATH), tag=self.image_tag, **self._base_docker_build_kwargs, )
def _build(self): # Download data if we don't already have it if not self.weights_dir.exists(): self.weights_dir.mkdir(parents=True) try: self.logger.info("Downloading pre-trained weights.") download_archive( BERT_MODEL_ARCHIVES[self.bert_model], self.weights_dir, junk_paths=True, ) self.logger.info("Weights downloaded.") except Exception: # Don't leave the weights directory in a partially downloaded state if self.weights_dir.exists(): shutil.rmtree(self.weights_dir) raise # Build the docker image self.docker_client.images.build( path=str(BERT._BUILD_PATH), tag=self.image_tag, **self._base_docker_build_kwargs, )
def __init__( self, # Can't make this type more restrictive since gensim might not be # available, and we need to make the union include a gensim type model: Any, tokenizer: Union[str, TokenizeMethod, Callable[[List[str]], List[List[str]]]] = TokenizeMethod.SPLIT, n_similar: int = 10, diversity: float = 0.8, ): try: import gensim from gensim.scripts.glove2word2vec import glove2word2vec except ImportError: raise ImportError( "word2vec-based data augmentation requires gensim to be installed." ) if isinstance(model, str): # Download and extract pretrained weights from a public source assert_in("word2vec model", model, set(WORD2VEC_MODELS.keys())) archive_name, filename = WORD2VEC_MODELS[model] archive_url = _WORD2VEC_MODEL_ARCHIVES[archive_name] LOGGER.debug(f"Downloading word2vec model '{model}'") # Some downloads aren't contained in archives if is_archive(Path(archive_url)): extract_dir = download_archive(archive_url, self.data_dir(), junk_paths=True) model_file = extract_dir / filename else: model_file = download_file(archive_url) if model.startswith("glove"): LOGGER.debug(f"Converting GloVe format to word2vec format") # Need to convert the downloaded file to word2vec format, # since GloVe vectorsr are formatted slightly differently with tempfile.NamedTemporaryFile() as f: tempfile_path = Path(f.name) glove2word2vec(model_file, tempfile_path) shutil.copy2(tempfile_path, model_file) LOGGER.debug(f"Loading word2vec model '{model}'") self._model = gensim.models.KeyedVectors.load_word2vec_format( model_file) LOGGER.debug(f"word2vec model loaded") elif isinstance(model, Path): LOGGER.debug(f"Loading word2vec model from path '{model}'") self._model = gensim.models.KeyedVectors.load_word2vec_format( str(model)) LOGGER.debug(f"word2vec model loaded") elif isinstance(model, (gensim.models.Word2Vec, gensim.models.KeyedVectors)): self._model = model else: raise TypeError( f"unsupported type for initializing word2vec model: '{type(model)}'" ) assert_type("n_similar", n_similar, int) if n_similar <= 0: raise ValueError("n_similar must be > 0") self.n_similar = n_similar assert_type("diversity", diversity, float) if not 0 < diversity <= 1: raise ValueError("diversity must be > 0 and <= 1") self.diversity = diversity if isinstance(tokenizer, str): tokenizer = TokenizeMethod[tokenizer] if isinstance(tokenizer, TokenizeMethod): # Avoid mypy error when passing a partially-applied function created by # functools.partial self.tokenizer = cast( Callable[[List[str]], List[List[str]]], functools.partial(tokenize, tokenizer), ) elif callable(tokenizer): self.tokenizer = tokenizer else: raise TypeError( f"unsupported type for tokenizer: '{type(tokenizer)}'")
def download(self, data_dir: Path) -> Path: return download_archive( "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", data_dir)
def download(self, data_dir: Path) -> Path: return download_archive( "https://ndownloader.figshare.com/files/5975967", data_dir, filename="20news-bydate.tar.gz", )