def main(): epochs = 10 is_save_models = True train_batch_size = 64 valid_batch_size = 1000 use_cuda = True seed = 5 train_log_interval = 10 learning_rate = 0.01 # get device is_cuda = use_cuda and torch.cuda.is_available() if not is_cuda: device = torch.device("cpu") else: device = torch.device(f"cuda:0") # data transform data_transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, ))]) # train loader train_loader = torch.utils.data.DataLoader(datasets.MNIST( str(lab.get_data_path()), train=True, download=True, transform=data_transform), batch_size=train_batch_size, shuffle=True) # valid loader valid_loader = torch.utils.data.DataLoader(datasets.MNIST( str(lab.get_data_path()), train=False, download=True, transform=data_transform), batch_size=valid_batch_size, shuffle=False) # model model = Net().to(device) # optimizer optimizer = optim.Adam(model.parameters(), lr=learning_rate) # set seeds torch.manual_seed(seed) # training loop for epoch in range(1, epochs + 1): train(epoch, model, optimizer, train_loader, device, train_log_interval) validate(epoch, model, valid_loader, device) if is_save_models: torch.save(model.state_dict(), "mnist_cnn.pt")
def load_index(conf: Configs, n_probe: int = 8): """ ## Load the index """ # Dimensions of $f(c_i)$ d_model = conf.transformer.d_model # Training data loader data_loader = conf.trainer.data_loader # Number of contexts; i.e. number of tokens in the training data minus one. # $\big(f(c_i), w_i\big)$ for $i \in [2, T]$ n_keys = data_loader.data.shape[0] * data_loader.data.shape[1] - 1 # Load FAISS index with monit.section('Load index'): index = faiss.read_index(str(lab.get_data_path() / 'faiss.index')) # Set number of cells to probe index.nprobe = n_probe # Load memory mapped numpy arrays keys_store = np.memmap(str(lab.get_data_path() / 'keys.npy'), dtype=np.float32, mode='r', shape=(n_keys, d_model)) vals_store = np.memmap(str(lab.get_data_path() / 'vals.npy'), dtype=np.int, mode='r', shape=(n_keys, 1)) return index, keys_store, vals_store
def _download(): """ Download the dataset """ if not (lab.get_data_path() / 'cora').exists(): download.download_file('https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz', lab.get_data_path() / 'cora.tgz') download.extract_tar(lab.get_data_path() / 'cora.tgz', lab.get_data_path())
def create_folders(): path = Path(lab.get_data_path() / 'download') if not path.exists(): path.mkdir(parents=True) source = Path(lab.get_data_path() / 'source') if not source.exists(): source.mkdir(parents=True)
def build_index(conf: Configs, n_centeroids: int = 2048, code_size: int = 64, n_probe: int = 8, n_train: int = 200_000): """ ## Build FAISS index [Getting started](https://github.com/facebookresearch/faiss/wiki/Getting-started), [faster search](https://github.com/facebookresearch/faiss/wiki/Faster-search), and [lower memory footprint](https://github.com/facebookresearch/faiss/wiki/Lower-memory-footprint) tutorials on FAISS will help you learn more about FAISS usage. """ # Dimensions of $f(c_i)$ d_model = conf.transformer.d_model # Training data loader data_loader = conf.trainer.data_loader # Number of contexts; i.e. number of tokens in the training data minus one. # $\big(f(c_i), w_i\big)$ for $i \in [2, T]$ n_keys = data_loader.data.shape[0] * data_loader.data.shape[1] - 1 # Build an index with Verenoi cell based faster search with compression that # doesn't store full vectors. quantizer = faiss.IndexFlatL2(d_model) index = faiss.IndexIVFPQ(quantizer, d_model, n_centeroids, code_size, 8) index.nprobe = n_probe # Load the memory mapped numpy array of keys keys_store = np.memmap(str(lab.get_data_path() / 'keys.npy'), dtype=np.float32, mode='r', shape=(n_keys, d_model)) # Pick a random sample of keys to train the index with random_sample = np.random.choice(np.arange(n_keys), size=[min(n_train, n_keys)], replace=False) with monit.section('Train index'): # Train the index to store the keys index.train(keys_store[random_sample]) # Add keys to the index; $\big(f(c_i), i\big)$ for s in monit.iterate('Index', range(0, n_keys, 1024)): e = min(s + 1024, n_keys) # $f(c_i)$ keys = keys_store[s:e] # $i$ idx = np.arange(s, e) # Add to index index.add_with_ids(keys, idx) with monit.section('Save'): # Save the index faiss.write_index(index, str(lab.get_data_path() / 'faiss.index'))
def main(): source_files = _GetPythonFiles().files np.random.shuffle(source_files) logger.inspect(source_files) train_valid_split = int(len(source_files) * 0.9) _load_code(lab.get_data_path() / 'train.py', source_files[:train_valid_split]) _load_code(lab.get_data_path() / 'valid.py', source_files[train_valid_split:])
def __init__(self, include_edges: bool = True): """ Load the dataset """ # Whether to include edges. # This is test how much accuracy is lost if we ignore the citation network. self.include_edges = include_edges # Download dataset self._download() # Read the paper ids, feature vectors, and labels with monit.section('Read content file'): content = np.genfromtxt(str(lab.get_data_path() / 'cora/cora.content'), dtype=np.dtype(str)) # Load the citations, it's a list of pairs of integers. with monit.section('Read citations file'): citations = np.genfromtxt(str(lab.get_data_path() / 'cora/cora.cites'), dtype=np.int32) # Get the feature vectors features = torch.tensor(np.array(content[:, 1:-1], dtype=np.float32)) # Normalize the feature vectors self.features = features / features.sum(dim=1, keepdim=True) # Get the class names and assign an unique integer to each of them self.classes = {s: i for i, s in enumerate(set(content[:, -1]))} # Get the labels as those integers self.labels = torch.tensor([self.classes[i] for i in content[:, -1]], dtype=torch.long) # Get the paper ids paper_ids = np.array(content[:, 0], dtype=np.int32) # Map of paper id to index ids_to_idx = {id_: i for i, id_ in enumerate(paper_ids)} # Empty adjacency matrix - an identity matrix self.adj_mat = torch.eye(len(self.labels), dtype=torch.bool) # Mark the citations in the adjacency matrix if self.include_edges: for e in citations: # The pair of paper indexes e1, e2 = ids_to_idx[e[0]], ids_to_idx[e[1]] # We build a symmetrical graph, where if paper $i$ referenced # paper $j$ we place an adge from $i$ to $j$ as well as an edge # from $j$ to $i$. self.adj_mat[e1][e2] = True self.adj_mat[e2][e1] = True
def setup(self, stage=None): # Assign train/val datasets for use in dataloaders if stage == 'fit' or stage is None: mnist_full = MNIST(str(lab.get_data_path()), train=True, transform=self.transform) self.mnist_train, self.mnist_val = random_split( mnist_full, [55000, 5000]) # Assign test dataset for use in dataloader(s) if stage == 'test' or stage is None: self.mnist_test = MNIST(str(lab.get_data_path()), train=False, transform=self.transform)
def __init__(self, validation_dates: int, skip_cache: bool = False): self.validation_dates = validation_dates dates_cache_path = lab.get_data_path() / 'dates.npy' packets_cache_path = lab.get_data_path() / 'packets.npy' if skip_cache or not dates_cache_path.exists( ) or not packets_cache_path.exists(): with monit.section('Build cache'): build_cache() with monit.section("Cache"): self.dates = np.load(str(dates_cache_path)) self.packets = torch.tensor(np.load(str(packets_cache_path)), dtype=torch.float)
def load_bundle(path: Path, url: Optional[str] = None) -> Tuple[str, int]: if url: download_file(url, path) if not path.exists(): raise FileNotFoundError(f'Bundle archive missing: {path}') with monit.section('Extract bundle'): with tarfile.open(str(path), 'r:gz') as tar: files = tar.getmembers() info_member = None for f in files: if f.name == 'info.json': info_member = f if not info_member: raise RuntimeError(f"Corrupted bundle. Missing info.json") with tar.extractfile(info_member) as ef: info = json.load(ef) run_uuid, checkpoint = info['uuid'], info['checkpoint'] run_path = get_run_by_uuid(run_uuid) if run_path is not None: logger.log(f"Run {run_uuid} exists", Text.meta) current_checkpoint = _get_run_checkpoint(run_path, checkpoint) if checkpoint == current_checkpoint: logger.log(f"Checkpoint {checkpoint} exists", Text.meta) return run_uuid, checkpoint run_path = lab.get_experiments_path() / 'bundled' / run_uuid checkpoint_path = run_path / "checkpoints" / str(checkpoint) if not checkpoint_path.exists(): checkpoint_path.mkdir(parents=True) data_path = lab.get_data_path() if not data_path.exists(): data_path.mkdir(parents=True) for f in files: if f.name == 'run.yaml': _extract_tar_file(tar, f, run_path / 'run.yaml') elif f.name == 'configs.yaml': _extract_tar_file(tar, f, run_path / 'configs.yaml') elif f.name.startswith('checkpoint/'): p = f.name[len('checkpoint/'):] p = checkpoint_path / p if not p.parent.exists(): p.parent.mkdir(parents=True) _extract_tar_file(tar, f, p) elif f.name.startswith('data/'): p = f.name[len('data/'):] p = data_path / p if not p.parent.exists(): p.parent.mkdir(parents=True) _extract_tar_file(tar, f, p) return run_uuid, checkpoint
def save_bundle(path: Path, run_uuid: str, checkpoint: int = -1, *, data_files: List[str]): run_path = get_run_by_uuid(run_uuid) if run_path is None: raise RuntimeError(f"Couldn't find run {run_uuid}") checkpoint = _get_run_checkpoint(run_path, checkpoint) if checkpoint is None: raise RuntimeError(f"Couldn't find checkpoint {run_uuid}:{checkpoint}") info_path = path.parent / f'{path.stem}.info.json' info = {'uuid': run_uuid, 'checkpoint': checkpoint} with open(str(info_path), 'w') as f: f.write(json.dumps(info)) checkpoint_path = run_path / "checkpoints" / str(checkpoint) with monit.section('Create bundle'): with tarfile.open(str(path), 'w:gz') as tar: tar.add(str(checkpoint_path), 'checkpoint') tar.add(str(run_path / 'run.yaml'), 'run.yaml') tar.add(str(run_path / 'configs.yaml'), 'configs.yaml') tar.add(str(info_path), 'info.json') for f in data_files: tar.add(str(lab.get_data_path() / f), f'data/{f}') info_path.unlink()
def tiny_shakespeare(c: NLPAutoRegressionConfigs): return TextFileDataset( lab.get_data_path() / 'tiny_shakespeare.txt', c.tokenizer, url= 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt' )
def main(): predictor = get_predictor() with open(str(lab.get_data_path() / 'sample.py'), 'r') as f: sample = f.read() with monit.section('Generate'): generate(predictor, 'import numpy as np\n', 1000)
def get_python_files(): """ Get list of python files and their paths inside `data/source` folder """ source_path = Path(lab.get_data_path() / 'source') files: List[PythonFile] = [] def _add_file(path: Path): """ Add a file to the list of tiles """ project = path.relative_to(source_path).parents relative_path = path.relative_to(source_path / project[len(project) - 3]) files.append( PythonFile(relative_path=str(relative_path), project=str(project[len(project) - 2]), path=path)) def _collect_python_files(path: Path): """ Recursively collect files """ for p in path.iterdir(): if p.is_dir(): _collect_python_files(p) else: _add_file(p) _collect_python_files(source_path) return files
def main(): predictor = get_predictor() with open(str(lab.get_data_path() / 'sample.py'), 'r') as f: sample = f.read() with monit.section('Evaluate'): evaluate(predictor, sample)
def download_repo(org: str, repo: str, idx: Optional[int]): zip_file = Path(lab.get_data_path() / 'download' / f'{org}_{repo}.zip') if zip_file.exists(): return zip_file if idx is not None: idx_str = f"{idx:03}: " else: idx_str = "" with monit.section(f"{idx_str} {org}/{repo}") as s: try: zip = urllib.request.urlopen( f'https://github.com/{org}/{repo}/archive/master.zip') except urllib.error.HTTPError as e: print(e) return content = zip.read() size = len(content) // 1024 s.message = f"{size :,}KB" with open(str(zip_file), 'wb') as f: f.write(content) return zip_file
def gather_keys(conf: Configs): """ ## Gather $\big(f(c_i), w_i\big)$ and save them in numpy arrays *Note that these numpy arrays will take up a lot of space (even few hundred gigabytes) depending on the size of your dataset*. """ # Dimensions of $f(c_i)$ d_model = conf.transformer.d_model # Training data loader data_loader = conf.trainer.data_loader # Number of contexts; i.e. number of tokens in the training data minus one. # $\big(f(c_i), w_i\big)$ for $i \in [2, T]$ n_keys = data_loader.data.shape[0] * data_loader.data.shape[1] - 1 # Numpy array for $f(c_i)$ keys_store = np.memmap(str(lab.get_data_path() / 'keys.npy'), dtype=np.float32, mode='w+', shape=(n_keys, d_model)) # Numpy array for $w_i$ vals_store = np.memmap(str(lab.get_data_path() / 'vals.npy'), dtype=np.int, mode='w+', shape=(n_keys, 1)) # Number of keys $f(c_i)$ collected added = 0 with torch.no_grad(): # Loop through data for i, batch in monit.enum("Collect data", data_loader, is_children_silent=True): # $w_i$ the target labels vals = batch[1].view(-1, 1) # Input data moved to the device of the model data = batch[0].to(conf.device) # Run the model _ = conf.model(data) # Get $f(c_i)$ keys = conf.model.ff_input.view(-1, d_model) # Save keys, $f(c_i)$ in the memory mapped numpy array keys_store[added:added + keys.shape[0]] = keys.cpu() # Save values, $w_i$ in the memory mapped numpy array vals_store[added:added + keys.shape[0]] = vals # Increment the number of collected keys added += keys.shape[0]
def get_awesome_pytorch_readme(): md = urllib.request.urlopen( 'https://raw.githubusercontent.com/bharathgs/Awesome-pytorch-list/master/README.md' ) content = md.read() with open(str(lab.get_data_path() / 'pytorch_awesome.md'), 'w') as f: f.write(str(content))
def _test_tiny_shakespeare(): from labml import lab _ = TextFileDataset( lab.get_data_path() / 'tiny_shakespeare.txt', lambda x: list(x), url= 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt' )
def main(): try: batch() except KeyboardInterrupt: pass source_files = get_python_files() np.random.shuffle(source_files) logger.inspect(source_files) train_valid_split = int(len(source_files) * 0.9) concat_and_save(lab.get_data_path() / 'train.py', source_files[:train_valid_split]) concat_and_save(lab.get_data_path() / 'valid.py', source_files[train_valid_split:])
def _dataset(c: SourceCodeDataConfigs): train, valid = SourceCodeDataset.get_train_valid(lab.get_data_path(), c.is_load_data) if c.truncate_data: train, valid = train[:c.truncate_data], valid[:c.truncate_data] if not c.tokenizer.is_trained: c.tokenizer.train(train + valid) return SourceCodeDataset(c.tokenizer, train, valid)
def main(): conf = load_experiment() with open(str(lab.get_data_path() / 'sample.py'), 'r') as f: sample = f.read() with monit.section('Anomalies'): anomalies(conf.text.tokenizer, sample, conf.model, conf.state_updater, conf.is_token_by_token)
def __init__(self, device: torch.device): self.device = device # Load the BERT tokenizer from [HuggingFace](https://huggingface.co/bert-base-uncased) with monit.section('Load BERT tokenizer'): self.tokenizer = BertTokenizer.from_pretrained( 'bert-base-uncased', cache_dir=str(lab.get_data_path() / 'cache' / 'bert-tokenizer')) # Load the BERT model from [HuggingFace](https://huggingface.co/bert-base-uncased) with monit.section('Load BERT model'): self.model = BertModel.from_pretrained( "bert-base-uncased", cache_dir=str(lab.get_data_path() / 'cache' / 'bert-model')) # Move the model to `device` self.model.to(device)
def tiny_shakespeare(c: Configs): """ Initialize/load tiny shakespeare dataset This dataset is from Andrej Karpathy's [char-rnn](https://github.com/karpathy/char-rnn) project. """ return TextFileDataset( lab.get_data_path() / 'tiny_shakespeare.txt', c.tokenizer, url='https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt')
def batch(overwrite: bool = False): with monit.section('Get pytorch_awesome'): get_awesome_pytorch_readme() repos = get_repos_from_readme('pytorch_awesome.md') # Download zips for i, r in monit.enum(f"Download {len(repos)} repos", repos): download_repo(r[0], r[1], i) # Extract downloads with monit.section('Extract zips'): download = Path(lab.get_data_path() / 'download') for repo in download.iterdir(): extract_zip(repo, overwrite) with monit.section('Remove non python files'): remove_files(lab.get_data_path() / 'source', {'.py'})
def _data_loader(is_train, batch_size): return torch.utils.data.DataLoader( datasets.MNIST(str(lab.get_data_path()), train=is_train, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=batch_size, shuffle=True)
def __init__(self, image_size): transform = torchvision.transforms.Compose([ torchvision.transforms.Resize(image_size), torchvision.transforms.ToTensor(), ]) super().__init__(str(lab.get_data_path()), train=True, download=True, transform=transform)
def download(): path = Path(lab.get_data_path() / 'download') if not path.exists(): path.mkdir(parents=True) get_awesome_pytorch() repos = get_repos('pytorch_awesome.md') for i, r in monit.enum("Download", repos): download_repo(r[0], r[1], i)
def cache_set(name: str, value: Any, file_type: str = 'json') -> Any: cache_path = lab.get_data_path() / 'cache' if not cache_path.exists(): cache_path.mkdir(parents=True) path = cache_path / f'{name}.{file_type}' with open(str(path), 'w') as f: if file_type == 'json': json.dump(value, f) else: raise ValueError(f'Unknown file type: {file_type}')
def initialize(self): """ ## Initialize models and data loaders """ input_shape = (self.img_channels, self.img_height, self.img_width) # Create the models self.generator_xy = GeneratorResNet(self.img_channels, self.n_residual_blocks).to(self.device) self.generator_yx = GeneratorResNet(self.img_channels, self.n_residual_blocks).to(self.device) self.discriminator_x = Discriminator(input_shape).to(self.device) self.discriminator_y = Discriminator(input_shape).to(self.device) # Create the optmizers self.generator_optimizer = torch.optim.Adam( itertools.chain(self.generator_xy.parameters(), self.generator_yx.parameters()), lr=self.learning_rate, betas=self.adam_betas) self.discriminator_optimizer = torch.optim.Adam( itertools.chain(self.discriminator_x.parameters(), self.discriminator_y.parameters()), lr=self.learning_rate, betas=self.adam_betas) # Create the learning rate schedules. # The learning rate stars flat until `decay_start` epochs, # and then linearly reduces to $0$ at end of training. decay_epochs = self.epochs - self.decay_start self.generator_lr_scheduler = torch.optim.lr_scheduler.LambdaLR( self.generator_optimizer, lr_lambda=lambda e: 1.0 - max(0, e - self.decay_start) / decay_epochs) self.discriminator_lr_scheduler = torch.optim.lr_scheduler.LambdaLR( self.discriminator_optimizer, lr_lambda=lambda e: 1.0 - max(0, e - self.decay_start) / decay_epochs) # Location of the dataset images_path = lab.get_data_path() / 'cycle_gan' / self.dataset_name # Image transformations transforms_ = [ transforms.Resize(int(self.img_height * 1.12), Image.BICUBIC), transforms.RandomCrop((self.img_height, self.img_width)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), ] # Training data loader self.dataloader = DataLoader( ImageDataset(images_path, transforms_, True, 'train'), batch_size=self.batch_size, shuffle=True, num_workers=self.data_loader_workers, ) # Validation data loader self.valid_dataloader = DataLoader( ImageDataset(images_path, transforms_, True, "test"), batch_size=5, shuffle=True, num_workers=self.data_loader_workers, )