Ejemplo n.º 1
0
def main():
    epochs = 10

    is_save_models = True
    train_batch_size = 64
    valid_batch_size = 1000

    use_cuda = True
    seed = 5
    train_log_interval = 10

    learning_rate = 0.01

    # get device
    is_cuda = use_cuda and torch.cuda.is_available()
    if not is_cuda:
        device = torch.device("cpu")
    else:
        device = torch.device(f"cuda:0")

    # data transform
    data_transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])

    # train loader
    train_loader = torch.utils.data.DataLoader(datasets.MNIST(
        str(lab.get_data_path()),
        train=True,
        download=True,
        transform=data_transform),
                                               batch_size=train_batch_size,
                                               shuffle=True)

    # valid loader
    valid_loader = torch.utils.data.DataLoader(datasets.MNIST(
        str(lab.get_data_path()),
        train=False,
        download=True,
        transform=data_transform),
                                               batch_size=valid_batch_size,
                                               shuffle=False)

    # model
    model = Net().to(device)

    # optimizer
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # set seeds
    torch.manual_seed(seed)

    # training loop
    for epoch in range(1, epochs + 1):
        train(epoch, model, optimizer, train_loader, device,
              train_log_interval)
        validate(epoch, model, valid_loader, device)

    if is_save_models:
        torch.save(model.state_dict(), "mnist_cnn.pt")
Ejemplo n.º 2
0
def load_index(conf: Configs, n_probe: int = 8):
    """
    ## Load the index
    """
    # Dimensions of $f(c_i)$
    d_model = conf.transformer.d_model
    # Training data loader
    data_loader = conf.trainer.data_loader
    # Number of contexts; i.e. number of tokens in the training data minus one.
    # $\big(f(c_i), w_i\big)$ for $i \in [2, T]$
    n_keys = data_loader.data.shape[0] * data_loader.data.shape[1] - 1

    # Load FAISS index
    with monit.section('Load index'):
        index = faiss.read_index(str(lab.get_data_path() / 'faiss.index'))
    # Set number of cells to probe
    index.nprobe = n_probe

    # Load memory mapped numpy arrays
    keys_store = np.memmap(str(lab.get_data_path() / 'keys.npy'),
                           dtype=np.float32,
                           mode='r',
                           shape=(n_keys, d_model))
    vals_store = np.memmap(str(lab.get_data_path() / 'vals.npy'),
                           dtype=np.int,
                           mode='r',
                           shape=(n_keys, 1))

    return index, keys_store, vals_store
Ejemplo n.º 3
0
 def _download():
     """
     Download the dataset
     """
     if not (lab.get_data_path() / 'cora').exists():
         download.download_file('https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz',
                                lab.get_data_path() / 'cora.tgz')
         download.extract_tar(lab.get_data_path() / 'cora.tgz', lab.get_data_path())
Ejemplo n.º 4
0
def create_folders():
    path = Path(lab.get_data_path() / 'download')
    if not path.exists():
        path.mkdir(parents=True)
    source = Path(lab.get_data_path() / 'source')

    if not source.exists():
        source.mkdir(parents=True)
Ejemplo n.º 5
0
def build_index(conf: Configs,
                n_centeroids: int = 2048,
                code_size: int = 64,
                n_probe: int = 8,
                n_train: int = 200_000):
    """
    ## Build FAISS index

    [Getting started](https://github.com/facebookresearch/faiss/wiki/Getting-started),
    [faster search](https://github.com/facebookresearch/faiss/wiki/Faster-search),
    and [lower memory footprint](https://github.com/facebookresearch/faiss/wiki/Lower-memory-footprint)
    tutorials on FAISS will help you learn more about FAISS usage.
    """
    # Dimensions of $f(c_i)$
    d_model = conf.transformer.d_model
    # Training data loader
    data_loader = conf.trainer.data_loader
    # Number of contexts; i.e. number of tokens in the training data minus one.
    # $\big(f(c_i), w_i\big)$ for $i \in [2, T]$
    n_keys = data_loader.data.shape[0] * data_loader.data.shape[1] - 1

    # Build an index with Verenoi cell based faster search with compression that
    # doesn't store full vectors.
    quantizer = faiss.IndexFlatL2(d_model)
    index = faiss.IndexIVFPQ(quantizer, d_model, n_centeroids, code_size, 8)
    index.nprobe = n_probe

    # Load the memory mapped numpy array of keys
    keys_store = np.memmap(str(lab.get_data_path() / 'keys.npy'),
                           dtype=np.float32,
                           mode='r',
                           shape=(n_keys, d_model))

    # Pick a random sample of keys to train the index with
    random_sample = np.random.choice(np.arange(n_keys),
                                     size=[min(n_train, n_keys)],
                                     replace=False)

    with monit.section('Train index'):
        # Train the index to store the keys
        index.train(keys_store[random_sample])

    # Add keys to the index; $\big(f(c_i), i\big)$
    for s in monit.iterate('Index', range(0, n_keys, 1024)):
        e = min(s + 1024, n_keys)
        # $f(c_i)$
        keys = keys_store[s:e]
        # $i$
        idx = np.arange(s, e)
        # Add to index
        index.add_with_ids(keys, idx)

    with monit.section('Save'):
        # Save the index
        faiss.write_index(index, str(lab.get_data_path() / 'faiss.index'))
def main():
    source_files = _GetPythonFiles().files

    np.random.shuffle(source_files)

    logger.inspect(source_files)

    train_valid_split = int(len(source_files) * 0.9)
    _load_code(lab.get_data_path() / 'train.py',
               source_files[:train_valid_split])
    _load_code(lab.get_data_path() / 'valid.py',
               source_files[train_valid_split:])
Ejemplo n.º 7
0
    def __init__(self, include_edges: bool = True):
        """
        Load the dataset
        """

        # Whether to include edges.
        # This is test how much accuracy is lost if we ignore the citation network.
        self.include_edges = include_edges

        # Download dataset
        self._download()

        # Read the paper ids, feature vectors, and labels
        with monit.section('Read content file'):
            content = np.genfromtxt(str(lab.get_data_path() /
                                        'cora/cora.content'),
                                    dtype=np.dtype(str))
        # Load the citations, it's a list of pairs of integers.
        with monit.section('Read citations file'):
            citations = np.genfromtxt(str(lab.get_data_path() /
                                          'cora/cora.cites'),
                                      dtype=np.int32)

        # Get the feature vectors
        features = torch.tensor(np.array(content[:, 1:-1], dtype=np.float32))
        # Normalize the feature vectors
        self.features = features / features.sum(dim=1, keepdim=True)

        # Get the class names and assign an unique integer to each of them
        self.classes = {s: i for i, s in enumerate(set(content[:, -1]))}
        # Get the labels as those integers
        self.labels = torch.tensor([self.classes[i] for i in content[:, -1]],
                                   dtype=torch.long)

        # Get the paper ids
        paper_ids = np.array(content[:, 0], dtype=np.int32)
        # Map of paper id to index
        ids_to_idx = {id_: i for i, id_ in enumerate(paper_ids)}

        # Empty adjacency matrix - an identity matrix
        self.adj_mat = torch.eye(len(self.labels), dtype=torch.bool)

        # Mark the citations in the adjacency matrix
        if self.include_edges:
            for e in citations:
                # The pair of paper indexes
                e1, e2 = ids_to_idx[e[0]], ids_to_idx[e[1]]
                # We build a symmetrical graph, where if paper $i$ referenced
                # paper $j$ we place an adge from $i$ to $j$ as well as an edge
                # from $j$ to $i$.
                self.adj_mat[e1][e2] = True
                self.adj_mat[e2][e1] = True
Ejemplo n.º 8
0
    def setup(self, stage=None):
        # Assign train/val datasets for use in dataloaders
        if stage == 'fit' or stage is None:
            mnist_full = MNIST(str(lab.get_data_path()),
                               train=True,
                               transform=self.transform)
            self.mnist_train, self.mnist_val = random_split(
                mnist_full, [55000, 5000])

        # Assign test dataset for use in dataloader(s)
        if stage == 'test' or stage is None:
            self.mnist_test = MNIST(str(lab.get_data_path()),
                                    train=False,
                                    transform=self.transform)
Ejemplo n.º 9
0
    def __init__(self, validation_dates: int, skip_cache: bool = False):
        self.validation_dates = validation_dates

        dates_cache_path = lab.get_data_path() / 'dates.npy'
        packets_cache_path = lab.get_data_path() / 'packets.npy'

        if skip_cache or not dates_cache_path.exists(
        ) or not packets_cache_path.exists():
            with monit.section('Build cache'):
                build_cache()

        with monit.section("Cache"):
            self.dates = np.load(str(dates_cache_path))
            self.packets = torch.tensor(np.load(str(packets_cache_path)),
                                        dtype=torch.float)
Ejemplo n.º 10
0
def load_bundle(path: Path, url: Optional[str] = None) -> Tuple[str, int]:
    if url:
        download_file(url, path)

    if not path.exists():
        raise FileNotFoundError(f'Bundle archive missing: {path}')

    with monit.section('Extract bundle'):
        with tarfile.open(str(path), 'r:gz') as tar:
            files = tar.getmembers()
            info_member = None
            for f in files:
                if f.name == 'info.json':
                    info_member = f

            if not info_member:
                raise RuntimeError(f"Corrupted bundle. Missing info.json")

            with tar.extractfile(info_member) as ef:
                info = json.load(ef)

            run_uuid, checkpoint = info['uuid'], info['checkpoint']
            run_path = get_run_by_uuid(run_uuid)

            if run_path is not None:
                logger.log(f"Run {run_uuid} exists", Text.meta)
                current_checkpoint = _get_run_checkpoint(run_path, checkpoint)
                if checkpoint == current_checkpoint:
                    logger.log(f"Checkpoint {checkpoint} exists", Text.meta)
                    return run_uuid, checkpoint

            run_path = lab.get_experiments_path() / 'bundled' / run_uuid

            checkpoint_path = run_path / "checkpoints" / str(checkpoint)
            if not checkpoint_path.exists():
                checkpoint_path.mkdir(parents=True)

            data_path = lab.get_data_path()
            if not data_path.exists():
                data_path.mkdir(parents=True)

            for f in files:
                if f.name == 'run.yaml':
                    _extract_tar_file(tar, f, run_path / 'run.yaml')
                elif f.name == 'configs.yaml':
                    _extract_tar_file(tar, f, run_path / 'configs.yaml')
                elif f.name.startswith('checkpoint/'):
                    p = f.name[len('checkpoint/'):]
                    p = checkpoint_path / p
                    if not p.parent.exists():
                        p.parent.mkdir(parents=True)
                    _extract_tar_file(tar, f, p)
                elif f.name.startswith('data/'):
                    p = f.name[len('data/'):]
                    p = data_path / p
                    if not p.parent.exists():
                        p.parent.mkdir(parents=True)
                    _extract_tar_file(tar, f, p)

            return run_uuid, checkpoint
Ejemplo n.º 11
0
def save_bundle(path: Path,
                run_uuid: str,
                checkpoint: int = -1,
                *,
                data_files: List[str]):
    run_path = get_run_by_uuid(run_uuid)
    if run_path is None:
        raise RuntimeError(f"Couldn't find run {run_uuid}")

    checkpoint = _get_run_checkpoint(run_path, checkpoint)

    if checkpoint is None:
        raise RuntimeError(f"Couldn't find checkpoint {run_uuid}:{checkpoint}")

    info_path = path.parent / f'{path.stem}.info.json'
    info = {'uuid': run_uuid, 'checkpoint': checkpoint}
    with open(str(info_path), 'w') as f:
        f.write(json.dumps(info))

    checkpoint_path = run_path / "checkpoints" / str(checkpoint)

    with monit.section('Create bundle'):
        with tarfile.open(str(path), 'w:gz') as tar:
            tar.add(str(checkpoint_path), 'checkpoint')
            tar.add(str(run_path / 'run.yaml'), 'run.yaml')
            tar.add(str(run_path / 'configs.yaml'), 'configs.yaml')
            tar.add(str(info_path), 'info.json')
            for f in data_files:
                tar.add(str(lab.get_data_path() / f), f'data/{f}')

    info_path.unlink()
Ejemplo n.º 12
0
def tiny_shakespeare(c: NLPAutoRegressionConfigs):
    return TextFileDataset(
        lab.get_data_path() / 'tiny_shakespeare.txt',
        c.tokenizer,
        url=
        'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
    )
Ejemplo n.º 13
0
def main():
    predictor = get_predictor()

    with open(str(lab.get_data_path() / 'sample.py'), 'r') as f:
        sample = f.read()
    with monit.section('Generate'):
        generate(predictor, 'import numpy as np\n', 1000)
Ejemplo n.º 14
0
def get_python_files():
    """
    Get list of python files and their paths inside `data/source` folder
    """

    source_path = Path(lab.get_data_path() / 'source')
    files: List[PythonFile] = []

    def _add_file(path: Path):
        """
        Add a file to the list of tiles
        """
        project = path.relative_to(source_path).parents
        relative_path = path.relative_to(source_path /
                                         project[len(project) - 3])

        files.append(
            PythonFile(relative_path=str(relative_path),
                       project=str(project[len(project) - 2]),
                       path=path))

    def _collect_python_files(path: Path):
        """
        Recursively collect files
        """
        for p in path.iterdir():
            if p.is_dir():
                _collect_python_files(p)
            else:
                _add_file(p)

    _collect_python_files(source_path)

    return files
Ejemplo n.º 15
0
def main():
    predictor = get_predictor()

    with open(str(lab.get_data_path() / 'sample.py'), 'r') as f:
        sample = f.read()
    with monit.section('Evaluate'):
        evaluate(predictor, sample)
Ejemplo n.º 16
0
def download_repo(org: str, repo: str, idx: Optional[int]):
    zip_file = Path(lab.get_data_path() / 'download' / f'{org}_{repo}.zip')

    if zip_file.exists():
        return zip_file

    if idx is not None:
        idx_str = f"{idx:03}: "
    else:
        idx_str = ""

    with monit.section(f"{idx_str} {org}/{repo}") as s:
        try:
            zip = urllib.request.urlopen(
                f'https://github.com/{org}/{repo}/archive/master.zip')
        except urllib.error.HTTPError as e:
            print(e)
            return
        content = zip.read()

        size = len(content) // 1024
        s.message = f"{size :,}KB"

        with open(str(zip_file), 'wb') as f:
            f.write(content)

    return zip_file
Ejemplo n.º 17
0
def gather_keys(conf: Configs):
    """
    ## Gather $\big(f(c_i), w_i\big)$ and save them in numpy arrays

    *Note that these numpy arrays will take up a lot of space (even few hundred gigabytes)
    depending on the size of your dataset*.
    """

    # Dimensions of $f(c_i)$
    d_model = conf.transformer.d_model
    # Training data loader
    data_loader = conf.trainer.data_loader
    # Number of contexts; i.e. number of tokens in the training data minus one.
    # $\big(f(c_i), w_i\big)$ for $i \in [2, T]$
    n_keys = data_loader.data.shape[0] * data_loader.data.shape[1] - 1
    # Numpy array for $f(c_i)$
    keys_store = np.memmap(str(lab.get_data_path() / 'keys.npy'),
                           dtype=np.float32,
                           mode='w+',
                           shape=(n_keys, d_model))
    # Numpy array for $w_i$
    vals_store = np.memmap(str(lab.get_data_path() / 'vals.npy'),
                           dtype=np.int,
                           mode='w+',
                           shape=(n_keys, 1))

    # Number of keys $f(c_i)$ collected
    added = 0
    with torch.no_grad():
        # Loop through data
        for i, batch in monit.enum("Collect data",
                                   data_loader,
                                   is_children_silent=True):
            # $w_i$ the target labels
            vals = batch[1].view(-1, 1)
            # Input data moved to the device of the model
            data = batch[0].to(conf.device)
            # Run the model
            _ = conf.model(data)
            # Get $f(c_i)$
            keys = conf.model.ff_input.view(-1, d_model)
            # Save keys, $f(c_i)$ in the memory mapped numpy array
            keys_store[added:added + keys.shape[0]] = keys.cpu()
            # Save values, $w_i$ in the memory mapped numpy array
            vals_store[added:added + keys.shape[0]] = vals
            # Increment the number of collected keys
            added += keys.shape[0]
Ejemplo n.º 18
0
def get_awesome_pytorch_readme():
    md = urllib.request.urlopen(
        'https://raw.githubusercontent.com/bharathgs/Awesome-pytorch-list/master/README.md'
    )
    content = md.read()

    with open(str(lab.get_data_path() / 'pytorch_awesome.md'), 'w') as f:
        f.write(str(content))
Ejemplo n.º 19
0
def _test_tiny_shakespeare():
    from labml import lab
    _ = TextFileDataset(
        lab.get_data_path() / 'tiny_shakespeare.txt',
        lambda x: list(x),
        url=
        'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
    )
Ejemplo n.º 20
0
def main():
    try:
        batch()
    except KeyboardInterrupt:
        pass

    source_files = get_python_files()

    np.random.shuffle(source_files)

    logger.inspect(source_files)

    train_valid_split = int(len(source_files) * 0.9)
    concat_and_save(lab.get_data_path() / 'train.py',
                    source_files[:train_valid_split])
    concat_and_save(lab.get_data_path() / 'valid.py',
                    source_files[train_valid_split:])
Ejemplo n.º 21
0
def _dataset(c: SourceCodeDataConfigs):
    train, valid = SourceCodeDataset.get_train_valid(lab.get_data_path(),
                                                     c.is_load_data)
    if c.truncate_data:
        train, valid = train[:c.truncate_data], valid[:c.truncate_data]
    if not c.tokenizer.is_trained:
        c.tokenizer.train(train + valid)
    return SourceCodeDataset(c.tokenizer, train, valid)
Ejemplo n.º 22
0
def main():
    conf = load_experiment()

    with open(str(lab.get_data_path() / 'sample.py'), 'r') as f:
        sample = f.read()
    with monit.section('Anomalies'):
        anomalies(conf.text.tokenizer, sample, conf.model, conf.state_updater,
                  conf.is_token_by_token)
Ejemplo n.º 23
0
    def __init__(self, device: torch.device):
        self.device = device

        # Load the BERT tokenizer from [HuggingFace](https://huggingface.co/bert-base-uncased)
        with monit.section('Load BERT tokenizer'):
            self.tokenizer = BertTokenizer.from_pretrained(
                'bert-base-uncased',
                cache_dir=str(lab.get_data_path() / 'cache' /
                              'bert-tokenizer'))

        # Load the BERT model from [HuggingFace](https://huggingface.co/bert-base-uncased)
        with monit.section('Load BERT model'):
            self.model = BertModel.from_pretrained(
                "bert-base-uncased",
                cache_dir=str(lab.get_data_path() / 'cache' / 'bert-model'))

            # Move the model to `device`
            self.model.to(device)
Ejemplo n.º 24
0
def tiny_shakespeare(c: Configs):
    """
    Initialize/load tiny shakespeare dataset

    This dataset is from Andrej Karpathy's [char-rnn](https://github.com/karpathy/char-rnn) project.
    """
    return TextFileDataset(
        lab.get_data_path() / 'tiny_shakespeare.txt', c.tokenizer,
        url='https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt')
Ejemplo n.º 25
0
def batch(overwrite: bool = False):
    with monit.section('Get pytorch_awesome'):
        get_awesome_pytorch_readme()
        repos = get_repos_from_readme('pytorch_awesome.md')

    # Download zips
    for i, r in monit.enum(f"Download {len(repos)} repos", repos):
        download_repo(r[0], r[1], i)

    # Extract downloads
    with monit.section('Extract zips'):
        download = Path(lab.get_data_path() / 'download')

        for repo in download.iterdir():
            extract_zip(repo, overwrite)

    with monit.section('Remove non python files'):
        remove_files(lab.get_data_path() / 'source', {'.py'})
Ejemplo n.º 26
0
def _data_loader(is_train, batch_size):
    return torch.utils.data.DataLoader(
        datasets.MNIST(str(lab.get_data_path()),
                       train=is_train,
                       download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=batch_size, shuffle=True)
Ejemplo n.º 27
0
    def __init__(self, image_size):
        transform = torchvision.transforms.Compose([
            torchvision.transforms.Resize(image_size),
            torchvision.transforms.ToTensor(),
        ])

        super().__init__(str(lab.get_data_path()),
                         train=True,
                         download=True,
                         transform=transform)
Ejemplo n.º 28
0
def download():
    path = Path(lab.get_data_path() / 'download')
    if not path.exists():
        path.mkdir(parents=True)

    get_awesome_pytorch()
    repos = get_repos('pytorch_awesome.md')

    for i, r in monit.enum("Download", repos):
        download_repo(r[0], r[1], i)
Ejemplo n.º 29
0
def cache_set(name: str, value: Any, file_type: str = 'json') -> Any:
    cache_path = lab.get_data_path() / 'cache'
    if not cache_path.exists():
        cache_path.mkdir(parents=True)
    path = cache_path / f'{name}.{file_type}'
    with open(str(path), 'w') as f:
        if file_type == 'json':
            json.dump(value, f)
        else:
            raise ValueError(f'Unknown file type: {file_type}')
Ejemplo n.º 30
0
    def initialize(self):
        """
        ## Initialize models and data loaders
        """
        input_shape = (self.img_channels, self.img_height, self.img_width)

        # Create the models
        self.generator_xy = GeneratorResNet(self.img_channels, self.n_residual_blocks).to(self.device)
        self.generator_yx = GeneratorResNet(self.img_channels, self.n_residual_blocks).to(self.device)
        self.discriminator_x = Discriminator(input_shape).to(self.device)
        self.discriminator_y = Discriminator(input_shape).to(self.device)

        # Create the optmizers
        self.generator_optimizer = torch.optim.Adam(
            itertools.chain(self.generator_xy.parameters(), self.generator_yx.parameters()),
            lr=self.learning_rate, betas=self.adam_betas)
        self.discriminator_optimizer = torch.optim.Adam(
            itertools.chain(self.discriminator_x.parameters(), self.discriminator_y.parameters()),
            lr=self.learning_rate, betas=self.adam_betas)

        # Create the learning rate schedules.
        # The learning rate stars flat until `decay_start` epochs,
        # and then linearly reduces to $0$ at end of training.
        decay_epochs = self.epochs - self.decay_start
        self.generator_lr_scheduler = torch.optim.lr_scheduler.LambdaLR(
            self.generator_optimizer, lr_lambda=lambda e: 1.0 - max(0, e - self.decay_start) / decay_epochs)
        self.discriminator_lr_scheduler = torch.optim.lr_scheduler.LambdaLR(
            self.discriminator_optimizer, lr_lambda=lambda e: 1.0 - max(0, e - self.decay_start) / decay_epochs)
        # Location of the dataset
        images_path = lab.get_data_path() / 'cycle_gan' / self.dataset_name

        # Image transformations
        transforms_ = [
            transforms.Resize(int(self.img_height * 1.12), Image.BICUBIC),
            transforms.RandomCrop((self.img_height, self.img_width)),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
        ]

        # Training data loader
        self.dataloader = DataLoader(
            ImageDataset(images_path, transforms_, True, 'train'),
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=self.data_loader_workers,
        )

        # Validation data loader
        self.valid_dataloader = DataLoader(
            ImageDataset(images_path, transforms_, True, "test"),
            batch_size=5,
            shuffle=True,
            num_workers=self.data_loader_workers,
        )