def preprocess_squad(): download_prefix = os.path.join("download", "squad") data_prefix = os.path.join("data", "squad") print("Downloading datasets into {}".format(download_prefix)) print("Preprocessing datasets into {}".format(data_prefix)) if not os.path.exists(download_prefix): os.makedirs(download_prefix) if not os.path.exists(data_prefix): os.makedirs(data_prefix) train_filename = maybe_download(squad_base_url, config.SQUAD_TRAIN_FILENAME, download_prefix, 30288272) train_data = SquadData.load_raw(train_filename) train_data.shuffle() train_data.save(config.SQUAD_TRAIN_PREFIX) dev_filename = maybe_download(squad_base_url, config.SQUAD_DEV_FILENAME, download_prefix, 4854279) dev_data = SquadData.load_raw(dev_filename) dev_data.shuffle() dev_data.save(config.SQUAD_DEV_PREFIX)
def download(self): data_dir = self.path if not os.path.exists(data_dir): os.mkdir(data_dir) for filename in self.data_files.values(): path = self.path + '/' + filename if not os.path.exists(path): url = urljoin(self.base_url, filename) util.maybe_download(url, path)
def prepare_ptb( files = {'train': './ptb/ptb.train.txt', 'test': './ptb/ptb.test.txt', 'valid': './ptb/ptb.valid_txt'} ): train_url = 'https://raw.githubusercontent.com/tomsercu/lstm/master/data/ptb.train.txt' valid_url = 'https://raw.githubusercontent.com/tomsercu/lstm/master/data/ptb.valid.txt' test_url = 'https://raw.githubusercontent.com/tomsercu/lstm/master/data/ptb.test.txt' train_txt = maybe_download(files['train'], train_url) test_txt = maybe_download(files['test'], test_url) valid_txt = maybe_download(files['valid'], valid_url)
def download(self): data_dir = self.path if not os.path.exists(data_dir): os.mkdir(data_dir) for filename in self.data_files.values(): path = self.path+'/'+filename if not os.path.exists(path): url = urljoin(self.base_url, filename) util.maybe_download(url, path)
def get_code(buttons): max_x = len(buttons[0]) max_y = len(buttons) x = None y = None for yi in range(max_y): for xi in range(max_x): if buttons[yi][xi] == '5': x = xi y = yi break if x is not None: break code = '' with maybe_download(2) as file: for s in file: for ch in s.strip(): if ch == 'U' and y - 1 >= 0 and buttons[y - 1][x] != ' ': y -= 1 elif ch == 'D' and y + 1 < max_y and buttons[y + 1][x] != ' ': y += 1 elif ch == 'L' and x - 1 >= 0 and buttons[y][x - 1] != ' ': x -= 1 elif ch == 'R' and x + 1 < max_x and buttons[y][x + 1] != ' ': x += 1 code += buttons[y][x] return code
def part1(): triangles = 0 with maybe_download(3) as file: for s in file.readlines(): a, b, c = [int(x) for x in s.split()] if is_triangle(a, b, c): triangles += 1 print('part1:', triangles)
def part2(): with maybe_download(7) as file: count = 0 for s in file: s = s.strip() if is_ssl(s): count += 1 print('part2', count)
def part2(): with maybe_download(4) as file: for s in map(str.rstrip, file): name, id, checksum = read_line(s) calculated_checksum = get_checksum(name) if calculated_checksum == checksum: real_name = decrypt(name, id) if real_name.find('north') != -1: print(real_name, id)
def part1(): total = 0 with maybe_download(4) as file: for s in map(str.rstrip, file): name, id, checksum = read_line(s) calculated_checksum = get_checksum(name) if calculated_checksum == checksum: total += id print('part1', total)
def part1(): display = [[0 for x in range(50)] for y in range(6)] with maybe_download(8) as file: for s in file: s = s.strip() apply_op(display, s) on = sum([sum(row) for row in display]) print('part1', on)
def part1(): with maybe_download(5) as file: s = file.readline().strip() i = 0 password = '' while len(password) < 8: hash, i = next_hash(s, i) password += hash[5] print('partial pass', password) print('part1', password)
def part1(): x = 0 y = 0 dir = 'N' with maybe_download(1) as file: data = [s.strip() for s in file.read().split(',')] for op in data: x, y, dir = fast_forward(x, y, dir, op) dist = abs(x) + abs(y) print('part1:', dist)
def part1And2(): bots = {} outputs = {} with maybe_download(10) as file: for s in file: s = s.strip() apply_instruction(bots, outputs, s) n = 1 for out_id in [0, 1, 2]: for x in outputs[out_id].values: n *= x print('part2', n)
def part2(): with maybe_download(5) as file: s = file.readline().strip() i = 0 password = [' ' for i in range(8)] found = 0 while found < 8: hash, i = next_hash(s, i) pos = hash[5] if pos.isdigit(): pos = int(pos) if pos < 8 and password[pos] == ' ': password[pos] = hash[6] found += 1 print('partial pass', ''.join(password)) print('part2', ''.join(password))
def get_glove(): prefix = config.GLOVE_DIR print("Storing datasets in {}".format(prefix)) if not os.path.exists(prefix): os.makedirs(prefix) glove_zip = maybe_download(config.GLOVE_BASE_URL, config.GLOVE_FILENAME, config.GLOVE_DIR, 862182613) if os.path.exists(os.path.join(prefix, 'glove.6B.{}d.txt'.format(config.GLOVE_DIM))): return print('Unzipping GloVe data') glove_zip_ref = zipfile.ZipFile(os.path.join(config.GLOVE_DIR, config.GLOVE_FILENAME), 'r') glove_zip_ref.extractall(config.GLOVE_DIR) glove_zip_ref.close()
def part2(): visited = set() x = 0 y = 0 dir = 'N' visited.add((x, y)) with maybe_download(1) as file: data = [s.strip() for s in file.read().split(',')] for op in data: dir = next_dir(dir, op) n = int(op[1:]) for i in range(n): x, y = next_pos(x, y, dir, 1) if (x, y) in visited: dist = abs(x) + abs(y) print('part1:', dist) return else: visited.add((x, y))
def part2(): triangles = 0 s1 = [] s2 = [] s3 = [] with maybe_download(3) as file: for s in file.readlines(): a, b, c = [int(x) for x in s.split()] s1.append(a) s2.append(b) s3.append(c) for i in range(0, len(s1), 3): a, b, c = s1[i:i + 3] if is_triangle(a, b, c): triangles += 1 a, b, c = s2[i:i + 3] if is_triangle(a, b, c): triangles += 1 a, b, c = s3[i:i + 3] if is_triangle(a, b, c): triangles += 1 print('part2:', triangles)
def main(): DEFAULT_SOURCE_URL = 'https://storage.googleapis.com/cvdf-datasets/mnist/' TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' DATA_DIR_PATH = './data/MNIST' VALIDATION_SIZE = 5000 local_file = util.maybe_download(TRAIN_IMAGES, DATA_DIR_PATH, DEFAULT_SOURCE_URL + TRAIN_IMAGES) train_images = util.extract_images(local_file) local_file = util.maybe_download(TRAIN_LABELS, DATA_DIR_PATH, DEFAULT_SOURCE_URL + TRAIN_LABELS) train_labels = util.extract_labels(local_file) local_file = util.maybe_download(TEST_IMAGES, DATA_DIR_PATH, DEFAULT_SOURCE_URL + TEST_IMAGES) test_images = util.extract_images(local_file) local_file = util.maybe_download(TEST_LABELS, DATA_DIR_PATH, DEFAULT_SOURCE_URL + TEST_LABELS) test_labels = util.extract_labels(local_file) validation_images = train_images[:VALIDATION_SIZE] validation_labels = train_labels[:VALIDATION_SIZE] train_images = train_images[VALIDATION_SIZE:] train_labels = train_labels[VALIDATION_SIZE:] Dataset = collections.namedtuple('Dataset', ['images', 'labels', 'num_examples']) Datasets = collections.namedtuple('Datasets', ['train', 'validation', 'test']) # train images num_train_images = train_images.shape[0] train_images = train_images.reshape( train_images.shape[0], train_images.shape[1] * train_images.shape[2]).astype(np.float32) train_images = np.multiply(train_images, 1.0 / 255.0) # Convert from [0, 255] -> [0.0, 1.0]. train = Dataset(train_images, train_labels, num_train_images) # validation images num_validation_images = validation_images.shape[0] validation_images = validation_images.reshape( validation_images.shape[0], validation_images.shape[1] * validation_images.shape[2]).astype(np.float32) validation_images = np.multiply( validation_images, 1.0 / 255.0) # Convert from [0, 255] -> [0.0, 1.0]. validation = Dataset(validation_images, validation_labels, num_validation_images) # test images num_test_images = test_images.shape[0] test_images = test_images.reshape( test_images.shape[0], test_images.shape[1] * test_images.shape[2]).astype(np.float32) test_images = np.multiply(test_images, 1.0 / 255.0) # Convert from [0, 255] -> [0.0, 1.0]. test = Dataset(test_images, test_labels, num_test_images) mnist_data = Datasets(train=train, validation=validation, test=test) network(mnist_data)
def get_split(self, split_name, is_tritrain): return self.get_sentences( util.maybe_download( "data", "http://lsz-gpu-01.cs.washington.edu/resources/", split_name + ".stagged"), is_tritrain)
def part2(): with maybe_download(6) as file: counts = get_all_counts(file) letters = ''.join([c[-1][0] for c in counts]) print('part2', letters)
def part2(): with maybe_download(9) as file: for s in file: s = s.strip() print('part2:', decompress_size(s, True))