def _download_weights_if_needed(self, download_region): """Checks for directories containing the ensemble weights, downloads them if neccessary. The CnnEnsemble uses many GB of pretrained weights for prediction. When using the package for the first time, these weights (too heavy for pypi) need to be downloaded. We use the built in keras utility get_file to handle the download and extraction. This function leaves some hidden files behind after extraction, which we manually remove. After download, the zamba/zamba/models/cnnensemble/ directory will have three new directories: 1. input - contains training fold splits from the original data, and stores formatting information for output 2. output - contains models weights for all models used in the ensemble 3. data_fast - contains cached training image preprocessing results """ download_regions = ['us', 'eu', 'asia'] if download_region not in download_regions: raise ValueError(f"download_region must be one of:\t{download_regions}") else: region_urls = {'us': 'https://s3.amazonaws.com/drivendata-public-assets/', 'eu': 'https://s3.eu-central-1.amazonaws.com/drivendata-public-assets-eu/', 'asia': 'https://s3-ap-southeast-1.amazonaws.com/drivendata-public-assets-asia/'} region_url = region_urls[download_region] # file names, paths fnames = ["input.tar.gz", "output.tar.gz", "data_fast.zip"] cache_dir = Path(__file__).parent cache_subdir = Path("cnnensemble") paths_needed = [cache_dir / cache_subdir / "input", cache_dir / cache_subdir / "output", cache_dir / cache_subdir / "data_fast"] # download and extract if needed for path_needed, fname in zip(paths_needed, fnames): if not path_needed.exists(): origin = region_url + fname get_file(fname=fname, origin=origin, cache_dir=cache_dir, cache_subdir=cache_subdir, extract=True) # remove the compressed file remove(cache_dir / cache_subdir / fname) # remove hidden files or dirs if present cnnpath = cache_dir / cache_subdir hidden_dirs = [pth for pth in cnnpath.glob("**/*") if pth.parts[-1].startswith("._") and pth.is_dir()] if hidden_dirs: deque(map(rmtree, hidden_dirs)) hidden_files = [pth for pth in cnnpath.glob("**/*") if pth.parts[-1].startswith("._") and pth.is_file()] if hidden_files: deque(map(remove, hidden_files))
def __init__(self, data_format=IMAGE_DATA_FORMAT, pre_trained=False, name=None, batch_norm=True): super(GSCSuperSparseCNN, self).__init__( linear_units=1500, linear_percent_on=0.067, linear_weight_sparsity=0.1, data_format=data_format, pre_trained=False, name=name, batch_norm=batch_norm, ) if pre_trained: if not batch_norm: raise NotImplementedError( "Unable to load pre-trained models with no BatchNorm") model_url, model_hash = MODEL_URLS["gsc_super_sparse_cnn"] file_name = "gsc_super_sparse_cnn-{:.8}".format(model_hash) archive_path = get_file( fname="{}.tar.gz".format(file_name), origin=model_url, file_hash=model_hash, extract=True, cache_subdir="models", ) cache_dir = os.path.dirname(archive_path) self.load_weights( os.path.join(cache_dir, "gsc_super_sparse_cnn.h5"))
def load_data( cls, subset_name: str = 'train', task_name: str = 'ner', shuffle: bool = True) -> Tuple[List[List[str]], List[List[str]]]: """ """ corpus_path = get_file(cls.__corpus_name__, cls.__zip_file__name, cache_dir=k.DATA_PATH, untar=True) if subset_name not in {'train', 'test', 'valid'}: raise ValueError() file_path = os.path.join(corpus_path, f'{subset_name}.txt') if task_name not in {'pos', 'chunking', 'ner'}: raise ValueError() data_index = ['pos', 'chunking', 'ner'].index(task_name) + 1 x_data, y_data = DataReader.read_conll_format_file( file_path, label_index=data_index) if shuffle: x_data, y_data = utils.unison_shuffled_copies(x_data, y_data) logging.debug( f"loaded {len(x_data)} samples from {file_path}. Sample:\n" f"x[0]: {x_data[0]}\n" f"y[0]: {y_data[0]}") return x_data, y_data
def load_data(): """Loads the Fashion-MNIST dataset. # Returns Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`. """ dirname = os.path.join('datasets', 'fashion-mnist') base = 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/' files = [ 'train-labels-idx1-ubyte.gz', 'train-images-idx3-ubyte.gz', 't10k-labels-idx1-ubyte.gz', 't10k-images-idx3-ubyte.gz' ] paths = [] for file in files: paths.append(get_file(file, origin=base + file, cache_subdir=dirname)) with gzip.open(paths[0], 'rb') as lbpath: y_train = np.frombuffer(lbpath.read(), np.uint8, offset=8) with gzip.open(paths[1], 'rb') as imgpath: x_train = np.frombuffer(imgpath.read(), np.uint8, offset=16).reshape(len(y_train), 28, 28) with gzip.open(paths[2], 'rb') as lbpath: y_test = np.frombuffer(lbpath.read(), np.uint8, offset=8) with gzip.open(paths[3], 'rb') as imgpath: x_test = np.frombuffer(imgpath.read(), np.uint8, offset=16).reshape(len(y_test), 28, 28) return (x_train, y_train), (x_test, y_test)
def build_dataset(): """Reads 20 new dataset and returns text(list), labels(list) and a dict of labels to index """ print('Processing Dataset...') zipped = get_file('20news-18828.tar.gz', DATA_URL, extract=True) data_dir = zipped[:-7] # remove .tar.gz # Text samples and their labels texts = [] # list of text samples label2idx = {} # dictionary mapping label name to numeric id labels = [] # list of label ids for folder in sorted(os.listdir(data_dir)): path = os.path.join(data_dir, folder) if os.path.isdir(path): label = len(label2idx) label2idx[folder] = label for fname in sorted(os.listdir(path)): if fname.isdigit(): # A text file fpath = os.path.join(path, fname) with open(fpath, encoding='latin-1') as f: t = f.read() i = t.find('\n\n') # skip header if 0 < i: t = t[i:] texts.append(t) labels.append(label) print('Found {} texts.'.format(len(texts))) return texts, labels, label2idx
def test_bert_embedding(self): text, label = ChineseDailyNerCorpus.load_data() is_bold = np.random.randint(1, 3, (len(text), 12)) bert_path = get_file( 'bert_sample_model', "http://s3.bmio.net/kashgari/bert_sample_model.tar.bz2", cache_dir=DATA_PATH, untar=True) text_embedding = BERTEmbedding(bert_path, task=kashgari.LABELING, sequence_length=12) num_feature_embedding = NumericFeaturesEmbedding(2, 'is_bold', sequence_length=12) stack_embedding = StackedEmbedding( [text_embedding, num_feature_embedding]) stack_embedding.analyze_corpus((text, is_bold), label) tensor = stack_embedding.process_x_dataset((text[:3], is_bold[:3])) print(tensor[0][0].shape) print(tensor[0][1].shape) print(tensor[1].shape) print(stack_embedding.embed_model.input_shape) print(stack_embedding.embed_model.summary()) r = stack_embedding.embed((text[:3], is_bold[:3])) assert r.shape == (3, 12, 24)
def load_weights(self): weights_path = get_file( 'vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5', WEIGHTS_PATH_NO_TOP, cache_subdir='models', file_hash='6d6bbae143d832006294945121d1f1fc') self.created_model.load_weights(weights_path, by_name=True)
def load_data(): base = "file:///D:/fashionmnist/" files = [ 'train-labels-idx1-ubyte.gz', 'train-images-idx3-ubyte.gz', 't10k-labels-idx1-ubyte.gz', 't10k-images-idx3-ubyte.gz' ] paths = [] for fname in files: paths.append(get_file(fname, origin=base + fname)) with gzip.open(paths[0], 'rb') as lbpath: y_train = np.frombuffer(lbpath.read(), np.uint8, offset=8) with gzip.open(paths[1], 'rb') as imgpath: x_train = np.frombuffer(imgpath.read(), np.uint8, offset=16).reshape(len(y_train), 28, 28) with gzip.open(paths[2], 'rb') as lbpath: y_test = np.frombuffer(lbpath.read(), np.uint8, offset=8) with gzip.open(paths[3], 'rb') as imgpath: x_test = np.frombuffer(imgpath.read(), np.uint8, offset=16).reshape(len(y_test), 28, 28) return (x_train, y_train), (x_test, y_test)
def create(self): """ Creates the VGG16 network achitecture and loads the pretrained weights. Args: None Returns: None """ model = self.model = Sequential() # model.add(Lambda(vgg_preprocess, input_shape=(3,224,224), output_shape=(3,224,224))) model.add(Lambda(vgg_preprocess, input_shape=(224,224, 3))) self.ConvBlock(2, 64) self.ConvBlock(2, 128) self.ConvBlock(3, 256) self.ConvBlock(3, 512) self.ConvBlock(3, 512) model.add(Flatten()) self.FCBlock() self.FCBlock() model.add(Dense(1000, activation='softmax')) fname = 'vgg16.h5' fname = 'vgg16_weights_tf_dim_ordering_tf_kernels.h5' model.load_weights(get_file(fname, self.FILE_PATH+fname, cache_subdir='models'))
def setUpClass(cls): cls.embedding_class = BERTEmbedding bert_path = get_file( 'bert_sample_model', "http://s3.bmio.net/kashgari/bert_sample_model.tar.bz2", cache_dir=DATA_PATH, untar=True) cls.config = {'model_folder': bert_path}
def get_classes(self): """ Downloads the Imagenet classes index file and loads it to self.classes. The file is downloaded only if it not already in the cache. """ fname = 'imagenet_class_index.json' fpath = get_file(fname, self.FILE_PATH+fname, cache_subdir='models') with open(fpath) as f: class_dict = json.load(f) self.classes = [class_dict[str(i)][1] for i in range(len(class_dict))]
def load_data(): ''' load data from MovieLens 100K Dataset http://grouplens.org/datasets/movielens/ Note that this method uses ua.base and ua.test in the dataset. :return: train_users, train_x, test_users, test_x :rtype: list of int, numpy.array, list of int, numpy.array ''' path = get_file( 'ml-100k.zip', origin='http://files.grouplens.org/datasets/movielens/ml-100k.zip') with ZipFile(path, 'r') as ml_zip: max_item_id = -1 train_history = {} with ml_zip.open('ml-100k/ua.base', 'r') as file: for line in file: user_id, item_id, rating, timestamp = line.decode( 'utf-8').rstrip().split('\t') if int(user_id) not in train_history: train_history[int(user_id)] = [int(item_id)] else: train_history[int(user_id)].append(int(item_id)) if max_item_id < int(item_id): max_item_id = int(item_id) test_history = {} with ml_zip.open('ml-100k/ua.test', 'r') as file: for line in file: user_id, item_id, rating, timestamp = line.decode( 'utf-8').rstrip().split('\t') if int(user_id) not in test_history: test_history[int(user_id)] = [int(item_id)] else: test_history[int(user_id)].append(int(item_id)) max_item_id += 1 # item_id starts from 1 train_users = list(train_history.keys()) train_x = numpy.zeros((len(train_users), max_item_id), dtype=numpy.int32) print(train_x.shape) for i, hist in enumerate(train_history.values()): # print(hist) mat = to_categorical(hist, max_item_id) # print(mat.shape) train_x[i] = numpy.sum(mat, axis=0) # print(len(train_x[i])) test_users = list(test_history.keys()) test_x = numpy.zeros((len(test_users), max_item_id), dtype=numpy.int32) for i, hist in enumerate(test_history.values()): mat = to_categorical(hist, max_item_id) test_x[i] = numpy.sum(mat, axis=0) return train_users, train_x, test_users, test_x
def setUpClass(cls): sample_w2v_path = get_file('sample_w2v.txt', "http://s3.bmio.net/kashgari/sample_w2v.txt", cache_dir=DATA_PATH) cls.embedding_class = WordEmbedding cls.config = { 'w2v_path': sample_w2v_path } cls.embedding_size = 100
def load_data(cls, subset_name: str = 'train', shuffle: bool = True, cutter: str = 'char') -> Tuple[List[List[str]], List[str]]: """ Load dataset as sequence classification format, char level tokenized features: ``[['听', '新', '闻', '。'], ['电', '视', '台', '在', '播', '什', '么'], ...]`` labels: ``['news', 'epg', ...]`` Samples:: train_x, train_y = SMP2018ECDTCorpus.load_data('train') test_x, test_y = SMP2018ECDTCorpus.load_data('test') Args: subset_name: {train, test, valid} shuffle: should shuffle or not, default True. cutter: sentence cutter, {char, jieba} Returns: dataset_features and dataset labels """ corpus_path = get_file(cls.__corpus_name__, cls.__zip_file__name, cache_dir=k.DATA_PATH, untar=True) if cutter not in ['char', 'jieba', 'none']: raise ValueError( 'cutter error, please use one onf the {char, jieba}') df_path = os.path.join(corpus_path, f'{subset_name}.csv') df = pd.read_csv(df_path) if cutter == 'jieba': try: import jieba except ModuleNotFoundError: raise ModuleNotFoundError( "please install jieba, `$ pip install jieba`") x_data = [list(jieba.cut(item)) for item in df['query'].to_list()] elif 'char': x_data = [list(item) for item in df['query'].to_list()] y_data = df['label'].to_list() if shuffle: x_data, y_data = utils.unison_shuffled_copies(x_data, y_data) logging.debug(f"loaded {len(x_data)} samples from {df_path}. Sample:\n" f"x[0]: {x_data[0]}\n" f"y[0]: {y_data[0]}") return x_data, y_data
def CrossModalityXception(num_classes, pre_trained, cross_modality_pre_training, input_shape, include_feature_fields=False): cross_modality_pre_training = cross_modality_pre_training and pre_trained # create the model model = Xception(classes=num_classes, weights=None, input_shape=input_shape, include_top=True) channels = input_shape[2] # load weight file >>> downloads some file from github weights_path = get_file( 'xception_weights_tf_dim_ordering_tf_kernels_notop.h5', TF_WEIGHTS_PATH_NO_TOP, cache_subdir='models', file_hash='b0042744bf5b25fce3cb969f33bebb97') weight_values_ = get_named_layer_weights_from_h5py(weights_path) symbolic_weights_ = get_symbolic_filtered_layer_weights_from_model( model)[:len(weight_values_)] if cross_modality_pre_training: # use a pretrained convolution weight # update it (name,[kernel,bias]) # cross modality pre-training for kernel # leave bias as is of course weight_values_[0] = ( "conv1_cross_modality", [ cross_modality_init( kernel=weight_values_[0][1][0], in_channels=channels ), # 0 = first layer , 1 = weight_value , 0 = kernel # Xception has no bias ]) else: # start the first convolution layer as random glorot symbolic_weights_ = symbolic_weights_[1:] weight_values_ = weight_values_[1:] if pre_trained: # do weight loading load_layer_weights(weight_values=weight_values_, symbolic_weights=symbolic_weights_) if include_feature_fields: return Model(model.inputs, [layer.output for layer in model.layers[-2:]]) else: return model
def resnet50_encoder(pretrained_weights='imagenet', num_classes=20, input_size=(256, 256, 1)): pretrained_url = "https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5" inp = Input(input_size) x = ZeroPadding2D((3, 3))(inp) x = Conv2D(64, (7, 7), strides=(2, 2))(x) feature_map_1 = x x = BatchNormalization()(x) x = Activation('relu')(x) x = MaxPooling2D((3, 3), strides=(2, 2))(x) x = conv_block(x, [64, 64, 256], strides=(1, 1)) x = identity_block(x, [64, 64, 256]) x = identity_block(x, [64, 64, 256]) feature_map_2 = ZeroPadding2D((1, 1))(x) feature_map_2 = Lambda(lambda x: x[:, :-1, :-1, :])(feature_map_2) x = conv_block(x, [128, 128, 512]) x = identity_block(x, [128, 128, 512]) x = identity_block(x, [128, 128, 512]) x = identity_block(x, [128, 128, 512]) feature_map_3 = x x = conv_block(x, [256, 256, 1024]) x = identity_block(x, [256, 256, 1024]) x = identity_block(x, [256, 256, 1024]) x = identity_block(x, [256, 256, 1024]) x = identity_block(x, [256, 256, 1024]) x = identity_block(x, [256, 256, 1024]) feature_map_4 = x x = conv_block(x, [512, 512, 2048]) x = identity_block(x, [512, 512, 2048]) x = identity_block(x, [512, 512, 2048]) feature_map_5 = x model = Model(inputs=inp, outputs=feature_map_5) if pretrained_weights == 'imagenet': weights_path = utils.get_file( pretrained_url.split("/")[-1], pretrained_url) model.load_weights(weights_path) return inp, [ feature_map_1, feature_map_2, feature_map_3, feature_map_4, feature_map_5 ]
def load_data(cls, model_name): """ Download pretrained GPT-2 models Args: model_name: {117M, 345M} Returns: GPT-2 model folder """ model_folder: pathlib.Path = pathlib.Path( os.path.join(macros.DATA_PATH, 'datasets', f'gpt2-{model_name}')) model_folder.mkdir(exist_ok=True, parents=True) for filename in [ 'checkpoint', 'encoder.json', 'hparams.json', 'model.ckpt.data-00000-of-00001', 'model.ckpt.index', 'model.ckpt.meta', 'vocab.bpe' ]: url = "https://storage.googleapis.com/gpt-2/models/" + model_name + "/" + filename get_file(os.path.join(f'gpt2-{model_name}', filename), url, cache_dir=macros.DATA_PATH) return str(model_folder)
def CrossModalityResNet50(num_classes, pre_trained, cross_modality_pre_training, input_shape): """Pretrained Resnet50 model from keras which uses cross modality pretraining to obtain a convolution weight which suits 20 channels needed by motion stream""" cross_modality_pre_training = cross_modality_pre_training and pre_trained # create the model model = ResNet50(classes=num_classes, weights=None, input_shape=input_shape, include_top=True) channels = input_shape[2] # load weight file >>> downloads some file from github weights_path = get_file( 'resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5', WEIGHTS_PATH_NO_TOP, cache_subdir='models', md5_hash='a268eb855778b3df3c7506639542a6af') # get the named weights of each layer weight_values_ = get_named_layer_weights_from_h5py(weights_path) # get the symbolic weights of each layer symbolic_weights_ = get_symbolic_filtered_layer_weights_from_model( model)[:len(weight_values_)] if cross_modality_pre_training: # use a pretrained convolution weight # update it (name,[kernel,bias]) # cross modality pre-training for kernel # leave bias as is of course weight_values_[0] = ( "conv1_cross_modality", [ cross_modality_init( kernel=weight_values_[0][1][0], in_channels=channels ), # 0 = first layer , 1 = weight_value , 0 = kernel weight_values_[0][1][1] ] # 0 = first layer , 1 = weight_value , 1 = bias ) else: # start the first convolution layer as random glorot symbolic_weights_ = symbolic_weights_[1:] weight_values_ = weight_values_[1:] if pre_trained: # do weight loading load_layer_weights(weight_values=weight_values_, symbolic_weights=symbolic_weights_) return model
def _load_data(self): dirname = 'cifar-100-python' origin = 'http://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz' path = get_file(dirname, origin=origin, untar=True) fpath = os.path.join(path, 'train') x_train, yf_train, yc_train = self._extract_data(fpath) fpath = os.path.join(path, 'test') x_test, yf_test, yc_test = self._extract_data(fpath) # Put channel last x_train = x_train.transpose(0, 2, 3, 1) x_test = x_test.transpose(0, 2, 3, 1) return (x_train, yc_train, yf_train), (x_test, yc_test, yf_test)
def load_imdb(seed=123): """ Loads the Imdb movie reviews sentiment analysis dataset. Parameters ---------- seed: int Seed for randomizer. Returns ------- train_texts, train_labels, test_texts, test_labels: A tuple of training and validation data. """ zipped = get_file('aclImdb_v1.tar.gz', IMDB_URL, extract=True) data_path = zipped[:-10] # remove .tar.gz # Load the training data train_texts = [] train_labels = [] for category in ['pos', 'neg']: train_path = os.path.join(data_path, 'train', category) for fname in sorted(os.listdir(train_path)): if fname.endswith('.txt'): with open(os.path.join(train_path, fname)) as f: train_texts.append(f.read()) train_labels.append(0 if category == 'neg' else 1) # Load the validation data. test_texts = [] test_labels = [] for category in ['pos', 'neg']: test_path = os.path.join(data_path, 'test', category) for fname in sorted(os.listdir(test_path)): if fname.endswith('.txt'): with open(os.path.join(test_path, fname)) as f: test_texts.append(f.read()) test_labels.append(0 if category == 'neg' else 1) # Shuffle the training data and labels. random.seed(seed) random.shuffle(train_texts) random.seed(seed) random.shuffle(train_labels) return (train_texts, np.array(train_labels)), (test_texts, np.array(test_labels))
def _get_model(self, file_name="megadetector_v3.pb", model_url=None): if model_url is None: model_url = self.MODEL_URL cache_subdir = "megadetector" model_path = zamba.config.cache_dir / cache_subdir / file_name if not model_path.exists(): model_path = get_file( fname=file_name, origin=model_url, cache_dir=zamba.config.cache_dir, cache_subdir=cache_subdir, extract=True, ) return model_path
def load_data( cls, subset_name: str = 'train', shuffle: bool = True) -> Tuple[List[List[str]], List[List[str]]]: """ Load dataset as sequence labeling format, char level tokenized features: ``[['海', '钓', '比', '赛', '地', '点', '在', '厦', '门', ...], ...]`` labels: ``[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', ...], ...]`` Sample:: train_x, train_y = ChineseDailyNerCorpus.load_data('train') test_x, test_y = ChineseDailyNerCorpus.load_data('test') Args: subset_name: {train, test, valid} shuffle: should shuffle or not, default True. Returns: dataset_features and dataset labels """ corpus_path = get_file(cls.__corpus_name__, cls.__zip_file__name, cache_dir=k.DATA_PATH, untar=True) if subset_name == 'train': file_path = os.path.join(corpus_path, 'example.train') elif subset_name == 'test': file_path = os.path.join(corpus_path, 'example.test') else: file_path = os.path.join(corpus_path, 'example.dev') x_data, y_data = DataReader.read_conll_format_file(file_path) if shuffle: x_data, y_data = utils.unison_shuffled_copies(x_data, y_data) logging.debug( f"loaded {len(x_data)} samples from {file_path}. Sample:\n" f"x[0]: {x_data[0]}\n" f"y[0]: {y_data[0]}") return x_data, y_data
def _get_model(self, file_name="zamba-and-obj-rec-0.859.joblib", model_url=None): if model_url is None: model_url = self.MODEL_URL cache_subdir = "blanknonblank" model_path = zamba.config.cache_dir / cache_subdir / file_name if not model_path.exists(): model_path = get_file( fname=file_name, origin=model_url, cache_dir=zamba.config.cache_dir, cache_subdir=cache_subdir, extract=True, ) return model_path
def test_build_with_BERT_and_fit(self): from kashgari.embeddings import BERTEmbedding from tensorflow.python.keras.utils import get_file from kashgari.macros import DATA_PATH sample_bert_path = get_file( 'bert_sample_model', "http://s3.bmio.net/kashgari/bert_sample_model.tar.bz2", cache_dir=DATA_PATH, untar=True) processor = MultiOutputProcessor() embedding = BERTEmbedding(model_folder=sample_bert_path, processor=processor) m = MultiOutputModel(embedding=embedding) m.build_model(train_x, (output_1, output_2)) m.fit(train_x, (output_1, output_2), epochs=2) res = m.predict(train_x[:10]) assert len(res) == 2 assert res[0].shape == (10, 3)
def generate_data(): path = get_file('babi-tasks-v1-2.tar.gz', origin='https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz') challenge = 'tasks_1-20_v1-2/en/qa2_two-supporting-facts_{}.txt' with tarfile.open(path) as tar: train = get_stories(tar.extractfile(challenge.format('train'))) test = get_stories(tar.extractfile(challenge.format('test'))) vocab = set() for story, q, answer in train + test: vocab |= set(story + q + [answer]) # 36 words vocab = sorted(vocab) vocab_size = len(vocab) + 1 word_idx = dict((c, i + 1) for i, c in enumerate(vocab)) story_maxlen = max(map(len, (x for x, _, _ in train + test))) query_maxlen = max(map(len, (x for _, x, _ in train + test))) # xs:1000 * 552 xqs: 1000*5 ys: 1000*36 train_sqa_vec = vectorize_stories(train, word_idx, story_maxlen, query_maxlen) test_sqa_vec = vectorize_stories(test, word_idx, story_maxlen, query_maxlen) return train_sqa_vec, test_sqa_vec, vocab_size, story_maxlen, query_maxlen
def get_classes(self): ''' Downloads the ImageNet class index file and loads it to self.get_classes unless it is already cached. Args: None Returns: None ''' # Get the ImageNet class indexes and cache them fname = 'imagenet_class_index.json' fpath = get_file(fname, self.FILE_PATH + fname, cache_subdir = 'models') # Open file and parse json with open(fpath) as f: class_dict = json.load(f) self.classes = [class_dict[str(i)][1] for i in range(len(class_dict))]
def main(args): global cold was_cold = cold cold = False try: input_bucket = args.get("input_bucket", 1) object_key = args.get("object_key", 1) model_object_key = args.get( "model_object_key", 1) # example : squeezenet_weights_tf_dim_ordering_tf_kernels.h5 model_bucket = args.get("model_bucket", 1) # download_path = tmp + '{}{}'.format(uuid.uuid4(), object_key) download_path = get_file( '{}{}'.format(uuid.uuid4(), object_key), "https://github.com/kmu-bigdata/serverless-faas-workbench/raw/master/dataset/image/animal-dog.jpg", cache_dir='/tmp/') # s3_client.download_file(input_bucket, object_key, download_path) model_path = tmp + '{}{}'.format(uuid.uuid4(), model_object_key) # s3_client.download_file(model_bucket, model_object_key, model_path) latency, result = predict(download_path) _tmp_dic = {x[1]: {'N': str(x[2])} for x in result[0]} return {"body": {"latency": latency, "msg": msg, "cold": was_cold}} except Exception as e: err = "whelp" try: err = traceback.format_exc() except Exception as fug: err = str(fug) return {"body": {"cust_error": msg, "thing": err, "cold": was_cold}}
def __init__( self, cnn_out_channels=(64, 64), cnn_percent_on=(0.095, 0.125), linear_units=1000, linear_percent_on=0.1, linear_weight_sparsity=0.4, boost_strength=1.5, boost_strength_factor=0.9, k_inference_factor=1.5, duty_cycle_period=1000, data_format=IMAGE_DATA_FORMAT, pre_trained=False, name=None, batch_norm=True, **kwargs, ): super(GSCSparseCNN, self).__init__(name=name, **kwargs) if data_format == "channels_first": axis = 1 input_shape = (1, 32, 32) else: axis = -1 input_shape = (32, 32, 1) self.add( keras.layers.Conv2D( name="cnn1", data_format=data_format, input_shape=input_shape, filters=cnn_out_channels[0], kernel_size=5, )) if batch_norm: self.add( keras.layers.BatchNormalization( name="cnn1_batchnorm", axis=axis, epsilon=1e-05, momentum=0.9, center=False, scale=False, )) self.add( keras.layers.MaxPool2D( name="cnn1_maxpool", pool_size=2, padding="same", data_format=data_format, )) self.add( KWinners2d( name="cnn1_kwinner", data_format=data_format, percent_on=cnn_percent_on[0], k_inference_factor=k_inference_factor, boost_strength=boost_strength, boost_strength_factor=boost_strength_factor, duty_cycle_period=duty_cycle_period, )) self.add( keras.layers.Conv2D( name="cnn2", data_format=data_format, filters=cnn_out_channels[1], kernel_size=5, )) if batch_norm: self.add( keras.layers.BatchNormalization( name="cnn2_batchnorm", axis=axis, epsilon=1e-05, momentum=0.9, center=False, scale=False, )) self.add( keras.layers.MaxPool2D( name="cnn2_maxpool", pool_size=2, padding="same", data_format=data_format, )) self.add( KWinners2d( name="cnn2_kwinner", data_format=data_format, percent_on=cnn_percent_on[1], k_inference_factor=k_inference_factor, boost_strength=boost_strength, boost_strength_factor=boost_strength_factor, duty_cycle_period=duty_cycle_period, )) self.add(keras.layers.Flatten(name="flatten", data_format=data_format)) self.add( keras.layers.Dense( name="linear", units=linear_units, kernel_constraint=SparseWeights(linear_weight_sparsity), )) if batch_norm: self.add( keras.layers.BatchNormalization(name="linear_bn", epsilon=1e-05, momentum=0.9, center=False, scale=False)) self.add( KWinners( name="linear_kwinner", percent_on=linear_percent_on, k_inference_factor=k_inference_factor, boost_strength=boost_strength, boost_strength_factor=boost_strength_factor, duty_cycle_period=duty_cycle_period, )) self.add(keras.layers.Dense(name="output", units=12)) self.add(keras.layers.Softmax(axis=1)) if pre_trained: if not batch_norm: raise NotImplementedError( "Unable to load pre-trained models with no BatchNorm") model_url, model_hash = MODEL_URLS["gsc_sparse_cnn"] file_name = "gsc_sparse_cnn-{:.8}".format(model_hash) archive_path = get_file( fname="{}.tar.gz".format(file_name), origin=model_url, file_hash=model_hash, extract=True, cache_subdir="models", ) cache_dir = os.path.dirname(archive_path) self.load_weights(os.path.join(cache_dir, "gsc_sparse_cnn.h5"))
def download(self, data_dir=None, dataset='all'): '''Cornell Grasping Dataset - about 5GB total size http:pr.cs.cornell.edu/grasping/rect_data/data.php Downloads to `~/.keras/datasets/cornell_grasping` by default. Includes grasp_listing.txt with all files in all datasets; the feature csv files which specify the dataset size, the features (data channels), and the number of grasps; and the tfrecord files which actually contain all the data. If `grasp_listing_hashed.txt` is present, an additional hashing step will will be completed to verify dataset integrity. `grasp_listing_hashed.txt` will be generated automatically when downloading with `dataset='all'`. # Arguments dataset: The name of the dataset to download, downloads all by default with the '' parameter, 102 will download the 102 feature dataset found in grasp_listing.txt. # Returns list of paths to the downloaded files ''' dataset = self._update_dataset_param(dataset) if data_dir is None: if self.data_dir is None: data_dir = FLAGS.data_dir else: data_dir = self.data_dir hypertree_utilities.mkdir_p(data_dir) print('Downloading datasets to: ', data_dir) url_prefix = '' # If a hashed version of the listing is available, # download the dataset and verify hashes to prevent data corruption. listing_hash = os.path.join(data_dir, 'grasp_listing_hash.txt') if os.path.isfile(listing_hash): files_and_hashes = np.genfromtxt(listing_hash, dtype='str', delimiter=' ') files = [ get_file(fpath.split('/')[-1], url_prefix + fpath, cache_subdir=data_dir, file_hash=hash_str, extract=True) for fpath, hash_str in tqdm(files_and_hashes) if '_' + str(dataset) in fpath ] else: # If a hashed version of the listing is not available, # simply download the dataset normally. listing_url = 'https://raw.githubusercontent.com/ahundt/robot-grasp-detection/master/grasp_listing.txt' grasp_listing_path = get_file('grasp_listing.txt', listing_url, cache_subdir=data_dir) grasp_files = np.genfromtxt(grasp_listing_path, dtype=str) files = [ get_file(fpath.split('/')[-1], url_prefix + fpath, cache_subdir=data_dir, extract=True) for fpath in tqdm(grasp_files) if '_' + dataset in fpath ] # If all files are downloaded, generate a hashed listing. if dataset is 'all' or dataset is '': print('Hashing all dataset files to prevent corruption...') hashes = [] for i, f in enumerate(tqdm(files)): hashes.append(_hash_file(f)) file_hash_np = np.column_stack([grasp_files, hashes]) with open(listing_hash, 'wb') as hash_file: np.savetxt(hash_file, file_hash_np, fmt='%s', delimiter=' ', header='file_path sha256') print( 'Hashing complete, {} contains each url plus hash, and will be used to verify the ' 'dataset during future calls to download().'.format( listing_hash)) return files
def load(path): """Ensures that a file is downloaded locally, then unzips and reads it.""" return GzipFile(get_file(Path(path).name, path)).read()