Beispiel #1
0
    def _download_weights_if_needed(self, download_region):
        """Checks for directories containing the ensemble weights, downloads them if neccessary.

        The CnnEnsemble uses many GB of pretrained weights for prediction. When using the package for the first
        time, these weights (too heavy for pypi) need to be downloaded. We use the built in keras utility get_file to
        handle the download and extraction. This function leaves some hidden files behind after extraction,
        which we manually remove.

        After download, the zamba/zamba/models/cnnensemble/ directory will have three new directories:

        1. input - contains training fold splits from the original data, and stores formatting information for output
        2. output - contains models weights for all models used in the ensemble
        3. data_fast - contains cached training image preprocessing results
        """

        download_regions = ['us', 'eu', 'asia']
        if download_region not in download_regions:
            raise ValueError(f"download_region must be one of:\t{download_regions}")
        else:
            region_urls = {'us': 'https://s3.amazonaws.com/drivendata-public-assets/',
                           'eu': 'https://s3.eu-central-1.amazonaws.com/drivendata-public-assets-eu/',
                           'asia': 'https://s3-ap-southeast-1.amazonaws.com/drivendata-public-assets-asia/'}
            region_url = region_urls[download_region]

        # file names, paths
        fnames = ["input.tar.gz", "output.tar.gz", "data_fast.zip"]

        cache_dir = Path(__file__).parent
        cache_subdir = Path("cnnensemble")

        paths_needed = [cache_dir / cache_subdir / "input",
                        cache_dir / cache_subdir / "output",
                        cache_dir / cache_subdir / "data_fast"]

        # download and extract if needed
        for path_needed, fname in zip(paths_needed, fnames):
            if not path_needed.exists():
                origin = region_url + fname
                get_file(fname=fname, origin=origin, cache_dir=cache_dir, cache_subdir=cache_subdir,
                         extract=True)

                # remove the compressed file
                remove(cache_dir / cache_subdir / fname)

        # remove hidden files or dirs if present
        cnnpath = cache_dir / cache_subdir
        hidden_dirs = [pth for pth in cnnpath.glob("**/*") if pth.parts[-1].startswith("._") and pth.is_dir()]
        if hidden_dirs:
            deque(map(rmtree, hidden_dirs))
        hidden_files = [pth for pth in cnnpath.glob("**/*") if pth.parts[-1].startswith("._") and pth.is_file()]
        if hidden_files:
            deque(map(remove, hidden_files))
Beispiel #2
0
 def __init__(self,
              data_format=IMAGE_DATA_FORMAT,
              pre_trained=False,
              name=None,
              batch_norm=True):
     super(GSCSuperSparseCNN, self).__init__(
         linear_units=1500,
         linear_percent_on=0.067,
         linear_weight_sparsity=0.1,
         data_format=data_format,
         pre_trained=False,
         name=name,
         batch_norm=batch_norm,
     )
     if pre_trained:
         if not batch_norm:
             raise NotImplementedError(
                 "Unable to load pre-trained models with no BatchNorm")
         model_url, model_hash = MODEL_URLS["gsc_super_sparse_cnn"]
         file_name = "gsc_super_sparse_cnn-{:.8}".format(model_hash)
         archive_path = get_file(
             fname="{}.tar.gz".format(file_name),
             origin=model_url,
             file_hash=model_hash,
             extract=True,
             cache_subdir="models",
         )
         cache_dir = os.path.dirname(archive_path)
         self.load_weights(
             os.path.join(cache_dir, "gsc_super_sparse_cnn.h5"))
Beispiel #3
0
    def load_data(
            cls,
            subset_name: str = 'train',
            task_name: str = 'ner',
            shuffle: bool = True) -> Tuple[List[List[str]], List[List[str]]]:
        """

        """
        corpus_path = get_file(cls.__corpus_name__,
                               cls.__zip_file__name,
                               cache_dir=k.DATA_PATH,
                               untar=True)

        if subset_name not in {'train', 'test', 'valid'}:
            raise ValueError()

        file_path = os.path.join(corpus_path, f'{subset_name}.txt')

        if task_name not in {'pos', 'chunking', 'ner'}:
            raise ValueError()

        data_index = ['pos', 'chunking', 'ner'].index(task_name) + 1

        x_data, y_data = DataReader.read_conll_format_file(
            file_path, label_index=data_index)
        if shuffle:
            x_data, y_data = utils.unison_shuffled_copies(x_data, y_data)
        logging.debug(
            f"loaded {len(x_data)} samples from {file_path}. Sample:\n"
            f"x[0]: {x_data[0]}\n"
            f"y[0]: {y_data[0]}")
        return x_data, y_data
def load_data():
    """Loads the Fashion-MNIST dataset.
    # Returns
        Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
    """
    dirname = os.path.join('datasets', 'fashion-mnist')
    base = 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/'
    files = [
        'train-labels-idx1-ubyte.gz', 'train-images-idx3-ubyte.gz',
        't10k-labels-idx1-ubyte.gz', 't10k-images-idx3-ubyte.gz'
    ]

    paths = []
    for file in files:
        paths.append(get_file(file, origin=base + file, cache_subdir=dirname))

    with gzip.open(paths[0], 'rb') as lbpath:
        y_train = np.frombuffer(lbpath.read(), np.uint8, offset=8)

    with gzip.open(paths[1], 'rb') as imgpath:
        x_train = np.frombuffer(imgpath.read(), np.uint8,
                                offset=16).reshape(len(y_train), 28, 28)

    with gzip.open(paths[2], 'rb') as lbpath:
        y_test = np.frombuffer(lbpath.read(), np.uint8, offset=8)

    with gzip.open(paths[3], 'rb') as imgpath:
        x_test = np.frombuffer(imgpath.read(), np.uint8,
                               offset=16).reshape(len(y_test), 28, 28)

    return (x_train, y_train), (x_test, y_test)
Beispiel #5
0
def build_dataset():
    """Reads 20 new dataset and returns text(list), labels(list) and a dict of labels to index """
    print('Processing Dataset...')
    zipped = get_file('20news-18828.tar.gz', DATA_URL, extract=True)
    data_dir = zipped[:-7]  # remove .tar.gz

    # Text samples and their labels
    texts = []  # list of text samples
    label2idx = {}  # dictionary mapping label name to numeric id
    labels = []  # list of label ids

    for folder in sorted(os.listdir(data_dir)):
        path = os.path.join(data_dir, folder)
        if os.path.isdir(path):
            label = len(label2idx)
            label2idx[folder] = label

            for fname in sorted(os.listdir(path)):
                if fname.isdigit():  # A text file
                    fpath = os.path.join(path, fname)

                    with open(fpath, encoding='latin-1') as f:
                        t = f.read()
                        i = t.find('\n\n')  # skip header
                        if 0 < i: t = t[i:]
                        texts.append(t)
                    labels.append(label)

    print('Found {} texts.'.format(len(texts)))

    return texts, labels, label2idx
Beispiel #6
0
    def test_bert_embedding(self):
        text, label = ChineseDailyNerCorpus.load_data()
        is_bold = np.random.randint(1, 3, (len(text), 12))

        bert_path = get_file(
            'bert_sample_model',
            "http://s3.bmio.net/kashgari/bert_sample_model.tar.bz2",
            cache_dir=DATA_PATH,
            untar=True)

        text_embedding = BERTEmbedding(bert_path,
                                       task=kashgari.LABELING,
                                       sequence_length=12)
        num_feature_embedding = NumericFeaturesEmbedding(2,
                                                         'is_bold',
                                                         sequence_length=12)

        stack_embedding = StackedEmbedding(
            [text_embedding, num_feature_embedding])
        stack_embedding.analyze_corpus((text, is_bold), label)

        tensor = stack_embedding.process_x_dataset((text[:3], is_bold[:3]))
        print(tensor[0][0].shape)
        print(tensor[0][1].shape)
        print(tensor[1].shape)
        print(stack_embedding.embed_model.input_shape)
        print(stack_embedding.embed_model.summary())
        r = stack_embedding.embed((text[:3], is_bold[:3]))
        assert r.shape == (3, 12, 24)
Beispiel #7
0
 def load_weights(self):
     weights_path = get_file(
         'vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5',
         WEIGHTS_PATH_NO_TOP,
         cache_subdir='models',
         file_hash='6d6bbae143d832006294945121d1f1fc')
     self.created_model.load_weights(weights_path, by_name=True)
def load_data():
    base = "file:///D:/fashionmnist/"
    files = [
        'train-labels-idx1-ubyte.gz', 'train-images-idx3-ubyte.gz',
        't10k-labels-idx1-ubyte.gz', 't10k-images-idx3-ubyte.gz'
    ]

    paths = []
    for fname in files:
        paths.append(get_file(fname, origin=base + fname))

    with gzip.open(paths[0], 'rb') as lbpath:
        y_train = np.frombuffer(lbpath.read(), np.uint8, offset=8)

    with gzip.open(paths[1], 'rb') as imgpath:
        x_train = np.frombuffer(imgpath.read(), np.uint8,
                                offset=16).reshape(len(y_train), 28, 28)

    with gzip.open(paths[2], 'rb') as lbpath:
        y_test = np.frombuffer(lbpath.read(), np.uint8, offset=8)

    with gzip.open(paths[3], 'rb') as imgpath:
        x_test = np.frombuffer(imgpath.read(), np.uint8,
                               offset=16).reshape(len(y_test), 28, 28)

    return (x_train, y_train), (x_test, y_test)
Beispiel #9
0
    def create(self):
        """
            Creates the VGG16 network achitecture and loads the pretrained weights.

            Args:   None
            Returns:   None
        """
        model = self.model = Sequential()
#        model.add(Lambda(vgg_preprocess, input_shape=(3,224,224), output_shape=(3,224,224)))
        model.add(Lambda(vgg_preprocess, input_shape=(224,224, 3)))

        self.ConvBlock(2, 64)
        self.ConvBlock(2, 128)
        self.ConvBlock(3, 256)
        self.ConvBlock(3, 512)
        self.ConvBlock(3, 512)

        model.add(Flatten())
        self.FCBlock()
        self.FCBlock()
        model.add(Dense(1000, activation='softmax'))

        fname = 'vgg16.h5'
        fname = 'vgg16_weights_tf_dim_ordering_tf_kernels.h5'
        model.load_weights(get_file(fname, self.FILE_PATH+fname, cache_subdir='models'))
Beispiel #10
0
 def setUpClass(cls):
     cls.embedding_class = BERTEmbedding
     bert_path = get_file(
         'bert_sample_model',
         "http://s3.bmio.net/kashgari/bert_sample_model.tar.bz2",
         cache_dir=DATA_PATH,
         untar=True)
     cls.config = {'model_folder': bert_path}
Beispiel #11
0
 def get_classes(self):
     """
         Downloads the Imagenet classes index file and loads it to self.classes.
         The file is downloaded only if it not already in the cache.
     """
     fname = 'imagenet_class_index.json'
     fpath = get_file(fname, self.FILE_PATH+fname, cache_subdir='models')
     with open(fpath) as f:
         class_dict = json.load(f)
     self.classes = [class_dict[str(i)][1] for i in range(len(class_dict))]
Beispiel #12
0
def load_data():
    '''
    load data from MovieLens 100K Dataset
    http://grouplens.org/datasets/movielens/

    Note that this method uses ua.base and ua.test in the dataset.

    :return: train_users, train_x, test_users, test_x
    :rtype: list of int, numpy.array, list of int, numpy.array
    '''
    path = get_file(
        'ml-100k.zip', origin='http://files.grouplens.org/datasets/movielens/ml-100k.zip')
    with ZipFile(path, 'r') as ml_zip:
        max_item_id = -1
        train_history = {}
        with ml_zip.open('ml-100k/ua.base', 'r') as file:
            for line in file:
                user_id, item_id, rating, timestamp = line.decode(
                    'utf-8').rstrip().split('\t')
                if int(user_id) not in train_history:
                    train_history[int(user_id)] = [int(item_id)]
                else:
                    train_history[int(user_id)].append(int(item_id))

                if max_item_id < int(item_id):
                    max_item_id = int(item_id)

        test_history = {}
        with ml_zip.open('ml-100k/ua.test', 'r') as file:
            for line in file:
                user_id, item_id, rating, timestamp = line.decode(
                    'utf-8').rstrip().split('\t')
                if int(user_id) not in test_history:
                    test_history[int(user_id)] = [int(item_id)]
                else:
                    test_history[int(user_id)].append(int(item_id))

    max_item_id += 1  # item_id starts from 1
    train_users = list(train_history.keys())
    train_x = numpy.zeros((len(train_users), max_item_id), dtype=numpy.int32)
    print(train_x.shape)
    for i, hist in enumerate(train_history.values()):
        # print(hist)
        mat = to_categorical(hist, max_item_id)
        # print(mat.shape)
        train_x[i] = numpy.sum(mat, axis=0)
        # print(len(train_x[i]))

    test_users = list(test_history.keys())
    test_x = numpy.zeros((len(test_users), max_item_id), dtype=numpy.int32)
    for i, hist in enumerate(test_history.values()):
        mat = to_categorical(hist, max_item_id)
        test_x[i] = numpy.sum(mat, axis=0)

    return train_users, train_x, test_users, test_x
    def setUpClass(cls):
        sample_w2v_path = get_file('sample_w2v.txt',
                                   "http://s3.bmio.net/kashgari/sample_w2v.txt",
                                   cache_dir=DATA_PATH)

        cls.embedding_class = WordEmbedding

        cls.config = {
            'w2v_path': sample_w2v_path
        }
        cls.embedding_size = 100
Beispiel #14
0
    def load_data(cls,
                  subset_name: str = 'train',
                  shuffle: bool = True,
                  cutter: str = 'char') -> Tuple[List[List[str]], List[str]]:
        """
        Load dataset as sequence classification format, char level tokenized

        features: ``[['听', '新', '闻', '。'], ['电', '视', '台', '在', '播', '什', '么'], ...]``

        labels: ``['news', 'epg', ...]``

        Samples::
            train_x, train_y = SMP2018ECDTCorpus.load_data('train')
            test_x, test_y = SMP2018ECDTCorpus.load_data('test')

        Args:
            subset_name: {train, test, valid}
            shuffle: should shuffle or not, default True.
            cutter: sentence cutter, {char, jieba}

        Returns:
            dataset_features and dataset labels
        """

        corpus_path = get_file(cls.__corpus_name__,
                               cls.__zip_file__name,
                               cache_dir=k.DATA_PATH,
                               untar=True)

        if cutter not in ['char', 'jieba', 'none']:
            raise ValueError(
                'cutter error, please use one onf the {char, jieba}')

        df_path = os.path.join(corpus_path, f'{subset_name}.csv')
        df = pd.read_csv(df_path)
        if cutter == 'jieba':
            try:
                import jieba
            except ModuleNotFoundError:
                raise ModuleNotFoundError(
                    "please install jieba, `$ pip install jieba`")
            x_data = [list(jieba.cut(item)) for item in df['query'].to_list()]
        elif 'char':
            x_data = [list(item) for item in df['query'].to_list()]
        y_data = df['label'].to_list()

        if shuffle:
            x_data, y_data = utils.unison_shuffled_copies(x_data, y_data)
        logging.debug(f"loaded {len(x_data)} samples from {df_path}. Sample:\n"
                      f"x[0]: {x_data[0]}\n"
                      f"y[0]: {y_data[0]}")
        return x_data, y_data
def CrossModalityXception(num_classes,
                          pre_trained,
                          cross_modality_pre_training,
                          input_shape,
                          include_feature_fields=False):
    cross_modality_pre_training = cross_modality_pre_training and pre_trained

    # create the model
    model = Xception(classes=num_classes,
                     weights=None,
                     input_shape=input_shape,
                     include_top=True)
    channels = input_shape[2]

    # load weight file >>> downloads some file from github
    weights_path = get_file(
        'xception_weights_tf_dim_ordering_tf_kernels_notop.h5',
        TF_WEIGHTS_PATH_NO_TOP,
        cache_subdir='models',
        file_hash='b0042744bf5b25fce3cb969f33bebb97')

    weight_values_ = get_named_layer_weights_from_h5py(weights_path)
    symbolic_weights_ = get_symbolic_filtered_layer_weights_from_model(
        model)[:len(weight_values_)]

    if cross_modality_pre_training:  # use a pretrained convolution weight
        # update it (name,[kernel,bias])
        # cross modality pre-training for kernel
        # leave bias as is of course
        weight_values_[0] = (
            "conv1_cross_modality",
            [
                cross_modality_init(
                    kernel=weight_values_[0][1][0], in_channels=channels
                ),  # 0 = first layer , 1 = weight_value , 0 = kernel
                # Xception has no bias
            ])

    else:  # start the first convolution layer as random glorot
        symbolic_weights_ = symbolic_weights_[1:]
        weight_values_ = weight_values_[1:]

    if pre_trained:
        # do weight loading
        load_layer_weights(weight_values=weight_values_,
                           symbolic_weights=symbolic_weights_)

    if include_feature_fields:
        return Model(model.inputs,
                     [layer.output for layer in model.layers[-2:]])
    else:
        return model
def resnet50_encoder(pretrained_weights='imagenet',
                     num_classes=20,
                     input_size=(256, 256, 1)):

    pretrained_url = "https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5"

    inp = Input(input_size)

    x = ZeroPadding2D((3, 3))(inp)
    x = Conv2D(64, (7, 7), strides=(2, 2))(x)
    feature_map_1 = x

    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = MaxPooling2D((3, 3), strides=(2, 2))(x)

    x = conv_block(x, [64, 64, 256], strides=(1, 1))
    x = identity_block(x, [64, 64, 256])
    x = identity_block(x, [64, 64, 256])
    feature_map_2 = ZeroPadding2D((1, 1))(x)
    feature_map_2 = Lambda(lambda x: x[:, :-1, :-1, :])(feature_map_2)

    x = conv_block(x, [128, 128, 512])
    x = identity_block(x, [128, 128, 512])
    x = identity_block(x, [128, 128, 512])
    x = identity_block(x, [128, 128, 512])
    feature_map_3 = x

    x = conv_block(x, [256, 256, 1024])
    x = identity_block(x, [256, 256, 1024])
    x = identity_block(x, [256, 256, 1024])
    x = identity_block(x, [256, 256, 1024])
    x = identity_block(x, [256, 256, 1024])
    x = identity_block(x, [256, 256, 1024])
    feature_map_4 = x

    x = conv_block(x, [512, 512, 2048])
    x = identity_block(x, [512, 512, 2048])
    x = identity_block(x, [512, 512, 2048])
    feature_map_5 = x

    model = Model(inputs=inp, outputs=feature_map_5)

    if pretrained_weights == 'imagenet':
        weights_path = utils.get_file(
            pretrained_url.split("/")[-1], pretrained_url)
        model.load_weights(weights_path)

    return inp, [
        feature_map_1, feature_map_2, feature_map_3, feature_map_4,
        feature_map_5
    ]
Beispiel #17
0
    def load_data(cls, model_name):
        """
        Download pretrained GPT-2 models
        Args:
            model_name: {117M, 345M}

        Returns:
            GPT-2 model folder
        """
        model_folder: pathlib.Path = pathlib.Path(
            os.path.join(macros.DATA_PATH, 'datasets', f'gpt2-{model_name}'))
        model_folder.mkdir(exist_ok=True, parents=True)

        for filename in [
                'checkpoint', 'encoder.json', 'hparams.json',
                'model.ckpt.data-00000-of-00001', 'model.ckpt.index',
                'model.ckpt.meta', 'vocab.bpe'
        ]:
            url = "https://storage.googleapis.com/gpt-2/models/" + model_name + "/" + filename
            get_file(os.path.join(f'gpt2-{model_name}', filename),
                     url,
                     cache_dir=macros.DATA_PATH)
        return str(model_folder)
def CrossModalityResNet50(num_classes, pre_trained,
                          cross_modality_pre_training, input_shape):
    """Pretrained Resnet50 model from keras which uses cross modality pretraining to obtain a convolution weight which suits 20 channels needed by motion stream"""
    cross_modality_pre_training = cross_modality_pre_training and pre_trained

    # create the model
    model = ResNet50(classes=num_classes,
                     weights=None,
                     input_shape=input_shape,
                     include_top=True)
    channels = input_shape[2]

    # load weight file >>> downloads some file from github
    weights_path = get_file(
        'resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5',
        WEIGHTS_PATH_NO_TOP,
        cache_subdir='models',
        md5_hash='a268eb855778b3df3c7506639542a6af')

    # get the named weights of each layer
    weight_values_ = get_named_layer_weights_from_h5py(weights_path)
    # get the symbolic weights of each layer
    symbolic_weights_ = get_symbolic_filtered_layer_weights_from_model(
        model)[:len(weight_values_)]

    if cross_modality_pre_training:  # use a pretrained convolution weight
        # update it (name,[kernel,bias])
        # cross modality pre-training for kernel
        # leave bias as is of course
        weight_values_[0] = (
            "conv1_cross_modality",
            [
                cross_modality_init(
                    kernel=weight_values_[0][1][0], in_channels=channels
                ),  # 0 = first layer , 1 = weight_value , 0 = kernel
                weight_values_[0][1][1]
            ]  # 0 = first layer , 1 = weight_value , 1 = bias
        )

    else:  # start the first convolution layer as random glorot
        symbolic_weights_ = symbolic_weights_[1:]
        weight_values_ = weight_values_[1:]

    if pre_trained:
        # do weight loading
        load_layer_weights(weight_values=weight_values_,
                           symbolic_weights=symbolic_weights_)

    return model
Beispiel #19
0
    def _load_data(self):
        dirname = 'cifar-100-python'
        origin = 'http://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz'
        path = get_file(dirname, origin=origin, untar=True)

        fpath = os.path.join(path, 'train')
        x_train, yf_train, yc_train = self._extract_data(fpath)

        fpath = os.path.join(path, 'test')
        x_test, yf_test, yc_test = self._extract_data(fpath)

        # Put channel last
        x_train = x_train.transpose(0, 2, 3, 1)
        x_test = x_test.transpose(0, 2, 3, 1)

        return (x_train, yc_train, yf_train), (x_test, yc_test, yf_test)
Beispiel #20
0
def load_imdb(seed=123):
    """ Loads the Imdb movie reviews sentiment analysis dataset.

    Parameters
    ----------
    seed: int
        Seed for randomizer.

    Returns
    -------
    train_texts, train_labels, test_texts, test_labels:
        A tuple of training and validation data.
    """

    zipped = get_file('aclImdb_v1.tar.gz', IMDB_URL, extract=True)
    data_path = zipped[:-10]  # remove .tar.gz

    # Load the training data
    train_texts = []
    train_labels = []
    for category in ['pos', 'neg']:
        train_path = os.path.join(data_path, 'train', category)
        for fname in sorted(os.listdir(train_path)):
            if fname.endswith('.txt'):
                with open(os.path.join(train_path, fname)) as f:
                    train_texts.append(f.read())
                train_labels.append(0 if category == 'neg' else 1)

    # Load the validation data.
    test_texts = []
    test_labels = []
    for category in ['pos', 'neg']:
        test_path = os.path.join(data_path, 'test', category)
        for fname in sorted(os.listdir(test_path)):
            if fname.endswith('.txt'):
                with open(os.path.join(test_path, fname)) as f:
                    test_texts.append(f.read())
                test_labels.append(0 if category == 'neg' else 1)

    # Shuffle the training data and labels.
    random.seed(seed)
    random.shuffle(train_texts)
    random.seed(seed)
    random.shuffle(train_labels)

    return (train_texts, np.array(train_labels)), (test_texts,
                                                   np.array(test_labels))
Beispiel #21
0
    def _get_model(self, file_name="megadetector_v3.pb", model_url=None):
        if model_url is None:
            model_url = self.MODEL_URL

        cache_subdir = "megadetector"
        model_path = zamba.config.cache_dir / cache_subdir / file_name

        if not model_path.exists():
            model_path = get_file(
                fname=file_name,
                origin=model_url,
                cache_dir=zamba.config.cache_dir,
                cache_subdir=cache_subdir,
                extract=True,
            )

        return model_path
Beispiel #22
0
    def load_data(
            cls,
            subset_name: str = 'train',
            shuffle: bool = True) -> Tuple[List[List[str]], List[List[str]]]:
        """
        Load dataset as sequence labeling format, char level tokenized

        features: ``[['海', '钓', '比', '赛', '地', '点', '在', '厦', '门', ...], ...]``

        labels: ``[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', ...], ...]``

        Sample::

            train_x, train_y = ChineseDailyNerCorpus.load_data('train')
            test_x, test_y = ChineseDailyNerCorpus.load_data('test')

        Args:
            subset_name: {train, test, valid}
            shuffle: should shuffle or not, default True.

        Returns:
            dataset_features and dataset labels
        """
        corpus_path = get_file(cls.__corpus_name__,
                               cls.__zip_file__name,
                               cache_dir=k.DATA_PATH,
                               untar=True)

        if subset_name == 'train':
            file_path = os.path.join(corpus_path, 'example.train')
        elif subset_name == 'test':
            file_path = os.path.join(corpus_path, 'example.test')
        else:
            file_path = os.path.join(corpus_path, 'example.dev')

        x_data, y_data = DataReader.read_conll_format_file(file_path)
        if shuffle:
            x_data, y_data = utils.unison_shuffled_copies(x_data, y_data)
        logging.debug(
            f"loaded {len(x_data)} samples from {file_path}. Sample:\n"
            f"x[0]: {x_data[0]}\n"
            f"y[0]: {y_data[0]}")
        return x_data, y_data
Beispiel #23
0
    def _get_model(self,
                   file_name="zamba-and-obj-rec-0.859.joblib",
                   model_url=None):
        if model_url is None:
            model_url = self.MODEL_URL

        cache_subdir = "blanknonblank"
        model_path = zamba.config.cache_dir / cache_subdir / file_name

        if not model_path.exists():
            model_path = get_file(
                fname=file_name,
                origin=model_url,
                cache_dir=zamba.config.cache_dir,
                cache_subdir=cache_subdir,
                extract=True,
            )

        return model_path
Beispiel #24
0
    def test_build_with_BERT_and_fit(self):
        from kashgari.embeddings import BERTEmbedding
        from tensorflow.python.keras.utils import get_file
        from kashgari.macros import DATA_PATH

        sample_bert_path = get_file(
            'bert_sample_model',
            "http://s3.bmio.net/kashgari/bert_sample_model.tar.bz2",
            cache_dir=DATA_PATH,
            untar=True)

        processor = MultiOutputProcessor()
        embedding = BERTEmbedding(model_folder=sample_bert_path,
                                  processor=processor)
        m = MultiOutputModel(embedding=embedding)
        m.build_model(train_x, (output_1, output_2))
        m.fit(train_x, (output_1, output_2), epochs=2)
        res = m.predict(train_x[:10])
        assert len(res) == 2
        assert res[0].shape == (10, 3)
def generate_data():
    path = get_file('babi-tasks-v1-2.tar.gz',
                    origin='https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz')
    challenge = 'tasks_1-20_v1-2/en/qa2_two-supporting-facts_{}.txt'
    with tarfile.open(path) as tar:
        train = get_stories(tar.extractfile(challenge.format('train')))
        test = get_stories(tar.extractfile(challenge.format('test')))
    vocab = set()
    for story, q, answer in train + test:
        vocab |= set(story + q + [answer])
    # 36 words
    vocab = sorted(vocab)
    vocab_size = len(vocab) + 1
    word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
    story_maxlen = max(map(len, (x for x, _, _ in train + test)))
    query_maxlen = max(map(len, (x for _, x, _ in train + test)))
    # xs:1000 * 552  xqs: 1000*5  ys: 1000*36
    train_sqa_vec = vectorize_stories(train, word_idx, story_maxlen, query_maxlen)
    test_sqa_vec = vectorize_stories(test, word_idx, story_maxlen, query_maxlen)
    return train_sqa_vec, test_sqa_vec, vocab_size, story_maxlen, query_maxlen
Beispiel #26
0
    def get_classes(self):
        '''
            Downloads the ImageNet class index file and loads it to 
            self.get_classes unless it is already cached.

            Args:
                None

            Returns:
                None
        '''

        # Get the ImageNet class indexes and cache them
        fname = 'imagenet_class_index.json'
        fpath = get_file(fname,
                         self.FILE_PATH + fname,
                         cache_subdir = 'models')

        # Open file and parse json
        with open(fpath) as f:
            class_dict = json.load(f)
        self.classes = [class_dict[str(i)][1] for i in range(len(class_dict))]
Beispiel #27
0
def main(args):
    global cold
    was_cold = cold
    cold = False

    try:
        input_bucket = args.get("input_bucket", 1)
        object_key = args.get("object_key", 1)

        model_object_key = args.get(
            "model_object_key",
            1)  # example : squeezenet_weights_tf_dim_ordering_tf_kernels.h5
        model_bucket = args.get("model_bucket", 1)

        # download_path = tmp + '{}{}'.format(uuid.uuid4(), object_key)
        download_path = get_file(
            '{}{}'.format(uuid.uuid4(), object_key),
            "https://github.com/kmu-bigdata/serverless-faas-workbench/raw/master/dataset/image/animal-dog.jpg",
            cache_dir='/tmp/')
        # s3_client.download_file(input_bucket, object_key, download_path)

        model_path = tmp + '{}{}'.format(uuid.uuid4(), model_object_key)
        # s3_client.download_file(model_bucket, model_object_key, model_path)

        latency, result = predict(download_path)

        _tmp_dic = {x[1]: {'N': str(x[2])} for x in result[0]}

        return {"body": {"latency": latency, "msg": msg, "cold": was_cold}}
    except Exception as e:
        err = "whelp"
        try:
            err = traceback.format_exc()
        except Exception as fug:
            err = str(fug)
        return {"body": {"cust_error": msg, "thing": err, "cold": was_cold}}
Beispiel #28
0
    def __init__(
        self,
        cnn_out_channels=(64, 64),
        cnn_percent_on=(0.095, 0.125),
        linear_units=1000,
        linear_percent_on=0.1,
        linear_weight_sparsity=0.4,
        boost_strength=1.5,
        boost_strength_factor=0.9,
        k_inference_factor=1.5,
        duty_cycle_period=1000,
        data_format=IMAGE_DATA_FORMAT,
        pre_trained=False,
        name=None,
        batch_norm=True,
        **kwargs,
    ):
        super(GSCSparseCNN, self).__init__(name=name, **kwargs)

        if data_format == "channels_first":
            axis = 1
            input_shape = (1, 32, 32)
        else:
            axis = -1
            input_shape = (32, 32, 1)

        self.add(
            keras.layers.Conv2D(
                name="cnn1",
                data_format=data_format,
                input_shape=input_shape,
                filters=cnn_out_channels[0],
                kernel_size=5,
            ))
        if batch_norm:
            self.add(
                keras.layers.BatchNormalization(
                    name="cnn1_batchnorm",
                    axis=axis,
                    epsilon=1e-05,
                    momentum=0.9,
                    center=False,
                    scale=False,
                ))
        self.add(
            keras.layers.MaxPool2D(
                name="cnn1_maxpool",
                pool_size=2,
                padding="same",
                data_format=data_format,
            ))
        self.add(
            KWinners2d(
                name="cnn1_kwinner",
                data_format=data_format,
                percent_on=cnn_percent_on[0],
                k_inference_factor=k_inference_factor,
                boost_strength=boost_strength,
                boost_strength_factor=boost_strength_factor,
                duty_cycle_period=duty_cycle_period,
            ))
        self.add(
            keras.layers.Conv2D(
                name="cnn2",
                data_format=data_format,
                filters=cnn_out_channels[1],
                kernel_size=5,
            ))
        if batch_norm:
            self.add(
                keras.layers.BatchNormalization(
                    name="cnn2_batchnorm",
                    axis=axis,
                    epsilon=1e-05,
                    momentum=0.9,
                    center=False,
                    scale=False,
                ))
        self.add(
            keras.layers.MaxPool2D(
                name="cnn2_maxpool",
                pool_size=2,
                padding="same",
                data_format=data_format,
            ))
        self.add(
            KWinners2d(
                name="cnn2_kwinner",
                data_format=data_format,
                percent_on=cnn_percent_on[1],
                k_inference_factor=k_inference_factor,
                boost_strength=boost_strength,
                boost_strength_factor=boost_strength_factor,
                duty_cycle_period=duty_cycle_period,
            ))
        self.add(keras.layers.Flatten(name="flatten", data_format=data_format))
        self.add(
            keras.layers.Dense(
                name="linear",
                units=linear_units,
                kernel_constraint=SparseWeights(linear_weight_sparsity),
            ))
        if batch_norm:
            self.add(
                keras.layers.BatchNormalization(name="linear_bn",
                                                epsilon=1e-05,
                                                momentum=0.9,
                                                center=False,
                                                scale=False))
        self.add(
            KWinners(
                name="linear_kwinner",
                percent_on=linear_percent_on,
                k_inference_factor=k_inference_factor,
                boost_strength=boost_strength,
                boost_strength_factor=boost_strength_factor,
                duty_cycle_period=duty_cycle_period,
            ))
        self.add(keras.layers.Dense(name="output", units=12))
        self.add(keras.layers.Softmax(axis=1))

        if pre_trained:
            if not batch_norm:
                raise NotImplementedError(
                    "Unable to load pre-trained models with no BatchNorm")
            model_url, model_hash = MODEL_URLS["gsc_sparse_cnn"]
            file_name = "gsc_sparse_cnn-{:.8}".format(model_hash)
            archive_path = get_file(
                fname="{}.tar.gz".format(file_name),
                origin=model_url,
                file_hash=model_hash,
                extract=True,
                cache_subdir="models",
            )
            cache_dir = os.path.dirname(archive_path)
            self.load_weights(os.path.join(cache_dir, "gsc_sparse_cnn.h5"))
    def download(self, data_dir=None, dataset='all'):
        '''Cornell Grasping Dataset - about 5GB total size

        http:pr.cs.cornell.edu/grasping/rect_data/data.php

        Downloads to `~/.keras/datasets/cornell_grasping` by default.
        Includes grasp_listing.txt with all files in all datasets;
        the feature csv files which specify the dataset size,
        the features (data channels), and the number of grasps;
        and the tfrecord files which actually contain all the data.

        If `grasp_listing_hashed.txt` is present, an additional
        hashing step will will be completed to verify dataset integrity.
        `grasp_listing_hashed.txt` will be generated automatically when
        downloading with `dataset='all'`.

        # Arguments

            dataset: The name of the dataset to download, downloads all by default
                with the '' parameter, 102 will download the 102 feature dataset
                found in grasp_listing.txt.

        # Returns

           list of paths to the downloaded files

        '''
        dataset = self._update_dataset_param(dataset)
        if data_dir is None:
            if self.data_dir is None:
                data_dir = FLAGS.data_dir
            else:
                data_dir = self.data_dir
        hypertree_utilities.mkdir_p(data_dir)
        print('Downloading datasets to: ', data_dir)

        url_prefix = ''
        # If a hashed version of the listing is available,
        # download the dataset and verify hashes to prevent data corruption.
        listing_hash = os.path.join(data_dir, 'grasp_listing_hash.txt')
        if os.path.isfile(listing_hash):
            files_and_hashes = np.genfromtxt(listing_hash,
                                             dtype='str',
                                             delimiter=' ')
            files = [
                get_file(fpath.split('/')[-1],
                         url_prefix + fpath,
                         cache_subdir=data_dir,
                         file_hash=hash_str,
                         extract=True)
                for fpath, hash_str in tqdm(files_and_hashes)
                if '_' + str(dataset) in fpath
            ]
        else:
            # If a hashed version of the listing is not available,
            # simply download the dataset normally.
            listing_url = 'https://raw.githubusercontent.com/ahundt/robot-grasp-detection/master/grasp_listing.txt'
            grasp_listing_path = get_file('grasp_listing.txt',
                                          listing_url,
                                          cache_subdir=data_dir)
            grasp_files = np.genfromtxt(grasp_listing_path, dtype=str)
            files = [
                get_file(fpath.split('/')[-1],
                         url_prefix + fpath,
                         cache_subdir=data_dir,
                         extract=True) for fpath in tqdm(grasp_files)
                if '_' + dataset in fpath
            ]

            # If all files are downloaded, generate a hashed listing.
            if dataset is 'all' or dataset is '':
                print('Hashing all dataset files to prevent corruption...')
                hashes = []
                for i, f in enumerate(tqdm(files)):
                    hashes.append(_hash_file(f))
                file_hash_np = np.column_stack([grasp_files, hashes])
                with open(listing_hash, 'wb') as hash_file:
                    np.savetxt(hash_file,
                               file_hash_np,
                               fmt='%s',
                               delimiter=' ',
                               header='file_path sha256')
                print(
                    'Hashing complete, {} contains each url plus hash, and will be used to verify the '
                    'dataset during future calls to download().'.format(
                        listing_hash))

        return files
Beispiel #30
0
 def load(path):
     """Ensures that a file is downloaded locally, then unzips and reads it."""
     return GzipFile(get_file(Path(path).name, path)).read()