Exemple #1
0
    def _download_and_unzip(self, md5=None, file_to_extract=None):
        if self.pkl_file.is_file():
            self.logger.info("Found binary format embedding %s" %
                             self.pkl_file)
            return

        maybe_download(self.download_url,
                       store_path=self.store_folder,
                       filename=self.download_file.name,
                       md5=md5)

        if Path(self.download_url).suffix != ".zip":
            return

        with zipfile.ZipFile(self.download_file) as zf:
            # assume there is only one file in zip

            if file_to_extract is None:
                [unzipped_file_name] = zf.namelist()
            else:
                unzipped_file_name = file_to_extract

            if not (self.store_folder / unzipped_file_name).is_file():
                self.logger.info("Unzipping the embedding file %s" %
                                 self.download_file)
                zf.extract(member=unzipped_file_name, path=self.store_folder)

            self.download_file = self.store_folder / unzipped_file_name
Exemple #2
0
def main():
    savepath = './save_point'
    filepath = './save_point/keras_example_checkpoint.h5'

    # Extract MNIST dataset
    train_data_filename = maybe_download('train-images-idx3-ubyte.gz')
    train_labels_filename = maybe_download('train-labels-idx1-ubyte.gz')
    test_data_filename = maybe_download('t10k-images-idx3-ubyte.gz')
    test_labels_filename = maybe_download('t10k-labels-idx1-ubyte.gz')

    train_data = extract_data(train_data_filename, 60000, dense=False)
    train_data = train_data.reshape((60000, NUM_CHANNELS, IMG_SIZE, IMG_SIZE))
    train_labels = extract_labels(train_labels_filename, 60000, one_hot=True)
    test_data = extract_data(test_data_filename, 10000, dense=False)
    test_data = test_data.reshape((10000, NUM_CHANNELS, IMG_SIZE, IMG_SIZE))
    test_labels = extract_labels(test_labels_filename, 10000, one_hot=True)

    validation_data = train_data[:VALIDATION_SIZE, ...]
    validation_labels = train_labels[:VALIDATION_SIZE, :]
    validation_set = (validation_data, validation_labels)
    train_data = train_data[VALIDATION_SIZE:, ...]
    train_labels = train_labels[VALIDATION_SIZE:, ...]

    # Model construction
    model = Sequential()
    model.add(Convolution2D(32, 3, 3, border_mode='same',
              input_shape=(1, 28, 28)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Convolution2D(64, 3, 3, border_mode='same'))
    model.add(Flatten())
    model.add(Dense(256))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(10))
    model.add(Activation('softmax'))

    # Define optimizer and configure training process
    sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9)
    model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=["accuracy"])

    model.fit(
        train_data,
        train_labels,
        nb_epoch=NUM_EPOCHS,
        batch_size=1000,
        validation_data=validation_set)

    print 'Save model weights'
    if not os.path.isdir (savepath):
        os.mkdir (savepath)
    model.save_weights(filepath, overwrite=True)

    predict = model.predict(test_data, batch_size=1000)

    print 'Test err: %.1f%%' % error_rate(predict, test_labels)

    print 'Test loss: %1.f%%, accuracy: %1.f%%', \
        tuple(model.evaluate(test_data, test_labels, batch_size=1000))
Exemple #3
0
  def __init__(self, root, train, download=True, transform=None):

    if download:
      maybe_download('https://www.dropbox.com/s/wczryi3tdzsa182/CIFAR10poly16.zip?dl=0',
                     'CIFAR10poly16', root, 'zip')

    if train:
      self.root = os.path.join(root, 'CIFAR10poly16', 'train')
    else:
      self.root = os.path.join(root, 'CIFAR10poly16', 'test')

    self.transform=transform
Exemple #4
0
  def __init__(self, root, train, transform=None, size=28):

    if size==28:
      maybe_download('https://www.dropbox.com/s/lajg1qorz3h3909/fashionMNISTpoly28.zip?dl=0',
                     'fashionMNISTpoly'+str(size), root, 'zip')

    if train:
      self.root = os.path.join(root, 'fashionMNISTpoly'+str(size), 'train')
    else:
      self.root = os.path.join(root, 'fashionMNISTpoly'+str(size), 'test')

    self.transform=transform
Exemple #5
0
  def __init__(self, root, train, transform=None, size=19):

    if size==32:
      maybe_download('https://www.dropbox.com/s/wczryi3tdzsa182/MNISTpoly.zip?dl=0',
                     'MNISTpoly'+str(size), root, 'zip')
    elif size==19:
      maybe_download('https://www.dropbox.com/s/jspn547dz473shr/MNISTpoly19.zip?dl=0',
                     'MNISTpoly'+str(size), root, 'zip')
    if train:
      self.root = os.path.join(root, 'MNISTpoly'+str(size), 'train')
    else:
      self.root = os.path.join(root, 'MNISTpoly'+str(size), 'test')

    self.transform=transform
Exemple #6
0
  def __init__(self, root='/root/datasets', transform=None):

    self.transform = transform
    self.root = join(root, 'htr_assets/crowdsource/processed')
    maybe_download(source_url='https://www.dropbox.com/s/dsg41kaajrfvfvj/htr_assets.zip?dl=0',
                   filename='htr_assets', target_directory=root, filetype='zip') # qyk added, the source is yq's dropbox
    # custom dataset loader
    allfiles = glob(join(self.root, '**/*.jpg'), recursive=True)
    labels = [basename(f)[:-4] if basename(f).find('empty-')==-1 else '_' for f in allfiles] # if filename has 'empty-', then the ground truth is nothing
    self.samples = list(zip(allfiles,labels))

    # makes list of characters
    chars = set.union(*[set(l) for l in labels])
    self.charList = sorted(list(chars))
Exemple #7
0
    def __init__(self, root='/root/datasets', transform=None):

        self.transform = transform
        self.root = join(root, 'iam_handwriting')

        # download and put dataset in correct directory
        maybe_download(
            'https://www.dropbox.com/sh/tdd0784neuv9ysh/AABm3gxtjQIZ2R9WZ-XR9Kpra?dl=0',
            'iam_handwriting', root, 'folder')
        if exists(join(self.root, 'words.tgz')):
            if not exists(join(self.root, 'words')):
                os.makedirs(join(self.root, 'words'))
                os.system('tar xvzf ' + join(self.root, 'words.tgz') +
                          ' --directory ' + join(self.root, 'words'))
                os.system('rm ' + join(self.root, 'words.tgz'))

        # begin collecting all words in IAM dataset frm the words.txt summary file at the root of IAM directiory
        labelsFile = open(join(self.root, 'words.txt'))
        chars = set()
        self.samples = []
        for line in labelsFile:

            # ignore comment line
            if not line or line[0] == '#':
                continue

            lineSplit = line.strip().split(' ')
            assert len(lineSplit) >= 9

            # filename: part1-part2-part3 --> part1/part1-part2/part1-part2-part3.png
            fileNameSplit = lineSplit[0].split('-')
            fileName = join(self.root, 'words/') + fileNameSplit[0] + '/' + fileNameSplit[0] + '-' + fileNameSplit[1] + '/' + \
                       lineSplit[0] + '.png'

            # GT text are columns starting at 9
            label = ' '.join(lineSplit[8:])

            # put sample into list
            # qyk exclude empty images
            if '---' not in label:  # qyk: data clean
                img_test = cv2.imread(fileName,
                                      cv2.IMREAD_GRAYSCALE)  #qyk: data clean
                if not (img_test is None
                        or np.min(img_test.shape) <= 1):  #qyk: data clean
                    self.samples.append((fileName, label))  #qyk

                    # makes list of characters
                    chars = chars.union(set(list(label)))
        self.charList = sorted(list(chars))
Exemple #8
0
  def __init__(self, root='/root/datasets', transform=None):

    self.transform = transform
    self.root = join(root, 'irs_handwriting')
    maybe_download(source_url='https://www.dropbox.com/s/54jarzcb0mju32d/img_cropped_irs.zip?dl=0', filename='irs_handwriting', target_directory=root, filetype='zip')
    if exists(join(root, 'img_cropped_irs')): os.system('mv '+join(root, 'img_cropped_irs')+' '+self.root)

    folder_depth = 2
    allfiles = glob(join(self.root, '**/'*folder_depth+'*.jpg'))[:2000]
    labels = [basename(f)[:-4] for f in allfiles]
    #print(labels[0])
    self.samples = list(zip(allfiles, labels))
    # makes list of characters
    chars = set.union(*[set(l) for l in labels])
    self.charList = sorted(list(chars))
Exemple #9
0
def load_data(dst_dir='./dataset'):
    """Loads CIFAR10 dataset.

    # Returns
        Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
    """
    dirname = 'cifar-10-batches-py'
    origin = 'http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
    path = maybe_download(dirname, origin, dst_dir, untar=True)

    nb_train_samples = 50000

    x_train = np.zeros((nb_train_samples, 3, 32, 32), dtype='uint8')
    y_train = np.zeros((nb_train_samples, ), dtype='uint8')

    for i in range(1, 6):
        fpath = os.path.join(path, 'data_batch_' + str(i))
        data, labels = load_batch(fpath)
        x_train[(i - 1) * 10000:i * 10000, :, :, :] = data
        y_train[(i - 1) * 10000:i * 10000] = labels

    fpath = os.path.join(path, 'test_batch')
    x_test, y_test = load_batch(fpath)

    y_train = np.reshape(y_train, (len(y_train)))
    y_test = np.reshape(y_test, (len(y_test)))

    x_train = x_train.transpose(0, 2, 3, 1)
    x_test = x_test.transpose(0, 2, 3, 1)

    return (x_train, y_train), (x_test, y_test)
Exemple #10
0
def distributed_maybe_download(path, local_rank, mpi_size):
    if not path.startswith('gs://'):
        return path
    filename = path[5:].replace('/', '-')
    with first_rank_first(local_rank, mpi_size):
        fp = maybe_download(path, filename)
    return fp
Exemple #11
0
    def __init__(self, root='/root/datasets', transform=None):

        self.transform = transform
        self.root = join(root, 'artifact_images_no_intersect',
                         'artifact_images_no_intersect')  #zip problem, sorry

        # download and put dataset in correct directory
        maybe_download(
            'https://www.dropbox.com/s/rogd4d5ilfm4g5e/artifact_images_no_intersect.zip?dl=0',
            'artifact_images_no_intersect', root, 'folder')
        #if exists(join(self.root,'words.tgz')):
        #  if not exists(join(self.root, 'words')):
        #    os.makedirs(join(self.root, 'words'))
        #    os.system('tar xvzf '+join(self.root, 'words.tgz')+' --directory '+join(self.root, 'words'))
        #    os.system('rm '+join(self.root,'words.tgz'))

        # begin collecting all words in IAM dataset frm the words.txt summary file at the root of IAM directiory

        labelsFile = open(join(self.root, 'databook.txt'))
        #chars = set()
        self.samples = []
        #ct=0
        for line in labelsFile:
            #ct+=1
            # ignore comment line
            if not line or line[0] == '#':
                continue

            lineSplit = line.strip().split(' ')
            assert len(lineSplit) == 3

            #fileNameSplit = lineSplit[0].split('-')
            imgPath = lineSplit[0].replace(
                '/root/datasets/artifact_images_no_intersect', self.root)
            # GT text are columns starting at 9
            labelPath = lineSplit[1].replace(
                '/root/datasets/artifact_images_no_intersect', self.root)

            gt_text = lineSplit[2]

            # put sample into list
            # qyk exclude empty images
            #      if '---' not in label: # qyk: data clean
            #        img_test=cv2.imread(fileName, cv2.IMREAD_GRAYSCALE) #qyk: data clean
            #        if not (img_test is None or np.min(img_test.shape) <= 1): #qyk: data clean
            #            self.samples.append( (fileName, label) ) #qyk
            self.samples.append((imgPath, labelPath, gt_text))
Exemple #12
0
  def __init__(self, root='/root/datasets', transform=None):

    self.transform = transform
    self.root = join(root, 'img_print_single')
    maybe_download(source_url='https://www.dropbox.com/s/xw8vd3n2jkz1n93/img_print_single.zip?dl=0',filename='img_print_single', target_directory=root, filetype='zip') #'https://www.dropbox.com/s/cbhpy6clfi9a5lz/img_print_100000_clean.zip?dl=0'
    #yq patch delete unrecognized non-english samples in linux
    #os.system('find '+ root+' -maxdepth 1 -name "*.jpg" -type f -delete') find ./logs/examples -maxdepth 1 -name "*.log"
    #if exists(join(root, 'img_print_single')): os.system('mv ' + join(root, 'img_print_single') + ' ' + self.root)

    folder_depth = 1
    allfiles = glob(join(self.root, '**/' * folder_depth + '*.jpg'))
    #allfiles = [f for f in allfiles if len(basename(f))-4<=25 and len(basename(f))-4 >=1 and (not '#U' in f) and (not '---' in f)] # screen out non-recognized characters qyk
    labels = [basename(f)[:-4] for f in allfiles]
    self.samples = list(zip(allfiles, labels))
    # makes list of characters
    chars = set.union(*[set(l) for l in labels])
    self.charList = sorted(list(chars))
Exemple #13
0
  def __init__(self, root='/root/datasets', transform=None):

    self.transform = transform
    self.root = join(root, 'text_recognition')
    maybe_download(source_url='https://www.dropbox.com/s/n1pq94xu9kpur1a/text_recognition.zip?dl=0',filename='text_recognition', target_directory=root, filetype='zip') #'https://www.dropbox.com/s/cbhpy6clfi9a5lz/img_print_100000_clean.zip?dl=0'
    #yq patch delete unrecognized non-english samples in linux
    #os.system('find '+ root+' -maxdepth 1 -name "*.jpg" -type f -delete') find ./logs/examples -maxdepth 1 -name "*.log"
    #if exists(join(root, 'img_print_100000_en')): os.system('mv ' + join(root, 'img_print_100000_en') + ' ' + self.root)

    #folder_depth = 0
    with open(join(self.root,'catalog.txt'),'r') as f:
        recs=f.readlines()
    self.samples=list(map(lambda x:x.strip('\n').rsplit(' ',1),recs))
    print('screened :'+str(len(self.samples)))
    # makes list of characters
    _,labels=zip(*self.samples)
    chars = set.union(*map(set,labels))
    self.charList = sorted(list(chars))
Exemple #14
0
def get_babi_en(get_10k=False):
    data_dir = "datasets/tasks_1-20_v1-2/en/"
    if get_10k == True:
        data_dir = "datasets/tasks_1-20_v1-2/en-10k/"

    maybe_download(
        'https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz',
        'datasets', 11745123)
    file = tarfile.open("datasets/babi_tasks_1-20_v1-2.tar.gz", "r:gz")
    file.extractall("datasets")
    file.close()
    print("Some housekeeping...")
    if not os.path.exists("datasets/babi"):
        os.makedirs("datasets/babi")
    for path, dir, files in os.walk(data_dir):
        for file in files:
            os.rename(os.path.join(data_dir, file),
                      os.path.join("datasets/babi", file))
    os.remove("datasets/babi_tasks_1-20_v1-2.tar.gz")
    rmtree("datasets/tasks_1-20_v1-2")
    print("Finished.")
num_classes = ord('z') - ord('a') + 1 + 1 + 1

# Hyper-parameters
num_epochs = 200
num_hidden = 50
num_layers = 1
batch_size = 1
initial_learning_rate = 1e-2
momentum = 0.9

num_examples = 1
num_batches_per_epoch = int(num_examples/batch_size)

# Loading the data

audio_filename = maybe_download('LDC93S1.wav', 93638)
target_filename = maybe_download('LDC93S1.txt', 62)

fs, audio = wav.read(audio_filename)

inputs = mfcc(audio, samplerate=fs)
# Tranform in 3D array
train_inputs = np.asarray(inputs[np.newaxis, :])
train_inputs = (train_inputs - np.mean(train_inputs))/np.std(train_inputs)
train_seq_len = [train_inputs.shape[1]]

# Readings targets
with open(target_filename, 'r') as f:

    #Only the last line is necessary
    line = f.readlines()[-1]
Exemple #16
0
import tensorflow as tf
import numpy as np
import cv2
from matplotlib.pyplot import plot, imshow, colorbar, show, axis
from PIL import Image
import os
import random
from os.path import join, basename, dirname
from glob import glob
import utils
import torch

home = os.environ['HOME']

ckptroot = join(home, 'ckpt', 'poisoncifar')
filename = 'liam_resnet18'
url = 'https://www.dropbox.com/s/6x0vxrous1kbb1s/liam_resnet18?dl=0'
utils.maybe_download(url, filename, ckptroot, filetype='file')
filepath = join(ckptroot, filename)
ckpt = torch.load(filepath)
weights = ckpt['model']
weights = {
    k: v.cpu().numpy()
    for k, v in weights.items() if k.find('running_') == -1
}
weights = {
    k: v
    for k, v in weights.items() if k.find('num_batches_tracked') == -1
}
num_classes = ord('z') - ord('a') + 1 + 1 + 1

# Hyper-parameters
num_epochs = 10000
num_hidden = 50
num_layers = 1
batch_size = 1
initial_learning_rate = 1e-2
momentum = 0.9

num_examples = 2
num_batches_per_epoch = int(num_examples / batch_size)

# Loading the data

audio_filename = maybe_download('red.wav', 96044)
target_filename = maybe_download('red.txt', 12)

audio_filename2 = maybe_download('blue.wav', 96044)
target_filename2 = maybe_download('blue.txt', 19)

fs, audio = wav.read(audio_filename)

fs2, audio2 = wav.read(audio_filename2)

inputs = mfcc(audio, samplerate=fs)
inputs2 = mfcc(audio2, samplerate=fs2)

# Tranform in 3D array
train_inputs = np.asarray(inputs[np.newaxis, :])
#train_inputs = np.asarray(inputs)
# Accounting the 0th indice +  space + blank label = 28 characters
num_classes = ord('z') - ord('a') + 1 + 1 + 1 + 1 + 1

# Hyper-parameters
num_epochs = 200
num_hidden = 50
num_layers = 1
batch_size = 1
initial_learning_rate = 1e-2
momentum = 0.9

num_examples = 1
num_batches_per_epoch = int(num_examples / batch_size)

# Loading the data
audio_filename = maybe_download('LDC93S1.wav', 93638)
target_filename = maybe_download('LDC93S1.txt', 62)

inputs = spectrogram_from_file(audio_filename,
                               step=10,
                               window=20,
                               max_freq=8000,
                               eps=1e-14)

# Tranform in 3D array
#print(len(inputs))
train_inputs = np.asarray(inputs[np.newaxis, :])
#print(len(train_inputs))
train_inputs = (train_inputs - np.mean(train_inputs)) / np.std(train_inputs)
train_seq_len = [train_inputs.shape[1]]
print(train_seq_len)
Exemple #19
0
                        help='Maximum number of lines')

    parser.add_argument('output', type=str, help='Output file')
    parser.add_argument('--tmp-dir',
                        type=str,
                        help='Location directory',
                        default='/tmp/')

    args = parser.parse_args()

    # download data
    output_file = Path(args.output)
    target_dir = Path(args.tmp_dir)

    firstnames_archive_path = utils.maybe_download(
        ARCHIVE_PREFIX + FIRSTNAMES_ARCHIVE_NAME + '.zip', target_dir,
        FIRSTNAMES_ARCHIVE_URL)
    utils.maybe_extract(firstnames_archive_path,
                        target_dir / FIRSTNAMES_ARCHIVE_DIR_NAME)
    extracted_firstnames_file = target_dir / f'{ARCHIVE_PREFIX}{FIRSTNAMES_ARCHIVE_NAME}/{FIRSTNAMES_ARCHIVE_NAME}.txt'

    lastnames_archive_path = utils.maybe_download(
        ARCHIVE_PREFIX + LASTNAME_ARCHIVE_NAME + '.zip', target_dir,
        LASTNAME_ARCHIVE_URL)
    utils.maybe_extract(lastnames_archive_path,
                        target_dir / LASTNAME_ARCHIVE_DIR_NAME)
    extracted_lastnames_file = target_dir / f'{ARCHIVE_PREFIX}{LASTNAME_ARCHIVE_NAME}/{LASTNAME_ARCHIVE_NAME}.txt'

    # read files
    print('extract lastnames')
    lastnames = get_most_common_lastnames(extracted_lastnames_file,
Exemple #20
0
    plt.show()

    shift = lambda x, w: convolve(
        x.reshape((28, 28)), mode='constant', weights=w).ravel()
    X = np.concatenate([X] + [
        np.apply_along_axis(shift, 1, X, vector)
        for vector in direction_vectors
    ])
    print X.shape
    y = np.concatenate([y for _ in range(len(direction_vectors) + 1)], axis=0)
    print y.shape
    return X, y


# Extract data
train_data_filename = maybe_download('train-images-idx3-ubyte.gz')
train_labels_filename = maybe_download('train-labels-idx1-ubyte.gz')
test_data_filename = maybe_download('t10k-images-idx3-ubyte.gz')
test_labels_filename = maybe_download('t10k-labels-idx1-ubyte.gz')

X_train = extract_data(train_data_filename, 60000, dense=True)
y_train = extract_labels(train_labels_filename, 60000, one_hot=False)
X_test = extract_data(test_data_filename, 10000, dense=True)
y_test = extract_labels(test_labels_filename, 10000, one_hot=False)

#################################################
# Test for decision tree classifier without dimensionality reduction
Tree = DecisionTreeClassifier()
Tree.fit(X_train, y_train)
print 'Without dimenstionality reduction: ', Tree.score(X_test, y_test)
Exemple #21
0
def main():
    alphabet_txt = os.path.join(LANG.model_dir, 'alphabet.txt')
    raw_txt_gz = os.path.join(LANG.model_dir, 'raw.txt.gz')
    unprepared_txt = os.path.join(LANG.model_dir, 'unprepared.txt')
    prepared_txt = os.path.join(LANG.model_dir, 'prepared.txt')
    vocabulary_txt = os.path.join(LANG.model_dir, 'vocabulary.txt')
    unfiltered_arpa = os.path.join(LANG.model_dir, 'unfiltered.arpa')
    filtered_arpa = os.path.join(LANG.model_dir, 'filtered.arpa')
    lm_binary = os.path.join(LANG.model_dir, 'lm.binary')
    kenlm_scorer = os.path.join(LANG.model_dir, 'kenlm.scorer')
    temp_prefix = os.path.join(LANG.model_dir, 'tmp')

    section('Writing alphabet file', empty_lines_before=1)
    with open(alphabet_txt, 'w', encoding='utf-8') as alphabet_file:
        alphabet_file.write('\n'.join(LANG.alphabet) + '\n')

    redo = ARGS.force_download

    section('Downloading text data')
    redo = maybe_download(LANG.text_url, raw_txt_gz, force=redo)

    section('Unzipping text data')
    redo = maybe_ungzip(raw_txt_gz, unprepared_txt, force=redo)

    redo = redo or ARGS.force_prepare

    section('Preparing text and building vocabulary')
    if redo or not os.path.isfile(prepared_txt) or not os.path.isfile(vocabulary_txt):
        redo = True
        announce('Preparing {} shards of "{}"...'.format(ARGS.workers, unprepared_txt))
        counters = Queue(ARGS.workers)
        source_bytes = os.path.getsize(unprepared_txt)
        aggregator_process = Process(target=aggregate_counters, args=(vocabulary_txt, source_bytes, counters))
        aggregator_process.start()
        counter_processes = list(map(lambda index: Process(target=count_words, args=(index, counters)),
                                     range(ARGS.workers)))
        try:
            for p in counter_processes:
                p.start()
            for p in counter_processes:
                p.join()
            counters.put(STOP_TOKEN)
            aggregator_process.join()
            print('')
            partials = list(map(lambda i: get_partial_path(i), range(ARGS.workers)))
            join_files(partials, prepared_txt)
            for partial in partials:
                os.unlink(partial)
        except KeyboardInterrupt:
            aggregator_process.terminate()
            for p in counter_processes:
                p.terminate()
            raise
    else:
        announce('Files "{}" and \n\t"{}" existing - not preparing'.format(prepared_txt, vocabulary_txt))

    redo = redo or ARGS.force_generate

    section('Building unfiltered language model')
    if redo or not os.path.isfile(unfiltered_arpa):
        redo = True
        lmplz_args = [
            KENLM_BIN + '/lmplz',
            '--temp_prefix', temp_prefix,
            '--memory', '80%',
            '--discount_fallback',
            '--limit_vocab_file', vocabulary_txt,
            '--text', prepared_txt,
            '--arpa', unfiltered_arpa,
            '--skip', 'symbols',
            '--order', str(LANG.order)
        ]
        if len(LANG.prune) > 0:
            lmplz_args.append('--prune')
            lmplz_args.extend(list(map(str, LANG.prune)))
        subprocess.check_call(lmplz_args)
    else:
        announce('File "{}" existing - not generating'.format(unfiltered_arpa))

    section('Filtering language model')
    if redo or not os.path.isfile(filtered_arpa):
        redo = True
        with open(vocabulary_txt, 'rb') as vocabulary_file:
            vocabulary_content = vocabulary_file.read()
        subprocess.run([
            KENLM_BIN + '/filter',
            'single',
            'model:' + unfiltered_arpa,
            filtered_arpa
        ], input=vocabulary_content, check=True)
    else:
        announce('File "{}" existing - not filtering'.format(filtered_arpa))

    section('Generating binary representation')
    if redo or not os.path.isfile(lm_binary):
        redo = True
        subprocess.check_call([
            KENLM_BIN + '/build_binary',
            '-a', '255',
            '-q', '8',
            '-v',
            'trie',
            filtered_arpa,
            lm_binary
        ])
    else:
        announce('File "{}" existing - not generating'.format(lm_binary))

    section('Building scorer')
    if redo or not os.path.isfile(kenlm_scorer):
        redo = True
        words = set()
        vocab_looks_char_based = True
        with open(vocabulary_txt) as vocabulary_file:
            for line in vocabulary_file:
                for word in line.split():
                    words.add(word.encode())
                    if len(word) > 1:
                        vocab_looks_char_based = False
        announce("{} unique words read from vocabulary file.".format(len(words)))
        announce(
            "{} like a character based model.".format(
                "Looks" if vocab_looks_char_based else "Doesn't look"
            )
        )
        if ARGS.alphabet_mode == 'auto':
            use_utf8 = vocab_looks_char_based
        elif ARGS.alphabet_mode == 'utf8':
            use_utf8 = True
        else:
            use_utf8 = False
        serialized_alphabet = get_serialized_utf8_alphabet() if use_utf8 else LANG.get_serialized_alphabet()
        from ds_ctcdecoder import Scorer, Alphabet
        alphabet = Alphabet()
        err = alphabet.deserialize(serialized_alphabet, len(serialized_alphabet))
        if err != 0:
            announce('Error loading alphabet: {}'.format(err))
            sys.exit(1)
        scorer = Scorer()
        scorer.set_alphabet(alphabet)
        scorer.set_utf8_mode(use_utf8)
        scorer.reset_params(LANG.alpha, LANG.beta)
        scorer.load_lm(lm_binary)
        scorer.fill_dictionary(list(words))
        shutil.copy(lm_binary, kenlm_scorer)
        scorer.save_dictionary(kenlm_scorer, True)  # append, not overwrite
        announce('Package created in {}'.format(kenlm_scorer))
        announce('Testing package...')
        scorer = Scorer()
        scorer.load_lm(kenlm_scorer)
    else:
        announce('File "{}" existing - not building'.format(kenlm_scorer))
Exemple #22
0
def main(argv=None):  # pylint: disable=unused-argument
  # Get the data.
  train_data_filename = maybe_download('train-images-idx3-ubyte.gz')
  train_labels_filename = maybe_download('train-labels-idx1-ubyte.gz')
  test_data_filename = maybe_download('t10k-images-idx3-ubyte.gz')
  test_labels_filename = maybe_download('t10k-labels-idx1-ubyte.gz')


  # Extract it into numpy arrays.
  train_data = extract_data(train_data_filename, 60000, dense=False)
  train_labels = extract_labels(train_labels_filename, 60000, one_hot=True)
  test_data = extract_data(test_data_filename, 10000, dense=False )
  test_labels = extract_labels(test_labels_filename, 10000, one_hot=True)


  # Generate a validation set.
  validation_data = train_data[:VALIDATION_SIZE, ...]
  validation_labels = train_labels[:VALIDATION_SIZE]
  train_data = train_data[VALIDATION_SIZE:, ...]
  train_labels = train_labels[VALIDATION_SIZE:]
  num_epochs = NUM_EPOCHS
  train_size = train_labels.shape[0]

  # This is where training samples and labels are fed to the graph.
  # These placeholder nodes will be fed a batch of training data at each
  # training step using the {feed_dict} argument to the Run() call below.
  train_data_node = tf.placeholder(
      tf.float32,
      shape=(BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS))
  train_labels_node = tf.placeholder(tf.float32,
                                     shape=(BATCH_SIZE, NUM_LABELS))
  eval_data = tf.placeholder(
      tf.float32,
      shape=(EVAL_BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS))

  # The variables below hold all the trainable weights. They are passed an
  # initial value which will be assigned when when we call:
  # {tf.initialize_all_variables().run()}

  # First convolutional layer
  conv1_weights = tf.Variable(
      tf.truncated_normal([3, 3, NUM_CHANNELS, 32],  # 5x5 filter, depth 32.
                          stddev=0.1,
                          seed=SEED))
  conv1_biases = tf.Variable(tf.zeros([32]))

  # Two second convolutional layers 5 x 5 filter, and 3 x 3 filters.
  conv2_weights = tf.Variable(
      tf.truncated_normal([5, 5, 32, 64],
                          stddev=0.1,
                          seed=SEED))
  conv2_biases = tf.Variable(tf.constant(0.01, shape=[64]))

  conv2_weights2 = tf.Variable(
      tf.truncated_normal([3, 3, 32, 64],
                          stddev=0.1,
                          seed=SEED))
  conv2_biases2 = tf.Variable(tf.constant(0.01, shape=[64]))

  # First fully connected layer after conv layer
  fc1_weights = tf.Variable(  # fully connected, depth 512.
      tf.truncated_normal(
          [IMAGE_SIZE // 4 * IMAGE_SIZE // 4 * 128, 512],
          stddev=0.05,
          seed=SEED))
  fc1_biases = tf.Variable(tf.constant(0.01, shape=[512]))

  # Second fully connected layer
  fc2_weights = tf.Variable(
      tf.truncated_normal([512, 256],
                          stddev=0.05,
                          seed=SEED))
  fc2_biases = tf.Variable(tf.constant(0.1, shape=[256]))

  # Output layer
  fc3_weights = tf.Variable(
      tf.truncated_normal([256, NUM_LABELS],
                          stddev=0.04,
                          seed=SEED))
  fc3_biases = tf.Variable(tf.constant(0.1, shape=[NUM_LABELS]))


  # We will replicate the model structure for the training subgraph, as well
  # as the evaluation subgraphs, while sharing the trainable parameters.
  def model(data, train=False):
    """The Model definition."""
    # 2D convolution, with 'SAME' padding (i.e. the output feature map has
    # the same size as the input). Note that {strides} is a 4D array whose
    # shape matches the data layout: [image index, y, x, depth].
    conv = tf.nn.conv2d(data,
                        conv1_weights,
                        strides=[1, 1, 1, 1],
                        padding='SAME')
    # Bias and rectified linear non-linearity.
    relu = tf.nn.relu(tf.nn.bias_add(conv, conv1_biases))
    if train:
        relu = tf.nn.dropout(relu, .5)
    # Max pooling. The kernel size spec {ksize} also follows the layout of
    # the data. Here we have a pooling window of 2, and a stride of 2.
    pool = tf.nn.max_pool(relu,
                          ksize=[1, 2, 2, 1],
                          strides=[1, 2, 2, 1],
                          padding='SAME')
    conv = tf.nn.conv2d(pool,
                        conv2_weights,
                        strides=[1, 1, 1, 1],
                        padding='SAME')
    relu = tf.nn.relu(tf.nn.bias_add(conv, conv2_biases))
    conv2 = tf.nn.conv2d(pool,
                         conv2_weights2,
                         strides=[1, 1, 1, 1],
                         padding='SAME')
    relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_biases2))

    pool = tf.nn.max_pool(relu,
                          ksize=[1, 2, 2, 1],
                          strides=[1, 2, 2, 1],
                          padding='SAME')
    pool2 = tf.nn.max_pool(relu2,
                           ksize=[1, 2, 2, 1],
                           strides=[1, 2, 2, 1],
                           padding='SAME')
    # Reshape the feature map cuboid into a 2D matrix to feed it to the
    # fully connected layers.
    pool = tf.concat(3, [pool, pool2])
    pool_shape = pool.get_shape().as_list()
    reshape = tf.reshape(
        pool,
        [pool_shape[0], pool_shape[1] * pool_shape[2] * pool_shape[3]])
    # Fully connected layer. Note that the '+' operation automatically
    # broadcasts the biases.
    hidden = tf.nn.relu(tf.matmul(reshape, fc1_weights) + fc1_biases)
    hidden = tf.nn.relu(tf.matmul(hidden, fc2_weights) + fc2_biases)
    # Add a 50% dropout during training only. Dropout also scales
    # activations such that no rescaling is needed at evaluation time.
    if train:
      hidden = tf.nn.dropout(hidden, 0.5, seed=SEED)
    return tf.matmul(hidden, fc3_weights) + fc3_biases

  def extract_filter (data):
    conv = tf.nn.conv2d(data,
                        conv1_weights,
                        strides=[1, 1, 1, 1],
                        padding='SAME')
    # Bias and rectified linear non-linearity.
    relu1 = tf.nn.relu(tf.nn.bias_add(conv, conv1_biases))

    # Max pooling. The kernel size spec {ksize} also follows the layout of
    # the data. Here we have a pooling window of 2, and a stride of 2.
    pool = tf.nn.max_pool(relu1,
                          ksize=[1, 2, 2, 1],
                          strides=[1, 2, 2, 1],
                          padding='SAME')
    conv = tf.nn.conv2d(pool,
                        conv2_weights,
                        strides=[1, 1, 1, 1],
                        padding='SAME')
    relu2 = tf.nn.relu(tf.nn.bias_add(conv, conv2_biases))
    conv2 = tf.nn.conv2d(pool,
                         conv2_weights2,
                         strides=[1, 1, 1, 1],
                         padding='SAME')
    relu3 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_biases2))

    return relu1, relu2, relu3


  # Training computation: logits + cross-entropy loss.
  logits = model(train_data_node, True)
  loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
      logits, train_labels_node))

  # L2 regularization for the fully connected parameters.
  regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases) +
                  tf.nn.l2_loss(fc2_weights) + tf.nn.l2_loss(fc2_biases) +
                  tf.nn.l2_loss(fc3_weights) + tf.nn.l2_loss(fc3_biases))
  # Add the regularization term to the loss.
  loss += 5e-4 * regularizers

  # Optimizer: set up a variable that's incremented once per batch and
  # controls the learning rate decay.
  batch = tf.Variable(0)
  # Decay once per epoch, using an exponential schedule starting at 0.01.
  learning_rate = tf.train.exponential_decay(
      0.01,                # Base learning rate.
      batch * BATCH_SIZE,  # Current index into the dataset.
      train_size,          # Decay step.
      0.95,                # Decay rate.
      staircase=True)
  # Use simple momentum for the optimization.
  optimizer = tf.train.MomentumOptimizer(learning_rate,
                                         0.9).minimize(loss,
                                                       global_step=batch)

  # Predictions for the current training minibatch.
  train_prediction = tf.nn.softmax(logits)

  # Predictions for the test and validation, which we'll compute less often.
  eval_prediction = tf.nn.softmax(model(eval_data))

  # Small utility function to evaluate a dataset by feeding batches of data to
  # {eval_data} and pulling the results from {eval_predictions}.
  # Saves memory and enables this to run on smaller GPUs.
  def eval_in_batches(data, sess):
    """Get all predictions for a dataset by running it in small batches."""
    size = data.shape[0]
    if size < EVAL_BATCH_SIZE:
      raise ValueError("batch size for evals larger than dataset: %d" % size)
    predictions = numpy.ndarray(shape=(size, NUM_LABELS), dtype=numpy.float32)
    for begin in xrange(0, size, EVAL_BATCH_SIZE):
      end = begin + EVAL_BATCH_SIZE
      if end <= size:
        predictions[begin:end, :] = sess.run(
            eval_prediction,
            feed_dict={eval_data: data[begin:end, ...]})
      else:
        batch_predictions = sess.run(
            eval_prediction,
            feed_dict={eval_data: data[-EVAL_BATCH_SIZE:, ...]})
        predictions[begin:, :] = batch_predictions[begin - size:, :]
    return predictions

  # Create a local session to run the training.
  saver = tf.train.Saver()
  start_time = time.time()
  with tf.Session() as sess:
    # Run all the initializers to prepare the trainable parameters.
    if FLAGS.model:
      saver.restore(sess, FLAGS.model)  # If model exists, load it
    else:
      sess.run(tf.initialize_all_variables())  # If there is no model randomly initialize
    if FLAGS.train:
      # Loop through training steps.
      for step in xrange(int(num_epochs * train_size) // BATCH_SIZE):
        # Compute the offset of the current minibatch in the data.
        # Note that we could use better randomization across epochs.
        offset = (step * BATCH_SIZE) % (train_size - BATCH_SIZE)
        batch_data = train_data[offset:(offset + BATCH_SIZE), ...]
        batch_labels = train_labels[offset:(offset + BATCH_SIZE)]
        # This dictionary maps the batch data (as a numpy array) to the
        # node in the graph is should be fed to.
        feed_dict = {train_data_node: batch_data,
                     train_labels_node: batch_labels}
        # Run the graph and fetch some of the nodes.
        _, l, lr, predictions = sess.run(
            [optimizer, loss, learning_rate, train_prediction],
            feed_dict=feed_dict)
        if step % EVAL_FREQUENCY == 0:
          elapsed_time = time.time() - start_time
          start_time = time.time()
          print('Step %d (epoch %.2f), %.1f ms' %
                (step, float(step) * BATCH_SIZE / train_size,
                 1000 * elapsed_time / EVAL_FREQUENCY))
          print('Minibatch loss: %.3f, learning rate: %.6f' % (l, lr))
          print('Minibatch error: %.1f%%' % error_rate(predictions, batch_labels))
          print('Validation error: %.1f%%' % error_rate(
              eval_in_batches(validation_data, sess), validation_labels))
          sys.stdout.flush()
      # Finally print the result!
      test_error = error_rate(eval_in_batches(test_data, sess), test_labels)
      print('Test error: %.1f%%' % test_error)
      print ('Optimization done')
      print ('Save models')
      if not tf.gfile.Exists("./conv_save"):
          tf.gfile.MakeDirs("./conv_save")
      saver_path = saver.save(sess, "./conv_save/model.ckpt")
      print ('Successfully saved file: %s' % saver_path)
    else:  # If train flag is false, execute image extraction routine
      print ("Filter extraction routine")
      aa = train_data[1:2, :, :, :]
      print (aa.shape)
      # Run extract filter operations (conv1, conv2 and conv3 layers)
      images = sess.run(extract_filter(train_data[1:2, :, :, :]))
      print (images[2].shape)
      plt.imshow (images[2][0, :, :, 32] * 255 + 255 / 2, cmap='gray')
      # plt.imshow (images[2][0, :, :, 32], cmap='gray')
      plt.show ()
      # Save all outputs
      for i in range (3):
        filter_shape = images[i].shape
        img_size = [filter_shape[1], filter_shape[2]]
        print (img_size)
    'of the target word.')
parser.add_argument('--min_count',
                    type=int,
                    default=5,
                    help='The minimum number of word occurrences for it to be '
                    'included in the vocabulary.')
parser.add_argument(
    '--sampling_factor',
    type=float,
    default=1e-3,
    help='Subsample threshold for word occurrence. Words that appear '
    'with higher frequency will be randomly down-sampled. Set '
    'to 0 to disable.')
args = parser.parse_args()

zip_filename = maybe_download('http://mattmahoney.net/dc/text8.zip')
text_file = unzip(zip_filename)
sentences = word2vec.Text8Corpus(text_file)
sentences = [' '.join(sent) for sent in sentences]
tokenizer = Tokenizer(filters=base_filter() + "'")
tokenizer.fit_on_texts(sentences)
sentences = tokenizer.texts_to_sequences(sentences)
V = len(tokenizer.word_index) + 1


def build_model():
    target_word = Sequential()
    target_word.add(Embedding(V, args.embedding_size, input_length=1))

    context = Sequential()
    context.add(Embedding(V, args.embedding_size, input_length=1))
Exemple #24
0
    plt.show()

    shift = lambda x, w: convolve(x.reshape((28, 28)), mode='constant',
                                  weights=w).ravel()
    X = np.concatenate([X] +
                       [np.apply_along_axis(shift, 1, X, vector)
                        for vector in direction_vectors])
    print X.shape
    y = np.concatenate([y for _ in range(len(direction_vectors) + 1)], axis=0)
    print y.shape
    return X, y


# Extract data
train_data_filename = maybe_download('train-images-idx3-ubyte.gz')
train_labels_filename = maybe_download('train-labels-idx1-ubyte.gz')
test_data_filename = maybe_download('t10k-images-idx3-ubyte.gz')
test_labels_filename = maybe_download('t10k-labels-idx1-ubyte.gz')

X_train = extract_data(train_data_filename, 60000, dense=True)
y_train = extract_labels(train_labels_filename, 60000, one_hot=False)
X_test = extract_data(test_data_filename, 10000, dense=True)
y_test = extract_labels(test_labels_filename, 10000, one_hot=False)


#################################################
# Test for decision tree classifier without dimensionality reduction
Tree = DecisionTreeClassifier()
Tree.fit(X_train, y_train)
print 'Without dimenstionality reduction: ', Tree.score(X_test, y_test)
Exemple #25
0
def get_loader(data_root,
               batchsize,
               poison=False,
               fracdirty=.5,
               cifar100=False,
               noaugment=False,
               nogan=True,
               cinic=False,
               tanti=False,
               svhn=False,
               surface=False,
               nworker=1):
    '''return loaders for cifar'''

    ## transforms
    def get_transform(datamean, datastd):
        transform_train = transforms.Compose([
            # transforms.RandomCrop(32, padding=4),
            # transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize(datamean, datastd),
        ])
        transform_test = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(datamean, datastd),
        ])
        transform_tanti = transforms.Compose([
            # transforms.RandomCrop(32, padding=6),
            # transforms.Lambda(transforms.functional.hflip), # temporary
            # transforms.RandomRotation(5),
            transforms.ToTensor(),
            transforms.Normalize(datamean, datastd),
        ])
        transform_switchable = transform_test if noaugment else transform_train
        return transform_train, transform_test, transform_switchable, transform_tanti

    ## multiplex between cifar and cinic and svhn
    if not cinic and not svhn:
        datamean, datastd = (0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)
        transform_train, transform_test, transform_switchable, transform_tanti = get_transform(
            datamean, datastd)
        Dataset = torchvision.datasets.CIFAR100 if cifar100 else torchvision.datasets.CIFAR10
        testset = Dataset(root=data_root,
                          train=False,
                          download=True,
                          transform=transform_test)
        args_trainset = dict(root=data_root, train=True, download=True)
    elif cinic:
        cinic_root = join(data_root, 'CINIC-10')
        utils.maybe_download(
            source_url=
            'https://datashare.is.ed.ac.uk/bitstream/handle/10283/3192/CINIC-10.tar.gz',
            filename='CINIC-10',
            target_directory=cinic_root,
            filetype='tar')
        datamean, datastd = [0.47889522, 0.47227842,
                             0.43047404], [0.24205776, 0.23828046, 0.25874835]
        transform_train, transform_test, transform_switchable, transform_tanti = get_transform(
            datamean, datastd)
        Dataset = torchvision.datasets.ImageFolder
        testset = Dataset(cinic_root + '/test', transform=transform_test)
        args_trainset = dict(root=cinic_root + '/train')
    elif svhn:
        datamean, datastd = [0.43768212, 0.44376972,
                             0.47280444], [0.1200278, 0.12307685, 0.10515254]
        transform_train, transform_test, transform_switchable, transform_tanti = get_transform(
            datamean, datastd)
        svhn_root = join(data_root, 'SVHN')
        # trainset = torchvision.datasets.SVHN(svhn_root, 'train', transform=transform_test)
        trainset = torchvision.datasets.SVHN(svhn_root,
                                             'train',
                                             transform=transform_train,
                                             download=True)
        if not surface:
            testset = torchvision.datasets.SVHN(svhn_root,
                                                'test',
                                                transform=transform_test,
                                                download=True)
            ganset = torchvision.datasets.SVHN(svhn_root,
                                               'extra',
                                               transform=transform_test,
                                               download=True)

    ## dataset objects
    if svhn:
        pass
    elif poison:
        trainset = Dataset(transform=transform_switchable, **args_trainset)
        if tanti: ganset = Dataset(transform=transform_tanti, **args_trainset)
        elif nogan:
            trainset, ganset = torch.utils.data.random_split(
                trainset, [25000, 25000])
            # else: ganset = CifarGan(root=data_root, transform=transform_test if nogan else transform_switchable)
        else:
            ganset = Dataset(root=cinic_root + '/valid',
                             transform=transform_train)
    else:
        trainset = Dataset(transform=transform_switchable, **args_trainset)

    ## dataloader objects
    if not surface:
        testloader = torch.utils.data.DataLoader(testset,
                                                 batch_size=100,
                                                 shuffle=False,
                                                 num_workers=nworker)
    else:
        testloader = None
    if poison:
        gansize = int(batchsize * fracdirty)
        trainsize = batchsize - gansize
        trainloader = torch.utils.data.DataLoader(trainset,
                                                  batch_size=trainsize,
                                                  shuffle=True,
                                                  num_workers=nworker)
        ganloader = torch.utils.data.DataLoader(ganset,
                                                batch_size=gansize,
                                                shuffle=True,
                                                num_workers=nworker)
    else:
        trainsize = batchsize
        trainloader = torch.utils.data.DataLoader(trainset,
                                                  batch_size=trainsize,
                                                  shuffle=True,
                                                  num_workers=nworker)
        ganloader = None

    return trainloader, ganloader, testloader
Exemple #26
0
def main():
	savepath = './save_point'
	filepath = './save_point/model_api_checkpoint.h5'
	train_data_filename = maybe_download('train-images-idx3-ubyte.gz')
	train_labels_filename = maybe_download('train-labels-idx1-ubyte.gz')
	test_data_filename = maybe_download('t10k-images-idx3-ubyte.gz')
	test_labels_filename = maybe_download('t10k-labels-idx1-ubyte.gz')

	train_data = extract_data(train_data_filename, 60000, dense=False)
	train_data = train_data.reshape((60000, NUM_CHANNELS, IMG_SIZE, IMG_SIZE))
	train_labels = extract_labels(train_labels_filename, 60000, one_hot=True)
	test_data = extract_data(test_data_filename, 10000, dense=False)
	test_data = test_data.reshape((10000, NUM_CHANNELS, IMG_SIZE, IMG_SIZE))
	test_labels = extract_labels(test_labels_filename, 10000, one_hot=True)

	validation_data = train_data[:VALIDATION_SIZE, ...]
	validation_labels = train_labels[:VALIDATION_SIZE, :]
	validation_set = (validation_data, validation_labels)
	train_data = train_data[VALIDATION_SIZE:, ...]
	train_labels = train_labels[VALIDATION_SIZE:, ...]

	img = Input(shape=(1, 28, 28))
	conv1 = Convolution2D(32, 3, 3, border_mode='same')(img)
	conv1 = Activation('relu')(conv1)
	pool1 = MaxPooling2D(pool_size=(2, 2))(conv1)
	conv2_1 = Convolution2D(64, 3, 3, border_mode='same')(pool1)
	conv2_2 = Convolution2D(64, 5, 5, border_mode='same')(pool1)
	conv2_1 = Activation('relu')(conv2_1)
	conv2_2 = Activation('relu')(conv2_2)
	pool2_1 = MaxPooling2D(pool_size=(2, 2))(conv2_1)
	pool2_2 = MaxPooling2D(pool_size=(2, 2))(conv2_2)
	dense1 = Flatten()(pool2_1)
	dense2 = Flatten()(pool2_2)
	dense = merge([dense1, dense2], mode='concat', concat_axis=1)
	dense = Dense(512)(dense)
	dense = Activation('relu')(dense)
	dense = Dense(256)(dense)
	dense = Activation('relu')(dense)
	dense = Dense(10)(dense)
	output = Activation('softmax')(dense)

	model = Model(input=[img], output=[output])

	sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9)

	model.compile(
					optimizer=sgd,
					loss=['categorical_crossentropy'],
					metrics=["accuracy"])

	model.fit(
					[train_data],
					[train_labels],
					nb_epoch=1,
					verbose=1,
					batch_size=1000,
					validation_data=validation_set)

	print 'Save model weights'
	if not os.path.isdir (savepath):
		os.mkdir (savepath)
	model.save_weights(filepath, overwrite=True)


	predictions = model.predict([test_data],
	                            batch_size=1000)

	print 'Test error: %.1f%%' % error_rate(predictions, test_labels)

	print 'Test loss: %.14f, Test accurracy %.4f' % \
	      tuple(model.evaluate([test_data], [test_labels], batch_size=1000))
Exemple #27
0
# -------------- TensorBoard summaries -----------------

summ_D_loss_real = tf.summary.scalar("D_loss_real", D_loss_real)
summ_D_loss_fake = tf.summary.scalar("D_loss_fake", D_loss_fake)
summ_D_loss = tf.summary.scalar("D_loss", D_loss)

summ_D_losses = tf.summary.merge(
    [summ_D_loss_real, summ_D_loss_fake, summ_D_loss])

summ_G_loss = tf.summary.scalar("G_loss", G_loss)

# -------------- Load the dataset ------------------------

# download mnist if needed
utils.maybe_download(FLAGS.input_path, FLAGS.mnist)

# import mnist dataset
data = input_data.read_data_sets(FLAGS.input_path, one_hot=True)

# -------------- Train models ------------------------

# create session
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# create summary writer
summary_writer = tf.summary.FileWriter(FLAGS.log_path,
                                       graph=tf.get_default_graph())

for i in range(FLAGS.train_steps):
Exemple #28
0
                            parse_args=False,
                            project_name='mfseq_transition_matrix')
    experiment.log_parameters(vars(args))

    # front matter
    home = os.environ['HOME']
    autoname = 'rank_%s/lr_%s' % (args.rank, args.lrnrate)
    experiment.set_name(autoname)
    args.logdir = join(home, 'ckpt', autoname)
    os.makedirs(args.logdir, exist_ok=True)
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    # load data from file
    maybe_download(
        'https://www.dropbox.com/s/lu38zp3ixjpth9e/graph_data_cube.pkl?dl=0',
        'graph_data_cube.pkl',
        join(home, 'datasets'),
        filetype='file')
    with gzip.open(join(home, 'datasets', 'graph_data_cube.pkl'), 'rb') as f:
        datacube = pickle.load(f)
    ntime, nnode, nfeat = datacube.shape

    # build tf graph
    model = Model(args, ntime, nnode, nfeat)

    # run optimizer on training data
    model.fit(datacube)

    # plot visualizations
    model.plot(ending=True)
Exemple #29
0
# download BERT multi-lingual
from utils import maybe_download
import os

BERT_MODELS_DIR = 'bert_models'
BERT_BASE_DIR = 'bert_models/multilingual_L-12_H-768_A-12'

BERT_MODEL_URL = 'https://storage.googleapis.com/bert_models/2018_11_03/'
BERT_BASE_MULTI_FILE = 'multilingual_L-12_H-768_A-12.zip'

if __name__ == '__main__':
    maybe_download(BERT_MODEL_URL, BERT_BASE_MULTI_FILE, BERT_MODELS_DIR)
    os.system(
        "7z x bert_models/multilingual_L-12_H-768_A-12.zip -obert_models")