def _download_and_unzip(self, md5=None, file_to_extract=None): if self.pkl_file.is_file(): self.logger.info("Found binary format embedding %s" % self.pkl_file) return maybe_download(self.download_url, store_path=self.store_folder, filename=self.download_file.name, md5=md5) if Path(self.download_url).suffix != ".zip": return with zipfile.ZipFile(self.download_file) as zf: # assume there is only one file in zip if file_to_extract is None: [unzipped_file_name] = zf.namelist() else: unzipped_file_name = file_to_extract if not (self.store_folder / unzipped_file_name).is_file(): self.logger.info("Unzipping the embedding file %s" % self.download_file) zf.extract(member=unzipped_file_name, path=self.store_folder) self.download_file = self.store_folder / unzipped_file_name
def main(): savepath = './save_point' filepath = './save_point/keras_example_checkpoint.h5' # Extract MNIST dataset train_data_filename = maybe_download('train-images-idx3-ubyte.gz') train_labels_filename = maybe_download('train-labels-idx1-ubyte.gz') test_data_filename = maybe_download('t10k-images-idx3-ubyte.gz') test_labels_filename = maybe_download('t10k-labels-idx1-ubyte.gz') train_data = extract_data(train_data_filename, 60000, dense=False) train_data = train_data.reshape((60000, NUM_CHANNELS, IMG_SIZE, IMG_SIZE)) train_labels = extract_labels(train_labels_filename, 60000, one_hot=True) test_data = extract_data(test_data_filename, 10000, dense=False) test_data = test_data.reshape((10000, NUM_CHANNELS, IMG_SIZE, IMG_SIZE)) test_labels = extract_labels(test_labels_filename, 10000, one_hot=True) validation_data = train_data[:VALIDATION_SIZE, ...] validation_labels = train_labels[:VALIDATION_SIZE, :] validation_set = (validation_data, validation_labels) train_data = train_data[VALIDATION_SIZE:, ...] train_labels = train_labels[VALIDATION_SIZE:, ...] # Model construction model = Sequential() model.add(Convolution2D(32, 3, 3, border_mode='same', input_shape=(1, 28, 28))) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Convolution2D(64, 3, 3, border_mode='same')) model.add(Flatten()) model.add(Dense(256)) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(10)) model.add(Activation('softmax')) # Define optimizer and configure training process sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9) model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=["accuracy"]) model.fit( train_data, train_labels, nb_epoch=NUM_EPOCHS, batch_size=1000, validation_data=validation_set) print 'Save model weights' if not os.path.isdir (savepath): os.mkdir (savepath) model.save_weights(filepath, overwrite=True) predict = model.predict(test_data, batch_size=1000) print 'Test err: %.1f%%' % error_rate(predict, test_labels) print 'Test loss: %1.f%%, accuracy: %1.f%%', \ tuple(model.evaluate(test_data, test_labels, batch_size=1000))
def __init__(self, root, train, download=True, transform=None): if download: maybe_download('https://www.dropbox.com/s/wczryi3tdzsa182/CIFAR10poly16.zip?dl=0', 'CIFAR10poly16', root, 'zip') if train: self.root = os.path.join(root, 'CIFAR10poly16', 'train') else: self.root = os.path.join(root, 'CIFAR10poly16', 'test') self.transform=transform
def __init__(self, root, train, transform=None, size=28): if size==28: maybe_download('https://www.dropbox.com/s/lajg1qorz3h3909/fashionMNISTpoly28.zip?dl=0', 'fashionMNISTpoly'+str(size), root, 'zip') if train: self.root = os.path.join(root, 'fashionMNISTpoly'+str(size), 'train') else: self.root = os.path.join(root, 'fashionMNISTpoly'+str(size), 'test') self.transform=transform
def __init__(self, root, train, transform=None, size=19): if size==32: maybe_download('https://www.dropbox.com/s/wczryi3tdzsa182/MNISTpoly.zip?dl=0', 'MNISTpoly'+str(size), root, 'zip') elif size==19: maybe_download('https://www.dropbox.com/s/jspn547dz473shr/MNISTpoly19.zip?dl=0', 'MNISTpoly'+str(size), root, 'zip') if train: self.root = os.path.join(root, 'MNISTpoly'+str(size), 'train') else: self.root = os.path.join(root, 'MNISTpoly'+str(size), 'test') self.transform=transform
def __init__(self, root='/root/datasets', transform=None): self.transform = transform self.root = join(root, 'htr_assets/crowdsource/processed') maybe_download(source_url='https://www.dropbox.com/s/dsg41kaajrfvfvj/htr_assets.zip?dl=0', filename='htr_assets', target_directory=root, filetype='zip') # qyk added, the source is yq's dropbox # custom dataset loader allfiles = glob(join(self.root, '**/*.jpg'), recursive=True) labels = [basename(f)[:-4] if basename(f).find('empty-')==-1 else '_' for f in allfiles] # if filename has 'empty-', then the ground truth is nothing self.samples = list(zip(allfiles,labels)) # makes list of characters chars = set.union(*[set(l) for l in labels]) self.charList = sorted(list(chars))
def __init__(self, root='/root/datasets', transform=None): self.transform = transform self.root = join(root, 'iam_handwriting') # download and put dataset in correct directory maybe_download( 'https://www.dropbox.com/sh/tdd0784neuv9ysh/AABm3gxtjQIZ2R9WZ-XR9Kpra?dl=0', 'iam_handwriting', root, 'folder') if exists(join(self.root, 'words.tgz')): if not exists(join(self.root, 'words')): os.makedirs(join(self.root, 'words')) os.system('tar xvzf ' + join(self.root, 'words.tgz') + ' --directory ' + join(self.root, 'words')) os.system('rm ' + join(self.root, 'words.tgz')) # begin collecting all words in IAM dataset frm the words.txt summary file at the root of IAM directiory labelsFile = open(join(self.root, 'words.txt')) chars = set() self.samples = [] for line in labelsFile: # ignore comment line if not line or line[0] == '#': continue lineSplit = line.strip().split(' ') assert len(lineSplit) >= 9 # filename: part1-part2-part3 --> part1/part1-part2/part1-part2-part3.png fileNameSplit = lineSplit[0].split('-') fileName = join(self.root, 'words/') + fileNameSplit[0] + '/' + fileNameSplit[0] + '-' + fileNameSplit[1] + '/' + \ lineSplit[0] + '.png' # GT text are columns starting at 9 label = ' '.join(lineSplit[8:]) # put sample into list # qyk exclude empty images if '---' not in label: # qyk: data clean img_test = cv2.imread(fileName, cv2.IMREAD_GRAYSCALE) #qyk: data clean if not (img_test is None or np.min(img_test.shape) <= 1): #qyk: data clean self.samples.append((fileName, label)) #qyk # makes list of characters chars = chars.union(set(list(label))) self.charList = sorted(list(chars))
def __init__(self, root='/root/datasets', transform=None): self.transform = transform self.root = join(root, 'irs_handwriting') maybe_download(source_url='https://www.dropbox.com/s/54jarzcb0mju32d/img_cropped_irs.zip?dl=0', filename='irs_handwriting', target_directory=root, filetype='zip') if exists(join(root, 'img_cropped_irs')): os.system('mv '+join(root, 'img_cropped_irs')+' '+self.root) folder_depth = 2 allfiles = glob(join(self.root, '**/'*folder_depth+'*.jpg'))[:2000] labels = [basename(f)[:-4] for f in allfiles] #print(labels[0]) self.samples = list(zip(allfiles, labels)) # makes list of characters chars = set.union(*[set(l) for l in labels]) self.charList = sorted(list(chars))
def load_data(dst_dir='./dataset'): """Loads CIFAR10 dataset. # Returns Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`. """ dirname = 'cifar-10-batches-py' origin = 'http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz' path = maybe_download(dirname, origin, dst_dir, untar=True) nb_train_samples = 50000 x_train = np.zeros((nb_train_samples, 3, 32, 32), dtype='uint8') y_train = np.zeros((nb_train_samples, ), dtype='uint8') for i in range(1, 6): fpath = os.path.join(path, 'data_batch_' + str(i)) data, labels = load_batch(fpath) x_train[(i - 1) * 10000:i * 10000, :, :, :] = data y_train[(i - 1) * 10000:i * 10000] = labels fpath = os.path.join(path, 'test_batch') x_test, y_test = load_batch(fpath) y_train = np.reshape(y_train, (len(y_train))) y_test = np.reshape(y_test, (len(y_test))) x_train = x_train.transpose(0, 2, 3, 1) x_test = x_test.transpose(0, 2, 3, 1) return (x_train, y_train), (x_test, y_test)
def distributed_maybe_download(path, local_rank, mpi_size): if not path.startswith('gs://'): return path filename = path[5:].replace('/', '-') with first_rank_first(local_rank, mpi_size): fp = maybe_download(path, filename) return fp
def __init__(self, root='/root/datasets', transform=None): self.transform = transform self.root = join(root, 'artifact_images_no_intersect', 'artifact_images_no_intersect') #zip problem, sorry # download and put dataset in correct directory maybe_download( 'https://www.dropbox.com/s/rogd4d5ilfm4g5e/artifact_images_no_intersect.zip?dl=0', 'artifact_images_no_intersect', root, 'folder') #if exists(join(self.root,'words.tgz')): # if not exists(join(self.root, 'words')): # os.makedirs(join(self.root, 'words')) # os.system('tar xvzf '+join(self.root, 'words.tgz')+' --directory '+join(self.root, 'words')) # os.system('rm '+join(self.root,'words.tgz')) # begin collecting all words in IAM dataset frm the words.txt summary file at the root of IAM directiory labelsFile = open(join(self.root, 'databook.txt')) #chars = set() self.samples = [] #ct=0 for line in labelsFile: #ct+=1 # ignore comment line if not line or line[0] == '#': continue lineSplit = line.strip().split(' ') assert len(lineSplit) == 3 #fileNameSplit = lineSplit[0].split('-') imgPath = lineSplit[0].replace( '/root/datasets/artifact_images_no_intersect', self.root) # GT text are columns starting at 9 labelPath = lineSplit[1].replace( '/root/datasets/artifact_images_no_intersect', self.root) gt_text = lineSplit[2] # put sample into list # qyk exclude empty images # if '---' not in label: # qyk: data clean # img_test=cv2.imread(fileName, cv2.IMREAD_GRAYSCALE) #qyk: data clean # if not (img_test is None or np.min(img_test.shape) <= 1): #qyk: data clean # self.samples.append( (fileName, label) ) #qyk self.samples.append((imgPath, labelPath, gt_text))
def __init__(self, root='/root/datasets', transform=None): self.transform = transform self.root = join(root, 'img_print_single') maybe_download(source_url='https://www.dropbox.com/s/xw8vd3n2jkz1n93/img_print_single.zip?dl=0',filename='img_print_single', target_directory=root, filetype='zip') #'https://www.dropbox.com/s/cbhpy6clfi9a5lz/img_print_100000_clean.zip?dl=0' #yq patch delete unrecognized non-english samples in linux #os.system('find '+ root+' -maxdepth 1 -name "*.jpg" -type f -delete') find ./logs/examples -maxdepth 1 -name "*.log" #if exists(join(root, 'img_print_single')): os.system('mv ' + join(root, 'img_print_single') + ' ' + self.root) folder_depth = 1 allfiles = glob(join(self.root, '**/' * folder_depth + '*.jpg')) #allfiles = [f for f in allfiles if len(basename(f))-4<=25 and len(basename(f))-4 >=1 and (not '#U' in f) and (not '---' in f)] # screen out non-recognized characters qyk labels = [basename(f)[:-4] for f in allfiles] self.samples = list(zip(allfiles, labels)) # makes list of characters chars = set.union(*[set(l) for l in labels]) self.charList = sorted(list(chars))
def __init__(self, root='/root/datasets', transform=None): self.transform = transform self.root = join(root, 'text_recognition') maybe_download(source_url='https://www.dropbox.com/s/n1pq94xu9kpur1a/text_recognition.zip?dl=0',filename='text_recognition', target_directory=root, filetype='zip') #'https://www.dropbox.com/s/cbhpy6clfi9a5lz/img_print_100000_clean.zip?dl=0' #yq patch delete unrecognized non-english samples in linux #os.system('find '+ root+' -maxdepth 1 -name "*.jpg" -type f -delete') find ./logs/examples -maxdepth 1 -name "*.log" #if exists(join(root, 'img_print_100000_en')): os.system('mv ' + join(root, 'img_print_100000_en') + ' ' + self.root) #folder_depth = 0 with open(join(self.root,'catalog.txt'),'r') as f: recs=f.readlines() self.samples=list(map(lambda x:x.strip('\n').rsplit(' ',1),recs)) print('screened :'+str(len(self.samples))) # makes list of characters _,labels=zip(*self.samples) chars = set.union(*map(set,labels)) self.charList = sorted(list(chars))
def get_babi_en(get_10k=False): data_dir = "datasets/tasks_1-20_v1-2/en/" if get_10k == True: data_dir = "datasets/tasks_1-20_v1-2/en-10k/" maybe_download( 'https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz', 'datasets', 11745123) file = tarfile.open("datasets/babi_tasks_1-20_v1-2.tar.gz", "r:gz") file.extractall("datasets") file.close() print("Some housekeeping...") if not os.path.exists("datasets/babi"): os.makedirs("datasets/babi") for path, dir, files in os.walk(data_dir): for file in files: os.rename(os.path.join(data_dir, file), os.path.join("datasets/babi", file)) os.remove("datasets/babi_tasks_1-20_v1-2.tar.gz") rmtree("datasets/tasks_1-20_v1-2") print("Finished.")
num_classes = ord('z') - ord('a') + 1 + 1 + 1 # Hyper-parameters num_epochs = 200 num_hidden = 50 num_layers = 1 batch_size = 1 initial_learning_rate = 1e-2 momentum = 0.9 num_examples = 1 num_batches_per_epoch = int(num_examples/batch_size) # Loading the data audio_filename = maybe_download('LDC93S1.wav', 93638) target_filename = maybe_download('LDC93S1.txt', 62) fs, audio = wav.read(audio_filename) inputs = mfcc(audio, samplerate=fs) # Tranform in 3D array train_inputs = np.asarray(inputs[np.newaxis, :]) train_inputs = (train_inputs - np.mean(train_inputs))/np.std(train_inputs) train_seq_len = [train_inputs.shape[1]] # Readings targets with open(target_filename, 'r') as f: #Only the last line is necessary line = f.readlines()[-1]
import tensorflow as tf import numpy as np import cv2 from matplotlib.pyplot import plot, imshow, colorbar, show, axis from PIL import Image import os import random from os.path import join, basename, dirname from glob import glob import utils import torch home = os.environ['HOME'] ckptroot = join(home, 'ckpt', 'poisoncifar') filename = 'liam_resnet18' url = 'https://www.dropbox.com/s/6x0vxrous1kbb1s/liam_resnet18?dl=0' utils.maybe_download(url, filename, ckptroot, filetype='file') filepath = join(ckptroot, filename) ckpt = torch.load(filepath) weights = ckpt['model'] weights = { k: v.cpu().numpy() for k, v in weights.items() if k.find('running_') == -1 } weights = { k: v for k, v in weights.items() if k.find('num_batches_tracked') == -1 }
num_classes = ord('z') - ord('a') + 1 + 1 + 1 # Hyper-parameters num_epochs = 10000 num_hidden = 50 num_layers = 1 batch_size = 1 initial_learning_rate = 1e-2 momentum = 0.9 num_examples = 2 num_batches_per_epoch = int(num_examples / batch_size) # Loading the data audio_filename = maybe_download('red.wav', 96044) target_filename = maybe_download('red.txt', 12) audio_filename2 = maybe_download('blue.wav', 96044) target_filename2 = maybe_download('blue.txt', 19) fs, audio = wav.read(audio_filename) fs2, audio2 = wav.read(audio_filename2) inputs = mfcc(audio, samplerate=fs) inputs2 = mfcc(audio2, samplerate=fs2) # Tranform in 3D array train_inputs = np.asarray(inputs[np.newaxis, :]) #train_inputs = np.asarray(inputs)
# Accounting the 0th indice + space + blank label = 28 characters num_classes = ord('z') - ord('a') + 1 + 1 + 1 + 1 + 1 # Hyper-parameters num_epochs = 200 num_hidden = 50 num_layers = 1 batch_size = 1 initial_learning_rate = 1e-2 momentum = 0.9 num_examples = 1 num_batches_per_epoch = int(num_examples / batch_size) # Loading the data audio_filename = maybe_download('LDC93S1.wav', 93638) target_filename = maybe_download('LDC93S1.txt', 62) inputs = spectrogram_from_file(audio_filename, step=10, window=20, max_freq=8000, eps=1e-14) # Tranform in 3D array #print(len(inputs)) train_inputs = np.asarray(inputs[np.newaxis, :]) #print(len(train_inputs)) train_inputs = (train_inputs - np.mean(train_inputs)) / np.std(train_inputs) train_seq_len = [train_inputs.shape[1]] print(train_seq_len)
help='Maximum number of lines') parser.add_argument('output', type=str, help='Output file') parser.add_argument('--tmp-dir', type=str, help='Location directory', default='/tmp/') args = parser.parse_args() # download data output_file = Path(args.output) target_dir = Path(args.tmp_dir) firstnames_archive_path = utils.maybe_download( ARCHIVE_PREFIX + FIRSTNAMES_ARCHIVE_NAME + '.zip', target_dir, FIRSTNAMES_ARCHIVE_URL) utils.maybe_extract(firstnames_archive_path, target_dir / FIRSTNAMES_ARCHIVE_DIR_NAME) extracted_firstnames_file = target_dir / f'{ARCHIVE_PREFIX}{FIRSTNAMES_ARCHIVE_NAME}/{FIRSTNAMES_ARCHIVE_NAME}.txt' lastnames_archive_path = utils.maybe_download( ARCHIVE_PREFIX + LASTNAME_ARCHIVE_NAME + '.zip', target_dir, LASTNAME_ARCHIVE_URL) utils.maybe_extract(lastnames_archive_path, target_dir / LASTNAME_ARCHIVE_DIR_NAME) extracted_lastnames_file = target_dir / f'{ARCHIVE_PREFIX}{LASTNAME_ARCHIVE_NAME}/{LASTNAME_ARCHIVE_NAME}.txt' # read files print('extract lastnames') lastnames = get_most_common_lastnames(extracted_lastnames_file,
plt.show() shift = lambda x, w: convolve( x.reshape((28, 28)), mode='constant', weights=w).ravel() X = np.concatenate([X] + [ np.apply_along_axis(shift, 1, X, vector) for vector in direction_vectors ]) print X.shape y = np.concatenate([y for _ in range(len(direction_vectors) + 1)], axis=0) print y.shape return X, y # Extract data train_data_filename = maybe_download('train-images-idx3-ubyte.gz') train_labels_filename = maybe_download('train-labels-idx1-ubyte.gz') test_data_filename = maybe_download('t10k-images-idx3-ubyte.gz') test_labels_filename = maybe_download('t10k-labels-idx1-ubyte.gz') X_train = extract_data(train_data_filename, 60000, dense=True) y_train = extract_labels(train_labels_filename, 60000, one_hot=False) X_test = extract_data(test_data_filename, 10000, dense=True) y_test = extract_labels(test_labels_filename, 10000, one_hot=False) ################################################# # Test for decision tree classifier without dimensionality reduction Tree = DecisionTreeClassifier() Tree.fit(X_train, y_train) print 'Without dimenstionality reduction: ', Tree.score(X_test, y_test)
def main(): alphabet_txt = os.path.join(LANG.model_dir, 'alphabet.txt') raw_txt_gz = os.path.join(LANG.model_dir, 'raw.txt.gz') unprepared_txt = os.path.join(LANG.model_dir, 'unprepared.txt') prepared_txt = os.path.join(LANG.model_dir, 'prepared.txt') vocabulary_txt = os.path.join(LANG.model_dir, 'vocabulary.txt') unfiltered_arpa = os.path.join(LANG.model_dir, 'unfiltered.arpa') filtered_arpa = os.path.join(LANG.model_dir, 'filtered.arpa') lm_binary = os.path.join(LANG.model_dir, 'lm.binary') kenlm_scorer = os.path.join(LANG.model_dir, 'kenlm.scorer') temp_prefix = os.path.join(LANG.model_dir, 'tmp') section('Writing alphabet file', empty_lines_before=1) with open(alphabet_txt, 'w', encoding='utf-8') as alphabet_file: alphabet_file.write('\n'.join(LANG.alphabet) + '\n') redo = ARGS.force_download section('Downloading text data') redo = maybe_download(LANG.text_url, raw_txt_gz, force=redo) section('Unzipping text data') redo = maybe_ungzip(raw_txt_gz, unprepared_txt, force=redo) redo = redo or ARGS.force_prepare section('Preparing text and building vocabulary') if redo or not os.path.isfile(prepared_txt) or not os.path.isfile(vocabulary_txt): redo = True announce('Preparing {} shards of "{}"...'.format(ARGS.workers, unprepared_txt)) counters = Queue(ARGS.workers) source_bytes = os.path.getsize(unprepared_txt) aggregator_process = Process(target=aggregate_counters, args=(vocabulary_txt, source_bytes, counters)) aggregator_process.start() counter_processes = list(map(lambda index: Process(target=count_words, args=(index, counters)), range(ARGS.workers))) try: for p in counter_processes: p.start() for p in counter_processes: p.join() counters.put(STOP_TOKEN) aggregator_process.join() print('') partials = list(map(lambda i: get_partial_path(i), range(ARGS.workers))) join_files(partials, prepared_txt) for partial in partials: os.unlink(partial) except KeyboardInterrupt: aggregator_process.terminate() for p in counter_processes: p.terminate() raise else: announce('Files "{}" and \n\t"{}" existing - not preparing'.format(prepared_txt, vocabulary_txt)) redo = redo or ARGS.force_generate section('Building unfiltered language model') if redo or not os.path.isfile(unfiltered_arpa): redo = True lmplz_args = [ KENLM_BIN + '/lmplz', '--temp_prefix', temp_prefix, '--memory', '80%', '--discount_fallback', '--limit_vocab_file', vocabulary_txt, '--text', prepared_txt, '--arpa', unfiltered_arpa, '--skip', 'symbols', '--order', str(LANG.order) ] if len(LANG.prune) > 0: lmplz_args.append('--prune') lmplz_args.extend(list(map(str, LANG.prune))) subprocess.check_call(lmplz_args) else: announce('File "{}" existing - not generating'.format(unfiltered_arpa)) section('Filtering language model') if redo or not os.path.isfile(filtered_arpa): redo = True with open(vocabulary_txt, 'rb') as vocabulary_file: vocabulary_content = vocabulary_file.read() subprocess.run([ KENLM_BIN + '/filter', 'single', 'model:' + unfiltered_arpa, filtered_arpa ], input=vocabulary_content, check=True) else: announce('File "{}" existing - not filtering'.format(filtered_arpa)) section('Generating binary representation') if redo or not os.path.isfile(lm_binary): redo = True subprocess.check_call([ KENLM_BIN + '/build_binary', '-a', '255', '-q', '8', '-v', 'trie', filtered_arpa, lm_binary ]) else: announce('File "{}" existing - not generating'.format(lm_binary)) section('Building scorer') if redo or not os.path.isfile(kenlm_scorer): redo = True words = set() vocab_looks_char_based = True with open(vocabulary_txt) as vocabulary_file: for line in vocabulary_file: for word in line.split(): words.add(word.encode()) if len(word) > 1: vocab_looks_char_based = False announce("{} unique words read from vocabulary file.".format(len(words))) announce( "{} like a character based model.".format( "Looks" if vocab_looks_char_based else "Doesn't look" ) ) if ARGS.alphabet_mode == 'auto': use_utf8 = vocab_looks_char_based elif ARGS.alphabet_mode == 'utf8': use_utf8 = True else: use_utf8 = False serialized_alphabet = get_serialized_utf8_alphabet() if use_utf8 else LANG.get_serialized_alphabet() from ds_ctcdecoder import Scorer, Alphabet alphabet = Alphabet() err = alphabet.deserialize(serialized_alphabet, len(serialized_alphabet)) if err != 0: announce('Error loading alphabet: {}'.format(err)) sys.exit(1) scorer = Scorer() scorer.set_alphabet(alphabet) scorer.set_utf8_mode(use_utf8) scorer.reset_params(LANG.alpha, LANG.beta) scorer.load_lm(lm_binary) scorer.fill_dictionary(list(words)) shutil.copy(lm_binary, kenlm_scorer) scorer.save_dictionary(kenlm_scorer, True) # append, not overwrite announce('Package created in {}'.format(kenlm_scorer)) announce('Testing package...') scorer = Scorer() scorer.load_lm(kenlm_scorer) else: announce('File "{}" existing - not building'.format(kenlm_scorer))
def main(argv=None): # pylint: disable=unused-argument # Get the data. train_data_filename = maybe_download('train-images-idx3-ubyte.gz') train_labels_filename = maybe_download('train-labels-idx1-ubyte.gz') test_data_filename = maybe_download('t10k-images-idx3-ubyte.gz') test_labels_filename = maybe_download('t10k-labels-idx1-ubyte.gz') # Extract it into numpy arrays. train_data = extract_data(train_data_filename, 60000, dense=False) train_labels = extract_labels(train_labels_filename, 60000, one_hot=True) test_data = extract_data(test_data_filename, 10000, dense=False ) test_labels = extract_labels(test_labels_filename, 10000, one_hot=True) # Generate a validation set. validation_data = train_data[:VALIDATION_SIZE, ...] validation_labels = train_labels[:VALIDATION_SIZE] train_data = train_data[VALIDATION_SIZE:, ...] train_labels = train_labels[VALIDATION_SIZE:] num_epochs = NUM_EPOCHS train_size = train_labels.shape[0] # This is where training samples and labels are fed to the graph. # These placeholder nodes will be fed a batch of training data at each # training step using the {feed_dict} argument to the Run() call below. train_data_node = tf.placeholder( tf.float32, shape=(BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS)) train_labels_node = tf.placeholder(tf.float32, shape=(BATCH_SIZE, NUM_LABELS)) eval_data = tf.placeholder( tf.float32, shape=(EVAL_BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS)) # The variables below hold all the trainable weights. They are passed an # initial value which will be assigned when when we call: # {tf.initialize_all_variables().run()} # First convolutional layer conv1_weights = tf.Variable( tf.truncated_normal([3, 3, NUM_CHANNELS, 32], # 5x5 filter, depth 32. stddev=0.1, seed=SEED)) conv1_biases = tf.Variable(tf.zeros([32])) # Two second convolutional layers 5 x 5 filter, and 3 x 3 filters. conv2_weights = tf.Variable( tf.truncated_normal([5, 5, 32, 64], stddev=0.1, seed=SEED)) conv2_biases = tf.Variable(tf.constant(0.01, shape=[64])) conv2_weights2 = tf.Variable( tf.truncated_normal([3, 3, 32, 64], stddev=0.1, seed=SEED)) conv2_biases2 = tf.Variable(tf.constant(0.01, shape=[64])) # First fully connected layer after conv layer fc1_weights = tf.Variable( # fully connected, depth 512. tf.truncated_normal( [IMAGE_SIZE // 4 * IMAGE_SIZE // 4 * 128, 512], stddev=0.05, seed=SEED)) fc1_biases = tf.Variable(tf.constant(0.01, shape=[512])) # Second fully connected layer fc2_weights = tf.Variable( tf.truncated_normal([512, 256], stddev=0.05, seed=SEED)) fc2_biases = tf.Variable(tf.constant(0.1, shape=[256])) # Output layer fc3_weights = tf.Variable( tf.truncated_normal([256, NUM_LABELS], stddev=0.04, seed=SEED)) fc3_biases = tf.Variable(tf.constant(0.1, shape=[NUM_LABELS])) # We will replicate the model structure for the training subgraph, as well # as the evaluation subgraphs, while sharing the trainable parameters. def model(data, train=False): """The Model definition.""" # 2D convolution, with 'SAME' padding (i.e. the output feature map has # the same size as the input). Note that {strides} is a 4D array whose # shape matches the data layout: [image index, y, x, depth]. conv = tf.nn.conv2d(data, conv1_weights, strides=[1, 1, 1, 1], padding='SAME') # Bias and rectified linear non-linearity. relu = tf.nn.relu(tf.nn.bias_add(conv, conv1_biases)) if train: relu = tf.nn.dropout(relu, .5) # Max pooling. The kernel size spec {ksize} also follows the layout of # the data. Here we have a pooling window of 2, and a stride of 2. pool = tf.nn.max_pool(relu, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') conv = tf.nn.conv2d(pool, conv2_weights, strides=[1, 1, 1, 1], padding='SAME') relu = tf.nn.relu(tf.nn.bias_add(conv, conv2_biases)) conv2 = tf.nn.conv2d(pool, conv2_weights2, strides=[1, 1, 1, 1], padding='SAME') relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_biases2)) pool = tf.nn.max_pool(relu, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') pool2 = tf.nn.max_pool(relu2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') # Reshape the feature map cuboid into a 2D matrix to feed it to the # fully connected layers. pool = tf.concat(3, [pool, pool2]) pool_shape = pool.get_shape().as_list() reshape = tf.reshape( pool, [pool_shape[0], pool_shape[1] * pool_shape[2] * pool_shape[3]]) # Fully connected layer. Note that the '+' operation automatically # broadcasts the biases. hidden = tf.nn.relu(tf.matmul(reshape, fc1_weights) + fc1_biases) hidden = tf.nn.relu(tf.matmul(hidden, fc2_weights) + fc2_biases) # Add a 50% dropout during training only. Dropout also scales # activations such that no rescaling is needed at evaluation time. if train: hidden = tf.nn.dropout(hidden, 0.5, seed=SEED) return tf.matmul(hidden, fc3_weights) + fc3_biases def extract_filter (data): conv = tf.nn.conv2d(data, conv1_weights, strides=[1, 1, 1, 1], padding='SAME') # Bias and rectified linear non-linearity. relu1 = tf.nn.relu(tf.nn.bias_add(conv, conv1_biases)) # Max pooling. The kernel size spec {ksize} also follows the layout of # the data. Here we have a pooling window of 2, and a stride of 2. pool = tf.nn.max_pool(relu1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') conv = tf.nn.conv2d(pool, conv2_weights, strides=[1, 1, 1, 1], padding='SAME') relu2 = tf.nn.relu(tf.nn.bias_add(conv, conv2_biases)) conv2 = tf.nn.conv2d(pool, conv2_weights2, strides=[1, 1, 1, 1], padding='SAME') relu3 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_biases2)) return relu1, relu2, relu3 # Training computation: logits + cross-entropy loss. logits = model(train_data_node, True) loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits( logits, train_labels_node)) # L2 regularization for the fully connected parameters. regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases) + tf.nn.l2_loss(fc2_weights) + tf.nn.l2_loss(fc2_biases) + tf.nn.l2_loss(fc3_weights) + tf.nn.l2_loss(fc3_biases)) # Add the regularization term to the loss. loss += 5e-4 * regularizers # Optimizer: set up a variable that's incremented once per batch and # controls the learning rate decay. batch = tf.Variable(0) # Decay once per epoch, using an exponential schedule starting at 0.01. learning_rate = tf.train.exponential_decay( 0.01, # Base learning rate. batch * BATCH_SIZE, # Current index into the dataset. train_size, # Decay step. 0.95, # Decay rate. staircase=True) # Use simple momentum for the optimization. optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9).minimize(loss, global_step=batch) # Predictions for the current training minibatch. train_prediction = tf.nn.softmax(logits) # Predictions for the test and validation, which we'll compute less often. eval_prediction = tf.nn.softmax(model(eval_data)) # Small utility function to evaluate a dataset by feeding batches of data to # {eval_data} and pulling the results from {eval_predictions}. # Saves memory and enables this to run on smaller GPUs. def eval_in_batches(data, sess): """Get all predictions for a dataset by running it in small batches.""" size = data.shape[0] if size < EVAL_BATCH_SIZE: raise ValueError("batch size for evals larger than dataset: %d" % size) predictions = numpy.ndarray(shape=(size, NUM_LABELS), dtype=numpy.float32) for begin in xrange(0, size, EVAL_BATCH_SIZE): end = begin + EVAL_BATCH_SIZE if end <= size: predictions[begin:end, :] = sess.run( eval_prediction, feed_dict={eval_data: data[begin:end, ...]}) else: batch_predictions = sess.run( eval_prediction, feed_dict={eval_data: data[-EVAL_BATCH_SIZE:, ...]}) predictions[begin:, :] = batch_predictions[begin - size:, :] return predictions # Create a local session to run the training. saver = tf.train.Saver() start_time = time.time() with tf.Session() as sess: # Run all the initializers to prepare the trainable parameters. if FLAGS.model: saver.restore(sess, FLAGS.model) # If model exists, load it else: sess.run(tf.initialize_all_variables()) # If there is no model randomly initialize if FLAGS.train: # Loop through training steps. for step in xrange(int(num_epochs * train_size) // BATCH_SIZE): # Compute the offset of the current minibatch in the data. # Note that we could use better randomization across epochs. offset = (step * BATCH_SIZE) % (train_size - BATCH_SIZE) batch_data = train_data[offset:(offset + BATCH_SIZE), ...] batch_labels = train_labels[offset:(offset + BATCH_SIZE)] # This dictionary maps the batch data (as a numpy array) to the # node in the graph is should be fed to. feed_dict = {train_data_node: batch_data, train_labels_node: batch_labels} # Run the graph and fetch some of the nodes. _, l, lr, predictions = sess.run( [optimizer, loss, learning_rate, train_prediction], feed_dict=feed_dict) if step % EVAL_FREQUENCY == 0: elapsed_time = time.time() - start_time start_time = time.time() print('Step %d (epoch %.2f), %.1f ms' % (step, float(step) * BATCH_SIZE / train_size, 1000 * elapsed_time / EVAL_FREQUENCY)) print('Minibatch loss: %.3f, learning rate: %.6f' % (l, lr)) print('Minibatch error: %.1f%%' % error_rate(predictions, batch_labels)) print('Validation error: %.1f%%' % error_rate( eval_in_batches(validation_data, sess), validation_labels)) sys.stdout.flush() # Finally print the result! test_error = error_rate(eval_in_batches(test_data, sess), test_labels) print('Test error: %.1f%%' % test_error) print ('Optimization done') print ('Save models') if not tf.gfile.Exists("./conv_save"): tf.gfile.MakeDirs("./conv_save") saver_path = saver.save(sess, "./conv_save/model.ckpt") print ('Successfully saved file: %s' % saver_path) else: # If train flag is false, execute image extraction routine print ("Filter extraction routine") aa = train_data[1:2, :, :, :] print (aa.shape) # Run extract filter operations (conv1, conv2 and conv3 layers) images = sess.run(extract_filter(train_data[1:2, :, :, :])) print (images[2].shape) plt.imshow (images[2][0, :, :, 32] * 255 + 255 / 2, cmap='gray') # plt.imshow (images[2][0, :, :, 32], cmap='gray') plt.show () # Save all outputs for i in range (3): filter_shape = images[i].shape img_size = [filter_shape[1], filter_shape[2]] print (img_size)
'of the target word.') parser.add_argument('--min_count', type=int, default=5, help='The minimum number of word occurrences for it to be ' 'included in the vocabulary.') parser.add_argument( '--sampling_factor', type=float, default=1e-3, help='Subsample threshold for word occurrence. Words that appear ' 'with higher frequency will be randomly down-sampled. Set ' 'to 0 to disable.') args = parser.parse_args() zip_filename = maybe_download('http://mattmahoney.net/dc/text8.zip') text_file = unzip(zip_filename) sentences = word2vec.Text8Corpus(text_file) sentences = [' '.join(sent) for sent in sentences] tokenizer = Tokenizer(filters=base_filter() + "'") tokenizer.fit_on_texts(sentences) sentences = tokenizer.texts_to_sequences(sentences) V = len(tokenizer.word_index) + 1 def build_model(): target_word = Sequential() target_word.add(Embedding(V, args.embedding_size, input_length=1)) context = Sequential() context.add(Embedding(V, args.embedding_size, input_length=1))
plt.show() shift = lambda x, w: convolve(x.reshape((28, 28)), mode='constant', weights=w).ravel() X = np.concatenate([X] + [np.apply_along_axis(shift, 1, X, vector) for vector in direction_vectors]) print X.shape y = np.concatenate([y for _ in range(len(direction_vectors) + 1)], axis=0) print y.shape return X, y # Extract data train_data_filename = maybe_download('train-images-idx3-ubyte.gz') train_labels_filename = maybe_download('train-labels-idx1-ubyte.gz') test_data_filename = maybe_download('t10k-images-idx3-ubyte.gz') test_labels_filename = maybe_download('t10k-labels-idx1-ubyte.gz') X_train = extract_data(train_data_filename, 60000, dense=True) y_train = extract_labels(train_labels_filename, 60000, one_hot=False) X_test = extract_data(test_data_filename, 10000, dense=True) y_test = extract_labels(test_labels_filename, 10000, one_hot=False) ################################################# # Test for decision tree classifier without dimensionality reduction Tree = DecisionTreeClassifier() Tree.fit(X_train, y_train) print 'Without dimenstionality reduction: ', Tree.score(X_test, y_test)
def get_loader(data_root, batchsize, poison=False, fracdirty=.5, cifar100=False, noaugment=False, nogan=True, cinic=False, tanti=False, svhn=False, surface=False, nworker=1): '''return loaders for cifar''' ## transforms def get_transform(datamean, datastd): transform_train = transforms.Compose([ # transforms.RandomCrop(32, padding=4), # transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(datamean, datastd), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(datamean, datastd), ]) transform_tanti = transforms.Compose([ # transforms.RandomCrop(32, padding=6), # transforms.Lambda(transforms.functional.hflip), # temporary # transforms.RandomRotation(5), transforms.ToTensor(), transforms.Normalize(datamean, datastd), ]) transform_switchable = transform_test if noaugment else transform_train return transform_train, transform_test, transform_switchable, transform_tanti ## multiplex between cifar and cinic and svhn if not cinic and not svhn: datamean, datastd = (0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010) transform_train, transform_test, transform_switchable, transform_tanti = get_transform( datamean, datastd) Dataset = torchvision.datasets.CIFAR100 if cifar100 else torchvision.datasets.CIFAR10 testset = Dataset(root=data_root, train=False, download=True, transform=transform_test) args_trainset = dict(root=data_root, train=True, download=True) elif cinic: cinic_root = join(data_root, 'CINIC-10') utils.maybe_download( source_url= 'https://datashare.is.ed.ac.uk/bitstream/handle/10283/3192/CINIC-10.tar.gz', filename='CINIC-10', target_directory=cinic_root, filetype='tar') datamean, datastd = [0.47889522, 0.47227842, 0.43047404], [0.24205776, 0.23828046, 0.25874835] transform_train, transform_test, transform_switchable, transform_tanti = get_transform( datamean, datastd) Dataset = torchvision.datasets.ImageFolder testset = Dataset(cinic_root + '/test', transform=transform_test) args_trainset = dict(root=cinic_root + '/train') elif svhn: datamean, datastd = [0.43768212, 0.44376972, 0.47280444], [0.1200278, 0.12307685, 0.10515254] transform_train, transform_test, transform_switchable, transform_tanti = get_transform( datamean, datastd) svhn_root = join(data_root, 'SVHN') # trainset = torchvision.datasets.SVHN(svhn_root, 'train', transform=transform_test) trainset = torchvision.datasets.SVHN(svhn_root, 'train', transform=transform_train, download=True) if not surface: testset = torchvision.datasets.SVHN(svhn_root, 'test', transform=transform_test, download=True) ganset = torchvision.datasets.SVHN(svhn_root, 'extra', transform=transform_test, download=True) ## dataset objects if svhn: pass elif poison: trainset = Dataset(transform=transform_switchable, **args_trainset) if tanti: ganset = Dataset(transform=transform_tanti, **args_trainset) elif nogan: trainset, ganset = torch.utils.data.random_split( trainset, [25000, 25000]) # else: ganset = CifarGan(root=data_root, transform=transform_test if nogan else transform_switchable) else: ganset = Dataset(root=cinic_root + '/valid', transform=transform_train) else: trainset = Dataset(transform=transform_switchable, **args_trainset) ## dataloader objects if not surface: testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=nworker) else: testloader = None if poison: gansize = int(batchsize * fracdirty) trainsize = batchsize - gansize trainloader = torch.utils.data.DataLoader(trainset, batch_size=trainsize, shuffle=True, num_workers=nworker) ganloader = torch.utils.data.DataLoader(ganset, batch_size=gansize, shuffle=True, num_workers=nworker) else: trainsize = batchsize trainloader = torch.utils.data.DataLoader(trainset, batch_size=trainsize, shuffle=True, num_workers=nworker) ganloader = None return trainloader, ganloader, testloader
def main(): savepath = './save_point' filepath = './save_point/model_api_checkpoint.h5' train_data_filename = maybe_download('train-images-idx3-ubyte.gz') train_labels_filename = maybe_download('train-labels-idx1-ubyte.gz') test_data_filename = maybe_download('t10k-images-idx3-ubyte.gz') test_labels_filename = maybe_download('t10k-labels-idx1-ubyte.gz') train_data = extract_data(train_data_filename, 60000, dense=False) train_data = train_data.reshape((60000, NUM_CHANNELS, IMG_SIZE, IMG_SIZE)) train_labels = extract_labels(train_labels_filename, 60000, one_hot=True) test_data = extract_data(test_data_filename, 10000, dense=False) test_data = test_data.reshape((10000, NUM_CHANNELS, IMG_SIZE, IMG_SIZE)) test_labels = extract_labels(test_labels_filename, 10000, one_hot=True) validation_data = train_data[:VALIDATION_SIZE, ...] validation_labels = train_labels[:VALIDATION_SIZE, :] validation_set = (validation_data, validation_labels) train_data = train_data[VALIDATION_SIZE:, ...] train_labels = train_labels[VALIDATION_SIZE:, ...] img = Input(shape=(1, 28, 28)) conv1 = Convolution2D(32, 3, 3, border_mode='same')(img) conv1 = Activation('relu')(conv1) pool1 = MaxPooling2D(pool_size=(2, 2))(conv1) conv2_1 = Convolution2D(64, 3, 3, border_mode='same')(pool1) conv2_2 = Convolution2D(64, 5, 5, border_mode='same')(pool1) conv2_1 = Activation('relu')(conv2_1) conv2_2 = Activation('relu')(conv2_2) pool2_1 = MaxPooling2D(pool_size=(2, 2))(conv2_1) pool2_2 = MaxPooling2D(pool_size=(2, 2))(conv2_2) dense1 = Flatten()(pool2_1) dense2 = Flatten()(pool2_2) dense = merge([dense1, dense2], mode='concat', concat_axis=1) dense = Dense(512)(dense) dense = Activation('relu')(dense) dense = Dense(256)(dense) dense = Activation('relu')(dense) dense = Dense(10)(dense) output = Activation('softmax')(dense) model = Model(input=[img], output=[output]) sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9) model.compile( optimizer=sgd, loss=['categorical_crossentropy'], metrics=["accuracy"]) model.fit( [train_data], [train_labels], nb_epoch=1, verbose=1, batch_size=1000, validation_data=validation_set) print 'Save model weights' if not os.path.isdir (savepath): os.mkdir (savepath) model.save_weights(filepath, overwrite=True) predictions = model.predict([test_data], batch_size=1000) print 'Test error: %.1f%%' % error_rate(predictions, test_labels) print 'Test loss: %.14f, Test accurracy %.4f' % \ tuple(model.evaluate([test_data], [test_labels], batch_size=1000))
# -------------- TensorBoard summaries ----------------- summ_D_loss_real = tf.summary.scalar("D_loss_real", D_loss_real) summ_D_loss_fake = tf.summary.scalar("D_loss_fake", D_loss_fake) summ_D_loss = tf.summary.scalar("D_loss", D_loss) summ_D_losses = tf.summary.merge( [summ_D_loss_real, summ_D_loss_fake, summ_D_loss]) summ_G_loss = tf.summary.scalar("G_loss", G_loss) # -------------- Load the dataset ------------------------ # download mnist if needed utils.maybe_download(FLAGS.input_path, FLAGS.mnist) # import mnist dataset data = input_data.read_data_sets(FLAGS.input_path, one_hot=True) # -------------- Train models ------------------------ # create session sess = tf.Session() sess.run(tf.global_variables_initializer()) # create summary writer summary_writer = tf.summary.FileWriter(FLAGS.log_path, graph=tf.get_default_graph()) for i in range(FLAGS.train_steps):
parse_args=False, project_name='mfseq_transition_matrix') experiment.log_parameters(vars(args)) # front matter home = os.environ['HOME'] autoname = 'rank_%s/lr_%s' % (args.rank, args.lrnrate) experiment.set_name(autoname) args.logdir = join(home, 'ckpt', autoname) os.makedirs(args.logdir, exist_ok=True) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu # load data from file maybe_download( 'https://www.dropbox.com/s/lu38zp3ixjpth9e/graph_data_cube.pkl?dl=0', 'graph_data_cube.pkl', join(home, 'datasets'), filetype='file') with gzip.open(join(home, 'datasets', 'graph_data_cube.pkl'), 'rb') as f: datacube = pickle.load(f) ntime, nnode, nfeat = datacube.shape # build tf graph model = Model(args, ntime, nnode, nfeat) # run optimizer on training data model.fit(datacube) # plot visualizations model.plot(ending=True)
# download BERT multi-lingual from utils import maybe_download import os BERT_MODELS_DIR = 'bert_models' BERT_BASE_DIR = 'bert_models/multilingual_L-12_H-768_A-12' BERT_MODEL_URL = 'https://storage.googleapis.com/bert_models/2018_11_03/' BERT_BASE_MULTI_FILE = 'multilingual_L-12_H-768_A-12.zip' if __name__ == '__main__': maybe_download(BERT_MODEL_URL, BERT_BASE_MULTI_FILE, BERT_MODELS_DIR) os.system( "7z x bert_models/multilingual_L-12_H-768_A-12.zip -obert_models")