def load_speaker_set(speaker_set): filename_pattern = "%(speaker)s_%(digit)s%(repetition)s.wav" url_pattern = "http://www.ee.columbia.edu/~dpwe/sounds/tidigits/%(speaker)s/" + filename_pattern digits = ["O", "1", "2", "3", "4", "5", "6", "7", "8", "9", "Z"] repetitions = ["A", "B"] X = [] y = [] for speaker in speaker_set: for digit_ind, digit in enumerate(digits): for repetition in repetitions: filename = filename_pattern % locals() url = url_pattern % locals() print url path = get_file(filename, origin=url) _fq, samples = wavfile.read(path) # Check that there is only one channel. assert samples.ndim == 1 X.append(samples) y.append(digit_ind) return X, y
def start(values): url = values["url"] path = "deafult" if url: get_file(path, url=url) if values["path"]: path = values["path"] data = open(path).read.lower() maxlen = values["maxlen"] characters = set(data) sentences = build_sentences(data, maxlen) model = Sequential() model.add(LSTM(512, return_sequence=False)) model.add(Dropout(0.3)) model.add(Dense(len(characters))) model.add(Activation('softmax')) model.compile(loss="categorical_crossentropy", optimizer="adadelta")
def loadText(path, origin="", vocsize=1000, maxlen=25, training_type=1, verbose=True): """ type(path): string path : path of text file to save to origin : URL where text is vocsize : vocabulary size maxlen : max size of one sentence Return: x_train, y_train, vocabulary eg: x,y,voc,i2w,w2i = loadData('pg11.txt', origin="http://www.gutenberg.org/cache/epub/11/pg11.txt") """ filesource = get_file(path, origin=origin) text = open(filesource).read() text = SENTENCE_START_TOKEN + text + SENTENCE_END_TOKEN if verbose: print('corpus length:', len(text)) tokens = word_tokenize(text) word_freq = nltk.FreqDist(tokens) if verbose: print("Found %d unique words tokens." % len(word_freq.items())) vocab = word_freq.most_common(vocsize - 3) indices_word = [x[0] for x in vocab] indices_word.append(UNKNOWN_TOKEN) indices_word.append(SENTENCE_START_TOKEN) indices_word.append(SENTENCE_END_TOKEN) word_indices = dict([(w, i) for i, w in enumerate(indices_word)]) for i, word in enumerate(tokens): tokens[i] = [word if word in word_indices else UNKNOWN_TOKEN] # now the whole text is indices of words in the vocabulary for i, word in enumerate(tokens): tokens[i] = word_indices[word[0]] # Create the training data xx = np.asarray(tokens[:-1], dtype=np.int32) yy = np.asarray(tokens[1:], dtype=np.int32) return xx, yy, vocab, word_indices, indices_word
def load_speaker_set(speaker_set): filename_pattern = "%(speaker)s_%(digit)s%(repetition)s.wav" url_pattern = "http://www.ee.columbia.edu/~dpwe/sounds/tidigits/%(speaker)s/" + filename_pattern digits = [ 'O', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'Z'] repetitions = ['A', 'B'] X = [] y = [] for speaker in speaker_set: for digit_ind, digit in enumerate(digits): for repetition in repetitions: filename = filename_pattern % locals() url = url_pattern % locals() path = get_file(filename, origin=url) _fq, samples = wavfile.read(path) # Check that there is only one channel. assert samples.ndim == 1 X.append(samples) y.append(digit_ind) return X, y
def load_data_2label(): dirname="cifar-100-python" origin="http://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz" path=get_file(dirname,origin=origin,untar=True) nb_test_samples=10000 nb_train_samples=50000 fpath=os.path.join(path,"train") X_train,coarseLabel_train,fineLabel_train=load_batch_2label(fpath) fpath=os.path.join(path,"test") X_test,coarseLabel_test,fineLabel_test=load_batch_2label(fpath) coarseLabel_train=np.reshape(coarseLabel_train,(len(coarseLabel_train),1)) fineLabel_train=np.reshape(fineLabel_train,(len(fineLabel_train),1)) coarseLabel_test=np.reshape(coarseLabel_test,(len(coarseLabel_test),1)) fineLabel_test=np.reshape(fineLabel_test,(len(fineLabel_test),1)) return (X_train,coarseLabel_train,fineLabel_train),(X_test,coarseLabel_test,fineLabel_test)
networks are quite computationally intensive. If you try this script on new data, make sure your corpus has at least ~100k characters. ~1M is better. ''' from __future__ import print_function from keras.datasets.data_utils import get_file from keras.models import Sequential from keras.layers.core import Dense, Activation, Dropout from keras.layers.recurrent import LSTM import numpy as np import random import sys path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt") text = open(path).read().lower() print('corpus length:', len(text)) chars = set(text) print('total chars:', len(chars)) char_indices = dict((c, i) for i, c in enumerate(chars)) indices_char = dict((i, c) for i, c in enumerate(chars)) # cut the text in semi-redundant sequences of maxlen characters maxlen = 20 step = 3 sentences = [] next_chars = [] for i in range(0, len(text) - maxlen, step): sentences.append(text[i:i + maxlen])
parser.add_argument('--max_epochs',default=50, type=int, help='number of full passes through the training data') parser.add_argument('--nb_epochs',default=1, type=int, help='number of epochs per iteration') parser.add_argument('--grad_clip',default=5, type=float, help='clip gradients at this value') parser.add_argument('--seed',default=0, type=int, help='numpy manual random number generator seed') args = parser.parse_args() if args.seq_step is None: args.seq_step = args.seq_length # writing outputs filename = args.name+'_rnn'+str(args.rnn_size)+'_layers'+str(args.num_layers)+'_seqlen'+str(args.seq_length)+'_batch'+str(args.batch_size)+'_epochs'+str(args.max_epochs)+'_'+str(args.nb_epochs) generated_text_file = open('output/'+filename+'.txt','w') path = get_file(args.name, origin=args.url) text = open(path).read().lower() print('corpus length:', len(text)) chars = set(text) print('total chars:', len(chars)) char_indices = dict((c, i) for i, c in enumerate(chars)) indices_char = dict((i, c) for i, c in enumerate(chars)) # cut the text in semi-redundant sequences of maxlen characters maxlen = args.seq_length step = args.seq_step sentences = [] next_chars = [] for i in range(0, len(text) - maxlen, step): sentences.append(text[i : i + maxlen])
y = np.zeros(len(word_idx) + 1) # let's not forget that index 0 is reserved y[word_idx[answer]] = 1 X.append(x) Xq.append(xq) Y.append(y) return pad_sequences(X, maxlen=story_maxlen), pad_sequences(Xq, maxlen=query_maxlen), np.array(Y) RNN = recurrent.GRU EMBED_HIDDEN_SIZE = 50 SENT_HIDDEN_SIZE = 100 QUERY_HIDDEN_SIZE = 100 BATCH_SIZE = 32 EPOCHS = 20 print('RNN / Embed / Sent / Query = {}, {}, {}, {}'.format(RNN, EMBED_HIDDEN_SIZE, SENT_HIDDEN_SIZE, QUERY_HIDDEN_SIZE)) path = get_file('babi-tasks-v1-2.tar.gz', origin='http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz') tar = tarfile.open(path) # Default QA1 with 1000 samples # challenge = 'tasks_1-20_v1-2/en/qa1_single-supporting-fact_{}.txt' # QA1 with 10,000 samples # challenge = 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt' # QA2 with 1000 samples challenge = 'tasks_1-20_v1-2/en/qa2_two-supporting-facts_{}.txt' # QA2 with 10,000 samples # challenge = 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt' train = get_stories(tar.extractfile(challenge.format('train'))) test = get_stories(tar.extractfile(challenge.format('test'))) vocab = sorted(reduce(lambda x, y: x | y, (set(story + q + [answer]) for story, q, answer in train + test))) # Reserve 0 for masking via pad_sequences vocab_size = len(vocab) + 1
def load_data(label_mode='fine'): dirname = "cifar-10-batches-py" origin = "http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz" path = get_file(dirname, origin=origin, untar=True) nb_test_samples = 10000 nb_train_samples = 50000 X_train2 = np.zeros((nb_train_samples, 3, 32, 32), dtype="uint8") y_train2 = np.zeros((nb_train_samples,), dtype="uint8") for i in range(1, 6): fpath = os.path.join(path, 'data_batch_' + str(i)) data, labels = cifar.load_batch(fpath) X_train2[(i-1)*10000:i*10000, :, :, :] = data y_train2[(i-1)*10000:i*10000] = labels fpath = os.path.join(path, 'test_batch') X_test2, y_test2 = cifar.load_batch(fpath) y_train2 = np.reshape(y_train2, (len(y_train2), 1)) y_test2 = np.reshape(y_test2, (len(y_test2), 1)) ################################################################ if label_mode not in ['fine', 'coarse']: raise Exception('label_mode must be one of "fine" "coarse".') dirname = "cifar-100-python" origin = "http://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz" path = get_file(dirname, origin=origin, untar=True) nb_test_samples = 500 nb_train_samples = 2500 fpath = os.path.join(path, 'train') X_train1, y_train1 = load_batch(fpath, label_key=label_mode+'_labels') fpath = os.path.join(path, 'test') X_test1, y_test1 = load_batch(fpath, label_key=label_mode+'_labels') y_train1 = np.reshape(y_train1, (len(y_train1), 1)) y_test1 = np.reshape(y_test1, (len(y_test1), 1)) ##################################################################### print(type(X_train1)) print(type(X_train2)) X_train=X_train1.tolist()+X_train2.tolist() print("X_train transformation worked") X_test=X_test1.tolist()+X_test2.tolist() print("X_test transformation worked") X_test=asarray(X_test) print("X_test revertion worked") X_train=asarray(X_train) print("X_train revertion worked") print(type(y_test1)) print(type(y_test2)) y_test=y_test1.tolist()+y_test2.tolist() y_train=y_train1.tolist()+y_train2.tolist() y_test=asarray(y_test) y_train=asarray(y_train) nb_test_samples=len(X_test) print(nb_test_samples) nb_train_samples=len(X_train) print(nb_train_samples) return (X_train, y_train), (X_test, y_test)
vocab, vocabcount = get_vocab(heads + desc) empty = 0 # RNN mask of no data eos = 1 # end of sentence start_idx = eos + 1 # first real word def get_idx(vocab, vocabcount): word2idx = dict((word, idx + start_idx) for idx, word in enumerate(vocab)) word2idx['<empty>'] = empty word2idx['<eos>'] = eos idx2word = dict((idx, word) for word, idx in word2idx.iteritems()) return word2idx, idx2word word2idx, idx2word = get_idx(vocab, vocabcount) fname = 'glove.6B.%dd.txt' % embedding_dim import os import sys from keras.datasets.data_utils import get_file datadir_base = os.path.expanduser(os.path.join('~', '.keras')) if not os.access(datadir_base, os.W_OK): datadir_base = os.path.join('/tmp', '.keras') datadir = os.path.join(datadir_base, 'datasets') glove_name = os.path.join(datadir, fname) if not os.path.exists(glove_name): path = 'glove.6B.zip' path = get_file(path, origin="http://nlp.stanford.edu/data/glove.6B.zip") # unzip{datadir}/{path}
def old_one(): path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt") text = open(path).read().lower() return text
# parser.add_argument('--print_every',default=1, type=int, help='how many steps/minibatches between printing out the loss') # parser.add_argument('--eval_val_every',default=1000, type=int, help='every how many iterations should we evaluate on validation data?') # parser.add_argument('--checkpoint_dir', default='cv', help='output directory where checkpoints get written') # parser.add_argument('--savefile',default='lstm', help='filename to autosave the checkpont to. Will be inside checkpoint_dir/') # GPU/CPU # parser.add_argument('--gpuid',default=0, help='which gpu to use. -1 = use CPU') args = parser.parse_args() if args.step is None: args.step = args.seq_length print('step size is', args.step) np.random.seed(args.seed) # for reproducibility path = get_file('patriotAct.txt', origin="http://genekogan.com/txt/patriotAct.txt") #path = get_file('marquez.txt', origin="http://pauladaunt.com/books/MARQUES,%20Gabriel%20Garcia%20-%20One%20Hundred%20Years%20of%20Solitude.txt") text = open(path).read()#.lower() print('corpus length:', len(text)) chars = set(text) print('total chars:', len(chars)) char_indices = dict((c, i) for i, c in enumerate(chars)) indices_char = dict((i, c) for i, c in enumerate(chars)) # cut the text in semi-redundant sequences of maxlen characters maxlen = args.seq_length step = args.step sentences = [] next_chars = [] for i in range(0, len(text) - maxlen, step):
def old_one(): path = get_file( 'nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt") text = open(path).read().lower() return text
type=int, help='numpy manual random number generator seed') args = parser.parse_args() if args.seq_step is None: args.seq_step = args.seq_length # writing outputs filename = args.name + '_rnn' + str(args.rnn_size) + '_layers' + str( args.num_layers) + '_seqlen' + str(args.seq_length) + '_batch' + str( args.batch_size) + '_epochs' + str(args.max_epochs) + '_' + str( args.nb_epochs) generated_text_file = open('output/' + filename + '.txt', 'w') path = get_file(args.name, origin=args.url) text = open(path).read().lower() print('corpus length:', len(text)) chars = set(text) print('total chars:', len(chars)) char_indices = dict((c, i) for i, c in enumerate(chars)) indices_char = dict((i, c) for i, c in enumerate(chars)) # cut the text in semi-redundant sequences of maxlen characters maxlen = args.seq_length step = args.seq_step sentences = [] next_chars = [] for i in range(0, len(text) - maxlen, step): sentences.append(text[i:i + maxlen])
new_sentence.extend(s) newX.append(new_sentence) print("maxlen", max(map(len, newX))) newX = pad_sequences(newX, maxlen=max(map(len, newX)), padding="pre") print(newX.shape) return newX def getmaxsentencelength(X, word_idx): max_sentence = 0 for i, story in enumerate(X): sentences = splitter([word_idx["."], word_idx["?"]], list(story)) sentences = filter(lambda a: a != [], sentences) for sentence in sentences: sentence = np.array(sentence) #print (sentence[-1]) s = sentence[sentence > 0] max_sentence = max(max_sentence, len(s)) return max_sentence path = get_file( 'babi-tasks-v1-2.tar.gz', origin='http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz' ) tar = tarfile.open(path)
from __future__ import print_function from keras.models import Sequential from keras.layers.core import Dense, Activation, Dropout from keras.layers.recurrent import LSTM from keras.datasets.data_utils import get_file import numpy as np import random import sys path = get_file('', origin="") text = open(path).read().lower() print('corpus length:', len(text)) chars = set(text) print('total chars:', len(chars)) char_indices = dict((c, i) for i, c in enumerate(chars)) indices_char = dict((i, c) for i, c in enumerate(chars)) # cut the text in semi-redundant sequences of maxlen characters maxlen = 20 step = 3 sentences = [] next_chars = [] for i in range(0, len(text) - maxlen, step): sentences.append(text[i: i + maxlen]) next_chars.append(text[i + maxlen]) print('nb sequences:', len(sentences)) print('Vectorization...') X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool) y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
import random, sys ''' Example script to generate text from Nietzsche's writings. At least 20 epochs are required before the generated text starts sounding coherent. It is recommended to run this script on GPU, as recurrent networks are quite computationally intensive. If you try this script on new data, make sure your corpus has at least ~100k characters. ~1M is better. ''' path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt") text = open(path).read().lower() print('corpus length:', len(text)) chars = set(text) print('total chars:', len(chars)) char_indices = dict((c, i) for i, c in enumerate(chars)) indices_char = dict((i, c) for i, c in enumerate(chars)) # cut the text in semi-redundant sequences of maxlen characters maxlen = 20 step = 3 sentences = [] next_chars = [] for i in range(0, len(text) - maxlen, step):
# parser.add_argument('--print_every',default=1, type=int, help='how many steps/minibatches between printing out the loss') # parser.add_argument('--eval_val_every',default=1000, type=int, help='every how many iterations should we evaluate on validation data?') # parser.add_argument('--checkpoint_dir', default='cv', help='output directory where checkpoints get written') # parser.add_argument('--savefile',default='lstm', help='filename to autosave the checkpont to. Will be inside checkpoint_dir/') # GPU/CPU # parser.add_argument('--gpuid',default=0, help='which gpu to use. -1 = use CPU') args = parser.parse_args() if args.step is None: args.step = args.seq_length print('step size is', args.step) np.random.seed(args.seed) # for reproducibility path = get_file('patriotAct.txt', origin="http://genekogan.com/txt/patriotAct.txt") #path = get_file('marquez.txt', origin="http://pauladaunt.com/books/MARQUES,%20Gabriel%20Garcia%20-%20One%20Hundred%20Years%20of%20Solitude.txt") text = open(path).read() #.lower() print('corpus length:', len(text)) chars = set(text) print('total chars:', len(chars)) char_indices = dict((c, i) for i, c in enumerate(chars)) indices_char = dict((i, c) for i, c in enumerate(chars)) # cut the text in semi-redundant sequences of maxlen characters maxlen = args.seq_length step = args.step sentences = [] next_chars = [] for i in range(0, len(text) - maxlen, step):