Example #1
0
def load_speaker_set(speaker_set):

    filename_pattern = "%(speaker)s_%(digit)s%(repetition)s.wav"
    url_pattern = "http://www.ee.columbia.edu/~dpwe/sounds/tidigits/%(speaker)s/" + filename_pattern

    digits = ["O", "1", "2", "3", "4", "5", "6", "7", "8", "9", "Z"]

    repetitions = ["A", "B"]

    X = []
    y = []

    for speaker in speaker_set:
        for digit_ind, digit in enumerate(digits):
            for repetition in repetitions:

                filename = filename_pattern % locals()
                url = url_pattern % locals()

                print url

                path = get_file(filename, origin=url)

                _fq, samples = wavfile.read(path)

                # Check that there is only one channel.
                assert samples.ndim == 1

                X.append(samples)
                y.append(digit_ind)

    return X, y
Example #2
0
def start(values):
	url = values["url"]
	path = "deafult"
	if url:
		get_file(path, url=url)

	if values["path"]:
		path = values["path"]
	data = open(path).read.lower()

	maxlen = values["maxlen"]
	characters = set(data)
	sentences = build_sentences(data, maxlen)
	model = Sequential()
	model.add(LSTM(512, return_sequence=False))
	model.add(Dropout(0.3))
	model.add(Dense(len(characters)))
	model.add(Activation('softmax'))
	model.compile(loss="categorical_crossentropy", optimizer="adadelta")
Example #3
0
def loadText(path,
             origin="",
             vocsize=1000,
             maxlen=25,
             training_type=1,
             verbose=True):
    """
    type(path): string
    path      : path of text file to save to
    origin    : URL where text is 
    vocsize   : vocabulary size
    maxlen    : max size of one sentence
      
    Return:
    x_train, y_train, vocabulary

    eg: x,y,voc,i2w,w2i = loadData('pg11.txt', origin="http://www.gutenberg.org/cache/epub/11/pg11.txt")

    """

    filesource = get_file(path, origin=origin)
    text = open(filesource).read()

    text = SENTENCE_START_TOKEN + text + SENTENCE_END_TOKEN

    if verbose:
        print('corpus length:', len(text))

    tokens = word_tokenize(text)
    word_freq = nltk.FreqDist(tokens)

    if verbose:
        print("Found %d unique words tokens." % len(word_freq.items()))

    vocab = word_freq.most_common(vocsize - 3)
    indices_word = [x[0] for x in vocab]
    indices_word.append(UNKNOWN_TOKEN)
    indices_word.append(SENTENCE_START_TOKEN)
    indices_word.append(SENTENCE_END_TOKEN)

    word_indices = dict([(w, i) for i, w in enumerate(indices_word)])

    for i, word in enumerate(tokens):
        tokens[i] = [word if word in word_indices else UNKNOWN_TOKEN]

    # now the whole text is indices of words in the vocabulary
    for i, word in enumerate(tokens):
        tokens[i] = word_indices[word[0]]

    # Create the training data
    xx = np.asarray(tokens[:-1], dtype=np.int32)
    yy = np.asarray(tokens[1:], dtype=np.int32)

    return xx, yy, vocab, word_indices, indices_word
Example #4
0
def load_speaker_set(speaker_set):

    filename_pattern = "%(speaker)s_%(digit)s%(repetition)s.wav"
    url_pattern = "http://www.ee.columbia.edu/~dpwe/sounds/tidigits/%(speaker)s/" + filename_pattern
    
    digits = [
        'O',
        '1',
        '2',
        '3',
        '4',
        '5',
        '6',
        '7',
        '8',
        '9',
        'Z']

    repetitions = ['A', 'B']

    X = []
    y = []
    
    for speaker in speaker_set:
        for digit_ind, digit in enumerate(digits): 
            for repetition in repetitions:
                       
                filename = filename_pattern % locals()
                url = url_pattern % locals()
            
                path = get_file(filename, origin=url)
                    
                _fq, samples = wavfile.read(path)

                # Check that there is only one channel.
                assert samples.ndim == 1
                
                X.append(samples)
                y.append(digit_ind)
                
    return X, y
Example #5
0
def load_data_2label():
    dirname="cifar-100-python"
    origin="http://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz"
    path=get_file(dirname,origin=origin,untar=True)

    nb_test_samples=10000
    nb_train_samples=50000

    fpath=os.path.join(path,"train")
    X_train,coarseLabel_train,fineLabel_train=load_batch_2label(fpath)

    fpath=os.path.join(path,"test")
    X_test,coarseLabel_test,fineLabel_test=load_batch_2label(fpath)
    
    coarseLabel_train=np.reshape(coarseLabel_train,(len(coarseLabel_train),1))
    fineLabel_train=np.reshape(fineLabel_train,(len(fineLabel_train),1))

    coarseLabel_test=np.reshape(coarseLabel_test,(len(coarseLabel_test),1))
    fineLabel_test=np.reshape(fineLabel_test,(len(fineLabel_test),1))

    return (X_train,coarseLabel_train,fineLabel_train),(X_test,coarseLabel_test,fineLabel_test)
Example #6
0
networks are quite computationally intensive.
If you try this script on new data, make sure your corpus
has at least ~100k characters. ~1M is better.
'''

from __future__ import print_function

from keras.datasets.data_utils import get_file
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
import numpy as np
import random
import sys

path = get_file('nietzsche.txt',
                origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read().lower()
print('corpus length:', len(text))

chars = set(text)
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 20
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i:i + maxlen])
Example #7
0
parser.add_argument('--max_epochs',default=50, type=int, help='number of full passes through the training data')
parser.add_argument('--nb_epochs',default=1, type=int, help='number of epochs per iteration')
parser.add_argument('--grad_clip',default=5, type=float, help='clip gradients at this value')
parser.add_argument('--seed',default=0, type=int, help='numpy manual random number generator seed')

args = parser.parse_args()

if args.seq_step is None:
    args.seq_step = args.seq_length

# writing outputs
filename = args.name+'_rnn'+str(args.rnn_size)+'_layers'+str(args.num_layers)+'_seqlen'+str(args.seq_length)+'_batch'+str(args.batch_size)+'_epochs'+str(args.max_epochs)+'_'+str(args.nb_epochs)
generated_text_file = open('output/'+filename+'.txt','w')


path = get_file(args.name, origin=args.url)
text = open(path).read().lower()
print('corpus length:', len(text))

chars = set(text)
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
maxlen = args.seq_length
step = args.seq_step
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i : i + maxlen])
Example #8
0
        y = np.zeros(len(word_idx) + 1)  # let's not forget that index 0 is reserved
        y[word_idx[answer]] = 1
        X.append(x)
        Xq.append(xq)
        Y.append(y)
    return pad_sequences(X, maxlen=story_maxlen), pad_sequences(Xq, maxlen=query_maxlen), np.array(Y)

RNN = recurrent.GRU
EMBED_HIDDEN_SIZE = 50
SENT_HIDDEN_SIZE = 100
QUERY_HIDDEN_SIZE = 100
BATCH_SIZE = 32
EPOCHS = 20
print('RNN / Embed / Sent / Query = {}, {}, {}, {}'.format(RNN, EMBED_HIDDEN_SIZE, SENT_HIDDEN_SIZE, QUERY_HIDDEN_SIZE))

path = get_file('babi-tasks-v1-2.tar.gz', origin='http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz')
tar = tarfile.open(path)
# Default QA1 with 1000 samples
# challenge = 'tasks_1-20_v1-2/en/qa1_single-supporting-fact_{}.txt'
# QA1 with 10,000 samples
# challenge = 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt'
# QA2 with 1000 samples
challenge = 'tasks_1-20_v1-2/en/qa2_two-supporting-facts_{}.txt'
# QA2 with 10,000 samples
# challenge = 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt'
train = get_stories(tar.extractfile(challenge.format('train')))
test = get_stories(tar.extractfile(challenge.format('test')))

vocab = sorted(reduce(lambda x, y: x | y, (set(story + q + [answer]) for story, q, answer in train + test)))
# Reserve 0 for masking via pad_sequences
vocab_size = len(vocab) + 1
def load_data(label_mode='fine'):
    dirname = "cifar-10-batches-py"
    origin = "http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
    path = get_file(dirname, origin=origin, untar=True)

    nb_test_samples = 10000
    nb_train_samples = 50000

    X_train2 = np.zeros((nb_train_samples, 3, 32, 32), dtype="uint8")
    y_train2 = np.zeros((nb_train_samples,), dtype="uint8")

    for i in range(1, 6):
        fpath = os.path.join(path, 'data_batch_' + str(i))
        data, labels = cifar.load_batch(fpath)
        X_train2[(i-1)*10000:i*10000, :, :, :] = data
        y_train2[(i-1)*10000:i*10000] = labels

    fpath = os.path.join(path, 'test_batch')
    X_test2, y_test2 = cifar.load_batch(fpath)

    y_train2 = np.reshape(y_train2, (len(y_train2), 1))
    y_test2 = np.reshape(y_test2, (len(y_test2), 1))
    ################################################################
    if label_mode not in ['fine', 'coarse']:
        raise Exception('label_mode must be one of "fine" "coarse".')

    dirname = "cifar-100-python"
    origin = "http://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz"
    path = get_file(dirname, origin=origin, untar=True)

    nb_test_samples = 500
    nb_train_samples = 2500

    fpath = os.path.join(path, 'train')
    X_train1, y_train1 = load_batch(fpath, label_key=label_mode+'_labels')

    fpath = os.path.join(path, 'test')
    X_test1, y_test1 = load_batch(fpath, label_key=label_mode+'_labels')

    y_train1 = np.reshape(y_train1, (len(y_train1), 1))
    y_test1 = np.reshape(y_test1, (len(y_test1), 1))

    #####################################################################
    print(type(X_train1))
    print(type(X_train2))
    X_train=X_train1.tolist()+X_train2.tolist()
    print("X_train transformation worked")
    X_test=X_test1.tolist()+X_test2.tolist()
    print("X_test transformation worked")
    X_test=asarray(X_test)
    print("X_test revertion worked")
    X_train=asarray(X_train)
    print("X_train revertion worked")
    print(type(y_test1))
    print(type(y_test2))
    y_test=y_test1.tolist()+y_test2.tolist()
    y_train=y_train1.tolist()+y_train2.tolist()
    y_test=asarray(y_test)
    y_train=asarray(y_train)

    nb_test_samples=len(X_test)
    print(nb_test_samples)
    nb_train_samples=len(X_train)
    print(nb_train_samples)
    return (X_train, y_train), (X_test, y_test)
vocab, vocabcount = get_vocab(heads + desc)
empty = 0  # RNN mask of no data
eos = 1  # end of sentence
start_idx = eos + 1  # first real word


def get_idx(vocab, vocabcount):
    word2idx = dict((word, idx + start_idx) for idx, word in enumerate(vocab))
    word2idx['<empty>'] = empty
    word2idx['<eos>'] = eos

    idx2word = dict((idx, word) for word, idx in word2idx.iteritems())

    return word2idx, idx2word


word2idx, idx2word = get_idx(vocab, vocabcount)
fname = 'glove.6B.%dd.txt' % embedding_dim
import os
import sys
from keras.datasets.data_utils import get_file
datadir_base = os.path.expanduser(os.path.join('~', '.keras'))
if not os.access(datadir_base, os.W_OK):
    datadir_base = os.path.join('/tmp', '.keras')
datadir = os.path.join(datadir_base, 'datasets')
glove_name = os.path.join(datadir, fname)
if not os.path.exists(glove_name):
    path = 'glove.6B.zip'
    path = get_file(path, origin="http://nlp.stanford.edu/data/glove.6B.zip")
    # unzip{datadir}/{path}
Example #11
0
def old_one():
    path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
    text = open(path).read().lower()
    return text
Example #12
0
# parser.add_argument('--print_every',default=1, type=int, help='how many steps/minibatches between printing out the loss')
# parser.add_argument('--eval_val_every',default=1000, type=int, help='every how many iterations should we evaluate on validation data?')
# parser.add_argument('--checkpoint_dir', default='cv', help='output directory where checkpoints get written')
# parser.add_argument('--savefile',default='lstm', help='filename to autosave the checkpont to. Will be inside checkpoint_dir/')
# GPU/CPU
# parser.add_argument('--gpuid',default=0, help='which gpu to use. -1 = use CPU')
args = parser.parse_args()

if args.step is None:
    args.step = args.seq_length

print('step size is', args.step)

np.random.seed(args.seed) # for reproducibility

path = get_file('patriotAct.txt', origin="http://genekogan.com/txt/patriotAct.txt")
#path = get_file('marquez.txt', origin="http://pauladaunt.com/books/MARQUES,%20Gabriel%20Garcia%20-%20One%20Hundred%20Years%20of%20Solitude.txt")
text = open(path).read()#.lower()
print('corpus length:', len(text))

chars = set(text)
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
maxlen = args.seq_length
step = args.step
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
Example #13
0
def old_one():
    path = get_file(
        'nietzsche.txt',
        origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
    text = open(path).read().lower()
    return text
Example #14
0
                    type=int,
                    help='numpy manual random number generator seed')

args = parser.parse_args()

if args.seq_step is None:
    args.seq_step = args.seq_length

# writing outputs
filename = args.name + '_rnn' + str(args.rnn_size) + '_layers' + str(
    args.num_layers) + '_seqlen' + str(args.seq_length) + '_batch' + str(
        args.batch_size) + '_epochs' + str(args.max_epochs) + '_' + str(
            args.nb_epochs)
generated_text_file = open('output/' + filename + '.txt', 'w')

path = get_file(args.name, origin=args.url)
text = open(path).read().lower()
print('corpus length:', len(text))

chars = set(text)
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
maxlen = args.seq_length
step = args.seq_step
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i:i + maxlen])
Example #15
0
            new_sentence.extend(s)

        newX.append(new_sentence)

    print("maxlen", max(map(len, newX)))
    newX = pad_sequences(newX, maxlen=max(map(len, newX)), padding="pre")
    print(newX.shape)
    return newX


def getmaxsentencelength(X, word_idx):

    max_sentence = 0
    for i, story in enumerate(X):
        sentences = splitter([word_idx["."], word_idx["?"]], list(story))
        sentences = filter(lambda a: a != [], sentences)

        for sentence in sentences:
            sentence = np.array(sentence)
            #print (sentence[-1])
            s = sentence[sentence > 0]
            max_sentence = max(max_sentence, len(s))
    return max_sentence


path = get_file(
    'babi-tasks-v1-2.tar.gz',
    origin='http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz'
)
tar = tarfile.open(path)
from __future__ import print_function
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.datasets.data_utils import get_file
import numpy as np
import random
import sys

path = get_file('', origin="")
text = open(path).read().lower()
print('corpus length:', len(text))

chars = set(text)
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 20
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
import random, sys

'''
    Example script to generate text from Nietzsche's writings.

    At least 20 epochs are required before the generated text
    starts sounding coherent.

    It is recommended to run this script on GPU, as recurrent
    networks are quite computationally intensive.

    If you try this script on new data, make sure your corpus 
    has at least ~100k characters. ~1M is better.
'''

path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read().lower()
print('corpus length:', len(text))

chars = set(text)
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))


# cut the text in semi-redundant sequences of maxlen characters
maxlen = 20
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
Example #18
0
# parser.add_argument('--print_every',default=1, type=int, help='how many steps/minibatches between printing out the loss')
# parser.add_argument('--eval_val_every',default=1000, type=int, help='every how many iterations should we evaluate on validation data?')
# parser.add_argument('--checkpoint_dir', default='cv', help='output directory where checkpoints get written')
# parser.add_argument('--savefile',default='lstm', help='filename to autosave the checkpont to. Will be inside checkpoint_dir/')
# GPU/CPU
# parser.add_argument('--gpuid',default=0, help='which gpu to use. -1 = use CPU')
args = parser.parse_args()

if args.step is None:
    args.step = args.seq_length

print('step size is', args.step)

np.random.seed(args.seed)  # for reproducibility

path = get_file('patriotAct.txt',
                origin="http://genekogan.com/txt/patriotAct.txt")
#path = get_file('marquez.txt', origin="http://pauladaunt.com/books/MARQUES,%20Gabriel%20Garcia%20-%20One%20Hundred%20Years%20of%20Solitude.txt")
text = open(path).read()  #.lower()
print('corpus length:', len(text))

chars = set(text)
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
maxlen = args.seq_length
step = args.step
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):