Beispiel #1
0
def augment(V, crop, grayscale=False):
    # note assumes square outputs of size crop x crop
    # randomly sample a crop in the input volume
    if crop == V.sx: return V

    dx = randi(0, V.sx - crop)
    dy = randi(0, V.sy - crop)

    W = Vol(crop, crop, V.depth)
    for x in xrange(crop):
        for y in xrange(crop):
            if x + dx < 0 or x + dx >= V.sx or \
                y + dy < 0 or y + dy >= V.sy:
                continue
            for d in xrange(V.depth):
                W.set(x, y, d, V.get(x + dx, y + dy, d))

    if grayscale:
        #flatten into depth=1 array
        G = Vol(crop, crop, 1, 0.0)
        for i in xrange(crop):
            for j in xrange(crop):
                G.set(i, j, 0, W.get(i, j, 0))
        W = G

    return W
def makeTerrainData(n_points=1000):
    global training_data, testing_data

    ###############################################################################
    ### from: https://github.com/udacity/ud120-projects/blob/master/choose_your_own/prep_terrain_data.py
    ### make the toy dataset
    random.seed(42)
    grade = [random.random() for ii in range(0, n_points)]
    bumpy = [random.random() for ii in range(0, n_points)]
    error = [random.random() for ii in range(0, n_points)]
    y = [
        round(grade[ii] * bumpy[ii] + 0.3 + 0.1 * error[ii])
        for ii in range(0, n_points)
    ]
    for ii in range(0, len(y)):
        if grade[ii] > 0.8 or bumpy[ii] > 0.8:
            y[ii] = 1.0


### split into train/test sets
    X = [[gg, ss] for gg, ss in zip(grade, bumpy)]
    split = int(0.75 * n_points)
    X_train = X[0:split]
    X_test = X[split:]
    y_train = y[0:split]
    y_test = y[split:]

    for x, y in zip(X_train, y_train):
        training_data.append((Vol(x), int(y)))
    for x, y in zip(X_test, y_test):
        testing_data.append((Vol(x), int(y)))
Beispiel #3
0
    def __init__(self, opt={}):
        self.out_depth = opt['filters']
        self.sx = opt['sx']  # filter size: should be odd if possible
        self.in_depth = opt['in_depth']
        self.in_sx = opt['in_sx']
        self.in_sy = opt['in_sy']

        # optional
        self.sy = getopt(opt, 'sy', self.sx)
        self.stride = getopt(
            opt, 'stride',
            1)  # stride at which we apply filters to input volume
        self.pad = getopt(opt, 'pad', 0)  # padding to borders of input volume
        self.l1_decay_mul = getopt(opt, 'l1_decay_mul', 0.0)
        self.l2_decay_mul = getopt(opt, 'l2_decay_mul', 1.0)
        """
        Note we are doing floor, so if the strided convolution of the filter doesnt fit into the input
        volume exactly, the output volume will be trimmed and not contain the (incomplete) computed
        final application.
        """
        self.out_sx = int(
            floor((self.in_sx - self.sx + 2 * self.pad) / self.stride + 1))
        self.out_sy = int(
            floor((self.in_sy - self.sy + 2 * self.pad) / self.stride + 1))
        self.layer_type = 'conv'

        bias = getopt(opt, 'bias_pref', 0.0)
        self.filters = [
            Vol(self.sx, self.sx, self.in_depth)
            for i in xrange(self.out_depth)
        ]
        self.biases = Vol(1, 1, self.out_depth, bias)
Beispiel #4
0
 def fromJSON(self, json):
     self.out_depth = json['out_depth']
     self.out_sx = json['out_sx']
     self.out_sy = json['out_sy']
     self.layer_type = json['layer_type']
     self.num_inputs = json['num_inputs']
     self.l1_decay_mul = json['l1_decay_mul']
     self.l2_decay_mul = json['l2_decay_mul']
     self.filters = [Vol(0, 0, 0, 0).fromJSON(f) for f in json['filters']]
     self.biases = Vol(0, 0, 0, 0).fromJSON(json['biases'])
Beispiel #5
0
def load_data():
    global training_data, testing_data

    train = [
        line.split(',') for line in file(
            './data/titanic-kaggle/train.csv').read().split('\n')[1:]
    ]
    for ex in train:
        PassengerId, Survived, Pclass, Name, NameRest, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked = ex

        # Fixing
        sex = 0.0 if Sex == 'male' else 1.0
        age = 0 if Age == '' else float(Age)
        Embarked = Embarked.replace('\r', '')
        if Embarked == 'C':
            emb = 0.0
        elif Embarked == 'Q':
            emb = 1.0
        else:
            emb = 2.0

        vec = [
            float(Pclass), sex, age,
            float(SibSp),
            float(Parch),
            float(Fare), emb
        ]
        v = Vol(vec)
        training_data.append((v, int(Survived)))

    test = [
        line.split(',') for line in file(
            './data/titanic-kaggle/test.csv').read().split('\n')[1:]
    ]
    for ex in test:
        PassengerId, Pclass, Name, NameRest, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked = ex

        # Fixing
        sex = 0.0 if Sex == 'male' else 1.0
        age = 0 if Age == '' else float(Age)
        Embarked = Embarked.replace('\r', '')
        if Embarked == 'C':
            emb = 0.0
        elif Embarked == 'Q':
            emb = 1.0
        else:
            emb = 2.0
        fare = 0 if Fare == '' else float(Fare)

        vec = [float(Pclass), sex, age, float(SibSp), float(Parch), fare, emb]
        testing_data.append(Vol(vec))

    print 'Data loaded...'
Beispiel #6
0
    def __init__(self, opt={}):
        self.out_depth = opt['num_neurons']
        self.l1_decay_mul = getopt(opt, 'l1_decay_mul', 0.0)
        self.l2_decay_mul = getopt(opt, 'l2_decay_mul', 1.0)

        self.num_inputs = opt['in_sx'] * opt['in_sy'] * opt['in_depth']
        self.out_sx = 1
        self.out_sy = 1
        self.layer_type = 'sim'

        bias = getopt(opt, 'bias_pref', 0.0)
        self.filters = [ Vol(1, 1, self.num_inputs) for i in xrange(self.out_depth) ]
        self.biases = Vol(1, 1, self.out_depth, bias)
Beispiel #7
0
 def fromJSON(self, json):
     self.sx = json['sx']
     self.sy = json['sy']
     self.stride = json['stride']
     self.in_depth = json['in_depth']
     self.out_depth = json['out_depth']
     self.out_sx = json['out_sx']
     self.out_sy = json['out_sy']
     self.layer_type = json['layer_type']
     self.l1_decay_mul = json['l1_decay_mul']
     self.l2_decay_mul = json['l2_decay_mul']
     self.pad = json['pad']
     self.filters = [Vol(0, 0, 0, 0).fromJSON(f) for f in json['filters']]
     self.biases = Vol(0, 0, 0, 0).fromJSON(json['biases'])
Beispiel #8
0
def fill():
    global embeddings

    output = 'PhraseId,Sentiment\n'
    raw = file('./data/sentiment-kaggle/test.tsv').read().split('\n')[1:]
    for idx, line in enumerate(raw):
        try:
            values = line.split('\t')
            phrase_id = values[0]
            phrase = values[2]

            x = []
            for word in phrase.split():
                if word in embeddings:
                    x.append(embeddings[word])

            avgs = [0.0] * 80
            for n in xrange(80):
                for vec in x:
                    avgs[n] += vec[n]
                try:
                    avgs[n] /= float(len(x))
                except:
                    avgs[n] = 0.0

            network.forward(Vol(avgs))
            output += '{},{}\n'.format(phrase_id, network.getPrediction() + 1)

            print idx
        except:
            continue
    with open('./data/sentiment-kaggle/out1.csv', 'w') as outfile:
        outfile.write(output)

    print 'Done'
def volumize(dist):
    global words

    V = Vol(1, 1, N, 0.0)
    for i, word in enumerate(words):
        V.w[i] = dist.freq(word)
    return V
Beispiel #10
0
    def forward(self, V, is_training):
        self.in_act = V
        N = self.out_depth
        V2 = Vol(self.out_sx, self.out_sy, self.out_depth, 0.0)

        if self.out_sx == 1 and self.out_sy == 1:
            for i in xrange(N):
                offset = i * self.group_size
                m = max(V.w[offset:])
                index = V.w[offset:].index(m)
                V2.w[i] = m
                self.switches[i] = offset + index
        else:
            switch_counter = 0
            for x in xrange(V.sx):
                for y in xrange(V.sy):
                    for i in xrange(N):
                        ix = i * self.group_size
                        elem = V.get(x, y, ix)
                        elem_i = 0
                        for j in range(1, self.group_size):
                            elem2 = V.get(x, y, ix + j)
                            if elem2 > elem:
                                elem = elem2
                                elem_i = j
                        V2.set(x, y, i, elem)
                        self.switches[i] = ix + elem_i
                        switch_counter += 1

        self.out_act = V2
        return self.out_act
Beispiel #11
0
    def forward(self, V, in_training):
        self.in_act = V
        A = Vol(1, 1, self.out_depth, 0.0)
        Vw = V.w
        
        def norm(vec):
            return sqrt(sum(c * c for c in vec))
        
        normv = norm(Vw)

        # compute cos sim between V and filters
        for i in xrange(self.out_depth):
            sum_a = 0.0
            fiw = self.filters[i].w
            for d in xrange(self.num_inputs):
                sum_a += Vw[d] * fiw[d]
            sum_a += self.biases.w[i] # dot(W, v) + b
            
            normf = norm(fiw)
            try:
                A.w[i] = sum_a / (normv * normf)
            except:
                A.w[i] = 0

        self.out_act = A
        return self.out_act
Beispiel #12
0
def test():
    global N, words, network

    print 'In testing.'

    gettysburg = """Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth."""
    tokenizer = RegexpTokenizer('\w+')
    gettysburg_tokens = tokenizer.tokenize(gettysburg) 

    samples = []
    for token in gettysburg_tokens:
        word = token.lower()
        if word not in ENGLISH_STOP_WORDS and word not in punctuation:
            samples.append(word)

    dist = FreqDist(samples)
    V = Vol(1, 1, N, 0.0)
    for i, word in enumerate(words):
        V.w[i] = dist.freq(word)

    pred = network.forward(V).w
    topics = []
    while len(topics) != 5:
        max_act = max(pred)
        topic_idx = pred.index(max_act)
        topic = words[topic_idx]

        if topic in gettysburg_tokens:
            topics.append(topic)
    
        del pred[topic_idx]

    print 'Topics of the Gettysburg Address:'
    print topics
Beispiel #13
0
def load_data(train=True):
    global N, frequencies

    with open('./data/big.txt', 'r') as infile:
        text = infile.read()
    
    skip = 3
    size = skip * N
    start = randint(0, len(text) - size)
    content = text[start:start+size]
    data = []

    for i in range(0, len(content), skip):
        x1, x2, y = content[i:i+skip]

        l1 = ord(x1)
        l2 = ord(x2)
        frequencies[l1] += 1
        frequencies[l2] += 1

        V = Vol(1, 1, 255, 0.0)
        V.w[l1] = 1.0
        V.w[l2] = 1.0
        label = ord(y)
        data.append((V, label))

    normalize()

    return data
def load_data(training=True):
    """Adapted from http://g.sweyla.com/blog/2012/mnist-numpy/"""
    path = './data'

    if training:
        fname_img = os.path.join(path, 'train-images-idx3-ubyte')
        fname_lbl = os.path.join(path, 'train-labels-idx1-ubyte')
    else:
        fname_img = os.path.join(path, 't10k-images-idx3-ubyte')
        fname_lbl = os.path.join(path, 't10k-labels-idx1-ubyte')

    # Inputs
    fimg = open(fname_img, 'rb')
    magic_nr, size, rows, cols = struct.unpack(">IIII", fimg.read(16))
    imgs = pyarray("B", fimg.read())
    fimg.close()

    imgs = [imgs[n:n + 784] for n in xrange(0, len(imgs), 784)]
    inputs = []
    V = Vol(28, 28, 1, 0.0)
    for img in imgs:
        V.w = [(px / 255.0) for px in img]
        inputs.append(augment(V, 24))

    # Outputs
    flbl = open(fname_lbl, 'rb')
    magic_nr, size = struct.unpack(">II", flbl.read(8))
    labels = pyarray("b", flbl.read())
    flbl.close()

    return zip(inputs, labels)
Beispiel #15
0
def load_data():
    global N, words

    raw = list(word for fileid in corpus.fileids()
               for word in corpus.words(fileid))
    words = list(
        token
        for token in RegexpTokenizer('\w+').tokenize(' '.join(raw)))[100:1000]
    tokens = set(words)
    tokens_l = list(tokens)
    N = len(tokens)
    print 'Corpus size: {} words'.format(N)

    step = 4
    data = []
    for gram in ngrams(words, step):
        w1, w2, w3, pred = gram
        V = Vol(1, 1, N, 0.0)
        V.w[tokens_l.index(w1)] = 1
        V.w[tokens_l.index(w2)] = 1
        V.w[tokens_l.index(w3)] = 1
        label = tokens_l.index(pred)
        data.append((V, label))

    return data
Beispiel #16
0
    def forward(self, V, is_training):
        self.in_act = V
        V2 = Vol(self.out_sx, self.out_sy, self.out_depth, 0.0)

        sexp = 0.0
        for i in xrange(len(V2.w)):
            sexp += exp(V.w[i])
            V2.w[i] = log((sexp / (i + 1))) / self.zeta

        self.out_act = V2
        return self.out_act
Beispiel #17
0
    def policy(self, s):
        """
        compute the value of doing any action in this state
        and return the argmax action and its value
        """

        V = Vol(s)
        action_values = self.value_net.forward(V)
        weights = action_values.w
        max_val = max(weights)
        max_k = weights.index(maxval)
        return {'action': max_k, 'value': max_val}
Beispiel #18
0
def load_data():
    global iris_data

    data = load_iris()

    xs = data.data
    ys = data.target

    inputs = [Vol(list(row)) for row in xs]
    labels = list(ys)

    iris_data = zip(inputs, labels)
    print 'Data loaded...'
def train():
    global training_data, n, t, training_data2

    print 'In training...'
    print 'k', 'time\t\t  ', 'loss\t  '
    print '----------------------------------------------------'
    training_data2 = []
    try:
        for x, y in training_data:
            stats = t.train(x, x.w)
            print stats['k'], stats['time'], stats['loss']
            training_data2.append((Vol(n.forward(x).w), y))
    except:  #hit control-c or other
        return
Beispiel #20
0
    def forward(self, V, is_training):
        self.in_act = V

        A = Vol(1, 1, self.num_inputs, 0.0)
        applied = 0
        for n in xrange(self.num_inputs):
            if n < self.skip:
                A.w[n] = V.w[n]
            else:
                A.w[n] = V.w[n] + self.delta[n - self.skip]
                applied += 1
            if applied == self.num_neurons:
                break

        self.out_act = A
        return self.out_act
Beispiel #21
0
    def forward(self, V, in_training):
        self.in_act = V
        A = Vol(1, 1, self.out_depth, 0.0)
        Vw = V.w

        # dot(W, x) + b
        for i in xrange(self.out_depth):
            sum_a = 0.0
            fiw = self.filters[i].w
            for d in xrange(self.num_inputs):
                sum_a += Vw[d] * fiw[d]
            sum_a += self.biases.w[i]
            A.w[i] = sum_a

        self.out_act = A
        return self.out_act
Beispiel #22
0
def generate(n, train=True):
    data = []
    if train:
        for i in range(n / 2):
            data.append(
                ((2 + randint(-1, 1), 7 + randint(-1, 1)), 1))  # top right
        for i in range(n / 2):
            data.append(
                ((7.5 + randint(-2, 2), 2 + randint(-1, 1)), 0))  # bottom left
    else:
        for i in range(n / 2):
            data.append(
                ((2 + randint(-1, 1), 7 + randint(-1, 1)), 1))  # top right
        for i in range(n / 2):
            data.append(
                ((7 + randint(-1, 1), 2 + randint(-1, 1)), 0))  # bottom left
    return [(Vol(x), label) for x, label in data]
def test_text(text, ngenerate=10, delete=True):
    out = ''
    for n in xrange(ngenerate):
        x = []
        words = text.split()
        for word in words:
            if word not in embeddings:
                return 'word: {} not in corpus'.format(word)
            else:
                x.extend(embeddings[word])
        output = network.forward(Vol(x)).w
        pred = network.getPrediction()
        new = tokens_l[pred] if random() < 0.5 else \
            weightedSample(embeddings.keys(), output)

        out += ' ' + new
        text = ' '.join(words[1:] + [new])
    return out
Beispiel #24
0
    def forward(self, V, is_training):
        self.in_act = V
        A = Vol(1, 1, self.out_depth, 0.0)

        # max activation
        max_act = max(V.w)

        # compute exponentials (carefully to not blow up)
        # normalize
        exps = [exp(w - max_act) for w in V.w]
        exps_sum = float(sum(exps))
        exps_norm = [elem / exps_sum for elem in exps]

        self.es = exps_norm
        A.w = exps_norm

        self.out_act = A
        return self.out_act
Beispiel #25
0
def load_data():
    global embeddings

    embeddings = {}
    raw = file('./data/word_projections-80.txt').read()
    raw = raw[9:]
    raw = raw.split('\n')
    for elem in raw:
        try:
            data = elem.split()
            word = data[0].lower()
            vector = [float(v) for v in data[1:]]
            embeddings[word] = vector
        except:
            continue

    data = []
    raw = file('./data/sentiment-kaggle/train.tsv').read().split('\n')[1:]
    for line in raw:
        try:
            values = line.split('\t')
            phrase = values[2]
            sentag = int(values[3]) - 1

            x = []
            for word in phrase.split():
                if word in embeddings:
                    x.append(embeddings[word])

            avgs = [0.0] * 80
            for n in xrange(80):
                for vec in x:
                    avgs[n] += vec[n]
                try:
                    avgs[n] /= float(len(x))
                except:
                    avgs[n] = 0.0

            V = Vol(avgs)
            data.append((V, sentag))
        except:
            continue

    return data
Beispiel #26
0
def load_data():
    global N, words

    freqs = [ FreqDist(corpus.words(fileid)) for fileid in corpus.fileids() ]
    words = list(set(word 
                    for dist in freqs 
                    for word in dist.keys()
                    if word not in ENGLISH_STOP_WORDS and
                    word not in punctuation))

    data = []
    N = len(words)
    for dist in freqs:
        V = Vol(1, 1, N, 0.0)
        for i, word in enumerate(words):
            V.w[i] = dist.freq(word)
        data.append((V, V.w))

    return data
Beispiel #27
0
    def backward(self, reward):
        self.latest_reward = reward
        self.average_reward_window.add(reward)
        self.reward_window.pop(0)
        self.reward_window.append(reward)

        if not self.learning:
            return

        self.age += 1

        #it is time t+1 and we have to store (s_t, a_t, r_t, s_{t+1}) as new experience
        #(given that an appropriate number of state measurements already exist, of course)
        if self.forward_passes > self.temporal_window + 1:
            n = self.window_size
            e = Experience(self.net_window[n - 2], self.action_window[n - 2],
                           self.reward_window[n - 2], self.net_window[n - 1])

            if len(self.experience) < self.experience_size:
                self.experience.append(e)
            else:
                ri = randi(0, self.experience_size)
                self.experience[ri] = e

        #learn based on experience, once we have some samples to go on
        #this is where the magic happens...
        if len(self.experience) > self.start_learn_threshold:
            avcost = 0.0

            for k in xrange(self.tdtrainer.batch_size):
                re = randi(0, len(self.experience))
                e = self.experience[re]
                x = Vol(1, 1, self.net_inputs)
                x.w = e.state0
                maxact = self.policy(e.state1)
                r = e.reward0 + self.gamma * maxact['value']
                ystruct = {'dim': e.action0, 'val': r}
                stats = self.tdtrainer.train(x, ystruct)
                avcost += stats['loss']

            avcost /= self.tdtrainer.batch_size
            print avcost
            self.average_loss_window.add(avcost)
Beispiel #28
0
def load_data():
    global N, words, labels

    posts = corpus.xml_posts()[:10000]
    freqs = [ FreqDist(post.text) for post in posts ] 
    words = list(set(word 
                    for dist in freqs 
                    for word in dist.keys()
                    if word not in ENGLISH_STOP_WORDS and
                    word not in punctuation))

    labels = list(set([ post.get('class') for post in posts ]))

    data = []
    N = len(words)
    for post, dist in zip(posts, freqs):
        V = Vol(1, 1, N, 0.0)
        for i, word in enumerate(words):
            V.w[i] = dist.freq(word)
        data.append((V, labels.index(post.get('class'))))

    return data
Beispiel #29
0
def load_data():
    global training_data, testing_data

    lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)

    xs = lfw_people.data
    ys = lfw_people.target

    inputs = []
    labels = list(ys)

    for face in xs:
        V = Vol(50, 37, 1, 0.0)
        V.w = list(face)
        inputs.append(augment(V, 30))

    x_tr, x_te, y_tr, y_te = train_test_split(inputs, labels, test_size=0.25)

    training_data = zip(x_tr, y_tr)
    testing_data = zip(x_te, y_te)

    print 'Dataset made...'
def load_data():
    global embeddings, N, tokens_l

    embeddings = {}
    raw = file('./data/word_projections-80.txt').read()
    raw = raw[9:]
    raw = raw.split('\n')
    for elem in raw:
        try:
            data = elem.split()
            word = data[0].lower()
            vector = [float(v) for v in data[1:]]
            embeddings[word] = vector
        except:
            continue

    path = './data/text/train_tiny'
    words = list(token for fname in os.listdir(path)
                 for token in file(os.path.join(path, fname)).read().split())
    tokens = set(words)
    tokens_l = list(tokens)
    N = len(tokens)
    print 'Corpus size: {} words'.format(N)

    step = 4
    data = []
    for n in xrange(0, len(words) - step):
        w1, w2, w3, pred = words[n:n + step]

        if not (w1 in embeddings and w2 in embeddings and w3 in embeddings
                and pred in embeddings and pred in tokens):
            continue

        V = Vol(embeddings[w1] + embeddings[w2] + embeddings[w3])
        label = tokens_l.index(pred)
        data.append((V, label))

    return data