Python Vol.Vol Beispiele, vol.Vol.Vol Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: vol_util.py Projekt: liyuming1978/PyTrafficCar

def augment(V, crop, grayscale=False):
    # note assumes square outputs of size crop x crop
    # randomly sample a crop in the input volume
    if crop == V.sx: return V

    dx = randi(0, V.sx - crop)
    dy = randi(0, V.sy - crop)

    W = Vol(crop, crop, V.depth)
    for x in xrange(crop):
        for y in xrange(crop):
            if x + dx < 0 or x + dx >= V.sx or \
                y + dy < 0 or y + dy >= V.sy:
                continue
            for d in xrange(V.depth):
                W.set(x, y, d, V.get(x + dx, y + dy, d))

    if grayscale:
        #flatten into depth=1 array
        G = Vol(crop, crop, 1, 0.0)
        for i in xrange(crop):
            for j in xrange(crop):
                G.set(i, j, 0, W.get(i, j, 0))
        W = G

    return W

Beispiel #2

0

Datei anzeigen

Datei: udacity_terrain.py Projekt: liyuming1978/PyTrafficCar

def makeTerrainData(n_points=1000):
    global training_data, testing_data

    ###############################################################################
    ### from: https://github.com/udacity/ud120-projects/blob/master/choose_your_own/prep_terrain_data.py
    ### make the toy dataset
    random.seed(42)
    grade = [random.random() for ii in range(0, n_points)]
    bumpy = [random.random() for ii in range(0, n_points)]
    error = [random.random() for ii in range(0, n_points)]
    y = [
        round(grade[ii] * bumpy[ii] + 0.3 + 0.1 * error[ii])
        for ii in range(0, n_points)
    ]
    for ii in range(0, len(y)):
        if grade[ii] > 0.8 or bumpy[ii] > 0.8:
            y[ii] = 1.0


### split into train/test sets
    X = [[gg, ss] for gg, ss in zip(grade, bumpy)]
    split = int(0.75 * n_points)
    X_train = X[0:split]
    X_test = X[split:]
    y_train = y[0:split]
    y_test = y[split:]

    for x, y in zip(X_train, y_train):
        training_data.append((Vol(x), int(y)))
    for x, y in zip(X_test, y_test):
        testing_data.append((Vol(x), int(y)))

Beispiel #3

0

Datei anzeigen

    def __init__(self, opt={}):
        self.out_depth = opt['filters']
        self.sx = opt['sx']  # filter size: should be odd if possible
        self.in_depth = opt['in_depth']
        self.in_sx = opt['in_sx']
        self.in_sy = opt['in_sy']

        # optional
        self.sy = getopt(opt, 'sy', self.sx)
        self.stride = getopt(
            opt, 'stride',
            1)  # stride at which we apply filters to input volume
        self.pad = getopt(opt, 'pad', 0)  # padding to borders of input volume
        self.l1_decay_mul = getopt(opt, 'l1_decay_mul', 0.0)
        self.l2_decay_mul = getopt(opt, 'l2_decay_mul', 1.0)
        """
        Note we are doing floor, so if the strided convolution of the filter doesnt fit into the input
        volume exactly, the output volume will be trimmed and not contain the (incomplete) computed
        final application.
        """
        self.out_sx = int(
            floor((self.in_sx - self.sx + 2 * self.pad) / self.stride + 1))
        self.out_sy = int(
            floor((self.in_sy - self.sy + 2 * self.pad) / self.stride + 1))
        self.layer_type = 'conv'

        bias = getopt(opt, 'bias_pref', 0.0)
        self.filters = [
            Vol(self.sx, self.sx, self.in_depth)
            for i in xrange(self.out_depth)
        ]
        self.biases = Vol(1, 1, self.out_depth, bias)

Beispiel #4

0

Datei anzeigen

 def fromJSON(self, json):
     self.out_depth = json['out_depth']
     self.out_sx = json['out_sx']
     self.out_sy = json['out_sy']
     self.layer_type = json['layer_type']
     self.num_inputs = json['num_inputs']
     self.l1_decay_mul = json['l1_decay_mul']
     self.l2_decay_mul = json['l2_decay_mul']
     self.filters = [Vol(0, 0, 0, 0).fromJSON(f) for f in json['filters']]
     self.biases = Vol(0, 0, 0, 0).fromJSON(json['biases'])

Beispiel #5

0

Datei anzeigen

Datei: titanic.py Projekt: liyuming1978/PyTrafficCar

def load_data():
    global training_data, testing_data

    train = [
        line.split(',') for line in file(
            './data/titanic-kaggle/train.csv').read().split('\n')[1:]
    ]
    for ex in train:
        PassengerId, Survived, Pclass, Name, NameRest, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked = ex

        # Fixing
        sex = 0.0 if Sex == 'male' else 1.0
        age = 0 if Age == '' else float(Age)
        Embarked = Embarked.replace('\r', '')
        if Embarked == 'C':
            emb = 0.0
        elif Embarked == 'Q':
            emb = 1.0
        else:
            emb = 2.0

        vec = [
            float(Pclass), sex, age,
            float(SibSp),
            float(Parch),
            float(Fare), emb
        ]
        v = Vol(vec)
        training_data.append((v, int(Survived)))

    test = [
        line.split(',') for line in file(
            './data/titanic-kaggle/test.csv').read().split('\n')[1:]
    ]
    for ex in test:
        PassengerId, Pclass, Name, NameRest, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked = ex

        # Fixing
        sex = 0.0 if Sex == 'male' else 1.0
        age = 0 if Age == '' else float(Age)
        Embarked = Embarked.replace('\r', '')
        if Embarked == 'C':
            emb = 0.0
        elif Embarked == 'Q':
            emb = 1.0
        else:
            emb = 2.0
        fare = 0 if Fare == '' else float(Fare)

        vec = [float(Pclass), sex, age, float(SibSp), float(Parch), fare, emb]
        testing_data.append(Vol(vec))

    print 'Data loaded...'

Beispiel #6

0

Datei anzeigen

Datei: similarity.py Projekt: xsongx/ConvNetPy

    def __init__(self, opt={}):
        self.out_depth = opt['num_neurons']
        self.l1_decay_mul = getopt(opt, 'l1_decay_mul', 0.0)
        self.l2_decay_mul = getopt(opt, 'l2_decay_mul', 1.0)

        self.num_inputs = opt['in_sx'] * opt['in_sy'] * opt['in_depth']
        self.out_sx = 1
        self.out_sy = 1
        self.layer_type = 'sim'

        bias = getopt(opt, 'bias_pref', 0.0)
        self.filters = [ Vol(1, 1, self.num_inputs) for i in xrange(self.out_depth) ]
        self.biases = Vol(1, 1, self.out_depth, bias)

Beispiel #7

0

Datei anzeigen

 def fromJSON(self, json):
     self.sx = json['sx']
     self.sy = json['sy']
     self.stride = json['stride']
     self.in_depth = json['in_depth']
     self.out_depth = json['out_depth']
     self.out_sx = json['out_sx']
     self.out_sy = json['out_sy']
     self.layer_type = json['layer_type']
     self.l1_decay_mul = json['l1_decay_mul']
     self.l2_decay_mul = json['l2_decay_mul']
     self.pad = json['pad']
     self.filters = [Vol(0, 0, 0, 0).fromJSON(f) for f in json['filters']]
     self.biases = Vol(0, 0, 0, 0).fromJSON(json['biases'])

Beispiel #8

0

Datei anzeigen

Datei: sentiment.py Projekt: liyuming1978/PyTrafficCar

def fill():
    global embeddings

    output = 'PhraseId,Sentiment\n'
    raw = file('./data/sentiment-kaggle/test.tsv').read().split('\n')[1:]
    for idx, line in enumerate(raw):
        try:
            values = line.split('\t')
            phrase_id = values[0]
            phrase = values[2]

            x = []
            for word in phrase.split():
                if word in embeddings:
                    x.append(embeddings[word])

            avgs = [0.0] * 80
            for n in xrange(80):
                for vec in x:
                    avgs[n] += vec[n]
                try:
                    avgs[n] /= float(len(x))
                except:
                    avgs[n] = 0.0

            network.forward(Vol(avgs))
            output += '{},{}\n'.format(phrase_id, network.getPrediction() + 1)

            print idx
        except:
            continue
    with open('./data/sentiment-kaggle/out1.csv', 'w') as outfile:
        outfile.write(output)

    print 'Done'

Beispiel #9

0

Datei anzeigen

Datei: similarity.py Projekt: liyuming1978/PyTrafficCar

def volumize(dist):
    global words

    V = Vol(1, 1, N, 0.0)
    for i, word in enumerate(words):
        V.w[i] = dist.freq(word)
    return V

Beispiel #10

0

Datei anzeigen

    def forward(self, V, is_training):
        self.in_act = V
        N = self.out_depth
        V2 = Vol(self.out_sx, self.out_sy, self.out_depth, 0.0)

        if self.out_sx == 1 and self.out_sy == 1:
            for i in xrange(N):
                offset = i * self.group_size
                m = max(V.w[offset:])
                index = V.w[offset:].index(m)
                V2.w[i] = m
                self.switches[i] = offset + index
        else:
            switch_counter = 0
            for x in xrange(V.sx):
                for y in xrange(V.sy):
                    for i in xrange(N):
                        ix = i * self.group_size
                        elem = V.get(x, y, ix)
                        elem_i = 0
                        for j in range(1, self.group_size):
                            elem2 = V.get(x, y, ix + j)
                            if elem2 > elem:
                                elem = elem2
                                elem_i = j
                        V2.set(x, y, i, elem)
                        self.switches[i] = ix + elem_i
                        switch_counter += 1

        self.out_act = V2
        return self.out_act

Beispiel #11

0

Datei anzeigen

Datei: similarity.py Projekt: xsongx/ConvNetPy

    def forward(self, V, in_training):
        self.in_act = V
        A = Vol(1, 1, self.out_depth, 0.0)
        Vw = V.w
        
        def norm(vec):
            return sqrt(sum(c * c for c in vec))
        
        normv = norm(Vw)

        # compute cos sim between V and filters
        for i in xrange(self.out_depth):
            sum_a = 0.0
            fiw = self.filters[i].w
            for d in xrange(self.num_inputs):
                sum_a += Vw[d] * fiw[d]
            sum_a += self.biases.w[i] # dot(W, v) + b
            
            normf = norm(fiw)
            try:
                A.w[i] = sum_a / (normv * normf)
            except:
                A.w[i] = 0

        self.out_act = A
        return self.out_act

Beispiel #12

0

Datei anzeigen

Datei: topics.py Projekt: liyuming1978/PyTrafficCar

def test():
    global N, words, network

    print 'In testing.'

    gettysburg = """Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth."""
    tokenizer = RegexpTokenizer('\w+')
    gettysburg_tokens = tokenizer.tokenize(gettysburg) 

    samples = []
    for token in gettysburg_tokens:
        word = token.lower()
        if word not in ENGLISH_STOP_WORDS and word not in punctuation:
            samples.append(word)

    dist = FreqDist(samples)
    V = Vol(1, 1, N, 0.0)
    for i, word in enumerate(words):
        V.w[i] = dist.freq(word)

    pred = network.forward(V).w
    topics = []
    while len(topics) != 5:
        max_act = max(pred)
        topic_idx = pred.index(max_act)
        topic = words[topic_idx]

        if topic in gettysburg_tokens:
            topics.append(topic)
    
        del pred[topic_idx]

    print 'Topics of the Gettysburg Address:'
    print topics

Beispiel #13

0

Datei anzeigen

def load_data(train=True):
    global N, frequencies

    with open('./data/big.txt', 'r') as infile:
        text = infile.read()
    
    skip = 3
    size = skip * N
    start = randint(0, len(text) - size)
    content = text[start:start+size]
    data = []

    for i in range(0, len(content), skip):
        x1, x2, y = content[i:i+skip]

        l1 = ord(x1)
        l2 = ord(x2)
        frequencies[l1] += 1
        frequencies[l2] += 1

        V = Vol(1, 1, 255, 0.0)
        V.w[l1] = 1.0
        V.w[l2] = 1.0
        label = ord(y)
        data.append((V, label))

    normalize()

    return data

Beispiel #14

0

Datei anzeigen

Datei: dark_knowledge.py Projekt: liyuming1978/PyTrafficCar

def load_data(training=True):
    """Adapted from http://g.sweyla.com/blog/2012/mnist-numpy/"""
    path = './data'

    if training:
        fname_img = os.path.join(path, 'train-images-idx3-ubyte')
        fname_lbl = os.path.join(path, 'train-labels-idx1-ubyte')
    else:
        fname_img = os.path.join(path, 't10k-images-idx3-ubyte')
        fname_lbl = os.path.join(path, 't10k-labels-idx1-ubyte')

    # Inputs
    fimg = open(fname_img, 'rb')
    magic_nr, size, rows, cols = struct.unpack(">IIII", fimg.read(16))
    imgs = pyarray("B", fimg.read())
    fimg.close()

    imgs = [imgs[n:n + 784] for n in xrange(0, len(imgs), 784)]
    inputs = []
    V = Vol(28, 28, 1, 0.0)
    for img in imgs:
        V.w = [(px / 255.0) for px in img]
        inputs.append(augment(V, 24))

    # Outputs
    flbl = open(fname_lbl, 'rb')
    magic_nr, size = struct.unpack(">II", flbl.read(8))
    labels = pyarray("b", flbl.read())
    flbl.close()

    return zip(inputs, labels)

Beispiel #15

0

Datei anzeigen

def load_data():
    global N, words

    raw = list(word for fileid in corpus.fileids()
               for word in corpus.words(fileid))
    words = list(
        token
        for token in RegexpTokenizer('\w+').tokenize(' '.join(raw)))[100:1000]
    tokens = set(words)
    tokens_l = list(tokens)
    N = len(tokens)
    print 'Corpus size: {} words'.format(N)

    step = 4
    data = []
    for gram in ngrams(words, step):
        w1, w2, w3, pred = gram
        V = Vol(1, 1, N, 0.0)
        V.w[tokens_l.index(w1)] = 1
        V.w[tokens_l.index(w2)] = 1
        V.w[tokens_l.index(w3)] = 1
        label = tokens_l.index(pred)
        data.append((V, label))

    return data

Beispiel #16

0

Datei anzeigen

Datei: similarity.py Projekt: xsongx/ConvNetPy

    def forward(self, V, is_training):
        self.in_act = V
        V2 = Vol(self.out_sx, self.out_sy, self.out_depth, 0.0)

        sexp = 0.0
        for i in xrange(len(V2.w)):
            sexp += exp(V.w[i])
            V2.w[i] = log((sexp / (i + 1))) / self.zeta

        self.out_act = V2
        return self.out_act

Beispiel #17

0

Datei anzeigen

Datei: deepqlearn.py Projekt: pombredanne/ConvNetPy

    def policy(self, s):
        """
        compute the value of doing any action in this state
        and return the argmax action and its value
        """

        V = Vol(s)
        action_values = self.value_net.forward(V)
        weights = action_values.w
        max_val = max(weights)
        max_k = weights.index(maxval)
        return {'action': max_k, 'value': max_val}

Beispiel #18

0

Datei anzeigen

Datei: iris.py Projekt: liyuming1978/PyTrafficCar

def load_data():
    global iris_data

    data = load_iris()

    xs = data.data
    ys = data.target

    inputs = [Vol(list(row)) for row in xs]
    labels = list(ys)

    iris_data = zip(inputs, labels)
    print 'Data loaded...'

Beispiel #19

0

Datei anzeigen

Datei: darkencoder.py Projekt: liyuming1978/PyTrafficCar

def train():
    global training_data, n, t, training_data2

    print 'In training...'
    print 'k', 'time\t\t  ', 'loss\t  '
    print '----------------------------------------------------'
    training_data2 = []
    try:
        for x, y in training_data:
            stats = t.train(x, x.w)
            print stats['k'], stats['time'], stats['loss']
            training_data2.append((Vol(n.forward(x).w), y))
    except:  #hit control-c or other
        return

Beispiel #20

0

Datei anzeigen

    def forward(self, V, is_training):
        self.in_act = V

        A = Vol(1, 1, self.num_inputs, 0.0)
        applied = 0
        for n in xrange(self.num_inputs):
            if n < self.skip:
                A.w[n] = V.w[n]
            else:
                A.w[n] = V.w[n] + self.delta[n - self.skip]
                applied += 1
            if applied == self.num_neurons:
                break

        self.out_act = A
        return self.out_act

Beispiel #21

0

Datei anzeigen

    def forward(self, V, in_training):
        self.in_act = V
        A = Vol(1, 1, self.out_depth, 0.0)
        Vw = V.w

        # dot(W, x) + b
        for i in xrange(self.out_depth):
            sum_a = 0.0
            fiw = self.filters[i].w
            for d in xrange(self.num_inputs):
                sum_a += Vw[d] * fiw[d]
            sum_a += self.biases.w[i]
            A.w[i] = sum_a

        self.out_act = A
        return self.out_act

Beispiel #22

0

Datei anzeigen

def generate(n, train=True):
    data = []
    if train:
        for i in range(n / 2):
            data.append(
                ((2 + randint(-1, 1), 7 + randint(-1, 1)), 1))  # top right
        for i in range(n / 2):
            data.append(
                ((7.5 + randint(-2, 2), 2 + randint(-1, 1)), 0))  # bottom left
    else:
        for i in range(n / 2):
            data.append(
                ((2 + randint(-1, 1), 7 + randint(-1, 1)), 1))  # top right
        for i in range(n / 2):
            data.append(
                ((7 + randint(-1, 1), 2 + randint(-1, 1)), 0))  # bottom left
    return [(Vol(x), label) for x, label in data]

Beispiel #23

0

Datei anzeigen

Datei: next_word_embeddings.py Projekt: liyuming1978/PyTrafficCar

def test_text(text, ngenerate=10, delete=True):
    out = ''
    for n in xrange(ngenerate):
        x = []
        words = text.split()
        for word in words:
            if word not in embeddings:
                return 'word: {} not in corpus'.format(word)
            else:
                x.extend(embeddings[word])
        output = network.forward(Vol(x)).w
        pred = network.getPrediction()
        new = tokens_l[pred] if random() < 0.5 else \
            weightedSample(embeddings.keys(), output)

        out += ' ' + new
        text = ' '.join(words[1:] + [new])
    return out

Beispiel #24

0

Datei anzeigen

Datei: loss.py Projekt: xsongx/ConvNetPy

    def forward(self, V, is_training):
        self.in_act = V
        A = Vol(1, 1, self.out_depth, 0.0)

        # max activation
        max_act = max(V.w)

        # compute exponentials (carefully to not blow up)
        # normalize
        exps = [exp(w - max_act) for w in V.w]
        exps_sum = float(sum(exps))
        exps_norm = [elem / exps_sum for elem in exps]

        self.es = exps_norm
        A.w = exps_norm

        self.out_act = A
        return self.out_act

Beispiel #25

0

Datei anzeigen

Datei: sentiment.py Projekt: liyuming1978/PyTrafficCar

def load_data():
    global embeddings

    embeddings = {}
    raw = file('./data/word_projections-80.txt').read()
    raw = raw[9:]
    raw = raw.split('\n')
    for elem in raw:
        try:
            data = elem.split()
            word = data[0].lower()
            vector = [float(v) for v in data[1:]]
            embeddings[word] = vector
        except:
            continue

    data = []
    raw = file('./data/sentiment-kaggle/train.tsv').read().split('\n')[1:]
    for line in raw:
        try:
            values = line.split('\t')
            phrase = values[2]
            sentag = int(values[3]) - 1

            x = []
            for word in phrase.split():
                if word in embeddings:
                    x.append(embeddings[word])

            avgs = [0.0] * 80
            for n in xrange(80):
                for vec in x:
                    avgs[n] += vec[n]
                try:
                    avgs[n] /= float(len(x))
                except:
                    avgs[n] = 0.0

            V = Vol(avgs)
            data.append((V, sentag))
        except:
            continue

    return data

Beispiel #26

0

Datei anzeigen

Datei: topics.py Projekt: liyuming1978/PyTrafficCar

def load_data():
    global N, words

    freqs = [ FreqDist(corpus.words(fileid)) for fileid in corpus.fileids() ]
    words = list(set(word 
                    for dist in freqs 
                    for word in dist.keys()
                    if word not in ENGLISH_STOP_WORDS and
                    word not in punctuation))

    data = []
    N = len(words)
    for dist in freqs:
        V = Vol(1, 1, N, 0.0)
        for i, word in enumerate(words):
            V.w[i] = dist.freq(word)
        data.append((V, V.w))

    return data

Beispiel #27

0

Datei anzeigen

Datei: deepqlearn.py Projekt: liyuming1978/PyTrafficCar

    def backward(self, reward):
        self.latest_reward = reward
        self.average_reward_window.add(reward)
        self.reward_window.pop(0)
        self.reward_window.append(reward)

        if not self.learning:
            return

        self.age += 1

        #it is time t+1 and we have to store (s_t, a_t, r_t, s_{t+1}) as new experience
        #(given that an appropriate number of state measurements already exist, of course)
        if self.forward_passes > self.temporal_window + 1:
            n = self.window_size
            e = Experience(self.net_window[n - 2], self.action_window[n - 2],
                           self.reward_window[n - 2], self.net_window[n - 1])

            if len(self.experience) < self.experience_size:
                self.experience.append(e)
            else:
                ri = randi(0, self.experience_size)
                self.experience[ri] = e

        #learn based on experience, once we have some samples to go on
        #this is where the magic happens...
        if len(self.experience) > self.start_learn_threshold:
            avcost = 0.0

            for k in xrange(self.tdtrainer.batch_size):
                re = randi(0, len(self.experience))
                e = self.experience[re]
                x = Vol(1, 1, self.net_inputs)
                x.w = e.state0
                maxact = self.policy(e.state1)
                r = e.reward0 + self.gamma * maxact['value']
                ystruct = {'dim': e.action0, 'val': r}
                stats = self.tdtrainer.train(x, ystruct)
                avcost += stats['loss']

            avcost /= self.tdtrainer.batch_size
            print avcost
            self.average_loss_window.add(avcost)

Beispiel #28

0

Datei anzeigen

def load_data():
    global N, words, labels

    posts = corpus.xml_posts()[:10000]
    freqs = [ FreqDist(post.text) for post in posts ] 
    words = list(set(word 
                    for dist in freqs 
                    for word in dist.keys()
                    if word not in ENGLISH_STOP_WORDS and
                    word not in punctuation))

    labels = list(set([ post.get('class') for post in posts ]))

    data = []
    N = len(words)
    for post, dist in zip(posts, freqs):
        V = Vol(1, 1, N, 0.0)
        for i, word in enumerate(words):
            V.w[i] = dist.freq(word)
        data.append((V, labels.index(post.get('class'))))

    return data

Beispiel #29

0

Datei anzeigen

Datei: faces.py Projekt: liyuming1978/PyTrafficCar

def load_data():
    global training_data, testing_data

    lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)

    xs = lfw_people.data
    ys = lfw_people.target

    inputs = []
    labels = list(ys)

    for face in xs:
        V = Vol(50, 37, 1, 0.0)
        V.w = list(face)
        inputs.append(augment(V, 30))

    x_tr, x_te, y_tr, y_te = train_test_split(inputs, labels, test_size=0.25)

    training_data = zip(x_tr, y_tr)
    testing_data = zip(x_te, y_te)

    print 'Dataset made...'

Beispiel #30

0

Datei anzeigen

Datei: next_word_embeddings.py Projekt: liyuming1978/PyTrafficCar

def load_data():
    global embeddings, N, tokens_l

    embeddings = {}
    raw = file('./data/word_projections-80.txt').read()
    raw = raw[9:]
    raw = raw.split('\n')
    for elem in raw:
        try:
            data = elem.split()
            word = data[0].lower()
            vector = [float(v) for v in data[1:]]
            embeddings[word] = vector
        except:
            continue

    path = './data/text/train_tiny'
    words = list(token for fname in os.listdir(path)
                 for token in file(os.path.join(path, fname)).read().split())
    tokens = set(words)
    tokens_l = list(tokens)
    N = len(tokens)
    print 'Corpus size: {} words'.format(N)

    step = 4
    data = []
    for n in xrange(0, len(words) - step):
        w1, w2, w3, pred = words[n:n + step]

        if not (w1 in embeddings and w2 in embeddings and w3 in embeddings
                and pred in embeddings and pred in tokens):
            continue

        V = Vol(embeddings[w1] + embeddings[w2] + embeddings[w3])
        label = tokens_l.index(pred)
        data.append((V, label))

    return data