Python DiskArray.flush Examples, diskarray.DiskArray.flush Python Examples

Example #1

0

Show file

    def load_data(self, fpath):
        words = [line[:-1] for line in open(fpath)]
        max_len = max(len(w)
                      for w in words) + 2  # adding 2 for start and end chars
        nwords = len(words)

        chars = list(sorted(list(set(list(''.join(words))))))
        chars = [self.CHAR_NONE, self.CHAR_START, self.CHAR_END] + chars
        charmap = {c: i for i, c in enumerate(chars)}
        nchars = len(chars)
        char_none = to_categorical(charmap[self.CHAR_NONE], num_classes=nchars)

        data = DiskArray('866k_training_data.array',
                         shape=(nwords, max_len, nchars),
                         dtype=np.float32)
        labels = DiskArraay('866k_labels_data.array',
                            shape=(nwords, nchars),
                            dtype=np.float32)

        for i in range(nwords):
            w = words[i][:-1]
            last_char = words[i][-1]

            w = '%s%s%s' % (self.CHAR_START, w, self.CHAR_END)
            w = [to_categorical(charmap[x], num_classes=nchars) for x in w]
            w = w + ([char_none] * (max_len - len(w)))

            data[i] = w
            labels[i] = to_categorical(charmap[last_char], num_classes=nchars)

        data.flush()
        labels.flush()
        return data, labels

Example #2

0

Show file

File: gist_script.py Project: chatrapathik/Deep-Learning

    def run(self):

        fpath = self.args.text
        fpath_pickled = self.args.text + ".pickled"

        max_len, nchars, nwords, words, charmap = self.get_char_to_int(fpath)

        disk_array = DiskArray(self.args.out_f,
                               shape=(0, ),
                               dtype=[('vec', np.float32, 128)])
        '''
        if not os.path.exists(fpath_pickled):
            data, labels = self.load_data(max_len, nchars, nwords, words, charmap)
            pickle.dump((data, labels), open(fpath_pickled, 'wb'))
        else:
            data, labels = pickle.load(open(fpath_pickled, 'rb'))
        '''

        if not os.path.exists(self.args.model_name):
            model = self.create_model(128, max_len, nchars)

            #history = model.fit(data, labels, epochs=self.args.epochs, batch_size=128)
            generator = self.generator(max_len, nchars, nwords, words, charmap,
                                       2048)
            model.fit_generator(generator,
                                steps_per_epoch=nwords / 2048,
                                epochs=self.args.epochs)
        else:
            model = load_model(self.args.model_name)

        model.save(self.args.model_name)

        if self.args.layer == 'lstm_layer':
            self.log.info('Accessing the layer weights')
            new_model = Sequential()
            new_model.add(LSTM(128, input_shape=(max_len, nchars),
                               unroll=True))
            weights = model.layers[0].get_weights()
            new_model.set_weights(weights)
            model_p = new_model
        else:
            model_p = model

        self.log.info('started predicting')
        for word in words:
            test_data, test_lables = self.load_data(max_len, nchars, 1, [word],
                                                    charmap)
            p_out = model_p.predict(test_data)
            disk_array.append((p_out[0], ))

        disk_array.flush()

Example #3

0

Show file

File: test_model.py Project: chatrapathik/Deep-Learning

    def run(self):

        fpath = self.args.text

        max_len, nchars, nwords, words, charmap = self.get_char_to_int(fpath)

        disk_array = DiskArray(self.args.out_f, shape=(0,), dtype=[('vec', np.float32, 108)])

        model = load_model(self.args.model_name)

        self.log.info('started predicting')
        for word in words:
            test_data, test_lables = self.load_data(max_len, nchars, 1, [word], charmap)
            p_out = model.predict(test_data)
            disk_array.append((p_out[0],))

        disk_array.flush()

Example #4

0

Show file

    def run(self):

        fpath = self.args.text

        max_len, nchars, nwords, charmap = self.get_char_to_int(fpath)

        disk_array = DiskArray(self.args.out_f,
                               shape=(0, ),
                               dtype=[('vec', np.float32, 128)])
        if not os.path.exists(self.args.training_data):
            data, labels = self.load_data(max_len, nchars, nwords, charmap)
        else:
            data = DiskArray(self.args.training_data, dtype=np.float32)
            labels = DiskArray(self.args.labels_data, dtype=np.float32)

        if not os.path.exists(self.args.model_name):
            model = self.create_model(128, max_len, nchars)
            self.log.info('Started training the model')
            history = model.fit(data[:],
                                labels[:],
                                epochs=self.args.epochs,
                                batch_size=128)
            plt.plot(history.history['loss'])
            plt.savefig(self.args.image_name)
        else:
            model = load_model(self.args.model_name)

        model.save(self.args.model_name)

        self.log.info('Accessing the layer weights')
        new_model = Sequential()
        new_model.add(LSTM(128, input_shape=(max_len, nchars), unroll=True))
        weights = model.layers[0].get_weights()
        new_model.set_weights(weights)

        self.log.info('started predicting')
        for word in open(fpath):
            word = word.strip()
            test_data, test_lables = self.get_test_data(
                max_len, nchars, 1, [word], charmap)
            p_out = new_model.predict(test_data)
            disk_array.append((p_out[0], ))

        disk_array.flush()

Example #5

0

Show file

    def load_data(self, max_len, nchars, nwords, charmap):
        char_none = to_categorical(charmap[self.CHAR_NONE], num_classes=nchars)
        data = DiskArray(self.args.training_data,
                         shape=(nwords, max_len, nchars),
                         dtype=np.float32)
        labels = DiskArray(self.args.labels_data,
                           shape=(nwords, nchars),
                           dtype=np.float32)

        f = open(self.args.text)
        for i, line in enumerate(f):
            line = line.strip()
            w = line[:-1]
            last_char = line[-1]
            w = '%s%s%s' % (self.CHAR_START, w, self.CHAR_END)
            w = [to_categorical(charmap[x], num_classes=nchars) for x in w]
            w = w + ([char_none] * (max_len - len(w)))
            data[i] = w
            labels[i] = to_categorical(charmap[last_char], num_classes=nchars)

        self.log.info('generating vectors is done')
        data.flush()
        labels.flush()
        return data, labels

Example #6

0

Show file

File: generate_tvspace.py Project: chatrapathik/Deep-Learning

import numpy as np
from keras.models import load_model
from keras import backend as K

from wordvecspace import WordVecSpaceMem
from diskarray import DiskArray


def _euclidean_dis_loss(y_true, y_pred):
    return K.sqrt(K.sum(K.square(y_pred - y_true), axis=0))


model = load_model(
    sys.argv[1], custom_objects=dict(_euclidean_dis_loss=_euclidean_dis_loss))
out_f = DiskArray(sys.argv[2], shape=(0, ), dtype=[('vec', np.float32, 300)])

wv = WordVecSpaceMem(sys.argv[3])


def get_tras_vectors():
    for i in range(wv.nvecs):
        word = wv.get_word_at_index(i)
        vec = wv.get_word_vector(word, raise_exc=True)
        vec = vec.reshape(1, 300)
        t_vec = model.predict(vec)
        out_f.append((t_vec[0], ))


get_tras_vectors()
out_f.flush()

Example #7

0

Show file

class GW2VectoWordVecSpaceFile(object):
    """
    Abstraction that helps in converting word vector space data
    (vectors and vocabulary) from Google Word2Vec format to
    WordVecSpaceFile format.
    """
    def __init__(self,
                 in_dir,
                 outdir,
                 nvecs_per_shard=0,
                 shard_name="shard",
                 full_name="full"):

        self.in_dir = in_dir
        self.outdir = outdir
        self.nvecs_per_shard = nvecs_per_shard
        self.shard_name = shard_name

        self.do_sharding = bool(self.nvecs_per_shard)
        if self.do_sharding:
            self.full_fpath = self.J(self.outdir, full_name)
            os.makedirs(self.full_fpath)

            map_itow = self.J(self.full_fpath, "indextoword")
            self.itow = DiskDict(map_itow)

            map_wtoi = self.J(self.full_fpath, "wordtoindex")
            self.wtoi = DiskDict(map_wtoi)

            self.mags = DiskArray(
                self.J(self.full_fpath, "magnitudes"),
                shape=(0, ),
                dtype=np.float32,
                growby=1000000,
            )
            self.occurs = DiskArray(
                self.J(self.full_fpath, "occurrences"),
                shape=(0, ),
                dtype=np.uint64,
                growby=1000000,
            )

    def J(self, p1, p2):
        return os.path.join(p1, p2)

    def _iter_vecs(self, vfile, vocabfile):
        for word, vec in vfile.iter_vectors():
            vec = np.fromstring(vec, dtype="float32")
            mag = np.linalg.norm(vec)
            vec = vec / mag
            _line = vocabfile.readline().split(" ")

            word, occur = _line[0], int(_line[1])
            yield vec, word, mag, occur

    def _build_writer(self, vidx, dim):
        if self.do_sharding:
            shard_num = int(vidx / self.nvecs_per_shard)
            shard_name = "{}{}".format(self.shard_name, shard_num)
            fpath = self.J(self.outdir, shard_name)
            return GWVecBinWriter(fpath, dim, sharding=True)
        else:
            return GWVecBinWriter(self.outdir, dim)

    def _create_manifest(
        self,
        out_fpath,
        nvecs,
        dim,
        N,
        t_occur,
        in_fpath,
        m_info={},
        full=False,
        num_vecs=None,
        nvps=None,
    ):
        if full:
            mfc = dict(
                num_shards=N,
                num_vectors=nvecs,
                dimension=dim,
                num_words=t_occur,
                dt_creation=datetime.utcnow().isoformat(),
                input_path=in_fpath,
                manifest_info=m_info,
                num_vecs_per_shard=self.nvecs_per_shard,
            )
        else:
            mfc = dict(
                num_shards=N,
                num_vecs_in_shard=nvecs,
                num_vecs=num_vecs,
                num_vecs_per_shard=nvps,
                dimension=dim,
                num_words=t_occur,
                dt_creation=datetime.utcnow().isoformat(),
                input_path=in_fpath,
                manifest_info=m_info,
            )

        fp = open(self.J(out_fpath, "manifest.json"), "w")
        fp.write(json.dumps(mfc))
        fp.close()

    def _find_manifest_info(self, fpath):
        m_file = self.J(fpath, "manifest.json")
        c = {}
        if os.path.isfile(m_file):
            fp = open(m_file, "r")
            c = json.loads(fp.read())
        return c

    def start(self):
        inp_vec_f = open(self.J(self.in_dir, "vectors.bin"), "rb")
        inp_vecs = GWVecBinReader(inp_vec_f)
        dim = inp_vecs.dim
        nvecs = inp_vecs.nvecs

        vocab_file = open(self.J(self.in_dir, "vocab.txt"),
                          "r",
                          encoding="utf-8",
                          errors="ignore")
        m_info = self._find_manifest_info(self.in_dir)

        w = None
        vecs = self._iter_vecs(inp_vecs, vocab_file)
        N = self.nvecs_per_shard
        if N:
            num_shards = math.ceil(nvecs / N)
        else:
            num_shards = 1

        t_occur = 0
        count = -1
        for index, (vec, word, mag, occur) in enumerate(vecs):
            if self.do_sharding and index % N == 0:
                if w:
                    count += 1
                    t_occur += s_occur
                    self._create_manifest(
                        w.outdir,
                        (index - (count * N)),
                        dim,
                        num_shards,
                        s_occur,
                        self.in_dir,
                        m_info,
                        num_vecs=nvecs,
                        nvps=N,
                    )
                    w.close()
                    w = None

            if not w:
                s_occur = 0
                w = self._build_writer(index, dim)

            if self.do_sharding:
                self.wtoi[word] = index
                self.itow[index] = word

                self.mags.append(mag)
                self.occurs.append(occur)

                w.write(vec=vec, mag=mag, occur=occur)

            else:
                w.write(vec=vec, mag=mag, word=word, index=index, occur=occur)

            s_occur += occur

        if w:
            w.close()
            count += 1
            t_occur += s_occur
            self._create_manifest(
                w.outdir,
                (index - (count * N)),
                dim,
                num_shards,
                s_occur,
                self.in_dir,
                m_info,
                num_vecs=nvecs,
                nvps=N,
            )

        if self.do_sharding:
            self.wtoi.close()
            self.itow.close()

            self.mags.flush()
            self.mags.close()

            self.occurs.flush()
            self.occurs.close()

            self._create_manifest(
                self.full_fpath,
                nvecs,
                dim,
                num_shards,
                t_occur,
                self.in_dir,
                m_info,
                full=True,
            )

Example #8

0

Show file

File: correctional_training.py Project: chatrapathik/Deep-Learning

class CorrectionalTraining(BaseScript):
    VEC_DIM = 300
    LABELS = [0, 1]

    def __init__(self):
        super(CorrectionalTraining, self).__init__()
        self.train_f = DiskArray(self.args.train_f,
                                 shape=(self.get_shape(), ),
                                 dtype=self.get_dtype())
        self.wv = WordVecSpaceMem(self.args.wvspace_f)

    def get_user_token(self):
        token = input("Enter the search token: ")

        return token

    def get_shape(self):
        if not os.path.exists(self.args.train_f):
            return 0

        dtype = self.get_dtype()
        shape = os.stat(self.args.train_f).st_size // np.dtype(dtype).itemsize
        return shape

    def get_nearest_token(self, token):
        url = 'http://dev0.servers.deepcompute.com:8888/api/v1/get_k_nearest_cosine?word={}&k=10'.format(
            token)
        #url = 'http://dev0.servers.deepcompute.com:8888/api/v1/get_nn_model_k_nearest?word={}&k=10'.format(token)
        response = requests.get(url)
        response = response.json()
        result = response.get('result')

        return result

    def get_user_label(self, token, nearest_token):
        #name = nearest_token.get('name', '')
        #nearest_token = nearest_token.get('word2', '')
        name = token
        '''
        if not name:
            name = nearest_token
        '''
        print('the nearest token is %s' % name)
        label = input("Mark the distance between {} and {}: ".format(
            token, nearest_token))

        return int(label)

    def get_token_vector(self, token, nearest_token):
        token_vec = self.wv.get_word_vector(token)
        nearest_tok_vec = self.wv.get_word_vector(nearest_token)

        return token_vec, nearest_tok_vec

    def append_label_to_diskarray(self, vec1, vec2, word1, word2, label):
        self.train_f.append((vec1, vec2, word1, word2, label))

    def get_dtype(self):
        return [
            ('vec1', np.float32, self.VEC_DIM),
            ('vec2', np.float32, self.VEC_DIM),
            ('word1', 'S', self.VEC_DIM),
            ('word2', 'S', self.VEC_DIM),
            ('label', np.int),
        ]

    def run(self):
        try:
            while True:
                token = self.get_user_token()
                nearest_tokens = self.get_nearest_token(token)
                for nearest_token in nearest_tokens:
                    label = int(self.get_user_label(token, nearest_token))
                    if label not in self.LABELS:
                        continue
                    vec1, vec2 = self.get_token_vector(token, nearest_token)
                    self.append_label_to_diskarray(vec1, vec2, token,
                                                   nearest_token, label)
        finally:
            self.train_f.flush()

    def define_args(self, parser):
        parser.add_argument('train_f', help='diskarray train file')
        parser.add_argument('wvspace_f', help='wvspace file')