Ejemplo n.º 1
0
 def __init__(self):
     super(TrainData, self).__init__()
     self.wvspace = WordVecSpaceMem(self.args.wvspace)
     self.train_f = DiskArray(self.args.train_file,
                              shape=(self.get_shape(), ),
                              dtype=self.get_dtype())
     self.words_f = open(self.args.words_file, 'w')
Ejemplo n.º 2
0
    def _prepare_word_index_wvspace(self, dim, initialize=False, mode='r+'):
        def J(x): return os.path.join(self.dirpath, x)

        v_path = J('vectors')
        m_path = J('magnitudes')
        o_path = J('occurrences')

        # FIXME: Support taking memmap array from diskarray
        m_array = DiskArray(m_path, dtype='float32', mode=mode,
                            growby=self._growby, log=self.log)
        o_array = DiskArray(o_path, dtype='uint64', mode=mode,
                            growby=self._growby, log=self.log)

        if not initialize:
            v_array = DiskArray(v_path, dtype='float32', mode=mode,
                                growby=self._growby, log=self.log)
            vec_l = int(len(v_array)/dim)
            v_array = v_array[:].reshape(vec_l, dim)
            m_array = m_array[:]
            o_array = o_array[:]
        else:
            v_array = DiskArray(v_path, shape=(0, dim), dtype='float32', mode=mode,
                            growby=self._growby, log=self.log)

        wtoi = itow = None
        if not self.sharding:
            wtoi = DiskDict(J('wordtoindex'))
            itow = DiskDict(J('indextoword'))

        return v_array, o_array, m_array, wtoi, itow
Ejemplo n.º 3
0
    def __init__(self,
                 in_dir,
                 outdir,
                 nvecs_per_shard=0,
                 shard_name="shard",
                 full_name="full"):

        self.in_dir = in_dir
        self.outdir = outdir
        self.nvecs_per_shard = nvecs_per_shard
        self.shard_name = shard_name

        self.do_sharding = bool(self.nvecs_per_shard)
        if self.do_sharding:
            self.full_fpath = self.J(self.outdir, full_name)
            os.makedirs(self.full_fpath)

            map_itow = self.J(self.full_fpath, "indextoword")
            self.itow = DiskDict(map_itow)

            map_wtoi = self.J(self.full_fpath, "wordtoindex")
            self.wtoi = DiskDict(map_wtoi)

            self.mags = DiskArray(
                self.J(self.full_fpath, "magnitudes"),
                shape=(0, ),
                dtype=np.float32,
                growby=1000000,
            )
            self.occurs = DiskArray(
                self.J(self.full_fpath, "occurrences"),
                shape=(0, ),
                dtype=np.uint64,
                growby=1000000,
            )
Ejemplo n.º 4
0
    def _prepare_word_index_wvspace(self, dim, initialize=False):
        v_dtype, o_dtype = self._make_dtype(dim)

        def J(x):
            return os.path.join(self.dirpath, x)

        def S(f):
            return os.stat(f).st_size

        v_path = J('vectors')
        o_path = J('occurrences')

        nvecs = noccurs = 0
        if not initialize:
            nvecs = S(v_path) / np.dtype(v_dtype).itemsize
            noccurs = S(o_path) / np.dtype(o_dtype).itemsize

        v_array = DiskArray(v_path,
                            shape=(int(nvecs), ),
                            dtype=v_dtype,
                            growby=self._growby,
                            log=self.log)
        o_array = DiskArray(o_path,
                            shape=(int(noccurs), ),
                            dtype=o_dtype,
                            growby=self._growby,
                            log=self.log)

        w_index = DiskDict(J('wordtoindex'))
        i_word = DiskDict(J('indextoword'))

        return v_array, o_array, w_index, i_word
Ejemplo n.º 5
0
    def load_data(self, fpath):
        words = [line[:-1] for line in open(fpath)]
        max_len = max(len(w)
                      for w in words) + 2  # adding 2 for start and end chars
        nwords = len(words)

        chars = list(sorted(list(set(list(''.join(words))))))
        chars = [self.CHAR_NONE, self.CHAR_START, self.CHAR_END] + chars
        charmap = {c: i for i, c in enumerate(chars)}
        nchars = len(chars)
        char_none = to_categorical(charmap[self.CHAR_NONE], num_classes=nchars)

        data = DiskArray('866k_training_data.array',
                         shape=(nwords, max_len, nchars),
                         dtype=np.float32)
        labels = DiskArraay('866k_labels_data.array',
                            shape=(nwords, nchars),
                            dtype=np.float32)

        for i in range(nwords):
            w = words[i][:-1]
            last_char = words[i][-1]

            w = '%s%s%s' % (self.CHAR_START, w, self.CHAR_END)
            w = [to_categorical(charmap[x], num_classes=nchars) for x in w]
            w = w + ([char_none] * (max_len - len(w)))

            data[i] = w
            labels[i] = to_categorical(charmap[last_char], num_classes=nchars)

        data.flush()
        labels.flush()
        return data, labels
Ejemplo n.º 6
0
    def run(self):
        self.train_d = DiskArray(self.args.trainf, dtype=self._get_dtype())
        self.test_d = DiskArray(self.args.testf, dtype=self._get_dtype())

        model = self.init_model()
        self.compile_model(model)
        self.train_model(model)
        self.test_model(model)
        self.save_model(model)
 def __init__(self):
     super(GridScript, self).__init__()
     self.train_d = DiskArray(self.args.train_d, dtype=self._get_dtype())
     self.test_d = DiskArray(self.args.test_d, dtype=self._get_dtype())
     self.csv = open(self.args.outf, 'w')
     self.outf = csv.writer(self.csv)
     self.outf.writerow([
         'num of hidden layers', 'loss', 'activcation', 'optimizer',
         'epochs', 'cohens_d', 'accuracy'
     ])
     self.hyper_parameters = []
Ejemplo n.º 8
0
 def __init__(self):
     super(Generate_Histogram, self).__init__()
     self.d = DiskArray(self.args.array, dtype=self._get_dtype())
     self.psame = []
     self.pnsame = []
     csv_f = open(self.args.csv_f, 'w')
     self.csv_f = csv.writer(csv_f)
Ejemplo n.º 9
0
    def run(self):
        self.train_d = DiskArray(self.args.trainf, dtype=self._get_dtype())
        self.train_d = np.array([i for i in range(10000)], dtype=np.int)

        model = self.init_model()
        self.compile_model(model)
        self.train_model(model)
        self.test_model(model)
        model.save(self.args.model)
Ejemplo n.º 10
0
    def run(self):

        fpath = self.args.text

        max_len, nchars, nwords, charmap = self.get_char_to_int(fpath)

        disk_array = DiskArray(self.args.out_f,
                               shape=(0, ),
                               dtype=[('vec', np.float32, 128)])
        if not os.path.exists(self.args.training_data):
            data, labels = self.load_data(max_len, nchars, nwords, charmap)
        else:
            data = DiskArray(self.args.training_data, dtype=np.float32)
            labels = DiskArray(self.args.labels_data, dtype=np.float32)

        if not os.path.exists(self.args.model_name):
            model = self.create_model(128, max_len, nchars)
            self.log.info('Started training the model')
            history = model.fit(data[:],
                                labels[:],
                                epochs=self.args.epochs,
                                batch_size=128)
            plt.plot(history.history['loss'])
            plt.savefig(self.args.image_name)
        else:
            model = load_model(self.args.model_name)

        model.save(self.args.model_name)

        self.log.info('Accessing the layer weights')
        new_model = Sequential()
        new_model.add(LSTM(128, input_shape=(max_len, nchars), unroll=True))
        weights = model.layers[0].get_weights()
        new_model.set_weights(weights)

        self.log.info('started predicting')
        for word in open(fpath):
            word = word.strip()
            test_data, test_lables = self.get_test_data(
                max_len, nchars, 1, [word], charmap)
            p_out = new_model.predict(test_data)
            disk_array.append((p_out[0], ))

        disk_array.flush()
Ejemplo n.º 11
0
def test():
    d = DiskArray(inp_f, dtype=[('vec', np.float32, 128)])
    mapping = DiskDict(dict_file)

    print('The given word is', mapping[str(index)])
    vectors = d['vec']
    vec = vectors[index].reshape(1, len(vectors[0]))
    vectors_t = vectors.T

    dists = np.dot(vec, vectors_t)
    k_near = np.argsort(dists)[0]

    words = []
    for i in k_near:
        words.append(mapping[str(i)])

    return words
Ejemplo n.º 12
0
    def run(self):

        fpath = self.args.text
        fpath_pickled = self.args.text + ".pickled"

        max_len, nchars, nwords, words, charmap = self.get_char_to_int(fpath)

        disk_array = DiskArray(self.args.out_f,
                               shape=(0, ),
                               dtype=[('vec', np.float32, 128)])
        '''
        if not os.path.exists(fpath_pickled):
            data, labels = self.load_data(max_len, nchars, nwords, words, charmap)
            pickle.dump((data, labels), open(fpath_pickled, 'wb'))
        else:
            data, labels = pickle.load(open(fpath_pickled, 'rb'))
        '''

        if not os.path.exists(self.args.model_name):
            model = self.create_model(128, max_len, nchars)

            #history = model.fit(data, labels, epochs=self.args.epochs, batch_size=128)
            generator = self.generator(max_len, nchars, nwords, words, charmap,
                                       2048)
            model.fit_generator(generator,
                                steps_per_epoch=nwords / 2048,
                                epochs=self.args.epochs)
        else:
            model = load_model(self.args.model_name)

        model.save(self.args.model_name)

        if self.args.layer == 'lstm_layer':
            self.log.info('Accessing the layer weights')
            new_model = Sequential()
            new_model.add(LSTM(128, input_shape=(max_len, nchars),
                               unroll=True))
            weights = model.layers[0].get_weights()
            new_model.set_weights(weights)
            model_p = new_model
        else:
            model_p = model

        self.log.info('started predicting')
        for word in words:
            test_data, test_lables = self.load_data(max_len, nchars, 1, [word],
                                                    charmap)
            p_out = model_p.predict(test_data)
            disk_array.append((p_out[0], ))

        disk_array.flush()
Ejemplo n.º 13
0
    def run(self):
        model = load_model(
            self.args.model_name,
            custom_objects=dict(
                _euclidean_distance=_euclidean_distance,
                _eucl_dist_output_shape=_eucl_dist_output_shape))

        test_d = DiskArray(self.args.test_f,
                           dtype=[('vec', np.float32, 300),
                                  ('vec', np.float32, 300), ('label', np.int)])
        csv_f = open(self.args.csv_file, 'w')
        csv_file = csv.writer(csv_f)
        csv_file.writerow(['label', 'prediction'])
        for i in len(test_d['vec']):
            vec1 = test_d['vec1'][i]
            vec2 = test_d['vec2'][i]
            pred_val = self.get_prediction(vec1, vec2, model)
            label = test_d['label'][i]
            csv_file.writerow([label, pred_val])
Ejemplo n.º 14
0
def k_nearest(wvspace, disk_f, word):
    wv = WordVecSpaceMem(wvspace)
    da = DiskArray(disk_f, dtype=[('vec', np.float32, 300)])
    index = wv.get_word_index(word)

    result = wv.get_nearest(index, k=10)
    print(wv.get_word_at_indices(result))

    vec = da['vec'][index].reshape(1, 300)
    vecs = da['vec']

    #dist = distance.cdist(vec, vecs, 'cosine')
    dist = distance.cdist(vec, vecs, 'euclidean')
    #dist = np.dot(vec, vecs.T)

    dist = pd.Series(dist[0])
    res = dist.nsmallest(10).keys()
    print('\n')
    print(wv.get_word_at_indices(list(res)))
Ejemplo n.º 15
0
def k_nearest(wvspace, disk_f, words, metric, image_name):
    f = open(words)
    wv = WordVecSpaceMem(wvspace)
    da = DiskArray(disk_f, dtype=[('vec', np.float32, 300)])

    vecs = da['vec']

    psame = []
    pnsame = []
    for line in f:
        words = json.loads(line.strip())
        index1 = wv.get_word_index(words[0])
        index2 = wv.get_word_index(words[1])

        if 'clinicaltrials' in words[0] or 'clinicaltrials' in words[1]:
            continue

        vec1 = vecs[index1].reshape(1, 300)
        vec2 = vecs[index2].reshape(1, 300)

        if metric == 'cosine':
           vspace_dist = wv.get_distance(words[0], words[1])
           tvspace_dist = distance.cosine(vec1, vec2)
        else:
            vspace_dist = wv.get_distance(words[0], words[1], metric='euclidean')
            tvspace_dist = distance.euclidean(vec1, vec2)

        if words[2] == 0:
            psame.append(tvspace_dist)
        else:
            pnsame.append(tvspace_dist)

    dm = (np.std(psame) + np.std(pnsame)) / 2
    nm = abs(np.mean(psame) - np.mean(pnsame))

    d = nm / dm
    print('the cohens D distance is', d)

    plt.hist(psame, bins=50, alpha=0.5, label='same points')
    plt.hist(pnsame, bins=50, alpha=0.5, label='not same points')
    plt.legend(loc='upper right')
    plt.savefig(image_name)
Ejemplo n.º 16
0
    def load_data(self, max_len, nchars, nwords, charmap):
        char_none = to_categorical(charmap[self.CHAR_NONE], num_classes=nchars)
        data = DiskArray(self.args.training_data,
                         shape=(nwords, max_len, nchars),
                         dtype=np.float32)
        labels = DiskArray(self.args.labels_data,
                           shape=(nwords, nchars),
                           dtype=np.float32)

        f = open(self.args.text)
        for i, line in enumerate(f):
            line = line.strip()
            w = line[:-1]
            last_char = line[-1]
            w = '%s%s%s' % (self.CHAR_START, w, self.CHAR_END)
            w = [to_categorical(charmap[x], num_classes=nchars) for x in w]
            w = w + ([char_none] * (max_len - len(w)))
            data[i] = w
            labels[i] = to_categorical(charmap[last_char], num_classes=nchars)

        self.log.info('generating vectors is done')
        data.flush()
        labels.flush()
        return data, labels
Ejemplo n.º 17
0
    def run(self):

        fpath = self.args.text

        max_len, nchars, nwords, words, charmap = self.get_char_to_int(fpath)

        disk_array = DiskArray(self.args.out_f, shape=(0,), dtype=[('vec', np.float32, 108)])

        model = load_model(self.args.model_name)

        self.log.info('started predicting')
        for word in words:
            test_data, test_lables = self.load_data(max_len, nchars, 1, [word], charmap)
            p_out = model.predict(test_data)
            disk_array.append((p_out[0],))

        disk_array.flush()
Ejemplo n.º 18
0
 def __init__(self):
     super(DistanceFunction, self).__init__()
     self.test_d = DiskArray(self.args.testf, dtype=self._get_dtype())
Ejemplo n.º 19
0
import sys

from keras.Models import load_model

from wordvecspace import WordVecSpaceMem
from diskarray import DiskArray


def _euclidean_dis_loss(y_true, y_pred):
    return K.sqrt(K.sum(K.square(y_pred - y_true), axis=0))


model = load_model(
    sys.argv[1], custom_objects=dict(_euclidean_dis_loss=_euclidean_dis_loss))
out_f = DiskArray(sys.argv[2], dtype=[('vec', np.float32, 300)])

wv = WordVecSpaceMem(sys.argv[3])


def get_tras_vectors():
    nvecs = len(wv.vectors)
    for num in range(nvecs):
        vec = wv.get_word_vector(num)
        vec = vec.reshape(1, 300)
        t_vec = model.predict(vec)
        out_f.append((t_vec, ))
Ejemplo n.º 20
0
def disk_array_test():

    timeout = 500
    disks = 10

    # Set up a disk model, disk array, and some files for testing purposes
    test_disk = DiskModel(
        2.5,  # spin up time
        30,  # spin up energy
        10,  # spin down energy
        3,  # idle power
        7,  # read power
        8,  # write power
        300 * units.MiB,  # speed
        0.003,  # seek time
        500 * units.GiB)  # capacity
    disk_array = DiskArray(0, test_disk, disks, test_disk, timeout)

    file1 = FileInfo("file1", "/", FileType.TEXT, 1 * units.GiB)
    file1_compressed_size = 300 * units.MiB
    file2 = FileInfo("file2", "/", FileType.BINARY, 40 * units.MiB)
    file2_compressed_size = 35 * units.MiB

    # Tests
    passed = True

    # Write before the disks turn off
    current_time = timeout / 2
    disk_array.update_time(current_time)
    time = disk_array.write(file1, file1_compressed_size)
    energy = disk_array.get_energy_usage()

    expected_time = test_disk.seek_time + (file1_compressed_size /
                                           test_disk.speed)
    expected_energy = (timeout / 2) * test_disk.idle_power * disks + \
                      expected_time * test_disk.write_power
    if not floateq(time, expected_time):
        passed = False
        print "Failed write test 1 for time"
    if not floateq(energy, expected_energy):
        passed = False
        print "Failed write test 1 for energy"

    # Update the time to when most of the disks turn off
    current_time += timeout / 2
    disk_array.update_time(current_time)
    energy = disk_array.get_energy_usage()
    expected_energy += disks * (timeout / 2) * test_disk.idle_power + \
                       (disks - 1) * test_disk.spin_down_energy
    if not floateq(energy, expected_energy):
        passed = False
        print "Failed test 2"

    # Update the time so that the last disk turns off
    current_time += timeout / 2
    disk_array.update_time(current_time)
    energy = disk_array.get_energy_usage()
    expected_energy += (timeout / 2) * test_disk.idle_power + \
                       test_disk.spin_down_energy
    if not floateq(energy, expected_energy):
        passed = False
        print "Failed test 3"

    # Turn a disk back on to read
    current_time += timeout * 10
    disk_array.update_time(current_time)
    time = disk_array.read(file1)
    energy = disk_array.get_energy_usage()

    read_time = file1_compressed_size / test_disk.speed
    expected_time = test_disk.spin_up_time + read_time
    expected_energy += test_disk.spin_up_energy + \
                       read_time * test_disk.read_power
    if not floateq(time, expected_time):
        passed = False
        print "Failed read test 4 for time"
    if not floateq(energy, expected_energy):
        passed = False
        print "Failed read test 4 for energy"

    # Try to read a file that's not there
    exception_occurred = False
    try:
        disk_array.read(file2)
    except:
        exception_occurred = True
    if not exception_occurred:
        passed = False
        print "Failed read test for non-existant file"

    # Try to allocate some cache disks
    exception_occurred = False
    try:
        disk_array = DiskArray(1, test_disk, disks, test_disk, timeout)
    except:
        exception_occurred = True
    if not exception_occurred:
        passed = False
        print "Failed read test for non-existant file"

    # TODO: add a test where multiple disks are involved (one disk is still on
    # from a previous operation while others are turned on for new operations)

    if passed:
        print "All disk array tests passed"
    else:
        print "Disk array tests FAILED"
Ejemplo n.º 21
0
import numpy as np
from keras.models import load_model
from keras import backend as K

from wordvecspace import WordVecSpaceMem
from diskarray import DiskArray


def _euclidean_dis_loss(y_true, y_pred):
    return K.sqrt(K.sum(K.square(y_pred - y_true), axis=0))


model = load_model(
    sys.argv[1], custom_objects=dict(_euclidean_dis_loss=_euclidean_dis_loss))
out_f = DiskArray(sys.argv[2], shape=(0, ), dtype=[('vec', np.float32, 300)])

wv = WordVecSpaceMem(sys.argv[3])


def get_tras_vectors():
    for i in range(wv.nvecs):
        word = wv.get_word_at_index(i)
        vec = wv.get_word_vector(word, raise_exc=True)
        vec = vec.reshape(1, 300)
        t_vec = model.predict(vec)
        out_f.append((t_vec[0], ))


get_tras_vectors()
out_f.flush()
Ejemplo n.º 22
0
import numpy as np
from diskarray import DiskArray

d = DiskArray('/tmp/pras.array', shape=(2, 2), dtype=np.float32)

a = np.ndarray((2, 2), dtype=np.float32)
d.extend(a)

print(d[:])
Ejemplo n.º 23
0
class GW2VectoWordVecSpaceFile(object):
    """
    Abstraction that helps in converting word vector space data
    (vectors and vocabulary) from Google Word2Vec format to
    WordVecSpaceFile format.
    """
    def __init__(self,
                 in_dir,
                 outdir,
                 nvecs_per_shard=0,
                 shard_name="shard",
                 full_name="full"):

        self.in_dir = in_dir
        self.outdir = outdir
        self.nvecs_per_shard = nvecs_per_shard
        self.shard_name = shard_name

        self.do_sharding = bool(self.nvecs_per_shard)
        if self.do_sharding:
            self.full_fpath = self.J(self.outdir, full_name)
            os.makedirs(self.full_fpath)

            map_itow = self.J(self.full_fpath, "indextoword")
            self.itow = DiskDict(map_itow)

            map_wtoi = self.J(self.full_fpath, "wordtoindex")
            self.wtoi = DiskDict(map_wtoi)

            self.mags = DiskArray(
                self.J(self.full_fpath, "magnitudes"),
                shape=(0, ),
                dtype=np.float32,
                growby=1000000,
            )
            self.occurs = DiskArray(
                self.J(self.full_fpath, "occurrences"),
                shape=(0, ),
                dtype=np.uint64,
                growby=1000000,
            )

    def J(self, p1, p2):
        return os.path.join(p1, p2)

    def _iter_vecs(self, vfile, vocabfile):
        for word, vec in vfile.iter_vectors():
            vec = np.fromstring(vec, dtype="float32")
            mag = np.linalg.norm(vec)
            vec = vec / mag
            _line = vocabfile.readline().split(" ")

            word, occur = _line[0], int(_line[1])
            yield vec, word, mag, occur

    def _build_writer(self, vidx, dim):
        if self.do_sharding:
            shard_num = int(vidx / self.nvecs_per_shard)
            shard_name = "{}{}".format(self.shard_name, shard_num)
            fpath = self.J(self.outdir, shard_name)
            return GWVecBinWriter(fpath, dim, sharding=True)
        else:
            return GWVecBinWriter(self.outdir, dim)

    def _create_manifest(
        self,
        out_fpath,
        nvecs,
        dim,
        N,
        t_occur,
        in_fpath,
        m_info={},
        full=False,
        num_vecs=None,
        nvps=None,
    ):
        if full:
            mfc = dict(
                num_shards=N,
                num_vectors=nvecs,
                dimension=dim,
                num_words=t_occur,
                dt_creation=datetime.utcnow().isoformat(),
                input_path=in_fpath,
                manifest_info=m_info,
                num_vecs_per_shard=self.nvecs_per_shard,
            )
        else:
            mfc = dict(
                num_shards=N,
                num_vecs_in_shard=nvecs,
                num_vecs=num_vecs,
                num_vecs_per_shard=nvps,
                dimension=dim,
                num_words=t_occur,
                dt_creation=datetime.utcnow().isoformat(),
                input_path=in_fpath,
                manifest_info=m_info,
            )

        fp = open(self.J(out_fpath, "manifest.json"), "w")
        fp.write(json.dumps(mfc))
        fp.close()

    def _find_manifest_info(self, fpath):
        m_file = self.J(fpath, "manifest.json")
        c = {}
        if os.path.isfile(m_file):
            fp = open(m_file, "r")
            c = json.loads(fp.read())
        return c

    def start(self):
        inp_vec_f = open(self.J(self.in_dir, "vectors.bin"), "rb")
        inp_vecs = GWVecBinReader(inp_vec_f)
        dim = inp_vecs.dim
        nvecs = inp_vecs.nvecs

        vocab_file = open(self.J(self.in_dir, "vocab.txt"),
                          "r",
                          encoding="utf-8",
                          errors="ignore")
        m_info = self._find_manifest_info(self.in_dir)

        w = None
        vecs = self._iter_vecs(inp_vecs, vocab_file)
        N = self.nvecs_per_shard
        if N:
            num_shards = math.ceil(nvecs / N)
        else:
            num_shards = 1

        t_occur = 0
        count = -1
        for index, (vec, word, mag, occur) in enumerate(vecs):
            if self.do_sharding and index % N == 0:
                if w:
                    count += 1
                    t_occur += s_occur
                    self._create_manifest(
                        w.outdir,
                        (index - (count * N)),
                        dim,
                        num_shards,
                        s_occur,
                        self.in_dir,
                        m_info,
                        num_vecs=nvecs,
                        nvps=N,
                    )
                    w.close()
                    w = None

            if not w:
                s_occur = 0
                w = self._build_writer(index, dim)

            if self.do_sharding:
                self.wtoi[word] = index
                self.itow[index] = word

                self.mags.append(mag)
                self.occurs.append(occur)

                w.write(vec=vec, mag=mag, occur=occur)

            else:
                w.write(vec=vec, mag=mag, word=word, index=index, occur=occur)

            s_occur += occur

        if w:
            w.close()
            count += 1
            t_occur += s_occur
            self._create_manifest(
                w.outdir,
                (index - (count * N)),
                dim,
                num_shards,
                s_occur,
                self.in_dir,
                m_info,
                num_vecs=nvecs,
                nvps=N,
            )

        if self.do_sharding:
            self.wtoi.close()
            self.itow.close()

            self.mags.flush()
            self.mags.close()

            self.occurs.flush()
            self.occurs.close()

            self._create_manifest(
                self.full_fpath,
                nvecs,
                dim,
                num_shards,
                t_occur,
                self.in_dir,
                m_info,
                full=True,
            )
class CorrectionalTraining(BaseScript):
    VEC_DIM = 300
    LABELS = [0, 1]

    def __init__(self):
        super(CorrectionalTraining, self).__init__()
        self.train_f = DiskArray(self.args.train_f,
                                 shape=(self.get_shape(), ),
                                 dtype=self.get_dtype())
        self.wv = WordVecSpaceMem(self.args.wvspace_f)

    def get_user_token(self):
        token = input("Enter the search token: ")

        return token

    def get_shape(self):
        if not os.path.exists(self.args.train_f):
            return 0

        dtype = self.get_dtype()
        shape = os.stat(self.args.train_f).st_size // np.dtype(dtype).itemsize
        return shape

    def get_nearest_token(self, token):
        url = 'http://dev0.servers.deepcompute.com:8888/api/v1/get_k_nearest_cosine?word={}&k=10'.format(
            token)
        #url = 'http://dev0.servers.deepcompute.com:8888/api/v1/get_nn_model_k_nearest?word={}&k=10'.format(token)
        response = requests.get(url)
        response = response.json()
        result = response.get('result')

        return result

    def get_user_label(self, token, nearest_token):
        #name = nearest_token.get('name', '')
        #nearest_token = nearest_token.get('word2', '')
        name = token
        '''
        if not name:
            name = nearest_token
        '''
        print('the nearest token is %s' % name)
        label = input("Mark the distance between {} and {}: ".format(
            token, nearest_token))

        return int(label)

    def get_token_vector(self, token, nearest_token):
        token_vec = self.wv.get_word_vector(token)
        nearest_tok_vec = self.wv.get_word_vector(nearest_token)

        return token_vec, nearest_tok_vec

    def append_label_to_diskarray(self, vec1, vec2, word1, word2, label):
        self.train_f.append((vec1, vec2, word1, word2, label))

    def get_dtype(self):
        return [
            ('vec1', np.float32, self.VEC_DIM),
            ('vec2', np.float32, self.VEC_DIM),
            ('word1', 'S', self.VEC_DIM),
            ('word2', 'S', self.VEC_DIM),
            ('label', np.int),
        ]

    def run(self):
        try:
            while True:
                token = self.get_user_token()
                nearest_tokens = self.get_nearest_token(token)
                for nearest_token in nearest_tokens:
                    label = int(self.get_user_label(token, nearest_token))
                    if label not in self.LABELS:
                        continue
                    vec1, vec2 = self.get_token_vector(token, nearest_token)
                    self.append_label_to_diskarray(vec1, vec2, token,
                                                   nearest_token, label)
        finally:
            self.train_f.flush()

    def define_args(self, parser):
        parser.add_argument('train_f', help='diskarray train file')
        parser.add_argument('wvspace_f', help='wvspace file')
 def __init__(self):
     super(CorrectionalTraining, self).__init__()
     self.train_f = DiskArray(self.args.train_f,
                              shape=(self.get_shape(), ),
                              dtype=self.get_dtype())
     self.wv = WordVecSpaceMem(self.args.wvspace_f)
Ejemplo n.º 26
0
class TrainData(BaseScript):

    VEC_DIM = 300

    def __init__(self):
        super(TrainData, self).__init__()
        self.wvspace = WordVecSpaceMem(self.args.wvspace)
        self.train_f = DiskArray(self.args.train_file,
                                 shape=(self.get_shape(), ),
                                 dtype=self.get_dtype())
        self.words_f = open(self.args.words_file, 'w')
        #self.model = load_model(self.args.model)

    def get_shape(self):
        if not os.path.exists(self.args.train_f):
            return 0

        dtype = self.get_dtype()
        shape = os.stat(
            self.args.train_file).st_size // np.dtype(dtype).itemsize
        return shape

    def get_dtype(self):
        return [
            ('vec1', np.float32, self.VEC_DIM),
            ('vec2', np.float32, self.VEC_DIM),
            ('label', np.int),
        ]

    def get_random_point(self):
        return random.randint(0, len(self.wvspace))

    def near_pair(self):
        index = self.get_random_point()
        word1 = self.wvspace.get_word_at_index(index)
        nearest = self.wvspace.get_nearest(word1, 10)
        n_words = self.wvspace.get_word_at_indices(nearest)
        word2 = n_words[1]
        self.add_pair(word1, word2)

    def add_pair(self, word1, word2):
        vec1 = self.wvspace.get_word_vector(word1)
        vec2 = self.wvspace.get_word_vector(word2)
        diff_vec = abs(vec1 - vec2)
        p_value = self.model.predict(vec1, vec2)
        p_value = 0 if p_value < 3 else 1
        self.train_f.append((vec1, vec2, p_value))
        self.words_f(word1 + '<====>' + word2 + '<======>' + str(p_value))

    def far_pair(self):
        index1 = self.get_random_point()
        word1 = self.wvspace.get_word_at_index(index)
        index2 = self.get_random_point()
        word2 = self.wvspace.get_word_at_index(index)
        self.add_pair(word1, word2)

    def run(self):
        for i in range(self.args.n_samples):
            word1, word2 = self.near_pair()

    def define_args(self, parser):
        parser.add_argument('train_file', metavar='training-file')
        parser.add_argument('wvspace', metavar='vector-space')
        parser.add_argument('words_file', metavar='words-file')
        parser.add_argument('n_samples', metavar='num-of-pairs')
Ejemplo n.º 27
0
    def __init__(self, actual_vspace, transformed_vspace):

        self.wvspace = WordVecSpaceMem(actual_vspace)
        self.t_vspace = DiskArray(transformed_vspace,
                                  dtype=[('vec', np.float32, 300)])
Ejemplo n.º 28
0
def main():
    '''
    Main function: parses command line arguments, generates several objects,
    and passes them all to the simulation.  Then runs the simulation and writes
    the results to an output file.
    '''

    # Set some defaults in case command line arguments are not supplied
    DEFAULT_TRACE_FILE_NAME = "./trace"
    DEFAULT_SPIN_DOWN_TIMEOUT = float("inf")  # seconds
    DEFAULT_COMPRESSION_THRESHOLD = 0.3  # compression ratio
    DEFAULT_COMPRESSION_ALG = "g"
    DEFAULT_OUTPUT_FILE_NAME = "output.csv"

    # Generate a parser for the command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("-f",
                        "--trace",
                        help="file name of trace file to execute",
                        default=DEFAULT_TRACE_FILE_NAME,
                        metavar="TRACE_FILE")
    parser.add_argument("-t",
                        "--timeout",
                        help="spin down timeout for disks",
                        default=DEFAULT_SPIN_DOWN_TIMEOUT,
                        metavar="SPIN_DOWN_TIMEOUT")
    parser.add_argument(
        "-c",
        "--compression_alg",
        help=
        "compression algorithm to use in the simulation (g = gzip, b = bzip2, 7 = 7z, l, = lzop, s = snappy, fx = gzip x times faster, gx = gzip with x times better compression)",
        default=DEFAULT_COMPRESSION_ALG,
        choices=[
            "g", "b", "7", "l", "s", "f1", "f2", "f3", "f4", "f5", "f6", "f7",
            "f8", "f9", "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17",
            "f18", "f19", "f20", "g1", "g2", "g3", "g4", "g5"
        ])

    group = parser.add_mutually_exclusive_group()
    group.add_argument("-n",
                       "--none",
                       action="store_true",
                       help="do not compress any files")
    group.add_argument("-a",
                       "--all",
                       action="store_true",
                       help="compress all files")
    group.add_argument(
        "-r",
        "--compression_ratio",
        help="compression ratio below which files will be compressed",
        default=DEFAULT_COMPRESSION_THRESHOLD,
        type=float,
        metavar="COMPRESSION_RATIO")

    parser.add_argument(
        "-o",
        "--output",
        help="output file name (results will be appended to this file)",
        default=DEFAULT_OUTPUT_FILE_NAME,
        metavar="OUTPUT_FILE")

    # Parse the command line arguments
    args = parser.parse_args()

    trace_file_name = args.trace
    spin_down_timeout = args.timeout
    output_file_name = args.output

    if args.compression_alg == "g":
        compression_alg = gzip_alg
    elif args.compression_alg == "b":
        compression_alg = bzip2_alg
    elif args.compression_alg == "7":
        compression_alg = sevenz_alg
    elif args.compression_alg == "s":
        compression_alg = snappy_alg
    elif args.compression_alg == "l":
        compression_alg = lzop_alg
    elif args.compression_alg == "f1":
        compression_alg = faster1_alg
    elif args.compression_alg == "f2":
        compression_alg = faster2_alg
    elif args.compression_alg == "f3":
        compression_alg = faster3_alg
    elif args.compression_alg == "f4":
        compression_alg = faster4_alg
    elif args.compression_alg == "f5":
        compression_alg = faster5_alg
    elif args.compression_alg == "f6":
        compression_alg = faster6_alg
    elif args.compression_alg == "f7":
        compression_alg = faster7_alg
    elif args.compression_alg == "f8":
        compression_alg = faster8_alg
    elif args.compression_alg == "f9":
        compression_alg = faster9_alg
    elif args.compression_alg == "f10":
        compression_alg = faster10_alg
    elif args.compression_alg == "f11":
        compression_alg = faster11_alg
    elif args.compression_alg == "f12":
        compression_alg = faster12_alg
    elif args.compression_alg == "f13":
        compression_alg = faster13_alg
    elif args.compression_alg == "f14":
        compression_alg = faster14_alg
    elif args.compression_alg == "f15":
        compression_alg = faster15_alg
    elif args.compression_alg == "f16":
        compression_alg = faster16_alg
    elif args.compression_alg == "f17":
        compression_alg = faster17_alg
    elif args.compression_alg == "f18":
        compression_alg = faster18_alg
    elif args.compression_alg == "f19":
        compression_alg = faster19_alg
    elif args.compression_alg == "f20":
        compression_alg = faster20_alg
    elif args.compression_alg == "g1":
        compression_alg = greater1_alg
    elif args.compression_alg == "g2":
        compression_alg = greater2_alg
    elif args.compression_alg == "g3":
        compression_alg = greater3_alg
    elif args.compression_alg == "g4":
        compression_alg = greater4_alg
    elif args.compression_alg == "g5":
        compression_alg = greater5_alg

    if args.none:
        selection_alg = NoCompressionSelectionAlgorithm()
        compression_threshold = 0
    elif args.all:
        selection_alg = CompressEverythingSelectionAlgorithm()
        compression_threshold = float("inf")
    else:
        compression_threshold = args.compression_ratio
        selection_alg = ThresholdCompressionAlgorithm(compression_threshold,
                                                      compression_alg)

    # The following parameters are hard coded
    processor_model = xeonE52658v2
    num_cache_disks = 0
    cache_disk_model = siliconDriveA100ssd
    num_passive_disks = 100
    passive_disk_model = savvio10k6hd

    trace = Trace(trace_file_name)
    disk_array = DiskArray(num_cache_disks, cache_disk_model,
                           num_passive_disks, passive_disk_model,
                           spin_down_timeout)
    sim = Simulation(trace, compression_alg, selection_alg, processor_model,
                     disk_array)

    results = sim.run()

    # Write the parameters and results to the output file.  The file is written
    # in CSV format, which is easy to parse and can be read by many spreadsheet
    # applications.

    # Do some calculations up front
    average_read_time = 0
    if results.read_count > 0:
        average_read_time = results.total_read_time / results.read_count

    average_write_time = 0
    if results.write_count > 0:
        average_write_time = results.total_write_time / results.write_count

    total_energy_usage = results.processor_energy_usage + \
        results.disk_energy_usage

    # Open (or create) the file for appending.
    # Technically there is a bug here because the file might spontaneously
    # spring into existence between the time it is checked and the time it is
    # opened, but there's no need to worry about that for non-production code.
    file_exists = os.path.exists(output_file_name)
    output_file = open(output_file_name, 'a')

    # Write out the header, if needed
    if not file_exists:
        output_file.write("trace_file_name,compression_algorithm,"
                          "compression_threshold,spin_down_timeout,"
                          "total_read_time,read_count,avg_read_time,"
                          "total_write_time,write_count,avg_write_time,"
                          "processor_energy_used,disk_energy_used,"
                          "total_energy_used,total_capacity_used,"
                          "parse_error_occurred\n")

    # Write out the input parameters
    output_file.write(trace_file_name)
    output_file.write(",")
    output_file.write(compression_alg.name)
    output_file.write(",")
    output_file.write(str(compression_threshold))
    output_file.write(",")
    output_file.write(str(spin_down_timeout))
    output_file.write(",")

    # Write out the results
    output_file.write(str(results.total_read_time))
    output_file.write(",")
    output_file.write(str(results.read_count))
    output_file.write(",")
    output_file.write(str(average_read_time))
    output_file.write(",")
    output_file.write(str(results.total_write_time))
    output_file.write(",")
    output_file.write(str(results.write_count))
    output_file.write(",")
    output_file.write(str(average_write_time))
    output_file.write(",")
    output_file.write(str(results.processor_energy_usage))
    output_file.write(",")
    output_file.write(str(results.disk_energy_usage))
    output_file.write(",")
    output_file.write(str(total_energy_usage))
    output_file.write(",")
    output_file.write(str(results.total_capacity_usage))
    output_file.write(",")
    output_file.write(str(results.parse_error_occurred))
    output_file.write("\n")

    # TODO: it might be nice to provide finer grained metrics: for example,
    # energy used by reads vs write vs idle and separate out procesor time
    # from disk time for reads and writes.

    # Clean up
    output_file.close()