def __init__(self): super(TrainData, self).__init__() self.wvspace = WordVecSpaceMem(self.args.wvspace) self.train_f = DiskArray(self.args.train_file, shape=(self.get_shape(), ), dtype=self.get_dtype()) self.words_f = open(self.args.words_file, 'w')
def _prepare_word_index_wvspace(self, dim, initialize=False, mode='r+'): def J(x): return os.path.join(self.dirpath, x) v_path = J('vectors') m_path = J('magnitudes') o_path = J('occurrences') # FIXME: Support taking memmap array from diskarray m_array = DiskArray(m_path, dtype='float32', mode=mode, growby=self._growby, log=self.log) o_array = DiskArray(o_path, dtype='uint64', mode=mode, growby=self._growby, log=self.log) if not initialize: v_array = DiskArray(v_path, dtype='float32', mode=mode, growby=self._growby, log=self.log) vec_l = int(len(v_array)/dim) v_array = v_array[:].reshape(vec_l, dim) m_array = m_array[:] o_array = o_array[:] else: v_array = DiskArray(v_path, shape=(0, dim), dtype='float32', mode=mode, growby=self._growby, log=self.log) wtoi = itow = None if not self.sharding: wtoi = DiskDict(J('wordtoindex')) itow = DiskDict(J('indextoword')) return v_array, o_array, m_array, wtoi, itow
def __init__(self, in_dir, outdir, nvecs_per_shard=0, shard_name="shard", full_name="full"): self.in_dir = in_dir self.outdir = outdir self.nvecs_per_shard = nvecs_per_shard self.shard_name = shard_name self.do_sharding = bool(self.nvecs_per_shard) if self.do_sharding: self.full_fpath = self.J(self.outdir, full_name) os.makedirs(self.full_fpath) map_itow = self.J(self.full_fpath, "indextoword") self.itow = DiskDict(map_itow) map_wtoi = self.J(self.full_fpath, "wordtoindex") self.wtoi = DiskDict(map_wtoi) self.mags = DiskArray( self.J(self.full_fpath, "magnitudes"), shape=(0, ), dtype=np.float32, growby=1000000, ) self.occurs = DiskArray( self.J(self.full_fpath, "occurrences"), shape=(0, ), dtype=np.uint64, growby=1000000, )
def _prepare_word_index_wvspace(self, dim, initialize=False): v_dtype, o_dtype = self._make_dtype(dim) def J(x): return os.path.join(self.dirpath, x) def S(f): return os.stat(f).st_size v_path = J('vectors') o_path = J('occurrences') nvecs = noccurs = 0 if not initialize: nvecs = S(v_path) / np.dtype(v_dtype).itemsize noccurs = S(o_path) / np.dtype(o_dtype).itemsize v_array = DiskArray(v_path, shape=(int(nvecs), ), dtype=v_dtype, growby=self._growby, log=self.log) o_array = DiskArray(o_path, shape=(int(noccurs), ), dtype=o_dtype, growby=self._growby, log=self.log) w_index = DiskDict(J('wordtoindex')) i_word = DiskDict(J('indextoword')) return v_array, o_array, w_index, i_word
def load_data(self, fpath): words = [line[:-1] for line in open(fpath)] max_len = max(len(w) for w in words) + 2 # adding 2 for start and end chars nwords = len(words) chars = list(sorted(list(set(list(''.join(words)))))) chars = [self.CHAR_NONE, self.CHAR_START, self.CHAR_END] + chars charmap = {c: i for i, c in enumerate(chars)} nchars = len(chars) char_none = to_categorical(charmap[self.CHAR_NONE], num_classes=nchars) data = DiskArray('866k_training_data.array', shape=(nwords, max_len, nchars), dtype=np.float32) labels = DiskArraay('866k_labels_data.array', shape=(nwords, nchars), dtype=np.float32) for i in range(nwords): w = words[i][:-1] last_char = words[i][-1] w = '%s%s%s' % (self.CHAR_START, w, self.CHAR_END) w = [to_categorical(charmap[x], num_classes=nchars) for x in w] w = w + ([char_none] * (max_len - len(w))) data[i] = w labels[i] = to_categorical(charmap[last_char], num_classes=nchars) data.flush() labels.flush() return data, labels
def run(self): self.train_d = DiskArray(self.args.trainf, dtype=self._get_dtype()) self.test_d = DiskArray(self.args.testf, dtype=self._get_dtype()) model = self.init_model() self.compile_model(model) self.train_model(model) self.test_model(model) self.save_model(model)
def __init__(self): super(GridScript, self).__init__() self.train_d = DiskArray(self.args.train_d, dtype=self._get_dtype()) self.test_d = DiskArray(self.args.test_d, dtype=self._get_dtype()) self.csv = open(self.args.outf, 'w') self.outf = csv.writer(self.csv) self.outf.writerow([ 'num of hidden layers', 'loss', 'activcation', 'optimizer', 'epochs', 'cohens_d', 'accuracy' ]) self.hyper_parameters = []
def __init__(self): super(Generate_Histogram, self).__init__() self.d = DiskArray(self.args.array, dtype=self._get_dtype()) self.psame = [] self.pnsame = [] csv_f = open(self.args.csv_f, 'w') self.csv_f = csv.writer(csv_f)
def run(self): self.train_d = DiskArray(self.args.trainf, dtype=self._get_dtype()) self.train_d = np.array([i for i in range(10000)], dtype=np.int) model = self.init_model() self.compile_model(model) self.train_model(model) self.test_model(model) model.save(self.args.model)
def run(self): fpath = self.args.text max_len, nchars, nwords, charmap = self.get_char_to_int(fpath) disk_array = DiskArray(self.args.out_f, shape=(0, ), dtype=[('vec', np.float32, 128)]) if not os.path.exists(self.args.training_data): data, labels = self.load_data(max_len, nchars, nwords, charmap) else: data = DiskArray(self.args.training_data, dtype=np.float32) labels = DiskArray(self.args.labels_data, dtype=np.float32) if not os.path.exists(self.args.model_name): model = self.create_model(128, max_len, nchars) self.log.info('Started training the model') history = model.fit(data[:], labels[:], epochs=self.args.epochs, batch_size=128) plt.plot(history.history['loss']) plt.savefig(self.args.image_name) else: model = load_model(self.args.model_name) model.save(self.args.model_name) self.log.info('Accessing the layer weights') new_model = Sequential() new_model.add(LSTM(128, input_shape=(max_len, nchars), unroll=True)) weights = model.layers[0].get_weights() new_model.set_weights(weights) self.log.info('started predicting') for word in open(fpath): word = word.strip() test_data, test_lables = self.get_test_data( max_len, nchars, 1, [word], charmap) p_out = new_model.predict(test_data) disk_array.append((p_out[0], )) disk_array.flush()
def test(): d = DiskArray(inp_f, dtype=[('vec', np.float32, 128)]) mapping = DiskDict(dict_file) print('The given word is', mapping[str(index)]) vectors = d['vec'] vec = vectors[index].reshape(1, len(vectors[0])) vectors_t = vectors.T dists = np.dot(vec, vectors_t) k_near = np.argsort(dists)[0] words = [] for i in k_near: words.append(mapping[str(i)]) return words
def run(self): fpath = self.args.text fpath_pickled = self.args.text + ".pickled" max_len, nchars, nwords, words, charmap = self.get_char_to_int(fpath) disk_array = DiskArray(self.args.out_f, shape=(0, ), dtype=[('vec', np.float32, 128)]) ''' if not os.path.exists(fpath_pickled): data, labels = self.load_data(max_len, nchars, nwords, words, charmap) pickle.dump((data, labels), open(fpath_pickled, 'wb')) else: data, labels = pickle.load(open(fpath_pickled, 'rb')) ''' if not os.path.exists(self.args.model_name): model = self.create_model(128, max_len, nchars) #history = model.fit(data, labels, epochs=self.args.epochs, batch_size=128) generator = self.generator(max_len, nchars, nwords, words, charmap, 2048) model.fit_generator(generator, steps_per_epoch=nwords / 2048, epochs=self.args.epochs) else: model = load_model(self.args.model_name) model.save(self.args.model_name) if self.args.layer == 'lstm_layer': self.log.info('Accessing the layer weights') new_model = Sequential() new_model.add(LSTM(128, input_shape=(max_len, nchars), unroll=True)) weights = model.layers[0].get_weights() new_model.set_weights(weights) model_p = new_model else: model_p = model self.log.info('started predicting') for word in words: test_data, test_lables = self.load_data(max_len, nchars, 1, [word], charmap) p_out = model_p.predict(test_data) disk_array.append((p_out[0], )) disk_array.flush()
def run(self): model = load_model( self.args.model_name, custom_objects=dict( _euclidean_distance=_euclidean_distance, _eucl_dist_output_shape=_eucl_dist_output_shape)) test_d = DiskArray(self.args.test_f, dtype=[('vec', np.float32, 300), ('vec', np.float32, 300), ('label', np.int)]) csv_f = open(self.args.csv_file, 'w') csv_file = csv.writer(csv_f) csv_file.writerow(['label', 'prediction']) for i in len(test_d['vec']): vec1 = test_d['vec1'][i] vec2 = test_d['vec2'][i] pred_val = self.get_prediction(vec1, vec2, model) label = test_d['label'][i] csv_file.writerow([label, pred_val])
def k_nearest(wvspace, disk_f, word): wv = WordVecSpaceMem(wvspace) da = DiskArray(disk_f, dtype=[('vec', np.float32, 300)]) index = wv.get_word_index(word) result = wv.get_nearest(index, k=10) print(wv.get_word_at_indices(result)) vec = da['vec'][index].reshape(1, 300) vecs = da['vec'] #dist = distance.cdist(vec, vecs, 'cosine') dist = distance.cdist(vec, vecs, 'euclidean') #dist = np.dot(vec, vecs.T) dist = pd.Series(dist[0]) res = dist.nsmallest(10).keys() print('\n') print(wv.get_word_at_indices(list(res)))
def k_nearest(wvspace, disk_f, words, metric, image_name): f = open(words) wv = WordVecSpaceMem(wvspace) da = DiskArray(disk_f, dtype=[('vec', np.float32, 300)]) vecs = da['vec'] psame = [] pnsame = [] for line in f: words = json.loads(line.strip()) index1 = wv.get_word_index(words[0]) index2 = wv.get_word_index(words[1]) if 'clinicaltrials' in words[0] or 'clinicaltrials' in words[1]: continue vec1 = vecs[index1].reshape(1, 300) vec2 = vecs[index2].reshape(1, 300) if metric == 'cosine': vspace_dist = wv.get_distance(words[0], words[1]) tvspace_dist = distance.cosine(vec1, vec2) else: vspace_dist = wv.get_distance(words[0], words[1], metric='euclidean') tvspace_dist = distance.euclidean(vec1, vec2) if words[2] == 0: psame.append(tvspace_dist) else: pnsame.append(tvspace_dist) dm = (np.std(psame) + np.std(pnsame)) / 2 nm = abs(np.mean(psame) - np.mean(pnsame)) d = nm / dm print('the cohens D distance is', d) plt.hist(psame, bins=50, alpha=0.5, label='same points') plt.hist(pnsame, bins=50, alpha=0.5, label='not same points') plt.legend(loc='upper right') plt.savefig(image_name)
def load_data(self, max_len, nchars, nwords, charmap): char_none = to_categorical(charmap[self.CHAR_NONE], num_classes=nchars) data = DiskArray(self.args.training_data, shape=(nwords, max_len, nchars), dtype=np.float32) labels = DiskArray(self.args.labels_data, shape=(nwords, nchars), dtype=np.float32) f = open(self.args.text) for i, line in enumerate(f): line = line.strip() w = line[:-1] last_char = line[-1] w = '%s%s%s' % (self.CHAR_START, w, self.CHAR_END) w = [to_categorical(charmap[x], num_classes=nchars) for x in w] w = w + ([char_none] * (max_len - len(w))) data[i] = w labels[i] = to_categorical(charmap[last_char], num_classes=nchars) self.log.info('generating vectors is done') data.flush() labels.flush() return data, labels
def run(self): fpath = self.args.text max_len, nchars, nwords, words, charmap = self.get_char_to_int(fpath) disk_array = DiskArray(self.args.out_f, shape=(0,), dtype=[('vec', np.float32, 108)]) model = load_model(self.args.model_name) self.log.info('started predicting') for word in words: test_data, test_lables = self.load_data(max_len, nchars, 1, [word], charmap) p_out = model.predict(test_data) disk_array.append((p_out[0],)) disk_array.flush()
def __init__(self): super(DistanceFunction, self).__init__() self.test_d = DiskArray(self.args.testf, dtype=self._get_dtype())
import sys from keras.Models import load_model from wordvecspace import WordVecSpaceMem from diskarray import DiskArray def _euclidean_dis_loss(y_true, y_pred): return K.sqrt(K.sum(K.square(y_pred - y_true), axis=0)) model = load_model( sys.argv[1], custom_objects=dict(_euclidean_dis_loss=_euclidean_dis_loss)) out_f = DiskArray(sys.argv[2], dtype=[('vec', np.float32, 300)]) wv = WordVecSpaceMem(sys.argv[3]) def get_tras_vectors(): nvecs = len(wv.vectors) for num in range(nvecs): vec = wv.get_word_vector(num) vec = vec.reshape(1, 300) t_vec = model.predict(vec) out_f.append((t_vec, ))
def disk_array_test(): timeout = 500 disks = 10 # Set up a disk model, disk array, and some files for testing purposes test_disk = DiskModel( 2.5, # spin up time 30, # spin up energy 10, # spin down energy 3, # idle power 7, # read power 8, # write power 300 * units.MiB, # speed 0.003, # seek time 500 * units.GiB) # capacity disk_array = DiskArray(0, test_disk, disks, test_disk, timeout) file1 = FileInfo("file1", "/", FileType.TEXT, 1 * units.GiB) file1_compressed_size = 300 * units.MiB file2 = FileInfo("file2", "/", FileType.BINARY, 40 * units.MiB) file2_compressed_size = 35 * units.MiB # Tests passed = True # Write before the disks turn off current_time = timeout / 2 disk_array.update_time(current_time) time = disk_array.write(file1, file1_compressed_size) energy = disk_array.get_energy_usage() expected_time = test_disk.seek_time + (file1_compressed_size / test_disk.speed) expected_energy = (timeout / 2) * test_disk.idle_power * disks + \ expected_time * test_disk.write_power if not floateq(time, expected_time): passed = False print "Failed write test 1 for time" if not floateq(energy, expected_energy): passed = False print "Failed write test 1 for energy" # Update the time to when most of the disks turn off current_time += timeout / 2 disk_array.update_time(current_time) energy = disk_array.get_energy_usage() expected_energy += disks * (timeout / 2) * test_disk.idle_power + \ (disks - 1) * test_disk.spin_down_energy if not floateq(energy, expected_energy): passed = False print "Failed test 2" # Update the time so that the last disk turns off current_time += timeout / 2 disk_array.update_time(current_time) energy = disk_array.get_energy_usage() expected_energy += (timeout / 2) * test_disk.idle_power + \ test_disk.spin_down_energy if not floateq(energy, expected_energy): passed = False print "Failed test 3" # Turn a disk back on to read current_time += timeout * 10 disk_array.update_time(current_time) time = disk_array.read(file1) energy = disk_array.get_energy_usage() read_time = file1_compressed_size / test_disk.speed expected_time = test_disk.spin_up_time + read_time expected_energy += test_disk.spin_up_energy + \ read_time * test_disk.read_power if not floateq(time, expected_time): passed = False print "Failed read test 4 for time" if not floateq(energy, expected_energy): passed = False print "Failed read test 4 for energy" # Try to read a file that's not there exception_occurred = False try: disk_array.read(file2) except: exception_occurred = True if not exception_occurred: passed = False print "Failed read test for non-existant file" # Try to allocate some cache disks exception_occurred = False try: disk_array = DiskArray(1, test_disk, disks, test_disk, timeout) except: exception_occurred = True if not exception_occurred: passed = False print "Failed read test for non-existant file" # TODO: add a test where multiple disks are involved (one disk is still on # from a previous operation while others are turned on for new operations) if passed: print "All disk array tests passed" else: print "Disk array tests FAILED"
import numpy as np from keras.models import load_model from keras import backend as K from wordvecspace import WordVecSpaceMem from diskarray import DiskArray def _euclidean_dis_loss(y_true, y_pred): return K.sqrt(K.sum(K.square(y_pred - y_true), axis=0)) model = load_model( sys.argv[1], custom_objects=dict(_euclidean_dis_loss=_euclidean_dis_loss)) out_f = DiskArray(sys.argv[2], shape=(0, ), dtype=[('vec', np.float32, 300)]) wv = WordVecSpaceMem(sys.argv[3]) def get_tras_vectors(): for i in range(wv.nvecs): word = wv.get_word_at_index(i) vec = wv.get_word_vector(word, raise_exc=True) vec = vec.reshape(1, 300) t_vec = model.predict(vec) out_f.append((t_vec[0], )) get_tras_vectors() out_f.flush()
import numpy as np from diskarray import DiskArray d = DiskArray('/tmp/pras.array', shape=(2, 2), dtype=np.float32) a = np.ndarray((2, 2), dtype=np.float32) d.extend(a) print(d[:])
class GW2VectoWordVecSpaceFile(object): """ Abstraction that helps in converting word vector space data (vectors and vocabulary) from Google Word2Vec format to WordVecSpaceFile format. """ def __init__(self, in_dir, outdir, nvecs_per_shard=0, shard_name="shard", full_name="full"): self.in_dir = in_dir self.outdir = outdir self.nvecs_per_shard = nvecs_per_shard self.shard_name = shard_name self.do_sharding = bool(self.nvecs_per_shard) if self.do_sharding: self.full_fpath = self.J(self.outdir, full_name) os.makedirs(self.full_fpath) map_itow = self.J(self.full_fpath, "indextoword") self.itow = DiskDict(map_itow) map_wtoi = self.J(self.full_fpath, "wordtoindex") self.wtoi = DiskDict(map_wtoi) self.mags = DiskArray( self.J(self.full_fpath, "magnitudes"), shape=(0, ), dtype=np.float32, growby=1000000, ) self.occurs = DiskArray( self.J(self.full_fpath, "occurrences"), shape=(0, ), dtype=np.uint64, growby=1000000, ) def J(self, p1, p2): return os.path.join(p1, p2) def _iter_vecs(self, vfile, vocabfile): for word, vec in vfile.iter_vectors(): vec = np.fromstring(vec, dtype="float32") mag = np.linalg.norm(vec) vec = vec / mag _line = vocabfile.readline().split(" ") word, occur = _line[0], int(_line[1]) yield vec, word, mag, occur def _build_writer(self, vidx, dim): if self.do_sharding: shard_num = int(vidx / self.nvecs_per_shard) shard_name = "{}{}".format(self.shard_name, shard_num) fpath = self.J(self.outdir, shard_name) return GWVecBinWriter(fpath, dim, sharding=True) else: return GWVecBinWriter(self.outdir, dim) def _create_manifest( self, out_fpath, nvecs, dim, N, t_occur, in_fpath, m_info={}, full=False, num_vecs=None, nvps=None, ): if full: mfc = dict( num_shards=N, num_vectors=nvecs, dimension=dim, num_words=t_occur, dt_creation=datetime.utcnow().isoformat(), input_path=in_fpath, manifest_info=m_info, num_vecs_per_shard=self.nvecs_per_shard, ) else: mfc = dict( num_shards=N, num_vecs_in_shard=nvecs, num_vecs=num_vecs, num_vecs_per_shard=nvps, dimension=dim, num_words=t_occur, dt_creation=datetime.utcnow().isoformat(), input_path=in_fpath, manifest_info=m_info, ) fp = open(self.J(out_fpath, "manifest.json"), "w") fp.write(json.dumps(mfc)) fp.close() def _find_manifest_info(self, fpath): m_file = self.J(fpath, "manifest.json") c = {} if os.path.isfile(m_file): fp = open(m_file, "r") c = json.loads(fp.read()) return c def start(self): inp_vec_f = open(self.J(self.in_dir, "vectors.bin"), "rb") inp_vecs = GWVecBinReader(inp_vec_f) dim = inp_vecs.dim nvecs = inp_vecs.nvecs vocab_file = open(self.J(self.in_dir, "vocab.txt"), "r", encoding="utf-8", errors="ignore") m_info = self._find_manifest_info(self.in_dir) w = None vecs = self._iter_vecs(inp_vecs, vocab_file) N = self.nvecs_per_shard if N: num_shards = math.ceil(nvecs / N) else: num_shards = 1 t_occur = 0 count = -1 for index, (vec, word, mag, occur) in enumerate(vecs): if self.do_sharding and index % N == 0: if w: count += 1 t_occur += s_occur self._create_manifest( w.outdir, (index - (count * N)), dim, num_shards, s_occur, self.in_dir, m_info, num_vecs=nvecs, nvps=N, ) w.close() w = None if not w: s_occur = 0 w = self._build_writer(index, dim) if self.do_sharding: self.wtoi[word] = index self.itow[index] = word self.mags.append(mag) self.occurs.append(occur) w.write(vec=vec, mag=mag, occur=occur) else: w.write(vec=vec, mag=mag, word=word, index=index, occur=occur) s_occur += occur if w: w.close() count += 1 t_occur += s_occur self._create_manifest( w.outdir, (index - (count * N)), dim, num_shards, s_occur, self.in_dir, m_info, num_vecs=nvecs, nvps=N, ) if self.do_sharding: self.wtoi.close() self.itow.close() self.mags.flush() self.mags.close() self.occurs.flush() self.occurs.close() self._create_manifest( self.full_fpath, nvecs, dim, num_shards, t_occur, self.in_dir, m_info, full=True, )
class CorrectionalTraining(BaseScript): VEC_DIM = 300 LABELS = [0, 1] def __init__(self): super(CorrectionalTraining, self).__init__() self.train_f = DiskArray(self.args.train_f, shape=(self.get_shape(), ), dtype=self.get_dtype()) self.wv = WordVecSpaceMem(self.args.wvspace_f) def get_user_token(self): token = input("Enter the search token: ") return token def get_shape(self): if not os.path.exists(self.args.train_f): return 0 dtype = self.get_dtype() shape = os.stat(self.args.train_f).st_size // np.dtype(dtype).itemsize return shape def get_nearest_token(self, token): url = 'http://dev0.servers.deepcompute.com:8888/api/v1/get_k_nearest_cosine?word={}&k=10'.format( token) #url = 'http://dev0.servers.deepcompute.com:8888/api/v1/get_nn_model_k_nearest?word={}&k=10'.format(token) response = requests.get(url) response = response.json() result = response.get('result') return result def get_user_label(self, token, nearest_token): #name = nearest_token.get('name', '') #nearest_token = nearest_token.get('word2', '') name = token ''' if not name: name = nearest_token ''' print('the nearest token is %s' % name) label = input("Mark the distance between {} and {}: ".format( token, nearest_token)) return int(label) def get_token_vector(self, token, nearest_token): token_vec = self.wv.get_word_vector(token) nearest_tok_vec = self.wv.get_word_vector(nearest_token) return token_vec, nearest_tok_vec def append_label_to_diskarray(self, vec1, vec2, word1, word2, label): self.train_f.append((vec1, vec2, word1, word2, label)) def get_dtype(self): return [ ('vec1', np.float32, self.VEC_DIM), ('vec2', np.float32, self.VEC_DIM), ('word1', 'S', self.VEC_DIM), ('word2', 'S', self.VEC_DIM), ('label', np.int), ] def run(self): try: while True: token = self.get_user_token() nearest_tokens = self.get_nearest_token(token) for nearest_token in nearest_tokens: label = int(self.get_user_label(token, nearest_token)) if label not in self.LABELS: continue vec1, vec2 = self.get_token_vector(token, nearest_token) self.append_label_to_diskarray(vec1, vec2, token, nearest_token, label) finally: self.train_f.flush() def define_args(self, parser): parser.add_argument('train_f', help='diskarray train file') parser.add_argument('wvspace_f', help='wvspace file')
def __init__(self): super(CorrectionalTraining, self).__init__() self.train_f = DiskArray(self.args.train_f, shape=(self.get_shape(), ), dtype=self.get_dtype()) self.wv = WordVecSpaceMem(self.args.wvspace_f)
class TrainData(BaseScript): VEC_DIM = 300 def __init__(self): super(TrainData, self).__init__() self.wvspace = WordVecSpaceMem(self.args.wvspace) self.train_f = DiskArray(self.args.train_file, shape=(self.get_shape(), ), dtype=self.get_dtype()) self.words_f = open(self.args.words_file, 'w') #self.model = load_model(self.args.model) def get_shape(self): if not os.path.exists(self.args.train_f): return 0 dtype = self.get_dtype() shape = os.stat( self.args.train_file).st_size // np.dtype(dtype).itemsize return shape def get_dtype(self): return [ ('vec1', np.float32, self.VEC_DIM), ('vec2', np.float32, self.VEC_DIM), ('label', np.int), ] def get_random_point(self): return random.randint(0, len(self.wvspace)) def near_pair(self): index = self.get_random_point() word1 = self.wvspace.get_word_at_index(index) nearest = self.wvspace.get_nearest(word1, 10) n_words = self.wvspace.get_word_at_indices(nearest) word2 = n_words[1] self.add_pair(word1, word2) def add_pair(self, word1, word2): vec1 = self.wvspace.get_word_vector(word1) vec2 = self.wvspace.get_word_vector(word2) diff_vec = abs(vec1 - vec2) p_value = self.model.predict(vec1, vec2) p_value = 0 if p_value < 3 else 1 self.train_f.append((vec1, vec2, p_value)) self.words_f(word1 + '<====>' + word2 + '<======>' + str(p_value)) def far_pair(self): index1 = self.get_random_point() word1 = self.wvspace.get_word_at_index(index) index2 = self.get_random_point() word2 = self.wvspace.get_word_at_index(index) self.add_pair(word1, word2) def run(self): for i in range(self.args.n_samples): word1, word2 = self.near_pair() def define_args(self, parser): parser.add_argument('train_file', metavar='training-file') parser.add_argument('wvspace', metavar='vector-space') parser.add_argument('words_file', metavar='words-file') parser.add_argument('n_samples', metavar='num-of-pairs')
def __init__(self, actual_vspace, transformed_vspace): self.wvspace = WordVecSpaceMem(actual_vspace) self.t_vspace = DiskArray(transformed_vspace, dtype=[('vec', np.float32, 300)])
def main(): ''' Main function: parses command line arguments, generates several objects, and passes them all to the simulation. Then runs the simulation and writes the results to an output file. ''' # Set some defaults in case command line arguments are not supplied DEFAULT_TRACE_FILE_NAME = "./trace" DEFAULT_SPIN_DOWN_TIMEOUT = float("inf") # seconds DEFAULT_COMPRESSION_THRESHOLD = 0.3 # compression ratio DEFAULT_COMPRESSION_ALG = "g" DEFAULT_OUTPUT_FILE_NAME = "output.csv" # Generate a parser for the command line arguments parser = argparse.ArgumentParser() parser.add_argument("-f", "--trace", help="file name of trace file to execute", default=DEFAULT_TRACE_FILE_NAME, metavar="TRACE_FILE") parser.add_argument("-t", "--timeout", help="spin down timeout for disks", default=DEFAULT_SPIN_DOWN_TIMEOUT, metavar="SPIN_DOWN_TIMEOUT") parser.add_argument( "-c", "--compression_alg", help= "compression algorithm to use in the simulation (g = gzip, b = bzip2, 7 = 7z, l, = lzop, s = snappy, fx = gzip x times faster, gx = gzip with x times better compression)", default=DEFAULT_COMPRESSION_ALG, choices=[ "g", "b", "7", "l", "s", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18", "f19", "f20", "g1", "g2", "g3", "g4", "g5" ]) group = parser.add_mutually_exclusive_group() group.add_argument("-n", "--none", action="store_true", help="do not compress any files") group.add_argument("-a", "--all", action="store_true", help="compress all files") group.add_argument( "-r", "--compression_ratio", help="compression ratio below which files will be compressed", default=DEFAULT_COMPRESSION_THRESHOLD, type=float, metavar="COMPRESSION_RATIO") parser.add_argument( "-o", "--output", help="output file name (results will be appended to this file)", default=DEFAULT_OUTPUT_FILE_NAME, metavar="OUTPUT_FILE") # Parse the command line arguments args = parser.parse_args() trace_file_name = args.trace spin_down_timeout = args.timeout output_file_name = args.output if args.compression_alg == "g": compression_alg = gzip_alg elif args.compression_alg == "b": compression_alg = bzip2_alg elif args.compression_alg == "7": compression_alg = sevenz_alg elif args.compression_alg == "s": compression_alg = snappy_alg elif args.compression_alg == "l": compression_alg = lzop_alg elif args.compression_alg == "f1": compression_alg = faster1_alg elif args.compression_alg == "f2": compression_alg = faster2_alg elif args.compression_alg == "f3": compression_alg = faster3_alg elif args.compression_alg == "f4": compression_alg = faster4_alg elif args.compression_alg == "f5": compression_alg = faster5_alg elif args.compression_alg == "f6": compression_alg = faster6_alg elif args.compression_alg == "f7": compression_alg = faster7_alg elif args.compression_alg == "f8": compression_alg = faster8_alg elif args.compression_alg == "f9": compression_alg = faster9_alg elif args.compression_alg == "f10": compression_alg = faster10_alg elif args.compression_alg == "f11": compression_alg = faster11_alg elif args.compression_alg == "f12": compression_alg = faster12_alg elif args.compression_alg == "f13": compression_alg = faster13_alg elif args.compression_alg == "f14": compression_alg = faster14_alg elif args.compression_alg == "f15": compression_alg = faster15_alg elif args.compression_alg == "f16": compression_alg = faster16_alg elif args.compression_alg == "f17": compression_alg = faster17_alg elif args.compression_alg == "f18": compression_alg = faster18_alg elif args.compression_alg == "f19": compression_alg = faster19_alg elif args.compression_alg == "f20": compression_alg = faster20_alg elif args.compression_alg == "g1": compression_alg = greater1_alg elif args.compression_alg == "g2": compression_alg = greater2_alg elif args.compression_alg == "g3": compression_alg = greater3_alg elif args.compression_alg == "g4": compression_alg = greater4_alg elif args.compression_alg == "g5": compression_alg = greater5_alg if args.none: selection_alg = NoCompressionSelectionAlgorithm() compression_threshold = 0 elif args.all: selection_alg = CompressEverythingSelectionAlgorithm() compression_threshold = float("inf") else: compression_threshold = args.compression_ratio selection_alg = ThresholdCompressionAlgorithm(compression_threshold, compression_alg) # The following parameters are hard coded processor_model = xeonE52658v2 num_cache_disks = 0 cache_disk_model = siliconDriveA100ssd num_passive_disks = 100 passive_disk_model = savvio10k6hd trace = Trace(trace_file_name) disk_array = DiskArray(num_cache_disks, cache_disk_model, num_passive_disks, passive_disk_model, spin_down_timeout) sim = Simulation(trace, compression_alg, selection_alg, processor_model, disk_array) results = sim.run() # Write the parameters and results to the output file. The file is written # in CSV format, which is easy to parse and can be read by many spreadsheet # applications. # Do some calculations up front average_read_time = 0 if results.read_count > 0: average_read_time = results.total_read_time / results.read_count average_write_time = 0 if results.write_count > 0: average_write_time = results.total_write_time / results.write_count total_energy_usage = results.processor_energy_usage + \ results.disk_energy_usage # Open (or create) the file for appending. # Technically there is a bug here because the file might spontaneously # spring into existence between the time it is checked and the time it is # opened, but there's no need to worry about that for non-production code. file_exists = os.path.exists(output_file_name) output_file = open(output_file_name, 'a') # Write out the header, if needed if not file_exists: output_file.write("trace_file_name,compression_algorithm," "compression_threshold,spin_down_timeout," "total_read_time,read_count,avg_read_time," "total_write_time,write_count,avg_write_time," "processor_energy_used,disk_energy_used," "total_energy_used,total_capacity_used," "parse_error_occurred\n") # Write out the input parameters output_file.write(trace_file_name) output_file.write(",") output_file.write(compression_alg.name) output_file.write(",") output_file.write(str(compression_threshold)) output_file.write(",") output_file.write(str(spin_down_timeout)) output_file.write(",") # Write out the results output_file.write(str(results.total_read_time)) output_file.write(",") output_file.write(str(results.read_count)) output_file.write(",") output_file.write(str(average_read_time)) output_file.write(",") output_file.write(str(results.total_write_time)) output_file.write(",") output_file.write(str(results.write_count)) output_file.write(",") output_file.write(str(average_write_time)) output_file.write(",") output_file.write(str(results.processor_energy_usage)) output_file.write(",") output_file.write(str(results.disk_energy_usage)) output_file.write(",") output_file.write(str(total_energy_usage)) output_file.write(",") output_file.write(str(results.total_capacity_usage)) output_file.write(",") output_file.write(str(results.parse_error_occurred)) output_file.write("\n") # TODO: it might be nice to provide finer grained metrics: for example, # energy used by reads vs write vs idle and separate out procesor time # from disk time for reads and writes. # Clean up output_file.close()