def load_data(self, fpath): words = [line[:-1] for line in open(fpath)] max_len = max(len(w) for w in words) + 2 # adding 2 for start and end chars nwords = len(words) chars = list(sorted(list(set(list(''.join(words)))))) chars = [self.CHAR_NONE, self.CHAR_START, self.CHAR_END] + chars charmap = {c: i for i, c in enumerate(chars)} nchars = len(chars) char_none = to_categorical(charmap[self.CHAR_NONE], num_classes=nchars) data = DiskArray('866k_training_data.array', shape=(nwords, max_len, nchars), dtype=np.float32) labels = DiskArraay('866k_labels_data.array', shape=(nwords, nchars), dtype=np.float32) for i in range(nwords): w = words[i][:-1] last_char = words[i][-1] w = '%s%s%s' % (self.CHAR_START, w, self.CHAR_END) w = [to_categorical(charmap[x], num_classes=nchars) for x in w] w = w + ([char_none] * (max_len - len(w))) data[i] = w labels[i] = to_categorical(charmap[last_char], num_classes=nchars) data.flush() labels.flush() return data, labels
def run(self): fpath = self.args.text fpath_pickled = self.args.text + ".pickled" max_len, nchars, nwords, words, charmap = self.get_char_to_int(fpath) disk_array = DiskArray(self.args.out_f, shape=(0, ), dtype=[('vec', np.float32, 128)]) ''' if not os.path.exists(fpath_pickled): data, labels = self.load_data(max_len, nchars, nwords, words, charmap) pickle.dump((data, labels), open(fpath_pickled, 'wb')) else: data, labels = pickle.load(open(fpath_pickled, 'rb')) ''' if not os.path.exists(self.args.model_name): model = self.create_model(128, max_len, nchars) #history = model.fit(data, labels, epochs=self.args.epochs, batch_size=128) generator = self.generator(max_len, nchars, nwords, words, charmap, 2048) model.fit_generator(generator, steps_per_epoch=nwords / 2048, epochs=self.args.epochs) else: model = load_model(self.args.model_name) model.save(self.args.model_name) if self.args.layer == 'lstm_layer': self.log.info('Accessing the layer weights') new_model = Sequential() new_model.add(LSTM(128, input_shape=(max_len, nchars), unroll=True)) weights = model.layers[0].get_weights() new_model.set_weights(weights) model_p = new_model else: model_p = model self.log.info('started predicting') for word in words: test_data, test_lables = self.load_data(max_len, nchars, 1, [word], charmap) p_out = model_p.predict(test_data) disk_array.append((p_out[0], )) disk_array.flush()
def run(self): fpath = self.args.text max_len, nchars, nwords, words, charmap = self.get_char_to_int(fpath) disk_array = DiskArray(self.args.out_f, shape=(0,), dtype=[('vec', np.float32, 108)]) model = load_model(self.args.model_name) self.log.info('started predicting') for word in words: test_data, test_lables = self.load_data(max_len, nchars, 1, [word], charmap) p_out = model.predict(test_data) disk_array.append((p_out[0],)) disk_array.flush()
def run(self): fpath = self.args.text max_len, nchars, nwords, charmap = self.get_char_to_int(fpath) disk_array = DiskArray(self.args.out_f, shape=(0, ), dtype=[('vec', np.float32, 128)]) if not os.path.exists(self.args.training_data): data, labels = self.load_data(max_len, nchars, nwords, charmap) else: data = DiskArray(self.args.training_data, dtype=np.float32) labels = DiskArray(self.args.labels_data, dtype=np.float32) if not os.path.exists(self.args.model_name): model = self.create_model(128, max_len, nchars) self.log.info('Started training the model') history = model.fit(data[:], labels[:], epochs=self.args.epochs, batch_size=128) plt.plot(history.history['loss']) plt.savefig(self.args.image_name) else: model = load_model(self.args.model_name) model.save(self.args.model_name) self.log.info('Accessing the layer weights') new_model = Sequential() new_model.add(LSTM(128, input_shape=(max_len, nchars), unroll=True)) weights = model.layers[0].get_weights() new_model.set_weights(weights) self.log.info('started predicting') for word in open(fpath): word = word.strip() test_data, test_lables = self.get_test_data( max_len, nchars, 1, [word], charmap) p_out = new_model.predict(test_data) disk_array.append((p_out[0], )) disk_array.flush()
def load_data(self, max_len, nchars, nwords, charmap): char_none = to_categorical(charmap[self.CHAR_NONE], num_classes=nchars) data = DiskArray(self.args.training_data, shape=(nwords, max_len, nchars), dtype=np.float32) labels = DiskArray(self.args.labels_data, shape=(nwords, nchars), dtype=np.float32) f = open(self.args.text) for i, line in enumerate(f): line = line.strip() w = line[:-1] last_char = line[-1] w = '%s%s%s' % (self.CHAR_START, w, self.CHAR_END) w = [to_categorical(charmap[x], num_classes=nchars) for x in w] w = w + ([char_none] * (max_len - len(w))) data[i] = w labels[i] = to_categorical(charmap[last_char], num_classes=nchars) self.log.info('generating vectors is done') data.flush() labels.flush() return data, labels
import numpy as np from keras.models import load_model from keras import backend as K from wordvecspace import WordVecSpaceMem from diskarray import DiskArray def _euclidean_dis_loss(y_true, y_pred): return K.sqrt(K.sum(K.square(y_pred - y_true), axis=0)) model = load_model( sys.argv[1], custom_objects=dict(_euclidean_dis_loss=_euclidean_dis_loss)) out_f = DiskArray(sys.argv[2], shape=(0, ), dtype=[('vec', np.float32, 300)]) wv = WordVecSpaceMem(sys.argv[3]) def get_tras_vectors(): for i in range(wv.nvecs): word = wv.get_word_at_index(i) vec = wv.get_word_vector(word, raise_exc=True) vec = vec.reshape(1, 300) t_vec = model.predict(vec) out_f.append((t_vec[0], )) get_tras_vectors() out_f.flush()
class GW2VectoWordVecSpaceFile(object): """ Abstraction that helps in converting word vector space data (vectors and vocabulary) from Google Word2Vec format to WordVecSpaceFile format. """ def __init__(self, in_dir, outdir, nvecs_per_shard=0, shard_name="shard", full_name="full"): self.in_dir = in_dir self.outdir = outdir self.nvecs_per_shard = nvecs_per_shard self.shard_name = shard_name self.do_sharding = bool(self.nvecs_per_shard) if self.do_sharding: self.full_fpath = self.J(self.outdir, full_name) os.makedirs(self.full_fpath) map_itow = self.J(self.full_fpath, "indextoword") self.itow = DiskDict(map_itow) map_wtoi = self.J(self.full_fpath, "wordtoindex") self.wtoi = DiskDict(map_wtoi) self.mags = DiskArray( self.J(self.full_fpath, "magnitudes"), shape=(0, ), dtype=np.float32, growby=1000000, ) self.occurs = DiskArray( self.J(self.full_fpath, "occurrences"), shape=(0, ), dtype=np.uint64, growby=1000000, ) def J(self, p1, p2): return os.path.join(p1, p2) def _iter_vecs(self, vfile, vocabfile): for word, vec in vfile.iter_vectors(): vec = np.fromstring(vec, dtype="float32") mag = np.linalg.norm(vec) vec = vec / mag _line = vocabfile.readline().split(" ") word, occur = _line[0], int(_line[1]) yield vec, word, mag, occur def _build_writer(self, vidx, dim): if self.do_sharding: shard_num = int(vidx / self.nvecs_per_shard) shard_name = "{}{}".format(self.shard_name, shard_num) fpath = self.J(self.outdir, shard_name) return GWVecBinWriter(fpath, dim, sharding=True) else: return GWVecBinWriter(self.outdir, dim) def _create_manifest( self, out_fpath, nvecs, dim, N, t_occur, in_fpath, m_info={}, full=False, num_vecs=None, nvps=None, ): if full: mfc = dict( num_shards=N, num_vectors=nvecs, dimension=dim, num_words=t_occur, dt_creation=datetime.utcnow().isoformat(), input_path=in_fpath, manifest_info=m_info, num_vecs_per_shard=self.nvecs_per_shard, ) else: mfc = dict( num_shards=N, num_vecs_in_shard=nvecs, num_vecs=num_vecs, num_vecs_per_shard=nvps, dimension=dim, num_words=t_occur, dt_creation=datetime.utcnow().isoformat(), input_path=in_fpath, manifest_info=m_info, ) fp = open(self.J(out_fpath, "manifest.json"), "w") fp.write(json.dumps(mfc)) fp.close() def _find_manifest_info(self, fpath): m_file = self.J(fpath, "manifest.json") c = {} if os.path.isfile(m_file): fp = open(m_file, "r") c = json.loads(fp.read()) return c def start(self): inp_vec_f = open(self.J(self.in_dir, "vectors.bin"), "rb") inp_vecs = GWVecBinReader(inp_vec_f) dim = inp_vecs.dim nvecs = inp_vecs.nvecs vocab_file = open(self.J(self.in_dir, "vocab.txt"), "r", encoding="utf-8", errors="ignore") m_info = self._find_manifest_info(self.in_dir) w = None vecs = self._iter_vecs(inp_vecs, vocab_file) N = self.nvecs_per_shard if N: num_shards = math.ceil(nvecs / N) else: num_shards = 1 t_occur = 0 count = -1 for index, (vec, word, mag, occur) in enumerate(vecs): if self.do_sharding and index % N == 0: if w: count += 1 t_occur += s_occur self._create_manifest( w.outdir, (index - (count * N)), dim, num_shards, s_occur, self.in_dir, m_info, num_vecs=nvecs, nvps=N, ) w.close() w = None if not w: s_occur = 0 w = self._build_writer(index, dim) if self.do_sharding: self.wtoi[word] = index self.itow[index] = word self.mags.append(mag) self.occurs.append(occur) w.write(vec=vec, mag=mag, occur=occur) else: w.write(vec=vec, mag=mag, word=word, index=index, occur=occur) s_occur += occur if w: w.close() count += 1 t_occur += s_occur self._create_manifest( w.outdir, (index - (count * N)), dim, num_shards, s_occur, self.in_dir, m_info, num_vecs=nvecs, nvps=N, ) if self.do_sharding: self.wtoi.close() self.itow.close() self.mags.flush() self.mags.close() self.occurs.flush() self.occurs.close() self._create_manifest( self.full_fpath, nvecs, dim, num_shards, t_occur, self.in_dir, m_info, full=True, )
class CorrectionalTraining(BaseScript): VEC_DIM = 300 LABELS = [0, 1] def __init__(self): super(CorrectionalTraining, self).__init__() self.train_f = DiskArray(self.args.train_f, shape=(self.get_shape(), ), dtype=self.get_dtype()) self.wv = WordVecSpaceMem(self.args.wvspace_f) def get_user_token(self): token = input("Enter the search token: ") return token def get_shape(self): if not os.path.exists(self.args.train_f): return 0 dtype = self.get_dtype() shape = os.stat(self.args.train_f).st_size // np.dtype(dtype).itemsize return shape def get_nearest_token(self, token): url = 'http://dev0.servers.deepcompute.com:8888/api/v1/get_k_nearest_cosine?word={}&k=10'.format( token) #url = 'http://dev0.servers.deepcompute.com:8888/api/v1/get_nn_model_k_nearest?word={}&k=10'.format(token) response = requests.get(url) response = response.json() result = response.get('result') return result def get_user_label(self, token, nearest_token): #name = nearest_token.get('name', '') #nearest_token = nearest_token.get('word2', '') name = token ''' if not name: name = nearest_token ''' print('the nearest token is %s' % name) label = input("Mark the distance between {} and {}: ".format( token, nearest_token)) return int(label) def get_token_vector(self, token, nearest_token): token_vec = self.wv.get_word_vector(token) nearest_tok_vec = self.wv.get_word_vector(nearest_token) return token_vec, nearest_tok_vec def append_label_to_diskarray(self, vec1, vec2, word1, word2, label): self.train_f.append((vec1, vec2, word1, word2, label)) def get_dtype(self): return [ ('vec1', np.float32, self.VEC_DIM), ('vec2', np.float32, self.VEC_DIM), ('word1', 'S', self.VEC_DIM), ('word2', 'S', self.VEC_DIM), ('label', np.int), ] def run(self): try: while True: token = self.get_user_token() nearest_tokens = self.get_nearest_token(token) for nearest_token in nearest_tokens: label = int(self.get_user_label(token, nearest_token)) if label not in self.LABELS: continue vec1, vec2 = self.get_token_vector(token, nearest_token) self.append_label_to_diskarray(vec1, vec2, token, nearest_token, label) finally: self.train_f.flush() def define_args(self, parser): parser.add_argument('train_f', help='diskarray train file') parser.add_argument('wvspace_f', help='wvspace file')