class KNearestService(object): def __init__(self, actual_vspace, transformed_vspace): self.wvspace = WordVecSpaceMem(actual_vspace) self.t_vspace = DiskArray(transformed_vspace, dtype=[('vec', np.float32, 300)]) def k_nearest(self, word: str, k: int = 10, metric: str = 'angular') -> dict: index = self.wvspace.get_word_index(word) result = self.wvspace.get_nearest(index, k, metric=metric) actual_results = self.wvspace.get_word_at_indices(result) vec = self.t_vspace['vec'][index].reshape(1, 300) vecs = self.t_vspace['vec'] if metric == 'angular': metric = 'cosine' dist = distance.cdist(vec, vecs, metric) dist = pd.Series(dist[0]) res = dist.nsmallest(k).keys() trans_results = self.wvspace.get_word_at_indices(list(res)) recall = len(set(actual_results) & set(trans_results)) / k data = dict(vspace_results=actual_results, T_vspace_results=trans_results, recall=recall) return data
def __init__(self): super(TrainData, self).__init__() self.wvspace = WordVecSpaceMem(self.args.wvspace) self.train_f = DiskArray(self.args.train_file, shape=(self.get_shape(), ), dtype=self.get_dtype()) self.words_f = open(self.args.words_file, 'w')
def __init__(self): super(MineTriplet, self).__init__() self.inp_cluster_f = DD(self.args.manual_cluster_f) self.vspace = WordVecSpaceMem(self.args.wvspace_f) self.out_train_d = DA(self.args.hard_triplet_batch, shape=(0, ), dtype=self._get_dtype())
def k_nearest(wvspace, disk_f, words, metric, image_name): f = open(words) wv = WordVecSpaceMem(wvspace) da = DiskArray(disk_f, dtype=[('vec', np.float32, 300)]) vecs = da['vec'] psame = [] pnsame = [] for line in f: words = json.loads(line.strip()) index1 = wv.get_word_index(words[0]) index2 = wv.get_word_index(words[1]) if 'clinicaltrials' in words[0] or 'clinicaltrials' in words[1]: continue vec1 = vecs[index1].reshape(1, 300) vec2 = vecs[index2].reshape(1, 300) if metric == 'cosine': vspace_dist = wv.get_distance(words[0], words[1]) tvspace_dist = distance.cosine(vec1, vec2) else: vspace_dist = wv.get_distance(words[0], words[1], metric='euclidean') tvspace_dist = distance.euclidean(vec1, vec2) if words[2] == 0: psame.append(tvspace_dist) else: pnsame.append(tvspace_dist) dm = (np.std(psame) + np.std(pnsame)) / 2 nm = abs(np.mean(psame) - np.mean(pnsame)) d = nm / dm print('the cohens D distance is', d) plt.hist(psame, bins=50, alpha=0.5, label='same points') plt.hist(pnsame, bins=50, alpha=0.5, label='not same points') plt.legend(loc='upper right') plt.savefig(image_name)
def test(inpf, model, outf): wv = WordVecSpaceMem(inpf) model = load_model(model, custom_objects=dict( _euclidean_distance=_euclidean_distance, _dist_output_shape=_dist_output_shape)) inequality_count = 0 for i in range(1000): index1, index2, index3 = random.sample(range(wv.nvecs), 3) vec1 = wv.get_word_vector(index1) vec2 = wv.get_word_vector(index2) vec3 = wv.get_word_vector(index3) vec1 = _reshape(vec1) vec2 = _reshape(vec2) vec3 = _reshape(vec3) dist_v13 = model.predict([vec1, vec3]) dist_v12 = model.predict([vec1, vec2]) dist_v23 = model.predict([vec2, vec3]) ''' diff_vec12 = get_diff_vec(vec1, vec2) diff_vec13 = get_diff_vec(vec1, vec3) diff_vec23 = get_diff_vec(vec2, vec3) dist_v13 = 1 - model.predict(diff_vec13)[0][0] dist_v12 = 1 - model.predict(diff_vec12)[0][0] dist_v23 = 1 - model.predict(diff_vec23)[0][0] ''' is_inequality = dist_v13 <= (dist_v12 + dist_v23) outf.writerow([ index1, index2, index3, dist_v13, dist_v12, dist_v23, is_inequality ]) if not is_inequality: inequality_count += 1 print(inequality_count)
def k_nearest(wvspace, disk_f, word): wv = WordVecSpaceMem(wvspace) da = DiskArray(disk_f, dtype=[('vec', np.float32, 300)]) index = wv.get_word_index(word) result = wv.get_nearest(index, k=10) print(wv.get_word_at_indices(result)) vec = da['vec'][index].reshape(1, 300) vecs = da['vec'] #dist = distance.cdist(vec, vecs, 'cosine') dist = distance.cdist(vec, vecs, 'euclidean') #dist = np.dot(vec, vecs.T) dist = pd.Series(dist[0]) res = dist.nsmallest(10).keys() print('\n') print(wv.get_word_at_indices(list(res)))
import sys from keras.Models import load_model from wordvecspace import WordVecSpaceMem from diskarray import DiskArray def _euclidean_dis_loss(y_true, y_pred): return K.sqrt(K.sum(K.square(y_pred - y_true), axis=0)) model = load_model( sys.argv[1], custom_objects=dict(_euclidean_dis_loss=_euclidean_dis_loss)) out_f = DiskArray(sys.argv[2], dtype=[('vec', np.float32, 300)]) wv = WordVecSpaceMem(sys.argv[3]) def get_tras_vectors(): nvecs = len(wv.vectors) for num in range(nvecs): vec = wv.get_word_vector(num) vec = vec.reshape(1, 300) t_vec = model.predict(vec) out_f.append((t_vec, ))
class CorrectionalTraining(BaseScript): VEC_DIM = 300 LABELS = [0, 1] def __init__(self): super(CorrectionalTraining, self).__init__() self.train_f = DiskArray(self.args.train_f, shape=(self.get_shape(), ), dtype=self.get_dtype()) self.wv = WordVecSpaceMem(self.args.wvspace_f) def get_user_token(self): token = input("Enter the search token: ") return token def get_shape(self): if not os.path.exists(self.args.train_f): return 0 dtype = self.get_dtype() shape = os.stat(self.args.train_f).st_size // np.dtype(dtype).itemsize return shape def get_nearest_token(self, token): url = 'http://dev0.servers.deepcompute.com:8888/api/v1/get_k_nearest_cosine?word={}&k=10'.format( token) #url = 'http://dev0.servers.deepcompute.com:8888/api/v1/get_nn_model_k_nearest?word={}&k=10'.format(token) response = requests.get(url) response = response.json() result = response.get('result') return result def get_user_label(self, token, nearest_token): #name = nearest_token.get('name', '') #nearest_token = nearest_token.get('word2', '') name = token ''' if not name: name = nearest_token ''' print('the nearest token is %s' % name) label = input("Mark the distance between {} and {}: ".format( token, nearest_token)) return int(label) def get_token_vector(self, token, nearest_token): token_vec = self.wv.get_word_vector(token) nearest_tok_vec = self.wv.get_word_vector(nearest_token) return token_vec, nearest_tok_vec def append_label_to_diskarray(self, vec1, vec2, word1, word2, label): self.train_f.append((vec1, vec2, word1, word2, label)) def get_dtype(self): return [ ('vec1', np.float32, self.VEC_DIM), ('vec2', np.float32, self.VEC_DIM), ('word1', 'S', self.VEC_DIM), ('word2', 'S', self.VEC_DIM), ('label', np.int), ] def run(self): try: while True: token = self.get_user_token() nearest_tokens = self.get_nearest_token(token) for nearest_token in nearest_tokens: label = int(self.get_user_label(token, nearest_token)) if label not in self.LABELS: continue vec1, vec2 = self.get_token_vector(token, nearest_token) self.append_label_to_diskarray(vec1, vec2, token, nearest_token, label) finally: self.train_f.flush() def define_args(self, parser): parser.add_argument('train_f', help='diskarray train file') parser.add_argument('wvspace_f', help='wvspace file')
def __init__(self): super(CorrectionalTraining, self).__init__() self.train_f = DiskArray(self.args.train_f, shape=(self.get_shape(), ), dtype=self.get_dtype()) self.wv = WordVecSpaceMem(self.args.wvspace_f)
def __init__(self, actual_vspace, transformed_vspace): self.wvspace = WordVecSpaceMem(actual_vspace) self.t_vspace = DiskArray(transformed_vspace, dtype=[('vec', np.float32, 300)])
class TrainData(BaseScript): VEC_DIM = 300 def __init__(self): super(TrainData, self).__init__() self.wvspace = WordVecSpaceMem(self.args.wvspace) self.train_f = DiskArray(self.args.train_file, shape=(self.get_shape(), ), dtype=self.get_dtype()) self.words_f = open(self.args.words_file, 'w') #self.model = load_model(self.args.model) def get_shape(self): if not os.path.exists(self.args.train_f): return 0 dtype = self.get_dtype() shape = os.stat( self.args.train_file).st_size // np.dtype(dtype).itemsize return shape def get_dtype(self): return [ ('vec1', np.float32, self.VEC_DIM), ('vec2', np.float32, self.VEC_DIM), ('label', np.int), ] def get_random_point(self): return random.randint(0, len(self.wvspace)) def near_pair(self): index = self.get_random_point() word1 = self.wvspace.get_word_at_index(index) nearest = self.wvspace.get_nearest(word1, 10) n_words = self.wvspace.get_word_at_indices(nearest) word2 = n_words[1] self.add_pair(word1, word2) def add_pair(self, word1, word2): vec1 = self.wvspace.get_word_vector(word1) vec2 = self.wvspace.get_word_vector(word2) diff_vec = abs(vec1 - vec2) p_value = self.model.predict(vec1, vec2) p_value = 0 if p_value < 3 else 1 self.train_f.append((vec1, vec2, p_value)) self.words_f(word1 + '<====>' + word2 + '<======>' + str(p_value)) def far_pair(self): index1 = self.get_random_point() word1 = self.wvspace.get_word_at_index(index) index2 = self.get_random_point() word2 = self.wvspace.get_word_at_index(index) self.add_pair(word1, word2) def run(self): for i in range(self.args.n_samples): word1, word2 = self.near_pair() def define_args(self, parser): parser.add_argument('train_file', metavar='training-file') parser.add_argument('wvspace', metavar='vector-space') parser.add_argument('words_file', metavar='words-file') parser.add_argument('n_samples', metavar='num-of-pairs')