def test(inpf, model, outf): wv = WordVecSpaceMem(inpf) model = load_model(model, custom_objects=dict( _euclidean_distance=_euclidean_distance, _dist_output_shape=_dist_output_shape)) inequality_count = 0 for i in range(1000): index1, index2, index3 = random.sample(range(wv.nvecs), 3) vec1 = wv.get_word_vector(index1) vec2 = wv.get_word_vector(index2) vec3 = wv.get_word_vector(index3) vec1 = _reshape(vec1) vec2 = _reshape(vec2) vec3 = _reshape(vec3) dist_v13 = model.predict([vec1, vec3]) dist_v12 = model.predict([vec1, vec2]) dist_v23 = model.predict([vec2, vec3]) ''' diff_vec12 = get_diff_vec(vec1, vec2) diff_vec13 = get_diff_vec(vec1, vec3) diff_vec23 = get_diff_vec(vec2, vec3) dist_v13 = 1 - model.predict(diff_vec13)[0][0] dist_v12 = 1 - model.predict(diff_vec12)[0][0] dist_v23 = 1 - model.predict(diff_vec23)[0][0] ''' is_inequality = dist_v13 <= (dist_v12 + dist_v23) outf.writerow([ index1, index2, index3, dist_v13, dist_v12, dist_v23, is_inequality ]) if not is_inequality: inequality_count += 1 print(inequality_count)
class CorrectionalTraining(BaseScript): VEC_DIM = 300 LABELS = [0, 1] def __init__(self): super(CorrectionalTraining, self).__init__() self.train_f = DiskArray(self.args.train_f, shape=(self.get_shape(), ), dtype=self.get_dtype()) self.wv = WordVecSpaceMem(self.args.wvspace_f) def get_user_token(self): token = input("Enter the search token: ") return token def get_shape(self): if not os.path.exists(self.args.train_f): return 0 dtype = self.get_dtype() shape = os.stat(self.args.train_f).st_size // np.dtype(dtype).itemsize return shape def get_nearest_token(self, token): url = 'http://dev0.servers.deepcompute.com:8888/api/v1/get_k_nearest_cosine?word={}&k=10'.format( token) #url = 'http://dev0.servers.deepcompute.com:8888/api/v1/get_nn_model_k_nearest?word={}&k=10'.format(token) response = requests.get(url) response = response.json() result = response.get('result') return result def get_user_label(self, token, nearest_token): #name = nearest_token.get('name', '') #nearest_token = nearest_token.get('word2', '') name = token ''' if not name: name = nearest_token ''' print('the nearest token is %s' % name) label = input("Mark the distance between {} and {}: ".format( token, nearest_token)) return int(label) def get_token_vector(self, token, nearest_token): token_vec = self.wv.get_word_vector(token) nearest_tok_vec = self.wv.get_word_vector(nearest_token) return token_vec, nearest_tok_vec def append_label_to_diskarray(self, vec1, vec2, word1, word2, label): self.train_f.append((vec1, vec2, word1, word2, label)) def get_dtype(self): return [ ('vec1', np.float32, self.VEC_DIM), ('vec2', np.float32, self.VEC_DIM), ('word1', 'S', self.VEC_DIM), ('word2', 'S', self.VEC_DIM), ('label', np.int), ] def run(self): try: while True: token = self.get_user_token() nearest_tokens = self.get_nearest_token(token) for nearest_token in nearest_tokens: label = int(self.get_user_label(token, nearest_token)) if label not in self.LABELS: continue vec1, vec2 = self.get_token_vector(token, nearest_token) self.append_label_to_diskarray(vec1, vec2, token, nearest_token, label) finally: self.train_f.flush() def define_args(self, parser): parser.add_argument('train_f', help='diskarray train file') parser.add_argument('wvspace_f', help='wvspace file')
class TrainData(BaseScript): VEC_DIM = 300 def __init__(self): super(TrainData, self).__init__() self.wvspace = WordVecSpaceMem(self.args.wvspace) self.train_f = DiskArray(self.args.train_file, shape=(self.get_shape(), ), dtype=self.get_dtype()) self.words_f = open(self.args.words_file, 'w') #self.model = load_model(self.args.model) def get_shape(self): if not os.path.exists(self.args.train_f): return 0 dtype = self.get_dtype() shape = os.stat( self.args.train_file).st_size // np.dtype(dtype).itemsize return shape def get_dtype(self): return [ ('vec1', np.float32, self.VEC_DIM), ('vec2', np.float32, self.VEC_DIM), ('label', np.int), ] def get_random_point(self): return random.randint(0, len(self.wvspace)) def near_pair(self): index = self.get_random_point() word1 = self.wvspace.get_word_at_index(index) nearest = self.wvspace.get_nearest(word1, 10) n_words = self.wvspace.get_word_at_indices(nearest) word2 = n_words[1] self.add_pair(word1, word2) def add_pair(self, word1, word2): vec1 = self.wvspace.get_word_vector(word1) vec2 = self.wvspace.get_word_vector(word2) diff_vec = abs(vec1 - vec2) p_value = self.model.predict(vec1, vec2) p_value = 0 if p_value < 3 else 1 self.train_f.append((vec1, vec2, p_value)) self.words_f(word1 + '<====>' + word2 + '<======>' + str(p_value)) def far_pair(self): index1 = self.get_random_point() word1 = self.wvspace.get_word_at_index(index) index2 = self.get_random_point() word2 = self.wvspace.get_word_at_index(index) self.add_pair(word1, word2) def run(self): for i in range(self.args.n_samples): word1, word2 = self.near_pair() def define_args(self, parser): parser.add_argument('train_file', metavar='training-file') parser.add_argument('wvspace', metavar='vector-space') parser.add_argument('words_file', metavar='words-file') parser.add_argument('n_samples', metavar='num-of-pairs')