Ejemplo n.º 1
0
def test(inpf, model, outf):
    wv = WordVecSpaceMem(inpf)
    model = load_model(model,
                       custom_objects=dict(
                           _euclidean_distance=_euclidean_distance,
                           _dist_output_shape=_dist_output_shape))

    inequality_count = 0

    for i in range(1000):
        index1, index2, index3 = random.sample(range(wv.nvecs), 3)

        vec1 = wv.get_word_vector(index1)
        vec2 = wv.get_word_vector(index2)
        vec3 = wv.get_word_vector(index3)

        vec1 = _reshape(vec1)
        vec2 = _reshape(vec2)
        vec3 = _reshape(vec3)
        dist_v13 = model.predict([vec1, vec3])
        dist_v12 = model.predict([vec1, vec2])
        dist_v23 = model.predict([vec2, vec3])
        '''
        diff_vec12 = get_diff_vec(vec1, vec2)
        diff_vec13 = get_diff_vec(vec1, vec3)
        diff_vec23 = get_diff_vec(vec2, vec3)

        dist_v13 = 1 - model.predict(diff_vec13)[0][0]
        dist_v12 = 1 - model.predict(diff_vec12)[0][0]
        dist_v23 = 1 - model.predict(diff_vec23)[0][0]
        '''

        is_inequality = dist_v13 <= (dist_v12 + dist_v23)
        outf.writerow([
            index1, index2, index3, dist_v13, dist_v12, dist_v23, is_inequality
        ])
        if not is_inequality:
            inequality_count += 1

    print(inequality_count)
class CorrectionalTraining(BaseScript):
    VEC_DIM = 300
    LABELS = [0, 1]

    def __init__(self):
        super(CorrectionalTraining, self).__init__()
        self.train_f = DiskArray(self.args.train_f,
                                 shape=(self.get_shape(), ),
                                 dtype=self.get_dtype())
        self.wv = WordVecSpaceMem(self.args.wvspace_f)

    def get_user_token(self):
        token = input("Enter the search token: ")

        return token

    def get_shape(self):
        if not os.path.exists(self.args.train_f):
            return 0

        dtype = self.get_dtype()
        shape = os.stat(self.args.train_f).st_size // np.dtype(dtype).itemsize
        return shape

    def get_nearest_token(self, token):
        url = 'http://dev0.servers.deepcompute.com:8888/api/v1/get_k_nearest_cosine?word={}&k=10'.format(
            token)
        #url = 'http://dev0.servers.deepcompute.com:8888/api/v1/get_nn_model_k_nearest?word={}&k=10'.format(token)
        response = requests.get(url)
        response = response.json()
        result = response.get('result')

        return result

    def get_user_label(self, token, nearest_token):
        #name = nearest_token.get('name', '')
        #nearest_token = nearest_token.get('word2', '')
        name = token
        '''
        if not name:
            name = nearest_token
        '''
        print('the nearest token is %s' % name)
        label = input("Mark the distance between {} and {}: ".format(
            token, nearest_token))

        return int(label)

    def get_token_vector(self, token, nearest_token):
        token_vec = self.wv.get_word_vector(token)
        nearest_tok_vec = self.wv.get_word_vector(nearest_token)

        return token_vec, nearest_tok_vec

    def append_label_to_diskarray(self, vec1, vec2, word1, word2, label):
        self.train_f.append((vec1, vec2, word1, word2, label))

    def get_dtype(self):
        return [
            ('vec1', np.float32, self.VEC_DIM),
            ('vec2', np.float32, self.VEC_DIM),
            ('word1', 'S', self.VEC_DIM),
            ('word2', 'S', self.VEC_DIM),
            ('label', np.int),
        ]

    def run(self):
        try:
            while True:
                token = self.get_user_token()
                nearest_tokens = self.get_nearest_token(token)
                for nearest_token in nearest_tokens:
                    label = int(self.get_user_label(token, nearest_token))
                    if label not in self.LABELS:
                        continue
                    vec1, vec2 = self.get_token_vector(token, nearest_token)
                    self.append_label_to_diskarray(vec1, vec2, token,
                                                   nearest_token, label)
        finally:
            self.train_f.flush()

    def define_args(self, parser):
        parser.add_argument('train_f', help='diskarray train file')
        parser.add_argument('wvspace_f', help='wvspace file')
Ejemplo n.º 3
0
class TrainData(BaseScript):

    VEC_DIM = 300

    def __init__(self):
        super(TrainData, self).__init__()
        self.wvspace = WordVecSpaceMem(self.args.wvspace)
        self.train_f = DiskArray(self.args.train_file,
                                 shape=(self.get_shape(), ),
                                 dtype=self.get_dtype())
        self.words_f = open(self.args.words_file, 'w')
        #self.model = load_model(self.args.model)

    def get_shape(self):
        if not os.path.exists(self.args.train_f):
            return 0

        dtype = self.get_dtype()
        shape = os.stat(
            self.args.train_file).st_size // np.dtype(dtype).itemsize
        return shape

    def get_dtype(self):
        return [
            ('vec1', np.float32, self.VEC_DIM),
            ('vec2', np.float32, self.VEC_DIM),
            ('label', np.int),
        ]

    def get_random_point(self):
        return random.randint(0, len(self.wvspace))

    def near_pair(self):
        index = self.get_random_point()
        word1 = self.wvspace.get_word_at_index(index)
        nearest = self.wvspace.get_nearest(word1, 10)
        n_words = self.wvspace.get_word_at_indices(nearest)
        word2 = n_words[1]
        self.add_pair(word1, word2)

    def add_pair(self, word1, word2):
        vec1 = self.wvspace.get_word_vector(word1)
        vec2 = self.wvspace.get_word_vector(word2)
        diff_vec = abs(vec1 - vec2)
        p_value = self.model.predict(vec1, vec2)
        p_value = 0 if p_value < 3 else 1
        self.train_f.append((vec1, vec2, p_value))
        self.words_f(word1 + '<====>' + word2 + '<======>' + str(p_value))

    def far_pair(self):
        index1 = self.get_random_point()
        word1 = self.wvspace.get_word_at_index(index)
        index2 = self.get_random_point()
        word2 = self.wvspace.get_word_at_index(index)
        self.add_pair(word1, word2)

    def run(self):
        for i in range(self.args.n_samples):
            word1, word2 = self.near_pair()

    def define_args(self, parser):
        parser.add_argument('train_file', metavar='training-file')
        parser.add_argument('wvspace', metavar='vector-space')
        parser.add_argument('words_file', metavar='words-file')
        parser.add_argument('n_samples', metavar='num-of-pairs')