コード例 #1
0
class KNearestService(object):
    def __init__(self, actual_vspace, transformed_vspace):

        self.wvspace = WordVecSpaceMem(actual_vspace)
        self.t_vspace = DiskArray(transformed_vspace,
                                  dtype=[('vec', np.float32, 300)])

    def k_nearest(self,
                  word: str,
                  k: int = 10,
                  metric: str = 'angular') -> dict:
        index = self.wvspace.get_word_index(word)

        result = self.wvspace.get_nearest(index, k, metric=metric)
        actual_results = self.wvspace.get_word_at_indices(result)

        vec = self.t_vspace['vec'][index].reshape(1, 300)
        vecs = self.t_vspace['vec']

        if metric == 'angular':
            metric = 'cosine'

        dist = distance.cdist(vec, vecs, metric)

        dist = pd.Series(dist[0])
        res = dist.nsmallest(k).keys()
        trans_results = self.wvspace.get_word_at_indices(list(res))

        recall = len(set(actual_results) & set(trans_results)) / k

        data = dict(vspace_results=actual_results,
                    T_vspace_results=trans_results,
                    recall=recall)
        return data
コード例 #2
0
 def __init__(self):
     super(TrainData, self).__init__()
     self.wvspace = WordVecSpaceMem(self.args.wvspace)
     self.train_f = DiskArray(self.args.train_file,
                              shape=(self.get_shape(), ),
                              dtype=self.get_dtype())
     self.words_f = open(self.args.words_file, 'w')
コード例 #3
0
    def __init__(self):
        super(MineTriplet, self).__init__()

        self.inp_cluster_f = DD(self.args.manual_cluster_f)
        self.vspace = WordVecSpaceMem(self.args.wvspace_f)
        self.out_train_d = DA(self.args.hard_triplet_batch,
                              shape=(0, ),
                              dtype=self._get_dtype())
コード例 #4
0
def k_nearest(wvspace, disk_f, words, metric, image_name):
    f = open(words)
    wv = WordVecSpaceMem(wvspace)
    da = DiskArray(disk_f, dtype=[('vec', np.float32, 300)])

    vecs = da['vec']

    psame = []
    pnsame = []
    for line in f:
        words = json.loads(line.strip())
        index1 = wv.get_word_index(words[0])
        index2 = wv.get_word_index(words[1])

        if 'clinicaltrials' in words[0] or 'clinicaltrials' in words[1]:
            continue

        vec1 = vecs[index1].reshape(1, 300)
        vec2 = vecs[index2].reshape(1, 300)

        if metric == 'cosine':
           vspace_dist = wv.get_distance(words[0], words[1])
           tvspace_dist = distance.cosine(vec1, vec2)
        else:
            vspace_dist = wv.get_distance(words[0], words[1], metric='euclidean')
            tvspace_dist = distance.euclidean(vec1, vec2)

        if words[2] == 0:
            psame.append(tvspace_dist)
        else:
            pnsame.append(tvspace_dist)

    dm = (np.std(psame) + np.std(pnsame)) / 2
    nm = abs(np.mean(psame) - np.mean(pnsame))

    d = nm / dm
    print('the cohens D distance is', d)

    plt.hist(psame, bins=50, alpha=0.5, label='same points')
    plt.hist(pnsame, bins=50, alpha=0.5, label='not same points')
    plt.legend(loc='upper right')
    plt.savefig(image_name)
コード例 #5
0
def test(inpf, model, outf):
    wv = WordVecSpaceMem(inpf)
    model = load_model(model,
                       custom_objects=dict(
                           _euclidean_distance=_euclidean_distance,
                           _dist_output_shape=_dist_output_shape))

    inequality_count = 0

    for i in range(1000):
        index1, index2, index3 = random.sample(range(wv.nvecs), 3)

        vec1 = wv.get_word_vector(index1)
        vec2 = wv.get_word_vector(index2)
        vec3 = wv.get_word_vector(index3)

        vec1 = _reshape(vec1)
        vec2 = _reshape(vec2)
        vec3 = _reshape(vec3)
        dist_v13 = model.predict([vec1, vec3])
        dist_v12 = model.predict([vec1, vec2])
        dist_v23 = model.predict([vec2, vec3])
        '''
        diff_vec12 = get_diff_vec(vec1, vec2)
        diff_vec13 = get_diff_vec(vec1, vec3)
        diff_vec23 = get_diff_vec(vec2, vec3)

        dist_v13 = 1 - model.predict(diff_vec13)[0][0]
        dist_v12 = 1 - model.predict(diff_vec12)[0][0]
        dist_v23 = 1 - model.predict(diff_vec23)[0][0]
        '''

        is_inequality = dist_v13 <= (dist_v12 + dist_v23)
        outf.writerow([
            index1, index2, index3, dist_v13, dist_v12, dist_v23, is_inequality
        ])
        if not is_inequality:
            inequality_count += 1

    print(inequality_count)
コード例 #6
0
def k_nearest(wvspace, disk_f, word):
    wv = WordVecSpaceMem(wvspace)
    da = DiskArray(disk_f, dtype=[('vec', np.float32, 300)])
    index = wv.get_word_index(word)

    result = wv.get_nearest(index, k=10)
    print(wv.get_word_at_indices(result))

    vec = da['vec'][index].reshape(1, 300)
    vecs = da['vec']

    #dist = distance.cdist(vec, vecs, 'cosine')
    dist = distance.cdist(vec, vecs, 'euclidean')
    #dist = np.dot(vec, vecs.T)

    dist = pd.Series(dist[0])
    res = dist.nsmallest(10).keys()
    print('\n')
    print(wv.get_word_at_indices(list(res)))
コード例 #7
0
import sys

from keras.Models import load_model

from wordvecspace import WordVecSpaceMem
from diskarray import DiskArray


def _euclidean_dis_loss(y_true, y_pred):
    return K.sqrt(K.sum(K.square(y_pred - y_true), axis=0))


model = load_model(
    sys.argv[1], custom_objects=dict(_euclidean_dis_loss=_euclidean_dis_loss))
out_f = DiskArray(sys.argv[2], dtype=[('vec', np.float32, 300)])

wv = WordVecSpaceMem(sys.argv[3])


def get_tras_vectors():
    nvecs = len(wv.vectors)
    for num in range(nvecs):
        vec = wv.get_word_vector(num)
        vec = vec.reshape(1, 300)
        t_vec = model.predict(vec)
        out_f.append((t_vec, ))
コード例 #8
0
class CorrectionalTraining(BaseScript):
    VEC_DIM = 300
    LABELS = [0, 1]

    def __init__(self):
        super(CorrectionalTraining, self).__init__()
        self.train_f = DiskArray(self.args.train_f,
                                 shape=(self.get_shape(), ),
                                 dtype=self.get_dtype())
        self.wv = WordVecSpaceMem(self.args.wvspace_f)

    def get_user_token(self):
        token = input("Enter the search token: ")

        return token

    def get_shape(self):
        if not os.path.exists(self.args.train_f):
            return 0

        dtype = self.get_dtype()
        shape = os.stat(self.args.train_f).st_size // np.dtype(dtype).itemsize
        return shape

    def get_nearest_token(self, token):
        url = 'http://dev0.servers.deepcompute.com:8888/api/v1/get_k_nearest_cosine?word={}&k=10'.format(
            token)
        #url = 'http://dev0.servers.deepcompute.com:8888/api/v1/get_nn_model_k_nearest?word={}&k=10'.format(token)
        response = requests.get(url)
        response = response.json()
        result = response.get('result')

        return result

    def get_user_label(self, token, nearest_token):
        #name = nearest_token.get('name', '')
        #nearest_token = nearest_token.get('word2', '')
        name = token
        '''
        if not name:
            name = nearest_token
        '''
        print('the nearest token is %s' % name)
        label = input("Mark the distance between {} and {}: ".format(
            token, nearest_token))

        return int(label)

    def get_token_vector(self, token, nearest_token):
        token_vec = self.wv.get_word_vector(token)
        nearest_tok_vec = self.wv.get_word_vector(nearest_token)

        return token_vec, nearest_tok_vec

    def append_label_to_diskarray(self, vec1, vec2, word1, word2, label):
        self.train_f.append((vec1, vec2, word1, word2, label))

    def get_dtype(self):
        return [
            ('vec1', np.float32, self.VEC_DIM),
            ('vec2', np.float32, self.VEC_DIM),
            ('word1', 'S', self.VEC_DIM),
            ('word2', 'S', self.VEC_DIM),
            ('label', np.int),
        ]

    def run(self):
        try:
            while True:
                token = self.get_user_token()
                nearest_tokens = self.get_nearest_token(token)
                for nearest_token in nearest_tokens:
                    label = int(self.get_user_label(token, nearest_token))
                    if label not in self.LABELS:
                        continue
                    vec1, vec2 = self.get_token_vector(token, nearest_token)
                    self.append_label_to_diskarray(vec1, vec2, token,
                                                   nearest_token, label)
        finally:
            self.train_f.flush()

    def define_args(self, parser):
        parser.add_argument('train_f', help='diskarray train file')
        parser.add_argument('wvspace_f', help='wvspace file')
コード例 #9
0
 def __init__(self):
     super(CorrectionalTraining, self).__init__()
     self.train_f = DiskArray(self.args.train_f,
                              shape=(self.get_shape(), ),
                              dtype=self.get_dtype())
     self.wv = WordVecSpaceMem(self.args.wvspace_f)
コード例 #10
0
    def __init__(self, actual_vspace, transformed_vspace):

        self.wvspace = WordVecSpaceMem(actual_vspace)
        self.t_vspace = DiskArray(transformed_vspace,
                                  dtype=[('vec', np.float32, 300)])
コード例 #11
0
class TrainData(BaseScript):

    VEC_DIM = 300

    def __init__(self):
        super(TrainData, self).__init__()
        self.wvspace = WordVecSpaceMem(self.args.wvspace)
        self.train_f = DiskArray(self.args.train_file,
                                 shape=(self.get_shape(), ),
                                 dtype=self.get_dtype())
        self.words_f = open(self.args.words_file, 'w')
        #self.model = load_model(self.args.model)

    def get_shape(self):
        if not os.path.exists(self.args.train_f):
            return 0

        dtype = self.get_dtype()
        shape = os.stat(
            self.args.train_file).st_size // np.dtype(dtype).itemsize
        return shape

    def get_dtype(self):
        return [
            ('vec1', np.float32, self.VEC_DIM),
            ('vec2', np.float32, self.VEC_DIM),
            ('label', np.int),
        ]

    def get_random_point(self):
        return random.randint(0, len(self.wvspace))

    def near_pair(self):
        index = self.get_random_point()
        word1 = self.wvspace.get_word_at_index(index)
        nearest = self.wvspace.get_nearest(word1, 10)
        n_words = self.wvspace.get_word_at_indices(nearest)
        word2 = n_words[1]
        self.add_pair(word1, word2)

    def add_pair(self, word1, word2):
        vec1 = self.wvspace.get_word_vector(word1)
        vec2 = self.wvspace.get_word_vector(word2)
        diff_vec = abs(vec1 - vec2)
        p_value = self.model.predict(vec1, vec2)
        p_value = 0 if p_value < 3 else 1
        self.train_f.append((vec1, vec2, p_value))
        self.words_f(word1 + '<====>' + word2 + '<======>' + str(p_value))

    def far_pair(self):
        index1 = self.get_random_point()
        word1 = self.wvspace.get_word_at_index(index)
        index2 = self.get_random_point()
        word2 = self.wvspace.get_word_at_index(index)
        self.add_pair(word1, word2)

    def run(self):
        for i in range(self.args.n_samples):
            word1, word2 = self.near_pair()

    def define_args(self, parser):
        parser.add_argument('train_file', metavar='training-file')
        parser.add_argument('wvspace', metavar='vector-space')
        parser.add_argument('words_file', metavar='words-file')
        parser.add_argument('n_samples', metavar='num-of-pairs')