コード例 #1
0
    def embedding_test_master(self,
                              input_file,
                              embedding_file,
                              block_size=10000):
        """
        the master of mult-Theading for test by embedding model
        """
        version = begin_time()
        self.word2vec = load_bigger(embedding_file)
        self.origin_sample = load_bigger(input_file)
        threadings = queue.Queue()
        waitthreadings = queue.Queue()
        num = len(self.origin_sample)
        start = 0
        end = min(block_size, num - 1)
        for block in range(int(num / block_size) + 1):
            work = threading.Thread(target=self.embedding_test_agent,
                                    args=(
                                        start,
                                        end,
                                        block,
                                    ))
            threadings.put(work)
            start = end + 1
            end = min(num - 1, block_size * (block + 2))
        while not threadings.empty():
            tempwork = threadings.get()
            tempwork.start()
            waitthreadings.put(tempwork)
        while not waitthreadings.empty():
            waitthreadings.get().join()

        result = [self.wordresult[k] for k in sorted(self.wordresult.keys())]
        results = sum(result, [])
        totalnum = int(len(results))
        correctnum = 0
        top3num = 0
        block_sizes = 10
        for index in range(int(totalnum / block_sizes)):
            pre = results[index * block_sizes:(index + 1) * block_sizes]
            temp_index = np.array(pre).argmax()
            top3 = np.array(pre).argsort()[-3:][::-1]
            if not temp_index:
                correctnum += 1
            if 0 in top3:
                top3num += 1
        print(correctnum, top3num, int(totalnum / block_sizes),
              spend_time(version),
              str(correctnum / int(totalnum / block_sizes))[:5],
              str(top3num / int(totalnum / block_sizes))[:5])
        end_time(version)
コード例 #2
0
ファイル: SMN_Last.py プロジェクト: iofu728/Model_retrieval
def test_model(dataset_file='SMN/data/datasets_test11.pkl',
               pre_file='SMN/data/smn_test11.pkl',
               model_name='SMN/data/model.bin',
               result_file='SMN/data/result_test11.txt'):
    """
    test model return accuracy
    """
    version = begin_time()
    datasets = load_bigger(dataset_file)
    pre = pickle.load(open(pre_file, "rb"))
    wordvecs = pre[1]
    predict(datasets,
            wordvecs.W,
            batch_size=200,
            max_l=50,
            hidden_size=200,
            word_embedding_size=200,
            model_name=model_name,
            result_file=result_file)
    sampleConduct = SampleConduct()
    end_time(version)
    return sampleConduct.calculate_test(result_file)
コード例 #3
0
    def word2ids(self,
                 input_file,
                 embedding_file,
                 output1_file='SMN/data/weibo/word2id.pkl',
                 output2_file='SMN/data/weibo/word_embedding.pkl',
                 output3_file='SMN/data/weibo/word2id'):
        """
        word 2 id
        """
        version = begin_time()
        with codecs.open(input_file, 'r', 'utf-8') as f:
            origin_sample = f.readlines()
        word_embedding = load_bigger(embedding_file)
        words = []
        word_map = {}
        embedding_lists = []

        word_map['_OOV_'] = 0
        word_map['_EOS_'] = 1
        embedding_lists.append([0] * 200)
        embedding_lists.append([0] * 200)
        for index in origin_sample:
            if index == '\r\n':
                continue
            words += [LCS(idx) for idx in index.replace('\r\n', '').split()]
            # words.update(set(index.replace('\r\n', '').split()))
        words = Counter(words)
        words = [index for index in words if words[index] > 2]
        word2id = ['_OOV_ 0', '_EOS_ 1']

        print('Step 2: Begin')
        index_num = 2
        for idx, index in enumerate(words):
            if index in word_embedding:
                if index not in word_map:
                    word_map[index] = index_num
                    index_num += 1
                    word2id.append(index + ' ' + str(word_map[index]))
                    embedding_lists.append(
                        list(word_embedding[index].astype('float16')))
            # elif index[:3] in word_embedding:
            #     if index[:3] not in word_map:
            #         word_map[index[:3]] = index_num
            #         word_map[index] = index_num
            #         index_num += 1
            #         word2id.append(index[:3] + ' ' + str(word_map[index[:3]]))
            #         word2id.append(index + ' ' + str(word_map[index]))
            #         embedding_lists.append(list(word_embedding[index[:3]].astype('float16')))
            #     else:
            #         word_map[index] = word_map[index[:3]]
            #         word2id.append(index + ' ' + str(word_map[index]))
            # elif index[:2] in word_embedding:
            #     if index[:2] not in word_map:
            #         word_map[index[:2]] = index_num
            #         word_map[index] = index_num
            #         index_num += 1
            #         word2id.append(index[:2] + ' ' + str(word_map[index[:2]]))
            #         word2id.append(index + ' ' + str(word_map[index]))
            #         embedding_lists.append(list(word_embedding[index[:2]].astype('float16')))
            #     else:
            #         word_map[index] = word_map[index[:2]]
            #         word2id.append(index + ' ' + str(word_map[index]))
            # elif index[:1] in word_embedding:
            #     if index[:1] not in word_map:
            #         word_map[index[:1]] = index_num
            #         word_map[index] = index_num
            #         index_num += 1
            #         word2id.append(index[:1] + ' ' + str(word_map[index[:1]]))
            #         word2id.append(index + ' ' + str(word_map[index]))
            #         embedding_lists.append(list(word_embedding[index[:1]].astype('float16')))
            #     else:
            #         word_map[index] = word_map[index[:1]]
            #         word2id.append(index + ' ' + str(word_map[index]))
        print(index_num)
        with open(output3_file, 'w') as f:
            f.write(list2str(word2id))
        print('Step 2: Over')

        # return embedding_lists, word_map
        pickle.dump(embedding_lists, open(output2_file, "wb"))
        pickle.dump(word_map, open(output1_file, "wb"))
        end_time(version)
コード例 #4
0
    def word2ids(self,
                 input_file,
                 embedding_file,
                 output1_file='SMN/data/weibo/word2id.pkl',
                 output2_file='SMN/data/weibo/word_embedding.pkl',
                 output3_file='SMN/data/weibo/word2id',
                 min_n=1,
                 max_n=3):
        """
        word 2 id
        """
        version = begin_time()
        with codecs.open(input_file, 'r', 'utf-8') as f:
            origin_sample = f.readlines()
        word_embedding = load_bigger(embedding_file)
        words = []
        word_map = {}
        embedding_lists = []

        word_map['_OOV_'] = 0
        word_map['_EOS_'] = 1
        embedding_lists.append([0] * 200)
        embedding_lists.append([0] * 200)
        for index in origin_sample:
            if index == '\r\n':
                continue
            words += [LCS(idx) for idx in index.replace('\r\n', '').split()]
            # words.update(set(index.replace('\r\n', '').split()))
        words = Counter(words)
        words = [index for index in words]
        word2id = ['_OOV_ 0', '_EOS_ 1']
        word_size = word_embedding.wv.syn0[0].shape[0]

        print('Step 2: Begin')
        index_num = 2
        for idx, index in enumerate(words):
            if index in word_map:
                continue
            if index in word_embedding.wv.vocab.keys():
                word_map[index] = index_num
                index_num += 1
                word2id.append(index + ' ' + str(word_map[index]))
                embedding_lists.append(word_embedding[index].astype('float32'))
            else:
                ngrams = compute_ngrams(index, min_n=min_n, max_n=max_n)
                word_vec = np.zeros(word_size, dtype=np.float32)
                ngrams_found = 0
                ngrams_single = [ng for ng in ngrams if len(ng) == 1]
                ngrams_more = [ng for ng in ngrams if len(ng) > 1]
                for ngram in ngrams_more:
                    if ngram in word_embedding.wv.vocab.keys():
                        word_vec += word_embedding[ngram]
                        ngrams_found += 1
                if ngrams_found == 0:
                    for ngram in ngrams_single:
                        if ngram in word_embedding.wv.vocab.keys():
                            word_vec += word_embedding[ngram]
                            ngrams_found += 1
                if word_vec.any():
                    word_vec /= max(1, ngrams_found)
                    word_map[index] = index_num
                    index_num += 1
                    word2id.append(index + ' ' + str(word_map[index]))
                    embedding_lists.append(word_vec)
        print(index_num)
        with open(output3_file, 'w') as f:
            f.write(list2str(word2id))
        print('Step 2: Over')

        # return embedding_lists, word_map
        pickle.dump(embedding_lists, open(output2_file, "wb"))
        pickle.dump(word_map, open(output1_file, "wb"))
        end_time(version)