Exemple #1
0
def make_data(revs,
              word_idx_map,
              max_l=50,
              filter_h=3,
              val_test_splits=[2, 3],
              validation_num=500000):
    """
    Transforms sentences into a 2-d matrix.
    """
    version = begin_time()
    train, val, test = [], [], []
    for rev in revs:
        sent = get_idx_from_sent_msg(rev["m"], word_idx_map, max_l, True)
        sent += get_idx_from_sent(rev["r"], word_idx_map, max_l, True)
        sent += get_session_mask(rev["m"])
        sent.append(int(rev["y"]))
        if len(val) >= validation_num:
            train.append(sent)
        else:
            val.append(sent)

    train = np.array(train, dtype="int")
    val = np.array(val, dtype="int")
    test = np.array(test, dtype="int")
    print('trainning data', len(train), 'val data', len(val), 'spend time:',
          spend_time(version))
    return [train, val, test]
    def embedding_test_master(self,
                              input_file,
                              embedding_file,
                              block_size=10000):
        """
        the master of mult-Theading for test by embedding model
        """
        version = begin_time()
        self.word2vec = load_bigger(embedding_file)
        self.origin_sample = load_bigger(input_file)
        threadings = queue.Queue()
        waitthreadings = queue.Queue()
        num = len(self.origin_sample)
        start = 0
        end = min(block_size, num - 1)
        for block in range(int(num / block_size) + 1):
            work = threading.Thread(target=self.embedding_test_agent,
                                    args=(
                                        start,
                                        end,
                                        block,
                                    ))
            threadings.put(work)
            start = end + 1
            end = min(num - 1, block_size * (block + 2))
        while not threadings.empty():
            tempwork = threadings.get()
            tempwork.start()
            waitthreadings.put(tempwork)
        while not waitthreadings.empty():
            waitthreadings.get().join()

        result = [self.wordresult[k] for k in sorted(self.wordresult.keys())]
        results = sum(result, [])
        totalnum = int(len(results))
        correctnum = 0
        top3num = 0
        block_sizes = 10
        for index in range(int(totalnum / block_sizes)):
            pre = results[index * block_sizes:(index + 1) * block_sizes]
            temp_index = np.array(pre).argmax()
            top3 = np.array(pre).argsort()[-3:][::-1]
            if not temp_index:
                correctnum += 1
            if 0 in top3:
                top3num += 1
        print(correctnum, top3num, int(totalnum / block_sizes),
              spend_time(version),
              str(correctnum / int(totalnum / block_sizes))[:5],
              str(top3num / int(totalnum / block_sizes))[:5])
        end_time(version)
Exemple #3
0
def make_data_train(revs,
                    word_idx_map,
                    max_l=50,
                    validation_num=50000,
                    block_size=200000):
    """
    Transforms sentences into a 2-d matrix.
    """
    version = begin_time()
    test = []
    threadings = queue.Queue()
    waitthreadings = queue.Queue()
    num = len(revs)
    start = 0
    end = min(block_size, num - 1)
    for block in range(int(num / block_size) + 1):
        work = threading.Thread(target=make_data_theading,
                                args=(
                                    revs,
                                    word_idx_map,
                                    max_l,
                                    validation_num,
                                    start,
                                    end,
                                ))
        threadings.put(work)
        start = end + 1
        end = min(num - 1, block_size * (block + 2))
    while not threadings.empty():
        tempwork = threadings.get()
        tempwork.start()
        waitthreadings.put(tempwork)
    while not waitthreadings.empty():
        waitthreadings.get().join()

    global trains, vals
    train = sum(trains, [])
    val = sum(vals, [])
    train = np.array(train, dtype="int")
    val = np.array(val, dtype="int")
    test = np.array(test, dtype="int")
    print('trainning data', len(train), 'val data', len(val), 'spend time:',
          spend_time(version))
    return [train, val, test]
 def calculate_test(self, input_file, block_size=10):
     """
     calculate result
     """
     version = begin_time()
     with codecs.open(input_file, 'r', 'utf-8') as f:
         results = f.readlines()
         totalnum = int(len(results))
         correctnum = 0
         top3num = 0
         for index in range(int(totalnum / block_size)):
             pre = results[index * block_size:(index + 1) * block_size]
             temp_index = np.array(pre).argmax()
             top3 = np.array(pre).argsort()[-3:][::-1]
             if not temp_index:
                 correctnum += 1
             if 0 in top3:
                 top3num += 1
         print(correctnum, top3num, int(totalnum / block_size),
               spend_time(version),
               str(correctnum / int(totalnum / block_size))[:5],
               str(top3num / int(totalnum / block_size))[:5])
         return str(correctnum / int(totalnum / block_size))[:5]