Esempi in Python per read_lines_small_file, esempi in Python per utils.read_lines_small_file

Esempio n. 1

0

Mostra file

File: text_classification_text_cnn_weibo_clean.py Progetto: IrelandC/DataScience-1

    def load_test_data(self):
        x_batch = []
        y_batch = []
        class_num_map = {}
        test_file_list = utils.find_all_files(
            self.parameters['test_corpus_dir'], [])
        lines = []
        for file_name in test_file_list:
            lines += utils.read_lines_small_file(file_name)

        random.shuffle(lines)
        for text_file in lines:
            [text, file_name] = text_file
            text = preProcess.filtUrl(text)
            text = text.replace(" ", '').replace(
                '\n', '')[:self.parameters['max_text_length']]
            if len(text) == 0: continue
            id_list = self.trans_char2id(text)
            #             print(text)
            #             print(id_list)
            class_label = file_name.split('/')[-2]
            class_num_map[class_label] = class_num_map.get(class_label, 0) + 1
            #if class_num_map[class_label]>20: continue
            class_label_one_hot = self.class_label_one_hot[class_label]
            x_batch.append(id_list)
            y_batch.append(class_label_one_hot)
        x_batch = np.array(x_batch)
        y_batch = np.array(y_batch)
        return x_batch, y_batch

Esempio n. 2

0

Mostra file

File: text_classification_text_cnn_weibo_clean.py Progetto: IrelandC/DataScience-1

    def fit(self, if_static_embeding=True):
        file_list = utils.find_all_files(self.parameters['train_corpus_dir'],
                                         [])
        test_input, test_output = self.load_test_data()
        count = 0
        batch_size = 50

        for epoch in range(10000):
            x_batch = []
            y_batch = []
            lines = []
            for file_name in file_list:
                lines += utils.read_lines_small_file(file_name)
            random.shuffle(lines)
            for text_file in lines:
                [text, file_name] = text_file
                text = preProcess.filtUrl(text)
                text = text.replace(" ", '').replace(
                    '\n', '')[:self.parameters['max_text_length']]
                id_list = self.trans_char2id(text)

                class_label = file_name.split('/')[-2]
                class_label_one_hot = self.class_label_one_hot[class_label]
                count += 1
                #print(class_label, text)
                x_batch.append(id_list)
                #             print(x_batch)
                y_batch.append(class_label_one_hot)
                #print(len(y_batch), y_batch)
            #打乱顺序
#                 print(x_batch)
            x_batch = np.array(x_batch)
            y_batch = np.array(y_batch)
            #print(y_batch)

            #                 print(x_batch.shape)
            #训练
            for i in range(0, y_batch.shape[0], batch_size):
                a_x_batch = x_batch[i:i + batch_size, :]
                a_y_batch = y_batch[i:i + batch_size, :]
                #print(a_y_batch)
                Y, prob_dist, _, loss_1, accuracy = self.sess.run([self.Y, self.prob_dist, self.train, self.losses, self.accuracy],\
                                              feed_dict={self.X: a_x_batch, self.Y: a_y_batch})
            #打印损失值


#             print('epoch ', epoch," loss is ", loss, '。 accuracy is ', accuracy)
            loss, accuracy = self.sess.run([self.losses, self.accuracy],\
                          feed_dict={self.X: test_input, self.Y: test_output})
            merg = self.sess.run(self.merged,\
                          feed_dict={self.X: test_input, self.Y: test_output})

            self.writer.add_summary(merg, epoch)
            print(epoch, loss_1, "在测试集中的loss为", loss, 'accuracy为', accuracy)
            self.saver.save(self.sess,
                            self.parameters['check_points_dir'] + '/model')
            tf.reset_default_graph()
        self.writer.close()

Esempio n. 3

0

Mostra file

    def fit(self, if_static_embeding=True):
        model = None
        file_list = utils.find_all_files(
            self.parameters['train_corpus_for_embedding'], [])
        print("训练数据的文件数量是", len(file_list))
        count = 0
        x_batch = []
        step = 0

        random.shuffle(file_list)
        for file_name in file_list:
            lines = utils.read_lines_small_file(file_name)
            #                 text = self.get_title_content(lines)
            lines = list(map(lambda x: x.split('#'), lines))
            lines = list(
                filter(lambda x: len(x) == 8 and len(x[6]) > 50, lines))
            lines = list(map(lambda x: x[6].split('kabukabu')[1].\
                             replace('d_post_content j_d_post_content  clearfix"> ', ''), lines))
            text = ''.join(lines).replace(' ', '')
            if len(text) == 0: continue
            #             print("文档字数 是", len(text))
            text = list(text)
            if len(text) == 0: continue

            count += 1

            x_batch.append(text)
            if len(x_batch) == 10:
                #打乱顺序
                random_index = list(range(10))
                random.shuffle(random_index)
                #print(x_batch)
                x_batch = np.array(x_batch)[random_index]
                #训练
                print("这是第", step)
                if model == None:
                    model = Word2Vec(x_batch,
                                     size=200,
                                     window=5,
                                     min_count=5,
                                     workers=8,
                                     iter=200)
                    step += 1
                else:
                    model.build_vocab(x_batch, update=True)
                    model.train(x_batch,
                                total_examples=x_batch.shape[0],
                                epochs=200)
                    step += 1
                x_batch = []
            if count % 50 == 0:
                model.save("./model/word2vec.model")

Esempio n. 4

0

Mostra file

    def fit(self, if_static_embeding=True):
        file_list = utils.find_all_files(self.parameters['train_corpus_dir'],
                                         [])
        test_input, test_output = self.load_test_data()
        count = 0
        x_batch = []
        y_batch = []
        for epoch in range(1000):
            random.shuffle(file_list)
            for file_name in file_list:
                print(file_name)
                lines = utils.read_lines_small_file(file_name)
                text = self.get_title_content(lines)
                if len(text) == 0: continue
                id_list = self.trans_char2id(text)
                class_label = file_name.split('/')[-2]
                print(self.class_label_one_hot)
                class_label_one_hot = self.class_label_one_hot[class_label]
                count += 1

                x_batch.append(id_list)
                #             print(x_batch)
                y_batch.append(class_label_one_hot)
                if len(x_batch) == 500:
                    #打乱顺序
                    random_index = list(range(500))
                    random.shuffle(random_index)
                    #                 print(x_batch)
                    x_batch = np.array(x_batch)[random_index]
                    y_batch = np.array(y_batch)[random_index]

                    #                 print(x_batch.shape)
                    #训练
                    _, loss, accuracy = self.sess.run([self.train, self.losses, self.accuracy],\
                                                      feed_dict={self.X: x_batch, self.Y: y_batch})
                    #打印损失值

                    x_batch = []
                    y_batch = []

                if count % 5000 == 0:
                    print('epoch ', epoch, " loss is ", loss, '。 accuracy is ',
                          accuracy)
                    loss, accuracy = self.sess.run([self.losses, self.accuracy],\
                                  feed_dict={self.X: test_input, self.Y: test_output})
                    print("在测试集中的loss为", loss, 'accuracy为', accuracy)
                    self.saver.save(
                        self.sess,
                        self.parameters['check_points_dir'] + '/model')

Esempio n. 5

0

Mostra file

    def get_char_id_map(self):
        class_label_id_map = {}
        char_freq_map = {}
        all_files = []
        print("正字读取语料，并统计字的频率")
        utils.find_all_files(self.parameters['train_corpus_dir'], all_files)
        count = 0
        for file_name in all_files:

            class_name = file_name.split("/")[-2]
            if class_name not in class_label_id_map:
                class_label_id_map[class_name] = len(class_label_id_map)

            this_class_sample_size = 0
            for lines in utils.read_lines_small_file(file_name):
                for line in lines:
                    line = preProcess.filtUrl(line)
                    line = line.replace(' ', '').replace('\n', '')
                    if len(line) < 10: continue
                    count += 1
                    this_class_sample_size += 1
                    #                 if this_class_sample_size==100: break
                    if count % 10000 == 0:
                        print("已经读取了", count, '行。', "字符数量是",
                              len(char_freq_map))
                    for char in line:
                        char_freq_map[char] = char_freq_map.get(char, 0) + 1

        print("正在为每一个字分配一个id")
        char_id_map = {'unk': 0, 'pad_char': 1, 'stop_char': 2}
        id_char_map = {0: 'unk', 1: 'pad_char', 2: 'stop_char'}
        init_char_id_map_size = len(char_id_map)
        char_freq_list = sorted(char_freq_map.items(),\
                                 key=lambda x: x[1], reverse=True)\
                                 [:self.parameters['char_set_size']-len(id_char_map)]
        for i in range(len(char_freq_list)):

            [char, _] = char_freq_list[i]
            if char not in stop_chars:
                char_id_map[char] = i + init_char_id_map_size
                id_char_map[i + init_char_id_map_size] = char

        pickle.dump(char_id_map, open(self.parameters['char_id_map_file'],
                                      'wb'))
        print(char_id_map.keys())
        pickle.dump(id_char_map, open(self.parameters['id_char_map_file'],
                                      'wb'))
        pickle.dump(class_label_id_map,
                    open(self.parameters['class_label_id_map_file'], 'wb'))

Esempio n. 6

0

Mostra file

 def load_test_data(self):
     x_batch = []
     y_batch = []
     class_num_map = {}
     test_file_list = utils.find_all_files(
         self.parameters['test_corpus_dir'], [])
     for file_name in test_file_list:
         lines = utils.read_lines_small_file(file_name)
         text = self.get_title_content(lines)
         if len(text) == 0: continue
         id_list = self.trans_char2id(text)
         class_label = file_name.split('/')[-2]
         class_num_map[class_label] = class_num_map.get(class_label, 0) + 1
         if class_num_map[class_label] > 20: continue
         print(self.class_label_one_hot)
         class_label_one_hot = self.class_label_one_hot[class_label]
         x_batch.append(id_list)
         y_batch.append(class_label_one_hot)
     x_batch = np.array(x_batch)
     y_batch = np.array(y_batch)
     return x_batch, y_batch