Ejemplo n.º 1
0
    def get_char_id_map(self):
        class_label_id_map = {}
        char_freq_map = {}
        all_files = []
        print("正字读取语料,并统计字的频率")
        utils.find_all_files(self.parameters['train_corpus_dir'], all_files)
        count = 0
        for file_name in all_files:

            class_name = file_name.split("/")[-2]
            if class_name not in class_label_id_map:
                class_label_id_map[class_name] = len(class_label_id_map)

            this_class_sample_size = 0
            for lines in utils.read_lines_small_file(file_name):
                for line in lines:
                    line = preProcess.filtUrl(line)
                    line = line.replace(' ', '').replace('\n', '')
                    if len(line) < 10: continue
                    count += 1
                    this_class_sample_size += 1
                    #                 if this_class_sample_size==100: break
                    if count % 10000 == 0:
                        print("已经读取了", count, '行。', "字符数量是",
                              len(char_freq_map))
                    for char in line:
                        char_freq_map[char] = char_freq_map.get(char, 0) + 1

        print("正在为每一个字分配一个id")
        char_id_map = {'unk': 0, 'pad_char': 1, 'stop_char': 2}
        id_char_map = {0: 'unk', 1: 'pad_char', 2: 'stop_char'}
        init_char_id_map_size = len(char_id_map)
        char_freq_list = sorted(char_freq_map.items(),\
                                 key=lambda x: x[1], reverse=True)\
                                 [:self.parameters['char_set_size']-len(id_char_map)]
        for i in range(len(char_freq_list)):

            [char, _] = char_freq_list[i]
            if char not in stop_chars:
                char_id_map[char] = i + init_char_id_map_size
                id_char_map[i + init_char_id_map_size] = char

        pickle.dump(char_id_map, open(self.parameters['char_id_map_file'],
                                      'wb'))
        print(char_id_map.keys())
        pickle.dump(id_char_map, open(self.parameters['id_char_map_file'],
                                      'wb'))
        pickle.dump(class_label_id_map,
                    open(self.parameters['class_label_id_map_file'], 'wb'))
    def load_test_data(self):
        x_batch = []
        y_batch = []
        class_num_map = {}
        test_file_list = utils.find_all_files(
            self.parameters['test_corpus_dir'], [])
        lines = []
        for file_name in test_file_list:
            lines += utils.read_lines_small_file(file_name)

        random.shuffle(lines)
        for text_file in lines:
            [text, file_name] = text_file
            text = preProcess.filtUrl(text)
            text = text.replace(" ", '').replace(
                '\n', '')[:self.parameters['max_text_length']]
            if len(text) == 0: continue
            id_list = self.trans_char2id(text)
            #             print(text)
            #             print(id_list)
            class_label = file_name.split('/')[-2]
            class_num_map[class_label] = class_num_map.get(class_label, 0) + 1
            #if class_num_map[class_label]>20: continue
            class_label_one_hot = self.class_label_one_hot[class_label]
            x_batch.append(id_list)
            y_batch.append(class_label_one_hot)
        x_batch = np.array(x_batch)
        y_batch = np.array(y_batch)
        return x_batch, y_batch
    def fit(self, if_static_embeding=True):
        file_list = utils.find_all_files(self.parameters['train_corpus_dir'],
                                         [])
        test_input, test_output = self.load_test_data()
        count = 0
        batch_size = 50

        for epoch in range(10000):
            x_batch = []
            y_batch = []
            lines = []
            for file_name in file_list:
                lines += utils.read_lines_small_file(file_name)
            random.shuffle(lines)
            for text_file in lines:
                [text, file_name] = text_file
                text = preProcess.filtUrl(text)
                text = text.replace(" ", '').replace(
                    '\n', '')[:self.parameters['max_text_length']]
                id_list = self.trans_char2id(text)

                class_label = file_name.split('/')[-2]
                class_label_one_hot = self.class_label_one_hot[class_label]
                count += 1
                #print(class_label, text)
                x_batch.append(id_list)
                #             print(x_batch)
                y_batch.append(class_label_one_hot)
                #print(len(y_batch), y_batch)
            #打乱顺序
#                 print(x_batch)
            x_batch = np.array(x_batch)
            y_batch = np.array(y_batch)
            #print(y_batch)

            #                 print(x_batch.shape)
            #训练
            for i in range(0, y_batch.shape[0], batch_size):
                a_x_batch = x_batch[i:i + batch_size, :]
                a_y_batch = y_batch[i:i + batch_size, :]
                #print(a_y_batch)
                Y, prob_dist, _, loss_1, accuracy = self.sess.run([self.Y, self.prob_dist, self.train, self.losses, self.accuracy],\
                                              feed_dict={self.X: a_x_batch, self.Y: a_y_batch})
            #打印损失值


#             print('epoch ', epoch," loss is ", loss, '。 accuracy is ', accuracy)
            loss, accuracy = self.sess.run([self.losses, self.accuracy],\
                          feed_dict={self.X: test_input, self.Y: test_output})
            merg = self.sess.run(self.merged,\
                          feed_dict={self.X: test_input, self.Y: test_output})

            self.writer.add_summary(merg, epoch)
            print(epoch, loss_1, "在测试集中的loss为", loss, 'accuracy为', accuracy)
            self.saver.save(self.sess,
                            self.parameters['check_points_dir'] + '/model')
            tf.reset_default_graph()
        self.writer.close()
Ejemplo n.º 4
0
    def fit(self, if_static_embeding=True):
        model = None
        file_list = utils.find_all_files(
            self.parameters['train_corpus_for_embedding'], [])
        print("训练数据的文件数量是", len(file_list))
        count = 0
        x_batch = []
        step = 0

        random.shuffle(file_list)
        for file_name in file_list:
            lines = utils.read_lines_small_file(file_name)
            #                 text = self.get_title_content(lines)
            lines = list(map(lambda x: x.split('#'), lines))
            lines = list(
                filter(lambda x: len(x) == 8 and len(x[6]) > 50, lines))
            lines = list(map(lambda x: x[6].split('kabukabu')[1].\
                             replace('d_post_content j_d_post_content  clearfix"> ', ''), lines))
            text = ''.join(lines).replace(' ', '')
            if len(text) == 0: continue
            #             print("文档字数 是", len(text))
            text = list(text)
            if len(text) == 0: continue

            count += 1

            x_batch.append(text)
            if len(x_batch) == 10:
                #打乱顺序
                random_index = list(range(10))
                random.shuffle(random_index)
                #print(x_batch)
                x_batch = np.array(x_batch)[random_index]
                #训练
                print("这是第", step)
                if model == None:
                    model = Word2Vec(x_batch,
                                     size=200,
                                     window=5,
                                     min_count=5,
                                     workers=8,
                                     iter=200)
                    step += 1
                else:
                    model.build_vocab(x_batch, update=True)
                    model.train(x_batch,
                                total_examples=x_batch.shape[0],
                                epochs=200)
                    step += 1
                x_batch = []
            if count % 50 == 0:
                model.save("./model/word2vec.model")
Ejemplo n.º 5
0
    def fit(self, if_static_embeding=True):
        file_list = utils.find_all_files(self.parameters['train_corpus_dir'],
                                         [])
        test_input, test_output = self.load_test_data()
        count = 0
        x_batch = []
        y_batch = []
        for epoch in range(1000):
            random.shuffle(file_list)
            for file_name in file_list:
                print(file_name)
                lines = utils.read_lines_small_file(file_name)
                text = self.get_title_content(lines)
                if len(text) == 0: continue
                id_list = self.trans_char2id(text)
                class_label = file_name.split('/')[-2]
                print(self.class_label_one_hot)
                class_label_one_hot = self.class_label_one_hot[class_label]
                count += 1

                x_batch.append(id_list)
                #             print(x_batch)
                y_batch.append(class_label_one_hot)
                if len(x_batch) == 500:
                    #打乱顺序
                    random_index = list(range(500))
                    random.shuffle(random_index)
                    #                 print(x_batch)
                    x_batch = np.array(x_batch)[random_index]
                    y_batch = np.array(y_batch)[random_index]

                    #                 print(x_batch.shape)
                    #训练
                    _, loss, accuracy = self.sess.run([self.train, self.losses, self.accuracy],\
                                                      feed_dict={self.X: x_batch, self.Y: y_batch})
                    #打印损失值

                    x_batch = []
                    y_batch = []

                if count % 5000 == 0:
                    print('epoch ', epoch, " loss is ", loss, '。 accuracy is ',
                          accuracy)
                    loss, accuracy = self.sess.run([self.losses, self.accuracy],\
                                  feed_dict={self.X: test_input, self.Y: test_output})
                    print("在测试集中的loss为", loss, 'accuracy为', accuracy)
                    self.saver.save(
                        self.sess,
                        self.parameters['check_points_dir'] + '/model')
Ejemplo n.º 6
0
def parse_original_xmls(dirname, pickle=True):
    pickle_file = pickle and os.path.join(dirname, 'annotation.pkl') or None
    if pickle and os.path.isfile(pickle_file):
        logging.info("Loading annotations from file %s" % pickle_file)
        with open(pickle_file, 'r') as f:
            annotations = cPickle.load(f)
        logging.info("Load annotations complete")
    else:
        logging.info("Reading annotations")
        annotations = []
        xml_files = find_all_files(dirname, '.xml')
        for f in xml_files:
            annotations.append(parse(f))
    if pickle and not os.path.isfile(pickle_file):
        logging.info("Saving annotations to file %s" % pickle_file)
        with open(pickle_file, 'w') as f:
            cPickle.dump(annotations, f)
    return annotations
Ejemplo n.º 7
0
    def cleanup_checkpoints(self) -> None:
        if not self.is_remove_old_checkpoint:
            # do nothing if the model do not save latest checkpoints or if all checkpoints are kept
            return

        checkpoint_paths = find_all_files(
            checkpoint_dir=self.log_dir,
            search_pattern=self.latest_checkpoint_pattern)

        # sort by recency (largest step first)
        checkpoint_paths.sort(key=lambda x: int(
            re.search(self.latest_checkpoint_pattern, x.name).group(1)),
                              reverse=True)

        # remove old checkpoints
        for checkpoint_path in checkpoint_paths[self.
                                                num_latest_checkpoints_kept:]:
            print(f"Removing old checkpoint \"{checkpoint_path}\"", flush=True)
            checkpoint_path.unlink()
Ejemplo n.º 8
0
 def load_test_data(self):
     x_batch = []
     y_batch = []
     class_num_map = {}
     test_file_list = utils.find_all_files(
         self.parameters['test_corpus_dir'], [])
     for file_name in test_file_list:
         lines = utils.read_lines_small_file(file_name)
         text = self.get_title_content(lines)
         if len(text) == 0: continue
         id_list = self.trans_char2id(text)
         class_label = file_name.split('/')[-2]
         class_num_map[class_label] = class_num_map.get(class_label, 0) + 1
         if class_num_map[class_label] > 20: continue
         print(self.class_label_one_hot)
         class_label_one_hot = self.class_label_one_hot[class_label]
         x_batch.append(id_list)
         y_batch.append(class_label_one_hot)
     x_batch = np.array(x_batch)
     y_batch = np.array(y_batch)
     return x_batch, y_batch
Ejemplo n.º 9
0
    def execute_operation(
        self
    ):  # Anropa olika funktioner beroende på valet i radiobuttons och skapa dictionaries inför generering av rapport
        folder = self.katalog_entry.get()
        ext = self.ext_entry.get()
        keyword = self.keyword_entry.get()
        date = self.date_entry.get()

        if self.radiovar.get() == 1:

            if folder:
                if os.path.isdir(folder):
                    list_tmp = utils.find_all_files(folder)
                    self.match_hashset += utils.verify_files(list_tmp)
                    utils.create_dict(folder, self.allfiles, list_tmp)
                    self.display_results(list_tmp)
                else:
                    tkinter.messagebox.showerror(
                        'Error', 'Detta är inte en äkta katalog!')
            else:
                tkinter.messagebox.showerror('Error',
                                             'Du måste ange en katalog!')

        elif self.radiovar.get() == 2:

            if folder and ext:
                if os.path.isdir(folder):
                    list_tmp = utils.find_specific_files(folder, ext)
                    self.match_hashset += utils.verify_files(list_tmp)
                    utils.create_dict(ext, self.specificfiles, list_tmp)
                    self.display_results(list_tmp)
                else:
                    tkinter.messagebox.showerror(
                        'Error', 'Detta är inte en äkta katalog!')
            else:
                tkinter.messagebox.showerror(
                    'Error', 'Du måste ange både katalog och filändelse!')

        elif self.radiovar.get() == 3:

            if folder and ext and keyword:
                if os.path.isdir(folder):
                    list_tmp = utils.search_files(folder, ext, keyword)
                    self.match_hashset += utils.verify_files(list_tmp)
                    utils.create_dict(keyword, self.infofiles, list_tmp)
                    self.display_results(list_tmp)
                else:
                    tkinter.messagebox.showerror(
                        'Error', 'Detta är inte en äkta katalog!')
            else:
                tkinter.messagebox.showerror(
                    'Error!', 'Du måste ange katalog, filändelse och sökord!')

        elif self.radiovar.get() == 4:

            if folder and date:
                if os.path.isdir(folder):
                    list_tmp = utils.find_modified_files(folder, date)
                    self.match_hashset += utils.verify_files(list_tmp)
                    utils.create_dict(date, self.datefiles, list_tmp)
                    self.display_results(list_tmp)
                else:
                    tkinter.messagebox.showerror(
                        'Error', 'Detta är inte en äkta katalog!')
            else:
                tkinter.messagebox.showerror(
                    'Error!', 'Du måste ange katalog och datum!')
Ejemplo n.º 10
0
def main():
    allfiles = dict()
    specificfiles = dict()
    infofiles = dict()
    datefiles = dict()
    match_hashset = list()

    while True:
        print("\n")
        print("################################################")
        print("# [1]Search  [2]Encryption  [3]File Difference #")
        print("# [4]System Info [5]Generate report            #")
        print('#  q or "exit" to exit                         #')
        print("################################################")
        ch = input("$ ")

        # Search in files
        if ch == "1":
            while True:
                print("\n")
                print("##########################################")
                print("# [1] Find all files [2] File Extension  #")
                print("# [3] By date        [4] Search in files #")
                print('#  q or "back" to go back                #')
                print("##########################################")
                ch2 = input("$ ")

                if ch2 == "1":
                    path = input("$ Path to folder: ")
                    if path == "q" or path == "back":
                        break
                    list_tmp = utils.find_all_files(path)
                    utils.create_dict(path, allfiles, list_tmp)
                    match_hashset += utils.verify_files(list_tmp)
                    print_results(list_tmp)

                if ch2 == "2":
                    ext = input("$ Extension: ")
                    if ext == "q" or ext == "back":
                        break

                    folder = input("$ Path to folder: ")
                    if folder == "q" or folder == "back":
                        break
                    list_tmp = utils.find_specific_files(folder, ext)
                    utils.create_dict(ext, specificfiles, list_tmp)
                    match_hashset += utils.verify_files(list_tmp)
                    print_results(list_tmp)

                if ch2 == "3":
                    folder = input("$ Path to folder: ")
                    if folder == "q" or folder == "back":
                        break

                    date = input("$ Date (Ex format: 2020-03-03): ")
                    if date == "q" or date == "back":
                        break
                    list_tmp = utils.find_modified_files(folder, date)
                    utils.create_dict(date, datefiles, list_tmp)
                    match_hashset = utils.verify_files(list_tmp)
                    print_results(list_tmp)

                if ch2 == "4":
                    folder = input("$ Path to folder: ")
                    if folder == "q" or folder == "back":
                        break

                    ext = input("$ Extension: ")
                    if ext == "q" or ext == "back":
                        break

                    keyword = input("$ Keyword: ")
                    if keyword == "q" or keyword == "back":
                        break
                    list_tmp = utils.search_files(folder, ext, keyword)
                    utils.create_dict(keyword, infofiles, list_tmp)
                    match_hashset = utils.verify_files(list_tmp)
                    print_results(list_tmp)

                if ch2 == "q" or ch2 == "back":
                    break

        #Encryption
        if ch == "2":
            while True:
                print("\n")
                print("###########################")
                print("# [1] Encrypt [2] Decrypt #")
                print('#  q or "back" to go back #')
                print("###########################")
                ch2 = input("$ ")

                if ch2 == "1":
                    filename = input("$ Path to file: ")
                    if filename == "q" or filename == "back":
                        break

                    utils.encrypt_file(filename)
                    print(filename + " has been encrypted.")

                if ch2 == "2":
                    filename = input("$ Path to file: ")
                    if filename == "q" or filename == "back":
                        break

                    utils.decrypt_file(filename)
                    print(filename + "has been decrypted.")

                if ch2 == "q" or ch2 == "back":
                    break

        # File Difference
        if ch == "3":
            while True:
                print("\n")
                print(' q or "back" to go back')
                file1 = input("$ File 1: ")
                if file1 == "q" or file1 == "back":
                    break

                file2 = input("$ File 2: ")
                if file2 == "q" or file2 == "back":
                    break

                file1_diff, file2_diff = utils.word_difference(file1, file2)
                print()
                print("Words in file 1, but not in file 2:")
                print_results(file1_diff)
                print("Words in file 2, but not in file 1:")
                print_results(file2_diff)

        # System info
        if ch == "4":
            print_results(utils.system_information())

        if ch == "5":
            dictionary = dict()
            dictionary['sys'] = utils.system_information()
            dictionary['hashset'] = match_hashset
            dictionary['allfiles'] = allfiles
            dictionary['extfiles'] = specificfiles
            dictionary['infofiles'] = infofiles
            dictionary['datefiles'] = datefiles
            utils.gen_report(dictionary)
            print("The report has been generated!")

        if ch == "q" or ch == "exit":
            print("\n")
            print(" Cya! ")
            print("\n")
            break