def batch_processing_files(files_list, segment_out_dir, batchSize, stopwords=[]): ''' 批量分割文件字词,并将batchSize的文件合并一个文件 :param files_list: 文件列表 :param segment_out_dir: 字符分割文件输出的目录 :param batchSize: :param stopwords: 停用词 :return: ''' if not os.path.exists(segment_out_dir): os.makedirs(segment_out_dir) files_processing.delete_dir_file(segment_out_dir) sample_num = len(files_list) batchNum = int(math.ceil(1.0 * sample_num / batchSize)) for i in range(batchNum): segment_out_name = os.path.join(segment_out_dir, 'segment_{}.txt'.format(i)) start = i * batchSize end = min((i + 1) * batchSize, sample_num) batch_files = files_list[start:end] content_list = segment_files_list(batch_files, stopwords, segment_type='word') # content_list=padding_sentences(content_list, padding_token='<PAD>', padding_sentence_length=15) save_content_list(segment_out_name, content_list, mode='ab') print("segment files:{}".format(segment_out_name))
def save_multi_file(files_list, labels_list, word2vec_path, out_dir, prefix, batchSize, max_sentence_length, labels_set=None, shuffle=False): ''' 将文件内容映射为索引矩阵,并且将数据保存为多个文件 :param files_list: :param labels_list: :param word2vec_path: word2vec模型的位置 :param out_dir: 文件保存的目录 :param prefix: 保存文件的前缀名 :param batchSize: 将多个文件内容保存为一个文件 :param labels_set: labels集合 :return: ''' if not os.path.exists(out_dir): os.mkdir(out_dir) # 把该目录下的所有文件都删除 files_processing.delete_dir_file(out_dir) if shuffle: random.seed(100) random.shuffle(files_list) random.seed(100) random.shuffle(labels_list) sample_num = len(files_list) w2vModel = load_wordVectors(word2vec_path) if labels_set is None: labels_set = files_processing.get_labels_set(label_list) labels_list, labels_set = files_processing.labels_encoding(labels_list, labels_set) labels_list = labels_list.tolist() batchNum = int(math.ceil(1.0 * sample_num / batchSize)) for i in range(batchNum): start = i * batchSize end = min((i + 1) * batchSize, sample_num) batch_files = files_list[start:end] batch_labels = labels_list[start:end] # 读取文件内容,字词分割 batch_content = files_processing.read_files_list_to_segment(batch_files, max_sentence_length, padding_token='<PAD>', segment_type='word') # 将字词转为索引矩阵 batch_indexMat = word2indexMat(w2vModel, batch_content, max_sentence_length) batch_labels = np.asarray(batch_labels) batch_labels = batch_labels.reshape([len(batch_labels), 1]) # 保存*.npy文件 filename = os.path.join(out_dir, prefix + '{0}.npy'.format(i)) labels_indexMat = cat_labels_indexMat(batch_labels, batch_indexMat) np.save(filename, labels_indexMat) print('step:{}/{}, save:{}, data.shape{}'.format(i, batchNum, filename, labels_indexMat.shape))