コード例 #1
0
def batch_processing_files(files_list,
                           segment_out_dir,
                           batchSize,
                           stopwords=[]):
    '''
    批量分割文件字词,并将batchSize的文件合并一个文件
    :param files_list: 文件列表
    :param segment_out_dir: 字符分割文件输出的目录
    :param batchSize:
    :param stopwords: 停用词
    :return:
    '''
    if not os.path.exists(segment_out_dir):
        os.makedirs(segment_out_dir)
    files_processing.delete_dir_file(segment_out_dir)

    sample_num = len(files_list)
    batchNum = int(math.ceil(1.0 * sample_num / batchSize))
    for i in range(batchNum):
        segment_out_name = os.path.join(segment_out_dir,
                                        'segment_{}.txt'.format(i))
        start = i * batchSize
        end = min((i + 1) * batchSize, sample_num)
        batch_files = files_list[start:end]
        content_list = segment_files_list(batch_files,
                                          stopwords,
                                          segment_type='word')
        # content_list=padding_sentences(content_list, padding_token='<PAD>', padding_sentence_length=15)
        save_content_list(segment_out_name, content_list, mode='ab')
        print("segment files:{}".format(segment_out_name))
コード例 #2
0
def save_multi_file(files_list, labels_list, word2vec_path, out_dir, prefix, batchSize, max_sentence_length,
                    labels_set=None, shuffle=False):
    '''
    将文件内容映射为索引矩阵,并且将数据保存为多个文件
    :param files_list:
    :param labels_list:
    :param word2vec_path: word2vec模型的位置
    :param out_dir: 文件保存的目录
    :param prefix:  保存文件的前缀名
    :param batchSize: 将多个文件内容保存为一个文件
    :param labels_set: labels集合
    :return:
    '''

    if not os.path.exists(out_dir):
        os.mkdir(out_dir)
    # 把该目录下的所有文件都删除
    files_processing.delete_dir_file(out_dir)
    if shuffle:
        random.seed(100)
        random.shuffle(files_list)
        random.seed(100)
        random.shuffle(labels_list)
    sample_num = len(files_list)
    w2vModel = load_wordVectors(word2vec_path)
    if labels_set is None:
        labels_set = files_processing.get_labels_set(label_list)
    labels_list, labels_set = files_processing.labels_encoding(labels_list, labels_set)
    labels_list = labels_list.tolist()
    batchNum = int(math.ceil(1.0 * sample_num / batchSize))
    for i in range(batchNum):
        start = i * batchSize
        end = min((i + 1) * batchSize, sample_num)
        batch_files = files_list[start:end]
        batch_labels = labels_list[start:end]
        # 读取文件内容,字词分割
        batch_content = files_processing.read_files_list_to_segment(batch_files,
                                                                    max_sentence_length,
                                                                    padding_token='<PAD>',
                                                                    segment_type='word')
        # 将字词转为索引矩阵
        batch_indexMat = word2indexMat(w2vModel, batch_content, max_sentence_length)
        batch_labels = np.asarray(batch_labels)
        batch_labels = batch_labels.reshape([len(batch_labels), 1])
        # 保存*.npy文件
        filename = os.path.join(out_dir, prefix + '{0}.npy'.format(i))
        labels_indexMat = cat_labels_indexMat(batch_labels, batch_indexMat)
        np.save(filename, labels_indexMat)
        print('step:{}/{}, save:{}, data.shape{}'.format(i, batchNum, filename, labels_indexMat.shape))