#导入训练预料
data_set={}
#训练语料集路径
train_path='text_corpus1_wordbag/train_set.data'
file_obj=open(train_path,'rb')

#读取持久化后的对象
data_set=pickle.load(file_obj)
file_obj.close()

#定义词袋数据结构
wordbag=Bunch(target_name=[],label=[],filenames=[],tdm=[],vocabulary={})
wordbag.target_name=data_set.target_name
wordbag.label=data_set.label
wordbag.filenames=data_set.filenames

#构建语料
corpus=data_set.contents

#从文件导入停用词表
stpwrdpath='extra_dict/hlt_stop_words.txt'
stpwrd_dic=open(stpwrdpath,'rb')
stpwrd_content=stpwrd_dic.read()

#将停用词转换为list
stpwrdlst=stpwrd_content.splitlines()
stpwrd_dic.close()

#计算词袋创建时间:获取开始时间
start=datetime.datetime.now()
                            bunch.lable.append(list[0])
                elif children.tag == 'contenttitle':
                    contenttitle = children.text
                elif children.tag == 'content':
                    content = str(contenttitle)+' '+str(children.text)
                    if (len(content) > 0):
                        seg = jieba.cut(content, cut_all=False)
                        bunch.contents.append(' '.join(seg))
                    else:
                        bunch.contents.append('null')
        print('finish train file:',filePath)
fileutils.saveBatchObj(trainRawPath, bunch)

# parser all test data and save it to bunch
bunch.lable=[]
bunch.filenames=[]
bunch.contents=[]
contenttitle =''
for file in os.listdir(testDataPath):
    filePath = testDataPath + os.sep + file
    if os.path.isdir(filePath):
        print(file, ' is dir. continue')
        continue
    with open(filePath, 'r') as file:
        text = file.read()
        text = re.sub(u"[\x00-\x08\x0b-\x0c\x0e-\x1f|&]+", u"", text)
        root = ET.fromstring(text)
        for child in root:
            # 第二层节点的标签名称和属性,遍历xml文档的第三层
            for children in child:
                # 第三层节点的标签名称和属性
#导入训练预料
data_set = {}
#训练语料集路径
train_path = 'text_corpus1_wordbag/train_set.data'
file_obj = open(train_path, 'rb')

#读取持久化后的对象
data_set = pickle.load(file_obj)
file_obj.close()

#定义词袋数据结构
wordbag = Bunch(target_name=[], label=[], filenames=[], tdm=[], vocabulary={})
wordbag.target_name = data_set.target_name
wordbag.label = data_set.label
wordbag.filenames = data_set.filenames

#构建语料
corpus = data_set.contents

#从文件导入停用词表
stpwrdpath = 'extra_dict/hlt_stop_words.txt'
stpwrd_dic = open(stpwrdpath, 'rb')
stpwrd_content = stpwrd_dic.read()

#将停用词转换为list
stpwrdlst = stpwrd_content.splitlines()
stpwrd_dic.close()

#计算词袋创建时间:获取开始时间
start = datetime.datetime.now()
import numpy as np
from skimage import io
from sklearn.datasets.base import Bunch

from dip.load_data import load_image_files, load_mask_images
from dip.mask import bounding_rect_of_mask


datasets = load_mask_images()

data = []
for f, mask in zip(
        datasets.filenames,
        load_image_files(datasets.filenames),
        ):
    # rect: (min_x, max_x, min_y, max_x)
    rect = bounding_rect_of_mask(mask, negative=True)
    data.append(list(rect))
    print('{0}: {1}'.format(f, rect))

bunch = Bunch(name='mask rects')
bunch.data = np.array(data)
bunch.filenames = datasets.filenames
bunch.target = datasets.target
bunch.target_names = datasets.target_names
bunch.description = 'mask rects: (min_x, min_y, max_x, max_y)'

with gzip.open('rects.pkl.gz', 'wb') as f:
    pickle.dump(bunch, f)