#导入训练预料 data_set={} #训练语料集路径 train_path='text_corpus1_wordbag/train_set.data' file_obj=open(train_path,'rb') #读取持久化后的对象 data_set=pickle.load(file_obj) file_obj.close() #定义词袋数据结构 wordbag=Bunch(target_name=[],label=[],filenames=[],tdm=[],vocabulary={}) wordbag.target_name=data_set.target_name wordbag.label=data_set.label wordbag.filenames=data_set.filenames #构建语料 corpus=data_set.contents #从文件导入停用词表 stpwrdpath='extra_dict/hlt_stop_words.txt' stpwrd_dic=open(stpwrdpath,'rb') stpwrd_content=stpwrd_dic.read() #将停用词转换为list stpwrdlst=stpwrd_content.splitlines() stpwrd_dic.close() #计算词袋创建时间:获取开始时间 start=datetime.datetime.now()
bunch.lable.append(list[0]) elif children.tag == 'contenttitle': contenttitle = children.text elif children.tag == 'content': content = str(contenttitle)+' '+str(children.text) if (len(content) > 0): seg = jieba.cut(content, cut_all=False) bunch.contents.append(' '.join(seg)) else: bunch.contents.append('null') print('finish train file:',filePath) fileutils.saveBatchObj(trainRawPath, bunch) # parser all test data and save it to bunch bunch.lable=[] bunch.filenames=[] bunch.contents=[] contenttitle ='' for file in os.listdir(testDataPath): filePath = testDataPath + os.sep + file if os.path.isdir(filePath): print(file, ' is dir. continue') continue with open(filePath, 'r') as file: text = file.read() text = re.sub(u"[\x00-\x08\x0b-\x0c\x0e-\x1f|&]+", u"", text) root = ET.fromstring(text) for child in root: # 第二层节点的标签名称和属性,遍历xml文档的第三层 for children in child: # 第三层节点的标签名称和属性
#导入训练预料 data_set = {} #训练语料集路径 train_path = 'text_corpus1_wordbag/train_set.data' file_obj = open(train_path, 'rb') #读取持久化后的对象 data_set = pickle.load(file_obj) file_obj.close() #定义词袋数据结构 wordbag = Bunch(target_name=[], label=[], filenames=[], tdm=[], vocabulary={}) wordbag.target_name = data_set.target_name wordbag.label = data_set.label wordbag.filenames = data_set.filenames #构建语料 corpus = data_set.contents #从文件导入停用词表 stpwrdpath = 'extra_dict/hlt_stop_words.txt' stpwrd_dic = open(stpwrdpath, 'rb') stpwrd_content = stpwrd_dic.read() #将停用词转换为list stpwrdlst = stpwrd_content.splitlines() stpwrd_dic.close() #计算词袋创建时间:获取开始时间 start = datetime.datetime.now()
import numpy as np from skimage import io from sklearn.datasets.base import Bunch from dip.load_data import load_image_files, load_mask_images from dip.mask import bounding_rect_of_mask datasets = load_mask_images() data = [] for f, mask in zip( datasets.filenames, load_image_files(datasets.filenames), ): # rect: (min_x, max_x, min_y, max_x) rect = bounding_rect_of_mask(mask, negative=True) data.append(list(rect)) print('{0}: {1}'.format(f, rect)) bunch = Bunch(name='mask rects') bunch.data = np.array(data) bunch.filenames = datasets.filenames bunch.target = datasets.target bunch.target_names = datasets.target_names bunch.description = 'mask rects: (min_x, min_y, max_x, max_y)' with gzip.open('rects.pkl.gz', 'wb') as f: pickle.dump(bunch, f)