def split_train_test(new_databunch, split_value): lenth = len(new_databunch.contents) # 数据总量 split_point = int((1 - split_value) * lenth) trainbunch = Bunch() testbunch = Bunch() trainbunch.contents = new_databunch.contents[:split_point] testbunch.contents = new_databunch.contents[split_point:] trainbunch.accu = new_databunch.accu[:split_point] testbunch.accu = new_databunch.accu[split_point:] return trainbunch, testbunch
def tfidfspace(bunch_file, tfidf_file, train_bunch_file=None): tfidfbunch = Bunch(labels=[], contents=[], tdm=[], vocabulary={}) # 读取bunch_file中的bunch, 将label赋予tfidfbunch中的label with open(bunch_file, "rb") as f: bunch = pickle.load(f) tfidfbunch.label = bunch.label tfidfbunch.contents = bunch.contents if train_bunch_file is None: # 此时对训练数据生成tfidf空间 vectorizer = TfidfVectorizer(max_df=0.4, sublinear_tf=True) tfidfbunch.tdm = vectorizer.fit_transform(bunch.contents) tfidfbunch.vocabulary = vectorizer.vocabulary_ else: # 对测试数据生成tfidf空间,保证与训练集的单词字典是相同的。 with open(train_bunch_file, "rb") as f: train_bunch = pickle.load(f) tfidfbunch.vocabulary = train_bunch.vocabulary vectorizer = TfidfVectorizer(max_df=0.4, sublinear_tf=True, vocabulary=train_bunch.vocabulary) tfidfbunch.tdm = vectorizer.fit_transform(bunch.contents) # 将tfidfbunch写入tfidf_file with open(tfidf_file, "wb") as f: pickle.dump(tfidfbunch, f) #保存tfidf模型 joblib.dump(vectorizer, TFIDF_FILE)
elif children.tag == 'contenttitle': contenttitle = children.text elif children.tag == 'content': content = str(contenttitle)+' '+str(children.text) if (len(content) > 0): seg = jieba.cut(content, cut_all=False) bunch.contents.append(' '.join(seg)) else: bunch.contents.append('null') print('finish train file:',filePath) fileutils.saveBatchObj(trainRawPath, bunch) # parser all test data and save it to bunch bunch.lable=[] bunch.filenames=[] bunch.contents=[] contenttitle ='' for file in os.listdir(testDataPath): filePath = testDataPath + os.sep + file if os.path.isdir(filePath): print(file, ' is dir. continue') continue with open(filePath, 'r') as file: text = file.read() text = re.sub(u"[\x00-\x08\x0b-\x0c\x0e-\x1f|&]+", u"", text) root = ET.fromstring(text) for child in root: # 第二层节点的标签名称和属性,遍历xml文档的第三层 for children in child: # 第三层节点的标签名称和属性 bunch.filenames.append(filePath)