#!/usr/bin/env python # -*- coding: utf-8 -*- # @Date : 2019-01-10 11:48:03 # @Author : anxi.xue ([email protected]) # @Link : ${link} # @Version : $Id$ import os import sys sys.path.append('..') from datautils import fileutils root = fileutils.getDataPath() + os.sep trainDataPath = root + 'news' + os.sep + "train" testDataPath = root + 'news' + os.sep + "test" def addStartEndTagForData(folderPath): for file in os.listdir(folderPath): filePath = folderPath + os.sep + file if os.path.isdir(filePath): continue with open(filePath, 'r+') as fp: content = fp.read() fp.seek(0, 0) if not content.startswith('<start>'): if not content.startswith('</start>'): fp.write('<start>\n' + content + '\n</start>') #将原素材进行转码
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Date : 2018-12-29 18:09:10 # @Author : anxi.xue ([email protected]) # @Link : ${link} # @Version : $Id$ import os import sys sys.path.append('..') from datautils import fileutils from lxml import etree, html # path = rootdatas.getDataPath() + os.sep + "a_pyplot.html" # file = open(path, 'rb') # content = file.read() # file.close() # page = html.document_fromstring(content) # text = page.text_content() # print(text) path = fileutils.getDataPath() + os.sep + "news" + os.sep + \ 'test.txt' file = open(path, 'r') content = file.read() file.close() print(content)