Beispiel #1
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2019-01-10 11:48:03
# @Author  : anxi.xue ([email protected])
# @Link    : ${link}
# @Version : $Id$

import os
import sys
sys.path.append('..')
from datautils import fileutils

root = fileutils.getDataPath() + os.sep
trainDataPath = root + 'news' + os.sep + "train"
testDataPath = root + 'news' + os.sep + "test"


def addStartEndTagForData(folderPath):
    for file in os.listdir(folderPath):
        filePath = folderPath + os.sep + file
        if os.path.isdir(filePath):
            continue
        with open(filePath, 'r+') as fp:
            content = fp.read()
            fp.seek(0, 0)
            if not content.startswith('<start>'):
                if not content.startswith('</start>'):
                    fp.write('<start>\n' + content + '\n</start>')


#将原素材进行转码
Beispiel #2
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2018-12-29 18:09:10
# @Author  : anxi.xue ([email protected])
# @Link    : ${link}
# @Version : $Id$

import os
import sys
sys.path.append('..')
from datautils import fileutils
from lxml import etree, html

# path = rootdatas.getDataPath() + os.sep + "a_pyplot.html"
# file = open(path, 'rb')
# content = file.read()
# file.close()

# page = html.document_fromstring(content)
# text = page.text_content()
# print(text)


path = fileutils.getDataPath() + os.sep + "news" + os.sep + \
    'test.txt'
file = open(path, 'r')
content = file.read()
file.close()
print(content)