def corpus_segment(corpus_path, seg_path): catelist = os.listdir( corpus_path) # Gets all subdirectories under corpus_path ## In fact, the name of subdirectories is the category #print("Segmenting..Please wait.") # Gets all the files under each directory (category) for mydir in catelist: class_path = corpus_path + mydir + "/" seg_dir = seg_path + mydir + "/" if not os.path.exists( seg_dir ): # Whether there is a word segmentation directory, if not, create it os.makedirs(seg_dir) file_list = os.listdir( class_path ) # Get all the text in a category in an unsegmented term repository # Traverse all files in the category directory and to process for file_path in file_list: fullname = class_path + file_path content = readfile(fullname) content = content.replace( '\r\n'.encode('utf-8'), ''.encode('utf-8')).strip() # Delete line breaks content = content.replace( ' '.encode('utf-8'), ''.encode('utf-8')).strip() # Delete empty lines, extra spaces content_seg = jieba.cut(content) # segment savefile(seg_dir + file_path, ' '.join(content_seg).encode( 'utf-8')) # Save the segmented file
def corpus_segment(corpus_path, seg_path): catelist = os.listdir(corpus_path) # 获取corpus_path下的所有子目录 print("玩儿命分词中...") # 获取每个目录(类别)下所有的文件 for mydir in catelist: class_path = corpus_path + mydir + "/" # 拼出分类子目录的路径 seg_dir = seg_path + mydir + "/" # 拼出分词后存贮的对应目录路径 if not os.path.exists(seg_dir): # 是否存在分词目录,如果没有则创建该目录 os.makedirs(seg_dir) file_list = os.listdir(class_path) # 获取未分词语料库中某一类别中的所有文本 for file_path in file_list: # 遍历类别目录下的所有文件 fullname = class_path + file_path # 拼出文件名全路径如:train_corpus/art/21.txt content = readfile(fullname) # 读取文件内容 content = content.replace('\r\n'.encode('utf-8'), ''.encode('utf-8')).strip() # 删除换行 content = content.replace(' '.encode('utf-8'), ''.encode('utf-8')).strip() # 删除空行、多余的空格 content_seg = jieba.cut(content) # 为文件内容分词 savefile(seg_dir + file_path, ' '.join(content_seg).encode('utf-8')) # 将处理后的文件保存到分词后语料目录 print("中文语料分词结束!!!")
def predicted_classifier(): corpus_path = "./upload_corpus/" content = request.args.get('content', '') hl = md5() hl.update(content.encode(encoding='utf-8')) filename = hl.hexdigest() savefile(corpus_path + filename + ".txt" , content.encode('utf-8')) data = dict(class_type=classifier(filename)) resp = jsonify(data) return resp
def corpus_segment(corpus_path, seg_path): ''' corpus_path是未分词语料库路径 seg_path是分词后语料库存储路径 ''' catelist = os.listdir(corpus_path) # 获取corpus_path下的所有子目录 ''' 其中子目录的名字就是类别名 train_corpus/it/21.txt中,'train_corpus/'是corpus_path,'it'是catelist中的一个成员 ''' print("玩儿命分词中...") # 获取每个目录(类别)下所有的文件 for mydir in catelist: ''' 这里mydir就是train_corpus/it/21.txt中的it(即catelist中的一个类别) ''' class_path = corpus_path + mydir + "/" # 拼出分类子目录的路径如:train_corpus/it/ seg_dir = seg_path + mydir + "/" # 拼出分词后存贮的对应目录路径如:train_corpus_seg/it/ if not os.path.exists(seg_dir): # 是否存在分词目录,如果没有则创建该目录 os.makedirs(seg_dir) file_list = os.listdir(class_path) # 获取未分词语料库中某一类别中的所有文本 ''' train_corpus/it/中的 21.txt, 22.txt, 23.txt ... file_list=['21.txt','22.txt',...] ''' for file_path in file_list: # 遍历类别目录下的所有文件 fullname = class_path + file_path # 拼出文件名全路径如:train_corpus/it/21.txt content = readfile(fullname) # 读取文件内容 '''此时,content里面存贮的是原文本的所有字符,例如多余的空格、空行、回车等等, 接下来,我们需要把这些无关痛痒的字符统统去掉,变成只有标点符号做间隔的紧凑的文本内容 ''' content = content.replace('\r\n'.encode('utf-8'), ''.encode('utf-8')).strip() # 删除换行 content = content.replace(' '.encode('utf-8'), ''.encode('utf-8')).strip() # 删除空行、多余的空格 content_seg = jieba.cut(content) # 为文件内容分词 savefile( seg_dir + file_path, ' '.join(content_seg).encode('utf-8')) # 将处理后的文件保存到分词后语料目录 print("中文语料分词结束!!!")
def corpus_segment(filename): # 对上传文件进行分词 corpus_path = "./upload_corpus/" + filename + ".txt" # 未分词分类语料库路径 seg_path = "./upload_corpus_seg/" + filename + ".txt" # 分词后分类语料库路径 ''' corpus_path是未分词语料库路径 seg_path是分词后语料库存储路径 ''' content = readfile(corpus_path) # 读取文件内容 '''此时,content里面存贮的是原文本的所有字符,例如多余的空格、空行、回车等等, 接下来,我们需要把这些无关痛痒的字符统统去掉,变成只有标点符号做间隔的紧凑的文本内容 ''' content = content.replace('\r\n'.encode('utf-8'), ''.encode('utf-8')).strip() # 删除换行 content = content.replace(' '.encode('utf-8'), ''.encode('utf-8')).strip() # 删除空行、多余的空格 content_seg = jieba.cut(content) # 为文件内容分词 savefile(seg_path, ' '.join(content_seg).encode('utf-8')) # 将处理后的文件保存到分词后语料目录
def corpus_segment(corpus_path, seg_path): ''' corpus_path is the path for file before division seg_path is the path for file after division ''' catelist = os.listdir(corpus_path) ''' catelist record all the folder names in the corpus_path, including 'art','literature','education'... ''' print("the jieba is working") # to obtain the file under each folder for mydir in catelist: class_path = corpus_path + mydir + "/" # train_corpus/art/ seg_dir = seg_path + mydir + "/" # train_corpus_seg/art/ if not os.path.exists(seg_dir): # create the train_corpus_seg os.makedirs(seg_dir) file_list = os.listdir(class_path) for file_path in file_list: # visit all the file under file_list fullname = class_path + file_path # give the full path:train_corpus/art/21.txt content = readfile(fullname) #read the .txt file '''delete the white space,null string,return ''' content = content.replace( '\r\n'.encode('utf-8'), ''.encode('utf-8')).strip() # delete return content = content.replace( ' '.encode('utf-8'), ''.encode('utf-8')).strip() # delete white space content_seg = jieba.cut(content) # 为文件内容分词 savefile(seg_dir + file_path, ' '.join(content_seg).encode('utf-8')) # put the file after division into seg_path print("the division of sentences is finished!!!")
@file: fetch.py @time: 2018/8/4 16:12 @software: PyCharm """ from bs4 import BeautifulSoup import requests import os from Tools import savefile # 财经资讯语料库抓取 corpus_path = "./train_corpus/C2-Financial/C2-Financial" count = 0 start = 338000 end = 338100 for article_id in range(start, end): url = 'https://wallstreetcn.com/articles/' + str(article_id) res = requests.get(url) html = res.text """ soup 表示被解析的html格式的内容 html.parser表示解析用的解析器 """ soup = BeautifulSoup(html, "html.parser") soup_content = soup.find_all("div", class_="node-article-content") if soup_content: text = soup_content[0].get_text() count += 1 file_path = corpus_path + str(count) + '.txt' savefile(file_path, text.encode('utf-8')) print(text) print(file_path)