def execute_from_command_line(): yahoo_dir = 'J:\\yahoo_data\\' yahoo_txt_file = open('J:\\yahoo_data\\all.txt', 'w') start = 1 while 1: filename = yahoo_dir + str(start) + '.html' if os.path.isfile(filename): fp = open(filename, 'r') htmltxt = ''.join(fp.readlines()) fp.close() #codedetect = chardet.detect(htmltxt)["encoding"] #检测得到编码方式 #print codedetect #htmltxt = unicode(htmltxt, codedetect).encode('utf-8') #target_filename = yahoo_txt_dir + str(start) + '.txt' yahoo_txt_file.write(html2txt(htmltxt)) print 'Success change html to txt' start += 1 else: break yahoo_txt_file.close()
def html_to_txt(): """将多个html文件合并为一个txt文件,统一编码为utf-8 or ascii """ ft = open(yahoo_txt, 'w') start = 1 while 1: filename = yahoo_dir + str(start) + '.html' if os.path.isfile(filename): fp = open(filename, 'r') htmltxt = ''.join(fp.readlines()) if not htmltxt or not len(htmltxt): continue fp.close() codedetect = chardet.detect(htmltxt)["encoding"] #检测得到修改之前的编码方式 print codedetect if not codedetect in ['utf-8', 'ascii']: htmltxt = unicode(htmltxt, codedetect).encode('utf-8') codedetect = chardet.detect(htmltxt)["encoding"] #检测得到修改之后的编码方式 print 'change', codedetect #target_filename = yahoo_txt_dir + str(start) + '.txt' #ft = open(target_filename, 'w') ft.write(html2txt(htmltxt)) print 'Success change html to txt %s' % start start += 1 else: break ft.close()
def html_to_txt(): """将多个html文件合并为一个txt文件,统一编码为utf-8 or ascii """ ft = open(yahoo_txt, 'w') start = 1 while 1: filename = yahoo_dir + str(start) + '.html' if os.path.isfile(filename): fp = open(filename, 'r') htmltxt = ''.join(fp.readlines()) if not htmltxt or not len(htmltxt): continue fp.close() codedetect = chardet.detect(htmltxt)["encoding"] #检测得到修改之前的编码方式 print codedetect if not codedetect in ['utf-8', 'ascii']: htmltxt = unicode(htmltxt, codedetect).encode('utf-8') codedetect = chardet.detect(htmltxt)[ "encoding"] #检测得到修改之后的编码方式 print 'change', codedetect #target_filename = yahoo_txt_dir + str(start) + '.txt' #ft = open(target_filename, 'w') ft.write(html2txt(htmltxt)) print 'Success change html to txt %s' % start start += 1 else: break ft.close()