def main(): files = glob.glob('tmp/*') cats = defaultdict(list) for file_path in files: with open(file_path) as f: a = defaultdict(str) content = f.read() try: content = content.decode('BIG5', 'ignore') except: print file_path raise parser = etree.HTML(content) it = parser.xpath('//body/table')[2] title = it.xpath('.//table')[0].xpath('.//table')[0].xpath( './/th//text()') #first is normal, else alias if len(title) > 1: alias = ','.join(title[1:]) else: alias = '' title = title[0] if title else 'None' ps = it.xpath('.//p/text()') cat = parser.xpath( '//div[@id="divStayTopRight"]//font[@class="header_s"]/a/text()' ) cat = ','.join(cat) or 'None' cat = ftoj(cat).replace(':', ':'.decode('utf8')) ps = filter(unicode, ps) ps = filter(lambda x: x.strip(), ps) ps.insert(0, '【概述】'.decode('utf8')) ps = ''.join(ps) # 分段的..她不要 #current_key = '【概述】'.decode('utf8') #for i in ps: # if '【'.decode('utf8') in i: # current_key = ftoj(i) # continue # a[current_key] += ftoj(i) a['title'] = ftoj(title).replace(':', ':'.decode('utf8')) a['alias'] = ftoj(alias).replace(':', ':'.decode('utf8')) a['content'] = ftoj(ps).replace(':', ':'.decode('utf8')).replace( '\n', '').replace('\r', '') cats[cat].append(a) if cat == 'None': print 'no cat', file_path if title == 'None': print 'no title', file_path return cats
def main(): files = glob.glob("tmp/*") cats = defaultdict(list) for file_path in files: with open(file_path) as f: a = defaultdict(str) content = f.read() try: content = content.decode("BIG5", "ignore") except: print file_path raise parser = etree.HTML(content) it = parser.xpath("//body/table")[2] title = it.xpath(".//table")[0].xpath(".//table")[0].xpath(".//th//text()") # first is normal, else alias if len(title) > 1: alias = ",".join(title[1:]) else: alias = "" title = title[0] if title else "None" ps = it.xpath(".//p/text()") cat = parser.xpath('//div[@id="divStayTopRight"]//font[@class="header_s"]/a/text()') cat = ",".join(cat) or "None" cat = ftoj(cat).replace(":", ":".decode("utf8")) ps = filter(unicode, ps) ps = filter(lambda x: x.strip(), ps) ps.insert(0, "【概述】".decode("utf8")) ps = "".join(ps) # 分段的..她不要 # current_key = '【概述】'.decode('utf8') # for i in ps: # if '【'.decode('utf8') in i: # current_key = ftoj(i) # continue # a[current_key] += ftoj(i) a["title"] = ftoj(title).replace(":", ":".decode("utf8")) a["alias"] = ftoj(alias).replace(":", ":".decode("utf8")) a["content"] = ftoj(ps).replace(":", ":".decode("utf8")).replace("\n", "").replace("\r", "") cats[cat].append(a) if cat == "None": print "no cat", file_path if title == "None": print "no title", file_path return cats
def has_cn(txt): txt = txt.decode('utf-8', 'ignore') txt = ftoj(txt) cn = len(CN_CHAR.findall(txt)) jp = len(JP_CHAR.findall(txt)) if cn >= 3 and cn > jp*5: return True
def has_cn(txt): txt = txt.decode("utf-8", "ignore") txt = ftoj(txt) cn = len(CN_CHAR.findall(txt)) jp = len(JP_CHAR.findall(txt)) if cn >= 3 and cn > jp * 5: return True