Esempio n. 1
0
def main():
    files = glob.glob('tmp/*')
    cats = defaultdict(list)
    for file_path in files:
        with open(file_path) as f:
            a = defaultdict(str)
            content = f.read()
            try:
                content = content.decode('BIG5', 'ignore')
            except:
                print file_path
                raise

            parser = etree.HTML(content)
            it = parser.xpath('//body/table')[2]
            title = it.xpath('.//table')[0].xpath('.//table')[0].xpath(
                './/th//text()')
            #first is normal, else alias
            if len(title) > 1:
                alias = ','.join(title[1:])
            else:
                alias = ''
            title = title[0] if title else 'None'

            ps = it.xpath('.//p/text()')
            cat = parser.xpath(
                '//div[@id="divStayTopRight"]//font[@class="header_s"]/a/text()'
            )
            cat = ','.join(cat) or 'None'
            cat = ftoj(cat).replace(':', ':'.decode('utf8'))

            ps = filter(unicode, ps)
            ps = filter(lambda x: x.strip(), ps)

            ps.insert(0, '【概述】'.decode('utf8'))
            ps = ''.join(ps)

            # 分段的..她不要
            #current_key = '【概述】'.decode('utf8')
            #for i in ps:
            #    if '【'.decode('utf8') in i:
            #        current_key = ftoj(i)
            #        continue
            #    a[current_key] += ftoj(i)

            a['title'] = ftoj(title).replace(':', ':'.decode('utf8'))
            a['alias'] = ftoj(alias).replace(':', ':'.decode('utf8'))
            a['content'] = ftoj(ps).replace(':', ':'.decode('utf8')).replace(
                '\n', '').replace('\r', '')

            cats[cat].append(a)
            if cat == 'None':
                print 'no cat', file_path
            if title == 'None':
                print 'no title', file_path
    return cats
Esempio n. 2
0
def main():
    files = glob.glob("tmp/*")
    cats = defaultdict(list)
    for file_path in files:
        with open(file_path) as f:
            a = defaultdict(str)
            content = f.read()
            try:
                content = content.decode("BIG5", "ignore")
            except:
                print file_path
                raise

            parser = etree.HTML(content)
            it = parser.xpath("//body/table")[2]
            title = it.xpath(".//table")[0].xpath(".//table")[0].xpath(".//th//text()")
            # first is normal, else alias
            if len(title) > 1:
                alias = ",".join(title[1:])
            else:
                alias = ""
            title = title[0] if title else "None"

            ps = it.xpath(".//p/text()")
            cat = parser.xpath('//div[@id="divStayTopRight"]//font[@class="header_s"]/a/text()')
            cat = ",".join(cat) or "None"
            cat = ftoj(cat).replace(":", ":".decode("utf8"))

            ps = filter(unicode, ps)
            ps = filter(lambda x: x.strip(), ps)

            ps.insert(0, "【概述】".decode("utf8"))
            ps = "".join(ps)

            # 分段的..她不要
            # current_key = '【概述】'.decode('utf8')
            # for i in ps:
            #    if '【'.decode('utf8') in i:
            #        current_key = ftoj(i)
            #        continue
            #    a[current_key] += ftoj(i)

            a["title"] = ftoj(title).replace(":", ":".decode("utf8"))
            a["alias"] = ftoj(alias).replace(":", ":".decode("utf8"))
            a["content"] = ftoj(ps).replace(":", ":".decode("utf8")).replace("\n", "").replace("\r", "")

            cats[cat].append(a)
            if cat == "None":
                print "no cat", file_path
            if title == "None":
                print "no title", file_path
    return cats
Esempio n. 3
0
def has_cn(txt):
    txt = txt.decode('utf-8', 'ignore')
    txt = ftoj(txt)
    cn = len(CN_CHAR.findall(txt))
    jp = len(JP_CHAR.findall(txt))
    if cn >= 3 and cn > jp*5:
        return True
Esempio n. 4
0
def has_cn(txt):
    txt = txt.decode("utf-8", "ignore")
    txt = ftoj(txt)
    cn = len(CN_CHAR.findall(txt))
    jp = len(JP_CHAR.findall(txt))
    if cn >= 3 and cn > jp * 5:
        return True