Example #1
0
def parse_all(fnames=None, renew=False, proxy=None):
    """
    批量解析页面
    :param fnames:
    :param renew 是否重新解析所有文件
    :return:
    """
    so = SQLiteOper("data/scrap.db")
    if renew:
        fnames = []
        fname_gen = glob.iglob(r'data/secwiki/*.html')
        sql = 'delete from `secwiki_detail`'
        for f in fname_gen:
            fnames.append(f)

        so.execute(sql)

    if fnames is None:
        print "no new secwiki"
        return

    nos = sort_fname(fnames)

    # sqlite handler
    sql = """insert into `secwiki_detail`(`ts`,`tag`,`url`,`title`,`root_domain`,`domain`,`path`)
                            values(?,?,?,?,?,?,?);"""

    # file handler

    result_fname = path("data/secwiki_{start}_{end}.txt".format(
        start=nos.keys()[0], end=nos.keys()[-1]))

    if not renew and os.path.isfile(
            result_fname) and os.path.getsize(result_fname) > 0:
        return

    result_fh = codecs.open(result_fname, mode='wb')
    for k in nos.keys():
        fname = nos[k]

        with open(fname, mode='r') as html_hd:
            results_list = {}
            for content in parse_item(html_hd, so=so, proxy=proxy):
                if content:
                    k = content[0] + content[2]

                    results_list[k] = content

                    line = "\t".join(content)
                    print line
                    result_fh.write("{line}{linesep}".format(
                        line=line, linesep=os.linesep))

            so.executemany(sql, operate_list=results_list.values())

    result_fh.close()
Example #2
0
def parse_all(renew=False, ndays=None, proxy=None):
    """
    解析多个页面
    :return:
    """
    so = SQLiteOper("data/scrap.db")

    # 解析或爬取缺失的页面
    fname_lists = []
    if ndays is not None:

        for cur_day in ndays:
            year = cur_day[0:4]
            month = cur_day[4:6]
            day = cur_day[6:8]
            fname = path("data/xuanwu/{year}/{month}/{day}/index.html".format(
                year=year, month=month, day=day))

            if not os.path.exists(fname):

                fname = scrap_item(cur_day)
                if fname is None:
                    print "%s news not exits" % cur_day

                else:
                    fname_lists.append(fname)

    if renew:
        fname_lists = []
        # 重新解析所有页面
        sql = 'delete from `xuanwu_detail`'
        so.execute(sql)
        for fname in glob.iglob(r'data/xuanwu/*/*/*/index.html'):
            fname_lists.append(fname)

    if fname_lists:
        start, end = getstartendfrompath(fname_lists)
        sql = """
                    insert into `xuanwu_detail`(`ts`,`tag`,`url`,`title`,`root_domain`,`domain`,`path`,`author_id`)
                        values(?,?,?,?,?,?,?,?);
                    """
        # file handler
        result_fname = path("data/xuanwu_{start}_{end}.txt".format(start=start,
                                                                   end=end))

        if not renew and os.path.isfile(
                result_fname) and os.path.getsize(result_fname) > 0:
            return

        result_fh = codecs.open(result_fname, mode='wb')

        for fname in fname_lists:

            fname = path(fname)

            results_list = {}
            for content in parse_item(fname, so=so, proxy=proxy):
                if content:
                    k = content[0] + content[2]

                    results_list[k] = content
                    line = "\t".join(content)
                    print line
                    result_fh.write("{line}{linesep}".format(
                        line=line, linesep=os.linesep))

            if results_list:
                so.executemany(sql, operate_list=results_list.values())