def parse_all(fnames=None, renew=False, proxy=None): """ 批量解析页面 :param fnames: :param renew 是否重新解析所有文件 :return: """ so = SQLiteOper("data/scrap.db") if renew: fnames = [] fname_gen = glob.iglob(r'data/secwiki/*.html') sql = 'delete from `secwiki_detail`' for f in fname_gen: fnames.append(f) so.execute(sql) if fnames is None: print "no new secwiki" return nos = sort_fname(fnames) # sqlite handler sql = """insert into `secwiki_detail`(`ts`,`tag`,`url`,`title`,`root_domain`,`domain`,`path`) values(?,?,?,?,?,?,?);""" # file handler result_fname = path("data/secwiki_{start}_{end}.txt".format( start=nos.keys()[0], end=nos.keys()[-1])) if not renew and os.path.isfile( result_fname) and os.path.getsize(result_fname) > 0: return result_fh = codecs.open(result_fname, mode='wb') for k in nos.keys(): fname = nos[k] with open(fname, mode='r') as html_hd: results_list = {} for content in parse_item(html_hd, so=so, proxy=proxy): if content: k = content[0] + content[2] results_list[k] = content line = "\t".join(content) print line result_fh.write("{line}{linesep}".format( line=line, linesep=os.linesep)) so.executemany(sql, operate_list=results_list.values()) result_fh.close()
def parse_all(renew=False, ndays=None, proxy=None): """ 解析多个页面 :return: """ so = SQLiteOper("data/scrap.db") # 解析或爬取缺失的页面 fname_lists = [] if ndays is not None: for cur_day in ndays: year = cur_day[0:4] month = cur_day[4:6] day = cur_day[6:8] fname = path("data/xuanwu/{year}/{month}/{day}/index.html".format( year=year, month=month, day=day)) if not os.path.exists(fname): fname = scrap_item(cur_day) if fname is None: print "%s news not exits" % cur_day else: fname_lists.append(fname) if renew: fname_lists = [] # 重新解析所有页面 sql = 'delete from `xuanwu_detail`' so.execute(sql) for fname in glob.iglob(r'data/xuanwu/*/*/*/index.html'): fname_lists.append(fname) if fname_lists: start, end = getstartendfrompath(fname_lists) sql = """ insert into `xuanwu_detail`(`ts`,`tag`,`url`,`title`,`root_domain`,`domain`,`path`,`author_id`) values(?,?,?,?,?,?,?,?); """ # file handler result_fname = path("data/xuanwu_{start}_{end}.txt".format(start=start, end=end)) if not renew and os.path.isfile( result_fname) and os.path.getsize(result_fname) > 0: return result_fh = codecs.open(result_fname, mode='wb') for fname in fname_lists: fname = path(fname) results_list = {} for content in parse_item(fname, so=so, proxy=proxy): if content: k = content[0] + content[2] results_list[k] = content line = "\t".join(content) print line result_fh.write("{line}{linesep}".format( line=line, linesep=os.linesep)) if results_list: so.executemany(sql, operate_list=results_list.values())