def dooneurl(taskid=0): ''' 执行一次爬行 ''' rec = dbm.getoneurl(taskid) if rec: rec['status'] = 1 # 设置为在爬 rec['fetchtime'] = dbm.timestring() dbm.setoneurl(**rec) dbm.commit() res = gethtml(rec['url']) html = res['html'] stat = res['status'] baseurl = rec['baseurl'] if rec['url'] != res['url']: baseurl = dbm.getbaseurl(res['url']) if stat: ctxs = getcontex(html, rec['keyword']) titl = gettitle(html) # print 'title:%s'%titl # print 'html:%s'%html rec['title'] = titl if rec['type'] == TYPERECORDALLHTML: rec['html'] = html # print "ctxs:%s"%ctxs count = 0 if ctxs and len(ctxs): count = len(ctxs) rec['count'] = count rec['context'] = ';;;'.join(ctxs) if rec['type'] == TYPERECORDMATCHHTML: rec['html'] = html # 继续搜索 if rec['deep'] < rec['maxdeep']: urls = geturls(html, baseurl) # print "urls:%s"%urls if urls and len(urls): rec['childcount'] = len(urls) for i in range(len(urls)): url = urls[i] if rec['urlflag'] and url.find(rec['urlflag'])==-1: continue one = dbm.getoneurl(taskid, url) if not one: print 'add url: %s'%url dbm.addoneurl(taskid=taskid, pid=rec['id'], url=url, keyword=rec['keyword'], type=rec['type'], deep=rec['deep']+1, urlflag=rec['urlflag'], power=count, maxdeep=rec['maxdeep']) rec['status'] = 2 # 设置为已爬 rec['completetime'] = dbm.timestring() dbm.setoneurl(**rec) dbm.commit() else: print 'get url fail:%s'%rec['url'] print 'fail info:%s'%html rec['html'] = html rec['status'] = 3 # 设置为出错 rec['completetime'] = dbm.timestring() dbm.setoneurl(**rec) dbm.commit() return True else: print 'task%d is success'%taskid return False
def main(): rows = dbm.findall() for row in rows: item = map_row(row) dbm.save2(item) dbm.commit()
print 'fail info:%s'%html rec['html'] = html rec['status'] = 3 # 设置为出错 rec['completetime'] = dbm.timestring() dbm.setoneurl(**rec) dbm.commit() return True else: print 'task%d is success'%taskid return False def dotask(taskid): ''' 执行一次爬行任务 ''' res = True try: res = dooneurl(taskid) except: pass return res if __name__ == '__main__': taskid = 1 dbm.clear() dbm.addoneurl(taskid=taskid, url='http://news.baidu.com/', urlflag='news.baidu.com', keyword='H7N9', maxdeep=2, type=TYPERECORDMATCHHTML) dbm.commit() while dotask(taskid): pass print 'fetch ok'