def save_metadata(dbcurr, binhash, address, start_time, data): utcnow = datetime.datetime.utcnow() name = threading.currentThread().getName() try: info = parse_metadata(data) if not info: return except: traceback.print_exc() return info_hash = binhash.encode('hex') info['info_hash'] = info_hash # need to build tags info['tagged'] = False info['classified'] = False info['requests'] = 1 info['last_seen'] = utcnow info['source_ip'] = address[0] if info.get('files'): files = [z for z in info['files'] if not z['path'].startswith('_')] if not files: files = info['files'] else: files = [{'path': info['name'], 'length': info['length']}] files.sort(key=lambda z:z['length'], reverse=True) bigfname = files[0]['path'] info['extension'] = metautils.get_extension(bigfname).lower() info['category'] = metautils.get_category(info['extension']) if info['category'] == u'安装包': pass elif info['category'] == u'压缩文件': pass elif info['category'] == u'图像': pass elif info['category'] == u'文档书籍': pass if 'files' in info: try: dbcurr.execute('INSERT INTO search_filelist VALUES(%s, %s)', (info['info_hash'], json.dumps(info['files']))) except: print name, 'insert error', sys.exc_info()[1] del info['files'] try: try: print '\n', datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),'Saved', info['info_hash'], info['name'], (time.time()-start_time), 's', address[0], geoip.country_name_by_addr(address[0]), except: print '\n',datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 'Saved', info['info_hash'], sys.exc_info()[1] ret = dbcurr.execute('INSERT INTO search_hash(info_hash,category,data_hash,name,extension,classified,source_ip,tagged,' + 'length,create_time,last_seen,requests,comment,creator) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', (info['info_hash'], info['category'], info['data_hash'], info['name'], info['extension'], info['classified'], info['source_ip'], info['tagged'], info['length'], info['create_time'], info['last_seen'], info['requests'], info.get('comment',''), info.get('creator',''))) dbcurr.connection.commit() except: print name, 'save error', info traceback.print_exc() return
def got_torrent(self): utcnow = datetime.datetime.utcnow() binhash, address, data, dtype, start_time = self.metadata_queue.get() if dtype == 'pt': self.n_downloading_pt -= 1 elif dtype == 'lt': self.n_downloading_lt -= 1 if not data: return self.n_valid += 1 try: info = self.parse_torrent(data) if not info: return except: traceback.print_exc() return info_hash = binhash.encode('hex') info['info_hash'] = info_hash # need to build tags info['tagged'] = False info['classified'] = False info['requests'] = 1 info['last_seen'] = utcnow info['source_ip'] = address[0] if info.get('files'): files = [z for z in info['files'] if not z['path'].startswith('_')] if not files: files = info['files'] else: files = [{'path': info['name'], 'length': info['length']}] files.sort(key=lambda z:z['length'], reverse=True) bigfname = files[0]['path'] info['extension'] = metautils.get_extension(bigfname).lower() info['category'] = metautils.get_category(info['extension']) if 'files' in info: try: self.dbcurr.execute('INSERT INTO search_filelist VALUES(%s, %s)', (info['info_hash'], json.dumps(info['files']))) except: print self.name, 'insert error', sys.exc_info()[1] del info['files'] try: print '\n', 'Saved', info['info_hash'], dtype, info['name'], (time.time()-start_time), 's', address[0], ret = self.dbcurr.execute('INSERT INTO search_hash(info_hash,category,data_hash,name,extension,classified,source_ip,tagged,' + 'length,create_time,last_seen,requests,comment,creator) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', (info['info_hash'], info['category'], info['data_hash'], info['name'], info['extension'], info['classified'], info['source_ip'], info['tagged'], info['length'], info['create_time'], info['last_seen'], info['requests'], info.get('comment',''), info.get('creator',''))) self.dbconn.commit() except: print self.name, 'save error', self.name, info traceback.print_exc() return self.n_new += 1
def save_metadata(dbcurr, binhash, address, start_time, data, blacklist): utcnow = datetime.datetime.utcnow() name = threading.currentThread().getName() try: info = parse_metadata(data) if not info: return except: traceback.print_exc() return info_hash = binhash.encode('hex') info['info_hash'] = info_hash # need to build tags info['tagged'] = False info['classified'] = False info['requests'] = 1 info['last_seen'] = utcnow info['source_ip'] = address[0] for item in blacklist: if str(item) in info['name']: return if info.get('files'): files = [z for z in info['files'] if not z['path'].startswith('_')] if not files: files = info['files'] else: files = [{'path': info['name'], 'length': info['length']}] files.sort(key=lambda z:z['length'], reverse=True) bigfname = files[0]['path'] info['extension'] = metautils.get_extension(bigfname).lower() info['category'] = metautils.get_category(info['extension']) if 'files' in info: try: dbcurr.execute('INSERT INTO search_filelist VALUES(%s, %s)', (info['info_hash'], json.dumps(info['files']))) except: print name, 'insert error', sys.exc_info()[1] del info['files'] try: try: print '\n', 'Saved', info['info_hash'], info['name'], (time.time()-start_time), 's', address[0], geoip.country_name_by_addr(address[0]), except: print '\n', 'Saved', info['info_hash'], sys.exc_info()[1] try: ret = dbcurr.execute('INSERT INTO search_hash(info_hash,category,data_hash,name,extension,classified,source_ip,tagged,' + 'length,create_time,last_seen,requests,comment,creator) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', (info['info_hash'], info['category'], info['data_hash'], info['name'], info['extension'], info['classified'], info['source_ip'], info['tagged'], info['length'], info['create_time'], info['last_seen'], info['requests'], info.get('comment',''), info.get('creator',''))) except: print 'insert search_hash err: ',info['info_hash'] dbcurr.connection.commit() except: print name, 'save error', info traceback.print_exc() return
def got_torrent(self): if self.metadata_queue.qsize() == 0: return binhash, address, data,start_time = self.metadata_queue.get() if not data: return try: info = self.parse_metadata(data) if not info: return except: traceback.print_exc() return temp = time.time() x = time.localtime(float(temp)) utcnow = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now info_hash = binhash.encode('hex') #磁力 info['info_hash'] = info_hash # need to build tags info['tagged'] = False info['classified'] = False info['requests'] = 1 info['last_seen'] = utcnow info['create_time'] = utcnow info['source_ip'] = address[0] if info.get('files'): files = [z for z in info['files'] if not z['path'].startswith('_')] if not files: files = info['files'] else: files = [{'path': info['name'], 'length': info['length']}] files.sort(key=lambda z:z['length'], reverse=True) bigfname = files[0]['path'] info['extension'] = metautils.get_extension(bigfname).lower() info['category'] = metautils.get_category(info['extension']) try: try: print '\n', 'Saved', info['info_hash'], info['name'], (time.time()-start_time), 's', address[0] except: print '\n', 'Saved', info['info_hash'] ret = self.dbcurr.execute('INSERT INTO search_hash(info_hash,category,data_hash,name,extension,classified,source_ip,tagged,' + 'length,create_time,last_seen,requests) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', (info['info_hash'], info['category'], info['data_hash'], info['name'], info['extension'], info['classified'], info['source_ip'], info['tagged'], info['length'], info['create_time'], info['last_seen'], info['requests'])) if self.count %50 ==0: self.dbconn.commit() if self.count>100000: self.count=0 except: print self.name, 'save error', self.name, info traceback.print_exc() return
def got_torrent(self): if self.metadata_queue.qsize() == 0: return binhash, address, data,start_time = self.metadata_queue.get() if not data: return try: info = self.parse_metadata(data) if not info: return except: traceback.print_exc() return temp = time.time() x = time.localtime(float(temp)) utcnow = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now info_hash = binhash.encode('hex') #磁力 info['info_hash'] = info_hash # need to build tags info['tagged'] = False info['classified'] = False info['requests'] = 1 info['last_seen'] = utcnow info['create_time'] = utcnow info['source_ip'] = address[0] if info.get('files'): files = [z for z in info['files'] if not z['path'].startswith('_')] if not files: files = info['files'] else: files = [{'path': info['name'], 'length': info['length']}] files.sort(key=lambda z:z['length'], reverse=True) bigfname = files[0]['path'] info['extension'] = metautils.get_extension(bigfname).lower() info['category'] = metautils.get_category(info['extension']) try: try: print '\n', 'Saved', info['info_hash'], info['name'], (time.time()-start_time), 's', address[0] except: print '\n', 'Saved', info['info_hash'] ret = self.dbcurr.execute('INSERT INTO search_hash(info_hash,category,data_hash,name,extension,classified,source_ip,tagged,' + 'length,create_time,last_seen,requests) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', (info['info_hash'], info['category'], info['data_hash'], info['name'], info['extension'], info['classified'], info['source_ip'], info['tagged'], info['length'], info['create_time'], info['last_seen'], info['requests'])) if self.count % 50 == 0 : self.dbconn.commit() if self.count > 100000: self.count = 0 except: print self.name, 'save error', self.name, info traceback.print_exc() return
def save_metadata(dbcurr, binhash, address, start_time, data): utcnow = datetime.datetime.utcnow() name = threading.currentThread().getName() try: info = parse_metadata(data) if not info: return except: traceback.print_exc() return info_hash = binhash.encode('hex') info['info_hash'] = info_hash # need to build tags info['tagged'] = False info['classified'] = False info['requests'] = 1 info['last_seen'] = utcnow info['source_ip'] = address[0] if info.get('files'): files = [z for z in info['files'] if not z['path'].startswith('_')] if not files: files = info['files'] else: files = [{'path': info['name'], 'length': info['length']}] files.sort(key=lambda z: z['length'], reverse=True) bigfname = files[0]['path'] info['extension'] = metautils.get_extension(bigfname).lower() info['category'] = metautils.get_category(info['extension']) if 'files' in info: try: dbcurr.execute('INSERT INTO search_filelist VALUES(%s, %s)', (info['info_hash'], json.dumps(info['files']))) except Exception: pass del info['files'] try: sql = ( 'INSERT INTO search_hash(info_hash,category,data_hash,name,extension,' 'classified,source_ip,tagged,length,create_time,last_seen,requests,' 'comment,creator) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' ) ret = dbcurr.execute( sql, (info['info_hash'], info['category'], info['data_hash'], info['name'], info['extension'], info['classified'], info['source_ip'], info['tagged'], info['length'], info['create_time'], info['last_seen'], info['requests'], info.get('comment', ''), info.get('creator', ''))) dbcurr.connection.commit() except: print name, 'save error', info traceback.print_exc() return
def save_metadata(dbcurr, infohash, address, start_time, info): utcnow = datetime.datetime.utcnow() name = threading.currentThread().getName() # try: # info = parse_metadata(data) # if not info: # raise Exception("no info!") # except: # traceback.print_exc() # return info_hash = infohash info['info_hash'] = info_hash # need to build tags info['tagged'] = False info['classified'] = False info['requests'] = 1 info['last_seen'] = utcnow info['source_ip'] = address[0] if info.get('files'): files = [z for z in info['files'] if not z['path'].startswith('_')] if not files: files = info['files'] else: files = [{'path': info['name'], 'length': info['length']}] files.sort(key=lambda z: z['length'], reverse=True) bigfname = files[0]['path'] info['extension'] = metautils.get_extension(bigfname).lower() info['category'] = metautils.get_category(info['extension']) if 'files' in info: try: dbcurr.execute('INSERT IGNORE `ssbc`.`search_filelist` VALUES(%s, %s) ', (info['info_hash'], json.dumps(info['files']))) except: logging.error(name+ 'insert error'+ sys.exc_info()[1]) del info['files'] try: logging.debug ( 'Saved : '+ info['info_hash']) ret = dbcurr.execute( 'INSERT INTO `ssbc`.`search_hash`(info_hash,category,data_hash,name,extension,classified,source_ip,tagged,' + 'length,create_time,last_seen,requests,comment,creator) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE requests = requests + 1', ( info['info_hash'], info['category'], info['data_hash'], info['name'], info['extension'], info['classified'], info['source_ip'], info['tagged'], info['length'], info['create_time'], info['last_seen'], info['requests'], info.get('comment', ''), info.get('creator', ''))) dbcurr.connection.commit() except: logging.error( name, 'save error', info) traceback.print_exc() return
def run(self): self.name = threading.currentThread().getName() print self.name, 'started' n_reqs = n_valid = n_new = 0 while True: x = q.get() print 'get + ' + x + ' :' + self.getName() +'\n' n_reqs += 1 utcnow = datetime.datetime.utcnow() date = (utcnow + datetime.timedelta(hours=8)) date = datetime.datetime(date.year, date.month, date.day) while True: # Check if we have this info_hash self.dbcurr.execute('SELECT id FROM movie_hash WHERE hash=%s', (x)) y = self.dbcurr.fetchone() if not y: try: data = self.get_torrent(x) except: traceback.print_exc() break if not data: sys.stdout.write('!') break try: info = self.parse_torrent(data) except: traceback.print_exc() break info['info_hash'] = x info['reqtimes'] = 1 info['updatetime'] = utcnow info['source'] = '127.0.0.1' info['create_time'] = utcnow if info.get('files'): files = [z for z in info['files'] if not z['path'].startswith('_')] if not files: files = info['files'] else: files = [{'path': info['name'], 'length': info['length']}] files.sort(key=lambda z:z['length'], reverse=True) bigfname = files[0]['path'] info['extension'] = metautils.get_extension(bigfname).lower() info['category'] = metautils.get_category(info['extension']) try: if 'files' in info: self.dbcurr.execute('INSERT INTO filelists VALUES(%s, %s)', (info['info_hash'], json.dumps(info['files']))) del info['files'] except: traceback.print_exc() try: ret = self.dbcurr.execute('INSERT INTO movie_hash(hash,category,name,source,filesize,createtime,updatetime,reqtimes) VALUES(%s,%s,%s,%s,%s,%s,%s,%s)', (info['info_hash'], info['category'], info['name'], info['source'], info['length'], info['create_time'], info['updatetime'], info['reqtimes'])) except: traceback.print_exc() break n_new += 1 n_valid += 1 sys.stdout.write('#') sys.stdout.flush() self.dbconn.commit() break if n_reqs >= MAX_READ: self.dbcurr.execute('INSERT INTO statusreport(date,new_hashs,total_requests, valid_requests) VALUES(%s,%s,%s,%s) ON DUPLICATE KEY UPDATE ' + 'total_requests=total_requests+%s, valid_requests=valid_requests+%s, new_hashs=new_hashs+%s', (date, n_new, n_reqs, n_valid, n_reqs, n_valid, n_new)) n_reqs = n_valid = n_new = 0
def got_torrent(self): utcnow = datetime.datetime.utcnow() binhash, address, data, dtype, start_time = self.metadata_queue.get() if dtype == 'pt': self.n_downloading_pt -= 1 elif dtype == 'lt': self.n_downloading_lt -= 1 if not data: return self.n_valid += 1 try: info = self.parse_torrent(data) if not info: return except: traceback.print_exc() return info_hash = binhash.encode('hex') info['info_hash'] = info_hash # need to build tags info['tagged'] = False info['classified'] = False info['requests'] = 1 info['last_seen'] = utcnow info['source_ip'] = address[0] if info.get('files'): files = [z for z in info['files'] if not z['path'].startswith('_')] if not files: files = info['files'] else: files = [{'path': info['name'], 'length': info['length']}] files.sort(key=lambda z:z['length'], reverse=True) bigfname = files[0]['path'] info['extension'] = metautils.get_extension(bigfname).lower() info['category'] = metautils.get_category(info['extension']) if 'files' in info: try: self.dbcurr.execute('INSERT INTO search_filelist VALUES(%s, %s)', (info['info_hash'], json.dumps(info['files']))) except: print self.name, 'insert error', sys.exc_info()[1] del info['files'] try: try: print '\n', 'Saved', info['info_hash'], dtype, info['name'], (time.time()-start_time), 's', address[0], except: print '\n', 'Saved', info['info_hash'], ret = self.dbcurr.execute('INSERT INTO search_hash(info_hash,category,data_hash,name,extension,classified,source_ip,tagged,' + 'length,create_time,last_seen,requests,comment,creator) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', (info['info_hash'], info['category'], info['data_hash'], info['name'], info['extension'], info['classified'], info['source_ip'], info['tagged'], info['length'], info['create_time'], info['last_seen'], info['requests'], info.get('comment',''), info.get('creator',''))) self.dbconn.commit() except: print self.name, 'save error', self.name, info traceback.print_exc() return self.n_new += 1
def run(self): self.name = threading.currentThread().getName() print self.name, 'started' n_reqs = n_valid = n_new = 0 while True: x = q.get() n_reqs += 1 utcnow = datetime.datetime.utcnow() date = (utcnow + datetime.timedelta(hours=8)) date = datetime.datetime(date.year, date.month, date.day) while True: # Check if we have this info_hash self.dbcurr.execute('SELECT id FROM search_hash WHERE info_hash=%s', (x['info_hash'],)) y = self.dbcurr.fetchone() if not y: try: data = self.fetch_torrent(x['info_hash']) except: traceback.print_exc() break if not data: sys.stdout.write('!') #print self.name, 'Missing torrent file', x['info_hash'], 'from', x['ip'] break try: info = self.parse_torrent(data) except: traceback.print_exc() break info['info_hash'] = x['info_hash'] # need to build tags info['tagged'] = False info['classified'] = False info['requests'] = 1 info['last_seen'] = utcnow info['source_ip'] = x['ip'] if info.get('files'): files = [z for z in info['files'] if not z['path'].startswith('_')] if not files: files = info['files'] else: files = [{'path': info['name'], 'length': info['length']}] files.sort(key=lambda z:z['length'], reverse=True) bigfname = files[0]['path'] info['extension'] = metautils.get_extension(bigfname).lower() info['category'] = metautils.get_category(info['extension']) if 'files' in info: self.dbcurr.execute('INSERT INTO search_filelist VALUES(%s, %s)', (info['info_hash'], json.dumps(info['files']))) del info['files'] try: #db.basic.save(info, w=1) ret = self.dbcurr.execute('INSERT INTO search_hash(info_hash,category,data_hash,name,extension,classified,source_ip,tagged,' + 'length,create_time,last_seen,requests,comment,creator) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', (info['info_hash'], info['category'], info['data_hash'], info['name'], info['extension'], info['classified'], info['source_ip'], info['tagged'], info['length'], info['create_time'], info['last_seen'], info['requests'], info.get('comment',''), info.get('creator',''))) except: print 'save error', self.name, info traceback.print_exc() break n_new += 1 n_valid += 1 # Check if we have log this request #_id_data = '%s-%s-%s' % (x['info_hash'], x['ip'], x['port']) #tmpid = binascii.crc32(_id_data) & 0xffffffff #if db_log.requests_user.find_one({'_id': tmpid}): # sys.stdout.write('S') # sys.stdout.flush() # break #db_log.requests_user.save({'_id': tmpid, # 'time': utcnow}) # 更新最近发现时间,请求数 if y: self.dbcurr.execute('UPDATE search_hash SET last_seen=%s, requests=requests+1 WHERE info_hash=%s', (utcnow, x['info_hash'])) # 更新日请求量统计 #colname = 'requests_d_' + date.strftime('%y%m%d') #db_log[colname].update( # {'_id': bid}, # {'$inc': {'requests': 1}} # , upsert=True) sys.stdout.write('#') sys.stdout.flush() self.dbconn.commit() break if n_reqs >= MAX_READ: self.dbcurr.execute('INSERT INTO search_statusreport(date,new_hashes,total_requests, valid_requests) VALUES(%s,%s,%s,%s) ON DUPLICATE KEY UPDATE ' + 'total_requests=total_requests+%s, valid_requests=valid_requests+%s, new_hashes=new_hashes+%s', (date, n_new, n_reqs, n_valid, n_reqs, n_valid, n_new)) n_reqs = n_valid = n_new = 0
def startSpider(self): if self.spider_queue.empty(): fetched_users = self.db.execute( 'SELECT * from spider_list ORDER BY weight DESC limit 0,20') if fetched_users <= 0: print 'nothing to spider,spider_list is empty' return False self.start = 'start' self.errno = ERR_NO fetchall = self.db.fetchall() #将数据库中取出的待爬取的分享者,加入爬取队列 for item in fetchall: self.spider_queue.put({ 'sid': item[0], 'uk': item[1], 'file_fetched': item[2], 'follow_fetched': item[3], 'follow_done': item[4], 'file_done': item[5], 'weight': item[6], 'uid': item[7] }) self.got_follow_count = 0 self.got_files_count = 0 self.while_count = 0 while not self.spider_queue.empty(): self.while_count += 1 share_user = self.spider_queue.get() #爬取分享者的文件列表 if not share_user['file_done']: print '%d now spidering file ,%d file fetched' % ( share_user['uk'], share_user['file_fetched']) rs = self.getShareLists(share_user['uk'], share_user['file_fetched']) if not rs: print 'uk:%d error to fetch files,try again later...' % share_user[ 'uk'] return True total_count, fetched_count, file_list = rs total_fetched = share_user['file_fetched'] + fetched_count print 'fetched_file_count:%d' % fetched_count if total_fetched >= total_count or total_count == 0: share_user['file_done'] = 1 #该分享者所有文件爬取完成 if total_count == 0: self.db.execute( "UPDATE spider_list set file_done=%s WHERE sid=%s", (1, share_user['sid'])) self.db.commit() else: try: files_count = 0 for file in file_list: files_count += 1 ext = '' file_type = '' file_type_i = -1 if file['isdir'] == 0 and file[ 'feed_type'] == 'share': ext = metautils.get_extension( file['title']).lower() file_type = metautils.get_category(ext) file_type_i = self.file_type_t[file_type] time_stamp = int(time.time()) self.db.execute( "INSERT INTO share_file (title,uk,shareid,shorturl,isdir,size,md5,ext,feed_time,create_time,file_type,uid,feed_type) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", (file['title'], file['uk'], file['shareid'], file['shorturl'], file['isdir'], file['size'], file['md5'], ext, file['feed_time'], time_stamp, file_type_i, share_user['uid'], file['feed_type'])) except: share_user['file_done'] = 0 self.db.rollback() traceback.print_exc() return False else: self.db.execute( "UPDATE spider_list set file_fetched=%s,file_done=%s WHERE sid=%s", (total_fetched, share_user['file_done'], share_user['sid'])) self.db.execute( "UPDATE share_users set fetched=%s WHERE uid=%s", (total_fetched, share_user['uid'])) share_user['file_fetched'] = total_fetched self.got_files_count += files_count self.db.commit() #爬取完文件后在爬取订阅列表 if share_user['follow_done'] == 0 and share_user['file_done'] == 1: print '%d now spidering follow ,%d follow fetched' % ( share_user['uk'], share_user['follow_fetched']) rs = self.getFollows(share_user['uk'], share_user['follow_fetched']) if not rs: print 'error to fetch follows,try again later...' return total_count, fetched_count, follow_list = rs total_fetched = share_user['follow_fetched'] + fetched_count print 'fetched_follow_count:%d' % fetched_count if total_fetched >= total_count or total_count == 0: share_user['follow_done'] = 1 if total_count == 0: self.db.execute("DELETE FROM spider_list WHERE sid=%s", (share_user['sid'], )) self.db.commit() else: try: follow_count = 0 for follow in follow_list: follow_count += 1 #判断该用户是否已经在表中了 if self.db.execute( 'SELECT * FROM share_users WHERE uk=%s', (follow['follow_uk'], )) > 0: print 'uk:%d has already in share_user table' % follow[ 'follow_uk'] continue time_stamp = int(time.time()) self.db.execute( "INSERT INTO share_users (uk,user_name,avatar_url,intro,follow_count,album_count,\ fens_count,pubshare_count,last_visited,create_time,weight) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", (follow['follow_uk'], follow['follow_uname'], follow['avatar_url'], follow['intro'], follow['follow_count'], follow['album_count'], follow['fans_count'], follow['pubshare_count'], time_stamp, time_stamp, 5)) #将获取的新分享者加入爬取列表 self.db.execute( "INSERT INTO spider_list (uk,uid) VALUES(%s,%s)", (follow['follow_uk'], self.db.last_row_id())) except: share_user['follow_done'] = 0 self.db.rollback() traceback.print_exc() return False else: if share_user['follow_done'] == 1: #订阅者爬取完成,该分享者的任务完成,从待爬取列表中删除 print 'delete follow fetched sid:%d from spider_list' % share_user[ 'sid'] self.db.execute( "DELETE FROM spider_list WHERE sid=%s", (share_user['sid'], )) else: self.db.execute( "UPDATE spider_list set follow_fetched=%s,follow_done=%s WHERE sid=%s", (total_fetched, share_user['follow_done'], share_user['sid'])) share_user['follow_fetched'] = total_fetched self.got_follow_count += follow_count self.db.commit() #只要分享者列表没完成,说明该分享者还未爬取完,则加入工作队列,继续爬取 if share_user['follow_done'] == 0: self.spider_queue.put(share_user) else: print '%d has done' % share_user['uk'] del share_user time.sleep(SPIDER_INTERVAL) print '-----------------Done------------------' print 'while_count:%d' % self.while_count print 'got_follow_count:%d' % self.got_follow_count print 'got_files_count:%d' % self.got_files_count return True