def initialize(): if not os.path.exists(define.STORAGE_PATH): os.mkdir(define.STORAGE_PATH) if not os.path.exists(define.HTML_PATH): os.mkdir(define.HTML_PATH) if not os.path.exists(define.CACHE_PATH): os.mkdir(define.CACHE_PATH) for i in range(0, 5): _logic.create_record_table('%s_%s' % (record.DB_TABLE, i)) bloom_fliter.init_bitarray() index = 0 for url in urls.init_url_list: if not bloom_fliter.url_exist(url): record_obj = record.Record(define.UNDEFINE, url, record.STATUS_NOTYET, '', '') table = '%s_%s' % (record.DB_TABLE, index % THREAD_COUNT) _logic.insert_record(table, record_obj) index = index + 1 bloom_fliter.save_bitarray()
def run_listener(): sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.bind(('localhost', 8002)) sock.listen(5) while True: connection, address = sock.accept() try: connection.settimeout(5) buf = connection.recv(1024) if buf == 'stop': print '正在停止...' stop_crawler() stop_analyzer() bloom_fliter.save_bitarray() print '已经成功保存数据' connection.close() break except socket.timeout: print 'time out' connection.close()