def __init__(self, url, **options): if pylibrtmp is None: raise RuntimeError("No RTMP Download support.") DownloadFunction.__init__(self, None) print "RTMP Download init" url = url.encode("utf-8") try: url, query = url.split("?", 1) except (TypeError, ValueError): options = dict() else: options = dict(urlparse.parse_qsl(query)) if is_rtmplink(url): url, options = load_rtmplink(url) self.url = url self.options = options self.rtmp = None self.last_index = 0 self.next_update = 1 if not "tcurl" in options: options["tcurl"] = self.url if "swfurl" in options and not "swfvfy" in options: options["swfvfy"] = "1" self.thread = threadpool.ThreadPool(1) self.stopped = False
def init(): lock = ThreadingLock() init_event = Event() icons = glob.glob(os.path.join(settings.menuiconfolder, "*.icns")) for i in icons: name = os.path.basename(i) name = name[:name.rfind(".")] thread = threadpool.ThreadPool(1) options = [(_X("Open"), bmp_factory('open'), lambda *_: event.call_from_thread(common.open_browser)), (_X("Select browser"), bmp_factory('browser'), lambda *_: event.call_from_thread(common.select_browser)), (_X("Logout"), bmp_factory('logout'), lambda *_: event.call_from_thread(common.relogin)), (_X("Quit"), bmp_factory('quit'), 'QUIT')] icon = settings.taskbaricon_inactive if not icon: return thread.spawn( SysTray, icon, "Download.am Client", options, lambda *_: event.call_from_thread(common.quit), 0, "download.am", lock=lock, init_callback=lambda _: event.call_from_thread(init_event.set)) init_event.wait() @event.register('login:changed') def on_login_changed(*_): guest = login.is_guest() or not login.has_login() if guest and len(options) == 4: with lock: options.insert( 2, (_X("Register"), bmp_factory('register'), lambda *_: event.call_from_thread(common.register))) SysTray.instance.init_menu_options(options) elif not guest and len(options) == 5: with lock: options.pop(2) SysTray.instance.init_menu_options(options) try: on_login_changed() except: pass
def __init__(self): super(Spider, self).__init__() self.plugin_path=None self.start_urls = [] # self.default_rule = None self._env = {} # self._rules = {} # self._sched = None # self._loader = None # self._downloader = None self._headers = None self.disabled = False self.model = '' #----------------------------------- self._log = None self._loader = None self._evtmgr = None self._urlmgr = None self._downloader = None self._parser = None self._sched = Scheduler() #----------------------------------- self._stopping = True # self._qcount = 0 #未完成的队列长度 self._ccount = 0 #抓取次数 self._cfg_file = None self._pool = threadpool.ThreadPool(4)
def __init__(self, concurrent_num=20, crawl_tags=[], depth=3, max_url_num=300, internal_timeout=60, spider_timeout=6 * 3600, crawler_mode=0, same_origin=True, dynamic_parse=False): """ concurrent_num : 并行crawler和fetcher数量 crawl_tags : 爬行时收集URL所属标签列表 depth : 爬行深度限制 max_url_num : 最大收集URL数量 internal_timeout : 内部调用超时时间 spider_timeout : 爬虫超时时间 crawler_mode : 爬取器模型(0:多线程模型,1:gevent模型) same_origin : 是否限制相同域下 dynamic_parse : 是否使用WebKit动态解析 """ self.logger.setLevel(logging.DEBUG) hd = logging.StreamHandler() formatter = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s") hd.setFormatter(formatter) self.logger.addHandler(hd) self.stopped = event.Event() self.internal_timer = Timeout(internal_timeout) self.crawler_mode = crawler_mode #爬取器模型 self.concurrent_num = concurrent_num self.fetcher_pool = pool.Pool(self.concurrent_num) if self.crawler_mode == 0: self.crawler_pool = threadpool.ThreadPool( min(50, self.concurrent_num)) else: self.crawler_pool = pool.Pool(self.concurrent_num) #self.fetcher_queue = queue.JoinableQueue(maxsize=self.concurrent_num*100) self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num * 100) self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num * 100) self.fetcher_cache = UrlCache() self.crawler_cache = UrlCache() self.default_crawl_tags = ['a', 'base', 'iframe', 'frame', 'object'] self.ignore_ext = [ 'js', 'css', 'png', 'jpg', 'gif', 'bmp', 'svg', 'exif', 'jpeg', 'exe', 'rar', 'zip' ] self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags)) self.same_origin = same_origin self.depth = depth self.max_url_num = max_url_num self.dynamic_parse = dynamic_parse if self.dynamic_parse: self.webkit = WebKit() self.crawler_stopped = event.Event()
def __init__(self, concurrent_num=20, crawl_tags=[], custom_headers={}, plugin=[], depth=10, max_url_num=3000, internal_timeout=60, spider_timeout=1800, dir_max_url=15, crawler_mode=0, same_origin=True, dynamic_parse=False, login_dict={}, scan_task_id=0): """ concurrent_num : 并行crawler和fetcher数量 crawl_tags : 爬行时收集URL所属标签列表 custom_headers : 自定义HTTP请求头 plugin : 自定义插件列表 depth : 爬行深度限制 max_url_num : 最大收集URL数量 internal_timeout : 内部调用超时时间 spider_timeout : 爬虫超时时间 crawler_mode : 爬取器模型(0:多线程模型,1:gevent模型) same_origin : 是否限制相同域下 dynamic_parse : 是否使用WebKit动态解析 """ self.logger.setLevel(logging.DEBUG) hd = logging.StreamHandler() formatter = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s") hd.setFormatter(formatter) self.logger.addHandler(hd) self.stopped = event.Event() self.internal_timeout = internal_timeout self.internal_timer = Timeout(internal_timeout) self.spider_stop_time = time() + spider_timeout self.crawler_mode = crawler_mode # 爬取器模型 self.concurrent_num = concurrent_num self.fetcher_pool = pool.Pool(self.concurrent_num) if self.crawler_mode == 0: self.crawler_pool = threadpool.ThreadPool( min(50, self.concurrent_num)) else: self.crawler_pool = pool.Pool(self.concurrent_num) # self.fetcher_queue = queue.JoinableQueue(maxsize=self.concurrent_num*100) self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num * 10000) self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num * 10000) self.fetcher_cache = UrlCache() self.crawler_cache = UrlCache() self.default_crawl_tags = [ 'script', 'a', 'base', 'iframe', 'frame', 'object' ] self.ignore_ext = [ 'js', 'css', 'png', 'jpg', 'gif', 'bmp', 'svg', 'exif', 'jpeg', 'exe', 'rar', 'zip', 'swf', 'ico' ] self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags)) self.same_origin = same_origin self.depth = depth self.max_url_num = max_url_num self.dir_max_url = dir_max_url self.dynamic_parse = dynamic_parse if self.dynamic_parse: self.webkit = WebKit(login_dict) if login_dict: self.webkit.auto_login() # elif custom_headers.get('Cookie'): # # self.webkit.set_cookie(custom_headers) self.crawler_stopped = event.Event() self.plugin_handler = plugin # 注册Crawler中使用的插件 self.custom_headers = custom_headers self.scan_task_id = scan_task_id
def __init__(self, concurrent_num=20, crawl_tags=[], custom_headers={}, plugin=[], depth=3, max_url_num=300, internal_timeout=60, spider_timeout=6 * 3600, crawler_mode=0, same_origin=True, dynamic_parse=False): """ concurrent_num : 并行crawler和fetcher数量 crawl_tags : 爬行时收集URL所属标签列表 custom_headers : 自定义HTTP请求头 plugin : 自定义插件列表 depth : 爬行深度限制 max_url_num : 最大收集URL数量 internal_timeout : 内部调用超时时间 spider_timeout : 爬虫超时时间 crawler_mode : 爬取器模型(0:多线程模型,1:gevent模型) same_origin : 是否限制相同域下 dynamic_parse : 是否使用WebKit动态解析 """ # 日志模块 self.logger.setLevel(logging.DEBUG) # 日志级别 formatter = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s") # 日志格式 hd = logging.StreamHandler() hd.setFormatter(formatter) self.logger.addHandler(hd) self.stopped = event.Event() self.internal_timeout = internal_timeout # 内部调用超时时间 self.internal_timer = Timeout(internal_timeout) self.crawler_mode = crawler_mode # 爬取器模型 self.concurrent_num = concurrent_num # 并行crawler与fetcher数量 # fetcher使用gevent模型 self.fetcher_pool = pool.Pool(self.concurrent_num) # crawler模型设置 # crawler负责解析并爬取HTML中的URL,送入fetcher,fetcher负责获取HTML,送入crawler if self.crawler_mode == 0: # 线程池模型 self.crawler_pool = threadpool.ThreadPool( min(50, self.concurrent_num)) else: # gevent模型 self.crawler_pool = pool.Pool(self.concurrent_num) # fetcher和crawler两部分独立工作,互不干扰,通过queue进行链接 # self.fetcher_queue = queue.JoinableQueue(maxsize=self.concurrent_num*100) self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num * 10000) self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num * 10000) self.fetcher_cache = UrlCache() self.crawler_cache = UrlCache() self.default_crawl_tags = ['a', 'base', 'iframe', 'frame', 'object'] # 默认的爬行时收集URL所属标签列表 self.ignore_ext = [ 'js', 'css', 'png', 'jpg', 'gif', 'bmp', 'svg', 'exif', 'jpeg', 'exe', 'rar', 'zip' ] # 爬行时忽略的URL种类 self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags)) # 爬行时收集URL所属标签列表 self.same_origin = same_origin # 是否同源 self.depth = depth # 爬行深度限制 self.max_url_num = max_url_num # 最大收集URL数量 self.dynamic_parse = dynamic_parse # 是否使用WebKit动态解析 # 如果开启动态解析 if self.dynamic_parse: self.webkit = WebKit() self.crawler_stopped = event.Event() self.plugin_handler = plugin # 注册Crawler中使用的插件 # 自定义HTTP头 self.custom_headers = custom_headers
if __name__ == '__main__': # req = requests.session() # 请求,为获取总页数 resp = crawling('https://shop.10086.cn/list/134_200_200_1_0_0_0_0_0.html') if resp.status_code != 200: print '[-] Get total page error' exit() # 获取总页数 total_page = int(re.findall(r'第1/([\d]+)页', resp.content)[0]) print '[+] Total page: %s' % total_page # 20 个线程池 p = threadpool.ThreadPool(20) time_start = time.time() threads = [ p.spawn(crawling_by_page, page) for page in range(1, total_page + 1) ] # 等待所有线程任务完成 gevent.joinall(threads) numbers = [] for t in threads: res = t.get() numbers = numbers + res
def init(): lock = ThreadingLock() init_event = Event() icons = glob.glob(os.path.join(settings.menuiconfolder, "*.icns")) for i in icons: name = os.path.basename(i) name = name[:name.rfind(".")] thread = threadpool.ThreadPool(1) _open = (_X("Open"), bmp_factory('open'), lambda *_: event.call_from_thread(common.open_browser)) _browser = (_X("Select browser"), bmp_factory('browser'), lambda *_: event.call_from_thread(common.select_browser)) _register = (_X("Register"), bmp_factory('register'), lambda *_: event.call_from_thread(common.register)) _login = (_X("Login"), bmp_factory('login'), lambda *_: event.call_from_thread(common.relogin)) _logout = (_X("Logout"), bmp_factory('logout'), lambda *_: event.call_from_thread(common.relogin)) _quit = (_X("Quit"), bmp_factory('quit'), 'QUIT') options = [_open, _browser, _login, _register, _quit] icon = settings.taskbaricon_inactive if not icon: return def update_tooltip(): text = common.generate_tooltip_text() if SysTray.instance.tooltip_text != text: with lock: SysTray.instance.tooltip_text = text SysTray.instance.refresh_icon() thread.spawn( SysTray, icon, options, lambda *_: event.call_from_thread(common.quit), 0, "download.am", lock=lock, init_callback=lambda _: event.call_from_thread(init_event.set), update_tooltip_callback=update_tooltip) init_event.wait() @event.register('login:changed') def on_login_changed(*_): opts = list() opts.append(_open) opts.append(_browser) if login.is_guest() or not login.has_login(): opts.append(_login) opts.append(_register) elif login.has_login(): opts.append(_logout) else: opts.append(_register) opts.append(_quit) if opts != options: with lock: options[:] = opts SysTray.instance.init_menu_options(options) @event.register('loader:initiialized') @core.GlobalStatus.files.changed @core.GlobalStatus.files_working.changed def on_update_tooltip(*_): event.fire_once_later(1, 'systray.win:update_tooltip') try: on_login_changed() except: pass
def PlugRunPluginsIntoApp(cls, app): sockets = Sockets(app) thread_pool = threadpool.ThreadPool(5) @app.route("/rekall/runplugin/cancel/<cell_id>", methods=["POST"]) def cancel_execution(cell_id): # pylint: disable=unused-variable worksheet = app.config["worksheet"] # Signal the worksheet to abort this cell. worksheet.aborted_cells.add(int(cell_id)) return "OK", 200 @sockets.route("/rekall/runplugin") def rekall_run_plugin_socket(ws): # pylint: disable=unused-variable cell = json.loads(ws.receive()) cell_id = cell["cell_id"] source = cell["source"] worksheet = app.config["worksheet"] # If the data is cached locally just return it. cache_key = GenerateCacheKey(source) cache = worksheet.GetData("%s.data" % cell_id) if cache and cache.get("cache_key") == cache_key: logging.debug("Dumping request from cache") ws.send(json.dumps(cache.get("data"))) return kwargs = source.get("arguments", {}) # Must provide the correct session to run this on. session_id = int(source.pop("session_id")) session = worksheet.session.find_session(session_id) output = cStringIO.StringIO() output_queue = Queue.Queue() renderer = WebConsoleRenderer(session=session, output=output, cell_id=cell_id, output_queue=output_queue, worksheet=worksheet) # Clear the interruption state of this cell. worksheet.aborted_cells.discard(cell_id) def RunPlugin(): with renderer.start(): try: session.RunPlugin(source["plugin"]["name"], renderer=renderer, **kwargs) except Exception: message = traceback.format_exc() renderer.report_error(message) run_plugin_result = thread_pool.spawn(RunPlugin) sent_messages = [] def HandleSentMessages(): while not run_plugin_result.ready() or not output_queue.empty( ): while not output_queue.empty(): message = output_queue.get() sent_messages.append(message) ws.send( json.dumps([message], cls=json_renderer.RobustEncoder)) run_plugin_result.wait(0.1) handle_messages_thread = gevent.spawn(HandleSentMessages) gevent.joinall([run_plugin_result, handle_messages_thread]) # Cache the data in the worksheet. worksheet.StoreData("%s.data" % cell_id, dict(cache_key=cache_key, data=sent_messages))