Esempio n. 1
0
 def __init__(self, url, **options):
     if pylibrtmp is None:
         raise RuntimeError("No RTMP Download support.")
     
     DownloadFunction.__init__(self, None)
     print "RTMP Download init"
     url = url.encode("utf-8")
     try:
         url, query = url.split("?", 1)
     except (TypeError, ValueError):
         options = dict()
     else:
         options = dict(urlparse.parse_qsl(query))
     if is_rtmplink(url):
         url, options = load_rtmplink(url)
     self.url = url
     self.options = options
     self.rtmp = None
     self.last_index = 0
     self.next_update = 1
     if not "tcurl" in options:
         options["tcurl"] = self.url
     if "swfurl" in options and not "swfvfy" in options:
         options["swfvfy"] = "1"
     self.thread = threadpool.ThreadPool(1)
     self.stopped = False
Esempio n. 2
0
def init():
    lock = ThreadingLock()
    init_event = Event()

    icons = glob.glob(os.path.join(settings.menuiconfolder, "*.icns"))
    for i in icons:
        name = os.path.basename(i)
        name = name[:name.rfind(".")]

    thread = threadpool.ThreadPool(1)
    options = [(_X("Open"), bmp_factory('open'),
                lambda *_: event.call_from_thread(common.open_browser)),
               (_X("Select browser"), bmp_factory('browser'),
                lambda *_: event.call_from_thread(common.select_browser)),
               (_X("Logout"), bmp_factory('logout'),
                lambda *_: event.call_from_thread(common.relogin)),
               (_X("Quit"), bmp_factory('quit'), 'QUIT')]

    icon = settings.taskbaricon_inactive
    if not icon:
        return

    thread.spawn(
        SysTray,
        icon,
        "Download.am Client",
        options,
        lambda *_: event.call_from_thread(common.quit),
        0,
        "download.am",
        lock=lock,
        init_callback=lambda _: event.call_from_thread(init_event.set))
    init_event.wait()

    @event.register('login:changed')
    def on_login_changed(*_):
        guest = login.is_guest() or not login.has_login()
        if guest and len(options) == 4:
            with lock:
                options.insert(
                    2, (_X("Register"), bmp_factory('register'),
                        lambda *_: event.call_from_thread(common.register)))
                SysTray.instance.init_menu_options(options)
        elif not guest and len(options) == 5:
            with lock:
                options.pop(2)
                SysTray.instance.init_menu_options(options)

    try:
        on_login_changed()
    except:
        pass
Esempio n. 3
0
    def __init__(self):
        super(Spider, self).__init__()
        
        self.plugin_path=None
        self.start_urls = []
        # self.default_rule = None
        self._env = {}
        # self._rules = {}
        # self._sched = None
        # self._loader = None
        # self._downloader = None
        self._headers = None
        self.disabled = False

        self.model = ''

        #-----------------------------------
        self._log = None
        self._loader = None
        self._evtmgr = None
        self._urlmgr = None
        self._downloader = None
        self._parser = None
        self._sched = Scheduler()
        #-----------------------------------

        self._stopping = True


        # self._qcount = 0            #未完成的队列长度
        self._ccount = 0            #抓取次数

        self._cfg_file = None


        self._pool = threadpool.ThreadPool(4)
Esempio n. 4
0
    def __init__(self,
                 concurrent_num=20,
                 crawl_tags=[],
                 depth=3,
                 max_url_num=300,
                 internal_timeout=60,
                 spider_timeout=6 * 3600,
                 crawler_mode=0,
                 same_origin=True,
                 dynamic_parse=False):
        """
        concurrent_num    : 并行crawler和fetcher数量
        crawl_tags        : 爬行时收集URL所属标签列表
        depth             : 爬行深度限制
        max_url_num       : 最大收集URL数量
        internal_timeout  : 内部调用超时时间
        spider_timeout    : 爬虫超时时间
        crawler_mode      : 爬取器模型(0:多线程模型,1:gevent模型)
        same_origin       : 是否限制相同域下
        dynamic_parse     : 是否使用WebKit动态解析
        """

        self.logger.setLevel(logging.DEBUG)
        hd = logging.StreamHandler()
        formatter = logging.Formatter(
            "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
        hd.setFormatter(formatter)
        self.logger.addHandler(hd)

        self.stopped = event.Event()
        self.internal_timer = Timeout(internal_timeout)

        self.crawler_mode = crawler_mode  #爬取器模型
        self.concurrent_num = concurrent_num
        self.fetcher_pool = pool.Pool(self.concurrent_num)
        if self.crawler_mode == 0:
            self.crawler_pool = threadpool.ThreadPool(
                min(50, self.concurrent_num))
        else:
            self.crawler_pool = pool.Pool(self.concurrent_num)

        #self.fetcher_queue = queue.JoinableQueue(maxsize=self.concurrent_num*100)
        self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num *
                                              100)
        self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num *
                                              100)

        self.fetcher_cache = UrlCache()
        self.crawler_cache = UrlCache()

        self.default_crawl_tags = ['a', 'base', 'iframe', 'frame', 'object']
        self.ignore_ext = [
            'js', 'css', 'png', 'jpg', 'gif', 'bmp', 'svg', 'exif', 'jpeg',
            'exe', 'rar', 'zip'
        ]
        self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags))
        self.same_origin = same_origin
        self.depth = depth
        self.max_url_num = max_url_num
        self.dynamic_parse = dynamic_parse
        if self.dynamic_parse:
            self.webkit = WebKit()
        self.crawler_stopped = event.Event()
Esempio n. 5
0
    def __init__(self,
                 concurrent_num=20,
                 crawl_tags=[],
                 custom_headers={},
                 plugin=[],
                 depth=10,
                 max_url_num=3000,
                 internal_timeout=60,
                 spider_timeout=1800,
                 dir_max_url=15,
                 crawler_mode=0,
                 same_origin=True,
                 dynamic_parse=False,
                 login_dict={},
                 scan_task_id=0):
        """
        concurrent_num    : 并行crawler和fetcher数量
        crawl_tags        : 爬行时收集URL所属标签列表
        custom_headers    : 自定义HTTP请求头
        plugin            : 自定义插件列表
        depth             : 爬行深度限制
        max_url_num       : 最大收集URL数量
        internal_timeout  : 内部调用超时时间
        spider_timeout    : 爬虫超时时间
        crawler_mode      : 爬取器模型(0:多线程模型,1:gevent模型)
        same_origin       : 是否限制相同域下
        dynamic_parse     : 是否使用WebKit动态解析
        """

        self.logger.setLevel(logging.DEBUG)
        hd = logging.StreamHandler()
        formatter = logging.Formatter(
            "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
        hd.setFormatter(formatter)
        self.logger.addHandler(hd)

        self.stopped = event.Event()
        self.internal_timeout = internal_timeout
        self.internal_timer = Timeout(internal_timeout)
        self.spider_stop_time = time() + spider_timeout
        self.crawler_mode = crawler_mode  # 爬取器模型
        self.concurrent_num = concurrent_num
        self.fetcher_pool = pool.Pool(self.concurrent_num)
        if self.crawler_mode == 0:
            self.crawler_pool = threadpool.ThreadPool(
                min(50, self.concurrent_num))
        else:
            self.crawler_pool = pool.Pool(self.concurrent_num)
        # self.fetcher_queue = queue.JoinableQueue(maxsize=self.concurrent_num*100)
        self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num *
                                              10000)
        self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num *
                                              10000)

        self.fetcher_cache = UrlCache()
        self.crawler_cache = UrlCache()

        self.default_crawl_tags = [
            'script', 'a', 'base', 'iframe', 'frame', 'object'
        ]
        self.ignore_ext = [
            'js', 'css', 'png', 'jpg', 'gif', 'bmp', 'svg', 'exif', 'jpeg',
            'exe', 'rar', 'zip', 'swf', 'ico'
        ]
        self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags))
        self.same_origin = same_origin
        self.depth = depth
        self.max_url_num = max_url_num
        self.dir_max_url = dir_max_url
        self.dynamic_parse = dynamic_parse
        if self.dynamic_parse:
            self.webkit = WebKit(login_dict)
            if login_dict:
                self.webkit.auto_login()
            # elif custom_headers.get('Cookie'):
            #
            #     self.webkit.set_cookie(custom_headers)

        self.crawler_stopped = event.Event()

        self.plugin_handler = plugin  # 注册Crawler中使用的插件
        self.custom_headers = custom_headers
        self.scan_task_id = scan_task_id
Esempio n. 6
0
    def __init__(self,
                 concurrent_num=20,
                 crawl_tags=[],
                 custom_headers={},
                 plugin=[],
                 depth=3,
                 max_url_num=300,
                 internal_timeout=60,
                 spider_timeout=6 * 3600,
                 crawler_mode=0,
                 same_origin=True,
                 dynamic_parse=False):
        """
        concurrent_num    : 并行crawler和fetcher数量
        crawl_tags        : 爬行时收集URL所属标签列表
        custom_headers    : 自定义HTTP请求头
        plugin            : 自定义插件列表
        depth             : 爬行深度限制
        max_url_num       : 最大收集URL数量
        internal_timeout  : 内部调用超时时间
        spider_timeout    : 爬虫超时时间
        crawler_mode      : 爬取器模型(0:多线程模型,1:gevent模型)
        same_origin       : 是否限制相同域下
        dynamic_parse     : 是否使用WebKit动态解析
        """

        #   日志模块
        self.logger.setLevel(logging.DEBUG)  #   日志级别
        formatter = logging.Formatter(
            "%(asctime)s - %(name)s - %(levelname)s - %(message)s")  #   日志格式
        hd = logging.StreamHandler()
        hd.setFormatter(formatter)
        self.logger.addHandler(hd)

        self.stopped = event.Event()
        self.internal_timeout = internal_timeout  #   内部调用超时时间
        self.internal_timer = Timeout(internal_timeout)

        self.crawler_mode = crawler_mode  #   爬取器模型
        self.concurrent_num = concurrent_num  #   并行crawler与fetcher数量

        #   fetcher使用gevent模型
        self.fetcher_pool = pool.Pool(self.concurrent_num)

        #   crawler模型设置
        #   crawler负责解析并爬取HTML中的URL,送入fetcher,fetcher负责获取HTML,送入crawler
        if self.crawler_mode == 0:
            #   线程池模型
            self.crawler_pool = threadpool.ThreadPool(
                min(50, self.concurrent_num))
        else:
            #   gevent模型
            self.crawler_pool = pool.Pool(self.concurrent_num)

        #   fetcher和crawler两部分独立工作,互不干扰,通过queue进行链接
        # self.fetcher_queue = queue.JoinableQueue(maxsize=self.concurrent_num*100)
        self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num *
                                              10000)
        self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num *
                                              10000)

        self.fetcher_cache = UrlCache()
        self.crawler_cache = UrlCache()

        self.default_crawl_tags = ['a', 'base', 'iframe', 'frame',
                                   'object']  #   默认的爬行时收集URL所属标签列表
        self.ignore_ext = [
            'js', 'css', 'png', 'jpg', 'gif', 'bmp', 'svg', 'exif', 'jpeg',
            'exe', 'rar', 'zip'
        ]  #   爬行时忽略的URL种类
        self.crawl_tags = list(set(self.default_crawl_tags)
                               | set(crawl_tags))  #   爬行时收集URL所属标签列表
        self.same_origin = same_origin  #   是否同源
        self.depth = depth  #   爬行深度限制
        self.max_url_num = max_url_num  #   最大收集URL数量
        self.dynamic_parse = dynamic_parse  #   是否使用WebKit动态解析

        #   如果开启动态解析
        if self.dynamic_parse:
            self.webkit = WebKit()
        self.crawler_stopped = event.Event()

        self.plugin_handler = plugin  # 注册Crawler中使用的插件
        #   自定义HTTP头
        self.custom_headers = custom_headers
Esempio n. 7
0
if __name__ == '__main__':

    # req = requests.session()

    # 请求,为获取总页数
    resp = crawling('https://shop.10086.cn/list/134_200_200_1_0_0_0_0_0.html')
    if resp.status_code != 200:
        print '[-] Get total page error'
        exit()

    # 获取总页数
    total_page = int(re.findall(r'第1/([\d]+)页', resp.content)[0])
    print '[+] Total page: %s' % total_page

    # 20 个线程池
    p = threadpool.ThreadPool(20)

    time_start = time.time()

    threads = [
        p.spawn(crawling_by_page, page) for page in range(1, total_page + 1)
    ]

    # 等待所有线程任务完成
    gevent.joinall(threads)

    numbers = []
    for t in threads:
        res = t.get()
        numbers = numbers + res
Esempio n. 8
0
def init():
    lock = ThreadingLock()
    init_event = Event()

    icons = glob.glob(os.path.join(settings.menuiconfolder, "*.icns"))
    for i in icons:
        name = os.path.basename(i)
        name = name[:name.rfind(".")]

    thread = threadpool.ThreadPool(1)

    _open = (_X("Open"), bmp_factory('open'),
             lambda *_: event.call_from_thread(common.open_browser))
    _browser = (_X("Select browser"), bmp_factory('browser'),
                lambda *_: event.call_from_thread(common.select_browser))
    _register = (_X("Register"), bmp_factory('register'),
                 lambda *_: event.call_from_thread(common.register))
    _login = (_X("Login"), bmp_factory('login'),
              lambda *_: event.call_from_thread(common.relogin))
    _logout = (_X("Logout"), bmp_factory('logout'),
               lambda *_: event.call_from_thread(common.relogin))
    _quit = (_X("Quit"), bmp_factory('quit'), 'QUIT')
    options = [_open, _browser, _login, _register, _quit]

    icon = settings.taskbaricon_inactive
    if not icon:
        return

    def update_tooltip():
        text = common.generate_tooltip_text()

        if SysTray.instance.tooltip_text != text:
            with lock:
                SysTray.instance.tooltip_text = text
                SysTray.instance.refresh_icon()

    thread.spawn(
        SysTray,
        icon,
        options,
        lambda *_: event.call_from_thread(common.quit),
        0,
        "download.am",
        lock=lock,
        init_callback=lambda _: event.call_from_thread(init_event.set),
        update_tooltip_callback=update_tooltip)
    init_event.wait()

    @event.register('login:changed')
    def on_login_changed(*_):
        opts = list()
        opts.append(_open)
        opts.append(_browser)
        if login.is_guest() or not login.has_login():
            opts.append(_login)
            opts.append(_register)
        elif login.has_login():
            opts.append(_logout)
        else:
            opts.append(_register)
        opts.append(_quit)

        if opts != options:
            with lock:
                options[:] = opts
                SysTray.instance.init_menu_options(options)

    @event.register('loader:initiialized')
    @core.GlobalStatus.files.changed
    @core.GlobalStatus.files_working.changed
    def on_update_tooltip(*_):
        event.fire_once_later(1, 'systray.win:update_tooltip')

    try:
        on_login_changed()
    except:
        pass
Esempio n. 9
0
    def PlugRunPluginsIntoApp(cls, app):
        sockets = Sockets(app)
        thread_pool = threadpool.ThreadPool(5)

        @app.route("/rekall/runplugin/cancel/<cell_id>", methods=["POST"])
        def cancel_execution(cell_id):  # pylint: disable=unused-variable
            worksheet = app.config["worksheet"]
            # Signal the worksheet to abort this cell.
            worksheet.aborted_cells.add(int(cell_id))

            return "OK", 200

        @sockets.route("/rekall/runplugin")
        def rekall_run_plugin_socket(ws):  # pylint: disable=unused-variable
            cell = json.loads(ws.receive())
            cell_id = cell["cell_id"]
            source = cell["source"]
            worksheet = app.config["worksheet"]

            # If the data is cached locally just return it.
            cache_key = GenerateCacheKey(source)
            cache = worksheet.GetData("%s.data" % cell_id)
            if cache and cache.get("cache_key") == cache_key:
                logging.debug("Dumping request from cache")
                ws.send(json.dumps(cache.get("data")))
                return

            kwargs = source.get("arguments", {})

            # Must provide the correct session to run this on.
            session_id = int(source.pop("session_id"))
            session = worksheet.session.find_session(session_id)

            output = cStringIO.StringIO()
            output_queue = Queue.Queue()
            renderer = WebConsoleRenderer(session=session,
                                          output=output,
                                          cell_id=cell_id,
                                          output_queue=output_queue,
                                          worksheet=worksheet)

            # Clear the interruption state of this cell.
            worksheet.aborted_cells.discard(cell_id)

            def RunPlugin():
                with renderer.start():
                    try:
                        session.RunPlugin(source["plugin"]["name"],
                                          renderer=renderer,
                                          **kwargs)

                    except Exception:
                        message = traceback.format_exc()
                        renderer.report_error(message)

            run_plugin_result = thread_pool.spawn(RunPlugin)

            sent_messages = []

            def HandleSentMessages():
                while not run_plugin_result.ready() or not output_queue.empty(
                ):
                    while not output_queue.empty():
                        message = output_queue.get()
                        sent_messages.append(message)
                        ws.send(
                            json.dumps([message],
                                       cls=json_renderer.RobustEncoder))
                    run_plugin_result.wait(0.1)

            handle_messages_thread = gevent.spawn(HandleSentMessages)

            gevent.joinall([run_plugin_result, handle_messages_thread])

            # Cache the data in the worksheet.
            worksheet.StoreData("%s.data" % cell_id,
                                dict(cache_key=cache_key, data=sent_messages))