コード例 #1
0
ファイル: chrome_factory.py プロジェクト: pingf/netboy2
    def anaylse_it(self, responses):
        payload = self.info
        analysers = payload.get('analysers')
        if analysers:
            for analyser in analysers:
                payload['analyser'] = analyser
                sig = {
                    # here the payload is the info
                    'payload': payload,
                    'response': responses,
                }
                try:
                    if payload.get('mode', 'celery') == 'celery':
                        App().app.send_task(
                            'netboy.celery.tasks.analyser_task',
                            kwargs=sig,
                            countdown=1,
                            queue=self.info.get('queue', 'worker'),
                            routing_key=self.info.get('queue', 'worker'))
                    else:
                        analyser_func = load(analyser) if isinstance(
                            analyser, str) else load(analyser.get('analyser'))
                        resp = analyser_func(responses)
                        if resp is not None:
                            responses = resp
                except Exception as e:
                    self.log.critical('analyser failed: ' + str(e))

        return responses
コード例 #2
0
ファイル: tasks.py プロジェクト: pingf/netboy2
def analyser_task(payload, response):
    result = {}
    analyser = payload.get('analyser')
    if isinstance(analyser, str):
        analyse_it = load(analyser)
        analyse_result = analyse_it(payload, response)
        result['analyse'] = analyse_result
    else:
        if 'analyser' in analyser:
            analyse_it = load(analyser.get('analyser'))
            analyse_result = analyse_it(payload, response)
            result['analyser'] = analyse_result
    result['payload'] = payload
    result['response'] = response
    return result
コード例 #3
0
ファイル: base_factory.py プロジェクト: pingf/netboy2
    def trigger_it(self, payload, response):
        triggers = payload.pop('triggers', None)
        if triggers:
            for trigger in triggers:
                # if url:
                # pay['job_id'] = payload.get('job_id')
                # pay['job_name'] = payload.get('job_name')
                # pay['task_id'] = payload.get('task_id')
                # pay['task_name'] = payload.get('task_name')
                # pay['url'] = payload.get('url')
                if isinstance(trigger, str):
                    trigger = {'trigger': trigger}
                payload['trigger'] = trigger
                sig = {
                    'payload': payload,
                    'response': response,
                }
                try:
                    if trigger.get('mode', 'sync') == 'celery' and payload.get('mode') == 'celery':
                        App().app.send_task('netboy.celery.tasks.trigger_task', kwargs=sig, countdown=1,
                                            queue=payload.get('queue'),
                                            routing_key=payload.get('queue'))
                    else:
                        trigger_func = load(trigger) if isinstance(trigger, str) else load(trigger.get('trigger'))
                        resp = trigger_func(payload, response)
                        if resp:
                            if resp.get('update'):
                                response = resp
                except Exception as e:
                    self.log.critical('trigger failed: ' + str(e))

            if isinstance(response, dict):
                if response.get('update'):
                    response['update'] = False
        return response
コード例 #4
0
ファイル: tasks.py プロジェクト: pingf/netboy2
def trigger_task(payload, response):
    result = {}
    trigger = payload.get('trigger')
    if isinstance(trigger, str):
        trigger_it = load(trigger)
        trigger_result = trigger_it(payload, response)
        result['trigger'] = trigger_result
    else:
        if 'trigger' in trigger:
            trigger_it = load(trigger.get('trigger'))
            trigger_result = trigger_it(payload, response)
            result['trigger'] = trigger_result
    response.pop('data', None)
    result['payload'] = payload
    result['response'] = response
    return result
コード例 #5
0
ファイル: tasks.py プロジェクト: pingf/netboy2
def final_callback(data, info):
    final = info.get('final')
    if final:
        final_func = load(final)
        if final_func:
            return final_func(data, info)
    return
コード例 #6
0
ファイル: base_factory.py プロジェクト: pingf/netboy2
    async def run_core_async(self, spider, func, extra=None):
        func = load(func) if isinstance(func, str) else func
        responses = []
        print(self.updated, '.....')
        for d in self.updated:
            prepare_resp = self.prepare_it(d)
            url = d.get('url') if isinstance(d, dict) else d

            if isinstance(prepare_resp, dict):
                if prepare_resp.get('skip'):
                    continue
                if prepare_resp.get('cover'):
                    response = self.trigger_it(d, prepare_resp)
                    responses.append(response)
                    continue
            start = time.time()
            try:
                response = await func(d, extra)
                if response is None:
                    end = time.time()
                    response_time = '%s' % (end - start)
                    msg = "failed! url: " + str(url)
                    self.log.warning(msg)
                    response = {
                        'url': url,
                        'effect': url,
                        'data': '',
                        'title': '',
                        'spider': spider,
                        'state': 'error',
                        "code": -2,
                        "time": response_time
                    }
                else:
                    end = time.time()
                    d['time'] = '%s' % (end - start)
                    msg = "success! url: " + str(url)
                    self.log.info(msg)
            except Exception as e:
                end = time.time()
                response_time = '%s' % (end - start)
                msg = "failed! url: " + str(url) + ' errtype: ' + str(type(e)) + ' errmsg: ' + str(e)
                self.log.warning(msg)
                response = {
                    'url': url,
                    'effect': url,
                    'data': '',
                    'title': '',
                    'spider': spider,
                    'state': 'error',
                    "code": -1,
                    "time": response_time
                }
            response = self.trigger_it(d, response)
            responses.append(response)
        self.anaylse_it(responses)
        return responses
コード例 #7
0
ファイル: base_factory.py プロジェクト: pingf/netboy2
    def prepare_it(self, data):
        prepares = data.pop('prepares', None)
        if prepares:
            for prepare in prepares:
                data['prepare'] = prepare
                try:
                    prepare_func = load(prepare) if isinstance(prepare, str) else load(prepare.get('prepare'))
                    resp = prepare_func(data)
                    if resp is not None:
                        if resp.get('update'):
                            data = resp

                except Exception as e:
                    self.log.critical('prepare failed: ' + str(e) + ' type: ' + str(type(e)))

            if isinstance(data, dict):
                if data.get('update'):
                    data['update'] = False
        return data
コード例 #8
0
def setup_curl(c):
    d = c.data
    databuf = BytesIO()
    headers = {'count': 0, 'content': [{}]}
    set_cookies = []

    c.databuf = databuf
    c.headers = headers


    def header_function(header_line):
        match = re.match("^Set-Cookie: (.*)$", header_line.decode('utf8', 'ignore'))
        if match:
            set_cookies.append(match.group(1))
        count = headers['count']
        header_line = header_line.decode('iso-8859-1')
        if ':' not in header_line and not header_line.startswith('HTTP'):
            # print(header_line)
            if '\r\n' in header_line:
                headers['count'] += 1
                headers['content'].append({})
            return
        # Break the header line into header name and value.
        if ':' in header_line:
            name, value = header_line.rstrip('\r\n').split(':', 1)
        else:
            name, value = header_line.rstrip('\r\n').split(' ', 1)

        # Remove whitespace that may be present.
        # Header lines include the trailing newline, and there may be whitespace
        # around the colon.
        name = name.strip()
        value = value.strip()

        # Header names are case insensitive.
        # Lowercase name here.
        name = name.lower()

        # Now we can actually record the header name and value.
        if name in headers['content'][count]:
            headers['content'][count][name].append(value)
        else:
            headers['content'][count][name] = [value]

    def write_function(buf):
        size = databuf.getbuffer().nbytes
        if size < 4096000:
            databuf.write(buf)
            return len(buf)
        return 0

    c.setopt(pycurl.FOLLOWLOCATION, d.get('followlocation', 1))
    c.setopt(pycurl.MAXREDIRS, d.get('maxredirs', 5))
    c.setopt(pycurl.CONNECTTIMEOUT, d.get('connecttimeout', 10))
    c.setopt(pycurl.TIMEOUT, d.get('timeout', 20))
    c.setopt(pycurl.NOSIGNAL, d.get('nosignal', 1))
    c.setopt(pycurl.USERAGENT, d.get('useragent', DEFAULT_USER_AGENT))
    c.setopt(pycurl.SSL_VERIFYPEER, d.get('ssl_verifypeer', 0))
    c.setopt(pycurl.SSL_VERIFYHOST, d.get('ssl_verifyhost', 0))

    crawl_url = d.get('effect') or d.get('url')
    if not crawl_url.startswith('http'):
        crawl_url = 'http://' + crawl_url
    c.setopt(pycurl.URL, crawl_url.encode('utf-8'))
    # c.setopt(pycurl.URL, url)

    headerfunction = d.get('headerfunction')
    if headerfunction is None:
        c.setopt(pycurl.HEADERFUNCTION, header_function)
    else:
        c.setopt(pycurl.HEADERFUNCTION, load(headerfunction))
    writefunction = d.get('writefunction')
    if writefunction is None:
        c.setopt(pycurl.WRITEFUNCTION, write_function)
    else:
        c.setopt(pycurl.WRITEFUNCTION, load(writefunction))

    method = d.get('method', 'get')

    if method == 'get':
        httpheader = d.get('httpheader')
        if httpheader:
            c.setopt(c.HTTPHEADER, httpheader)
    elif method == 'post':
        httpheader = d.get('httpheader', ['Accept: application/json', "Content-type: application/json"])
        if httpheader:
            # curl.setopt(pycurl.HEADER, p.get('header', 1))
            c.setopt(pycurl.HTTPHEADER, httpheader)
        post301 = getattr(pycurl, 'POST301', None)
        if post301 is not None:
            # Added in libcurl 7.17.1.
            c.setopt(post301, True)
        c.setopt(pycurl.POST, 1)
        postfields = d.get('postfields')
        if postfields:
            postfields = json.dumps(postfields)
            c.setopt(pycurl.POSTFIELDS, postfields)
    elif method == 'postform':
        httpheader = d.get('httpheader', ["Content-Type: application/x-www-form-urlencoded"])
        if httpheader:
            c.setopt(pycurl.HTTPHEADER, httpheader)
        post301 = getattr(pycurl, 'POST301', None)
        if post301 is not None:
            # Added in libcurl 7.17.1.
            c.setopt(post301, True)
        c.setopt(pycurl.POST, 1)
        httppost = d.get('postform')
        if httppost:
            c.setopt(pycurl.POSTFIELDS, httppost)
    proxy_type = d.get('proxytype')
    proxy = d.get('proxy')
    proxy_port = d.get('proxyport')
    proxy_userpwd = d.get('proxyuserpwd')
    if proxy:
        c.setopt(pycurl.PROXY, proxy)
    if proxy_port:
        c.setopt(pycurl.PROXYPORT, proxy_port)
    if proxy_userpwd:
        c.setopt(pycurl.PROXYUSERPWD, proxy_userpwd)
    if proxy_type:
        if '4' in proxy_type:
            proxy_type = pycurl.PROXYTYPE_SOCKS4A
        elif '5' in proxy_type:
            proxy_type = pycurl.PROXYTYPE_SOCKS5_HOSTNAME
        else:
            proxy_type = pycurl.PROXYTYPE_HTTP
        c.setopt(pycurl.PROXYTYPE, proxy_type)

    verbose = d.get('verbose')
    if verbose:
        c.setopt(pycurl.VERBOSE, True)
    # with the line below, redirect cookie can update
    c.setopt(pycurl.COOKIEFILE, "")

    cookie = d.get('cookie')
    if cookie:
        c.setopt(pycurl.COOKIE, cookie)
    c.setopt(pycurl.FAILONERROR, True)

    http_version = d.get('http_version')
    if http_version == '1.1' or http_version == 1.1:
        c.setopt(pycurl.HTTP_VERSION, pycurl.CURL_HTTP_VERSION_1_1)
    else:
        c.setopt(pycurl.HTTP_VERSION, pycurl.CURL_HTTP_VERSION_1_0)
コード例 #9
0
ファイル: chrome_factory.py プロジェクト: pingf/netboy2
    def run(self):
        load_timeout = self.info.get('timeout', 15)
        script_timeout = self.info.get('script_timeout', 15)
        implicit_wait = self.info.get('wait', 5)
        if not self.driver:
            chrome_bin = self.info.get('chrome',
                                       '/opt/google/chrome-beta/chrome')
            # window_size = self.info.get('window_size', '2048x4096')
            proxy_type = self.info.get('proxytype')
            user_agent = self.info.get('useragent', DEFAULT_USER_AGENT)

            lang = self.info.get('lang', 'zh,zh-CN')
            if proxy_type == '5':
                proxy_type = 'socks5h'
            if proxy_type == '4':
                proxy_type = 'socks4'
            proxy = self.info.get('proxy')
            proxy_port = self.info.get('proxyport')

            options = webdriver.ChromeOptions()
            options.binary_location = chrome_bin
            # options.add_argument('headless')
            options.set_headless(headless=True)
            options.add_argument("--dns-prefetch-disable")
            options.add_argument('--no-referrers')

            # options.add_argument('window-size=' + window_size)
            # options.add_argument('--proxy-server=http://127.0.0.1:8123')
            # options.add_argument('--proxy-server=https://127.0.0.1:8123')
            # options.add_argument('--proxy-server=socks5://127.0.0.1:1082')
            if proxy and proxy_type and proxy_port:
                options.add_argument('--proxy-server=%s://%s:%d' %
                                     (proxy_type, proxy, proxy_port))

            if user_agent:
                options.add_argument("--user-agent=" + user_agent)

            options.add_argument('--disable-gpu')
            options.add_argument('--disable-audio')
            options.add_argument('--no-sandbox')
            options.add_argument('--ignore-certificate-errors')
            options.add_argument('--allow-insecure-localhost')
            if lang:
                options.add_experimental_option(
                    'prefs', {'intl.accept_languages': lang})

                # options.add_argument('lang='+lang)
            capabilities = DesiredCapabilities.CHROME.copy()
            capabilities['acceptSslCerts'] = True
            capabilities['acceptInsecureCerts'] = True
            try:
                self.driver = webdriver.Chrome(
                    chrome_options=options, desired_capabilities=capabilities)
            except Exception as e:
                self.log.warning('error: ' + str(e) + ' error_type: ' +
                                 str(type(e)))
                raise e
            self.driver.implicitly_wait(implicit_wait)
            self.driver.set_page_load_timeout(load_timeout)
            self.driver.set_script_timeout(script_timeout)

        responses = []
        try:
            # crawl_func = exit_after(load_timeout)(crawl)
            for d in self.updated:

                prepare_resp = self.prepare_it(d)

                if isinstance(prepare_resp, dict):
                    if prepare_resp.get('skip'):
                        continue
                    if prepare_resp.get('cover'):
                        response = self.trigger_it(d, prepare_resp)
                        if self.info.get('mode') == 'celery':
                            response.pop('data', None)
                            response.pop('screen', None)
                        responses.append(response)
                        continue

                start = time.time()
                url = d.get('url') if isinstance(d, dict) else d
                try:
                    response = crawl(self.driver, d)
                    if response is None:
                        end = time.time()
                        response_time = '%s' % (end - start)
                        msg = "failed! url: " + str(url)
                        self.log.warning(msg)
                        response = {
                            'url': url,
                            'effect': url,
                            'data': '',
                            'title': '',
                            'spider': 'chrome',
                            'state': 'error',
                            "code": -2,
                            "time": response_time
                        }
                    else:
                        interact = d.get('interactive')
                        if interact:
                            inter_func = load(interact)
                            inter_func(d, self.driver)
                        end = time.time()
                        d['time'] = '%s' % (end - start)
                        msg = "success! url: " + str(url) + ' effect: ' + str(
                            self.driver.current_url)
                        self.log.info(msg)

                except Exception as e:
                    end = time.time()
                    response_time = '%s' % (end - start)
                    msg = "failed! url: " + str(url) + ' errtype: ' + str(
                        type(e)) + ' errmsg: ' + str(e)
                    self.log.warning(msg)
                    response = {
                        'url': url,
                        'effect': url,
                        'data': '',
                        'title': '',
                        'spider': 'chrome',
                        'state': 'error',
                        "code": -1,
                        "time": response_time
                    }
                response = self.trigger_it(d, response)
                if self.info.get('mode') == 'celery':
                    response.pop('data', None)
                    response.pop('screen', None)
                responses.append(response)
            self.anaylse_it(responses)
        finally:
            if self.driver:
                self.driver.quit()
                self.driver = None
        return responses
コード例 #10
0
ファイル: aiohttp_factory.py プロジェクト: pingf/netboy2
    async def fetch(self, data, session, json_response=False):
        if isinstance(data, str):
            data = {
                'url': data,
            }
        url = data.get('url')
        filter = data.get('filter')
        if not filter:
            filter = ['data', 'title', 'code']

        results = {'url': url}
        async with session.get(url) as response:
            # delay = response.headers.get("DELAY")
            # date = response.headers.get("DATE")
            # print("{}:{} with delay {}".format(date, response.url, delay))
            # content = await response.read()
            stream = data.get('stream')
            if isinstance(stream, dict):
                stream_func = stream.get('func')
                stream_func = load(stream_func)
                stream_chunk = stream.get('chunk', 512)
                stream_file = stream.get('file')
                count = 0
                if stream_file:
                    with open(stream_file, 'wb') as fd:
                        while True:
                            chunk = await response.content.read(stream_chunk)
                            count += 1
                            if not chunk:
                                break
                            stream_func(chunk, data, fd)
                else:
                    while True:
                        chunk = await resp.content.read(stream_chunk)
                        count += 1
                        if not chunk:
                            break
                        stream_func(chunk, data)
                results['stream'] = {'chunk': stream_chunk, 'count': count}
            elif 'data' in filter or 'charset' in filter or 'title' in filter:
                charset = data.get('charset')
                if json_response is False:
                    if charset:
                        raw = await response.read()
                        content = raw.decode('utf8', errors='ignore')
                    else:
                        content = await response.text()
                else:
                    content = await response.json()

                match = re.search('<title[^>]*>([^<]+)</title>', content,
                                  re.IGNORECASE)
                title = match.group(1) if match else ''

                if 'data' in filter:
                    results['data'] = content
                if 'charset' in filter:
                    results['charset'] = charset
                if 'title' in filter:
                    results['title'] = title
            if 'headers' in filter:
                results['headers'] = {
                    k: v
                    for k, v in response.headers.items()
                }
            if 'cookies' in filter:
                results['cookies'] = {
                    k: v
                    for k, v in response.cookies.items()
                }
            if 'effect' in filter:
                results['effect'] = response.real_url
            if 'code' in filter:
                results['code'] = response.status
            if 'method' in filter:
                results['method'] = response.method
            return results