コード例 #1
0
    def crawl_info(self, data, payload, begin_time):
        end_time = datetime.datetime.now()
        elapsed = (end_time - begin_time).total_seconds()
        tid = payload.get('id')
        url = payload.get('url')
        if type(data) == dict:
            data['time'] = elapsed
            if tid:
                data['id'] = tid
            data['spider'] = 'chrome'
            data['url'] = url
            data['payload'] = payload
            # data['chrome_id'] = payload.get('chrome_id')
        else:
            data = {
                'time': elapsed,
                'spider': 'chrome',
                'url': url,
            }

        post_func = payload.get('post_func')
        if type(post_func) == str:
            post_func = func.load(post_func)
        if post_func:
            data2 = post_func(payload, data)
            if type(data2) is dict and len(data2) >= len(data):
                data = data2
        return data
コード例 #2
0
ファイル: request.py プロジェクト: zhuangyan/falsy
async def get_request(payload, share=None):
    c = pycurl.Curl()
    data_buf = BytesIO()
    # header_buf = BytesIO()
    headers = {'count': 0, 'content': [{}]}
    try:
        setup_curl_for_get(c, payload, data_buf, headers, share)  # header_buf)

        with aiohttp.Timeout(payload.get('aiohttp_timeout', 60)):
            resp = await CurlLoop.handler_ready(c)
            charset = None
            if 'content-type' in headers:
                content_type = headers['content-type'].lower()
                match = re.search('charset=(\S+)', content_type)
                if match:
                    charset = match.group(1)
                    print('Decoding using %s' % charset)
            body = data_buf.getvalue()
            if len(body) == 0:
                data = ''
                charset = 'utf-8'
            else:
                if charset is None:
                    dammit = UnicodeDammit(
                        body, ["utf-8", "gb2312", "gbk", "big5", "gb18030"],
                        smart_quotes_to="html")
                    data = dammit.unicode_markup
                    charset = dammit.original_encoding
                else:
                    data = body.decode(charset, 'ignore')
            # headers.remove({})
            headers['content'] = [h for h in headers['content'] if len(h) > 0]
            soup_lxml = BeautifulSoup(data, 'lxml')
            soup_html = BeautifulSoup(data, 'html.parser')
            resp.update({
                'url': payload.get('url'),
                # 'soup': soup,
                'title': get_title(soup_lxml),
                'links': get_links(soup_lxml),
                'links2': get_links2(soup_lxml),
                'metas': get_metas(soup_lxml),
                'images': get_images(soup_lxml),
                'scripts': get_scripts(soup_lxml),
                'text': get_text(soup_html),
                'data': data,
                'headers': headers,
                'charset': charset,
                'spider': 'pycurl',
                'payload': payload,
            })
            post_func = payload.get('post_func')
            if post_func:
                post_func = load(post_func)
                resp = post_func(payload, resp)
            return resp
    finally:
        c.close()
コード例 #3
0
ファイル: decorator.py プロジェクト: zhuangyan/falsy
 def __exit__(self, e_type, e_value, e_trace):
     if e_type is None or e_value is None:
         return
     to = self.__dict__.get('to')
     if to:
         exceptions = self.__dict__.get('exceptions', ())
         if issubclass(e_type, exceptions):
             propagate = func.load(to)(e_type, e_value, e_trace)
             if propagate in [True, False]:
                 return propagate
コード例 #4
0
ファイル: decorator.py プロジェクト: abeusher/falsy
    def __exit__(self, e_type, e_value, e_trace):
        on_error = self.__dict__.get('on_error')
        if on_error:
            propagate = func.load(on_error)(e_type, e_value, e_trace)
            if propagate in [True, False]:
                return propagate

        for supp in self.__dict__.get('suppress_list', []):
            if isinstance(e_value, supp):
                return True
        return False
コード例 #5
0
ファイル: request.py プロジェクト: koshikraj/falsy
async def get_request(payload):
    c = pycurl.Curl()
    data_buf = BytesIO()
    # header_buf = BytesIO()
    headers = {'count': 0, 'content': [{}]}
    try:
        setup_curl_for_get(c, payload, data_buf, headers)  # header_buf)

        with aiohttp.Timeout(payload.get('aiohttp_timeout', 60)):
            resp = await CurlLoop.handler_ready(c)
            encoding = None
            if 'content-type' in headers:
                content_type = headers['content-type'].lower()
                match = re.search('charset=(\S+)', content_type)
                if match:
                    encoding = match.group(1)
                    print('Decoding using %s' % encoding)
            body = data_buf.getvalue()
            if len(body) == 0:
                data = ''
                encoding = 'utf-8'
            else:
                if encoding is None:
                    dammit = UnicodeDammit(
                        body, ["utf-8", "gb2312", "gbk", "big5", "gb18030"],
                        smart_quotes_to="html")
                    data = dammit.unicode_markup
                    encoding = dammit.original_encoding
                else:
                    data = body.decode(encoding, 'ignore')
            # headers.remove({})
            headers['content'] = [h for h in headers['content'] if len(h) > 0]
            soup = BeautifulSoup(data, 'lxml')
            resp.update({
                'url':
                payload.get('url'),
                # 'soup': soup,
                'title':
                str(soup.title.get_text()),
                'links': [str(link) for link in soup.find_all('a', href=True)],
                'data':
                data,
                'headers':
                headers,
                'encoding':
                encoding,
            })
            post_func = payload.get('post_func')
            if post_func:
                post_func = load(post_func)
                resp = post_func(payload, resp)
            return resp
    finally:
        c.close()
コード例 #6
0
def load(id, args, error_handler=None):
    if args and error_handler:
        return func.load(id).s(args).on_error(func.load(error_handler).s())
    if args and not error_handler:
        return func.load(id).s(args)
    if not args and error_handler:
        return func.load(id).s().on_error(func.load(error_handler).s())
    return func.load(id).s()
コード例 #7
0
ファイル: request.py プロジェクト: zhuangyan/falsy
async def post_request(payload, share=None):
    c = pycurl.Curl()
    data_buf = BytesIO()
    # header_buf = BytesIO()
    headers = {'count': 0, 'content': [{}]}
    try:
        setup_curl_for_post(c, payload, data_buf, headers,
                            share)  # header_buf)

        with aiohttp.Timeout(payload.get('aiohttp_timeout', 60)):
            resp = await CurlLoop.handler_ready(c)
            # encoding = None
            # if 'content-type' in headers:
            #     content_type = headers['content-type'].lower()
            #     match = re.search('charset=(\S+)', content_type)
            #     if match:
            #         encoding = match.group(1)
            #         print('Decoding using %s' % encoding)
            body = data_buf.getvalue()
            encoding = 'utf-8'
            data = body.decode(encoding, 'ignore') if len(body) > 0 else ''

            # if encoding is None:
            #     dammit = UnicodeDammit(body, ["utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html")
            #     data = dammit.unicode_markup
            #     encoding = dammit.original_encoding
            # else:
            #     data = body.decode(encoding, 'ignore')
            # headers.remove({})
            headers['content'] = [h for h in headers['content'] if len(h) > 0]

            resp.update({
                # 'url': payload.get('url'),
                'data': data,
                'headers': headers,
                'encoding': encoding,
            })
            post_func = payload.get('post_func')
            if type(post_func) == str:
                post_func = load(post_func)
            if post_func:
                resp = post_func(payload, resp)
            # post_func = payload.get('post_func')
            # if post_func:
            #     post_func = load(post_func)
            #     resp = post_func(payload, resp)
            return resp
    finally:
        c.close()
コード例 #8
0
def loads(payload):
    if payload.get('type') != 'normal':
        raise Exception('celery task loader only support normal mode')
    tasks = payload.get('tasks', [])
    cts = []
    for task in tasks:
        ops = [
            load(id, task.get('args'), task.get('on_error'))
            if i == 0 else load(id, None, task.get('on_error'))
            for i, id in enumerate(task['ids'])
        ]
        cts.append(chain(ops))
    callback = payload.get('callback')
    if callback:
        return chord(header=group(cts), body=func.load(callback).s())
    return group(cts)
コード例 #9
0
 def load_handler(self, name):
     if name is None:
         return None
     return load(name)
コード例 #10
0
ファイル: utils.py プロジェクト: Duroktar/falsy
def setup_curl_basic(c, p, data_buf, headers=None):
    def header_function(header_line):
        count = headers['count']
        header_line = header_line.decode('iso-8859-1')

        if ':' not in header_line and not header_line.startswith('HTTP'):
            # print(header_line)
            if '\r\n' in header_line:
                headers['count'] += 1
                headers['content'].append({})
            return

        # Break the header line into header name and value.
        if ':' in header_line:
            name, value = header_line.rstrip('\r\n').split(':', 1)
        else:
            name, value = header_line.rstrip('\r\n').split(' ', 1)

        # Remove whitespace that may be present.
        # Header lines include the trailing newline, and there may be whitespace
        # around the colon.
        name = name.strip()
        value = value.strip()

        # Header names are case insensitive.
        # Lowercase name here.
        name = name.lower()

        # Now we can actually record the header name and value.
        if name in headers['content'][count]:
            headers['content'][count][name].append(value)
        else:
            headers['content'][count][name] = [value]

    def write_function(buf):
        size = data_buf.getbuffer().nbytes
        if size < 4096000:
            data_buf.write(buf)
            return len(buf)
        return 0

    url = p.get('url')
    c._raw_url = url
    c._raw_id = p.get('id', str(uuid.uuid1()))
    c._raw_post_func = p.get('post_func')
    c._raw_payload = p
    c.setopt(pycurl.URL, url.encode('utf-8'))
    c.setopt(pycurl.FOLLOWLOCATION, p.get('followlocation', 1))
    c.setopt(pycurl.MAXREDIRS, p.get('maxredirs', 5))

    # c.setopt(pycurl.WRITEHEADER, header_buf)
    headerfunction = p.get('headerfunction')
    if headerfunction is None:
        c.setopt(pycurl.HEADERFUNCTION, header_function)
    else:
        c.setopt(pycurl.HEADERFUNCTION, load(headerfunction))
    writefunction = p.get('writefunction')
    if writefunction is None:
        c.setopt(pycurl.WRITEFUNCTION, write_function)
    else:
        c.setopt(pycurl.WRITEFUNCTION, load(writefunction))
    c.setopt(pycurl.USERAGENT, p.get('useragent', DEFAULT_USER_AGENT))
    c.setopt(pycurl.SSL_VERIFYPEER, p.get('ssl_verifypeer', 0))
    c.setopt(pycurl.SSL_VERIFYHOST, p.get('ssl_verifyhost', 0))
    c.setopt(pycurl.NOSIGNAL, p.get('nosignal', 1))
    c.setopt(pycurl.CONNECTTIMEOUT, p.get('connecttimeout', 7))
    c.setopt(pycurl.TIMEOUT, p.get('timeout', 15))
    c.setopt(pycurl.DNS_CACHE_TIMEOUT, p.get('dns_cache_timeout', 360))
    c.setopt(pycurl.DNS_USE_GLOBAL_CACHE, p.get('dns_use_global_cache', 1))
    c.setopt(pycurl.TCP_NODELAY, p.get('tcp_nodelay', 1))
    c.setopt(pycurl.IPRESOLVE, p.get('ipresolve', pycurl.IPRESOLVE_V4))
    c.setopt(pycurl.ENCODING, p.get('encoding', 'gzip, deflate'))

    c.setopt(pycurl.HTTP_VERSION,
             p.get('http_version', pycurl.CURL_HTTP_VERSION_1_0))
    c.setopt(pycurl.FORBID_REUSE, p.get('forbid_reuse', 1))
    c.setopt(pycurl.FRESH_CONNECT, p.get('fresh_connect', 1))
    c.setopt(c.AUTOREFERER, p.get('autoreferer', 1))

    referer = p.get('referer')
    if referer:
        c.setopt(c.REFERER, referer)
    cookiejar = p.get('cookiejar')
    if cookiejar:
        print('cookiejar', cookiejar)
        c.setopt(c.COOKIEJAR, cookiejar)
    cookiefile = p.get('cookiefile')
    if cookiefile:
        print('cookiefile', cookiefile)
        c.setopt(c.COOKIEFILE, cookiefile)

    dns_servers = p.get('dns_servers')
    if dns_servers:
        c.setopt(c.DNS_SERVERS, dns_servers)

    debug = p.get('debugfunction')
    if debug:
        c.setopt(pycurl.DEBUGFUNCTION, load(debug))
    c.setopt(pycurl.VERBOSE, p.get('verbose', 0))

    proxy = p.get('proxy')
    proxyport = p.get('proxyport')
    proxytype = p.get('proxytype')
    proxyuserpwd = p.get('proxyuserpwd')
    if proxy and proxyport and proxytype:
        c.setopt(pycurl.PROXY, proxy)
        c.setopt(pycurl.PROXYPORT, proxyport)
        c.setopt(pycurl.PROXYTYPE, proxytype)
        if proxyuserpwd:
            c.setopt(pycurl.PROXYUSERPWD, proxyuserpwd)