Esempio n. 1
0
def fix_javascript(url, content):
    """ 中南文交所的幺蛾子

    假定至少安装过node
    """
    import execjs
    try:
        if 'znypjy' in url:
            text = content.decode('gb18030', 'ignore')
            m = re.compile('(function.*?;})window.location').search(text)
            if m:
                script = m.group(1)
                code = execjs.compile(script).call('decoder')
                content = session.get(
                    url + '?' + code, timeout=(5, 10)).content
        elif 'xhcae' in url:
            text = content.decode('gb18030', 'ignore')
            m = re.compile(
                '/notice/\w+/\?WebShieldSessionVerify=\w+').search(text)
            if m:
                url = m.group(0)
                content = session.get(
                    'http://www.xhcae.com' + url, timeout=(5, 10)).content
    except:
        log.exception('')
    return content
Esempio n. 2
0
def fix_javascript(url, content):
    """ 中南文交所的幺蛾子

    假定至少安装过node
    """
    import execjs
    try:
        if 'znypjy' in url:
            text = content.decode('gb18030', 'ignore')
            m = re.compile('(function.*?;})window.location').search(text)
            if m:
                script = m.group(1)
                code = execjs.compile(script).call('decoder')
                content = session.get(url + '?' + code,
                                      timeout=(5, 10)).content
        elif 'xhcae' in url:
            text = content.decode('gb18030', 'ignore')
            m = re.compile('/notice/\w+/\?WebShieldSessionVerify=\w+').search(
                text)
            if m:
                url = m.group(0)
                content = session.get('http://www.xhcae.com' + url,
                                      timeout=(5, 10)).content
    except:
        log.exception('')
    return content
Esempio n. 3
0
def crawl_all():
    for site in SITES:
        retries = 5
        while retries > 0:
            retries -= 1
            try:
                crawl(site, maxpage=1)
            except:
                log.exception('站点{}爬取失败, retries={}'.format(site, retries))
            else:
                break
Esempio n. 4
0
def crawl_all():
    for site in SITES:
        retries = 5
        while retries > 0:
            retries -= 1
            try:
                crawl(site, maxpage=1)
            except:
                log.exception('站点{}爬取失败, retries={}'.format(site, retries))
            else:
                break
Esempio n. 5
0
File: cli.py Progetto: maocis/ybk
 def __exit__(self, type, value, traceback):
     if value:
         crawl_log.exception('出错啦')
     path.unlink()
     return True