Example #1
0
def validate_ip(proxy):
    headers = {
        'User-Agent': get_random_useragent()
    }
    # 代理设置
    proxy_handler = urllib.request.ProxyHandler({'http': proxy})
    opener = urllib.request.build_opener(proxy_handler)
    urllib.request.install_opener(opener)

    # 请求网址
    validateUrl = 'https://www.baidu.com'
    req = urllib.request.Request(url=validateUrl, headers=headers)
    # 延时,等待反馈结果
    time.sleep(4)

    #判断结果
    try:
        res = urllib.request.urlopen(req)
        # 延时,等待反馈结果
        time.sleep(2)
        content = res.read()
        # 写入文件
        if content and res.status == 200:
            print('%s is ok' % proxy)
            write('./proxy.txt', proxy)
        else:
            # 未通过
            print('%s is not ok' % proxy)
    except urllib.request.URLError as e:
        print('%s error %s' % (proxy, e.reason))
def main():
    target_url = 'https://www.jianshu.com/c/yD9GAd?utm_medium=index-collections&utm_source=desktop'
    server_url = 'https://www.jianshu.com'

    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Connection': 'keep-alive',
        'Host': 'www.jianshu.com',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': get_random_useragent()
    }

    session = requests.session()
    session.get('https://www.jianshu.com', headers=headers)
    print(session.cookies.get_dict())

    resp = session.get(target_url, headers=headers)
    print(resp.cookies.get_dict())

    title_dict = get_title_url(resp.text)
    for title, url in title_dict.items():
        r = session.get(server_url + url, headers=headers)
        print(r.cookies.get_dict())

        content = get_content(r.text)
        with open('./' + title + '.txt', 'w', encoding='utf-8') as fw:
            fw.write(content)
Example #3
0
def download(url, referer):
    try:
        headers = {'User-Agent': get_random_useragent(), 'Referer': referer}
        resp = requests.get(url, headers=headers)
        if resp.status_code == 200:
            return resp
        else:
            return None
    except BaseException as e:
        print(e)
        return None
def main():
    headers = {
        'User-Agent': get_random_useragent(),
        'Host': 'www.lagou.com',
        'Referer': 'www.lagou.com',
        'Upgrade-Insecure-Requests': '1',
    }
    target_url = 'https://www.lagou.com/jobs/positionAjax.json?city=%E6%88%90%E9%83%BD&needAddtionalResult=false'

    # 获取session-cookie
    session = requests.session()
    session.get('https://www.lagou.com', headers=headers)

    headers['Referer'] = 'https://www.lagou.com/jobs/list_python?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput='
    headers['X-Anit-Forge-Code'] = '0'
    headers['X-Anit-Forge-Token'] = 'None'
    headers['X-Requested-With'] = 'XMLHttpRequest'
    positions = []
    for x in range(1, 8):
        data = {
            'first': 'true',
            'pn': x,
            'kd': 'python'
        }
        resp = session.post(target_url, headers=headers, data=data)
        json_str = resp.json()
        page_position = json_str['content']['positionResult']['result']
        for position in page_position:
            position_dict = {
                'positionId': position['positionId'],
                'positionName': position['positionName'],
                'positionDesc': crawl_positiondesc(session, position['positionId']),
                'education': position['education'],
                'city': position['city'],
                'createTime': position['createTime'],
                'companyShortName': position['companyShortName'],
                'financeStage': position['financeStage'],
                'salary': position['salary'],
                'industryField': position['industryField'],
                'district': position['district'],
                'positionAdvantage': position['positionAdvantage'],
                'companySize': position['companySize'],
                'companyLabelList': position['companyLabelList'],
                'workYear': position['workYear'],
                'positionLables': position['positionLables'],
                'companyFullName': position['companyFullName'],
                'firstType': position['firstType'],
                'secondType': position['secondType'],
                'subwayline': position['subwayline'],
                'stationname': position['stationname']
            }
            positions.append(position_dict)
        # print(positions)
        time.sleep(10)
def crawl_positiondesc(session, p_id):
    url = 'https://www.lagou.com/jobs/%s.html' % p_id
    headers = {
        'User-Agent': get_random_useragent(),
        'Host': 'www.lagou.com',
        'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
        'Upgrade-Insecure-Requests': '1'
    }

    resp = session.get(url=url, headers=headers)
    print(resp.text.encode('iso-8859-1').decode('utf-8'))
 def download(self, url):
     try:
         headers = {
             'User-Agent': get_random_useragent(),
         }
         resp = requests.get(url, headers=headers)
         if resp.status_code == 200:
             return resp.text
         else:
             return None
     except BaseException as e:
         print(e)
         return None
Example #7
0
 def download(self, url):
     """
     根据url下载页面
     :param url: url
     :return: html string
     """
     try:
         headers = {'User-Agent': get_random_useragent()}
         resp = requests.get(url, headers=headers)
         return resp.text
     except BaseException as e:
         print(e)
         return None
    def download(self, url, referer):
        headers = {
            'Host': 'www.mmjpg.com',
            'Referer': referer,
            'User-Agent': get_random_useragent()
        }

        try:
            resp = requests.get(url=url, headers=headers)
            if resp.status_code == 200:
                return resp.text
            return None
        except BaseException as e:
            print(e)
            return None
Example #9
0
def download(url):
    headers = {
        'Host': 'www.xicidaili.com',
        'Referer': 'http://www.xicidaili.com/',
        'User-Agent': get_random_useragent()
    }

    try:
        resp = requests.get(url=url, headers=headers)
        if resp.status_code == 200:
            return resp.text
        return None
    except BaseException as e:
        print(e)
        return None
Example #10
0
def downloader(url):
    """下载页面"""
    headers = {
        'Host': 'maoyan.com',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': get_random_useragent()
    }
    try:
        resp = requests.get(url=url, headers=headers)
        if resp.status_code == 200:
            return resp.text
        return None
    except RequestException as e:
        print('请求 %s 失败' % url)
        print(e)
        return None
Example #11
0
def main():
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Connection': 'keep-alive',
        'Host': 'www.jianshu.com',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': get_random_useragent()
    }
    session = requests.session()
    session.get('https://www.jianshu.com', headers=headers)
    print(session.cookies.get_dict())
    resp1 = session.get('https://www.jianshu.com/p/cc3754e1a761', headers=headers)
    resp2 = session.get('https://www.jianshu.com/p/1f9663d82b58', headers=headers)
    print(resp1.cookies.get_dict())
    print(resp2.cookies.get_dict())
Example #12
0
    def download(self, session, url, referer=None):
        print(session.cookies.get_dict())
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            'Host': 'www.qiushibaike.com',
            'User-Agent': get_random_useragent()
        }

        if referer:
            headers['Referer'] = referer

        try:
            resp = session.get(url, headers=headers)
            return resp.text
        except BaseException as e:
            print('请求失败!爬取失败!', e)
            return None
def downloader(url):
    """
    下载器函数,用于下载url所代表的页面
    :param url: url
    :return: 下载页面的字符串
    """
    headers = {
        'Host': 'www.weather.com.cn',
        'Referer': 'http://www.weather.com.cn/forecast/',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': get_random_useragent()
    }

    try:
        resp = requests.get(url, headers=headers)
        if resp.status_code == 200:
            return resp.text.encode('iso-8859-1').decode('utf-8')
    except BaseException as e:
        print(e)
    return None