コード例 #1
0
def get_page(url, options={}):
    try:
        ua = UserAgent()
    except:
        pass
    try:
        base_headers = {
            'User-Agent': ua.random(),
            'Accept-Encoding': 'gzip, deflate, sdch',
            'Accept-Language': 'zh-CN,zh;q=0.8'
        }
    except:
        base_headers = {
            'User-Agent': ua.random(),
            'Accept-Encoding': 'gzip, deflate, sdch',
            'Accept-Language': 'zh-CN,zh;q=0.8'
        }
    headers = dict(base_headers, **options)
    print('Getting', url)
    try:
        r = requests.get(url, headers=headers)
        print('Getting result', url, r.status_code)
        if r.status_code == 200:
            return r.text
    except ConnectionError:
        print('Crawling Failed', url)
        return None
コード例 #2
0
ファイル: cram_news.py プロジェクト: xunfeng191/-StockTrading
def gen_news():
    ua = UserAgent()
    user_agent = ua.random()

    referer = 'https://tushare.pro/login?next=%2Fnews%2Fnews_sina'

    headers = {
        'User-Agent': user_agent,
        'Host': 'tushare.pro',
        'Origin': 'https://tushare.pro',
        'Referer': referer
    }

    stockPageRequest = request.urlopen('http://finance.eastmoney.com/news/cdfsd.html')
    htmlTitleContent = str(stockPageRequest.read(), 'utf-8')
    # 正则匹配标题
    titlePattern = re.compile('<span class="l3 a3">title="(.*?)"</span>', re.S)
    p_title = 'title="(.*?)"(.*?)'
    title = re.findall(p_title, htmlTitleContent)
    title = [t[0] for t in title if not t[0].find('【')]

    news = []
    for t in title:
        a = t.find('【')
        b = t.find('】')
        news.append({'title': t[a+1:b], 'content': t[b+1:]})
    # news = News.objects.all()
    return news
コード例 #3
0
def test():
    ua = UserAgent(family='chrome', os_family='linux')

    for i in range(100):
        res = ua.random()

        print(res)
コード例 #4
0
 def _build_chrome_options(self, headless=True, random_user=False):
     chrome_options = Options()
     chrome_options.add_argument("--disable-notifications")
     chrome_options.add_argument("--verbose")
     chrome_options.add_argument("--window-size=1920x1080")
     chrome_options.add_argument("--no-sandbox")
     chrome_options.add_experimental_option(
         "prefs",
         {
             "download.default_directory": self.download_dir,
             "download.prompt_for_download": False,
             "download.directory_upgrade": True,
             "safebrowsing_for_trusted_sources_enabled": False,
             "safebrowsing.enabled": False,
         },
     )
     chrome_options.add_argument("--disable-gpu")
     chrome_options.add_argument("--disable-software-rasterizer")
     if headless:
         chrome_options.add_argument("--headless")
     if random_user:
         ua = UserAgent(family="chrome")
         randomua = ua.random()
         chrome_options.add_argument(f"user-agent={randomua}")
     return chrome_options
コード例 #5
0
def set_options():
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-setuid-sandbox")
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--window-size=1920x1080")
    ua = UserAgent(family='chrome')
    randomua = ua.random()
    chrome_options.add_argument(f'user-agent={randomua}')
    print(randomua)
    return chrome_options
コード例 #6
0
    def youtube_scrapper(self, query, number_results=2):
        "Function to scrape results from Youtube Search"
        query = urllib.parse.quote_plus(query)  # Format into URL encoding
        ua = UserAgent(family='chrome')
        assert isinstance(query, str)  #Search term must be a string
        assert isinstance(number_results,
                          int)  #Number of results must be an integer
        escaped_search_term = query.replace(' ', '+')
        google_url = "https://www.google.com/search?q={}&num={}".format(
            query + "+site:youtube.com", 1)
        #print(google_url)
        response = requests.get(google_url, {"User-Agent": ua.random()})
        soup = BeautifulSoup(response.text, "html.parser")
        result_div = soup.find_all('div', attrs={'class': 'ZINbbc'})
        self.Links = []
        self.Titles = []
        for r in result_div:
            # Checks if each element is present, else, raise exception
            try:
                link = r.find('a', href=True)
                title = r.find('div', attrs={'class': 'vvjwJb'}).get_text()

                # Check to make sure everything is present before appending
                if link != '' and title != '':
                    self.Links.append(link['href'])
                    self.Titles.append(title)
                    if (len(self.Links) == number_results):
                        break
            # Next loop if one element is not present
            except:
                continue

        for i in range(0, len(self.Links)):
            self.Links[i] = self.Links[i].replace("/url?q=", "")
        for i in range(0, len(self.Links)):
            if (self.Links[i].find("watch") != -1):
                self.Links[i] = self.Links[i].replace("%3F", "?")
                self.Links[i] = self.Links[i].replace("%3D", "=")
                self.Links[i] = self.Links[i].split("&")[0]
            else:
                continue
        if (len(self.Links) == 0):
            return
        else:
            for i in range(0, len(self.Links)):
                d = dict()
                d["title"] = self.Titles[i]
                d["linktopage"] = self.Links[i]
                self.youtube_result.append(d)
コード例 #7
0
class DownloadImg():
    def __init__(self):
        self.ua = UserAgent()

    def download_one_img(self, img_url, saved_path):
        # 下载图片
        header = {
            "User-Agent": "{}".format(self.ua.random().strip()),
            'Connection': 'close'}
        r = requests.get(img_url, headers=header, stream=True)
        print("请求图片状态码 {}".format(r.status_code))  # 返回状态码
        if r.status_code == 200:  # 写入图片
            with open(saved_path, mode="wb") as f:
                f.write(r.content)
            print("download {} success!".format(saved_path))
        del r
        return saved_path
コード例 #8
0
ファイル: test.py プロジェクト: hiddenxx/Scripts
import requests
from my_fake_useragent import UserAgent

URL = 'https://pastr.io/login'
client = requests.Session()

ua = UserAgent()
print(ua.random())
header = {'User-Agent': str(ua.random())}

login_payload = {
    "email": "*****@*****.**",
    "password": "******",
    "remember": False,
}

r = client.post(URL, data=login_payload, headers=header)
print(r)
コード例 #9
0
ファイル: utils.py プロジェクト: ypanaberdi/indeed-python-api
def getRandomUserAgent():
    ua = UserAgent()
    return ua.random()
コード例 #10
0
ファイル: blle.py プロジェクト: liyye/mysite
import re
import requests
from lxml import etree
from my_fake_useragent import UserAgent
import MySQLdb
conn = MySQLdb.connect(host='127.0.0.1',
                       port=3306,
                       user='******',
                       passwd='123',
                       db='yunyun',
                       charset='utf8')
cursor = conn.cursor()

a = UserAgent()
p = a.random()
headers = {
    'User-Agent': p,
    # 'cookie': '__cfduid=dce1ed34975ff71acb9b22d4959d0263b1563521810; ASP.NET_SessionId=1oj0zvk0wttwcudymxjeftpt; UM_distinctid=16c0928d2b2448-03463007e150d9-e343166-144000-16c0928d2b32f6; CNZZDATA1255263807=653621382-1563520703-%7C1563520703; ViewHistory_4=1oj0zvk0wttwcudymxjeftpt; .ynzpauth=869D169A9273686FE3F281194E66EAF796DA177B8799BC0686C9AFD983575676620178F545B8CC60F7FEAA6886B258DF06E4D0E13BBE33ABBA3DCF46FB3A659EE847BBE2696F2256B15111D8D1BDD642178E9567CF7161BDEA9BC44159707D7DF2F8D7D349B8397F87AA820265CC36F284BFECA0EF6E38D76411703DA70E1B5EB03806C9211CD2EC6C800D8E4E9CC840A8734ACC7E31910E493DCF0B2D859E27; viewedResume=2088560%2C1515707%2C727002%2C1218946%2C1623681%2C2131167%2C2121066'
}

for i in range(2957, 10000):
    url = 'http://www.bole.com.cn/resume/resume-show.php?id=' + str(i) + ''
    # print(url)
    try:
        with requests.session() as s:
            a = s.get(url, headers=headers)
            pr = a.text
            # print(pr)
            pattern = re.compile('<div class="personal_info_item">(.*?)</div>')
            rev1 = pattern.findall(pr)
            # print(rev1)
コード例 #11
0
from bs4 import BeautifulSoup
import requests
import csv
from my_fake_useragent import UserAgent

# Mimic the access to the website like a browser
ua = UserAgent(family='chrome')
BrowserUserAgent = ua.random()

# Define URL and Requests object
f = csv.writer(open('drug-names.csv', 'w'))
f.writerow(['Name'])
pages = []
headers = BrowserUserAgent

firstAlphaNumeric = [
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0-9', ''
]
# secondAlphaNumeric = firstAlphaNumeric
finalList = []

for first in firstAlphaNumeric:
    for second in firstAlphaNumeric:
        url = 'https://www.drugs.com/alpha/' + str(first) + str(
            second) + '.html'
        pages.append(url)

for item in pages:
    page = requests.get(item, headers)
    soup = BeautifulSoup(page.text, 'html.parser')
コード例 #12
0
class IpPool:
    def __init__(self):
        self.ua = UserAgent()
        self.headers = {'User-Agent': self.ua.random()}
        # ip代理API
        self.ipurl = 'http://http.tiqu.qingjuhe.cn/getip?num=1&type=2&pack=51811&port=11&lb=1&pb=4&regions='
        # redis数据库
        self.redi = redis.Redis(host='127.0.0.1',
                                port=6379,
                                db=0,
                                decode_responses=True,
                                password='******')
        # 接口请求失败计数
        self.count = 0

    # 获取代理ip
    def get_ip(self):
        try:
            res = requests.get(url=self.ipurl,
                               headers=self.headers,
                               timeout=10)
            print(res.status_code)
            print(
                '获取时间:{}'.format(
                    str(
                        time.strftime("%Y-%m-%d %H:%M:%S",
                                      time.localtime(int(time.time()))))),
                res.text)
            if res.status_code != 200:
                self.count += 1
            else:
                self.count -= 1
            # 接口返回数据
            # {"code":0,"data":[{"ip":"223.241.61.18","port":"4336"}],"msg":"0","success":true}
            json_obj = res.json()
            if res.status_code == 200 and json_obj['data'][0]:
                if self.proxyip(json_obj['data'][0]['ip']):
                    return json_obj['data'][0]
                    # return {'ip': '127.0.0.1', 'port': '1234'}
        except:
            self.count += 1

    # 存储ip
    def set_ip(self, ip):
        print('存入:', ip)
        self.redi.lpush('ip:iplist', json.dumps(ip))

    # 检测IP有效性
    def test_ip(self, item):
        item = json.loads(item)
        try:
            telnetlib.Telnet(item['ip'], port=item['port'], timeout=10)
        except:
            return False
        else:
            return True

    def proxyip(self, ip):
        url = 'https://iphunter.net/ip/{}'.format(ip)
        headers = {
            'user-agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
        }
        res = requests.get(url, headers=headers)
        e = etree.HTML(res.text)
        data = ''.join(e.xpath('/html/body/article/script[3]/text()'))
        if '代理' not in data and '爬虫' not in data:
            return True
        else:
            return False

    # 引擎
    def engine(self):
        while True:
            if self.redi.llen('ip:iplist') >= 19:
                for item in self.redi.lrange('ip:iplist', 0, -1):
                    print(
                        '检测时间:{}'.format(
                            str(
                                time.strftime("%Y-%m-%d %H:%M:%S",
                                              time.localtime(int(
                                                  time.time()))))), item)
                    if item == None:
                        print(None)
                        # 清除无效IP
                        self.redi.lrem('ip:iplist', 1, item)
                        # # 补充有效IP
                        time.sleep(2)
                        ip = self.get_ip()
                        if ip:
                            self.set_ip(ip)
                    if not self.test_ip(item):
                        print(self.test_ip(item))
                        # 清除无效IP
                        self.redi.lrem('ip:iplist', 1, item)
                        # # 补充有效IP
                        time.sleep(2)
                        ip = self.get_ip()
                        if ip:
                            self.set_ip(ip)
            else:
                for i in range(20):
                    time.sleep(2)
                    if self.redi.llen('ip:iplist') <= 20:
                        print('ip数量小于20')
                        ip = self.get_ip()
                        if ip:
                            self.set_ip(ip)
            time.sleep(30)

    # 客户端随机ip
    def random_ip(self):
        try:
            iplist = self.redi.lrange('ip:iplist', 0, -1)
        except:
            iplist = []
        if iplist:
            while True:
                ip = random.choice(iplist)
                if ip:
                    ip = json.loads(ip)
                    # ip_info = '183.166.164.209:4370'
                    ip_info = ip['ip'] + ':' + ip['port']
                    proxies = {'https': ip_info}
                    return ip_info
                    # proxies = {'https': '119.5.74.242:4385'}
        else:
            return None

    # 运行
    def run(self):
        pid = str(os.getpid())
        self.redi.set('pid:ip_pool', pid)
        self.engine()
コード例 #13
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: Cedar
# @Date  : 2021/3/22
# @Desc  :

from my_fake_useragent import UserAgent


ua = UserAgent(phone=True)
print(ua.random())
コード例 #14
0
    def crawl(self):
        ua = UserAgent()
        headers = {'User-Agent': '{}'.format(ua.random())}
        print(self.spider_name, 'now crawling', self.url_key)

        try:
            raw_contents = requests.get(self.url, headers=headers).text
            match_pattern = r'<td(.*?)</td>'
            level_1_soup_list = re.findall(match_pattern, raw_contents,
                                           re.S | re.M)
            level_2_soup_list = []
            for level_1_soup in level_1_soup_list:
                level_2_soup = level_1_soup.split('>')[1]
                level_2_soup_list.append(level_2_soup)

            project_name = level_2_soup_list[1]
            project_number = level_2_soup_list[3]
            project_intro = level_2_soup_list[5]
            project_link = level_2_soup_list[7].split('\"')[
                1]  # Special Design
            project_purpose = level_2_soup_list[9]
            project_size = level_2_soup_list[11]
            project_duration = level_2_soup_list[13]
            project_apr = level_2_soup_list[15]
            project_repay_start = level_2_soup_list[17]
            project_repay_method = level_2_soup_list[19].strip(
            )  # Special Design
            project_repay_details = level_2_soup_list[21]
            project_status = level_2_soup_list[23].strip()  # Special Design
            project_raise_start = level_2_soup_list[25]
            project_guarantee = level_2_soup_list[27]
            project_repay_source = level_2_soup_list[29]
            project_risk = level_2_soup_list[31]
            project_expense = level_2_soup_list[33]
            project_template_number = level_2_soup_list[35]
            project_lender_notice = level_2_soup_list[37]
            project_borrower_type = level_2_soup_list[39].strip(
            )  # Special Design
            project_borrower_name = level_2_soup_list[43]
            project_document_type = level_2_soup_list[45].strip(
            )  # Special Design
            project_document_number = level_2_soup_list[47]
            project_borrower_job = level_2_soup_list[49]
            project_borrower_other_info = level_2_soup_list[51]
            project_borrower_credit = level_2_soup_list[53]
            project_borrower_default_times = level_2_soup_list[55]
            project_borrower_default_amounts = level_2_soup_list[57]
            project_borrower_income_and_debt = level_2_soup_list[59]

            self.list_of_attribute = [
                self.url_key, project_name, project_number, project_intro,
                project_link, project_purpose, project_size, project_duration,
                project_apr, project_repay_start, project_repay_method,
                project_repay_details, project_status, project_raise_start,
                project_guarantee, project_repay_source, project_risk,
                project_expense, project_template_number,
                project_lender_notice, project_borrower_type,
                project_borrower_name, project_document_type,
                project_document_number, project_borrower_job,
                project_borrower_other_info, project_borrower_credit,
                project_borrower_default_times,
                project_borrower_default_amounts,
                project_borrower_income_and_debt
            ]

            print(self.spider_name, 'has finished the crawling from',
                  self.url_key)

        except:
            project_name = "FAIL"
            project_number = "FAIL"
            project_intro = "FAIL"
            project_link = "FAIL"
            project_purpose = "FAIL"
            project_size = "FAIL"
            project_duration = "FAIL"
            project_apr = "FAIL"
            project_repay_start = "FAIL"
            project_repay_method = "FAIL"
            project_repay_details = "FAIL"
            project_status = "FAIL"
            project_raise_start = "FAIL"
            project_guarantee = "FAIL"
            project_repay_source = "FAIL"
            project_risk = "FAIL"
            project_expense = "FAIL"
            project_template_number = "FAIL"
            project_lender_notice = "FAIL"
            project_borrower_type = "FAIL"
            project_borrower_name = "FAIL"
            project_document_type = "FAIL"
            project_document_number = "FAIL"
            project_borrower_job = "FAIL"
            project_borrower_other_info = "FAIL"
            project_borrower_credit = "FAIL"
            project_borrower_default_times = "FAIL"
            project_borrower_default_amounts = "FAIL"
            project_borrower_income_and_debt = "FAIL"

            self.list_of_attribute = [
                "FAIL", project_name, project_number, project_intro,
                project_link, project_purpose, project_size, project_duration,
                project_apr, project_repay_start, project_repay_method,
                project_repay_details, project_status, project_raise_start,
                project_guarantee, project_repay_source, project_risk,
                project_expense, project_template_number,
                project_lender_notice, project_borrower_type,
                project_borrower_name, project_document_type,
                project_document_number, project_borrower_job,
                project_borrower_other_info, project_borrower_credit,
                project_borrower_default_times,
                project_borrower_default_amounts,
                project_borrower_income_and_debt
            ]
            print(self.spider_name, "has failed and gives", self.url_key,
                  "to another spider")
コード例 #15
0
import requests
from my_fake_useragent import UserAgent
import json
from pymongo import MongoClient
from pyquery import PyQuery as pq
import random
import time

ua = UserAgent()
headers = {"User-Agent": ua.random()}

client = MongoClient(host="localhost", port=27017)
collection = client["发改委"]['辽宁1']


def parse_detail(html, url):
    ret = {}
    doc = pq(html)
    ret['url'] = url
    ret['title'] = doc(".news-content-main h1").text()
    ret['sourceTime'] = doc(".news-info").text()
    ret['content'] = doc('#ContTextSize').text()
    ret['contentUrl'] = doc("#ContTextSize a").attr("href")
    print(ret)
    collection.insert_one(ret)


def parse_index(html):
    doc = pq(html)
    items = doc(".mod-body2 ul li").items()
    for item in items:
コード例 #16
0
ファイル: crawl.py プロジェクト: eternalstop/spider
 def __init__(self):
     self.proxies = []  # 代理列表
     ua = UserAgent()  # 使用随机UA
     self.headers = {"UserAgent": ua.random()}
コード例 #17
0
def get_request_headers():
    ua = UserAgent()

    return {"User-Agent": ua.random()}