Beispiel #1
0
 def _build_chrome_options(self, headless=True, random_user=False):
     chrome_options = Options()
     chrome_options.add_argument("--disable-notifications")
     chrome_options.add_argument("--verbose")
     chrome_options.add_argument("--window-size=1920x1080")
     chrome_options.add_argument("--no-sandbox")
     chrome_options.add_experimental_option(
         "prefs",
         {
             "download.default_directory": self.download_dir,
             "download.prompt_for_download": False,
             "download.directory_upgrade": True,
             "safebrowsing_for_trusted_sources_enabled": False,
             "safebrowsing.enabled": False,
         },
     )
     chrome_options.add_argument("--disable-gpu")
     chrome_options.add_argument("--disable-software-rasterizer")
     if headless:
         chrome_options.add_argument("--headless")
     if random_user:
         ua = UserAgent(family="chrome")
         randomua = ua.random()
         chrome_options.add_argument(f"user-agent={randomua}")
     return chrome_options
Beispiel #2
0
def gen_news():
    ua = UserAgent()
    user_agent = ua.random()

    referer = 'https://tushare.pro/login?next=%2Fnews%2Fnews_sina'

    headers = {
        'User-Agent': user_agent,
        'Host': 'tushare.pro',
        'Origin': 'https://tushare.pro',
        'Referer': referer
    }

    stockPageRequest = request.urlopen('http://finance.eastmoney.com/news/cdfsd.html')
    htmlTitleContent = str(stockPageRequest.read(), 'utf-8')
    # 正则匹配标题
    titlePattern = re.compile('<span class="l3 a3">title="(.*?)"</span>', re.S)
    p_title = 'title="(.*?)"(.*?)'
    title = re.findall(p_title, htmlTitleContent)
    title = [t[0] for t in title if not t[0].find('【')]

    news = []
    for t in title:
        a = t.find('【')
        b = t.find('】')
        news.append({'title': t[a+1:b], 'content': t[b+1:]})
    # news = News.objects.all()
    return news
Beispiel #3
0
def test():
    ua = UserAgent(family='chrome', os_family='linux')

    for i in range(100):
        res = ua.random()

        print(res)
Beispiel #4
0
def get_page(url, options={}):
    try:
        ua = UserAgent()
    except:
        pass
    try:
        base_headers = {
            'User-Agent': ua.random(),
            'Accept-Encoding': 'gzip, deflate, sdch',
            'Accept-Language': 'zh-CN,zh;q=0.8'
        }
    except:
        base_headers = {
            'User-Agent': ua.random(),
            'Accept-Encoding': 'gzip, deflate, sdch',
            'Accept-Language': 'zh-CN,zh;q=0.8'
        }
    headers = dict(base_headers, **options)
    print('Getting', url)
    try:
        r = requests.get(url, headers=headers)
        print('Getting result', url, r.status_code)
        if r.status_code == 200:
            return r.text
    except ConnectionError:
        print('Crawling Failed', url)
        return None
def set_options():
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-setuid-sandbox")
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--window-size=1920x1080")
    ua = UserAgent(family='chrome')
    randomua = ua.random()
    chrome_options.add_argument(f'user-agent={randomua}')
    print(randomua)
    return chrome_options
Beispiel #6
0
 def __init__(self):
     self.ua = UserAgent()
     self.headers = {'User-Agent': self.ua.random()}
     # ip代理API
     self.ipurl = 'http://http.tiqu.qingjuhe.cn/getip?num=1&type=2&pack=51811&port=11&lb=1&pb=4&regions='
     # redis数据库
     self.redi = redis.Redis(host='127.0.0.1',
                             port=6379,
                             db=0,
                             decode_responses=True,
                             password='******')
     # 接口请求失败计数
     self.count = 0
Beispiel #7
0
    def youtube_scrapper(self, query, number_results=2):
        "Function to scrape results from Youtube Search"
        query = urllib.parse.quote_plus(query)  # Format into URL encoding
        ua = UserAgent(family='chrome')
        assert isinstance(query, str)  #Search term must be a string
        assert isinstance(number_results,
                          int)  #Number of results must be an integer
        escaped_search_term = query.replace(' ', '+')
        google_url = "https://www.google.com/search?q={}&num={}".format(
            query + "+site:youtube.com", 1)
        #print(google_url)
        response = requests.get(google_url, {"User-Agent": ua.random()})
        soup = BeautifulSoup(response.text, "html.parser")
        result_div = soup.find_all('div', attrs={'class': 'ZINbbc'})
        self.Links = []
        self.Titles = []
        for r in result_div:
            # Checks if each element is present, else, raise exception
            try:
                link = r.find('a', href=True)
                title = r.find('div', attrs={'class': 'vvjwJb'}).get_text()

                # Check to make sure everything is present before appending
                if link != '' and title != '':
                    self.Links.append(link['href'])
                    self.Titles.append(title)
                    if (len(self.Links) == number_results):
                        break
            # Next loop if one element is not present
            except:
                continue

        for i in range(0, len(self.Links)):
            self.Links[i] = self.Links[i].replace("/url?q=", "")
        for i in range(0, len(self.Links)):
            if (self.Links[i].find("watch") != -1):
                self.Links[i] = self.Links[i].replace("%3F", "?")
                self.Links[i] = self.Links[i].replace("%3D", "=")
                self.Links[i] = self.Links[i].split("&")[0]
            else:
                continue
        if (len(self.Links) == 0):
            return
        else:
            for i in range(0, len(self.Links)):
                d = dict()
                d["title"] = self.Titles[i]
                d["linktopage"] = self.Links[i]
                self.youtube_result.append(d)
Beispiel #8
0
def spider(url):
    response = requests.get(url, {'User-Agent':UserAgent().random()})#用来突破反爬虫
    res = response.content
    html = str(res, 'utf-8')#用来获取html页面
    html_tree = bs(html, 'lxml')
    # 找class = wz_content标签下的内容
    html_text = html_tree.find_all("div", class_="wz_content")
    All_text = []
    for text in html_text:
        one_text = []
        text_url = text.find('a')['href']  # 选取了当前文章的链接
        text_title = text.find('h3') #标题
        text_cout = text.find("span", class_="count")
        #舍弃http://youxian.cnki链接 打不开的 没数据 可能需要登陆才有数据 之后再调试吧  出现概率1/20
        if re.match(r'http://www.cnki.com.cn/Article/[a-zA-Z]+-[0-9a-zA-Z-]+.htm', text_url) or re.match(r'http://cdmd.cnki.com.cn/Article/[a-zA-Z]+-[0-9a-zA-Z-]+.htm', text_url):
            # 调用函数 进去各个文章的具体网站 找其他信息
            text_all = datespider(text_url)
            one_text.append(text_title.get_text().replace('\xa0', '').replace('\n', ''))  # 得到文章的标题
            one_text.append(text_cout.get_text().replace('\xa0', '').replace('\n', '').replace('下载次数', '').replace('被引次数', '').replace('(', '').replace(')', ''))  # 把操作次数 放进列表
            for item in text_all:#将datespider函数返回的信息,文章的 作者、单位、学位 、分类号,插入列表
                one_text.append(item.replace('\t', '').replace('\r', '').replace('\n', '').replace(' ', '').replace('年', ''))
            one_text.append(text_url)  # 把文章的链接 放进列表

            All_text.append(one_text)
    return All_text
class DoubanDownloaderMiddleware(object):

    def __init__(self):
        self.count = 0

        '''无标题模式'''
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')
        self.driver = webdriver.Chrome(chrome_options=options)
        self.driver.implicitly_wait(3)
        self.driver.get('https://www.douban.com')

        '''跳转到frame'''
        frame = self.driver.find_element_by_xpath('//body//div[@class="login"]/iframe')
        self.driver.switch_to.frame(frame)

        try:
            self.driver.find_element_by_xpath('//body/div[1]/div[1]/ul[1]/li[2]').click()
            time.sleep(0.5)
            self.driver.find_element_by_xpath('//input[@id="username"]').send_keys('xxx')
            self.driver.find_element_by_xpath('//input[@id="password"]').send_keys('xxx')
            self.driver.find_element_by_xpath('//div[@class="account-form-field-submit "]').click()
            '''设置等待响应时间'''
            time.sleep(1)

        #如果没找到对应元素,再找一遍
        except NoSuchElementException as e:
            print('再加载一遍: %s' % e)
            self.driver.find_element_by_xpath('//body/div[1]/div[1]/ul[1]/li[2]').click()
            time.sleep(0.5)
            self.driver.find_element_by_xpath('//input[@id="username"]').send_keys('xxx')
            self.driver.find_element_by_xpath('//input[@id="password"]').send_keys('xxx')
            self.driver.find_element_by_xpath('//div[@class="account-form-field-submit "]').click()
            time.sleep(1)

        else:
            print('Successful logging!')

    def process_request(self, request, spider):
        '''
        特殊的标记,只是运行一次,用来登录
        本来想实现翻页,但是没有成功,所以不设置也没问题。
        '''
       self.count += 1
        if self.count <= 1:
            return HtmlResponse(url=request.url, status=200, request=request,
                                encoding='utf-8', body=self.driver.page_source)

        #添加User-Agent,用了第三方库:my_fake_useragent
        else:
            ua = UserAgent(family='chrome', os_family='Windows')
            res = ua.random()
            request.headers['User-Agent'] = res

            '''下面也可以添加随机IP,要先在settings中写好代理池PROXIES'''

            '''
Beispiel #10
0
def random_header():
    ua = UserAgent()
    random_header = json.loads(r'''{
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "Host": "www.dogforum.com",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent":"%s"
    }'''%ua.random)
    return random_header
Beispiel #11
0
 def __init__(self):
     self.url = 'http://app.mi.com/categotyAllListApi?page={}&categoryId={}&pageSize=30'
     self.q = Queue()  # 存放所有URL地址的队列
     self.i = 0
     self.id_list = []  # 存放所有类型id的空列表
     # 打开文件
     self.f = open('xiaomi.csv', 'a', newline="")
     self.writer = csv.writer(self.f)
     self.lock = Lock()  # 创建锁
     self.ua = UserAgent()
def getListProxies():
    ip_list = []
    session = requests.session()
    headers = {'User-Agent': UserAgent().random}
    page = session.get("http://www.xicidaili.com/nn", headers=headers)
    soup = BeautifulSoup(page.text, 'lxml')
    taglist = soup.find_all('tr', attrs={'class': re.compile("(odd)|()")})
    for trtag in taglist:
        tdlist = trtag.find_all('td')
        proxy = {'http': 'http://' + tdlist[1].string + ':' + tdlist[2].string}
        ip_list.append(proxy)
    return ip_list
Beispiel #13
0
 def __init__(self):
     self.url = "http://wzzxbs.mofcom.gov.cn/WebProSP/infoPub/record/loadRecordData.action"
     self.detail_base_url = "http://wzzxbs.mofcom.gov.cn/WebProSP/infoPub/record/loadEntpRecordDetails.action?params.recordId={}&time={}"
     self.headers = {
         "Accept": "application/json, text/javascript, */*",
         "Accept-Encoding": "gzip, deflate",
         "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
         "Connection": "keep-alive",
         "Content-Length": "169",
         "Content-Type": "application/x-www-form-urlencoded",
         "Cookie": "insert_cookie=32151754",
         "Host": "wzzxbs.mofcom.gov.cn",
         "Origin": "http://wzzxbs.mofcom.gov.cn",
         "Referer":
         "http://wzzxbs.mofcom.gov.cn/WebProSP/app/infoPub/entpRecord",
         "User-Agent":
         "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
         "X-Requested-With": "XMLHttpRequest"
     }
     self.detail_headers = {
         "Accept":
         "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
         "Accept-Encoding":
         "gzip, deflate",
         "Accept-Language":
         "zh-CN,zh;q=0.9,en;q=0.8",
         "Cache-Control":
         "max-age=0",
         "Connection":
         "keep-alive",
         "Cookie":
         "insert_cookie=32151754",
         "Host":
         "wzzxbs.mofcom.gov.cn",
         "Upgrade-Insecure-Requests":
         "1",
         "User-Agent":
         "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
     }
     self.data = {
         "params.entpName": "",
         "page.currentPage": "",
         "page.limit": "2000",
         "page.option": "next",
         "page.start": "",
         "page.rowCount": "",
         "listGrid.col": "1:showRecordInfo(0),2,3,4",
         "listGrid.type": "link,ro,ro,ro"
     }
     self.detail_data = {"params.recordId": "", "time": ""}
     self.util = Util()
     self.user_agent = UserAgent()
Beispiel #14
0
def test_login():
    """
    仅作返回cookies
    :return:
    """
    data = {"username": "******", "password": 123456}
    url = "http://pre-admin.mofangcar.com/cms/login"
    headers = {"User-Agent": UserAgent().random()}
    # "User-Agent": " Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}
    res = requests.post(url, data=data, headers=headers, verify=False)
    print(res.json())
    # print(res.text)
    # print(res.cookies['JSESSIONID'])  # 有的把cookie信息存在cookies中  有的放入返回的data中
    return res.cookies
Beispiel #15
0
def postRequestListUrl(name,url,data,code,params):
    if name in listPageRequestHooks:
        if "headers" in listPageRequestHooks[name]:
            headers = listPageRequestHooks[name]['headers']()
        if "cookies" in listPageRequestHooks[name]:
            cookies = listPageRequestHooks[name]['cookies']()
        if "url" in listPageRequestHooks[name]:
            url = listPageRequestHooks[name]['cookies']()
        if "params" in listPageRequestHooks[name]:
            params = listPageRequestHooks[name]['params']()
        if "timeout" in listPageRequestHooks[name]:
            timeout = listPageRequestHooks[name]['timeout']()
        if "data" in listPageRequestHooks[name]:
            data = listPageRequestHooks[name]['data']()
    result=post(url,headers if "headers" in locals() else {"User-Agent":UserAgent().random()},timeout if "timeout" in locals() else 60,params=params if "params" in locals() else None,data=data,cookies=cookies if "cookies" in locals() else None,code=code)
    return result
class DownloadImg():
    def __init__(self):
        self.ua = UserAgent()

    def download_one_img(self, img_url, saved_path):
        # 下载图片
        header = {
            "User-Agent": "{}".format(self.ua.random().strip()),
            'Connection': 'close'}
        r = requests.get(img_url, headers=header, stream=True)
        print("请求图片状态码 {}".format(r.status_code))  # 返回状态码
        if r.status_code == 200:  # 写入图片
            with open(saved_path, mode="wb") as f:
                f.write(r.content)
            print("download {} success!".format(saved_path))
        del r
        return saved_path
Beispiel #17
0
def datespider(date_url):
    #设置一下 UserAgent 突破反扒
    response_try = requests.get(date_url, UserAgent().random())
    # 用BeautifulSoup框架转化
    response_tree = bs(response_try.text, 'html.parser')
    if(response_tree==None):
        return []
    else:
        # 在对应位置 匹配需要的信息
        res_date = response_tree.find("font", {"color": "#0080ff"})
        res_name = response_tree.find("div", {"style": "text-align:center; width:740px; height:30px;"})
        res_msg = response_tree.find("div", {"style": "text-align:left;"})

        #时间
        if res_date == None:
            response_date = None
        else:
            response_date = res_date.get_text().replace('\xa0', '').replace('\r', '').replace('\n', '').replace('\t', '')
        #作者
        if res_name == None:
            response_name = None
        else:
            response_name = res_name.get_text().replace('\xa0', '').replace('\r', '').replace('\n', '').replace('\t', '')
        #其他信息
        if res_msg == None:
            res_msg = None
        else:
            # 去除不想要的东西
            response_msg = res_msg.get_text().replace('\xa0', '').replace('\r', '').replace('\n', '').replace('\t','')\
                .replace('】', '').replace('学位授予单位:', '').replace('学位级别:', '').replace('作者单位:', '').replace('学位授予年份:','').replace('分类号:', '')
            #用“【”作为分割界限,将response_msg字符串 划分为 response_point列表
            response_point = response_msg.split("【")
        #插入列表 并返回
        response_All = []
        response_All.append(response_date)
        response_All.append(response_name)
        #列表拼接
        #列表拼接
        for item in range(1,len(response_point)):
            response_All.append(response_point[item])

        return response_All
Beispiel #18
0
from collections import defaultdict
from urllib.parse import urlsplit, urljoin, urldefrag
from datetime import datetime
import threading

import requests
from dateutil.relativedelta import relativedelta
from bs4 import BeautifulSoup, SoupStrainer
from my_fake_useragent import UserAgent
from my_python_module.cache_utils import cachedb, func_cache
from my_python_module.datetime_utils import get_timestamp, get_dt_fromtimestamp
from my_python_module.pathlib import mkdirs

logger = logging.getLogger(__name__)

ua = UserAgent(family=['chrome', 'firefox'])


class URLType(Enum):
    """
    refUrl: 除了Absolute URL,其他URL都需要根据本URL所在的文章的refUrl才能得到绝对URL
    """
    Absolute = 1
    # 'https://www.cis.rit.edu/htbooks/nmr/chap-10/chap-10.htm'
    MissScheme = 2
    # ’//www.cis.rit.edu/htbooks/nmr/chap-10/chap-10.htm‘ refUrl
    RelativeSite = 3
    # ’/htbooks/nmr/chap-10/chap-10.htm‘ refUrl
    RelativeFolder = 4
    # ’chap-10.html‘ refUrl
    RelativeArticle = 5
Beispiel #19
0
import re
import requests
from lxml import etree
from my_fake_useragent import UserAgent
import MySQLdb
conn = MySQLdb.connect(host='127.0.0.1',
                       port=3306,
                       user='******',
                       passwd='123',
                       db='yunyun',
                       charset='utf8')
cursor = conn.cursor()

a = UserAgent()
p = a.random()
headers = {
    'User-Agent': p,
    # 'cookie': '__cfduid=dce1ed34975ff71acb9b22d4959d0263b1563521810; ASP.NET_SessionId=1oj0zvk0wttwcudymxjeftpt; UM_distinctid=16c0928d2b2448-03463007e150d9-e343166-144000-16c0928d2b32f6; CNZZDATA1255263807=653621382-1563520703-%7C1563520703; ViewHistory_4=1oj0zvk0wttwcudymxjeftpt; .ynzpauth=869D169A9273686FE3F281194E66EAF796DA177B8799BC0686C9AFD983575676620178F545B8CC60F7FEAA6886B258DF06E4D0E13BBE33ABBA3DCF46FB3A659EE847BBE2696F2256B15111D8D1BDD642178E9567CF7161BDEA9BC44159707D7DF2F8D7D349B8397F87AA820265CC36F284BFECA0EF6E38D76411703DA70E1B5EB03806C9211CD2EC6C800D8E4E9CC840A8734ACC7E31910E493DCF0B2D859E27; viewedResume=2088560%2C1515707%2C727002%2C1218946%2C1623681%2C2131167%2C2121066'
}

for i in range(2957, 10000):
    url = 'http://www.bole.com.cn/resume/resume-show.php?id=' + str(i) + ''
    # print(url)
    try:
        with requests.session() as s:
            a = s.get(url, headers=headers)
            pr = a.text
            # print(pr)
            pattern = re.compile('<div class="personal_info_item">(.*?)</div>')
            rev1 = pattern.findall(pr)
            # print(rev1)
Beispiel #20
0
import time
import random
import requests
from my_fake_useragent import UserAgent

from policy_crawl.common.logger import errorlog
headers = {"User-Agent": UserAgent().random()}


def get(url,
        params=None,
        headers=headers,
        code="utf-8",
        timeout=160,
        **kwargs):
    res = requests.get(url,
                       params=params,
                       headers=headers,
                       timeout=timeout,
                       **kwargs)
    if res.status_code in [200, 201, 301]:
        return res.content.decode(code)
    else:
        errorlog.logger.error("url status_code 错误:%s,status_code:%s" %
                              (url, res.status_code))
        raise ConnectionError("没有连接")


def post(url, data=None, headers=headers, code="utf-8", timeout=160, **kwargs):
    res = requests.post(url,
                        data=data,
Beispiel #21
0
 def __init__(self):
     self.headers = {'User-Agent': UserAgent().random()}
     print(self.headers)
Beispiel #22
0
class IpPool:
    def __init__(self):
        self.ua = UserAgent()
        self.headers = {'User-Agent': self.ua.random()}
        # ip代理API
        self.ipurl = 'http://http.tiqu.qingjuhe.cn/getip?num=1&type=2&pack=51811&port=11&lb=1&pb=4&regions='
        # redis数据库
        self.redi = redis.Redis(host='127.0.0.1',
                                port=6379,
                                db=0,
                                decode_responses=True,
                                password='******')
        # 接口请求失败计数
        self.count = 0

    # 获取代理ip
    def get_ip(self):
        try:
            res = requests.get(url=self.ipurl,
                               headers=self.headers,
                               timeout=10)
            print(res.status_code)
            print(
                '获取时间:{}'.format(
                    str(
                        time.strftime("%Y-%m-%d %H:%M:%S",
                                      time.localtime(int(time.time()))))),
                res.text)
            if res.status_code != 200:
                self.count += 1
            else:
                self.count -= 1
            # 接口返回数据
            # {"code":0,"data":[{"ip":"223.241.61.18","port":"4336"}],"msg":"0","success":true}
            json_obj = res.json()
            if res.status_code == 200 and json_obj['data'][0]:
                if self.proxyip(json_obj['data'][0]['ip']):
                    return json_obj['data'][0]
                    # return {'ip': '127.0.0.1', 'port': '1234'}
        except:
            self.count += 1

    # 存储ip
    def set_ip(self, ip):
        print('存入:', ip)
        self.redi.lpush('ip:iplist', json.dumps(ip))

    # 检测IP有效性
    def test_ip(self, item):
        item = json.loads(item)
        try:
            telnetlib.Telnet(item['ip'], port=item['port'], timeout=10)
        except:
            return False
        else:
            return True

    def proxyip(self, ip):
        url = 'https://iphunter.net/ip/{}'.format(ip)
        headers = {
            'user-agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
        }
        res = requests.get(url, headers=headers)
        e = etree.HTML(res.text)
        data = ''.join(e.xpath('/html/body/article/script[3]/text()'))
        if '代理' not in data and '爬虫' not in data:
            return True
        else:
            return False

    # 引擎
    def engine(self):
        while True:
            if self.redi.llen('ip:iplist') >= 19:
                for item in self.redi.lrange('ip:iplist', 0, -1):
                    print(
                        '检测时间:{}'.format(
                            str(
                                time.strftime("%Y-%m-%d %H:%M:%S",
                                              time.localtime(int(
                                                  time.time()))))), item)
                    if item == None:
                        print(None)
                        # 清除无效IP
                        self.redi.lrem('ip:iplist', 1, item)
                        # # 补充有效IP
                        time.sleep(2)
                        ip = self.get_ip()
                        if ip:
                            self.set_ip(ip)
                    if not self.test_ip(item):
                        print(self.test_ip(item))
                        # 清除无效IP
                        self.redi.lrem('ip:iplist', 1, item)
                        # # 补充有效IP
                        time.sleep(2)
                        ip = self.get_ip()
                        if ip:
                            self.set_ip(ip)
            else:
                for i in range(20):
                    time.sleep(2)
                    if self.redi.llen('ip:iplist') <= 20:
                        print('ip数量小于20')
                        ip = self.get_ip()
                        if ip:
                            self.set_ip(ip)
            time.sleep(30)

    # 客户端随机ip
    def random_ip(self):
        try:
            iplist = self.redi.lrange('ip:iplist', 0, -1)
        except:
            iplist = []
        if iplist:
            while True:
                ip = random.choice(iplist)
                if ip:
                    ip = json.loads(ip)
                    # ip_info = '183.166.164.209:4370'
                    ip_info = ip['ip'] + ':' + ip['port']
                    proxies = {'https': ip_info}
                    return ip_info
                    # proxies = {'https': '119.5.74.242:4385'}
        else:
            return None

    # 运行
    def run(self):
        pid = str(os.getpid())
        self.redi.set('pid:ip_pool', pid)
        self.engine()
from bs4 import BeautifulSoup
import requests
import csv
from my_fake_useragent import UserAgent

# Mimic the access to the website like a browser
ua = UserAgent(family='chrome')
BrowserUserAgent = ua.random()

# Define URL and Requests object
f = csv.writer(open('drug-names.csv', 'w'))
f.writerow(['Name'])
pages = []
headers = BrowserUserAgent

firstAlphaNumeric = [
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0-9', ''
]
# secondAlphaNumeric = firstAlphaNumeric
finalList = []

for first in firstAlphaNumeric:
    for second in firstAlphaNumeric:
        url = 'https://www.drugs.com/alpha/' + str(first) + str(
            second) + '.html'
        pages.append(url)

for item in pages:
    page = requests.get(item, headers)
    soup = BeautifulSoup(page.text, 'html.parser')
Beispiel #24
0
def get_request_headers():
    ua = UserAgent()

    return {"User-Agent": ua.random()}

proxy = ip_list[0]
error = 0
while 1:
    queue_len = r.llen('queue:ftshop')
    queue_index = 0
    s = requests.session()
    n = str(q.get_nowait(), encoding='utf8')
    data = json.loads(n)
    shopid = data['shopid']
    region = data['region']
    area = data['area'].encode("utf-8").decode("utf-8")
    headers = {
        'User-Agent':
        UserAgent().random,
        'Referer':
        'https://m.dianping.com/shenzhen/ch10/r{0}'.format(region_dict[area])
    }
    url = 'https://m.dianping.com/shop/' + shopid
    try:
        respon = s.get(url, headers=headers, proxies=proxy)
    except Exception as e:
        error = 1
    i = 0
    while '验证中心' in respon.text or '抱歉!页面暂' in respon.text or respon.status_code != 200 or error == 1:
        i = i + 1
        if i < len(ip_list):
            proxy = ip_list[i]
            try:
                respon = s.get(url, headers=headers, proxies=proxy)
Beispiel #26
0
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
]

user_agent = random.choice(user_agent_list)
headers = {'User-Agent': user_agent}
ua = UserAgent()


def Organizer(request):
    lr = LowRange()


P = []


def LowRange():
    Alink = r'https://www.amazon.in/s?i=electronics&bbn=1805560031&rh=n%3A976419031%2Cn%3A976420031%2Cn%3A1389401031%2Cn%3A1389432031%2Cn%3A1805560031%2Cp_36%3A1318506031%2Cp_89%3AAsus%7CHUAWEI%7CHonor%7CLenovo%7CMi%7CMotorola%7CNokia%7COPPO%7CRedmi%7CSamsung%7CVivo%7CXiaomi%7Crealme%2Cp_n_operating_system_browse-bin%3A1485077031%2Cp_72%3A1318476031%2Cp_n_feature_seven_browse-bin%3A8561133031%2Cp_n_feature_eight_browse-bin%3A8561112031%7C8561116031%2Cp_n_feature_three_browse-bin%3A1897963031%2Cp_n_condition-type%3A8609960031%2Cp_n_feature_five_browse-bin%3A8561106031%2Cp_n_feature_two_browse-bin%3A1898707031%2Cp_n_pct-off-with-tax%3A2665399031%2Cp_n_feature_thirteen_browse-bin%3A8561102031&dc&fst=as%3Aoff&qid=1567887652&rnid=8561098031&ref=sr_nr_p_n_feature_thirteen_browse-bin_2'
    Flink = r'https://www.flipkart.com/mobiles/smartphones~trunype/pr?sid=tyy%2C4io&p%5B%5D=facets.processor_brand%255B%255D%3DSnapdragon&p%5B%5D=facets.sim_type%255B%255D%3DDual%2BSim&p%5B%5D=facets.internal_storage%255B%255D%3D64%2B-%2B127.9%2BGB&p%5B%5D=facets.number_of_cores%255B%255D%3DOcta%2BCore&p%5B%5D=facets.rating%255B%255D%3D4%25E2%2598%2585%2B%2526%2Babove&p%5B%5D=facets.price_range.from%3D10000&p%5B%5D=facets.price_range.to%3D20000&p%5B%5D=facets.availability%255B%255D%3DExclude%2BOut%2Bof%2BStock&p%5B%5D=facets.type%255B%255D%3DSmartphones&p%5B%5D=facets.operating_system%255B%255D%3DAndroid&p%5B%5D=facets.screen_size%255B%255D%3D6%2Binch%2B%2526%2Babove&p%5B%5D=facets.battery_capacity%255B%255D%3D5000%2BmAh%2B%2526%2BAbove&p%5B%5D=facets.battery_capacity%255B%255D%3D4000%2B-%2B4999%2BmAh&p%5B%5D=facets.clock_speed%255B%255D%3D2.5%2BGHz%2B%2526%2BAbove&p%5B%5D=facets.clock_speed%255B%255D%3D2%2B-%2B2.5%2BGHz&p%5B%5D=facets.network_type%255B%255D%3D4G%2BVOLTE&p%5B%5D=facets.brand%255B%255D%3DSamsung&p%5B%5D=facets.brand%255B%255D%3DMi&p%5B%5D=facets.brand%255B%255D%3DHonor&p%5B%5D=facets.brand%255B%255D%3DHuawei&p%5B%5D=facets.brand%255B%255D%3DMotorola&p%5B%5D=facets.brand%255B%255D%3DOPPO&p%5B%5D=facets.brand%255B%255D%3DNokia&p%5B%5D=facets.brand%255B%255D%3DVivo&p%5B%5D=facets.brand%255B%255D%3DPOCO&p%5B%5D=facets.ram%255B%255D%3D4%2BGB&p%5B%5D=facets.ram%255B%255D%3D6%2BGB%2B%2526%2BAbove&p%5B%5D=facets.serviceability%5B%5D%3Dfalse'
    Slink = r'https://www.snapdeal.com/products/mobiles-mobile-phones/filters/Form_s~Smartphones?sort=plrty&q=Form_s%3ASmartphones%7CPrice%3A10000%2C20000%7CBrand%3AVivo%5EOppo%5EMI%5EMoto%7CRAM_s%3A4%20GB%5E6%20GB%7CConnectivity_s%3AVoLTE%7CScreensize_s%3A6.0%20%26%20Above%7CPrimaryCamera_s%3A8MP-13MP%7C'
    linklist = {Alink: AV.Hello, Flink: FV.Hello, Slink: SV.Hello}
 def __init__(self):
     self.ua = UserAgent()
Beispiel #28
0
import requests
from my_fake_useragent import UserAgent
import json
from pymongo import MongoClient
from pyquery import PyQuery as pq
import random
import time

ua = UserAgent()
headers = {"User-Agent": ua.random()}

client = MongoClient(host="localhost", port=27017)
collection = client["发改委"]['辽宁1']


def parse_detail(html, url):
    ret = {}
    doc = pq(html)
    ret['url'] = url
    ret['title'] = doc(".news-content-main h1").text()
    ret['sourceTime'] = doc(".news-info").text()
    ret['content'] = doc('#ContTextSize').text()
    ret['contentUrl'] = doc("#ContTextSize a").attr("href")
    print(ret)
    collection.insert_one(ret)


def parse_index(html):
    doc = pq(html)
    items = doc(".mod-body2 ul li").items()
    for item in items:
Beispiel #29
0
    def crawl(self):
        ua = UserAgent()
        headers = {'User-Agent': '{}'.format(ua.random())}
        print(self.spider_name, 'now crawling', self.url_key)

        try:
            raw_contents = requests.get(self.url, headers=headers).text
            match_pattern = r'<td(.*?)</td>'
            level_1_soup_list = re.findall(match_pattern, raw_contents,
                                           re.S | re.M)
            level_2_soup_list = []
            for level_1_soup in level_1_soup_list:
                level_2_soup = level_1_soup.split('>')[1]
                level_2_soup_list.append(level_2_soup)

            project_name = level_2_soup_list[1]
            project_number = level_2_soup_list[3]
            project_intro = level_2_soup_list[5]
            project_link = level_2_soup_list[7].split('\"')[
                1]  # Special Design
            project_purpose = level_2_soup_list[9]
            project_size = level_2_soup_list[11]
            project_duration = level_2_soup_list[13]
            project_apr = level_2_soup_list[15]
            project_repay_start = level_2_soup_list[17]
            project_repay_method = level_2_soup_list[19].strip(
            )  # Special Design
            project_repay_details = level_2_soup_list[21]
            project_status = level_2_soup_list[23].strip()  # Special Design
            project_raise_start = level_2_soup_list[25]
            project_guarantee = level_2_soup_list[27]
            project_repay_source = level_2_soup_list[29]
            project_risk = level_2_soup_list[31]
            project_expense = level_2_soup_list[33]
            project_template_number = level_2_soup_list[35]
            project_lender_notice = level_2_soup_list[37]
            project_borrower_type = level_2_soup_list[39].strip(
            )  # Special Design
            project_borrower_name = level_2_soup_list[43]
            project_document_type = level_2_soup_list[45].strip(
            )  # Special Design
            project_document_number = level_2_soup_list[47]
            project_borrower_job = level_2_soup_list[49]
            project_borrower_other_info = level_2_soup_list[51]
            project_borrower_credit = level_2_soup_list[53]
            project_borrower_default_times = level_2_soup_list[55]
            project_borrower_default_amounts = level_2_soup_list[57]
            project_borrower_income_and_debt = level_2_soup_list[59]

            self.list_of_attribute = [
                self.url_key, project_name, project_number, project_intro,
                project_link, project_purpose, project_size, project_duration,
                project_apr, project_repay_start, project_repay_method,
                project_repay_details, project_status, project_raise_start,
                project_guarantee, project_repay_source, project_risk,
                project_expense, project_template_number,
                project_lender_notice, project_borrower_type,
                project_borrower_name, project_document_type,
                project_document_number, project_borrower_job,
                project_borrower_other_info, project_borrower_credit,
                project_borrower_default_times,
                project_borrower_default_amounts,
                project_borrower_income_and_debt
            ]

            print(self.spider_name, 'has finished the crawling from',
                  self.url_key)

        except:
            project_name = "FAIL"
            project_number = "FAIL"
            project_intro = "FAIL"
            project_link = "FAIL"
            project_purpose = "FAIL"
            project_size = "FAIL"
            project_duration = "FAIL"
            project_apr = "FAIL"
            project_repay_start = "FAIL"
            project_repay_method = "FAIL"
            project_repay_details = "FAIL"
            project_status = "FAIL"
            project_raise_start = "FAIL"
            project_guarantee = "FAIL"
            project_repay_source = "FAIL"
            project_risk = "FAIL"
            project_expense = "FAIL"
            project_template_number = "FAIL"
            project_lender_notice = "FAIL"
            project_borrower_type = "FAIL"
            project_borrower_name = "FAIL"
            project_document_type = "FAIL"
            project_document_number = "FAIL"
            project_borrower_job = "FAIL"
            project_borrower_other_info = "FAIL"
            project_borrower_credit = "FAIL"
            project_borrower_default_times = "FAIL"
            project_borrower_default_amounts = "FAIL"
            project_borrower_income_and_debt = "FAIL"

            self.list_of_attribute = [
                "FAIL", project_name, project_number, project_intro,
                project_link, project_purpose, project_size, project_duration,
                project_apr, project_repay_start, project_repay_method,
                project_repay_details, project_status, project_raise_start,
                project_guarantee, project_repay_source, project_risk,
                project_expense, project_template_number,
                project_lender_notice, project_borrower_type,
                project_borrower_name, project_document_type,
                project_document_number, project_borrower_job,
                project_borrower_other_info, project_borrower_credit,
                project_borrower_default_times,
                project_borrower_default_amounts,
                project_borrower_income_and_debt
            ]
            print(self.spider_name, "has failed and gives", self.url_key,
                  "to another spider")
Beispiel #30
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: Cedar
# @Date  : 2021/3/22
# @Desc  :

from my_fake_useragent import UserAgent


ua = UserAgent(phone=True)
print(ua.random())