Ejemplo n.º 1
0
def box_office():
    fake = FakeUserAgent()
    data = ''
    for i in range(5):
        z = requests.get('http://dianying.nuomi.com/movie/boxrefresh',
                         headers={
                             'User-Agent': fake.random,
                             'referer':
                             'http://dianying.nuomi.com/movie/boxoffice'
                         })
        try:
            data = z.json() if len(z.text) > 1000 else data
        except:
            continue
    movies = []
    n = 1
    for movie_ in data['real']['data']['detail']:
        movie = dict()
        movie['rank'] = n
        movie['movieName'] = movie_['movieName']
        movie['上映天数'] = movie_['attribute']['1']['attrValue']
        movie['实时票房'] = movie_['attribute']['3']['attrValue']
        movie['累计票房'] = movie_['attribute']['2']['attrValue']
        movie['票房占比'] = movie_['attribute']['4']['attrValue']
        movie['排片占比'] = movie_['attribute']['5']['attrValue']
        movie['上座率'] = movie_['attribute']['6']['attrValue']
        movie['排座占比'] = movie_['attribute']['7']['attrValue']
        movie['场次'] = movie_['attribute']['8']['attrValue']
        movie['人次'] = movie_['attribute']['9']['attrValue']
        movies.append(movie)
        n += 1
    # print(movies)
    return movies
Ejemplo n.º 2
0
    def startrequest(self):

        ua = FakeUserAgent().random
        req = request.urlopen(self.url)
        con = req.read().decode('gb2312')
        obj = BeautifulSoup(con, 'html5lib')
        return obj
Ejemplo n.º 3
0
    def __init__(self, *, username: str, password: str, query_hash: str, limit: int, mode: Mode, ajax_header: str,
                 requests_interval: float, random_intervals: bool, on_error_interval: float,
                 session_file_path: str = 'session.pickle'):
        self.username: str = username
        self.password: str = password
        self.query_hash: str = query_hash
        self.limit: int = limit
        self.mode: str = mode
        self.requests_interval: float = requests_interval
        self.random_intervals: bool = random_intervals
        self.on_error_interval = on_error_interval
        self.session_file_path: str = session_file_path

        self.send_requests_qty = 0
        self.user_id: int = None
        self.session: Session = None
        self.user_agent: str = FakeUserAgent().random
        self.defaultHeaders: Dict[str, str] = {
            "User-Agent": self.user_agent,
            "Accept": "*/*",
            "Accept-Language": "en,en-US;q=0.7,ru;q=0.3",
            "Accept-Encoding": "gzip, deflate, br",
            "Referer": "https://www.instagram.com/accounts/login/?source=auth_switcher",
            "X-Instagram-AJAX": ajax_header,  # TODO: find this variable source
            "Content-Type": "application/x-www-form-urlencoded",
            "X-Requested-With": "XMLHttpRequest",
            "DNT": "1",
            "Connection": "keep-alive",
            "TE": "Trailers",
            "Pragma": "no-cache",
            "Cache-Control": "no-cache",
        }
Ejemplo n.º 4
0
def get_detail(url):
    headers = {
        'User-Agent':FakeUserAgent().random
    }
    response = requests.get(url,headers=headers)
    response.encoding = response.apparent_encoding
    sel = parsel.Selector(response.text)

    detail_url_list = sel.xpath('//ul[@id="tam_newlist"]/li/a/@href').getall()
    for i in detail_url_list:
        detail_url = parse.urljoin(base_url,i)
        content = get_html(detail_url)
        name = content.xpath('//h2[@class="person_top_tt1"]/text()').get()
        name = re.findall('【.*】(.*?)举报信息',name)
        if name:
            print(name[0].strip())
        con = content.xpath('//div[@class="commentList"]/table/tbody/tr')
        for i in con:
            con = i.xpath('./td/text()').getall()
            print('|'.join(con))
        print('*'*50)
    next_url = sel.xpath('//a[text()="下一页"]/@href').get()
    next_url = parse.urljoin(base_url,next_url)
    if next_url:
        print(next_url)
        get_detail(next_url)
Ejemplo n.º 5
0
 def __init__(self, lists):
     self.agents = FakeUserAgent()
     self.proxys = lists
     self.maxnum = 20
     self.i = 0
     self.User_Agent = self.agents.random
     self.proxy = random.choice(self.proxys)
Ejemplo n.º 6
0
    def __init__(
        self,
        username: str,
        password: str,
        region: str = "US",
        user_agent: Optional[str] = None,
        update_handler: Optional[Callable[[dict], None]] = None,
    ):

        self._log = logging.getLogger(__file__)

        if user_agent is None:
            try:
                user_agent = FakeUserAgent().data_browsers["chrome"][0]
            except Exception:
                user_agent = FALLBACK_UA
        self._ua = user_agent_parser.Parse(user_agent)

        self._reset_session()

        self.username = username
        self.password = password
        self.region = region

        self._playlists = {}
        self._channels = None
        self._favorite_channels = None

        # vars to manage session cache
        self.last_renew = None
        self.update_interval = 30

        # hook function to call whenever the playlist updates
        self.update_handler = update_handler
Ejemplo n.º 7
0
def get_html1(url):
    log = '正在爬取第{}页'.format(count)
    with open('./log.txt', 'a', encoding='utf-8') as f:
        f.write(log +url+'\n')
    print(log)
    headers = {
        # 'User-agent': random.choice(USER_AGENT),
        'User-agent': FakeUserAgent().random,
        # 'Cookie':'SUID=A553BC753120910A000000005D579E9B; CXID=43D3BD73396C255D4F0D62E2A30FACD5; ABTEST=0|1570799385|v1; weixinIndexVisited=1; SUV=00FF3F47D2280AF15DA07F1AF093F191; SNUID=996B4AB36264F68B4FC4D4C9620BAAE7; JSESSIONID=aaaeIO7VN6dpFbkt0Or1w; pgv_pvi=3442375680; pgv_si=s6097819648; IPLOC=CN5200; PHPSESSID=5kf4b1b4nadodal6bqq6i33hq7; sct=28; ppinf=5|1571121825|1572331425|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTo5OiVFNSU5OSVBMnxjcnQ6MTA6MTU3MTEyMTgyNXxyZWZuaWNrOjk6JUU1JTk5JUEyfHVzZXJpZDo0NDpvOXQybHVGdGJ3MXhuMmV3V1hSRENTU3lVejlJQHdlaXhpbi5zb2h1LmNvbXw; pprdig=lNSU0p8_k_81ts_7ftcwBoO929s-mMZ1y68X7ZNwuR9F_V-IbWhMJfWLfAcgMbco_l-PywMeJEya7nloyKubTvvUBzxXYIS92nqXRPuYZqWneCRNq_-1ckgDtRCc8-Phusq4Xn-vCEpqrn_u-lGC5tEZLkOB5Ev6oJtRit04qW8; sgid=01-41643723-AV2laqH199g92jHZhBez6fo; ppmdig=15711218260000003e3302d16cca953cb84620fac0b12bb2'
        'Cookie':'SUID=A553BC753120910A000000005D579E9B; CXID=43D3BD73396C255D4F0D62E2A30FACD5; ABTEST=0|1570799385|v1; weixinIndexVisited=1; SUV=00FF3F47D2280AF15DA07F1AF093F191; IPLOC=CN5201; JSESSIONID=aaa6JFgpvpWLM42NI5s1w; ppinf=5|1571104843|1572314443|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTo5OiVFNSU5OSVBMnxjcnQ6MTA6MTU3MTEwNDg0M3xyZWZuaWNrOjk6JUU1JTk5JUEyfHVzZXJpZDo0NDpvOXQybHVGdGJ3MXhuMmV3V1hSRENTU3lVejlJQHdlaXhpbi5zb2h1LmNvbXw; pprdig=NBAJZcXga_YmqmTh25Dh1GW_gtNkDl-o7FDOxa-rUraCmXVXLXvLq0mqfAv6Qqd40Ic5MQ9xiCw-5C__AFcqHPIYgSkdkLhWiPyJ9dRs8OCepd-5ljMhBFzlLX7Qfgi6w1zEF5L3sK5wKZoqqhR0A5UNPNuEucfyQmMnLgkw8Lg; sgid=01-41643723-AV2lKEtJ7RkRxZuFsapgV8s; PHPSESSID=cd9e6l9l8ecvdo4v60a871rcu0; SNUID=996B4AB36264F68B4FC4D4C9620BAAE7; sct=22; ppmdig=157111096600000005de28951f5fd4d60e1df92d5fa1d9cc'
         }
    # proxy1 = get_proxy()
    proxy1 = get_ip()
    print('正在使用代理ip ',proxy1)
    proxies = {'http': 'http://' + proxy1}
    time.sleep(1)
    try:
        response = requests.get(url,headers=headers,proxies=proxies,timeout=10,allow_redirects=False)
        response.encoding = response.apparent_encoding
        if response.status_code==200:
            print('200','正确解析网页')
            return response.text
        if response.status_code==302:
            print('遇到了302','正在重试中')
            return get_html1(url)
    except Exception as e:
        print('代理连接异常','正在重新请求!',e)
        return get_html1(url)
Ejemplo n.º 8
0
def get_html(url,count=1):
    print('正在爬取' ,url)
    global proxy
    if count >= max_count:
        print('try too many!')
        return None
    headers = {
        'User-agent':FakeUserAgent().random,
        # 'Cookie':'SUID=A553BC753120910A000000005D579E9B; CXID=43D3BD73396C255D4F0D62E2A30FACD5; ABTEST=0|1570799385|v1; IPLOC=CN5201; weixinIndexVisited=1; SUV=00FF3F47D2280AF15DA07F1AF093F191; JSESSIONID=aaaeDUh4W3WC_hg1on62w; PHPSESSID=ht3i283356j29g4j7js5m9eo92; SNUID=8A7350AB787DECACE9CA67627917514F; sct=15; ppinf=5|1571026592|1572236192|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTo5OiVFNSU5OSVBMnxjcnQ6MTA6MTU3MTAyNjU5MnxyZWZuaWNrOjk6JUU1JTk5JUEyfHVzZXJpZDo0NDpvOXQybHVGdGJ3MXhuMmV3V1hSRENTU3lVejlJQHdlaXhpbi5zb2h1LmNvbXw; pprdig=eWznPWzWx7ILqN6BrKy-ZfGkc_-UGAdVGVMrBOM1HVv_pIZrVt4FTdeV9NbiwhaVQscDogAhXd03jtvUti_Ig6lhpYPzyDNAne_wyOuAuudtkCL_cDCJ_589m57LZuNX-scF1yWVwpjtTkLzRnn-8v1JY72KKUG4xfurSCW6Va4; sgid=01-41643723-AV2j9qDHUXicUUnwUayN4f8o; ppmdig=15710265920000007021a731e5255822c7b8fdfc31eab41d'
    }
    try:
        if proxy:
            print(proxy)
            proxies = {'http':'http://'+proxy}
            response = requests.get(url,headers=headers,proxies=proxies,allow_redirects=False)
        else:
            response = requests.get(url,headers=headers,allow_redirects=False)
        if response.status_code==200:
            response.encoding = response.apparent_encoding
            return response.text
        if response.status_code==302:
            print('302')
            proxy = get_proxy()
            if proxy:
                print('Using Proxy',proxy)
                return get_html(url)
            else:
                print('Get Proxy Failed')
                return None
    except Exception as e:
            print(e)
            proxy = get_proxy()
            count+=1
            return get_html(url,count)
Ejemplo n.º 9
0
def main(url, bankid, provinceid, cityid, key=''):

    'get请求格式:bank=1&province=1&city=35&key='
    try:
        ua = FakeUserAgent()
        # print(ua.random)
        user_agent = ua.random
        # user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17"
        headers = {"User-Agent": user_agent}
        params = {
            'bank': bankid,
            'province': provinceid,
            'city': cityid,
            'key': key
        }
        time.sleep(1)
        resp = requests.get(url=url, params=params, headers=headers, timeout=5)
        text = resp.text
        parse(text, bankid, provinceid, cityid)

        FINISHED.append([bankid, provinceid, cityid])
    except Exception as e:
        print(e)
        with open("finished.txt", "w+", encoding="utf-8") as f:
            for item in FINISHED:
                f.write(str(item))
        main(url, bankid, provinceid, cityid, key)
Ejemplo n.º 10
0
def make_headers():
    ua = FakeUserAgent()
    headers = {
        "User-Agent": ua.chrome,
        "Referer": "https://www.1point3acres.com/bbs/",
        "Host": "www.1point3acres.com",
    }
    return headers
Ejemplo n.º 11
0
def get_html(url):
    headers = {
        'User-Agent': FakeUserAgent().random
    }
    response = requests.get(url, headers=headers)
    response.encoding = response.apparent_encoding
    sel = parsel.Selector(response.text)
    return sel
Ejemplo n.º 12
0
    def __init__(self, cache=None):
        self.user_agent = FakeUserAgent()
        self.headers = {"user-agent": self.user_agent.random}
        if cache:
            self.cache = cache
        else:
            self.cache = MongoCache(db_name="hupu_crawler")

        self.logger = logging.getLogger("hupu_crawler")
Ejemplo n.º 13
0
def get_html(url):  # получение содержимого страницы
    try:
        response = requests.get(
            url, headers={'User-Agent': FakeUserAgent().chrome})
        response.raise_for_status()
        html = response.content
        return html
    except (requests.RequestException, ValueError):
        print('Что-то пошло не так')
        return False
Ejemplo n.º 14
0
def get_url():
    header = {'user-agent': FakeUserAgent().chrome}
    response = requests.get('https://www.baidu.com/', headers=header)
    code = response.encoding
    html = parsel.Selector(text=response.content.decode(code))
    data = []
    for item in html.xpath('//ul[@class="s-hotsearch-content"]/li/a/span[2]'
                           ).css('::text').getall():
        data.append(item)
    return data
Ejemplo n.º 15
0
 def __init__(self, fallback=None, file=None):
     self.agent_file = file
     if file is not None:
         logger.info('Using local file for user agents: ' + self.agent_file)
         self.useragents = self.load_user_agents(self.agent_file)
     else:
         logger.info('Using fake-useragent package for user agents.')
         if fallback is None:
             fallback = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
         self.fakeuseragent = FakeUserAgent(fallback=fallback, cache=False)
Ejemplo n.º 16
0
 def get_agent(choice='random'):
     # 获取伪装Agent
     ua = FakeUserAgent()
     browser = {'safari': ua.safari,
                'random': ua.random,
                'chrome': ua.chrome,
                'ie': ua.internetexplorer,
                'opera': ua.opera,
                'firefox': ua.firefox}
     return {'User-Agent': browser[choice]}
Ejemplo n.º 17
0
def get_content(link):
    headers = {'User-Agent': FakeUserAgent().random}
    response = requests.get(link, headers=headers)
    response.encoding = 'utf-8'
    print(response.request.url)
    print(response.status_code)
    content = response.text
    print(content)
    name = re.findall(r" agentName:'(.*?)',", content, re.S)
    print(name)
Ejemplo n.º 18
0
class RandomUserAgentMiddleware(Middleware):
    engine = FakeUserAgent()

    def process_request(self, request, spider):
        random_user_agent = self.get_random_user_agent()
        request.headers.setdefault('User-Agent', random_user_agent)
        spider.log(f'Using {random_user_agent}', logging.INFO)

    def get_random_user_agent(self):
        return self.engine.chrome
Ejemplo n.º 19
0
 def __init__(self, url=None, ipq=None, savehd=None):
     self.starturl = url
     self.infohd = savehd
     self.ua = FakeUserAgent()
     self.ipqueue = ipq
     self.ips = []
     self.opener = None
     self.reqnum = 0
     self.iterips = None
     self.curip = None
Ejemplo n.º 20
0
def get_ip():
    headers = {
        'User-Agent':FakeUserAgent().random
    }
    url = 'http://api3.xiguadaili.com/ip/?tid=556756079976571&num=1&category=2&sortby=time&filter=on'
    try:
        response = requests.get(url,headers=headers)
        if response.status_code ==200:
            return response.text
    except Exception as e:
         get_ip()
Ejemplo n.º 21
0
 def __init__(self, cache=None):
     self.user_agent = FakeUserAgent()
     self.headers = {"user-agent": self.user_agent.random}
     if cache:
         self.cache = cache
     else:
         self.cache = MongoCache(db_name="hupu_crawler",
                                 username="******",
                                 password="******")
     self.logger = set_logger("hupu_crawler")
     self.redis_client = Redis()
Ejemplo n.º 22
0
def get_user_agent(num):
    """
    生成不同的 user-agent
    :param num: 生成个数
    :return: list
    """
    ua = FakeUserAgent()
    user_agent = []
    for i in range(num):
        user_agent.append({'User-Agent': ua.random})
    return user_agent
Ejemplo n.º 23
0
def get_request_headers():
    headers = {
        'User-Agent': FakeUserAgent().random,
        "Connection": "keep-alive",
        "Cache-Control": "max-age=0",
        "Upgrade-Insecure-Requests": "1",
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-US;q=0.7",
    }
    return headers
class LeaderDifference:
    __ua = FakeUserAgent()

    def get_data(self, stock_id, begin, end, test=False):
        try:
            if test == True:
                print('Test mode')
                f = open("res/test/result.txt", "r", encoding='utf-8')
                l = f.read()
                f.close()
            else:
                begin_date = begin.strftime('%Y%m%d')
                end_date = end.strftime('%Y%m%d')
                url = 'https://histock.tw/stock/branch.aspx?no={}&from={}&to={}'.format(
                    str(stock_id), end_date,
                    begin_date)  #end date is earlier than begin date

                time.sleep(1)
                for _ in range(0, 5):
                    proxy_index = randomproxy.random_proxy()
                    proxy = randomproxy.proxies[proxy_index]
                    # Make the call
                    try:
                        r = requests.get(
                            url,
                            headers={'User-Agent': self.__ua.random},
                            proxies={
                                'http':
                                '{0}:{1}'.format(proxy['ip'], proxy['port'])
                            })
                        r.encoding = 'utf-8'
                        l = r.text
                        break
                    except:  # If error, delete this proxy and find another one
                        del randomproxy.proxies[proxy_index]
                        print('Proxy ' + proxy['ip'] + ':' + proxy['port'] +
                              ' deleted.')

            soup = BeautifulSoup(l, 'html.parser')
            pattern = re.compile(r'var jsonDatas', re.MULTILINE | re.DOTALL)
            script = soup.find("script", text=pattern)

            start = script.string.find('eval(') + 5
            end = script.string.find('});') + 1
            json_s = script.string[start:end]

            json_obj = json.loads(json_s, encoding='utf-8')

            return json_obj
        except Exception as e:
            print("except: {}".format(str(e)))
            return None
Ejemplo n.º 25
0
class FakeUserAgentMiddleware(object):
    def __init__(self, crawler):
        super(FakeUserAgentMiddleware, self).__init__()
        self.ua = FakeUserAgent()
        self.ua_type = crawler.settings.get("RANDOM_USER_AGENT_TYPE", "random")

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def process_request(self, request, spider):
        request.headers.setdefault(b'User-Agent',
                                   self.ua.__getattr__(self.ua_type))
Ejemplo n.º 26
0
def get_link():
    headers = {'User-Agent': FakeUserAgent().random}
    response = requests.get(url, headers=headers)
    response.encoding = 'gb2312'
    content = response.text
    sel = parsel.Selector(content)
    link = sel.xpath(
        '//div[@class="houseList"]/dl/dd[@class="info rel"]/p[@class="title"]/a/@href'
    ).getall()[6:]
    for i in link:
        # urls = 'https://zu.fang.com'+i
        urls = 'http://search.fang.com/captcha-b64c3c4d4e3190bb69/redirect?h=https://zu.fang.com/chuzu/1_61211134_-1.htm'
        get_content(urls)
        break
Ejemplo n.º 27
0
 def __init__(self):
     ua = FakeUserAgent()
     self.station_name = station()
     self.from_station = input('请输入出发站: ')
     from_station = self.station_name[self.from_station]
     self.to_station = input('请输入到达站: ')
     to_station = self.station_name[self.to_station]
     self.date = input('请输入出发日期: ')
     self.url = 'https://kyfw.12306.cn/otn/leftTicket/queryZ?' \
                'leftTicketDTO.train_date=%s&' \
                'leftTicketDTO.from_station=%s&' \
                'leftTicketDTO.to_station=%s&' \
                'purpose_codes=ADULT' % (self.date, from_station, to_station)
     self.headers = {'user-agent': ua.chrome}
Ejemplo n.º 28
0
def chekout_proxy(ip):
    ip = {'http': ip}
    proxy = request.ProxyHandler(ip)
    opener = request.build_opener(proxy)
    ua = FakeUserAgent()
    url = 'http://www.baidu.com'
    headinfo = {'User-Agent': ua.random}
    reqhd = request.Request(url, headers=headinfo)
    try:
        req = opener.open(reqhd, timeout=5)
    except Exception as e:
        print('invalid ip:', ip, e)
        return
    if req.code == 200:
        return ip
Ejemplo n.º 29
0
async def get_html_test(url):
    browser = await launch()
    page = await browser.newPage()
    await page.setUserAgent(FakeUserAgent().chrome)
    await page.evaluateOnNewDocument('Object.defineProperty(navigator, "webdriver", {get: () => undefined})')
    htmls = ''
    for i in url:
        await page.goto(i)
        await page.waitFor(random.randrange(2, 5, 1))
        for j in range(12):
            await page.keyboard.press('PageDown')
            time.sleep(random.randrange(1, 4, 1))
        htmls += await page.content()
    await browser.close()
    return htmls
Ejemplo n.º 30
0
def main():
    session = HTMLSession()
    headers = {
        'user-agent': FakeUserAgent().chrome,
        'referer': 'https://dxy.com/',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,applic'
                  'ation/signed-exchange;v=b3;q=0.9',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'upgrade-insecure-requests': '1'}
    session, link = get_link(session, headers)
    session, disease, tag_name = get_disease_link(session, link, headers)
    data = get_tag(session, disease, headers)
    # if len(tag_name) != len(data):
    #     print('爬取发生异常!')
    #     exit()
    print(data)