def box_office(): fake = FakeUserAgent() data = '' for i in range(5): z = requests.get('http://dianying.nuomi.com/movie/boxrefresh', headers={ 'User-Agent': fake.random, 'referer': 'http://dianying.nuomi.com/movie/boxoffice' }) try: data = z.json() if len(z.text) > 1000 else data except: continue movies = [] n = 1 for movie_ in data['real']['data']['detail']: movie = dict() movie['rank'] = n movie['movieName'] = movie_['movieName'] movie['上映天数'] = movie_['attribute']['1']['attrValue'] movie['实时票房'] = movie_['attribute']['3']['attrValue'] movie['累计票房'] = movie_['attribute']['2']['attrValue'] movie['票房占比'] = movie_['attribute']['4']['attrValue'] movie['排片占比'] = movie_['attribute']['5']['attrValue'] movie['上座率'] = movie_['attribute']['6']['attrValue'] movie['排座占比'] = movie_['attribute']['7']['attrValue'] movie['场次'] = movie_['attribute']['8']['attrValue'] movie['人次'] = movie_['attribute']['9']['attrValue'] movies.append(movie) n += 1 # print(movies) return movies
def startrequest(self): ua = FakeUserAgent().random req = request.urlopen(self.url) con = req.read().decode('gb2312') obj = BeautifulSoup(con, 'html5lib') return obj
def __init__(self, *, username: str, password: str, query_hash: str, limit: int, mode: Mode, ajax_header: str, requests_interval: float, random_intervals: bool, on_error_interval: float, session_file_path: str = 'session.pickle'): self.username: str = username self.password: str = password self.query_hash: str = query_hash self.limit: int = limit self.mode: str = mode self.requests_interval: float = requests_interval self.random_intervals: bool = random_intervals self.on_error_interval = on_error_interval self.session_file_path: str = session_file_path self.send_requests_qty = 0 self.user_id: int = None self.session: Session = None self.user_agent: str = FakeUserAgent().random self.defaultHeaders: Dict[str, str] = { "User-Agent": self.user_agent, "Accept": "*/*", "Accept-Language": "en,en-US;q=0.7,ru;q=0.3", "Accept-Encoding": "gzip, deflate, br", "Referer": "https://www.instagram.com/accounts/login/?source=auth_switcher", "X-Instagram-AJAX": ajax_header, # TODO: find this variable source "Content-Type": "application/x-www-form-urlencoded", "X-Requested-With": "XMLHttpRequest", "DNT": "1", "Connection": "keep-alive", "TE": "Trailers", "Pragma": "no-cache", "Cache-Control": "no-cache", }
def get_detail(url): headers = { 'User-Agent':FakeUserAgent().random } response = requests.get(url,headers=headers) response.encoding = response.apparent_encoding sel = parsel.Selector(response.text) detail_url_list = sel.xpath('//ul[@id="tam_newlist"]/li/a/@href').getall() for i in detail_url_list: detail_url = parse.urljoin(base_url,i) content = get_html(detail_url) name = content.xpath('//h2[@class="person_top_tt1"]/text()').get() name = re.findall('【.*】(.*?)举报信息',name) if name: print(name[0].strip()) con = content.xpath('//div[@class="commentList"]/table/tbody/tr') for i in con: con = i.xpath('./td/text()').getall() print('|'.join(con)) print('*'*50) next_url = sel.xpath('//a[text()="下一页"]/@href').get() next_url = parse.urljoin(base_url,next_url) if next_url: print(next_url) get_detail(next_url)
def __init__(self, lists): self.agents = FakeUserAgent() self.proxys = lists self.maxnum = 20 self.i = 0 self.User_Agent = self.agents.random self.proxy = random.choice(self.proxys)
def __init__( self, username: str, password: str, region: str = "US", user_agent: Optional[str] = None, update_handler: Optional[Callable[[dict], None]] = None, ): self._log = logging.getLogger(__file__) if user_agent is None: try: user_agent = FakeUserAgent().data_browsers["chrome"][0] except Exception: user_agent = FALLBACK_UA self._ua = user_agent_parser.Parse(user_agent) self._reset_session() self.username = username self.password = password self.region = region self._playlists = {} self._channels = None self._favorite_channels = None # vars to manage session cache self.last_renew = None self.update_interval = 30 # hook function to call whenever the playlist updates self.update_handler = update_handler
def get_html1(url): log = '正在爬取第{}页'.format(count) with open('./log.txt', 'a', encoding='utf-8') as f: f.write(log +url+'\n') print(log) headers = { # 'User-agent': random.choice(USER_AGENT), 'User-agent': FakeUserAgent().random, # 'Cookie':'SUID=A553BC753120910A000000005D579E9B; CXID=43D3BD73396C255D4F0D62E2A30FACD5; ABTEST=0|1570799385|v1; weixinIndexVisited=1; SUV=00FF3F47D2280AF15DA07F1AF093F191; SNUID=996B4AB36264F68B4FC4D4C9620BAAE7; JSESSIONID=aaaeIO7VN6dpFbkt0Or1w; pgv_pvi=3442375680; pgv_si=s6097819648; IPLOC=CN5200; PHPSESSID=5kf4b1b4nadodal6bqq6i33hq7; sct=28; ppinf=5|1571121825|1572331425|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTo5OiVFNSU5OSVBMnxjcnQ6MTA6MTU3MTEyMTgyNXxyZWZuaWNrOjk6JUU1JTk5JUEyfHVzZXJpZDo0NDpvOXQybHVGdGJ3MXhuMmV3V1hSRENTU3lVejlJQHdlaXhpbi5zb2h1LmNvbXw; pprdig=lNSU0p8_k_81ts_7ftcwBoO929s-mMZ1y68X7ZNwuR9F_V-IbWhMJfWLfAcgMbco_l-PywMeJEya7nloyKubTvvUBzxXYIS92nqXRPuYZqWneCRNq_-1ckgDtRCc8-Phusq4Xn-vCEpqrn_u-lGC5tEZLkOB5Ev6oJtRit04qW8; sgid=01-41643723-AV2laqH199g92jHZhBez6fo; ppmdig=15711218260000003e3302d16cca953cb84620fac0b12bb2' 'Cookie':'SUID=A553BC753120910A000000005D579E9B; CXID=43D3BD73396C255D4F0D62E2A30FACD5; ABTEST=0|1570799385|v1; weixinIndexVisited=1; SUV=00FF3F47D2280AF15DA07F1AF093F191; IPLOC=CN5201; JSESSIONID=aaa6JFgpvpWLM42NI5s1w; ppinf=5|1571104843|1572314443|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTo5OiVFNSU5OSVBMnxjcnQ6MTA6MTU3MTEwNDg0M3xyZWZuaWNrOjk6JUU1JTk5JUEyfHVzZXJpZDo0NDpvOXQybHVGdGJ3MXhuMmV3V1hSRENTU3lVejlJQHdlaXhpbi5zb2h1LmNvbXw; pprdig=NBAJZcXga_YmqmTh25Dh1GW_gtNkDl-o7FDOxa-rUraCmXVXLXvLq0mqfAv6Qqd40Ic5MQ9xiCw-5C__AFcqHPIYgSkdkLhWiPyJ9dRs8OCepd-5ljMhBFzlLX7Qfgi6w1zEF5L3sK5wKZoqqhR0A5UNPNuEucfyQmMnLgkw8Lg; sgid=01-41643723-AV2lKEtJ7RkRxZuFsapgV8s; PHPSESSID=cd9e6l9l8ecvdo4v60a871rcu0; SNUID=996B4AB36264F68B4FC4D4C9620BAAE7; sct=22; ppmdig=157111096600000005de28951f5fd4d60e1df92d5fa1d9cc' } # proxy1 = get_proxy() proxy1 = get_ip() print('正在使用代理ip ',proxy1) proxies = {'http': 'http://' + proxy1} time.sleep(1) try: response = requests.get(url,headers=headers,proxies=proxies,timeout=10,allow_redirects=False) response.encoding = response.apparent_encoding if response.status_code==200: print('200','正确解析网页') return response.text if response.status_code==302: print('遇到了302','正在重试中') return get_html1(url) except Exception as e: print('代理连接异常','正在重新请求!',e) return get_html1(url)
def get_html(url,count=1): print('正在爬取' ,url) global proxy if count >= max_count: print('try too many!') return None headers = { 'User-agent':FakeUserAgent().random, # 'Cookie':'SUID=A553BC753120910A000000005D579E9B; CXID=43D3BD73396C255D4F0D62E2A30FACD5; ABTEST=0|1570799385|v1; IPLOC=CN5201; weixinIndexVisited=1; SUV=00FF3F47D2280AF15DA07F1AF093F191; JSESSIONID=aaaeDUh4W3WC_hg1on62w; PHPSESSID=ht3i283356j29g4j7js5m9eo92; SNUID=8A7350AB787DECACE9CA67627917514F; sct=15; ppinf=5|1571026592|1572236192|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTo5OiVFNSU5OSVBMnxjcnQ6MTA6MTU3MTAyNjU5MnxyZWZuaWNrOjk6JUU1JTk5JUEyfHVzZXJpZDo0NDpvOXQybHVGdGJ3MXhuMmV3V1hSRENTU3lVejlJQHdlaXhpbi5zb2h1LmNvbXw; pprdig=eWznPWzWx7ILqN6BrKy-ZfGkc_-UGAdVGVMrBOM1HVv_pIZrVt4FTdeV9NbiwhaVQscDogAhXd03jtvUti_Ig6lhpYPzyDNAne_wyOuAuudtkCL_cDCJ_589m57LZuNX-scF1yWVwpjtTkLzRnn-8v1JY72KKUG4xfurSCW6Va4; sgid=01-41643723-AV2j9qDHUXicUUnwUayN4f8o; ppmdig=15710265920000007021a731e5255822c7b8fdfc31eab41d' } try: if proxy: print(proxy) proxies = {'http':'http://'+proxy} response = requests.get(url,headers=headers,proxies=proxies,allow_redirects=False) else: response = requests.get(url,headers=headers,allow_redirects=False) if response.status_code==200: response.encoding = response.apparent_encoding return response.text if response.status_code==302: print('302') proxy = get_proxy() if proxy: print('Using Proxy',proxy) return get_html(url) else: print('Get Proxy Failed') return None except Exception as e: print(e) proxy = get_proxy() count+=1 return get_html(url,count)
def main(url, bankid, provinceid, cityid, key=''): 'get请求格式:bank=1&province=1&city=35&key=' try: ua = FakeUserAgent() # print(ua.random) user_agent = ua.random # user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17" headers = {"User-Agent": user_agent} params = { 'bank': bankid, 'province': provinceid, 'city': cityid, 'key': key } time.sleep(1) resp = requests.get(url=url, params=params, headers=headers, timeout=5) text = resp.text parse(text, bankid, provinceid, cityid) FINISHED.append([bankid, provinceid, cityid]) except Exception as e: print(e) with open("finished.txt", "w+", encoding="utf-8") as f: for item in FINISHED: f.write(str(item)) main(url, bankid, provinceid, cityid, key)
def make_headers(): ua = FakeUserAgent() headers = { "User-Agent": ua.chrome, "Referer": "https://www.1point3acres.com/bbs/", "Host": "www.1point3acres.com", } return headers
def get_html(url): headers = { 'User-Agent': FakeUserAgent().random } response = requests.get(url, headers=headers) response.encoding = response.apparent_encoding sel = parsel.Selector(response.text) return sel
def __init__(self, cache=None): self.user_agent = FakeUserAgent() self.headers = {"user-agent": self.user_agent.random} if cache: self.cache = cache else: self.cache = MongoCache(db_name="hupu_crawler") self.logger = logging.getLogger("hupu_crawler")
def get_html(url): # получение содержимого страницы try: response = requests.get( url, headers={'User-Agent': FakeUserAgent().chrome}) response.raise_for_status() html = response.content return html except (requests.RequestException, ValueError): print('Что-то пошло не так') return False
def get_url(): header = {'user-agent': FakeUserAgent().chrome} response = requests.get('https://www.baidu.com/', headers=header) code = response.encoding html = parsel.Selector(text=response.content.decode(code)) data = [] for item in html.xpath('//ul[@class="s-hotsearch-content"]/li/a/span[2]' ).css('::text').getall(): data.append(item) return data
def __init__(self, fallback=None, file=None): self.agent_file = file if file is not None: logger.info('Using local file for user agents: ' + self.agent_file) self.useragents = self.load_user_agents(self.agent_file) else: logger.info('Using fake-useragent package for user agents.') if fallback is None: fallback = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' self.fakeuseragent = FakeUserAgent(fallback=fallback, cache=False)
def get_agent(choice='random'): # 获取伪装Agent ua = FakeUserAgent() browser = {'safari': ua.safari, 'random': ua.random, 'chrome': ua.chrome, 'ie': ua.internetexplorer, 'opera': ua.opera, 'firefox': ua.firefox} return {'User-Agent': browser[choice]}
def get_content(link): headers = {'User-Agent': FakeUserAgent().random} response = requests.get(link, headers=headers) response.encoding = 'utf-8' print(response.request.url) print(response.status_code) content = response.text print(content) name = re.findall(r" agentName:'(.*?)',", content, re.S) print(name)
class RandomUserAgentMiddleware(Middleware): engine = FakeUserAgent() def process_request(self, request, spider): random_user_agent = self.get_random_user_agent() request.headers.setdefault('User-Agent', random_user_agent) spider.log(f'Using {random_user_agent}', logging.INFO) def get_random_user_agent(self): return self.engine.chrome
def __init__(self, url=None, ipq=None, savehd=None): self.starturl = url self.infohd = savehd self.ua = FakeUserAgent() self.ipqueue = ipq self.ips = [] self.opener = None self.reqnum = 0 self.iterips = None self.curip = None
def get_ip(): headers = { 'User-Agent':FakeUserAgent().random } url = 'http://api3.xiguadaili.com/ip/?tid=556756079976571&num=1&category=2&sortby=time&filter=on' try: response = requests.get(url,headers=headers) if response.status_code ==200: return response.text except Exception as e: get_ip()
def __init__(self, cache=None): self.user_agent = FakeUserAgent() self.headers = {"user-agent": self.user_agent.random} if cache: self.cache = cache else: self.cache = MongoCache(db_name="hupu_crawler", username="******", password="******") self.logger = set_logger("hupu_crawler") self.redis_client = Redis()
def get_user_agent(num): """ 生成不同的 user-agent :param num: 生成个数 :return: list """ ua = FakeUserAgent() user_agent = [] for i in range(num): user_agent.append({'User-Agent': ua.random}) return user_agent
def get_request_headers(): headers = { 'User-Agent': FakeUserAgent().random, "Connection": "keep-alive", "Cache-Control": "max-age=0", "Upgrade-Insecure-Requests": "1", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-US;q=0.7", } return headers
class LeaderDifference: __ua = FakeUserAgent() def get_data(self, stock_id, begin, end, test=False): try: if test == True: print('Test mode') f = open("res/test/result.txt", "r", encoding='utf-8') l = f.read() f.close() else: begin_date = begin.strftime('%Y%m%d') end_date = end.strftime('%Y%m%d') url = 'https://histock.tw/stock/branch.aspx?no={}&from={}&to={}'.format( str(stock_id), end_date, begin_date) #end date is earlier than begin date time.sleep(1) for _ in range(0, 5): proxy_index = randomproxy.random_proxy() proxy = randomproxy.proxies[proxy_index] # Make the call try: r = requests.get( url, headers={'User-Agent': self.__ua.random}, proxies={ 'http': '{0}:{1}'.format(proxy['ip'], proxy['port']) }) r.encoding = 'utf-8' l = r.text break except: # If error, delete this proxy and find another one del randomproxy.proxies[proxy_index] print('Proxy ' + proxy['ip'] + ':' + proxy['port'] + ' deleted.') soup = BeautifulSoup(l, 'html.parser') pattern = re.compile(r'var jsonDatas', re.MULTILINE | re.DOTALL) script = soup.find("script", text=pattern) start = script.string.find('eval(') + 5 end = script.string.find('});') + 1 json_s = script.string[start:end] json_obj = json.loads(json_s, encoding='utf-8') return json_obj except Exception as e: print("except: {}".format(str(e))) return None
class FakeUserAgentMiddleware(object): def __init__(self, crawler): super(FakeUserAgentMiddleware, self).__init__() self.ua = FakeUserAgent() self.ua_type = crawler.settings.get("RANDOM_USER_AGENT_TYPE", "random") @classmethod def from_crawler(cls, crawler): return cls(crawler) def process_request(self, request, spider): request.headers.setdefault(b'User-Agent', self.ua.__getattr__(self.ua_type))
def get_link(): headers = {'User-Agent': FakeUserAgent().random} response = requests.get(url, headers=headers) response.encoding = 'gb2312' content = response.text sel = parsel.Selector(content) link = sel.xpath( '//div[@class="houseList"]/dl/dd[@class="info rel"]/p[@class="title"]/a/@href' ).getall()[6:] for i in link: # urls = 'https://zu.fang.com'+i urls = 'http://search.fang.com/captcha-b64c3c4d4e3190bb69/redirect?h=https://zu.fang.com/chuzu/1_61211134_-1.htm' get_content(urls) break
def __init__(self): ua = FakeUserAgent() self.station_name = station() self.from_station = input('请输入出发站: ') from_station = self.station_name[self.from_station] self.to_station = input('请输入到达站: ') to_station = self.station_name[self.to_station] self.date = input('请输入出发日期: ') self.url = 'https://kyfw.12306.cn/otn/leftTicket/queryZ?' \ 'leftTicketDTO.train_date=%s&' \ 'leftTicketDTO.from_station=%s&' \ 'leftTicketDTO.to_station=%s&' \ 'purpose_codes=ADULT' % (self.date, from_station, to_station) self.headers = {'user-agent': ua.chrome}
def chekout_proxy(ip): ip = {'http': ip} proxy = request.ProxyHandler(ip) opener = request.build_opener(proxy) ua = FakeUserAgent() url = 'http://www.baidu.com' headinfo = {'User-Agent': ua.random} reqhd = request.Request(url, headers=headinfo) try: req = opener.open(reqhd, timeout=5) except Exception as e: print('invalid ip:', ip, e) return if req.code == 200: return ip
async def get_html_test(url): browser = await launch() page = await browser.newPage() await page.setUserAgent(FakeUserAgent().chrome) await page.evaluateOnNewDocument('Object.defineProperty(navigator, "webdriver", {get: () => undefined})') htmls = '' for i in url: await page.goto(i) await page.waitFor(random.randrange(2, 5, 1)) for j in range(12): await page.keyboard.press('PageDown') time.sleep(random.randrange(1, 4, 1)) htmls += await page.content() await browser.close() return htmls
def main(): session = HTMLSession() headers = { 'user-agent': FakeUserAgent().chrome, 'referer': 'https://dxy.com/', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,applic' 'ation/signed-exchange;v=b3;q=0.9', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8', 'upgrade-insecure-requests': '1'} session, link = get_link(session, headers) session, disease, tag_name = get_disease_link(session, link, headers) data = get_tag(session, disease, headers) # if len(tag_name) != len(data): # print('爬取发生异常!') # exit() print(data)