# For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from fake_useragent import UserAgent BOT_NAME = 'guazi' SPIDER_MODULES = ['guazi.spiders'] NEWSPIDER_MODULE = 'guazi.spiders' LOG_LEVEL = 'WARNING' # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = UserAgent().random # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16
from bs4 import BeautifulSoup import requests as re import pandas as pd from datetime import datetime import numpy as np import time import tqdm url = "https://www.hltv.org/results?offset=" pd.options.mode.chained_assignment = None # default='warn' from fake_useragent import UserAgent ua = UserAgent(cache=False) def team_matches(team, df): return df.loc[(df['team1'] == team) | (df['team2'] == team)].reset_index(drop=True) class hltv(object): """docstring for ClassName""" def __init__(self, user_agent): self.user_agent = user_agent self.dataframe = pd.DataFrame() def results(self, num): df = pd.DataFrame( columns=['team1', 'score1', 'score2', 'team2', 'url', 'date']) session = re.Session() session.headers.update({'User-Agent': self.user_agent}) for index_of_page in range(num):
def __init__(self, login, password, like_per_day=1000, media_max_like=150, media_min_like=0, follow_per_day=0, follow_time=5 * 60 * 60, #Cannot be zero unfollow_per_day=0, start_at_h=0, start_at_m=0, end_at_h=23, end_at_m=59, database_name='follows_db.db', comment_list=[["this", "the", "your"], ["photo", "picture", "pic", "shot", "snapshot"], ["is", "looks", "feels", "is really"], ["great", "super", "good", "very good", "good", "wow", "WOW", "cool", "GREAT", "magnificent", "magical", "very cool", "stylish", "beautiful", "so beautiful", "so stylish", "so professional", "lovely", "so lovely", "very lovely", "glorious", "so glorious", "very glorious", "adorable", "excellent", "amazing"], [".", "..", "...", "!", "!!", "!!!"]], comments_per_day=0, tag_list=['cat', 'car', 'dog'], max_like_for_one_tag=5, unfollow_break_min=15, unfollow_break_max=30, log_mod=0, proxy="", user_blacklist={}, tag_blacklist=[], unwanted_username_list=[], unfollow_whitelist=[]): self.database_name = database_name self.follows_db = sqlite3.connect(database_name, timeout=0, isolation_level=None) self.follows_db_c = self.follows_db.cursor() check_and_update(self) fake_ua = UserAgent() self.user_agent = check_and_insert_user_agent(self, str(fake_ua.random)) self.bot_start = datetime.datetime.now() self.bot_start_ts = time.time() self.start_at_h = start_at_h self.start_at_m = start_at_m self.end_at_h = end_at_h self.end_at_m = end_at_m self.unfollow_break_min = unfollow_break_min self.unfollow_break_max = unfollow_break_max self.user_blacklist = user_blacklist self.tag_blacklist = tag_blacklist self.unfollow_whitelist = unfollow_whitelist self.comment_list = comment_list self.instaloader = instaloader.Instaloader() self.time_in_day = 24 * 60 * 60 # Like self.like_per_day = like_per_day if self.like_per_day != 0: self.like_delay = self.time_in_day / self.like_per_day # Follow self.follow_time = follow_time #Cannot be zero self.follow_per_day = follow_per_day if self.follow_per_day != 0: self.follow_delay = self.time_in_day / self.follow_per_day # Unfollow self.unfollow_per_day = unfollow_per_day if self.unfollow_per_day != 0: self.unfollow_delay = self.time_in_day / self.unfollow_per_day # Comment self.comments_per_day = comments_per_day if self.comments_per_day != 0: self.comments_delay = self.time_in_day / self.comments_per_day # Don't like if media have more than n likes. self.media_max_like = media_max_like # Don't like if media have less than n likes. self.media_min_like = media_min_like # Auto mod seting: # Default list of tag. self.tag_list = tag_list # Get random tag, from tag_list, and like (1 to n) times. self.max_like_for_one_tag = max_like_for_one_tag # log_mod 0 to console, 1 to file self.log_mod = log_mod self.s = requests.Session() self.c = requests.Session() # if you need proxy make something like this: # self.s.proxies = {"https" : "http://proxyip:proxyport"} # by @ageorgios if proxy != "": proxies = { 'http': 'http://' + proxy, 'https': 'http://' + proxy, } self.s.proxies.update(proxies) # convert login to lower self.user_login = login.lower() self.user_password = password self.bot_mode = 0 self.media_by_tag = [] self.media_on_feed = [] self.media_by_user = [] self.current_user_info = '' self.unwanted_username_list = unwanted_username_list now_time = datetime.datetime.now() self.check_for_bot_update() log_string = 'Instabot v1.2.0/1 started at %s:' % \ (now_time.strftime("%d.%m.%Y %H:%M")) self.write_log(log_string) self.login() self.populate_user_blacklist() signal.signal(signal.SIGTERM, self.cleanup) atexit.register(self.cleanup)
def get_header(): ua = UserAgent(verify_ssl=False) return {'User-Agent': ua.random}
def test_fake_update_use_cache_server(): ua = UserAgent(cache=False, use_cache_server=True) denied_urls = [ 'https://www.w3schools.com/browsers/browsers_stats.asp', 'http://useragentstring.com/pages/useragentstring.php', ] with mock.patch( 'fake_useragent.utils.Request', side_effect=partial(_request, denied_urls=denied_urls), ): ua.update() _probe(ua) denied_urls = [ 'https://www.w3schools.com/browsers/browsers_stats.asp', 'http://useragentstring.com/pages/useragentstring.php', settings.CACHE_SERVER, ] with mock.patch( 'fake_useragent.utils.Request', side_effect=partial(_request, denied_urls=denied_urls), ): with pytest.raises(FakeUserAgentError): ua.update()
def test_fake_update_cache(path): assert not os.path.isfile(path) ua = UserAgent(path=path, cache=False, use_cache_server=False) assert not os.path.isfile(path) with pytest.raises(AssertionError): ua.update(cache='y') ua.update(cache=True) assert os.path.isfile(path) _probe(ua)
def resolver(mongoconf, exit_event, debug=False): ua = UserAgent() ua.update() db = MongoClient(mongoconf['host'], mongoconf['port'])[mongoconf['db']] linkscoll = db['links'] tweetscoll = db['tweets'] while not exit_event.is_set(): done = 0 todo = list(tweetscoll.find({"links_to_resolve": True}, projection={"links": 1, "proper_links": 1, "retweet_id": 1}, limit=600, sort=[("_id", 1)])) urlstoclear = list(set([l for t in todo if not t.get("proper_links", []) for l in t.get('links', [])])) alreadydone = {l["_id"]: l["real"] for l in linkscoll.find({"_id": {"$in": urlstoclear}})} tweetsdone = [] batchidsdone = set() for tweet in todo: if tweet.get("proper_links", []): tweetsdone.append(tweet["_id"]) continue tweetid = tweet.get('retweet_id') or tweet['_id'] if tweetid in batchidsdone: continue if exit_event.is_set(): continue gdlinks = [] for link in tweet.get("links", []): if link in alreadydone: gdlinks.append(alreadydone[link]) continue good = resolve_url(link, user_agent=ua) gdlinks.append(good) try: linkscoll.save({'_id': link, 'real': good}) except Exception as e: log("WARNING", "Could not store resolved link %s -> %s because %s: %s" % (link, good, type(e), e)) if link != good: done += 1 tweetscoll.update({'$or': [{'_id': tweetid}, {'retweet_id': tweetid}]}, {'$set': {'proper_links': gdlinks, 'links_to_resolve': False}}, upsert=False, multi=True) batchidsdone.add(tweetid) if debug and done: left = tweetscoll.count({"links_to_resolve": True}) log("DEBUG", "[links] +%s new redirection resolved out of %s links (%s waiting)" % (done, len(todo), left)) # clear tweets potentially rediscovered if tweetsdone: tweetscoll.update({"_id": {"$in": tweetsdone}}, {"$set": {"links_to_resolve": False}}, upsert=False, multi=True) log("INFO", "FINISHED resolver")
def test_custom_path(): location = os.path.join( tempfile.gettempdir(), 'fake_useragent' + uuid.uuid1().hex + '.json', ) ua = UserAgent(path=location) assert utils.exist(location) check_dict(ua.data) mtime = os.path.getmtime(location) ua.update() assert os.path.getmtime(location) != mtime clear(location)
def test_fake_user_agent_browsers(): ua = UserAgent(cache=False, use_cache_server=False) _probe(ua) with pytest.raises(FakeUserAgentError): ua.non_existing with pytest.raises(FakeUserAgentError): ua['non_existing'] data1 = ua.data ua.update() data2 = ua.data assert data1 == data2 assert data1 is not data2
def test_user_agent(): clear() assert not utils.exist() ua = UserAgent(cache=False) assert not ua.ie is None assert not ua.msie is None assert not ua.internetexplorer is None assert not ua.internet_explorer is None assert not ua['internet explorer'] is None assert not ua.google is None assert not ua.chrome is None assert not ua.googlechrome is None assert not ua.google_chrome is None assert not ua['google chrome'] is None assert not ua.firefox is None assert not ua.ff is None assert not ua.ie is None assert not ua.safari is None assert not ua.random is None assert not ua['random'] is None assert ua.non_existing is None assert ua['non_existing'] is None data1 = ua.data ua.update() data2 = ua.data assert data1 == data2 assert not data1 is data2 clear() del ua ua = UserAgent() assert utils.exist() data1 = ua.data clear() ua.update() assert utils.exist() data2 = ua.data assert data1 == data2 assert not data1 is data2
def __init__(self, crawler): super(RandomUserAgentMiddlware, self).__init__() # self.user_agent_list = crawler.settings.get("user_agent_list", []) self.ua = UserAgent() self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random")
from fake_useragent import UserAgent us = UserAgent() print(us.random)
#coding=utf-8 !pip install -U fake-useragent !pip install -U func_timeout from bs4 import BeautifulSoup from lxml import html as h from fake_useragent import UserAgent from google.colab import drive from math import ceil from posixpath import normpath from urllib.parse import urlencode, urljoin, urlparse, urlparse, urlunparse from datetime import date, datetime, timedelta import pandas as pd import csv, func_timeout, html, os.path, pickle, re, requests, string, time #drive.mount('/content/drive') print(str(UserAgent().random)) @func_timeout.func_set_timeout(30)#添加func_set_timeout(2)的装饰器,参数2表示超时时间,此处设置为2s。 def askChoice(slogan):#将需要询问的内容封装进一个函数 inputs = input(f'{slogan}\n') return inputs #程序执行时先调用askChoice函数,并开始计时。 #若用户在计时期间内输入,则正常传参 #若用户超时,则触发func_timeout.exceptions.FunctionTimedOut异常,try...except捕捉异常,并进行后续操作。 def getHTMLText(url, code = 'utf-8'): Headers = {'User-Agent':str(UserAgent().random)} r = requests.get(url, headers = Headers, timeout = 30) r.raise_for_status() r.encoding = code return r
def getHTMLText(url, code = 'utf-8'): Headers = {'User-Agent':str(UserAgent().random)} r = requests.get(url, headers = Headers, timeout = 30) r.raise_for_status() r.encoding = code return r
import requests import re import sys import time import os import argparse from bs4 import BeautifulSoup from functools import partial from multiprocessing import Pool, TimeoutError, cpu_count from fake_useragent import UserAgent ua = UserAgent().random parser = argparse.ArgumentParser( description='Argument parser for dork-scanner') parser.add_argument('-S', '--search', help='String to be searched for', default='1') parser.add_argument('-E', '--engine', help='Search engine to be used', default='google') parser.add_argument('-P', '--page', help='Number of pages to search in', default='1') parser.add_argument('-Pr', '--process', help='Number of parallel processes', default='1')
def change_web_scraping_info(self): print('Changing user-agent and the proxy...') ua = UserAgent() self.user_agent = str(ua.random)
def test_user_agent(): clear(settings.DB) assert not utils.exist(settings.DB) ua = UserAgent(cache=False) assert ua.ie is not None assert ua.msie is not None assert ua.internetexplorer is not None assert ua.internet_explorer is not None assert ua['internet explorer'] is not None assert ua.google is not None assert ua.chrome is not None assert ua.googlechrome is not None assert ua.google_chrome is not None assert ua['google chrome'] is not None assert ua.firefox is not None assert ua.ff is not None assert ua.ie is not None assert ua.safari is not None assert ua.random is not None assert ua['random'] is not None try: ua.non_existing except FakeUserAgentError: pass else: assert False try: assert ua['non_existing'] except FakeUserAgentError: pass else: assert False data1 = ua.data ua.update(settings.DB) data2 = ua.data assert data1 == data2 assert data1 is not data2 clear(settings.DB) del ua ua = UserAgent() assert utils.exist(settings.DB) data1 = ua.data clear(settings.DB) ua.update(settings.DB) assert utils.exist(settings.DB) data2 = ua.data assert data1 == data2 assert data1 is not data2 clear(settings.DB)
from selenium.common.exceptions import TimeoutException, NoSuchElementException from selenium.webdriver import DesiredCapabilities #跳过SSL验证证书 import ssl #设置忽略SSL验证 from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.proxy import ProxyType from fake_useragent import UserAgent from config import phantomjs_driver, generate_cookie_url, chrome_driver, generate_cookie_url, cookie_max_num, \ cookie_timeout, cookie_retry_num, cookie_interval_time, log_dir, crawl_interval_mintime, crawl_interval_maxtime, \ base_url1 from proxy_ip import _proxy ssl._create_default_https_context = ssl._create_unverified_context ua = UserAgent(verify_ssl=False) is_clear = False # 请求头设置 user_agent = [ "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER) ", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E) ", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0) ", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
def __init__(self, crawler): super(RandomUserAgentMiddleware, self).__init__() self.ua = UserAgent() self.ua_type = crawler.settings.get('RANDOM_UA_TYPE', 'random')
def parse(self,pass_url): #解析当前微博下的所有评论用户 first_req=requests.get(pass_url+str(1),cookies=self.new_cookies()).content if 'not exist' in str(first_req): return None html = etree.HTML(first_req) #获取中断的页面 try: with open('page_num.txt','r') as f: broken_page_num=int(f.readlines()[0])+1 except: broken_page_num=1 #评论总页数 try: page_num = (html.xpath('//*[@id="pagelist"]/form/div/text()')[1].split('/')[1])[:-1] except: #print('[-----]页面请求错误') return self.parse(pass_url=pass_url) for page in range(broken_page_num,int(page_num)+1): print(page) if page % 5 == 0: with open('page_num.txt','w') as f: f.write(str(page)) fi=set() #保存当前运行状态 cookies=self.new_cookies() #print('[++++++++]当前cookies:',str(cookies)) try: req=requests.get(pass_url+str(page),cookies=cookies,headers={"User-Agent":UserAgent().random}).content html=etree.HTML(req) fans = html.xpath('//div[@class="c"]/a[contains(@href,"/u/")]/@href') fans_name=html.xpath('//div[@class="c"]/a[contains(@href,"/u/")]/text()') except: while True: #print('[!!!!!]出现错误,未获取到内容:') time.sleep(5) try: req = requests.get(pass_url + str(page),headers={"User-Agent":UserAgent().random},cookies=cookies).content html = etree.HTML(req) fans = html.xpath('//div[@class="c"]/a[contains(@href,"/u/")]/@href') fans_name = html.xpath('//div[@class="c"]/a[contains(@href,"/u/")]/text()') break except: pass for i,j in enumerate(fans): #防止底部返回链接的干扰 if '5644764907' in j: continue fans_url='https://weibo.cn/'+j.split('/u/')[1]+'/info' fans_weibo='https://weibo.cn'+j m_url="https://m.weibo.cn/api/container/getIndex?containerid=230283{}_-_INFO&title=%E5%9F%BA%E6%9C%AC%E8%B5%84%E6%96%99&luicode=10000011&lfid=230283{}".format(j.split('/u/')[1],j.split('/u/')[1]) name=fans_name[i] if name in fi: pass else: fi.add(name) self.db.lpush(fans_url) self.db1.lpush(fans_weibo) self.db2.lpush(m_url) print('[+++][+++][+++]',name) #在应对限制ip的反爬措施中,效率最高的等待时间 time.sleep(0.35) #爬完该篇微博的所有评论后 time.sleep(1) with open('page_num.txt','w') as f: f.write('0')
else: proxies = re.findall( re.compile('<td>([\d.]+)</td>'), str(requests.get('https://www.sslproxies.org/').content)) proxies = [ '%s:%s' % x for x in list(zip(proxies[0::2], proxies[1::2])) ] print('%d proxies successfully loaded!' % len(proxies)) proxy = Proxy() proxy.proxy_type = ProxyType.MANUAL if args.user_agent: if path.isfile(args.user_agent): user_agents = list( filter(None, open(args.user_agent, 'r').read().split('\n'))) else: user_agents = [args.user_agent] else: user_agents = UserAgent() for i in range(args.threads): t = Thread(target=bot, args=(i, args.url)) t.daemon = True t.start() sleep(uniform(2.0, 4.0)) stdin.read(1) exit(0) except KeyboardInterrupt: exit(0) except: exit(1)
def __init__(self): self.headers = dict() self.headers['User-Agent'] = UserAgent().random self.my_session = requests.session()
def get_df(self): print('get data from services...') sfb_df = pd.read_csv(self.path_sfb, sep=';', index_col='id') serv_df = sfb_df[sfb_df['type'] == 'services'] list_url = serv_df['URL'].values final_df = pd.DataFrame(columns=[ 'date', 'type', 'category_id', 'category_title', 'site_title', 'price_new', 'price_old', 'site_unit', 'site_link', 'site_code' ]) #mgts n = 0 url = list_url[n] print(url) html = requests.get(url, headers={ 'User-Agent': UserAgent().chrome }).content #, headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') price_list = soup.findAll('div', {'class': 'slider_slide'}) #0 заменить for price_elem in price_list: if price_elem.findAll('div', {'class': 'texts'})[0].text == 'Безлимитный': price_dict = dict() price_dict['date'] = Global().date price_dict['site_code'] = 'services' id_n = int(serv_df[serv_df['URL'].str.contains(url)].index[0]) price_dict['category_id'] = id_n price_dict['category_title'] = serv_df['cat_title'].loc[ price_dict['category_id']] price_dict['type'] = 'services' price_dict['site_title'] = price_elem.findAll( 'div', {'class': 'texts'})[0].text price_dict['price_new'] = int( price_elem.findAll('div', {'class': 'slider_price_val'})[0].text) price_dict['price_old'] = '' price_dict['site_unit'] = price_elem.findAll( 'div', {'class': 'slider_price_rub1' })[0].text + '/' + price_elem.findAll( 'div', {'class': 'slider_price_rub2'})[0].text price_dict['site_link'] = url final_df = final_df.append(price_dict, ignore_index=True) break #Помывка в бане в общем отделении, билет http://legkiipar.ru/menraz.html try: n = 1 url = list_url[n] print(url) html = requests.get( url).content #, headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') #Будние дни с 08:00 до 22:00 pattern = re.compile(r'Будние дни') price_dict = dict() price_dict['date'] = Global().date price_dict['site_code'] = 'services' price_dict['type'] = 'services' price_dict['site_title'] = soup(text=pattern)[0] price_1 = soup.findAll('span', {'class': 'стиль6'}) price_dict['price_new'] = re.findall('\d+', price_1[1].text)[0] price_dict['price_old'] = '' price_dict['site_unit'] = re.findall('\d+ часа', price_1[4].text[:-1])[0] price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(url)].index[0]) price_dict['category_title'] = serv_df['cat_title'].loc[ price_dict['category_id']].values[0] price_dict['site_link'] = url final_df = final_df.append(price_dict, ignore_index=True) except: print('DAMN! {} can not be parsed'.format(url)) #Помывка в бане в общем отделении, билет http://banya-lefortovo.ru/price.html n = 2 price_dict = dict() price_dict['date'] = Global().date price_dict['site_code'] = 'services' url = list_url[n] print(url) html = requests.get( url).content #, headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') pattern = re.compile(r'Русская общая баня') price_dict['site_title'] = soup(text=pattern)[0] price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(url)].index[0]) price_dict['category_title'] = serv_df.loc[ price_dict['category_id']]['cat_title'].values[0] price_dict['type'] = 'services' price_dict['price_new'] = int( re.findall('\d+', re.findall('\d+ рублей', soup(text=pattern)[0])[0])[0]) price_dict['price_old'] = '' price_dict['site_unit'] = re.findall('\d+ часа', soup(text=pattern)[0])[0] price_dict['site_link'] = url final_df = final_df.append(price_dict, ignore_index=True) #Помывка в бане в общем отделении, билет https://rzhevskie-bani.ru/rb/bani.html n = 3 price_dict = dict() url = list_url[n] print(url) html = requests.get( url).content #, headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') price_dict['price_new'] = int( re.findall('\d+', soup.findAll('td', {'class': 'price'})[0].text)[0]) pattern = re.compile(r'Стоимость') soup.findAll('td') price_dict['date'] = Global().date price_dict['site_code'] = 'services' price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(url)].index[0]) price_dict['category_title'] = serv_df.loc[ price_dict['category_id']]['cat_title'].values[0] price_dict['site_title'] = soup(text=pattern)[0] price_dict['type'] = 'services' price_dict['site_unit'] = re.findall('(\d+.*\d часа)', soup(text=pattern)[0][-9:])[0] price_dict['site_link'] = url final_df = final_df.append(price_dict, ignore_index=True) #Помывка в бане в общем отделении, билет http://vorontsovskie-bani.ru/obshchestvennye-bani/muzhskoj-zal-pervyj-razryad n = 4 price_dict = dict() price_dict['date'] = Global().date price_dict['site_code'] = 'services' url = list_url[n] print(url) price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(url)].index[0]) try: html = requests.get(url, headers={ 'User-Agent': UserAgent().chrome }, timeout=10).content except: proxy = get_proxy(url) html = requests.get(url, headers={ 'User-Agent': UserAgent().chrome }, proxies=proxy).content soup = BeautifulSoup(html, 'lxml') price_div = soup.findAll('div', {'class': 'price-head'})[0] price_dict['price_new'] = int( re.findall('\d+', price_div.findAll('span', {'class': 'price'})[0].text)[0]) price_dict['price_old'] = '' price_dict['site_title'] = price_div.find('p').text.replace( '\xa0', ' ') price_dict['site_unit'] = re.findall('\d+ часа', price_dict['site_title'])[0] price_dict['type'] = 'services' price_dict['site_link'] = url price_dict['category_title'] = serv_df.loc[ price_dict['category_id']]['cat_title'].values[0] final_df = final_df.append(price_dict, ignore_index=True) #Постановка набоек, пара https://masterskaya-obuvi.ru/tseny ''' n=5 price_dict=dict() price_dict['date']=Global().date price_dict['site_code']='services' url=list_url[n] print(url) html=requests.get(url).content#, headers={'User-Agent': UserAgent().chrome} soup=BeautifulSoup(html, 'lxml') price_dict['category_id']=int(serv_df[serv_df['URL'].str.contains(url)].index[0]) price_dict['category_title'] = serv_df.loc[price_dict['category_id']]['cat_title'].values[0] for elem in soup.findAll('tr'): if re.findall('износоустойчивой резины',elem.text)!=[]: price_div=elem price_dict['site_title']=re.findall('[А-Яа-яёз(). ]+',elem.text)[0] price_dict['site_unit']=re.findall('[А-Яа-яёз(). ]+',elem.text)[1] price_dict['price_new']=int(price_div.findAll('td',{'width':"356"})[0].text) price_dict['price_old'] = '' price_dict['type'] = 'services' price_dict['site_link']=url break final_df=final_df.append(price_dict,ignore_index=True) ''' #Постановка набоек, пара https://masterskaya-obuvi.ru/tseny n = 6 price_dict = dict() price_dict['date'] = Global().date price_dict['site_code'] = 'services' url = list_url[n] print(url) html = requests.get( url).content #, headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(url)].index[0]) price_dict['category_title'] = serv_df.loc[ price_dict['category_id']]['cat_title'].values[0] for elem in soup.findAll('tr'): if re.findall('эконом', elem.text) != []: price_div = elem price_dict['site_title'] = self.wspex_space( re.findall( '[А-Яа-яёз(). ]+', price_div.findAll('td', {'align': 'left'})[0].text)[0]) price_text = price_div.findAll('strong')[0].text price_dict['price_new'] = int(re.findall('\d+', price_text)[0]) price_dict['price_old'] = '' price_dict['type'] = 'services' price_dict['site_unit'] = re.findall( '\([А-Яа-я]*\)', price_dict['site_title'])[0][1:-1] price_dict['site_link'] = url break final_df = final_df.append(price_dict, ignore_index=True) #Билет на 1 поездку - мосгортранс n = 7 price_dict = dict() price_dict['site_code'] = 'services' price_dict['date'] = Global().date url = list_url[n] print(url) html = requests.get( url).content #, headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') #soup.findAll('td')#,{'class':'text-center'})[0] price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(url)].index[0]) price_dict['category_title'] = serv_df.loc[ price_dict['category_id']]['cat_title'] for elem in soup.findAll('td'): if re.findall('не более', elem.text) != []: price_div = elem site_title = price_div.text break for elem in soup.findAll('tr'): if re.findall('не более', elem.text) != []: price_div = elem price_dict['site_title'] = price_div.find('td').text price_dict['price_new'] = int( re.findall('\d{2,3}', price_div.text)[0]) price_dict['price_old'] = '' price_dict['type'] = 'services' price_dict['site_unit'] = 'поездка' price_dict['site_link'] = url break final_df = final_df.append(price_dict, ignore_index=True) # # стрижка try: n = 8 price_dict = dict() price_dict['site_code'] = 'services' price_dict['date'] = Global().date url = list_url[n] print(url) html = requests.get( url).content # , headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') # soup.findAll('td')#,{'class':'text-center'})[0] for elem in soup.findAll('tr'): if re.findall('(любой длины)', elem.text) != []: price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(url)].index[-1]) price_dict['category_title'] = serv_df.loc[ price_dict['category_id']]['cat_title'].values[0] price_text = elem.text price_dict['site_title'] = re.findall( '[А-Яа-я ()]+', price_text)[0] price_dict['price_new'] = re.findall('\d+', price_text)[0] price_dict['price_old'] = '' price_dict['type'] = 'services' price_dict['site_unit'] = 'стрижка' price_dict['site_link'] = url break final_df = final_df.append(price_dict, ignore_index=True) except: print('DAMN! {} can not be parsed'.format(url)) #стрижка try: n = 9 price_dict = dict() price_dict['site_code'] = 'services' price_dict['date'] = Global().date url = list_url[n] print(url) html = requests.get( url).content #, headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') for elem in soup.findAll('tr'): if re.findall('Женская', elem.text) != []: price_div = elem price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(url)].index[0]) price_dict['category_title'] = serv_df.loc[ price_dict['category_id']]['cat_title'].values[0] price_dict['site_title'] = price_div.find( 'td', { 'class': 'services-table__name' }).text price_dict['price_new'] = int( self.wspex( price_div.find( 'td', { 'class': 'services-table__price services-table__price-small' }).text)) price_dict['price_old'] = '' price_dict['type'] = 'services' price_dict['site_unit'] = 'стрижка' price_dict['site_link'] = url break final_df = final_df.append(price_dict, ignore_index=True) except: print('DAMN! {} can not be parsed'.format(url)) #стрижка n = 10 price_dict = dict() price_dict['site_code'] = 'services' price_dict['date'] = Global().date url = list_url[n] print(url) html = requests.get( url).content #, headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') for elem in soup.findAll('tr'): if re.findall('лопаток', elem.text) != []: price_div = elem price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(list_url[n - 1])].index[0]) price_dict['category_title'] = serv_df.loc[ price_dict['category_id']]['cat_title'].values[0] price_dict['site_title'] = price_div.find( 'td', { 'height': '17' }).text price_dict['price_new'] = int( self.wspex(price_div.find('td', { 'width': '157' }).text)) price_dict['price_old'] = '' price_dict['type'] = 'services' price_dict['site_unit'] = 'стрижка' price_dict['site_link'] = url break final_df = final_df.append(price_dict, ignore_index=True) #Билет на 1 поездку - мосгортранс n = 11 price_dict = dict() price_dict['site_code'] = 'services' price_dict['date'] = Global().date url = list_url[n] print(url) html = requests.get( url).content #, headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') for elem in soup.findAll('tr'): if re.findall('не более', elem.text) != []: price_div = elem price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(url)].index[-1]) price_dict['category_title'] = serv_df.loc[ price_dict['category_id']]['cat_title'] price_dict['site_title'] = price_div.find('td').text price_dict['price_new'] = int( re.findall('\d{2,3}', price_div.text)[0]) price_dict['price_old'] = '' price_dict['type'] = 'services' price_dict['site_unit'] = 'поездка' price_dict['site_link'] = url break final_df = final_df.append(price_dict, ignore_index=True) final_df = final_df[final_df.site_title.notna()] print('ALL SERVICES HAVE BEEN SUCCESSFULLY PARSED!') return final_df
from bs4 import BeautifulSoup import os import urllib.parse as rep import urllib.request as req from fake_useragent import UserAgent opener = req.build_opener() opener.addheaders = [('User-agent', UserAgent().chrome)] req.install_opener(opener) base = "https://search.naver.com/search.naver?sm=tab_hty.top&where=image&query=" quote = req.quote('카구야') url = base + quote print('Request URL : {}'.format(url)) res = req.urlopen(url) savePath = "C:/Users/hsm01/Desktop/web/crawl/result2" try: if not (os.path.isdir(savePath)): os.makedirs(os.path.join(savePath)) except OSError as e: print("folder creation failed") print(f"folder name : {e.filename}") raise RuntimeError("system exit!") else: print("folder created") soup = BeautifulSoup(res, "html.parser") # print(soup.prettify()) img_list = soup.select('div.img_area> a.thumb._thumb > img')
print(proxy.http_proxy) chrome_options.add_argument('user-agent="{}"'.format(agent.random)) capabilities = webdriver.DesiredCapabilities.CHROME proxy.add_to_capabilities(capabilities) driver = webdriver.Chrome(options=chrome_options, desired_capabilities=capabilities) driver.get(args.url) sleep(args.duration) driver.close() except: _exit(0) if args.proxies: proxies = open(args.proxies, 'r').read().split('\n') else: proxies = re.findall( re.compile('<td>([\d.]+)</td>'), str(requests.get('https://free-proxy-list.net/').content)) proxies = ['%s:%s' % x for x in list(zip(proxies[0::2], proxies[1::2]))] print('%d proxies successfully loaded!' % len(proxies)) proxy = Proxy() proxy.proxy_type = ProxyType.MANUAL agent = UserAgent() chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--mute-audio') for i in range(args.threads): t = Thread(target=bot, args=(args.url, )) t.deamon = True t.start() sleep(uniform(1.5, 3.0))
def process_request(self, request, spider): ua = UserAgent() request.headers['User-Agent'] = ua.random
def proxy_chrome(PROXY_HOST,PROXY_PORT,PROXY_USER,PROXY_PASS): manifest_json = """ { "version": "1.0.0", "manifest_version": 2, "name": "Chrome Proxy", "permissions": [ "proxy", "tabs", "unlimitedStorage", "storage", "<all_urls>", "webRequest", "webRequestBlocking" ], "background": { "scripts": ["background.js"] }, "minimum_chrome_version":"22.0.0" } """ background_js = """ var config = { mode: "fixed_servers", rules: { singleProxy: { scheme: "http", host: "%(host)s", port: parseInt(%(port)d) }, bypassList: ["foobar.com"] } }; chrome.proxy.settings.set({value: config, scope: "regular"}, function() {}); function callbackFn(details) { return { authCredentials: { username: "******", password: "******" } }; } chrome.webRequest.onAuthRequired.addListener( callbackFn, {urls: ["<all_urls>"]}, ['blocking'] ); """ % { "host": PROXY_HOST, "port": PROXY_PORT, "user": PROXY_USER, "pass": PROXY_PASS, } pluginfile = 'extension\\proxy_auth_plugin.zip' with zipfile.ZipFile(pluginfile, 'w') as zp: zp.writestr("manifest.json", manifest_json) zp.writestr("background.js", background_js) co = Options() # extension support is not possible in incognito mode for now # co.add_argument('--incognito') co.add_argument('--disable-gpu') # disable infobars co.add_argument('--disable-infobars') co.add_argument("--start-maximized") ua = UserAgent() userAgent = ua.random co.add_argument('--user-agent="' + str(userAgent) + '"') co.add_experimental_option("excludeSwitches",["ignore-certificate-errors"]) # location of chromedriver, please change it according to your project. chromedriver = os.getcwd()+'\\Chromedriver\\chromedriver.exe' co.add_extension(pluginfile) driver = webdriver.Chrome(chromedriver,chrome_options=co) # return the driver with added proxy configuration. return driver
# For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'movie' SPIDER_MODULES = ['movie.spiders'] NEWSPIDER_MODULE = 'movie.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent from fake_useragent import UserAgent ua = UserAgent() USER_AGENT = ua.random # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16
def __init__(self): self.ua = UserAgent(use_cache_server=False)
from requests import Session as _Session from requests.exceptions import ConnectionError, ChunkedEncodingError, Timeout, HTTPError from requests.adapters import HTTPAdapter import logging import time from .cookiejar import ClientCookieJar try: from fake_useragent import UserAgent except ImportError: UserAgent = None ua = None ua_str = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36' else: ua = UserAgent( fallback= 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36' ) ua_str = ua.chrome session_logger = logging.getLogger('showroom.session') class ClientSession(_Session): """ Wrapper for requests.Session. Mainly used to catch temporary errors and set a Timeout Overrides requests.Session.get() and increases max pool size Raises:
import time import json import hashlib import requests from fake_useragent import UserAgent url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule" headers = {"User-Agent": UserAgent().random} def main(): def get_salt(): """获取13位时间戳""" salt = str(int(round(time.time() * 1000))) return salt def get_sign(): """获取sign""" sign = "fanyideskweb" + keywords + get_salt() + "6x(ZHw]mwzX#u0V7@yfwK" hl = hashlib.md5() hl.update(sign.encode(encoding='utf-8')) return sign data = { 'i': keywords, 'from': 'AUTO', 'to': 'AUTO', 'smartresult': 'dict', 'client': 'fanyideskweb', 'salt': get_salt(), 'sign': get_sign(),
# 测试ttk # from tkinter import * # from tkinter import ttk # help(ttk.Button) # root=Tk() # ttk.Style().configure("TButton",padding=15,relief=FLAT,background="green",foreground="blue") # # mac上Button似乎仍然无法改bg # ttk.Button(root,text="撒放放假啊阱n").pack() # # Button(root,text="NO.2",relief=SUNKEN,fg="purple").pack() # # tk的按钮显现不了字还是 # mainloop() from fake_useragent import UserAgent headers = {UserAgent().random} print(headers) print('==================================') str1 = "sasas, sajfqn http:http\n llsaajghttps" a = str1.rfind("https") print(a) print('==================================') with open("Burning.txt", 'a+') as f: # f.write("sssak\n") f.seek(0, 0) # print(f.read()) print(f.readlines()) print('==================================')
def genUA(): """returns a fake random user-agent""" return str(UserAgent().random)
from urllib.request import Request, build_opener from urllib.request import HTTPHandler from fake_useragent import UserAgent url = 'http://httpbin.org/get' headers = { 'User-Agent':UserAgent().random } request = Request(url, headers=headers) handler = HTTPHandler(debuglevel=1) opener = build_opener(handler) response = opener.open(request) # print(response.read().decode())
def main(): START = datetime.now() bad = 'Our systems have detected unusual traffic from your computer network. This page checks to see if it's really you sending the requests, and not a robot.' parser = argparse.ArgumentParser(description="Get URLS from Bing") parser.add_argument( 'mot', nargs="?", help= "The phrase you want to look up. Put a '+' between the terms. eg: paris+the+city+of+love", type=str) parser.add_argument('-n', '--number', default=50, help="Minimum number of links you want (default=50)", type=int) args = parser.parse_args() nb_links = args.number if len(sys.argv) <= 1: parser.print_help() sys.exit(1) url_lis, ignored_link, total = [], 0, 0 ua = UserAgent() header = {'User-Agent': str(ua.random)} print(colored(header, 'yellow')) page = requests.get("https://www.bing.com/search?q=" + args.mot.lower(), headers=header) soup = BeautifulSoup(page.content, 'html.parser') out = open("out.txt", 'w') while len(url_lis) < nb_links: time.sleep(round(random.uniform(3, 7), 2)) # take it easy, don't get banned... # we get the h2 links of the search page h2links = soup.findAll("h2") good_link = re.findall('<a h="ID=SERP.{7}".href="(http.*?)"', str(h2links)) for link in good_link: total += 1 if isValid(link) and link not in url_lis: out.write(link + '\n') print(link) url_lis.append(link) else: ignored_link += 1 print(colored('{} links gotten'.format(len(url_lis)), 'red')) next_url = str(page.content) if re.findall(bad, str(next_url)): print( colored("they're coming after you, run !", 'red', 'on_yellow')) sys.exit(0) #we get here the link of the "Next" button #If you're not searching from francophone areas, you need to change the title of the link eg: Página siguiente, Volgende pagina, Nächste Seite... next_url = re.findall('title="Page suivante" href="(.*?)"', next_url) try: next_url = "https://www.bing.com" + html.unescape(next_url[0]) except IndexError: print(colored('No more results, sorry', 'yellow')) sys.exit(0) print('\n', colored(next_url, 'green'), sep="") page = requests.get(next_url, headers=header) soup = BeautifulSoup(page.content, 'html.parser') out.close() print('\n\nOutput file : out.txt') print( colored('links ignored : ' + str(ignored_link) + ' of ' + str(total), 'blue')) END = (datetime.now() - START).total_seconds() print(colored("Done in {} secs".format(round(END, 2)), 'yellow'))
def __init__(self,crawler): super(RandomUserAgentMiddlware,self).__init__() self.ua = UserAgent()
def __init__(self): self.ua = UserAgent()
api_args['max_id'] = min(api_args.get('max_id', tw['id']), tw['id']-1) metas = prepare_tweet(tw) metas.pop('_id') tw.update(metas) for po in ['user', 'entities', 'extended_entities']: if po in tw: tw.pop(po) db.tweets.update({'_id': tw['id']}, {"$set": tw}, upsert=True) print "...collected %s new tweets" % len(tweets) tweets = api.call('statuses.user_timeline', api_args) db.users.update({'_id': user['twitter']}, {"$set": {"done": True}}) # TODO: refacto all of this with gazouilloire/run.py ua = UserAgent() ua.update() todo = list(tweetscoll.find({"links_to_resolve": True}, projection={"links": 1, "proper_links": 1, "retweet_id": 1}, limit=600, sort=[("_id", 1)])) left = tweetscoll.count({"links_to_resolve": True}) print "\n\n- STARTING LINKS RESOLVING: %s waiting\n\n" % left while todo: done = 0 urlstoclear = list(set([l for t in todo if not t.get("proper_links", []) for l in t.get('links', [])])) alreadydone = {l["_id"]: l["real"] for l in linkscoll.find({"_id": {"$in": urlstoclear}})} tweetsdone = [] batchidsdone = set() for tweet in todo: if tweet.get("proper_links", []): tweetsdone.append(tweet["_id"]) continue tweetid = tweet.get('retweet_id') or tweet['_id']
def test_fake_update(): ua = UserAgent(cache=False, use_cache_server=False) ua.update() _probe(ua)