def test_fake_update_use_cache_server(): ua = UserAgent(cache=False, use_cache_server=True) denied_urls = [ 'https://www.w3schools.com/browsers/browsers_stats.asp', 'http://useragentstring.com/pages/useragentstring.php', ] with mock.patch( 'fake_useragent.utils.Request', side_effect=partial(_request, denied_urls=denied_urls), ): ua.update() _probe(ua) denied_urls = [ 'https://www.w3schools.com/browsers/browsers_stats.asp', 'http://useragentstring.com/pages/useragentstring.php', settings.CACHE_SERVER, ] with mock.patch( 'fake_useragent.utils.Request', side_effect=partial(_request, denied_urls=denied_urls), ): with pytest.raises(FakeUserAgentError): ua.update()
def __init__(self): super().__init__() proxy = QNetworkProxy() proxy.setType(QNetworkProxy.HttpProxy) proxy.setHostName("109.173.124.250") proxy.setPort(7793) QNetworkProxy.setApplicationProxy(proxy) try: print("1") ua = UserAgent() ua.update() useragent = ua.random except FakeUserAgentError: print("2") useragent = "Mozilla / 5.0 (Windows NT 10.0; Win64; x64) AppleWebKit / 537.36 (KHTML, как Gecko) Chrome / " \ "72.0.3626.121 Safari / 537.36" profile = QWebEngineProfile() profile.setHttpUserAgent(useragent) page = QWebEnginePage(profile) page.setUrl(QUrl("https://www.instagram.com/")) self.setPage(page) self.page().proxyAuthenticationRequired.connect( self.handle_proxy_auth_req) self.imit_peop = threading.Thread(target=self.imitation_people, args=()) self._timer = QTimer() self.loadFinished.connect(self.startTimer) self._timer.timeout.connect(self.start_thread_imitation)
def resolver(pile_links, mongoconf, exit_event, debug=False): ua = UserAgent() ua.update() db = MongoClient(mongoconf['host'], mongoconf['port'])[mongoconf['db']] linkscoll = db['links'] tweetscoll = db['tweets'] while not exit_event.is_set() or not pile_links.empty(): todo = [] while not pile_links.empty() and len(todo) < 10: todo.append(pile_links.get()) if not todo: if not exit_event.is_set(): time.sleep(1) continue done = 0 for tweet in todo: gdlinks = [] for link in tweet["links"]: good = linkscoll.find_one({'_id': link}) if good: gdlinks.append(good['real']) continue good = resolve_url(link, user_agent=ua) gdlinks.append(good) linkscoll.save({'_id': link, 'real': good}) if link != good: done += 1 tweetscoll.update({'_id': tweet['_id']}, {'$set': {'proper_links': gdlinks}}, upsert=False) if debug and done: log("DEBUG", "[links] +%s links resolved (out of %s/%s)" % (done, len(todo), pile_links.qsize())) log("INFO", "FINISHED resolver")
class RotateUserAgentMiddleware(UserAgentMiddleware): def __init__(self, user_agent=''): self.user_agent = user_agent self.ua = UserAgent(fallback=self.user_agent) cache_file = settings.DB ONEDAY_DELTA = 86400 now = time.time() cache_age = 0 try: cache_age = getmtime(cache_file) except OSError: pass if cache_age <= now - ONEDAY_DELTA: self.ua.update() with open(cache_file, mode='rb') as fp: browser_data = json.load(fp) test = browser_data.get('browsers', {}).get('chrome', []) if not test: d = dirname(sys.modules["alascrapy"].__file__) backup_filename = 'fake_useragent_%s.json' % settings.__version__ copyfile(join(d, backup_filename), cache_file) def process_request(self, request, spider): manual_user_agent = request.meta.get('User-Agent', None) if manual_user_agent: request.headers['User-Agent'] = manual_user_agent else: new_user_agent = self.ua.random if new_user_agent: request.headers['User-Agent'] = new_user_agent
def process_request(self, request, spider): #设置user-agent的值 try: ua = UserAgent() ua.update() request.headers.setdefault('User-Agent', ua.random) except FakeUserAgentError as e: print(e) # #产生随机代理,突破反爬虫对IP的检测 # class RandomProxy(object): # def __init__(self, iplist): # self.iplist = iplist # @classmethod # def from_crawler(cls, crawler): # return cls(crawler.settings.getlist('IPLIST')) # def process_request(self, request, spider): # ''' # 在请求上添加代理 # :param request: # :param spider: # :return: # ''' # proxy = random.choice(self.iplist) # request.meta['proxy'] = proxy
def main(): logging.basicConfig( level=logging.INFO, format= '%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', ) log_file_handler = RotatingFileHandler('/tmp/fake_useragent.log', maxBytes=1024**2, encoding='utf-8') formatter = logging.Formatter( '%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s' ) log_file_handler.setFormatter(formatter) logger = logging.getLogger() logger.addHandler(log_file_handler) logging.info('fake_useragent fetch data now.....') try: ua = UserAgent() ua.update() except FakeUserAgentError as e: logging.error('fake_useragent fetch data fail, Exception occured: %s' % e, exc_info=True)
def __init__(self, path="./phantomjs"): useragent = UserAgent() useragent.update() dcap = dict(DesiredCapabilities.PHANTOMJS) dcap['phantomjs.page.settings.userAgent'] = useragent.random self.processEngine = webdriver.PhantomJS(executable_path=path, desired_capabilities=dcap)
def _get_headers() -> Dict: """ Return header dict with random User-Agent to support request and to avoid being blocked by the server """ ua = UserAgent() ua.update() return {"User-Agent": ua.random}
def get_user_agents_generator(verbose=False, verify_ssl=False): if verbose: print("retriving updated user-agent list...") ua = UserAgent(verify_ssl=verify_ssl) ua.update() if verbose: print("Done.") return ua
def test_user_agent(): clear() assert not utils.exist() ua = UserAgent(cache=False) assert ua.ie is not None assert ua.msie is not None assert ua.internetexplorer is not None assert ua.internet_explorer is not None assert ua['internet explorer'] is not None assert ua.google is not None assert ua.chrome is not None assert ua.googlechrome is not None assert ua.google_chrome is not None assert ua['google chrome'] is not None assert ua.firefox is not None assert ua.ff is not None assert ua.ie is not None assert ua.safari is not None assert ua.random is not None assert ua['random'] is not None assert ua.non_existing is None assert ua['non_existing'] is None data1 = ua.data ua.update() data2 = ua.data assert data1 == data2 assert data1 is not data2 clear() del ua ua = UserAgent() assert utils.exist() data1 = ua.data clear() ua.update() assert utils.exist() data2 = ua.data assert data1 == data2 assert data1 is not data2
def test_user_agent(): clear() assert not utils.exist() ua = UserAgent(cache=False) assert not ua.ie is None assert not ua.msie is None assert not ua.internetexplorer is None assert not ua.internet_explorer is None assert not ua['internet explorer'] is None assert not ua.google is None assert not ua.chrome is None assert not ua.googlechrome is None assert not ua.google_chrome is None assert not ua['google chrome'] is None assert not ua.firefox is None assert not ua.ff is None assert not ua.ie is None assert not ua.safari is None assert not ua.random is None assert not ua['random'] is None assert ua.non_existing is None assert ua['non_existing'] is None data1 = ua.data ua.update() data2 = ua.data assert data1 == data2 assert not data1 is data2 clear() del ua ua = UserAgent() assert utils.exist() data1 = ua.data clear() ua.update() assert utils.exist() data2 = ua.data assert data1 == data2 assert not data1 is data2
def post(CPM, rawCPM, wrong, words, ip, keys, char): ua = UserAgent() ua.update() header = { 'Origin': 'https://typing-speed-test.aoeu.eu', 'User-Agent': str(ua.chrome) } return requests.post(STATS_URL, createParameters(CPM, rawCPM, wrong, words, ip, keys, char, 0, 0, 0, 60), headers=header)
class FakeUA: """ 用于提供ua,单例模式,fake_ua能用则用,否则用自带的ua集,通过FakeUA.random获取随机ua """ def __new__(cls, *args, **kwargs): if not hasattr(cls, 'instance'): cls.instance = super(FakeUA, cls).__new__(cls) return cls.instance def __init__(self, if_update_fake_ua=False): self.fake_ua = None try: if if_update_fake_ua: self.fake_ua = UserAgent(path='fake_useragent%s.json' % VERSION) self.fake_ua.update() else: raise FakeUserAgentError() except FakeUserAgentError: self.some = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60', 'Opera/8.0 (Windows NT 5.1; U; en)', 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', 'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)"', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36' ] def __getattr__(self, item): try: if item == 'random': if self.fake_ua is not None: return self.fake_ua.random else: return random.choice(self.some) except KeyError: raise AttributeError(r"Object does'n has attribute '%s'" % item)
def resolve_links(db): tweetscoll = db['tweets'] linkscoll = db['links'] ua = UserAgent() ua.update() todo = list(tweetscoll.find({"links_to_resolve": True}, projection={"links": 1, "proper_links": 1, "retweet_id": 1}, limit=600, sort=[("_id", 1)])) left = tweetscoll.count({"links_to_resolve": True}) print >> sys.stderr, "\n\n- STARTING LINKS RESOLVING: %s waiting\n\n" % left while todo: done = 0 urlstoclear = list(set([l for t in todo if not t.get("proper_links", []) for l in t.get('links', [])])) alreadydone = {l["_id"]: l["real"] for l in linkscoll.find({"_id": {"$in": urlstoclear}})} tweetsdone = [] batchidsdone = set() ct = 0 for tweet in todo: ct += 1 if tweet.get("proper_links", []): tweetsdone.append(tweet["_id"]) continue tweetid = tweet.get('retweet_id') or tweet['_id'] if tweetid in batchidsdone: continue gdlinks = [] for link in tweet.get("links", []): if link in alreadydone: gdlinks.append(alreadydone[link]) continue print >> sys.stderr, " %s / %s " % (ct, left), link good = resolve_url(link, user_agent=ua) gdlinks.append(good) alreadydone[link] = good try: linkscoll.save({'_id': link, 'real': good}) if good != link: print >> sys.stderr, " ->", good except Exception as e: print >> sys.stderr, "- WARNING: Could not store resolved link %s -> %s because %s: %s" % (link, good, type(e), e) if link != good: done += 1 tweetscoll.update({'$or': [{'_id': tweetid}, {'retweet_id': tweetid}]}, {'$set': {'proper_links': gdlinks, 'links_to_resolve': False}}, upsert=False, multi=True) batchidsdone.add(tweetid) # clear tweets potentially rediscovered if tweetsdone: tweetscoll.update({"_id": {"$in": tweetsdone}}, {"$set": {"links_to_resolve": False}}, upsert=False, multi=True) if done: left = tweetscoll.count({"links_to_resolve": True}) print >> sys.stderr, "- [LINKS RESOLVING] +%s new redirection resolved out of %s links (%s waiting)" % (done, len(todo), left) todo = list(tweetscoll.find({"links_to_resolve": True}, projection={"links": 1, "proper_links": 1, "retweet_id": 1}, limit=600, sort=[("_id", 1)]))
def test_fake_update_cache(path): assert not os.path.isfile(path) ua = UserAgent(path=path, cache=False, use_cache_server=False) assert not os.path.isfile(path) with pytest.raises(AssertionError): ua.update(cache='y') ua.update(cache=True) assert os.path.isfile(path) _probe(ua)
def user_agent(browser) -> str: result = BROWSER.get(browser) if result is None: global _ua if _ua is None: _ua = UserAgent(fallback=BAIDUSPIDER_USER_AGENT) else: try: delta = now() - fromtimestamp(getmtime(_ua.path)) if delta.days >= 7: _ua.update() except FileNotFoundError: pass result = _ua[browser] return result
class RandomUserAgentMiddleware(object): # 随机更换user-agent def __init__(self, crawler): super(RandomUserAgentMiddleware, self).__init__ self.ua = UserAgent() self.ua.update() self.ua_type = crawler.settings.get('RANDOM_TYPE', 'random') @classmethod def from_crawler(cls, crawler): return cls(crawler) def process_request(self, request, spider): def get_ua(): return getattr(self.ua, self.ua_type) request.headers.setdefault('User-Agent', get_ua())
def resolver(mongoconf, exit_event, debug=False): ua = UserAgent() ua.update() db = MongoClient(mongoconf['host'], mongoconf['port'])[mongoconf['db']] linkscoll = db['links'] tweetscoll = db['tweets'] while not exit_event.is_set(): done = 0 todo = list(tweetscoll.find({"links_to_resolve": True}, projection={"links": 1, "proper_links": 1, "retweet_id": 1}, limit=600, sort=[("_id", 1)])) urlstoclear = list(set([l for t in todo if not t.get("proper_links", []) for l in t.get('links', [])])) alreadydone = {l["_id"]: l["real"] for l in linkscoll.find({"_id": {"$in": urlstoclear}})} tweetsdone = [] batchidsdone = set() for tweet in todo: if tweet.get("proper_links", []): tweetsdone.append(tweet["_id"]) continue tweetid = tweet.get('retweet_id') or tweet['_id'] if tweetid in batchidsdone: continue if exit_event.is_set(): continue gdlinks = [] for link in tweet.get("links", []): if link in alreadydone: gdlinks.append(alreadydone[link]) continue good = resolve_url(link, user_agent=ua) gdlinks.append(good) alreadydone[link] = good try: linkscoll.save({'_id': link, 'real': good}) except Exception as e: log("WARNING", "Could not store resolved link %s -> %s because %s: %s" % (link, good, type(e), e)) if link != good: done += 1 tweetscoll.update({'$or': [{'_id': tweetid}, {'retweet_id': tweetid}]}, {'$set': {'proper_links': gdlinks, 'links_to_resolve': False}}, upsert=False, multi=True) batchidsdone.add(tweetid) if debug and done: left = tweetscoll.count({"links_to_resolve": True}) log("DEBUG", "[links] +%s new redirection resolved out of %s links (%s waiting)" % (done, len(todo), left)) # clear tweets potentially rediscovered if tweetsdone: tweetscoll.update({"_id": {"$in": tweetsdone}}, {"$set": {"links_to_resolve": False}}, upsert=False, multi=True) log("INFO", "FINISHED resolver")
def resolver(mongoconf, exit_event, debug=False): ua = UserAgent() ua.update() db = MongoClient(mongoconf['host'], mongoconf['port'])[mongoconf['db']] linkscoll = db['links'] tweetscoll = db['tweets'] while not exit_event.is_set(): done = 0 todo = list(tweetscoll.find({"links_to_resolve": True}, projection={"links": 1, "proper_links": 1, "retweet_id": 1}, limit=600, sort=[("_id", 1)])) urlstoclear = list(set([l for t in todo if not t.get("proper_links", []) for l in t.get('links', [])])) alreadydone = {l["_id"]: l["real"] for l in linkscoll.find({"_id": {"$in": urlstoclear}})} tweetsdone = [] batchidsdone = set() for tweet in todo: if tweet.get("proper_links", []): tweetsdone.append(tweet["_id"]) continue tweetid = tweet.get('retweet_id') or tweet['_id'] if tweetid in batchidsdone: continue if exit_event.is_set(): continue gdlinks = [] for link in tweet.get("links", []): if link in alreadydone: gdlinks.append(alreadydone[link]) continue good = resolve_url(link, user_agent=ua) gdlinks.append(good) try: linkscoll.save({'_id': link, 'real': good}) except Exception as e: log("WARNING", "Could not store resolved link %s -> %s because %s: %s" % (link, good, type(e), e)) if link != good: done += 1 tweetscoll.update({'$or': [{'_id': tweetid}, {'retweet_id': tweetid}]}, {'$set': {'proper_links': gdlinks, 'links_to_resolve': False}}, upsert=False, multi=True) batchidsdone.add(tweetid) if debug and done: left = tweetscoll.count({"links_to_resolve": True}) log("DEBUG", "[links] +%s new redirection resolved out of %s links (%s waiting)" % (done, len(todo), left)) # clear tweets potentially rediscovered if tweetsdone: tweetscoll.update({"_id": {"$in": tweetsdone}}, {"$set": {"links_to_resolve": False}}, upsert=False, multi=True) log("INFO", "FINISHED resolver")
def test_custom_path(): location = os.path.join( tempfile.gettempdir(), 'fake_useragent' + uuid.uuid1().hex + '.json', ) ua = UserAgent(path=location) assert utils.exist(location) check_dict(ua.data) mtime = os.path.getmtime(location) ua.update() assert os.path.getmtime(location) != mtime clear(location)
def test_fake_user_agent_browsers(): ua = UserAgent(cache=False, use_cache_server=False) _probe(ua) with pytest.raises(FakeUserAgentError): ua.non_existing with pytest.raises(FakeUserAgentError): ua['non_existing'] data1 = ua.data ua.update() data2 = ua.data assert data1 == data2 assert data1 is not data2
def main(argv): ua = UserAgent() ua.update() base_url = argv[1] url = 'http://web.archive.org/__wb/calendarcaptures?url=http://' + base_url + '&selected_year=2017' headers = {'User-Agent': ua.random} response = requests.get(url) json_obj = json.loads(response.text) for month in json_obj: if month: for week in month: if week: for day in week: if day: for ts in day['ts']: time_stamps.add(ts) for ts in time_stamps: urls.append('http://web.archive.org/web/' + str(ts) + '/http://' + base_url) print urls
def random_header(logger): # Create a dict of accept headers for each user-agent. accepts = { "Firefox": "text/html, application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Safari, Chrome": "application/xhtml+xml,application/xml, text/html;q=0.9, text/plainlq=0.8,image/png,*/*,q=0.5" } # Get a random user-agent. We used Chrome and Firefox user agents. # More at: https://pypi.org/project/fake-useragent/ try: # Getting a suer agent using the fake_useragent package ua = UserAgent(cache=True) ua.update() if random.random() > 0.5: random_user_agent = ua.chrome else: random_user_agent = ua.firefox # In case there's a problem with the fake-useragent package, we still want the scraper to function # so there's a list of ua's (https://developers.whatismybrowser.com/) that we created and swap to another ua. # Be aware of a need to update the list periodically. except FakeUserAgentError as error: # Save a message to the logs file. logger.error( "FakeUserAgent didn't work. Generating headers from the pre-defined list of headers. error: {}" .format(error)) user_agents = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" "Mozilla/5.0 (Windows NT 5.1; rv:7.0.1) Gecko/20100101 Firefox/7.0.1" ] random_user_agent = random.choice(user_agents) # Create the headers dictionary. It's key to match between the ua and the accept headers finally: valid_accept = accepts["Firefox"] if random_user_agent.find( "Firefox") > 0 else accepts["Safari, Chrome"] headers = {"User-Agent": random_user_agent, "Accept": valid_accept} return headers
def __init__( self, username: str, password: str, region: RegionChoice = RegionChoice.US, quality: QualitySize = QualitySize.LARGE_256k, user_agent: Optional[str] = None, update_handler: Optional[Callable[[dict], None]] = None, ): self._log = logging.getLogger(__file__) if user_agent is None: try: ua = UserAgent(use_cache_server=False) ua.update() user_agent = ua.chrome except Exception: user_agent = FALLBACK_UA self._ua = user_agent_parser.Parse(user_agent) self.reset_session() self.username = username self.password = password self.region = region self.stream_quality = quality self._playlists = {} self._channels = None self._favorite_channels = None self._use_primary = True # vars to manage session cache self.last_renew = None self.update_interval = 30 # hook function to call whenever the playlist updates self.update_handler = update_handler
def test_user_agent(): clear(settings.DB) assert not utils.exist(settings.DB) ua = UserAgent(cache=False) assert ua.ie is not None assert ua.msie is not None assert ua.internetexplorer is not None assert ua.internet_explorer is not None assert ua['internet explorer'] is not None assert ua.google is not None assert ua.chrome is not None assert ua.googlechrome is not None assert ua.google_chrome is not None assert ua['google chrome'] is not None assert ua.firefox is not None assert ua.ff is not None assert ua.ie is not None assert ua.safari is not None assert ua.random is not None assert ua['random'] is not None try: ua.non_existing except FakeUserAgentError: pass else: assert False try: assert ua['non_existing'] except FakeUserAgentError: pass else: assert False data1 = ua.data ua.update(settings.DB) data2 = ua.data assert data1 == data2 assert data1 is not data2 clear(settings.DB) del ua ua = UserAgent() assert utils.exist(settings.DB) data1 = ua.data clear(settings.DB) ua.update(settings.DB) assert utils.exist(settings.DB) data2 = ua.data assert data1 == data2 assert data1 is not data2 clear(settings.DB)
def pull_items_search(self): """Pulls items down from amazon for the given pages.""" print("Retrieving Items...") connection = sqlite3.connect(self._database_name) cursor = connection.cursor() ua = UserAgent() ua.update() for page in self._pages: headers = {'User-Agent': '{}'.format(ua.random)} r = requests.get(page, headers=headers) asoup = BeautifulSoup(r.text, 'lxml') if asoup.head.title.string == "Robot Check": print( "You've been discovered as a bot. Take a break and come back tomorrow." ) print(headers['User-Agent']) return 'bot' # s-result-item is the tag that contains all the data on an item. items_list = asoup.find_all('li', class_="s-result-item") if len(items_list) > 0: first_page_num = ceil(int(self._range[0]) / len(items_list)) # Getting one extra page because the item count can change between here and when the items are found. last_page_num = ceil(int(self._range[1]) / len(items_list)) + 1 else: print("No items found.") with open("./output/no_items.html", 'w') as f: f.write(asoup.prettify()) print("Item less html written to ./output/no_items.html") break # Getting the category category_chain = asoup.find('h2', id="s-result-count").span.contents categorystr = '' for category in category_chain: if category.string is not None: categorystr += category.string # If there are new lines in the category this was a search and the last piece is pulled from the search box. if '\n' in categorystr: categorystr = categorystr.replace('\n', '') + asoup.find( 'input', id="twotabsearchtextbox")["value"] categorystr = categorystr.replace(':', '-') print(categorystr) # Fast forwarding to the first page in the range next_page = "https://www.amazon.com" + asoup.find( 'span', class_="pagnLink").contents[0]['href'] first_valid_page = next_page.replace( "page=2", "page={}".format(first_page_num)) r = requests.get(first_valid_page) # Going through all the pages and getting their items. for page_num in range(first_page_num, last_page_num + 1): asoup = BeautifulSoup(r.text, 'lxml') # s-result-item is the tag that contains all the data on an item. items = asoup.find_all('li', class_="s-result-item") # Scrapping the item information and adding it to the database. for item in items: linkstr = item.find( 'a', class_="a-link-normal a-text-normal")['href'] namestr = item.find('h2').string reviewscorestr = '' if item.find( 'i', class_=re.compile( "a-icon a-icon-star a-star-.")) is not None: reviewscorestr = item.find( 'i', class_=re.compile( "a-icon a-icon-star a-star-.")).string reviewersstr = '0' psbl_num_reviewers_tag = item.find( 'a', class_="a-size-small a-link-normal a-text-normal") if psbl_num_reviewers_tag and "#customerReviews" in psbl_num_reviewers_tag[ 'href']: reviewersstr = item.find( 'a', class_="a-size-small a-link-normal a-text-normal" ).string pricestr = '0.00' # The price is broken up into a whole section and fractional section. if item.find( 'span', class_="sx-price-whole") is not None and item.find( 'sup', class_="sx-price-fractional") is not None: whole = item.find('span', class_="sx-price-whole").string fract = item.find('sup', class_="sx-price-fractional").string pricestr = "{}.{}".format(whole, fract) # The price is just its own full string value. elif pricestr == '0.00' and item.find( 'span', class_="a-size-small s-padding-right-micro" ) is not None: pricestr = item.find( 'span', class_="a-size-small s-padding-right-micro").string asinstr = item['data-asin'] # The ranks on amazon are zero based ranknum = int(item['id'][len("result_"):]) + 1 sql_command = """INSERT OR IGNORE INTO items (item_number, category, name, reviewscore, price, link, rank, asin, reviewers) VALUES (NULL, ?, ?, ?, ?, ?, ?, ?, ?);""" cursor.execute( sql_command, (categorystr, namestr, reviewscorestr, pricestr, linkstr, ranknum, asinstr, reviewersstr)) connection.commit() try: next_page = "https://www.amazon.com" + asoup.find( 'a', id="pagnNextLink")['href'] except TypeError: if asoup.head.title.string != "Robot Check": print( "You've been discovered as a bot. Take a break and come back tomorrow." ) print(headers['User-Agent']) return 'bot' elif asoup.head.title.string != "503 Service Unavailable Error": print( "503 Service Unavailable Error from Amazon. Try again." ) else: print( "Error: No more pages, range higher than number of items." ) print(next_page) with open( "./output/failed_{}.html".format( categorystr.replace(" ", "")), 'w') as f: f.write(asoup.prettify()) print( "Failed html written to ./output/failed_{}.html" .format(categorystr.replace(" ", ""))) break if page_num != last_page_num: time.sleep(5) r = requests.get(next_page) if page is not self._pages[-1]: time.sleep(45)
class MeituanSpider(): def __init__(self): self.count = 0 self.ip_lst = [] self.user_agent = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.2 Safari/605.1.15', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36', ] # self.ua = UserAgent(verify_ssl=False) # 通过库随机产生useragent self.ua.update() # self.user_agent = self.ua.get_useragent_list() def get_ip(self, savefile): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) \ Chrome/83.0.4103.116 Safari/537.36' } for num in range(1, 60): url = 'https://www.kuaidaili.com/free/inha/%d/' % num res = requests.get(url, headers=headers, timeout=5) html = etree.HTML(res.text) ips = html.xpath('//*[@id="list"]/table//tr/td[1]/text()') ports = html.xpath('//*[@id="list"]/table//tr/td[2]/text()') [ self.ip_lst.append('%s:%s' % (ip, port)) for ip, port in zip(ips, ports) ] time.sleep(2) print('IP获取成功,数量%d' % len(self.ip_lst)) with open(savefile, 'w') as sf: json.dump(self.ip_lst, sf, ensure_ascii=False) # 获取页面数据,将json object转化为python dict def _get_url(self, url, city): simulateBrowserData = { 'Accept': '*/*', 'Accept-Encoding': 'br, gzip, deflate', 'Accept-Language': 'zh-cn', 'Connection': 'keep-alive', 'Host': city + '.meituan.com', 'Referer': 'https://' + city + '.meituan.com/meishi/', # 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.2 Safari/605.1.15', 'User-Agent': self.ua.random, # 'User-Agent': random.choice(self.user_agent) } proxy = {'http': random.choice(self.ip_lst)} response = requests.get(url, headers=simulateBrowserData, proxies=proxy, allow_redirects=False) soup = BeautifulSoup(response.text) # for script in soup.find_all('script'): # print(script.contents) all_scripts = soup.find_all('script') text = None for number, script in enumerate(all_scripts): if 'window._appState' in script.text: text = script.text # print(number, text) if text == None: print("服务器拒绝访问") # print(simulateBrowserData['User-Agent']) time.sleep(1 + 2 * random.random()) self.count += 1 if self.count > 6: print('sleep 600s...') time.sleep(600) # 连续失败n次,休息10分钟再请求 self.count = 0 return self._get_url(url, city) # user-agent 失效情况 继续随机一个进行请求 else: self.count = 0 data = json.loads(text.split("=", 1)[1][:-1]) print(simulateBrowserData['User-Agent']) return data # 获取店铺评论数据 def _get_comment(self, ): '''https://www.meituan.com/meishi/api/poi/getMerchantComment?uuid=763c7334407e4fb1aef8.1622721180.1.0.0&platform=1&partner=126&originUrl=https%3A%2F%2Fwww.meituan.com%2Fmeishi%2F913072238%2F&riskLevel=1&optimusCode=10&id=913072238&userId=147136762&offset=0&pageSize=10&sortType=1''' # 获取一个城市多区域多类目商家信息 def get_city_shops_infos(self, city, categoryID, sf): '''美团商家信息最大显示67页,所以应该按各个小分类进行爬取,尽量列出所有店铺''' if not self.ip_lst: with open('ips.json', 'r') as f: self.ip_lst = json.load(f) success_shops = set() failed_shops = set() # city = 'bj' url = 'https://' + city + '.meituan.com/meishi/' + categoryID + '/' data = self._get_url(url, city) n = 0 filters = data.get('filters', {}) cates = filters.get('cates', []) # 分类 areas = filters.get('areas', []) # 区域 dinners = filters.get('dinnerCountsAttr', []) # 用餐人数 sorts = filters.get('sortTypesAttr', []) # 店铺排序类型 # 这里只考虑'区域' for _area in areas[:]: '''一级:大区域''' area_id = _area['id'] # int area_name = _area['name'] print('大区域:', area_name) area_url = _area['url'] sub_areas = _area.get('subAreas', []) if not sub_areas: continue for _sub_area in sub_areas[1:]: # 第一个元素为大区域"全部" '''二级:小区域''' sub_area_id = _sub_area['id'] # int sub_area_name = _sub_area['name'] print('小区域:', sub_area_name) sub_area_url = _sub_area['url'].replace('http:', 'https:') print(sub_area_url) a_data = self._get_url(sub_area_url, city) shopCounts = a_data.get('poiLists').get( 'totalCounts') # 最小区域内的商家数量 pageNum = math.ceil(shopCounts / 15) # 美团一页展示15个商家 for pn in range(1, pageNum + 1): print('page:', pn) try: for eachShopInfo in a_data.get('poiLists').get( 'poiInfos'): shopId = eachShopInfo.get('poiId') # 店铺id try: if shopId in success_shops: continue n += 1 # if n < 1215: # todo: 中断跳过操作 # print('===== 跳过 =====') # continue print(n) shopName = eachShopInfo.get('title') # 店铺名 print(shopName) avgScore = eachShopInfo.get('avgScore') # 评分 allCommentNum = eachShopInfo.get( 'allCommentNum') # 评论条数 address = eachShopInfo.get('address') # 地址 avgPrice = eachShopInfo.get('avgPrice') # 人均价格 # 店铺详细信息 shop_url = 'https://www.meituan.com/meishi/%s/' % shopId print(shop_url) shop_data = self._get_url(shop_url, 'nc') recommended = shop_data.get('recommended', []) # 推荐菜 shopClass = [ _d['title'] for _d in shop_data.get('crumbNav', []) ] # 店铺层级 phone = shop_data.get('detailInfo', {}).get('phone', '') # 电话 address_detail = shop_data.get( 'detailInfo', {}).get('address', '') # 详细地址 openTime = shop_data.get('detailInfo', {}).get( 'openTime', '') # 营业时间 dealList = shop_data.get('dealList', {}).get('deals', []) # 套餐列表 success_shops.add(shopId) out_data = [ city, categoryID, area_name, sub_area_name, shopId, shopName, avgScore, allCommentNum, avgPrice, address_detail, '/'.join(shopClass), phone, openTime, shop_url, json.dumps(recommended, ensure_ascii=False), json.dumps(dealList, ensure_ascii=False) ] out_data = [ str(_str).replace('\t', '|') for _str in out_data ] out_str = '\t'.join(out_data) sf.write(out_str + '\n') if sf else print(out_str) time.sleep(20 + 5 * random.random()) except Exception as e: print("error:%s" % e) print("shopId:%s" % shopId) time.sleep(1 + 6 * random.random()) except Exception as e: print("error:%s" % e) print("page:%s" % pn) time.sleep(1 + 6 * random.random()) pn += 1 next_page_url = sub_area_url + 'pn%s/' % pn if pn > pageNum: break a_data = self._get_url(next_page_url, city)
def RequestsAndcheck(URL, Xpath, proxies, dict_rate2): dict_rate = GetRate() URL = covertURL(URL) htmls = {} for j in range(10): # try get price with regular counect try: regular_result = getPrice(URL, Xpath) time.sleep(2.6) if regular_result: print(regular_result) break except: pass # proxy UserAgents = [ 'Mozilla/5.0 (Linux; Android 5.0; Nexus 6 Build/LRX21D) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/37.0.0.0 Mobile Safari/537.36', 'Mozilla/5.0 (Linux; Android 5.1.1; KFAUWI Build/LVY48F) AppleWebKit/537.36 (KHTML, like Gecko) Silk/68.2.6 like Chrome/68.0.3440.85 Safari/537.36', 'Mozilla/5.0 (Linux; Android 6.0; Nexus 6P Build/MDB08L) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.69 Mobile Safari/537.36', 'Mozilla/5.0 (Linux; Android 7.0; SM-T715 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.91 Safari/537.36', 'Mozilla/5.0 (Linux; Android 8.0.0; Pixel Build/OPR3.170623.007) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.98 Mobile Safari/537.36', 'Mozilla/5.0 (Linux; Android 8.0.0; SM-G935F Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.91 Mobile Safari/537.36', 'Mozilla/5.0 (Linux; Android 9; Pixel 2 XL Build/PPR1.180610.009) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.91 Mobile Safari/537.36', 'Mozilla/5.0 (Android 9; Tablet; rv:61.0) Gecko/61.0 Firefox/61.0', 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1', 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_2_1 like Mac OS X) AppleWebKit/602.4.6 (KHTML, like Gecko) Version/10.0 Mobile/14D27 Safari/602.1', 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_2 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13C75 Safari/601.1', 'Mozilla/5.0 (iPhone; CPU iPhone OS 8_2 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12D508 Safari/600.1.4', 'Mozilla/5.0 (iPhone; CPU iPhone OS 7_0 like Mac OS X) AppleWebKit/546.10 (KHTML, like Gecko) Version/6.0 Mobile/7E18WD Safari/8536.25', 'Mozilla/5.0 (iPhone; CPU iPhone OS 6_1_3 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10B329 Safari/8536.25', 'Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3', 'Mozilla/5.0 (iPad; CPU OS 9_0 like Mac OS X) AppleWebKit/601.1.17 (KHTML, like Gecko) Version/8.0 Mobile/13A175 Safari/600.1.4', 'Mozilla/5.0 (iPad; CPU OS 8_4_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12H321 Safari/600.1.4', 'Mozilla/5.0 (iPad; CPU OS 7_0_2 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Mobile/11A501', 'Mozilla/5.0 (iPad; CPU OS 6_1_3 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10B329 Safari/8536.25', 'Mozilla/5.0 (iPad; CPU OS 5_1_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9B206 Safari/7534.48.3', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134', 'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0; MDDCJS)', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; Trident/5.0)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)', 'Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 6.0; en-US)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36 OPR/52.0.2871.64', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:58.0) Gecko/20100101 Firefox/58.0' ] for i in range(0, len(proxies)): # Get a proxy from the pool proxy = proxies[i] print("Request #%d" % i) try: if (len(UserAgents) > 0): browser = UserAgents[random.randint(0, len(UserAgents) - 1)] else: try: ua = UserAgent(cache=False, use_cache_server=False) ua.update() UserAgents = ([ str(ua.safari), str(ua.ff), str(ua.firefox), str(ua.google), str(ua.chrome), str(ua.opera), str(ua["Internet Explorer"]), str(ua["google chrome"]), str(ua.msie), str(ua.ie), str(ua.chrome) ]) browser = ua.random except: UserAgents = [ 'Mozilla/5.0 (Linux; Android 5.0; Nexus 6 Build/LRX21D) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/37.0.0.0 Mobile Safari/537.36', 'Mozilla/5.0 (Linux; Android 5.1.1; KFAUWI Build/LVY48F) AppleWebKit/537.36 (KHTML, like Gecko) Silk/68.2.6 like Chrome/68.0.3440.85 Safari/537.36', 'Mozilla/5.0 (Linux; Android 6.0; Nexus 6P Build/MDB08L) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.69 Mobile Safari/537.36', 'Mozilla/5.0 (Linux; Android 7.0; SM-T715 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.91 Safari/537.36', 'Mozilla/5.0 (Linux; Android 8.0.0; Pixel Build/OPR3.170623.007) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.98 Mobile Safari/537.36', 'Mozilla/5.0 (Linux; Android 8.0.0; SM-G935F Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.91 Mobile Safari/537.36', 'Mozilla/5.0 (Linux; Android 9; Pixel 2 XL Build/PPR1.180610.009) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.91 Mobile Safari/537.36', 'Mozilla/5.0 (Android 9; Tablet; rv:61.0) Gecko/61.0 Firefox/61.0', 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1', 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_2_1 like Mac OS X) AppleWebKit/602.4.6 (KHTML, like Gecko) Version/10.0 Mobile/14D27 Safari/602.1', 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_2 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13C75 Safari/601.1', 'Mozilla/5.0 (iPhone; CPU iPhone OS 8_2 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12D508 Safari/600.1.4', 'Mozilla/5.0 (iPhone; CPU iPhone OS 7_0 like Mac OS X) AppleWebKit/546.10 (KHTML, like Gecko) Version/6.0 Mobile/7E18WD Safari/8536.25', 'Mozilla/5.0 (iPhone; CPU iPhone OS 6_1_3 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10B329 Safari/8536.25', 'Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3', 'Mozilla/5.0 (iPad; CPU OS 9_0 like Mac OS X) AppleWebKit/601.1.17 (KHTML, like Gecko) Version/8.0 Mobile/13A175 Safari/600.1.4', 'Mozilla/5.0 (iPad; CPU OS 8_4_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12H321 Safari/600.1.4', 'Mozilla/5.0 (iPad; CPU OS 7_0_2 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Mobile/11A501', 'Mozilla/5.0 (iPad; CPU OS 6_1_3 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10B329 Safari/8536.25', 'Mozilla/5.0 (iPad; CPU OS 5_1_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9B206 Safari/7534.48.3', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134', 'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0; MDDCJS)', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; Trident/5.0)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)', 'Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 6.0; en-US)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36 OPR/52.0.2871.64', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:58.0) Gecko/20100101 Firefox/58.0' ] browser = UserAgents[random.randint( 0, len(UserAgents) - 1)] price_list = getPrice(URL, Xpath, proxy, browser) if not price_list: pass else: print(price_list) htmls[(proxy, browser)] = price_list UserAgents.remove(browser) except: print(proxy + " : " + browser + " Skipping. Connnection error") htmls = BastPrice(htmls, dict_rate, dict_rate2) if not regular_result and not htmls: return ('unsupported', False) if not regular_result: for Proxy_UA in htmls.keys(): return (Proxy_UA, False) if not htmls: return ('only regular find', False) htmls["regular"] = regular_result htmls = BastPrice(htmls, dict_rate, dict_rate2) if not htmls: return ('Erorr', False) else: for Proxy_UA in htmls.keys(): if Proxy_UA == "regular": return (Proxy_UA, True) return (Proxy_UA, False)
metas = prepare_tweet(tw) metas.pop('_id') tw.update(metas) for po in ['user', 'entities', 'extended_entities']: if po in tw: tw.pop(po) db.tweets.update({'_id': tw['id']}, {"$set": tw}, upsert=True) print "...collected %s new tweets" % len(tweets) tweets = api.call('statuses.user_timeline', api_args) db.users.update({'_id': user['twitter']}, {"$set": {"done": True}}) # TODO: refacto all of this with gazouilloire/run.py ua = UserAgent() ua.update() todo = list(tweetscoll.find({"links_to_resolve": True}, projection={"links": 1, "proper_links": 1, "retweet_id": 1}, limit=600, sort=[("_id", 1)])) left = tweetscoll.count({"links_to_resolve": True}) print "\n\n- STARTING LINKS RESOLVING: %s waiting\n\n" % left while todo: done = 0 urlstoclear = list(set([l for t in todo if not t.get("proper_links", []) for l in t.get('links', [])])) alreadydone = {l["_id"]: l["real"] for l in linkscoll.find({"_id": {"$in": urlstoclear}})} tweetsdone = [] batchidsdone = set() for tweet in todo: if tweet.get("proper_links", []): tweetsdone.append(tweet["_id"]) continue tweetid = tweet.get('retweet_id') or tweet['_id'] if tweetid in batchidsdone:
def get_random_header(self): '''Get random user agents(headers)''' ua = UserAgent() ua.update() return ua
node_list.sort() headers = requests.utils.default_headers() node_cat_map = {} ua = UserAgent() url = "https://snoopsnoo.com/r/" index = sum(1 for line in open("node_cat_map.txt")) f = open("node_cat_map.txt", "a+") agent_counter = 0 headers.update({'User-Agent': ua.firefox}) for it in range(index, len(node_list)): if agent_counter >= 10: ua.update() headers.update({'User-Agent': ua.firefox}) agent_counter = 0 try: req = requests.get(url + node_list[it], headers) soup = BeautifulSoup(req.content, 'html.parser') cat = soup.body.select(".breadcrumb")[0].find_all("a")[1].get_text( strip=True) f.write(node_list[it] + ":" + cat + "\r") print(node_list[it] + ": " + cat + " has been written to the file successfully [" + str(it) + "/" + str(len(node_list)) + ")") except requests.exceptions.RequestException as e: print(e) f.close()
def test_fake_update(): ua = UserAgent(cache=False, use_cache_server=False) ua.update() _probe(ua)
class Bot: def __init__(self, pair): self.region = pair[1] self.category = pair[0] self.key = 'af0deccbgcgidddjgnvljitntccdduijhdinfgjgfjir' self.conn = http.client.HTTPSConnection("m.avito.ru", timeout=10) self.ua = UserAgent() def get_id_location(self): payload = '' headers = { 'Cookie': 'u=2kfmrcai.1cy6s0w.wt2thhj49000; buyer_selected_search_radius4=0_general; ' 'buyer_local_priority_v2=0; buyer_selected_search_radius0=200; buyer_location_id=638920; ' 'sx=H4sIAAAAAAACAw3JwQqAIAwA0H%2FZucPMZcu%2FkZBFCyQsR0j%2FXu%2F6Osx8P0eSLV' '%2B1MpmZFissbBA7NIiQ93CmNoVxMWVSWpEI%2F6ciJkVhgAzRBWT06L173w9VOrM1VAAAAA%3D%3D; ' 'sessid=8bc4e5dea8b325ce05e21935d12b0464.1608066636; _mlocation=638920; v=1608070541; ' 'dfp_group=87', 'user-agent': str(self.ua) } self.ua.update() self.conn.request( "GET", "/api/1/slocations?key=" + self.key + "&q=" + urllib.parse.quote_plus(self.region), payload, headers) res = self.conn.getresponse() data = res.read().decode("utf-8") id = json.loads(data) return id["result"]["locations"][0]['id'] def get_count(self, id_location, time, page=1, count=0): payload = '' headers = { 'Cookie': 'u=2kfmrcai.1cy6s0w.wt2thhj49000; buyer_selected_search_radius4=0_general; ' 'buyer_local_priority_v2=0; buyer_selected_search_radius0=200; buyer_location_id=638920; ' 'sx=H4sIAAAAAAACAw3JwQqAIAwA0H%2FZucPMZcu%2FkZBFCyQsR0j%2FXu%2F6Osx8P0eSLV' '%2B1MpmZFissbBA7NIiQ93CmNoVxMWVSWpEI%2F6ciJkVhgAzRBWT06L173w9VOrM1VAAAAA%3D%3D; ' 'sessid=8bc4e5dea8b325ce05e21935d12b0464.1608066636; _mlocation=638920; v=1608070541; ' 'dfp_group=87', 'user-agent': str(self.ua) } url = ("/api/10/items?key=" + self.key + "&locationId=" + str(id_location) + '&page=' + str(page) + "&query=" + urllib.parse.quote_plus(self.category) + '&sort=date') self.conn.request("GET", url, payload, headers) res = self.conn.getresponse() data = res.read().decode("utf-8") items = json.loads(data)['result']["items"] if len(items) > 0: positiv = 0 for i in range(len(items)): item = items[i]['value'] if item['time'] >= time: positiv += 1 if positiv != 0: return self.get_count(id_location, time=time, page=page + 1, count=count + positiv) else: self.conn.close() return count else: self.conn.close() return count