def get_header(): header = Headers( browser="chrome", os="win", headers=True ) return header.generate()
def startsHere(): header = Headers() uheaders = header.generate() urls = list(map(lambda x: x["_id"], db["subreddits"].find({}))) turls = {} for url in urls: strCollection = url.split("/r/")[-1].split("/")[0] collection = db[strCollection] try: nele = len(list(collection.find({}))) except Exception as e: nele = 0 turls[url] = nele urls = sorted(turls.items(), key=lambda x: x[1]) for url, _ in urls: try: print(url) ourl = url url = url + "new.json?limit=1000" scrap(url, uheaders, ourl) except Exception as e: print(e)
def parse(self, response): try: header = Headers( browser="chrome", # Generate only Chrome UA os="win", # Generate ony Windows platform headers=True # generate misc headers ) header1 = "" for i in range(1, 10): header1 = header.generate() print(len(listing_urls)) for i in range(0, len(listing_urls)): yield scrapy.Request(url=listing_urls[i], callback=self.parse_data, meta={ 'listing_url': listing_urls[i], 'thumb_urls': thumb_urls[i], 'categories': categories[i], 'buying_format': buying_format[i], 'titles': titles[i] }, dont_filter=True, headers=header1) except Exception as e: print(e)
def get_more_suggestions(request_text): df = request_text.split(" ") finSug = [] for el in df: newurl = 'https://wbxsearch.wildberries.ru/suggests/common?query=REQUEST'.replace( "REQUEST", str(el)) session = requests.Session() headers = Headers(browser="chrome", os="win", headers=True) session.headers = headers.generate() lst_req_text = request_text.split(" ") res = session.get(url=newurl) res.raise_for_status() suggestions = json.loads(res.text) for item in suggestions: vector_sg = str(item["name"]).split(" ") for inItem in vector_sg: for el in lst_req_text: if inItem.find(el) != -1: finSug.append(inItem) return finSug
def get_links(self): """ returns list of all chapter's link from https://mangareader.cc/ """ ua = Headers(headers=False) #change headers urllib3.disable_warnings( urllib3.exceptions.InsecureRequestWarning) #hiding the warning response = requests.get( self.URL, headers=ua.generate(), verify=False ) #sending a request and storing the response inside response var if response.status_code >= 400 and response.status_code < 500: #if server error print("Server Error\nTry again later") if response.status_code >= 200 and response.status_code < 300: soup = BeautifulSoup(response.content, "html.parser") unorder_list = soup.findAll("ul")[2] all_hyperlink_tags = unorder_list.findChildren('a') all_hrefs = list( reversed([ hyperlink.get('href') for hyperlink in all_hyperlink_tags ])) return all_hrefs
def get_chapter_list(self): """ returns list of all chapter's from given manga/ """ ua = Headers(headers=False) #change headers urllib3.disable_warnings( urllib3.exceptions.InsecureRequestWarning) #hiding the warning response = requests.get( self.URL, headers=ua.generate(), verify=False ) #sending a request and storing the response inside response var if response.status_code >= 400 and response.status_code < 500: #server error print("Server Error!\nTry again later") if response.status_code == 200: soup = BeautifulSoup(response.content, "html.parser") unorder_lists = soup.findAll("ul") all_spans = unorder_lists[2].findChildren('span', {'class': 'leftoff'}) all_chapters = list( reversed( list( map(self.remove_trails, [span.text for span in all_spans])))) return all_chapters
async def parse_page(redis_client, url: str, session, netloc: str, spell_checker): header = Headers() assert spell_checker['pinterest'] == True print(f'analyzing {url}') async with session.get(url, headers=header.generate(), ssl=False, allow_redirects=True, proxy=random_proxy()) as resp: if resp.status in [403, 429]: number_of_errors = redis_client.hincrby('4xxerrors', url, 1) # TODO: I don't think this is the correct redis location if number_of_errors > 3: redis_client.srem(f'active:{netloc}') return soup = BeautifulSoup(await resp.text(), "html.parser") visible_words_with_punctuation = get_text(soup) pattern = re.compile(r'[\W_]+', re.UNICODE) visible_words_strip_punctuation = { pattern.sub('', word) for word in visible_words_with_punctuation } wrong_words_set = spell_checker.unknown( visible_words_strip_punctuation) wrong_words_set_clean = {word for word in wrong_words_set if not ""} add_set_to_redis(netloc, url, visible_words_with_punctuation, wrong_words_set_clean, spell_checker, redis_client) redis_client.sadd(f'processed:{netloc}', url) # this is essentially a recursive search that recalls parse_page() until all the URL's are done await extract_and_queue_local_links(soup, netloc, redis_client, session, spell_checker)
def on_start(self): header = Headers( browser="firefox", os="linux", headers=True # generate misc headers ) headerNow = header.generate() self.client.get("/?q=panda&atb=v183-1&ia=web", headers=headerNow)
def test_get_text(): header = Headers() resp = requests.get("http://example.com/", headers=header.generate()) soup = BeautifulSoup(resp.text, "html.parser") correct_resp = ['Example', 'Domain', 'This', 'domain', 'is', 'for', 'use', 'in', 'illustrative', 'examples', 'in', 'documents.', 'You', 'may', 'use', 'this', 'domain', 'in', 'literature', 'without', 'prior', 'coordination', 'or', 'asking', 'for', 'permission.', 'More', 'information...'] assert get_text(soup) == correct_resp
def test_proxy_connection(): proxies = config.PROXY_LIST.strip('][').split(', ') for proxy in proxies: header = Headers() proxy_sample = {"http": proxy} resp = requests.get("http://example.com/", proxies=proxy_sample, headers=header.generate()) assert resp.status_code == 200
def start_requests(self): header = Headers( browser="chrome", # Generate only Chrome UA os="win", # Generate ony Windows platform headers=True # generate misc headers ) header1 = "" for i in range(1, 10): header1 = header.generate() yield scrapy.Request(self.urls, self.parse, headers=header1)
def sp_headers(): if __name__ == "__main__": header = Headers( browser="chrome", # Generate only Chrome UA os="win", # Generate ony Windows platform headers=True # generate misc headers ) for i in range(10): header.generate()
def init(self): # website infos self.base_url = "https://footdistrict.com" self.endpoints = ["/zapatillas/f/b/converse/"] # create a random headers generator, configured to generate random windows headers self.headers_gen = Headers(os="win", headers=True) # max links to be monitored self.max_links = 5 self.found_links = [] # type: List[str]
def gosreestr_parse_new_uids(fpath, existed_uids, timeout, error_timeout, luigi_callback=None): page_index = 0 s = requests.Session() headers = Headers(headers=True) _existed_uids = existed_uids if os.path.exists(fpath): parsed_uids = [u.split(';')[0] for u in read_lines(fpath)] page_index = int(read_lines(fpath).pop().split(';')[1]) + 1 _existed_uids.extend(parsed_uids) form_data = prepare_request_data(FORM_DATA, page_index) s.headers = headers.generate() table_raw = s.post(LIST_URL, data=form_data, timeout=15).text status = '' new_uids_count = 0 new_uids = list() while not check_empty_table(table_raw): uids = parse_ids_from_table(table_raw) _new_uids = list() for uid in uids: if uid not in _existed_uids: _new_uids.append(uid) append_file(fpath, f'{uid};{page_index}') else: break new_uids.extend(_new_uids) new_uids_count += len(_new_uids) form_data = prepare_request_data(FORM_DATA, page_index) try: s.headers = headers.generate() table_raw = s.post(LIST_URL, data=form_data, timeout=15).text except (ReadTimeout, ConnectTimeout, ConnectionError, ReadTimeoutError): luigi_callback( f'Page: {page_index}, parsed count: {new_uids_count}. Timeout after error', 0) sleep(error_timeout) else: page_index += 1 luigi_callback( f'Page: {page_index}, parsed count: {new_uids_count}. Timeout after success.', 0) sleep(timeout) return new_uids
def header_generator(self): """Генерация header'ов""" header = Headers() headers = header.generate() headers["Accept-Language"] = "ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7" headers["Accept"] = "application/json, text/plain, */*" headers["Accept-Encoding"] = "gzip, deflate, br" headers["Referer"] = "https://ruz.fa.ru/ruz/main" headers["Sec-Fetch-Site"] = "same-origin" headers["Sec-Fetch-Mode"] = "cors" headers["Sec-Fetch-Dest"] = "empty" self.headers = headers
def generate_header(browser='chrome', ops='win', random_args=False, **kwargs): """生成随机请求头""" header = Headers( browser= browser, # str, chrome/firefox/opera. User Agent browser. Default: random os=ops, # str, win/mac/lin. OS of User Agent. Default: random headers= random_args # bool, True/False. Generate random headers or no. Default: False ) headers = header.generate() for key, value in kwargs.items(): headers[key] = value return headers
def __init__(self, request_name, New_updated_data=[], links_storage=c.Links_storage): self.New_updated_data = New_updated_data self.redisDB = redis.Redis(db=1) self.request_name = request_name if len(request_name) == 0: return self.links_storage = links_storage self.session = requests.Session() headers = Headers(browser="chrome", os="win", headers=True) self.session.headers = headers.generate() self.products = {}
async def parse_page(redis, url: str, session) -> None: header = Headers() async with session.get(url, headers=header.generate(), ssl=False, allow_redirects=True, proxy=random_proxy()) as resp: current_netloc = urlparse(url).netloc # Get the url's parent try: domain = await Domain.query.where( Domain.domain == f'http://{current_netloc}').gino.first() except Exception as e: logging.error(f'Failed at finding {current_netloc}', exc_info=True) # Break out 403 errors for multiple tries if resp.status in [403, 429]: redis.hincrby("403errors", url, 1) await redis.srem('domainbeingcrawled:active', current_netloc) number_of_errors = await redis.hget('403errors', url) number_of_errors = int(number_of_errors.decode('utf8')) if number_of_errors >= 5: await Page.create(page=url, errors=[], page_response=resp.status, domain=domain.id) await redis.srem('pagestobecrawled:queue', url) return soup = BeautifulSoup(await resp.text(), "html.parser") visible_words = get_text(soup) wrong_words = await check_if_spelled_right(redis, words=visible_words) try: await Page.create(page=url, errors=wrong_words, page_response=resp.status, domain=domain.id) await extract_and_queue_local_links(soup=soup, root_domain=resp.host, redis=redis) except Exception as e: logging.error(e) print(f'successfully processed {url}') print(f'About to pop {current_netloc}') await redis.srem('pagestobecrawled:queue', url) await redis.srem('domainbeingcrawled:active', current_netloc) print('popped!')
def __init__(self): self.session = requests.Session() headers = Headers(browser="chrome", os="win", headers=True) self.session.headers = headers.generate() self.links = [ [ 'https://wbxcatalog-ru.wildberries.ru/nm-2-card/catalog?spp=0&pricemarginCoeff=1.0®=0&appType=1&offlineBonus=0&onlineBonus=0&emp=0&locale=ru&lang=ru&curr=rub&nm=IDS;', "W_iD" ], [ 'https://api.retailrocket.net/api/1.0/partner/5ba1feda97a5252320437f20/items/?itemsIds=IDS&stock=&format=json', "E_iD" ], ['https://my-shop.ru/cgi-bin/shop2.pl?q=product&id=IDS', "M_iD"] ] self.result = []
def init_driver(browser_name): def set_properties(browser_option): ua = Headers().generate() #fake user agent browser_option.add_argument('--headless') browser_option.add_argument('--disable-extensions') browser_option.add_argument('--incognito') browser_option.add_argument('--disable-gpu') browser_option.add_argument('--log-level=3') browser_option.add_argument(f'user-agent={ua}') browser_option.add_argument('--disable-notifications') browser_option.add_argument('--disable-popup-blocking') return browser_option try: browser_name = browser_name.strip().title() ua = Headers().generate() #fake user agent #automating and opening URL in headless browser if browser_name.lower() == "chrome": browser_option = ChromeOptions() browser_option = set_properties(browser_option) driver = webdriver.Chrome(ChromeDriverManager().install(),options=browser_option) #chromedriver's path in first argument elif browser_name.lower() == "firefox": browser_option = FirefoxOptions() browser_option = set_properties(browser_option) driver = webdriver.Firefox(executable_path= GeckoDriverManager().install(),options=browser_option) else: driver = "Browser Not Supported!" return driver except Exception as ex: print(ex)
def mainChecker(proxy_type, proxy, position): checked[position] = None proxyDict = { "http": f"{proxy_type}://{proxy}", "https": f"{proxy_type}://{proxy}", } try: header = Headers(headers=False).generate() agent = header['User-Agent'] headers = { 'User-Agent': f'{agent}', } response = requests.get('https://www.youtube.com/', headers=headers, proxies=proxyDict, timeout=30) status = response.status_code print(bcolors.OKBLUE + f"Tried {position+1} |" + bcolors.OKGREEN + f' {proxy} | GOOD | Type : {proxy_type} | Response : {status}' + bcolors.ENDC) print(proxy, file=open('GoodProxy.txt', 'a')) except: print(bcolors.OKBLUE + f"Tried {position+1} |" + bcolors.FAIL + f' {proxy} | {proxy_type} |BAD ' + bcolors.ENDC) checked[position] = proxy_type pass
def link_checker(shop): shop_name = shop['name'] fail_url = shop['fail_url'] for item in shop['items']: item_name = item['name'] headers = Headers(headers=True).generate() try: response = requests.get(item['link'], allow_redirects=True, headers=headers, timeout=5) if response.status_code == 200 and response.url == fail_url: status = 'OOS' log_result(shop_name, item_name, status) elif response.status_code == 200 and response.url != fail_url: status = format_hyperlink(item['link'], 'IN STOCK') log_result(shop_name, item_name, status) elif response.status_code == 404: log_result(shop_name, item_name, 'PAGE NOT FOUND') else: log_result(shop_name, item_name, 'ERROR: RESPONSE CODE ' + str(response.status_code)) except requests.exceptions.Timeout as t: log_result(shop_name, item_name, 'TIMEOUT') except requests.exceptions.TooManyRedirects as t: log_result(shop_name, item_name, str(t)) except requests.exceptions.RequestException as e: log_result(shop_name, item_name, str(e))
def yandex_parser(m_class): global https_p res = [] _https = https_p.copy() urls = set() flag = True for i in range(len(_https)): try: htps = random.choice(_https) print(htps) _https.remove(htps) proxies = { 'https': "https://" + htps} url = "https://yandex.ru/images/search?text=" + str(m_class) + i page = requests.get(url, headers=Headers().generate(), proxies=proxies) soup = BeautifulSoup(page.text, "html.parser") result = soup.find_all("div", {"class":"serp-item"}, limit=50) for r in result: if count >3: return list(urls) jsonify = json.loads(r["data-bem"]) urls.add(jsonify['serp-item']['preview'][0]['url']) if len(urls)>=1: res.append(htps) count+=1 continue else: continue return except: pass return res
def get_image_links(self,chapter_number): """returns all image links present for manga chapter""" URLS = self.get_links() #all chapters, method from Chapter_list class(base class) headers = Headers(headers=False).generate() urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) #disable warning response = requests.get(URLS[int(chapter_number)],headers = headers,verify = False) #chapter number if response.status_code < 500 and response.status_code >= 400: #if server error print("Server Error!\nTry Again later") exit() if response.status_code >= 200 and response.status_code < 300: #if response is success soup = BeautifulSoup(response.content,"html.parser") #make bs4 object with response's content and parser is html.parser paragraph = soup.find("p",{"id":"arraydata"}) all_image_hrefs = paragraph.text.split(',') return all_image_hrefs
def run(self): # 开始计时 pure_ip_address = self.proxyip.split(':')[0] # 验证IP归属 if not getChinaIP(pure_ip_address): pass # raise ValueError('不是有效IP') # start = time.time() # 消除关闭证书验证的警告 urllib3.disable_warnings() headers = Headers(headers=True).generate() headers['Referer'] = 'http://ga.314300.cn/toupiao/user40.html' headers['Pragma'] = 'no-cache' # headers['Host'] = 'ga.314300.cn' # headers['x-forward-for'] = pure_ip_address headers['Cookie'] = 'ASPSESSIONIDSAACBBBS=HOPLOAJDCHIIHBFNLIODPLJL' # print(headers) headers[ 'User-Agent'] = 'Mozilla/5.0 (Linux; U; Android 2.3.6; zh-cn; GT-S5660 Build/GINGERBREAD) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1 MicroMessenger/5.3' html = requests.get(headers=headers, url=targetUrl, proxies={ "http": 'http://' + self.proxyip }, verify=False, timeout=12).content.decode() # 结束计时 end = time.time() # 输出内容 print(threading.current_thread().getName() + "使用代理IP, 耗时 " + str(end - start) + "毫秒 " + self.proxyip + " 获取到如下HTML内容:\n" + html + "\n*************")
def run(self): # 开始计时 pure_ip_address = self.proxyip.split(':')[0] # 验证IP归属 if not getChinaIP(pure_ip_address): # pass raise ValueError('不是有效IP') # start = time.time() # 消除关闭证书验证的警告 urllib3.disable_warnings() headers = Headers(headers=True).generate() headers[ 'Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676' headers['Pragma'] = 'no-cache' headers['Host'] = 'bb.cf08tp.cn' headers['x-forward-for'] = pure_ip_address headers['Cookie'] = 'PHPSESSID={}'.format(''.join( str(uuid.uuid1()).split('-'))) print(headers) html = requests.get(headers=headers, url=targetUrl, proxies={ "http": 'http://' + self.proxyip, "https": 'https://' + self.proxyip }, verify=False, timeout=2).content.decode() # 结束计时 end = time.time() # 输出内容 print(threading.current_thread().getName() + "使用代理IP, 耗时 " + str(end - start) + "毫秒 " + self.proxyip + " 获取到如下HTML内容:\n" + html + "\n*************")
def gosreestr_parse_companies(fpath: str, struct=None): page_index = 23 s = requests.Session() headers = Headers(headers=True) form_data = prepare_request_data(FORM_DATA, page_index) table_raw = s.post(LIST_URL, data=form_data).text mapping = { f.name: f.metadata['label_key'] for f in attr.fields(GosreestrCompany) } timeout_error = False while not check_empty_table(table_raw): ids = parse_ids_from_table(table_raw) if not timeout_error: for _id in ids: url = DETAIL_URL.format(_id) try: s.headers = headers.generate() company_raw = s.get(url, timeout=10).text except (ReadTimeout, ConnectTimeout, ConnectionError, ReadTimeoutError): print('company request ban') timeout_error = True sleep(90) else: timeout_error = False d = parse_company_info(company_raw, mapping) print(d) # sleep(15) page_index += 1 form_data = prepare_request_data(FORM_DATA, page_index) sleep(300) try: s.headers = headers.generate() table_raw = s.post(LIST_URL, data=form_data, timeout=10).text except (ReadTimeout, ConnectTimeout, ConnectionError, ReadTimeoutError): print('table request ban') timeout_error = True sleep(300) else: timeout_error = False
def startsHere(): header = Headers() uheaders = header.generate() users = list(map(lambda x: x["_id"], db["users"].find({"viewed": False}))) url = "https://www.reddit.com/user/" for usr in users: try: nurl = url + usr + "/.json?limit=1000" scrap(nurl, uheaders) except Exception as e: e = 0 db["users"].update_one({"_id": usr}, {"$set": {"viewed": True}})
def get_header(self, browser=None, os=None, headers=None): if not browser: browser = "chrome" if not os: os = "win" if not headers: headers = False return Headers(browser=browser, os=os, headers=headers)
def get_page(uri): """ Reads a webpage given the URI. """ # make request for uri HeadersGenerator = Headers(os='mac', headers=False) response = requests.get(uri, headers=HeadersGenerator.generate()) # check status code status_code = response.status_code if status_code != 200: print(status_code) # get and return content as bytes content = response.content return content