def sp_headers(): if __name__ == "__main__": header = Headers( browser="chrome", # Generate only Chrome UA os="win", # Generate ony Windows platform headers=True # generate misc headers ) for i in range(10): header.generate()
def gosreestr_parse_new_uids(fpath, existed_uids, timeout, error_timeout, luigi_callback=None): page_index = 0 s = requests.Session() headers = Headers(headers=True) _existed_uids = existed_uids if os.path.exists(fpath): parsed_uids = [u.split(';')[0] for u in read_lines(fpath)] page_index = int(read_lines(fpath).pop().split(';')[1]) + 1 _existed_uids.extend(parsed_uids) form_data = prepare_request_data(FORM_DATA, page_index) s.headers = headers.generate() table_raw = s.post(LIST_URL, data=form_data, timeout=15).text status = '' new_uids_count = 0 new_uids = list() while not check_empty_table(table_raw): uids = parse_ids_from_table(table_raw) _new_uids = list() for uid in uids: if uid not in _existed_uids: _new_uids.append(uid) append_file(fpath, f'{uid};{page_index}') else: break new_uids.extend(_new_uids) new_uids_count += len(_new_uids) form_data = prepare_request_data(FORM_DATA, page_index) try: s.headers = headers.generate() table_raw = s.post(LIST_URL, data=form_data, timeout=15).text except (ReadTimeout, ConnectTimeout, ConnectionError, ReadTimeoutError): luigi_callback( f'Page: {page_index}, parsed count: {new_uids_count}. Timeout after error', 0) sleep(error_timeout) else: page_index += 1 luigi_callback( f'Page: {page_index}, parsed count: {new_uids_count}. Timeout after success.', 0) sleep(timeout) return new_uids
def get_header(): header = Headers( browser="chrome", os="win", headers=True ) return header.generate()
def get_more_suggestions(request_text): df = request_text.split(" ") finSug = [] for el in df: newurl = 'https://wbxsearch.wildberries.ru/suggests/common?query=REQUEST'.replace( "REQUEST", str(el)) session = requests.Session() headers = Headers(browser="chrome", os="win", headers=True) session.headers = headers.generate() lst_req_text = request_text.split(" ") res = session.get(url=newurl) res.raise_for_status() suggestions = json.loads(res.text) for item in suggestions: vector_sg = str(item["name"]).split(" ") for inItem in vector_sg: for el in lst_req_text: if inItem.find(el) != -1: finSug.append(inItem) return finSug
def parse(self, response): try: header = Headers( browser="chrome", # Generate only Chrome UA os="win", # Generate ony Windows platform headers=True # generate misc headers ) header1 = "" for i in range(1, 10): header1 = header.generate() print(len(listing_urls)) for i in range(0, len(listing_urls)): yield scrapy.Request(url=listing_urls[i], callback=self.parse_data, meta={ 'listing_url': listing_urls[i], 'thumb_urls': thumb_urls[i], 'categories': categories[i], 'buying_format': buying_format[i], 'titles': titles[i] }, dont_filter=True, headers=header1) except Exception as e: print(e)
def get_links(self): """ returns list of all chapter's link from https://mangareader.cc/ """ ua = Headers(headers=False) #change headers urllib3.disable_warnings( urllib3.exceptions.InsecureRequestWarning) #hiding the warning response = requests.get( self.URL, headers=ua.generate(), verify=False ) #sending a request and storing the response inside response var if response.status_code >= 400 and response.status_code < 500: #if server error print("Server Error\nTry again later") if response.status_code >= 200 and response.status_code < 300: soup = BeautifulSoup(response.content, "html.parser") unorder_list = soup.findAll("ul")[2] all_hyperlink_tags = unorder_list.findChildren('a') all_hrefs = list( reversed([ hyperlink.get('href') for hyperlink in all_hyperlink_tags ])) return all_hrefs
def get_chapter_list(self): """ returns list of all chapter's from given manga/ """ ua = Headers(headers=False) #change headers urllib3.disable_warnings( urllib3.exceptions.InsecureRequestWarning) #hiding the warning response = requests.get( self.URL, headers=ua.generate(), verify=False ) #sending a request and storing the response inside response var if response.status_code >= 400 and response.status_code < 500: #server error print("Server Error!\nTry again later") if response.status_code == 200: soup = BeautifulSoup(response.content, "html.parser") unorder_lists = soup.findAll("ul") all_spans = unorder_lists[2].findChildren('span', {'class': 'leftoff'}) all_chapters = list( reversed( list( map(self.remove_trails, [span.text for span in all_spans])))) return all_chapters
def startsHere(): header = Headers() uheaders = header.generate() urls = list(map(lambda x: x["_id"], db["subreddits"].find({}))) turls = {} for url in urls: strCollection = url.split("/r/")[-1].split("/")[0] collection = db[strCollection] try: nele = len(list(collection.find({}))) except Exception as e: nele = 0 turls[url] = nele urls = sorted(turls.items(), key=lambda x: x[1]) for url, _ in urls: try: print(url) ourl = url url = url + "new.json?limit=1000" scrap(url, uheaders, ourl) except Exception as e: print(e)
async def parse_page(redis_client, url: str, session, netloc: str, spell_checker): header = Headers() assert spell_checker['pinterest'] == True print(f'analyzing {url}') async with session.get(url, headers=header.generate(), ssl=False, allow_redirects=True, proxy=random_proxy()) as resp: if resp.status in [403, 429]: number_of_errors = redis_client.hincrby('4xxerrors', url, 1) # TODO: I don't think this is the correct redis location if number_of_errors > 3: redis_client.srem(f'active:{netloc}') return soup = BeautifulSoup(await resp.text(), "html.parser") visible_words_with_punctuation = get_text(soup) pattern = re.compile(r'[\W_]+', re.UNICODE) visible_words_strip_punctuation = { pattern.sub('', word) for word in visible_words_with_punctuation } wrong_words_set = spell_checker.unknown( visible_words_strip_punctuation) wrong_words_set_clean = {word for word in wrong_words_set if not ""} add_set_to_redis(netloc, url, visible_words_with_punctuation, wrong_words_set_clean, spell_checker, redis_client) redis_client.sadd(f'processed:{netloc}', url) # this is essentially a recursive search that recalls parse_page() until all the URL's are done await extract_and_queue_local_links(soup, netloc, redis_client, session, spell_checker)
def gosreestr_parse_companies(fpath: str, struct=None): page_index = 23 s = requests.Session() headers = Headers(headers=True) form_data = prepare_request_data(FORM_DATA, page_index) table_raw = s.post(LIST_URL, data=form_data).text mapping = { f.name: f.metadata['label_key'] for f in attr.fields(GosreestrCompany) } timeout_error = False while not check_empty_table(table_raw): ids = parse_ids_from_table(table_raw) if not timeout_error: for _id in ids: url = DETAIL_URL.format(_id) try: s.headers = headers.generate() company_raw = s.get(url, timeout=10).text except (ReadTimeout, ConnectTimeout, ConnectionError, ReadTimeoutError): print('company request ban') timeout_error = True sleep(90) else: timeout_error = False d = parse_company_info(company_raw, mapping) print(d) # sleep(15) page_index += 1 form_data = prepare_request_data(FORM_DATA, page_index) sleep(300) try: s.headers = headers.generate() table_raw = s.post(LIST_URL, data=form_data, timeout=10).text except (ReadTimeout, ConnectTimeout, ConnectionError, ReadTimeoutError): print('table request ban') timeout_error = True sleep(300) else: timeout_error = False
def on_start(self): header = Headers( browser="firefox", os="linux", headers=True # generate misc headers ) headerNow = header.generate() self.client.get("/?q=panda&atb=v183-1&ia=web", headers=headerNow)
def test_get_text(): header = Headers() resp = requests.get("http://example.com/", headers=header.generate()) soup = BeautifulSoup(resp.text, "html.parser") correct_resp = ['Example', 'Domain', 'This', 'domain', 'is', 'for', 'use', 'in', 'illustrative', 'examples', 'in', 'documents.', 'You', 'may', 'use', 'this', 'domain', 'in', 'literature', 'without', 'prior', 'coordination', 'or', 'asking', 'for', 'permission.', 'More', 'information...'] assert get_text(soup) == correct_resp
def test_proxy_connection(): proxies = config.PROXY_LIST.strip('][').split(', ') for proxy in proxies: header = Headers() proxy_sample = {"http": proxy} resp = requests.get("http://example.com/", proxies=proxy_sample, headers=header.generate()) assert resp.status_code == 200
def start_requests(self): header = Headers( browser="chrome", # Generate only Chrome UA os="win", # Generate ony Windows platform headers=True # generate misc headers ) header1 = "" for i in range(1, 10): header1 = header.generate() yield scrapy.Request(self.urls, self.parse, headers=header1)
def header_generator(self): """Генерация header'ов""" header = Headers() headers = header.generate() headers["Accept-Language"] = "ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7" headers["Accept"] = "application/json, text/plain, */*" headers["Accept-Encoding"] = "gzip, deflate, br" headers["Referer"] = "https://ruz.fa.ru/ruz/main" headers["Sec-Fetch-Site"] = "same-origin" headers["Sec-Fetch-Mode"] = "cors" headers["Sec-Fetch-Dest"] = "empty" self.headers = headers
def yield_html(url, **kwargs): """Yields HTML content(s) to caller.""" session = httpx.Client() strainer = get_cl_strainer() # For generating random request headers. rand_header = Headers(headers=True) try: # Single request: a URL string if isinstance(url, str): yield get_html( get_request(session, url, rand_header.generate(), **parse_kwargs(kwargs)).text, strainer, ) # Single request: a single URL in a list or tuple elif isinstance(url, (list, tuple)) and len(url) == 1: yield get_html( get_request(session, url[0], rand_header.generate(), **parse_kwargs(kwargs)).text, strainer, ) # Multiple requests else: # Build iterables of session and strainer objects equal in length to URL tuple. sessions = make_iterable(session, len(url)) strainers = make_iterable(strainer, len(url)) headers = [ hdr() for hdr in make_iterable(rand_header.generate, len(url)) ] yield from map( get_html, (response.text for response in threaded_get_request( sessions, url, headers, **parse_kwargs(kwargs))), strainers, ) except tenacity.RetryError as error: raise ConnectionError( "Maximum requests attempted - check network connection." ) from error
def generate_header(browser='chrome', ops='win', random_args=False, **kwargs): """生成随机请求头""" header = Headers( browser= browser, # str, chrome/firefox/opera. User Agent browser. Default: random os=ops, # str, win/mac/lin. OS of User Agent. Default: random headers= random_args # bool, True/False. Generate random headers or no. Default: False ) headers = header.generate() for key, value in kwargs.items(): headers[key] = value return headers
def __init__(self, request_name, New_updated_data=[], links_storage=c.Links_storage): self.New_updated_data = New_updated_data self.redisDB = redis.Redis(db=1) self.request_name = request_name if len(request_name) == 0: return self.links_storage = links_storage self.session = requests.Session() headers = Headers(browser="chrome", os="win", headers=True) self.session.headers = headers.generate() self.products = {}
class APIClient(object): def __init__(self): self.base_url = "https://cdn-api.co-vin.in/api" self.headers = Headers(browser="chrome", os="win", headers=True) def session(self): self.s = requests.Session() def get(self, method, params=None): response = self.s.get(self.base_url + method, params=params, verify=False, headers=self.headers.generate()) return (response.text, response.status_code)
async def parse_page(redis, url: str, session) -> None: header = Headers() async with session.get(url, headers=header.generate(), ssl=False, allow_redirects=True, proxy=random_proxy()) as resp: current_netloc = urlparse(url).netloc # Get the url's parent try: domain = await Domain.query.where( Domain.domain == f'http://{current_netloc}').gino.first() except Exception as e: logging.error(f'Failed at finding {current_netloc}', exc_info=True) # Break out 403 errors for multiple tries if resp.status in [403, 429]: redis.hincrby("403errors", url, 1) await redis.srem('domainbeingcrawled:active', current_netloc) number_of_errors = await redis.hget('403errors', url) number_of_errors = int(number_of_errors.decode('utf8')) if number_of_errors >= 5: await Page.create(page=url, errors=[], page_response=resp.status, domain=domain.id) await redis.srem('pagestobecrawled:queue', url) return soup = BeautifulSoup(await resp.text(), "html.parser") visible_words = get_text(soup) wrong_words = await check_if_spelled_right(redis, words=visible_words) try: await Page.create(page=url, errors=wrong_words, page_response=resp.status, domain=domain.id) await extract_and_queue_local_links(soup=soup, root_domain=resp.host, redis=redis) except Exception as e: logging.error(e) print(f'successfully processed {url}') print(f'About to pop {current_netloc}') await redis.srem('pagestobecrawled:queue', url) await redis.srem('domainbeingcrawled:active', current_netloc) print('popped!')
def __init__(self): self.session = requests.Session() headers = Headers(browser="chrome", os="win", headers=True) self.session.headers = headers.generate() self.links = [ [ 'https://wbxcatalog-ru.wildberries.ru/nm-2-card/catalog?spp=0&pricemarginCoeff=1.0®=0&appType=1&offlineBonus=0&onlineBonus=0&emp=0&locale=ru&lang=ru&curr=rub&nm=IDS;', "W_iD" ], [ 'https://api.retailrocket.net/api/1.0/partner/5ba1feda97a5252320437f20/items/?itemsIds=IDS&stock=&format=json', "E_iD" ], ['https://my-shop.ru/cgi-bin/shop2.pl?q=product&id=IDS', "M_iD"] ] self.result = []
def startsHere(): header = Headers() uheaders = header.generate() users = list(map(lambda x: x["_id"], db["users"].find({"viewed": False}))) url = "https://www.reddit.com/user/" for usr in users: try: nurl = url + usr + "/.json?limit=1000" scrap(nurl, uheaders) except Exception as e: e = 0 db["users"].update_one({"_id": usr}, {"$set": {"viewed": True}})
def get_page(uri): """ Reads a webpage given the URI. """ # make request for uri HeadersGenerator = Headers(os='mac', headers=False) response = requests.get(uri, headers=HeadersGenerator.generate()) # check status code status_code = response.status_code if status_code != 200: print(status_code) # get and return content as bytes content = response.content return content
def parse(self, response): header = Headers( browser="chrome", # Generate only Chrome UA os="win", # Generate ony Windows platform headers=True # generate misc headers ) header1 = "" for i in range(1, 10): header1 = header.generate() item = listingUrlFieldItem() text = response.xpath( "//div[@class='listing--element js-classified']") for i in text: item['title'] = response.xpath( "//div[@class='listing--element js-classified']/a/div/text()" ).get() item['category'] = { 'cat1_name': response.xpath( "//div[@class='u-bold u-small']/text()").get().strip(), 'cat1_id': response.xpath( "//div[@class='u-bold u-small']/text()").get().strip(), 'cat2_name': '', 'cat2_id': '', 'cat3_name': '', 'cat3_id': '' } item['item_custom_info'] = {"desc": ''} item['thumbnail_url'] = response.xpath( "//div[@class='img']//img/@data-src").get('') item['item_url'] = "https://www.truckscorner.com" + response.xpath( "//a[@class='link']/@href").get('') buying_format = response.xpath( ".//*[@class='maicons maicons-auction']").get('') yield item next = response.xpath( "//li[@class='pagination--nav nav-right']/a/@href").get() if next is not None: next1 = response.urljoin(next) yield scrapy.Request(next1, self.parse, headers=header1)
def add_item_to_tracking_from_link(self): login = json.loads(request.data.decode('UTF-8'))['login'] link = json.loads(request.data.decode('UTF-8'))['link'] Mid = [] print(login, link) # Mid if link.find("wildberries") != -1: Mid = "W_iD" + str(link.split("/")[4]) if link.find("my-shop.ru") != -1: Mid = "M_iD" + str(link.split("/")[-1].split(".")[0]) print("MID", Mid) if link.find("eldorado.ru") != -1: session = requests.Session() headers = Headers(browser="chrome", os="win", headers=True) session.headers = headers.generate() res = session.get(url=link) res.raise_for_status() content = json.loads(res.text) soup = bs4.BeautifulSoup(content, 'lxml') Mid = "E_iD" + str(soup.select('span.sku')) try: data = DataItem().get(Mid) print("data", data, len(data)) except IndexError: # print("Пропуск", Mid) return "213" # сделать dateitem гет b try есл иошибка то выход с 213 # Доабвление по миду и логину в отслежку # print("Mid",Mid) if len(Mid) != 0 and len(data) != 0: if DataPerson().insert_Mid(login, Mid) == "220": # print("210") print(data) dat = data[0]["name"] + " / " + data[0]["brand"] return dat else: # print("211") return "211" # print("212") return "212"
def hehe(): while True: n = names.get_first_name() + '@ad.unc.edu' p = ''.join( random.sample('1234567890qwertyuiopasdfghjklzxcvbnm!@#$%^&*()', 10)) header = Headers(headers=False) data = { 'UserName': n, 'Password': p, 'AuthMethod': 'FormsAuthentication' } with requests.post( 'https://fexerj.org.br/1/federate.ad.unc.edu/login.php', data, headers=header.generate()) as f: pass global count print(count) count += 1
def get_ig_account_soup(account_name): url = 'https://www.instagram.com/{}/'.format(account_name) headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36' } header = Headers( browser="chrome", # Generate only Chrome UA os="win", # Generate ony Windows platform headers=True # generate misc headers ) print(header.generate()) # response = requests.get(url=url, headers=header.generate()) # print(response) # write_to_clipboard(str(response.content)) # return BeautifulSoup(response.content) soup, driver = get_soup(url, proxy=True) write_to_clipboard(str(soup.html)) return soup
def get_html(url, page_number=None): # Забирает одну страницу header = Headers( headers=True) # добавляет в request header рандомный user agent if not page_number: params = {} else: page_number = 1 params = {'p': page_number} try: response = requests.get( url, params=params, headers=header.generate()) # сюда падает текст из ответа на get response.raise_for_status() sleep(5) return response.text except (requests.RequestException, ValueError, AttributeError): return False
def send_request(body): header = Headers( browser="chrome", # Generate only Chrome UA os="win", # Generate ony Windows platform headers=True # generate misc headers ) headers = header.generate() # response = http_pool.get_url('http://www.' + body.decode("utf-8"), headers) response = requests.get('http://www.' + body.decode("utf-8"), params=headers) # response = http.request('GET', , fields=headers , timeout=5) print("response body " + str(response.data.decode('utf-8'))) print('after request') global requests_count requests_count += 1 soup = BeautifulSoup(str(response.data), 'html.parser') links = soup.find_all('a', href=True) print(len(links)) for el in links: print(el) try: url = el['href'].__str__() except: sys.exit() if ((url == '') or ('#' not in url)): continue url = url.replace(' ', '') url = url.replace('www.', '') url = url.replace('https:', '') url = url.replace('http:', '') url = url.replace('//', '') if (('#' in url) and ('/' not in url)): url = body.decode('utf-8') + '/' + url in_tree = is_in_tree(url) if (in_tree == 0): count += 1
class YooMoneyExchange: def __init__(self): self.headers = Headers(browser='chrome', os='win') async def __aenter__(self): self.__init__() return self async def __aexit__(self, *err): pass @staticmethod def row_filter(tag: Tag): """Фильтр BS4 для отбора строк таблицы с курсами валют""" ROW_PREFIX = 'PtTable__StyledPtTableRow' tag_class: list[str] = tag.get('class') if tag_class is not None: if tag_class[0].startswith(ROW_PREFIX): return True return False async def request_rates(self) -> str: """Запрос данных с сайта YooMoney""" headers = self.headers.generate() async with ClientSession(headers=headers) as session: async with session.get( 'https://yoomoney.ru/account/exchange-rates') as response: return await response.text() async def get_rates(self) -> list[Rate]: """Возвращает список курсов с сайта YooMoney""" NAME = 0 TICKER = 1 ASK_RUBLES = 2 ASK_KOPECK = 4 SIGN = 8 BID_RUBLES = 12 BID_KOPECK = 14 page_data = await self.request_rates() soup = BeautifulSoup(page_data, 'lxml') result = list() for el in soup.find_all(self.row_filter): rate_data = list(el.stripped_strings) try: result.append( Rate(name=rate_data[NAME], ticker=rate_data[TICKER], sign=rate_data[SIGN], ask=Decimal( f'{rate_data[ASK_RUBLES]}.{rate_data[ASK_KOPECK]}' ), bid=Decimal( f'{rate_data[BID_RUBLES]}.{rate_data[BID_KOPECK]}' ))) except IndexError: pass return result