def crawl_by_keywords(keywords): connector = db.MongoConnector(config.DB_HOST,config.DB_USER_NAME,config.DB_PASSWORD,config.DB_NAME) backend = MongoQABackend(connector,config.QA_COLLECT_NAME) keywords = util.read_txt_lines(args.kw_file) keywords = util.expand_keywords(keywords,['飲食']) kw_request = KeywordQueryRequest(util.get_browser_driver(config.DRIVER_PATH,config.ENV)) asession = AsyncHTMLSession() for keyword in keywords: start_url ='http://so.120ask.com/?kw=%s'%(keyword) current_url = start_url while True: page_src = kw_request.send(current_url) if page_src is None: break page = KeywordQueryPage(page_src) links = page.parse_question_links() qids = page.parse_question_ids() l = [] for qid,link in zip(qids,links): cb = AsyncHealthPageCallback(qid,backend) arq = AsyncHealthQuestionRequest(asession,link,cb) l.append(arq) if len(l)>0: asession.run(*[ r.send for r in l ]) next_link = page.parse_next_page_link() if next_link is None: break current_url = urljoin(start_url,next_link)
async def logic(urls): try: new_loop=asyncio.new_event_loop() asyncio.set_event_loop(new_loop) session = AsyncHTMLSession() browser = await launch({ 'ignoreHTTPSErrors':True, 'headless':True, 'handleSIGINT':False, 'handleSIGTERM':False, 'handleSIGHUP':False, 'args': ['--no-sandbox', '--disable-setuid-sandbox'] }) session._browser = browser urls1=urls.split(',') emails1=[] for url in urls1: emails = await fetch(url, session) for email in emails: emails1.append(email) for i in range(len(emails1)): if(i % 2 == 1): emails1.pop(i-1) for i in range(len(emails1)): returndict.update({'email' + str(i+1):emails1[i]}) return returndict except Exception as e: print(e) falseret=[] return falseret
def find_department_of_keywords(keywords, filepath): kw_request = KeywordQueryRequest( util.get_browser_driver(config.DRIVER_PATH, config.ENV)) asession = AsyncHTMLSession() f = open(filepath, 'w', encoding='utf-8') for keyword in keywords: start_url = 'http://so.120ask.com/?kw=%s' % (keyword) current_url = start_url page_src = kw_request.send(current_url) assert page_src is not None page = KeywordQueryPage(page_src) links = page.parse_question_links() qids = page.parse_question_ids() l = [] for qid, link in zip(qids, links): cb = DepartmentOfKeywordCallback() arq = AsyncHealthQuestionRequest(asession, link, cb) l.append(arq) assert len(l) > 3 res = asession.run(*[r.send for r in l]) # most of department of questions is the department of keyword c = Counter(res) department, url = c.most_common()[0][0] f.write('%s-->%s,%s\n' % (keyword, department, url))
async def get_async_response(self): """ define async get response to have the JS executed """ asession = AsyncHTMLSession() # asession = self.set_session(asession) aresponse = asession.get(self.url.geturl()) return await aresponse
def async_get(event_loop): """ AsyncSession cannot be created global since it will create a different loop from pytest-asyncio. """ async_session = AsyncHTMLSession() async_session.mount('file://', FileAdapter()) path = os.path.sep.join((os.path.dirname(os.path.abspath(__file__)), 'python.html')) url = 'file://{}'.format(path) return partial(async_session.get, url)
def spider_opened(self, spider: Spider) -> None: """Open HTMLSession when spider starts""" try: self.session = AsyncHTMLSession(**self.settings) except TypeError: self.session = AsyncHTMLSession() raise AttributeError( "DEFAULT_SCRAPY_REQUESTS_SETTINGS is not " + "aligned with requests-html session settings. \n" + "Please check www.github.com/psf/requests-html/blob/026c4e5217cfc8347614148aab331d81402f596b/requests_html.py#L759" )
async def search_rootme_user_challenges(username: str): url = f"https://www.root-me.org/{username}?inc=score" session = AsyncHTMLSession() async def get_profile(): r = await session.get(url) data = {} data['score'] = r.html.xpath( "/html/body/div[1]/div/div[2]/main/div/div/div/div/div[2]/div[1]/div[1]/span/text()" )[0].split("\xa0")[0][1:] data['ranking'] = r.html.xpath( "/html/body/div[1]/div/div[2]/main/div/div/div/div/div[2]/div[1]/div[2]/span" )[0].text data['rank'] = r.html.xpath( "/html/body/div[1]/div/div[2]/main/div/div/div/div/div[2]/div[1]/div[3]/span" )[0].text categories_list = r.html.xpath( "/html/body/div/div/div[2]/main/div/div/div/div/div[2]")[0].find( "div") categories = {} for x in categories_list: category = x.find('div')[0] try: title = category.find('h4')[0].text.split('\n')[1] categories[title] = { "percentage": category.find('h4')[0].text.split('\n')[0] } points, _, completion = category.find("span")[1].text.split( '\xa0') categories[title]['points'] = points categories[title]['completion'] = completion categories[title]['challenges'] = {} challenges = category.find("ul")[0].find('li') for challenge in challenges: categories[title]['challenges'][challenge.text[2:]] = { 'completed': True if challenge.text[0] == 'o' else False } categories[title]['challenges'][ challenge.text[2:]]['points'] = challenge.find( 'a')[0].attrs['title'].split(' ')[0] except: pass data['challenges'] = categories return data return session.run(get_profile)[0]
async def main(): if not os.path.exists('./img'): os.mkdir('img') # sneaker_links_parser = Sneaker_Links("https://sneakerlinks.com", "test") # sneaker_links_parser.get_data() asession = AsyncHTMLSession() Solelinks_parser = Solelinks("https://vagu.space", "solelinks", asession) # await Solelinks_parser.get_data() # await task asession.run(Solelinks_parser.get_page)
def __init__(self, session: AsyncHTMLSession = None, config={"EKSI_URL": "https://eksisozluk.com/"}): """ Sınıfı başlatır. """ if session == None: self.session = AsyncHTMLSession() else: if isinstance(session, AsyncHTMLSession): self.session = session else: self.session = AsyncHTMLSession() self.config = config self.eksi = self.config["EKSI_URL"]
def __init__(self, gitlabAddress: str) -> None: self._orunSession = AsyncHTMLSession(workers=20) self._gitlabAddress = gitlabAddress # do not stop the loop on self.getAllProjectMetadata() self._dontStopLoop = True self._pagesCount = 0 # raw data self.projectsMetadata = list() self.projectCommitsMetadata = list() # **start** Orunmila knows self._commitsByYear = dict() self._numberOfProjects = 0
async def main(): pictures_queue = Queue() workers_count = 300 connection = { 'user': '******', # input your postgres username 'database': 'your database name', # input your database name here 'host': '127.0.0.1', # change your host if it's not local 'password': '******' # input your password for this database } dsn = 'postgresql://{user}:{password}@{host}/{database}'.format(**connection) engine = create_engine(dsn) result = engine.execute('''select picture from "your_table_name"''') res_list = [] for row in result: clean_jpg = row['picture'].split("\n") for i in clean_jpg: res_list.append(i) print(len(res_list)) for pic in res_list: pictures_queue.put_nowait(pic) session = AsyncHTMLSession() tasks = [] for num in range(workers_count): task = worker(pictures_queue, num, session) tasks.append(task) await asyncio.gather(*tasks)
async def moz_parser(urls_q): session = AsyncHTMLSession() while urls_q.qsize() > 0: url = await urls_q.get() try: response = await session.get(url) sleep(5) except Exception as e: print(type(e), e) await urls_q.put(url) continue links = response.html.xpath('//h2/a') for link in links: href = link.attrs['href'] name = link.text with open('results.txt', 'a', encoding='utf-8') as f: f.write(f'{href}\t{name}\n') print(f'SUCCESS | {url}')
def main(): session = AsyncHTMLSession() working_list = set([i for i in load(file)]) print(f'Len of working list {len(working_list)}') tasks = [ ping_creator(session, url) for url in working_list if filtered_url(url) ]
async def kuaidaili(): res = list() sess = AsyncHTMLSession() resp = await sess.get(f'https://www.kuaidaili.com/free/inha/') for ip_row in resp.html.find('#list table tr'): ip = ip_row.find('td[data-title="IP"]', first=True) port = ip_row.find('td[data-title="PORT"]', first=True) if ip and port: res.append(Proxy( ip_port=f"{ip.text}:{port.text}", scheme=SCHEME_HTTP, status=STATUS_NEW, )) await asyncio.sleep(5) resp = await sess.get(f'https://www.kuaidaili.com/free/intr/') for ip_row in resp.html.find('#list table tr'): ip = ip_row.find('td[data-title="IP"]', first=True) port = ip_row.find('td[data-title="PORT"]', first=True) if ip and port: res.append(Proxy( ip_port=f"{ip.text}:{port.text}", scheme=SCHEME_HTTP, status=STATUS_NEW, )) await sess.close() return res
async def http_proxy(): res = list() sess = AsyncHTMLSession() for u in [ 'https://proxyhttp.net/free-list/proxy-anonymous-hide-ip-address/', 'https://proxyhttp.net/', 'https://proxyhttp.net/free-list/anonymous-server-hide-ip-address/2#proxylist', ]: resp = await sess.get(u) await resp.html.arender(wait=1.5, timeout=10.0) for ip_row in resp.html.find('table.proxytbl tr'): ip = ip_row.find('td:nth-child(1)', first=True) port = ip_row.find('td:nth-child(2)', first=True) try: if ip and port: port_str = re.search(r'//]]> (\d+)', port.text).group(1) res.append(Proxy( ip_port=f"{ip.text}:{port_str}", scheme=SCHEME_HTTP, status=STATUS_NEW, )) except AttributeError: pass await sess.close() return res
async def __get(url: str, proxy: str = None) -> Response: session = AsyncHTMLSession() result = await session.get(url, proxies=Scrapper.__proxies(proxy), timeout=Scrapper.PROXY_TIMEOUT) \ if proxy is not None \ else await session.get(url, timeout=Scrapper.GET_TIMEOUT) await session.close() return result
async def l_wine(message): search_string = message.text session = AsyncHTMLSession() link = 'https://l-wine.ru/collection/?' headers = { 'authority': 'l-wine.ru', 'cache-control': 'no-store, no-cache, must-revalidate', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36', 'sec-fetch-dest': 'empty', 'accept': 'application/json, text/javascript, */*; q=0.01', 'origin': 'https://l-wine.ru', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'cors', } params = { 'q': search_string, 'PAGEN_1': '2', 'bxajaxid': '556e3da235ad5f10fd6e4d11c79000cf', 'parent_bxajaxid': '5a96feaafb623ec95f9c643be02cc78f' } r = await session.get(link, headers=headers, params=params) #await r.html.arender(timeout=20) print(r.text)
async def get_stock(): bestbuy_base_url = "https://www.bestbuy.com/site/computer-cards-components/video-graphics-cards/abcat0507002.c?id=abcat0507002" bestbuy_model_stub = Template( "qp=gpusv_facet%3DGraphics%20Processing%20Unit%20(GPU)~NVIDIA%20GeForce%20RTX%20$Model" ) # Get the current time and append to the end of the url just to add some minor difference # between scrapes. t = int(round(time.time() * 1000)) urls = { f"3070-={bestbuy_base_url}&{bestbuy_model_stub.substitute(Model='3070')}&t={t}", f"3070-=https://www.newegg.com/p/pl?N=100007709%20601357250&PageSize=96&t={t}", f"3080-={bestbuy_base_url}&{bestbuy_model_stub.substitute(Model='3080')}&t={t}", f"3080-=https://www.newegg.com/p/pl?N=100007709%20601357247&PageSize=96&t={t}", f"3090-={bestbuy_base_url}&{bestbuy_model_stub.substitute(Model='3090')}&t={t}", f"3090-=https://www.newegg.com/p/pl?N=100007709%20601357248&PageSize=96&t={t}" } s = AsyncHTMLSession() tasks = (parse_url(s, url.split("-=")[1], url.split("-=")[0]) for url in urls) return await asyncio.gather(*tasks)
async def main2(): urlList = await main() headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36' } #print(urlList) for link in urlList: asession = AsyncHTMLSession() r = await asession.get(link, headers=headers) await r.html.arender() if r.html.find('.video-wrap') == -1: continue else: h1 = r.html.find('.disable-download') #print(h1) newItem = str(h1) start = newItem.find("src='") + len("src='") end = newItem.find("' style='background") link = newItem[start:end] print(link) webbrowser.open(link, new=2) time.sleep(8)
async def qfdocs(self, ctx, arg): query_url = f'https://qiskit.org/documentation/search.html?q={arg}&check_keywords=yes&area=default#' try: session = AsyncHTMLSession() response = await session.get(query_url) except: return await ctx.send('`Failed to Establish Connection.`') else: await response.html.arender(sleep=7) soup = BeautifulSoup(response.html.html, "html.parser") summary = soup.select('.search') #return await ctx.send('`Request Timed Out`') description = f'' for li in summary[0].find_all('li')[0:10]: link = li.find('a', href=True) self.res.append( f'[`{link.contents[0]}`]({self.render_link + link["href"]})') if self.res == []: self.title = '`No Results Found`' else: self.title = f'`Results for: {arg}`' embed = discord.Embed(title=self.title, description='\n'.join(self.res), color=0xe8e3e3) return await ctx.send(embed=embed)
async def fetch(self, link) -> Optional[Tuple]: session = AsyncHTMLSession() try: r = await session.get(link) container = r.html.find(".zn-body-text", first=True) contents: List[Element] = container.find() html = "" cover = None for content in contents: element_class = content.attrs.get('class') if element_class: if "zn-body__paragraph" in element_class: html += content.html if "el__embedded" in element_class: img = content.find('img', first=True) if img: caption = img.attrs.get('alt') src = img.attrs.get('data-src-large') src = f"https:{src}" if not cover: cover = src html += f'<img src="{src}" />\n' html += f"<span>{caption}</span>\n" self.parser.parse(html) return self.parser.convert(), str(self.parser), cover except Exception as e: print(e) return None, None, None
async def get_character_name(gear_url, message): """ It is *sometimes* the case that discord users don't update their username to be their character name (eg for alts). This method renders the gear_url in an HTML session and parses the page to attempt to find the character's name. This assumes a specific format of the page: player names are nested in an h3 element with css class named 'class-[player class]' Returns the character's name if successful, otherwise returns the message sender's display name in discord. """ name = message.author.display_name if not re.match(SIXTY_UPGRADES_REGEX, gear_url): return name for i in range(MAX_FETCH_CHARACTER_NAME_RETRIES): try: asession = AsyncHTMLSession() webpage = await asession.get(gear_url) await webpage.html.arender() query_selector = "h3[class^='class-']" name = webpage.html.find(query_selector, first=True).text break except Exception as e: logging.error(e) finally: await asession.close() return name
async def getErrorQuestion(self,urlAt): print("start error session " +urlAt[0] + " " + urlAt[1]) errorSession = AsyncHTMLSession() repeat = True while repeat: try: print("connecting: " + urlAt[0]) response = await errorSession.get(urlAt[0]) except: pass else: repeat = False print("rendering: " + urlAt[0]) try: rendered = await self.renderQuestion(response) except: print("error was on " + urlAt[0]) self.errors.append(url) response.close() else: del self.errors[self.errors.index(urlAt)] response.close() print("correct rendered on:" + urlAt[0]) self.responses.append([rendered, urlAt[1]]) await errorSession.close()
async def fetch(self, link: str) -> Optional[Tuple]: try: session = AsyncHTMLSession() r = await session.get(link) content = r.html.find(".Mid2L_con", first=True) content_list = content.find() texts = [] images = [] content = "" cover = None for c in content_list: image = c.find("img", first=True) if image: if image.attrs['src'] not in images: content += image.html images.append(image.attrs['src']) if not cover: cover = image.attrs['src'] else: if c.text not in texts: content += c.html texts.append(c.text) self.parser.parse(content) return self.parser.convert(), str(self.parser), cover except Exception as e: return None, None, None
async def get_stock(): bestbuy_base_url = "https://www.bestbuy.com/site/computer-cards-components/video-graphics-cards/abcat0507002.c?id=abcat0507002" bestbuy_model_stub = Template("qp=gpusv_facet%3DGraphics%20Processing%20Unit%20(GPU)~NVIDIA%20GeForce%20RTX%20$Model") # Get the current time and append to the end of the url just to add some minor difference # between scrapes. t = int(round(time.time() * 1000)) urls = { # f"3070-={bestbuy_base_url}&{bestbuy_model_stub.substitute(Model='3070')}&t={t}", # f"3070-=https://www.newegg.com/p/pl?N=100007709%20601357250&PageSize=96&t={t}", f"3080-={bestbuy_base_url}&{bestbuy_model_stub.substitute(Model='3080')}&t={t}", f"3080-=https://www.newegg.com/p/pl?N=100007709%20601357247&PageSize=96&t={t}", f"3090-={bestbuy_base_url}&{bestbuy_model_stub.substitute(Model='3090')}&t={t}", f"3090-=https://www.amazon.com/s?k=rtx+3090&i=computers&rh=n%3A17923671011%2Cn%3A284822%2Cp_n_availability%3A1248801011&dc&qid=1605664070&rnid=1248799011&t=%7Bt%7D&t={t}" f"3090-=https://www.newegg.com/p/pl?N=100007709%20601357248&PageSize=96&t={t}", f"5900X-=https://www.bestbuy.com/site/promo/amd-ryzen-5000?qp=numberofcores_facet%3DNumber%20of%20Cores~12-core&t={t}", f"5900X-=https://www.newegg.com/p/pl?N=100007671%20601359154%20601301117&t={t}", f"5900X-=https://www.amazon.com/s?k=5900x&i=computers&rh=n%3A229189%2Cp_n_availability%3A1248801011&dc&qid=1605664558&rnid=1248799011&t={t}", } s = AsyncHTMLSession() tasks = (parse_url(s, url.split("-=")[1], url.split("-=")[0]) for url in urls) return await asyncio.gather(*tasks)
def test_async_run(): asession = AsyncHTMLSession() async def test1(): return await asession.get('https://xkcd.com/1957/') async def test2(): return await asession.get('https://reddit.com/') async def test3(): return await asession.get('https://smile.amazon.com/') r = asession.run(test1, test2, test3) assert len(r) == 3 assert isinstance(r[0], HTMLResponse)
async def proxynova(): res = list() sess = AsyncHTMLSession() resp = await sess.get('https://www.proxynova.com/proxy-server-list/') for tr in resp.html.find('#tbl_proxy_list > tbody:nth-child(2) > tr'): if 'data-proxy-id' not in tr.attrs: continue script_element = tr.find('td:nth-child(1) > abbr > script', first=True) port_element = tr.find('td:nth-child(2)', first=True) if not script_element or not port_element: continue groups = re.findall( r"document\.write\('(.*?)'\);", script_element.text) if not groups or len(groups) != 1: continue ip = groups[0] port = port_element.text res.append(Proxy( ip_port=f"{ip}:{port}", scheme=SCHEME_HTTP, status=STATUS_NEW, )) await sess.close() return res
async def getRanks(platform, profile): asession = AsyncHTMLSession() url = "https://rocketleague.tracker.network/profile/" + platform.strip( ) + "/" + profile r = await asession.get(url) await r.html.arender(retries=8) ranks = {} #parsing stuff below parse = r.html.text.split("(Top") highest = ["", 0] for i in range(1, len(parse) - 2): if ("Ranked" in parse[i] and "-" not in parse[i]): if ("Grand Champion Division" in parse[i] or "Unranked Division" in parse[i] or "I Division I" in parse[i]): gameMode = parse[i].split("v")[0].split("\n")[-1][:-1].strip() rank = parse[i].split(gameMode)[1].split("\n")[0][5:].strip() mmr = int(parse[i].split("\n")[-2].strip().replace(",", "")) ranks[gameMode] = {"Rank": "", "MMR": 0} ranks[gameMode]["Rank"] = rank ranks[gameMode]["MMR"] = mmr if (mmr > highest[1]): highest = [rank, mmr] r.close() return ranks, highest
async def get_data_asynchronous(): urls = [ 'http://www.fpb.pt/fpb2014/!site.go?s=1&show=jog&id=258215' ] with ThreadPoolExecutor(max_workers=20) as executor: with AsyncHTMLSession() as session: # Set any session parameters here before calling `fetch` # Initialize the event loop loop = asyncio.get_event_loop() # Use list comprehension to create a list of # tasks to complete. The executor will run the `fetch` # function for each url in the urlslist tasks = [ await loop.run_in_executor( executor, fetch, *(session, url) # Allows us to pass in multiple arguments to `fetch` ) for url in urls ] # Initializes the tasks to run and awaits their results for response in await asyncio.gather(*tasks): parseWebpage(response)
async def get_site(self): new_loop=asyncio.new_event_loop() asyncio.set_event_loop(new_loop) session = AsyncHTMLSession() browser = await pyppeteer.launch({ 'ignoreHTTPSErrors':True, 'headless':True, 'handleSIGINT':False, 'handleSIGTERM':False, 'handleSIGHUP':False }) session._browser = browser url = 'https://money.tmx.com/en/quote/' + self.symbol resp_page = await session.get(url) await resp_page.html.arender() return resp_page