def crawl_by_keywords(keywords): connector = db.MongoConnector(config.DB_HOST,config.DB_USER_NAME,config.DB_PASSWORD,config.DB_NAME) backend = MongoQABackend(connector,config.QA_COLLECT_NAME) keywords = util.read_txt_lines(args.kw_file) keywords = util.expand_keywords(keywords,['飲食']) kw_request = KeywordQueryRequest(util.get_browser_driver(config.DRIVER_PATH,config.ENV)) asession = AsyncHTMLSession() for keyword in keywords: start_url ='http://so.120ask.com/?kw=%s'%(keyword) current_url = start_url while True: page_src = kw_request.send(current_url) if page_src is None: break page = KeywordQueryPage(page_src) links = page.parse_question_links() qids = page.parse_question_ids() l = [] for qid,link in zip(qids,links): cb = AsyncHealthPageCallback(qid,backend) arq = AsyncHealthQuestionRequest(asession,link,cb) l.append(arq) if len(l)>0: asession.run(*[ r.send for r in l ]) next_link = page.parse_next_page_link() if next_link is None: break current_url = urljoin(start_url,next_link)
async def main(): if not os.path.exists('./img'): os.mkdir('img') # sneaker_links_parser = Sneaker_Links("https://sneakerlinks.com", "test") # sneaker_links_parser.get_data() asession = AsyncHTMLSession() Solelinks_parser = Solelinks("https://vagu.space", "solelinks", asession) # await Solelinks_parser.get_data() # await task asession.run(Solelinks_parser.get_page)
def find_department_of_keywords(keywords, filepath): kw_request = KeywordQueryRequest( util.get_browser_driver(config.DRIVER_PATH, config.ENV)) asession = AsyncHTMLSession() f = open(filepath, 'w', encoding='utf-8') for keyword in keywords: start_url = 'http://so.120ask.com/?kw=%s' % (keyword) current_url = start_url page_src = kw_request.send(current_url) assert page_src is not None page = KeywordQueryPage(page_src) links = page.parse_question_links() qids = page.parse_question_ids() l = [] for qid, link in zip(qids, links): cb = DepartmentOfKeywordCallback() arq = AsyncHealthQuestionRequest(asession, link, cb) l.append(arq) assert len(l) > 3 res = asession.run(*[r.send for r in l]) # most of department of questions is the department of keyword c = Counter(res) department, url = c.most_common()[0][0] f.write('%s-->%s,%s\n' % (keyword, department, url))
def test_async_add_data(self): from requests_html import AsyncHTMLSession, HTMLSession session = HTMLSession() asession = AsyncHTMLSession() proxies = {} async def get_pythonorg(): r = await asession.get('https://baidu.com/', proxies=proxies) print(r.text) async def get_reddit(): r = await asession.get('https://sina.com/', proxies=proxies) print(r.text) async def get_google(): r = await asession.get('https://163.com/', proxies=proxies) print(r.text) asession.run(get_pythonorg, get_reddit, get_google)
async def search_rootme_user_challenges(username: str): url = f"https://www.root-me.org/{username}?inc=score" session = AsyncHTMLSession() async def get_profile(): r = await session.get(url) data = {} data['score'] = r.html.xpath( "/html/body/div[1]/div/div[2]/main/div/div/div/div/div[2]/div[1]/div[1]/span/text()" )[0].split("\xa0")[0][1:] data['ranking'] = r.html.xpath( "/html/body/div[1]/div/div[2]/main/div/div/div/div/div[2]/div[1]/div[2]/span" )[0].text data['rank'] = r.html.xpath( "/html/body/div[1]/div/div[2]/main/div/div/div/div/div[2]/div[1]/div[3]/span" )[0].text categories_list = r.html.xpath( "/html/body/div/div/div[2]/main/div/div/div/div/div[2]")[0].find( "div") categories = {} for x in categories_list: category = x.find('div')[0] try: title = category.find('h4')[0].text.split('\n')[1] categories[title] = { "percentage": category.find('h4')[0].text.split('\n')[0] } points, _, completion = category.find("span")[1].text.split( '\xa0') categories[title]['points'] = points categories[title]['completion'] = completion categories[title]['challenges'] = {} challenges = category.find("ul")[0].find('li') for challenge in challenges: categories[title]['challenges'][challenge.text[2:]] = { 'completed': True if challenge.text[0] == 'o' else False } categories[title]['challenges'][ challenge.text[2:]]['points'] = challenge.find( 'a')[0].attrs['title'].split(' ')[0] except: pass data['challenges'] = categories return data return session.run(get_profile)[0]
def test_async_run(): asession = AsyncHTMLSession() async def test1(): return await asession.get('https://xkcd.com/1957/') async def test2(): return await asession.get('https://reddit.com/') async def test3(): return await asession.get('https://smile.amazon.com/') r = asession.run(test1, test2, test3) assert len(r) == 3 assert isinstance(r[0], HTMLResponse)
def async_requests(RequestManagers): async def make_request(RequestManager, asession): rm_dict = RequestManager.dict() link = rm_dict['link'] headers = rm_dict['headers'] proxies = rm_dict['proxies'] try: r = await asession.get(link, headers=headers, proxies=proxy_num) rm_dict['response'] = r except: pass rm_dict['t1'] = datetime.now() RequestManager.set_dict(rm_dict) return RequestManager r1, r2, r3, r4, r5, r6, r7, r8 = LinkClient.getLinks() asession = AsyncHTMLSession() async def get_link1(): make_request(r1, assession) async def get_link2(): make_request(r2, asession) async def get_link3(): make_request(r3, asession) async def get_link4(): make_request(r4, asession) async def get_link5(): make_request(r5, asession) async def get_link6(): make_request(r6, asession) async def get_link7(): make_request(r7, asession) async def get_link8(): make_request(r8, asession) return asession.run(get_link1, get_link2, get_link3, get_link4, get_link5, get_link6, get_link7, get_link8)
def multi_request(urls, headers=None): if len(urls) == 0: return [] if headers is None: headers = {} session = AsyncHTMLSession() scrape_fns= [] for url in urls: async def get_site_content(url=url): return await session.get(url, headers=headers) scrape_fns.append(get_site_content) results = session.run(*scrape_fns) session.close() return results
def getnearbyweathers(url_list, weather_data): """ Getting the weathers in multiple locations async - reducing delay time :param url_list: List of urls to get the weather data from :return: weather_data: List of dictionary objects containing the weather data """ asession = AsyncHTMLSession() async def get_weaather1(): r = await asession.get(url_list[0]) async def get_weaather2(): r = await asession.get(url_list[1]) async def get_weaather3(): r = await asession.get(url_list[2]) responses = asession.run(get_weaather1(), get_weaather2(), get_weaather3()) print(responses) for response in responses: weather_data.append(get_weather_data_from_url(response)) return weather_data
async def main(): if not os.path.exists('./img'): os.mkdir('img') # sneaker_links_parser = Sneaker_Links("https://sneakerlinks.com", "test") # sneaker_links_parser.get_data() asession = AsyncHTMLSession() Solelinks_parser = Solelinks("https://vagu.space", "solelinks", asession) # await Solelinks_parser.get_data() # await task asession.run(Solelinks_parser.get_page) asession.run(solelinks) # asyncio.get_event_loop().run_until_complete(solelinks()) # asession = AsyncHTMLSession() # async def get_qq(): # r = await asession.get('https://vagu.space/') # await r.html.arender() # print(r.html.raw_html) # async def get_toutiao(): # r = await asession.get('https://www.toutiao.com/') # x = await r.html.arender() # print(x.html.raw_html)
# Instanciate list for champion dictionaries champlist = [] # Initialise dictionary list index index = 0 # open async session asession = AsyncHTMLSession() while index < end: # Initialise secondary function parameters (Number of functions = chunk) get_resp1 = getfunc1(tierLinkList[index + 0]["url"]) get_resp2 = getfunc2(tierLinkList[index + 1]["url"]) get_resp3 = getfunc3(tierLinkList[index + 2]["url"]) get_resp4 = getfunc4(tierLinkList[index + 3]["url"]) get_resp5 = getfunc5(tierLinkList[index + 4]["url"]) # Start async secondary functions resps = asession.run(get_resp1, get_resp2, get_resp3, get_resp4, get_resp5) # Process responses for i, r in enumerate(resps): cdir = get_champ(r, tierLinkList[index + i]) champlist.append(cdir) log.info(f"{cdir.get('Name', 'Not Found')} ({d}) Done") # Temporary supplement information to create csv from largest dict items # and smallest dict items to investigate any parsing errors maxtuple, mintuple = tmp_info(maxtuple, mintuple) d += 1 # End of Temporary info index += chunk rem = len(tierLinkList) % chunk
# 异步请求 async def get_lagou(): # await半等待状态 response = await asession.get(url=url_lagou) return response async def get_boss(): response = await asession.get(url_boss) return response async def get_qcwy(): response = await asession.get(url_qcwy) return response # 同时跑三个函数 results = asession.run(get_qcwy) # print(results ,list(map(lambda x:x,results))) for i in results: titles = i.html.xpath("/html/body/div(@class='dw_table')") for a in titles: title = i.html.xpath(".//a(@target='_blank')/@title") print(title) # 同时返回当前地址的url # print(i.html.url) # 抓取页面的所有url # links = i.html.links # print(links) # 填补url 有点像response.urljoin()的补全URL功能 # absolute_links = i.html.absolute_links
class Orunmila: def __init__(self, gitlabAddress: str) -> None: self._orunSession = AsyncHTMLSession(workers=20) self._gitlabAddress = gitlabAddress # do not stop the loop on self.getAllProjectMetadata() self._dontStopLoop = True self._pagesCount = 0 # raw data self.projectsMetadata = list() self.projectCommitsMetadata = list() # **start** Orunmila knows self._commitsByYear = dict() self._numberOfProjects = 0 # **end## Orunmila knows # # secondary method async def _getProjectMetadata(self) -> None: gitlabPlatResponse = await self._orunSession.get( f"{self._gitlabAddress}/api/v4/projects?&per_page=100&page={self._pagesCount}" ) if gitlabPlatResponse.json() == []: self._dontStopLoop = False return # for pMetadata in gitlabPlatResponse.json(): self.projectsMetadata.append(dict(pMetadata)) # self._pagesCount += 1 # # secondary method async def _getCommitsMetadata(self) -> None: pageCount = 0 gitlabPlatResponse = await self._orunSession.get( f"{self._gitlabAddress}/api/v4/projects/{self._projectCurrentTd}/repository/commits?&per_page=100&page={pageCount}" ) while gitlabPlatResponse.json() != []: tmp_gitlabPlatResponse = gitlabPlatResponse.json() for pCommit in tmp_gitlabPlatResponse: try: self.projectCommitsMetadata.append(dict(pCommit)) except ValueError as error: print( f"Error getting commits: {error}, on repository id: {self._projectCurrentTd}" ) # # pageCount += 1 gitlabPlatResponse = await self._orunSession.get( f"{self._gitlabAddress}/api/v4/projects/{self._projectCurrentTd}/repository/commits?&per_page=100&page={pageCount}" ) # print(f"getting data of project {self._projectCurrentTd}") # def getAllProjectsMetadata(self) -> List: while self._dontStopLoop: self._orunSession.run(self._getProjectMetadata) # print(len(self.projectsMetadata)) # cleanup self._pagesCount = 0 self._dontStopLoop = True return self.projectsMetadata # def getAllCommitsMetadata(self) -> List: for project in self.projectsMetadata: self._projectCurrentTd = project["id"] self._orunSession.run(self._getCommitsMetadata) # # cleanup del self._projectCurrentTd return self.projectCommitsMetadata
name, *courses_filtered = re.split("\n|,", teacher_section.find("p", first=True).text) courses = [course.strip() for course in courses_filtered if course] a_link = teacher_section.find("a", first=True) site = a_link.attrs.get("href") if a_link else None # return a tuple of the name, courses and site return (name, courses, site) results = list(map(lambda url: get_page_function(url), urls)) results_html = asession.run(*results) classes = {} for result in results_html: response = result.html # splitting the thing up, getting rid of the start _, *response_td = response.find('td') for td in response_td: teacher_name, courses, site = process_teacher_courses(td) teacher = (teacher_name, site)
logo, elem["name"], elem["symbol"], # description, tags, status, elem["max_supply"], elem["circulating_supply"], elem["total_supply"], elem["quote"]["USD"]["market_cap"], elem["quote"]["USD"]["price"], elem["quote"]["USD"]["volume_24h"], elem["quote"]["USD"]["percent_change_1h"], elem["quote"]["USD"]["percent_change_24h"], elem["quote"]["USD"]["percent_change_7d"], elem["last_updated"], ] with open(f"crypto_asset_meta.csv", "a") as f1: csv_writer_1 = csv.writer(f1) csv_writer_1.writerow(asset_info) if __name__ == '__main__': asession = AsyncHTMLSession() url_list = [ "https://web-api.coinmarketcap.com/v1/cryptocurrency/listings/latest?convert=USD&cryptocurrency_type=all&limit=1000&sort=market_cap&sort_dir=desc&start=1", ] asession.run( *[lambda url=url: get_asset_info(asession, url) for url in url_list])
class LandChinaBot: info_all = [] url = 'https://www.landchina.com/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36', # 'Cookie': 'security_session_mid_verify=b92679e3c892fc921cb78030f1e86157', 'Cookie': f'Hm_lvt_83853859c7247c5b03b527894622d3fa=1587194605,1587302414,1587386859,1587430790; ASP.NET_SessionId=uamiaoeymyazhuxclijuqxpm; security_session_verify=b85755a9c9b76a4d6195095ceaf8620d; security_session_high_verify=f06c25fa45c4419e26e53b51dff6f097; security_session_mid_verify=53623062e0ea9dd09f49fe550795e88a; Hm_lpvt_83853859c7247c5b03b527894622d3fa={int(time.time())}' } data = None def __init__(self, city_code, city_name): self.city_name = city_name self.getCityInfo(city_code, city_name) self.async_session = AsyncHTMLSession() def getCityInfo(self, city_code, city_name): # 894e12d9-6b0f-46a2-b053-73c49d2f706d:出让公告2011后 # city_info = unquote(f'894e12d9-6b0f-46a2-b053-73c49d2f706d:{city_code}' + u"▓~" + city_name) city_info = unquote( f'b4a43cbb-3c47-47ee-81cf-8b993d5bda89:{city_code}' + u"▓~" + city_name) city_info = city_info.encode("gb18030") self.data = { 'TAB_QuerySubmitConditionData': city_info, } def to_csv(self, datas): """ 存储csv文件逻辑 :param data: :return: """ import csv if not os.path.exists('./中国土地市场-出让公告2011前.csv'): names = [name for name in datas.keys()] with open(f'./中国土地市场-出让公告2011前.csv', 'a', newline='') as f: writer = csv.writer(f) if isinstance(names, list): # 单行存储 if names: writer.writerow(names) f.close() # 存数据 data = [i for i in datas.values()] try: with open(f'./中国土地市场-出让公告2011前.csv', 'a', newline='') as f: writer = csv.writer(f) if isinstance(data, list): # 单行存储 if data: writer.writerow(data) f.close() return True else: return False else: # print(type(data)) return False except Exception as e: raise e.args @staticmethod def stringToHex(): width = str(GetSystemMetrics(0)) height = str(GetSystemMetrics(1)) screendate = width + "," + height val = "" for i in range(len(screendate)): if val == "": val = binascii.b2a_hex(screendate[i].encode('utf-8')) else: val += binascii.b2a_hex(screendate[i].encode('utf-8')) return val.decode('utf-8') async def getCookie(self): response = await self.async_session.get(self.url, headers=self.headers) security_verify_data = self.stringToHex() link = f'{self.url}?security_verify_data={security_verify_data}' response = await self.async_session.get(link, headers=self.headers) # print(self.async_session.cookies) async def getInfo(self, session): # detail_link = [] link = f'{self.url}default.aspx?tabid=324' for page in range(1, 2): self.data['TAB_QuerySubmitPagerData'] = str(page) # print(self.data) try: response = requests.post(link, data=self.data, headers=self.headers) # print(response.content.decode('gbk')) info = Selector(text=response.content.decode('gbk')).xpath( '//*[@id="TAB_contentTable"]/tbody/tr') # info = response.html.xpath('//*[@id="TAB_contentTable"]/tbody/tr') for sub_raw in info[1:]: info_basic = {} basic_value = [] sub_list = sub_raw.xpath('td') for i, info_sub in enumerate(sub_list): if i == 0: info_sub = info_sub.xpath( 'text()').extract()[0][:-1] # print(info_sub, end=' ') basic_value.append(info_sub) elif i > 0 and i != 2: info_sub = info_sub.xpath('text()').extract()[0] # print(info_sub, end=' ') basic_value.append(info_sub) else: link_sub = info_sub.xpath('a/@href').extract()[0] # detail_link.append(link_sub) try: info_sub = info_sub.xpath( 'a/text()').extract()[0] except IndexError: info_sub = info_sub.xpath( 'a/span/@title').extract()[0] # print(info_sub, end=' ') basic_value.append(info_sub) # print('\n') details = await self.getDetail(link_sub, self.async_session) info_basic['城市'] = self.city_name # info_basic['序号'] = basic_value[0] info_basic['行政区'] = basic_value[1] info_basic['供应公告标题'] = basic_value[2] info_basic['公告编号'] = ''.join( re.findall(r'[\s|\S]*(\([\s|\S]*\))', str(basic_value[2]))) info_basic['公告类型'] = basic_value[3] info_basic['发布时间'] = basic_value[4] info_basic['网上创建时间'] = basic_value[5] # info_basic['地块公示信息'] = details for det in details: info_ba = {} info_ba = {**info_basic, **det} all_data = { '城市': '', '行政区': '', '供应公告标题': '', '公告编号': '', '公告类型': '', '发布时间': '', '网上创建时间': '', '宗地编号:': '', '宗地面积:': '', '宗地坐落:': '', '出让年限:': '', '容积率:': '', '建筑密度(%):': '', '绿化率(%):': '', '建筑限高(米):': '', '土地用途:': '', '投资强度:': '', '保证金:': '', '起始价:': '', '加价幅度:': '', '挂牌开始时间:': '', '挂牌截止时间:': '', '提交书面申请地': '', '缴纳竞买保证金截止时间': '', '确认竞买资格时间': '', '拍卖开始时间': '', '拍卖挂牌进行地点': '', '联系地址': '', '联系人': '', '联系电话': '', '开户单位': '', '开户银行': '', '银行账号': '' } info_all = {**all_data, **info_ba} print(info_all) self.to_csv(info_all) # self.info_all.append(info_all) except Exception as e: continue # return detail_link async def getDetail(self, link, session): link = f'{self.url}{link}' print(link) # link = 'https://www.landchina.com//DesktopModule/BizframeExtendMdl/workList/bulWorkView.aspx?wmguid=20aae8dc-4a0c-4af5-aedf-cc153eb6efdf&recorderguid=4eff5dbf-6bce-4cef-a3a8-61b51cd4dc21&sitePath=' # response = await session.get(link, headers=self.headers) response = requests.get(link, headers=self.headers) # print(response.content.decode('gb18030')) ttf_url = re.findall( r"truetype[\s|\S]*styles/fonts/([\s|\S]*?)'[\s|\S]*woff'\)", response.content.decode('gb18030'))[0] # print(ttf_url) # ttf_content = await session.get(f'{self.url}/styles/fonts/{ttf_url}', headers=self.headers) ttf_content = requests.get(f'{self.url}/styles/fonts/{ttf_url}', headers=self.headers) new_font_name = f"{link.split('recorderguid=')[1]}.ttf" with open(new_font_name, 'wb') as f: f.write(ttf_content.content) info_text_all = Selector(text=response.content.decode( 'gb18030')).xpath('//*[@id="tdContent"]//td/div') other_text = Selector(text=response.content.decode('gb18030')).xpath( '//*[@id="tdContent"]//td/p//text()').extract() # info = ''.join([str(ir).replace('\r\n','').replace(' ','') for ir in info_text]) bg_all = [] for info_t in info_text_all: info_text = info_t.xpath('table//text()').extract() info_temp = '#'.join( list( filter(None, [ str(ir).replace(' ', '').replace('\t', '') for ir in info_text ]))) bg_all.append(info_temp) info = '$$$$'.join(bg_all) other = list( filter(None, [ str(ot).replace(' ', '').replace('\t', '') for ot in other_text ])) # info = [str(ir).replace(' ','').replace('\t','') for ir in info_text] # info = '#'.join(info) other = '&'.join(other) # 替换繁体字 info_all = replace_content(f'{info}****{other}', link.split('recorderguid=')[1]) if not info_all: return False # print(info_all) names = [ '宗地编号:', '宗地面积:', '宗地坐落:', '出让年限:', '容积率:', '建筑密度(%):', '绿化率(%):', '建筑限高(米):', '土地用途:', '投资强度:', '保证金:', '起始价:', '加价幅度:', '挂牌开始时间:', '挂牌截止时间:', '提交书面申请地', '缴纳竞买保证金截止时间', '确认竞买资格时间', '拍卖开始时间', '拍卖挂牌进行地点', '联系地址', '联系人', '联系电话', '开户单位', '开户银行', '银行账号' ] keys = [ '宗地编号', '宗地面积', '宗地坐落', '出让年限', '容积率', '建筑密度', '绿化率', '建筑限高', '土地用途', '投资强度', '保证金', '起始价', '加价幅度', '挂牌开始时间', '挂牌截止时间', '提交书面申请地', '缴纳竞买保证金截止时间', '确认竞买资格时间', '拍卖开始时间', '拍卖挂牌进行地点', '联系地址', '联系人', '联系电话', '开户单位', '开户银行', '银行账号' ] info = info_all.split('****')[0] # 表格信息 other = info_all.split('****')[1] # 其他内容信息 infos = list(filter(None, info.split('$$$$'))) # infos = info.split('$$$$') # infos_all = [] # 多表格解析逻辑 content_all = [] for info in infos: result_info = info.split('#') # print(result_info) # infos_all.append(result_info) # 添加一个空元素,防止后面解析报错 result_info.append('') # 解析表格逻辑 content_dict = dict() for i, inf in enumerate(result_info): if inf in names: if result_info[i + 1] in names: content_dict[inf] = '' # elif inf in ['用途名称', '面积']: # if inf == '用途名称': # yt = re.findall(r'用途名称#面积#([\u4E00-\u9FA5]+).*?#投资强度', info) # if yt: # content_dict[inf] = yt[0] # else: # content_dict[inf] = '' # else: # mj = re.findall(r'用途名称#面积#.*?([\d|\.|#]+?)#投资强度', info) # if mj: # content_dict[inf] = mj[0][1:] # else: # content_dict[inf] = '' else: content_dict[inf] = str(result_info[i + 1]) else: pass # 非表格信息 content_dict['提交书面申请地'] = ''.join( re.findall(r'五、申请人可于[\s|\S]*&到&([\s|\S]*?)&向我局提交书面申请', other)) content_dict['缴纳竞买保证金截止时间'] = ''.join( re.findall(r'竞买保证金的截止时间为&([\s|\S]*?)&', other)) content_dict['确认竞买资格时间'] = ''.join( re.findall(r'具备申请条件的,我局将在&([\s|\S]*?)&前确认其竞买资格', other)) try: content_dict['拍卖开始时间'] = ''.join( re.findall(r'拍卖活动定于&([\s|\S]*?)&在&', other)) content_dict['拍卖挂牌进行地点'] = ''.join( re.findall(r'&在&([\s|\S]*?)&进行', other)) except: # &号地块:&2020年05月18土09时30分&至& # content_dict['拍卖开始时间'] = '' # content_dict['拍卖进行地点'] = '' # content_dict['挂牌开始时间'] = ''.join(re.findall(r'&号地块:&([\s|\S]*?)&至&', other)) # content_dict['挂牌进行地点'] = ''.join(re.findall(r'&在&([\s|\S]*?)&进行', other)) pass content_dict['联系地址'] = ''.join( re.findall(r'联系地址:([\s|\S]*?)&', other)) content_dict['联系人'] = ''.join( re.findall(r'&联系人:([\s|\S]*?)&', other)) content_dict['联系电话'] = ''.join( re.findall(r'&联系电话:([\s|\S]*?)&', other)) content_dict['开户单位'] = ''.join( re.findall(r'&开户单位:([\s|\S]*?)&', other)) content_dict['开户银行'] = ''.join( re.findall(r'&开户银行:([\s|\S]*?)&', other)) content_dict['银行账号'] = ''.join( re.findall(r'&银行帐号:([\s|\S]*)', other)) content_dict['内容链接'] = link content_dict['主键MD5'] = self.to_md5(str(link)) content_dict['爬取时间'] = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))) content_all.append(content_dict) # 每个表格返回一条数据 return content_all async def run(self): await self.getCookie() await self.getInfo(self.async_session) def to_md5(self, txt): import hashlib m = hashlib.md5() m.update(txt.encode()) return m.hexdigest() def main(self): self.async_session.run(self.run)
#!/usr/bin/python3 # -*- coding: UTF-8 -*- __Author__ = "Alvin Liu" 'http://www-mipengine-org.mipcdn.com/i/p3.manhuapan.com/2020/12/11211014200587.jpg' 'http://www-mipengine-org.mipcdn.com/i/p3.manhuapan.com/2020/12/11211014200599.jpg' from requests_html import AsyncHTMLSession from requests_html import HTMLSession async_session = AsyncHTMLSession() async def fetchBaidu(): """ 抓取百度的数据,测试获取接口字段 """ ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0' result = await async_session.get( 'https://manhua.fzdm.com/2/998/index_1.html', headers={'user-agent': ua}) for i in result.html.find('img'): # 展示抓取HTML内容信息 print(i) # fetchBaidu() async_session.run(fetchBaidu)
from requests_html import AsyncHTMLSession asession = AsyncHTMLSession() async def delay_1(): r = await asession.get('https://httpbin.org/delay/1') return r async def delay_2(): r = await asession.get('https://httpbin.org/delay/2') return r async def delay_3(): r = await asession.get('https://httpbin.org/delay/3') return r t1 = time.perf_counter() results = asession.run(delay_1, delay_2, delay_3) for result in results: print(result.html.url) t2 = time.perf_counter() print(f'finished in {round(t2-t1, 2)}')
time1 = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%d %H:%M:%S") return r_crude.html.xpath('//*[@id="quote-header-info"]/div[3]/div/div/span[1]')[0].text, "Crude", time1 async def get_gold(): r_gold = await asession.get("https://finance.yahoo.com/quote/GC=F?p=GC=F") ts = time.time() time1 = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%d %H:%M:%S") return r_gold.html.xpath('//*[@id="quote-header-info"]/div[3]/div/div/span[1]')[0].text, "Gold", time1 conn = sqlite3.connect("portfoliostocks.db") print(type(conn)) cur = conn.cursor() # cur.execute("""CREATE TABLE stocks_prices( # stock_name text, # price REAL, # time1 text)""") for i in range(100): results = asession.run(get_snp, get_dow, get_nasdaq, get_crude, get_gold) for result in results: conn.execute("INSERT INTO stocks_prices VALUES (?,?,?)", (result[1], result[0], result[2])) print(result[1], result[0], result[2]) time.sleep(20) conn.commit() conn.close();
class LandChinaBot: info_all = [] url = 'https://www.landchina.com/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36', # 'Cookie': 'security_session_mid_verify=b92679e3c892fc921cb78030f1e86157', } data = None def __init__(self, city_code, city_name): self.getCityInfo(city_code, city_name) self.async_session = AsyncHTMLSession() def getCityInfo(self, city_code, city_name): city_info = unquote( f'42ad98ae-c46a-40aa-aacc-c0884036eeaf:{city_code}' + u"▓~" + city_name) city_info = city_info.encode("gb18030") self.data = { 'TAB_QuerySubmitConditionData': city_info, } @staticmethod def stringToHex(): width = str(GetSystemMetrics(0)) height = str(GetSystemMetrics(1)) screendate = width + "," + height val = "" for i in range(len(screendate)): if val == "": val = binascii.b2a_hex(screendate[i].encode('utf-8')) else: pass val += binascii.b2a_hex(screendate[i].encode('utf-8')) return val.decode('utf-8') async def getCookie(self): response = await self.async_session.get(self.url, headers=self.headers) security_verify_data = self.stringToHex() link = f'{self.url}?security_verify_data={security_verify_data}' response = await self.async_session.get(link, headers=self.headers) # print(self.async_session.cookies) async def getInfo(self, session): # detail_link = [] link = f'{self.url}default.aspx?tabid=263' response = await session.post(link, data=self.data, headers=self.headers) # print(response.text) info = response.html.xpath('//*[@id="TAB_contentTable"]/tbody/tr') for sub_raw in info[1:]: info_basic = {} basic_value = [] sub_list = sub_raw.xpath('//td') for i, info_sub in enumerate(sub_list): if i != 2: info_sub = info_sub.xpath('//text()')[0] # print(info_sub, end=' ') basic_value.append(info_sub) else: link_sub = info_sub.xpath('//a/@href')[0] # detail_link.append(link_sub) try: info_sub = info_sub.xpath('//a/text()')[0] except IndexError: info_sub = info_sub.xpath('//a/span/@title')[0] # print(info_sub, end=' ') basic_value.append(info_sub) # print('\n') details = await self.getDetail(link_sub, self.async_session) info_basic['序号'] = basic_value[0][:-1] info_basic['行政区'] = basic_value[1] info_basic['土地坐落'] = basic_value[2] info_basic['总面积'] = basic_value[3] info_basic['土地用途'] = basic_value[4] info_basic['供应方式'] = basic_value[5] info_basic['签订日期'] = basic_value[6] info_basic['供地结果信息'] = details self.info_all.append(info_basic) # return detail_link async def getDetail(self, link, session): link = f'{self.url}{link}' # print(link) response = await session.get(link, headers=self.headers) # print(response.text) info = response.html.xpath( '//*[contains(@id, "mainModuleContainer_1855_1856_ctl00_ctl00_p1_")]/text()' ) # print(info) if not info: return False info_new = '' pay_i = right_i = None # 计算出'土地来源'对应的实际数据 # 去除空值,并将数据变换为键值对形式,并以#分隔 for i, info_sub in enumerate(info): if '土地来源' in info_sub: if float(info[i + 1]) == float(info[i - 1]): info[i + 1] = '现有建设用地' elif float(info[i + 1]) == 0: info[i + 1] = '新增建设用地' else: info[i + 1] = '新增建设用地(来自存量库)' elif '分期支付约定' in info_sub: pay_i = i elif '土地使用权人' in info_sub: right_i = i if info_sub != '\xa0': info_sub = f'"{info_sub}"' if ':' in info_sub or ':' in info_sub: info_sub = f'#{info_sub[:-2]}":' # 获取'分期支付约定'对应的实际数据 if pay_i == i: info_sub = info_sub + '[' if not right_i and pay_i and i > pay_i: info_sub = info_sub + ',' if right_i == i: info_sub = ']' + info_sub info_new += info_sub # info_new = info_new.split('#') # 获取'约定容积率'对应的实际数据 volume_i = info_new.index('"约定容积率":') if info_new[volume_i + 1][-1] == ':': info_new[volume_i + 1] += '""' info_new[volume_i + 1] = '{' + info_new[volume_i + 1] + ',' if info_new[volume_i + 2][-1] == ':': info_new[volume_i + 2] += '""' info_new[volume_i + 2] = info_new[volume_i + 2] + '}' info_new[ volume_i] = f'{info_new[volume_i]}{info_new[volume_i+1]}{info_new[volume_i+2]}' info_new.pop(volume_i + 1) info_new.pop(volume_i + 1) # 补充空值,构成字典 info = '{' for i, info_sub in enumerate(info_new[1:]): if len(info_sub) > 1 and info_sub[-1] == ':': info_sub += '""' info += f'{info_sub},' info += '}' info = eval(info) # 获取'分期支付约定'对应的实际数据 pay_info = info['分期支付约定'][4:] if pay_info: pay_info_new = '[' for i, info_sub in enumerate(pay_info): info_sub = f"'{info_sub}'," if re.match(r"^\'\d+\',$", info_sub): info_sub = '[' + info_sub if i > 1: pay_info_new = pay_info_new[:-1] + "]," pay_info_new += info_sub pay_info_new = pay_info_new[:-1] + "]]" pay_info_new = eval(pay_info_new) # 去重 info_index = None pay_info = [] for info_sub in pay_info_new: if info_index != info_sub[0]: # 补充相关 info_pay = {} for i in range(4 - len(info_sub)): info_sub.append('') info_pay['支付期号'] = info_sub[0] info_pay['约定支付日期'] = info_sub[1] info_pay['约定支付金额(万元)'] = info_sub[2] info_pay['备注'] = info_sub[3] pay_info.append(info_pay) info_index = info_sub[0] info['分期支付约定'] = pay_info else: info['分期支付约定'] = [] # print(info, '\n\n') return info async def run(self): await self.getCookie() await self.getInfo(self.async_session) for info_sub in self.info_all: print(info_sub, '\n\n') def main(self): self.async_session.run(self.run)
from requests_html import AsyncHTMLSession asession = AsyncHTMLSession() url = 'http://1463-50e62051-2d46-4d20.nss.ctfer.vip:9080/shop?page={}' find_str = 'lv6.png' start, end = 0, 200 async def download(link, text): res = await asession.get(link) if text in res.text: print(link) if __name__ == "__main__": lst = [url.format(i) for i in range(start, end)] funcs = [lambda x=x: download(x, find_str) for x in lst] asession.run(*funcs)
ascdep_date = ascdep_url[i] asession = AsyncHTMLSession() async def getasc(): r = await asession.get( f"https://booking.maykenbel.com/?chain=19159&template=maykenbel&shell=MKNBL2018&start=availresults&brand=maykenbe¤cy=GBP&lang=1&arrive={ascdate[5:7]}%2F{ascdate[8:10]}%2F{ascdate[:4]}&depart={ascdep_date[5:7]}%2F{ascdep_date[8:10]}%2F{ascdep_date[:4]}&hotel=70825&dpArrive={ascdate[8:10]}%2F{ascdate[5:7]}%2F{ascdate[:4]}&dpDepart={ascdep_date[8:10]}%2F{ascdep_date[5:7]}%2F{ascdep_date[:4]}&rooms=1&adult=1&promo=" ) return r async def getchc(): r = await asession.get( f"https://secure.chevalcollection.com/convert/site/Cheval%20Harrington%20Court[wsJsZoGCLg62hr_WrMSMy9dIwRklPItcNUhU30wAXMo]/en/results.php?checkin={chcstart_url[i]}&nights={chc_nights[i]}¤cy=GBP&resultViewType=sda&viewtype=rateroom&partya=0" ) return r results = asession.run(getasc, getchc) for result in results: print(result) match = re.search("cheval", result.html.url) print("Date " + ascdate) if match: print("Cheval Harrington Court") try: discchc1bed = result.html.find("#mbprice_4932506_15069_123", first=True).text if discchc1bed: chc1bed = chc_calc(discchc1bed, chc_nights[i])
"labels": labels, "url": url, "img_url": img_url } f.write(title + des + level + users + labels + url + img_url + '\n') # if c_list: # items = c_list.find('') # for item in items: # title = item.find(".course-card-name").text() # 查找title # des = item.find(".course-card-desc").text() # level = item.find(".course-card-info>span:eq(0)").text() # users = item.find(".course-card-info>span:eq(1)").text() # labels = item.find(".course-label").text().split(" ") # url = item("https://www.imooc.com/learn/", item.find("a").attr("href")) # url拼接 # img_url = item("https://img3.mukewang.com/", item.find("img").attr("src")) # url拼接 # dict = { # "title":title, # "des":des, # "level":level, # "users":users, # "labels":labels, # "url":url, # "img_url":img_url # } # print(dict) if __name__ == '__main__': result = asession.run(get_html)
for l in f: util.rlstrip(l) m = re.search(r'(.*)-->(.*),(.*)',l) kw,d,url = m.group(1), m.group(2),m.group(3) yield kw,d,url if __name__ == '__main__': print('Start scipt of crawl departments') connector = db.MongoConnector(config.DB_HOST,config.DB_USER_NAME,config.DB_PASSWORD,config.DB_NAME) backend = MongoQABackend(connector,config.QA_COLLECT_NAME) r = DepartmentListRequest(util.get_browser_driver(config.DRIVER_PATH,config.ENV)) asession = AsyncHTMLSession() for kw,dep,url in read_keyword_urls(args.url_file): print('crawl %s-->%s'%(kw,url)) current_url = url while current_url is not None: page_src = r.send(current_url) page = PageUnderDepartment('呼吸內科',page_src) qurls = page.parse_questions() l = [] for qid,link in qurls: cb = AsyncHealthPageCallback(qid,backend) arq = AsyncHealthQuestionRequest(asession,link,cb) l.append(arq) if len(l)>0: asession.run(*[ r.send for r in l ]) next_link = page.parse_next_link() current_url = next_link
from json import loads from base64 import b64decode from requests_html import AsyncHTMLSession a = AsyncHTMLSession() url = 'https://www.proxyfish.com/proxylist/server_processing.php?type=' async def 一(): return await a.get(url + 'HTTP') # ~1.5k async def 二(): return await a.get(url + 'HTTPS') # ~500 results = a.run(一, 二) for result in results: req = result.json()['data'] # request json data data = b64decode(req) # decode data d = loads(data) # load data for p in d: # only proxy ip and port print(p[1] + ':' + p[2])
class asyncio_crawler(object): def __init__(self, url, depth=10, numworkers=None): #create a async requests_html session self.asession = AsyncHTMLSession(workers=numworkers) if numworkers: self.numworkers = numworkers else: self.numworkers = multiprocessing.cpu_count() * 5 self.url = url self.host = urlparse.urlparse(url).netloc self.regex = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,6}\b", re.IGNORECASE) self.depth = depth self.runtime = 0 self.gen_link_counter = 0 self.mails = set() self.links = set() self.links_done = set() LOGGER.info('-----> ASYNCIO_CRAWLER INITIALIZED <-----') async def Task(self): '''This function will be used to generate tasks for the asyncio call''' if self.links: #TO DO: Check if this is also possible with asyncio.Queue url = self.links.pop() #LOGGER.info(url) self.links_done.add(url) # use the async html_request framework to do requests, get html text # and to generate new links try: response = await self.asession.get(url) for link in response.html.absolute_links: if self.host in link: self.links.add(link) # use regular expression to scan for mail adresses for mail_addr in re.findall(self.regex, response.html.full_text): self.mails.add(mail_addr) except Exception as e: # our requests has failed,but we don't care too much LOGGER.warning(e) def crawl(self): '''main loop''' #use the requests_html module to get links and mails # find all links on the start url self.links.add(self.url) Task_list = [] while self.links and self.gen_link_counter < self.depth: for link in self.links: Task_list.append(self.Task) # run asycio task with run command of async requests_html self.asession.run(*Task_list) self.links.difference_update(self.links_done) self.gen_link_counter += 1 def run(self): '''method to be called from executable''' LOGGER.info('-----> RUN ASYNCIO_CRAWLER <-----') starttime = time.time() try: self.crawl() except KeyboardInterrupt: # ..abort crawler using CTRL+C pass except Exception: raise self.runtime = time.time() - starttime self.report() def report(self): #report all results to console LOGGER.info('-----> ASYNCIO_CRAWLER FINISHED <-----') LOGGER.info('-----> REPORT FOLLOWS <-----') LOGGER.info('-----> Links done: <-----') for link in self.links_done: LOGGER.info(link) LOGGER.info('-----> Got mails: <-----') for mail in self.mails: LOGGER.info(mail) LOGGER.info('-----> Mails found: <-----') LOGGER.info(len(self.mails)) LOGGER.info('-----> Finished linklist: <-----') empty = not bool(self.links) LOGGER.info(empty) LOGGER.info('-----> Generation of links: <-----') LOGGER.info(self.gen_link_counter) LOGGER.info('-----> Links done: <-----') LOGGER.info(len(self.links_done)) LOGGER.info('-----> Number of Workers: <-----') LOGGER.info(self.numworkers) LOGGER.info('-----> ASYNCIO_Crawler runtime [s]: <-----') LOGGER.info(self.runtime)
) # session appears to be currently required for a single get request with this library? async def 一(): return await a.get('https://free-proxy-list.net') # 300 async def 二(): return await a.get('https://www.us-proxy.org') # 200 async def 三(): return await a.get('https://www.sslproxies.org') # 100 results = a.run(一, 二, 三) for result in results: cells = result.html.find('td') p = '' for cell in cells: c = cell.text if not c.lower().islower( ): # lowercase all letters and then check if islower to determine if the cell contains letters (only ip and port cells will remain) if '.' in c:
def main(): session = AsyncHTMLSession() tasks = [ping_creator(session, url) for url in load(file)] session.run(*tasks)
async def get_delay1(): r = await async_session.get('https://httpbin.org/delay/1') return r async def get_delay2(): r = await async_session.get('https://httpbin.org/delay/2') return r async def get_delay3(): r = await async_session.get('https://httpbin.org/delay/3') return r t1 = time.perf_counter() results = async_session.run(get_delay1, get_delay2, get_delay3) # Each item in the results list is a response object # and can be interacted with as such for result in results: response = result.html.url print(response) t2 = time.perf_counter() print(f'Asynchronous: {t2 - t1} seconds')
assert numbers, 'Something went wrong' functions = [] for number in numbers: for region in REGIONS: async def fetch_page(region=region, number=number): response = await session.get(f'{URL}{region}/{number}') return response functions.append(fetch_page) return functions session = AsyncHTMLSession() results = session.run(*generate_requests(62)) for page in results: games = page.html.find('.bracket-game') for game in games: players = {} p_id = 0 # need very specific selector to prevent bracket-popup elements from being included for player_cell in game.find( 'div.bracket-game > div > div:first-child'): if 'bracket-popup' in player_cell.attrs['class']: continue # increment p_id after checking for invalid elements p_id += 1