async def req(): # 跳转页请求 resp = await requests.get(MAIN_PAGE_URL) resp_text = await resp.text() # 取出图片URL并且并发请求 image_urls = [f"{HOST}{image.get('src')}" for image in etree.HTML(resp_text).xpath('//img')] await asyncio.gather(*[requests.get(image_url) for image_url in image_urls]) # 拿到内容页 resp1 = await requests.get(MAIN_PAGE_URL) resp1_text = await resp1.text() # print(resp1_text) doc = etree.HTML(resp1_text) # 调用JS生成CSS # os.path.dirname(__file__)是取当前py文件的相对路径 js = execjs.compile(open(f"{os.path.dirname(__file__)}/js/exam1.js", encoding="utf-8").read()) css = base64.b64decode(js.call("get_css", resp1_text)).decode() print(css) # 解析CSS并覆盖到span标签的text中 css_dict = css2dict(css) spans = doc.xpath('//span') for span in spans: span.text = css_dict.get(span.get("class")) # 移除p和script标签,来源:https://stackoverflow.com/questions/7981840/how-to-remove-an-element-in-lxml for bad in doc.xpath("//body/p|//body/script"): bad.getparent().remove(bad) # 用xpath直接取出body下的所有text,在清除前后空格和换行符之后合并到同一个字符串 exam_text = "".join([text.strip() for text in doc.xpath('//body//text()')]) print(exam_text)
def get_video(source): cheemses = ["https://www.youtube.com/channel/UChZWowQd_y6usuF7vSL4jmA"] channels = [ "https://www.youtube.com/channel/UCYd6CmhFvvq6yruUBmGXjuA/videos", "https://www.youtube.com/channel/UCX2laRqGQhqoChYmlaUgOiw/videos", "https://www.youtube.com/user/wettitab/videos", "https://www.youtube.com/channel/UC38r7_x7oMPAZweB2fvGDXQ/videos", "https://www.youtube.com/channel/UC-xjitW_J39_Q1ure2HlJew/videos", "https://www.youtube.com/channel/UCHh-cQr-viOcimjPhxr3xRQ/videos", "https://www.youtube.com/channel/UCAJI1a4L0R5HkvTHTxZOd6g/videos", "https://www.youtube.com/user/shibainusaki/videos", "https://www.youtube.com/channel/UCOE2s_EwBM0es4TfC6ce7Fg/videos", "https://www.youtube.com/channel/UCkEdaRw8w0daEvGgzKff8TA", "https://www.youtube.com/channel/UC_WUkVnPROmHC1qnGHQAMDA", "https://www.youtube.com/channel/UChZWowQd_y6usuF7vSL4jmA", ] sources = {"shibes": channels, "cheems": cheemses} all_vids = [] for i in sources[source]: url = i page = requests.get(url).content data = str(page).split(" ") item = 'href="/watch?' vids = [ line.replace('href="', "youtube.com") for line in data if item in line ] # list of all videos listed twice all_vids.extend(vids) return random.choice(all_vids)
async def request(): # Using plain aiohttp client with show_duration('aiohttp', extra_newline=True): async with aiohttp.ClientSession() as session: async with session.get('https://www.google.com') as response: content = await response.text() print('Status:', response.status) # 200 print('Length:', len(content)) # 10597 # The above becomes a bit easier without indents using `aiohttp_requests` with show_duration('aiohttp_requests', extra_newline=True): response = await requests.get('https://www.google.com') content = await response.text() print('Status: ', response.status) # 200 print('Length: ', len(content)) # 10625 # Now, let's do some concurrent requests with show_duration('100 concurrent requests'): status_count = defaultdict(int) get_futures = [ requests.get('https://www.google.com') for _ in range(1000) ] for get_future in asyncio.as_completed(get_futures): response = await get_future status_count[response.status] += 1 print(status_count) # {200: 100}
async def req(): resp = await requests.get(MAIN_PAGE_URL) resp_text = await resp.text() print(resp_text) image_urls = [ f"{HOST}{image.get('src')}" for image in etree.HTML(resp_text).xpath('//img') ] await asyncio.gather( *[requests.get(image_url) for image_url in image_urls]) resp1 = await requests.get(MAIN_PAGE_URL) resp1_text = await resp1.text() print('###############################') print(resp1_text) doc = etree.HTML(resp1_text) js = execjs.compile( open(f"{os.path.dirname(__file__)}/js/exam1.js", encoding="utf-8").read()) css = base64.b64decode(js.call('get_css', resp1_text)).decode() print('###############################') print(css) css_dict = css2dict(css) spans = doc.xpath('//span') for span in spans: span.text = css_dict.get(span.get("class")) for bad in doc.xpath("//body/p|//body/script"): bad.getparent().remove(bad) exam_text = "".join([text.strip() for text in doc.xpath('//body//text()')]) print('###############################') print(exam_text)
async def watch_current(): sleep_amount = 3 while True: prev = chain[-1] if chain else None current = prev['to'] if prev else initial print('check ' + current) try: response = await asyncio.wait_for(requests.get( f"https://api.zksync.io/api/v0.1/account/{current}/history/0/15" ), timeout=15.0) except asyncio.TimeoutError: print('timeout!') await asyncio.sleep(2) continue data = await response.json() for tx in data: #print(json.dumps(tx, indent=4)) created_at = calendar.timegm(parse(tx['created_at']).timetuple()) if tx['tx']['type'] == 'Transfer' and tx['tx'][ 'from'] == current and tx['tx']['token'] == 'TBTC' and ( not prev or created_at > prev['timestamp']): print("found tx -> " + tx['tx']["to"]) chain.append({ "from": tx['tx']["from"], "to": tx['tx']["to"], "amount": tx['tx']["amount"], "fee": tx['tx']["fee"], "tx_id": tx['tx_id'], "date": tx['created_at'], "timestamp": created_at, }) sleep_amount = 3 break else: sleep_amount = 60 await asyncio.sleep(sleep_amount)
import asyncio from aiohttp_requests import requests import requests '''url = 'https://api.pushshift.io/reddit/comment/search/' async def get_com(url, query): resp = await requests.get(url, params={'q': {query}}) data = await resp.json() return data''' '''url = 'https://api.pushshift.io/reddit/comment/search/' def get_com(url, query): resp = requests.get(url, params={'q': {query}}) data = resp.json() return data print(get_com(url, "author",))''' respon = requests.get('https://api.pushshift.io/reddit/comment/search/') print(respon)