def __init__(self,proxy,loop,localsession,id=0): self.proxy=proxy self.loop=loop self.session=CloudflareScraper(headers=genHeaders(),timeout=timeoutConfig,loop=loop) self.localsession=localsession self.id=id #用于标记这次投票是第几次 #if proxy: self.fingerprint=md5((proxy+'ChNeWi').encode()).hexdigest()
async def tiktok_video_no_watermark(self, url: str): async with CloudflareScraper( headers={'user-agent': self._user_agent}) as session: page = await session.get('https://ssstiktok.io/ru') if page.status != 200: return {'url': None} soup = BeautifulSoup( await page.text(), features="html.parser") # Иницилизация обработки HTML тегов form = soup.find(class_='pure-form pure-g hide-after-request') endpoint = form['data-hx-post'] vals = form['include-vals'] m = self._re_donor_no_wm.search(vals) if not m: return {'url': None} tt, ts = m.group('tt', 'ts') response = await session.post(f'https://ssstiktok.io{endpoint}', data={ 'id': url, 'locale': 'ru', 'tt': tt, 'ts': ts }) soup = BeautifulSoup(await response.text(), features="html.parser") for link in soup.find_all('a'): return { 'url': base64_decode(link['href'].split('/dl?url=').pop()) } return {'url': None}
async def test(id): print('Instantiating ' + id + ' exchange') session = CloudflareScraper(loop=asyncio.get_event_loop()) # instantiate the exchange by id exchange = getattr(ccxt, id)({ 'timeout': 20000, 'session': session, }) markets = None try: # load all markets from the exchange markets = await exchange.load_markets() except ccxt.BaseError as e: print(type(e).__name__, str(e)) print('Failed.') await exchange.close() await session.close() return markets
async def post(self, url, payload=None, headers: dict = {}, loop=None): headers["XSRF-TOKEN"] = self.csrf_token async with CloudflareScraper(loop=loop) as session: async with session.post(url=url, data=payload, headers=headers) as resp: data = await resp.text() return data
async def tiktok_by_url(self, url: str): m_pc = self._re_tiktok_pc_url.search(url) m_mobile = self._re_tiktok_mobile_url.search(url) m_mobile_2 = self._re_tiktok_mobile_2_url.search(url) if m_pc: return await self.tiktok_by_id(int(m_pc.group('tiktok_id'))) elif m_mobile: return await self.tiktok_by_id(int(m_mobile.group('tiktok_id'))) elif m_mobile_2: async with CloudflareScraper( loop=self._loop, headers={ 'authority': 'm.tiktok.com', 'accept': 'application/json, text/plain, */*', 'accept-encoding': 'gzip, deflate', 'accept-language': 'en-US,en;q=0.9', 'referrer': 'https://www.tiktok.com/', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-site', 'user-agent': self._user_agent }) as session: async with session.get(url) as response: m_pc = self._re_tiktok_pc_url.search(str(response.url)) if m_pc: return await self.tiktok_by_id( int(m_pc.group('tiktok_id'))) return False else: return False
async def _create_client_session(self): async with self._creation_semaphore: try: from aiocfscrape import CloudflareScraper except ImportError: raise ImportError('Cfcrawler requires aiocfscrape') self._client = CloudflareScraper()
async def get_election_offices(): """Starting point of the scraper program. Scrapes BASE_URL for election office information and both dumps results to a .json file and returns the results as json. @return: list of scraped results as json. """ # Get list of county names from registrar to populate form # Define coroutine functions (context managers) async with CloudflareScraper() as session: async with session.get(BASE_URL) as s: # ClientResponse.read() is a coroutine function so it must be awaited text = await s.read() soup = bS(text, "html5lib") info_list = soup.findAll("area") counties = [info['alt'] for info in info_list] county_urls = [info['href'] for info in info_list] # Use list of counties and IDs to get county info for each county tasks: List[Task] = [] num_scraped = 0 master_list = [] for i in range(len(counties)): # Create task for a future asynchronous operation and store it in task list tasks.append( asyncio.create_task( scrape_one_county(session, counties[i], county_urls[i]))) # Run the coroutines and iterate over the yielded results as they complete # (out-of-order). Use asyncio.gather() with a couple code modifications to # preserve list order future: Future[Tuple[str, str, str, str, str]] for future in asyncio.as_completed(tasks): # Unpack awaited result of scrape_one_county() ( address, county_website, phone_number, email_address, county_name, ) = await future schema = format_data_into_schema( address, county_website, phone_number, email_address, county_name, ) master_list.append(schema) num_scraped += 1 print(f"[New York] Scraped {county_name} county: " f"#{num_scraped} of {len(counties)} .... " f"[{round((num_scraped / len(counties)) * 100, 2)}%]") master_list = sorted(master_list, key=lambda county: county['countyName']) with open(os.path.join(ROOT_DIR, "scrapers", "new_york", "new_york.json"), "w") as f: json.dump(master_list, f) return master_list
async def get(self, url, loop=None): async with CloudflareScraper(loop=loop) as session: async with session.get(url) as resp: data = await resp.text() csrf_token = resp.cookies["XSRF-TOKEN"].value if not self.csrf_token or self.csrf_token != csrf_token: self.csrf_token = csrf_token return data
async def request(self, url: str, kwargs: dict = {}, return_bytes=False, payload=None) -> dict: async with CloudflareScraper( loop=self._loop, headers={ 'authority': 'm.tiktok.com', 'accept': 'application/json, text/plain, */*', 'accept-encoding': 'gzip, deflate', 'accept-language': 'en-US,en;q=0.9', 'referrer': 'https://m.tiktok.com/', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-site', 'user-agent': self._user_agent, 'cookie': ';'.join([ f'{key}={value}' for key, value in self.browser.cookies.items() ]) }) as session: url = await self._browser.signature(url, kwargs) if payload is not None: async with session.post(url, json=payload) as response: return await response.text() async with session.get(url) as response: if return_bytes: return response.content try: _json = await response.json(content_type=None) code = _json.get('code', -1) if code != '10000': return _json return await self.captcha(_json, url, kwargs, return_bytes) except Exception as e: logging.error(e, exc_info=True) print( f'Failed on {url}; Converting to json error; Text: {await response.text()}' ) raise Exception('Invalid Response!!!')
async def coin_name(self, symbol: str) -> str: try: async with CloudflareScraper() as session: async with session.get( 'https://liqui.io/Market/Currencies/') as resp: currencies = await resp.json() except Exception as e: raise LiquiPairNamesException(e) coin_name = next( (i['Name'] for i in currencies if i['Symbol'] == symbol), None) if not coin_name: raise LiquiPairNamesException(f'cannot find coin {symbol!r}') return coin_name
async def get_election_offices(): """Starting point of the scraper program. Scrapes BASE_URL for election office information and both dumps results to a .json file and returns the results as json. @return: list of scraped results as json. """ # Define coroutine functions (context managers) async with CloudflareScraper() as session: async with session.get(BASE_URL) as s: # ClientResponse.read() is a coroutine function so it must be awaited text = await s.read() soup = bS(text.decode("utf-8"), "html.parser") test_county_data = get_county_codes_and_names(soup) county_data = sorted(test_county_data, key=lambda k: k["countyName"]) num_scraped = 0 master_list = [] # Create list that will store asyncio tasks tasks: List[Task] = [] for county in county_data: code = county["countyCode"] name = county["countyName"] # Create task for a future asynchronous operation and store it in task list tasks.append(asyncio.create_task(scrape_one_county(session, code, name))) # Run the coroutines and iterate over the yielded results as they complete # (out-of-order). Use asyncio.gather() with a couple code modifications to # preserve list order future: Future[Tuple[str, str, str, str]] for future in asyncio.as_completed(tasks): # Unpack awaited result of scrape_one_county() cleaned_string, protected_email, _, county_name = await future schema = format_data_into_schema( cleaned_string, protected_email, county_name ) master_list.append(schema) num_scraped += 1 print( f"[Florida] Scraped {county_name} county: " f"#{num_scraped} of {len(county_data)} .... " f"[{round((num_scraped / len(county_data)) * 100, 2)}%]" ) with open(os.path.join(ROOT_DIR, "scrapers", "florida", "florida.json"), "w") as f: json.dump(master_list, f) return master_list
async def cs_page(url): async with CloudflareScraper() as session: async with session.get(url) as resp: return await resp.text()
async def read_logs(): values = ('user', 'password', 'serverid', 'loc', 'folder', 'admin_file', 'admin_line', 'chat_file', 'chat_line', 'kill_file', 'kill_line', 'login_file', 'login_line', 'violations_file', 'violations_line') print( 'scumlogs v1.0, scum server logs downloader from gportal\nby htttps://GAMEBotLand.com' ) try: load_configini() except: global configini configini = {} for value in values: if value not in configini: configini[value] = '' if configini['folder'] != '': if configini['folder'][-1:] != '/' and configini['folder'][-1:] != '\\': configini['folder'] = configini['folder'] + '/' save_configini() if configini['loc'] == 'com': loc = 'com' else: loc = 'us' URL_LOGIN = '******'.format( configini['loc']) URL_LOGS = 'https://www.g-portal.{}/en/scum/logs/{}'.format( configini['loc'], configini['serverid']) async with CloudflareScraper() as session: try: log('connecting g-portal...') payload = { '_method': 'POST', 'login': configini['user'], 'password': configini['password'], 'rememberme': '1' } async with session.post(URL_LOGIN, data=payload) as raw_response: response = await raw_response.text() async with session.get(URL_LOGS) as raw_response: response = await raw_response.text() html = BeautifulSoup(response, 'html.parser') select = html.find('div', {'class': 'wrapper logs'}) loglist = select['data-logs'] logs = json.loads(loglist) for i in range(len(logs)): getid = logs["file_" + str(i + 1)] id = (getid[int(getid.find('Logs')) + 5:]) type = id.split('_')[0] if configini[type + '_file'] != '': if id < configini[type + '_file']: continue payload = { '_method': 'POST', 'load': 'true', 'ExtConfig[config]': getid } async with session.post(URL_LOGS, data=payload) as raw_response: response = await raw_response.text() content = json.loads(response) lines = content["ExtConfig"]["content"].splitlines() filename = configini['folder'] + id file = open(filename, "a+", encoding='utf-8') found = False writing = False for line in lines: if id == configini[type + '_file'] and not found: if line == configini[type + '_line']: found = True continue else: file.write(line + '\n') writing = True if writing: if found: log('updating {}'.format(id)) else: log('creating {}'.format(id)) file.close() configini[type + '_file'] = id configini[type + '_line'] = lines[-1] save_configini() except: log('error connecting, check connectivity and scumlogs.ini') help() await session.close()
async def __get_js(self): async with CloudflareScraper(loop=self._loop, headers={}) as session: async with session.get( 'https://sf-tb-sg.ibytedtos.com/obj/rc-web-sdk-sg/acrawler.js' ) as response: return await response.text()
async def url_2_image(url: str): async with CloudflareScraper() as session: async with session.get(url) as response: return await response.read()
future.add_done_callback(functools.partial(printer)) task = self.PostFingerprint() future = asyncio.ensure_future(task, loop=self.loop) future.add_done_callback(functools.partial(printer)) return ('%d %s %s' % (self.id, self.proxy, result)) if ('refresh' in result): #session expired等各种原因 print('%d %s %s %s' % (self.id, self.proxy, result, '开始重试整个投票流程')) await self.Vote() #再来一次! #if('An entry' in result): #这个ip被抢先投票 return ('%d %s %s' % (self.id, self.proxy, result)) #结束投票 except RetryExhausted: return ('%d %s %s' % (self.id, self.proxy, '连续重试次数超限')) except (aiohttp.ClientError, asyncio.TimeoutError): return ('%d %s %s' % (self.id, self.proxy, '代理可能失效,放弃治疗')) def Launch(self): vote = self.Vote() vote_future = asyncio.ensure_future(vote, loop=self.loop) vote_future.add_done_callback(functools.partial(printer)) ## res=await vote_future ## if(res==300): ## vote_future.add_done_callback(functools.partial(self.Launch)) if __name__ == '__main__': voter = Voter('192.168.1.1:9999', asyncio.get_event_loop(), CloudflareScraper()) print(voter)
future=asyncio.ensure_future(task,loop=self.loop) future.add_done_callback(functools.partial(printer)) task=self.PostFingerprint() future=asyncio.ensure_future(task,loop=self.loop) future.add_done_callback(functools.partial(doNothing)) return('%d %s %s'%(self.id,self.proxy,result)) if('refresh' in result): #session expired等各种原因 print('%d %s %s %s'%(self.id,self.proxy,result,'开始重试整个投票流程')) if random.random()<0.6: await self.Vote() #再来一次! else: return('%d %s %s %s'%(self.id,self.proxy,result,'放弃治疗')) #结束投票 #if('An entry' in result): #这个ip被抢先投票 return('%d %s %s'%(self.id,self.proxy,result)) #结束投票 except RetryExhausted: return('%d %s %s'%(self.id,self.proxy,'连续重试次数超限')) except (aiohttp.ClientError,asyncio.TimeoutError): return('%d %s %s'%(self.id,self.proxy,'代理可能失效,放弃治疗')) def Launch(self): vote=self.Vote() vote_future=asyncio.ensure_future(vote,loop=self.loop) vote_future.add_done_callback(functools.partial(printer)) ## res=await vote_future ## if(res==300): ## vote_future.add_done_callback(functools.partial(self.Launch)) if __name__=='__main__': voter=Voter('192.168.1.1:9999',asyncio.get_event_loop(),CloudflareScraper()) print(voter)
class Voter: #session=CloudflareScraper(headers=headers,timeout=timeoutConfig) #localsession=从外部输入一个不使用代理的CloudflareScraper session #proxy=从外部输入一个代理ip #global_retry=10 #如果进世萌后时间超过global_timeout, #且累计连接失败次数超过global_retry #则放弃这次投票 #这个策略似乎没啥意义。先不用 def __init__(self,proxy,loop,localsession,id=0): self.proxy=proxy self.loop=loop self.session=CloudflareScraper(headers=genHeaders(),timeout=timeoutConfig,loop=loop) self.localsession=localsession self.id=id #用于标记这次投票是第几次 #if proxy: self.fingerprint=md5((proxy+'ChNeWi').encode()).hexdigest() #else:#仅用于测试! # self.fingerprint=md5(('Hecate2'+str(time.time())).encode()).hexdigest() # self.voting_token=sha256(('Hecate2'+str(time.time())).encode()).hexdigest() #self.localsession=localsession def retry(self,*exceptions, retries=5, cooldown=0):#, verbose=True): """Decorate an async function to execute it a few times before giving up. Hopes that problem is resolved by another side shortly. Args: exceptions (Tuple[Exception]) : The exceptions expected during function execution retries (int): Number of retries of function execution. cooldown (int): Seconds to wait before retry. verbose (bool): Specifies if we should log about not successful attempts. """ def wrap(func): @wraps(func) async def inner(*args, **kwargs): #子函数可以访问父函数的所有变量,因此这里可以访问class的self retries_count = 0 while True: try: result = await func(*args, **kwargs) except exceptions as err: #exceptions是从retry传入的 #self.global_retry -= 1 retries_count += 1 if retries_count >= retries: message='已连续错误%d次,放弃治疗'%(retries) #print(self.id,self.proxy,message) print(message) #verbose and log.exception(message) #verbose and print(message) #raise RetryExhaustedError( # func.__qualname__, args, kwargs) from err #raise RetryExhaustedError #return err #return '还没想好return什么东西' raise RetryExhausted else: #message = "Exception:{} during\n{} execution. " \ # "{} of {} retries attempted"\ # .format(err, func, retries_count, retries) message= '出现错误:{}. 正在重试{}/{}'\ .format(err, retries_count, retries) #print(self.id,self.proxy,message) print(message) #verbose and log.warning(message) #verbose and print(message) await asyncio.sleep(cooldown) else: return result return inner return wrap @retry(aiohttp.ClientError,asyncio.TimeoutError) async def _get(self, url, timeout=timeoutConfig): #自动判断get到的是文字还是图片,返回utf-8编码的文字或bytes类型图片 async with self.session.get(url,proxy=self.proxy,timeout=timeout) as response: #return await response.text() body=await response.read() #print(response.content_type) #'text/html' 'image/png' #print(body) if (response.status<400): if 'text' in response.content_type: #text=body.decode(encoding='utf-8') text=await response.text(errors='ignore') #f=open('./tmp.txt','a',encoding='utf-8') #f.write(text) #f.close() return text #if 'image' in response.content_type: else: #fb=open('./tmp.png','wb') #fb.write(body) #fb.close() return body #if (response.status==503): #pass #处理cloudflare防火墙 else: response.raise_for_status() print('get连续失败太多次!') #不使用代理,直接get #暂时不要对着世萌用这个。如果一开始没有墙,突然墙开起来了,可能会出问题 #因为同一个本地session会发起几百几千个get,吃到不同的墙 @retry(aiohttp.ClientError,asyncio.TimeoutError) async def _localget(self, url, timeout=captchaTimeoutConfig): #用本机ip去get! #自动判断get到的是文字还是图片,返回utf-8编码的文字或bytes类型图片 async with self.localsession.get(url,timeout=timeout) as response: #return await response.text() body=await response.read() #print(response.content_type) #'text/html' 'image/png' #print(body) if (response.status<400): if 'text' in response.content_type: text=body.decode(encoding='utf-8') #f=open('./tmp.txt','a',encoding='utf-8') #f.write(text) #f.close() return text #if 'image' in response.content_type: else: #fb=open('./tmp.png','wb') #fb.write(body) #fb.close() return body #if (response.status==503): #pass #处理cloudflare防火墙 else: response.raise_for_status() print('localget连续失败太多次!') @retry(aiohttp.ClientError,asyncio.TimeoutError) async def _post(self,url,data,timeout=timeoutConfig): async with self.session.post(url,data=data,proxy=self.proxy,timeout=timeout) as response: text=await response.text() if (response.status<400): return text else: response.raise_for_status() print('post连续失败太多次!') #本机向验证码服务器post,不使用代理 #只能用于验证码post!只允许返回text!对于二进制内容会出错! @retry(aiohttp.ClientError,asyncio.TimeoutError) async def _localpost(self,url,data,timeout=timeoutConfig): async with self.localsession.post(url,data=data,timeout=timeout) as response: text=await response.text() if (response.status<400 and text!='!'): return text else: response.raise_for_status() print('localpost连续失败太多次!验证码服务器可能有严重问题!') async def EnterISML(self): text=await self._get('http://www.internationalsaimoe.com/voting') #text=await self._get('https://www.internationalsaimoe.com/voting?lang=zh-hans') voting_token = re.search(repattern, text) if voting_token: self.html=text self.voting_token=voting_token.group(1) self.startTime=time.time() print(self.id,self.proxy,'进入ISML成功') else: print(self.id,self.proxy,'找不到voting_token') raise NoVotingToken #发指纹和打码可以并发执行 async def PostFingerprint(self):#确保self.fingerprint在class初始化时已经生成! await self._post("https://www.internationalsaimoe.com/security",data={"secure":self.fingerprint}) print(self.id,self.proxy,'发指纹成功') return('%d %s 发指纹成功'%(self.id,self.proxy)) async def AIDeCaptcha(self): #打码。包含多次下载验证码,预处理,以及交给服务器最终识别 tries=0 while 1:#while tries<重试上限: #目前验证码重试次数不设上限! tries+=1 raw_img=await self._get( 'https://www.internationalsaimoe.com/captcha/%s/%s' % (self.voting_token, int(time.time() * 1000)), timeout=captchaTimeoutConfig) img=Image.open(BytesIO(raw_img)) img = 255-np.array(img.convert('L') ) #转化为灰度图 if(judge(img)): del img print(self.id,self.proxy,'第%d次获取验证码,能够识别'%(tries)) captcha=await self._localpost(next(csGen),raw_img) self.captcha=captcha return captcha async def DeCaptcha(self): #打码。直接丢给打码平台处理 #captcha=await self._get() #self.captcha='打码结果' print('丢给打码平台的版本还未完成!') await asyncio.sleep(0) raise async def Submit(self):#提交投票 postdata=selector(self.html,self.voting_token,self.captcha) sleepTime=120-(time.time()-self.startTime)#消耗的时间减去90秒 if(sleepTime>0):#还没到90秒 print(self.id,self.proxy,'开始等待%d秒'%(sleepTime)) await asyncio.sleep(sleepTime)#坐等到90秒 print(self.id,self.proxy,'开始Submit') result=await self._post("https://www.internationalsaimoe.com/voting/submit",data=postdata) return result async def SaveHTML(self):#存票根 text=await self._get('https://www.internationalsaimoe.com/voting') #text=await self._get('https://www.internationalsaimoe.com/voting?lang=zh-hans') try: f=open('./HTML/%s.html'%(self.captcha),'w',encoding=('utf-8')) f.write(text) f.close() #print(self.id,self.proxy,'存票根成功') return('%d %s 存票根成功'%(self.id,self.proxy)) except Exception: return('%d %s 由于硬盘原因,存票根失败,可能硬盘过载!!!!!'%(self.id,self.proxy)) #@retry(aiohttp.ClientError,asyncio.TimeoutError,retries=2) async def Vote(self):#跑完整个投票流程!建议由Launch函数启动 try: await self.EnterISML() except NoVotingToken: return('%d %s %s'%(self.id,self.proxy,'找不到voting_token')) except RetryExhausted: return('%d %s %s'%(self.id,self.proxy,'连续重试次数超限')) except (aiohttp.ClientError,asyncio.TimeoutError): return('%d %s %s'%(self.id,self.proxy,'代理可能失效,放弃治疗')) try: ## #下面开始发指纹并且暂时不管它。识别验证码与发指纹并发执行 ## task=self.PostFingerprint() ## future=asyncio.ensure_future(task,loop=self.loop) ## #下面开始识别验证码任务。 ## await self.AIDeCaptcha() ## #下面等发指纹任务完成(通常早就完成了) ## await future await self.PostFingerprint() await self.AIDeCaptcha() #下面坐等到90秒然后submit result=await self.Submit() #下面应对验证码错误(重试) if('Invalid' in result):#验证码错误 result=await self.AIDeCaptcha() await self.Submit() if('Invalid' in result):#验证码错误 result=await self.AIDeCaptcha() await self.Submit() #下面存票根 if('successful' in result): task=self.SaveHTML() future=asyncio.ensure_future(task,loop=self.loop) future.add_done_callback(functools.partial(printer)) task=self.PostFingerprint() future=asyncio.ensure_future(task,loop=self.loop) future.add_done_callback(functools.partial(doNothing)) return('%d %s %s'%(self.id,self.proxy,result)) if('refresh' in result): #session expired等各种原因 print('%d %s %s %s'%(self.id,self.proxy,result,'开始重试整个投票流程')) if random.random()<0.6: await self.Vote() #再来一次! else: return('%d %s %s %s'%(self.id,self.proxy,result,'放弃治疗')) #结束投票 #if('An entry' in result): #这个ip被抢先投票 return('%d %s %s'%(self.id,self.proxy,result)) #结束投票 except RetryExhausted: return('%d %s %s'%(self.id,self.proxy,'连续重试次数超限')) except (aiohttp.ClientError,asyncio.TimeoutError): return('%d %s %s'%(self.id,self.proxy,'代理可能失效,放弃治疗')) def Launch(self): vote=self.Vote() vote_future=asyncio.ensure_future(vote,loop=self.loop) vote_future.add_done_callback(functools.partial(printer))
async def get_election_offices(): """Starting point of the scraper program. Scrapes BASE_URL for election office information and both dumps results to a .json file and returns the results as json. @return: list of scraped results as json. """ # Get list of county names from registrar to populate form # Define coroutine functions (context managers) async with CloudflareScraper() as session: async with session.get(REGISTRAR_URL) as s: # ClientResponse.read() is a coroutine function so it must be awaited text = await s.read() soup = bS(text, "html5lib") county_option_list = soup.findAll( attrs={"name": "idTown"})[0].findAll("option") id_list = [ county_option["value"] for county_option in county_option_list ] county_list = [ county_option.string for county_option in county_option_list ] # Use list of counties and IDs to get county info for each county tasks: List[Task] = [] num_scraped = 0 master_list = [] for i in range(len(id_list)): county_id = id_list[i] county_name = county_list[i] # Create task for a future asynchronous operation and store it in task list tasks.append( asyncio.create_task( scrape_one_county(session, county_id, county_name))) # Run the coroutines and iterate over the yielded results as they complete # (out-of-order). Use asyncio.gather() with a couple code modifications to # preserve list order future: Future[Tuple[str, str, str, str, str, str]] for future in asyncio.as_completed(tasks): # Unpack awaited result of scrape_one_county() ( registrar_name, phys_address, mail_address, phone_number, email_address, county_name, ) = await future schema = format_data_into_schema( registrar_name, phys_address, mail_address, phone_number, email_address, county_name, ) master_list.append(schema) num_scraped += 1 print(f"[Georgia] Scraped {county_name} county: " f"#{num_scraped} of {len(county_list)} .... " f"[{round((num_scraped / len(county_list)) * 100, 2)}%]") with open(os.path.join(ROOT_DIR, "scrapers", "georgia", "georgia.json"), "w") as f: json.dump(master_list, f) return master_list
async def read_logs(): result_chat_lines = [] result_kill_lines = [] values = ('user', 'password', 'serverid', 'loc', 'folder', 'admin_file', 'admin_line', 'chat_file', 'chat_line', 'kill_file', 'kill_line', 'login_file', 'login_line', 'violations_file', 'violations_line') try: load_configini() except Exception: global configini configini = {} for value in values: if value not in configini: configini[value] = '' if configini['folder'] != '': if configini['folder'][-1:] != '/' and configini['folder'][-1:] != '\\': configini['folder'] = configini['folder'] + '/' save_configini() URL_LOGIN = '******'.format( configini['loc']) URL_LOGS = 'https://www.g-portal.{}/en/scum/logs/{}'.format( configini['loc'], configini['serverid']) async with CloudflareScraper() as session: try: log('connecting g-portal...') payload = { '_method': 'POST', 'login': configini['user'], 'password': configini['password'], 'rememberme': '1' } async with session.post(URL_LOGIN, data=payload) as raw_response: response = await raw_response.text() async with session.get(URL_LOGS) as raw_response: response = await raw_response.text() html = BeautifulSoup(response, 'html.parser') select = html.find('div', {'class': 'wrapper logs'}) loglist = select['data-logs'] logs = json.loads(loglist) for i in range(len(logs)): getid = logs["file_" + str(i + 1)] id = (getid[int(getid.find('Logs')) + 5:]) type = id.split('_')[0] if type in ['chat', 'kill']: if configini[type + '_file'] != '': if id < configini[type + '_file']: continue payload = { '_method': 'POST', 'load': 'true', 'ExtConfig[config]': getid } async with session.post(URL_LOGS, data=payload) as raw_response: response = await raw_response.text() content = json.loads(response) lines = content["ExtConfig"]["content"].splitlines() found = False writing = False for line in lines: # Replace all "%" symbols to ";" coz # .ini files can't save "%" symbol. if "%" in line: ready_line = "" # continue for i in line: if i == "%": ready_line += ";" else: ready_line += i index = lines.index(line) lines[index] = ready_line if id == configini[type + '_file'] and not found: if line == configini[type + '_line']: found = True continue # Replace all ";" symbold to ";" elif line.find("%") > -1: ready_line = "" for i in line: if i == "%": ready_line += ";" else: ready_line += i if ready_line == configini[type + "_line"]: found = True continue else: if type == "chat": result_chat_lines.append(line) else: result_kill_lines.append(line) writing = True if writing: if found: log('updating {}'.format(id)) else: log('creating {}'.format(id)) # file.close() configini[type + '_file'] = id configini[type + '_line'] = lines[-1] save_configini() if not result_chat_lines and not result_kill_lines: return [], False return [result_chat_lines, result_kill_lines], True except Exception: print(traceback.format_exc()) return [], False await session.close()
"Accept-Language": "zh-CN,zh;q=0.8", "Upgrade-Insecure-Requests": "1", "User-Agent": uaGen.random, } ##def printer(future): ## print(future.result()) request_timeout = 50 #单次http请求的默认超时。 #你可以随时暂时覆盖这一设置 captcha_timeout = 60 #单次取验证码的默认超时 timeoutConfig = aiohttp.ClientTimeout(total=request_timeout) captchaTimeoutConfig = aiohttp.ClientTimeout(total=captcha_timeout) localsession = CloudflareScraper(headers=headers, loop=worker_loop, timeout=timeoutConfig) #async def localsession_get(url='https://coinone.co.kr/'):#珂以测试防火墙 async def localsession_get(url="https://www.internationalsaimoe.com"): async with localsession.get(url) as res: text = await res.text() return ('Ignaleo:本地session请求%s,状态码为%d' % (url, res.status)) #print('Ignaleo:本地session请求%s,状态码为%d'%(url,res.status)) #return res.status ## await asyncio.sleep(80) ## async with localsession.post(url,data=b'test',ssl=False) as res: ## text = await res.text()