def all_properties(): h = get_pq(PROPERTIES_REFERENCE) res = OrderedDict((property_name_from_href(q(a).attr("href")), { "title": " ".join(a.text.split()), "description": q(a).attr("href") }) for a in h('#divContent li a')) return res
async def on_callback_query(self, msg): query_id, from_id, data = telepot.glance(msg, flavor='callback_query') print('Callback query:', query_id, from_id, data) if self._message_with_inline_keyboard: if data == "appmedia_ssr": rank_url = "https://appmedia.jp/fategrandorder/96261" if data == "appmedia_sr": rank_url = "https://appmedia.jp/fategrandorder/93558" r = requests.get(rank_url) if r.status_code == 200: rq = q(r.content.decode("utf-8")) tables = rq(".post-content > table") table_rank = q((tables)[6]) # remove thead table_trs = table_rank("tr")[1:] reply = "%s\n" % rank_url reply += "サーヴァント 総合 高難 周回\n" for i in table_trs: qi = q(i) len_children = len(qi.children()) if len_children == 5: qic = qi("td") qia = qi("a") qimg = qi("img")[2] sougou = q(qimg).attr("src").split('/')[-1].rstrip('.png') if sougou == "A-1": sougou = "A+" reply += "[{}]({}) {} {} {}\n".format(q(qic[0]).text(), qia.attr("href"), sougou, q(qic[3]).text(), q(qic[4]).text()) await self.sender.sendMessage(reply, parse_mode="Markdown")
def parse_properties_from_href(href): h = tools.get_pq(href) dl = h('#main-col-body .variablelist dl').filter( lambda i: 'Type :' in q(this).text() ) #pretty_print_element(dl) #import pdb; pdb.set_trace() pairs = zip(dl.children('dt'), dl.children('dd')) pairs = [(q(dt), q(dd)) for dt, dd in pairs] properties = OrderedDict( (dt.text().split()[0], get_type(dt, dd)) for dt, dd in pairs ) required = [ k.text() for k, v in pairs if v('p').filter( lambda i: 'Required : Yes' in q(this).text() and not 'Yes, for VPC security groups' in q(this).text() ) ] return properties, required
def get_page_info(url): t = '' # 获取页面信息 re = requests.get(url) # 使用pyqyery解析得到标题和内容 info = q(re.text) # 获取标题 title = info('body > div.qq_conent.clearfix > div.LEFT > h1').text() t += title + '\n\n' + url + '\n' # 我发现页面js中含有一个json串,含有新闻的基本信息,切片获取后用json解析 # 获取js语句,切片后得到json串 aaa = info('head > script:last').text()[14:] try: j = json.loads(aaa, encoding='UTF-8') except JSONDecodeError as e: print(e) # 发表媒体 media = j['media'] t += '作者:' + media + '\n' # 发布时间 pubtime = j['pubtime'] t += '发布时间:' + pubtime + '\n' # 新闻标签 tags = j['tags'] t += '关键字:' + tags + '\n\n' # 新闻内容 ps = info( 'body > div.qq_conent.clearfix > div.LEFT > div.content.clearfix > div.content-article > p' ) for p in ps: # 每次一段 t += q(p).text() t += '\n\n' return t
def fit2(url): s = q(url, headers=headers) r = s('div').filter('.chapter ') name1 = s('h2').text() for i in r: a = q(i) name2 = re.findall(r'第\d.+?(?=&)',str(a)) link = a('a').filter('.J-media-item') for i in link: link = re.findall(r'(?<=video/).+?(?=")', str(q(i))) b = q(i).text() name3 = re.findall(r'\d-\d.+(?=\r)', str(b)) if link: url = 'http://www.imooc.com/course/ajaxmediainfo/?mid=' + link[-1] + '&mode=flash' r = requests.get(url) r = r.json() r = r['data']['result']['mpath'] H = r[-1]#BD M = r[-2]#HD L = r[-3]#SD mkdr = '.\\' + name1 + '\\' + name2[-1] if os.path.exists(mkdr)==False: os.makedirs(mkdr) name = name1 + '\\' + name2[-1] + '\\' + name3[-1] + '.mp4' thre.download( H, name, blocks=3, proxies={} )
def fit3_1(url): s = q(url, headers=headers) r = s('div').filter('.moco-course-wrap') for i in r: link = re.findall(r'learn/\d{1,6}(?=")', str(q(i))) if link: ur = 'http://www.imooc.com/' + link[-1] print(ur) fit2(ur)
def all_properties(): h = get_pq(PROPERTIES_REFERENCE) res = OrderedDict( ( property_name_from_href(q(a).attr("href")), { "title": " ".join(a.text.split()), "description": q(a).attr("href") } ) for a in h('#divContent li a')) return res
def page_get(root_url): resp = requests.get(root_url) html = resp.text query = q(html) url_list = [] for res_page in query.find('.hsbn'): href = q(res_page).attr('href') url = root_url.replace('futaba.htm', '') url_list.append(os.path.join(url, href)) return url_list
def parse_parameters(): parameters_href = tools.BASE + 'parameters-section-structure.html' h = tools.get_pq(parameters_href) dl = h('#main-col-body .variablelist dl').eq(0) dl = q(dl) dl = zip(dl.children('dt'), dl.children('dd')) dl = OrderedDict((q(dt).text(), q(dd)) for dt, dd in dl) result = OrderedDict() result['Type'] = parse_paremeter_types(dl.pop('Type')) for dt in dl.keys(): result[dt] = {'type': 'string'} result['AllowedValues']['type'] = 'array' result['NoEcho']['type'] = ['string', 'boolean'] return result
async def auto_join(self, response): doc = q(response.content) for component in doc('[id][state]'): tag_name = component.get('is') or component.tag state = json.loads(component.get('state')) component_id = self.add_component(tag_name, state) await self.send_join(component_id)
def all_resource_properties_hrefs(): h = get_pq( 'http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-product-property-reference.html' ) res = OrderedDict((a.attr('href'), a.text()) for a in [q(a) for a in h('#divContent li a')]) return res
def get_pq(uri=BASE + 'aws-template-resource-type-ref.html'): h = q(uri, headers={ 'user-agent': 'https://github.com/fungusakafungus/cloudformation-jsonschema' }) h.make_links_absolute() return h
def parse_paremeter_types(dd): types = [q(dt).text() for dt in dd('dl dt')] types += ['List<String>'] # undocumented result = OrderedDict() result['type'] = 'string' result['enum'] = types return result
def getUserUploads(username): doc = q("https://www.youtube.com/user/%s/videos" % username) vids = doc('#channels-browse-content-grid .yt-lockup-title a[href^="/watch"]') vids = vids.map(lambda i, e: { 'id':videoIdPattern.findall(q(e).attr('href')), 'title':q(e).attr('title') }) for v in vids: if len(v['id']) and v['title']: if v['id'][0]==latest: break notifyVideo(v['id'][0],v['title']) f = open('latest.sav','w') f.write(vids[0]['id'][0]) f.close()
def fit4(url): s = q(url, headers=headers) r = s('div').filter('.item') link = re.findall(r'c=.+?(?=")', str(r)) for li in link: url = 'http://www.imooc.com/course/list?' + li print(url) fit3(url)
def all_resource_hrefs(): h = get_pq( 'http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-template-resource-type-ref.html' ) all_resource_hrefs = OrderedDict( (a.text(), a.attr('href')) for a in [q(a) for a in h('#divContent li a')]) return all_resource_hrefs
def fit3(url): s = q(url, headers=headers) r = s('div').filter('.page') if r: pages = re.findall(r'\d', str(r.text())) for page in pages: urls = url + '&page=' + page fit3_1(urls) else: fit3_1(url)
def all_resource_properties_hrefs(): h = get_pq(BASE + 'aws-product-property-reference.html') res = OD( (a1.attr('href'), a1.text()) for a1 in [q(a) for a in h('#main-col-body li a') ] ) return res
def set_resource_properties(res_type): all = all_resource_hrefs() h = get_pq(all[res_type]) schema = load() dl = h('#divContent .variablelist dl').eq(0) resources = resources_dict(schema) pairs = zip(dl('dt'), dl('dd')) pairs = [(q(dt), q(dd)) for dt, dd in pairs] shortcut = resources[res_type]['properties'] shortcut['Properties'] = OrderedDict() shortcut['Properties']['properties'] = OrderedDict( (dt.text(), get_type(dd)) for dt, dd in pairs) required = [ k.text() for k, v in pairs if v('p').filter(lambda i: 'Required : Yes' in q(this).text()) ] if required: shortcut['Properties']['required'] = required resources[res_type]['required'] = ['Properties'] return schema
def all_res_properties(): h = tools.get_pq(PROPERTIES_REFERENCE) res = OrderedDict() for a in h('#main-col-body li a'): href = q(a).attr("href") res[property_name_from_href(href)] = OD(( ("title", " ".join(a.text.split())), ("descriptionURL", href), ("type", "object"), )) return res
def getPageText(base_url, next_page): result = requests.get(base_url + next_page) result.encoding = 'UTF-8' t = q(result.text) title = t('div.zhong').text() print(title) text = t('article#nr').text() # TODO 在这里持久化 serializeToFile('D:\myCode\python\crawler\gyzz.txt', title, text) exist_next_page = False for aa in t('a.dise'): if aa.text == '下一页': next_page = q(aa).attr.href print(next_page) exist_next_page = True break if not exist_next_page: return getPageText(base_url, next_page)
def fit1(url): link = url.split('/', -1)[-1] ul = 'http://www.imooc.com/course/ajaxmediainfo/?mid=' + link + '&mode=flash' s = q(url, headers=headers) r = s('em').text() video = re.findall(r'\d-.+?(?=\d)', r) name = video[-1] + '.mp4' r = requests.get(ul) r = r.json() r = r['data']['result']['mpath'] H = r[-1]#BD M = r[-2]#HD L = r[-3]#SD thre.download( H, name, blocks=4, proxies={} )
def set_resource_properties(res_type): all = all_resource_hrefs() h = get_pq(all[res_type]) schema = load() dl = h('#divContent .variablelist dl').eq(0) resources = resources_dict(schema) pairs = zip(dl('dt'), dl('dd')) pairs = [(q(dt), q(dd)) for dt, dd in pairs] shortcut = resources[res_type]['properties'] shortcut['Properties'] = OrderedDict() shortcut['Properties']['properties'] = OrderedDict( (dt.text(), get_type(dd)) for dt, dd in pairs ) required = [ k.text() for k, v in pairs if v('p').filter(lambda i: 'Required : Yes' in q(this).text()) ] if required: shortcut['Properties']['required'] = required resources[res_type]['required'] = ['Properties'] return schema
def get_type(dt, dd_): dd = dd_('p').filter(lambda x: q(this).text().startswith('Type')) t = dd.text().lower() for pattern, schema_fragment in type_patterns: if pattern in t: return schema_fragment if dd('a'): return property_ref(dt, dd_, dd, t) if dd_('.type') and len(dd_('.type')): if (dd_('.type').text() == 'AWS::EC2::SecurityGroup' and 'list of' in t): return {"$ref": "basic_types.json#/definitions/list<string>"} ind = t.find('type :') extract = t[ind:ind + 50] log.warning('Could not parse resource property type: "%s"\n"%s"', extract, dd_.html()) return {'description': dd_.html()}
def apply_diff(self, html_diff): html = [] cursor = 0 for diff in html_diff: if isinstance(diff, str): html.append(diff) elif diff < 0: cursor -= diff else: html.append(self.last_received_html[cursor:cursor + diff]) cursor += diff self.last_received_html = ''.join(html) self.doc = q(self.last_received_html) state = self.doc.attr['state'] if state: self.state = json.loads(state)
def get_type(dd_): dd = dd_('p').filter(lambda x: q(this).text().startswith('Type')) t = dd.text().lower() if 'type : string' in t: return {'type': 'string'} if 'list of strings' in t: return {'type': 'array', 'items': {'type': 'string'}} if 'type : integer' in t: return {'type': 'integer'} if 'type : boolean' in t: return {'type': 'boolean'} if dd('a'): return property_ref_from_href(dd('a').attr('href')) if dd_('.type') and len(dd_('.type')): if (dd_('.type').text() == 'AWS::EC2::SecurityGroup' and 'list of' in t): return {'type': 'array', 'items': {'type': 'string'}} warn('Could not parse resource property type: "%s"', dd_.html()) return {}
def find_route(src, dst, dt, options): def _opt_to_char(opt): return '1' if options[opt] else '0' head_info = '{} ==> {} at {}'.format(src, dst, dt.strftime('%Y-%m-%d %H:%M')) print(head_info) dt_str = dt.strftime('%Y%m%d%H%M') root = q(str(requests.get( 'https://transit.yahoo.co.jp/search/result', params={ 'flatlon': '', 'from': src, 'tlatlon': '', 'to': dst, 'viacode': '', 'via': '', 'y': dt_str[:4], 'm': dt_str[4:6], 'd': dt_str[6:8], 'hh': dt_str[8:10], 'm1': dt_str[10:11], 'm2': dt_str[11:12], 'ticket': 'ic', 'expkind': '1', 'ws': '3', 's': '0', 'al': '1', 'shin': _opt_to_char('shinkansen'), 'ex': _opt_to_char('limited_express'), 'hb': '1', 'lb': '1', 'sr': '1', 'kw': '', }).content, 'utf-8')) RouteDisplay(options).format_root(root) print('=' * 80) print(head_info)
def __init__(self, *args, **kwargs): super(Fgobot, self).__init__(*args, **kwargs) self._count = 0 self._servants = OrderedDict() self._message_with_inline_keyboard = None s_table = requests.get("https://www9.atwiki.jp/f_go/pages/671.html") q_table = q(s_table.content.decode("utf-8")) q_table = q_table("#wikibody table") t_list = list(q_table('tr').items()) for i in t_list: sclass = q(i('td')[0]).text() if sclass: self._servants[sclass] = OrderedDict() for n, j in enumerate(i('td')[1:]): t_list_tr_text = q(j).text() if t_list_tr_text: servants_text = "" pqj = q(j) for s in pqj('a'): servants_text += "%s\nhttps:%s\n" % (q(s).text(), q(s).attr("href")) self._servants[sclass][n] = servants_text
def all_resource_hrefs(): h = get_pq(BASE + 'aws-template-resource-type-ref.html') all_resource_hrefs = OD((a1.text().strip(), a1.attr('href')) for a1 in [q(a) for a in h('#main-col-body li a')]) return all_resource_hrefs
def all_resource_hrefs(): h=get_pq('http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-template-resource-type-ref.html') all_resource_hrefs = OrderedDict((a.text(), a.attr('href')) for a in [q(a) for a in h('#divContent li a')]) return all_resource_hrefs
def parse(target_url_list): for url in target_url_list: resp = requests.get(url) html = resp.text query = q(html) dig_image(query)
def _no_comment(i): for o in i: if isinstance(o, HtmlComment): continue yield q(o)
def reply_text(self, text): reply = "meow" markup = ReplyKeyboardRemove() if text.startswith("/appmedia"): reply = "Which ranking?" markup = InlineKeyboardMarkup(inline_keyboard=[ [InlineKeyboardButton(text='Appmedia SSR Ranking', callback_data='appmedia_ssr')], [InlineKeyboardButton(text='Appmedia SR Ranking', callback_data='appmedia_sr')] ]) if text.startswith("/servant"): keyboards = [] for i in self._servants.keys(): keyboards.append([KeyboardButton(text="★{} {}".format(j, i)) for j in self._servants[i].keys()]) markup = ReplyKeyboardMarkup(keyboard=keyboards, one_time_keyboard=True) # "Saber", "Archer", "Lancer", "Rider", "Caster", "Assassin", "Berserker", "Shielder", "Ruler", "Avenger", "MoonCancer", "AlterEgo", "Foreigner" reply = "Please choose class and rare" if text.startswith("★"): fgorare, fgoclass = text.split() fgorare = int(fgorare[1]) reply = self._servants[fgoclass][fgorare] if text.startswith("/hougu"): reply = """ 早见表 https://docs.google.com/spreadsheets/d/1ru35rHQ9DMsQcBXHPgUD5XDO-mSvR_j1fFTB_V507zw/htmlview fc2 宝具計算 fgotouka.web.fc2.com 国人宝具計算 https://xianlechuanshuo.github.io/fgo2/calc4.html """ if text.startswith("/drop"): reply = "FGO効率劇場\nhttps://docs.google.com/spreadsheets/d/1TrfSDteVZnjUPz68rKzuZWZdZZBLqw03FlvEToOvqH0/htmlview?sle=true#" if text.startswith("/wiki"): servant_name = " ".join(text.split()[1:]) if servant_name: query_page_url = "https://www9.atwiki.jp/f_go/?cmd=wikisearch&keyword={}".format(servant_name) r = requests.get(query_page_url) if r.status_code == 200: rq = q(r.content.decode("utf-8")) links = rq("#wikibody li a") filtered_links = [q(x) for x in links if not any(("コメント" in q(x).text(), "ボイス" in q(x).text(), "性能" in q(x).text()))][:10] reply = "" for i in filtered_links: reply += "{}\nhttps:{}\n\n".format(i.html(), i.attr("href")) else: reply = "connection timeout {}".format(r.status_code) else: reply = "https://www9.atwiki.jp/f_go/pages/671.html" if text.startswith("/price"): reply = "google: 9800 JPY = " google_finance_url = "https://finance.google.com/finance/converter?a={}&from={}&to={}".format(9800, "JPY", "CNY") result = requests.get(google_finance_url) if result.status_code == 200: rcontent = q(result.content) rcontent = rcontent("#currency_converter_result .bld").text() reply += rcontent jeanne_h5_url = "http://h5.m.taobao.com/awp/core/detail.htm?id=553971150031" reply += "\nJeanne {}".format(jeanne_h5_url) tu_jihua_url = "https://item.taobao.com/item.htm?spm=2013.1.w4023-16844942798.13.5692e503t594AU&id=558505049792" reply += "\n秃计划 {}".format(tu_jihua_url) if text.startswith("/gamewith"): reply = "https://gamewith.jp/fgo/article/show/62409" if text.startswith("/summon"): simulator_url = "https://konatasick.github.io/test_simulator/pool.html?" simulator_js = "https://konatasick.github.io/test_simulator/js/index.js" reply = "Summon list\n%s\n\n" % simulator_url r = requests.get(simulator_js) if r.status_code == 200: rcontent = r.content.decode("utf-8") rcontent = rcontent.replace("\n", "") summon_json = re.findall('"sites"\:(.*)\/\*', rcontent)[0].rstrip('}') summon_json = json.loads(summon_json) summon_json_last_ten = summon_json[::-1][:10] for i in summon_json_last_ten: reply += "{} {}{}\n".format(i["name"], simulator_url, i["info"]) if text.startswith("/help") or text.startswith("/start"): reply = """ Author: @fdb713 /appmedia - appmedia ranking /drop - drop statistics /gamewith - gamewith ranking link /hougu - hougu damage quick reference /price - compare JPY to CNY and 3rd-party charge /servant - send link of servants by rare and class from atwiki /summon - simulate summon /wiki - search and send link of servant or other keywords on atwiki page /help or /start - show this message """ return reply, markup
def readTextFromXmlFile(): xmlFilePath = os.path.join(os.getcwd(), 'assets', 'picf.xml') with open(xmlFilePath, 'r', encoding='utf8') as f: qxml = q(f.read()) qxml("p xref").replaceWith(lambda i, e: ' ' + qxml(e).text() + ' ') text = qxml("p:not(caption)")
def get_pq( uri='http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-template-resource-type-ref.html' ): h = q(uri) h.make_links_absolute() return h
def all_resource_properties_hrefs(): h = get_pq(BASE + 'aws-product-property-reference.html') res = OD((a1.attr('href'), a1.text()) for a1 in [q(a) for a in h('#main-col-body li a')]) return res
def all_resource_hrefs(): h = get_pq(BASE + 'aws-template-resource-type-ref.html') all_resource_hrefs = OD( (a1.text().strip(), a1.attr('href')) for a1 in [q(a) for a in h('#main-col-body li a')]) return all_resource_hrefs
def get_pq(uri='http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-template-resource-type-ref.html'): h=q(uri) h.make_links_absolute() return h
def all_resource_properties_hrefs(): h=get_pq('http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-product-property-reference.html') res = OrderedDict((a.attr('href'),a.text()) for a in [q(a) for a in h('#divContent li a')]) return res
def dig_image(query): p = re.compile('.*\/src\/.*\.jpg$') for link_list in query.find('a'): href = q(link_list).attr('href') if p.match(href): save_image(href)