def parsePage(html): # Dictionary to store info athInfo = {} #Now start populating our data object athInfo['AthleteName'] = html.cssselect("h2")[0].text athInfo['DivisionRank'] = html.cssselect("#rank *")[0].tail.strip() athInfo['OverallRank'] = html.cssselect("#div-rank *")[0].tail.strip() infoFields = ['Bib', 'Division', 'Age', 'State', 'Country', 'Profession'] detailsFields = ['TotalSwim', 'TotalBike', 'TotalRun', 'TotalTime'] rows = html.cssselect("table#general-info tr") for i, stat in enumerate(infoFields): athInfo[stat] = rows[i][1].text rows = html.cssselect("table#athelete-details tr") for i, stat in enumerate(detailsFields): athInfo[stat] = rows[i][1].text #have to use xpath to get T1 and T2 data athInfo['T1'] = html.xpath("//tr[contains(td/text(), 'T1:')]/td[2]")[0].text_content() athInfo['T2'] = html.xpath("//tr[contains(td/text(), 'T2:')]/td[2]")[0].text_content() athInfo['HasResults'] = 1 athInfo['Scraped'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") scraperwiki.sqlite.save(unique_keys=['Bib'], data=athInfo, table_name="RESULTS", verbose=0)
def signIn(username, password): raw = requests.get(PATHS['login']) session_left_slice = raw.headers['set-cookie'].find('=') + 1 session_right_slice = raw.headers['set-cookie'].find(';') session_id = raw.headers['set-cookie'][ session_left_slice:session_right_slice] html = lxml.html.fromstring(raw.text) db_viewstate = html.cssselect("input#__DATABASE_VIEWSTATE").value print db_viewstate ev_validation = html.cssselect("input#__EVENTVALIDATION").value # Create the form payload username_key = 'txtUserID' password_key = 'txtPassword' login_button = 'btnSubmit' form_payload = { '__EVENTTARGET': '', '__EVENTARGUMENT': '', '__VIEWSTATE': db_viewstate, '__EVENTVALIDATION': ev_validation, username_key: username, password_key: password, } session = requests.session() session.post(PATHS['login'], data=form_payload) return session
def get_metadata(url): resp = requests.head(url, headers=headers, timeout=5) resp.raise_for_status() if 'text/html' not in resp.headers.get('Content-Type'): return {'url': url} resp = requests.get(url, headers=headers, timeout=10) resp.raise_for_status() html = lxml.html.fromstring(resp.content.decode('utf8')) tags = html.cssselect('meta[property], meta[name]') meta = {} for tag in tags: prop = tag.attrib.get('property', tag.attrib.get('name')) data = tag.attrib.get('content') if data is not None: meta[prop] = data can = html.cssselect('link[rel="canonical"]') if can: meta['canonical'] = can[0].attrib['href'] # Canonical data meta['url'] = _get(meta, 'canonical', 'og:url', default=url) meta['description'] = _get(meta, 'description', 'og:description', 'twitter:description') meta['title'] = _get(meta, 'og:title', 'twitter:title', url) return meta
def scrape_restaurant_data(self, example): # get this from yelp html = obtain_html(example["url"]) html.make_links_absolute(example["url"]) title = html.cssselect("h1.biz-page-title")[0].text.strip() review_highlights = html.cssselect("ul.review-highlights-list") if len(review_highlights) > 0: description = tree_to_str(clean_up_highlights(review_highlights[0])) else: description = create_description_highlights(html) images = html.cssselect("img.photo-box-img") image_url = None if len(images) > 0: image_url = images[0].attrib["src"] return { "title": title, "description": description, "categories": example["categories"], "image_url" : image_url, "rating": rating_to_string(example["rating"]), "price": example["price"] }
def parsePage(html): # Dictionary to store info athInfo = {} #Now start populating our data object athInfo['AthleteName'] = html.cssselect("h2")[0].text athInfo['DivisionRank'] = html.cssselect("#rank *")[0].tail.strip() athInfo['OverallRank'] = html.cssselect("#div-rank *")[0].tail.strip() infoFields = ['Bib', 'Division', 'Age', 'State', 'Country', 'Profession'] detailsFields = ['TotalSwim', 'TotalBike', 'TotalRun', 'TotalTime'] rows = html.cssselect("table#general-info tr") for i, stat in enumerate(infoFields): athInfo[stat] = rows[i][1].text rows = html.cssselect("table#athelete-details tr") for i, stat in enumerate(detailsFields): athInfo[stat] = rows[i][1].text #have to use xpath to get T1 and T2 data athInfo['T1'] = html.xpath( "//tr[contains(td/text(), 'T1:')]/td[2]")[0].text_content() athInfo['T2'] = html.xpath( "//tr[contains(td/text(), 'T2:')]/td[2]")[0].text_content() athInfo['HasResults'] = 1 athInfo['Scraped'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") scraperwiki.sqlite.save(unique_keys=['Bib'], data=athInfo, table_name="RESULTS", verbose=0)
def scrape_daft_page(url): info = {'url': url} http_client = CachedHttpClient() data = http_client.get(url) html = lxml.html.fromstring(data) content_tag = html.xpath('//div[@id="content"]')[0] title_tag = content_tag.cssselect('.smi-info h1')[0] info['title'] = title_tag.text image_tag = html.cssselect('#smi-gallery-img-main img')[0] image_src = image_tag.attrib['src'] if image_src.startswith('//'): image_src = 'https:' + image_src info['image'] = image_src price_tag = html.cssselect('#smi-price-string')[0] info['price'] = price_tag.text header_text = html.cssselect('#smi-summary-items .header_text') hdrtext = [t.text for t in header_text] info['beds'] = hdrtext[1] info['baths'] = hdrtext[2] info['description'] = '\n\n'.join( elem.text_content() for elem in html.cssselect('#smi-tab-overview .description_block')) # info['description'] = content_tag.cssselect('.overview')[0].text return info
def test_render(self): # Bouton tout pourri qui affiche "Accueil -> [ Jardins, Variétés ]" column = DropDownLinkColumn(links=[ Link(text=u'Main button', viewname='s5appadherant:accueil'), Link(text=u'Jardins', viewname='s5appadherant:jardin_all', args=()), Link(text=u'Variété', viewname='s5appadherant:variete_list') ]) output = column.render(G(Adherant)) html = lxml.html.fromstring(output) elements = html.cssselect(".btn-group > a") self.assertEqual(1, len(elements)) self.assertEqual(u'Main button', elements[0].text) self.assertEqual(reverse('s5appadherant:accueil'), elements[0].attrib['href']) elements = html.cssselect("button.dropdown-toggle") self.assertEqual(1, len(elements)) self.assertEqual('dropdown', elements[0].attrib['data-toggle']) elements = html.cssselect("ul.dropdown-menu li a") self.assertEqual(2, len(elements)) self.assertEqual(u'Jardins', elements[0].text) self.assertEqual(reverse('s5appadherant:jardin_all'), elements[0].attrib['href']) self.assertEqual(u'Variété', elements[1].text) self.assertEqual(reverse('s5appadherant:variete_list'), elements[1].attrib['href'])
def get_citation(title): """Given a paper title, attempts to get citation strings for that paper from Google Scholar.""" # Search for the paper by title resp = requests.get(BASE_URL, params={'q': title}) html = lxml.html.fromstring(resp.content) result_els = html.cssselect('.gs_r') if not result_els: return None # Only consider the first match result_el = result_els[0] # result_title = result_el.cssselect('.gs_rt a')[0].text # Request the citations result_id = result_el.attrib['data-cid'] resp = requests.get(BASE_URL, params={ 'q': 'info:{}:scholar.google.com/'.format(result_id), 'output': 'cite' }) html = lxml.html.fromstring(resp.content) citations = {} for format_el, citation_el in zip(html.cssselect('th'), html.cssselect('td .gs_citr')): format = format_el.text citation = citation_el.text_content() citations[format] = citation return citations
def sign_in(u, p): session = requests.session() raw = session.get(LOGIN_URL) html = lxml.html.fromstring(raw.text) viewstate = html.cssselect("input#__VIEWSTATE")[0].value viewstate_generator = html.cssselect("input#__VIEWSTATEGENERATOR")[0].value payload = { "ctl00$cphMain$Logon1$_resolution" : "1440x900", "ctl00$cphMain$Logon1$_email" : u, "ctl00$cphMain$Logon1$_password": p, "ctl00$cphMain$Logon1$_login": "******", "__EVENTTARGET": "", "__EVENTARGUMENT": "", "__VIEWSTATE": viewstate, "__VIEWSTATEGENERATOR": viewstate_generator, } headers = { "Content-Type": "application/x-www-form-urlencoded", } r = session.post(LOGIN_URL, data=payload, headers=headers) assert "unread messages" in r.content return session
def parsePage(html): # Dictionary to store info athInfo = {} #Now start populating our data object athInfo['ATHLETE_NAME'] = html.cssselect("h2")[0].text athInfo['DIVISION_RANK'] = html.cssselect("#rank *")[0].tail.strip() athInfo['OVERALL_RANK'] = html.cssselect("#div-rank *")[0].tail.strip() #infoFields = ['BIB', 'DIVISION', 'AGE', 'STATE', 'COUNTRY', 'PROFESSION'] infoFields = ['BIB', 'DIVISION', 'STATE', 'COUNTRY', 'PROFESSION'] detailsFields = ['TOTAL_SWIM', 'TOTAL_BIKE', 'TOTAL_RUN', 'TOTAL_TIME'] rows = html.cssselect("table#general-info tr") for i, stat in enumerate(infoFields): athInfo[stat] = rows[i][1].text rows = html.cssselect("table#athelete-details tr") for i, stat in enumerate(detailsFields): athInfo[stat] = rows[i][1].text #have to use xpath to get T1 and T2 data athInfo['T1'] = html.xpath("//tr[contains(td/text(), 'T1:')]/td[2]")[0].text_content() athInfo['T2'] = html.xpath("//tr[contains(td/text(), 'T2:')]/td[2]")[0].text_content() athInfo['HAS_RESULTS'] = 1 athInfo['SCRAPED'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") scraperwiki.sqlite.save(unique_keys=['BIB'], data=athInfo, table_name="RESULTS", verbose=0)
def test_templates_course_detail_one_open_course_run(self): """ For a course with one open course run, the course run should be in the header and the side column should display an indication that there is no other course run. """ course = CourseFactory() page = course.extended_object # Create an open course run now = timezone.now() CourseRunFactory( direct_course=course, start=now + timedelta(hours=1), enrollment_start=now - timedelta(hours=1), enrollment_end=now + timedelta(hours=1), ) self.assertTrue(page.publish("fr")) url = page.get_absolute_url() response = self.client.get(url) self.assertEqual(response.status_code, 200) html = lxml.html.fromstring(response.content) # Check syllabus intro header = str(etree.tostring(html.cssselect(".subheader__intro")[0])) self.assertEqual(header.count("course-detail__run-descriptions"), 1) self.assertIn("S’inscrire maintenant", header) # Check syllabus aside column aside = str(etree.tostring(html.cssselect(".course-detail__aside")[0])) self.assertNotIn("course-detail__run-descriptions", aside) self.assertNotIn("S’inscrire maintenant", aside) self.assertIn("Aucune autre session ouverte", aside)
def get_agent_by_html(html): # print tostring(html) try: name = html.cssselect('a')[0].get('title').encode( 'ascii', 'ignore').decode('ascii') agent_url = html.cssselect('a')[0].get('href') estate_name = html.cssselect('a') estate_name = estate_name[len(estate_name) - 1].get('title') try: reg_number = re.search( re.escape(r'CEA Registration Number :') + '\s(.{8})', tostring(html), re.I).group(1) except AttributeError: reg_number = None try: lic_number = re.search( re.escape(r'Agency Licence Number :') + '\s(.{9})', tostring(html), re.I).group(1) except AttributeError: lic_number = None phone_number = html.cssselect('span a') if phone_number: phone_number = get_phone_number(phone_number[0]) agent = AgentIProperty(name=name, phone_number=phone_number, estate_name=estate_name, reg_number=reg_number, lic_number=lic_number, url=agent_url) # print agent return agent except IndexError: return None
def test_templates_course_detail_two_open_course_runs(self): """ For a course with two open course runs, the course run starting next should be in the header and the other course run should be in the side column. """ course = CourseFactory() page = course.extended_object url = page.get_absolute_url() # Create 2 open course runs now = timezone.now() start1, start2 = random.sample( [now + timedelta(days=1), now + timedelta(days=2)], 2 ) CourseRunFactory( direct_course=course, start=start1, enrollment_start=now - timedelta(hours=1), enrollment_end=now + timedelta(hours=1), ) CourseRunFactory( direct_course=course, start=start2, enrollment_start=now - timedelta(hours=1), enrollment_end=now + timedelta(hours=1), ) self.assertTrue(page.publish("fr")) response = self.client.get(url) self.assertEqual(response.status_code, 200) html = lxml.html.fromstring(response.content) # Check syllabus intro header = str( etree.tostring( html.cssselect(".subheader__intro")[0], encoding="iso8859-1", method="html", ).decode("utf-8") ) self.assertEqual(header.count("course-detail__runs--open"), 1) self.assertIn("S’inscrire maintenant", header) date_string = formats.date_format(min(start1, start2)) with translation.override("fr"): self.assertIn(f"Du {date_string}", header) # Check syllabus aside column aside = str( etree.tostring( html.cssselect(".course-detail__aside")[0], encoding="iso8859-1", method="html", ).decode("utf-8") ) self.assertEqual(aside.count("course-detail__runs--open"), 1) self.assertIn("S’inscrire maintenant", aside) date_string = formats.date_format(max(start1, start2)) with translation.override("fr"): self.assertIn(f"Du {date_string}", aside)
def current_usage_info(session): response = session.get(current_usage_url) html = lxml.html.fromstring(response.text) def convert(text): m = re.search(r'(\d+(?:\.\d+)?) GB', text) if m: return float(m.group(1)) m = re.search(r'(\d+(?:\.\d+)?) MB', text) if m: return float(m.group(1)) / 1024.0 tds = html.cssselect('#usageInformation')[0].xpath('.//td') info = { 'download_usage': convert(_condense_whitespace(tds[2].text_content())), 'upload_usage': convert(_condense_whitespace(tds[4].text_content())), 'total_usage': convert(_condense_whitespace(tds[6].text_content())), 'allowance': convert(_condense_whitespace(tds[8].text_content())), 'billing_period': re.sub( r'Details for ?', '', _condense_whitespace( html.cssselect('#currentBillingPeriod')[0].text_content())), } info['left'] = info['allowance'] - info['total_usage'] return info
def main(): parser = argparse.ArgumentParser() parser.add_argument('-u', '--username', help='Username') parser.add_argument('-p', '--password', help='Password') args = parser.parse_args() with requests.Session() as s: homepage = s.get('http://trithuc.vinacontrol.com.vn/') html = lxml.html.fromstring(homepage.text) payloads = {'name': args.username, 'pass': args.username, 'form_build_id': html.cssselect('input[name=form_build_id]')[0].attrib['value'], 'form_id': html.cssselect('input[name=form_id]')[0].attrib['value'], 'op': 'Đăng nhập' } s.post('http://trithuc.vinacontrol.com.vn/node', data=payloads) res = s.get('http://trithuc.vinacontrol.com.vn/ds-cauhoi?field_quiz_phanloai_tid[0]=438&items_per_page=All') html = lxml.html.fromstring(res.text) table = html.cssselect('table[data-view-name=ds_cauhoi]')[0] columns = ['STT', 'Phân loại', 'Câu hỏi', 'Trả lời'] df = pd.DataFrame(columns=columns) for row in table.cssselect('tbody>tr'): df = df.append(pd.DataFrame([parse_row(row)], columns=columns)) df.to_excel('test.xls')
def parsePage(html): # Dictionary to store info athInfo = {} # Now start populating our data object athInfo["ATHLETE_NAME"] = html.cssselect("h2")[0].text athInfo["DIVISION_RANK"] = html.cssselect("#rank *")[0].tail.strip() athInfo["OVERALL_RANK"] = html.cssselect("#div-rank *")[0].tail.strip() # infoFields = ['BIB', 'DIVISION', 'AGE', 'STATE', 'COUNTRY', 'PROFESSION'] infoFields = ["BIB", "DIVISION", "STATE", "COUNTRY", "PROFESSION"] detailsFields = ["TOTAL_SWIM", "TOTAL_BIKE", "TOTAL_RUN", "TOTAL_TIME"] rows = html.cssselect("table#general-info tr") for i, stat in enumerate(infoFields): athInfo[stat] = rows[i][1].text rows = html.cssselect("table#athelete-details tr") for i, stat in enumerate(detailsFields): athInfo[stat] = rows[i][1].text # have to use xpath to get T1 and T2 data athInfo["T1"] = html.xpath("//tr[contains(td/text(), 'T1:')]/td[2]")[0].text_content() athInfo["T2"] = html.xpath("//tr[contains(td/text(), 'T2:')]/td[2]")[0].text_content() athInfo["HAS_RESULTS"] = 1 athInfo["SCRAPED"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") scraperwiki.sqlite.save(unique_keys=["BIB"], data=athInfo, table_name="RESULTS", verbose=0)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-u', '--username', help='Username') parser.add_argument('-p', '--password', help='Password') args = parser.parse_args() with requests.Session() as s: homepage = s.get('http://trithuc.vinacontrol.com.vn/') html = lxml.html.fromstring(homepage.text) payloads = { 'name': args.username, 'pass': args.username, 'form_build_id': html.cssselect('input[name=form_build_id]')[0].attrib['value'], 'form_id': html.cssselect('input[name=form_id]')[0].attrib['value'], 'op': 'Đăng nhập' } s.post('http://trithuc.vinacontrol.com.vn/node', data=payloads) res = s.get( 'http://trithuc.vinacontrol.com.vn/ds-cauhoi?field_quiz_phanloai_tid[0]=438&items_per_page=All' ) html = lxml.html.fromstring(res.text) table = html.cssselect('table[data-view-name=ds_cauhoi]')[0] columns = ['STT', 'Phân loại', 'Câu hỏi', 'Trả lời'] df = pd.DataFrame(columns=columns) for row in table.cssselect('tbody>tr'): df = df.append(pd.DataFrame([parse_row(row)], columns=columns)) df.to_excel('test.xls')
def _result_type(html): if html.cssselect(".tinfodownloadbutton a"): return "info" elif html.cssselect(".tlistdownload a"): return "list" else: return "empty"
def scrape_comment(self, html, parent): c = HTMLDocument( text = html.cssselect("div.text-holder"), headline = html.cssselect("a.commentTitle")[0].text_content().strip(), section = parent.props.section, date = readDate(" ".join([t.text for t in html.cssselect("ul.meta li.createdate, li.createtime")])), author = html.cssselect("ul.meta li.by")[0].text.strip().lstrip("By").strip(), url = parent.props.url + "#{}".format(html.cssselect("a.commentTitle")[0].get('id'))) c.props._parent = "{p.props.headline}, {p.props.date}".format(p = parent) return c
def nextPage(html,base_url=''): # logger.info('have many page') car_body = lxml.html.tostring(html.cssselect('.text')[-1]) while len(html.cssselect('.next')) > 0 and len(html.cssselect('.nextBtn')) == 0: nextpage = requests.get(base_url + html.cssselect('.next')[0].get('href')) nexthtml = lxml.html.fromstring(nextpage.content) body = lxml.html.tostring(nexthtml.cssselect('.text')[-1]) car_body += body html = nexthtml return car_body
def scrape_detail_page(response: requests.Response) -> dict: html = lxml.html.fromstring(response.text) ebook = { 'url': response.url, 'title': html.cssselect('#bookTitle')[0].text_content(), 'price': html.cssselect('.buy')[0].text.strip(), 'content': [h3.text_content() for h3 in html.cssselect('#content > h3')] } return ebook
def scrape_detail_page(response): html = lxml.html.fromstring(response.text) ebook = { 'url': response.url, 'key': extract_key(response.url), 'title': html.cssselect('#bookTitle')[0].text_content(), 'price': html.cssselect('.buy')[0].text.strip(), 'content': [normalize_spaces(h3.text_content()) for h3 in html.cssselect('#content>h3')], } return ebook
def download_councillors(): with open(WEBPAGESTXT, 'r') as txtfile: urls = txtfile.readlines() urls = [url.strip() for url in urls] session = http.client.HTTPSConnection('www.berlin.de', timeout=10) councillors = {} for url in urls: if councillors: time.sleep(2) bezirk = bezirk_from_url(url) headers = {'Accept-Encoding': 'gzip', 'Connection': 'keep-alive'} session.request('GET', url, headers=headers) response = session.getresponse() response = response.read() response = zlib.decompress(response, 47) try: response = response.decode('latin-1', 'strict') except UnicodeDecodeError: response = response.decode('windows-1252', 'strict') html = lxml.html.fromstring(response) html.make_links_absolute(url) tablerows = html.cssselect('.zl12') tablerows += html.cssselect('.zl11') number = html.cssselect('table.tk1:nth-child(8)')[0] number = number.text_content() _, number = number.split(':') number = number.strip() if number.isdigit(): number = int(number) if not number == len(tablerows): print('%s:' % bezirk, '%s councillors were found.' % len(tablerows), 'Should be %s councillors.' % number) for row in tablerows: councillor = extract_councillor(row) councillor['BEZIRK'] = bezirk identifier = normalized_name(councillor['ANZEIGENAME']) try: councillors[bezirk][identifier] = councillor except KeyError: councillors[bezirk] = {identifier: councillor} session.close() return councillors
def parse(page_text): urls = [] html = get_html(page_text) url = html.cssselect('link[rel = "canonical"]')[0].get('href') print("URL:", url) breadcrumbs = html.cssselect('div.breadcrumbs>span.almost_bold')[0].text if (breadcrumbs == "History"): pagination = int(html.cssselect('div.rating_pagination.pagination>span')[0].text) if(pagination == 1): rating_pagination = html.cssselect("div.rating_pagination.pagination")[0] a_list = rating_pagination.cssselect('div.rating_pagination.pagination>a') for a in a_list: urls.append(main_type + a.get("href")) href_list = [] a_list = html.cssselect('table.rating.responsive>tr:not([class])>td[style="text-align:left"]>a') for a in a_list: href_list.append(main_type + a.get("href")) href_list = list(set(href_list)) for href in href_list: urls.append(href) else: type = html.cssselect('div.breadcrumbs>span[itemprop = "itemListElement"]')[1].cssselect('a')[0].get('title') group_list = html.cssselect('div.tbt2.row_heading>div>h2')[1:] title = html.cssselect('meta[property = "og:title"]')[0].get("content") # print("Название", title) id_device = add_device(title, type, url) # print("Идентификатор продукта", id_device) tables = html.cssselect('div.tbt1.single>div.table') # print(len(tables)) for index, table in enumerate(tables): group = group_list[index].text tbts = table.cssselect('div.tbt5') for tbt in tbts: divs = tbt.cssselect('div') one_block = divs[1].text #Прерываем ошибочные таблицs if(one_block == None): # add = False break two_block = divs[2].text if(two_block == None): try: two_block = divs[2].cssselect('span')[0].text except: two_block = divs[2].cssselect('a')[0].text if(two_block == "+"): two_block = 1 else: if(two_block == "-"): two_block = 0 # print("Группа:", group) id_device_variables = add_device_variables(one_block, group) # print("Идентификатор VARIABLE:", id_device_variables) id_device_value = add_device_value(id_device, id_device_variables, two_block) return urls
def test_render(self): column = self.get_column() output = column.render(G(Adherant)) html = lxml.html.fromstring(output) elements = html.cssselect('a[data-toggle="modal"]') self.assertEqual(1, len(elements)) self.assertEqual(u"Accueil", elements[0].text) modal_id = elements[0].attrib['data-target'] modal = html.cssselect(u'[id="%s"]' % modal_id[1:]) self.assertEqual(1, len(modal))
def upload_course(session, title="Lorem Ipsum", filename="test.jpg", filedata="", filetype="image/png"): response = session.get("https://www.iscp.ac.uk/evidence/course.aspx") html = lxml.html.fromstring(response.text) viewstate = html.cssselect("input#__VIEWSTATE")[0].value viewstate_generator = html.cssselect("input#__VIEWSTATEGENERATOR")[0].value viewstate_encrypted = html.cssselect("input#__VIEWSTATEENCRYPTED")[0].value event_validation = html.cssselect("input#__EVENTVALIDATION")[0].value payload = { "__EVENTTARGET": "", "__EVENTARGUMENT": "", "__EVENTVALIDATION": event_validation, "__VIEWSTATE": viewstate, "__VIEWSTATEGENERATOR": viewstate_generator, "__VIEWSTATEENCRYPTED": viewstate_encrypted, "ctl00$cphMain$txtDate": "01/01/1970", "ctl00$cphMain$txtEndDate": "", "ctl00$cphMain$drpTitles": 6, # Other "ctl00$cphMain$txtOtherTitle": title, "ctl00$cphMain$drpTypes": 0, "ctl00$cphMain$txtOtherType": "", "ctl00$cphMain$txtAwardingBody": "", "ctl00$cphMain$txtFeedback": "", "ctl00$cphMain$txtLearn": "", "ctl00$cphMain$txtImprove": "", "ctl00$cphMain$txtActionPlan": "", "ctl00$cphMain$topicChooser1$hidScrollTop": "", "ctl00$cphMain$topicChooser1$hidTpcExpanded": "True", "ctl00$cphMain$topicChooser1$hidSelectedTopics": "", "ctl00$cphMain$topicChooser1$hdnPopUpShowing": "", "ctl00$cphMain$topicChooser1$hidTab": "", "ctl00$cphMain$btnInsert": "Save Course/seminar", "ctl00$TraineeReport1$download_token_value_id": "17/05/2015 12:20:46", "ctl00$TraineeReport1$txtStartDate": "16/05/2014", "ctl00$TraineeReport1$txtEndDate": "16/05/2015", "ctl00$txtFeedbackComments": "", } files = { "ctl00$cphMain$fupControl1": ( filename, filedata, filetype, # FIXME ), } r = session.post("https://www.iscp.ac.uk/evidence/course.aspx", data=payload, files=files) pprint.pprint(r.text)
def theme(data): html = lxml.html.fromstring(data) # 等同于etree.HTML函数 data_theme = "" a = html.cssselect('#tab_sdyj > tfoot > tr') for i in range(len(a)): if (len( html.cssselect('#tab_sdyj > tfoot > tr:nth-child(' + str(i + 1) + ') > td:nth-child(2)')) != 0) and (len( html.cssselect('#tab_sdyj > tfoot > tr:nth-child(' + str(i + 1) + ') > td:nth-child(6)')) != 0): data_theme += html.cssselect('#tab_sdyj > tfoot > tr:nth-child(' + str(i + 1) + ')')[0].text_content() return (data_theme)
def download(self,link): parent_url='http://link.springer.com' source = requests.get(link).content html = lxml.html.fromstring(source) book_title= html.cssselect('h1#title')[0].text_content() chapter=01 for i in html.cssselect('li.toc-item'): url=urljoin(parent_url,i.cssselect('div.actions')[0].cssselect('span.action')[0].cssselect('a')[0].get('href')) pdf=requests.get(url).content f = open(book_title+str(chapter)+'.pdf', 'wb+') f.write(pdf) chapter+=1 print url Download_Book().concatenate_pdf(book_title)
def test_cms_plugins_program_fallback_when_never_published(self): """ The program plugin should render in the fallback language when the program page has never been published in the current language. """ # Create a program program = ProgramFactory( page_title={ "en": "public program", "fr": "programme publique" }, fill_cover={ "original_filename": "cover.jpg", "default_alt_text": "my cover", }, ) program_page = program.extended_object # Create a page to add the plugin to page = create_i18n_page({"en": "A page", "fr": "Une page"}) placeholder = page.placeholders.get(slot="maincontent") add_plugin(placeholder, ProgramPlugin, "en", **{"page": program_page}) add_plugin(placeholder, ProgramPlugin, "fr", **{"page": program_page}) # Publish only the French version of the program program_page.publish("fr") # Check the page content in English page.publish("en") url = page.get_absolute_url(language="en") response = self.client.get(url) html = lxml.html.fromstring(response.content) # The program's full name should be wrapped in a link within an h2 title = html.cssselect(".program-glimpse__title")[0] link = title.cssselect(".program-glimpse__link")[0] self.assertEqual(link.text_content().strip(), "programme publique") self.assertNotContains(response, "public program") # Program's cover should be present cover = html.cssselect(".program-glimpse__media")[0] self.assertEqual(cover.get("aria-hidden"), "true") img = cover.cssselect("img")[0] self.assertIsNotNone( re.search( r"/media/filer_public_thumbnails/filer_public/.*cover\.jpg__300x170", img.get("src"), ))
def getpage(url, f, count): html = geturl(url) html = lxml.html.fromstring(html) title = html.cssselect('h2')[0].text print('第 %d 页(%s)已下载' % (count, title)) patt = '(%\w+)+' f.write(title + '\n\n') content = html.cssselect('script')[7].text content = content[content.find('unescape')+10 : content.find('\"))')] #print(content) content = content.replace('%3Cbr%2F%3E%3Cbr%2F%3E','\n') content = content.replace('%','\\').encode() content = content.decode('unicode_escape') f.write(content + '\n')
def scrape_list_page(response: requests.Response) -> Iterator[str]: html = lxml.html.fromstring(response.text) html.make_links_absolute(response.url) for a in html.cssselect('#listBook > li > a[itemprop="url"]'): url = a.get('href') yield url
def current_month_daily_breakdown(session): summary_markers = [ '', 'Total Usage (GB)', 'Usage Allowance', 'Additional Use', 'Usage', ] response = session.get(day_to_day_url) html = lxml.html.fromstring(response.text) rows = html.cssselect(".internetUsageDataContainer .mainSection tr") data = [] summary = [] for row in rows: row = map(_format_cell_data, row.cssselect('td')) if row[0] in summary_markers: cleaned = [ td for td in row if td ] if len(cleaned): summary.append(cleaned) else: data.append(row) return DailyBreakdown(data, summary)
def get_submission_list(self, problem_name): self.check_problem_exist(problem_name) request = urllib2.Request(url=SITE_PREFIX+'judge/submission/recent/?problem='+problem_name) response = self.opener.open(request) try: import lxml.html except ImportError: print 'lxml library is needed for parsing HTML' return html = lxml.html.fromstring(unicode(response.read().decode('utf8'))) context = {} fields = ('id', 'problem', 'user', 'language', 'length', 'state', 'stats', 'submitted_on') length = {'id': 9, 'problem': 15, 'user': 15, 'language': 5, 'length': 7, 'state': 15, 'stats': 7, 'submitted_on': 15} template = u'%(id)s %(problem)s %(user)s %(language)s %(length)s %(state)s %(stats)s %(submitted_on)s' def width(string): return sum(1+(unicodedata.east_asian_width(c) in 'WF') for c in string) for tr in html.cssselect('table.submission_list tr'): for field in fields: element = tr.find_class(field) if element: context[field] = unicode(element[0].text_content().strip()) else: context[field] = u'' context[field] = ' ' * (length[field] - width(context[field])) + context[field] print template % context
def search(topic): url = 'https://lobste.rs/search?what=stories&order=newest&q=' + topic response = requests.get(url) html = lxml.html.fromstring(response.text) for item in html.cssselect('.link a'): text = item.text_content() print(text)
def download(): i = 0 aux = 0 for i in range(int(MAX_BOOKS)): try: if not os.path.isfile(get_url_from_pne(img_temp, 'DBM_', i, 'jpg')) and not os.path.isfile(get_url_from_pne(img_temp, 'DBM_', i, 'png')): link = 'http://www.dragonball-multiverse.com/es/page-'+str(i)+'.html' source = requests.get(link).content html = lxml.html.fromstring(source) book_title = html.cssselect('div')[7].cssselect('img')[0].get('src') url=urljoin(parent_url,book_title) img_format = book_title[-3:] # file ext catched (.jpg or .png) img_file = open(get_url_from_pne(img_temp, 'DBM_', i, img_format),'wb') img_file.write(requests.get(url).content) img_file.close() time.sleep(0.5) print("Downloaded in: "+get_url_from_pne(img_temp, 'DBM_', i, img_format)) else: print("Skipping book number %s\r" %i) except Exception as e: print(e) print("It cannot be downloaded") BLACK_LIST.append(i) time.sleep(.5)
def __enter__(self): # Similar to assertContains(), we verify the status code self.test_case.assertEqual(self.response.status_code, self.status_code) # TODO consider validating self.response['Content-Type'] # Parse the response as HTML html = lxml.html.fromstring(self.response.content.decode('utf-8')) if self.selector is not None: # Use cssselect to filter the elements elements = html.cssselect(self.selector) # Ensure some data exists if len(elements) == 0: raise SelectorNotFound( 'No selector matches found for {0}'.format(self.selector) ) return elements if self.element_id is not None: try: return html.get_element_by_id(self.element_id) except KeyError: raise ElementIDNotFound( 'Element with id, {0}, not present'.format(self.element_id) ) # No filtering defined, return the entire parsed HTML document return html
def spiderboy(url): page = requests.get(url,headers=headers) base_url ='/'.join(page.url.split('/')[:-4]) html = lxml.html.fromstring(page.content.decode('gbk','ignore')) items = html.cssselect('.anewsnotitle') for item in items: car_link = base_url + item.cssselect('.newstext h3 a')[0].get('href') logger.info('link: '+car_link) try: Car.objects.get(car_link = car_link) pass except Exception,e: car_title = str(item.cssselect('.newstext h3 a')[0].text_content()) logger.info('title: '+car_title) car_icon = base_url + item.cssselect('.newspic a img')[0].get('src') logger.info('icon_url: '+car_icon) car_des = str(item.cssselect('.newstext p')[0].text_content()) logger.info('get des') innerpage = requests.get(car_link,headers=headers) innerhtml = lxml.html.fromstring(innerpage.content.decode('gbk','ignore')) try: next = innerhtml.cssselect('.cpagesizebottom a')[-1] if next.text_content() == u'下一页': mid_body = nextPage(innerhtml,base_url) else: mid_body = lxml.html.tostring(innerhtml.cssselect('.content')[0]) mid_body = cleaner.clean_html(mid_body) except: mid_body = lxml.html.tostring(innerhtml.cssselect('.content')[0]) mid_body = cleaner.clean_html(mid_body) pattern = re.compile(r'(?:src|href)="([^http].*?[\.jpg])"', re.VERBOSE) test = pattern.findall(mid_body) test = list(set(test)) for i in test: mid_body = mid_body.replace(i,base_url+i) car_body = mid_body logger.info('body: catch') car_cate = category_select(url,catechoice) logger.info('category: '+car_cate) ca = Car(car_title=car_title, car_des=car_des, car_link=car_link, car_body=car_body, car_icon=car_icon, car_source="neeu", car_cate=car_cate) ca.save() logger.info('done one')
def selectMany(self, css): html = self.get_html() selection = html.cssselect(css) if not len(selection): self.fail('No elements matching: %r' % css) self.state['selection'] = selection return selection
async def main(): try: all_urls = ALL_URLS.read_text().split() except FileNotFoundError: all_urls = [] async with aiohttp.ClientSession(raise_for_status=True) as session: async with session.get( "https://p.eagate.573.jp/game/sdvx/") as response: html = lxml.html.fromstring(await response.text()) urls = [ img.attrib["data-original"] for img in html.cssselect("div.news_box img") ] if urls: for url in urls: if WEBHOOK and url not in all_urls: print(url) all_urls.append(url) async with session.get(url) as response: image_data = await response.read() data = aiohttp.FormData() data.add_field("file", image_data, filename=URL(url).name) await session.post(WEBHOOK, data=data) ALL_URLS.write_text("".join(f"{url}\n" for url in all_urls)) CURRENT_URLS.write_text("".join(f"{url}\n" for url in urls))
def parse_article_oneline_list(url, text=None): ''' XPATH: //div[@class="memo_list"]/form[@id="listForm"]/ul/dl source: <dl> <dt class="profile_block reply_size" > <div id="pimgWrap_0_6475" class="fl"> <img src="http://fimg.daum-img.net/tenth/img/y/t/i/u/ccJT/76/96c1c9-42087-d1.bmp" width="32" height="32" alt="" onmouseover="MemoFormController.showProfileLayer(this, 'pimg_0_6475');" onmouseout="MemoFormController.hideProfileLayer();"> <img id="pimg_0_6475" src="http://fimg.daum-img.net/tenth/img/y/t/i/u/ccJT/76/96c1c9-42087-d3.bmp" width="150" height="150" style="display: none;" alt="프로필 이미지" /> </div> </dt> <dd class="content_block "> <div id="memoViewer_0_6475" class="content_viewer "> <p class="nickname"> <a href="#" onclick="showSideView(this, 'Zo6UMXQoclc0', '', 'Ellen[\uC774\uACBD\uBBFC]'); return false;" class="b">Ellen[이경민]</a> <span class="txt_sub num">12.07.11. 09:45</span> </p> <div class="content_memo"> 7/15(일) 오후 2시에 강서구 등촌동 저희집에서 집들이 할께요! <br /> 참석 가능하시면 댓글 달아주세요~ ㅎㅎ좀 멀긴하지만 맛있는 음식과 술이 기다리고 있을거예요~ ^^ <img src="http://i1.daumcdn.net/cafeimg/cf_img2/img_blank2.gif" width="8" height="12" alt="새글" class="icon_new" /> <b> <a href="#" onclick="ReplyFormController.showReplyForm('0_6475'); return false;" class="txt_point" > [<span id="commentReplyCount_0_6475" class="txt_point">8</span>] </a> </b> </div> </div><!-- content_viewer --> <div id="memoModify_0_6475" class="content_modify"></div> <div id="memoBtns_0_6475" class="memo_btns p11"> <a href="#" onclick="ReplyFormController.showReplyForm('0_6475'); return false;" class="p11">답글</a> </div> </dd><!-- end content_block --> </dl> ''' _type = namedtuple('BriefArticleInfo', 'article_num title post_date author path url'.split()) # fetch if text is None: text = urlread(url, timeouts=ARTICLE_TIMEOUTS) html = lxml.html.fromstring(text) results = [] articles = html.cssselect('div.memo_list form#listForm ul dl') for dl in articles: content = dl.cssselect('div.content_viewer div.content_memo')[0].xpath('child::text()') nick = dl.cssselect('div.content_viewer p.nickname a')[0] date = dl.cssselect('div.content_viewer p.nickname span.txt_sub.num')[0] article_num = dl.cssselect('div.content_viewer')[0].attrib['id'].rsplit('_', 1)[-1] results.append(_type( int(article_num), "\n".join(content).strip(), date.text.strip(), nick.text.strip(), None, None, )) return results
def fetch_elements(html,lo_name,locate_source): """.. :py:method:: 根据对象库获取元素 """ path = locate_source[lo_name] if isinstance(path,tuple): if len(path) == 3: return html.cssselect(path[0])[path[1]:path[2]] elif len(path) == 2: return html.cssselect(path[0])[path[1]] else: raise Exception elif isinstance(path,basestring): return html.cssselect(path) else: raise Exception
def download(): i = 0 aux = 0 for i in range(int(MAX_BOOKS)): try: if not os.path.isfile(get_url_from_pne( img_temp, 'DBM_', i, 'jpg')) and not os.path.isfile( get_url_from_pne(img_temp, 'DBM_', i, 'png')): link = 'http://www.dragonball-multiverse.com/es/page-' + str( i) + '.html' source = requests.get(link).content html = lxml.html.fromstring(source) book_title = html.cssselect('div')[7].cssselect('img')[0].get( 'src') url = urljoin(parent_url, book_title) img_format = book_title[-3:] # file ext catched (.jpg or .png) img_file = open( get_url_from_pne(img_temp, 'DBM_', i, img_format), 'wb') img_file.write(requests.get(url).content) img_file.close() time.sleep(0.5) print("Downloaded in: " + get_url_from_pne(img_temp, 'DBM_', i, img_format)) else: print("Skipping book number %s\r" % i) except Exception as e: print(e) print("It cannot be downloaded") BLACK_LIST.append(i) time.sleep(.5)
def run(self): base_cookie = self.get_base_cookie() req = requests.get(self.ENDPOINT, headers=self.HEADERS, cookies=base_cookie, verify=False, allow_redirects=False) if req.status_code != 200: print req.status_code print req.content print 'Error' return html = lxml.html.fromstring(req.content) categs = html.cssselect('.tabel_categorie') for categ in categs: a = categ.cssselect('a')[0] name = a.text name = name.replace('Calorii ', '').lower() categories = name.split(" ") if len(categories) > 1: category_name = "{}-{}".format(categories[0], categories[1]) else: category_name = categories[0] url = a.get('href') req = requests.get(url, headers=self.HEADERS, cookies=base_cookie, verify=False, allow_redirects=False) self._process_category_table(req.content, category_name)
def scrape_list_page(response: requests.Response) -> Iterator[str]: html = lxml.html.fromstring(response.text) html.make_links_absolute(response.url) for a in html.cssselect('table.cassetteitem_other > tbody > tr > td:nth-child(9) > a'): url = a.get('href') yield url
def getpoll(): pageURL = "http://polldaddy.com/poll/7575405/" html = lxml.html.parse(pageURL).getroot() votebutton = html.cssselect('.vote-button') datavote = votebutton[0].get("data-vote") datadict = ast.literal_eval(datavote) return datadict
def parse_cafe_inner_url_from_official(url): '''Parse cafe official url and return real url. <frame name="down" id="down" src="http://cafe986.daum.net/_c21_/home?grpid=ccJT" width="100%" height="100%" frameborder="0" marginwidth="0" marginheight="0" title="카페 메인 프레임"> ''' #CAFE_HOME_PATTERN = re.compile(u''' # # get src of frame#down # <frame [^>]* # ( # (id="down" [^>]*src="([^"]*)") # | # (src="([^"]*)" [^>]*id="down") # ) # [^>]*> #''', re.S | re.X) site1 = urlread(url, timeouts=ARTICLE_TIMEOUTS) #match = CAFE_HOME_PATTERN.search(site1) #if not match: # raise Exception("parse error") #url = match.group(3) or match.group(5) html = lxml.html.fromstring(site1) frame = html.cssselect('frame#down')[0] url = frame.get('src') return url
def parse_article_album_list(url, text=None): ''' parse article phone list and result list of article information as a tuple: (article_num, title, post_date, author, path, url) ''' _type = namedtuple('BriefArticleInfo', 'article_num title post_date author path url fldid grpid'.split()) # fetch if text is None: text = urlread(url, timeouts=ARTICLE_TIMEOUTS) html = lxml.html.fromstring(text) articles = html.cssselect('div.albumListBox li') def _parse(li): subject = li.cssselect('dd.subject a')[0] author = li.cssselect('dd.nick a')[0] article_num, post_date = li.cssselect('dd.txt_sub.p11 span.num') href = subject.get('href') path = unescape(href) query_dict = urlparse.parse_qs(urllib.splitquery(path)[-1]) return _type( int(article_num.text.strip()), subject.text.strip(), post_date.text.strip(), author.text.strip(), href, get_domain(url, href), query_dict.get('fldid', [None])[0], query_dict.get('grpid', [None])[0], ) return [_parse(li) for li in articles if not li.cssselect('div.blank_thumb')]
def parsing_list_product(request): html = get_html(request) a_list = html.cssselect('ul.sitemap>li>a') for a in a_list: href = a.get('href') url_product.append(href) log.info('Ссылка на товар:%s', href)
def parsing_category(request): global categories_data, filename_category parent_category_one = 0 parent_category_two = 0 html = get_html(request) ul = html.cssselect('ul.sitemap')[0] li_list = ul.cssselect('li') for li in li_list: li_class = li.get('class') if (li_class == 'level-0'): parent_category = 0 a = li.cssselect('a')[0] category_name = a.text category_url = a.get('href') temp = [index_add(), parent_category, category_name, category_url] categories_data.append(temp) parent_category_one = categories_data[-1][0] log.info( 'PARENT_CATEGORY_ID: %s, CATEGORY_NAME: %s, CATEGORY_URL: %s', parent_category, category_name, category_url) else: if (li_class == "level-1"): a = li.cssselect('a')[0] parent_category = parent_category_one category_name = a.text category_url = a.get('href') temp = [ index_add(), parent_category, category_name, category_url ] categories_data.append(temp) parent_category_two = categories_data[-1][0] else: if (li_class == "level-2"): parent_category = parent_category_two a = li.cssselect('a')[0] category_name = a.text category_url = a.get('href') temp = [ index_add(), parent_category, category_name, category_url ] categories_data.append(temp) with open(filename_category, "w", newline="", encoding='utf-8') as file: writer = csv.writer(file) writer.writerows(categories_data)
def test_testing_topic_announce(self): """Controls topics that are of type announcement don't have sorted options""" # Creates posts for announcement topics forum = ForumFactory() PostFactory(topic=TopicFactory(forum=forum, type=Topic.TOPIC_ANNOUNCE)) PostFactory(topic=TopicFactory(forum=forum, type=Topic.TOPIC_ANNOUNCE)) user = UserFactory() assign_perm("can_read_forum", user, forum) self.client.force_login(user) response = self.client.get(f"/forum/forum/{forum.slug}-{forum.pk}/") html = lxml.html.fromstring(response.content) # Select the header block of the announcement block, the first block announce_block = str( etree.tostring(html.cssselect(".topiclist .card-header")[0])) # Controls that announce_block is about announcements and not topics self.assertIn("Announcements", announce_block) self.assertNotIn("Topics", announce_block) self.assertIn("Replies", announce_block) self.assertIn("Views", announce_block) self.assertIn("Last post", announce_block) # There's no sortable informations self.assertNotIn("sortable sorted", announce_block) # There's no column that has a sorting link on self.assertNotIn("<a href=", announce_block) # There's no toggle sorting self.assertNotIn("Toggle sorting", announce_block)
def get_calendar(data: dict, limit: int = 31) -> str: result = [] now = datetime.datetime.now(tz=TZ) for days in range(limit): date = now + datetime.timedelta(days=days) temp = data.get(date.year, {}) temp = temp.get(date.month, {}) temp = temp.get(date.day, {}) if not temp: break info = [] for key, value in temp.items(): # "qdhd": 庆典活动 # "tdz": 团队战 # "tbhd": 特别活动 # "jqhd": 剧情活动 # "jssr": 角色生日 if value and key in ['qdhd', 'tdz', 'tbhd', 'jqhd', 'jssr']: html = lxml.html.fromstring(value) nodes = html.cssselect('.cl-t') for node in nodes: info.append(node.text) msg = '\n'.join(info) if not msg: continue result.append('\n'.join(['==========', date.strftime('%Y-%m-%d'), msg])) return '\n'.join(result)
def get_hrefs(data,css_select): # soup = BeautifulSoup(data,'lxml') # ele = soup.select(css_select) html = lxml.html.fromstring(data) ele = html.cssselect(css_select) hrefs = [e for e in ele] cell = (hrefs[0].text,hrefs[1].get('href')) return cell
def _get_metadata_citation(html, format=None): if format is None: format = 'chicago' citation = html.cssselect("div.citation#cite_%s" % format) citation = citation[0].text_content() return citation.replace('<', '<').replace('>', '>')
def spiderboy(url): page = requests.get(url) html = lxml.html.fromstring(page.content.decode('gbk')) items = html.cssselect('#ATitle') for item in items: car_link = item.get('href') # logger.info('link: '+car_link) try: Car.objects.get(car_link=car_link) # logger.info('already have ' + car_link) pass except: car_title = str(item.text_content()) # logger.info('title: '+car_title) car_icon = 'http://x.autoimg.cn/news/index/img/20110801/logo_new.png' car_des = '' innerpage = requests.get(car_link) innerhtml = lxml.html.fromstring(innerpage.content.decode('gbk')) try: next = base_url + innerhtml.cssselect('.page-item-readall')[0].get('href') except: next = None if next: innerpage = requests.get(next) innerhtml = lxml.html.fromstring(innerpage.content.decode('gbk')) try: innerhtml.cssselect('.diversion-box')[0].drop_tree() except: pass try: innerhtml.cssselect('.btn.font-normal')[0].drop_tree() except: pass article =innerhtml.cssselect('#articleContent')[0] mid_body = lxml.html.tostring(article,encoding=unicode) mid_body2 = cut(mid_body) r = re.compile(r'<a>|</a>') mid_body3 = cleaner.clean_html(mid_body2) car_body = mid_body3 car_body = r.sub('',car_body) ca = Car(car_title=car_title, car_des=car_des, car_link=car_link, car_body=car_body, car_icon=car_icon, car_source="autohome", car_cate='car') ca.save()
def _parseResponse(self, queryText, url, html): for userEntry in html.cssselect('table.NoteDivWidth'): userInfo = userEntry.cssselect('tr table')[0] nickname = userInfo.cssselect('tr th')[0].text exchangeUrl = userInfo.cssselect('tr td')[-1].cssselect('a')[0].attrib['href'] if any(source in exchangeUrl.lower() for source in self.sourceSubstringsToExclude): yield None else: shopFound = not exchangeUrl.endswith('.html') if shopFound: cardSource = exchangeUrl self.logger.warning('Found new shop: {}', exchangeUrl) else: cardSource = self.getTitle() + '/' + nickname userCards = userEntry.cssselect('table.CardInfo') if len(userCards) > 0: self.estimatedCardsCount += len(userCards) - 1 for cardInfo in userCards: cardName = cardInfo.cssselect('th.txt0')[0].text cardUrl = exchangeUrl if not shopFound: cardUrl += '?Title={}'.format(cardName) idSource = cardInfo.cssselect('nobr.txt0')[0].text cardId = int(re.match(r'[^\d]*(\d+)[^\d]*', idSource).group(1)) if idSource else None price = None priceSource = cardInfo.cssselect('td.txt15')[-1].cssselect('b') if len(priceSource) > 0: possiblePrice = priceSource[-1].text if possiblePrice is not None: possiblePrice = possiblePrice.split()[0] if possiblePrice.isdigit(): price = decimal.Decimal(possiblePrice) foilness = len(cardInfo.cssselect('#FoilCard')) > 0 language = None languageSource = cardInfo.cssselect('td.txt15')[0].cssselect('font') if len(languageSource) > 0: language = languageSource[0].text setSource = cardInfo.cssselect('#table0 td img')[0].attrib['alt'] yield { 'id': cardId, 'name': cardName, 'foilness': foilness, 'set': setSource, 'language': language, 'price': price, 'currency': core.utils.Currency.RUR, 'count': int(cardInfo.cssselect('td.txt15 b')[0].text.split()[0]), 'source': self.packSource(cardSource, cardUrl), }
def _download_from_list_page(html, term): for title_node in html.cssselect(".tlistname a"): if term in title_node.text_content(): download_node = title_node.xpath("../..")[0].cssselect(".tlistdownload a")[0] torrent_url = download_node.get("href") torrent = urlopen(torrent_url).read() return torrent logger.error("Search results from %s does not contain that term -- there probably is no torrent by that name.", term) return False
def getBookInfo(str): html = lxml.html.fromstring(str) book = {} title = html.cssselect('title')[0].text book['title'] = title[0:title.find(',')] book['author'] = html.cssselect('a[href^="/book/author/"]')[0].get('alt') print("书名:", book['title']) print("作者:", book['author']) def getlink(a): link = a.get('href') return ("http://www.tadu.com"+link) def geta(div): return div.cssselect('a')[0] chapter = list(map(geta, html.cssselect('div.chapter_t'))) book['links'] = list(map(getlink, chapter)) print("共 %d 页" % len(book['links'])) return book