def get_and_follow_links(initial_link, num_of_returned_links): # we will create a simple python dictionary with this function #that has a key as the url, and the corresponding value all of the #text grabbed from the page sites={} r=requests.get(initial_link) tree=lh.fromstring(r.text) #using the more complicated xpath query shown in the second function text=tree.xpath('string(/html/head/title)') text+='\n' text+=tree.xpath('string(//body/*[not(self::script)])') text=text.split() text=' '.join(text) #making sure your computer can render non-ascii characters by encoding in utf-8 text=text.encode('utf-8') sites[initial_link]=text links=tree.xpath('//a/@href') for l in links[0:num_of_returned_links]: r=requests.get(l) tree=lh.fromstring(r.text) text=tree.xpath('string(/html/head/title)') text+='\n' text+= tree.xpath('string(//body/*[not(self::script)])') text=text.split() text=' '.join(text) text=text.encode('utf-8') sites[l]=text #taking a quick break so the websites don't get annoyed at you for too many requests time.sleep(random.randint(5,15)) for i, x in enumerate(sites.items()): print i, x
def download_sgml_doc(self, info, html_url, current_version=None): should_update_xml = False s = self.open_url(html_url, self.doc_type) doc = html.fromstring(s) # Find the link to the SGML el = doc.xpath(".//a[contains(., 'Rakenteinen asiakirja')]") if len(el) != 1: # retry self.http.nuke_cache(html_url, self.doc_type) s = self.open_url(html_url, self.doc_type) doc = html.fromstring(s) el = doc.xpath(".//a[contains(., 'Rakenteinen asiakirja')]") if current_version: ver_el = doc.xpath(".//div[@class='doclist-items']//div[@class='header']/span") assert len(ver_el) == 1, "Version element not found" m = re.search(r'([0-9]\.[0-9])', ver_el[0].text) assert m, "Version number not found (%s)" % ver_el[0].text doc_version = m.groups()[0] if doc_version != current_version: should_update_xml = True self.logger.debug("SGML document updated (version %s, stored version %s)" % (doc_version, current_version)) if len(el) != 1: year = info['id'].split('/')[1] if int(year) <= 1999: return None raise ParseError("No link to SGML file found: %s" % html_url) doc.make_links_absolute(html_url) link = el[0].attrib['href'] fname = link.split('/')[-1] m = re.match(r'^([a-z0-9_]+)\.sgm$', fname) if not m: raise ParseError("SGML filename invalid") fname_base = m.groups()[0] stored_sgml_fn = '%s/%s' % (self.sgml_storage, fname) if should_update_xml or not os.path.exists(stored_sgml_fn): self.logger.debug("downloading SGML file") try: s = self.open_url(link, self.doc_type) except HTTPError: # retry after nuking the cache self.http.nuke_cache(html_url, self.doc_type) self.open_url(html_url, self.doc_type) s = self.open_url(link, self.doc_type) f = open(stored_sgml_fn, 'w') f.write(s) f.close() xml_fn = '%s/%s.xml' % (self.xml_storage, fname_base) if should_update_xml or not os.path.exists(xml_fn): ret = os.spawnv(os.P_WAIT, self.sgml_to_xml, [self.SGML_TO_XML, stored_sgml_fn, xml_fn]) if ret: raise ParseError("SGML-to-XML conversion failed") return xml_fn
def processGithub(author, url, begin, end): assert(author['username'] != None) githubURL = url + '/pulls?q=is:pr+author:%s+is:%s+updated:%s..%s' #begin = '2014-11-01' #end = '2014-12-01' closedIssuesURL = githubURL % (author['username'], 'closed', begin, end) page = requests.get(closedIssuesURL) tree = html.fromstring(page.text) issueDetails = tree.xpath('//*[@class="issue-title-link js-navigation-open"]') closedIssues = [] for issue in issueDetails: issueURL = 'https://github.com' + issue.get('href') issueTitle = issue.text.strip() closedIssues.append((issueURL, issueTitle)) openIssuesURL = githubURL % (author['username'], 'open', begin, end) page = requests.get(openIssuesURL) tree = html.fromstring(page.text) openIssues = [] issueDetails = tree.xpath('//*[@class="issue-title-link js-navigation-open"]') for issue in issueDetails: issueURL = 'https://github.com' + issue.get('href') issueTitle = issue.text.strip() openIssues.append((issueURL, issueTitle)) return (closedIssues, openIssues)
def get_or_create_publisher(romeo_xml_description): """ Retrieves from the model, or creates into the model, the publisher corresponding to the <publisher> description from RoMEO """ xml = romeo_xml_description romeo_id = None try: romeo_id = xml.attrib['id'] except KeyError: raise MetadataSourceException('RoMEO did not provide a publisher id.\n'+ 'URL was: '+request) name = None try: raw_name = xml.findall('./name')[0].text.strip() name = fromstring(kill_html(sanitize_html(raw_name))).text except (KeyError, IndexError, AttributeError): raise MetadataSourceException('RoMEO did not provide the publisher\'s name.\n'+ 'URL was: '+request) alias = None try: alias = nstrip(xml.findall('./alias')[0].text) if alias: alias = fromstring(kill_html(sanitize_html(alias))).text except KeyError, IndexError: pass
def main(): """ Convierte la documentación de Trello en una estructura de datos y la imprime por salida estándar. """ ep = requests.get(TRELLO_API_DOC).content root = html.fromstring(ep) links = root.xpath('//a[contains(@class, "reference internal")]/@href') pages = [requests.get(TRELLO_API_DOC + u) for u in links if u.endswith('index.html')] endpoints = [] for page in pages: root = html.fromstring(page.content) sections = root.xpath('//div[@class="section"]/h2/..') for sec in sections: ep_html = etree.tostring(sec).decode('utf-8') ep_text = html2text(ep_html).splitlines() match = EP_DESC_REGEX.match(ep_text[0]) if not match: continue ep_method, ep_url = match.groups() ep_text[0] = ' '.join([ep_method, ep_url]) ep_doc = b64encode(gzip.compress('\n'.join(ep_text).encode('utf-8'))) endpoints.append((ep_method, ep_url, ep_doc)) print(yaml.dump(create_tree(endpoints)))
def test_edit_post(self): self.login_client() edit_post_url = reverse('pybb:edit_post', kwargs={'pk': self.post.id}) response = self.client.get(edit_post_url) self.assertEqual(response.status_code, 200) tree = html.fromstring(response.content) values = dict(tree.xpath('//form[@method="post"]')[0].form_values()) values['body'] = 'test edit' response = self.client.post(edit_post_url, data=values, follow=True) self.assertEqual(response.status_code, 200) self.assertEqual(Post.objects.get(pk=self.post.id).body, 'test edit') response = self.client.get(self.post.get_absolute_url(), follow=True) self.assertContains(response, 'test edit') # Check admin form self.user.is_staff = True self.user.save() response = self.client.get(edit_post_url) self.assertEqual(response.status_code, 200) tree = html.fromstring(response.content) values = dict(tree.xpath('//form[@method="post"]')[0].form_values()) values['body'] = 'test edit' values['login'] = '******' response = self.client.post(edit_post_url, data=values, follow=True) self.assertEqual(response.status_code, 200) self.assertContains(response, 'test edit')
def processRietveld(author, guid, begin, end): reitveldURL = 'https://codereview.chromium.org/search?closed=%s&owner=%s&repo_guid=%s&modified_after=%s&modified_before=%s&limit=30' assert(author != None and guid != None and begin != None and end != None) email = '' if type(author['email']) == list: email = author['email'][0] else: email = author['email'] closedIssuesURL = reitveldURL % ('2', email, guid, weekStart, weekEnd) page = requests.get(closedIssuesURL) tree = html.fromstring(page.text) issueDetails = tree.xpath('//*[@class="subject"]/a/text()') closedIssues = [] for i in xrange(0, len(issueDetails), 2): issueURL = 'https://codereview.chromium.org/' + issueDetails[i] issueTitle = issueDetails[i + 1].strip() closedIssues.append((issueURL, issueTitle)) openIssuesURL = reitveldURL % ('3', email, guid, weekStart, weekEnd) page = requests.get(openIssuesURL) tree = html.fromstring(page.text) issueDetails = tree.xpath('//*[@class="subject"]/a/text()') openIssues = [] for i in xrange(0, len(issueDetails), 2): issueURL = 'https://codereview.chromium.org/' + issueDetails[i] issueTitle = issueDetails[i + 1].strip() openIssues.append((issueURL, issueTitle)) return (closedIssues, openIssues)
def test_get_imdblink(self): html = u""" <div> <div class="wikibase-statementview-mainsnak"> <div> <div class="wikibase-snakview-value"> <a class="wb-external-id" href="http://www.imdb.com/tt0433664"> tt0433664 </a> </div> </div> </div> </div> """ html_etree = fromstring(html) imdblink = wikidata.get_imdblink(html_etree, 'https://www.imdb.com/') html = u""" <div> <div class="wikibase-statementview-mainsnak"> <div> <div class="wikibase-snakview-value"> <a class="wb-external-id" href="href="http://tools.wmflabs.org/...http://www.imdb.com/&id=nm4915994""> nm4915994 </a> </div> </div> </div> </div> """ html_etree = fromstring(html) imdblink = wikidata.get_imdblink(html_etree, 'https://www.imdb.com/') self.assertIn('https://www.imdb.com/name/nm4915994', imdblink)
def test_get_geolink(self): html = u""" <div> <div class="wikibase-statementview-mainsnak"> <div> <div class="wikibase-snakview-value"> 60°N, 40°E </div> </div> </div> </div> """ html_etree = fromstring(html) geolink = wikidata.get_geolink(html_etree) self.assertIn('https://www.openstreetmap.org/', geolink) self.assertIn('lat=60&lon=40', geolink) html = u""" <div> <div class="wikibase-statementview-mainsnak"> <div> <div class="wikibase-snakview-value"> 34°35'59"S, 58°22'55"W </div> </div> </div> </div> """ html_etree = fromstring(html) geolink = wikidata.get_geolink(html_etree) self.assertIn('https://www.openstreetmap.org/', geolink) self.assertIn('lat=-34.59', geolink) self.assertIn('lon=-58.38', geolink)
def get_htmldoc(url, encode='utf8', timeout=60): if url.startswith('file://'): with open(url[7:]) as f: content = f.read() content = content.decode(encode, 'ignore') print content.__class__ try: content = cleaner.clean_html(content) except: pass doc = fromstring(content) return doc code, data = _getcontent(url, timeout=timeout) #print code #encode = 'utf8' if code: encode = code codedata = data.decode(encode, 'ignore') try: print codedata.__class__ codedata = cleaner.clean_html(codedata) except: print 'Error: ', url log.exception('unexpected error:%s(%s)' % (url, encode)) #raise #with open('error_page.html', 'w+') as fw: # fw.write(codedata.encode(encode)) doc = fromstring(codedata) # log.debug(codedata) return doc
def get_posts_list(self): profile_page = self.user_url + '/profile' r = self.s.get(profile_page) self.output_html(text=r.text, filename='profilepage') # step 2 rizhi_tab = profile_page + '?v=blog_ajax&undefined' r = self.s.get(rizhi_tab) self.output_html(text=r.text, filename='rizhi_tab') # step 3 first_blog_url = html.fromstring(r.text).cssselect('[stats="blog_blog"]')[0].attrib['href'] first_blog_title = html.fromstring(r.text).cssselect('[stats="blog_blog"]')[0].text r = self.s.get(first_blog_url) print("Generating 《%s》" % first_blog_title) self.output_html(text=r.text, filename='0.'+first_blog_title) # 根据状态码或页面元素判断已到末尾 for i in range(1,10000): try: next_blog_url = html.fromstring(r.text).cssselect(".a-nav .float-right a")[0].attrib['href'] next_blog_title = html.fromstring(r.text).cssselect(".a-nav .float-right a")[0].text.lstrip('较旧一篇:') r = self.s.get(next_blog_url) print("Generating 《%s》" % next_blog_title) next_blog_title = re.sub(r'[<>"*\\/|?]', '', next_blog_title) # 标题中的? -> '',: -> - next_blog_title = re.sub(':', '-', next_blog_title) self.output_html(text=r.text, filename=str(i)+'.'+next_blog_title) except: print("Unexpected error:", sys.exc_info()[0]) print('Existing program...') break
def parse_updates_html_str(html_str): course_upd_collection = [] if html_str == '': return {"updates": course_upd_collection} try: course_html_parsed = html.fromstring(html_str) except: escaped = django.utils.html.eacape(html_str) course_html_parsed = html.fromstring(escaped) print type(course_html_parsed) if course_html_parsed.tag == 'section': for index, update in enumerate(course_html_parsed): if len(update) > 0: content = _course_info_content(update) computer_id = len(course_html_parsed) - index payload = { "id": computer_id, "date": update.findtext("h2"), "content": content } course_upd_collection.append(payload) return {"updates": course_upd_collection}
def __init__(self, email, password): """ Initiates the session and Logs in with the provided credentials :param email: :param password: :return: """ self.session_request = requests.session() result = self.session_request.get(self.login_url) login_page = html.fromstring(result.text) token = list(set(login_page.xpath("//input[@name='fkey']/@value")))[0] self.credentials = { "email": email, "password": password, "fkey": token } result = self.session_request.post( self.login_url, data=self.credentials, headers=dict(referer=self.login_url) ) self.main_page = html.fromstring(result.content) self.page_title = str(self.main_page.xpath("//title/text()")[0]) if self.page_title == "Stack Overflow": print "Login Successful" + "\n" self.questions = self.main_page.xpath("//div[@id='question-mini-list']")[0] else: print "Invalid Credentials" + "\n"
def get_player_stats(site, headers): page1 = requests.get(site) tree1 = html.fromstring(page1.text) stats1 = tree1.xpath('//td[@align="right"]') stats1 = [x.text for x in stats1] stats1 = [xnum(x,0) for x in stats1] names1 = tree1.xpath('//td[@align="left"][@class=" highlight_text"]/a/text()') teams1 = tree1.xpath('//td[@align="left"]/a[starts-with(@href,"/teams")]/text()') text_addon = "&offset=" seq_addon = range(100,401,100) for a in seq_addon: page_temp = requests.get(site+text_addon+str(a)) tree_temp = html.fromstring(page_temp.text) stats_temp = tree_temp.xpath('//td[@align="right"]') stats_temp = [x.text for x in stats_temp] stats_temp = [xnum(x,0) for x in stats_temp] stats1.extend(stats_temp) names_temp = tree_temp.xpath('//td[@align="left"][@class=" highlight_text"]/a/text()') teams_temp = tree_temp.xpath('//td[@align="left"]/a[starts-with(@href,"/teams")]/text()') names1.extend(names_temp) teams1.extend(teams_temp) stats1 = np.array(stats1).reshape((len(stats1)/26,26)) stats1 = pd.DataFrame(stats1, columns=headers) stats1['name'] = names1 stats1['team'] = teams1 return stats1
def ShowCartoons(title, url, page_count): oc = ObjectContainer(title1 = title) thisurl = url thisletter = url.split("=",1)[1] page = scraper.get(BASE_URL + '/CartoonList' + url + '&page=' + page_count) page_data = html.fromstring(page.text) for each in page_data.xpath("//tr/td[1]"): content = HTML.ElementFromString(each.xpath("./@title")[0]) url = content.xpath("./div/a[@class='bigChar']/@href")[0] title = content.xpath("./div/a[@class='bigChar']/text()")[0].strip() thumbhtml = scraper.get(BASE_URL + url) page_html = html.fromstring(thumbhtml.text) thumb = page_html.xpath("//link[@rel='image_src']/@href")[0] Log(thumb) oc.add(DirectoryObject( key = Callback(ShowEpisodes, title = title, url = url), title = title, thumb = thumb ) ) oc.add(NextPageObject( key = Callback(ShowCartoons, title = thisletter.upper(), url = thisurl, page_count = int(page_count) + 1), title = "More...", thumb = R(ICON_NEXT) ) ) return oc
def cmd(send, msg, args): """Gets a man page. Syntax: {command} [section] <command> """ parser = arguments.ArgParser(args['config']) parser.add_argument('section', nargs='?') parser.add_argument('command') try: cmdargs = parser.parse_args(msg) except arguments.ArgumentException as e: send(str(e)) return if cmdargs.section: html = get('http://linux.die.net/man/%s/%s' % (cmdargs.section, cmdargs.command)) short = fromstring(html.text).find('.//meta[@name="description"]') if short is not None: short = short.get('content') send("%s -- http://linux.die.net/man/%s/%s" % (short, cmdargs.section, cmdargs.command)) else: send("No manual entry for %s in section %s" % (cmdargs.command, cmdargs.section)) else: for section in range(0, 8): html = get('http://linux.die.net/man/%d/%s' % (section, cmdargs.command)) if html.status_code == 200: short = fromstring(html.text).find('.//meta[@name="description"]') if short is not None: short = short.get('content') send("%s -- http://linux.die.net/man/%d/%s" % (short, section, cmdargs.command)) return send("No manual entry for %s" % cmdargs.command)
def extract_info(url, requester): response = retrieve(url, requester) if response: tree = html.fromstring(response.content) members = itertools.chain(parse(tree, MXPATH)) next = parse(tree, NXPATH)[0] while next: response = retrieve(HOST + next, requester) if response: tree = html.fromstring(response.content) members = itertools.chain(members, parse(tree, MXPATH)) next = parse(tree, NXPATH)[0] else: break return members else: return itertools.chain()
def crawler(url_ip): global eqid, counter, serial, body_list print "Starting Crawler Service for: " + url_ip url = "http://" + url_ip + "/cgi-bin/dynamic/printer/config/reports/deviceinfo.html" urleqid = "http://" + url_ip + "/cgi-bin/dynamic/topbar.html" response = requests.get(url) tree = html.fromstring(response.text) # xpath sequence should be pulled using the google source inspection counter = tree.xpath('//td[contains(p,"Count")]/following-sibling::td/p/text()') serial = tree.xpath('//td[contains(p, "Serial")]/following-sibling::td/p/text()') counter = counter[0].split(' ')[3] serial = serial[0].split(' ')[3] responseeqid = requests.get(urleqid) treeequid = html.fromstring(responseeqid.text) eqid = treeequid.xpath('//descendant-or-self::node()/child::b[contains(., "Location")]/text()')[1].split(' ')[-1] # print basic data print " -- equipment id found: " + eqid print " -- count found: " + counter print " -- serial found: " + serial body_of_email = "Equipment ID = " + eqid + "<br>Total Meter Count = " + counter + "<br>Serial Number = " + serial + "<br><br>" body_list.append(body_of_email) print "Stopping Crawler Service for: " + url_ip return
def get_posts_list(self): profile_page = self.user_url + '/profile' r = self.s.get(profile_page) self.output_html(text=r.text, filename='profilepage') # step 2 rizhi_tab = profile_page + '?v=blog_ajax&undefined' r = self.s.get(rizhi_tab) self.output_html(text=r.text, filename='rizhi_tab') # step 3 first_blog_url = html.fromstring(r.text).cssselect('[stats="blog_blog"]')[0].attrib['href'] first_blog_title = html.fromstring(r.text).cssselect('[stats="blog_blog"]')[0].text r = self.s.get(first_blog_url) if sys.stdout.encoding == 'UTF-8': print("Generating 《%s》" % first_blog_title) else: print('0') self.output_html(text=r.text, filename='0.'+first_blog_title) # 根据状态码或页面元素判断已到末尾 for i in range(1, 10000): next_blog_element = html.fromstring(r.text).cssselect(".a-nav .float-right a") if next_blog_element: next_blog_url = next_blog_element[0].attrib['href'] else: break # already the last blogpost next_blog_title = html.fromstring(r.text).cssselect(".a-nav .float-right a")[0].text.lstrip('较旧一篇:') r = self.s.get(next_blog_url) if sys.stdout.encoding == 'UTF-8': print("Generating 《%s》" % next_blog_title) else: print(i) next_blog_title = re.sub(r'[<>"*\\/|?]', '', next_blog_title) # 标题中的? -> '',: -> - next_blog_title = re.sub(':', '-', next_blog_title) self.output_html(text=r.text, filename=str(i)+'.'+next_blog_title)
def __loadPages(self, maxPages): response = requests.get(self.baseUrl) firstpage = html.fromstring(response.text) pagination = firstpage.xpath('//div[@class="pagination loop-pagination"]')[0] pagecount = int(max(pagination.xpath('//a[@class="page-numbers"]/text()'))) pagecount = pagecount if maxPages == 0 else maxPages for pagenumber in range(1, pagecount + 1): print "Scraping page %d... (%s)" % (pagenumber, (self.baseUrl + "/page/%d" % pagenumber)) if pagenumber == 1: page = firstpage else: response = requests.get(self.baseUrl + "/page/%d" % pagenumber) page = html.fromstring(response.text) articles = page.xpath("//article") for article in articles: # Header header = article.xpath('.//h1[@class="entry-title"]/a/text()') if not header: header = article.xpath('.//h1[@class="entry-title"]/a/span/text()') post = header[0].upper() + "\n\n" # Body post += "\n".join(article.xpath('.//div[@class="entry-content"]/p/text()')).strip() self.posts.append(post)
def make_ecas_session(): session = requests.Session() data = {"lgid": "en", "action": "gp"} res = session.get("http://ted.europa.eu/TED/browse/browseByBO.do") res = session.post( "http://ted.europa.eu/TED/main/HomePage.do?pid=secured", data=data, cookies={"lg": "en"}, allow_redirects=True ) a = html.fromstring(res.content).find('.//div[@id="main_domain"]/a[@title="External"]') res = session.get(a.get("href")) form = html.fromstring(res.content).find('.//form[@id="loginForm"]') data = dict([(i.get("name"), i.get("value")) for i in form.findall(".//input")]) data["username"] = ECAS_USER data["password"] = ECAS_PASSWORD data["selfHost"] = "webgate.ec.europa.eu" data["timeZone"] = "GMT-04:00" res = session.post(form.get("action"), data=data, allow_redirects=True) doc = html.fromstring(res.content) # print res.content form = doc.find('.//form[@id="showAccountDetailsForm"]') # print form data = dict([(i.get("name"), i.get("value")) for i in form.findall(".//input")]) res = session.post(form.get("action"), data=data, allow_redirects=True) doc = html.fromstring(res.content) link = filter(lambda a: "redirecting-to" in a.get("href", ""), doc.findall(".//a")) res = session.get(link.pop().get("href")) log.info("ECAS Session created.") return session
def getFpDkData(): sleep(randint(2, 5)) url = "http://www.fantasypros.com/nfl/draftkings-lineup-optimizer.php" try: response = requests.get(url) tree = html.fromstring(response.text) except: try: response = requests.get(url) tree = html.fromstring(response.text) except: print "Couldn't scrape:", url return [] names = getDkName(tree) points = getPlayerPoints(tree) salaries = getPlayerSalaries(tree) positions = getPlayerPosition(tree) opponents = getPlayerOpponent(tree) players = [] for i in xrange(min(len(names), len(points), len(salaries))): player = {} player["Name"] = names[i] player["Position"] = positions[i][0] player["PositionRank"] = positions[i][-1] player["dkPoints"] = points[i] player["dkSalary"] = salaries[i] # player["Opponent"] = opponents[i] if player["Position"] != "NR": players.append(player) print "Scraped Fantasy Pros for DK Points/Salary for:", player["Name"] return players
def get_match(match_id): f = open("data/reget.json", "a+") r = s.get('http://www.oddsportal.com/a/b/c/d-%s/' % match_id) tree = html.fromstring(r.text) try: match = { 'match_id': match_id } print(match_id) match = match_dl.get_match(match) name = tree.xpath( '//div[@id="col-content"]/h1')[0].text_content().split(' - ') match['home'] = name[0] match['away'] = name[1] match['event'] = get_league_info(r.url)[1:] event_request = requests.get( 'http://www.soccer24.com/match/' + match['match_id']) event_tree = html.fromstring(event_request.text) phrases = event_tree.xpath( '//table[@class="detail"]//a/text()')[0].split(' - ')[1:] match['event'] += phrases[::-1] f.write(json.dumps(match) + '\n') except: fail = open("to_reget.dat", 'a+') fail.write(match_id + '\n') fail.close() f.close()
def removeWikiDuplicates(rd_file, syn_file): with open(rd_file,"r") as f: htmls = h.fromstring(f.read()) rd_pages = htmls.xpath('page') with open(syn_file,"r") as f: htmls = h.fromstring(f.read()) syn_pages = htmls.xpath('page') rd_titles = [] for page in rd_pages: rd_titles.append(page.xpath("title")[0].text_content()) syn_titles = [] for page in syn_pages: syn_titles.append(page.xpath("title")[0].text_content()) sub_syn_inds = [] for i,title in enumerate(syn_titles): if not title in rd_titles: sub_syn_inds.append(i) open(syn_file,"w").close() f = open(syn_file,"a") for i in sub_syn_inds: f.write(h.tostring(syn_pages[i])) f.close()
def test_process_idname(self): from mobilize.components import CssPath, XPath src_html = '''<div> <nav> <a href="/A">A</a> <a href="/B">B</a> </nav> ''' def prep_component(**kw): return c # check that default_idname is required if self.idname not defined c1 = CssPath('nav') c1.extract(html.fromstring(src_html)) with self.assertRaises(AssertionError): c1.process() # check that idname argument c2 = CssPath('nav', idname='foo') c2.extract(html.fromstring(src_html)) c2.process() # no AssertionError on this line meanst the test passes # check that default_idname supresses the error c3 = CssPath('nav') c3.extract(html.fromstring(src_html)) c3.process('foo') # no AssertionError on this line meanst the test passes
def get_info_from_filmup(film_url): page = requests.get(film_url) tree = html.fromstring(page.content) #Fetch the infos as a list: info = tree.xpath('//div[@id="container"]/table/tr/td/div/table/tr/td/table/tr/td/table/tr/td/font/node()') print len(info) info = [get_text(i) for i in info] plot = tree.xpath('//div[@id="container"]/table/tr/td/div/table/tr/td/table/tr/td/font/text()') image = tree.xpath('//div[@id="container"]/table/tr/td/div/table/tr/td/table/tr/td/table/form/tr/td/a[@class="filmup"]/@href') plot[1] = parse_apostrophe(plot[1]) plot[1] = substitue_accents(plot[1]) #Fetch large image url image_page = requests.get('http://filmup.leonardo.it/'+image[0]) tree = html.fromstring(image_page.content) image_big = tree.xpath('//div[@id="container"]/table/tr/td/div/div/img/@src') #Download image in local folder s = r'/sc_(.[^\.]*)\.htm' #print re.findall(s,film_url) img_title = "images/"+re.findall(s,film_url)[0]+".jpg" urlretrieve('http://filmup.leonardo.it'+image_big[0], img_title) #resize image: image = Image.open(img_title) image_small = resize_image(image,height=330) image_small.save("images/"+re.findall(s,film_url)[0]+"_small.jpg") image_fullsize = resize_image(image,height=600) image_fullsize.save("images/"+re.findall(s,film_url)[0]+".jpg") res = merge_infos(info, plot) return res
def test_innerhtml(self): from mobilize.components import XPath html_str = '''<table><tr><td>Hello</td></tr></table>''' # test for innerhtml=False component_f = XPath('//td', idname='foo', innerhtml=False) component_f.extract(html.fromstring(html_str)) extracted = component_f.process() extracted_str = html.tostring(extracted) expected = '<div class="mwu-elem" id="foo"><td>Hello</td></div>' e = normxml(expected) a = normxml(extracted_str) self.assertSequenceEqual(e, a) # test for innerhtml=True component_t = XPath('//td', idname='foo', innerhtml=True) component_t.extract(html.fromstring(html_str)) extracted = component_t.process() extracted_str = html.tostring(extracted) expected = '<div class="mwu-elem" id="foo">Hello</div>' self.assertSequenceEqual(normxml(expected), normxml(extracted_str)) # test for ineffectiveness of innerhtml=True with multiple matching elements component_t = XPath('//td', idname='foo', innerhtml=True) component_t.extract(html.fromstring(''' <table><tr> <td>Hello</td> <td>Goodbye</td> </tr></table> ''')) extracted = component_t.process() extracted_str = html.tostring(extracted) expected = '<div class="mwu-elem" id="foo"><td>Hello</td><td>Goodbye</td></div>' self.assertSequenceEqual(normxml(expected), normxml(extracted_str))
def pyld_36kr(): """<a style="color:#000000;" target="_blank" href="http://36kr.com/" title="36氪是一个关注互联网创业的科技博客,旨在帮助互联网创业者实现创业梦。我们相信每个人都可以像来氪星人超人那样强大无比。还行吧,有质有量还有料">36kr-首页</a>""" starttime = time.time() my_title = pyld_36kr.__doc__ title_clean = re.sub("<.*?>", "", my_title) column = 6 iscover = 1 try: r = requests.get("http://36kr.com/") xpath1 = fromstring(r.text).xpath items = xpath1("//article") newurl = "http://36kr.com" + xpath1('//a[@id="info_flows_next_link"]/@href')[0] r = requests.get(newurl) items = items + fromstring(r.text).xpath("//article") items = [i for i in items if i.xpath("./div/div/span/time/@datetime")] urls = ["http://36kr.com" + i.xpath("./a/@href")[0] for i in items] covers = [i.xpath("./a/@data-lazyload")[0] for i in items] titles = [i.xpath("./div/a/text()")[0] for i in items] sums = [i.xpath('./div/div[@class="brief"]/text()')[0] for i in items] ptime = [ '<div align="right"><br>%s</div>' % re.sub(" \+\d\d\d\d$", "", i.xpath("./div/div/span/time/@datetime")[0]) for i in items ] sums = ["<br>".join(i) for i in list(zip(sums, ptime))] aa = [i for i in list(zip(covers, titles, urls, sums)) if thisday.strftime("%Y-%m-%d") in i[3]] except Exception as e: print("%s %s" % (title_clean, e)) aa = [["error"] * 4] iscover = 0 runtime1 = round(time.time() - starttime, 3) print(title_clean, "finished in %s seconds" % runtime1) return [my_title, aa, column, iscover]
def open(self, url=None): try: if url is not None: if url in self.storedlinks: return -1 else: if ("http://" in url) or ("https://" in url): self.current = ulib.urlopen(url) self.source = self.current.read() self.storedlinks.add(url) self.soup = html.fromstring(self.source) return 1 else: return None else: self.current = ulib.urlopen(self.root) self.source = "".join(self.current.readlines()) self.soup = html.fromstring(self.source) return 1 except ValueError as ve: print ve.message return None except ulib.HTTPError as ht: print ht.message return None except ulib.URLError as u: print "Could not connect to given URL" return None
def main(): session_requests = requests.session() # Get login csrf token result = session_requests.get(LOGIN_URL) tree = html.fromstring(result.text) authenticity_token = list(set(tree.xpath("//input[@name='csrfmiddlewaretoken']/@value")))[0] # Create payload payload = { "username": username, "password": password, "csrfmiddlewaretoken": authenticity_token } # Perform login result = session_requests.post(LOGIN_URL, data = payload, headers = dict(referer = LOGIN_URL)) # Scrape url result = session_requests.get(URL, headers = dict(referer = URL)) tree = html.fromstring(result.content) bucket_elems = tree.findall(".//a[@class='execute']") bucket_names = [bucket_elem.text_content().replace("\n", "").strip() for bucket_elem in bucket_elems] print bucket_names
} print('翻页###################', begin) query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data) fakeid_list = query_fakeid_response.json().get('app_msg_list') try: for item in fakeid_list: msg_link = item.get('link') # print(msg_link) msg_title = item.get('title') if int(item.get('update_time')) > int('1519833600'): print(msg_link + 'continue') get_msg_response = requests.get(msg_link) tree = html.fromstring(get_msg_response.text) try: # if tree.xpath('//*[@id="meta_content"]/em[2]/text()')[0] == kevin: print(msg_title, end='\n') # 处理文本 msg_content = tree.xpath( '//*[@id="js_content"]//text()') with open('sentence.txt', 'a', encoding='utf-8') as f: f.write( '\n\n====================================================================\n\n' + msg_title + '\n\n') for text in msg_content: f.write(text + '\n') f.write( "\n\n====================================================================\n\n"
def response(resp): """Get response from google's search request""" results = [] # detect google sorry resp_url = urlparse(resp.url) if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': raise RuntimeWarning('sorry.google.com') if resp_url.path.startswith('/sorry'): raise RuntimeWarning(gettext('CAPTCHA required')) # which subdomain ? # subdomain = resp.search_params.get('google_subdomain') # convert the text to dom dom = html.fromstring(resp.text) img_bas64_map = scrap_out_thumbs(dom) img_src_script = eval_xpath( dom, '//script[contains(., "AF_initDataCallback({key: ")]')[1].text # parse results # # root element:: # <div id="islmp" ..> # result div per image:: # <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..." # The data-id matches to a item in a json-data structure in:: # <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ... # In this structure the link to the origin PNG, JPG or whatever is given # first link per image-div contains a <img> with the data-iid for bas64 encoded image data:: # <img class="rg_i Q4LuWd" data-iid="0" # second link per image-div is the target link:: # <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper"> # the second link also contains two div tags with the *description* and *publisher*:: # <div class="WGvvNb">The Sacrament of the Last Supper ...</div> # <div class="fxgdke">en.wikipedia.org</div> root = eval_xpath(dom, '//div[@id="islmp"]') if not root: logger.error("did not find root element id='islmp'") return results root = root[0] for img_node in eval_xpath(root, './/img[contains(@class, "rg_i")]'): try: img_alt = eval_xpath(img_node, '@alt')[0] img_base64_id = eval_xpath(img_node, '@data-iid') if img_base64_id: img_base64_id = img_base64_id[0] thumbnail_src = img_bas64_map[img_base64_id] else: thumbnail_src = eval_xpath(img_node, '@src') if not thumbnail_src: thumbnail_src = eval_xpath(img_node, '@data-src') if thumbnail_src: thumbnail_src = thumbnail_src[0] else: thumbnail_src = '' link_node = eval_xpath(img_node, '../../../a[2]')[0] url = eval_xpath(link_node, '@href')[0] pub_nodes = eval_xpath(link_node, './div/div') pub_descr = img_alt pub_source = '' if pub_nodes: pub_descr = extract_text(pub_nodes[0]) pub_source = extract_text(pub_nodes[1]) img_src_id = eval_xpath(img_node, '../../../@data-id')[0] src_url = scrap_img_by_id(img_src_script, img_src_id) if not src_url: src_url = thumbnail_src results.append({ 'url': url, 'title': img_alt, 'content': pub_descr, 'source': pub_source, 'img_src': src_url, # 'img_format': img_format, 'thumbnail_src': thumbnail_src, 'template': 'images.html' }) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) # from lxml import etree # logger.debug(etree.tostring(img_node, pretty_print=True)) # import pdb # pdb.set_trace() continue return results
def parseProduct(asin, amazon_url, retrying_time): headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36' } url = amazon_url + "/dp/" + asin try: # Adding verify=False to avold ssl related issues response = requests.get(url, headers=headers) doc = html.fromstring(response.content) XPATH_NAME = '//h1[@id="title"]//text()' XPATH_SALE_PRICE = '//span[contains(@id,"ourprice") or contains(@id,"saleprice")]/text()' XPATH_ORIGINAL_PRICE = '//td[contains(text(),"List Price") or contains(text(),"M.R.P") or contains(text(),"Price")]/following-sibling::td/text()' XPATH_CATEGORY = '//a[@class="a-link-normal a-color-tertiary"]//text()' XPATH_AVAILABILITY = '//div[@id="availability"]//text()' XPATH_RATING = '//span[@id="acrPopover"]' XPATH_REVIEWS_NUMBER = '//span[@id="acrCustomerReviewText"]//text()' raw_name = doc.xpath(XPATH_NAME) raw_sale_price = doc.xpath(XPATH_SALE_PRICE) raw_category = doc.xpath(XPATH_CATEGORY) raw_original_price = doc.xpath(XPATH_ORIGINAL_PRICE) raw_availability = doc.xpath(XPATH_AVAILABILITY) raw_rating_elem = doc.xpath(XPATH_RATING) raw_product_rating = [] if raw_rating_elem != []: for elem in raw_rating_elem: raw_product_rating = elem.attrib['title'] raw_number_of_review = doc.xpath(XPATH_REVIEWS_NUMBER) name = ' '.join(''.join(raw_name).split()) if raw_name else None sale_price = ' '.join(''.join( raw_sale_price).split()).strip() if raw_sale_price else None category = ' > '.join([i.strip() for i in raw_category ]) if raw_category else None original_price = ''.join( raw_original_price).strip() if raw_original_price else None availability = ''.join( raw_availability).strip() if raw_availability else None rating = ''.join(raw_product_rating).replace( ' su 5 stelle', '') if raw_product_rating else None reviews_number = ''.join(raw_number_of_review).replace( ' recensioni clienti', '') if raw_number_of_review else None if not original_price: original_price = sale_price # retrying in case of captcha (only first time) if not name: if retrying_time: raise ValueError('captcha') parseProduct(asin, amazon_url, True) return data = { 'name': name, 'salePrice': sale_price, 'category': category, 'originalPrice': original_price, 'availability': availability, 'url': url, 'date': datetime.datetime.now().strftime("%d-%m-%Y %H:%M"), 'numberOfReviews': reviews_number, 'productRating': rating } return data except: print("Error scraping product info")
# date: 2019.05.09 # author: Bartłomiej 'furas' Burek # https://stackoverflow.com/questions/56059703/how-can-i-make-lxml-save-two-pages-to-the-pages-so-it-can-be-read-by-the-tree from lxml import html import requests data = { 'BTC': 'id-bitcoin', 'TRX': 'id-tron', # ... 'HC': 'id-hypercash', 'XZC': 'id-zcoin', } all_results = {} for url in ('https://coinmarketcap.com/', 'https://coinmarketcap.com/2'): page = requests.get(url) tree = html.fromstring(page.content) print(tree.cssselect('body')) for key, val in data.items(): result = tree.xpath('//*[@id="' + val + '"]/td[4]/a/text()') print(key, result) if result: all_results[key] = result[0] print('---') print(all_results)
from lxml import html, etree from urllib.request import urlopen pagina = urlopen("https://www.pythonparatodos.com.br/formulario.html") tree = html.fromstring(pagina.read()) tr = tree.xpath('//tr[2]') print(tr) for elemento in tr: print(etree.tostring(elemento))
def parseResponse(response, amazon_url, rev_index): p_resp = response reviews = p_resp.split('&&&') XPATH_ID = './/div[@data-hook="review"]' XPATH_RATING = './/i[@data-hook="review-star-rating"]//text()' XPATH_TITLE = './/a[@data-hook="review-title"]//text()' XPATH_AUTHOR = './/a[@data-hook="review-author"]//text()' XPATH_AUTHOR_PROFILE = './/a[@data-hook="review-author"]' XPATH_POSTED_DATE = './/span[@data-hook="review-date"]//text()' XPATH_VERIFIED_PURCHASE = './/span[@data-hook="avp-badge"]//text()' XPATH_BODY = './/span[@data-hook="review-body"]//text()' XPATH_HELPFUL = './/span[@data-hook="helpful-vote-statement"]//text()' reviews_list = {} # the first 3 fields of the array can be discarded since useless for i in range(3, len(reviews)): rev_fields = reviews[i].split('\",\"') if len(rev_fields) == 3: # the 3rd field (rev_fields[2]) contains the html of the review review = rev_fields[2].replace("\\\"", "\"") is_amazon_vine = True if "Recensione Vine " in review else False parser = html.fromstring(review) # DATA COMPUTATION # if 'id' field is not found we can assume we are working on a html response referred to a review # the last 3 fields contain no information review_id = "" for elem in parser.xpath(XPATH_ID): review_id = elem.attrib['id'] if review_id is "": break raw_review_rating = parser.xpath(XPATH_RATING) raw_review_header = parser.xpath(XPATH_TITLE) raw_review_author = parser.xpath(XPATH_AUTHOR) # retrieve the link to the review's author profile for elem in parser.xpath(XPATH_AUTHOR_PROFILE): raw_review_author_profile = elem.attrib['href'] raw_review_posted_date = parser.xpath(XPATH_POSTED_DATE) raw_review_verified_purchase = parser.xpath( XPATH_VERIFIED_PURCHASE) raw_review_body = parser.xpath(XPATH_BODY) raw_review_helpful_vote = parser.xpath(XPATH_HELPFUL) # DATA COMPOSITION author_name = ' '.join(' '.join( raw_review_author).split()) if raw_review_author else "" author_profile = amazon_url + ''.join( raw_review_author_profile) if raw_review_author_profile else "" author_code = (author_profile.split("account.")[1]).split("/")[0] review_rating = ''.join(raw_review_rating).replace( ' su 5 stelle', '') if raw_review_rating else "" review_header = ' '.join(' '.join( raw_review_header).split()) if raw_review_header else "" review_text = ' '.join( ' '.join(raw_review_body).split()) if raw_review_body else "" review_verified_purchase = True if raw_review_verified_purchase else False review_posted_date = parseDate( raw_review_posted_date) if raw_review_posted_date else None try: review_helpful_vote = int([ x for x in (''.join(raw_review_helpful_vote).split(' ')) if x != '' ][1]) if raw_review_helpful_vote else 0 except: review_helpful_vote = 1 author_id, author_rank = ac.getCustomerId(author_code, amazon_url) if author_id <> None: author_helpful_votes, author_reviews_count = ac.getHelpfulVotesAndTotalReviewsCount( author_id, amazon_url) else: author_helpful_votes, author_reviews_count = -1, -1 latest_author_reviews = ac.getLatestCustomerReviewAndTextAnalisys( author_code, amazon_url) # set a sleeping time if execute_sleep: #sleep(random.uniform(2,4)) sleep(2) review_summary = { 'reviewId': review_id, 'reviewLink': amazon_url + "/gp/customer-reviews/" + review_id, 'reviewText': review_text, 'reviewPostedDate': review_posted_date, 'reviewHeader': review_header, 'reviewRating': review_rating, 'reviewAuthor': { 'name': author_name, 'profileLink': author_profile, 'code': author_code, 'id': author_id, 'rank': author_rank, 'helpfulVotes': author_helpful_votes, 'totalReviewsCount': author_reviews_count, 'latestReviews': latest_author_reviews }, 'reviewVerifiedPurchase': review_verified_purchase, 'reviewHelpfulVote': review_helpful_vote, 'isAmazonVineReviewer': is_amazon_vine } reviews_list[rev_index] = review_summary rev_index += 1 return reviews_list
def parse_html(self, root, first_page=False): if random() > 0.8: if len(root.xpath("//div[@class='controls']/a/text()")): self.display.exit(self.display.api_error(" ")) book_content = root.xpath("//div[@id='sbo-rt-content']") if not len(book_content): self.display.exit( "Parser: book content's corrupted or not present: %s (%s)" % (self.filename, self.chapter_title) ) page_css = "" stylesheet_links = root.xpath("//link[@rel='stylesheet']") if len(stylesheet_links): stylesheet_count = 0 for s in stylesheet_links: css_url = urljoin("https:", s.attrib["href"]) if s.attrib["href"][:2] == "//" \ else urljoin(self.base_url, s.attrib["href"]) if css_url not in self.css: self.css.append(css_url) self.display.log("Crawler: found a new CSS at %s" % css_url) page_css += "<link href=\"Styles/Style{0:0>2}.css\" " \ "rel=\"stylesheet\" type=\"text/css\" />\n".format(stylesheet_count) stylesheet_count += 1 stylesheets = root.xpath("//style") if len(stylesheets): for css in stylesheets: if "data-template" in css.attrib and len(css.attrib["data-template"]): css.text = css.attrib["data-template"] del css.attrib["data-template"] try: page_css += html.tostring(css, method="xml", encoding='unicode') + "\n" except (html.etree.ParseError, html.etree.ParserError) as parsing_error: self.display.error(parsing_error) self.display.exit( "Parser: error trying to parse one CSS found in this page: %s (%s)" % (self.filename, self.chapter_title) ) # TODO: add all not covered tag for `link_replace` function svg_image_tags = root.xpath("//image") if len(svg_image_tags): for img in svg_image_tags: image_attr_href = [x for x in img.attrib.keys() if "href" in x] if len(image_attr_href): svg_url = img.attrib.get(image_attr_href[0]) svg_root = img.getparent().getparent() new_img = svg_root.makeelement("img") new_img.attrib.update({"src": svg_url}) svg_root.remove(img.getparent()) svg_root.append(new_img) book_content = book_content[0] book_content.rewrite_links(self.link_replace) xhtml = None try: if first_page: is_cover = self.get_cover(book_content) if is_cover is not None: page_css = "<style>" \ "body{display:table;position:absolute;margin:0!important;height:100%;width:100%;}" \ "#Cover{display:table-cell;vertical-align:middle;text-align:center;}" \ "img{height:90vh;margin-left:auto;margin-right:auto;}" \ "</style>" cover_html = html.fromstring("<div id=\"Cover\"></div>") cover_div = cover_html.xpath("//div")[0] cover_img = cover_div.makeelement("img") cover_img.attrib.update({"src": is_cover.attrib["src"]}) cover_div.append(cover_img) book_content = cover_html self.cover = is_cover.attrib["src"] xhtml = html.tostring(book_content, method="xml", encoding='unicode') except (html.etree.ParseError, html.etree.ParserError) as parsing_error: self.display.error(parsing_error) self.display.exit( "Parser: error trying to parse HTML of this page: %s (%s)" % (self.filename, self.chapter_title) ) return page_css, xhtml
SCOPES = 'https://www.googleapis.com/auth/calendar' store = file.Storage('token.json') creds = store.get() if not creds or creds.invalid: flow = client.flow_from_clientsecrets(os.environ['CREDENTIALS_PATH'], SCOPES) creds = tools.run_flow(flow, store) service = build('calendar', 'v3', http=creds.authorize(Http())) # Call the Calendar API now = datetime.datetime.utcnow().isoformat() + 'Z' # 'Z' indicates UTC time data = open('example.html', 'rb').read().decode("utf-8") tree = html.fromstring(data) table = tree.xpath('/html/body/table/tr[6]/td/table')[0] for row in table.getchildren()[1:]: childrens = row.getchildren() if len(childrens) > 1: time_str = childrens[3].getchildren()[0].text.strip() date_str = childrens[4].getchildren()[0].text.strip() start, end = time_str.split('-') start = start.strip() end = end.strip() month, day, year = date_str.split('.') start_date = f'{year}-{day}-{month}T{start}:00+03:00' end_date = f'{year}-{day}-{month}T{end}:00+03:00' print(f'Meeting at {start_date} on {end_date}') event = {
if file == "": file = "TestCSV.csv" try: with open(file, newline='') as csvfile: puts(colored.yellow("lese Datei: " + file)) logoreader = csv.DictReader(csvfile) for row in logoreader: r = requests.post( "http://www.vereinswappen.de/vereine.php?option=vereinssuchen", data={ 'verein': row['verein'], 'suchen': 'Suchen' }) tree = html.fromstring(r.content) #print(r.content) vereinswappen = tree.xpath( '//img[@style="max-width: 100px; width: 100px;"]/@src') i = 1 for x in vereinswappen: if not os.path.exists('Wappen'): os.makedirs('Wappen') filename = "Wappen/" + row['verein'] + str(i) + ".png" urllib.request.urlretrieve(x, filename) i += 1 #print(vereinswappen) puts(colored.yellow('Lade Wappen ' + row['verein'] + '...')) print( emoji.emojize('Success! :white_check_mark:', use_aliases=True))
def tree_from_url(url: str, decode: str=None): content = requests.get(url).content if decode: content = content.decode(decode) return html.fromstring(content)
def fetch_rfc(number, force=False): url = 'https://tools.ietf.org/html/rfc%d' % number output_dir = 'data/%04d' % (number // 1000 % 10 * 1000) output_file = '%s/rfc%d.json' % (output_dir, number) # すでに出力ファイルが存在する場合は終了 (--forceオプションが有効なとき以外) if not force and os.path.isfile(output_file): return 0 # 出力先ディレクトリの作成 os.makedirs(output_dir, exist_ok=True) # RFCページのDOMツリーの取得 headers = {'User-agent': '', 'referer': url} page = requests.get(url, headers) tree = html.fromstring(cleanhtml(page.content)) # タイトルの取得 title = tree.xpath('//title/text()') if len(title) == 0: raise RFCNotFound title = title[0] # ページが存在するか確認 content_h1 = tree.xpath('//div[@class="content"]/h1/text()') if len(content_h1) >= 1 and content_h1[0].startswith('Not found:'): raise RFCNotFound # DOMツリーから文章を取得 contents = tree.xpath( '//pre/text() | ' # 本文 '//pre/a/text() | ' # 本文中のリンク # セクションのタイトル '//pre/span[@class="h1" or @class="h2" or @class="h3" or ' '@class="h4" or @class="h5" or @class="h6"]//text() |' '//pre/span/a[@class="selflink"]/text() |' # セクションの番号 '//a[@class="invisible"]' # ページの区切り ) # ページ区切りで段落がページをまたぐ場合の処理 contents_len = len(contents) for i, content in enumerate(contents): # ページ区切りのとき if (isinstance(content, html.HtmlElement) and content.get('class') == 'invisible'): contents[i - 1] = contents[i - 1].rstrip() # 前ページの末尾の空白を除去 contents[i + 0] = '' # ページ区切りの除去 if i + 1 >= contents_len: continue contents[i + 1] = '' # 余分な改行の除去 if i + 2 >= contents_len: continue contents[i + 2] = '' # 余分な空白の除去 if i + 3 >= contents_len: continue if not isinstance(contents[i + 3], str): continue contents[i + 3] = contents[i + 3].lstrip('\n') # 次ページの先頭の改行を除去 # ページをまたぐ文章に対応する処理 first, last = 0, -1 prev_last_line = contents[i - 1].split('\n')[last] # 前ページの最後の行 next_first_line = contents[i + 3].split('\n')[first] # 次ページの最初の行 indent1 = get_indent(prev_last_line) indent2 = get_indent(next_first_line) # print('newpage:', i) # print(' ', indent1, prev_last_line) # print(' ', indent2, next_first_line) # 以下の条件のとき、段落がページをまたいでいると判断する # 1) 前ページの最後の段落の字下げの幅と、次ページの最初の段落の字下げの幅が同じとき # 2) 前ページの最後の段落が、文終端の「.」や「;」ではないとき if (not prev_last_line.endswith('.') and not prev_last_line.endswith(';') and re.match(r'^ *[a-zA-Z0-9(]', next_first_line) and indent1 == indent2): # 内容がページをまたぐ場合、BREAKを挿入する # BREAK は文章のときは空白に置き換えて、コードのときは改行の置き換える。 contents[i + 3] = BREAK + contents[i + 3] else: # 内容がページをまたがない場合、段落区切り(改行2つ)を挿入する contents[i + 0] = '\n\n' # ページ番号を非表示にする contents[-1] = re.sub(r'.*\[Page \d+\]$', '', contents[-1].rstrip()).rstrip() # 全ての段落を結合する(段落の区切りは\n\n) text = ''.join(contents).strip() paragraphs = Paragraphs(text) # 段落情報をJSONに変換する obj = { 'title': { 'text': title }, 'number': number, 'created_at': str(datetime.now(JST)), 'updated_by': '', 'contents': [], } for paragraph in paragraphs: obj['contents'].append({ 'indent': paragraph.indent, 'text': paragraph.text, }) if paragraph.is_section_title: obj['contents'][-1]['section_title'] = True if paragraph.is_code: obj['contents'][-1]['raw'] = True if paragraph.is_toc: obj['contents'][-1]['toc'] = True json_file = open(output_file, 'w') json.dump(obj, json_file, indent=2, ensure_ascii=False)
def __init__(self, args): self.args = args self.display = Display("info_%s.log" % escape(args.bookid)) self.display.intro() self.cookies = {} self.jwt = {} if not args.cred: if not os.path.isfile(COOKIES_FILE): self.display.exit("Login: unable to find cookies file.\n" " Please use the --cred option to perform the login.") self.cookies = json.load(open(COOKIES_FILE)) else: self.display.info("Logging into Safari Books Online...", state=True) self.do_login(*args.cred) if not args.no_cookies: json.dump(self.cookies, open(COOKIES_FILE, "w")) self.book_id = args.bookid self.api_url = self.API_TEMPLATE.format(self.book_id) self.display.info("Retrieving book info...") self.book_info = self.get_book_info() self.display.book_info(self.book_info) self.display.info("Retrieving book chapters...") self.book_chapters = self.get_book_chapters() self.chapters_queue = self.book_chapters[:] if len(self.book_chapters) > sys.getrecursionlimit(): sys.setrecursionlimit(len(self.book_chapters)) self.book_title = self.book_info["title"] self.base_url = self.book_info["web_url"] self.clean_book_title = "".join(self.escape_dirname(self.book_title).split(",")[:2]) \ + " ({0})".format(self.book_id) books_dir = os.path.join(PATH, "Books") if not os.path.isdir(books_dir): os.mkdir(books_dir) self.BOOK_PATH = os.path.join(books_dir, self.clean_book_title) self.css_path = "" self.images_path = "" self.create_dirs() self.display.info("Output directory:\n %s" % self.BOOK_PATH) self.chapter_title = "" self.filename = "" self.css = [] self.images = [] self.display.info("Downloading book contents... (%s chapters)" % len(self.book_chapters), state=True) self.BASE_HTML = self.BASE_01_HTML + (self.KINDLE_HTML if not args.no_kindle else "") + self.BASE_02_HTML self.cover = False self.get() if not self.cover: self.cover = self.get_default_cover() cover_html = self.parse_html( html.fromstring("<div id=\"sbo-rt-content\"><img src=\"Images/{0}\"></div>".format(self.cover)), True ) self.book_chapters = [{ "filename": "default_cover.xhtml", "title": "Cover" }] + self.book_chapters self.filename = self.book_chapters[0]["filename"] self.save_page_html(cover_html) self.css_done_queue = Queue(0) if "win" not in sys.platform else WinQueue() self.display.info("Downloading book CSSs... (%s files)" % len(self.css), state=True) self.collect_css() self.images_done_queue = Queue(0) if "win" not in sys.platform else WinQueue() self.display.info("Downloading book images... (%s files)" % len(self.images), state=True) self.collect_images() self.display.info("Creating EPUB file...", state=True) self.create_epub() if not args.no_cookies: json.dump(self.cookies, open(COOKIES_FILE, "w")) self.display.done(os.path.join(self.BOOK_PATH, self.book_id + ".epub")) self.display.unregister() if not self.display.in_error and not args.log: os.remove(self.display.log_file) sys.exit(0)
def handler(event, context): url = "https://twitter.com/realDonaldTrump" response = requests.request("GET", url) tree = html.fromstring(response.content) vecTweets = tree.xpath('//div[@class="js-tweet-text-container"]//p') return vecTweets[0].text_content()
def tree_from_html(url: str): page = requests.get(url) return html.fromstring(page.content)