def interscience(url): '''downloads the PDF from sciencedirect given a link to an article''' url = str(url) buffer = StringIO() curl = pycurl.Curl() curl.setopt(curl.URL, url) curl.setopt(curl.WRITEFUNCTION, buffer.write) curl.setopt(curl.VERBOSE, 0) curl.setopt(curl.USERAGENT, user_agent) curl.setopt(curl.TIMEOUT, 20) curl.perform() curl.close() buffer = buffer.getvalue().strip() html = lxml.html.parse(StringIO(buffer)) image = html.findall("//img[@name='pdf']")[0] link = image.getparent() pdf_href = link.attrib["href"] #now let's get the article title title_div = html.findall("//div[@class='articleTitle']")[0] paper_title = title_div.text paper_title = paper_title.replace("\n", "") if paper_title[-1] == " ": paper_title = paper_title[:-1] re.sub('[^a-zA-Z0-9_\-.() ]+', '', paper_title) #now fetch the document for the user os.system( "wget --user-agent=\"pyscholar/blah\" --output-document=\"%s.pdf\" \"%s\"" % (paper_title, pdf_href)) print "\n\n"
def interscience(url): '''downloads the PDF from sciencedirect given a link to an article''' url = str(url) buffer = StringIO() curl = pycurl.Curl() curl.setopt(curl.URL, url) curl.setopt(curl.WRITEFUNCTION, buffer.write) curl.setopt(curl.VERBOSE, 0) curl.setopt(curl.USERAGENT, user_agent) curl.setopt(curl.TIMEOUT, 20) curl.perform() curl.close() buffer = buffer.getvalue().strip() html = lxml.html.parse(StringIO(buffer)) image = html.findall("//img[@name='pdf']")[0] link = image.getparent() pdf_href = link.attrib["href"] #now let's get the article title title_div = html.findall("//div[@class='articleTitle']")[0] paper_title = title_div.text paper_title = paper_title.replace("\n", "") if paper_title[-1] == " ": paper_title = paper_title[:-1] re.sub('[^a-zA-Z0-9_\-.() ]+', '', paper_title) #now fetch the document for the user os.system("wget --user-agent=\"pyscholar/blah\" --output-document=\"%s.pdf\" \"%s\"" % (paper_title, pdf_href)) print "\n\n"
def get_realtime_title(): """Get ALL Category and Source Realtime news from chinatimes realtime url may change or invaild when it is not *realtime* return: dict{category, source, time, title, url} """ response, content = h.request(news_list_url) html = lxml.html.fromstring(content.decode('big5', 'ignore')) html.make_links_absolute(base_url) # Get news-list section div = html.findall("*div")[1] # Get all title-info to list tr = list(div.iterdescendants("tr"))[1:] result_list = [] for title_info in tr: news_url = list(title_info.iterlinks())[0][2] info_list = map(lambda x: x.text_content(), list(title_info)) info_dict = {"title": info_list[0].strip("\r\n "), "time": info_list[1], "category": info_list[2], "source": info_list[3], "url": news_url} result_list.append(info_dict) return result_list
def get_semester_course_data(self, url, semester): """inputs, url (str) to call, semester (str) that url find data for""" print(f"Obtaining and indexing information for {semester}") html = self.call_url_and_get_html_object(url) tables = html.findall( ".//table[@class='courseListing basicTable courseListingSetWidths']" ) # Parse html to get course offering data for table in tables: fields = table.findall(".//td") spans = table.findall(".//span") course_number = str(spans[1].text.strip()) title = str(fields[4].text).strip() professor = str(fields[6].text).strip() status = str(fields[0].text) crn = str(fields[1].text) # Add course offering data to dictionary of course classes if course_number not in self.course_dict.keys(): # If course doesn't already exist in dictionary keys, instantiate class of it self.course_dict[course_number] = Course(title=title, semester=semester, professor=professor, crn=crn, status=status) else: self.course_dict[course_number].add_instance_of_course( semester, professor, crn, status)
def compass(answers=None): answers = answers or def_answers.copy() questions = {} post_args = {} while post_args is not None: # Post previous responses, Get new questions (first post is empty, gets page 1) html_text = submit_page(post_args) html = lxml.html.fromstring(html_text) curr_questions = reap_questions(html) # If the test isn't done, prepare [post_args] for next page if len(curr_questions): # Verify test integrity if not all(item in def_questions.items() for item in curr_questions.items()): raise RuntimeError( "Questions have changed. Answer cache is bad!") questions.update(curr_questions) # Assemble responses post_args = { 'answer_' + str(key): answers[key] for key in curr_questions } # Print responses for num in sorted(curr_questions): print( str(num) + ":\t" + curr_questions[num] + "\n\t" + values[int(answers[num])] + '\n') submit_tag = html.find(".//input[@type='submit']") post_args[ "submit"] = submit_tag.value # submit_tag.type == "submit" for tag in html.findall(".//input[@type='hidden']"): post_args[tag.name] = tag.value pageno = post_args["pageno"] else: post_args = None pageno = 'f' # with open('/Users/alex/Desktop/page' + pageno + ".html", "a+") as f: # f.write(html_text) h2 = html.find(".//h2") print(h2.text_content()) lines = h2.text_content().split('\n') x = float(lines[0].split(":")[1]) y = float(lines[1].split(":")[1]) pyplot.scatter(x, y) pyplot.xlim(-10, 10) pyplot.ylim(-10, 10) pyplot.title("Political coordinates") pyplot.xlabel("Economic Left/Right") pyplot.ylabel("Social Libertarian/Authoritarian") pyplot.grid() pyplot.show() return questions
def get_realtime_title(pages=5): """ Get ALL Category Realtime news from libertytimes realtime url may change or invaild when it is not **realtime** get_realtime_title(pages=5, encoding="UTF-8") *pages*: get page 1 to pages, default is 5 pages return: dict{time, title, url} """ result_list = [] for page in xrange(1, pages + 1): response, content = h.request("%s&ipage=%d" % (news_list_url, page)) html = lxml.html.fromstring(content.decode("utf-8", "ignore")) html.make_links_absolute(base_url) # Get news-list section div = html.findall("*div")[0] # Get all title-info to list tr = list(div.iterdescendants("tr"))[1:-1] for title_info in tr: news_url = list(title_info.iterlinks())[1][2] info_list = map(lambda x: x.text_content(), list(title_info)) try: info_dict = {"title": info_list[0].strip("\r\n "), "time": info_list[1], "url": news_url} except IndexError, error_infomation: pass result_list.append(info_dict)
def debate_of_term1(id): """Parse a debate transcript in term 1 format and return list of its paragraphs' text content.""" # download the debate transcript or use a local fixed debate if there is one filename = os.path.join('fixed_debates', 'debate_%s.html' % id) if os.path.exists(filename): with open(filename, 'r') as f: content = f.read() else: url = 'http://www.nrsr.sk/dl/Browser/Document?documentId=%s' % id content = scrapeutils.download(url) if 'Unexpected error!' in content: raise RuntimeError("Debate with id '%s' does not exist" % id) # fix markup and parse to HTML tree content = content.replace('12. 9. 1995<o:p></o:p>', '12. septembra 1995') content = content.replace('<o:p></o:p>', '') html = lxml.html.fromstring(content) # extract paragraph texts, use blank line as paragraph separator result = [] text = '' for par in html.findall('.//p'): line = scrapeutils.plaintext(par.text_content()) if len(line) > 0 and not re.match(r'\w+ deň rokovania', line): text += '\n%s' % line else: if text: result.append(scrapeutils.clear_hyphens(text, '\n')) text = line result.append(scrapeutils.clear_hyphens(text, '\n')) return scrapeutils.plaintext(result)
def mp(id, term): """Parse MP from his profile webpage.""" if term and term not in terms.keys(): raise ValueError("unknown term '%s'" % term) url = 'http://www.nrsr.sk/web/Default.aspx?sid=poslanci/poslanec&PoslanecID=%s&CisObdobia=%s' % (id, term) content = scrapeutils.download(url) if 'Unexpected error!' in content: raise RuntimeError("MP with id '%s' does not exist in term '%s'" % (id, term)) html = lxml.html.fromstring(content) result = { 'id': str(id), 'url': url } for div in html.findall('.//div[@class="mp_personal_data"]//div[strong]'): label = div.findtext('strong') value = div.find('span') result[label.lower()] = value.text_content() if value is not None else '' image_url = html.find('.//div[@class="mp_foto"]/img').get('src') image = requests.get(image_url).content with open(os.path.join(BASE_DIR, 'dummy-image.jpg'), 'rb') as f: dummy_image = f.read() result['fotka'] = image_url if image != dummy_image else '' result['členstvo'] = [] ul = html.find('.//span[@id="_sectionLayoutContainer_ctl01_ctlClenstvoLabel"]').getparent().getnext() for li in ul.findall('li'): m = re.search(r'(.*?)\s*\((.*?)\)', li.text) result['členstvo'].append({'meno': m.group(1), 'rola': m.group(2)}) return scrapeutils.plaintext(result)
def resolve_ean(ean): page = requests.get(SEARCH_URL.format(ean)) html = lxml.html.document_fromstring(page.text) #Jump further further_url = "http://www.rebuy.de/" + html.find('.//a[@class="productConversion"]').attrib["href"] page = requests.get(further_url) html = lxml.html.document_fromstring(page.text) result = dict() result["title"] = html.find('.//h1/span[@class="loud"]').text_content() result["type"] = TYPE_TRANSLATE[html.xpath('.//p[contains(@class, "category-icon")]')[0].text_content()] result["imgurl"] = html.find(".//img[@id='cover']").attrib["src"] attribs = dict() for i in html.findall(".//ul[@id='main-info-facts']/li"): name, sep, val = i.text_content().strip().partition(":") attribs[name] = val result["created"] = defNone(attribs.get("Erscheinungsdatum"), lambda x: toDBDate(x.strip(), "%d.%m.%Y")) result["author"] = None result["artists"] = None result["description"] = None result["duration"] = None return result
def _parse_departures(self, html): ns = html.get('xmlns', '') rows = html.findall( './/*[@id="dnn_ctr1608_ModuleContent"]//{%(ns)s}table' '//{%(ns)s}table//{%(ns)s}tr' % ({ 'ns': ns }))[1:-1] return [Departure(self, row) for row in rows]
def clean_up_html(html, method='html'): html = autolink_html(html, link_regexes=_link_regexes) html = lxml.html.fromstring(cleaner.clean_html(html)) for h1 in html.findall('h1'): h1.tag = 'h2' for a in html.cssselect('a'): a.attrib['target'] = '_blank' return lxml.html.tostring(html, encoding='utf-8', method=method)
def handle(self, **options): self.stdout.write('Loading file %s...' % options['html_file']) self.stdout.write('Save %s...' % options['save']) save = options['save'] #save = True html = lxml.html.parse(options['html_file']) tables = html.findall(".//table") self.stdout.write('nb tables %s' % len(tables)) for table in html.iterfind(".//table"): self.error_warning=0 # First child must be a caption self.caption = table[0] # Next row: parse brand name, url, and date row = self.caption.getnext() if row[0].get('colspan') is not '5': self.stdout.write('\tNot a brand table') continue if row[1].get('colspan') is not '5': self.stdout.write('\tNot a brand table') continue # Parse Brand self.parseBrand(row[1]) brand_note = None new_company = None if self.brand_note != []: brand_note = Note(note = '. '.join(self.brand_note)) self.stdout.write("\tNote: %s" % brand_note.note) if save: brand_note.save() if save: new_company = Company( name=self.caption.text, validation_date = self.date, note = brand_note, certification = ' / '.join(self.certification) ) new_company.save() #new_company = Company.objects.get( name=self.caption.text ) for name in self.brand_name.split('/'): if save: brand = Brand(name=name.strip(), company=new_company) brand.save() for url in self.url: url = url.strip('/') if save: new_site = Site(domain=url, company=new_company) new_site.save() # Next row must be for table header row = row.getnext() text = row.find("td").xpath("string()") if text != "Description": self.stdout.write(red+'Header table first column %s is not Description' % text+reset) # Parse products self.parseProducts(row, new_company, options['save'])
def get_string_ids(self): """docstrig for get_string_ids""" print "Check available items .." ids = [] data = self.fetch_url(self.string_list_url) html = lxml.html.fromstring(data) for i in html.findall('li'): ids.append(i.get('id').replace('pstring_', '')) return ids
def resolve_ean(ean): page = requests.get(SEARCH_URL.format(ean=ean)) #Check if something was found if "Ihre Suche ergab leider keine Treffer" in page.text: return None html = lxml.html.document_fromstring(page.text) result = defaultdict() transform = list() #Check media type result["type"] = html.find('.//span[@class="noCategory"]').text_content().strip() resolve_author = lambda: defNone(html.find('.//span[@class="oAuthorLinked"]'), lambda x: x.text_content()) if result["type"].startswith("Buch"): result["type"] = "book" result["author"] = resolve_author() result["artists"] = None elif result["type"] == "Hörbuch": result["type"] = "audiobook" result["author"] = resolve_author() result["artists"] = None else: result["type"] = "movie" result["artists"] = [elm.text for elm in html.findall('.//span[@class="oAuthorLinked"]/a')] result["author"] = None #Extract simple attributes from the head of the page result["title"] = html.find('.//span[@class="oProductTitle"]').text.strip() result["imgurl"] = html.find('.//img[@id="elevateZoom"]').attrib["src"] result["description"] = defNone(html.find('.//dd[@class="cTypeBeschreibung"]'), lambda x: x.text_content().strip()) #Extract attributes of the dd/dt Table next to the article picture attr_container = html.find('.//dl[@class="dlCols30_70"]') attr_list = dict() for elm in attr_container.getchildren(): if elm.tag == "dt": curName = elm.text.strip() if elm.tag == "dd": attr_list[curName] = elm.text_content().strip() result["duration"] = defNone(attr_list.get("Spieldauer"), lambda x:int(x.replace("Minuten", ""))) result["studio"] = attr_list.get("Studio") result["genre"] = attr_list.get("Genre") import locale oldlocale = locale.getlocale(locale.LC_TIME) locale.setlocale(locale.LC_TIME, "de_DE.utf8") result["created"] = defNone(attr_list.get("Erscheinungsdatum"), lambda x: interpDate(x)) locale.setlocale(locale.LC_TIME, oldlocale) return result
def _parse_stations(self, html): options = html.findall( './/*select[@datevaluefield="StationName"]/option') stations = [] for option in options[1:]: data = urlencode({'stationname': option.get('value')}) name = '%s' % (option.get('value')).rsplit(' Stn', 1)[0] url = '%s?%s' % (self.url, data) stations += [Station(name, url)] return stations
def get_string(html_text): html = lxml.html.fromstring(html_text) remove_tags = ('.//style', './/script', './/noscript') for remove_tag in remove_tags: for tag in html.findall(remove_tag): tag.drop_tree() # ここでの削除は元の変数tに反映されます。 codeframe_list = [] lang_list = [] # コードの削除 for tag in html.findall(".//div[@class='code-frame']"): codeframe_list.append(tag.text_content()) lang_list.append(tag.attrib["data-lang"]) tag.drop_tree() atext_list = [] ahref_list = [] # href リンクの削除 for tag in html.cssselect('a'): if tag.text is not None: atext_list.append(tag.text) if tag.get('href') is not None: ahref_list.append(tag.get('href')) tag.drop_tree() code_list = [] # 一行コードの削除 for cc in html.cssselect('code'): if cc.text is not None: code_list.append(cc.text) cc.drop_tree() text = html.text_content().strip('\n') return pd.Series( [ "".join(text.split('\n')), ",".join(codeframe_list), ",".join(lang_list), ",".join(code_list), ",".join(atext_list), ",".join(ahref_list) ], index=['text', 'code-frame', 'lang', 'code', 'a-text', 'a-href'])
def _parse_stations(self, html): ns = html.get('xmlns', '') options = html.findall( './/*[@id="EntryForm"]//{%(ns)s}select/{%(ns)s}option' %({'ns':ns})) stations = [] for option in options: data = urlencode({'stationname': option.get('value')}) name = '%s' %(option.get('value')).rsplit(' Stn', 1)[0] url = '%s?%s' %(self.url, data) stations += [Station(name, url)] return stations
def parse_html(self, url): page = url.split('articles/')[-1] if self.base_path.joinpath(page).exists(): html = lxml.html.parse(page) logging.info('HTML page `{}` exists, and parses.'.format(url)) # Dateline is in the first p, unless that is an image, then it is in the third. dateline = html.find('.//{*}p') if dateline.text is None: dateline = html.findall('.//{*}p')[2] if 'BLACKSBURG, Va.' in dateline.text: self.spatial_coverage = 'Blacksburg, Va.' else: date_issued = self.date_issued.strftime(', %b') self.spatial_coverage = dateline.text.split(date_issued)[0].title() if len(self.spatial_coverage) > 25 or '\n' in self.spatial_coverage or ' ' == self.spatial_coverage: # Sanity check: These are symptoms of errors. Change them to Blacksburg. self.spatial_coverage = 'Blacksburg, Va.' logging.debug('Spatial Coverage: {}'.format(self.spatial_coverage)) # Author is in the first li of the last ul, or the one before that, if it exists. html_lists = html.findall('.//{*}ul') author = html_lists[-1].find('./{*}li').text if author is None: try: author = html_lists[-2].find('./{*}li').text except IndexError as e: logging.error('No author found.') if author is not None: author = ' '.join(author.split()) self.author = author logging.debug('Author: {}'.format(self.author)) # Any img tag is a related file. for image in html.iterfind('.//{*}img'): self.image_urls.add(image.get('src')) if len(self.image_urls) > 0: logging.debug('All image urls: {}'.format(self.image_urls)) else: logging.error('Url `{}` does not map to an HTML file in the archive.'.format(url)) self.error_urls.add(url)
def _parse_stations(self, html): select = [ select for select in html.findall('.//*div[@id="divTrainLineStationOption"]//select') if select.get('name').endswith('TrainStation')][0] stations = [] for option in select.findall('option')[1:]: data = urlencode({'stationname': option.get('value')}) name = '%s' % (option.get('value')).rsplit(' Stn', 1)[0] url = '%s?%s' % (self.url, data) stations += [Station(name, url)] return sorted(stations)
def compass(): answers = def_answers.copy() questions = {} post_args = {} while post_args is not None: # Post previous responses, Get new questions (first post is empty, gets page 1) html_text = submit_page(post_args) html = lxml.html.fromstring(html_text) curr_questions = reap_questions(html) # If the test isn't done, prepare [post_args] for next page if len(curr_questions): # Verify test integrity if not all(item in def_questions.items() for item in curr_questions.items()): raise RuntimeError("Questions have changed. Answer cache is bad!") questions.update(curr_questions) # Assemble responses post_args = {'answer_' + str(key): answers[key] for key in curr_questions} # Print responses for num in sorted(curr_questions): print(str(num) + ":\t" + curr_questions[num] + "\n\t" + values[int(answers[num])] + '\n') submit_tag = html.find(".//input[@type='submit']") post_args["submit"] = submit_tag.value # submit_tag.type == "submit" for tag in html.findall(".//input[@type='hidden']"): post_args[tag.name] = tag.value pageno = post_args["pageno"] else: post_args = None pageno = 'f' # with open('/Users/alex/Desktop/page' + pageno + ".html", "a+") as f: # f.write(html_text) h2 = html.find(".//h2") print(h2.text_content()) lines = h2.text_content().split('\n') x = float(lines[0][-6:]) y = float(lines[1][-6:]) pyplot.scatter(x, y) pyplot.xlim(-10, 10) pyplot.ylim(-10, 10) pyplot.title("Political coordinates") pyplot.xlabel("Economic Left/Right") pyplot.ylabel("Social Libertarian/Authoritarian") pyplot.grid() pyplot.show() return questions
def get_realtime_title(pages=5, encoding="UTF-8"): """ Get ALL Category Realtime news from appledaily realtime url may change or invaild when it is not **realtime** get_realtime_title(pages=5, encoding="UTF-8") *pages*: get page 1 to pages, default is 5 pages *encoding*: html text encoding return: dict{time, title, url} """ result_list = [] for page in xrange(1, pages + 1): response, content = h.request("%s/index/type/apple/page/%d" % (news_list_url, page)) html = lxml.html.fromstring(content.decode("utf-8", "ignore")) html.make_links_absolute(base_url) # Get news-list section div = html.findall("*div")[0] # Get all title-info to list li = list(div.iterdescendants("li"))[10:-29] for title_info in li: news_url = list(title_info.iterlinks())[0][2] info_list = map(lambda x: x.text_content().encode(encoding), list(title_info)) # info_list = info_list[0].strip("\r\n ").replace("\n", "") # time = info_list[:5] # category = info_list[5:11] # title = info_list[11:].strip("\r\n ") # title = title[: title.rfind("(") - 1] arr = [i.strip() for i in info_list[0].split("\n")] time = arr[1][:5] category = arr[1][5:] title = arr[2] try: info_dict = {"title": title, "time": time, "category": category, "url": news_url} except IndexError, error_infomation: pass result_list.append(info_dict)
def get_data_urls_and_terms(self, subject): """Return list of urls to obtain data from available previous semesers""" print("Finding terms with course information available") html = self.call_url_and_get_html_object( "https://my.gwu.edu/mod/pws/") # Call home page url term_elements = html.findall(".//div[@class='tableHeaderFont']" ) # Obtain list of available terms terms = [term.text.lower().strip() for term in term_elements] term_urls = [] for term in terms: num_term = translate_term_to_numerical(term) self.terms.append(int(num_term)) term_urls.append((self.get_url_from_term_string(num_term, subject), term)) return term_urls
def read(): import requests import lxml.html # WebサイトのURLを指定 url = "https://status.aws.amazon.com/" # Requestsを利用してWebページを取得する r = requests.get(url) # lxmlを利用してWebページを解析する html = lxml.html.fromstring(r.text) # lxmlのfindallを利用して、ヘッドラインのタイトルを取得する elems = html.findall(".//td") for elem in elems: print(elem.text)
def grab_cloudflare(url): sess = requests.Session() sess.headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0"} safe_eval = lambda s: eval(s, {"__builtins__": {}}) if "#" not in s and "__" not in s else "" page = sess.get(url).content if "a = $('#jschl_answer');" in page: # Cloudflare anti-bots is on html = lxml.html.fromstring(page) challenge = html.find(".//input[@name='jschl_vc']").attrib["value"] script = html.findall(".//script")[-1].text_content() domain = url.split("/")[2] math = re.search(r"a\.val\((\d.+?)\)", script).group(1) answer = str(safe_eval(math) + len(domain)) data = {"act": "jschl", "jschl_vc": challenge, "jschl_answer": answer} return sess.post(url, data).content else: return page
def mp_list(term=None): """Parse list of MPs.""" if term and term not in terms.keys(): raise ValueError("unknown term '%s'" % term) term = term or max(terms.keys()) url = 'http://www.nrsr.sk/web/Default.aspx?sid=poslanci/zoznam_abc&ListType=0&CisObdobia=%s' % term content = scrapeutils.download(url) html = lxml.html.fromstring(content) result = { 'url': url, '_items': [{ 'id': re.search(r'PoslanecID=(\d+)', mp.get('href')).group(1), 'meno': mp.text, } for mp in html.findall('.//div[@class="mps_list"]//li/a')] } return scrapeutils.plaintext(result)
def get_queues(html): all_items = html.findall('.//div[@class="quote"]') for item in all_items: all_text = item.findall('./span[@class="text"]') for txt in all_text: print(txt.text_content()) all_authors = item.findall('./span/small[@class="author"]') for txt in all_authors: print('author:', txt.text_content()) all_urls = item.findall('./span/a') for txt in all_urls: print('url:', txt.text_content(), txt.attrib['href']) all_tags = item.findall('./div/a') for txt in all_tags: print('tag:', txt.text_content(), txt.attrib['href']) print('---')
def debate_of_terms234(id): """Parse a debate transcript in terms 2-4 format and return list of its paragraphs' text content.""" # download RTF file or use a local fixed debate if there is one filename = os.path.join('fixed_debates', 'debate_%s.rtf' % id) if not os.path.exists(filename): url = 'http://www.nrsr.sk/dl/Browser/Document?documentId=%s' % id rtf = scrapeutils.download(url) filename = os.path.join(scrapeutils.WEBCACHE_PATH, 'debate_%s.rtf' % id) with open(filename, 'w') as f: f.write(rtf) # convert from RTF to HTML using unoconv using LibreOffice content = subprocess.check_output(['unoconv', '-f', 'html', '--stdout', filename]) html = lxml.html.fromstring(content) result = [] for par in html.findall('./body/p'): result.append(par.text_content()) return scrapeutils.plaintext(result)
def get_realtime_title(encoding="UTF-8"): """ Get ALL Category Realtime news from udn news realtime url may change or invaild when it is not **realtime** get_realtime_title(encoding="UTF-8") *encoding*: html text encoding return: dict{category, time, title, url} """ response, content = h.request(news_list_url) html = lxml.html.fromstring(content.decode('big5', 'ignore')) html.make_links_absolute(base_url) # Get news-list section table = html.findall("*table")[0] # Get all title-info to list tr = list(table.iterdescendants("tr"))[13: -3] result_list = [] for title_info in tr: news_url = list(title_info.iterlinks()) if not news_url or "gif" in news_url[0][2]: continue news_url = news_url[0][2] info_list = map(lambda x: x.text_content().encode(encoding), list(title_info)) try: info_dict = {"title": info_list[1].strip("\r\n "), "time": info_list[0], "category": info_list[2], "url": news_url} except IndexError, error_infomation: pass result_list.append(info_dict)
def resolve_ean(ean): page = requests.get(SEARCH_URL.format(ean)) html = lxml.html.document_fromstring(page.text) result = dict() title_elm = html.find(".//span[@itemprop='name']") #When the title is not found on the page, the product seems to be in the unsorted section of geizhals... if title_elm is None: return None result["title"] = title_elm.text_content() result["genre"] = html.find(".//li[@class='ghnavhi']").text_content() description = html.find(".//div[@id='gh_proddesc']").text_content() result["firstrelease"] = defNone(re.search("Ersterscheinung: (\d+)", description), lambda x: x.group(1)) for i in html.findall(".//a[@class='revlink']"): if "imdb" in i.attrib["href"]: result["imdb_link"] = i.attrib["href"] break; return result
def grab_cloudflare(url, *args, **kwargs): sess = requests.Session() sess.headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0"} safe_eval = lambda s: eval(s, {"__builtins__": {}}) if "#" not in s and "__" not in s else "" page = sess.get(url, *args, **kwargs) if "a = document.getElementById('jschl-answer');" in page.content: logger.info("Encountered CloudFlare anti-bot wall") # Cloudflare anti-bots is on html = lxml.html.fromstring(page.content) challenge = html.find(".//input[@name='jschl_vc']").attrib["value"] script = html.findall(".//script")[-1].text_content() domain_parts = url.split("/") domain = domain_parts[2] math = re.search(r"a\.value = (\d.+?);", script).group(1) answer = str(safe_eval(math) + len(domain)) data = {"jschl_vc": challenge, "jschl_answer": answer} get_url = domain_parts[0] + '//' + domain + "/cdn-cgi/l/chk_jschl" return sess.get(get_url, params=data, headers={'referer': url}, *args, **kwargs) else: return page
def old_debates_list(term): """Parse list of debates for the given term of office from NRSR Digital Library. Appropriate for older terms (1.-4.) where debates are not split by speaker.""" if term not in ['1', '2', '3', '4']: raise ValueError("Old style transcripts are not available for term '%s'" % term) base_url = 'http://www.nrsr.sk/dl/Browser/Grid?nodeType=DocType&legId=13&chamberId=0' + \ '&categoryId=1&committeeId=0&documentTypeId=5&folderId=0&meetingNr=' + \ '&termNr=%s' % term result = { 'url': base_url, '_items': [] } page = 0 while True: url = base_url + '&pageIndex=%s' % page content = scrapeutils.download(url) html = lxml.html.fromstring(content) # extract all debates from the current page for tr in html.findall('.//table[@class="resultTable"]//tr'): sequence_number = tr.findtext('td[1]/a') title = tr.find('td[2]/a') doc_id = re.search(r'documentId=(\d+)', title.get('href')) debate = { 'časť': sequence_number, 'názov': title.text, 'url': 'http://www.nrsr.sk' + title.get('href'), 'id': doc_id.group(1) } result['_items'].append(debate) page += 1 pages = html.findtext('.//div[@class="pager"]/span[last()]') if page >= int(pages): break return scrapeutils.plaintext(result)
def session_list(term=None): """Parse list of sessions in one term of office of the parliament.""" if term and term not in terms.keys(): raise ValueError("unknown term '%s'" % term) url = 'http://www.nrsr.sk/web/default.aspx?sid=schodze/hlasovanie/schodze' content = scrapeutils.download(url) html = lxml.html.fromstring(content) # scraping for older terms requires another POST request to emulate selectbox choice if term: data = { '_sectionLayoutContainer$ctl01$_termsCombo': term, '__VIEWSTATE': html.find('.//input[@id="__VIEWSTATE"]').get('value'), '__EVENTVALIDATION': html.find('.//input[@id="__EVENTVALIDATION"]').get('value'), } ext = '|%s' % term content = scrapeutils.download(url, 'POST', data, ext) html = lxml.html.fromstring(content) # pick list items result = { 'url': url, '_items': [] } for li in html.findall('.//div[@id="_sectionLayoutContainer__panelContent"]//ul//li'): a = li.find('a') link = a.get('href') session = { 'číslo': re.search(r'CisSchodze=(\d+)', link).group(1), 'názov': a.text, 'trvanie': re.search(r'\((.+?)\)', li.text_content()).group(1), 'url': 'http://www.nrsr.sk/web/' + link, } result['_items'].append(session) return scrapeutils.plaintext(result)
def deputy_speakers(): """Parse current deputy speakers (podpredsedovia) of the chamber.""" url = 'http://www.nrsr.sk/web/default.aspx?sid=podpredsedovia' content = scrapeutils.download(url) html = lxml.html.fromstring(content) result = [] for div in html.findall(".//div[@class='vicechairman_bigbox']"): name = div.find('.//a') link = name.get('href') id = re.search(r'PoslanecID=(\d+)', link) description = div.find(".//div[@class='vicechairman_description']") result.append({ 'fotka': 'http://www.nrsr.sk/web/' + div.find('.//img').get('src'), 'meno': name.text, 'url': 'http://www.nrsr.sk/web/' + link, 'id': id.group(1), 'kandidoval(a) za': description.find('div[1]/strong').tail, 'narodený(á):': description.find('div[2]/strong').tail, 'národnosť': description.find('div[3]/strong').tail, }) return scrapeutils.plaintext(result)
def reap_questions(html): questions = {} for tag in html.findall(".//label[1]/input[@type='radio']"): num = int(tag.name.split('_')[-1]) questions[num] = tag.find("....../td[1]").text_content() return questions
def format_body(self, html): body = html.findall('body') body_list = [] footer = self.format_footer(body[-1].getchildren()) for b in body[:-1]: body_list.append(etree.tostring(b).replace('\t', '').replace('\n','')) html_body =''' <script type="text/javascript"> var indexer = 0; var aryTest = %s ; function nextData() { if(indexer < aryTest.length -1) { indexer += 1; document.forms[0].prev.disabled = false; document.getElementById("openerp_data").innerHTML=aryTest[indexer]; document.getElementById("counter").innerHTML= indexer + 1 + ' / ' + aryTest.length; } else { document.forms[0].next.disabled = true; } } function prevData() { if (indexer > 0) { indexer -= 1; document.forms[0].next.disabled = false; document.getElementById("openerp_data").innerHTML=aryTest[indexer]; document.getElementById("counter").innerHTML= indexer + 1 + ' / ' + aryTest.length; } else { document.forms[0].prev.disabled = true; } } </script> </head> <body> <div id="openerp_data"> %s </div> <div> %s </div> <br> <form> <table> <tr> <td td align="left"> <input name = "prev" type="button" value="Previous" onclick="prevData();"> </td> <td> <div id = "counter">%s / %s</div> </td> <td align="right"> <input name = "next" type="button" value="Next" onclick="nextData();"> </td> </tr> </table> </form> </body></html>'''%(body_list,body_list[0],footer,'1',len(body_list)) return html_body
def format_header(self, html): head = html.findall('head') header = '' for node in head: header += etree.tostring(node) return header
def _render(self, data, template_name=None): """Render output of view function to HTML. :param data: Data dictionary from view function :param template_name: Name of template file :return: Rendered HTML """ nested = template_name is None template_name = template_name or self.template_name if nested and self.detect_render_nested: try: renderer = self.detect_renderer(None, template_name) except KeyError: renderer = self.renderer else: renderer = self.renderer # Catch errors and return appropriate debug divs # todo: add debug parameter try: rendered = renderer(self.template_dir, template_name, data) except IOError: return '<div>Template {} not found.</div>'.format(template_name) html = lxml.html.fragment_fromstring(rendered, create_parent='remove') for element in html.findall('.//*[@mod-meta]'): # Render nested template template_rendered, is_replace = self.render_element(element, data) original = lxml.html.tostring(element) if is_replace: replacement = template_rendered else: replacement = original replacement = replacement.replace( '><', '>' + template_rendered + '<') rendered = rendered.replace(original, replacement) ## Parse HTML using html5lib; lxml is too strict and e.g. throws ## errors if missing parent container; htmlparser mangles whitespace ## and breaks replacement #parsed = BeautifulSoup(rendered, 'html5lib') #subtemplates = parsed.find_all( # lambda tag: tag.has_attr('mod-meta') #) # #for element in subtemplates: # # # Extract HTML of original element # element_html = str(element) # # # Render nested template # template_rendered, is_replace = self.render_element(element, data) # # # Build replacement # if is_replace: # replacement = template_rendered # else: # element.string = template_rendered # replacement = str(element) # # # Replace # rendered = rendered.replace(element_html, replacement) return rendered
def _render(self, data, template_name=None): """Render output of view function to HTML. :param data: Data dictionary from view function :param template_name: Name of template file :return: Rendered HTML """ nested = template_name is None template_name = template_name or self.template_name if nested and self.detect_render_nested: try: renderer = self.detect_renderer(None, template_name) except KeyError: renderer = self.renderer else: renderer = self.renderer # Catch errors and return appropriate debug divs # todo: add debug parameter try: # TODO: Seems like Jinja2 and handlebars renderers would not work with this call sig rendered = renderer(self.template_dir, template_name, data, trust=self.trust) except IOError: return '<div>Template {} not found.</div>'.format(template_name) html = lxml.html.fragment_fromstring(rendered, create_parent='remove') for element in html.findall('.//*[@mod-meta]'): # Render nested template template_rendered, is_replace = self.render_element(element, data) original = lxml.html.tostring(element) if is_replace: replacement = template_rendered else: replacement = original replacement = replacement.replace('><', '>' + template_rendered + '<') rendered = rendered.replace(original, replacement) ## Parse HTML using html5lib; lxml is too strict and e.g. throws ## errors if missing parent container; htmlparser mangles whitespace ## and breaks replacement #parsed = BeautifulSoup(rendered, 'html5lib') #subtemplates = parsed.find_all( # lambda tag: tag.has_attr('mod-meta') #) # #for element in subtemplates: # # # Extract HTML of original element # element_html = str(element) # # # Render nested template # template_rendered, is_replace = self.render_element(element, data) # # # Build replacement # if is_replace: # replacement = template_rendered # else: # element.string = template_rendered # replacement = str(element) # # # Replace # rendered = rendered.replace(element_html, replacement) return rendered
def group_list(type, term=None): """Parse list of groups of a given type (committee, parliamentary group, delegation, friendship group).""" types = { 'committee': { 'url': 'http://www.nrsr.sk/web/default.aspx?SectionId=77', 'term_param_name': '_sectionLayoutContainer$ctl02$_currentTerm', }, 'parliamentary group': { 'url': 'http://www.nrsr.sk/web/default.aspx?SectionId=69', 'term_param_name': '_sectionLayoutContainer$ctl02$_currentTerm', }, 'delegation': { 'url': 'http://www.nrsr.sk/web/default.aspx?sid=eu/delegacie/zoznam', 'term_param_name': '_sectionLayoutContainer$ctl01$_currentTerm', }, 'friendship group': { 'url': 'http://www.nrsr.sk/web/default.aspx?sid=eu/sp/zoznam', 'term_param_name': '_sectionLayoutContainer$ctl01$_currentTerm', }, } if type not in types: raise ValueError("unknown type of group '%s'" % type) if term and term not in terms.keys(): raise ValueError("unknown term '%s'" % term) content = scrapeutils.download(types[type]['url']) html = lxml.html.fromstring(content) # scraping for older terms requires another POST request to emulate selectbox choice if term: data = { types[type]['term_param_name']: term, '__VIEWSTATE': html.find('.//input[@id="__VIEWSTATE"]').get('value'), '__EVENTVALIDATION': html.find('.//input[@id="__EVENTVALIDATION"]').get('value'), } ext = '|%s' % term content = scrapeutils.download(types[type]['url'], 'POST', data, ext) html = lxml.html.fromstring(content) # pick list items result = { 'url': types[type]['url'], '_items': [] } for li in html.findall('.//ul[@class="longlist"]//li'): a = li.find('a') group = { 'id': re.search(r'(ID|SkupinaId)=(\d+)', a.get('href')).group(2), 'názov': a.text, } line = li.text_content() info = re.search(group['názov'] + r'\s*(\((.+?) - (.+?)\))?\s*(\S.*)?$', line, re.DOTALL) if info: if info.group(2): group['od'] = info.group(2) group['do'] = info.group(3) if info.group(4): group['poznámka'] = info.group(4) result['_items'].append(group) return scrapeutils.plaintext(result)
def new_debates_list(term, since_date=None, until_date=None): """Parse list of debate parts for the given term of office from NRSR web. Appropriate for newer terms (since 5th) where split debates are available. If `since_date` or `until_date` is given in ISO format only the debate parts since/until that date are returned. """ if term not in ['5', '6', '7']: raise ValueError("Parsed transcripts are not available for term '%s'" % term) url = 'http://www.nrsr.sk/web/Default.aspx?sid=schodze/rozprava' content = scrapeutils.download(url) html = lxml.html.fromstring(content) # a POST request to emulate choice of term in second selectbox and pressing the button data = { '_sectionLayoutContainer$ctl01$_termNr': term, '_sectionLayoutContainer$ctl01$_search': 'Vyhľadať', '__VIEWSTATE': html.find('.//input[@id="__VIEWSTATE"]').get('value'), '__EVENTVALIDATION': html.find('.//input[@id="__EVENTVALIDATION"]').get('value'), } base_ext = '|new|%s' % term if since_date: data['_sectionLayoutContainer$ctl01$_dateFrom$dateInput'] = since_date + '-00-00-00' base_ext += '|s%s' % since_date if until_date: data['_sectionLayoutContainer$ctl01$_dateTo$dateInput'] = since_date + '-00-00-00' base_ext += '|u%s' % since_date content = scrapeutils.download(url, 'POST', data, base_ext) html = lxml.html.fromstring(content) result = { 'url': url, '_items': [] } page = 1 while True: # extract all debate parts from the current page for tr in html.findall('.//table[@id="_sectionLayoutContainer_ctl01__newDebate"]/tr'): if tr.get('class') in ('pager', 'tab_zoznam_header'): continue session_number = tr.find('td[1]') date = tr.find('td[2]') time_interval = tr.find('td[3]') time = re.search(r'(.*?) - (.*)', time_interval.text) part_type = time_interval.find('em') speaker = tr.find('td[4]') speaker_label = speaker.find('br').tail.strip('( ') debate_part = { 'schôdza': session_number.text.replace('.', ''), 'dátum': date.text, 'trvanie': {'od': time.group(1), 'do': time.group(2)}, 'druh': part_type.text or '', 'osoba': {'meno': speaker.findtext('strong'), 'funkcia': speaker_label} } speaker_link = speaker.find('a') if speaker_link is not None: speaker_url = speaker_link.get('href') id = re.search(r'PoslanecID=(\d+)', speaker_url) debate_part['osoba']['url'] = speaker_url debate_part['osoba']['id'] = id.group(1) for a in tr.findall('td[5]/a'): link = a.get('href') src = a.find('img').get('src') if 'speak' in src: id = re.search(r'id=(\d+)', link) debate_part['video'] = {'url': link, 'id': id.group(1)} elif 'all' in src: debate_part['video_rokovania'] = {'url': link} elif 'rewrite' in src: id = re.search(r'id=(\d+)', link) debate_part['prepis'] = {'url': link, 'id': id.group(1)} else: raise RuntimeError('Unrecognized link in section %s/%s/%s' % (session_number.text, date.text, time_interval.text)) result['_items'].append(debate_part) # test if there is a link to next page current_page = html.find('.//table[@id="_sectionLayoutContainer_ctl01__newDebate"]//tr[1]//span') if current_page is None: break next_page = current_page.getparent().getnext() if next_page is None: break page += 1 # a POST request to emulate pager click data = { '__EVENTTARGET': '_sectionLayoutContainer$ctl01$_newDebate', '__EVENTARGUMENT': 'Page$%s' % page, '_sectionLayoutContainer$ctl01$_termNr': term, '__VIEWSTATE': html.find('.//input[@id="__VIEWSTATE"]').get('value'), '__EVENTVALIDATION': html.find('.//input[@id="__EVENTVALIDATION"]').get('value'), } ext = base_ext + '|%s' % page content = scrapeutils.download(url, 'POST', data, ext) html = lxml.html.fromstring(content) return scrapeutils.plaintext(result)
def get_next_page(html): all_items = html.findall('.//li[@class="next"]/a') for item in all_items: print('url:', item.text_content(), txt.attrib['href']) return txt.attrib['href']
def get_top_tags(html): all_items = html.findall('.//span[@class="tag-item"]/a') for item in all_items: print(item.text_content(), '->', item.attrib['href'])