def parse(self): self.has_metadata = bool(self.metadata["schemata"]) self.metadata_count = len(self.metadata["schemata"]) self.visited = bool(self.html) for k, v in parse_schemata(self.__dict__).items(): setattr(self, k, v) try: tree = parse_html(self.html, self.base_url) def find_one(selector): try: return format_text(tree.xpath(selector)[0].text_content()) except: return "" if not self.headline: self.headline = find_one("//h1") if not self.articlebody: self.articlebody = "\n".join([ format_text(node.text_content()) for node in tree.xpath("//p") ]) print(self.articlebody) except Exception as e: print(e) # self.html = "" return self.__dict__
async def extract_schema_objects(self, responses): """Iterate through a collection of HTTP response objects, extract any embedded json objects from the DOM (possibly an empty list), load those data structures into memory, and append them to the response.""" for response in responses: # try: html = response.html tree = parse_html(html) schemata = tree.xpath("//script[contains(@type, 'json')]/text()") jsonized = [] errors = [] for schema in schemata: try: jsonized.append(json.loads(schema)) except Exception as e: serialized = [f"{e.__class__.__name__} :: {e}", schema] errors.append(serialized) response.metadata = {"schemata": jsonized, "errors": errors} response.has_metadata = bool(jsonized) response.metadata_count = len(jsonized) # except Exception as e: # print(e.__class__.__name__, e, response) # response['metadata'] = {"schemata": [], "errors": } return responses
def get_url_title(url): r""" Request HTML for the page at the URL indicated and return it's <title> property >>> get_url_title('mozilla.com').strip() 'Internet for people, not profit\n — Mozilla' """ parsed_url = try_parse_url(url) if parsed_url is None: return None try: r = requests.get(parsed_url.geturl(), stream=False, allow_redirects=True, timeout=5) tree = parse_html(r.content) title = tree.findtext('.//title') return title except ConnectionError: logging.error( 'Unable to connect to internet to retrieve URL {}'.format( parsed_url.geturl())) logging.error(format_exc()) except (InvalidURL, InvalidSchema, InvalidHeader, MissingSchema): logging.warn('Unable to retrieve URL {}'.format(parsed_url.geturl())) logging.error(format_exc())
def get_sensor_status(): """ Parses PDU status HTML and returns sensor readings. """ url = '/sensors.html' res = dispatch_request(url) if res[0] != 200: raise Exception('Failed to get status') data = res[1] data = clean_html(data) tree = parse_html(data) id1 = parse_value(tree, '/html/body/div/div/table[2]/tr[5]/td[2]/font') id2 = parse_value(tree, '/html/body/div/div/table[2]/tr[6]/td[2]/font') lab1 = parse_value(tree, '/html/body/div/div/table[2]/tr[5]/td[3]/font/b') lab2 = parse_value(tree, '/html/body/div/div/table[2]/tr[6]/td[3]/font/b') temp1 = parse_value(tree, '/html/body/div/div/table[2]/tr[5]/td[4]/font/b/font/b') temp2 = parse_value(tree, '/html/body/div/div/table[2]/tr[6]/td[4]/font/b/font/b') hum1 = parse_value(tree, '/html/body/div/div/table[2]/tr[5]/td[5]/font/b/font/b') hum2 = parse_value(tree, '/html/body/div/div/table[2]/tr[6]/td[5]/font/b/font/b') hum1 = hum1.replace(' %', '') hum2 = hum2.replace(' %', '') temp1 = temp1.replace(' Deg. F', '') temp2 = temp2.replace(' Deg. F', '') res = [{'id': id1, 'label': lab1, 'temp': temp1, 'hum': hum1}, {'id': id2, 'label': lab2, 'temp': temp2, 'hum': hum2}, ] return res
def summary(response): logger.info('Processing summary from %s' % response.url) html = parse_html(response.content.replace(b' ', b'')) html.make_links_absolute(response.url) titles = html.xpath('.//strong/a/text()') if len(titles) == 1: title = str(titles[0]) else: title = '' logger.warning('Found no title in %s' % response.url) bodies = html.xpath('.//div[@class="da_black"]') if len(bodies) == 1: body = bodies[0].text_content() else: body = '' logger.warning('Found no body in %s' % response.url) def xpath(query): xs = html.xpath(query) if len(xs) == 1: return xs[0] else: logger.warning('Found %d results for "%s", skipping' % (len(xs), query)) return '' record = { 'article_id': subparsers.article_id(response.url), # 'url': response.url, 'post_date': subparsers.date(xpath('.//em[contains(text(), "Posted:")]/text()')), 'expiration_date': subparsers.date(xpath('.//em[contains(text(), "Expiration date:")]/text()')), # 'title': title, 'applicant': subparsers.applicant(body), 'linear_feet': subparsers.linear_feet(body), 'county': subparsers.county(body), 'body': subparsers._strip_html_ws(body), 'attachments': subparsers.attachments(html), 'hydrologic_unit_codes': subparsers.hucs(body), 'coastal_use_permits': subparsers.cups(body), 'water_quality_certifications': subparsers.wqcs(body), } record.update(subparsers.permit_manager(body)) maybe_pan = da_number(title) if maybe_pan: record.update(maybe_pan) a = subparsers.applicant(body) if a: record['applicant'] = a fallbacks = subparsers.soup(body) record['longitude'] = record['latitude'] = None for k in fallbacks: if not record[k]: record[k] = fallbacks[k] return record
def scrape_row(session, row): profile_url = urljoin(base_url, row.xpath('.//a/@href')[0]) constituency, island, group = (None,) * 3 if 'Parliamentary Secretaries' not in row.xpath('string(//title)'): constituency, island, group = ((*i.xpath('./text()'), '')[0].strip() for i in row.xpath('./td[position() > 1]')) name, = row.xpath('.//a/text()') last, first = (i.strip() for i in ft.reduce(lambda s, r: s.replace(r, ''), honorifics, name).split(',')) session.visit(profile_url) html = parse_html(session.html) image, = html.xpath('//img[@class = "alignLeft sidePicture"]/@src') image = urljoin(base_url, image) return (first + ' ' + last, last + ', ' + first, last, first, extract_birth_date(html.xpath('//div[text() = "Biography"]' '/following-sibling::p')), image, group and group.strip('()'), constituency, island, profile_url)
def run(self): with open(self.outfile, "w+", encoding="utf-8") as file: for link in self.links: try: content = parse_html(self.get_article(link)).find( ".//div[@id='MainContent']") breadcrumbs = content.findall( ".//div[@id='BreadCrumb']/div/a") if len(breadcrumbs) == 0: breadcrumbs = content.findall( ".//div[@class='ThebreadCrumbContainer']//a") categories = [ a.text.strip() for a in breadcrumbs[:self.categories] ] title = content.find( ".//div[@class='Details_MainTitle']").text.strip() body = content.find(".//div[@id='detailedBody']") if body is None: body = content.find( ".//div[@class='DetailsArticleSummary']") body = self.whitespaces.sub(" ", body.text_content()).strip() file.write( "\t".join([link[37:], *categories, title, body]) + "\n") print("Added Article:", title) except Exception as error: print("In link", link, "Error", error)
def load_youtube(ytid): tmp_url = YT_URL + ytid tmp_title = parse_html(urlopen(tmp_url)).find('.//title').text tmp_submitter_ip = request.remote_addr tmp_submitter = gethostbyaddr(tmp_submitter_ip)[0] if tmp_title == 'YouTube': return 'Invalid youtube id {}'.format(ytid) if tmp_submitter_ip not in song_queues: q = Queue() song_queues[tmp_submitter_ip] = q else: q = song_queues[tmp_submitter_ip] tmp_record = RecordType(title=tmp_title, url=tmp_url, submitter_host=tmp_submitter, submitter_ip=tmp_submitter_ip) q.put(tmp_record) elements = q.qsize() dump_queue() return '{}<br/>{}<br/>Queue size:{}'.format(tmp_title, tmp_submitter, elements)
def gather_people(session): session.visit(base_url) while True: yield from iter(parse_html(session.html) .xpath('//table[@class = "detailTable detailTable_full"]/tbody/tr')) next_page = session.find_by_xpath('//a[@title = "Link to next page"]') if not next_page: break next_page.click()
def main(): global base_url, page_num_of_article, page_start, page_step, page_encoding, page_sleep, count count = 0 last_page = scraperwiki.sqlite.get_var('last_page', -1) latest_article = None start_over = False if last_page == -1: last_page = page_start latest_article = scraperwiki.sqlite.get_var('latest_article', None) start_over = True opener = urllib2.build_opener() opener.addheaders = [ ('User-agent', 'Mozilla/5.0'), ('Referer', base_url) ] urllib2.install_opener(opener) error_count = 0 num_of_article = page_num_of_article while num_of_article == page_num_of_article: page_url = build_url(last_page) try: html = scraperwiki.scrape(page_url) except urllib2.URLError, e: print 'Cannot reach the server:', if hasattr(e, 'reason'): print e.reason elif hasattr(e, 'code'): print e.code error_count += 1 if error_count < 3: continue else: break try: html = html.decode(page_encoding) except UnicodeDecodeError: encoded = '' for word in html.split(' '): try: encoded += word.decode(page_encoding) + ' ' except UnicodeDecodeError: pass html = encoded.rstrip() num_of_article = scrape(parse_html(html), latest_article, start_over) page = last_page / page_step if (page_start == 0): page += 1 scraperwiki.sqlite.save_var('last_page', last_page) print 'Page', page, ',', num_of_article, 'article(s)' last_page += page_step if not page_exists(html, last_page): break time.sleep(page_sleep)
def get_links_for_date(date): """Function to retrieve the image links for a given date.""" comic_url = f"http://www.girlgeniusonline.com/comic.php?date={date}" try: resp = urlopen(comic_url) if (resp.status == 200): doc = parse_html(resp.read()) return doc.xpath('//img[@src and @alt="Comic"]/@src') except (TimeoutError, URLError): print(f"Error getting comic for {date}")
def main(): with urlopen(Request(base_url, headers={'User-Agent': 'Mozilla/5.0'})) as r, \ sqlite3.connect('data.sqlite') as c: c.execute('''\ CREATE TABLE IF NOT EXISTS data (name, sort_name, family_name, given_name, gender, term, area, UNIQUE (name, term, area))''') c.executemany('''\ INSERT OR REPLACE INTO data VALUES (?, ?, ?, ?, ?, ?, ?)''', gather_people(parse_html(r.read().decode())))
def gather_people(session): session.visit(base_url) while True: yield from iter( parse_html(session.html).xpath( '//table[@class = "detailTable detailTable_full"]/tbody/tr')) next_page = session.find_by_xpath('//a[@title = "Link to next page"]') if not next_page: break next_page.click()
def get_extensions(self, with_plain_descriptions=True): extensions = json.loads( self.make_request('/extensions/').read() )['extensions'] if with_plain_descriptions: for extension in extensions: extension['description'] = fix_whitespace(parse_html( extension['description'] ).text_content()).strip() return extensions
def collect_rows(session): for list_url in list_urls: session.visit(urljoin(base_url, list_url)) while True: yield from iter(parse_html(session.html) .xpath('//table[@class = "detailTable detailTable_full"]/tbody/tr')) next_page = session.find_by_xpath('//a[@title = "Link to next page"]') if not next_page: break next_page.click()
def scrape_courses(response, from_list_of=False): from lxml.html import fromstring as parse_html # Determine scraped attribute and location of course id # depending on whether the sent data is from list_of() or courses() functions attr, start, end = ("href", 54, -7) if from_list_of else ("onclick", 87, -24) # Return dictionary of courses blackboard ids mapped to courses myUDC ids return { link.text[:7]: link.attrib[attr][start:end] for link in parse_html(response).xpath("//a") }
def main(): global base_url, page_num_of_article, page_start, page_step, page_encoding, page_sleep, count count = 0 last_page = scraperwiki.sqlite.get_var('last_page', -1) latest_article = None start_over = False if last_page == -1: last_page = page_start latest_article = scraperwiki.sqlite.get_var('latest_article', None) start_over = True opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0'), ('Referer', base_url)] urllib2.install_opener(opener) error_count = 0 num_of_article = page_num_of_article while num_of_article == page_num_of_article: page_url = build_url(last_page) try: html = scraperwiki.scrape(page_url) except urllib2.URLError, e: print 'Cannot reach the server:', if hasattr(e, 'reason'): print e.reason elif hasattr(e, 'code'): print e.code error_count += 1 if error_count < 3: continue else: break try: html = html.decode(page_encoding) except UnicodeDecodeError: encoded = '' for word in html.split(' '): try: encoded += word.decode(page_encoding) + ' ' except UnicodeDecodeError: pass html = encoded.rstrip() num_of_article = scrape(parse_html(html), latest_article, start_over) page = last_page / page_step if (page_start == 0): page += 1 scraperwiki.sqlite.save_var('last_page', last_page) print 'Page', page, ',', num_of_article, 'article(s)' last_page += page_step if not page_exists(html, last_page): break time.sleep(page_sleep)
def main(): with urlopen(Request(base_url, headers={'User-Agent': 'Mozilla/5.0'})) as r, \ sqlite3.connect('data.sqlite') as c: c.execute('''\ CREATE TABLE IF NOT EXISTS data (name, sort_name, family_name, given_name, gender, term, area, UNIQUE (name, term, area))''') c.executemany( '''\ INSERT OR REPLACE INTO data VALUES (?, ?, ?, ?, ?, ?, ?)''', gather_people(parse_html(r.read().decode())))
def map_error_response(solr_response): if 'response' in solr_response and solr_response['response'].code >= 400: real_response = solr_response['response'] document = parse_html(real_response.body) title = tostring(document.xpath('//title').pop(), method='text') reason = title.strip() body_element = document.xpath('//body').pop() raw_body = tostring(body_element, method='text').strip() original_message = re.sub(r'(\s+)|(Powered.*$)', ' ', raw_body).strip() return {'reason':reason, 'original_message': original_message, 'response': real_response} else: return solr_response
def get_metadata_from_schol_html(path): if (path / 'results.json').exists(): return doc = parse_html(str(path / 'scholarly.html')) metadata = {} metadata['title'] = {'value': [all_whitespace_to_space(get_text_from_selector(doc, '.title-group')[0])]} metadata['doi'] = {'value': [all_whitespace_to_space(get_text_from_selector(doc, '.doi')[0]).replace('doi: ', '')]} metadata['date'] = {'value': [all_whitespace_to_space(get_text_from_selector(doc, '.pub-date-epub')[0]).replace('epub: ', '')]} metadata['journal'] = {'value': [all_whitespace_to_space(get_text_from_selector(doc, '.journal-title')[0])]} with open(str(path / 'results.json'), 'w') as f: json.dump(metadata, f, indent=4)
def get_all_uses_of_citation(fname_or_etree, doi="", title="", n_sentences=0): #print("Looking for %s in %s" % (doi, fname_or_etree)) if type(fname_or_etree) is not lxml.etree._ElementTree: html = parse_html(fname_or_etree) else: html = fname_or_etree # Try searching by div first if doi != "": doi_element = get_doi_element(html, doi) if doi_element is None: div = None else: div = doi_element.getparent() # But we couldn't find the DOI then try the title if div is None and title != "": title_element = get_title_element(html, title) if title_element is None: div = None else: div = title_element.getparent() if div is None: return #print(all_whitespace_to_space(div.text_content())) li = div.getparent() ref_id = li.find('a').attrib['name'] #print(ref_id) sel = CSSSelector('a[href="#%s"]' % ref_id) res = sel(html) #print(res) text = [get_sentence(r, n_around=n_sentences) for r in res] if len(text) == 0: # It is the in the list of references, but we can't find the citation # This is probably because it was something like reference number 4 # and was cited as [2-5] # So we return some text that explains the error text = [ 'ERROR: In reference list, but cannot find citation. Check manually.' ] return text
def get_all_uses_of_citation(fname_or_etree, doi="", title="", n_sentences=0): #print("Looking for %s in %s" % (doi, fname_or_etree)) if type(fname_or_etree) is not lxml.etree._ElementTree: html = parse_html(fname_or_etree) else: html = fname_or_etree # Try searching by div first if doi != "": doi_element = get_doi_element(html, doi) if doi_element is None: div = None else: div = doi_element.getparent() # But we couldn't find the DOI then try the title if div is None and title != "": title_element = get_title_element(html, title) if title_element is None: div = None else: div = title_element.getparent() if div is None: return #print(all_whitespace_to_space(div.text_content())) li = div.getparent() ref_id = li.find('a').attrib['name'] #print(ref_id) sel = CSSSelector('a[href="#%s"]' % ref_id) res = sel(html) #print(res) text = [get_sentence(r, n_around=n_sentences) for r in res] if len(text) == 0: # It is the in the list of references, but we can't find the citation # This is probably because it was something like reference number 4 # and was cited as [2-5] # So we return some text that explains the error text = ['ERROR: In reference list, but cannot find citation. Check manually.'] return text
def scrape_rows(session, rows): for row in rows: profile_link = urllib.parse.urljoin(base_url, row.xpath('.//a/@href')[0]) constituency, island, group = ([ *i.xpath('./text()'), '' ][0].strip() for i in row.xpath('./td[position() > 1]')) name, = row.xpath('.//a/text()') last, first = (i.strip() for i in ft.reduce( lambda s, r: s.replace(r, ''), honorifics, name).split(',')) session.visit(profile_link) image, = parse_html(session.html)\ .xpath('//img[@class = "alignLeft sidePicture"]/@src') image = urllib.parse.urljoin(base_url, image) yield (first + ' ' + last, last + ', ' + first, last, first, image, group.strip('()'), constituency, island, profile_link)
def extract_birth_date(text): try: text = next(p.text_content() for p in text if 'born' in p.text_content()) # We're arbitrarily limiting it to eight tokens after 'born' so that # we don't accidentally pick up dates other than birth dates text = ' '.join(text[text.find('born'):].split()[:10]) except StopIteration: return with urlopen('http://nlp.stanford.edu:8080/sutime/process', data=urlencode({'q': text, 'rules': 'english'}).encode()) as r: date, = parse_html(r.read())\ .xpath('//h3[text() = "Temporal Expressions"]' '/following-sibling::table[1]//tr[2]/td[2]/text()') or (None,) if not date: print('Unable to extract birth date from {!r}'.format(text), file=sys.stderr) return date
def get_outlet_status(): """ Parses PDU status HTML and returns outlet statuses. """ url = '/outctrl.html' res = dispatch_request(url) if res[0] != 200: raise Exception('Failed to get status') data = res[1] data = clean_html(data) tree = parse_html(data) id1 = parse_value(tree, '/html/body/div/div/table[2]/tr[6]/td[2]/font') id2 = parse_value(tree, '/html/body/div/div/table[2]/tr[7]/td[2]/font') lab1 = parse_value(tree, '/html/body/div/div/table[2]/tr[6]/td[3]/font/b') lab2 = parse_value(tree, '/html/body/div/div/table[2]/tr[7]/td[3]/font/b') stat1 = parse_value(tree, '/html/body/div/div/table[2]/tr[6]/td[5]/font') stat2 = parse_value(tree, '/html/body/div/div/table[2]/tr[7]/td[5]/font') return [{'id': id1, 'label': lab1, 'status': stat1}, {'id': id2, 'label': lab2, 'status': stat2}, ]
def __init__(self, url, html, row, soup=None, lxml=None, fix_encoding_errors=True): self.url = url self.sitemap_data = row self.html = (fix_text_segment(html.replace("\xa0", " "), uncurl_quotes=False) if fix_encoding_errors else html) try: self.soup = soup if soup else BeautifulSoup(self.html) except Exception as e: raise ValueError(f"{e.__class__.__name__} :: {e}, {self.html}") self.meta = Haystack(html) #print(json.dumps(self.meta, indent=4)) try: if isinstance(self.html, str): self.html = self.html.encode("utf-8") self.lxml = lxml if lxml else parse_html(self.html) except Exception as e: raise ValueError(f"{e.__class__.__name__} :: {e}, {self.html}") self.data = { "content": self.content, "url": self.url, "title": self.title, "published_at": self.published_at, "description": self.summary, "author": self.author, "image_url": self.image_url, "section": self.section, "publisher": self.publisher, "keywords": self.keywords, "metadata": {k: v for k, v in self.meta.data.items()}, } self.data.update( {k: row[k] for k in self.passthrough_attrs if row and k in row})
def extract_examples(docs_html_filepaths, output_directory): for docs_html_filepath in docs_html_filepaths: with open(docs_html_filepath, 'r') as docs_html_file: dom = parse_html(docs_html_file) log('Extracting from', docs_html_filepath) examples = dom.xpath('//*[@data-example-id]') for example in examples: example_id = example.get('data-example-id') if not example_id: continue example_filename = example_id + '.html' example_filepath = joinpath(output_directory, example_filename) example_html = EXAMPLE_TEMPLATE.format(example_html=etree.tostring(example)) with open(example_filepath, 'w') as example_html_file: example_html_file.write(example_html) log('Wrote', example_filepath)
def main(): with urlopen(base_url) as r: src = r.read().decode('windows-1253') now = dt.datetime.now().isoformat() con = sqlite3.connect('data.sqlite') with con: con.execute('''\ CREATE TABLE IF NOT EXISTS first_reading_archive (src, time_last_scraped, UNIQUE (src))''') con.execute('''\ INSERT OR REPLACE INTO first_reading_archive VALUES (?, ?)''', (src, now)) with con: con.execute('''\ CREATE TABLE IF NOT EXISTS first_reading (number, title, sponsors, committees, date_tabled, time_last_scraped, UNIQUE (number, title, date_tabled))''') con.executemany('''\ INSERT OR REPLACE INTO first_reading VALUES (?, ?, ?, ?, ?, ?)''', gather_docs(parse_html(src), now))
def scrape_rows(session, rows): for row in rows: profile_link = urllib.parse.urljoin(base_url, row.xpath('.//a/@href')[0]) constituency, island, group = ([*i.xpath('./text()'), ''][0].strip() for i in row.xpath('./td[position() > 1]')) name, = row.xpath('.//a/text()') last, first = (i.strip() for i in ft.reduce(lambda s, r: s.replace(r, ''), honorifics, name).split(',')) session.visit(profile_link) image, = parse_html(session.html)\ .xpath('//img[@class = "alignLeft sidePicture"]/@src') image = urllib.parse.urljoin(base_url, image) yield (first + ' ' + last, last + ', ' + first, last, first, image, group.strip('()'), constituency, island, profile_link)
def main(): global page_num_of_article, page_start, page_step, page_encoding, count count = 0 last_page = scraperwiki.sqlite.get_var('last_page', page_start) latest_article = '' if last_page == -1: last_page = page_start latest_article = scraperwiki.sqlite.get_var('latest_article', '') num_of_article = page_num_of_article while num_of_article == page_num_of_article: page_url = build_url(last_page) html = scraperwiki.scrape(page_url).decode(page_encoding) num_of_article = scrape(parse_html(html), latest_article) scraperwiki.sqlite.save_var('last_page', last_page) last_page += page_step if not page_exists(html, last_page): break scraperwiki.sqlite.save_var('last_page', -1) print '%d article(s) have been scraped.' % count
def summary(response): html = parse_html(response.content.replace(b' ', b'')) html.make_links_absolute(response.url) titles = html.xpath('//strong/a/text()') if len(titles) == 1: title = str(titles[0]) else: title = '' logger.warning('Found no title in %s' % response.url) bodies = html.xpath('//div[@class="da_black"]') if len(bodies) == 1: body = bodies[0].text_content() else: body = '' logger.warning('Found no body in %s' % response.url) def xpath(query): xs = html.xpath(query) if len(xs) == 1: return xs[0] else: logger.warning('Found %d results for "%s", skipping' % (len(xs), query)) return '' record = { 'article_id': subparsers.article_id(response.url), # 'url': response.url, 'post_date': subparsers.date(xpath('//em[contains(text(), "Posted:")]/text()')), 'expiration_date': subparsers.date(xpath('//em[contains(text(), "Expiration date:")]/text()')), # 'title': title, 'body': body.strip('\r\n '), 'attachments': subparsers.attachments(html), 'hydrologic_unit_codes': subparsers.hucs(body), 'coastal_use_permits': subparsers.cups(body), 'water_quality_certifications': subparsers.wqcs(body), } maybe_pan = da_number(title) if maybe_pan: record.update(maybe_pan) applicant, location, character, leftover = subparsers.body(html, url = response.url) record.update({ 'applicant': applicant.strip('\r\n '), 'location': location.strip('\r\n '), 'character': character.strip('\r\n '), }) else: record.update({ 'applicant': '', 'location': '', 'character': '', }) fallbacks = pdf.parse(body) record['longitude'] = record['latitude'] = None for k in fallbacks: if not record[k]: record[k] = fallbacks[k] return record
def get(self, path): """Return an pq instance of the lxml parsed document at path.""" rv = self.client.get(path, follow_redirects=True) return parse_html(rv.data)
def parse(html_text, today=None): html_tree = parse_html(html_text) return [ parse_post(post_el, today=today) for post_el in html_tree.cssselect('.sprofile-post') ]
from lxml.html import parse as parse_html from lxml.html import tostring as tostring_html from urlparse import urlparse import sys url = sys.argv[1] url_obj = urlparse(url) base_url = url_obj.scheme + '://' + url_obj.hostname + '/' + ('/'.join(url_obj.path.split('/')[:-1])) target_filename = url_obj.path.split('/')[-2] + '.html' print 'Base URL:', base_url print 'TOC URL:', url data = {} toc_doc = parse_html(fetch_html(url)) title_el = toc_doc.xpath('//div[@id="ct_title"]/h1')[0] data['title'] = title_el.text data['author'] = title_el.getchildren() and title_el.getchildren()[0].text or 'Anonymous' chapters = [] data['chapters'] = chapters for el in toc_doc.xpath('//div[@id="catalog_list"]/ul/li/a'): ch_url = el.attrib.get('href') if ch_url.startswith('http://vip'): continue print 'Fetching', ch_url ch_data = {}
def get_sections(curriculum_code): r = requests.get(BASE_URL.format(curriculum_code)) r.raise_for_status() tree = parse_html(BytesIO(r.content)) return list(map(build_section, tree.xpath(TABLES_XPATH)[RELEVANT_SECTIONS]))
async def detalhes(linha): linha_dash = linha if '-' in linha else linha + '-0' # Busca pelo numero da linha pag_query = await fetch_url(f'http://www.emdec.com.br/ABusInf/consultarlinha.asp?linha={linha_dash}&consulta=1') for line in pag_query.splitlines(): pag_query_regex = '\\s*document.JnInformacoes.action = "detalhelinha.asp\\?(.*)";' match = re.match(pag_query_regex, line) if match: url_detalhes = f'http://www.emdec.com.br/ABusInf/detalhelinha.asp?{match.group(1)}' break pag_detalhes = parse_html(await fetch_url(url_detalhes)) pag_map = await fetch_url(f'http://www.emdec.com.br/ABusInf/{pag_detalhes.cssselect("#mapFrame")[0].get("src")}') map_data = parseMap(pag_map) processed_map = await process_map(map_data) def schedules(dom): ret = {} for group in dom.xpath('div'): title_node = group.xpath('p')[-1] name = { u'Horários Sábado': 'saturday', u'Horários Sábado (Referência)': 'saturday', u'Horários Domingo': 'sunday', u'Horários Domingo (Referência)': 'sunday', u'Horários Útil': 'weekday', u'Horários Útil (Referência)': 'weekday', }[strip(title_node.text)] trips = [] ret[name] = { 'trips': trips, 'vehicles': int(re.search('\\d+', title_node.tail).group()) } #print(strip(group.xpath('p')[-1].tail)) for cell in group.xpath('div/table/tr/td'): trips.append({ 'time': strip(cell.xpath('table/tr/td')[0].text), 'wheelchair_accessible': bool(cell.xpath('table/tr/td/img')) }) return ret def stops(dom): return [ strip(td.text) for td in dom.cssselect('div > table > tr > td') ] def trecho(dom, map_data): details = {} for tr in dom.xpath('div/table/tr'): details[strip(tr.cssselect('td')[0].text)[:-1]] = tr.cssselect('td input')[0].get('value') main_panels = dom.xpath('table/tr/td') ret = {} ret["details"] = details #ret["end_location"] = geocode(details["Letreiro"]) ret["schedules"] = schedules(main_panels[0]) #ret["stops"] = stops(main_panels[1]) ret["map"] = map_data return ret trechos = [ trecho(div, processed_map[map_index]) for (div, map_index) in zip(pag_detalhes.cssselect('#tabs > div'), [1,0]) ] trechos = [trecho for trecho in trechos if trecho['map']['shape'] and trecho['details']['Letreiro'] != 'ESPECIAL'] route_long_name = fix_route_name(linha, get_text(pag_detalhes, 'txtPesquisa').split(' - ', 1)[-1]) ret = {} ret["route_short_name"] = linha ret["route_long_name"] = route_long_name ret["company"] = get_text(pag_detalhes, 'txtEmpresa') ret["comments"] = get_text(pag_detalhes, 'txtObservacao') ret["updated"] = strip(pag_detalhes.cssselect('#conteudo font[size="1"]')[0].text.split('\n')[1]) ret["route_url"] = 'http://www.portalinterbuss.com.br/campinas/linhas/%s' % linha # http://www.portalinterbuss.com.br/campinas/layout-da-frota ret["route_color"] = { '1': '1985E9', # AZUL CLARO '2': 'e91919', # VERMELHO '3': '0D2447', # VERDE '4': '0D2447', # AZUL ESCURO '5': '9D9D9D', # BRANCO COM FIGURAS }[linha[0]] ret["route_text_color"] = { '1': 'C6E4FF', # AZUL CLARO '2': 'FFDDDD', # VERMELHO '3': 'E8FFDC', # VERDE '4': 'CFE2FF', # AZUL ESCURO '5': 'EFEFEF', # BRANCO COM FIGURAS }[linha[0]] ret["directions"] = trechos print(f"Fetched details from {linha}") return ret
def summary(response): html = parse_html(response.content.replace(b' ', b'')) html.make_links_absolute(response.url) titles = html.xpath('//strong/a/text()') if len(titles) == 1: title = str(titles[0]) else: title = '' logger.warning('Found no title in %s' % response.url) bodies = html.xpath('//div[@class="da_black"]') if len(bodies) == 1: body = bodies[0].text_content() else: body = '' logger.warning('Found no body in %s' % response.url) def xpath(query): xs = html.xpath(query) if len(xs) == 1: return xs[0] else: logger.warning('Found %d results for "%s", skipping' % (len(xs), query)) return '' record = { 'article_id': subparsers.article_id(response.url), # 'url': response.url, 'post_date': subparsers.date(xpath('//em[contains(text(), "Posted:")]/text()')), 'expiration_date': subparsers.date( xpath('//em[contains(text(), "Expiration date:")]/text()')), # 'title': title, 'body': body.strip('\r\n '), 'attachments': subparsers.attachments(html), 'hydrologic_unit_codes': subparsers.hucs(body), 'coastal_use_permits': subparsers.cups(body), 'water_quality_certifications': subparsers.wqcs(body), } maybe_pan = da_number(title) if maybe_pan: record.update(maybe_pan) applicant, location, character, leftover = subparsers.body( html, url=response.url) record.update({ 'applicant': applicant.strip('\r\n '), 'location': location.strip('\r\n '), 'character': character.strip('\r\n '), }) else: record.update({ 'applicant': '', 'location': '', 'character': '', }) fallbacks = pdf.parse(body) record['longitude'] = record['latitude'] = None for k in fallbacks: if not record[k]: record[k] = fallbacks[k] return record