def obtain(url, j): res = req.get(url) tree = lxml.html.fromstring(res.text) id_date = css('.auto-style5:nth-child(1)') id_g_num = css('.auto-style5:nth-child(2)') id_s_num = css('.auto-style5:nth-child(3)') date_list = id_date(tree) g_num_list = id_g_num(tree) s_num_list = id_s_num(tree) if j == 1: g_num_temp = [g_num_list[i].text for i in range(1, len(g_num_list))] def split(txt): u0 = re.split('\xa0,', txt) u0 = re.split(',', txt) u1 = np.array(u0).astype(int) return u1 g_num = np.array( [split(g_num_temp[i]) for i in range(len(g_num_temp))]) dataset = g_num elif j == 2: dataset = np.matrix([ s_num_list[i].text for i in range(1, len(s_num_list)) ]).astype(int).transpose() else: dataset = np.matrix( [date_list[i].text for i in range(1, len(date_list))]).transpose() return dataset
def test_to_xml_multi_form(self): data = dict(self.make_mgmt_data(2).items() + self.make_sub_form_data(0).items() + self.make_sub_form_data(1).items()) form = Form(self.uih, data, prefix=self.prefix) self.assertTrue(form.is_valid()) root = create_root('root') form.to_xml(root) self.assertTrue(css('skymaker[id="6"]')(root)) self.assertTrue(css('skymaker module-version')(root)) self.assertTrue(css('skymaker module-version')(root)) self.assertTrue(css('skymaker parents item')(root)) self.assertEqual(len(css('skymaker images item')(root)), 2)
def get_els(page_html, filters, selector='.hdrlnk'): ''' yields (to_crawl, skip) ''' for el in css(selector)(html.fromstring(page_html)): if any((i.upper() in el.text.upper() for i in filters)): yield el, None else: yield None, el
def selector(s): if s.startswith('css:'): return css(s[4:]).path elif s.startswith('xpath:'): return s[6:] else: return s
def parse_summary_page(html): doc = etree.HTML(html) li_elms = css("li.j_thread_list")(doc) base_url = "http://tieba.baidu.com%s" link_items = [] for li in li_elms: link = css("a.j_th_tit")(li)[0] href = base_url % link.get('href') reply_str = css("span.j_reply_data")(li)[0].text link_items.append(PageItem(href, reply_str)) next_page_links = css("a.next")(doc) if next_page_links: next_link = base_url % next_page_links[0].get("href") else: next_link = None return (next_link, link_items)
def scrape_organization_info(url): doc = get_document(url) org = {} try: org['name'] = css("#voicearea h1 span")(doc)[0].text for link in css("#voicearea div.contactinfo a[href]")(doc): ref = link.get('href') if ref.startswith('mailto:'): org['mailto'] = ref.replace('mailto:', '').strip() elif ref.startswith('http'): if not "eniro.se/" in ref and not "hitta.se/" in ref: org['url'] = ref else: pass #print " # unexpected href: %r" % ref return org except: print "ERROR in content from <%s>:" % url print etree.tostring(doc) raise
def parse_detail_page(self, html): core_selector = css("div.core") doc = etree.HTML(html) core_div = core_selector(doc) if core_div: content_list = core_div[0] else: return post_contents = css("div.l_post")(content_list) main_author = None contents = [] created_at = None for div in post_contents: dumped_data = div.get("data-field") try: data = json.loads(dumped_data) except: print "Parse json error" continue _content = data["content"] _author = data["author"] user_id = _author.get("id") or _author.get("name") main_author = user_id if main_author is None else main_author created_at = _content["date"] if main_author != user_id: break post_content = css("cc div.d_post_content")(div)[0] current_content = tostring(post_content, encoding="UTF-8", method="text") contents.append(current_content) self.title = css(".core_title_txt")(content_list)[0].text.encode( "UTF-8") self.content = "\n\n".join(contents) self.created_at = created_at self.save_to_sqlite()
def parse_detail_page(self, html): core_selector = css("div.core") doc = etree.HTML(html) core_div = core_selector(doc) if core_div: content_list = core_div[0] else: return post_contents = css("div.l_post")(content_list) main_author = None contents = [] created_at = None for div in post_contents: dumped_data = div.get("data-field") try: data = json.loads(dumped_data) except: print "Parse json error" continue _content = data["content"] _author = data["author"] user_id = _author.get("id") or _author.get("name") main_author = user_id if main_author is None else main_author created_at = _content["date"] if main_author != user_id: break post_content = css("cc div.d_post_content")(div)[0] current_content = tostring(post_content, encoding="UTF-8", method="text") contents.append(current_content) self.title = css(".core_title_txt")(content_list)[0].text.encode("UTF-8") self.content = "\n\n".join(contents) self.created_at = created_at self.save_to_sqlite()
def parse(): url = 'http://eu4.paradoxwikis.com/Countries' r = requests.get(url) parser = etree.HTML(r.content) #url = 'Countries - Europa Universalis 4 Wiki.html' #with open(url, 'r') as f: # parser = etree.HTML(f.read()) # CSS selectors sel = css('table.wikitable tr') flags = [] for row in sel(parser): children = row.getchildren() tdflag = children[1] tdtag = children[2] flag = tdflag.cssselect('img') if len(flag) == 0: continue # Original sauce src = flag[0].get('src') src = src.split('/') del src[-1] del src[2] src = "/".join(src) tag = str(tostring(tdtag)[5:8])[2:-1] flags.append(str(tag) + "-" + src) with open(CACHE, 'w') as f: f.write("\n".join(flags)) print('Wrote {0} countries to file'.format(len(flags)))
def handle(self, *args, **options): if len(args) != 1: raise CommandError("Please specify a directory.") directory = args[0] if not os.path.exists(directory): os.mkdir(directory) # start scraper scraper = scrapelib.Scraper(requests_per_minute=60, allow_cookies=True, follow_robots=True) # open contractor CSV contractor_file = open(os.path.join(directory, 'contractors.csv'), 'wb') contractor_csv = csv.DictWriter(contractor_file, CONTRACTOR_FIELDS, restval='', extrasaction='ignore') contractor_csv.writer.writerow(CONTRACTOR_FIELDS) # first grab overall search page print 'Scraping main listing...' overall_text = scraper.urlopen("http://www.contractormisconduct.org/index.cfm/1,73,224,html?pnContractorID=0&pstDispositionTypeID=0&prtCourtTypeID=0&mcType=0&eaType=0&ContractType=0&dollarAmt=-1%2F-1&dateFrom=01%2F01%2F1985&dateTo=01%2F01%2F2025&submit=sort") overall_doc = document_fromstring(overall_text) # enumerate the organizations for org_option in css('select[name=pnContractorID] option')(overall_doc): if org_option.attrib['value'] != '0': contractor_csv.writerow({ 'Contractor': org_option.text, 'URL': 'http://www.contractormisconduct.org/index.cfm/1,73,221,html?ContractorID=%s' % org_option.attrib['value'] }) contractor_file.close() # open instance CSV instance_file = open(os.path.join(directory, 'instances.csv'), 'wb') instance_csv = csv.DictWriter(instance_file, INSTANCE_FIELDS, restval='', extrasaction='ignore') instance_csv.writer.writerow(INSTANCE_FIELDS) # iterate over links from main page and grab their data links = css('td.caseRow a')(overall_doc) for i in range(len(links)): link = links[i] url = urlparse.urljoin("http://www.contractormisconduct.org/index.cfm/1,73,224,html", link.attrib['href']) print 'Scraping %s (%s of %s)' % (url, i + 1, len(links)) instance_text = scraper.urlopen(url) instance_doc = document_fromstring(UnicodeDammit(instance_text, isHTML=True).unicode) row = { 'Contractor': css('#primecontent > h2')(instance_doc)[0].text, 'Instance': sanitize(css('#incident > h2')(instance_doc)[0].text), 'URL': url } for field in css('#incident > p > strong')(instance_doc): field_name = field.text.replace(':', '') field_contents = sanitize(field.tail.strip()) if field_name == 'Date': date_parts = field_contents.split(None, 1) row['Date'] = date_parts[0] row['Year'] = row['Date'].split('/')[-1] row['Significance of Date'] = date_parts[1][1:-1] if len(date_parts) > 1 else '' elif field_name == 'Amount': row['Misconduct Penalty Amount'] = field_contents.replace('$', '').replace(',', '') if DOLLARS.match(field_contents) else '' else: row[field_name] = field_contents instance_csv.writerow(row) instance_file.close()
def get_image_4chan(self, path, staging, rate_limit=0): """ Create image from path If the path is local, simply read the local path and return an Image representing it. If not, attempt to download the image from elsewhere, and cache the downloaded result if possible, else discard the file afterwards. :param path: Path to image, either a local path or a URL :param rate_limit: Seconds to wait after downloading, if downloading :return Image: Image object, or nothing if loading it failed """ # do we have the file locally? filename = Path(config.PATH_IMAGES, path.split("/")[-1]) if filename.exists(): return Image.open(str(filename)) while self.previous_download > time.time() - rate_limit: time.sleep(0.1) self.previous_download = time.time() rate_regex = re.compile( r"Search limit exceeded. Please wait ([0-9]+) seconds before attempting again." ) # get link to image from external HTML search results # detect rate limiting and wait until we're good to go again page = requests.get(path) rate_limited = rate_regex.search(page.content.decode("utf-8")) while rate_limited: self.log.debug( "Rate-limited by external source. Waiting %s seconds." % rate_limited[1]) time.sleep(int(rate_limited[1])) page = requests.get(path) rate_limited = rate_regex.search(page.content.decode("utf-8")) # get link to image file from HTML returned parser = etree.HTMLParser() tree = etree.parse(StringIO(page.content.decode("utf-8")), parser) image_url = css("a.thread_image_link")(tree)[0].get("href") # download image itself image = requests.get(image_url, stream=True) if image.status_code != 200: raise FileNotFoundError # cache the image for later, if needed if config.PATH_IMAGES and Path(config.PATH_ROOT, config.PATH_IMAGES).exists(): md5 = hashlib.md5() based_hash = path.split("/")[-1].split(".")[0].replace("_", "/") extension = image_url.split(".")[-1].lower() md5.update(base64.b64decode(based_hash)) local_path = Path(config.PATH_IMAGES, md5.hexdigest() + "." + extension) delete_after = False else: local_path = staging.joinpath("temp-image") delete_after = True # save file, somewhere with local_path.open('wb') as file: for chunk in image.iter_content(1024): file.write(chunk) # avoid getting rate-limited by image source time.sleep(rate_limit) picture = Image.open(local_path) # if no image folder is configured, delete the temporary file if delete_after: local_path.unlink() return picture
def handle(self, *args, **options): if len(args) != 1: raise CommandError("Please specify a directory.") directory = args[0] if not os.path.exists(directory): os.mkdir(directory) # start scraper scraper = scrapelib.Scraper(requests_per_minute=60, allow_cookies=True, follow_robots=True) # open contractor CSV contractor_file = open(os.path.join(directory, 'contractors.csv'), 'wb') contractor_csv = csv.DictWriter(contractor_file, CONTRACTOR_FIELDS, restval='', extrasaction='ignore') contractor_csv.writer.writerow(CONTRACTOR_FIELDS) # first grab overall search page print 'Scraping main listing...' overall_text = scraper.urlopen( "http://www.contractormisconduct.org/index.cfm/1,73,224,html?pnContractorID=0&pstDispositionTypeID=0&prtCourtTypeID=0&mcType=0&eaType=0&ContractType=0&dollarAmt=-1%2F-1&dateFrom=01%2F01%2F1985&dateTo=01%2F01%2F2025&submit=sort" ) overall_doc = document_fromstring(overall_text) # enumerate the organizations for org_option in css('select[name=pnContractorID] option')( overall_doc): if org_option.attrib['value'] != '0': contractor_csv.writerow({ 'Contractor': org_option.text, 'URL': 'http://www.contractormisconduct.org/index.cfm/1,73,221,html?ContractorID=%s' % org_option.attrib['value'] }) contractor_file.close() # open instance CSV instance_file = open(os.path.join(directory, 'instances.csv'), 'wb') instance_csv = csv.DictWriter(instance_file, INSTANCE_FIELDS, restval='', extrasaction='ignore') instance_csv.writer.writerow(INSTANCE_FIELDS) # iterate over links from main page and grab their data links = css('td.caseRow a')(overall_doc) for i in range(len(links)): link = links[i] url = urlparse.urljoin( "http://www.contractormisconduct.org/index.cfm/1,73,224,html", link.attrib['href']) print 'Scraping %s (%s of %s)' % (url, i + 1, len(links)) instance_text = scraper.urlopen(url) instance_doc = document_fromstring( UnicodeDammit(instance_text, isHTML=True).unicode) row = { 'Contractor': css('#primecontent > h2')(instance_doc)[0].text, 'Instance': sanitize(css('#incident > h2')(instance_doc)[0].text), 'URL': url } for field in css('#incident > p > strong')(instance_doc): field_name = field.text.replace(':', '') field_contents = sanitize(field.tail.strip()) if field_name == 'Date': date_parts = field_contents.split(None, 1) row['Date'] = date_parts[0] row['Year'] = row['Date'].split('/')[-1] row['Significance of Date'] = date_parts[1][1:-1] if len( date_parts) > 1 else '' elif field_name == 'Amount': row['Misconduct Penalty Amount'] = field_contents.replace( '$', '').replace( ',', '') if DOLLARS.match(field_contents) else '' else: row[field_name] = field_contents instance_csv.writerow(row) instance_file.close()
def get_els(page_html, selector='.athing'): for tr in css(selector)(html.fromstring(page_html)): a = css('a')(tr.getchildren()[2])[0] yield tr, a
def process(self): """ This takes a 4CAT results file as input, and outputs a new CSV file with all posts containing the original query exactly, ignoring any * or " in the query """ months = {} # we use these to extract URLs and host names if needed link_regex = re.compile(r"https?://en.wikipedia\.org/wiki/[^\s.]+") wiki_page = re.compile(r"[\[\[[^\]]+\]\]") category_regex = re.compile(r"\[\[Category:[^\]]+\]\]") trailing_comma = re.compile(r",$") # initialise links = {} all_categories = {} counter = 1 errors = 0 page_categories = {} page_links = {} deep_pages = {} # find all links in post bodies self.dataset.update_status("Reading source file") for post in self.iterate_csv_items(self.source_file): wiki_links = link_regex.findall(post["body"]) wiki_links = [trailing_comma.sub("", link) for link in wiki_links] # if we have a per-post URL, include that as well if "url" in post and post["url"] and link_regex.match(post["url"]): wiki_links.append(post["url"]) for link in wiki_links: link = "/wiki/".join(link.split("/wiki/")[1:]).split("#")[0] if link not in links: links[link] = 0 links[link] += 1 # just a helper function to get the HTML content of a node def stringify_children(node): from lxml.etree import tostring from itertools import chain parts = ([node.text] + list( chain(*([c.text, tostring(c), c.tail] for c in node.getchildren()))) + [node.tail]) # filter removes possible Nones in texts and tails return ''.join(filter(None, parts)) self.dataset.update_status("Fetching categories from Wikipedia API...") for link in links: if link not in page_categories: if self.interrupted: raise ProcessorInterruptedException( "Interrupted while fetching data from Wikipedia") page_categories[link] = set() self.dataset.update_status( "Fetching categories from Wikipedia API, page %i of %i" % (counter, len(links))) counter += 1 # fetch wikipedia source url = "https://en.wikipedia.org/w/index.php?title=" + link + "&action=edit" try: page = requests.get(url) except requests.RequestException: errors += 1 continue if page.status_code != 200: errors += 1 continue # get link to image file from HTML returned parser = etree.HTMLParser() tree = etree.parse(StringIO(page.content.decode("utf-8")), parser) try: wiki_source = stringify_children( css("#wpTextbox1")(tree)[0]) except IndexError: # not a source page? errors += 1 continue # extract category names from category link syntax categories = category_regex.findall(wiki_source) categories = set([ ":".join(category.split(":")[1:])[:-2].split("|")[0] for category in categories ]) # save category links for category in categories: # Add " (cat)" to the category strings. # This is needed because pages can sometimes have the same name as the category. # This will result in a faulty graph, since there's duplicate nodes. category += " (cat)" if category not in all_categories: all_categories[category] = 0 all_categories[category] += 1 page_categories[link].add(category) # if needed, also include pages linked to from within the # fetched page source if self.parameters.get("deep_pages", None): linked_pages = wiki_page.findall(wiki_source) for page in linked_pages: page = page.split("|")[0] if page not in deep_pages: deep_pages[page] = 0 deep_pages[page] += 1 if link not in page_links: page_links[link] = set() page_links[link].add(page) # write GDF file with self.dataset.get_results_path().open("w", encoding="utf-8") as results: results.write("nodedef>name VARCHAR,type VARCHAR,weight INTEGER\n") for page in page_categories: results.write("'" + page.replace("_", " ").replace(",", "") + "',page," + str(links[page]).replace(",", "") + "\n") for category in all_categories: results.write("'" + category.replace("_", " ").replace(",", "") + "',category," + str(all_categories[category]).replace(",", "") + "\n") results.write( "edgedef>node1 VARCHAR, node2 VARCHAR, weight INTEGER\n") for page in page_categories: for category in page_categories[page]: results.write("'" + page.replace("_", " ").replace(",", "") + "','" + category.replace("_", " ").replace(",", "") + "'\n") self.dataset.finish(len(page_categories))
def test_form_in_response(self): code, html = self.get_html() self.assertTrue(code, 200) self.assertTrue(css('.id_mock_image-apply_mock_image')(html)) self.assertTrue(css('#mock_image_params')(html))
def process_html(text): tree = html.fromstring(text) for img in css("img")(tree): src = img.get("src") print "src: %s" % src
def test_one_sub_form_in_response(self): code, html = self.get_html() self.assertTrue(code, 200) self.assertTrue(css('#single_form_template')(html))
'checkout': '05%2F05%2F2017', 'neighborhoods[]': '%s' % (hood), } }, 'active': 1 } for hood in neighborhoods], 'to_parser': { 'raw_html': False, 'object_types': { 'next_page': { 'objects': { 'next_page': { 'parse_func': sel_attr, 'kwargs': { 'attr': 'href', 'selector': css('.next_page a'), }, 'follow': 1, }, }, }, 'airbnb_listing': { 'pre_selector': css('.listing'), 'objects': { 'url': { 'parse_func': sel_attr, 'kwargs': { 'attr': 'data-url', }, }, 'reviews': {
def comment_els(comment_html, text_class='.c00', match_on='.*amazon\.co.*\/.*|.*amzn\.co.*\/.*'): for comment in css(text_class)(html.fromstring(comment_html)): links = css('a')(comment) for link in links: if 'reply' not in link.text and re.match(match_on, link.get('href')) and 'aws.' not in link.get('href'): yield comment, link
dumpert_class = [{ 'name': 'Dumpert', 'domain': 'http://dumpert.nl', 'num_get': 1, 'phases': [{ 'to_getter': [{ 'url': 'http://dumpert.nl/{}/'.format(i if i else ''), 'active': 1 } for i in range(2)], 'to_parser': { 'object_types': { 'upload': { 'pre_selector': css('a.dumpthumb'), 'to_store': { 'func': store_json, 'kwargs': { 'filename': 'dumpert', } }, 'attrs': { 'url': { 'func': sel_attr, 'kwargs': { 'attr': 'href', } }, 'title': { 'func': sel_text,
from lxml import html from lxml.cssselect import CSSSelector as css from urlparse import urljoin from topps import scraper from math import floor from topps import util db = util.connect_db(scraper) cursor = db.cursor() br = mechanize.Browser() index_tree = html.fromstring(br.open("http://www.nfl.com/players").read()) get_team_links = css("#byTeamRosterTable a") get_divisions = css("#byTeamRosterTable .bold") team_links = [(anchor.text, anchor.get('href')) for anchor in get_team_links(index_tree)] divisions = [div.text.split(" ") for div in get_divisions(index_tree)] conferences = set([div[0] for div in divisions]) conference_ids = [] # for conf in conferences: # cursor.execute("""INSERT INTO conference (name) VALUES ("{0}");""".format(conf)) # print cursor.fetchall() # for division in divisions: # print division[0], division[1] # cursor.execute("""INSERT INTO division (name, conference_name) VALUES ("{1}", "{0}");""".format(division[0], " ".join(division))) # db.commit()
def get_image(self, path, rate_limit=0): """ Create image from path If the path is local, simply read the local path and return an Image representing it. If not, attempt to download the image from elsewhere, and cache the downloaded result if possible, else discard the file afterwards. :param path: Path to image, either a local path or a URL :param rate_limit: Seconds to wait after downloading, if downloading :return Image: Image object, or nothing if loading it failed """ rate_regex = re.compile(r"Search limit exceeded. Please wait ([0-9]+) seconds before attempting again.") if isinstance(path, Path): # local file return Image.open(path) # get link to image from external HTML search results # detect rate limiting and wait until we're good to go again page = requests.get(path, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.1 Safari/605.1.15"}) rate_limited = rate_regex.search(page.content.decode("utf-8")) while rate_limited: self.log.debug("Rate-limited by external source. Waiting %s seconds." % rate_limited[1]) time.sleep(int(rate_limited[1])) page = requests.get(path) rate_limited = rate_regex.search(page.content.decode("utf-8")) # get link to image file from HTML returned parser = etree.HTMLParser() tree = etree.parse(StringIO(page.content.decode("utf-8")), parser) image_url = css("a.thread_image_link")(tree)[0].get("href") # download image itself image = requests.get(image_url, stream=True) # if not available, the thumbnail may be if image.status_code != 200: thumbnail_url = ".".join(image_url.split(".")[:-1]) + "s." + image_url.split(".")[-1] image = requests.get(thumbnail_url, stream=True) if image.status_code != 200: raise FileNotFoundError # cache the image for later, if needed if config.PATH_IMAGES: md5 = hashlib.md5() based_hash = path.split("/")[-1].split(".")[0].replace("_", "/") extension = image_url.split(".")[-1].lower() md5.update(base64.b64decode(based_hash)) local_path = Path(config.PATH_IMAGES, md5.hexdigest() + "." + extension) delete_after = False else: query_result = self.dataset.get_results_path() local_path = Path(query_result.parent, query_result.name + "-temp") delete_after = True # save file, somewhere with open(local_path, 'wb') as file: for chunk in image.iter_content(1024): file.write(chunk) # avoid getting rate-limited by image source time.sleep(rate_limit) picture = Image.open(local_path) # if no image folder is configured, delete the temporary file if delete_after: local_path.unlink() return picture
from lxml.cssselect import CSSSelector as css import time clss = { 'mediamarkt': { 'start': [ { 'url': '/scholenoverzicht/vo/', 'active': 1, }, ], 'list_url': 'http://www.onderwijsconsument.nl', 'object_url': 'http://www.onderwijsconsument.nl', # 'iter_class': css('li.pagination-next a'), 'css': { 'list_class': css('#lijst .school a'), 'sections': { 'school': { 'selector': css('#school'), 'css': { 'onderwijs': { 'func': sel_text, 'params': { 'selector': css('.lead'), }, }, 'name': { 'func': sel_text, 'params': { 'selector': css('h2') }
from parse_functions import * # noqa from lxml.cssselect import CSSSelector as css clss = { 'lyrics': { 'skip_object': '', 'start': [ { 'url': 'wiki/LyricWiki:Top_100', 'active': True, }, ], 'list_url': 'http://lyrics.wikia.com/', 'object_url': 'http://lyrics.wikia.com/', 'css': { 'list_class': css('li b a:not(a.new)'), 'sections': { 'lyrics': { 'artist': { 'func': parse_attr, 'params': { 'attr': 'content', 'selector': css('meta[property=title]') } }, 'lyric': { 'func': parse_regex, 'params': { 'selector': css('.lyricbox'), 'regex': ";([a-zA-z\d.,? '’\"!\(\)-]*)\n", }
clss = [ { 'name': 'mediamarkt', 'domain': 'http://www.mediamarkt.nl', 'phases': [ { 'to_getter': [ {'url': 'http://www.mediamarkt.nl/', 'active': 1, }, ], 'to_parser': { 'object_types': { 'link': { 'pre_selector': css('#top-navigation'), 'attrs': { 'menu_item': { 'func': sel_attr, 'kwargs': { 'selector': css('li.item a'), 'index': 0, 'attr': 'href'}, 'follow': {'forward': 1}, }, }, }, } }, }, {'to_parser': {