def render(self, info=None, format='html', encoding='unicode'): ''' Render string from template. :keyword info: data to substitute into a document :keyword str format: HTML doctype to use for string :param str encoding: encoding type for return string ''' # substitute any info into the document if info is not None: self.__imod__(info) tree = _copytree(self._tree) # use XHTML 1.0 doctype if format == 'html': # strip namespace prefix from HTML xhtml_to_html(tree) doc = htostring(tree, encoding=encoding, doctype=_html5) # use XHTML 1.1 doctype elif format == 'xhtml10': doc = htostring( tree, method='xml', encoding=encoding, doctype=_xhtml10, ) # use HTML 5 doctype elif format == 'xhtml11': doc = htostring( tree, method='xml', encoding=encoding, doctype=_xhtml11, ) return doc
def scrape_category(url, title): category_slug = slugify(title) #if testing and category_slug != 'storage-servers-nas': # return try: f = urlopen(url) except ValueError: if trace: print 'Retrying:', url url = 'http://eracks.com' + url.replace(' ', '%20') if trace: print 'As:', url f = urlopen(url) doc = html5lib.parse( f, treebuilder='lxml', namespaceHTMLElements=False ) # this didn't work, but above three lines did: encoding='utf-8', html.xhtml_to_html(doc) jQuery = PyQuery([doc]) prods = jQuery('#products a').filter( lambda not_used: 'config?sku=' in PyQuery(this).attr('href')) for a in prods: scrape_product(PyQuery(a).attr('href'), category_slug)
def scrape(slug, url, name, title=None): f = urlopen(url) doc = f.read() doc, errs = tidy_document( doc, options={ "output-html": 1, #'indent':1, "clean": 1, "drop-font-tags": 1, }, ) if errs: # raise Exception, errs print errs doc = html5lib.parse(doc, treebuilder="lxml") # this didn't work, but above three lines did: encoding='utf-8', html.xhtml_to_html(doc) jQuery = PyQuery([doc]) td = jQuery("td#content") assert len(td) == 1 for img in td("img"): # print 'img:', PyQuery (img) img = PyQuery(img) src = img.attr("src") # alt = img.attr('alt') # if src.startswith ('/image'): rslt = getimage(src, slug.split("/")[0]) img.attr("src", rslt) if trace: print rslt # td = # no_fonts (td) # need to fix links here content = PyQuery(td[0]) # content = content.html() content = no_namespaces(content.html()) print slug, content[:60] # .html() # [:60] if dbteeth: # q, created = QuickPage.objects.get_or_create ( qp, created = create_or_update( QuickPage, keys=dict(slug=slug), fields=dict( name=name, title=title if title else name, content=content, # defaults = dict (sortorder = sortorder), ), )
def scrape_category (url, title): category_slug = slugify (title) try: f = urlopen (url) except ValueError: if trace: print 'Retrying:', url url = 'http://eracks.com' + url.replace (' ','%20') if trace: print 'As:', url f = urlopen (url) doc = html5lib.parse(f, treebuilder='lxml', namespaceHTMLElements=False) # this didn't work, but above three lines did: encoding='utf-8', html.xhtml_to_html (doc) jQuery = PyQuery([doc]) page_title = jQuery ('title').text() if page_title.startswith ("eRacks Open Source Systems: "): page_title = page_title.partition ("eRacks Open Source Systems: ") [-1] if page_title.startswith ("eRacks "): page_title = page_title.partition ("eRacks ") [-1] content = jQuery ('td#content') links = content ('a') images = content ('img') for link in links: a = PyQuery (link) href = a.attr('href') skus = find_sku.findall (href) if skus: sku = skus [0] #a.attr ('href', '/%s/%s/' % (category_slug, slugify (sku))) a.attr ('href', '/products/%s/%s/' % (category_slug, sku)) elif href.startswith ('/Legacy'): sku = slugify (href.split ('/') [-1]) #a.attr ('href', '/%s/%s/' % (category_slug, slugify (sku))) a.attr ('href', '/products/%s/%s/' % (category_slug, sku)) print 'link:', a.attr('href') for image in images: img = PyQuery (image) src = img.attr('src') newsrc = getimage (src, 'categories/' + category_slug) img.attr ('src', newsrc) print 'image:', newsrc description = content.html() if trace: print description if dbteeth: cat = Categories.objects.get (name=title) cat.comments = cat.comments + '\n\nScraped from Zope as of ' + str(datetime.date.today()) cat.description = description cat.title = page_title cat.save() print '..saved.'
def scrape(slug, url, name, title=None): f = urlopen(url) doc = f.read() doc, errs = tidy_document( doc, options={ 'output-html': 1, #'indent':1, 'clean': 1, 'drop-font-tags': 1, }) if errs: #raise Exception, errs print errs doc = html5lib.parse( doc, treebuilder='lxml' ) # this didn't work, but above three lines did: encoding='utf-8', html.xhtml_to_html(doc) jQuery = PyQuery([doc]) td = jQuery('td#content') assert len(td) == 1 for img in td('img'): #print 'img:', PyQuery (img) img = PyQuery(img) src = img.attr('src') #alt = img.attr('alt') #if src.startswith ('/image'): rslt = getimage(src, slug.split('/')[0]) img.attr('src', rslt) if trace: print rslt #td = #no_fonts (td) # need to fix links here content = PyQuery(td[0]) #content = content.html() content = no_namespaces(content.html()) print slug, content[:60] #.html() # [:60] if dbteeth: #q, created = QuickPage.objects.get_or_create ( qp, created = create_or_update( QuickPage, keys=dict(slug=slug), fields=dict( name=name, title=title if title else name, content=content, #defaults = dict (sortorder = sortorder), ))
def clean_html(tree): mytree = deepcopy(tree) for elem in mytree.iter(): for attr, val in elem.attrib.iteritems(): if attr.startswith('{'): del elem.attrib[attr] xhtml_to_html(mytree) return etree.tostring(normalize_ns(mytree), method="html", encoding=unicode)
def parse_table(): source_file = 'citylist.html' if os.path.isfile(source_file): log("Reading %s" % source_file) tree = etree.parse(source_file) else: log("Reading %s" % WIKI_URL) tree = etree.parse(WIKI_URL) tree.write(source_file, encoding='utf-8') html.xhtml_to_html(tree) tables = tree.findall('//table') tables.sort(key=len) table = tables[-1] # longest table return table
def scrape_product (url, title): f = urlopen (url) doc = html5lib.parse(f, treebuilder='lxml') # this didn't work, but above three lines did: encoding='utf-8', html.xhtml_to_html (doc) jQuery = PyQuery([doc]) cat = Categories.objects.get (name=title) #name = title #slug = slugify (title) description = jQuery ('td#content').html() #print description [:50] cat.comments = cat.comments + '\n\nScraped from Zope as of ' + str(datetime.date.today()) cat.description = description cat.save() print '..saved.'
def scrape_product(url, title): f = urlopen(url) doc = html5lib.parse( f, treebuilder='lxml' ) # this didn't work, but above three lines did: encoding='utf-8', html.xhtml_to_html(doc) jQuery = PyQuery([doc]) cat = Categories.objects.get(name=title) #name = title #slug = slugify (title) description = jQuery('td#content').html() #print description [:50] cat.comments = cat.comments + '\n\nScraped from Zope as of ' + str( datetime.date.today()) cat.description = description cat.save() print '..saved.'
def scrape_category (url, title): category_slug = slugify (title) #if testing and category_slug != 'storage-servers-nas': # return try: f = urlopen (url) except ValueError: if trace: print 'Retrying:', url url = 'http://eracks.com' + url.replace (' ','%20') if trace: print 'As:', url f = urlopen (url) doc = html5lib.parse(f, treebuilder='lxml', namespaceHTMLElements=False) # this didn't work, but above three lines did: encoding='utf-8', html.xhtml_to_html (doc) jQuery = PyQuery([doc]) prods = jQuery ('#products a').filter (lambda not_used: 'config?sku=' in PyQuery(this).attr('href')) for a in prods: scrape_product (PyQuery(a).attr ('href'), category_slug)
def __call__(self, doc): """ Cleans the document. """ try: getroot = doc.getroot except AttributeError: pass # Element instance else: doc = getroot() # ElementTree instance, instead of an element # convert XHTML to HTML xhtml_to_html(doc) # Normalize a case that IE treats <image> like <img>, and that # can confuse either this step or later steps. for el in doc.iter('image'): el.tag = 'img' if not self.comments: # Of course, if we were going to kill comments anyway, we don't # need to worry about this self.kill_conditional_comments(doc) kill_tags = set(self.kill_tags or ()) remove_tags = set(self.remove_tags or ()) allow_tags = set(self.allow_tags or ()) if self.scripts: kill_tags.add('script') if self.safe_attrs_only: safe_attrs = set(self.safe_attrs) for el in doc.iter(etree.Element): attrib = el.attrib for aname in attrib.keys(): if aname not in safe_attrs: del attrib[aname] if self.javascript: if not (self.safe_attrs_only and self.safe_attrs == defs.safe_attrs): # safe_attrs handles events attributes itself for el in doc.iter(etree.Element): attrib = el.attrib for aname in attrib.keys(): if aname.startswith('on'): del attrib[aname] doc.rewrite_links(self._remove_javascript_link, resolve_base_href=False) # If we're deleting style then we don't have to remove JS links # from styles, otherwise... if not self.inline_style: for el in _find_styled_elements(doc): old = el.get('style') new = _css_javascript_re.sub('', old) new = _css_import_re.sub('', new) if self._has_sneaky_javascript(new): # Something tricky is going on... del el.attrib['style'] elif new != old: el.set('style', new) if not self.style: for el in list(doc.iter('style')): if el.get('type', '').lower().strip() == 'text/javascript': el.drop_tree() continue old = el.text or '' new = _css_javascript_re.sub('', old) # The imported CSS can do anything; we just can't allow: new = _css_import_re.sub('', old) if self._has_sneaky_javascript(new): # Something tricky is going on... el.text = '/* deleted */' elif new != old: el.text = new if self.comments: kill_tags.add(etree.Comment) if self.processing_instructions: kill_tags.add(etree.ProcessingInstruction) if self.style: kill_tags.add('style') if self.inline_style: etree.strip_attributes(doc, 'style') if self.links: kill_tags.add('link') elif self.style or self.javascript: # We must get rid of included stylesheets if Javascript is not # allowed, as you can put Javascript in them for el in list(doc.iter('link')): if 'stylesheet' in el.get('rel', '').lower(): # Note this kills alternate stylesheets as well if not self.allow_element(el): el.drop_tree() if self.meta: kill_tags.add('meta') if self.page_structure: remove_tags.update(('head', 'html', 'title')) if self.embedded: # FIXME: is <layer> really embedded? # We should get rid of any <param> tags not inside <applet>; # These are not really valid anyway. for el in list(doc.iter('param')): found_parent = False parent = el.getparent() while parent is not None and parent.tag not in ('applet', 'object'): parent = parent.getparent() if parent is None: el.drop_tree() kill_tags.update(('applet', )) # The alternate contents that are in an iframe are a good fallback: remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) if self.frames: # FIXME: ideally we should look at the frame links, but # generally frames don't mix properly with an HTML # fragment anyway. kill_tags.update(defs.frame_tags) if self.forms: remove_tags.add('form') kill_tags.update(('button', 'input', 'select', 'textarea')) if self.annoying_tags: remove_tags.update(('blink', 'marquee')) _remove = [] _kill = [] for el in doc.iter(): if el.tag in kill_tags: if self.allow_element(el): continue _kill.append(el) elif el.tag in remove_tags: if self.allow_element(el): continue _remove.append(el) if _remove and _remove[0] == doc: # We have to drop the parent-most tag, which we can't # do. Instead we'll rewrite it: el = _remove.pop(0) el.tag = 'div' el.attrib.clear() elif _kill and _kill[0] == doc: # We have to drop the parent-most element, which we can't # do. Instead we'll clear it: el = _kill.pop(0) if el.tag != 'html': el.tag = 'div' el.clear() _kill.reverse() # start with innermost tags for el in _kill: el.drop_tree() for el in _remove: el.drop_tag() if self.remove_unknown_tags: if allow_tags: raise ValueError( "It does not make sense to pass in both allow_tags and remove_unknown_tags" ) allow_tags = set(defs.tags) if allow_tags: # make sure we do not remove comments/PIs if users want them (which is rare enough) if not self.comments: allow_tags.add(etree.Comment) if not self.processing_instructions: allow_tags.add(etree.ProcessingInstruction) bad = [] for el in doc.iter(): if el.tag not in allow_tags: bad.append(el) if bad: if bad[0] is doc: el = bad.pop(0) el.tag = 'div' el.attrib.clear() for el in bad: el.drop_tag() if self.add_nofollow: for el in _find_external_links(doc): if not self.allow_follow(el): rel = el.get('rel') if rel: if ('nofollow' in rel and ' nofollow ' in (' %s ' % rel)): continue rel = '%s nofollow' % rel else: rel = 'nofollow' el.set('rel', rel)
def __call__(self, doc): """ Cleans the document. """ if hasattr(doc, "getroot"): # ElementTree instance, instead of an element doc = doc.getroot() # convert XHTML to HTML xhtml_to_html(doc) # Normalize a case that IE treats <image> like <img>, and that # can confuse either this step or later steps. for el in doc.iter("image"): el.tag = "img" if not self.comments: # Of course, if we were going to kill comments anyway, we don't # need to worry about this self.kill_conditional_comments(doc) kill_tags = set(self.kill_tags or ()) remove_tags = set(self.remove_tags or ()) allow_tags = set(self.allow_tags or ()) if self.scripts: kill_tags.add("script") if self.safe_attrs_only: safe_attrs = set(self.safe_attrs) for el in doc.iter(etree.Element): attrib = el.attrib for aname in attrib.keys(): if aname not in safe_attrs: del attrib[aname] if self.javascript: if not (self.safe_attrs_only and self.safe_attrs == defs.safe_attrs): # safe_attrs handles events attributes itself for el in doc.iter(etree.Element): attrib = el.attrib for aname in attrib.keys(): if aname.startswith("on"): del attrib[aname] doc.rewrite_links(self._remove_javascript_link, resolve_base_href=False) # If we're deleting style then we don't have to remove JS links # from styles, otherwise... if not self.inline_style: for el in _find_styled_elements(doc): old = el.get("style") new = _css_javascript_re.sub("", old) new = _css_import_re.sub("", new) if self._has_sneaky_javascript(new): # Something tricky is going on... del el.attrib["style"] elif new != old: el.set("style", new) if not self.style: for el in list(doc.iter("style")): if el.get("type", "").lower().strip() == "text/javascript": el.drop_tree() continue old = el.text or "" new = _css_javascript_re.sub("", old) # The imported CSS can do anything; we just can't allow: new = _css_import_re.sub("", old) if self._has_sneaky_javascript(new): # Something tricky is going on... el.text = "/* deleted */" elif new != old: el.text = new if self.comments or self.processing_instructions: # FIXME: why either? I feel like there's some obscure reason # because you can put PIs in comments...? But I've already # forgotten it kill_tags.add(etree.Comment) if self.processing_instructions: kill_tags.add(etree.ProcessingInstruction) if self.style: kill_tags.add("style") if self.inline_style: etree.strip_attributes(doc, "style") if self.links: kill_tags.add("link") elif self.style or self.javascript: # We must get rid of included stylesheets if Javascript is not # allowed, as you can put Javascript in them for el in list(doc.iter("link")): if "stylesheet" in el.get("rel", "").lower(): # Note this kills alternate stylesheets as well if not self.allow_element(el): el.drop_tree() if self.meta: kill_tags.add("meta") if self.page_structure: remove_tags.update(("head", "html", "title")) if self.embedded: # FIXME: is <layer> really embedded? # We should get rid of any <param> tags not inside <applet>; # These are not really valid anyway. for el in list(doc.iter("param")): found_parent = False parent = el.getparent() while parent is not None and parent.tag not in ("applet", "object"): parent = parent.getparent() if parent is None: el.drop_tree() kill_tags.update(("applet",)) # The alternate contents that are in an iframe are a good fallback: remove_tags.update(("iframe", "embed", "layer", "object", "param")) if self.frames: # FIXME: ideally we should look at the frame links, but # generally frames don't mix properly with an HTML # fragment anyway. kill_tags.update(defs.frame_tags) if self.forms: remove_tags.add("form") kill_tags.update(("button", "input", "select", "textarea")) if self.annoying_tags: remove_tags.update(("blink", "marquee")) _remove = [] _kill = [] for el in doc.iter(): if el.tag in kill_tags: if self.allow_element(el): continue _kill.append(el) elif el.tag in remove_tags: if self.allow_element(el): continue _remove.append(el) if _remove and _remove[0] == doc: # We have to drop the parent-most tag, which we can't # do. Instead we'll rewrite it: el = _remove.pop(0) el.tag = "div" el.attrib.clear() elif _kill and _kill[0] == doc: # We have to drop the parent-most element, which we can't # do. Instead we'll clear it: el = _kill.pop(0) if el.tag != "html": el.tag = "div" el.clear() _kill.reverse() # start with innermost tags for el in _kill: el.drop_tree() for el in _remove: el.drop_tag() if self.remove_unknown_tags: if allow_tags: raise ValueError( "It does not make sense to pass in both allow_tags and remove_unknown_tags" ) allow_tags = set(defs.tags) if allow_tags: bad = [] for el in doc.iter(): if el.tag not in allow_tags: bad.append(el) if bad: if bad[0] is doc: el = bad.pop(0) el.tag = "div" el.attrib.clear() for el in bad: el.drop_tag() if self.add_nofollow: for el in _find_external_links(doc): if not self.allow_follow(el): rel = el.get("rel") if rel: if "nofollow" in rel and " nofollow " in (" %s " % rel): continue rel = "%s nofollow" % rel else: rel = "nofollow" el.set("rel", rel)
print d('p').filter(lambda i: i == 1) print d('p').filter(lambda i: i == 2) print d('p').filter(lambda not_used: PyQuery(this).text() == 'Hi') ''' import re find_sku = re.compile ('config\?sku=(.*)\&.*') #print sku.search ('/products/Enterprise%20Servers/config?sku=ENTERPRISE&session=11706171452858578') #print sku.findall ('/products/Enterprise%20Servers/config?sku=ENTERPRISE&session=11706171452858578') #sys.exit() f = urlopen (url) doc = html5lib.parse(f, treebuilder='lxml') # this didn't work, but above three lines did: encoding='utf-8', html.xhtml_to_html (doc) jQuery = PyQuery([doc]) for a in jQuery ('#products a'): # [-1:]: # skip 'Legacy' at the end a = PyQuery (a) title = a.text() href = a.attr ('href') #assert title == unquote (urlparse (href).path).split ('/') [-1] ## link = '/products/ print 'Working on:', slugify (title), title #, href scrape_category (href, title) print 'Done:', title print
print d('p').filter(lambda i: i == 2) print d('p').filter(lambda not_used: PyQuery(this).text() == 'Hi') ''' import re find_sku = re.compile('config\?sku=(.*)\&.*') #print sku.search ('/products/Enterprise%20Servers/config?sku=ENTERPRISE&session=11706171452858578') #print sku.findall ('/products/Enterprise%20Servers/config?sku=ENTERPRISE&session=11706171452858578') #sys.exit() f = urlopen(url) doc = html5lib.parse( f, treebuilder='lxml' ) # this didn't work, but above three lines did: encoding='utf-8', html.xhtml_to_html(doc) jQuery = PyQuery([doc]) for a in jQuery('#products a'): # [-1:]: # skip 'Legacy' at the end a = PyQuery(a) title = a.text() href = a.attr('href') #assert title == unquote (urlparse (href).path).split ('/') [-1] ## link = '/products/ print 'Working on:', slugify(title), title #, href scrape_category(href, title) print 'Done:', title print
## globals url = "http://eracks.com/customers" teeth = 0 # whether to write scraped images ## main f = urlopen(url) doc = html5lib.parse(f, treebuilder="lxml") # this didn't work, but above three lines did: encoding='utf-8', print html.xhtml_to_html(doc) jQuery = PyQuery([doc]) def getimage(src): f = urlopen(src) info = f.info() fname = src.split("/")[-1] + "." + info.getsubtype() path = "/home/joe/Projects/django_eracks/static/images/customers/" + fname if teeth: open(path, "wb").write(f.read()) return "/images/customers/" + fname # url for retrieval href = src = caption = loc = title = ""
def scrape_product(url, category_slug): f = urlopen(url) doc = html5lib.parse( f, treebuilder='lxml', namespaceHTMLElements=False ) # this didn't work, but above three lines did: encoding='utf-8', html.xhtml_to_html(doc) jQuery = PyQuery([doc]) #content = jQuery ('td#content table').eq(0) content = jQuery('td#content') content('form').remove() # used to do this, but some models (eg blades) don't have tables: #content = jQuery ('td#content table td').eq (0) #if content.is_('table'): # content = content ('table td').eq (0) # nope, this was too simplistic - let's take apart the tables - see below in final save # nope, this doens't work either. I give up. skus = find_sku.findall(url) sku = skus[0] slug = slugify(sku) print sku ''' if sku in ['ESERVE', 'NAS6X', 'NAS16X', 'PREMIUM', 'TWINSERVE', 'PREMIUM2', 'SANDYCORE', 'i7CORE', 'i7SHORT',]: print 'Skipping..' return #elif testing and sku != 'NAS12': # print 'Skipping due to testing..' # return ''' content('.small').filter(lambda notused: PyQuery(this).text().startswith( "Per single unit, this configuration's price")).remove() content('.small').filter(lambda notused: PyQuery(this).text().startswith( "The base price with this configuration is")).remove() content('.small').filter(lambda notused: PyQuery(this).text().startswith( "All eRacks systems come with a Standard")).remove() content('.small').filter(lambda notused: PyQuery(this).text().startswith( "The price differences between the default")).remove() content('.small').filter(lambda notused: PyQuery(this).text().startswith( "Contact eRacks to inquire about leasing")).remove() content('form').remove() content('#pricetext').remove() content('#warrantynote').remove() content('#closenote').remove() xbig = content('.xbig') if xbig: xbig('a').remove() inner = xbig.html().replace(':', '').strip() xbig.replaceWith('<h5 class=xbig>%s</h5>' % inner) print 'xbig replaced:', inner font = content('font[size=4], font[size=5]') if font: font('a').remove() inner = font.text().replace(':', '').strip() font.replaceWith('<h5 class="product">%s</h5>' % inner) print 'font replaced:', inner if testing: print print sku, 'content:' print content.html() links = content('a') images = content('img') for link in links: a = PyQuery(link) href = a.attr('href') if href: if '?' in href: href = href.split('?')[ 0] # doesn't this get rid of all get parms? a.attr('href', href) linkskus = find_sku.findall(href) # That this is looking for?!! else: print "Empty Link:", a.html() linkskus = [] print content.html() if linkskus: linksku = linkskus[0] a.attr('href', '/products/%s/%s/' % (category_slug, linksku)) print 'New link:', a.attr('href') elif href.startswith('/Legacy'): linksku = slugify(href.split('/')[-1]) a.attr('href', '/products/%s/%s/' % (category_slug, linksku)) print 'New link:', a.attr('href') elif 'ore photos' in a.text(): print 'Scraping:', href scrape_photos(url, href, slug) #print 'Removing link (scraped):', href #a.remove() print 'Updating "more photos" link:', href a.attr('href', '#photos') a.attr('onclick', '$("#photos-tab").click();') elif href.endswith('_photos'): print 'Scraping:', href scrape_photos(url, href, slug) print 'Updating "<prod>_photos" link:', href a.attr('href', '#photos') a.attr('onclick', '$("#photos-tab").click();') for image in images: img = PyQuery(image) src = img.attr('src') newsrc = getimage(src, 'products/' + slug) img.attr('src', newsrc) print 'image:', newsrc if dbteeth: #prod, created = Product.objects.get_or_create (sku=sku) # prods are already in the db, silly! prod = Product.objects.get(sku=sku) prod.comments = prod.comments + '\n\nScraped from Zope as of ' + str( datetime.date.today()) #prod.description = content.text() + '<br>'.join ([PyQuery(c).html() for c in content ('td')]) # content.html() prod.description = content.html() # save image(s): # prod.image = # prod.images.add (name, title, src, etc) prod.save() print '..saved.'
def scrape_product (url, category_slug): f = urlopen (url) doc = html5lib.parse(f, treebuilder='lxml', namespaceHTMLElements=False) # this didn't work, but above three lines did: encoding='utf-8', html.xhtml_to_html (doc) jQuery = PyQuery([doc]) #content = jQuery ('td#content table').eq(0) content = jQuery ('td#content') content ('form').remove() # used to do this, but some models (eg blades) don't have tables: #content = jQuery ('td#content table td').eq (0) #if content.is_('table'): # content = content ('table td').eq (0) # nope, this was too simplistic - let's take apart the tables - see below in final save # nope, this doens't work either. I give up. skus = find_sku.findall (url) sku = skus [0] slug = slugify (sku) print sku ''' if sku in ['ESERVE', 'NAS6X', 'NAS16X', 'PREMIUM', 'TWINSERVE', 'PREMIUM2', 'SANDYCORE', 'i7CORE', 'i7SHORT',]: print 'Skipping..' return #elif testing and sku != 'NAS12': # print 'Skipping due to testing..' # return ''' content ('.small').filter (lambda notused: PyQuery (this).text().startswith ("Per single unit, this configuration's price")).remove() content ('.small').filter (lambda notused: PyQuery (this).text().startswith ("The base price with this configuration is")).remove() content ('.small').filter (lambda notused: PyQuery (this).text().startswith ("All eRacks systems come with a Standard")).remove() content ('.small').filter (lambda notused: PyQuery (this).text().startswith ("The price differences between the default")).remove() content ('.small').filter (lambda notused: PyQuery (this).text().startswith ("Contact eRacks to inquire about leasing")).remove() content ('form').remove() content ('#pricetext').remove() content ('#warrantynote').remove() content ('#closenote').remove() xbig = content ('.xbig') if xbig: xbig ('a').remove() inner = xbig.html().replace (':','').strip() xbig.replaceWith ('<h5 class=xbig>%s</h5>' % inner) print 'xbig replaced:', inner font = content('font[size=4], font[size=5]') if font: font ('a').remove() inner = font.text().replace (':','').strip() font.replaceWith ('<h5 class="product">%s</h5>' % inner) print 'font replaced:', inner if testing: print print sku, 'content:' print content.html() links = content ('a') images = content ('img') for link in links: a = PyQuery (link) href = a.attr('href') if href: if '?' in href: href = href.split('?')[0] # doesn't this get rid of all get parms? a.attr ('href', href) linkskus = find_sku.findall (href) # That this is looking for?!! else: print "Empty Link:", a.html() linkskus=[] print content.html() if linkskus: linksku = linkskus [0] a.attr ('href', '/products/%s/%s/' % (category_slug, linksku)) print 'New link:', a.attr('href') elif href.startswith ('/Legacy'): linksku = slugify (href.split ('/') [-1]) a.attr ('href', '/products/%s/%s/' % (category_slug, linksku)) print 'New link:', a.attr('href') elif 'ore photos' in a.text(): print 'Scraping:', href scrape_photos (url, href, slug) #print 'Removing link (scraped):', href #a.remove() print 'Updating "more photos" link:', href a.attr ('href', '#photos') a.attr ('onclick', '$("#photos-tab").click();') elif href.endswith ('_photos'): print 'Scraping:', href scrape_photos (url, href, slug) print 'Updating "<prod>_photos" link:', href a.attr ('href', '#photos') a.attr ('onclick', '$("#photos-tab").click();') for image in images: img = PyQuery (image) src = img.attr('src') newsrc = getimage (src, 'products/' + slug) img.attr ('src', newsrc) print 'image:', newsrc if dbteeth: #prod, created = Product.objects.get_or_create (sku=sku) # prods are already in the db, silly! prod = Product.objects.get (sku=sku) prod.comments = prod.comments + '\n\nScraped from Zope as of ' + str(datetime.date.today()) #prod.description = content.text() + '<br>'.join ([PyQuery(c).html() for c in content ('td')]) # content.html() prod.description = content.html() # save image(s): # prod.image = # prod.images.add (name, title, src, etc) prod.save() print '..saved.'
def __call__(self, doc): """ Cleans the document. """ if hasattr(doc, 'getroot'): # ElementTree instance, instead of an element doc = doc.getroot() # convert XHTML to HTML xhtml_to_html(doc) # Normalize a case that IE treats <image> like <img>, and that # can confuse either this step or later steps. for el in doc.iter('image'): el.tag = 'img' if not self.comments: # Of course, if we were going to kill comments anyway, we don't # need to worry about this self.kill_conditional_comments(doc) kill_tags = set(self.kill_tags or ()) remove_tags = set(self.remove_tags or ()) allow_tags = set(self.allow_tags or ()) if self.scripts: kill_tags.add('script') if self.safe_attrs_only: safe_attrs = set(defs.safe_attrs) for el in doc.iter(): attrib = el.attrib for aname in attrib.keys(): if aname not in safe_attrs: del attrib[aname] if self.javascript: if not self.safe_attrs_only: # safe_attrs handles events attributes itself for el in doc.iter(): attrib = el.attrib for aname in attrib.keys(): if aname.startswith('on'): del attrib[aname] doc.rewrite_links(self._remove_javascript_link, resolve_base_href=False) if not self.style: # If we're deleting style then we don't have to remove JS links # from styles, otherwise... for el in _find_styled_elements(doc): old = el.get('style') new = _css_javascript_re.sub('', old) new = _css_import_re.sub('', old) if self._has_sneaky_javascript(new): # Something tricky is going on... del el.attrib['style'] elif new != old: el.set('style', new) for el in list(doc.iter('style')): if el.get('type', '').lower().strip() == 'text/javascript': el.drop_tree() continue old = el.text or '' new = _css_javascript_re.sub('', old) # The imported CSS can do anything; we just can't allow: new = _css_import_re.sub('', old) if self._has_sneaky_javascript(new): # Something tricky is going on... el.text = '/* deleted */' elif new != old: el.text = new if self.comments or self.processing_instructions: # FIXME: why either? I feel like there's some obscure reason # because you can put PIs in comments...? But I've already # forgotten it kill_tags.add(etree.Comment) if self.processing_instructions: kill_tags.add(etree.ProcessingInstruction) if self.style: kill_tags.add('style') etree.strip_attributes(doc, 'style') if self.links: kill_tags.add('link') elif self.style or self.javascript: # We must get rid of included stylesheets if Javascript is not # allowed, as you can put Javascript in them for el in list(doc.iter('link')): if 'stylesheet' in el.get('rel', '').lower(): # Note this kills alternate stylesheets as well el.drop_tree() if self.meta: kill_tags.add('meta') if self.page_structure: remove_tags.update(('head', 'html', 'title')) if self.embedded: # FIXME: is <layer> really embedded? # We should get rid of any <param> tags not inside <applet>; # These are not really valid anyway. for el in list(doc.iter('param')): found_parent = False parent = el.getparent() while parent is not None and parent.tag not in ('applet', 'object'): parent = parent.getparent() if parent is None: el.drop_tree() kill_tags.update(('applet',)) # The alternate contents that are in an iframe are a good fallback: remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) if self.frames: # FIXME: ideally we should look at the frame links, but # generally frames don't mix properly with an HTML # fragment anyway. kill_tags.update(defs.frame_tags) if self.forms: remove_tags.add('form') kill_tags.update(('button', 'input', 'select', 'textarea')) if self.annoying_tags: remove_tags.update(('blink', 'marquee')) _remove = [] _kill = [] for el in doc.iter(): if el.tag in kill_tags: if self.allow_element(el): continue _kill.append(el) elif el.tag in remove_tags: if self.allow_element(el): continue _remove.append(el) if _remove and _remove[0] == doc: # We have to drop the parent-most tag, which we can't # do. Instead we'll rewrite it: el = _remove.pop(0) el.tag = 'div' el.attrib.clear() elif _kill and _kill[0] == doc: # We have to drop the parent-most element, which we can't # do. Instead we'll clear it: el = _kill.pop(0) if el.tag != 'html': el.tag = 'div' el.clear() _kill.reverse() # start with innermost tags for el in _kill: el.drop_tree() for el in _remove: el.drop_tag() allow_tags = self.allow_tags if self.remove_unknown_tags: if allow_tags: raise ValueError( "It does not make sense to pass in both allow_tags and remove_unknown_tags") allow_tags = set(defs.tags) if allow_tags: bad = [] for el in doc.iter(): if el.tag not in allow_tags: bad.append(el) if bad: if bad[0] is doc: el = bad.pop(0) el.tag = 'div' el.attrib.clear() for el in bad: el.drop_tag() if self.add_nofollow: for el in _find_external_links(doc): if not self.allow_follow(el): el.set('rel', 'nofollow')
## globals url = 'http://eracks.com/customers' teeth = 0 # whether to write scraped images ## main f = urlopen(url) doc = html5lib.parse( f, treebuilder='lxml' ) # this didn't work, but above three lines did: encoding='utf-8', print html.xhtml_to_html(doc) jQuery = PyQuery([doc]) def getimage(src): f = urlopen(src) info = f.info() fname = src.split('/')[-1] + '.' + info.getsubtype() path = '/home/joe/Projects/django_eracks/static/images/customers/' + fname if teeth: open(path, 'wb').write(f.read()) return '/images/customers/' + fname # url for retrieval href = src = caption = loc = title = '' sortorder = 100