def run_parse(): page = urllib.request.urlopen(base_url) doc = html.document_fromstring(page.read()) doc.make_links_absolute(base_url=base_url) for link in html.iterlinks(doc): if ("forumdisplay.php" in link[2]) and ("f=43" in link[2]): v_chapter_name = link[0].text_content() v_path = folder_prefix + v_chapter_name v_link = link[2] # Создаем папку if not os.path.exists(v_path): os.makedirs(v_path) page = urllib.request.urlopen(v_link) doc = html.document_fromstring(page.read()) doc.make_links_absolute(base_url=base_url) for link_topics in html.iterlinks(doc): parsed_url = urllib.parse.urlparse(link_topics[2]) # print(parsed_url) parsed_q = urllib.parse.parse_qs(parsed_url.query) # print(parsed_q) # Отыскиваем ссылку на первую страницу if (parsed_url.path == "/showthread.php") and ("t" in parsed_q) and not ("page" in parsed_q) and \ (link_topics[0].text_content() != "1")\ and (parsed_q["t"][0] == "1537" ): parse_topic(link_topics, v_path) #print(parsed_q)
def run_parse(): page = urllib.request.urlopen(base_url) doc = html.document_fromstring(page.read()) doc.make_links_absolute(base_url=base_url) for link in html.iterlinks(doc): if ("forumdisplay.php" in link[2]) and ("f=43" in link[2]): v_chapter_name = link[0].text_content() v_path = folder_prefix + v_chapter_name v_link = link[2] # Создаем папку if not os.path.exists(v_path): os.makedirs(v_path) page = urllib.request.urlopen(v_link) doc = html.document_fromstring(page.read()) doc.make_links_absolute(base_url=base_url) for link_topics in html.iterlinks(doc): parsed_url = urllib.parse.urlparse(link_topics[2]) # print(parsed_url) parsed_q = urllib.parse.parse_qs(parsed_url.query) # print(parsed_q) # Отыскиваем ссылку на первую страницу if (parsed_url.path == "/showthread.php") and ("t" in parsed_q) and not ("page" in parsed_q) and \ (link_topics[0].text_content() != "1")\ and (parsed_q["t"][0] == "1537" ): parse_topic(link_topics, v_path)
def fix_inner_link(self, page_html): """ Changes inner book links to point to the saved HML file rather than the URL on CNX :param page_html: HTML that could contain page links :return: HTML with corrected links """ html = lxml.html.document_fromstring(page_html) for element, attribute, link, pos in html.iterlinks(): if '/contents/' in link and element.text is not None: print "fix_link: " + link content_index = link.find('/contents/') pound_index = link.find('#') id = '' if pound_index == -1: id = link[content_index + 10:] else: id = link[content_index + 10:pound_index] anchor = '' if pound_index > -1: anchor = link[pound_index:] if self.remove_version(id) in self.get_page_ids(): print "fix_inner_link: " + id page = self.get_page_json(id) title = page['id'] + ".html" link_text = title + anchor element.attrib['href'] = link_text return lxml.html.tostring(html)
def scrape_ulsterbus(): """ For Ulsterbus there is a single index page containing the links to the individual service timetables. This index page is paginated - 5 pages with <= 100 items on each page. The paginator employs 'JS to form POST' buttons, so we need to mimic the individual POSTS here. """ fmt = ( 'ctl00$MainRegion$MainContentRegion$MainBodyRegion' '$ctl00$rptPageList$ctl00$ctl03$ctl01$ctl%02d' ) for x in [5, 7, 9, 11, 13]: done_routes = set() payload = { '__EVENTTARGET': fmt % x, '__EVENTARGUMENT': '', } response = requests.post(TRANSLINK_ULSTERBUS_INDEX, data=payload) html = parse(response.content) for link in html.iterlinks(): url = link[2] if valid_ulsterbus_route_url(url) and url not in done_routes: href = TRANSLINK_URL + url for row in iter_scrape_route(ULSTERBUS_ID, BUS_ROUTE_TYPE, href): yield row done_routes.add(url)
def get_for_flickr_photo(flickr_user, flickr_id): """ Return license instance """ try: response = urllib2.urlopen( 'http://www.flickr.com/photos/%s/%s/' % (flickr_user, flickr_id) ).read() except urllib2.URLError as e: if e.code == 404: return License.objects.get_or_create( name='All Rights Reserved')[0] else: return None html = lxml.html.fromstring(response) for element, attribute, link, pos in html.iterlinks(): if (attribute and link and attribute.lower() == 'href' and link.startswith('http://creativecommons.org/licenses/') and link.endswith('/')): return License.objects.get_or_create( creative_commons=True, url=link)[0] #for element, attribute, link, pos in html.iterlinks(): #if (attribute and link and attribute.lower() == 'href' and #link.endswith('/help/general/#147')): #return License.objects.get_or_create( #name='All Rights Reserved')[0] return License.objects.get_or_create( name='All Rights Reserved')[0]
def get_for_blendswap_scene(blendswap_id): """ Return (license, blendswap url, blendswap username) """ blendswap_url = 'http://www.blendswap.com/blends/view/%s' % blendswap_id print 'Visiting %s...' % blendswap_url try: response = urllib2.urlopen(blendswap_url).read() except urllib2.URLError as e: if e.code == 404: license, _ = License.objects.get_or_create(name='All Rights Reserved') return license, None, None else: return None, None, None html = lxml.html.fromstring(response) blendswap_username = html.find_class("user_link")[0].text_content().strip() for element, attribute, link, pos in html.iterlinks(): if (attribute and link and attribute.lower() == 'href' and (link.startswith('http://creativecommons.org/licenses/') or link.startswith('http://creativecommons.org/publicdomain/')) and link.endswith('/')): license, _ = License.objects.get_or_create( creative_commons=True, url=link) return license, blendswap_url, blendswap_username raise ValueError("Could not find license")
def html_malware_scan(url, topic): """ Crawls a page depth 1, returns the total count for each of the keywords in topic for each of the pages in the crawl. """ response = requests.get(url, timeout=10.0) html = lxml.html.fromstring(response.content, base_url=response.url) html.make_links_absolute(resolve_base_href=True) childs_topic_cnt = 0 main_page_topic_cnt = response.text.lower().count(topic) for url in Bar().iter( {link for element, attribute, link, pos in html.iterlinks()}): #for url in {link for element, attribute, link, pos in html.iterlinks()}: childs_topic_cnt += check_content(url, topic) if (float(main_page_topic_cnt) / (float(childs_topic_cnt) + 1.0) >= 1.0 or float(main_page_topic_cnt) / (float(childs_topic_cnt) + 1.0) == 0): return True else: return False
def image_fixups(content, msgid, archive, richformat, allowimgs): "Replace the CID links stored messages" html = local_fromstring(content) for element, attribute, link, _ in iterlinks(html): if not link.startswith('cid:'): if not allowimgs and attribute == 'src': element.attrib['src'] = '%simgs/blocked.gif' % media_url() element.attrib['title'] = link if richformat: if archive: displayurl = url('message-preview-archived-with-imgs', msgid=msgid) else: displayurl = url('message-preview-with-imgs', msgid=msgid) flash(ugettext('This message contains external' ' images, which have been blocked. ') + literal(link_to(ugettext('Display images'), displayurl))) else: imgname = link.replace('cid:', '') if archive: imgurl = url('messages-preview-archived-img', img=imgname.replace('/', '__xoxo__'), msgid=msgid) else: imgurl = url('messages-preview-img', img=imgname.replace('/', '__xoxo__'), msgid=msgid) element.attrib['src'] = imgurl return tostring(html)
def html_malware_scan(url, topic): """ Crawls a page depth 1, returns the total count for each of the keywords in topic for each of the pages in the crawl. """ response = requests.get(url, timeout=10.0) html = lxml.html.fromstring( response.content, base_url=response.url ) html.make_links_absolute(resolve_base_href=True) childs_topic_cnt = 0 main_page_topic_cnt = response.text.lower().count(topic) for url in Bar().iter({link for element, attribute, link, pos in html.iterlinks()}): #for url in {link for element, attribute, link, pos in html.iterlinks()}: childs_topic_cnt += check_content(url, topic) if (float(main_page_topic_cnt)/(float(childs_topic_cnt)+1.0) >= 1.0 or float(main_page_topic_cnt)/(float(childs_topic_cnt)+1.0) == 0): return True else: return False
def parse_links_xpath(filename): """question 2b Do the same using xpath and the lxml library from http://lxml.de rather than regular expressions. Which approach is better? (Hint: http://goo.gl/mzl9t) """ from lxml.html import iterlinks f = open(filename) site = f.read() html = iterlinks(site) values = [] keys = [] for a in html: keys.append(a[0].text) values.append(a[2]) dct = dict(zip(keys, values)) f.close() return dct
def grab_urls(content, url): urls = {} domain = urlparse(url).netloc html = document_fromstring(content) html.make_links_absolute(url, resolve_base_href=True) for element, attribute, link, pos in html.iterlinks(): if attribute != "href": continue # skip if not on our domain if urlparse(link).netloc != domain and urlparse( link).netloc != "www." + domain: continue # skip if self referential if (url.split("//")[1] + "#") in link: continue text = element.text_content() if len( element) == 0 else element[0].text_content() text = text.lstrip() if text is not None else "" # compute relevancy here relevance[link] = relevancy(link, text, url) urls[link] = 1 if text != "": print text print link print return urls.keys()
def _post_process_html(self, content): html = lxml.html.fromstring(content) if self.links: html.rewrite_links(self._map_cid) for link in html.iterlinks(): link[0].set("target", "_blank") else: html.rewrite_links(lambda x: None) safe_attrs = list(defs.safe_attrs) + ["class", "style"] cleaner = Cleaner( scripts=True, javascript=True, links=True, page_structure=True, embedded=True, frames=True, add_nofollow=True, safe_attrs=safe_attrs ) mail_text = lxml.html.tostring( cleaner.clean_html(html), encoding="unicode") with open("/tmp/output.txt", "w") as fp: fp.write(mail_text) return smart_text(mail_text)
def _get_desc(self, cr, uid, ids, field_name=None, arg=None, context=None): res = dict.fromkeys(ids, '') for module in self.browse(cr, uid, ids, context=context): path = get_module_resource(module.name, 'static/description/index.html') if path: with tools.file_open(path, 'rb') as desc_file: doc = desc_file.read() html = lxml.html.document_fromstring(doc) for element, attribute, link, pos in html.iterlinks(): if element.get('src') and not '//' in element.get( 'src') and not 'static/' in element.get('src'): element.set( 'src', "/%s/static/description/%s" % (module.name, element.get('src'))) res[module.id] = html_sanitize(lxml.html.tostring(html)) else: overrides = { 'embed_stylesheet': False, 'doctitle_xform': False, 'output_encoding': 'unicode', 'xml_declaration': False, } output = publish_string(source=module.description or '', settings_overrides=overrides, writer=MyWriter()) res[module.id] = html_sanitize(output) return res
def scrape_ulsterbus(): """ For Ulsterbus there is a single index page containing the links to the individual service timetables. This index page is paginated - 5 pages with <= 100 items on each page. The paginator employs 'JS to form POST' buttons, so we need to mimic the individual POSTS here. """ fmt = ('ctl00$MainRegion$MainContentRegion$MainBodyRegion' '$ctl00$rptPageList$ctl00$ctl03$ctl01$ctl%02d') for x in [5, 7, 9, 11, 13]: done_routes = set() payload = { '__EVENTTARGET': fmt % x, '__EVENTARGUMENT': '', } response = requests.post(TRANSLINK_ULSTERBUS_INDEX, data=payload) html = parse(response.content) for link in html.iterlinks(): url = link[2] if valid_ulsterbus_route_url(url) and url not in done_routes: href = TRANSLINK_URL + url for row in iter_scrape_route(ULSTERBUS_ID, BUS_ROUTE_TYPE, href): yield row done_routes.add(url)
def write_html_sig(sigfile, sig, basedir, is_domain, logger): "write html sig" cleaner = SignatureCleaner(style=True, remove_tags=UNCLEANTAGS, safe_attrs_only=False) html = cleaner.clean_html(sig.signature_content) html = fragments_fromstring(html)[0] for element, attribute, link, pos in iterlinks(html): if link.startswith('/settings/imgs/'): view, args, kwargs = resolve(link) view = None args = None img = SignatureImg.objects.get(pk=kwargs['img_id']) if is_domain: imgfile = '%s/domains/imgs/%s' % (basedir, img.name) else: imgfile = '%s/users/imgs/%s' % (basedir, img.name) element.attrib['src'] = 'cid:%s' % img.name imghandle = open(imgfile, 'wb') imghandle.write(base64.decodestring(img.image)) imghandle.close() logger.info(_("Wrote img: %(img)s") % dict(img=imgfile)) # update the sig with image obtained sig.image = img if 'link' in locals(): sig.save() sighandle = open(sigfile, 'w') if not sig.signature_content.startswith('--'): sighandle.write('<br/>--<br/>') sighandle.write(tostring(html)) logger.info(_("Wrote html signature: %(sig)s") % dict(sig=sigfile))
def get_links(self, url, starts_with=None, ends_with=None): results = [] content = self.download(url) if not content: return [] if not (starts_with or ends_with): raise NotImplementedError( 'get_links requires either `startswith` or `endswith`') html = lxml.html.document_fromstring(content) path = urlparse.urlparse(url).path def url_match(link): # The link might be something like "/pub/mobile/nightly/" # but we're looking for a path that starts with "nightly". # So first we need to remove what's part of the base URL # to make a fair comparison. if starts_with is not None: # If the current URL is http://example.com/some/dir/ # and the link is /some/dir/mypage/ and the thing # we're looking for is "myp" then this should be true if link.startswith(path): link = link.replace(path, '') return link.startswith(starts_with) elif ends_with: return link.endswith(ends_with) return False for _, _, link, _ in html.iterlinks(): if url_match(link): results.append(urlparse.urljoin(url, link)) return results
def get_page(): # request the page r = requests.get(request.args['url']) # parse the dom into python objects html = lxml.html.document_fromstring(r.content) # prase the requested url so we can form the base href url = urlparse(request.args['url']) # create the base url dom fragment base_url = lxml.html.fromstring("<base href='%s://%s'>" % (url.scheme, url.hostname)).find('.//base') # find the head element head = html.find(".//head") # insert the base href in the last place of the head elements head.insert(-1, base_url) # rewrite urls to have absolute url html.resolve_base_href() # rewrite links to load through this proxy for element, attribute, link, pos in html.iterlinks(): if element.tag == "a" and attribute == "href": link = "http://localhost:8888/translate_url?url=%s" % (link) element.set("href", link) element.set("target", "_parent") # translate through DOM Traversal # html = translate_dom_string(html, lxml.html.tostring(html)) # translate through HTML regex string replacement html = translate_html(html, lxml.html.tostring(html)) # dump the html string for debugging # with open('html_dump', 'w') as f: # f.write(lxml.html.tostring(html)) # a little regex to remove any script tags return re.subn(r'<(script).*?</\1>(?s)', '', lxml.html.tostring(html))[0]
def _get_desc(self, cr, uid, ids, field_name=None, arg=None, context=None): res = dict.fromkeys(ids, "") for module in self.browse(cr, uid, ids, context=context): path = get_module_resource(module.name, "static/description/index.html") if path: with tools.file_open(path, "rb") as desc_file: doc = desc_file.read() html = lxml.html.document_fromstring(doc) for element, attribute, link, pos in html.iterlinks(): if ( element.get("src") and not "//" in element.get("src") and not "static/" in element.get("src") ): element.set("src", "/%s/static/description/%s" % (module.name, element.get("src"))) res[module.id] = html_sanitize(lxml.html.tostring(html)) else: overrides = { "embed_stylesheet": False, "doctitle_xform": False, "output_encoding": "unicode", "xml_declaration": False, } output = publish_string( source=module.description or "", settings_overrides=overrides, writer=MyWriter() ) res[module.id] = html_sanitize(output) return res
def xlinks(page): links = {} # myparser = lxml.etree.HTMLParser(encoding="utf-8") # html = lxml.etree.HTML(page, parser=myparser) html = lxml.html.document_fromstring(page) for (f_name_element, attr, f_link, pos) in html.iterlinks(): #if(attr == 'href'): # for r in html.xpath("//a") : # f_link = l.get("href") links[f_link] = 1 if f_link.find("washingtonpost") > 0: continue if f_link.find("washpost") > 0: continue if f_link.find("mailto") >= 0: #print "external mail",f_link,pos, attr continue if f_link.find(".gov") >= 0: f_link = f_link.rstrip("/") continue if f_link.find("wiki") >= 0: print "external wiki", f_link, pos, attr return links
def parse(tid: str, host: str) -> None: job = get_current_job() print(f"Starting task for {host}") job.meta["status"] = "In progress" job.save_meta() links = [] r = requests.get(f"https://{host}") if r.status_code == 200: for elem in iterlinks(r.text): el, href, path, n = elem #time.sleep(0.2) if path.startswith("/"): print(path) links.append(path) json_str = json.dumps(links, ensure_ascii=False, indent=4) + "\n" json_bytes = json_str.encode('utf-8') with tarfile.open(f"static/{tid}.tar.xz", "w:xz") as xz: buf = io.BytesIO(json_bytes) info = tarfile.TarInfo(f"{tid}.json") info.size = buf.seek(0, io.SEEK_END) xz.addfile(info, fileobj=io.BytesIO(buf.getvalue())) job.meta["status"] = "Completed" job.meta["url"] = f"https://timeweb.com/ru/task/{tid}.tar.xz" job.save_meta() else: job.meta["status"] = f"Error: {r.status_code}" job.save_meta() print('Task completed')
def get_links(self, noReturn = False): if self.links == []: so = self.get_source() self.links = list(iterlinks(so)) if not noReturn: return self.links
def _get_desc(self): for module in self: if not module.name: module.description_html = False continue module_path = modules.get_module_path(module.name, display_warning=False) # avoid to log warning for fake community module if module_path: path = modules.check_resource_path(module_path, 'static/description/index.html') if module_path and path: with tools.file_open(path, 'rb') as desc_file: doc = desc_file.read() html = lxml.html.document_fromstring(doc) for element, attribute, link, pos in html.iterlinks(): if element.get('src') and not '//' in element.get('src') and not 'static/' in element.get('src'): element.set('src', "/%s/static/description/%s" % (module.name, element.get('src'))) module.description_html = tools.html_sanitize(lxml.html.tostring(html)) else: overrides = { 'embed_stylesheet': False, 'doctitle_xform': False, 'output_encoding': 'unicode', 'xml_declaration': False, 'file_insertion_enabled': False, } output = publish_string(source=module.description if not module.application and module.description else '', settings_overrides=overrides, writer=MyWriter()) module.description_html = tools.html_sanitize(output)
def get_links(self, url, starts_with=None, ends_with=None): results = [] content = self.download(url) if not content: return [] if not (starts_with or ends_with): raise NotImplementedError("get_links requires either `startswith` or `endswith`") html = lxml.html.document_fromstring(content) path = urlparse.urlparse(url).path def url_match(link): # The link might be something like "/pub/mobile/nightly/" # but we're looking for a path that starts with "nightly". # So first we need to remove what's part of the base URL # to make a fair comparison. if starts_with is not None: # If the current URL is http://example.com/some/dir/ # and the link is /some/dir/mypage/ and the thing # we're looking for is "myp" then this should be true if link.startswith(path): link = link.replace(path, "") return link.startswith(starts_with) elif ends_with: return link.endswith(ends_with) return False for _, _, link, _ in html.iterlinks(): if url_match(link): results.append(urlparse.urljoin(url, link)) return results
def fix_inner_link(self, page_html): """ Changes inner book links to point to the saved HML file rather than the URL on CNX :param page_html: HTML that could contain page links :return: HTML with corrected links """ html = lxml.html.document_fromstring(page_html) for element, attribute, link, pos in html.iterlinks(): if '/contents/' in link and element.text is not None: print "fix_inner_link: " + link content_index = link.find('/contents/') at_index = link.find('@') pound_index = link.find('#') id = '' if at_index == -1: id = link[content_index + 10:] else: id = link[content_index + 10:at_index] anchor = '' if pound_index > -1: anchor = link[pound_index:] print "fix_inner_link: " + id page = self.get_page_json(id) title = page['id'] + ".html" link_text = title + anchor element.attrib['href'] = link_text return lxml.html.tostring(html)
def _get_desc(self): for module in self: path = modules.get_module_resource( module.name, 'static/description/index.html') if path: with tools.file_open(path, 'rb') as desc_file: doc = desc_file.read() html = lxml.html.document_fromstring(doc) for element, attribute, link, pos in html.iterlinks(): if element.get('src') and not '//' in element.get( 'src') and not 'static/' in element.get('src'): element.set( 'src', "/%s/static/description/%s" % (module.name, element.get('src'))) module.description_html = tools.html_sanitize( lxml.html.tostring(html)) else: overrides = { 'embed_stylesheet': False, 'doctitle_xform': False, 'output_encoding': 'unicode', 'xml_declaration': False, } output = publish_string(source=module.description or '', settings_overrides=overrides, writer=MyWriter()) module.description_html = tools.html_sanitize(output)
def get_page(self,item): links = html.iterlinks(item) l = [] for i in links: l.append(i) page = r.get(l[0][2])
def image_fixups(content, msgid, archive, richformat, allowimgs): "Replace the CID links stored messages" html = local_fromstring(content) for element, attribute, link, _ in iterlinks(html): if not link.startswith('cid:'): if not allowimgs and attribute == 'src': element.attrib['src'] = '%simgs/blocked.gif' % media_url() element.attrib['title'] = link if richformat: if archive: displayurl = url('message-preview-archived-with-imgs', msgid=msgid) else: displayurl = url('message-preview-with-imgs', msgid=msgid) flash( ugettext('This message contains external' ' images, which have been blocked. ') + literal(link_to(ugettext('Display images'), displayurl))) else: imgname = link.replace('cid:', '') if archive: imgurl = url('messages-preview-archived-img', img=imgname.replace('/', '__xoxo__'), msgid=msgid) else: imgurl = url('messages-preview-img', img=imgname.replace('/', '__xoxo__'), msgid=msgid) element.attrib['src'] = imgurl return tostring(html)
def projects_from_url(url): """returns list of projects from the index url""" projects = [] # XXX should be a set? html = urllib2.urlopen(url).read() html = lxml.html.fromstring(html) for link in html.iterlinks(): projects.append(link[2].strip('/')) return projects
def mask_links(html_text, site_url): document = html.fromstring(html_text) for el, attr, val, pos in html.iterlinks(document): if el.tag.lower() == "a": if (attr == 'href' and not is_internal_link(val, site_url)): el.attrib['target'] = '_blank' el.attrib['rel'] = 'nofollow' return html.tostring(document)
def get_links(html, base_url, tags=[]): links = [] tags = tags html = lxml.html.document_fromstring(html) html.make_links_absolute(base_url) links_html = html.iterlinks() links = [x[2] for x in links_html if x[0].tag in tags] return links
def get_links(html, base_url, tags = []): links = [] tags = tags html = lxml.html.document_fromstring(html) html.make_links_absolute(base_url) links_html = html.iterlinks() links = [ x[2] for x in links_html if x[0].tag in tags ] return links
def sanitize_html(self, msg): "Clean up html" cleaner = Cleaner(style=True, remove_tags=UNCLEANTAGS) msg = HTMLTITLE_RE.sub('', msg) html = cleaner.clean_html(msg) html = fromstring(html) for element, attribute, link, pos in iterlinks(html): element.attrib['src'] = settings.MEDIA_URL + '/imgs/blocked.gif' return tostring(html)
def getTodos(projects, objects): """ Get todos for each project """ tags_dict = getTags(objects) for project in projects: for ref_id in project['ref_ids'].split(): for object in objects: if object.attributes['id'].value == ref_id: attribute_nodes = object.getElementsByTagName("attribute") title = "" content = "" datemodified = "" datecreated = "" datecompleted= "" tags = "" for attribute_node in attribute_nodes: if attribute_node.attributes['name'].value == 'title': if attribute_node.childNodes: title = attribute_node.childNodes[0].nodeValue.encode("utf-8") break # Check if todo has a note attached if title: for attribute_node in attribute_nodes: # <attribute name="datemodified" >309306984.40529602766036987305 if attribute_node.attributes['name'].value == 'datemodified': datemodified = convertCocoaEpoch(attribute_node.childNodes[0].\ nodeValue.encode("utf-8")) # <attribute name="datecreated" >306520491.00000000000000000000 if attribute_node.attributes['name'].value == 'datecreated': datecreated = convertCocoaEpoch(attribute_node.childNodes[0].\ nodeValue.encode("utf-8")) #<attribute name="datecompleted" type="date">292880221.18648099899291992188 if attribute_node.attributes['name'].value == 'datecompleted': datecompleted = convertCocoaEpoch(attribute_node.childNodes[0].\ nodeValue.encode("utf-8")) if attribute_node.attributes['name'].value == 'content': content = attribute_node.childNodes[0].nodeValue #.encode("utf-8") # lets encode in writeOutline # I think we need to translate all this things html = content.replace('\\u3c00', '<').replace('\\u3e00', '>') html = html.replace('\u2600', '&') html = lxml.html.fromstring(html) content = html.text_content().split('\n') for l in html.iterlinks(): content += [l[2]] relationship_nodes = object.getElementsByTagName("relationship") for relationship_node in relationship_nodes: if relationship_node.attributes['name'].value == 'tags': try: tags_id = relationship_node.attributes['idrefs'].value tags = [tags_dict[t_id] for t_id in tags_id.split()] except: tags = "" project['todos'].append([title, content, datecreated, datemodified, datecompleted, tags]) return projects
def parse_topic_page(doc, path): posts = doc.cssselect('table[id^="post"]') for post in posts: post_content = post.cssselect('div[id^="post_message"]') if len(post_content) > 0: # links = post_content[0].cssselect('a[href^="http"]') # for link in links: for link in html.iterlinks(post_content[0]): if ("http" in link[2]) and (link[1] == "href"): parse_external_image(link, path)
def scrape_goldline(): done_routes = set() html = parse(scrape(TRANSLINK_GOLDLINE_INDEX)) for link in html.iterlinks(): url = link[2] if valid_goldline_route_url(url) and url not in done_routes: href = TRANSLINK_URL + url for row in iter_scrape_route(GOLDLINE_ID, BUS_ROUTE_TYPE, href): yield row done_routes.add(url)
def scrape_enterprise(): done_routes = set() html = parse(scrape(TRANSLINK_ENTERPRISE_INDEX)) for link in html.iterlinks(): url = link[2] if valid_enterprise_route_url(url) and url not in done_routes: href = TRANSLINK_URL + url for row in iter_scrape_route(ENTERPRISE_ID, RAIL_ROUTE_TYPE, href): yield row done_routes.add(url)
def get_google_links(query): links = [] url = GOOGLE_SEARCH_URL.format(urllib.parse.quote(query)) result = get_result(url) html = lxml.html.document_fromstring(result) for l in html.iterlinks(): if is_question(l[2]): links.append(l[2]) return links
def get_pages(): response_text = requests.get(BASE_URL).content html = lxml.html.fromstring(response_text) html.make_links_absolute(BASE_URL) links = [] i = 0 for link in html.iterlinks(): links.append(link[2]) return links
def fetch_data(url: str) -> None: r = requests.get(url) html = lxml.html.fromstring(r.content) tasks = [] loop = asyncio.get_event_loop() for *_, url, _ in html.iterlinks(): if url.startswith("fileadmin"): tasks.append( loop.create_task(fetch_zip(os.path.join(BASE_URL, url)))) loop.run_until_complete(asyncio.wait(tasks)) loop.close()
def _extract_links(self, response_text, response_url): html = lxml.html.fromstring(response_text) html.make_links_absolute(response_url) for e, a, l, p in html.iterlinks(): if self.scan_tag(e.tag): if self.scan_attr(a): link = Link(self.process_attr(l), text=e.text) self.links.append(link) links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links return links
def _extract_links(self, response_text, response_url): html = lxml.html.fromstring(response_text) html.make_links_absolute(response_url) for e, a, l, p in html.iterlinks(): if self.scan_tag(e.tag): if self.scan_attr(a): link = Link(self.process_attr(l), text=e.text) self.links.append(link) links = unique_list(self.links, key=lambda link: link.url) \ if self.unique else self.links return links
def extract_links_from_html(base, body): try: html = lxml.html.fromstring(body) html.make_links_absolute(base) for element, attribute, link, pos in html.iterlinks(): if isinstance(link, str): link = link.encode('utf-8', 'ignore') yield link except Exception: logging.warning("(lxml) html parse error") import traceback; traceback.print_exc()
def extract_links_from_html(base, body): try: html = lxml.html.fromstring(body) html.make_links_absolute(base) for element, attribute, link, pos in html.iterlinks(): if isinstance(link, unicode): link = link.encode('utf-8', 'ignore') yield link except StandardError: logging.warning("(lxml) html parse error") import traceback traceback.print_exc()
def fetch_links_from_web_page(self, page): log.debug('') try: # [ NOTE ]: Pull out all links after resolving them using any # <base> tags found in the document. links = [ link for element, attribute, link, pos in iterlinks( resolve_base_href(page.content)) ] except etree.ParseError: # [ NOTE ]: If the document is not HTML content this will return # an empty list. links = [] return list(set(links))
def getLinks(url, startswith=None, endswith=None): page = urllib2.urlopen(url) html = lxml.html.document_fromstring(page.read()) page.close() results = [] for element, attribute, link, pos in html.iterlinks(): if startswith: if link.startswith(startswith): results.append(link) elif endswith: if link.endswith(endswith): results.append(link) return results
def get_uri(self, url, html): if url is not None and html is not None: print(url) parsed_uri = urlparse(url) domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) html.make_links_absolute(url) for l in html.iterlinks(): parsed_uri = urlparse(l[2]) curr_domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) if curr_domain == domain: if l[2] not in self.urls: self.pool.put(l[2]) self.urls.add(l[2])
def getLinks(url, startswith=None, endswith=None, urllib=urllib2): page = urllib.urlopen(url) html = lxml.html.document_fromstring(page.read()) page.close() results = [] for element, attribute, link, pos in html.iterlinks(): if startswith: if link.startswith(startswith): results.append(link) elif endswith: if link.endswith(endswith): results.append(link) return results
def getLinks(url, startswith=None, endswith=None): html = '' results = [] content = patient_urlopen(url, sleep_time=30) html = lxml.html.document_fromstring(content) for element, attribute, link, pos in html.iterlinks(): if startswith: if link.startswith(startswith): results.append(link) elif endswith: if link.endswith(endswith): results.append(link) return results
def main(): url = sys.argv[1] html = parse(url).getroot() # make the links absolute for those that are local html.make_links_absolute(html.base_url, True) # remember the domain domain = urlparse(html.base_url).netloc # print all links on the page for element, attribute, link, pos in html.iterlinks(): # only print if it's a local link and not self-referential link_domain = urlparse(link).netloc selfReference = (url.split("//")[1] + "#") in link if (link_domain == domain or link_domain == "www." + domain) and not selfReference: print link
def create_plps(self, category_limit, min_wait_time): plps = set([]) assert self.description == 'homepage', "Must create plp's beginning with the homepage." nav_categories = self.tree.xpath('//nav') count = 0 self.last_scrape = time.time() for category in nav_categories[2:]: for a,b,link,d in html.iterlinks(category): self.throttle(min_wait_time) if link[0:4] != 'http' and link not in ('/Store/catalog/shopAllBrands.jsp', '#', '/Store/cart/cart.jsp') and count < category_limit: url = self.url + link print count, 'PLPs loaded so far. Loading PLP from ', url plps.add(Page(url,'plp', self.user_agent, baseurl)) self.last_scrape = time.time() count += 1 return plps
def walk(self, link): print("DEBUG: walk getting called with url " + link) try: self.base_url except AttributeError: (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(link) self.base_url = scheme + "://" + netloc print("DEBUG: self.baseurl: " + self.base_url) # handle relative urls try: if link.startswith("/") or (not link.startswith("http://") and not link.startswith("https://")): link = self.base_url + link except AttributeError as e: print(e) if link in self.visited_urls: try: self.skip_count += 1 except AttributeError: self.skip_count = 1 print("DEBUG: self.skip_count: " + str(self.skip_count)) return print("DEBUG: walk opening url " + link) try: resp, content = self.httplib2.request(link, "GET") del resp except socket.error as error: print(error) return self.visited_urls.append(link) print("DEBUG: self.visited_urls:", len(self.visited_urls)) try: html_dom = html.document_fromstring(content) except etree.ParserError as e: print(e) return for (element, attribute, link, pos) in html.iterlinks(html_dom): for url in self.walk(link): yield url yield link
def _get_desc(self, cr, uid, ids, field_name=None, arg=None, context=None): res = dict.fromkeys(ids, '') for module in self.browse(cr, uid, ids, context=context): path = get_module_resource(module.name, 'static/description/index.html') if path: with tools.file_open(path, 'rb') as desc_file: doc = desc_file.read() html = lxml.html.document_fromstring(doc) for element, attribute, link, pos in html.iterlinks(): if element.get('src') and not '//' in element.get('src') and not 'static/' in element.get('src'): element.set('src', "/%s/static/description/%s" % (module.name, element.get('src'))) res[module.id] = lxml.html.tostring(html) else: overrides = dict(embed_stylesheet=False, doctitle_xform=False, output_encoding='unicode') output = publish_string(source=module.description, settings_overrides=overrides, writer=MyWriter()) res[module.id] = output return res
def _extract_links(self, response_text, response_url, response_encoding): links = [] html = lxml.html.fromstring(response_text) html.make_links_absolute(response_url) for e, a, l, p in html.iterlinks(): if self.tag_func(e.tag): if self.attr_func(a): l = safe_url_string(l, response_encoding) text = u'' if e.text: text = str_to_unicode(e.text, response_encoding, errors='replace').strip() link = Link(self.process_func(l), text=text) links.append(link) links = unique_list(links, key=lambda link: link.url) \ if self.unique else links return links
def img_fixups(content, queueid, allowimgs, richformat): "Replace the CID links in Queued messages" html = local_fromstring(content) for element, attribute, link, _ in iterlinks(html): if not link.startswith('cid:'): if not allowimgs and attribute == 'src': element.attrib['src'] = '%simgs/blocked.gif' % media_url() element.attrib['title'] = link if richformat: flash(ugettext('This message contains external ' 'images, which have been blocked. ') + literal(link_to(ugettext('Display images'), url('queue-preview-with-imgs', queueid=queueid)))) else: imgname = link.replace('cid:', '') element.attrib['src'] = url('queue-preview-img', imgid=imgname.replace('/', '__xoxo__'), queueid=queueid) return tostring(html)