def _toc_from_html(self, opf): if 'toc' not in self.oeb.guide: return False self.log.debug('Reading TOC from HTML...') itempath, frag = urldefrag(self.oeb.guide['toc'].href) item = self.oeb.manifest.hrefs[itempath] html = item.data if frag: elems = xpath(html, './/*[@id="%s"]' % frag) if not elems: elems = xpath(html, './/*[@name="%s"]' % frag) elem = elems[0] if elems else html while elem != html and not xpath(elem, './/h:a[@href]'): elem = elem.getparent() html = elem titles = defaultdict(list) order = [] for anchor in xpath(html, './/h:a[@href]'): href = anchor.attrib['href'] href = item.abshref(urlnormalize(href)) path, frag = urldefrag(href) if path not in self.oeb.manifest.hrefs: continue title = xml2text(anchor) title = COLLAPSE_RE.sub(' ', title.strip()) if href not in titles: order.append(href) titles[href].append(title) toc = self.oeb.toc for href in order: toc.add(' '.join(titles[href]), href) return True
def get_links(url, depth, atmost_count): urldfg = urlparse.urldefrag(url) url = urldfg[0] urls_list = [] myopener = MyOpener() try: page = myopener.open(url) except: return [] text = page.read() page.close() url_parsed = urlparse.urlparse(url) domain_name_url_arr = url_parsed.netloc.split(".") soup = BeautifulSoup(text, "html.parser") for tag in soup.findAll('a', href=True): if atmost_count == 0: break; tag['href'] = urlparse.urljoin(url, tag['href']) new_url = urlparse.urldefrag(tag['href'])[0] new_url_parsed = urlparse.urlparse(new_url) domain_name_new_url_arr = new_url_parsed.netloc.split('.'); if len(domain_name_url_arr) >= 2 and len(domain_name_new_url_arr) >= 2: if domain_name_url_arr[-1] != domain_name_new_url_arr[-1] or domain_name_url_arr[-2] != domain_name_new_url_arr[-2]: continue; else: continue; if new_url[-4:] == '.pdf': continue; if new_url not in urls_list: urls_list.append([new_url, depth + 1]) atmost_count -= 1; return urls_list
def make_correct_link(base, link): """ makes links correct: http://... -- pass /... -- adding link to site before www. ... -- adding http:// before smth.html -- adding full path before it's planned to be a wrapper over urlparse's functions done in order to handle all possible cases of url presentation handles absolute url and relative url clean up all 'fragments' in url (like http://site.ru/1.html#4 -> http://site.ru/1.html) """ defrag_link, _ = urlparse.urldefrag(link) defrag_base, _ = urlparse.urldefrag(base) # case 'g.html on http://ya.ru/a ==> http://ya.ru/a/g.html # (added slash after a if it's a unslashed folder # defining unslashed folder: empty query (like 'a.php?set=1'), # no dots # no closing slash scheme, netloc, url, params, query, fragment = urlparse.urlparse(defrag_base) if url and not query and not re.search("/$", url) and not re.search("\.", url ): url += '/' defrag_base = urlparse.urlunparse( (scheme, netloc, url, params, query, fragment) ) #just rejoining all parts return_link = urlparse.urljoin(defrag_base, defrag_link) return return_link
def __init__(self, toc, j, renderlist, redirects): self.typedoc = StringIO.StringIO() self.toc = toc self.subs = {} # type: Dict self.docParent = {} # type: Dict self.docAfter = {} # type: Dict self.rendered = set() # type: Set self.redirects = redirects self.title = None # type: str for t in j: if "extends" in t: for e in aslist(t["extends"]): add_dictlist(self.subs, e, t["name"]) #if "docParent" not in t and "docAfter" not in t: # add_dictlist(self.docParent, e, t["name"]) if t.get("docParent"): add_dictlist(self.docParent, t["docParent"], t["name"]) if t.get("docChild"): for c in aslist(t["docChild"]): add_dictlist(self.docParent, t["name"], c) if t.get("docAfter"): add_dictlist(self.docAfter, t["docAfter"], t["name"]) _, _, metaschema_loader = schema.get_metaschema() alltypes = schema.extend_and_specialize(j, metaschema_loader) self.typemap = {} # type: Dict self.uses = {} # type: Dict self.record_refs = {} # type: Dict for t in alltypes: self.typemap[t["name"]] = t try: if t["type"] == "record": self.record_refs[t["name"]] = [] for f in t.get("fields", []): p = has_types(f) for tp in p: if tp not in self.uses: self.uses[tp] = [] if (t["name"], f["name"]) not in self.uses[tp]: _, frg1 = urlparse.urldefrag(t["name"]) _, frg2 = urlparse.urldefrag(f["name"]) self.uses[tp].append((frg1, frg2)) if tp not in basicTypes and tp not in self.record_refs[t["name"]]: self.record_refs[t["name"]].append(tp) except KeyError as e: _logger.error("Did not find 'type' in %s", t) raise for f in alltypes: if (f["name"] in renderlist or ((not renderlist) and ("extends" not in f) and ("docParent" not in f) and ("docAfter" not in f))): self.render_type(f, 1)
def crawlWeb(UrlafterConnect,keyword): if not UrlafterConnect: print("Url is empty") return list() #Get all the links soup = BeautifulSoup(UrlafterConnect) urllist = [] #check for the existence of keyword IR and crawl on those urls if re.search(keyword, str(soup), re.IGNORECASE) != None: for link in soup.find_all('a', href=True): crawl = link.get('href') crawl_url = crawl.encode('utf-8') if not crawl_url: continue #links present in the same directory of /wiki, if so convert them to http form if crawl_url.startswith('/wiki'): if (crawl_url.find(':') == -1) and (crawl_url != "/wiki/Main_Page"): crawl_url = urlparse.urljoin("http://en.wikipedia.org",crawl_url) crawl_url, frag = urlparse.urldefrag(crawl_url) urllist.append(crawl_url) else: #Get only wiki links without colons in it and not redirecting to main page if crawl_url.startswith('http://en.wikipedia.org'): if crawl_url != "http://en.wikipedia.org/wiki/Main_Page": s = "http://en" crawl = crawl_url.lstrip("http://en") if crawl.find(':') == -1: crawl_url, frag = urlparse.urldefrag(crawl_url) urllist.append(crawl_url) #Remove duplicate entries from the list while returning return list(set(urllist))
def startElementNS(self, name, qname, attrs): stack = self.stack stack.append(ElementHandler()) current = self.current parent = self.parent base = attrs.get(BASE, None) if base is not None: base, frag = urldefrag(base) if parent and parent.base: base = urljoin(parent.base, base) else: systemId = self.locator.getPublicId() or self.locator.getSystemId() if systemId: base = urljoin(systemId, base) else: if parent: base = parent.base if base is None: systemId = self.locator.getPublicId() or self.locator.getSystemId() if systemId: base, frag = urldefrag(systemId) current.base = base language = attrs.get(LANG, None) if language is None: if parent: language = parent.language current.language = language current.start(name, qname, attrs)
def _urljoin(base, url): """ Construct a full ("absolute") URL by combining a "base URL" with another URL. Informally, this uses components of the base URL, in particular the addressing scheme, the network location and (part of) the path, to provide missing components in the relative URL. Additionally, the fragment identifier is preserved according to the HTTP 1.1 bis draft. @type base: C{bytes} @param base: Base URL. @type url: C{bytes} @param url: URL to combine with C{base}. @return: An absolute URL resulting from the combination of C{base} and C{url}. @see: L{urlparse.urljoin} @see: U{https://tools.ietf.org/html/draft-ietf-httpbis-p2-semantics-22#section-7.1.2} """ base, baseFrag = urldefrag(base) url, urlFrag = urldefrag(urljoin(base, url)) return urljoin(url, b'#' + (urlFrag or baseFrag))
def parse_showings_table(self, response): movie_title = response.meta['movieTitle'] movie_url = response.meta['movieUrl'] showings_table_value = response.meta['showingsTableValue'] theater_url = response.meta['theaterUrl'] version = response.meta['version'] showings_table = response.xpath('//div[@class="cinema-movie clearfix"]/div[@value="' + showings_table_value + '"]') at_least_one_showing_found = False jump_links = showings_table.css('.jump-to-show').xpath('a') if len(jump_links) >= 1: jump_link = jump_links[-1] if jump_link.xpath('text()').extract_first().endswith(u'>'): jump_url = urldefrag(response.urljoin(jump_link.xpath('@href').extract_first()))[0] request = scrapy.Request(jump_url, callback=self.parse_showings_table) request.meta['movieTitle'] = movie_title request.meta['movieUrl'] = movie_url request.meta['showingsTableValue'] = showings_table_value request.meta['theaterUrl'] = theater_url request.meta['version'] = version yield request else: for showings_column in showings_table.css('.cinema-movie-dates').xpath('li'): for showing_cell in showings_column.xpath('ul/li/a'): at_least_one_showing_found = True dayAndMonth = showings_column.xpath('div[2]/text()').extract_first().split('/') day = int(dayAndMonth[0]) month = int(dayAndMonth[1]) hourAndMinute = showing_cell.xpath('text()').extract_first().split(':') hour = int(hourAndMinute[0]) minute = int(hourAndMinute[1]) #seating_info = showing_cell.xpath('@title').extract_first()[len('<div>'):len('</div>')] seating_info = showing_cell.xpath('@title').extract_first()[len('<div>'):-len('</div>')].split('</div><div>') date_obj = datetime(datetime.now().year, month, day, hour, minute) if date_obj < datetime.now(): date_obj = datetime(datetime.now().year + 1, month, day, hour, minute) showing = ShowingItem() showing['movieTitle'] = movie_title showing['movieUrl'] = movie_url showing['theaterUrl'] = theater_url showing['seatingInfo'] = seating_info showing['showingUrl'] = response.urljoin(showing_cell.xpath('@href').extract_first()) showing['start'] = date_obj.strftime('%Y-%m-%dT%H:%M:00') showing['version'] = version yield showing if at_least_one_showing_found: next_page = showings_table.css('.showtimes-extra').xpath('a[last()]') if next_page: next_page_url = urldefrag(response.urljoin(next_page.xpath('@href')[0].extract()))[0] request = scrapy.Request(next_page_url, callback=self.parse_showings_table) request.meta['movieTitle'] = movie_title request.meta['movieUrl'] = movie_url request.meta['showingsTableValue'] = showings_table_value request.meta['theaterUrl'] = theater_url request.meta['version'] = version yield request
def get_links(response): if 300 <= response.status_code < 400 and response.headers['location']: # redirect yield urlparse.urldefrag(urlparse.urljoin(response.url, response.headers['location'], False))[0] try: html = beautify(response) for i in html.findAll('a', href=True): yield urlparse.urldefrag(urlparse.urljoin(response.url, i['href'], False))[0] except NotHtmlException: pass
def job(self, joborder, basedir, output_callback, **kwargs): # Validate job order validate.validate_ex(self.names.get_name("input_record_schema", ""), joborder) requirements = kwargs.get("requirements", []) + self.tool.get("requirements", []) hints = kwargs.get("hints", []) + self.tool.get("hints", []) steps = [makeTool(step, basedir) for step in self.tool.get("steps", [])] random.shuffle(steps) self.state = {} self.processStatus = "success" for i in self.tool["inputs"]: (_, iid) = urlparse.urldefrag(i["id"]) if iid in joborder: self.state[i["id"]] = WorkflowStateItem(i, copy.deepcopy(joborder[iid])) elif "default" in i: self.state[i["id"]] = WorkflowStateItem(i, copy.deepcopy(i["default"])) else: raise WorkflowException("Input '%s' not in input object and does not have a default value." % (i["id"])) for s in steps: for out in s.tool["outputs"]: self.state[out["id"]] = None s.completed = False completed = 0 while completed < len(steps): made_progress = False completed = 0 for step in steps: if step.completed: completed += 1 else: for newjob in self.try_make_job(step, basedir, requirements=requirements, hints=hints, **kwargs): if newjob: made_progress = True yield newjob if not made_progress and completed < len(steps): yield None wo = {} for i in self.tool["outputs"]: if "connect" in i: (_, src) = urlparse.urldefrag(i['id']) if i["connect"]["source"] not in self.state: raise WorkflowException("Connect source '%s' on parameter '%s' does not exist" % (i["connect"]["source"], inp["id"])) wo[src] = self.state[i["connect"]["source"]].value output_callback(wo, self.processStatus)
def crawl_web( scope, tocrawl, index, graph, url_info, limits = [-1, 0, 0.0, 1.0]): # returns index, graph of inlinks tocrawl_next = [] # used for depth control depth = 0 pages = 0 max_pages, max_depth, max_time, time_delay = limits if max_time > 0.0: start_time = time() while tocrawl or tocrawl_next: if not tocrawl: # # Descent one more level (depth) # tocrawl = tocrawl_next tocrawl_next = [] depth += 1 if max_depth >= 0 and depth > max_depth: print 'Reached maximum depth. Interrupting crawler.' break page = tocrawl.pop(0) # Remove fragment portion from the url page = urlparse.urldefrag(page)[0] if not page in graph: pages += 1 print 'Crawling page:', page if max_time != 0.0: print 'time = ', time()-start_time, ' max_time = ', max_time if max_pages > 0: print 'Pages crawled:', pages, 'max_pages = ', max_pages # [ToDo:]Transform meta_data into a dictionary text, outlinks, meta_data = get_page( page) add_page_to_index( index, page, text) # Need to filter outlinks only to current scope outlinks = [ [urlparse.urldefrag(l[0])[0],l[1]] for l in outlinks if is_inscope( scope, l[0]) and (l[0].endswith('.html') or l[0].endswith('.htm')) ] newlinks = [ urlparse.urldefrag(l[0])[0] for l in outlinks] graph[page] = outlinks url_info[page] = meta_data tocrawl_next = list( set(tocrawl_next + newlinks)) if pages >= max_pages: print 'Reached number of pages limit. Interrupting crawler.' break if max_time > 0.0 and max_time > time()-start_time: print 'Reached time limit. Interrupting crawler.' break tocrawl = list( set(tocrawl + tocrawl_next)) return tocrawl, index, graph, url_info
def _toc_from_navpoint(self, item, toc, navpoint): children = xpath(navpoint, 'ncx:navPoint') for child in children: title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()')) title = COLLAPSE_RE.sub(' ', title.strip()) href = xpath(child, 'ncx:content/@src') if not title: self._toc_from_navpoint(item, toc, child) continue if not href: gc = xpath(child, 'ncx:navPoint') if not gc: # This node is useless continue href = 'missing.html' href = item.abshref(urlnormalize(href[0])) path, _ = urldefrag(href) if path not in self.oeb.manifest.hrefs: self.logger.warn('TOC reference %r not found' % href) gc = xpath(child, 'ncx:navPoint') if not gc: # This node is useless continue id = child.get('id') klass = child.get('class', 'chapter') try: po = int(child.get('playOrder', self.oeb.toc.next_play_order())) except: po = self.oeb.toc.next_play_order() authorElement = xpath(child, 'descendant::calibre:meta[@name = "author"]') if authorElement : author = authorElement[0].text else : author = None descriptionElement = xpath(child, 'descendant::calibre:meta[@name = "description"]') if descriptionElement: description = etree.tostring(descriptionElement[0], method='text', encoding=unicode).strip() if not description: description = None else : description = None index_image = xpath(child, 'descendant::calibre:meta[@name = "toc_thumbnail"]') toc_thumbnail = (index_image[0].text if index_image else None) if not toc_thumbnail or not toc_thumbnail.strip(): toc_thumbnail = None node = toc.add(title, href, id=id, klass=klass, play_order=po, description=description, author=author, toc_thumbnail=toc_thumbnail) self._toc_from_navpoint(item, node, child)
def load_path(db, path): basepath, fragment = urldefrag(path) with closing(urllib.urlopen(basepath)) as f: contents = f.read() # Is it a db pickle? try: source = DbSource(contents) for a, b, cs in source.word_lists(): db.extend(a, b, cs) return except: pass # Is it a pdf? try: source = PdfSource(contents) for a, b, c in source.triples(): db.append(a, b, c) return except: pass # treat it as text source = TextSource(contents, fragment) for a, b, c in source.triples(): db.append(a, b, c)
def __init__(self, toolpath_object, **kwargs): try: makeTool = kwargs.get("makeTool") self.embedded_tool = makeTool(toolpath_object["run"], **kwargs) except validate.ValidationException as v: raise WorkflowException("Tool definition %s failed validation:\n%s" % (toolpath_object["run"]["id"], validate.indent(str(v)))) if "id" in toolpath_object: self.id = toolpath_object["id"] else: self.id = "#step_" + str(random.randint(1, 1000000000)) for field in ("inputs", "outputs"): for i in toolpath_object[field]: inputid = i["id"] (_, d) = urlparse.urldefrag(inputid) frag = d.split(".")[-1] p = urlparse.urljoin(toolpath_object["run"].get("id", self.id), "#" + frag) found = False for a in self.embedded_tool.tool[field]: if a["id"] == p: i.update(a) found = True if not found: raise WorkflowException("Did not find %s parameter '%s' in workflow step" % (field, p)) i["id"] = inputid super(WorkflowStep, self).__init__(toolpath_object, "Process", do_validate=False, **kwargs) if self.embedded_tool.tool["class"] == "Workflow": (feature, _) = self.get_requirement("SubworkflowFeatureRequirement") if not feature: raise WorkflowException("Workflow contains embedded workflow but SubworkflowFeatureRequirement not declared")
def __init__(self, request, timeout=180): self.url = urldefrag(request.url)[0] self.method = request.method self.body = request.body or None self.headers = Headers(request.headers) self.response_headers = None self.timeout = request.meta.get('download_timeout') or timeout self.start_time = time() self.deferred = defer.Deferred().addCallback(self._build_response, request) # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected # to have _disconnectedDeferred. See Twisted r32329. # As Scrapy implements it's own logic to handle redirects is not # needed to add the callback _waitForDisconnect. # Specifically this avoids the AttributeError exception when # clientConnectionFailed method is called. self._disconnectedDeferred = defer.Deferred() self._set_connection_attributes(request) # set Host header based on url self.headers.setdefault('Host', self.netloc) # set Content-Length based len of body if self.body is not None: self.headers['Content-Length'] = len(self.body) # just in case a broken http/1.1 decides to keep connection alive self.headers.setdefault("Connection", "close")
def grab_links(self): if self.document is not None: for item in self.document.xpath('//a/@href'): item = urldefrag(item)[0] url = urlparse(item) if url.geturl() and item not in self.crawler.visited_urls and url.hostname in self.processor.allowed_urls: self.crawler.urls.put(item)
def serialize_href(self, href, base=None): ''' Serialize the href attribute of an <a> or <reference> tag. It is serialized as filepos="000000000" and a pointer to its location is stored in self.href_offsets so that the correct value can be filled in at the end. ''' hrefs = self.oeb.manifest.hrefs try: path, frag = urldefrag(urlnormalize(href)) except ValueError: # Unparseable URL return False if path and base: path = base.abshref(path) if path and path not in hrefs: return False buf = self.buf item = hrefs[path] if path else None if item and item.spine_position is None: return False path = item.href if item else base.href href = '#'.join((path, frag)) if frag else path buf.write(b'filepos=') self.href_offsets[href].append(buf.tell()) buf.write(b'0000000000') return True
def write_opf(self, guide, toc, spine, resource_map): mi = self.header.exth.mi if (self.cover_offset is not None and self.cover_offset < len(resource_map)): mi.cover = resource_map[self.cover_offset] if len(list(toc)) < 2: self.log.warn('KF8 has no metadata Table of Contents') for ref in guide: if ref.type == 'toc': href = ref.href() href, frag = urldefrag(href) if os.path.exists(href.replace('/', os.sep)): try: toc = self.read_inline_toc(href, frag) except: self.log.exception('Failed to read inline ToC') opf = OPFCreator(os.getcwdu(), mi) opf.guide = guide def exclude(path): return os.path.basename(path) == 'debug-raw.html' opf.create_manifest_from_files_in([os.getcwdu()], exclude=exclude) opf.create_spine(spine) opf.set_toc(toc) with open('metadata.opf', 'wb') as of, open('toc.ncx', 'wb') as ncx: opf.render(of, ncx, 'toc.ncx') return 'metadata.opf'
def validate_document(document_loader, workflowobj, uri, enable_dev=False, strict=True, preprocess_only=False): # type: (Loader, Dict[unicode, Any], unicode, bool, bool, bool) -> Tuple[Loader, Names, Any, Dict[str, str], unicode] """Validate a CWL document.""" jobobj = None if "cwl:tool" in workflowobj: jobobj = workflowobj uri = urlparse.urljoin(uri, jobobj["cwl:tool"]) del jobobj["cwl:tool"] workflowobj = fetch_document(uri)[1] if isinstance(workflowobj, list): workflowobj = { "$graph": workflowobj } fileuri = urlparse.urldefrag(uri)[0] if "cwlVersion" in workflowobj: workflowobj["cwlVersion"] = re.sub( r"^(?:cwl:|https://w3id.org/cwl/cwl#)", "", workflowobj["cwlVersion"]) else: workflowobj["cwlVersion"] = "draft-2" if workflowobj["cwlVersion"] == "draft-2": workflowobj = update._draft2toDraft3dev1( workflowobj, document_loader, uri, update_steps=False) if "@graph" in workflowobj: workflowobj["$graph"] = workflowobj["@graph"] del workflowobj["@graph"] (document_loader, avsc_names) = \ process.get_schema(workflowobj["cwlVersion"])[:2] if isinstance(avsc_names, Exception): raise avsc_names workflowobj["id"] = fileuri processobj, metadata = document_loader.resolve_all(workflowobj, fileuri) if preprocess_only: return document_loader, avsc_names, processobj, metadata, uri document_loader.validate_links(processobj) schema.validate_doc(avsc_names, processobj, document_loader, strict) if not metadata: metadata = {"$namespaces": processobj.get("$namespaces", {}), "$schemas": processobj.get("$schemas", []), "cwlVersion": processobj["cwlVersion"]} if metadata.get("cwlVersion") != update.LATEST: processobj = update.update( processobj, document_loader, fileuri, enable_dev, metadata) if jobobj: metadata["cwl:defaults"] = jobobj return document_loader, avsc_names, processobj, metadata, uri
def serialize_guide(self): ''' The Kindle decides where to open a book based on the presence of an item in the guide that looks like <reference type="text" title="Start" href="chapter-one.xhtml"/> Similarly an item with type="toc" controls where the Goto Table of Contents operation on the kindle goes. ''' buf = self.buf hrefs = self.oeb.manifest.hrefs buf.write(b'<guide>') for ref in self.oeb.guide.values(): path = urldefrag(ref.href)[0] if path not in hrefs or hrefs[path].media_type not in OEB_DOCS: continue buf.write(b'<reference type="') if ref.type.startswith('other.') : self.serialize_text(ref.type.replace('other.',''), quot=True) else: self.serialize_text(ref.type, quot=True) buf.write(b'" ') if ref.title is not None: buf.write(b'title="') self.serialize_text(ref.title, quot=True) buf.write(b'" ') if is_guide_ref_start(ref): self._start_href = ref.href self.serialize_href(ref.href) # Space required or won't work, I kid you not buf.write(b' />') buf.write(b'</guide>')
def reduce_url(cls, url): """ >>> url = "qpfer://mother/qfid#module_and_type_hint" >>> qfurl.reduce_url(url) 'qpfer://mother/qfid' """ return urlparse.urldefrag(url)[0].replace("///", "", 1)
def make_tool(document_loader, avsc_names, processobj, metadata, uri, makeTool, kwargs): # type: (Loader, Names, Dict[str, Any], Dict[str, Any], unicode, Callable[..., Process], Dict[str, Any]) -> Process """Make a Python CWL object.""" resolveduri = document_loader.resolve_ref(uri)[0] if isinstance(resolveduri, list): if len(resolveduri) == 1: processobj = resolveduri[0] else: raise WorkflowException( u"Tool file contains graph of multiple objects, must specify " "one of #%s" % ", #".join( urlparse.urldefrag(i["id"])[1] for i in resolveduri if "id" in i)) else: processobj = cast(Dict[str, Any], resolveduri) kwargs = kwargs.copy() kwargs.update({ "makeTool": makeTool, "loader": document_loader, "avsc_names": avsc_names, "metadata": metadata }) tool = makeTool(processobj, **kwargs) if "cwl:defaults" in metadata: jobobj = metadata["cwl:defaults"] for inp in tool.tool["inputs"]: if shortname(inp["id"]) in jobobj: inp["default"] = jobobj[shortname(inp["id"])] return tool
def url(self, name, force=False): """ Returns the real URL in DEBUG mode. """ if settings.DEBUG and not force: hashed_name, fragment = name, '' else: clean_name, fragment = urldefrag(name) if urlsplit(clean_name).path.endswith('/'): # don't hash paths hashed_name = name else: cache_key = self.cache_key(name) hashed_name = self.cache.get(cache_key) if hashed_name is None: hashed_name = self.hashed_name(clean_name).replace('\\', '/') # set the cache if there was a miss # (e.g. if cache server goes down) self.cache.set(cache_key, hashed_name) final_url = super(CachedFilesMixin, self).url(hashed_name) # Special casing for a @font-face hack, like url(myfont.eot?#iefix") # http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax query_fragment = '?#' in name # [sic!] if fragment or query_fragment: urlparts = list(urlsplit(final_url)) if fragment and not urlparts[4]: urlparts[4] = fragment if query_fragment and not urlparts[3]: urlparts[2] += '?' final_url = urlunsplit(urlparts) return unquote(final_url)
def receive_output(self, jobout, processStatus): _logger.debug("WorkflowStep output from run is %s", jobout) self.output = {} for i in self.tool["outputs"]: (_, d) = urlparse.urldefrag(i["param"] if "param" in i else i["id"]) self.output[i["id"]] = jobout[d] self.processStatus = processStatus
def map_resources(self, oeb_book): for item in oeb_book.manifest: if item.media_type in OEB_IMAGES: if item.href not in self.images: ext = os.path.splitext(item.href)[1] fname = '%s%s' % (len(self.images), ext) fname = fname.zfill(10) self.images[item.href] = fname if item in oeb_book.spine: self.get_link_id(item.href) root = item.data.find(XHTML('body')) link_attrs = set(html.defs.link_attrs) link_attrs.add(XLINK('href')) for el in root.iter(): attribs = el.attrib try: if not isinstance(el.tag, basestring): continue except: continue for attr in attribs: if attr in link_attrs: href = item.abshref(attribs[attr]) href, id = urldefrag(href) if href in self.base_hrefs: self.get_link_id(href, id)
def handle_starttag(self, tag, attrs): """ """ self.html += "<%s" % tag for attr in attrs: if attr[0] == "href": try: # split anchor from url baseurl, anchor = urlparse.urldefrag(attr[1]) o = self.context.restrictedTraverse( urllib.unquote(baseurl)) if getattr(o, 'absolute_url', None): url = o.absolute_url() else: # maybe we got a view instead of an traversal object: if getattr(o, 'context', None): url = o.context.absolute_url() else: url = attr[1] if anchor: url = '#' + anchor except Exception: url = attr[1] self.html += ' href="%s"' % self._encode(url) else: self.html += ' %s="%s"' % (attr) self.html += ">"
def escape_ajax(url): """ Return the crawleable url according to: http://code.google.com/web/ajaxcrawling/docs/getting-started.html >>> escape_ajax("www.example.com/ajax.html#!key=value") 'www.example.com/ajax.html?_escaped_fragment_=key=value' >>> escape_ajax("www.example.com/ajax.html?k1=v1&k2=v2#!key=value") 'www.example.com/ajax.html?k1=v1&k2=v2&_escaped_fragment_=key=value' >>> escape_ajax("www.example.com/ajax.html?#!key=value") 'www.example.com/ajax.html?_escaped_fragment_=key=value' >>> escape_ajax("www.example.com/ajax.html#!") 'www.example.com/ajax.html?_escaped_fragment_=' URLs that are not "AJAX crawlable" (according to Google) returned as-is: >>> escape_ajax("www.example.com/ajax.html#key=value") 'www.example.com/ajax.html#key=value' >>> escape_ajax("www.example.com/ajax.html#") 'www.example.com/ajax.html#' >>> escape_ajax("www.example.com/ajax.html") 'www.example.com/ajax.html' """ defrag, frag = urlparse.urldefrag(url) if not frag.startswith('!'): return url return add_or_replace_parameter(defrag, '_escaped_fragment_', frag[1:])
def find_mention_item(self, items): """Returns the mf2 item that mentions (or replies to, likes, etc) the target. May modify the items arg, e.g. may set or replace content.html or content.value. Args: items: sequence of mf2 item dicts Returns: mf2 item dict or None """ # find target URL in source for item in items: props = item.setdefault('properties', {}) # find first non-empty content element content = props.setdefault('content', [{}])[0] text = content.get('html') or content.get('value') for type in 'in-reply-to', 'like', 'like-of', 'repost', 'repost-of': urls = [urlparse.urldefrag(u)[0] for u in microformats2.get_string_urls(props.get(type, []))] if self.any_target_in(urls): break else: if text and self.any_target_in(text): type = 'post' url = first_value(props, 'url') or self.source_url name = first_value(props, 'name') or first_value(props, 'summary') text = content['html'] = ('mentioned this in %s.' % util.pretty_link(url, text=name, max_length=280)) else: type = None if type: # found the target! rsvp = first_value(props, 'rsvp') if rsvp: self.entity.type = 'rsvp' if not text: content['value'] = 'RSVPed %s.' % rsvp else: self.entity.type = {'in-reply-to': 'comment', 'like-of': 'like', 'repost-of': 'repost', }.get(type, type) if not text: content['value'] = {'comment': 'replied to this.', 'like': 'liked this.', 'repost': 'reposted this.', }[self.entity.type] return item # check children in case this is eg an h-feed found = self.find_mention_item(item.get('children', [])) if found: return found return None
def _spine_add_extra(self): manifest = self.oeb.manifest spine = self.oeb.spine unchecked = set(spine) selector = XPath('h:body//h:a/@href') extras = set() while unchecked: new = set() for item in unchecked: if item.media_type not in OEB_DOCS: # TODO: handle fallback chains continue for href in selector(item.data): href, _ = urldefrag(href) if not href: continue try: href = item.abshref(urlnormalize(href)) except ValueError: # Malformed URL continue if href not in manifest.hrefs: continue found = manifest.hrefs[href] if found.media_type not in OEB_DOCS or \ found in spine or found in extras: continue new.add(found) extras.update(new) unchecked = new version = int(self.oeb.version[0]) for item in sorted(extras): if version >= 2: self.logger.warn( 'Spine-referenced file %r not in spine' % item.href) spine.add(item, linear=False)
def typefmt(self, tp, redirects, nbsp=False): # type: (Any, Dict[str, str], bool) -> Union[str, unicode] global primitiveType if isinstance(tp, list): if nbsp and len(tp) <= 3: return " | ".join([self.typefmt(n, redirects) for n in tp]) else: return " | ".join([self.typefmt(n, redirects) for n in tp]) if isinstance(tp, dict): if tp["type"] == "https://w3id.org/cwl/salad#array": return "array<%s>" % (self.typefmt(tp["items"], redirects, nbsp=True)) if tp["type"] in ("https://w3id.org/cwl/salad#record", "https://w3id.org/cwl/salad#enum"): frg = schema.avro_name(tp["name"]) if tp["name"] in redirects: return """<a href="%s">%s</a>""" % (redirects[tp["name"]], frg) elif tp["name"] in self.typemap: return """<a href="#%s">%s</a>""" % (to_id(frg), frg) else: return frg if isinstance(tp["type"], dict): return self.typefmt(tp["type"], redirects) else: if str(tp) in redirects: return """<a href="%s">%s</a>""" % (redirects[tp], redirects[tp]) elif str(tp) in basicTypes: return """<a href="%s">%s</a>""" % (primitiveType, schema.avro_name(str(tp))) else: _, frg = urlparse.urldefrag(tp) if frg: tp = frg return """<a href="#%s">%s</a>""" % (to_id(tp), tp)
def tree_to_binary(self, elem, nsrmap=NSRMAP, parents=[], inhead=False, preserve=False): if not isinstance(elem.tag, basestring): # Don't emit any comments or raw entities return nsrmap = copy.copy(nsrmap) attrib = dict(elem.attrib) style = self.stylizer.style(elem) if self.stylizer else None for key, value in elem.nsmap.items(): if value not in nsrmap or nsrmap[value] != key: xmlns = ('xmlns:' + key) if key else 'xmlns' attrib[xmlns] = value nsrmap[value] = key tag = prefixname(elem.tag, nsrmap) tag_offset = self.buf.tell() if tag == 'head': inhead = True flags = FLAG_OPENING if not elem.text and len(elem) == 0: flags |= FLAG_CLOSING if inhead: flags |= FLAG_HEAD if style and self.is_block(style): flags |= FLAG_BLOCK self.write(0, flags) tattrs = self.tattrs[0] if tag in self.tags: index = self.tags[tag] self.write(index) if self.tattrs[index]: tattrs = self.tattrs[index] else: self.write(FLAG_CUSTOM, len(tag)+1, tag) last_break = self.page_breaks[-1][0] if self.page_breaks else None if style and last_break != tag_offset \ and style['page-break-before'] in PAGE_BREAKS: self.page_breaks.append((tag_offset, list(parents))) for attr, value in attrib.items(): attr = prefixname(attr, nsrmap) if attr in ('href', 'src'): value = urlnormalize(value) path, frag = urldefrag(value) if self.item: path = self.item.abshref(path) prefix = unichr(3) if path in self.manifest.hrefs: prefix = unichr(2) value = self.manifest.hrefs[path].id if frag: value = '#'.join((value, frag)) value = prefix + value elif attr in ('id', 'name'): self.anchors.append((value, tag_offset)) elif attr.startswith('ms--'): attr = '%' + attr[4:] elif tag == 'link' and attr == 'type' and value in OEB_STYLES: value = CSS_MIME if attr in tattrs: self.write(tattrs[attr]) else: self.write(FLAG_CUSTOM, len(attr)+1, attr) try: self.write(ATTR_NUMBER, int(value)+1) except ValueError: self.write(len(value)+1, value) self.write(0) old_preserve = preserve if style: preserve = (style['white-space'] in ('pre', 'pre-wrap')) xml_space = elem.get(XML('space')) if xml_space == 'preserve': preserve = True elif xml_space == 'normal': preserve = False if elem.text: if preserve: self.write(elem.text) elif len(elem) == 0 or not elem.text.isspace(): self.write(COLLAPSE.sub(' ', elem.text)) # else: de nada parents.append(tag_offset) child = cstyle = nstyle = None for next in chain(elem, [None]): if self.stylizer: nstyle = None if next is None else self.stylizer.style(next) if child is not None: if not preserve \ and (inhead or not nstyle or self.is_block(cstyle) or self.is_block(nstyle)) \ and child.tail and child.tail.isspace(): child.tail = None self.tree_to_binary(child, nsrmap, parents, inhead, preserve) child, cstyle = next, nstyle parents.pop() preserve = old_preserve if not flags & FLAG_CLOSING: self.write(0, (flags & ~FLAG_OPENING) | FLAG_CLOSING, 0) if elem.tail and tag != 'html': tail = elem.tail if not preserve: tail = COLLAPSE.sub(' ', tail) self.write(tail) if style and style['page-break-after'] not in ('avoid', 'auto'): self.page_breaks.append((self.buf.tell(), list(parents)))
def spine_item(tocitem): href = urldefrag(tocitem.href)[0] for item in self.oeb.spine: if item.href == href: return item
def submit_video(request): sitelocation = SiteLocation.objects.get_current() if not (request.user_is_admin() or sitelocation.display_submit_button): raise Http404 # Extract construction hint, if it exists. # This is a hint that plugins can use to slightly change the behavior # of the video submission forms. construction_hint = (request.POST.get('construction_hint', None) or request.GET.get('construction_hint', None)) url = request.POST.get('url') or request.GET.get('url', '') if request.method == "GET" and not url: submit_form = forms.SubmitVideoForm( construction_hint=construction_hint) return render_to_response( 'localtv/submit_video/submit.html', {'form': submit_form}, context_instance=RequestContext(request)) else: url = urlparse.urldefrag(url)[0] submit_form = forms.SubmitVideoForm({'url': url or ''}) if submit_form.is_valid(): existing = Video.objects.filter( Q(website_url=submit_form.cleaned_data['url']) | Q(file_url=submit_form.cleaned_data['url']), site=sitelocation.site) existing.filter(status=Video.REJECTED).delete() if existing.count(): if request.user_is_admin(): # even if the video was rejected, an admin submitting it # should make it approved # FIXME: This initiates a new query against the database - # so the rejected videos which were deleted will not be # marked approved. for v in existing.exclude( status=Video.ACTIVE): v.user = request.user v.status = Video.ACTIVE v.when_approved = datetime.datetime.now() v.save() return HttpResponseRedirect( reverse('localtv_submit_thanks', args=[existing[0].pk])) else: # pick the first approved video to point the user at videos = existing.filter(status=Video.ACTIVE) if videos.count(): video = videos[0] else: video = None return render_to_response( 'localtv/submit_video/submit.html', {'form': forms.SubmitVideoForm( construction_hint=construction_hint), 'was_duplicate': True, 'video': video}, context_instance=RequestContext(request)) vidscraper_video = utils.get_vidscraper_video( submit_form.cleaned_data['url']) get_dict = {'url': submit_form.cleaned_data['url']} if 'construction_hint' in request.GET: get_dict['construction_hint'] = construction_hint if 'bookmarklet' in request.GET: get_dict['bookmarklet'] = '1' get_params = urllib.urlencode(get_dict) if vidscraper_video: if (vidscraper_video.link and vidscraper_video.link != get_dict['url']): request.POST = { 'url': vidscraper_video.link.encode('utf8')} # rerun the view, but with the canonical URL return submit_video(request) if (vidscraper_video.embed_code or (vidscraper_video.file_url and not vidscraper_video.file_url_expires)): return HttpResponseRedirect( reverse('localtv_submit_scraped_video') + '?' + get_params) # otherwise if it looks like a video file if is_video_url(submit_form.cleaned_data['url']): return HttpResponseRedirect( reverse('localtv_submit_directlink_video') + '?' + get_params) else: return HttpResponseRedirect( reverse('localtv_submit_embedrequest_video') + '?' + get_params) else: return render_to_response( 'localtv/submit_video/submit.html', {'form': submit_form}, context_instance=RequestContext(request))
print('scheme = ', p.scheme) #Print the scheme parmeter from the result #page 140 print(parse_qs(p.query)) # {'shape': ['square'], 'dpi': ['96']} print("Query Parmeters: ") r = parse_qs('mode=topographic&pin=Boston&pin=San%20Francisco') print(r) # r is a dictinary import pprint pp = pprint.PrettyPrinter(indent=8) print("Query Parmeters using pprint : ") pp.pprint(r) #Remove the anchor (#), u = 'http://docs.python.org/library/urlparse.html#item22' udfrag = urldefrag(u) # the retrun type is a tuple #('http://docs.python.org/library/urlparse.html', 'urlparse.urldefrag') print("URL defrag :") length = len(udfrag) # Get the number of items in a udfrag Tuple print("Tuple Length :", length) print(udfrag) # ('http://docs.python.org/library/urlparse.html','item22'') print(udfrag[0]) # http://docs.python.org/library/urlparse.html print(udfrag[1]) # 'item22' print("slice", udfrag[0:length]) print "this is a tuple: %s" % (udfrag, ) # Another way to print a tuple # Build a URL by calling its geturl() method. # When combined with the urlencode() function, which knows how to build
def joinUrls(baseUrl, newUrl): helpUrl, fragment = urlparse.urldefrag(newUrl) return urlparse.urljoin(baseUrl, helpUrl)
def __init__(self, toolpath_object, **kwargs): (_, self.names, _) = get_schema() self.tool = toolpath_object self.requirements = kwargs.get("requirements", []) + self.tool.get( "requirements", []) self.hints = kwargs.get("hints", []) + self.tool.get("hints", []) if "loader" in kwargs: self.formatgraph = kwargs["loader"].graph self.validate_hints(self.tool.get("hints", []), strict=kwargs.get("strict")) self.schemaDefs = {} sd, _ = self.get_requirement("SchemaDefRequirement") if sd: sdtypes = sd["types"] av = schema_salad.schema.make_valid_avro( sdtypes, {t["name"]: t for t in sdtypes}, set()) for i in av: self.schemaDefs[i["name"]] = i avro.schema.make_avsc_object(av, self.names) # Build record schema from inputs self.inputs_record_schema = { "name": "input_record_schema", "type": "record", "fields": [] } self.outputs_record_schema = { "name": "outputs_record_schema", "type": "record", "fields": [] } for key in ("inputs", "outputs"): for i in self.tool[key]: c = copy.copy(i) doc_url, _ = urlparse.urldefrag(c['id']) c["name"] = shortname(c["id"]) del c["id"] if "type" not in c: raise validate.ValidationException( "Missing `type` in parameter `%s`" % c["name"]) if "default" in c and "null" not in aslist(c["type"]): c["type"] = ["null"] + aslist(c["type"]) else: c["type"] = c["type"] if key == "inputs": self.inputs_record_schema["fields"].append(c) elif key == "outputs": self.outputs_record_schema["fields"].append(c) try: self.inputs_record_schema = schema_salad.schema.make_valid_avro( self.inputs_record_schema, {}, set()) avro.schema.make_avsc_object(self.inputs_record_schema, self.names) except avro.schema.SchemaParseException as e: raise validate.ValidationException( "Got error `%s` while prcoessing inputs of %s:\n%s" % (str(e), self.tool["id"], json.dumps(self.inputs_record_schema, indent=4))) try: self.outputs_record_schema = schema_salad.schema.make_valid_avro( self.outputs_record_schema, {}, set()) avro.schema.make_avsc_object(self.outputs_record_schema, self.names) except avro.schema.SchemaParseException as e: raise validate.ValidationException( "Got error `%s` while prcoessing outputs of %s:\n%s" % (str(e), self.tool["id"], json.dumps(self.outputs_record_schema, indent=4)))
def normalize(seed_url, link): # urldefrag 从链接中的第一个#开始,将链接分割成两部分,第二部分为#之后数据且不带# link, _ = urlparse.urldefrag(link) return urlparse.urljoin(seed_url, link)
def read_inline_toc(self, href, frag): ans = TOC() base_href = '/'.join(href.split('/')[:-1]) with open(href.replace('/', os.sep), 'rb') as f: raw = f.read().decode(self.header.codec) root = parse_html(raw, log=self.log) body = XPath('//h:body')(root) reached = False if body: start = body[0] else: start = None reached = True if frag: elems = XPath('//*[@id="%s"]' % frag)(root) if elems: start = elems[0] def node_depth(elem): ans = 0 parent = elem.getparent() while parent is not None: parent = parent.getparent() ans += 1 return ans # Layer the ToC based on nesting order in the source HTML current_depth = None parent = ans seen = set() links = [] for elem in root.iterdescendants(etree.Element): if reached and elem.tag == XHTML('a') and elem.get('href', False): href = elem.get('href') href, frag = urldefrag(href) href = base_href + '/' + href text = xml2text(elem).strip() if (text, href, frag) in seen: continue seen.add((text, href, frag)) links.append((text, href, frag, node_depth(elem))) elif elem is start: reached = True depths = sorted(set(x[-1] for x in links)) depth_map = {x: i for i, x in enumerate(depths)} for text, href, frag, depth in links: depth = depth_map[depth] if current_depth is None: current_depth = 0 parent.add_item(href, frag, text) elif current_depth == depth: parent.add_item(href, frag, text) elif current_depth < depth: parent = parent[-1] if len(parent) > 0 else parent parent.add_item(href, frag, text) current_depth += 1 else: delta = current_depth - depth while delta > 0 and parent.parent is not None: parent = parent.parent delta -= 1 parent.add_item(href, frag, text) current_depth = depth return ans
def analyze(self, fname, find_sources=False, check_remote=False): """Analyze links on a page.""" rv = False self.whitelist = [re.compile(x) for x in self.site.config['LINK_CHECK_WHITELIST']] self.internal_redirects = [urljoin('/', _[0]) for _ in self.site.config['REDIRECTIONS']] base_url = urlparse(self.site.config['BASE_URL']) self.existing_targets.add(self.site.config['SITE_URL']) self.existing_targets.add(self.site.config['BASE_URL']) url_type = self.site.config['URL_TYPE'] atom_extension = self.site.config['ATOM_EXTENSION'] deps = {} if find_sources: deps = _call_nikola_list(self.site, self.cache)[1] if url_type in ('absolute', 'full_path'): url_netloc_to_root = urlparse(self.site.config['BASE_URL']).path try: filename = fname if filename.startswith(self.site.config['CACHE_FOLDER']): # Do not look at links in the cache, which are not parsed by # anyone and may result in false positives. Problems arise # with galleries, for example. Full rationale: (Issue #1447) self.logger.notice("Ignoring {0} (in cache, links may be incorrect)".format(filename)) return False if not os.path.exists(fname): # Quietly ignore files that don’t exist; use `nikola check -f` instead (Issue #1831) return False if '.html' == fname[-5:]: with open(filename, 'rb') as inf: d = lxml.html.fromstring(inf.read()) extra_objs = lxml.html.fromstring('<html/>') # Turn elements with a srcset attribute into individual img elements with src attributes for obj in list(d.xpath('(*//img|*//source)')): if 'srcset' in obj.attrib: for srcset_item in obj.attrib['srcset'].split(','): extra_objs.append(lxml.etree.Element('img', src=srcset_item.strip().split(' ')[0])) link_elements = list(d.iterlinks()) + list(extra_objs.iterlinks()) # Extract links from XML formats to minimal HTML, allowing those to go through the link checks elif atom_extension == filename[-len(atom_extension):]: d = lxml.etree.parse(filename) link_elements = lxml.html.fromstring('<html/>') for elm in d.findall('*//{http://www.w3.org/2005/Atom}link'): feed_link = elm.attrib['href'].split('?')[0].strip() # strip FEED_LINKS_APPEND_QUERY link_elements.append(lxml.etree.Element('a', href=feed_link)) link_elements = list(link_elements.iterlinks()) elif filename.endswith('sitemap.xml') or filename.endswith('sitemapindex.xml'): d = lxml.etree.parse(filename) link_elements = lxml.html.fromstring('<html/>') for elm in d.getroot().findall("*//{http://www.sitemaps.org/schemas/sitemap/0.9}loc"): link_elements.append(lxml.etree.Element('a', href=elm.text.strip())) link_elements = list(link_elements.iterlinks()) else: # unsupported file type return False for l in link_elements: target = l[2] if target == "#": continue target = urldefrag(target)[0] if any([urlparse(target).netloc.endswith(_) for _ in ['example.com', 'example.net', 'example.org']]): self.logger.debug("Not testing example address \"{0}\".".format(target)) continue # absolute URL to root-relative if target.startswith(base_url.geturl()): target = target.replace(base_url.geturl(), '/') parsed = urlparse(target) # Warn about links from https to http (mixed-security) if base_url.netloc == parsed.netloc and base_url.scheme == "https" and parsed.scheme == "http": self.logger.warn("Mixed-content security for link in {0}: {1}".format(filename, target)) # Link to an internal REDIRECTIONS page if target in self.internal_redirects: redir_status_code = 301 redir_target = [_dest for _target, _dest in self.site.config['REDIRECTIONS'] if urljoin('/', _target) == target][0] self.logger.warn("Remote link moved PERMANENTLY to \"{0}\" and should be updated in {1}: {2} [HTTP: 301]".format(redir_target, filename, target)) # Absolute links to other domains, skip # Absolute links when using only paths, skip. if ((parsed.scheme or target.startswith('//')) and parsed.netloc != base_url.netloc) or \ ((parsed.scheme or target.startswith('//')) and url_type in ('rel_path', 'full_path')): if not check_remote or parsed.scheme not in ["http", "https"]: continue if target in self.checked_remote_targets: # already checked this exact target if self.checked_remote_targets[target] in [301, 308]: self.logger.warn("Remote link PERMANENTLY redirected in {0}: {1} [Error {2}]".format(filename, target, self.checked_remote_targets[target])) elif self.checked_remote_targets[target] in [302, 307]: self.logger.debug("Remote link temporarily redirected in {0}: {1} [HTTP: {2}]".format(filename, target, self.checked_remote_targets[target])) elif self.checked_remote_targets[target] > 399: self.logger.error("Broken link in {0}: {1} [Error {2}]".format(filename, target, self.checked_remote_targets[target])) continue # Skip whitelisted targets if any(re.search(_, target) for _ in self.whitelist): continue # Check the remote link works req_headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0 (Nikola)'} # I’m a real boy! resp = requests.head(target, headers=req_headers, allow_redirects=False) # Retry client errors (4xx) as GET requests because many servers are broken if resp.status_code >= 400 and resp.status_code <= 499: time.sleep(0.5) resp = requests.get(target, headers=req_headers, allow_redirects=False) # Follow redirects and see where they lead, redirects to errors will be reported twice if resp.status_code in [301, 302, 307, 308]: redir_status_code = resp.status_code time.sleep(0.5) # Known redirects are retested using GET because IIS servers otherwise get HEADaches resp = requests.get(target, headers=req_headers, allow_redirects=True) # Permanent redirects should be updated if redir_status_code in [301, 308]: self.logger.warn("Remote link moved PERMANENTLY to \"{0}\" and should be updated in {1}: {2} [HTTP: {3}]".format(resp.url, filename, target, redir_status_code)) if redir_status_code in [302, 307]: self.logger.debug("Remote link temporarily redirected to \"{0}\" in {1}: {2} [HTTP: {3}]".format(resp.url, filename, target, redir_status_code)) self.checked_remote_targets[resp.url] = resp.status_code self.checked_remote_targets[target] = redir_status_code else: self.checked_remote_targets[target] = resp.status_code if resp.status_code > 399: # Error self.logger.error("Broken link in {0}: {1} [Error {2}]".format(filename, target, resp.status_code)) continue elif resp.status_code <= 399: # The address leads *somewhere* that is not an error self.logger.debug("Successfully checked remote link in {0}: {1} [HTTP: {2}]".format(filename, target, resp.status_code)) continue self.logger.warn("Could not check remote link in {0}: {1} [Unknown problem]".format(filename, target)) continue if url_type == 'rel_path': if target.startswith('/'): target_filename = os.path.abspath( os.path.join(self.site.config['OUTPUT_FOLDER'], unquote(target.lstrip('/')))) else: # Relative path unquoted_target = unquote(target).encode('utf-8') target_filename = os.path.abspath( os.path.join(os.path.dirname(filename).encode('utf-8'), unquoted_target)) elif url_type in ('full_path', 'absolute'): relative = False if url_type == 'absolute': # convert to 'full_path' case, ie url relative to root if parsed.path.startswith(url_netloc_to_root): url_rel_path = parsed.path[len(url_netloc_to_root):] else: url_rel_path = parsed.path if not url_rel_path.startswith('/'): relative = True else: # convert to relative to base path if target.startswith(url_netloc_to_root): url_rel_path = target[len(url_netloc_to_root):] else: url_rel_path = target if not url_rel_path.startswith('/'): relative = True if url_rel_path == '' or url_rel_path.endswith('/'): url_rel_path = urljoin(url_rel_path, self.site.config['INDEX_FILE']) if relative: unquoted_target = unquote(target).encode('utf-8') target_filename = os.path.abspath( os.path.join(os.path.dirname(filename).encode('utf-8'), unquoted_target)) else: fs_rel_path = fs_relpath_from_url_path(url_rel_path) target_filename = os.path.join(self.site.config['OUTPUT_FOLDER'], fs_rel_path) if any(re.search(x, target_filename) for x in self.whitelist): continue elif target_filename not in self.existing_targets: if os.path.exists(target_filename): self.logger.info("Good link {0} => {1}".format(target, target_filename)) self.existing_targets.add(target_filename) else: rv = True self.logger.warn("Broken link in {0}: {1}".format(filename, target)) if find_sources: self.logger.warn("Possible sources:") self.logger.warn("\n".join(deps[filename])) self.logger.warn("===============================\n") except Exception as exc: self.logger.error(u"Error with: {0} {1}".format(filename, exc)) return rv
def linkto(item): _, frg = urlparse.urldefrag(item) return "[%s](#%s)" % (frg, to_id(frg))
def _manifest_add_missing(self, invalid): import cssutils manifest = self.oeb.manifest known = set(manifest.hrefs) unchecked = set(manifest.values()) cdoc = OEB_DOCS|OEB_STYLES invalid = set() while unchecked: new = set() for item in unchecked: data = None if (item.media_type in cdoc or item.media_type[-4:] in ('/xml', '+xml')): try: data = item.data except: self.oeb.log.exception(u'Failed to read from manifest ' u'entry with id: %s, ignoring'%item.id) invalid.add(item) continue if data is None: continue if (item.media_type in OEB_DOCS or item.media_type[-4:] in ('/xml', '+xml')): hrefs = [r[2] for r in iterlinks(data)] for href in hrefs: href, _ = urldefrag(href) if not href: continue try: href = item.abshref(urlnormalize(href)) scheme = urlparse(href).scheme except: self.oeb.log.exception( 'Skipping invalid href: %r'%href) continue if not scheme and href not in known: new.add(href) elif item.media_type in OEB_STYLES: try: urls = list(cssutils.getUrls(data)) except: urls = [] for url in urls: href, _ = urldefrag(url) href = item.abshref(urlnormalize(href)) scheme = urlparse(href).scheme if not scheme and href not in known: new.add(href) unchecked.clear() warned = set([]) for href in new: known.add(href) is_invalid = False for item in invalid: if href == item.abshref(urlnormalize(href)): is_invalid = True break if is_invalid: continue if not self.oeb.container.exists(href): if href not in warned: self.logger.warn('Referenced file %r not found' % href) warned.add(href) continue if href not in warned: self.logger.warn('Referenced file %r not in manifest' % href) warned.add(href) id, _ = manifest.generate(id='added') guessed = guess_type(href)[0] media_type = guessed or BINARY_MIME added = manifest.add(id, href, media_type) unchecked.add(added) for item in invalid: self.oeb.manifest.remove(item)
def get_src_obj_url(url): """Get and return source object's URL part from full URL.""" return urldefrag(url)[0]
def urldefrag(url): url, fragment = _urlparse.urldefrag(url) return urlnormalize(url), fragment
def render_type(self, f, depth): # type: (Dict[str, Any], int) -> None if f["name"] in self.rendered or f["name"] in self.redirects: return self.rendered.add(f["name"]) if "doc" not in f: f["doc"] = "" f["type"] = copy.deepcopy(f) f["doc"] = "" f = f["type"] if "doc" not in f: f["doc"] = "" def extendsfrom(item, ex): # type: (Dict[str, Any], List[Dict[str, Any]]) -> None if "extends" in item: for e in aslist(item["extends"]): ex.insert(0, self.typemap[e]) extendsfrom(self.typemap[e], ex) ex = [f] extendsfrom(f, ex) enumDesc = {} if f["type"] == "enum" and isinstance(f["doc"], list): for e in ex: for i in e["doc"]: idx = i.find(":") if idx > -1: enumDesc[i[:idx]] = i[idx + 1:] e["doc"] = [ i for i in e["doc"] if i.find(":") == -1 or i.find(" ") < i.find(":") ] f["doc"] = fix_doc(f["doc"]) if f["type"] == "record": for field in f.get("fields", []): if "doc" not in field: field["doc"] = "" if f["type"] != "documentation": lines = [] for l in f["doc"].splitlines(): if len(l) > 0 and l[0] == "#": l = ("#" * depth) + l lines.append(l) f["doc"] = "\n".join(lines) _, frg = urlparse.urldefrag(f["name"]) num = self.toc.add_entry(depth, frg) doc = "## %s %s\n" % (num, frg) else: doc = "" if self.title is None and f["doc"]: self.title = f["doc"][0:f["doc"].index("\n")] if self.title.startswith('# '): self.title = self.title[2:] if f["type"] == "documentation": f["doc"] = number_headings(self.toc, f["doc"]) # if "extends" in f: # doc += "\n\nExtends " # doc += ", ".join([" %s" % linkto(ex) for ex in aslist(f["extends"])]) # if f["name"] in self.subs: # doc += "\n\nExtended by" # doc += ", ".join([" %s" % linkto(s) for s in self.subs[f["name"]]]) # if f["name"] in self.uses: # doc += "\n\nReferenced by" # doc += ", ".join([" [%s.%s](#%s)" % (s[0], s[1], to_id(s[0])) # for s in self.uses[f["name"]]]) doc = doc + "\n\n" + f["doc"] doc = mistune.markdown(doc, renderer=MyRenderer()) if f["type"] == "record": doc += "<h3>Fields</h3>" doc += """<table class="table table-striped">""" doc += "<tr><th>field</th><th>type</th><th>required</th><th>description</th></tr>" required = [] optional = [] for i in f.get("fields", []): tp = i["type"] if isinstance( tp, list) and tp[0] == "https://w3id.org/cwl/salad#null": opt = False tp = tp[1:] else: opt = True desc = i["doc"] # if "inherited_from" in i: # desc = "%s _Inherited from %s_" % (desc, linkto(i["inherited_from"])) rfrg = schema.avro_name(i["name"]) tr = "<td><code>%s</code></td><td>%s</td><td>%s</td>"\ "<td>%s</td>" % ( rfrg, self.typefmt(tp, self.redirects), opt, mistune.markdown(desc)) if opt: required.append(tr) else: optional.append(tr) for i in required + optional: doc += "<tr>" + i + "</tr>" doc += """</table>""" elif f["type"] == "enum": doc += "<h3>Symbols</h3>" doc += """<table class="table table-striped">""" doc += "<tr><th>symbol</th><th>description</th></tr>" for e in ex: for i in e.get("symbols", []): doc += "<tr>" efrg = schema.avro_name(i) doc += "<td><code>%s</code></td><td>%s</td>" % ( efrg, enumDesc.get(efrg, "")) doc += "</tr>" doc += """</table>""" f["doc"] = doc self.typedoc.write(f["doc"]) subs = self.docParent.get(f["name"], []) + \ self.record_refs.get(f["name"], []) if len(subs) == 1: self.render_type(self.typemap[subs[0]], depth) else: for s in subs: self.render_type(self.typemap[s], depth + 1) for s in self.docAfter.get(f["name"], []): self.render_type(self.typemap[s], depth)
def __init__(self, toolpath_object, validateAs, do_validate=True, **kwargs): (_, self.names) = get_schema() self.tool = toolpath_object if do_validate: try: # Validate tool documument validate.validate_ex(self.names.get_name(validateAs, ""), self.tool, strict=kwargs.get("strict")) except validate.ValidationException as v: raise validate.ValidationException( "Could not validate %s as %s:\n%s" % (self.tool.get("id"), validateAs, validate.indent(str(v)))) self.requirements = kwargs.get("requirements", []) + self.tool.get( "requirements", []) self.hints = kwargs.get("hints", []) + self.tool.get("hints", []) self.validate_hints(self.tool.get("hints", []), strict=kwargs.get("strict")) self.schemaDefs = {} sd, _ = self.get_requirement("SchemaDefRequirement") if sd: for i in sd["types"]: avro.schema.make_avsc_object(i, self.names) self.schemaDefs[i["name"]] = i # Build record schema from inputs self.inputs_record_schema = { "name": "input_record_schema", "type": "record", "fields": [] } for i in self.tool["inputs"]: c = copy.copy(i) doc_url, fragment = urlparse.urldefrag(c['id']) c["name"] = fragment del c["id"] if "type" not in c: raise validate.ValidationException( "Missing `type` in parameter `%s`" % c["name"]) if "default" in c: c["type"] = ["null"] + aslist(c["type"]) else: c["type"] = c["type"] self.inputs_record_schema["fields"].append(c) avro.schema.make_avsc_object(self.inputs_record_schema, self.names) self.outputs_record_schema = { "name": "outputs_record_schema", "type": "record", "fields": [] } for i in self.tool["outputs"]: c = copy.copy(i) doc_url, fragment = urlparse.urldefrag(c['id']) c["name"] = fragment del c["id"] if "type" not in c: raise validate.ValidationException( "Missing `type` in parameter `%s`" % c["name"]) if "default" in c: c["type"] = ["null"] + aslist(c["type"]) else: c["type"] = c["type"] self.outputs_record_schema["fields"].append(c) avro.schema.make_avsc_object(self.outputs_record_schema, self.names)
The urlsplit() function is an alternative to urlparse() . It behaves a little differently because it does not split the parameters from the URL. This is useful for URLs following RFC 2396, which supports parameters for each segment of the path. Since the parameters are not split out, the tuple API will show five elements instead of six, and there is no params attribute. """ from urlparse import urlsplit url = 'http://*****:*****@NetLoc:80/p1;param/p2;param?query=arg#frag' parsed = urlsplit(url) print "using urlsplit=", parsed print 'scheme :', parsed.scheme print 'netloc :', parsed.netloc print 'path:', parsed.path print 'query :', parsed.query print 'fragment:', parsed.fragment print 'username:'******'password:'******'hostname:', parsed.hostname, '(netloc in lowercase)' print 'port:', parsed.port print "============================================" """ To simply strip the fragment identifier from a URL, such as when finding a base page name from a URL, use urldefrag() . """ from urlparse import urldefrag original = 'http://netloc/path;param?query=arg#frag' print 'original:', original url, fragment = urldefrag(original) print 'url:', url print 'fragment:', fragment
def resolve_ref( self, ref, # type: Union[CommentedMap, CommentedSeq, unicode] base_url=None, # type: unicode checklinks=True # type: bool ): # type: (...) -> Tuple[Union[CommentedMap, CommentedSeq, unicode], Dict[unicode, Any]] obj = None # type: CommentedMap resolved_obj = None # type: Union[CommentedMap, CommentedSeq, unicode] inc = False mixin = None # type: Dict[unicode, Any] if not base_url: base_url = file_uri(os.getcwd()) + "/" if isinstance(ref, (str, unicode)) and os.sep == "\\": # Convert Windows path separator in ref ref = ref.replace("\\", "/") sl = SourceLine(obj, None, ValueError) # If `ref` is a dict, look for special directives. if isinstance(ref, CommentedMap): obj = ref if "$import" in obj: sl = SourceLine(obj, "$import", RuntimeError) if len(obj) == 1: ref = obj[u"$import"] obj = None else: raise sl.makeError( u"'$import' must be the only field in %s" % (unicode(obj))) elif "$include" in obj: sl = SourceLine(obj, "$include", RuntimeError) if len(obj) == 1: ref = obj[u"$include"] inc = True obj = None else: raise sl.makeError( u"'$include' must be the only field in %s" % (unicode(obj))) elif "$mixin" in obj: sl = SourceLine(obj, "$mixin", RuntimeError) ref = obj[u"$mixin"] mixin = obj obj = None else: ref = None for identifier in self.identifiers: if identifier in obj: ref = obj[identifier] break if not ref: raise sl.makeError( u"Object `%s` does not have identifier field in %s" % (relname(obj), self.identifiers)) if not isinstance(ref, (str, unicode)): raise ValueError(u"Expected CommentedMap or string, got %s: `%s`" % (type(ref), unicode(ref))) url = self.expand_url(ref, base_url, scoped_id=(obj is not None)) # Has this reference been loaded already? if url in self.idx and (not mixin): return self.idx[url], {} sl.raise_type = RuntimeError with sl: # "$include" directive means load raw text if inc: return self.fetch_text(url), {} doc = None if obj: for identifier in self.identifiers: obj[identifier] = url doc_url = url else: # Load structured document doc_url, frg = urlparse.urldefrag(url) if doc_url in self.idx and (not mixin): # If the base document is in the index, it was already loaded, # so if we didn't find the reference earlier then it must not # exist. raise validate.ValidationException( u"Reference `#%s` not found in file `%s`." % (frg, doc_url)) doc = self.fetch(doc_url, inject_ids=(not mixin)) # Recursively expand urls and resolve directives if mixin: doc = copy.deepcopy(doc) doc.update(mixin) del doc["$mixin"] url = None resolved_obj, metadata = self.resolve_all(doc, base_url, file_base=doc_url, checklinks=checklinks) else: resolved_obj, metadata = self.resolve_all(doc if doc else obj, doc_url, checklinks=checklinks) # Requested reference should be in the index now, otherwise it's a bad # reference if url is not None: if url in self.idx: resolved_obj = self.idx[url] else: raise RuntimeError( "Reference `%s` is not in the index. Index contains:\n %s" % (url, "\n ".join(self.idx))) if isinstance(resolved_obj, CommentedMap): if u"$graph" in resolved_obj: metadata = _copy_dict_without_key(resolved_obj, u"$graph") return resolved_obj[u"$graph"], metadata else: return resolved_obj, metadata else: return resolved_obj, metadata
def normalize(seed_url, link): """Normalize this URL by removing hash and adding domain """ link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates return urlparse.urljoin(seed_url, link)
def resolve_ref(self, ref, base_url=None, checklinks=True): # type: (Union[Dict[unicode, Any], unicode], unicode, bool) -> Tuple[Union[List, Dict[unicode, Any], unicode], Dict[unicode, Any]] base_url = base_url or u'file://%s/' % os.path.abspath('.') obj = None # type: Dict[unicode, Any] inc = False # If `ref` is a dict, look for special directives. if isinstance(ref, dict): obj = ref if u"$import" in ref: if len(obj) == 1: ref = obj[u"$import"] obj = None else: raise ValueError("'$import' must be the only field in %s" % (str(obj))) elif u"$include" in obj: if len(obj) == 1: ref = obj[u"$include"] inc = True obj = None else: raise ValueError( "'$include' must be the only field in %s" % (str(obj))) else: ref = None for identifier in self.identifiers: if identifier in obj: ref = obj[identifier] break if not ref: raise ValueError( "Object `%s` does not have identifier field in %s" % (obj, self.identifiers)) if not isinstance(ref, (str, unicode)): raise ValueError("Must be string: `%s`" % str(ref)) url = self.expand_url(ref, base_url, scoped_id=(obj is not None)) # Has this reference been loaded already? if url in self.idx: return self.idx[url], {} # "$include" directive means load raw text if inc: return self.fetch_text(url), {} doc = None if obj: for identifier in self.identifiers: obj[identifier] = url doc_url = url else: # Load structured document doc_url, frg = urlparse.urldefrag(url) if doc_url in self.idx: raise validate.ValidationException( "Reference `#%s` not found in file `%s`." % (frg, doc_url)) doc = self.fetch(doc_url) # Recursively expand urls and resolve directives resolved_obj, metadata = self.resolve_all(doc if doc else obj, doc_url, checklinks=checklinks) # Requested reference should be in the index now, otherwise it's a bad # reference if url is not None: if url in self.idx: resolved_obj = self.idx[url] else: raise RuntimeError("Reference `%s` is not in the index. " "Index contains:\n %s" % (url, "\n ".join(self.idx))) if isinstance(resolved_obj, (dict)): if u"$graph" in resolved_obj: metadata = _copy_dict_without_key(resolved_obj, u"$graph") return resolved_obj[u"$graph"], metadata else: return resolved_obj, metadata else: return resolved_obj, metadata
def normalize(seed_url, link): link, _ = urlparse.urldefrag(link) return urlparse.urljoin(seed_url, link)
def storage_volume_deletable(self, request): """ Returns a list of domains that use the given volume. options: [{ 'domainURI': <domain URI>, 'pool': <pool name>, 'source': <file name> }, ...] return: [{ 'domainURI': <domain URI>, 'pool': <pool name>, 'source': <file name> 'deletable': (True|False|None) }, ...] where 'deletebale' is True: disk can be deleted False: disk is shared and should not be deleted None: disk can not be deleted """ _tmp_cache = {} for volume in request.options: # safe default: not deletable volume['deletable'] = None node_uri, domain_uuid = urldefrag(volume['domainURI']) # Must be in a pool pool = self.get_pool(node_uri, volume['pool']) if not pool: continue # Pool must be modifiable if pool['type'] not in POOLS_RW: continue # Pool must be mapped to the file system pool_path = pool['path'] if not pool_path: continue volume_path = volume['source'] # check if volume is used by any other domain success, result = self.uvmm.send('STORAGE_VOLUME_USEDBY', None, volume=volume_path) if not success: raise UMC_Error( _('Failed to check if the drive is used by any other virtual instance' )) if len(result) > 1: # is used by at least one other domain volume['deletable'] = False continue try: domain = _tmp_cache[volume['domainURI']] except LookupError: success, domain = self.uvmm.send('DOMAIN_INFO', None, uri=node_uri, domain=domain_uuid) if not success: raise UMC_Error( _('Could not retrieve details for domain %s') % domain_uuid) _tmp_cache[volume['domainURI']] = domain drive = None for disk in domain.disks: if disk.source == volume_path: drive = disk break else: continue volume['deletable'] = drive.device == Disk.DEVICE_DISK self.finished(request.id, request.options)
def _toc_from_navpoint(self, item, toc, navpoint): children = xpath(navpoint, 'ncx:navPoint') for child in children: title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()')) title = COLLAPSE_RE.sub(' ', title.strip()) href = xpath(child, 'ncx:content/@src') if not title: self._toc_from_navpoint(item, toc, child) continue if (not href or not href[0]) and not xpath(child, 'ncx:navPoint'): # This node is useless continue href = item.abshref(urlnormalize( href[0])) if href and href[0] else '' path, _ = urldefrag(href) if href and path not in self.oeb.manifest.hrefs: self.logger.warn('TOC reference %r not found' % href) gc = xpath(child, 'ncx:navPoint') if not gc: # This node is useless continue id = child.get('id') klass = child.get('class', 'chapter') try: po = int(child.get('playOrder', self.oeb.toc.next_play_order())) except: po = self.oeb.toc.next_play_order() authorElement = xpath( child, 'descendant::calibre:meta[@name = "author"]') if authorElement: author = authorElement[0].text else: author = None descriptionElement = xpath( child, 'descendant::calibre:meta[@name = "description"]') if descriptionElement: description = etree.tostring(descriptionElement[0], method='text', encoding=unicode).strip() if not description: description = None else: description = None index_image = xpath( child, 'descendant::calibre:meta[@name = "toc_thumbnail"]') toc_thumbnail = (index_image[0].text if index_image else None) if not toc_thumbnail or not toc_thumbnail.strip(): toc_thumbnail = None node = toc.add(title, href, id=id, klass=klass, play_order=po, description=description, author=author, toc_thumbnail=toc_thumbnail) self._toc_from_navpoint(item, node, child)
def scandeps(base, doc, reffields, urlfields, loadref, urljoin=urlparse.urljoin): # type: (Text, Any, Set[Text], Set[Text], Callable[[Text, Text], Any], Callable[[Text, Text], Text]) -> List[Dict[Text, Text]] r = [] # type: List[Dict[Text, Text]] deps = None # type: Dict[Text, Any] if isinstance(doc, dict): if "id" in doc: if doc["id"].startswith("file://"): df, _ = urlparse.urldefrag(doc["id"]) if base != df: r.append({"class": "File", "location": df}) base = df if doc.get("class") in ("File", "Directory") and "location" in urlfields: u = doc.get("location", doc.get("path")) if u and not u.startswith("_:"): deps = {"class": doc["class"], "location": urljoin(base, u)} if doc["class"] == "Directory" and "listing" in doc: deps["listing"] = doc["listing"] if doc["class"] == "File" and "secondaryFiles" in doc: deps["secondaryFiles"] = doc["secondaryFiles"] deps = nestdir(base, deps) r.append(deps) else: if doc["class"] == "Directory" and "listing" in doc: r.extend( scandeps(base, doc["listing"], reffields, urlfields, loadref, urljoin=urljoin)) elif doc["class"] == "File" and "secondaryFiles" in doc: r.extend( scandeps(base, doc["secondaryFiles"], reffields, urlfields, loadref, urljoin=urljoin)) for k, v in doc.iteritems(): if k in reffields: for u in aslist(v): if isinstance(u, dict): r.extend( scandeps(base, u, reffields, urlfields, loadref, urljoin=urljoin)) else: sub = loadref(base, u) subid = urljoin(base, u) deps = {"class": "File", "location": subid} sf = scandeps(subid, sub, reffields, urlfields, loadref, urljoin=urljoin) if sf: deps["secondaryFiles"] = sf deps = nestdir(base, deps) r.append(deps) elif k in urlfields and k != "location": for u in aslist(v): deps = {"class": "File", "location": urljoin(base, u)} deps = nestdir(base, deps) r.append(deps) elif k not in ("listing", "secondaryFiles"): r.extend( scandeps(base, v, reffields, urlfields, loadref, urljoin=urljoin)) elif isinstance(doc, list): for d in doc: r.extend( scandeps(base, d, reffields, urlfields, loadref, urljoin=urljoin)) if r: normalizeFilesDirs(r) r = mergedirs(r) return r
def remove_fragment(url): pure_url, frag = urldefrag(url) return pure_url
def normalize(seed_url, link): link, _ = urlparse.urldefrag( link) # 将link分解成去掉fragment的新url和去掉的fragment的二元组 return urlparse.urljoin(seed_url, link)
def fetch(self): """Attempt to fetch the contents of the URL. If successful, and the data is HTML, extract further links and add them to the crawler. Redirects are also added back there. """ while self.tries < self.max_tries: self.tries += 1 self.request = None try: self.request = Request(self.log, self.url, self.crawler.pool) yield From(self.request.connect()) yield From(self.request.send_request()) self.response = yield From(self.request.get_response()) self.body = yield From(self.response.read()) h_conn = self.response.get_header('connection').lower() if h_conn != 'close': self.request.close(recycle=True) self.request = None if self.tries > 1: self.log(1, 'try', self.tries, 'for', self.url, 'success') break except (BadStatusLine, OSError) as exc: self.exceptions.append(exc) self.log(1, 'try', self.tries, 'for', self.url, 'raised', repr(exc)) ##import pdb; pdb.set_trace() # Don't reuse the connection in this case. finally: if self.request is not None: self.request.close() else: # We never broke out of the while loop, i.e. all tries failed. self.log(0, 'no success for', self.url, 'in', self.max_tries, 'tries') return next_url = self.response.get_redirect_url() if next_url: self.next_url = urlparse.urljoin(self.url, next_url) if self.max_redirect > 0: self.log(1, 'redirect to', self.next_url, 'from', self.url) self.crawler.add_url(self.next_url, self.max_redirect - 1) else: self.log(0, 'redirect limit reached for', self.next_url, 'from', self.url) else: if self.response.status == 200: self.ctype = self.response.get_header('content-type') self.pdict = {} if self.ctype: self.ctype, self.pdict = cgi.parse_header(self.ctype) self.encoding = self.pdict.get('charset', 'utf-8') if self.ctype == 'text/html': body = self.body.decode(self.encoding, 'replace') # Replace href with (?:href|src) to follow image links. self.urls = set( re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', body)) if self.urls: self.log(1, 'got', len(self.urls), 'distinct urls from', self.url) self.new_urls = set() for url in self.urls: url = unescape(url) url = urlparse.urljoin(self.url, url) url, frag = urlparse.urldefrag(url) if self.crawler.add_url(url): self.new_urls.add(url)
def analyze(self, task, find_sources=False): rv = False self.whitelist = [re.compile(x) for x in self.site.config['LINK_CHECK_WHITELIST']] base_url = urlparse(self.site.config['BASE_URL']) self.existing_targets.add(self.site.config['SITE_URL']) self.existing_targets.add(self.site.config['BASE_URL']) url_type = self.site.config['URL_TYPE'] if url_type == 'absolute': url_netloc_to_root = urlparse(self.site.config['SITE_URL']).path try: filename = task.split(":")[-1] if filename.startswith(self.site.config['CACHE_FOLDER']): # Do not look at links in the cache, which are not parsed by # anyone and may result in false positives. Problems arise # with galleries, for example. Full rationale: (Issue #1447) self.logger.notice("Ignoring {0} (in cache, links may be incorrect)".format(filename)) return False d = lxml.html.fromstring(open(filename, 'rb').read()) for l in d.iterlinks(): target = l[0].attrib[l[1]] if target == "#": continue target, _ = urldefrag(target) parsed = urlparse(target) # Warn about links from https to http (mixed-security) if base_url.netloc == parsed.netloc and base_url.scheme == "https" and parsed.scheme == "http": self.logger.warn("Mixed-content security for link in {0}: {1}".format(filename, target)) # Absolute links when using only paths, skip. if (parsed.scheme or target.startswith('//')) and url_type in ('rel_path', 'full_path'): continue # Absolute links to other domains, skip if (parsed.scheme or target.startswith('//')) and parsed.netloc != base_url.netloc: continue if url_type == 'rel_path': target_filename = os.path.abspath( os.path.join(os.path.dirname(filename), unquote(target))) elif url_type in ('full_path', 'absolute'): if url_type == 'absolute': # convert to 'full_path' case, ie url relative to root url_rel_path = target.path[len(url_netloc_to_root):] else: url_rel_path = target.path if url_rel_path == '' or url_rel_path.endswith('/'): url_rel_path = urljoin(url_rel_path, self.site.config['INDEX_FILE']) fs_rel_path = fs_relpath_from_url_path(url_rel_path) target_filename = os.path.join(self.site.config['OUTPUT_FOLDER'], fs_rel_path) if any(re.match(x, target_filename) for x in self.whitelist): continue elif target_filename not in self.existing_targets: if os.path.exists(target_filename): self.logger.notice("Good link {0} => {1}".format(target, target_filename)) self.existing_targets.add(target_filename) else: rv = True self.logger.warn("Broken link in {0}: {1}".format(filename, target)) if find_sources: self.logger.warn("Possible sources:") self.logger.warn("\n".join(_call_nikola_list(self.site, ["--deps", task]))) self.logger.warn("===============================\n") except Exception as exc: self.logger.error("Error with: {0} {1}".format(filename, exc)) return rv
def resolve_ref(self, ref, base_url=None): base_url = base_url or 'file://%s/' % os.path.abspath('.') obj = None # If `ref` is a dict, look for special directives. if isinstance(ref, dict): obj = ref if "import" in ref: if len(obj) == 1: ref = obj["import"] obj = None else: raise ValueError("'import' must be the only field in %s" % (str(obj))) elif "include" in obj: if len(obj) == 1: ref = obj["include"] else: raise ValueError("'include' must be the only field in %s" % (str(obj))) else: if "id" in obj: ref = obj["id"] else: raise ValueError("Object `%s` does not have `id` field" % obj) if not isinstance(ref, basestring): raise ValueError("Must be string: `%s`" % str(ref)) url = expand_url(ref, base_url) # Has this reference been loaded already? if url in self.idx: return self.idx[url] # "include" directive means load raw text if obj and "include" in obj: return self.fetch_text(url) if obj: obj["id"] = url self.idx[url] = obj else: # Load structured document doc_url, frg = urlparse.urldefrag(url) if doc_url in self.idx: raise validate.ValidationException( "Reference `#%s` not found in file `%s`." % (frg, doc_url)) obj = self.fetch(doc_url) # Recursively expand urls and resolve directives self.resolve_all(obj, url) # Requested reference should be in the index now, otherwise it's a bad reference if self.idx.get(url) is not None: return self.idx[url] else: raise RuntimeError( "Reference `%s` is not in the index. Index contains:\n %s" % (url, "\n ".join(self.idx)))
def __init__(self, url): self.__url = urlparse.urldefrag(url)
def binary_to_text_inner(self, bin, buf, stack): (depth, tag_name, current_map, dynamic_tag, errors, in_censorship, is_goingdown, state, flags) = stack.pop() if state == 'close tag': if not tag_name: raise LitError('Tag ends before it begins.') buf.write(encode(u''.join(('</', tag_name, '>')))) dynamic_tag = 0 tag_name = None state = 'text' while self.cpos < len(bin): c, self.cpos = read_utf8_char(bin, self.cpos) oc = ord(c) if state == 'text': if oc == 0: state = 'get flags' continue elif c == '\v': c = '\n' elif c == '>': c = '>>' elif c == '<': c = '<<' buf.write(encode(c)) elif state == 'get flags': if oc == 0: state = 'text' continue flags = oc state = 'get tag' elif state == 'get tag': state = 'text' if oc == 0 else 'get attr' if flags & FLAG_OPENING: tag = oc buf.write('<') if not (flags & FLAG_CLOSING): is_goingdown = True if tag == 0x8000: state = 'get custom length' continue if flags & FLAG_ATOM: if not self.tag_atoms or tag not in self.tag_atoms: raise LitError("atom tag %d not in atom tag list" % tag) tag_name = self.tag_atoms[tag] current_map = self.attr_atoms elif tag < len(self.tag_map): tag_name = self.tag_map[tag] current_map = self.tag_to_attr_map[tag] else: dynamic_tag += 1 errors += 1 tag_name = '?' + unichr(tag) + '?' current_map = self.tag_to_attr_map[tag] print 'WARNING: tag %s unknown' % unichr(tag) buf.write(encode(tag_name)) elif flags & FLAG_CLOSING: if depth == 0: raise LitError('Extra closing tag %s at %d' % (tag_name, self.cpos)) break elif state == 'get attr': in_censorship = False if oc == 0: state = 'text' if not is_goingdown: tag_name = None dynamic_tag = 0 buf.write(' />') else: buf.write('>') frame = (depth, tag_name, current_map, dynamic_tag, errors, in_censorship, False, 'close tag', flags) stack.append(frame) frame = (depth + 1, None, None, 0, 0, False, False, 'text', 0) stack.append(frame) break else: if oc == 0x8000: state = 'get attr length' continue attr = None if current_map and oc in current_map and current_map[oc]: attr = current_map[oc] elif oc in self.attr_map: attr = self.attr_map[oc] if not attr or not isinstance(attr, basestring): raise LitError('Unknown attribute %d in tag %s' % (oc, tag_name)) if attr.startswith('%'): in_censorship = True state = 'get value length' continue buf.write(' ' + encode(attr) + '=') if attr in ['href', 'src']: state = 'get href length' else: state = 'get value length' elif state == 'get value length': if not in_censorship: buf.write('"') count = oc - 1 if count == 0: if not in_censorship: buf.write('"') in_censorship = False state = 'get attr' continue state = 'get value' if oc == 0xffff: continue if count < 0 or count > (len(bin) - self.cpos): raise LitError('Invalid character count %d' % count) elif state == 'get value': if count == 0xfffe: if not in_censorship: buf.write('%s"' % (oc - 1)) in_censorship = False state = 'get attr' elif count > 0: if not in_censorship: if c == '"': c = '"' elif c == '<': c = '<' buf.write(c.encode('ascii', 'xmlcharrefreplace')) count -= 1 if count == 0: if not in_censorship: buf.write('"') in_censorship = False state = 'get attr' elif state == 'get custom length': count = oc - 1 if count <= 0 or count > len(bin) - self.cpos: raise LitError('Invalid character count %d' % count) dynamic_tag += 1 state = 'get custom' tag_name = '' elif state == 'get custom': tag_name += c count -= 1 if count == 0: buf.write(encode(tag_name)) state = 'get attr' elif state == 'get attr length': count = oc - 1 if count <= 0 or count > (len(bin) - self.cpos): raise LitError('Invalid character count %d' % count) buf.write(' ') state = 'get custom attr' elif state == 'get custom attr': buf.write(encode(c)) count -= 1 if count == 0: buf.write('=') state = 'get value length' elif state == 'get href length': count = oc - 1 if count <= 0 or count > (len(bin) - self.cpos): raise LitError('Invalid character count %d' % count) href = '' state = 'get href' elif state == 'get href': href += c count -= 1 if count == 0: doc, frag = urldefrag(href[1:]) path = self.item_path(doc) if frag: path = '#'.join((path, frag)) path = urlnormalize(path) buf.write(encode(u'"%s"' % path)) state = 'get attr'