def browse(browseUrlObject): browseUrl = browseUrlObject.url try: url = URL(browseUrl) dom = DOM(url.download()) visited[browseUrl] = 1 try: for anchor in dom.by_tag("a"): url = generateUrl(baseUrl, browseUrl, anchor.href) if url != "": try: dict[url].count = dict[url].count + 1 except: dict[url] = link(browseUrl, 0, url) except: pass except: del dict[browseUrl] print browseUrlObject brokenUrls[browseUrl] = browseUrlObject pass
if re.search(r"^/", href): return root + href if href.startswith("http://") or href.startswith("www."): return "" s = baseurl.split("/") s = baseurl.replace("/" + s.pop(), "/") return s + "href" dict = {} brokenUrls = {} visited = {} rootUrl = "http://winzip.com/win/en/index.htm" baseUrl = "http://winzip.com" dict[rootUrl] = link(baseUrl, 0, rootUrl) def browse(browseUrlObject): browseUrl = browseUrlObject.url try: url = URL(browseUrl) dom = DOM(url.download()) visited[browseUrl] = 1 try: for anchor in dom.by_tag("a"): url = generateUrl(baseUrl, browseUrl, anchor.href) if url != "": try: dict[url].count = dict[url].count + 1 except: