コード例 #1
0
ファイル: mycrawler.py プロジェクト: somit/crawlers
def browse(browseUrlObject):
    browseUrl = browseUrlObject.url
    try:
        url = URL(browseUrl)
        dom = DOM(url.download())
        visited[browseUrl] = 1
        try:
            for anchor in dom.by_tag("a"):
                url = generateUrl(baseUrl, browseUrl, anchor.href)
                if url != "":
                    try:
                        dict[url].count = dict[url].count + 1
                    except:
                        dict[url] = link(browseUrl, 0, url)
        except:
            pass
    except:
        del dict[browseUrl]
        print browseUrlObject
        brokenUrls[browseUrl] = browseUrlObject
        pass
コード例 #2
0
ファイル: mycrawler.py プロジェクト: somit/crawlers
    if re.search(r"^/", href):
        return root + href
    if href.startswith("http://") or href.startswith("www."):
        return ""
    s = baseurl.split("/")
    s = baseurl.replace("/" + s.pop(), "/")
    return s + "href"


dict = {}
brokenUrls = {}
visited = {}

rootUrl = "http://winzip.com/win/en/index.htm"
baseUrl = "http://winzip.com"
dict[rootUrl] = link(baseUrl, 0, rootUrl)


def browse(browseUrlObject):
    browseUrl = browseUrlObject.url
    try:
        url = URL(browseUrl)
        dom = DOM(url.download())
        visited[browseUrl] = 1
        try:
            for anchor in dom.by_tag("a"):
                url = generateUrl(baseUrl, browseUrl, anchor.href)
                if url != "":
                    try:
                        dict[url].count = dict[url].count + 1
                    except: