Beispiel #1
0
    def crawl(self, url,fun, deep=5):
        page = TylCrawlerPage(url=url)
        fetcher = TylCrawlerFetcher()
        pages = [] 
        pages.append([page])

        try:
            for i in range(deep):
                pn = []
                for p in pages[i]:
                    fetcher.fetch(p)
                    p.fetcher = fetcher
                    links = p.getLinks(self.host)
                    fun(p)

                    if hasattr(self, "sleepSec"):
                        time.sleep(self.sleepSec)

                    if (i+1) == deep:
                        continue
                    for link in links:

                        link = self.fixUrl(link)

                        if self.urlCrawled(link):
                            continue;
                        self.crawledList.append(link)
                        pchild = TylCrawlerPage(url=link)
                        pchild.setReferer(p.url)
                        pchild.cookieJar = p.cookieJar
                        pchild.level = i+1
                        pn.append(pchild)
                        #print link
                pages.append(pn)
        except ValueError as e:
            print e
Beispiel #2
0
        page.fetched = True
        if not hasattr(page, "url"):
            return None
        try:
            req = urllib2.Request(page.url)
            for x in page.headers: req.add_header(x, page.headers[x])
            if page.cookieJar is None:
                page.cookieJar = cookielib.CookieJar()
            opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(page.cookieJar))
            r = opener.open(req,timeout=self.timeout)
            response = r.read()
            page.code = r.getcode()
            page.content = response

            for header in r.info().headers:
                pair = header.split(":")
                headerKey = pair[0].strip()
                headerValue = pair[1].strip()
                page.responseHeaders[headerKey] = headerValue

        except Exception,e:
            print e

if __name__ == "__main__":
    from page import TylCrawlerPage
    fetcher = TylFetcher()
    page = TylCrawlerPage(url="http://www.okbuy.com/")
    fetcher.fetch(page)
    page.code
    page.getLinks()