Esempio n. 1
0
class ErrorTry(object):
    def __init__(self, trymax=3, fname="errtry.bloom", errlink="errlink.txt"):
        self.trydict = {}
        self.bfilter = Filter(capacity=100000, fname=fname)
        self.trymax = trymax
        self.errlinkfd = open(errlink, "a+")

    def isTry(self, value):
        if not value:
            return False

        if self.bfilter.isExists(value):
            #print "aaaaaaaaaaaaa"
            return False

        trycnt = self.trydict.get(value, 0)
        if trycnt >= self.trymax:
            self.bfilter.add(value)
            self.trydict.pop(value)
            mylog.warn("maxtry: %s" % value)
            self.errlinkfd.write(value + "\n")
            return False

        try:
            self.trydict[value] += 1
        except:
            self.trydict[value] = 1

        return True

    def sync(self):
        self.bfilter.sync()
Esempio n. 2
0
 def check(self, key):
     """
     Check if data is present in bloom filter
     True means the data has been added to the mysql table
     Implies no api call
     Otherwise api call.
     """
     return Filter.check(key)
Esempio n. 3
0
    def __init__(self):
        bfilter = Filter(capacity=100000000, fname="home_filter.bloom")
        errortry = ErrorTry(fname="home_err.bloom")
        super(CrawlerHome, self).__init__("home",
                                          bfilter,
                                          errortry,
                                          concurent=1)
        self.cache = set()

        self.tre = re.compile(">(.*?)<br>(.*?)<")
Esempio n. 4
0
 def __init__(self):
     bfilter = Filter(capacity=100000000, fname="csdn_filter.bloom")
     errortry = ErrorTry(fname="csdn_err.bloom")
     super(CrawlerCsdn, self).__init__("csdn",
                                       bfilter,
                                       errortry,
                                       concurent=10)
     self.linkfd = open("csdnlinks.txt", "a+")
     self.fd404 = open("csdn404.txt", "a+")
     self.proxies = {"http": "http://127.0.0.1:8080"}
     self.cache = set()
Esempio n. 5
0
    def __init__(self, name, fbloom=None, errtry=None, concurent=5):
        self.concurent = concurent
        self.tasks = Queue.Queue(maxsize=1000000)
        self.details = Queue.Queue(maxsize=10)
        self.tmpl = name
        self.ss = requests.session()
        self.proxies = {}

        day = time.strftime("%Y%m%d", time.localtime())
        if not fbloom:
            fn = day + "_filter.bloom"
            self.bfilter = Filter(fname=fn)
        else:
            self.bfilter = fbloom

        if not errtry:
            fn = day + "_errtry.bloom"
            self.errortry = ErrorTry(fname=fn)
        else:
            self.errortry = errtry
Esempio n. 6
0
 def update_filter(self, key):
     """
     After the movie is added in the database,
     filter is updated
     """
     Filter.update(key)
Esempio n. 7
0
 def __init__(self, trymax=3, fname="errtry.bloom", errlink="errlink.txt"):
     self.trydict = {}
     self.bfilter = Filter(capacity=100000, fname=fname)
     self.trymax = trymax
     self.errlinkfd = open(errlink, "a+")
Esempio n. 8
0
class Crawler(object):
    headers = {
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Encoding":
        "gzip, deflate, sdch, br",
        "Accept-Language":
        "zh-CN,zh;q=0.8",
        "Cache-Control":
        "no-cache",
        "Connection":
        "keep-alive",
        #"Host":"www.baidu.com",
        "Pragma":
        "no-cache",
        "Upgrade-Insecure-Requests":
        "1",
        "User-Agent":
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
    }

    def __init__(self, name, fbloom=None, errtry=None, concurent=5):
        self.concurent = concurent
        self.tasks = Queue.Queue(maxsize=1000000)
        self.details = Queue.Queue(maxsize=10)
        self.tmpl = name
        self.ss = requests.session()
        self.proxies = {}

        day = time.strftime("%Y%m%d", time.localtime())
        if not fbloom:
            fn = day + "_filter.bloom"
            self.bfilter = Filter(fname=fn)
        else:
            self.bfilter = fbloom

        if not errtry:
            fn = day + "_errtry.bloom"
            self.errortry = ErrorTry(fname=fn)
        else:
            self.errortry = errtry

    def downloadFail(self, url, pagetype):
        if self.errortry.isTry(url):
            if pagetype == PAGE_TYPE_LIST:
                self.addListUrl(url)
            elif pagetype == PAGE_TYPE_DETAIL:
                self.addDetailUrl(url)

    #need code for real crawler
    def initTasks(self):
        pass

    #need code for real crawler
    #ret: True success  False: try get
    def procListPage(self, data, url):
        pass

    #need code for real crawler
    #ret: True success  False: try get
    def procDetailPage(self, data, url):
        pass

    def addListUrl(self, url):
        if not url:
            return
        self.tasks.put(url)

    def addDetailUrl(self, url):
        if not url:
            return
        self.details.put(url)

    def crawlList(self, url):
        hurl = url
        try:
            r = self.ss.get(hurl,
                            verify=False,
                            headers=self.headers,
                            proxies=self.proxies)
        except Exception as e:
            mylog.error("crawl fail: %s %s" % (e, url))
            self.downloadFail(url, PAGE_TYPE_LIST)
            return
        if r.status_code != 200:
            mylog.error("crawl fail: %s %s" % (r.status_code, url))
            self.downloadFail(url, PAGE_TYPE_LIST)
            return
        if not self.procListPage(r.content, url):
            mylog.error("proc fail: %s", url)
            self.downloadFail(url, PAGE_TYPE_LIST)
        else:
            self.bfilter.add(url)

    def crawlDetail(self, url):
        hurl = url
        try:
            r = self.ss.get(hurl,
                            verify=False,
                            headers=self.headers,
                            proxies=self.proxies)
        except Exception as e:
            mylog.error("detail crawl fail: %s %s" % (e, url))
            self.downloadFail(url, PAGE_TYPE_DETAIL)
            return
        if r.status_code != 200:
            mylog.error("detail crawl fail: %s %s" % (r.status_code, url))
            self.downloadFail(url, PAGE_TYPE_DETAIL)
            return
        if not self.procDetailPage(r.content, url):
            mylog.error("detail proc fail: %s" % url)
            self.downloadFail(url, PAGE_TYPE_DETAIL)
        else:
            self.bfilter.add(url)

    def run(self):
        self.initTasks()
        while True:
            if self.tasks.empty():
                break
            gl = []
            for i in xrange(0, self.concurent):
                try:
                    url = self.tasks.get_nowait()
                except Queue.Empty:
                    break
                if not url:
                    break
                if self.bfilter.isExists(url):
                    continue
                g = gevent.spawn(self.crawlList, url)
                gl.append(g)
            if gl:
                gevent.joinall(gl)
            time.sleep(0.3)

            while True:
                if self.details.empty():
                    break
                dgl = []
                for i in xrange(0, self.concurent):
                    try:
                        url = self.details.get_nowait()
                    except Queue.Empty:
                        break
                    if not url:
                        break
                    if self.bfilter.isExists(url):
                        continue
                    g = gevent.spawn(self.crawlDetail, url)
                    dgl.append(g)
                if dgl:
                    gevent.joinall(dgl)
                time.sleep(0.3)