Ejemplo n.º 1
0
 def __init__(self):
     self.salver = SALVER
     self.r = redis_db.RedisQueue('new')
     self.html = HtmlDownloader()
     self.parser = HtmlParser()
     if self.salver:
         start_urls = [
             'https://www.guazi.com/bj/buy/o{}/#bread'.format(i)
             for i in range(1, 230)
         ]
         self.r.put(start_urls)
Ejemplo n.º 2
0
 def __init__(self):
     self.salver = SALVER
     self.r = redis_db.RedisQueue('new')
     self.html = HtmlDownloader()
     self.parser = HtmlParser()
     if self.salver:
         start_urls = [
             'https://weibo.com/p/aj/v6/mblog/mbloglist?ajwvr=6&domain'
             '=100505&from=page_100505_profile&wvr=6&mod=data&is_hot=1&pagebar=1&pl_name=Pl_Of'
             'ficial_MyProfileFeed__20&id=1005051713926427&script_uri=/p/1005051713926427/home&feed_typ'
             'e=0&page={}&pre_page=1&domain_op=100505&__rnd=1526295208644'.
             format(i) for i in range(200)
         ]
         self.r.put(start_urls)
Ejemplo n.º 3
0
    def __init__(self):
        self.salver = SALVER
        self.r = redis_db.RedisQueue('new')
        self.html = HtmlDownloader()
        self.parser = HtmlParser()
        if self.salver:

            start_urls = 'https://s.taobao.com/search?q={}&app=detailproduct&through=1'.format(KEYS)
            self.r.put(start_urls)

            for i in range(1, PAGE_NUM):
                url = 'https://s.taobao.com/search?data-key=s&data-value=88&ajax=true&_ksTS=1526525628728_733&callback=jsonp734&q={}&imgfile=&ie=utf8&app=detailproduct&through=1&bcoffset=4&p4ppushleft=6%2C48&s={}'.format(
                    KEYS, i * 44)
                self.r.put(url)
Ejemplo n.º 4
0
    def __init__(self):
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')

        server_adrr = '127.0.0.1'
        print('connect to %s...' % server_adrr)

        self.m = BaseManager(address=(server_adrr, 8001), authkey=b'qiye')

        self.m.connect()
        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()
        self.downloader = HtmlDownloader()
        self.htmlparser = HtmlParser()
        self.dataoutput = DataOutput()
Ejemplo n.º 5
0
class SpiderWorker():
    def __init__(self):
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')

        server_adrr = '127.0.0.1'
        print('connect to %s...' % server_adrr)

        self.m = BaseManager(address=(server_adrr, 8001), authkey=b'qiye')

        self.m.connect()
        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()
        self.downloader = HtmlDownloader()
        self.htmlparser = HtmlParser()
        self.dataoutput = DataOutput()

    def crawl(self):

        while True:
            try:
                if self.task.empty:
                    url = self.task.get()
                    if url == 'end':
                        return None
                    print('正在解析 %s' % url.encode('utf-8'))
                    content = self.downloader.download(url)
                    new_urls, data = self.htmlparser.parser(url, content)
                    self.result.put({'new_urls': new_urls})
                    self.dataoutput.output_mongo({'data': data})
            except Exception as e:
                print(e)
Ejemplo n.º 6
0
class Worker():
    def __init__(self):
        self.salver = SALVER
        self.r = redis_db.RedisQueue('new')
        self.html = HtmlDownloader()
        self.parser = HtmlParser()
        if self.salver:

            start_urls = 'https://s.taobao.com/search?q={}&app=detailproduct&through=1'.format(KEYS)
            self.r.put(start_urls)

            for i in range(1, PAGE_NUM):
                url = 'https://s.taobao.com/search?data-key=s&data-value=88&ajax=true&_ksTS=1526525628728_733&callback=jsonp734&q={}&imgfile=&ie=utf8&app=detailproduct&through=1&bcoffset=4&p4ppushleft=6%2C48&s={}'.format(
                    KEYS, i * 44)
                self.r.put(url)


    def start(self):
        while True:
            url = self.r.get_wait()
            if url == None:
                break
            html = self.html.download(url)
            self.parser.parser(url, html)
            time.sleep(1)






    def process_start(self):
        threads = []
        t1 = time.time()
        for i in range(PROCESS_NUM):
            t = Process(target=self.start, args=())
            threads.append(t)
        for i in range(len(threads)):
            print('线程% running...' % i)
            threads[i].start()

        for i in range(len(threads)):

            threads[i].join()
            print('线程% close...' % i)
        t2 = time.time()
        print(t2-t1)

    def run(self):
        if PROCESS_BOOL:
            logger.debug('多线程启动')
            self.process_start()
        else:
            logger.debug('单线程启动')
            self.start()
Ejemplo n.º 7
0
class Worker():
    def __init__(self):
        self.salver = SALVER
        self.r = redis_db.RedisQueue('new')
        self.html = HtmlDownloader()
        self.parser = HtmlParser()
        if self.salver:
            start_urls = [
                'https://weibo.com/p/aj/v6/mblog/mbloglist?ajwvr=6&domain'
                '=100505&from=page_100505_profile&wvr=6&mod=data&is_hot=1&pagebar=1&pl_name=Pl_Of'
                'ficial_MyProfileFeed__20&id=1005051713926427&script_uri=/p/1005051713926427/home&feed_typ'
                'e=0&page={}&pre_page=1&domain_op=100505&__rnd=1526295208644'.
                format(i) for i in range(200)
            ]
            self.r.put(start_urls)

    def start(self):
        while True:
            url = self.r.get_wait()
            if url == None:
                break
            html = self.html.download(url)
            print(html)
            self.parser._get_datas(url, html)

    def process_start(self):
        threads = []
        t1 = time.time()
        for i in range(PROCESS_NUM):
            t = Process(target=self.start, args=())
            threads.append(t)
        for i in range(len(threads)):
            print('线程% running...' % i)
            threads[i].start()

        for i in range(len(threads)):

            threads[i].join()
            print('线程% close...' % i)
        t2 = time.time()
        print(t2 - t1)

    def run(self):
        if PROCESS_BOOL:
            logger.debug('多线程启动')
            self.process_start()
        else:
            logger.debug('单线程启动')
            self.start()
Ejemplo n.º 8
0
class Worker():
    def __init__(self):
        self.salver = SALVER
        self.r = redis_db.RedisQueue('new')
        self.html = HtmlDownloader()
        self.parser = HtmlParser()
        if self.salver:
            start_urls = [
                'https://www.guazi.com/bj/buy/o{}/#bread'.format(i)
                for i in range(1, 230)
            ]
            self.r.put(start_urls)

    def start(self):
        while True:
            url = self.r.get_wait()
            if url == None:
                break
            html = self.html.download(url)
            self.parser.parser(url, html)

    def process_start(self):
        threads = []
        t1 = time.time()
        for i in range(PROCESS_NUM):
            t = Process(target=self.start, args=())
            threads.append(t)
        for i in range(len(threads)):
            print('线程% running...' % i)
            threads[i].start()

        for i in range(len(threads)):

            threads[i].join()
            print('线程% close...' % i)
        t2 = time.time()
        print(t2 - t1)

    def run(self):
        if PROCESS_BOOL:
            logger.debug('多线程启动')
            self.process_start()
        else:
            logger.debug('单线程启动')
            self.start()