コード例 #1
0
ファイル: spider.py プロジェクト: zhanglaplace/mini_spider
def main():
    """
    the main method to run mini spider
    """
    # 日志保存到./log/spider.log和./log/spider.log.wf,按天切割,保留7天
    log.init_log("./log/spider")
    spider = SpiderEngine.SpiderEngine()
    try:
        opts, args = getopt.getopt(sys.argv[1:], "vhc:")
    except getopt.GetoptError as err:
        logging.error("get option error : %s." % err)
        return
    for o, a in opts:
        if o == "-v":
            version()
            return
        elif o == "-h":
            # spider_engine = _SpiderEngine._SpiderEngine()
            # spider_engine.config_tostring()
            # spider_engine.set_config()
            print "帮助信息:没有帮助^_^"
            return
        elif o == "-c":
            spider.set_config_by_file(a)
        else:
            logging.error("unhandled option")
            print "unhandled option"
            return
    spider.start_work()
    return
コード例 #2
0
    def run(self):
        """
        run the thread.
        get task from queue. And add the sub url into queue. BFS.
        :return: no return
        """
        while True:
            try:
                url_leaf = self.queue.get(block=True, timeout=self.timeout)
            except Exception as err:
                logging.info("this thread can not get a task. job done.")
                break
            # print url_leaf is None
            self.queue.task_done()
            #sleep interval
            time.sleep(self.interval)

            #download the url
            if self.need_download(url_leaf.url):
                UrlParse.UrlParse.download(self.file_path, url_leaf.url)
            self.lock.acquire()
            self.total_set.add(url_leaf.url)
            self.lock.release()
            #get the sub urls from url
            sub_urls = UrlParse.UrlParse.get_urls(url_leaf.url)
            new_level = url_leaf.level + 1
            if new_level > self.max_depth:
                continue
            for url in sub_urls:
                url_leaf_temp = SpiderEngine.UrlLeaf(url, new_level)
                self.queue.put(url_leaf_temp)
コード例 #3
0
 def test_url_leaf(self):
     """
     test the UrlLeaf class
     :return:nothing
     """
     leaf = SpiderEngine.UrlLeaf('lalal', 0)
     print leaf
コード例 #4
0
 def test_set_config(self):
     """
     test the engine's setting config
     :return: nothing
     """
     a = SpiderEngine.SpiderEngine()
     a.set_config("urls", "output", 1, 1, 1, "*\.(html|png|jpg|bmp)$", 1)
コード例 #5
0
 def test_engine(self):
     """
     test engine works well
     :return: nothing
     """
     a = SpiderEngine.SpiderEngine()
     a.set_config_by_file("../spider_conf")
     a.start_work()
コード例 #6
0
ファイル: example_brand.py プロジェクト: vv1133/spider_engine
        post_data = str(post_data).encode('gbk')
        # print(post_data)
        url = 'http://zzxh.zjsgat.gov.cn:6081/zjwwzzxh/tscreenquery.do?act=query&status=doQueryInputVehInfo'
        self.request_post(url,
                          post_data=post_data,
                          callback=self.get_login_status)

    def _get_car_plate(self, content):
        za = r'<li><input type="button" value="(.*)"  /></li>'
        value = re.findall(za, content)
        return value

    def get_login_status(self, response):
        plate_cnt = len(self._get_car_plate(response.text))
        self.login_status = True if plate_cnt > 0 else False

    def check_login(self):
        return self.login_status

    def parse(self, response):
        plates = self._get_car_plate(response.text)
        for plate in plates:
            self.out.write(plate + '\n')
        self.out.flush()

        delay = random.uniform(3, 11)
        time.sleep(delay)


SpiderEngine(urls=urls, spider_cls=MySpider).start(1)