def main(): """ the main method to run mini spider """ # 日志保存到./log/spider.log和./log/spider.log.wf,按天切割,保留7天 log.init_log("./log/spider") spider = SpiderEngine.SpiderEngine() try: opts, args = getopt.getopt(sys.argv[1:], "vhc:") except getopt.GetoptError as err: logging.error("get option error : %s." % err) return for o, a in opts: if o == "-v": version() return elif o == "-h": # spider_engine = _SpiderEngine._SpiderEngine() # spider_engine.config_tostring() # spider_engine.set_config() print "帮助信息:没有帮助^_^" return elif o == "-c": spider.set_config_by_file(a) else: logging.error("unhandled option") print "unhandled option" return spider.start_work() return
def run(self): """ run the thread. get task from queue. And add the sub url into queue. BFS. :return: no return """ while True: try: url_leaf = self.queue.get(block=True, timeout=self.timeout) except Exception as err: logging.info("this thread can not get a task. job done.") break # print url_leaf is None self.queue.task_done() #sleep interval time.sleep(self.interval) #download the url if self.need_download(url_leaf.url): UrlParse.UrlParse.download(self.file_path, url_leaf.url) self.lock.acquire() self.total_set.add(url_leaf.url) self.lock.release() #get the sub urls from url sub_urls = UrlParse.UrlParse.get_urls(url_leaf.url) new_level = url_leaf.level + 1 if new_level > self.max_depth: continue for url in sub_urls: url_leaf_temp = SpiderEngine.UrlLeaf(url, new_level) self.queue.put(url_leaf_temp)
def test_url_leaf(self): """ test the UrlLeaf class :return:nothing """ leaf = SpiderEngine.UrlLeaf('lalal', 0) print leaf
def test_set_config(self): """ test the engine's setting config :return: nothing """ a = SpiderEngine.SpiderEngine() a.set_config("urls", "output", 1, 1, 1, "*\.(html|png|jpg|bmp)$", 1)
def test_engine(self): """ test engine works well :return: nothing """ a = SpiderEngine.SpiderEngine() a.set_config_by_file("../spider_conf") a.start_work()
post_data = str(post_data).encode('gbk') # print(post_data) url = 'http://zzxh.zjsgat.gov.cn:6081/zjwwzzxh/tscreenquery.do?act=query&status=doQueryInputVehInfo' self.request_post(url, post_data=post_data, callback=self.get_login_status) def _get_car_plate(self, content): za = r'<li><input type="button" value="(.*)" /></li>' value = re.findall(za, content) return value def get_login_status(self, response): plate_cnt = len(self._get_car_plate(response.text)) self.login_status = True if plate_cnt > 0 else False def check_login(self): return self.login_status def parse(self, response): plates = self._get_car_plate(response.text) for plate in plates: self.out.write(plate + '\n') self.out.flush() delay = random.uniform(3, 11) time.sleep(delay) SpiderEngine(urls=urls, spider_cls=MySpider).start(1)