Esempio n. 1
0
 def save_url(self):
     try:
         self.save_file_comment(st.NEW_URL_PATH, self.new_urls)
         self.save_file_comment(st.OLD_URL_PATH, self.old_urls)
     except Exception as f:
         data = '[' + st.URL_NAME + ']' + f
         logger.write_log(level='error', data=data)
Esempio n. 2
0
 def save_file_comment(self, path, data):
     mes = '[%s]保存中......' % path
     logger.write_log(level='info', data=mes)
     if isinstance(data, set):
         data = list(data)
     with open(path, 'w') as x:
         json.dump(data, x)
         logger.write_log(level='info', data="文件保存完成......")
Esempio n. 3
0
 def write_head(self):
     try:
         with open(self.filepath, 'w', encoding='utf-8') as f:
             writer = csv.writer(f)
             title = [k for k, v in self.data[0].items()]
             writer.writerow(title)
         logger.write_log(level='info', data="写入表头成功......")
     except Exception as e:
         data = '[' + settings.SAVE_NAME + ']' + e
         logger.write_log(level='error', data=data)
Esempio n. 4
0
 def add_url(self, url):
     if url:
         md5 = hashlib.md5()
         md5.update(url.encode())
         hash_url = md5.hexdigest()
         if url not in self.new_urls and hash_url not in self.old_urls:
             mes = "[%s]链接添加......" % url
             logger.write_log(level='run', data=mes)
             self.this_url_counts += 1
             self.new_urls.add(url)
Esempio n. 5
0
    def save(self):
        with open(self.filepath, 'a', encoding='utf-8') as f:
            writer = csv.writer(f)
            try:
                for book_dict in self.data:
                    info_list = [v for k, v in book_dict.items()]
                    writer.writerow(info_list)
                logger.write_log(level='info', data="文件存储完成......")
                self.data = []

            except Exception as e:
                data = '[' + settings.SAVE_NAME + ']' + e
                logger.write_log(level='error', data=data)
Esempio n. 6
0
    def save_pro(self):
        logger.write_log(level='info', data='数据存储程序启动......')
        while True:
            try:
                if not self.save_q.empty():
                    data = self.save_q.get()
                    time_controler.reset_time(st.SAVE_NAME)
                    if data == 'end':
                        data_controler.save()
                        logger.write_log(level='info', data='存储进程结束......')
                        return
                    data_controler.write_data(data)
                else:
                    logger.write_log(level='info', data='等待数据传输......')
                    time.sleep(1)
                    if time_controler.judge_timeout(st.SAVE_NAME):
                        logger.write_log(level='info', data='退出数据存储程序......')
                        logger.save_all()
                        return

            except Exception as e:
                data = '[' + st.SAVE_NAME + ']' + e
                logger.write_log(level = 'error', data = data)
Esempio n. 7
0
 def get_file_comment(self, path):
     logger.write_log(level='info', data="正在加载文件......")
     try:
         with open(path, 'r') as x:
             comment = json.load(x)
             logger.write_log(level='info', data="加载文件成功......")
             if isinstance(comment, list):
                 comment = set(comment)
             return comment
     except Exception as e:
         data = '[' + st.URL_NAME + ']' + str(e)
         logger.write_log(level='error', data=data)
     return set()
Esempio n. 8
0
    def get_comment_pro(self):
        logger.write_log(level='info', data='数据提取程序启动......')
        while True:
            try:
                if not self.res_q.empty():
                    comment = self.res_q.get()
                    logger.write_log(level = 'info', data = '得到数据......')
                    if comment == 'end':
                        self.save_q.put('end')
                        logger.write_log(level = 'info', data = '通知存储进程结束......')
                        return
                    new_url = comment.get('url')
                    data = comment.get('data')
                    if new_url:
                        self.conn_q.put(new_url)
                    if data:
                        self.save_q.put(data)
                    time_controler.reset_time(st.GET_NAME)
                else:
                    logger.write_log(level='info', data='等待爬虫传输......')
                    time.sleep(1)
                    if time_controler.judge_timeout(st.GET_NAME):
                        logger.write_log(level='info', data='退出数据提取程序......')
                        logger.save_all()
                        return

            except Exception as e:
                data = '[' + st.GET_NAME + ']' + e
                logger.write_log(level = 'error', data = data)
Esempio n. 9
0
    def url_control_pro(self, start_url):
        logger.write_log(level = 'info', data = "URL管理程序启动......")
        if not url_controler.have_new:
            url_controler.add_url(start_url)
        while True:
            #活跃模式(输出URL)
            if not url_controler.diode:
                while url_controler.have_new:
                    new_url = url_controler.get_new_url()
                    self.url_q.put(new_url)
                    if url_controler.this_url_counts > st.NEED_URL_COUNTS:
                        logger.write_log(level = 'info', data = '已经爬取了%s个URL......'% st.NEED_URL_COUNTS)
                        url_controler.diode = True
                        self.url_q.put('end')
                        logger.write_log(level = 'info', data = "URL管理程序进入休眠模式......")


            try:
                if not self.conn_q.empty():
                    new_urls = self.conn_q.get()
                    url_controler.add_all_urls(new_urls)
                    time_controler.reset_time(st.URL_NAME)
                else:
                    logger.write_log(level = 'info', data = '等待数据提取URL......')
                    time.sleep(1)
                    if time_controler.judge_timeout(st.URL_NAME):
                            url_controler.save_url()
                            logger.write_log(level = 'info', data = '退出URL管理程序......')
                            logger.save_all()
                            return
            except Exception as e:
                data = '[' + st.URL_NAME + ']' + e
                logger.write_log(level='error', data=data)