コード例 #1
0
 def __init__(self):
     self.redis_conn = get_redis_conn()
     self.check_queue = SETTINGS['URLS_CHECK_TASKS']
     self.mongo_conn = MongodbClass()
     self.mysql_conn = MySqlDBClass()
     self.logging_actor = update_logging()
     self.counter = CountIitem()
コード例 #2
0
 def __init__(self):
     self.chrome_options = Options()
     #self.chrome_options.add_argument('--headless')
     #self.driver = webdriver.Chrome(chrome_options=self.chrome_options,executable_path=SETTINGS['CHROME_PATH'])
     self.driver = webdriver.Chrome(SETTINGS['CHROME_PATH'])
     self.driver.set_window_size(500,500)
     self.redis_conn = get_redis_conn()
     #self.driver = webdriver.Chrome(self.chrome_path)
     self.count = 0
コード例 #3
0
 def __init__(self,refined_totalpage=2):
     self.loss_urls = []
     self.refined_totalpage = refined_totalpage
     self.redis_conn = get_redis_conn()
     jsonfile = os.path.join(BASE_JSONFILE_PATH,'{}.json'.format(self.name))
     self.xp = Json2XPath(jsonfile).get_xpath()
     self.check_queue = SETTINGS['URLS_CHECK_TASKS']
     self.mongo_conn = MongodbClass()
     self.mysql_conn = MySqlDBClass()
     self.logging_actor = update_logging()
     self.func_moc = all_modify_func[self.name]
コード例 #4
0
 def __init__(self):
     self.redis_conn = get_redis_conn()
     self.cookie_w_queue = SETTINGS['COOKIE_WORK']
     self.cookie_c_queue = SETTINGS['COOKIE_CHECK']
     self.cookie_batch_size = 6
     self.cookie_indexes = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
     self.remain_indexes = None
     self.random_indexes = None
     #self.get_ini_random_indexes()
     self.calib_time = datetime.today()
     self.interval = 180
コード例 #5
0
 def __init__(self, spider_name, check_date=None, timeout=100):
     self.tasks = all_tasks[spider_name]
     self.reflesh_urls = []
     self.limitpages = 3
     self.conn = get_redis_conn()
     self.mysql_conn = MySqlDBClass()
     self.outdate = 0  #用来判断是否有过期的日期,如果有说明实时爬取的范围有效
     self.logger_filename = spider_name
     self.check_date = check_date
     self.spider_name = spider_name
     print('UpdateFilterClass mysql', self.mysql_conn)
コード例 #6
0
    def __init__(self):
        super().__init__()

        self.redis_conn = get_redis_conn()
        #self.to_mysql = MongoDB_To_MySQL(self.name)
        self.redis_batch_size = 100
        self.work_queue = SETTINGS['URLS_WORK_TASKS']
        self.check_queue = SETTINGS['URLS_CHECK_TASKS']
        #self.sche_updator = UpdateFilterClass(self.name)

        self.pre_suf = None
        self.workers = dict(workers)
コード例 #7
0
 def __init__(self, check_date=None, timeout=100):
     self.conn = get_redis_conn()
     self.mysql_conn = MySqlDBClass()
     self.outdate = 0  #用来判断是否有过期的日期,如果有说明实时爬取的范围有效
     self.check_date = check_date
     self.mongo_conn = MongodbClass()
     self.sele_spiders = ['gongshu', 'longyou', 'zhejiangzfcg']
     #print('UpdateFilterClass mysql',self.mysql_conn)
     self.largecities = [
         'zhejiang', 'hangzhou', 'huzhou', 'jiaxing', 'jinhua', 'lishui',
         'ningboshi', 'quzhou', 'shaoxing', 'taizhou', 'wenzhou', 'zhoushan'
     ]
コード例 #8
0
 def __init__(self):
     self.redis_conn = get_redis_conn()
コード例 #9
0
 def __init__(self):
     self.mongo_conn = MongodbClass()
     self.redis_conn = get_redis_conn()
コード例 #10
0
 def __init__(self):
     self.redis_conn = get_redis_conn()
     self.func_moc = all_modify_func[self.name]
     self.check_queue = SETTINGS['URLS_CHECK_TASKS']
     self.refined_totalpage = 2
コード例 #11
0
 def _spider_opened(self,spider):
     self.mongo_instance = MongodbClass()
     self.redis_conn = get_redis_conn()
コード例 #12
0
 def _spider_opened(self,spider):
     self.mysql_instance = MySqlDBClass()
     self.mongo_instance = MongodbClass()
     self.mysql_instance.create_table('t_zhaobiao')
     self.redis_conn = get_redis_conn()
     self.counter = CountIitem()
コード例 #13
0
 def __init__(self, timeout=300):
     self.conn = get_redis_conn()
     self.filepath = SETTINGS['CDATA_FILE_PATH']
     self.error_filepath = SETTINGS['EDATA_FILE_PATH']