Exemple #1
0
	def __init__(self):
		init_params = {
			'site_name' : SITE_NAME,
			'init_url'  : INIT_URL,
			'skip_url'  : SKIP_URL,
			'redis_crawling_urls' : REDIS_CRAWLING_URLS,
			'redis_crawled_urls' : REDIS_CRAWLED_URLS,
			'redis_product_urls' : REDIS_PRODUCT_URLS,
			'product_pattern' : PRODUCT_PATTERN,
			'process_num' : PROCESS_NUM,
			'use_tor' : USE_TOR
		}
		Crawl.__init__(self, **init_params)
		#select collection
		self.mongo_collection = self.mongo_conn['nguyenkim_product']
Exemple #2
0
	def __init__(self):
		init_params = {
			'site_name' : SITE_NAME,
			'init_url'  : INIT_URL,
			'skip_url'  : SKIP_URL,
			'redis_crawling_urls' : REDIS_CRAWLING_URLS,
			'redis_crawled_urls' : REDIS_CRAWLED_URLS,
			'redis_product_urls' : REDIS_PRODUCT_URLS,
			'product_pattern' : PRODUCT_PATTERN,
			'process_num' : PROCESS_NUM,
			'use_tor' : USE_TOR
		}
		Crawl.__init__(self, **init_params)
		#select collection
		self.mongo_collection = self.mongo_conn['tiki_product']

		self.page_link_format = re.compile(r'(.*)\?.*(p=\d+).*', re.MULTILINE|re.DOTALL)
Exemple #3
0
    def __init__(self):
        init_params = {
            'site_name': SITE_NAME,
            'init_url': INIT_URL,
            'skip_url': SKIP_URL,
            'redis_crawling_urls': REDIS_CRAWLING_URLS,
            'redis_crawled_urls': REDIS_CRAWLED_URLS,
            'redis_product_urls': REDIS_PRODUCT_URLS,
            'product_pattern': PRODUCT_PATTERN,
            'process_num': PROCESS_NUM,
            'use_tor': USE_TOR
        }
        Crawl.__init__(self, **init_params)
        #select collection
        self.mongo_collection = self.mongo_conn['lazada_product']

        self.page_link_format = re.compile(r"(.*)\?.*(page=\d+).*",
                                           re.MULTILINE | re.DOTALL)
Exemple #4
0
 def __init__(self):
     Crawl.__init__(self, INIT_URL, SKIP_URL, USE_TOR)
     #select collection
     self.mongo_collection = self.mongo_conn['cdiscount_product']