def __init__(self, name=None, **kwargs): from cwaliexpress.database_factory import DatabaseFactory, DatabaseTypes self._cache_db = DatabaseFactory.get_database(DatabaseTypes.cache, kwargs['mongo_uri']) self._history_db = DatabaseFactory.get_database(DatabaseTypes.history, kwargs['mongo_uri']) from cwaliexpress.parser.response_parser import ResponseParse self._crawl_parser = ResponseParse() super(AliExpresssSpider, self).__init__(name, **kwargs)
class AliExpresssBrowserSpider(scrapy.Spider): name = "aliexpress_browser" allowed_domains = ["aliexpress.com"] start_urls = [ 'http://www.aliexpress.com/' ] def __init__(self, name=None, **kwargs): self.driver = webdriver.Firefox() from cwaliexpress.database_factory import DatabaseFactory, DatabaseTypes self._cache_db = DatabaseFactory.get_database(DatabaseTypes.cache, kwargs['mongo_uri']) self._history_db = DatabaseFactory.get_database(DatabaseTypes.history, kwargs['mongo_uri']) from cwaliexpress.parser.response_parser import ResponseParse self._crawl_parser = ResponseParse() super(AliExpresssBrowserSpider, self).__init__(name, **kwargs) @classmethod def from_crawler(cls, crawler, *args, **kwargs): return super(AliExpresssBrowserSpider, cls).from_crawler(crawler, args, mongo_uri=crawler.settings.get('MONGODB_SERVER') ) def spider_closed(self, spider): self.driver.close() def parse(self, response): self.driver.get(response.url) time.sleep(4) self._crawl_parser.submit_search(self.driver) _href = self._crawl_parser.get_category_href(self.driver) _total_count = 100 _page_number = 1 # while True: # pass self.driver.get(_href) time.sleep(4) items = self._crawl_parser.get_items_from_pagenate(self.driver) for item in items: yield item
class AliExpresssDebugSpider(scrapy.Spider): name = "aliexpress_debug" allowed_domains = ["aliexpress.com"] start_urls = [ 'http://www.aliexpress.com/af/macbook-pro.html?ltype=wholesale&d=y&origin=n&isViewCP=y&catId=0&initiative_id=SB_20160520233312&SearchText=macbook+pro', ] def __init__(self, name=None, **kwargs): from cwaliexpress.database_factory import DatabaseFactory, DatabaseTypes self._cache_db = DatabaseFactory.get_database(DatabaseTypes.cache, kwargs['mongo_uri']) self._history_db = DatabaseFactory.get_database(DatabaseTypes.history, kwargs['mongo_uri']) from cwaliexpress.parser.response_parser import ResponseParse self._crawl_parser = ResponseParse() super(AliExpresssDebugSpider, self).__init__(name, **kwargs) @classmethod def from_crawler(cls, crawler, *args, **kwargs): return super(AliExpresssDebugSpider, cls).from_crawler(crawler, args, mongo_uri=crawler.settings.get('MONGODB_SERVER') ) def parse(self, response): yield WebdriverRequest(response.url, callback=self.parse_search_with_js) def parse_search_with_js(self, response): item = self._crawl_parser.parse(response.url, response) yield item self._history_db.process_item(response.url)
class AliExpresssDebugSpider(scrapy.Spider): name = "aliexpress_debug" allowed_domains = ["aliexpress.com"] start_urls = [ 'http://www.aliexpress.com/af/macbook-pro.html?ltype=wholesale&d=y&origin=n&isViewCP=y&catId=0&initiative_id=SB_20160520233312&SearchText=macbook+pro', ] def __init__(self, name=None, **kwargs): from cwaliexpress.database_factory import DatabaseFactory, DatabaseTypes self._cache_db = DatabaseFactory.get_database(DatabaseTypes.cache, kwargs['mongo_uri']) self._history_db = DatabaseFactory.get_database( DatabaseTypes.history, kwargs['mongo_uri']) from cwaliexpress.parser.response_parser import ResponseParse self._crawl_parser = ResponseParse() super(AliExpresssDebugSpider, self).__init__(name, **kwargs) @classmethod def from_crawler(cls, crawler, *args, **kwargs): return super(AliExpresssDebugSpider, cls).from_crawler( crawler, args, mongo_uri=crawler.settings.get('MONGODB_SERVER')) def parse(self, response): yield WebdriverRequest(response.url, callback=self.parse_search_with_js) def parse_search_with_js(self, response): item = self._crawl_parser.parse(response.url, response) yield item self._history_db.process_item(response.url)
class AliExpresssBrowserDebugSpider(scrapy.Spider): name = "aliexpress_browser_debug" allowed_domains = ["aliexpress.com"] start_urls = [ 'http://www.aliexpress.com/' ] def __init__(self, name=None, **kwargs): self.driver = webdriver.Firefox() from cwaliexpress.database_factory import DatabaseFactory, DatabaseTypes self._cache_db = DatabaseFactory.get_database(DatabaseTypes.cache, kwargs['mongo_uri']) self._history_db = DatabaseFactory.get_database(DatabaseTypes.history, kwargs['mongo_uri']) from cwaliexpress.parser.response_parser import ResponseParse self._crawl_parser = ResponseParse() super(AliExpresssBrowserDebugSpider, self).__init__(name, **kwargs) @classmethod def from_crawler(cls, crawler, *args, **kwargs): return super(AliExpresssBrowserDebugSpider, cls).from_crawler(crawler, args, mongo_uri=crawler.settings.get('MONGODB_SERVER') ) def spider_closed(self, spider): self.driver.close() def parse(self, response): self.driver.get(response.url) time.sleep(4) self._crawl_parser.submit_search(self.driver) _href = self._crawl_parser.get_category_href(self.driver) _total_count = 0 _page_number = 1 while True: pass self.driver.get(_href) time.sleep(1) self._crawl_parser.get_items_from_pagenate(self.driver)
def __init__(self, name=None, **kwargs): from cwaliexpress.database_factory import DatabaseFactory, DatabaseTypes self._cache_db = DatabaseFactory.get_database(DatabaseTypes.cache, kwargs['mongo_uri']) self._history_db = DatabaseFactory.get_database(DatabaseTypes.history, kwargs['mongo_uri']) from cwaliexpress.parser.response_parser import ResponseParse self._crawl_parser = ResponseParse() super(AliExpresssDebugSpider, self).__init__(name, **kwargs)
class AliExpresssSpider(scrapy.Spider): name = "aliexpress" allowed_domains = ["aliexpress.com"] start_urls = [ 'http://www.aliexpress.com/af/macbook-pro.html?ltype=wholesale&d=y&origin=n&isViewCP=y&catId=0&initiative_id=SB_20160520233312&SearchText=macbook+pro', ] def __init__(self, name=None, **kwargs): from cwaliexpress.database_factory import DatabaseFactory, DatabaseTypes self._cache_db = DatabaseFactory.get_database(DatabaseTypes.cache, kwargs['mongo_uri']) self._history_db = DatabaseFactory.get_database(DatabaseTypes.history, kwargs['mongo_uri']) from cwaliexpress.parser.response_parser import ResponseParse self._crawl_parser = ResponseParse() super(AliExpresssSpider, self).__init__(name, **kwargs) @classmethod def from_crawler(cls, crawler, *args, **kwargs): return super(AliExpresssSpider, cls).from_crawler(crawler, args, mongo_uri=crawler.settings.get('MONGODB_SERVER') ) def parse(self, response): self._crawl_parser.parse_paginate(response.url, response) def parse_detail(self, response): item = self._crawl_parser.parse(response.url, response) yield item yield scrapy.Request(item['cluster'], self.parse_cluster) # yield scrapy.Request(response.url, self.parse_relatived_app) # the below is that crawl a random relatived app. select = '//a[@class="card-click-target"]' sel = Selector(response) navs = sel.xpath(select) if not self._history_db.check_exist(abstractPath): yield scrapy.Request(abstractPath, self.parse_detail, meta={'type': title})
def get_Parser(parserType): from cwaliexpress.parser.browse_parser import BrowseParser from cwaliexpress.parser.response_parser import ResponseParse if ParserTypes.browser == parserType: return BrowseParser() elif ParserTypes.response == parserType: return ResponseParse() else: return None