Esempio n. 1
0
    def __init__(self, name=None, **kwargs):
        from cwaliexpress.database_factory import DatabaseFactory, DatabaseTypes

        self._cache_db = DatabaseFactory.get_database(DatabaseTypes.cache, kwargs['mongo_uri'])
        self._history_db = DatabaseFactory.get_database(DatabaseTypes.history, kwargs['mongo_uri'])

        from cwaliexpress.parser.response_parser import ResponseParse
        self._crawl_parser = ResponseParse()

        super(AliExpresssSpider, self).__init__(name, **kwargs)
class AliExpresssBrowserSpider(scrapy.Spider):
    name = "aliexpress_browser"
    allowed_domains = ["aliexpress.com"]
    start_urls = [
        'http://www.aliexpress.com/'
    ]

    def __init__(self, name=None, **kwargs):
        self.driver = webdriver.Firefox()

        from cwaliexpress.database_factory import DatabaseFactory, DatabaseTypes

        self._cache_db = DatabaseFactory.get_database(DatabaseTypes.cache, kwargs['mongo_uri'])
        self._history_db = DatabaseFactory.get_database(DatabaseTypes.history, kwargs['mongo_uri'])

        from cwaliexpress.parser.response_parser import ResponseParse
        self._crawl_parser = ResponseParse()

        super(AliExpresssBrowserSpider, self).__init__(name, **kwargs)

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        return super(AliExpresssBrowserSpider, cls).from_crawler(crawler,
                                                                 args,
                                                                 mongo_uri=crawler.settings.get('MONGODB_SERVER')
                                                                 )

    def spider_closed(self, spider):
        self.driver.close()

    def parse(self, response):
        self.driver.get(response.url)
        time.sleep(4)

        self._crawl_parser.submit_search(self.driver)

        _href = self._crawl_parser.get_category_href(self.driver)
        _total_count = 100
        _page_number = 1
        # while True:
        #     pass
        self.driver.get(_href)
        time.sleep(4)

        items = self._crawl_parser.get_items_from_pagenate(self.driver)

        for item in items:
            yield item
class AliExpresssDebugSpider(scrapy.Spider):
    name = "aliexpress_debug"
    allowed_domains = ["aliexpress.com"]
    start_urls = [
        'http://www.aliexpress.com/af/macbook-pro.html?ltype=wholesale&d=y&origin=n&isViewCP=y&catId=0&initiative_id=SB_20160520233312&SearchText=macbook+pro',
    ]

    def __init__(self, name=None, **kwargs):
        from cwaliexpress.database_factory import DatabaseFactory, DatabaseTypes

        self._cache_db = DatabaseFactory.get_database(DatabaseTypes.cache, kwargs['mongo_uri'])
        self._history_db = DatabaseFactory.get_database(DatabaseTypes.history, kwargs['mongo_uri'])

        from cwaliexpress.parser.response_parser import ResponseParse
        self._crawl_parser = ResponseParse()

        super(AliExpresssDebugSpider, self).__init__(name, **kwargs)

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        return super(AliExpresssDebugSpider, cls).from_crawler(crawler,
                                                         args,
                                                         mongo_uri=crawler.settings.get('MONGODB_SERVER')
                                                         )

    def parse(self, response):
        yield WebdriverRequest(response.url, callback=self.parse_search_with_js)

    def parse_search_with_js(self, response):
        item = self._crawl_parser.parse(response.url, response)
        yield item

        self._history_db.process_item(response.url)
class AliExpresssDebugSpider(scrapy.Spider):
    name = "aliexpress_debug"
    allowed_domains = ["aliexpress.com"]
    start_urls = [
        'http://www.aliexpress.com/af/macbook-pro.html?ltype=wholesale&d=y&origin=n&isViewCP=y&catId=0&initiative_id=SB_20160520233312&SearchText=macbook+pro',
    ]

    def __init__(self, name=None, **kwargs):
        from cwaliexpress.database_factory import DatabaseFactory, DatabaseTypes

        self._cache_db = DatabaseFactory.get_database(DatabaseTypes.cache,
                                                      kwargs['mongo_uri'])
        self._history_db = DatabaseFactory.get_database(
            DatabaseTypes.history, kwargs['mongo_uri'])

        from cwaliexpress.parser.response_parser import ResponseParse
        self._crawl_parser = ResponseParse()

        super(AliExpresssDebugSpider, self).__init__(name, **kwargs)

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        return super(AliExpresssDebugSpider, cls).from_crawler(
            crawler, args, mongo_uri=crawler.settings.get('MONGODB_SERVER'))

    def parse(self, response):
        yield WebdriverRequest(response.url,
                               callback=self.parse_search_with_js)

    def parse_search_with_js(self, response):
        item = self._crawl_parser.parse(response.url, response)
        yield item

        self._history_db.process_item(response.url)
class AliExpresssBrowserDebugSpider(scrapy.Spider):
    name = "aliexpress_browser_debug"
    allowed_domains = ["aliexpress.com"]
    start_urls = [
        'http://www.aliexpress.com/'
    ]

    def __init__(self, name=None, **kwargs):
        self.driver = webdriver.Firefox()

        from cwaliexpress.database_factory import DatabaseFactory, DatabaseTypes

        self._cache_db = DatabaseFactory.get_database(DatabaseTypes.cache, kwargs['mongo_uri'])
        self._history_db = DatabaseFactory.get_database(DatabaseTypes.history, kwargs['mongo_uri'])

        from cwaliexpress.parser.response_parser import ResponseParse
        self._crawl_parser = ResponseParse()

        super(AliExpresssBrowserDebugSpider, self).__init__(name, **kwargs)

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        return super(AliExpresssBrowserDebugSpider, cls).from_crawler(crawler,
                                                                      args,
                                                                      mongo_uri=crawler.settings.get('MONGODB_SERVER')
                                                                      )

    def spider_closed(self, spider):
        self.driver.close()

    def parse(self, response):
        self.driver.get(response.url)
        time.sleep(4)

        self._crawl_parser.submit_search(self.driver)


        _href = self._crawl_parser.get_category_href(self.driver)
        _total_count = 0
        _page_number = 1
        while True:

            pass
        self.driver.get(_href)
        time.sleep(1)

        self._crawl_parser.get_items_from_pagenate(self.driver)
    def __init__(self, name=None, **kwargs):
        from cwaliexpress.database_factory import DatabaseFactory, DatabaseTypes

        self._cache_db = DatabaseFactory.get_database(DatabaseTypes.cache, kwargs['mongo_uri'])
        self._history_db = DatabaseFactory.get_database(DatabaseTypes.history, kwargs['mongo_uri'])

        from cwaliexpress.parser.response_parser import ResponseParse
        self._crawl_parser = ResponseParse()

        super(AliExpresssDebugSpider, self).__init__(name, **kwargs)
Esempio n. 7
0
class AliExpresssSpider(scrapy.Spider):
    name = "aliexpress"
    allowed_domains = ["aliexpress.com"]
    start_urls = [
        'http://www.aliexpress.com/af/macbook-pro.html?ltype=wholesale&d=y&origin=n&isViewCP=y&catId=0&initiative_id=SB_20160520233312&SearchText=macbook+pro',
    ]

    def __init__(self, name=None, **kwargs):
        from cwaliexpress.database_factory import DatabaseFactory, DatabaseTypes

        self._cache_db = DatabaseFactory.get_database(DatabaseTypes.cache, kwargs['mongo_uri'])
        self._history_db = DatabaseFactory.get_database(DatabaseTypes.history, kwargs['mongo_uri'])

        from cwaliexpress.parser.response_parser import ResponseParse
        self._crawl_parser = ResponseParse()

        super(AliExpresssSpider, self).__init__(name, **kwargs)

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        return super(AliExpresssSpider, cls).from_crawler(crawler,
                                                          args,
                                                          mongo_uri=crawler.settings.get('MONGODB_SERVER')
                                                          )

    def parse(self, response):
        self._crawl_parser.parse_paginate(response.url, response)

    def parse_detail(self, response):
        item = self._crawl_parser.parse(response.url, response)
        yield item

        yield scrapy.Request(item['cluster'], self.parse_cluster)

        # yield scrapy.Request(response.url, self.parse_relatived_app)

        # the below is that crawl a random relatived app.
        select = '//a[@class="card-click-target"]'
        sel = Selector(response)
        navs = sel.xpath(select)

        if not self._history_db.check_exist(abstractPath):
            yield scrapy.Request(abstractPath, self.parse_detail, meta={'type': title})
    def get_Parser(parserType):

        from cwaliexpress.parser.browse_parser import BrowseParser
        from cwaliexpress.parser.response_parser import ResponseParse

        if ParserTypes.browser == parserType:
            return BrowseParser()
        elif ParserTypes.response == parserType:
            return ResponseParse()
        else:
            return None