Exemple #1
0
def run():
    name = sys.argv[1]  # 获取命令行参数,这里就是china
    custom_settings = get_config(name)  # 相当于拿到china.json配置文件
    # 爬取使用的spider名称
    spider = custom_settings.get('spider', 'universal')
    project_settings = get_project_settings()
    settings = dict(project_settings.copy())  # 弄个副本然后搞成一个字典什么意思呢?
    # 合并配置 这一块理解的不好
    # 应该是把china.json配置文件里面的settings设置,更新到项目配置里面去
    settings.update(custom_settings.get('settings'))
    process = CrawlerProcess(settings)
    # 启动爬虫
    process.crawl(spider, **{'name':
                             name})  # 这里是勾上了spider--universal--UniversalSpider
    process.start()

    name = sys.argv[1]
    custom_settings = get_config(name)
    spider = custom_settings.get('spider', 'universal')
    project_settings = get_project_settings()
    settings = dict(project_settings.copy())
    settings.update(custom_settings.get('settings'))
    process = CrawlerProcess(settings)
    process.crawl(spider, **{'name': name})
    process.start()
Exemple #2
0
 def __init__(self, name, *args, **kwargs):
     config = get_config(name)
     self.config = config
     self.rules = rules.get(config.get('rules'))
     self.start_urls = config.get('start_urls')
     self.allowed_domains = config.get('allowed_domains')
     super(UniversalSpider, self).__init__(*args, **kwargs)
Exemple #3
0
 def __init__(self, name, *args, **kwargs):  # init方法中,、、
     config = get_config(name)
     self.config = config
     self.rules = rules.get(config.get('rules')) # rules属性另外读取了rules.py的配置
     self.start_urls = config.get('allowed_domains') # start_urls被赋值
     self.allowed_domains = config.get('allowed_domains')    # allowed_domains被赋值
     super(UniversalSpider, self).__init__(*args, **kwargs)
Exemple #4
0
def run():
    name = sys.argv[1]
    print(name)
    custom_settings = get_config(name)
    spider = custom_settings.get('spider', 'universal')
    print(spider)

    rules = custom_settings.get('rules')
    print("rules: %s " % rules)

    project_settings = get_project_settings()
    print(project_settings)

    settings = dict(project_settings.copy())
    print(settings)

    settings.update(custom_settings.get('settings'))

    print(settings)

    process = CrawlerProcess(settings)
    print("######################################################################process is: %s " % process)
    # c = process.crawl(spider, **{'name': name})
    c = process.crawl(UniversalSpider, **{'name': name})
    print("######################################################################processrrrrr is: %s " % c)

    process.start()
    print("######################################################################process iseeeeeeee: %s " % process)
Exemple #5
0
    def __init__(self, name, *args, **kwargs):
        # 获取自定义的json配置文件
        config = get_config(name)
        self.config = config

        # rules配置
        self.rules = SpiderRules(
            detailUrlXpaths=
            '//div[@class="p-con"]/div[@class="p-box"]/ul[@class="products"]',
            detailTags=('a', 'area'),
            detailAttrs=('href', ),
            detailCallback='parse_item',
            isSplash=True).rules.get(config.get('rules'))
        # self.rules = rules.get(config.get('rules'))

        # start_urls配置
        start_urls = config.get('start_urls')
        if start_urls:
            if start_urls.get('type') == 'static':
                self.start_urls = start_urls.get('value')
            elif start_urls.get('type') == 'dynamic':
                self.start_urls = list(
                    eval('urls.' + start_urls.get('method'))(
                        *start_urls.get('args', [])))

        # allowed_domains配置
        self.allowed_domains = config.get('allowed_domains')
        super(UniversalSpider, self).__init__(*args, **kwargs)
Exemple #6
0
 def __init__(self, name, *args, **kwargs):
     #config为dict或list类型
     config = get_config(name)
     self.config = config
     #此处的params表示spider对应的rules
     self.rules = rules.get(config.get('rules'))
     self.start_urls = config.get('start_urls')
     self.allowed_domains = config.get('allowed_domains')
     super(UniversalSpider, self).__init__(*args, **kwargs)
Exemple #7
0
def run():
    name = sys.argv[1]
    custom_settings = get_config(name)
    spider = custom_settings.get('spider', 'universal')
    project_settings = get_project_settings()
    settings = dict(project_settings.copy())
    settings.update(custom_settings.get('settings'))
    process = CrawlerProcess(settings)
    process.crawl(spider, **{'name': name})
    process.start()
Exemple #8
0
 def __init__(self, name, *args, **kwargs):
     config = get_config(name)
     self.config = config
     self.rules = rules.get(config.get('rules'))
     start_urls = config.get('start_urls')
     if start_urls:
         if start_urls.get('type') == 'static':
             self.start_urls = start_urls.get('value')
         elif start_urls.get('type') == 'dynamic':
             self.start_urls = list(eval('urls.' + start_urls.get('method'))(*start_urls.get('args', [])))
     self.allowed_domains = config.get('allowed_domains')
     super(UniversalSpider, self).__init__(*args, **kwargs)
Exemple #9
0
def run():
    name = sys.argv[1]  # 首先获取命令行的参数并赋值为name,name就是JSON文件的名称,也是爬取目标网站的名称
    custom_settings = get_config(name)  # 利用get_config方法,传入该名称读取刚才定义的配置文件
    # 爬取使用的Spider名称、
    spider = custom_settings.get('spider', 'universal')
    project_settings = get_project_settings()   # 配置文件中的settings配置
    settings = dict(project_settings.copy())    # 将获取到的settings配置和项目全局的settings配置做了合并
    # 合并配置
    settings.update(custom_settings.get('settings'))
    process = CrawlerProcess(settings)  # 新建一个CrawlSpider,传入爬取使用的配置
    # 启动爬虫
    process.crawl(spider, **{'name': name}) # 调用crawl
    process.start() # 调用start启动爬取
Exemple #10
0
def run():
    name = sys.argv[1]
    custom_settings = get_config(name)
    #获取爬取时使用的Spider名称
    spider = custom_settings.get('spider')
    project_settings = get_project_settings()
    settings = dict(project_settings.copy())
    #合并配置,将json中的settings合并到创建项目时自动生成的settings
    settings.update(custom_settings.get('settings'))
    process = CrawlerProcess(settings)
    #启动爬虫
    process.crawl(spider, **{'name': name})
    process.start()
Exemple #11
0
def run():
    # name = sys.argv[1] # json配置文件的名称
    name = 'china'
    custom_settings = get_config(name)
    spider = custom_settings.get('spider', 'universal')
    project_settings = get_project_settings()
    settings = dict(project_settings.copy())
    settings.update(custom_settings.get('settings'))
    process = CrawlerProcess(settings)
    process.crawl(
        spider, **{'name': name}
    )  # 第一个参数spider是要启动的爬虫类名,第二个参数:**{'name': name}是universal类中init初始化方法中name参数
    process.start()
Exemple #12
0
def run():
    name = sys.argv[1]
    keyword = sys.argv[2]
    custom_settings = get_config(name)
    # 爬取使用的Spider名称
    spider = custom_settings.get('spider', 'universal')
    project_settings = get_project_settings()
    settings = dict(project_settings.copy())
    # 合并配置
    settings.update(random.choice(custom_settings.get('settings')))
    process = CrawlerProcess(settings)
    # 启动爬虫
    process.crawl(spider, **{'name': name, 'keyword': keyword})
    process.start()
Exemple #13
0
 def __init__(self, name, keyword, *args, **kwargs):
     config = get_config(name)
     self.config = config
     self.rules = rules.get(config.get('rules'))
     self.name = name
     self.keyword = keyword
     #self.start_urls = ["http://106.38.57.66:8080/oasearch/front/search.do"]
     #if start_urls:
     #if start_urls.get("type") == 'static':
     #self.start_urls = start_urls.get("value")
     #elif start_urls.get("type") == 'dynamic':
     #self.start_urls = list(eval('urls.' + start_urls.get('method'))(*start_urls.get('args',[]),keyword))
     self.allowed_domains = config.get('allowed_domains')
     super(UniversalSpider, self).__init__(*args, **kwargs)
Exemple #14
0
def run():
    # 获取命令行参数
    name = sys.argv[1]
    custom_settings = get_config(name)
    # 获取爬虫名字
    spider = custom_settings.get('spider', 'universal')
    # 获取项目默认配置
    project_settings = get_project_settings()
    settings = dict(project_settings.copy())
    # 合并配置
    settings.update(custom_settings.get('settings'))
    process = CrawlerProcess(settings)
    # 启动爬虫
    process.crawl(spider, **{'name': name})
    process.start()
Exemple #15
0
def run():
    '''
    为入口文件,作用是读取命令行参数->启动Spider/ 添加该文件之后,项目的启动命令为python run.py china
    '''
    name = sys.argv[1]  # 获取cmd命令行中参数/ 该name对应的即为spider的名称
    custom_settings = get_config(name)
    # 爬虫执行的spider名称
    spider = custom_settings.get('spider', 'universal')
    project_settings = get_project_settings()
    settings = dict(project_settings.copy())
    # 合并所有配置
    settings.update(custom_settings.get('settings'))
    process = CrawlerProcess(settings)
    # 启动爬虫
    process.crawl(spider, **{'name': name})
    process.start()
Exemple #16
0
def run():
    name = sys.argv[1]

    # 获取定义的 json 文件数据
    custom_settings = get_config(name)

    # 获取爬取使用的 spider 名称
    spider = custom_settings.get('spider', 'universal')  # 如果不存在默认使用 universal
    print("spider_name: ", spider)
    project_settings = get_project_settings()  # 读取settings设置文件
    settings = dict(project_settings.copy())

    # 合并配置
    settings.update(
        custom_settings.get('settings'))  # 将 settings 中的配置和 自定义 的配置合并
    process = CrawlerProcess(settings)  # 使用 CrawlerProcess 来实例化 爬虫程序

    # 启动爬虫
    process.crawl(spider, **{'name':
                             name})  # 传入名称 开始爬取 相当于 scrapy crawl spider_name
    process.start()
Exemple #17
0
    def __init__(self, name, *args, **kwargs):
        # 获取自定义的json配置文件
        config = get_config(name)
        self.config = config

        self.pageType = 1  # 翻页类型,根据分页类型判断使用翻页方式
        self.pageTotal = 4  # 总页数,事件点击的翻页需要配置总页数
        self.detailUrlXpaths = '//div[@class="p-con"]/div[@class="p-box"]/ul[@class="products"]'  #详情页链接xpath
        self.pageXpaths = '//div[@id="pageStyle"]//a[contains(., "下一页")]'
        self.selector = '.laypage_next'
        self.attribute = 'data-page'
        self.title = '//div[@class="pro-property"]/div[@class="pro-info"]/h2/text()'
        self.text = '//div[@class="pro-property"]/div[@class="pro-info"]/p/text()'

        # self.rules = rules.get(config.get('rules'))
        # 根据分页类型获取rules
        if self.pageType == 0:
            # rules配置
            self.rules = SpiderRules(detailUrlXpaths=self.detailUrlXpaths,
                                     pageXpaths=self.pageXpaths,
                                     detailCallback='parse_item',
                                     isSplash=False).rules.get('ruleHref')
        elif self.pageType == 1:
            self.rules = SpiderRules(
                detailUrlXpaths=self.detailUrlXpaths,
                detailCallback='parse_item').rules.get('ruleClick')

        # start_urls配置
        start_urls = config.get('start_urls')
        if start_urls:
            if start_urls.get('type') == 'static':
                self.start_urls = start_urls.get('value')
            elif start_urls.get('type') == 'dynamic':
                self.start_urls = list(
                    eval('urls.' + start_urls.get('method'))(
                        *start_urls.get('args', [])))

        # allowed_domains配置
        self.allowed_domains = config.get('allowed_domains')
        super(UniversalSpider, self).__init__(*args, **kwargs)
Exemple #18
0
 def __init__(self, *name, **kwargs):
     super().__init__(self.name, **kwargs)
     config = get_config(self.name)
     self.allowed_domains = config.get('allowed_domains')
     self.start_urls = config.get('start_urls')
     self.config = config