def __init__(self, *args, **kwargs): kwargs['delimiter'] = settings.get('CSV_DELIMITER', '\t') fields_to_export = settings.get('FIELDS_TO_EXPORT', []) if fields_to_export: kwargs['fields_to_export'] = fields_to_export super(CustomCsvItemExporter, self).__init__(*args, **kwargs)
def __init__(self, *args, **kwargs): delimiter = settings.get('CSV_DELIMITER', ',') kwargs['delimiter'] = delimiter fields_to_export = settings.get('FIELDS_TO_EXPORT', []) if fields_to_export: kwargs['fields_to_export'] = fields_to_export super(AmacItemExporter, self).__init__(*args, **kwargs)
def __init__(self, *args, **kwargs): delimiter = settings.get('CSV_DELIMITER', ',') kwargs['delimiter'] = delimiter fields_to_export = settings.get('FEED_EXPORT_FIELDS', []) if fields_to_export: kwargs['fields_to_export'] = fields_to_export kwargs.update({'dialect': QuoteAllDialect}) super(SELOGER_MEDIAVACANCESCsvItemExporter, self).__init__(*args, **kwargs)
def __init__(self, *args, settings: scrapy.settings.Settings, **kwargs, ): super().__init__( *args, database = settings.get("_MYSQL_DATABASE"), user = settings.get("_MYSQL_USER"), password = settings.get("_MYSQL_PASSWORD"), **kwargs, )
def __init__(self, *args, **kwargs): """initialize all params from configs/xxx.json file other details see scrapy """ # main problem: # custom_settings 在对象生成时就已有,无法修改,先于__init__方法 self._name = kwargs.get('_name', self.name) #建议考虑scrapyd 从此处获取spider名 config = get_configs(self._name, self.logger) self._config = config #self.custom_settings = config.get('settings') #自定义settings覆盖(无用) #self.logger.info('Show the custom_settings %s', str(self.custom_settings)) # 获取上次爬取日期 self.last_date = get_dates(self._name, self.logger) # 获取本爬虫的bloomfilter self.bloomfilter = None if config.get("NEED_BLOOM", False): self.bloom_path = settings.get("BLOOM_PATH", None) self.bloomfilter, self.bloomname = get_bloom( self.bloom_path, self.name, self.last_date, self.logger) # 获取rules, allowed_domains, start_urls self.rules = get_rules(self._name, self.logger) #建议rules 通用化 self.allowed_domains = config.get('allowed_domains', []) self.timezone = config.get("timezone", 8) start_urls = config.get("start_urls", {}) if start_urls: Case = start_urls.get("case", 10) Kwargs = start_urls.get("Kwargs", {}) self.start_urls = get_start_urls(self.last_date, self.timezone, Case, self.logger, **Kwargs) else: self.start_urls = [] #最后改日期 self.current_date = set_dates(self._name, logger=self.logger) self.logger.debug("start_urls<<<<:%s" % str(self.start_urls)) # for _compile_rules super(TemplatezweiSpider, self).__init__(*args, **kwargs) self.logger.debug("rules<<<<<:%s" % str(self.rules))
def __init__(self, settings, *args, **kwargs): super(CommentSpider, self).__init__(*args, **kwargs) save_dir = settings.get('SAVE_DIR') #获取存储路径 # 创建存储文件夹 if not os.path.exists(save_dir): print("创建目录:" + save_dir) os.mkdir(save_dir) # 输入商品id并且存入到pid变量 product_id = input("请输入商品id:") # example id: 26482700253 5155905 if (not product_id.isdigit()): print("输入id有误!") exit(-1) self.pid = product_id # 初始化爬取列表 self.start_urls = [] for i in range(1, 4): # 遍历好评差评中评 url = "https://sclub.jd.com/comment/productPageComments.action?" \ "productId={}&score={}&sortType=6&page=0&pageSize=10" \ .format(product_id, i) self.start_urls.append(url)
import scrapy import base64 import md5 import shutil import spiders.common from lib.common import Common from scrapy.pipelines.images import ImagesPipeline from scrapy.exceptions import DropItem from scrapy import log from scrapy import settings from scrapy.utils.project import get_project_settings from urllib import unquote settings = get_project_settings() IMAGES_STORE = settings.get("IMAGES_STORE") #UPLOAD_IMAGE = "true" UPLOAD_IMAGE = "true" class GoodsPipeline(ImagesPipeline): def item_completed(self, results, item, info): commonLib = Common() item_data = {} for key,value in item.items(): item_data[key] = value itemType = item_data.pop("itemType")