Esempio n. 1
0
    def __init__(self, *args, **kwargs):
        kwargs['delimiter'] = settings.get('CSV_DELIMITER', '\t')

        fields_to_export = settings.get('FIELDS_TO_EXPORT', [])
        if fields_to_export:
            kwargs['fields_to_export'] = fields_to_export

        super(CustomCsvItemExporter, self).__init__(*args, **kwargs)
Esempio n. 2
0
    def __init__(self, *args, **kwargs):
        delimiter = settings.get('CSV_DELIMITER', ',')
        kwargs['delimiter'] = delimiter

        fields_to_export = settings.get('FIELDS_TO_EXPORT', [])
        if fields_to_export:
            kwargs['fields_to_export'] = fields_to_export

        super(AmacItemExporter, self).__init__(*args, **kwargs)
    def __init__(self, *args, **kwargs):
        delimiter = settings.get('CSV_DELIMITER', ',')
        kwargs['delimiter'] = delimiter
        fields_to_export = settings.get('FEED_EXPORT_FIELDS', [])
        if fields_to_export:
            kwargs['fields_to_export'] = fields_to_export
        kwargs.update({'dialect': QuoteAllDialect})

        super(SELOGER_MEDIAVACANCESCsvItemExporter,
              self).__init__(*args, **kwargs)
Esempio n. 4
0
 def __init__(self,
     *args,
     settings: scrapy.settings.Settings,
     **kwargs,
 ):
     super().__init__(
         *args,
         database =  settings.get("_MYSQL_DATABASE"),
         user =      settings.get("_MYSQL_USER"),
         password =  settings.get("_MYSQL_PASSWORD"),
         **kwargs,
     )
Esempio n. 5
0
    def __init__(self, *args, **kwargs):
        """initialize all params from configs/xxx.json file


        other details see scrapy
        """

        # main problem:
        # custom_settings 在对象生成时就已有,无法修改,先于__init__方法

        self._name = kwargs.get('_name', self.name)  #建议考虑scrapyd 从此处获取spider名
        config = get_configs(self._name, self.logger)
        self._config = config

        #self.custom_settings = config.get('settings')   #自定义settings覆盖(无用)
        #self.logger.info('Show the custom_settings %s', str(self.custom_settings))

        # 获取上次爬取日期
        self.last_date = get_dates(self._name, self.logger)

        # 获取本爬虫的bloomfilter
        self.bloomfilter = None
        if config.get("NEED_BLOOM", False):
            self.bloom_path = settings.get("BLOOM_PATH", None)
            self.bloomfilter, self.bloomname = get_bloom(
                self.bloom_path, self.name, self.last_date, self.logger)

        # 获取rules, allowed_domains, start_urls
        self.rules = get_rules(self._name, self.logger)  #建议rules 通用化

        self.allowed_domains = config.get('allowed_domains', [])

        self.timezone = config.get("timezone", 8)

        start_urls = config.get("start_urls", {})
        if start_urls:
            Case = start_urls.get("case", 10)
            Kwargs = start_urls.get("Kwargs", {})
            self.start_urls = get_start_urls(self.last_date, self.timezone,
                                             Case, self.logger, **Kwargs)
        else:
            self.start_urls = []

        #最后改日期
        self.current_date = set_dates(self._name, logger=self.logger)

        self.logger.debug("start_urls<<<<:%s" % str(self.start_urls))
        # for _compile_rules
        super(TemplatezweiSpider, self).__init__(*args, **kwargs)
        self.logger.debug("rules<<<<<:%s" % str(self.rules))
 def __init__(self, settings, *args, **kwargs):
     super(CommentSpider, self).__init__(*args, **kwargs)
     save_dir = settings.get('SAVE_DIR')  #获取存储路径
     # 创建存储文件夹
     if not os.path.exists(save_dir):
         print("创建目录:" + save_dir)
         os.mkdir(save_dir)
     # 输入商品id并且存入到pid变量
     product_id = input("请输入商品id:")  # example id: 26482700253 5155905
     if (not product_id.isdigit()):
         print("输入id有误!")
         exit(-1)
     self.pid = product_id
     # 初始化爬取列表
     self.start_urls = []
     for i in range(1, 4):  # 遍历好评差评中评
         url = "https://sclub.jd.com/comment/productPageComments.action?" \
               "productId={}&score={}&sortType=6&page=0&pageSize=10" \
             .format(product_id, i)
         self.start_urls.append(url)
Esempio n. 7
0
import scrapy
import base64
import md5
import shutil
import spiders.common

from lib.common import Common
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy import log
from scrapy import settings
from scrapy.utils.project import get_project_settings
from urllib import unquote

settings = get_project_settings()
IMAGES_STORE = settings.get("IMAGES_STORE")
#UPLOAD_IMAGE = "true"
UPLOAD_IMAGE = "true"

class GoodsPipeline(ImagesPipeline):
        

    def item_completed(self, results, item, info):

        commonLib = Common()
        
        item_data = {}
        for key,value in item.items():
            item_data[key] = value

        itemType = item_data.pop("itemType")