コード例 #1
0
ファイル: test_utils_serialize.py プロジェクト: 01-/scrapy
class JsonEncoderTestCase(unittest.TestCase):

    def setUp(self):
        self.encoder = ScrapyJSONEncoder()

    def test_encode_decode(self):
        dt = datetime.datetime(2010, 1, 2, 10, 11, 12)
        dts = "2010-01-02 10:11:12"
        d = datetime.date(2010, 1, 2)
        ds = "2010-01-02"
        t = datetime.time(10, 11, 12)
        ts = "10:11:12"
        dec = Decimal("1000.12")
        decs = "1000.12"

        for input, output in [('foo', 'foo'), (d, ds), (t, ts), (dt, dts),
                              (dec, decs), (['foo', d], ['foo', ds])]:
            self.assertEqual(self.encoder.encode(input), json.dumps(output))

    def test_encode_deferred(self):
        self.assertIn('Deferred', self.encoder.encode(defer.Deferred()))

    def test_encode_request(self):
        r = Request("http://www.example.com/lala")
        rs = self.encoder.encode(r)
        self.assertIn(r.method, rs)
        self.assertIn(r.url, rs)

    def test_encode_response(self):
        r = Response("http://www.example.com/lala")
        rs = self.encoder.encode(r)
        self.assertIn(r.url, rs)
        self.assertIn(str(r.status), rs)
コード例 #2
0
    def process_item(self, item, spider):
        url = "http://localhost:9200/articles/%s" % (item["publication"].lower())
        encoder = ScrapyJSONEncoder()
        json_body = encoder.encode(item)
        resp = requests.post(url, data=json_body)
        log.msg("Item added to elasticSearch node. Response: " + resp.text)

        return item
コード例 #3
0
ファイル: base_exporter.py プロジェクト: xunyuw/iFlyQA
class HadoopExporter(BaseItemExporter):
    def __init__(self, hadoop, **kwargs):
        #self.con = file_write.Connection()
        #self.con.connect(hadoop.ip, hadoop.port)
        self.encoder = ScrapyJSONEncoder(**kwargs)
        #self.seq = file_write.SeqFileSaver(self.con, '/common/crawler/%s/' % hadoop.username.replace(".", "/"),
        #                                   1, '%s' % hadoop.username.replace(".", "_"))
        self.encoding = 'utf-8'
        self.fields_to_export = None
        self.export_empty_fields = False
        self.writer = SeqWriter(os.path.join(Utils.settings['SEQFILE_DIR'], hadoop.username.replace(".", "/")),
                                hadoop.username.replace(".", "_"))

    def close_file(self):
        print "close"
        self.writer.close()
        #self.seq.set_is_end()
        #self.con.close()

    def start_exporting(self):
        pass

    def finish_exporting(self):
        pass

    def export_item(self, item):
        value = self.encoder.encode(dict(self._get_serialized_fields(item)))
        self.writer.writeData(
            item['key'] if 'key' in item else item['url'],
            value
        )
コード例 #4
0
ファイル: pipelines.py プロジェクト: darthbear/scrapy-redis
class RedisPipeline(object):
    """Pushes serialized item into a redis list/queue"""

    def __init__(self, host, port, queue_type):
        self.server = redis.Redis(host, port)
        self.encoder = ScrapyJSONEncoder()
	self.queue_type = queue_type

    @classmethod
    def from_settings(cls, settings):
        host = settings.get('REDIS_HOST', 'localhost')
        port = settings.get('REDIS_PORT', 6379)
        queue_type = settings.get('QUEUE_TYPE', 'FIFO')
        return cls(host, port)

    def process_item(self, item, spider):
        return deferToThread(self._process_item, item, spider)

    def _process_item(self, item, spider):
        key = self.item_key(item, spider)
        data = self.encoder.encode(dict(item))
        self.server.lpush(key, data)
	if (self.queue_type == 'LIFO'):
            self.server.lpush(key, data)
	else:
            self.server.rpush(key, data)
        return item

    def item_key(self, item, spider):
        """Returns redis key based on given spider"""
        return "%s:items" % spider.name
コード例 #5
0
ファイル: items_rq.py プロジェクト: dboyliao/scraper
    def __init__(self, host, port, db, queue_name, store_id):
        self.encoder = ScrapyJSONEncoder()
        self.store_id = store_id
        self.queue_name = queue_name

        self.server = redis.Redis(host, port, db)
        self.queue = rq.Queue(queue_name, connection=self.server)
コード例 #6
0
class RabbitMQPipeline(object):
    """Pushes serialized item into a RabbitMQ list/queue"""

    def __init__(self, server, exchange_name):
        self.server = server
        self.exchange_name = exchange_name
        self.encoder = ScrapyJSONEncoder()

    @classmethod
    def from_settings(cls, settings):
        server, redis_server = connection.from_settings(settings)
        exchange_name = settings.get('RABBITMQ_EXCHANGE_NAME', EXCHANGE_NAME)
        return cls(server, exchange_name)

    @classmethod
    def from_crawler(cls, crawler):
        return cls.from_settings(crawler.settings)

    def process_item(self, item, spider):
        key = self.item_key(item, spider)
        data = self.encoder.encode(item)
        self.server.basic_publish(exchange=self.exchange_name,
                                  routing_key=key,
                                  body=data)
        return item

    def item_key(self, item, spider):
        """Returns RabbitMQ key based on given spider"""
        return "%s:items" % spider.name
コード例 #7
0
ファイル: pipelines.py プロジェクト: mezhou887/scrapysystem
class RedisPipeline(object):

    def __init__(self, server):
        self.server = server
        self.encoder = ScrapyJSONEncoder()

    @classmethod
    def from_settings(cls, settings):
        server = connection.from_settings(settings)
        return cls(server)

    @classmethod
    def from_crawler(cls, crawler):
        return cls.from_settings(crawler.settings)

    def process_item(self, item, spider):
        return deferToThread(self._process_item, item, spider)

    def _process_item(self, item, spider):
        key = self.item_key(item, spider)
        data = self.encoder.encode(item)
        self.server.rpush(key, data)
        return item

    def item_key(self, item, spider):
        return "%s:items" % spider.name
コード例 #8
0
ファイル: pipelines.py プロジェクト: huokedu/social_scraper
class RedisPipeline(object):
    """
    Pushes serialized item into a redis.
    Specific for SocialSpiders
    """

    def __init__(self, server):
        self.server = server
        self.encoder = ScrapyJSONEncoder()

    @classmethod
    def from_settings(cls, settings):
        server = connection.from_settings(settings)
        return cls(server)

    @classmethod
    def from_crawler(cls, crawler):
        return cls.from_settings(crawler.settings)

    def process_item(self, item, spider):
        return deferToThread(self._process_item, item, spider)

    def _process_item(self, item, spider):
        key = self.item_key(item, spider)
        data = self.encoder.encode(item)
        self.server.set(key, data.decode('utf-8'))
        return item

    def item_key(self, item, spider):
        """Returns redis key based on given spider"""
        return "{}_{}".format(spider.name, item['search_name'])
コード例 #9
0
ファイル: pipelines.py プロジェクト: leveryd/python-security
class RedisPipeline(object):
    """Pushes serialized item into a redis list/queue"""

    def __init__(self, server):
        self.server = server
        self.encoder = ScrapyJSONEncoder()

    @classmethod
    def from_settings(cls, settings):
        server = connection.from_settings(settings)
        return cls(server)

    @classmethod
    def from_crawler(cls, crawler):
        return cls.from_settings(crawler.settings)

    def process_item(self, item, spider):
        return deferToThread(self._process_item, item, spider)

    def _process_item(self, item, spider):
        key = self.item_key(item, spider)
        data = self.encoder.encode(item)
        self.server.rpush(key, data)
        return item

    def item_key(self, item, spider):
        """Returns redis key based on given spider"""
        return "%s:items" % spider.name
コード例 #10
0
ファイル: exporters.py プロジェクト: ArturGaspar/scrapy
class JsonItemExporter(BaseItemExporter):

    def __init__(self, file, **kwargs):
        self._configure(kwargs, dont_fail=True)
        self.file = file
        # there is a small difference between the behaviour or JsonItemExporter.indent
        # and ScrapyJSONEncoder.indent. ScrapyJSONEncoder.indent=None is needed to prevent
        # the addition of newlines everywhere
        json_indent = self.indent if self.indent is not None and self.indent > 0 else None
        kwargs.setdefault('indent', json_indent)
        kwargs.setdefault('ensure_ascii', not self.encoding)
        self.encoder = ScrapyJSONEncoder(**kwargs)
        self.first_item = True

    def _beautify_newline(self):
        if self.indent is not None:
            self.file.write(b'\n')

    def start_exporting(self):
        self.file.write(b"[")
        self._beautify_newline()

    def finish_exporting(self):
        self._beautify_newline()
        self.file.write(b"]")

    def export_item(self, item):
        if self.first_item:
            self.first_item = False
        else:
            self.file.write(b',')
            self._beautify_newline()
        itemdict = dict(self._get_serialized_fields(item))
        data = self.encoder.encode(itemdict)
        self.file.write(to_bytes(data, self.encoding))
コード例 #11
0
class RedisPipeline(object):
    """Pushes serialized item into a scrapy_redis list/queue"""

    def __init__(self, host, port):
        self.server = redis.Redis(host, port)
        self.encoder = ScrapyJSONEncoder()

    @classmethod
    def from_settings(cls, settings):
        host = settings.get("REDIS_HOST", "localhost")
        port = settings.get("REDIS_PORT", 6379)
        return cls(host, port)

    def process_item(self, item, spider):
        return deferToThread(self._process_item, item, spider)

    def _process_item(self, item, spider):
        key = self.item_key(item, spider)
        data = self.encoder.encode(dict(item))
        self.server.rpush(key, data)
        return item

    def item_key(self, item, spider):
        """Returns scrapy_redis key based on given spider"""
        return "%s:items" % spider.name
コード例 #12
0
class RedisStoragePipeline(object):
    def __init__(self, server):
        self.server = server
        self.encoder = ScrapyJSONEncoder()

    @classmethod
    def from_settings(cls, settings):
        server = connection.from_settings(settings)
        return cls(server)

    @classmethod
    def from_crawler(cls, crawler):
        return cls.from_settings(crawler.settings)

    def process_item(self, item, spider):
        return deferToThread(self._process_item, item, spider)

    def _process_item(self, item, spider):
        data = self.encoder.encode(item)
        if isinstance(item, GubaPostListItem):
            key = self.item_key_list(item, spider)
        if isinstance(item, GubaPostDetailItem):
            key = self.item_key_detail(item, spider)
        self.server.rpush(key, data)

        return item

    def item_key_list(self, item, spider):
        stock_id = item['stock_id']
        return "%s:list_items" % stock_id

    def item_key_detail(self, item, spider):
        stock_id = item['stock_id']
        return "%s:detail_items" % stock_id
コード例 #13
0
class DockerhubExtension(object):

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def __init__(self, crawler):
        self.crawler = crawler
        self.job_path = crawler.settings.get('JOB_PATH')
        if not self.job_path:
            raise NotConfigured('no JOB_PATH set')

        self.json_encoder = ScrapyJSONEncoder()
        self.looping_call = LoopingCall(self.store_job_info)
        self.looping_call.start(5)
        crawler.signals.connect(self.store_job_info,
                                signal=signals.spider_closed)

    def store_job_info(self):
        with open(self.job_path, 'w') as f:
            stats = self.crawler.stats.get_stats()
            job_info = {
                'stats': stats
            }
            job_info_json = self.json_encoder.encode(job_info)
            f.write(job_info_json)
コード例 #14
0
class RabbitMQPipeline(object):
    """Pushes serialized item into a RabbitMQ list/queue"""

    def __init__(self, server):
        self.server = server
        self.encoder = ScrapyJSONEncoder()

    @classmethod
    def from_settings(cls, settings):
        server = connection.from_settings(settings)
        return cls(server)

    @classmethod
    def from_crawler(cls, crawler):
        return cls.from_settings(crawler.settings)

    def process_item(self, item, spider):
        return deferToThread(self._process_item, item, spider)

    def _process_item(self, item, spider):
        key = self.item_key(item, spider)
        data = self.encoder.encode(item)
        self.server.basic_publish(exchange='',
                                  routing_key=key,
                                  body=data)
        return item

    def item_key(self, item, spider):
        """Returns RabbitMQ key based on given spider"""
        return "%s:items" % spider.name
コード例 #15
0
ファイル: statusmailer.py プロジェクト: JayveeHe/spider_senz
    def __init__(self, recipients, mail, compressor, crawler):
        self.recipients = recipients
        self.mail = mail
        self.encoder = ScrapyJSONEncoder(crawler=crawler)
        self.files = defaultdict(compressor)

        self.num_items = 0
        self.num_errors = 0
コード例 #16
0
ファイル: exporters.py プロジェクト: voith/scrapy
class JsonLinesItemExporter(BaseItemExporter):
    def __init__(self, file, **kwargs):
        self._configure(kwargs, dont_fail=True)
        self.file = file
        self.encoder = ScrapyJSONEncoder(**kwargs)

    def export_item(self, item):
        itemdict = dict(self._get_serialized_fields(item))
        self.file.write(to_bytes(self.encoder.encode(itemdict) + "\n"))
コード例 #17
0
ファイル: pipelines.py プロジェクト: dfdeshom/scrapy-kafka
    def __init__(self, producer, topic):
        """
        :type producer: kafka.producer.Producer
        :type topic: str or unicode
        """
        self.producer = producer
        self.topic = topic

        self.encoder = ScrapyJSONEncoder()
コード例 #18
0
ファイル: exporters.py プロジェクト: ahmad-rzk/fbcrawl
 def __init__(self, outputAddress, **kwargs):
     self._configure(kwargs, dont_fail=True)
     # self.file = file
     # needs to be updated when migrating to proto-buff
     self.encoder = ScrapyJSONEncoder(**kwargs)
     self.first_item = True
     # creating ZMQ context and socket
     self.context = zmq.Context()
     self.socket = self.context.socket(zmq.PUB)
     # self.socket.setsockopt(zmq.LINGER, 100)
     self.outputAddress = outputAddress
コード例 #19
0
ファイル: base_exporter.py プロジェクト: xunyuw/iFlyQA
 def __init__(self, hadoop, **kwargs):
     #self.con = file_write.Connection()
     #self.con.connect(hadoop.ip, hadoop.port)
     self.encoder = ScrapyJSONEncoder(**kwargs)
     #self.seq = file_write.SeqFileSaver(self.con, '/common/crawler/%s/' % hadoop.username.replace(".", "/"),
     #                                   1, '%s' % hadoop.username.replace(".", "_"))
     self.encoding = 'utf-8'
     self.fields_to_export = None
     self.export_empty_fields = False
     self.writer = SeqWriter(os.path.join(Utils.settings['SEQFILE_DIR'], hadoop.username.replace(".", "/")),
                             hadoop.username.replace(".", "_"))
コード例 #20
0
    def __init__(self, crawler):
        self.crawler = crawler
        self.job_path = crawler.settings.get('JOB_PATH')
        if not self.job_path:
            raise NotConfigured('no JOB_PATH set')

        self.json_encoder = ScrapyJSONEncoder()
        self.looping_call = LoopingCall(self.store_job_info)
        self.looping_call.start(5)
        crawler.signals.connect(self.store_job_info,
                                signal=signals.spider_closed)
コード例 #21
0
class JsonLinesItemExporter(BaseItemExporter):
    def __init__(self, file, **kwargs):
        self._configure(kwargs, dont_fail=True)
        self.file = file
        kwargs.setdefault('ensure_ascii', not self.encoding)
        self.encoder = ScrapyJSONEncoder(**kwargs)

    def export_item(self, item):
        itemdict = dict(self._get_serialized_fields(item))
        data = self.encoder.encode(itemdict) + '\n'
        self.file.write(to_bytes(data, self.encoding))
コード例 #22
0
ファイル: exporters.py プロジェクト: ArturGaspar/scrapy
 def __init__(self, file, **kwargs):
     self._configure(kwargs, dont_fail=True)
     self.file = file
     # there is a small difference between the behaviour or JsonItemExporter.indent
     # and ScrapyJSONEncoder.indent. ScrapyJSONEncoder.indent=None is needed to prevent
     # the addition of newlines everywhere
     json_indent = self.indent if self.indent is not None and self.indent > 0 else None
     kwargs.setdefault('indent', json_indent)
     kwargs.setdefault('ensure_ascii', not self.encoding)
     self.encoder = ScrapyJSONEncoder(**kwargs)
     self.first_item = True
コード例 #23
0
ファイル: exporters.py プロジェクト: ArturGaspar/scrapy
class JsonLinesItemExporter(BaseItemExporter):

    def __init__(self, file, **kwargs):
        self._configure(kwargs, dont_fail=True)
        self.file = file
        kwargs.setdefault('ensure_ascii', not self.encoding)
        self.encoder = ScrapyJSONEncoder(**kwargs)

    def export_item(self, item):
        itemdict = dict(self._get_serialized_fields(item))
        data = self.encoder.encode(itemdict) + '\n'
        self.file.write(to_bytes(data, self.encoding))
コード例 #24
0
def run(output_path, download_workers):
    crawl(download_workers)
    reactor.run()  # the script will block here until the last crawl call is finished
    encoder = ScrapyJSONEncoder()
    with open("results/publications.jl", "rb+") as publications_info_file:
        with open(output_path, "wb+") as output_file:
            output_file.write(b"[")
            first = True
            while True:
                line = publications_info_file.readline()
                if not line:
                    break
                if first:
                    output_file.write(b"\n")
                    first = False
                else:
                    output_file.write(b",\n")
                company_info_data = json.loads(line)
                data = encoder.encode(company_info_data)
                output_file.write(to_bytes(data))
            output_file.write(b"\n]\n")
コード例 #25
0
ファイル: exports.py プロジェクト: litiblue/letmescrape
class LetMeShopApiExporter(BaseItemExporter):
    api_end_point = ''
    method = 'POST'

    def __init__(self, api_base_url, auth_token, *args, **kwargs):
        super(LetMeShopApiExporter, self).__init__(*args, export_empty_fields=True, **kwargs)
        self.api_base_url = api_base_url
        self.encoder = ScrapyJSONEncoder(**kwargs)
        self.headers = {'Authorization': 'Token %s' % auth_token}

    def _fill_missing_fields(self, item, default_value=None):
        if self.fields_to_export is None:
            missing_keys = frozenset(item.fields.iterkeys()).difference(item.iterkeys())
        else:
            missing_keys = frozenset(self.fields_to_export).difference(item.iterkeys())

        for missing_key in missing_keys:
            item[missing_key] = item.fields[missing_key].get('default_value', default_value)

        return item

    def _get_serialized_fields(self, item, default_value=None, include_empty=None):
        if include_empty is None:
            include_empty = self.export_empty_fields

        if include_empty:
            item = self._fill_missing_fields(item, default_value)

        return super(LetMeShopApiExporter, self)._get_serialized_fields(item, default_value, include_empty)

    @property
    def request_url(self):
        return urljoin(self.api_base_url, self.api_end_point)

    def export_item(self, item_or_items):
        if isinstance(item_or_items, (list, tuple)):
            item_list = item_or_items
            serialized = [dict(self._get_serialized_fields(item)) for item in item_list]
        else:
            item = item_or_items
            serialized = dict(self._get_serialized_fields(item))

        serialized = snake_case_to_camel_case(serialized)
        payload = self.encoder.encode(serialized)

        r = requests.request(self.method, self.request_url, data=payload, headers=self.headers)
        r.raise_for_status()

    def start_exporting(self):
        pass

    def finish_exporting(self):
        pass
コード例 #26
0
ファイル: pipelines.py プロジェクト: dfdeshom/scrapy-kafka
class KafkaPipeline(object):

    """
    Publishes a serialized item into a Kafka topic

    :param producer: The Kafka producer
    :type producer: kafka.producer.Producer

    :param topic: The Kafka topic being used
    :type topic: str or unicode

    """

    def __init__(self, producer, topic):
        """
        :type producer: kafka.producer.Producer
        :type topic: str or unicode
        """
        self.producer = producer
        self.topic = topic

        self.encoder = ScrapyJSONEncoder()

    def process_item(self, item, spider):
        """
        Overriden method to process the item

        :param item: Item being passed
        :type item: scrapy.item.Item

        :param spider: The current spider being used
        :type spider: scrapy.spider.Spider
        """
        # put spider name in item
        item = dict(item)
        item['spider'] = spider.name
        msg = self.encoder.encode(item)
        self.producer.send_messages(self.topic, msg)

    @classmethod
    def from_settings(cls, settings):
        """
        :param settings: the current Scrapy settings
        :type settings: scrapy.settings.Settings

        :rtype: A :class:`~KafkaPipeline` instance
        """
        k_hosts = settings.get('SCRAPY_KAFKA_HOSTS', ['localhost:9092'])
        topic = settings.get('SCRAPY_KAFKA_ITEM_PIPELINE_TOPIC', 'scrapy_kafka_item')
        kafka = KafkaClient(k_hosts)
        conn = SimpleProducer(kafka)
        return cls(conn, topic)
コード例 #27
0
ファイル: ad_spider_base.py プロジェクト: YanzheL/adcrawler
class AdSpiderBase(RedisSpider):
    task_encoder = ScrapyJSONEncoder().encode
    task_decoder = ScrapyJSONDecoder().decode

    def next_request(self):
        serialized_task = self.server.rpop(self.redis_key)
        if serialized_task:
            self.logger.info("Got task {}".format(serialized_task))
            return self.make_request_from_task(serialized_task, callback=self.parse, dont_filter=False)

    @staticmethod
    def tagfilter(tag):
        return isinstance(tag, Tag)
コード例 #28
0
    def parse(self, response):

        items = []
        for row in response.xpath('//table[@class="TblDataRecs"]/tr'):
            item = mylanguageexchangeItem()
            name = row.xpath('td[@class="userdata"]//a//b/text()').extract()
            item["name"] = [x.strip() for x in name]
            country = row.xpath('td[@class="userdata"][@data-th="Country(City)"]//tr[1]/td/text()').extract()
            item["country"] = [x.strip() for x in country]
            city = row.xpath('td[@class="userdata"][@data-th="Country(City)"]//tr[2]/td/text()').extract()
            item["city"] = [x.strip().strip('()') for x in city]
            native = row.xpath('td[@class="userdata"][@data-th="Native Language"]//td/text()').extract()
            item["native"] = [x.strip() for x in native]
            practicing = row.xpath('td[@class="userdata"][@data-th="Practicing Language"]//td/text()').extract()
            item["practicing"] = [x.strip() for x in practicing]
            desc = row.xpath('td[@class="userdata"][@data-th="Description"]//td/text()').extract()
            item["desc"] = [x.strip() for x in desc]
            items.append(item)

        _encoder = ScrapyJSONEncoder()
        with open('mylanguageexchange_crawled.json', 'w') as outfile:
            outfile.write(_encoder.encode(items))
コード例 #29
0
ファイル: shopware4_rest.py プロジェクト: nyov/scrapyext
	def __init__(self):
		"""
		Connect to Shopware REST Api using HTTP digest authentication.
		We need an ADMIN role with sufficient access to insert articles.
		Shopware4 (german) API Guide: http://wiki.shopware.de/_detail_861_487.html
		"""
		self.name         = settings['SHOPWARE_SERVICE_NAME']
		self.api_url      = settings['SHOPWARE_API_BASE']
		self.access_token = settings['SHOPWARE_TOKEN_KEY']

		self.request_headers = {'Content-Type': 'application/json; charset=utf-8', 'Accept': 'application/json'}
		self.encoder = ScrapyJSONEncoder()

		self.node = {}

		# shopware minimal default item
		self.default_item = RestItem({
			'taxId': 1,
			#'tax': 19,
			'name': 'nexus',
			'mainDetail': {
				'number': 'nex24',
				'prices': [{
					'customerGroupKey': 'EK',
					'basePrice': 16,
					'price': 20, # shop will add VAT (if configured that way)
				}],
			#	'attribute': {
			#		'supplier_url': 'http://example.net',
			#		'supplierUrl': 'http://example.net',
			#	#	'attr19': 'http://example.net',
			#	},
			},
			'active': True,
			'supplier': 'example com',
			'categories': [
				{'id': 5,},
				{'id': 3,},
			],
			'images': [{
				#'id': '1', ## this one is bugged in shopware (doesnt add image to article)
				#'mediaId': '1',
				# needs deduplication on update
				'link': 'http://shopware.local/templates/_emotion/frontend/_resources/images/logo.jpg',
			}],
			'attribute': {
				'attr19': 'http://example.net',
			},
			'description': 'Some Article',
			'descriptionLong': 'Some Article Description',
		})
コード例 #30
0
class KafkaPipeline(object):
    """
    Publishes a serialized item into a Kafka topic

    :param producer: The Kafka producer
    :type producer: kafka.producer.Producer

    :param topic: The Kafka topic being used
    :type topic: str or unicode

    """
    def __init__(self, producer, topic):
        """
        :type producer: kafka.producer.Producer
        :type topic: str or unicode
        """
        self.producer = producer
        self.topic = topic

        self.encoder = ScrapyJSONEncoder()

    def process_item(self, item, spider):
        """
        Overriden method to process the item

        :param item: Item being passed
        :type item: scrapy.item.Item

        :param spider: The current spider being used
        :type spider: scrapy.spider.Spider
        """
        # put spider name in item
        item = dict(item)
        item['spider'] = spider.name
        msg = self.encoder.encode(item)
        self.producer.send_messages(self.topic, msg)

    @classmethod
    def from_settings(cls, settings):
        """
        :param settings: the current Scrapy settings
        :type settings: scrapy.settings.Settings

        :rtype: A :class:`~KafkaPipeline` instance
        """
        k_hosts = settings.get('SCRAPY_KAFKA_HOSTS', ['localhost:9092'])
        topic = settings.get('SCRAPY_KAFKA_ITEM_PIPELINE_TOPIC',
                             'scrapy_kafka_item')
        kafka = KafkaClient(k_hosts)
        conn = SimpleProducer(kafka)
        return cls(conn, topic)
コード例 #31
0
 def __init__(self, host, port, user, password, virtual_host, exchange,
              routing_key, queue):
     self.host = host
     self.port = port
     self.user = user
     self.password = password
     self.virtual_host = virtual_host
     credentials = pika.PlainCredentials(self.user, self.password)
     parameters = pika.ConnectionParameters(self.host, self.port,
                                            self.virtual_host, credentials)
     self.connection = pika.BlockingConnection(parameters=parameters)
     self.channel = self.connection.channel()
     self.exchange = exchange
     self.routing_key = routing_key
     self.queue = queue
     self.channel.exchange_declare(exchange=exchange,
                                   exchange_type="direct",
                                   durable=True)
     self.channel.queue_declare(queue=queue, durable=True)
     self.channel.queue_bind(exchange=exchange,
                             routing_key=routing_key,
                             queue=queue)
     self.encoder = ScrapyJSONEncoder()
コード例 #32
0
class RabbitMQItemPublisherPipeline(object):
    def __init__(self, host, port, user, password, virtual_host, exchange,
                 routing_key, queue):
        self.host = host
        self.port = port
        self.user = user
        self.password = password
        self.virtual_host = virtual_host
        credentials = pika.PlainCredentials(self.user, self.password)
        parameters = pika.ConnectionParameters(self.host, self.port,
                                               self.virtual_host, credentials)
        self.connection = pika.BlockingConnection(parameters=parameters)
        self.channel = self.connection.channel()
        self.exchange = exchange
        self.routing_key = routing_key
        self.queue = queue
        self.channel.exchange_declare(exchange=exchange,
                                      exchange_type="direct",
                                      durable=True)
        self.channel.queue_declare(queue=queue, durable=True)
        self.channel.queue_bind(exchange=exchange,
                                routing_key=routing_key,
                                queue=queue)
        self.encoder = ScrapyJSONEncoder()

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            host=crawler.settings.get("RABBITMQ_HOST"),
            port=crawler.settings.get("RABBITMQ_PORT"),
            user=crawler.settings.get("RABBITMQ_USER"),
            password=crawler.settings.get("RABBITMQ_PASSWORD"),
            virtual_host=crawler.settings.get("RABBITMQ_VIRTUAL_HOST"),
            exchange=crawler.settings.get("RABBITMQ_EXCHANGE"),
            routing_key=crawler.settings.get("RABBITMQ_ROUTING_KEY"),
            queue=crawler.settings.get("RABBITMQ_QUEUE"),
        )

    def close_spider(self, spider):
        self.channel.close()
        self.connection.close()

    def process_item(self, item, spider):
        data = self.encoder.encode(item)
        self.channel.basic_publish(
            exchange=self.exchange,
            routing_key=self.routing_key,
            body=data,
        )
        return item
コード例 #33
0
class UnicodeJsonLinesItemExporter(BaseItemExporter):
    """ Allows exporting to JSON directly as Unicode. """
    def __init__(self, file, **kwargs):
        self._configure(kwargs)
        self.file = file
        kwargs["ensure_ascii"] = False
        self.encoder = ScrapyJSONEncoder(**kwargs)

    def export_item(self, item):
        itemdict = dict(self._get_serialized_fields(item))
        self.file.write(self.encoder.encode(itemdict) + u"\n")

    def serialize_field(self, field, name, value):
        return value # DON'T call super version, this encodes the Unicode.
コード例 #34
0
ファイル: exporters.py プロジェクト: bf96163/scrapy
class JsonLinesItemExporter(BaseItemExporter):
    def __init__(self, file, **kwargs):
        super().__init__(dont_fail=True, **kwargs)
        self.file = file
        self._kwargs.setdefault('ensure_ascii', not self.encoding)
        self.encoder = ScrapyJSONEncoder(
            **self._kwargs)  #就是处理各种数据类型的 比如时间 集合这种json自己处理不了的

    def export_item(self, item):
        #简单解释 将选出的项用serializer序列化后变成dict
        itemdict = dict(self._get_serialized_fields(item))
        data = self.encoder.encode(itemdict) + '\n'
        #写到文件里
        self.file.write(to_bytes(data, self.encoding))
コード例 #35
0
ファイル: pipelines.py プロジェクト: cheer-cheer/cartographer
    def process_item(self, item, spider):
        due_date = item['due_date']
        last_update = item['last_update']

        data_file_name = './data/' + due_date.strftime('%y%m%d') + '_' + \
            last_update.strftime('%y%m%d') + '.json'

        os.makedirs(os.path.dirname(data_file_name), exist_ok=True)

        with open(data_file_name, 'w+') as f:
            f.write(ScrapyJSONEncoder().encode(item['provinces']))

        print('已保存到数据文件:', data_file_name)
        return item
コード例 #36
0
    def process_item(self, item, spider):
        """ Handle items. Send items to RabbitMQ.

        :param item:
        :param spider:
        :return:
        """
        self.rbmq_conn.send_message(channel=self.publisher,
                                    message=ScrapyJSONEncoder().encode(
                                        dict(item)),
                                    exchange_name=self.exchange_name,
                                    routing_key=self.routing_key)
        logger.debug(f'Item scraped')
        return item
コード例 #37
0
class JsonEncoderTestCase(unittest.TestCase):
    def setUp(self):
        self.encoder = ScrapyJSONEncoder()

    def test_encode_decode(self):
        dt = datetime.datetime(2010, 1, 2, 10, 11, 12)
        dts = "2010-01-02 10:11:12"
        d = datetime.date(2010, 1, 2)
        ds = "2010-01-02"
        t = datetime.time(10, 11, 12)
        ts = "10:11:12"
        dec = Decimal("1000.12")
        decs = "1000.12"
        s = {'foo'}
        ss = ['foo']
        dt_set = {dt}
        dt_sets = [dts]

        for input, output in [('foo', 'foo'), (d, ds), (t, ts), (dt, dts),
                              (dec, decs), (['foo', d], ['foo', ds]), (s, ss),
                              (dt_set, dt_sets)]:
            self.assertEqual(self.encoder.encode(input), json.dumps(output))

    def test_encode_deferred(self):
        self.assertIn('Deferred', self.encoder.encode(defer.Deferred()))

    def test_encode_request(self):
        r = Request("http://www.example.com/lala")
        rs = self.encoder.encode(r)
        self.assertIn(r.method, rs)
        self.assertIn(r.url, rs)

    def test_encode_response(self):
        r = Response("http://www.example.com/lala")
        rs = self.encoder.encode(r)
        self.assertIn(r.url, rs)
        self.assertIn(str(r.status), rs)
コード例 #38
0
class JsonLinesItemSplitFileExporter(BaseItemExporter):

    """An item exporter to organize json lines into separate folders.

    Attributes:
        _configure (func): Uses to configure the Item Exporter by setting the options dictionary.
        encoder (ScrapyJSONEncoder): Encoder used to convert scrapy items into a json format line.

    """

    def __init__(self, **kwargs):
        """Initialize the configuration dictionary and encoder.

        Args:
            **kwargs: Arbitrary keyword arguments for the options dictionary.
        """
        # If dont_fail is set, it won't raise an exception on unexpected options
        self._configure(kwargs, dont_fail=True)
        kwargs.setdefault('ensure_ascii', not self.encoding)
        self.encoder = ScrapyJSONEncoder()
        super(JsonLinesItemSplitFileExporter, self).__init__()

    def export_item(self, item):
        """Export Scrapy items to specific files based on the article_type.

        Args:
            item (scrapy.Item): A Scrapy item that contains a complete scraped information for an article/product.

        """
        # Serialize the item, and perform encoding to create a python dictionary
        item_dict = dict(self._get_serialized_fields(item))
        data = self.encoder.encode(item_dict) + os.linesep

        # If there is only one item in article_type, then the path (folders) would just be
        # scraped_data/spider.name/article_type. Otherwise we would combine all the article_type list except the last
        # item into a path, such as scraped_data/spider.name/article_type[0]/article_type[1], then the item would be
        # a json line placed in scraped_data/spider.name/article_type[0]/article_type[1]/article_type[2].jl.
        if len(item['article_type']) == 1:
            path = os.path.join("scraped_data", item["spider_name"])
            item_path = os.path.join(path, item['article_type'][0]) + ".jl"
        else:
            path = os.path.join(os.path.join("scraped_data", item["spider_name"]),
                                (os.path.join(*item['article_type'][:-1])))
            item_path = os.path.join(path, item['article_type'][-1]) + ".jl"
        if not os.path.exists(path):
            os.makedirs(path)

        # Write in append and byte mode
        open(item_path, 'a+b').write(to_bytes(data, self.encoding))
コード例 #39
0
class SortedJsonItemExporter(BaseItemExporter):
    def __init__(self, file, **kwargs):
        super().__init__(dont_fail=True, **kwargs)
        self.file = file
        self._kwargs.setdefault('indent', 4)
        self._kwargs.setdefault('ensure_ascii', not self.encoding)
        self.encoder = ScrapyJSONEncoder(**self._kwargs)
        self.items = []

    def export_item(self, item):
        self.items.append(dict(self._get_serialized_fields(item)))

    def finish_exporting(self):
        data = self.encoder.encode(sorted(self.items, key=sort_key))
        self.file.write(to_bytes(data, self.encoding))
コード例 #40
0
class KafkaPipeline(object):

    def __init__(self, producer, topic):
        self.producer = producer
        self.topic = topic
        self.encoder = ScrapyJSONEncoder()

    def process_item(self, item, spider):
        item = dict(item)
        item['spider'] = spider.name
        msg = self.encoder.encode(item)
        self.producer.send_message(self.topic, msg)

    @classmethod
    def from_settings(cls, settings)
コード例 #41
0
class KafkaProducerPipeline(object):
    def __init__(self, kafka_bootstrap_server):
        self.kafka_bootstrap_server = []
        self.kafka_bootstrap_server.append(kafka_bootstrap_server)
        self.collection_name = 'articles'
        self.encoder = ScrapyJSONEncoder()

    @classmethod
    def from_crawler(cls, crawler):
        # pull in information from settings.py
        return cls(kafka_bootstrap_server=crawler.settings.get(
            'KAFKA_BOOTSTRAP_SERVER'), )

    def open_spider(self, spider):
        print("spider name: ", spider.name)
        # initializing py-Kafka producer
        self.producer = KafkaProducer(
            bootstrap_servers=self.kafka_bootstrap_server)

        print("kafka_bootstrap_server: ", self.kafka_bootstrap_server)
        if hasattr(spider, 'collection_name'):
            print("spider collection_name: ", spider.collection_name)
            self.collection_name = spider.collection_name

    def close_spider(self, spider):
        # clean up when spider is closed
        self.producer.flush(timeout=60)
        self.producer.close(timeout=60)

    def process_item(self, item, spider):
        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem("Missing {0}!".format(data))
        if valid:
            print("valid - inside process_item...", item['source'], ': ',
                  item['headline'])
            # self.producer.send('articles', self.encoder.encode(item).encode())
            key = str(ord(item['source'][0])) + str(ord(item['source'][1]))
            self.producer.send('articles',
                               value=self.encoder.encode(item).encode(),
                               key=key.encode())
            self.index += 1
            logging.debug("News item sent by Kafka Producer!")
        return item
コード例 #42
0
class ScrapyKafkaTopicWriter(KafkaTopicWriter):
    """ Kafka writer which knows how to handle Scrapy items: they are
    serialized to JSON, and "_id" field is used as Kafka key if present.
    """
    def __init__(self, *args, **kwargs):
        self._encoder = ScrapyJSONEncoder()
        self._exporter = PythonItemExporter(binary=False)
        kwargs.setdefault('value_serializer', self._serialize_value)
        super(ScrapyKafkaTopicWriter, self).__init__(*args, **kwargs)

    def write_item(self, item):
        key = item.get('_id', None)
        msg = self._exporter.export_item(item)
        return self.write(key, msg)

    def _serialize_value(self, value):
        return self._encoder.encode(value).encode('utf8')
コード例 #43
0
ファイル: items_rq.py プロジェクト: yupengyan/scraper
class AddItemPipeline(object):
    """ Pushes serialized item into a RQ """
    def __init__(self, host, port, db, queue_name, store_id):
        self.encoder = ScrapyJSONEncoder()
        self.store_id = store_id
        self.queue_name = queue_name

        self.server = redis.Redis(host, port, db)
        self.queue = rq.Queue(queue_name, connection=self.server)

    @classmethod
    def from_settings(cls, settings):
        host = settings.get('REDIS_HOST', 'localhost')
        port = settings.get('REDIS_PORT', 6379)
        db = settings.get('REDIS_DB', 0)
        queue_name = settings.get('RQ_QUEUE', 'default')
        store_id = int(settings.get('STORE', 0))
        return cls(host, port, db, queue_name, store_id)

    @classmethod
    def from_crawler(cls, crawler):
        return cls.from_settings(crawler.settings)

    def process_item(self, item, spider):
        return deferToThread(self._process_item, item, spider)

    def _process_item(self, item, spider):
        ''' '''  # {{{

        ## get global Store URL mapping
        store_id = self.store_id
        if store_id is 0:
            raise DropItem('Not set the store and no Store URL mapping')

        ## assign queue parameters
        item['store_id'] = store_id
        callback = 'worker.save_product_to_db'
        event = self.encoder.encode(
            dict(queue=self.queue_name, value=item, time=time.time()))

        ## push item to redis queue
        self.queue.enqueue(callback, event)

        return item
コード例 #44
0
ファイル: items_rq.py プロジェクト: dboyliao/scraper
class AddItemPipeline(object):
    """ Pushes serialized item into a RQ """

    def __init__(self, host, port, db, queue_name, store_id):
        self.encoder = ScrapyJSONEncoder()
        self.store_id = store_id
        self.queue_name = queue_name

        self.server = redis.Redis(host, port, db)
        self.queue = rq.Queue(queue_name, connection=self.server)

    @classmethod
    def from_settings(cls, settings):
        host = settings.get('REDIS_HOST', 'localhost')
        port = settings.get('REDIS_PORT', 6379)
        db = settings.get('REDIS_DB', 0)
        queue_name = settings.get('RQ_QUEUE', 'default')
        store_id = int(settings.get('STORE', 0))
        return cls(host, port, db, queue_name, store_id)

    @classmethod
    def from_crawler(cls, crawler):
        return cls.from_settings(crawler.settings)

    def process_item(self, item, spider):
        return deferToThread(self._process_item, item, spider)

    def _process_item(self, item, spider):
        ''' ''' # {{{

        ## get global Store URL mapping
        store_id = self.store_id
        if store_id is 0:
            raise DropItem('Not set the store and no Store URL mapping')

        ## assign queue parameters
        item['store_id'] = store_id
        callback = 'worker.save_product_to_db'
        event = self.encoder.encode(dict(queue=self.queue_name, value=item, time=time.time()))

        ## push item to redis queue
        self.queue.enqueue(callback, event)
        
        return item
コード例 #45
0
class RedisListPipeline:

    DEFAULT_QUEUE = 'queue'
    DEFAULT_MAX_RETRY = 5

    serializer = ScrapyJSONEncoder().encode

    def __init__(self, conn_url: str, queue: str, max_retry=None):
        try:
            import redis
        except ImportError:
            raise NotConfigured('missing redis library')
        self._conn = redis.from_url(conn_url)
        self.queue = queue or self.DEFAULT_QUEUE
        self.max_retry = max_retry or self.DEFAULT_MAX_RETRY

    @classmethod
    def from_crawler(cls, crawler):
        if hasattr(crawler.spider, 'queue'):
            queue = crawler.spider.queue
        else:
            queue = crawler.settings.get('REDIS_DEFAULT_QUEUE')
        return cls(
            conn_url=crawler.settings.get('REDIS_CONNECTION_URL'),
            queue=queue,
            max_retry=crawler.settings.get('REDIS_MAX_RETRY')
        )

    def process_item(self, item, spider):
        data = self.serializer(item)
        try_time = 0
        while try_time < self.max_retry:
            try:
                self._conn.rpush(self.queue, data)
                return item
            except Exception:
                spider.logger.error('process item failed {}'.format(item))
                try_time += 1
        spider.logger.error('Give up item for failed {} times {}'.format(try_time, item))
        return item

    def close(self):
        self._conn.close()
コード例 #46
0
ファイル: exporters.py プロジェクト: fuzzy69/bns
class CustomJsonItemExporter(BaseItemExporter):
    def __init__(self, file, **kwargs):
        self._configure(kwargs, dont_fail=True)
        self.file = file
        kwargs.setdefault("ensure_ascii", False)
        self.encoder = ScrapyJSONEncoder(**kwargs)
        self.first_item = True

    def start_exporting(self):
        self.file.write(b'')

    def finish_exporting(self):
        self.file.write(b'\n')

    def export_item(self, item):
        itemdict = dict(self._get_serialized_fields(item))
        bytes = to_bytes(self.encoder.encode(itemdict))
        self.file.write(bytes)
        self.file.write(b"\n")
コード例 #47
0
class SendToBrokerPipeline(object):

	def __init__(self):
		self.publisher = Publisher('data_distributor')
		self.encoder = ScrapyJSONEncoder()
	

	def process_item(self, item, spider):
		#Runs sending broker in separate thread to prevent it from blocking
		#on single items
		return deferToThread(self._process_item, item, spider)

	def _process_item(self, item, spider):

		item_dict = dict(item)

		data = self.encoder.encode(item_dict)
		self.publisher.send_message(data,'articles')
		return item
コード例 #48
0
class UnicodeJsonLinesItemExporter(BaseItemExporter):

    logger = logging.getLogger('UnicodeJsonLinesLogging')

    def __init__(self, file, **kwargs):
        self._configure(kwargs, dont_fail=True)
        self.file = file
        self.encoder = ScrapyJSONEncoder(ensure_ascii=False,
                                         encoding='UTF-8',
                                         **kwargs)

    def export_item(self, item):
        itemdict = dict(self._get_serialized_fields(item))
        data = self.encoder.encode(itemdict) + '\n'
        self.logger.info('==============')
        self.file.write(to_bytes(data, self.encoding))

    def finish_exporting(self):
        pass
コード例 #49
0
class BasePipeline(object):
    """

    """

    def __init__(self):
        self.headers = {'Content-Type': 'application/json'}
        self._encoder = ScrapyJSONEncoder()

    def _item_to_json(self, obj):
        """
        Utility that takes a object, serializes it using ScrapyJSONEncoder
        and returns a deserialized version of the data.

        We use this to convert Scrapy models to JSON objects we can to requests.

        :param obj:
        :return dict:
        """
        return json.loads(self._encoder.encode(obj))
コード例 #50
0
ファイル: __init__.py プロジェクト: xacprod/ve1
class JsonItemExporter(JsonLinesItemExporter):
    def __init__(self, file, **kwargs):
        self._configure(kwargs, dont_fail=True)
        self.file = file
        self.encoder = ScrapyJSONEncoder(**kwargs)
        self.first_item = True

    def start_exporting(self):
        self.file.write("[")

    def finish_exporting(self):
        self.file.write("]")

    def export_item(self, item):
        if self.first_item:
            self.first_item = False
        else:
            self.file.write(',\n')
        itemdict = dict(self._get_serialized_fields(item))
        self.file.write(self.encoder.encode(itemdict))
コード例 #51
0
ファイル: exporters.py プロジェクト: voith/scrapy
class JsonItemExporter(BaseItemExporter):
    def __init__(self, file, **kwargs):
        self._configure(kwargs, dont_fail=True)
        self.file = file
        self.encoder = ScrapyJSONEncoder(**kwargs)
        self.first_item = True

    def start_exporting(self):
        self.file.write(b"[")

    def finish_exporting(self):
        self.file.write(b"]")

    def export_item(self, item):
        if self.first_item:
            self.first_item = False
        else:
            self.file.write(b",\n")
        itemdict = dict(self._get_serialized_fields(item))
        self.file.write(to_bytes(self.encoder.encode(itemdict)))
コード例 #52
0
ファイル: exporters.py プロジェクト: audiomnia/audiomnia
class GeoJsonItemExporter(BaseItemExporter):
    def __init__(self, file, **kwargs):
        self._configure(kwargs, dont_fail=True)
        self.file = file
        kwargs.setdefault('ensure_ascii', not self.encoding)
        self.encoder = ScrapyJSONEncoder(**kwargs)
        self.first_item = True

    def start_exporting(self):
        self.file.write(b'{ "type": "FeatureCollection","features":[\n')

    def finish_exporting(self):
        self.file.write(b"\n]}")

    def export_item(self, item):
        if self.first_item:
            self.first_item = False
        else:
            self.file.write(b',\n')
        data = self.encoder.encode(item)
        self.file.write(to_bytes(data, self.encoding))
コード例 #53
0
ファイル: __init__.py プロジェクト: antworteffekt/scrapy
class JsonItemExporter(JsonLinesItemExporter):

    def __init__(self, file, **kwargs):
        self._configure(kwargs)
        self.file = file
        self.encoder = ScrapyJSONEncoder(**kwargs)
        self.first_item = True

    def start_exporting(self):
        self.file.write("[")

    def finish_exporting(self):
        self.file.write("]")

    def export_item(self, item):
        if self.first_item:
            self.first_item = False
        else:
            self.file.write(',\n')
        itemdict = dict(self._get_serialized_fields(item))
        self.file.write(self.encoder.encode(itemdict))
コード例 #54
0
class FanItemExporter(BaseItemExporter):

    def __init__(self, file, **kwargs):
        self._configure(kwargs, dont_fail=True)
        self.file = file
        self.encoder = ScrapyJSONEncoder(**kwargs)
        self.first_item = True

    def start_exporting(self):
        self.file.write(b'{"product": [')

    def finish_exporting(self):
        self.file.write(b'\n]}')

    def export_item(self, item):
        if self.first_item:
            self.first_item = False
        else:
            self.file.write(b',\n')
        itemdict = dict(self._get_serialized_fields(item))
        self.file.write(to_bytes(self.encoder.encode(itemdict)))
コード例 #55
0
class JsonItemExporter(BaseItemExporter):
    def __init__(self, file, **kwargs):
        self._configure(kwargs, dont_fail=True)
        self.file = file
        kwargs.setdefault('ensure_ascii', not self.encoding)
        self.encoder = ScrapyJSONEncoder(**kwargs)
        self.first_item = True

    def start_exporting(self):
        self.file.write(b"[\n")

    def finish_exporting(self):
        self.file.write(b"\n]")

    def export_item(self, item):
        if self.first_item:
            self.first_item = False
        else:
            self.file.write(b',\n')
        itemdict = dict(self._get_serialized_fields(item))
        data = self.encoder.encode(itemdict)
        self.file.write(to_bytes(data, self.encoding))
コード例 #56
0
class DockerhubExtension(object):
    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def __init__(self, crawler):
        self.crawler = crawler
        self.job_path = crawler.settings.get('JOB_PATH')
        if not self.job_path:
            raise NotConfigured('no JOB_PATH set')

        self.json_encoder = ScrapyJSONEncoder()
        self.looping_call = LoopingCall(self.store_job_info)
        self.looping_call.start(5)
        crawler.signals.connect(self.store_job_info,
                                signal=signals.spider_closed)

    def store_job_info(self):
        with open(self.job_path, 'w') as f:
            stats = self.crawler.stats.get_stats()
            job_info = {'stats': stats}
            job_info_json = self.json_encoder.encode(job_info)
            f.write(job_info_json)
コード例 #57
0
ファイル: exporters.py プロジェクト: 0daybug/scrapy
class JsonItemExporter(BaseItemExporter):

    def __init__(self, file, **kwargs):
        self._configure(kwargs, dont_fail=True)
        self.file = file
        kwargs.setdefault('ensure_ascii', not self.encoding)
        self.encoder = ScrapyJSONEncoder(**kwargs)
        self.first_item = True

    def start_exporting(self):
        self.file.write(b"[\n")

    def finish_exporting(self):
        self.file.write(b"\n]")

    def export_item(self, item):
        if self.first_item:
            self.first_item = False
        else:
            self.file.write(b',\n')
        itemdict = dict(self._get_serialized_fields(item))
        data = self.encoder.encode(itemdict)
        self.file.write(to_bytes(data, self.encoding))
コード例 #58
0
class KafkaPipeline:
    # https://github.com/dfdeshom/scrapy-kafka/blob/master/scrapy_kafka/pipelines.py
    def __init__(self, producer, topic):
        self.producer = producer
        self.topic = topic

        self.encoder = ScrapyJSONEncoder()

    def process_item(self, item, spider):
        item = dict(item)
        item['spider'] = spider.name
        msg = self.encoder.encode(item)
        spider.log(msg)
        self.producer.send(self.topic, msg.encode('utf-8'))
        spider.log("Sent to kafka.")

    @classmethod
    def from_settings(cls, settings):
        k_hosts = settings.get('KAFKA_HOSTS', ['localhost:9092'])
        topic = settings.get('KAFKA_ITEM_PIPELINE_TOPIC', 'scrapy_kafka_item')

        prod = KafkaProducer(bootstrap_servers=k_hosts)
        return cls(prod, topic)
コード例 #59
0
class KafkaPipeline:
    def __init__(self, producer, topic):
        self.p = producer
        self.topic = topic
        self.encoder = ScrapyJSONEncoder()

    @classmethod
    def from_settings(cls, settings):
        """
        :param settings: the current Scrapy settings
        :type settings: scrapy.settings.Settings
        :rtype: A :class:`~KafkaPipeline` instance
        """
        brokers = settings.get('KAFKA_PRODUCER_BROKERS')
        topic = settings.get('KAFKA_PRODUCER_TOPIC')
        producer = Producer({
            'bootstrap.servers': brokers,
        })
        return cls(producer, topic)

    def process_item(self, item, spider):
        msg = self.encoder.encode(item)
        self.p.produce(self.topic, msg, callback=delivery_report)