class JsonEncoderTestCase(unittest.TestCase): def setUp(self): self.encoder = ScrapyJSONEncoder() def test_encode_decode(self): dt = datetime.datetime(2010, 1, 2, 10, 11, 12) dts = "2010-01-02 10:11:12" d = datetime.date(2010, 1, 2) ds = "2010-01-02" t = datetime.time(10, 11, 12) ts = "10:11:12" dec = Decimal("1000.12") decs = "1000.12" for input, output in [('foo', 'foo'), (d, ds), (t, ts), (dt, dts), (dec, decs), (['foo', d], ['foo', ds])]: self.assertEqual(self.encoder.encode(input), json.dumps(output)) def test_encode_deferred(self): self.assertIn('Deferred', self.encoder.encode(defer.Deferred())) def test_encode_request(self): r = Request("http://www.example.com/lala") rs = self.encoder.encode(r) self.assertIn(r.method, rs) self.assertIn(r.url, rs) def test_encode_response(self): r = Response("http://www.example.com/lala") rs = self.encoder.encode(r) self.assertIn(r.url, rs) self.assertIn(str(r.status), rs)
def process_item(self, item, spider): url = "http://localhost:9200/articles/%s" % (item["publication"].lower()) encoder = ScrapyJSONEncoder() json_body = encoder.encode(item) resp = requests.post(url, data=json_body) log.msg("Item added to elasticSearch node. Response: " + resp.text) return item
class HadoopExporter(BaseItemExporter): def __init__(self, hadoop, **kwargs): #self.con = file_write.Connection() #self.con.connect(hadoop.ip, hadoop.port) self.encoder = ScrapyJSONEncoder(**kwargs) #self.seq = file_write.SeqFileSaver(self.con, '/common/crawler/%s/' % hadoop.username.replace(".", "/"), # 1, '%s' % hadoop.username.replace(".", "_")) self.encoding = 'utf-8' self.fields_to_export = None self.export_empty_fields = False self.writer = SeqWriter(os.path.join(Utils.settings['SEQFILE_DIR'], hadoop.username.replace(".", "/")), hadoop.username.replace(".", "_")) def close_file(self): print "close" self.writer.close() #self.seq.set_is_end() #self.con.close() def start_exporting(self): pass def finish_exporting(self): pass def export_item(self, item): value = self.encoder.encode(dict(self._get_serialized_fields(item))) self.writer.writeData( item['key'] if 'key' in item else item['url'], value )
class RedisPipeline(object): """Pushes serialized item into a redis list/queue""" def __init__(self, host, port, queue_type): self.server = redis.Redis(host, port) self.encoder = ScrapyJSONEncoder() self.queue_type = queue_type @classmethod def from_settings(cls, settings): host = settings.get('REDIS_HOST', 'localhost') port = settings.get('REDIS_PORT', 6379) queue_type = settings.get('QUEUE_TYPE', 'FIFO') return cls(host, port) def process_item(self, item, spider): return deferToThread(self._process_item, item, spider) def _process_item(self, item, spider): key = self.item_key(item, spider) data = self.encoder.encode(dict(item)) self.server.lpush(key, data) if (self.queue_type == 'LIFO'): self.server.lpush(key, data) else: self.server.rpush(key, data) return item def item_key(self, item, spider): """Returns redis key based on given spider""" return "%s:items" % spider.name
def __init__(self, host, port, db, queue_name, store_id): self.encoder = ScrapyJSONEncoder() self.store_id = store_id self.queue_name = queue_name self.server = redis.Redis(host, port, db) self.queue = rq.Queue(queue_name, connection=self.server)
class RabbitMQPipeline(object): """Pushes serialized item into a RabbitMQ list/queue""" def __init__(self, server, exchange_name): self.server = server self.exchange_name = exchange_name self.encoder = ScrapyJSONEncoder() @classmethod def from_settings(cls, settings): server, redis_server = connection.from_settings(settings) exchange_name = settings.get('RABBITMQ_EXCHANGE_NAME', EXCHANGE_NAME) return cls(server, exchange_name) @classmethod def from_crawler(cls, crawler): return cls.from_settings(crawler.settings) def process_item(self, item, spider): key = self.item_key(item, spider) data = self.encoder.encode(item) self.server.basic_publish(exchange=self.exchange_name, routing_key=key, body=data) return item def item_key(self, item, spider): """Returns RabbitMQ key based on given spider""" return "%s:items" % spider.name
class RedisPipeline(object): def __init__(self, server): self.server = server self.encoder = ScrapyJSONEncoder() @classmethod def from_settings(cls, settings): server = connection.from_settings(settings) return cls(server) @classmethod def from_crawler(cls, crawler): return cls.from_settings(crawler.settings) def process_item(self, item, spider): return deferToThread(self._process_item, item, spider) def _process_item(self, item, spider): key = self.item_key(item, spider) data = self.encoder.encode(item) self.server.rpush(key, data) return item def item_key(self, item, spider): return "%s:items" % spider.name
class RedisPipeline(object): """ Pushes serialized item into a redis. Specific for SocialSpiders """ def __init__(self, server): self.server = server self.encoder = ScrapyJSONEncoder() @classmethod def from_settings(cls, settings): server = connection.from_settings(settings) return cls(server) @classmethod def from_crawler(cls, crawler): return cls.from_settings(crawler.settings) def process_item(self, item, spider): return deferToThread(self._process_item, item, spider) def _process_item(self, item, spider): key = self.item_key(item, spider) data = self.encoder.encode(item) self.server.set(key, data.decode('utf-8')) return item def item_key(self, item, spider): """Returns redis key based on given spider""" return "{}_{}".format(spider.name, item['search_name'])
class RedisPipeline(object): """Pushes serialized item into a redis list/queue""" def __init__(self, server): self.server = server self.encoder = ScrapyJSONEncoder() @classmethod def from_settings(cls, settings): server = connection.from_settings(settings) return cls(server) @classmethod def from_crawler(cls, crawler): return cls.from_settings(crawler.settings) def process_item(self, item, spider): return deferToThread(self._process_item, item, spider) def _process_item(self, item, spider): key = self.item_key(item, spider) data = self.encoder.encode(item) self.server.rpush(key, data) return item def item_key(self, item, spider): """Returns redis key based on given spider""" return "%s:items" % spider.name
class JsonItemExporter(BaseItemExporter): def __init__(self, file, **kwargs): self._configure(kwargs, dont_fail=True) self.file = file # there is a small difference between the behaviour or JsonItemExporter.indent # and ScrapyJSONEncoder.indent. ScrapyJSONEncoder.indent=None is needed to prevent # the addition of newlines everywhere json_indent = self.indent if self.indent is not None and self.indent > 0 else None kwargs.setdefault('indent', json_indent) kwargs.setdefault('ensure_ascii', not self.encoding) self.encoder = ScrapyJSONEncoder(**kwargs) self.first_item = True def _beautify_newline(self): if self.indent is not None: self.file.write(b'\n') def start_exporting(self): self.file.write(b"[") self._beautify_newline() def finish_exporting(self): self._beautify_newline() self.file.write(b"]") def export_item(self, item): if self.first_item: self.first_item = False else: self.file.write(b',') self._beautify_newline() itemdict = dict(self._get_serialized_fields(item)) data = self.encoder.encode(itemdict) self.file.write(to_bytes(data, self.encoding))
class RedisPipeline(object): """Pushes serialized item into a scrapy_redis list/queue""" def __init__(self, host, port): self.server = redis.Redis(host, port) self.encoder = ScrapyJSONEncoder() @classmethod def from_settings(cls, settings): host = settings.get("REDIS_HOST", "localhost") port = settings.get("REDIS_PORT", 6379) return cls(host, port) def process_item(self, item, spider): return deferToThread(self._process_item, item, spider) def _process_item(self, item, spider): key = self.item_key(item, spider) data = self.encoder.encode(dict(item)) self.server.rpush(key, data) return item def item_key(self, item, spider): """Returns scrapy_redis key based on given spider""" return "%s:items" % spider.name
class RedisStoragePipeline(object): def __init__(self, server): self.server = server self.encoder = ScrapyJSONEncoder() @classmethod def from_settings(cls, settings): server = connection.from_settings(settings) return cls(server) @classmethod def from_crawler(cls, crawler): return cls.from_settings(crawler.settings) def process_item(self, item, spider): return deferToThread(self._process_item, item, spider) def _process_item(self, item, spider): data = self.encoder.encode(item) if isinstance(item, GubaPostListItem): key = self.item_key_list(item, spider) if isinstance(item, GubaPostDetailItem): key = self.item_key_detail(item, spider) self.server.rpush(key, data) return item def item_key_list(self, item, spider): stock_id = item['stock_id'] return "%s:list_items" % stock_id def item_key_detail(self, item, spider): stock_id = item['stock_id'] return "%s:detail_items" % stock_id
class DockerhubExtension(object): @classmethod def from_crawler(cls, crawler): return cls(crawler) def __init__(self, crawler): self.crawler = crawler self.job_path = crawler.settings.get('JOB_PATH') if not self.job_path: raise NotConfigured('no JOB_PATH set') self.json_encoder = ScrapyJSONEncoder() self.looping_call = LoopingCall(self.store_job_info) self.looping_call.start(5) crawler.signals.connect(self.store_job_info, signal=signals.spider_closed) def store_job_info(self): with open(self.job_path, 'w') as f: stats = self.crawler.stats.get_stats() job_info = { 'stats': stats } job_info_json = self.json_encoder.encode(job_info) f.write(job_info_json)
class RabbitMQPipeline(object): """Pushes serialized item into a RabbitMQ list/queue""" def __init__(self, server): self.server = server self.encoder = ScrapyJSONEncoder() @classmethod def from_settings(cls, settings): server = connection.from_settings(settings) return cls(server) @classmethod def from_crawler(cls, crawler): return cls.from_settings(crawler.settings) def process_item(self, item, spider): return deferToThread(self._process_item, item, spider) def _process_item(self, item, spider): key = self.item_key(item, spider) data = self.encoder.encode(item) self.server.basic_publish(exchange='', routing_key=key, body=data) return item def item_key(self, item, spider): """Returns RabbitMQ key based on given spider""" return "%s:items" % spider.name
def __init__(self, recipients, mail, compressor, crawler): self.recipients = recipients self.mail = mail self.encoder = ScrapyJSONEncoder(crawler=crawler) self.files = defaultdict(compressor) self.num_items = 0 self.num_errors = 0
class JsonLinesItemExporter(BaseItemExporter): def __init__(self, file, **kwargs): self._configure(kwargs, dont_fail=True) self.file = file self.encoder = ScrapyJSONEncoder(**kwargs) def export_item(self, item): itemdict = dict(self._get_serialized_fields(item)) self.file.write(to_bytes(self.encoder.encode(itemdict) + "\n"))
def __init__(self, producer, topic): """ :type producer: kafka.producer.Producer :type topic: str or unicode """ self.producer = producer self.topic = topic self.encoder = ScrapyJSONEncoder()
def __init__(self, outputAddress, **kwargs): self._configure(kwargs, dont_fail=True) # self.file = file # needs to be updated when migrating to proto-buff self.encoder = ScrapyJSONEncoder(**kwargs) self.first_item = True # creating ZMQ context and socket self.context = zmq.Context() self.socket = self.context.socket(zmq.PUB) # self.socket.setsockopt(zmq.LINGER, 100) self.outputAddress = outputAddress
def __init__(self, hadoop, **kwargs): #self.con = file_write.Connection() #self.con.connect(hadoop.ip, hadoop.port) self.encoder = ScrapyJSONEncoder(**kwargs) #self.seq = file_write.SeqFileSaver(self.con, '/common/crawler/%s/' % hadoop.username.replace(".", "/"), # 1, '%s' % hadoop.username.replace(".", "_")) self.encoding = 'utf-8' self.fields_to_export = None self.export_empty_fields = False self.writer = SeqWriter(os.path.join(Utils.settings['SEQFILE_DIR'], hadoop.username.replace(".", "/")), hadoop.username.replace(".", "_"))
def __init__(self, crawler): self.crawler = crawler self.job_path = crawler.settings.get('JOB_PATH') if not self.job_path: raise NotConfigured('no JOB_PATH set') self.json_encoder = ScrapyJSONEncoder() self.looping_call = LoopingCall(self.store_job_info) self.looping_call.start(5) crawler.signals.connect(self.store_job_info, signal=signals.spider_closed)
class JsonLinesItemExporter(BaseItemExporter): def __init__(self, file, **kwargs): self._configure(kwargs, dont_fail=True) self.file = file kwargs.setdefault('ensure_ascii', not self.encoding) self.encoder = ScrapyJSONEncoder(**kwargs) def export_item(self, item): itemdict = dict(self._get_serialized_fields(item)) data = self.encoder.encode(itemdict) + '\n' self.file.write(to_bytes(data, self.encoding))
def __init__(self, file, **kwargs): self._configure(kwargs, dont_fail=True) self.file = file # there is a small difference between the behaviour or JsonItemExporter.indent # and ScrapyJSONEncoder.indent. ScrapyJSONEncoder.indent=None is needed to prevent # the addition of newlines everywhere json_indent = self.indent if self.indent is not None and self.indent > 0 else None kwargs.setdefault('indent', json_indent) kwargs.setdefault('ensure_ascii', not self.encoding) self.encoder = ScrapyJSONEncoder(**kwargs) self.first_item = True
def run(output_path, download_workers): crawl(download_workers) reactor.run() # the script will block here until the last crawl call is finished encoder = ScrapyJSONEncoder() with open("results/publications.jl", "rb+") as publications_info_file: with open(output_path, "wb+") as output_file: output_file.write(b"[") first = True while True: line = publications_info_file.readline() if not line: break if first: output_file.write(b"\n") first = False else: output_file.write(b",\n") company_info_data = json.loads(line) data = encoder.encode(company_info_data) output_file.write(to_bytes(data)) output_file.write(b"\n]\n")
class LetMeShopApiExporter(BaseItemExporter): api_end_point = '' method = 'POST' def __init__(self, api_base_url, auth_token, *args, **kwargs): super(LetMeShopApiExporter, self).__init__(*args, export_empty_fields=True, **kwargs) self.api_base_url = api_base_url self.encoder = ScrapyJSONEncoder(**kwargs) self.headers = {'Authorization': 'Token %s' % auth_token} def _fill_missing_fields(self, item, default_value=None): if self.fields_to_export is None: missing_keys = frozenset(item.fields.iterkeys()).difference(item.iterkeys()) else: missing_keys = frozenset(self.fields_to_export).difference(item.iterkeys()) for missing_key in missing_keys: item[missing_key] = item.fields[missing_key].get('default_value', default_value) return item def _get_serialized_fields(self, item, default_value=None, include_empty=None): if include_empty is None: include_empty = self.export_empty_fields if include_empty: item = self._fill_missing_fields(item, default_value) return super(LetMeShopApiExporter, self)._get_serialized_fields(item, default_value, include_empty) @property def request_url(self): return urljoin(self.api_base_url, self.api_end_point) def export_item(self, item_or_items): if isinstance(item_or_items, (list, tuple)): item_list = item_or_items serialized = [dict(self._get_serialized_fields(item)) for item in item_list] else: item = item_or_items serialized = dict(self._get_serialized_fields(item)) serialized = snake_case_to_camel_case(serialized) payload = self.encoder.encode(serialized) r = requests.request(self.method, self.request_url, data=payload, headers=self.headers) r.raise_for_status() def start_exporting(self): pass def finish_exporting(self): pass
class KafkaPipeline(object): """ Publishes a serialized item into a Kafka topic :param producer: The Kafka producer :type producer: kafka.producer.Producer :param topic: The Kafka topic being used :type topic: str or unicode """ def __init__(self, producer, topic): """ :type producer: kafka.producer.Producer :type topic: str or unicode """ self.producer = producer self.topic = topic self.encoder = ScrapyJSONEncoder() def process_item(self, item, spider): """ Overriden method to process the item :param item: Item being passed :type item: scrapy.item.Item :param spider: The current spider being used :type spider: scrapy.spider.Spider """ # put spider name in item item = dict(item) item['spider'] = spider.name msg = self.encoder.encode(item) self.producer.send_messages(self.topic, msg) @classmethod def from_settings(cls, settings): """ :param settings: the current Scrapy settings :type settings: scrapy.settings.Settings :rtype: A :class:`~KafkaPipeline` instance """ k_hosts = settings.get('SCRAPY_KAFKA_HOSTS', ['localhost:9092']) topic = settings.get('SCRAPY_KAFKA_ITEM_PIPELINE_TOPIC', 'scrapy_kafka_item') kafka = KafkaClient(k_hosts) conn = SimpleProducer(kafka) return cls(conn, topic)
class AdSpiderBase(RedisSpider): task_encoder = ScrapyJSONEncoder().encode task_decoder = ScrapyJSONDecoder().decode def next_request(self): serialized_task = self.server.rpop(self.redis_key) if serialized_task: self.logger.info("Got task {}".format(serialized_task)) return self.make_request_from_task(serialized_task, callback=self.parse, dont_filter=False) @staticmethod def tagfilter(tag): return isinstance(tag, Tag)
def parse(self, response): items = [] for row in response.xpath('//table[@class="TblDataRecs"]/tr'): item = mylanguageexchangeItem() name = row.xpath('td[@class="userdata"]//a//b/text()').extract() item["name"] = [x.strip() for x in name] country = row.xpath('td[@class="userdata"][@data-th="Country(City)"]//tr[1]/td/text()').extract() item["country"] = [x.strip() for x in country] city = row.xpath('td[@class="userdata"][@data-th="Country(City)"]//tr[2]/td/text()').extract() item["city"] = [x.strip().strip('()') for x in city] native = row.xpath('td[@class="userdata"][@data-th="Native Language"]//td/text()').extract() item["native"] = [x.strip() for x in native] practicing = row.xpath('td[@class="userdata"][@data-th="Practicing Language"]//td/text()').extract() item["practicing"] = [x.strip() for x in practicing] desc = row.xpath('td[@class="userdata"][@data-th="Description"]//td/text()').extract() item["desc"] = [x.strip() for x in desc] items.append(item) _encoder = ScrapyJSONEncoder() with open('mylanguageexchange_crawled.json', 'w') as outfile: outfile.write(_encoder.encode(items))
def __init__(self): """ Connect to Shopware REST Api using HTTP digest authentication. We need an ADMIN role with sufficient access to insert articles. Shopware4 (german) API Guide: http://wiki.shopware.de/_detail_861_487.html """ self.name = settings['SHOPWARE_SERVICE_NAME'] self.api_url = settings['SHOPWARE_API_BASE'] self.access_token = settings['SHOPWARE_TOKEN_KEY'] self.request_headers = {'Content-Type': 'application/json; charset=utf-8', 'Accept': 'application/json'} self.encoder = ScrapyJSONEncoder() self.node = {} # shopware minimal default item self.default_item = RestItem({ 'taxId': 1, #'tax': 19, 'name': 'nexus', 'mainDetail': { 'number': 'nex24', 'prices': [{ 'customerGroupKey': 'EK', 'basePrice': 16, 'price': 20, # shop will add VAT (if configured that way) }], # 'attribute': { # 'supplier_url': 'http://example.net', # 'supplierUrl': 'http://example.net', # # 'attr19': 'http://example.net', # }, }, 'active': True, 'supplier': 'example com', 'categories': [ {'id': 5,}, {'id': 3,}, ], 'images': [{ #'id': '1', ## this one is bugged in shopware (doesnt add image to article) #'mediaId': '1', # needs deduplication on update 'link': 'http://shopware.local/templates/_emotion/frontend/_resources/images/logo.jpg', }], 'attribute': { 'attr19': 'http://example.net', }, 'description': 'Some Article', 'descriptionLong': 'Some Article Description', })
def __init__(self, host, port, user, password, virtual_host, exchange, routing_key, queue): self.host = host self.port = port self.user = user self.password = password self.virtual_host = virtual_host credentials = pika.PlainCredentials(self.user, self.password) parameters = pika.ConnectionParameters(self.host, self.port, self.virtual_host, credentials) self.connection = pika.BlockingConnection(parameters=parameters) self.channel = self.connection.channel() self.exchange = exchange self.routing_key = routing_key self.queue = queue self.channel.exchange_declare(exchange=exchange, exchange_type="direct", durable=True) self.channel.queue_declare(queue=queue, durable=True) self.channel.queue_bind(exchange=exchange, routing_key=routing_key, queue=queue) self.encoder = ScrapyJSONEncoder()
class RabbitMQItemPublisherPipeline(object): def __init__(self, host, port, user, password, virtual_host, exchange, routing_key, queue): self.host = host self.port = port self.user = user self.password = password self.virtual_host = virtual_host credentials = pika.PlainCredentials(self.user, self.password) parameters = pika.ConnectionParameters(self.host, self.port, self.virtual_host, credentials) self.connection = pika.BlockingConnection(parameters=parameters) self.channel = self.connection.channel() self.exchange = exchange self.routing_key = routing_key self.queue = queue self.channel.exchange_declare(exchange=exchange, exchange_type="direct", durable=True) self.channel.queue_declare(queue=queue, durable=True) self.channel.queue_bind(exchange=exchange, routing_key=routing_key, queue=queue) self.encoder = ScrapyJSONEncoder() @classmethod def from_crawler(cls, crawler): return cls( host=crawler.settings.get("RABBITMQ_HOST"), port=crawler.settings.get("RABBITMQ_PORT"), user=crawler.settings.get("RABBITMQ_USER"), password=crawler.settings.get("RABBITMQ_PASSWORD"), virtual_host=crawler.settings.get("RABBITMQ_VIRTUAL_HOST"), exchange=crawler.settings.get("RABBITMQ_EXCHANGE"), routing_key=crawler.settings.get("RABBITMQ_ROUTING_KEY"), queue=crawler.settings.get("RABBITMQ_QUEUE"), ) def close_spider(self, spider): self.channel.close() self.connection.close() def process_item(self, item, spider): data = self.encoder.encode(item) self.channel.basic_publish( exchange=self.exchange, routing_key=self.routing_key, body=data, ) return item
class UnicodeJsonLinesItemExporter(BaseItemExporter): """ Allows exporting to JSON directly as Unicode. """ def __init__(self, file, **kwargs): self._configure(kwargs) self.file = file kwargs["ensure_ascii"] = False self.encoder = ScrapyJSONEncoder(**kwargs) def export_item(self, item): itemdict = dict(self._get_serialized_fields(item)) self.file.write(self.encoder.encode(itemdict) + u"\n") def serialize_field(self, field, name, value): return value # DON'T call super version, this encodes the Unicode.
class JsonLinesItemExporter(BaseItemExporter): def __init__(self, file, **kwargs): super().__init__(dont_fail=True, **kwargs) self.file = file self._kwargs.setdefault('ensure_ascii', not self.encoding) self.encoder = ScrapyJSONEncoder( **self._kwargs) #就是处理各种数据类型的 比如时间 集合这种json自己处理不了的 def export_item(self, item): #简单解释 将选出的项用serializer序列化后变成dict itemdict = dict(self._get_serialized_fields(item)) data = self.encoder.encode(itemdict) + '\n' #写到文件里 self.file.write(to_bytes(data, self.encoding))
def process_item(self, item, spider): due_date = item['due_date'] last_update = item['last_update'] data_file_name = './data/' + due_date.strftime('%y%m%d') + '_' + \ last_update.strftime('%y%m%d') + '.json' os.makedirs(os.path.dirname(data_file_name), exist_ok=True) with open(data_file_name, 'w+') as f: f.write(ScrapyJSONEncoder().encode(item['provinces'])) print('已保存到数据文件:', data_file_name) return item
def process_item(self, item, spider): """ Handle items. Send items to RabbitMQ. :param item: :param spider: :return: """ self.rbmq_conn.send_message(channel=self.publisher, message=ScrapyJSONEncoder().encode( dict(item)), exchange_name=self.exchange_name, routing_key=self.routing_key) logger.debug(f'Item scraped') return item
class JsonEncoderTestCase(unittest.TestCase): def setUp(self): self.encoder = ScrapyJSONEncoder() def test_encode_decode(self): dt = datetime.datetime(2010, 1, 2, 10, 11, 12) dts = "2010-01-02 10:11:12" d = datetime.date(2010, 1, 2) ds = "2010-01-02" t = datetime.time(10, 11, 12) ts = "10:11:12" dec = Decimal("1000.12") decs = "1000.12" s = {'foo'} ss = ['foo'] dt_set = {dt} dt_sets = [dts] for input, output in [('foo', 'foo'), (d, ds), (t, ts), (dt, dts), (dec, decs), (['foo', d], ['foo', ds]), (s, ss), (dt_set, dt_sets)]: self.assertEqual(self.encoder.encode(input), json.dumps(output)) def test_encode_deferred(self): self.assertIn('Deferred', self.encoder.encode(defer.Deferred())) def test_encode_request(self): r = Request("http://www.example.com/lala") rs = self.encoder.encode(r) self.assertIn(r.method, rs) self.assertIn(r.url, rs) def test_encode_response(self): r = Response("http://www.example.com/lala") rs = self.encoder.encode(r) self.assertIn(r.url, rs) self.assertIn(str(r.status), rs)
class JsonLinesItemSplitFileExporter(BaseItemExporter): """An item exporter to organize json lines into separate folders. Attributes: _configure (func): Uses to configure the Item Exporter by setting the options dictionary. encoder (ScrapyJSONEncoder): Encoder used to convert scrapy items into a json format line. """ def __init__(self, **kwargs): """Initialize the configuration dictionary and encoder. Args: **kwargs: Arbitrary keyword arguments for the options dictionary. """ # If dont_fail is set, it won't raise an exception on unexpected options self._configure(kwargs, dont_fail=True) kwargs.setdefault('ensure_ascii', not self.encoding) self.encoder = ScrapyJSONEncoder() super(JsonLinesItemSplitFileExporter, self).__init__() def export_item(self, item): """Export Scrapy items to specific files based on the article_type. Args: item (scrapy.Item): A Scrapy item that contains a complete scraped information for an article/product. """ # Serialize the item, and perform encoding to create a python dictionary item_dict = dict(self._get_serialized_fields(item)) data = self.encoder.encode(item_dict) + os.linesep # If there is only one item in article_type, then the path (folders) would just be # scraped_data/spider.name/article_type. Otherwise we would combine all the article_type list except the last # item into a path, such as scraped_data/spider.name/article_type[0]/article_type[1], then the item would be # a json line placed in scraped_data/spider.name/article_type[0]/article_type[1]/article_type[2].jl. if len(item['article_type']) == 1: path = os.path.join("scraped_data", item["spider_name"]) item_path = os.path.join(path, item['article_type'][0]) + ".jl" else: path = os.path.join(os.path.join("scraped_data", item["spider_name"]), (os.path.join(*item['article_type'][:-1]))) item_path = os.path.join(path, item['article_type'][-1]) + ".jl" if not os.path.exists(path): os.makedirs(path) # Write in append and byte mode open(item_path, 'a+b').write(to_bytes(data, self.encoding))
class SortedJsonItemExporter(BaseItemExporter): def __init__(self, file, **kwargs): super().__init__(dont_fail=True, **kwargs) self.file = file self._kwargs.setdefault('indent', 4) self._kwargs.setdefault('ensure_ascii', not self.encoding) self.encoder = ScrapyJSONEncoder(**self._kwargs) self.items = [] def export_item(self, item): self.items.append(dict(self._get_serialized_fields(item))) def finish_exporting(self): data = self.encoder.encode(sorted(self.items, key=sort_key)) self.file.write(to_bytes(data, self.encoding))
class KafkaPipeline(object): def __init__(self, producer, topic): self.producer = producer self.topic = topic self.encoder = ScrapyJSONEncoder() def process_item(self, item, spider): item = dict(item) item['spider'] = spider.name msg = self.encoder.encode(item) self.producer.send_message(self.topic, msg) @classmethod def from_settings(cls, settings)
class KafkaProducerPipeline(object): def __init__(self, kafka_bootstrap_server): self.kafka_bootstrap_server = [] self.kafka_bootstrap_server.append(kafka_bootstrap_server) self.collection_name = 'articles' self.encoder = ScrapyJSONEncoder() @classmethod def from_crawler(cls, crawler): # pull in information from settings.py return cls(kafka_bootstrap_server=crawler.settings.get( 'KAFKA_BOOTSTRAP_SERVER'), ) def open_spider(self, spider): print("spider name: ", spider.name) # initializing py-Kafka producer self.producer = KafkaProducer( bootstrap_servers=self.kafka_bootstrap_server) print("kafka_bootstrap_server: ", self.kafka_bootstrap_server) if hasattr(spider, 'collection_name'): print("spider collection_name: ", spider.collection_name) self.collection_name = spider.collection_name def close_spider(self, spider): # clean up when spider is closed self.producer.flush(timeout=60) self.producer.close(timeout=60) def process_item(self, item, spider): valid = True for data in item: if not data: valid = False raise DropItem("Missing {0}!".format(data)) if valid: print("valid - inside process_item...", item['source'], ': ', item['headline']) # self.producer.send('articles', self.encoder.encode(item).encode()) key = str(ord(item['source'][0])) + str(ord(item['source'][1])) self.producer.send('articles', value=self.encoder.encode(item).encode(), key=key.encode()) self.index += 1 logging.debug("News item sent by Kafka Producer!") return item
class ScrapyKafkaTopicWriter(KafkaTopicWriter): """ Kafka writer which knows how to handle Scrapy items: they are serialized to JSON, and "_id" field is used as Kafka key if present. """ def __init__(self, *args, **kwargs): self._encoder = ScrapyJSONEncoder() self._exporter = PythonItemExporter(binary=False) kwargs.setdefault('value_serializer', self._serialize_value) super(ScrapyKafkaTopicWriter, self).__init__(*args, **kwargs) def write_item(self, item): key = item.get('_id', None) msg = self._exporter.export_item(item) return self.write(key, msg) def _serialize_value(self, value): return self._encoder.encode(value).encode('utf8')
class AddItemPipeline(object): """ Pushes serialized item into a RQ """ def __init__(self, host, port, db, queue_name, store_id): self.encoder = ScrapyJSONEncoder() self.store_id = store_id self.queue_name = queue_name self.server = redis.Redis(host, port, db) self.queue = rq.Queue(queue_name, connection=self.server) @classmethod def from_settings(cls, settings): host = settings.get('REDIS_HOST', 'localhost') port = settings.get('REDIS_PORT', 6379) db = settings.get('REDIS_DB', 0) queue_name = settings.get('RQ_QUEUE', 'default') store_id = int(settings.get('STORE', 0)) return cls(host, port, db, queue_name, store_id) @classmethod def from_crawler(cls, crawler): return cls.from_settings(crawler.settings) def process_item(self, item, spider): return deferToThread(self._process_item, item, spider) def _process_item(self, item, spider): ''' ''' # {{{ ## get global Store URL mapping store_id = self.store_id if store_id is 0: raise DropItem('Not set the store and no Store URL mapping') ## assign queue parameters item['store_id'] = store_id callback = 'worker.save_product_to_db' event = self.encoder.encode( dict(queue=self.queue_name, value=item, time=time.time())) ## push item to redis queue self.queue.enqueue(callback, event) return item
class AddItemPipeline(object): """ Pushes serialized item into a RQ """ def __init__(self, host, port, db, queue_name, store_id): self.encoder = ScrapyJSONEncoder() self.store_id = store_id self.queue_name = queue_name self.server = redis.Redis(host, port, db) self.queue = rq.Queue(queue_name, connection=self.server) @classmethod def from_settings(cls, settings): host = settings.get('REDIS_HOST', 'localhost') port = settings.get('REDIS_PORT', 6379) db = settings.get('REDIS_DB', 0) queue_name = settings.get('RQ_QUEUE', 'default') store_id = int(settings.get('STORE', 0)) return cls(host, port, db, queue_name, store_id) @classmethod def from_crawler(cls, crawler): return cls.from_settings(crawler.settings) def process_item(self, item, spider): return deferToThread(self._process_item, item, spider) def _process_item(self, item, spider): ''' ''' # {{{ ## get global Store URL mapping store_id = self.store_id if store_id is 0: raise DropItem('Not set the store and no Store URL mapping') ## assign queue parameters item['store_id'] = store_id callback = 'worker.save_product_to_db' event = self.encoder.encode(dict(queue=self.queue_name, value=item, time=time.time())) ## push item to redis queue self.queue.enqueue(callback, event) return item
class RedisListPipeline: DEFAULT_QUEUE = 'queue' DEFAULT_MAX_RETRY = 5 serializer = ScrapyJSONEncoder().encode def __init__(self, conn_url: str, queue: str, max_retry=None): try: import redis except ImportError: raise NotConfigured('missing redis library') self._conn = redis.from_url(conn_url) self.queue = queue or self.DEFAULT_QUEUE self.max_retry = max_retry or self.DEFAULT_MAX_RETRY @classmethod def from_crawler(cls, crawler): if hasattr(crawler.spider, 'queue'): queue = crawler.spider.queue else: queue = crawler.settings.get('REDIS_DEFAULT_QUEUE') return cls( conn_url=crawler.settings.get('REDIS_CONNECTION_URL'), queue=queue, max_retry=crawler.settings.get('REDIS_MAX_RETRY') ) def process_item(self, item, spider): data = self.serializer(item) try_time = 0 while try_time < self.max_retry: try: self._conn.rpush(self.queue, data) return item except Exception: spider.logger.error('process item failed {}'.format(item)) try_time += 1 spider.logger.error('Give up item for failed {} times {}'.format(try_time, item)) return item def close(self): self._conn.close()
class CustomJsonItemExporter(BaseItemExporter): def __init__(self, file, **kwargs): self._configure(kwargs, dont_fail=True) self.file = file kwargs.setdefault("ensure_ascii", False) self.encoder = ScrapyJSONEncoder(**kwargs) self.first_item = True def start_exporting(self): self.file.write(b'') def finish_exporting(self): self.file.write(b'\n') def export_item(self, item): itemdict = dict(self._get_serialized_fields(item)) bytes = to_bytes(self.encoder.encode(itemdict)) self.file.write(bytes) self.file.write(b"\n")
class SendToBrokerPipeline(object): def __init__(self): self.publisher = Publisher('data_distributor') self.encoder = ScrapyJSONEncoder() def process_item(self, item, spider): #Runs sending broker in separate thread to prevent it from blocking #on single items return deferToThread(self._process_item, item, spider) def _process_item(self, item, spider): item_dict = dict(item) data = self.encoder.encode(item_dict) self.publisher.send_message(data,'articles') return item
class UnicodeJsonLinesItemExporter(BaseItemExporter): logger = logging.getLogger('UnicodeJsonLinesLogging') def __init__(self, file, **kwargs): self._configure(kwargs, dont_fail=True) self.file = file self.encoder = ScrapyJSONEncoder(ensure_ascii=False, encoding='UTF-8', **kwargs) def export_item(self, item): itemdict = dict(self._get_serialized_fields(item)) data = self.encoder.encode(itemdict) + '\n' self.logger.info('==============') self.file.write(to_bytes(data, self.encoding)) def finish_exporting(self): pass
class BasePipeline(object): """ """ def __init__(self): self.headers = {'Content-Type': 'application/json'} self._encoder = ScrapyJSONEncoder() def _item_to_json(self, obj): """ Utility that takes a object, serializes it using ScrapyJSONEncoder and returns a deserialized version of the data. We use this to convert Scrapy models to JSON objects we can to requests. :param obj: :return dict: """ return json.loads(self._encoder.encode(obj))
class JsonItemExporter(JsonLinesItemExporter): def __init__(self, file, **kwargs): self._configure(kwargs, dont_fail=True) self.file = file self.encoder = ScrapyJSONEncoder(**kwargs) self.first_item = True def start_exporting(self): self.file.write("[") def finish_exporting(self): self.file.write("]") def export_item(self, item): if self.first_item: self.first_item = False else: self.file.write(',\n') itemdict = dict(self._get_serialized_fields(item)) self.file.write(self.encoder.encode(itemdict))
class JsonItemExporter(BaseItemExporter): def __init__(self, file, **kwargs): self._configure(kwargs, dont_fail=True) self.file = file self.encoder = ScrapyJSONEncoder(**kwargs) self.first_item = True def start_exporting(self): self.file.write(b"[") def finish_exporting(self): self.file.write(b"]") def export_item(self, item): if self.first_item: self.first_item = False else: self.file.write(b",\n") itemdict = dict(self._get_serialized_fields(item)) self.file.write(to_bytes(self.encoder.encode(itemdict)))
class GeoJsonItemExporter(BaseItemExporter): def __init__(self, file, **kwargs): self._configure(kwargs, dont_fail=True) self.file = file kwargs.setdefault('ensure_ascii', not self.encoding) self.encoder = ScrapyJSONEncoder(**kwargs) self.first_item = True def start_exporting(self): self.file.write(b'{ "type": "FeatureCollection","features":[\n') def finish_exporting(self): self.file.write(b"\n]}") def export_item(self, item): if self.first_item: self.first_item = False else: self.file.write(b',\n') data = self.encoder.encode(item) self.file.write(to_bytes(data, self.encoding))
class JsonItemExporter(JsonLinesItemExporter): def __init__(self, file, **kwargs): self._configure(kwargs) self.file = file self.encoder = ScrapyJSONEncoder(**kwargs) self.first_item = True def start_exporting(self): self.file.write("[") def finish_exporting(self): self.file.write("]") def export_item(self, item): if self.first_item: self.first_item = False else: self.file.write(',\n') itemdict = dict(self._get_serialized_fields(item)) self.file.write(self.encoder.encode(itemdict))
class FanItemExporter(BaseItemExporter): def __init__(self, file, **kwargs): self._configure(kwargs, dont_fail=True) self.file = file self.encoder = ScrapyJSONEncoder(**kwargs) self.first_item = True def start_exporting(self): self.file.write(b'{"product": [') def finish_exporting(self): self.file.write(b'\n]}') def export_item(self, item): if self.first_item: self.first_item = False else: self.file.write(b',\n') itemdict = dict(self._get_serialized_fields(item)) self.file.write(to_bytes(self.encoder.encode(itemdict)))
class JsonItemExporter(BaseItemExporter): def __init__(self, file, **kwargs): self._configure(kwargs, dont_fail=True) self.file = file kwargs.setdefault('ensure_ascii', not self.encoding) self.encoder = ScrapyJSONEncoder(**kwargs) self.first_item = True def start_exporting(self): self.file.write(b"[\n") def finish_exporting(self): self.file.write(b"\n]") def export_item(self, item): if self.first_item: self.first_item = False else: self.file.write(b',\n') itemdict = dict(self._get_serialized_fields(item)) data = self.encoder.encode(itemdict) self.file.write(to_bytes(data, self.encoding))
class DockerhubExtension(object): @classmethod def from_crawler(cls, crawler): return cls(crawler) def __init__(self, crawler): self.crawler = crawler self.job_path = crawler.settings.get('JOB_PATH') if not self.job_path: raise NotConfigured('no JOB_PATH set') self.json_encoder = ScrapyJSONEncoder() self.looping_call = LoopingCall(self.store_job_info) self.looping_call.start(5) crawler.signals.connect(self.store_job_info, signal=signals.spider_closed) def store_job_info(self): with open(self.job_path, 'w') as f: stats = self.crawler.stats.get_stats() job_info = {'stats': stats} job_info_json = self.json_encoder.encode(job_info) f.write(job_info_json)
class KafkaPipeline: # https://github.com/dfdeshom/scrapy-kafka/blob/master/scrapy_kafka/pipelines.py def __init__(self, producer, topic): self.producer = producer self.topic = topic self.encoder = ScrapyJSONEncoder() def process_item(self, item, spider): item = dict(item) item['spider'] = spider.name msg = self.encoder.encode(item) spider.log(msg) self.producer.send(self.topic, msg.encode('utf-8')) spider.log("Sent to kafka.") @classmethod def from_settings(cls, settings): k_hosts = settings.get('KAFKA_HOSTS', ['localhost:9092']) topic = settings.get('KAFKA_ITEM_PIPELINE_TOPIC', 'scrapy_kafka_item') prod = KafkaProducer(bootstrap_servers=k_hosts) return cls(prod, topic)
class KafkaPipeline: def __init__(self, producer, topic): self.p = producer self.topic = topic self.encoder = ScrapyJSONEncoder() @classmethod def from_settings(cls, settings): """ :param settings: the current Scrapy settings :type settings: scrapy.settings.Settings :rtype: A :class:`~KafkaPipeline` instance """ brokers = settings.get('KAFKA_PRODUCER_BROKERS') topic = settings.get('KAFKA_PRODUCER_TOPIC') producer = Producer({ 'bootstrap.servers': brokers, }) return cls(producer, topic) def process_item(self, item, spider): msg = self.encoder.encode(item) self.p.produce(self.topic, msg, callback=delivery_report)