def process_item(self, item, spider): adapter = ItemAdapter(item) if adapter['name'] is None or not isinstance(adapter['name'], str): return DropItem(f"name field invalid...") elif adapter['gclass'] is None or not isinstance( adapter['gclass'], str): return DropItem(f"gclass field invalid") else: return item
def test_exporter_custom_serializer(self): class CustomItemExporter(BaseItemExporter): def serialize_field(self, field, name, value): if name == 'age': return str(int(value) + 1) else: return super().serialize_field(field, name, value) i = self.item_class(name='John', age='22') a = ItemAdapter(i) ie = CustomItemExporter() self.assertEqual(ie.serialize_field(a.get_field_meta('name'), 'name', a['name']), 'John') self.assertEqual(ie.serialize_field(a.get_field_meta('age'), 'age', a['age']), '23') i2 = {'name': 'John', 'age': '22'} self.assertEqual(ie.serialize_field({}, 'name', i2['name']), 'John') self.assertEqual(ie.serialize_field({}, 'age', i2['age']), '23')
def test_get_output_value_list(self): """Getting output value must not remove value from item""" input_item = self.item_class(name=['foo', 'bar']) il = ItemLoader(item=input_item) self.assertEqual(il.get_output_value('name'), ['foo', 'bar']) loaded_item = il.load_item() self.assertIsInstance(loaded_item, self.item_class) self.assertEqual( ItemAdapter(loaded_item).asdict(), dict({'name': ['foo', 'bar']}))
def process_item(self, item, spider): ''' 每个实现保存的类里面必须都要有这个方法,且名字固定,用来具体实现怎么保存 ''' collection_name = item['domain_collection'] table = self.db[collection_name] table.insert_one(ItemAdapter(item).asdict()) return item
def test_add_value_singlevalue_singlevalue(self): """Values added after initialization should be appended""" input_item = self.item_class(name='foo') il = ItemLoader(item=input_item) il.add_value('name', 'bar') loaded_item = il.load_item() self.assertIsInstance(loaded_item, self.item_class) self.assertEqual( ItemAdapter(loaded_item).asdict(), {'name': ['foo', 'bar']})
def close_spider(self, spider): if self.items is not []: for item in self.items: adapter = ItemAdapter(item) columns = adapter.field_names() writer = csv.DictWriter(self.file, fieldnames=columns, restval='', extrasaction='ignore', delimiter=',', quoting=csv.QUOTE_NONNUMERIC, quotechar="\"") if self.file.tell() == 0: writer.writeheader() writer.writerow(adapter.asdict()) self.file.close()
def process_item(self, item, spider): adapter = ItemAdapter(item) name = adapter['name'] i_type = adapter['m_type'] i_time = adapter['m_time'] output = f'{name}\t{i_type}\t{i_time}\n' with open('./scrapy_result.csv', 'a+', encoding="utf-8") as result: result.write(output) return item
def process_item(self, item, spider): adapter = ItemAdapter(item) html = adapter.get('table_html') if html is not None: i = 1 for table in html.css("table"): path = os.path.join(spider.settings['FILES_STORE'], adapter['out_dir'], 'table-%i.tsv' % i) with open(path, "w") as file: for tr in table.css('tr'): line = "\t".join( tr.css('td span::text').getall()) + "\n" file.write(line) spider.logger.debug('Table Path: %s' % path) i += 1 pass return item
def process_item(self, item, spider): adapter = ItemAdapter(item) if adapter["bbb_url"] in self.seen_urls: DropItem(adapter) spider.logger.info(f"duplicate found {adapter['bbb_url']}") return item
def process_item(self, item, spider): item.setdefault('start_date', '') item.setdefault('end_date', '') self.file_1 = open('Requests.csv', 'a', encoding='utf-8', newline='') writer = csv.writer(self.file_1, delimiter=',') writer.writerow(ItemAdapter(item).values()) self.file_1.close() return item
def process_item(self, item, spider): data = ItemAdapter(item).asdict() update = {"$set": data} # perform upsert self.db[spider.name].update_one(filter=data, update=update, upsert=True) return item
def print_items(self, lvl=None, colour=True): if lvl is None: items = [item for lst in self.items.values() for item in lst] else: items = self.items.get(lvl, []) print("# Scraped Items ", "-" * 60) display.pprint([ItemAdapter(x).asdict() for x in items], colorize=colour)
def _exporter_for_item(self, item): adapter = ItemAdapter(item) company = adapter['company'] if company not in self.company_to_exporter: f = open(os.path.join(self.path, f'{company}.json'), 'wb') exporter = JsonItemExporter(f, indent=4) exporter.start_exporting() self.company_to_exporter[company] = exporter return self.company_to_exporter[company]
def post_process(self, output): for x in output: if is_item(x): missing = [ arg for arg in self.args if arg not in ItemAdapter(x) ] if missing: missing_fields = ", ".join(missing) raise ContractFail(f"Missing fields: {missing_fields}")
def _get_serialized_fields(self, item, default_value=None, include_empty=None, pre=None, field_filter=None): """Copy from BaseItemExporter """ item = ItemAdapter(item) if include_empty is None: include_empty = self.export_empty_fields if self.fields_to_export is None: if include_empty: field_iter = item.field_names() else: field_iter = item.keys() else: if include_empty: field_iter = self.fields_to_export else: field_iter = (x for x in self.fields_to_export if x in item) for field_name in field_iter: k = None if field_filter: if pre is not None: k = pre_join(pre, field_name) if k in field_filter: continue if field_name in item: field_meta = item.get_field_meta(field_name) value = self.serialize_field( field_meta, field_name, item[field_name], pre=k, field_filter=field_filter, ) else: value = default_value yield field_name, value
def test_nested_item(self): i1 = self.item_class(name='Joseph\xa3', age='22') i2 = self.item_class(name='Maria', age=i1) i3 = self.item_class(name='Jesus', age=i2) self.ie.start_exporting() self.ie.export_item(i3) self.ie.finish_exporting() exported = json.loads(to_unicode(self.output.getvalue())) expected = {'name': 'Jesus', 'age': {'name': 'Maria', 'age': ItemAdapter(i1).asdict()}} self.assertEqual(exported, [expected])
def process_item(self, item, spider): adapter = ItemAdapter(item) item_dict = dict() item_dict['id'] = adapter['id'] item_dict['src'] = adapter['src'] if self.db[self.collection_name].find_one({"id": adapter["id"]}): raise DropItem(f"Duplicate item found: {item!r}") else: return item
def process_item(self, item, spider): adapter = ItemAdapter(item) # удаляем / в конце если есть url = adapter['url'].strip('/') url_withaout_param = url[:url.rfind('/')] if url_withaout_param in self.store_urls: raise DropItem("Duplicate url found: %r" % item) else: self.store_urls.add(url_withaout_param) return item
def parse_item(self, response): adapter = ItemAdapter(self.item_cls()) m = self.name_re.search(response.text) if m: adapter['name'] = m.group(1) adapter['url'] = response.url m = self.price_re.search(response.text) if m: adapter['price'] = m.group(1) return adapter.item
def test_header_export_two_items(self): for item in [self.i, ItemAdapter(self.i).asdict()]: output = BytesIO() ie = CsvItemExporter(output) ie.start_exporting() ie.export_item(item) ie.export_item(item) ie.finish_exporting() self.assertCsvEqual(output.getvalue(), b'age,name\r\n22,John\xc2\xa3\r\n22,John\xc2\xa3\r\n')
def process_item(self, item, spider): if item['product_title'] != []: cursor = self.db[self.collection_name].find({"_id": item['_id']}) if cursor.count() == 0: self.db[self.collection_name].insert_one(ItemAdapter(item).asdict()) else: self.db[self.collection_name].update_one({ "_id": item['_id'] }, {"$set": ItemAdapter(item).asdict()}) return item return item
def process_item(self, item, spider): # Only handle TopicItems if not isinstance(item, BaseTopicItem): return item adapter = ItemAdapter(item) topic_id = adapter.get('topic_id') self.logger.debug(f'exporting TopicItem (id: {topic_id})') path = prepare_path(base_dir=self.base_dir_path, dirname_template=self.dirname_tmplt, filename_template=self.filename_tmplt, item=item) with path.open('w') as wh: json.dump(adapter.asdict(), wh) self.logger.debug(f'exported TopicItem (id: {topic_id})') return item
def insert_into_comment(conn, item): adapter = ItemAdapter(item) sqlite_insert_with_param = """INSERT OR REPLACE INTO comment (comment_id, comment_text, comment_date, comment_author_id, comment_author_username,article_url,article_title) VALUES (?, ?, ?, ?, ?, ?, ?);""" data_tuple = (adapter["comment_id"], adapter["comment_text"], adapter["comment_date"], adapter["comment_author_id"], adapter["comment_author_username"], adapter["article_url"], adapter["article_title"]) execute_sql_param(conn, sqlite_insert_with_param, data_tuple)
def update_body(self, item): item = ItemAdapter(item).asdict() domain, path = self.purify_url(item['url']) self.body += [{ 'image': item['image_url'], 'domain': domain, 'path': path, 'des': item['description'], 'title': item['title'] }]
def test_false(self): self.assertFalse(is_item(int)) self.assertFalse(is_item(sum)) self.assertFalse(is_item(1234)) self.assertFalse(is_item(object())) self.assertFalse(is_item("a string")) self.assertFalse(is_item(b"some bytes")) self.assertFalse(is_item(["a", "list"])) self.assertFalse(is_item(("a", "tuple"))) self.assertFalse(is_item({"a", "set"})) self.assertFalse(is_item(dict)) self.assertFalse(is_item(ScrapyItem)) self.assertFalse(is_item(DataClassItem)) self.assertFalse(is_item(ScrapySubclassedItem)) self.assertFalse(is_item(AttrsItem)) self.assertFalse(is_item(PydanticModel)) self.assertFalse(ItemAdapter.is_item_class(list)) self.assertFalse(ItemAdapter.is_item_class(int)) self.assertFalse(ItemAdapter.is_item_class(tuple))
def test_add_value_list_singlevalue(self): """Values added after initialization should be appended""" input_item = self.item_class(name=["foo", "bar"]) il = ItemLoader(item=input_item) il.add_value("name", "qwerty") loaded_item = il.load_item() self.assertIsInstance(loaded_item, self.item_class) self.assertEqual( ItemAdapter(loaded_item).asdict(), {"name": ["foo", "bar", "qwerty"]})
def process_item(self, item, spider): new_item = WeiXinCnpcNews(title=item.get('title'), author=item.get('author'), pre_title=item.get('pre_title'), \ preview_img_link=item.get('preview_img_link'), pub_time=item.get('pub_time'), \ content=item.get('content'), crawl_time=item.get('crawl_time'), url=item.get('url'), \ categories=item.get('categories'),images_url=str(item.get('image_urls')),images=str(item.get('images'))) adapter = ItemAdapter(item) try: if adapter.get('content'): spider.session.add(new_item) spider.session.commit() else: raise (f"Missing content in {item}") except: spider.session.rollback() return item
def process_item(self, item, spider): itemAdapter = ItemAdapter(item=item) if itemAdapter.get('title') and itemAdapter.get('description'): type = self.get_refine_type(str(itemAdapter.get("type"))) if type == "UNKNOW": text = [ str(itemAdapter.get('title')) + str(itemAdapter.get('description')) ] prediction = self.classifier.predict(text) type = prediction[0] itemAdapter.update({'type': type}) return itemAdapter.item else: return DropItem( f"Missing field in paper at {itemAdapter.get('source')}")
def _get_serialized_fields(self, item, default_value=None, include_empty=None): """Return the fields to export as an iterable of tuples (name, serialized_value) """ item = ItemAdapter(item) if include_empty is None: include_empty = self.export_empty_fields if self.fields_to_export is None: if include_empty: field_iter = item.field_names() else: field_iter = item.keys() elif isinstance(self.fields_to_export, Mapping): if include_empty: field_iter = self.fields_to_export.items() else: field_iter = ((x, y) for x, y in self.fields_to_export.items() if x in item) else: if include_empty: field_iter = self.fields_to_export else: field_iter = (x for x in self.fields_to_export if x in item) for field_name in field_iter: if isinstance(field_name, str): item_field, output_field = field_name, field_name else: item_field, output_field = field_name if item_field in item: field_meta = item.get_field_meta(item_field) value = self.serialize_field(field_meta, output_field, item[item_field]) else: value = default_value yield output_field, value
async def process_item(self, item, spider): print(item) if item["doctype"] == "course": self.db.Courses.find_one_and_update( { "doctype": "course", "id": item["id"] }, {"$set": ItemAdapter(item).asdict()}, upsert=True, ) return item user = self.db.Users.find_one({ "doctype": "user", "_id": item["user_id"] }) if not user.get("notifications"): self.db.Assignments.find_one_and_update({"doctype": "assignment"}) else: old = await self.process_grades( user) # getting the current grades as an int # inserting the doc into db self.db[item["doctype"][0].upper() + item["doctype"][1:] + "s"].find_one_and_update( { "doctype": item["doctype"], "id": item["id"] }, {"$set": ItemAdapter(item).asdict()}, upsert=True, ) current = await self.process_grades(user) # updated grade count for system in user["notifications"]: pass return item