def save_items(self, items): for item in items: table_name = item.__class__.__name__ item_dict = ItemAdapter(item).asdict() item_dict = self.__check_size(item_dict) item_dict = self.__clean_dict(item_dict) try: _columns = ', '.join(item_dict.keys()) updated_values = ', '.join(i[0] + "='" + i[1] + "'" for i in item_dict.items() if i[0] != 'url') values = ", ".join("'{}'".format(k) for k in item_dict.values()) sql = "INSERT INTO sro.{} ({}) VALUES ({})".format( table_name, _columns, values) self._cursor.execute(sql) print(sql) except: url = item_dict.pop('url') _columns = ', '.join(item_dict.keys()) set_str = ", ".join("{}=%s".format(k) for k in item_dict.keys()) sql = "UPDATE sro.{} SET {} WHERE url = '{}'".format( table_name, set_str, url) self._cursor.execute(sql, list(item_dict.values())) print(sql) self._connection.commit()
def process_item(_item: ShiXiSengJobItem, _spider: ShixiSeng) -> ShiXiSengJobItem: """ 数据处理逻辑 """ # 将空白数据置为None adapter = ItemAdapter(_item) for i in adapter.keys(): # type: str if len(adapter[i]) == 0: adapter[i] = None # 数据合法性校验 for i in adapter.keys(): if adapter[i] is None or i in ('details_url', 'job_type'): pass # 详情URL原样保留 elif i in ('job_good_list', 'company_tags'): # 两类标签列表批量转换,去除首尾空白符 adapter[i] = list(map(trim_all, adapter[i])) pass elif i == 'job_detail': # 这一项需要以HTML源代码形式存储,借助lxml进行代码规范化 adapter[i] = tostring(element_or_tree=adapter[i][0], encoding=str) # 链式移除其中遗留的神奇的格式控制符,它们会对后续Hive数据载入产生影响 adapter[i] = trim_all(adapter[i]) else: # 其他元素则统一提取字符串 adapter[i] = trim_all(adapter[i][0]) if i in ('dead_line', 'profile_requirement'): # 两个日期需要截断,只保留日期的部分,文字注释丢弃 adapter[i] = adapter[i].split(':')[1] # 进行后续处理 return adapter.item
def process_item(self, item, spider): adapter = ItemAdapter(item) if 'inv_id' in adapter.keys(): # Nothing here, as it is assumed an inv_id means the investment # already exists. Might change in future. # For now, will just pass on this pipeline. pass elif 'invname' in adapter.keys(): # If the invname has been passsed and there is no inv_id, that # means the investment must not exist (it couldn't be found in # the AssignInvId pipeline) platform = adapter['platformname'] investments = InvestmentName.objects.filter(platform_id=platform) if str(adapter['invname']).casefold() not in [ str(i.name).casefold() for i in investments ]: # If the invname does not exist in the investments for this # platform, create a new investment and matching scrape. new_inv = InvestmentName(name=adapter['invname'].title(), platform=adapter['platformname']) new_inv.save() new_inv_scrape = InvestmentScrape(name=new_inv, settings=adapter['settings']) new_inv_scrape.save() # Should be a log print('NEW INV CREATED:', new_inv.name) else: # If all fees/buy spreads/sell spreads are in the same response. platform = adapter['platformname'] investmentnames = InvestmentName.objects.filter( platform_id=platform) for key in [k for k in adapter.keys() if k not in excluded_keys]: # Iterate investments in the item if str(key).casefold() not in [ str(i.name).casefold() for i in investmentnames ]: # If they don't exist (as above), create new investment # and matching scrape. new_inv = InvestmentName(name=key.title(), platform=adapter['platformname']) new_inv.save() new_inv_scrape = InvestmentScrape( name=new_inv, settings=adapter['settings']) new_inv_scrape.save() # Should be a log print('NEW INV CREATED:', new_inv.name) return item
def process_item(self, item, spider): adapter = ItemAdapter(item) if 'inv_id' in adapter.keys(): # If inv_id is passed in item, look up the investment instance and # save the data directly. inv = get_object_or_404(Investment, id=adapter['inv_id'], template=True) setattr(inv, adapter['type'], adapter['data']) inv.save() else: lowercase_adapter = {k.lower(): v for k, v in adapter.items()} # If all investments are in the same item, get all investments for # the platform and iterate through them. platform = adapter['platformname'] investments = Investment.objects.filter(name__platform=platform, template=True) for inv in investments: # For each investment, assign the data from the item to the # relevant field - the adapter['type']) setattr(inv, adapter['type'], lowercase_adapter[str(inv.name.name).casefold()]) inv.save() return item
def _get_serialized_fields(self, item, default_value=None, include_empty=None): """Return the fields to export as an iterable of tuples (name, serialized_value) """ item = ItemAdapter(item) if include_empty is None: include_empty = self.export_empty_fields if self.fields_to_export is None: if include_empty: field_iter = item.field_names() else: field_iter = item.keys() else: if include_empty: field_iter = self.fields_to_export else: field_iter = (x for x in self.fields_to_export if x in item) for field_name in field_iter: if field_name in item: field_meta = item.get_field_meta(field_name) value = self.serialize_field(field_meta, field_name, item[field_name]) else: value = default_value yield field_name, value
def _get_serialized_fields(self, item, default_value=None, include_empty=None): """Return the fields to export as an iterable of tuples (name, serialized_value) 用serialize_field 序列化当前所选的item项 """ item = ItemAdapter(item) if include_empty is None: include_empty = self.export_empty_fields if self.fields_to_export is None: if include_empty: # 包含空值(item定义有但是传入没有的key) field_iter = item.field_names() else: # 只包含item传递过来的值 field_iter = item.keys() else: if include_empty: # 指定特定值 field_iter = self.fields_to_export else: field_iter = (x for x in self.fields_to_export if x in item) #取交集 for field_name in field_iter: if field_name in item: field_meta = item.get_field_meta(field_name) value = self.serialize_field(field_meta, field_name, item[field_name]) else: value = default_value yield field_name, value
def process_item(self, item, spider): adapter = ItemAdapter(item) for key in list(adapter.keys()): if adapter[key] == "None": bad_key = adapter.pop(key) return item
def process_item(self, item, spider): adapter = ItemAdapter(item) if 'indeed_job_key' not in adapter.keys(): raise DropItem(f"Missing job key: {item!r}") if adapter['indeed_job_key'] in self.ids_seen: raise DropItem(f"Duplicate item found: {item!r}") else: self.ids_seen.add(adapter['indeed_job_key']) return item
def process_item(self, item, spider): adapter = ItemAdapter(item) print(item['platformid']) if 'url' in adapter.keys(): platformname = (PlatformNames.objects.filter( id=item['platformid']))[0] platformname.direct_pds_link = adapter['url'] platformname.save()
def process_item(self, item, spider): adapter = ItemAdapter(item) existing_aa_names = get_list_or_404(AssetAllocationName) for key in adapter.keys(): if key not in excluded_keys: if key.casefold() not in (i.name.casefold() for i in existing_aa_names): aa_name = AssetAllocationName(name=key.title()) aa_name.save() return item
def process_item(self, item, spider): html = """<!DOCTYPE HTML><html lang="ru"><head><meta charset="UTF-8"> <title>Название страницы</title><meta name="description" content="Описание страницы" /></head><body>""" adapter = ItemAdapter(item) self.file.write(html) for key in adapter.keys(): if key not in ['images', 'image_urls', 'files', 'file_urls']: self.file.write(str(adapter[key])) self.file.write('\n') self.file.write('</body>') return item
def process_item(self, item, spider): """Clean up unnecessary values from an item.""" adapter = ItemAdapter(item) for key in tuple(adapter.keys()): if (self.drop_falsey and not adapter[key]) or ( self.drop_values is not None and adapter[key] in self.drop_values): del adapter[key] return item
def process_item(self, item, spider): adapter = ItemAdapter(item) if 'name' in adapter.keys() and 'price' in adapter.keys(): adapter['name'] = [unidecode(adapter['name'][0])] adapter['price'] = [unidecode(adapter['price'][0])] str_price = adapter['price'][0] str_price = str_price.replace('.', '') str_price = str_price.replace(',', '.') price = float(str_price) if 'cents' in adapter.keys(): cents = adapter['cents'][0] cents = float(cents) price = price + cents / 100 adapter['price'] = price self.col.insert_one(adapter.asdict()) print("Exporter:", adapter["name"], "| ", adapter["price"], "|", adapter['store']) else: raise DropItem(f"{item} without name or price") return item
def _get_serialized_fields(self, item, default_value=None, include_empty=None, pre=None, field_filter=None): """Copy from BaseItemExporter """ item = ItemAdapter(item) if include_empty is None: include_empty = self.export_empty_fields if self.fields_to_export is None: if include_empty: field_iter = item.field_names() else: field_iter = item.keys() else: if include_empty: field_iter = self.fields_to_export else: field_iter = (x for x in self.fields_to_export if x in item) for field_name in field_iter: k = None if field_filter: if pre is not None: k = pre_join(pre, field_name) if k in field_filter: continue if field_name in item: field_meta = item.get_field_meta(field_name) value = self.serialize_field( field_meta, field_name, item[field_name], pre=k, field_filter=field_filter, ) else: value = default_value yield field_name, value
def _get_serialized_fields(self, item, default_value=None, include_empty=None): """Return the fields to export as an iterable of tuples (name, serialized_value) """ item = ItemAdapter(item) if include_empty is None: include_empty = self.export_empty_fields if self.fields_to_export is None: if include_empty: field_iter = item.field_names() else: field_iter = item.keys() elif isinstance(self.fields_to_export, Mapping): if include_empty: field_iter = self.fields_to_export.items() else: field_iter = ((x, y) for x, y in self.fields_to_export.items() if x in item) else: if include_empty: field_iter = self.fields_to_export else: field_iter = (x for x in self.fields_to_export if x in item) for field_name in field_iter: if isinstance(field_name, str): item_field, output_field = field_name, field_name else: item_field, output_field = field_name if item_field in item: field_meta = item.get_field_meta(item_field) value = self.serialize_field(field_meta, output_field, item[item_field]) else: value = default_value yield output_field, value
def process_item(self, item, spider): if spider.name == 'douban': content = f"|{item['db_movie_name']}|\t|{item['db_movie_link']}|\t|{item['db_movie_detail']}|\n\n" with open('/Users/xiaoyu.gao/PycharmProjects/Python001-class01/week01/db_movie.txt', 'a', encoding='utf8') \ as f: f.write(content) elif spider.name == 'maoyan': item_dict = ItemAdapter(item).asdict() # sql_engine.connect(host=config.DB_HOST, user=config.DB_USER, password=config.DB_PWD, db=config.DB) sql = 'INSERT INTO {table} (`{fields}`) VALUES (%s,%s,%s) ON DUPLICATE KEY UPDATE {update}'.format( table=self.table, fields='`,`'.join(item_dict.keys()), update=','.join(['`{field}`=VALUES(`{field}`)'.format(field=field) for field in item_dict.keys()]) ) cur = self.conn.cursor() try: cur.execute(sql, item_dict.values()) except: self.conn.rollback() print('save to db failed except:%s', format_exc()) finally: cur.close() return item