def save_items(self, items): for item in items: table_name = item.__class__.__name__ item_dict = ItemAdapter(item).asdict() item_dict = self.__check_size(item_dict) item_dict = self.__clean_dict(item_dict) try: _columns = ', '.join(item_dict.keys()) updated_values = ', '.join(i[0] + "='" + i[1] + "'" for i in item_dict.items() if i[0] != 'url') values = ", ".join("'{}'".format(k) for k in item_dict.values()) sql = "INSERT INTO sro.{} ({}) VALUES ({})".format( table_name, _columns, values) self._cursor.execute(sql) print(sql) except: url = item_dict.pop('url') _columns = ', '.join(item_dict.keys()) set_str = ", ".join("{}=%s".format(k) for k in item_dict.keys()) sql = "UPDATE sro.{} SET {} WHERE url = '{}'".format( table_name, set_str, url) self._cursor.execute(sql, list(item_dict.values())) print(sql) self._connection.commit()
def process_item(self, item, spider): adapter = ItemAdapter(item) hasError = False for val in adapter.values(): if ('DATA PROCESS ERROR' in str(val)): hasError = True msg = f"Data process error found @ {adapter['SOURCE_ID']}-{adapter['JOB_HASH']}" print(msg) raise DropItem(msg) if (not hasError): self.__connector.run_query(item) return item
def process_item(self, item, spider): if spider.name == 'douban': content = f"|{item['db_movie_name']}|\t|{item['db_movie_link']}|\t|{item['db_movie_detail']}|\n\n" with open('/Users/xiaoyu.gao/PycharmProjects/Python001-class01/week01/db_movie.txt', 'a', encoding='utf8') \ as f: f.write(content) elif spider.name == 'maoyan': item_dict = ItemAdapter(item).asdict() # sql_engine.connect(host=config.DB_HOST, user=config.DB_USER, password=config.DB_PWD, db=config.DB) sql = 'INSERT INTO {table} (`{fields}`) VALUES (%s,%s,%s) ON DUPLICATE KEY UPDATE {update}'.format( table=self.table, fields='`,`'.join(item_dict.keys()), update=','.join(['`{field}`=VALUES(`{field}`)'.format(field=field) for field in item_dict.keys()]) ) cur = self.conn.cursor() try: cur.execute(sql, item_dict.values()) except: self.conn.rollback() print('save to db failed except:%s', format_exc()) finally: cur.close() return item
class CsvItemExporter(BaseItemExporter): def __init__(self, file, include_headers_line=True, join_multivalued=',', errors=None, **kwargs): super().__init__(dont_fail=True, **kwargs) if not self.encoding: self.encoding = 'utf-8' self.include_headers_line = include_headers_line self.stream = io.TextIOWrapper( file, line_buffering=False, write_through=True, encoding=self.encoding, newline= '', # Windows needs this https://github.com/scrapy/scrapy/issues/3034 errors=errors, ) self.csv_writer = csv.writer(self.stream, **self._kwargs) self._headers_not_written = True self._join_multivalued = join_multivalued def serialize_field(self, field, name, value): serializer = field.get('serializer', self._join_if_needed) return serializer(value) def _join_if_needed(self, value): if isinstance(value, (list, tuple)): try: return self._join_multivalued.join(value) except TypeError: # list in value may not contain strings pass return value def export_item(self, item): if self._headers_not_written: self._headers_not_written = False self._write_headers_and_set_fields_to_export(item) fields = self._get_serialized_fields(item, default_value='', include_empty=True) values = list(self._build_row(x for _, x in fields)) self.csv_writer.writerow(values) def _build_row(self, values): for s in values: try: yield to_unicode(s, self.encoding) except TypeError: yield s def _write_headers_and_set_fields_to_export(self, item): if self.include_headers_line: if not self.fields_to_export: # use declared field names, or keys if the item is a dict self.fields_to_export = ItemAdapter(item).field_names() if isinstance(self.fields_to_export, Mapping): fields = self.fields_to_export.values() else: fields = self.fields_to_export row = list(self._build_row(fields)) self.csv_writer.writerow(row)