Ejemplo n.º 1
0
 def save_items(self, items):
     for item in items:
         table_name = item.__class__.__name__
         item_dict = ItemAdapter(item).asdict()
         item_dict = self.__check_size(item_dict)
         item_dict = self.__clean_dict(item_dict)
         try:
             _columns = ', '.join(item_dict.keys())
             updated_values = ', '.join(i[0] + "='" + i[1] + "'"
                                        for i in item_dict.items()
                                        if i[0] != 'url')
             values = ", ".join("'{}'".format(k)
                                for k in item_dict.values())
             sql = "INSERT INTO sro.{} ({}) VALUES ({})".format(
                 table_name, _columns, values)
             self._cursor.execute(sql)
             print(sql)
         except:
             url = item_dict.pop('url')
             _columns = ', '.join(item_dict.keys())
             set_str = ", ".join("{}=%s".format(k)
                                 for k in item_dict.keys())
             sql = "UPDATE sro.{} SET {} WHERE url = '{}'".format(
                 table_name, set_str, url)
             self._cursor.execute(sql, list(item_dict.values()))
             print(sql)
     self._connection.commit()
Ejemplo n.º 2
0
 def process_item(self, item, spider):
     adapter = ItemAdapter(item)
     hasError = False
     for val in adapter.values():
         if ('DATA PROCESS ERROR' in str(val)):
             hasError = True
             msg = f"Data process error found @ {adapter['SOURCE_ID']}-{adapter['JOB_HASH']}"
             print(msg)
             raise DropItem(msg)
     if (not hasError):
         self.__connector.run_query(item)
         return item
Ejemplo n.º 3
0
 def process_item(self, item, spider):
     if spider.name == 'douban':
         content = f"|{item['db_movie_name']}|\t|{item['db_movie_link']}|\t|{item['db_movie_detail']}|\n\n"
         with open('/Users/xiaoyu.gao/PycharmProjects/Python001-class01/week01/db_movie.txt', 'a', encoding='utf8') \
                 as f:
             f.write(content)
     elif spider.name == 'maoyan':
         item_dict = ItemAdapter(item).asdict()
         # sql_engine.connect(host=config.DB_HOST, user=config.DB_USER, password=config.DB_PWD, db=config.DB)
         sql = 'INSERT INTO {table} (`{fields}`) VALUES (%s,%s,%s) ON DUPLICATE KEY UPDATE {update}'.format(
             table=self.table,
             fields='`,`'.join(item_dict.keys()),
             update=','.join(['`{field}`=VALUES(`{field}`)'.format(field=field) for field in item_dict.keys()])
         )
         cur = self.conn.cursor()
         try:
             cur.execute(sql, item_dict.values())
         except:
             self.conn.rollback()
             print('save to db failed except:%s', format_exc())
         finally:
             cur.close()
     return item
Ejemplo n.º 4
0
class CsvItemExporter(BaseItemExporter):
    def __init__(self,
                 file,
                 include_headers_line=True,
                 join_multivalued=',',
                 errors=None,
                 **kwargs):
        super().__init__(dont_fail=True, **kwargs)
        if not self.encoding:
            self.encoding = 'utf-8'
        self.include_headers_line = include_headers_line
        self.stream = io.TextIOWrapper(
            file,
            line_buffering=False,
            write_through=True,
            encoding=self.encoding,
            newline=
            '',  # Windows needs this https://github.com/scrapy/scrapy/issues/3034
            errors=errors,
        )
        self.csv_writer = csv.writer(self.stream, **self._kwargs)
        self._headers_not_written = True
        self._join_multivalued = join_multivalued

    def serialize_field(self, field, name, value):
        serializer = field.get('serializer', self._join_if_needed)
        return serializer(value)

    def _join_if_needed(self, value):
        if isinstance(value, (list, tuple)):
            try:
                return self._join_multivalued.join(value)
            except TypeError:  # list in value may not contain strings
                pass
        return value

    def export_item(self, item):
        if self._headers_not_written:
            self._headers_not_written = False
            self._write_headers_and_set_fields_to_export(item)

        fields = self._get_serialized_fields(item,
                                             default_value='',
                                             include_empty=True)
        values = list(self._build_row(x for _, x in fields))
        self.csv_writer.writerow(values)

    def _build_row(self, values):
        for s in values:
            try:
                yield to_unicode(s, self.encoding)
            except TypeError:
                yield s

    def _write_headers_and_set_fields_to_export(self, item):
        if self.include_headers_line:
            if not self.fields_to_export:
                # use declared field names, or keys if the item is a dict
                self.fields_to_export = ItemAdapter(item).field_names()
            if isinstance(self.fields_to_export, Mapping):
                fields = self.fields_to_export.values()
            else:
                fields = self.fields_to_export
            row = list(self._build_row(fields))
            self.csv_writer.writerow(row)