Example #1
0
 def save_items(self, items):
     for item in items:
         table_name = item.__class__.__name__
         item_dict = ItemAdapter(item).asdict()
         item_dict = self.__check_size(item_dict)
         item_dict = self.__clean_dict(item_dict)
         try:
             _columns = ', '.join(item_dict.keys())
             updated_values = ', '.join(i[0] + "='" + i[1] + "'"
                                        for i in item_dict.items()
                                        if i[0] != 'url')
             values = ", ".join("'{}'".format(k)
                                for k in item_dict.values())
             sql = "INSERT INTO sro.{} ({}) VALUES ({})".format(
                 table_name, _columns, values)
             self._cursor.execute(sql)
             print(sql)
         except:
             url = item_dict.pop('url')
             _columns = ', '.join(item_dict.keys())
             set_str = ", ".join("{}=%s".format(k)
                                 for k in item_dict.keys())
             sql = "UPDATE sro.{} SET {} WHERE url = '{}'".format(
                 table_name, set_str, url)
             self._cursor.execute(sql, list(item_dict.values()))
             print(sql)
     self._connection.commit()
    def process_item(_item: ShiXiSengJobItem,
                     _spider: ShixiSeng) -> ShiXiSengJobItem:
        """ 数据处理逻辑 """
        # 将空白数据置为None
        adapter = ItemAdapter(_item)
        for i in adapter.keys():  # type: str
            if len(adapter[i]) == 0:
                adapter[i] = None

        # 数据合法性校验
        for i in adapter.keys():
            if adapter[i] is None or i in ('details_url', 'job_type'):
                pass  # 详情URL原样保留
            elif i in ('job_good_list', 'company_tags'):
                # 两类标签列表批量转换,去除首尾空白符
                adapter[i] = list(map(trim_all, adapter[i]))
                pass
            elif i == 'job_detail':
                # 这一项需要以HTML源代码形式存储,借助lxml进行代码规范化
                adapter[i] = tostring(element_or_tree=adapter[i][0],
                                      encoding=str)
                # 链式移除其中遗留的神奇的格式控制符,它们会对后续Hive数据载入产生影响
                adapter[i] = trim_all(adapter[i])
            else:
                # 其他元素则统一提取字符串
                adapter[i] = trim_all(adapter[i][0])
                if i in ('dead_line', 'profile_requirement'):
                    # 两个日期需要截断,只保留日期的部分,文字注释丢弃
                    adapter[i] = adapter[i].split(':')[1]

        # 进行后续处理
        return adapter.item
Example #3
0
    def process_item(self, item, spider):
        adapter = ItemAdapter(item)
        if 'inv_id' in adapter.keys():
            # Nothing here, as it is assumed an inv_id means the investment
            # already exists. Might change in future.
            # For now, will just pass on this pipeline.
            pass

        elif 'invname' in adapter.keys():
            # If the invname has been passsed and there is no inv_id, that
            # means the investment must not exist (it couldn't be found in
            # the AssignInvId pipeline)
            platform = adapter['platformname']
            investments = InvestmentName.objects.filter(platform_id=platform)
            if str(adapter['invname']).casefold() not in [
                    str(i.name).casefold() for i in investments
            ]:
                # If the invname does not exist in the investments for this
                # platform, create a new investment and matching scrape.
                new_inv = InvestmentName(name=adapter['invname'].title(),
                                         platform=adapter['platformname'])
                new_inv.save()
                new_inv_scrape = InvestmentScrape(name=new_inv,
                                                  settings=adapter['settings'])
                new_inv_scrape.save()

                # Should be a log
                print('NEW INV CREATED:', new_inv.name)

        else:
            # If all fees/buy spreads/sell spreads are in the same response.
            platform = adapter['platformname']
            investmentnames = InvestmentName.objects.filter(
                platform_id=platform)
            for key in [k for k in adapter.keys() if k not in excluded_keys]:
                # Iterate investments in the item
                if str(key).casefold() not in [
                        str(i.name).casefold() for i in investmentnames
                ]:
                    # If they don't exist (as above), create new investment
                    # and matching scrape.
                    new_inv = InvestmentName(name=key.title(),
                                             platform=adapter['platformname'])
                    new_inv.save()
                    new_inv_scrape = InvestmentScrape(
                        name=new_inv, settings=adapter['settings'])
                    new_inv_scrape.save()

                    # Should be a log
                    print('NEW INV CREATED:', new_inv.name)

        return item
Example #4
0
    def process_item(self, item, spider):
        adapter = ItemAdapter(item)

        if 'inv_id' in adapter.keys():
            # If inv_id is passed in item, look up the investment instance and
            # save the data directly.
            inv = get_object_or_404(Investment,
                                    id=adapter['inv_id'],
                                    template=True)
            setattr(inv, adapter['type'], adapter['data'])
            inv.save()
        else:
            lowercase_adapter = {k.lower(): v for k, v in adapter.items()}
            # If all investments are in the same item, get all investments for
            # the platform and iterate through them.
            platform = adapter['platformname']
            investments = Investment.objects.filter(name__platform=platform,
                                                    template=True)

            for inv in investments:
                # For each investment, assign the data from the item to the
                # relevant field - the adapter['type'])
                setattr(inv, adapter['type'],
                        lowercase_adapter[str(inv.name.name).casefold()])
                inv.save()

        return item
Example #5
0
    def _get_serialized_fields(self, item, default_value=None, include_empty=None):
        """Return the fields to export as an iterable of tuples
        (name, serialized_value)
        """
        item = ItemAdapter(item)

        if include_empty is None:
            include_empty = self.export_empty_fields

        if self.fields_to_export is None:
            if include_empty:
                field_iter = item.field_names()
            else:
                field_iter = item.keys()
        else:
            if include_empty:
                field_iter = self.fields_to_export
            else:
                field_iter = (x for x in self.fields_to_export if x in item)

        for field_name in field_iter:
            if field_name in item:
                field_meta = item.get_field_meta(field_name)
                value = self.serialize_field(field_meta, field_name, item[field_name])
            else:
                value = default_value

            yield field_name, value
Example #6
0
    def _get_serialized_fields(self,
                               item,
                               default_value=None,
                               include_empty=None):
        """Return the fields to export as an iterable of tuples
        (name, serialized_value)
        用serialize_field 序列化当前所选的item项
        """
        item = ItemAdapter(item)

        if include_empty is None:
            include_empty = self.export_empty_fields

        if self.fields_to_export is None:
            if include_empty:  # 包含空值(item定义有但是传入没有的key)
                field_iter = item.field_names()
            else:  # 只包含item传递过来的值
                field_iter = item.keys()
        else:
            if include_empty:  # 指定特定值
                field_iter = self.fields_to_export
            else:
                field_iter = (x for x in self.fields_to_export
                              if x in item)  #取交集

        for field_name in field_iter:
            if field_name in item:
                field_meta = item.get_field_meta(field_name)
                value = self.serialize_field(field_meta, field_name,
                                             item[field_name])
            else:
                value = default_value

            yield field_name, value
    def process_item(self, item, spider):
        adapter = ItemAdapter(item)

        for key in list(adapter.keys()):
            if adapter[key] == "None":
                bad_key = adapter.pop(key)

        return item
Example #8
0
 def process_item(self, item, spider):
     adapter = ItemAdapter(item)
     if 'indeed_job_key' not in adapter.keys():
         raise DropItem(f"Missing job key: {item!r}")
     if adapter['indeed_job_key'] in self.ids_seen:
         raise DropItem(f"Duplicate item found: {item!r}")
     else:
         self.ids_seen.add(adapter['indeed_job_key'])
         return item
Example #9
0
    def process_item(self, item, spider):
        adapter = ItemAdapter(item)
        print(item['platformid'])

        if 'url' in adapter.keys():

            platformname = (PlatformNames.objects.filter(
                id=item['platformid']))[0]

            platformname.direct_pds_link = adapter['url']
            platformname.save()
Example #10
0
    def process_item(self, item, spider):
        adapter = ItemAdapter(item)

        existing_aa_names = get_list_or_404(AssetAllocationName)

        for key in adapter.keys():
            if key not in excluded_keys:
                if key.casefold() not in (i.name.casefold()
                                          for i in existing_aa_names):
                    aa_name = AssetAllocationName(name=key.title())
                    aa_name.save()
        return item
Example #11
0
    def process_item(self, item, spider):
        html = """<!DOCTYPE HTML><html lang="ru"><head><meta charset="UTF-8">
            <title>Название страницы</title><meta name="description" content="Описание страницы" /></head><body>"""

        adapter = ItemAdapter(item)
        self.file.write(html)
        for key in adapter.keys():
            if key not in ['images', 'image_urls', 'files', 'file_urls']:
                self.file.write(str(adapter[key]))
                self.file.write('\n')
        self.file.write('</body>')
        return item
Example #12
0
    def process_item(self, item, spider):
        """Clean up unnecessary values from an item."""

        adapter = ItemAdapter(item)

        for key in tuple(adapter.keys()):
            if (self.drop_falsey and not adapter[key]) or (
                    self.drop_values is not None
                    and adapter[key] in self.drop_values):
                del adapter[key]

        return item
Example #13
0
    def process_item(self, item, spider):
        adapter = ItemAdapter(item)
        if 'name' in adapter.keys() and 'price' in adapter.keys():
            adapter['name'] = [unidecode(adapter['name'][0])]
            adapter['price'] = [unidecode(adapter['price'][0])]
            str_price = adapter['price'][0]
            str_price = str_price.replace('.', '')
            str_price = str_price.replace(',', '.')
            price = float(str_price)
            if 'cents' in adapter.keys():
                cents = adapter['cents'][0]
                cents = float(cents)
                price = price + cents / 100
            adapter['price'] = price
            self.col.insert_one(adapter.asdict())
            print("Exporter:", adapter["name"], "| ", adapter["price"], "|",
                  adapter['store'])

        else:
            raise DropItem(f"{item} without name or price")

        return item
Example #14
0
    def _get_serialized_fields(self,
                               item,
                               default_value=None,
                               include_empty=None,
                               pre=None,
                               field_filter=None):
        """Copy from BaseItemExporter
        """
        item = ItemAdapter(item)

        if include_empty is None:
            include_empty = self.export_empty_fields

        if self.fields_to_export is None:
            if include_empty:
                field_iter = item.field_names()
            else:
                field_iter = item.keys()
        else:
            if include_empty:
                field_iter = self.fields_to_export
            else:
                field_iter = (x for x in self.fields_to_export if x in item)

        for field_name in field_iter:
            k = None
            if field_filter:
                if pre is not None:
                    k = pre_join(pre, field_name)
                    if k in field_filter:
                        continue
            if field_name in item:
                field_meta = item.get_field_meta(field_name)
                value = self.serialize_field(
                    field_meta,
                    field_name,
                    item[field_name],
                    pre=k,
                    field_filter=field_filter,
                )
            else:
                value = default_value

            yield field_name, value
Example #15
0
    def _get_serialized_fields(self,
                               item,
                               default_value=None,
                               include_empty=None):
        """Return the fields to export as an iterable of tuples
        (name, serialized_value)
        """
        item = ItemAdapter(item)

        if include_empty is None:
            include_empty = self.export_empty_fields

        if self.fields_to_export is None:
            if include_empty:
                field_iter = item.field_names()
            else:
                field_iter = item.keys()
        elif isinstance(self.fields_to_export, Mapping):
            if include_empty:
                field_iter = self.fields_to_export.items()
            else:
                field_iter = ((x, y) for x, y in self.fields_to_export.items()
                              if x in item)
        else:
            if include_empty:
                field_iter = self.fields_to_export
            else:
                field_iter = (x for x in self.fields_to_export if x in item)

        for field_name in field_iter:
            if isinstance(field_name, str):
                item_field, output_field = field_name, field_name
            else:
                item_field, output_field = field_name
            if item_field in item:
                field_meta = item.get_field_meta(item_field)
                value = self.serialize_field(field_meta, output_field,
                                             item[item_field])
            else:
                value = default_value

            yield output_field, value
Example #16
0
 def process_item(self, item, spider):
     if spider.name == 'douban':
         content = f"|{item['db_movie_name']}|\t|{item['db_movie_link']}|\t|{item['db_movie_detail']}|\n\n"
         with open('/Users/xiaoyu.gao/PycharmProjects/Python001-class01/week01/db_movie.txt', 'a', encoding='utf8') \
                 as f:
             f.write(content)
     elif spider.name == 'maoyan':
         item_dict = ItemAdapter(item).asdict()
         # sql_engine.connect(host=config.DB_HOST, user=config.DB_USER, password=config.DB_PWD, db=config.DB)
         sql = 'INSERT INTO {table} (`{fields}`) VALUES (%s,%s,%s) ON DUPLICATE KEY UPDATE {update}'.format(
             table=self.table,
             fields='`,`'.join(item_dict.keys()),
             update=','.join(['`{field}`=VALUES(`{field}`)'.format(field=field) for field in item_dict.keys()])
         )
         cur = self.conn.cursor()
         try:
             cur.execute(sql, item_dict.values())
         except:
             self.conn.rollback()
             print('save to db failed except:%s', format_exc())
         finally:
             cur.close()
     return item