コード例 #1
0
 def process_item_helper(self, item, collection_name):
     """
         1. for each post, get rid of 'last_reply_date' key to see if this post has been stored before
         2. update/insert the post
     """
     adapter = ItemAdapter(item)
     post = adapter.asdict()
     # get rid of ‘last_reply_date’ key
     last_reply_date = ''
     if 'last_reply_date' in post:
         last_reply_date = post['last_reply_date']
         post.pop('last_reply_date')
     db_collection = self.db[collection_name]
     #logging.debug("search for post: \n %s", post)
     # if this post is new, insert the origin post
     if db_collection.count_documents(post) == 0:
         logging.debug("Insert a new item")
         db_collection.insert_one(adapter.asdict())
     # this post has been stored before, update
     else:
         if last_reply_date != '':
             logging.debug("Updating an item with last_reply_date %s",
                           last_reply_date)
             db_collection.update_one(
                 post, {"$set": {
                     "last_reply_date": last_reply_date
                 }})
コード例 #2
0
 def process_item(self, item, spider):
     item = ItemAdapter(item)
     if self.is_valid_entry(item):
         bisect.insort(self.entries, item['word'])
         line = json.dumps(item.asdict(), indent=4, sort_keys=True) + ','
         self.file.write(line)
     else:
         line = json.dumps(item.asdict(), indent=4, sort_keys=True) + ','
         self.error_file.write(line)
     return item
コード例 #3
0
 def process_item(self, item, spider):
     item = ItemAdapter(item)
     if item['word'] is None:
         # if word could not be extracted wirte to errored file
         line = json.dumps(item.asdict(), indent=4, sort_keys=True) + ','
         self.error_file.write(line)
     elif item['definitions'] is None:
         # if definition could not be extracted write to errored file
         line = json.dumps(item.asdict(), indent=4, sort_keys=True)
         self.error_file.write(line)
     #else:
     # write to database
     # write_to_db(self, item.asdict())
     return item
コード例 #4
0
 def close_spider(self, spider):
     res = [ItemAdapter.asdict(item) for item in self.articles]
     with open(self.FILENAME, 'w', encoding='utf-8', newline='') as f:
         writer = csv.DictWriter(f, fieldnames=list(res[0].keys()))
         writer.writeheader()
         for i in res:
             writer.writerow(i)
コード例 #5
0
    def process_item(self, item, spider):

        adapter = ItemAdapter(item)
        print('In Process Item', adapter.asdict())
        if adapter.get('product_price'):
            return item
        else:
            raise DropItem("Missing price in %s" % item)
コード例 #6
0
 def process_item(self, item, spider):
     adapter = ItemAdapter(item)
     # It provides a common interface to extract and set data without having to take the object’s type into account.
     if adapter.get('page_num'):
         line = "当前页面是第%d页" % item['page_num'] + "\n"
     else:
         line = json.dumps(adapter.asdict(),
                           ensure_ascii=False) + "\n"  #非ASCII编码,便于人阅读
     self.file.write(line)
     return item
コード例 #7
0
ファイル: pipelines.py プロジェクト: JerryNyoike/jobscrape
    def process_item(self, item, spider):
        adapter = ItemAdapter(item)
        values = adapter.asdict()
        empty = lambda field: not field
        empty_fields = filter(empty, list(values.values()))

        if len(list(empty_fields)) > 12 or values.get("country") != "Kenya":
            print("\n\n***************")
            print("Dropped")
            print("******************\n\n")
            raise DropItem("Too many empty values found.")

        return item
コード例 #8
0
ファイル: pipelines.py プロジェクト: KwonL/YOOX-crawler
    def process_item(self, item, spider):
        adapter = ItemAdapter(item)
        try:
            adapter["brand"] = self.clean_text(adapter.get("brand"))
            adapter["name"] = self.clean_text(adapter.get("name"))
            adapter["price"] = int(
                float(adapter.get("price").replace(",", "").split()[1]))
            adapter["image"] = adapter.get("image").split("?")[0]
        except Exception:
            DropItem(f"Missing essential properties in {item}")

        self.csv.writerow(adapter.asdict())

        return item
コード例 #9
0
ファイル: pipelines.py プロジェクト: JerryNyoike/jobscrape
    def close_spider(self, spider):
        if self.items is not []:
            for item in self.items:
                adapter = ItemAdapter(item)
                columns = adapter.field_names()
                writer = csv.DictWriter(self.file,
                                        fieldnames=columns,
                                        restval='',
                                        extrasaction='ignore',
                                        delimiter=',',
                                        quoting=csv.QUOTE_NONNUMERIC,
                                        quotechar="\"")

                if self.file.tell() == 0:
                    writer.writeheader()
                writer.writerow(adapter.asdict())

        self.file.close()
コード例 #10
0
    def process_item(self, item, spider):
        # Only handle TopicItems
        if not isinstance(item, BaseTopicItem):
            return item

        adapter = ItemAdapter(item)
        topic_id = adapter.get('topic_id')
        self.logger.debug(f'exporting TopicItem (id: {topic_id})')

        path = prepare_path(base_dir=self.base_dir_path,
                            dirname_template=self.dirname_tmplt,
                            filename_template=self.filename_tmplt,
                            item=item)
        with path.open('w') as wh:
            json.dump(adapter.asdict(), wh)

        self.logger.debug(f'exported TopicItem (id: {topic_id})')

        return item
コード例 #11
0
    def process_item(self, item, spider):
        adapter = ItemAdapter(item)
        if 'name' in adapter.keys() and 'price' in adapter.keys():
            adapter['name'] = [unidecode(adapter['name'][0])]
            adapter['price'] = [unidecode(adapter['price'][0])]
            str_price = adapter['price'][0]
            str_price = str_price.replace('.', '')
            str_price = str_price.replace(',', '.')
            price = float(str_price)
            if 'cents' in adapter.keys():
                cents = adapter['cents'][0]
                cents = float(cents)
                price = price + cents / 100
            adapter['price'] = price
            self.col.insert_one(adapter.asdict())
            print("Exporter:", adapter["name"], "| ", adapter["price"], "|",
                  adapter['store'])

        else:
            raise DropItem(f"{item} without name or price")

        return item
コード例 #12
0
 def process_item(self, item, spider):
     adapter = ItemAdapter(item)
     line = json.dumps(adapter.asdict(), ensure_ascii=False) + "\n"
     self.file.write(line)
     return item
コード例 #13
0
 def process_item(self, item, spider):
     adapter = ItemAdapter(item)
     line = json.dumps(adapter.asdict(), ensure_ascii=True) + '\n'
     self.file.write(line)
     print('Item Scraped!')
     return item
コード例 #14
0
 def process_item(self, item, spider):
     adapter = ItemAdapter(item)
     adapter['created_at'] = datetime.now().timestamp()
     self.items.append(adapter.asdict())
     self.db[f'kw-{spider.keyword}'].insert_one(adapter.asdict())
     return f"成功抓取关键词 [ {spider.keyword} ] 下的产品 {item['pid']} "
コード例 #15
0
ファイル: pipelines.py プロジェクト: iit2018062/Keywords
 def process_item(self, item, spider):
     adapter = ItemAdapter(item)
     self.db[self.collection_name].update_one({'url': adapter.get('url')},
                                              {'$set': adapter.asdict()},
                                              upsert=True)
     return item