Esempio n. 1
0
    def process_item(self, item, spider):
        """
        For each CharterItem item, insert the item into the specified
        collection of the MongoDB database. If the item
        already exists, replace it (this prevents duplicates).
        
        To check if an item already exists, filter by the item's
        url field.
        """
        print("Processing item...")
        # Only store CharterItems.

        adapted_item = ItemAdapter(item).asdict()
        adapted_item.update({
                "user": spider.user if hasattr(spider,"user") else None, 
                "rq_id": spider.rq_id if hasattr(spider,"rq_id") else None
            })

        if not isinstance(item, CharterItem):
            print("Not an instance of CharterItem")
            print(item['url'])
            self.db['otherItems'].replace_one({'url': item['url']}, adapted_item, upsert=True)
            return item
        # Finds the document with the matching url.
        query = {'url': item['url']}
        # upsert=True means insert the document if the query doesn't find a match.
        self.db[self.collection_name].replace_one(
            query, adapted_item, upsert=True
        )
#        self.db[self.collection_name].insert(dict(item))
        logging.debug(f"MongoDB: Inserted {item['url']}.")
        return item
Esempio n. 2
0
    def process_item(self, item, spider):
        #put item inside an adapter
        adapter = ItemAdapter(item)
        #call API to retrieve information
        adapter.update(self.get_address_info(adapter, limit=1))

        return item
Esempio n. 3
0
    def process_item(self, item, spider):
        print("Processing item...")
        self.connection = pymongo.MongoClient(
            self.MONGO_URI,
            username=self.MONGO_USERNAME, 
            password=self.MONGO_PASSWORD
        )
        self.db = self.connection[self.MONGODB_DB]
        print("CONNECTED TO MONGO DB")
        self.collection = self.db[self.MONGODB_COLLECTION_TEXT]
        
        adapted_item = ItemAdapter(item).asdict()
        adapted_item.update({
                "user": spider.user if hasattr(spider,"user") else None, 
                "rq_id": spider.rq_id if hasattr(spider,"rq_id") else None
            })

        # Only store CharterItems.
        if not isinstance(item, CharterItem):
            print("Not an instance of CharterItem")
            print(item['url'])
            self.db['otherItems'].replace_one({'url': item['url']}, adapted_item, upsert=True)
            return item
        # Finds the document with the matching url.
        query = {'url': item['url']}
        # upsert=True means insert the document if the query doesn't find a match.
        self.collection.replace_one(query, adapted_item, upsert=True)
#        self.db[self.collection_name].insert(dict(item))
        logging.debug(f"MongoDB: Inserted {item['url']}.")
        return item
Esempio n. 4
0
 def process_item(self, item, spider):
     itemAdapter = ItemAdapter(item=item)
     if itemAdapter.get("area"):
         area = str(itemAdapter.get("area"))
         itemAdapter.update({"area": self.getSquare(area)})
         return itemAdapter.item
     else:
         return DropItem(
             f"Missing field in real estate at {itemAdapter.get('source')}")
Esempio n. 5
0
 def process_item(self, item, spider):
     itemAdapter = ItemAdapter(item=item)
     if itemAdapter.get("value"):
         value = str(itemAdapter.get("value"))
         area = str(itemAdapter.get("area"))
         itemAdapter.update({"value": self.getPrice(value, area)})
         return itemAdapter.item
     else:
         return DropItem(
             f"Missing real estate in paper at {itemAdapter.get('source')}")
Esempio n. 6
0
 def process_item(self, item, spider):
     #line = json.dumps(ItemAdapter(item).asdict()) + "\n"
     #self.file.write(line)
     #print (item)
     #print(item['comments_words_count'],' ',item['comments'])
     ItemData = ItemAdapter(item).asdict()
     if self.CrawlerData.find(ItemData).count() == 0:
         ItemData.update({'scantime': DateTime})
         self.CrawlerData.insert_one(ItemData)
         self.NewItems += 1
     self.TotalItems += 1
     return item
Esempio n. 7
0
 def process_item(self, item, spider):
     today = date.today().strftime("%d/%m/%Y")
     yesterday = (date.today() - timedelta(days=1)).strftime("%d/%m/%Y")
     itemAdapter = ItemAdapter(item=item)
     if itemAdapter.get("time"):
         time = str(itemAdapter.get("time"))
         time = time.replace(" ", "").replace("ngàyđăng:", "").replace(
             "hômnay", today).replace("hômqua", yesterday)
         itemAdapter.update({"time": time})
         return itemAdapter.item
     else:
         return DropItem(
             f"Missing field in real estate at {itemAdapter.get('source')}")
Esempio n. 8
0
 def process_item(self, item, spider):
     itemAdapter = ItemAdapter(item=item)
     if itemAdapter.get('title') and itemAdapter.get('description'):
         type = self.get_refine_type(str(itemAdapter.get("type")))
         if type == "UNKNOW":
             text = [
                 str(itemAdapter.get('title')) +
                 str(itemAdapter.get('description'))
             ]
             prediction = self.classifier.predict(text)
             type = prediction[0]
         itemAdapter.update({'type': type})
         return itemAdapter.item
     else:
         return DropItem(
             f"Missing field in paper at {itemAdapter.get('source')}")
Esempio n. 9
0
 def process_item(self, item, spider):
     itemAdapter = ItemAdapter(item=item)
     address = [
         x.strip() for x in str(itemAdapter.get('address')).split(',')
     ]
     if len(address) >= 3:
         itemAdapter.update({
             'ward':
             self.extractLocationName(address[-3]),
             'district':
             self.extractLocationName(address[-2]),
             'province':
             self.extractLocationName(address[-1])
         })
         if str(itemAdapter.get('province')) == "Hà Nội":
             return itemAdapter.item
         else:
             return DropItem(
                 f"Missing field in paper at {itemAdapter.get('source')}")
     else:
         return DropItem(
             f"Missing field in paper at {itemAdapter.get('source')}")