def process_item(self, item, spider): """ For each CharterItem item, insert the item into the specified collection of the MongoDB database. If the item already exists, replace it (this prevents duplicates). To check if an item already exists, filter by the item's url field. """ print("Processing item...") # Only store CharterItems. adapted_item = ItemAdapter(item).asdict() adapted_item.update({ "user": spider.user if hasattr(spider,"user") else None, "rq_id": spider.rq_id if hasattr(spider,"rq_id") else None }) if not isinstance(item, CharterItem): print("Not an instance of CharterItem") print(item['url']) self.db['otherItems'].replace_one({'url': item['url']}, adapted_item, upsert=True) return item # Finds the document with the matching url. query = {'url': item['url']} # upsert=True means insert the document if the query doesn't find a match. self.db[self.collection_name].replace_one( query, adapted_item, upsert=True ) # self.db[self.collection_name].insert(dict(item)) logging.debug(f"MongoDB: Inserted {item['url']}.") return item
def process_item(self, item, spider): #put item inside an adapter adapter = ItemAdapter(item) #call API to retrieve information adapter.update(self.get_address_info(adapter, limit=1)) return item
def process_item(self, item, spider): print("Processing item...") self.connection = pymongo.MongoClient( self.MONGO_URI, username=self.MONGO_USERNAME, password=self.MONGO_PASSWORD ) self.db = self.connection[self.MONGODB_DB] print("CONNECTED TO MONGO DB") self.collection = self.db[self.MONGODB_COLLECTION_TEXT] adapted_item = ItemAdapter(item).asdict() adapted_item.update({ "user": spider.user if hasattr(spider,"user") else None, "rq_id": spider.rq_id if hasattr(spider,"rq_id") else None }) # Only store CharterItems. if not isinstance(item, CharterItem): print("Not an instance of CharterItem") print(item['url']) self.db['otherItems'].replace_one({'url': item['url']}, adapted_item, upsert=True) return item # Finds the document with the matching url. query = {'url': item['url']} # upsert=True means insert the document if the query doesn't find a match. self.collection.replace_one(query, adapted_item, upsert=True) # self.db[self.collection_name].insert(dict(item)) logging.debug(f"MongoDB: Inserted {item['url']}.") return item
def process_item(self, item, spider): itemAdapter = ItemAdapter(item=item) if itemAdapter.get("area"): area = str(itemAdapter.get("area")) itemAdapter.update({"area": self.getSquare(area)}) return itemAdapter.item else: return DropItem( f"Missing field in real estate at {itemAdapter.get('source')}")
def process_item(self, item, spider): itemAdapter = ItemAdapter(item=item) if itemAdapter.get("value"): value = str(itemAdapter.get("value")) area = str(itemAdapter.get("area")) itemAdapter.update({"value": self.getPrice(value, area)}) return itemAdapter.item else: return DropItem( f"Missing real estate in paper at {itemAdapter.get('source')}")
def process_item(self, item, spider): #line = json.dumps(ItemAdapter(item).asdict()) + "\n" #self.file.write(line) #print (item) #print(item['comments_words_count'],' ',item['comments']) ItemData = ItemAdapter(item).asdict() if self.CrawlerData.find(ItemData).count() == 0: ItemData.update({'scantime': DateTime}) self.CrawlerData.insert_one(ItemData) self.NewItems += 1 self.TotalItems += 1 return item
def process_item(self, item, spider): today = date.today().strftime("%d/%m/%Y") yesterday = (date.today() - timedelta(days=1)).strftime("%d/%m/%Y") itemAdapter = ItemAdapter(item=item) if itemAdapter.get("time"): time = str(itemAdapter.get("time")) time = time.replace(" ", "").replace("ngàyđăng:", "").replace( "hômnay", today).replace("hômqua", yesterday) itemAdapter.update({"time": time}) return itemAdapter.item else: return DropItem( f"Missing field in real estate at {itemAdapter.get('source')}")
def process_item(self, item, spider): itemAdapter = ItemAdapter(item=item) if itemAdapter.get('title') and itemAdapter.get('description'): type = self.get_refine_type(str(itemAdapter.get("type"))) if type == "UNKNOW": text = [ str(itemAdapter.get('title')) + str(itemAdapter.get('description')) ] prediction = self.classifier.predict(text) type = prediction[0] itemAdapter.update({'type': type}) return itemAdapter.item else: return DropItem( f"Missing field in paper at {itemAdapter.get('source')}")
def process_item(self, item, spider): itemAdapter = ItemAdapter(item=item) address = [ x.strip() for x in str(itemAdapter.get('address')).split(',') ] if len(address) >= 3: itemAdapter.update({ 'ward': self.extractLocationName(address[-3]), 'district': self.extractLocationName(address[-2]), 'province': self.extractLocationName(address[-1]) }) if str(itemAdapter.get('province')) == "Hà Nội": return itemAdapter.item else: return DropItem( f"Missing field in paper at {itemAdapter.get('source')}") else: return DropItem( f"Missing field in paper at {itemAdapter.get('source')}")