def test_drop_duplicate(self): with tempfile.TemporaryDirectory() as root: root = Path(root) storage = FileStorage(root) storage.insert_many("test", self.items) pipeline = DropDuplicate(storage, "test") item = Item("123", {"x": 123}) assert pipeline(item) is None item = Item("789", {"x": 789}) assert pipeline(item) is item
def test_overwrite_true(): with tempfile.TemporaryDirectory() as root: root = Path(root) storage = FileStorage(root) pipeline = StoreItem(storage, "test", overwrite=True) item = Item("123", {"x": 456}) pipeline(item) item = Item("123", {"x": 789}) pipeline(item) stored_item = storage.get("test", "123") assert stored_item["x"] == 789
def test_update(self): with tempfile.TemporaryDirectory() as root: root = Path(root) storage = FileStorage(root) storage.insert_many("test", self.data) new_data = [ Item("123", {"text": "xfoo"}), Item("456", {"text": "xbar"}) ] storage.update_many("test", new_data) assert storage.get("test", "123")["text"] == "xfoo" assert storage.get("test", "456")["text"] == "xbar"
def update(self, collection: str, item: Item) -> None: if not self.exists(collection, item): raise ItemNotFoundError(f"Item ( {item.id} ) not found.") path = self._get_item_path(collection, item) with path.open("w") as f: json.dump(item.to_dict(), f)
def upsert(self, collection: str, item: Item) -> None: self._create_collection(collection) path = self._get_item_path(collection, item) with path.open("w") as f: json.dump(item.to_dict(), f) logger.debug("save file: %s", str(path))
def get(self, collection: str, item_id: str) -> Item: path = self._get_collection_path(collection) / item_id if not path.exists(): raise ItemNotFoundError(f"Item ( {item_id} ) not found.") with path.open("r") as f: item = Item.from_dict(json.load(f)) return item
def test_item(): item = Item("123", {}) item["foo"] = {"x": 123} item["bar"] = {"y": 456} item["bar"]["y"] = 789 assert item.id == "123" assert len(item) == 2 assert item["foo"]["x"] == 123 assert item["bar"]["y"] == 789
def insert(self, collection: str, item: Item) -> None: self._create_collection(collection) path = self._get_item_path(collection, item) if path.exists(): raise ItemDuplicationError(f"Item id {item.id} already exists.") with path.open("w") as f: json.dump(item.to_dict(), f) logger.debug("save file: %s", str(path))
def setup(self): # pylint:disable=attribute-defined-outside-init self.item = Item("123", {"x": 456})
def _get_item(self, response: httpx.Response) -> Item: item_id = str(response.url) content = {"text": self._get_text(response)} return Item(item_id, content)
def _build_item(video_dict: tp.Dict[str, tp.Any]) -> Item: item_id = video_dict["id"] content = video_dict return Item(item_id, content)
def setup(self): # pylint:disable=attribute-defined-outside-init self.data = [ Item("123", {"text": "foo"}), Item("456", {"text": "bar"}) ]
def task(path, taskid): storage = TinyDBStorage(path) with storage as s: s.insert("test", Item(str(taskid), {"task": taskid})) return taskid
def _run(self) -> tp.Iterator[Item]: for tweets in self._crawl(): for tweet in tweets: item_id = str(tweet["id"]) content = tweet yield Item(item_id, content)