コード例 #1
0
    def test_drop_duplicate(self):
        with tempfile.TemporaryDirectory() as root:
            root = Path(root)

            storage = FileStorage(root)

            storage.insert_many("test", self.items)

            pipeline = DropDuplicate(storage, "test")

            item = Item("123", {"x": 123})
            assert pipeline(item) is None

            item = Item("789", {"x": 789})
            assert pipeline(item) is item
コード例 #2
0
ファイル: test_store_item.py プロジェクト: altescy/mincrawler
    def test_overwrite_true():
        with tempfile.TemporaryDirectory() as root:
            root = Path(root)

            storage = FileStorage(root)
            pipeline = StoreItem(storage, "test", overwrite=True)

            item = Item("123", {"x": 456})
            pipeline(item)

            item = Item("123", {"x": 789})
            pipeline(item)

            stored_item = storage.get("test", "123")
            assert stored_item["x"] == 789
コード例 #3
0
    def test_update(self):
        with tempfile.TemporaryDirectory() as root:
            root = Path(root)

            storage = FileStorage(root)
            storage.insert_many("test", self.data)

            new_data = [
                Item("123", {"text": "xfoo"}),
                Item("456", {"text": "xbar"})
            ]

            storage.update_many("test", new_data)

            assert storage.get("test", "123")["text"] == "xfoo"
            assert storage.get("test", "456")["text"] == "xbar"
コード例 #4
0
    def update(self, collection: str, item: Item) -> None:
        if not self.exists(collection, item):
            raise ItemNotFoundError(f"Item ( {item.id} ) not found.")

        path = self._get_item_path(collection, item)
        with path.open("w") as f:
            json.dump(item.to_dict(), f)
コード例 #5
0
    def upsert(self, collection: str, item: Item) -> None:
        self._create_collection(collection)

        path = self._get_item_path(collection, item)

        with path.open("w") as f:
            json.dump(item.to_dict(), f)

        logger.debug("save file: %s", str(path))
コード例 #6
0
    def get(self, collection: str, item_id: str) -> Item:
        path = self._get_collection_path(collection) / item_id

        if not path.exists():
            raise ItemNotFoundError(f"Item ( {item_id} ) not found.")

        with path.open("r") as f:
            item = Item.from_dict(json.load(f))

        return item
コード例 #7
0
def test_item():
    item = Item("123", {})

    item["foo"] = {"x": 123}
    item["bar"] = {"y": 456}
    item["bar"]["y"] = 789

    assert item.id == "123"
    assert len(item) == 2
    assert item["foo"]["x"] == 123
    assert item["bar"]["y"] == 789
コード例 #8
0
    def insert(self, collection: str, item: Item) -> None:
        self._create_collection(collection)

        path = self._get_item_path(collection, item)

        if path.exists():
            raise ItemDuplicationError(f"Item id {item.id} already exists.")

        with path.open("w") as f:
            json.dump(item.to_dict(), f)

        logger.debug("save file: %s", str(path))
コード例 #9
0
ファイル: test_store_item.py プロジェクト: altescy/mincrawler
 def setup(self):
     # pylint:disable=attribute-defined-outside-init
     self.item = Item("123", {"x": 456})
コード例 #10
0
ファイル: single_page.py プロジェクト: altescy/mincrawler
 def _get_item(self, response: httpx.Response) -> Item:
     item_id = str(response.url)
     content = {"text": self._get_text(response)}
     return Item(item_id, content)
コード例 #11
0
 def _build_item(video_dict: tp.Dict[str, tp.Any]) -> Item:
     item_id = video_dict["id"]
     content = video_dict
     return Item(item_id, content)
コード例 #12
0
 def setup(self):
     # pylint:disable=attribute-defined-outside-init
     self.data = [
         Item("123", {"text": "foo"}),
         Item("456", {"text": "bar"})
     ]
コード例 #13
0
 def task(path, taskid):
     storage = TinyDBStorage(path)
     with storage as s:
         s.insert("test", Item(str(taskid), {"task": taskid}))
     return taskid
コード例 #14
0
ファイル: tweet_search.py プロジェクト: altescy/mincrawler
 def _run(self) -> tp.Iterator[Item]:
     for tweets in self._crawl():
         for tweet in tweets:
             item_id = str(tweet["id"])
             content = tweet
             yield Item(item_id, content)