예제 #1
0
def test_simple_dedup():
    documents = [
        dict(text=text("_Hello", "_World", "I'm so original")),
        dict(text=text("_world", "I'm originaler", "_Hello")),
    ]

    results = list(dedup.deduplicate(documents, field="text"))
    expected = [
        # First document is untouched
        dict(
            text=text("_Hello", "_World", "I'm so original"),
            original_nlines=3,
            nlines=3,
            text_hash=[
                str_hash(h) for h in ["_hello", "_world", "i'm so original"]
            ],
        ),
        # Second documents loses several lines
        dict(
            text="I'm originaler",
            original_nlines=3,
            nlines=1,
            text_hash=[0, str_hash("i'm originaler"), 0],
        ),
    ]

    assert_documents_equal(expected, results, ignoring=CUMBERSOME)
예제 #2
0
    def test_dedup_with_hashes(self):
        tmp = self.get_tmpdir()

        documents = [
            dict(text=text("_Hello", "World", "I'm so original")),
            dict(text=text("Good morning", "World", "I'm originaler")),
        ]
        dump_hashes(tmp("hashes.bin"),
                    [str_hash(h) for h in ["_hello", "i'm originaler"]])
        results = list(
            dedup.deduplicate(documents,
                              field="text",
                              hashes=tmp("hashes.bin"),
                              add_hashes=False))
        expected = [
            dict(
                text=text("World", "I'm so original"),
                original_nlines=3,
                nlines=2,
                text_hash=[0,
                           str_hash("world"),
                           str_hash("i'm so original")],
            ),
            dict(
                text=text("Good morning", "World"),
                original_nlines=3,
                nlines=2,
                text_hash=[str_hash("good morning"),
                           str_hash("world"), 0],
            ),
        ]

        assert_documents_equal(expected, results, ignoring=CUMBERSOME)
예제 #3
0
def test_remove_duplicates_sharded(tmp_path: Path):
    data = tmp_path / "data"
    part_0 = [["Hello", "_World", "I'm so original"]]
    write_docs(data / "part_0.json", part_0)
    part_1 = [["_Good morning", "_World", "I'm originaler"]]
    write_docs(data / "part_1.json", part_1)

    h = tmp_path / "hashes"
    h.mkdir()
    h0 = FlatHashSet()
    h0.add([str_hash(s.lower()) for doc in part_0 for s in doc])
    h0.add([str_hash("_world")])
    h0.dump(h / "part_0.bin")
    assert {
        str_hash("hello"): False,
        str_hash("_world"): True,
        str_hash("i'm so original"): False,
    } == as_dict(h0)

    h1 = FlatHashSet()
    h1.add([str_hash(s.lower()) for doc in part_1 for s in doc])
    h1.add([str_hash("_good morning")])
    h1.dump(h / "part_1.bin")
    assert {
        str_hash("_good morning"): True,
        str_hash("_world"): False,
        str_hash("i'm originaler"): False,
    } == as_dict(h1)

    res = tmp_path / "res"
    res.mkdir()
    # dedup.DISABLE_MULTI_PROCESSING = True  # Simplifies debugging
    dedup.remove_duplicates_sharded(
        files=[data / "part_0.json", data / "part_1.json"],
        outputs=[res / "part_0.json", res / "part_1.json"],
        field="text",
        hashes_dir=h,
    )

    results_0 = list(jsonql.read_jsons(res / "part_0.json"))
    expected_0 = [
        dict(
            text=text("Hello", "I'm so original"),
            original_nlines=3,
            nlines=2,
            line_ids=[0, 2],
        )
    ]
    assert_documents_equal(expected_0, results_0, ignoring=LENGTHS)

    # First pass removes "_world", second "_good morning".
    results_1 = list(jsonql.read_jsons(res / "part_1.json"))
    expected_1 = [
        dict(text=text("I'm originaler"), original_nlines=3, nlines=1, line_ids=[2])
    ]

    assert_documents_equal(expected_1, results_1, ignoring=LENGTHS)
예제 #4
0
    def test_dedup_fast(self):
        data = self.get_tmpdir()
        part_0 = [["Hello", "_World", "I'm so original"]]
        write_docs(data("part_0.json"), part_0)
        part_1 = [["Good morning", "_World", "I'm originaler"]]
        write_docs(data("part_1.json"), part_1)

        res = self.get_tmpdir()
        h = self.get_tmpdir()
        dedup.deduplicate_concatenated(
            [data("part_0.json"), data("part_1.json")],
            [res("part_0.json"), res("part_1.json")],
            field="text",
            output_hashes=h("hashes.bin"),
        )

        with open(res("part_0.json")) as o:
            results_0 = [json.loads(l) for l in o.readlines()]
        expected_0 = [
            dict(
                text=text("Hello", "_World", "I'm so original"),
                original_nlines=3,
                nlines=3,
                text_hash=[
                    str_hash(w)
                    for w in ["hello", "_world", "i'm so original"]
                ],
            )
        ]
        assert_documents_equal(expected_0, results_0, ignoring=CUMBERSOME)

        with open(res("part_1.json")) as o:
            results_1 = [json.loads(l) for l in o.readlines()]
        expected_1 = [
            dict(
                text=text("Good morning", "I'm originaler"),
                original_nlines=3,
                nlines=2,
                text_hash=[
                    str_hash("good morning"), 0,
                    str_hash("i'm originaler")
                ],
            )
        ]

        assert_documents_equal(expected_1, results_1, ignoring=CUMBERSOME)

        words = [w for part in [part_0, part_1] for doc in part for w in doc]
        expected = {str_hash(s.lower()): s.startswith("_") for s in words}
        self.assertEqual(expected, load_hashes(h("hashes.bin")))
예제 #5
0
    def do(self, document: dict) -> Optional[str]:
        content: Optional[str] = document.get(self.field)
        if not content:
            return None
        all_sentences = [
            s for l in content.split("\n") if l
            for s in self.splitter.split(text=l)
        ]
        unique_sentences = []
        for s in all_sentences:
            if not s:
                continue
            h = dedup.str_hash(s)
            if h in self.hashes:
                continue
            self.hashes.add(h)
            unique_sentences.append(s)

        scores = []
        for sentence in unique_sentences:
            normalized = text_normalizer.normalize(sentence)
            pieces = self.sp.encode_as_pieces(normalized)
            log_score = self.lm.score(" ".join(pieces))
            pp = -1
            if len(pieces):
                pp = perplexity.pp(log_score, len(pieces))
            scores.append(pp)

        res = filter(lambda pp_s: self.threshold > pp_s[0] > 0,
                     zip(scores, unique_sentences))
        return "\n".join(f"{pp}\t{s}" for (pp, s) in res) or None
예제 #6
0
def test_dedup_from_hashes(tmp_path: Path):
    documents = [
        dict(text=text("_Hello", "World", "I'm so original")),
        dict(text=text("Good morning", "World", "I'm originaler")),
    ]
    seen = ["_hello", "i'm originaler", "world"]
    hashes = [str_hash(h) for h in seen]
    h = dedup.FlatHashSet()
    h.add(hashes)
    # Note: 'world' appears only once and won't be treated as a duplicate.
    h.add(hashes[:-1])
    h.dump(tmp_path / "hashes.bin")

    results = list(
        dedup.DuplicatesRemover("text", [tmp_path / "hashes.bin"]).map(documents)
    )
    expected = [
        dict(
            text=text("World", "I'm so original"),
            original_nlines=3,
            nlines=2,
            line_ids=[1, 2],
        ),
        dict(
            text=text("Good morning", "World"),
            original_nlines=3,
            nlines=2,
            line_ids=[0, 1],
        ),
    ]

    assert_documents_equal(expected, results, ignoring=LENGTHS)
예제 #7
0
def test_dedup_with_dump(tmp_path: Path):
    hashes = tmp_path / "hashes.bin"
    documents = [
        dict(text=text("_Hello", "_World", "I'm so original")),
        dict(text=text("_world", "I'm originaler", "_Hello")),
    ]
    collector = dedup.HashesCollector(field="text", output=hashes)
    list(collector.map(documents))
    results = load_hashes(hashes)
    expected = {
        str_hash(l): l.startswith("_")
        for l in ["_hello", "_world", "i'm so original", "i'm originaler"]
    }
    assert expected == results
예제 #8
0
def test_dedup_with_np_dump(tmp_path: Path):
    hashes = tmp_path / "hashes.bin"
    documents = [
        dict(text=text("_Hello", "_World", "I'm so original")),
        dict(text=text("_world", "I'm originaler", "_Hello")),
    ]
    with dedup.HashesCollector(field="text", output=hashes) as d:
        list(d.map(documents))

    results = FlatHashSet()
    results.load_np(hashes)
    expected = set(
        str_hash(l) for l in ["_hello", "_world", "i'm so original", "i'm originaler"]
    )
    assert expected == set(results.keys())
예제 #9
0
def test_dedup_fast(tmp_path: Path):
    data = tmp_path / "data"
    part_0 = [["Hello", "_World", "I'm so original"]]
    write_docs(data / "part_0.json", part_0)
    part_1 = [["Good morning", "_World", "I'm originaler"]]
    write_docs(data / "part_1.json", part_1)
    parts = [data / "part_0.json", data / "part_1.json"]

    res = tmp_path / "res"
    res.mkdir()
    h = tmp_path / "hashes.bin"
    field = "text"
    jsonql.run_pipes(dedup.HashesCollector(field, output=h), file=parts)
    for part in parts:
        jsonql.run_pipes(
            dedup.DuplicatesRemover(field, [h]), file=part, output=res / part.name
        )
        jsonql.run_pipes(
            dedup.DuplicatesRemover(field, [h]), file=part, output=res / part.name
        )

    results_0 = list(jsonql.read_jsons(res / "part_0.json"))
    expected_0 = [
        dict(
            text=text("Hello", "I'm so original"),
            original_nlines=3,
            nlines=2,
            line_ids=[0, 2],
        )
    ]
    assert_documents_equal(expected_0, results_0, ignoring=LENGTHS)

    results_1 = list(jsonql.read_jsons(res / "part_1.json"))
    expected_1 = [
        dict(
            text=text("Good morning", "I'm originaler"),
            original_nlines=3,
            nlines=2,
            line_ids=[0, 2],
        )
    ]

    assert_documents_equal(expected_1, results_1, ignoring=LENGTHS)

    words = [w for part in [part_0, part_1] for doc in part for w in doc]
    expected = {str_hash(s.lower()): s.startswith("_") for s in words}
    assert expected == load_hashes(h)
예제 #10
0
    def test_dedup_with_dump(self):
        tmp = self.get_tmpdir()

        documents = [
            dict(text=text("_Hello", "_World", "I'm so original")),
            dict(text=text("_world", "I'm originaler", "_Hello")),
        ]
        list(
            dedup.deduplicate(documents,
                              field="text",
                              output_hashes=tmp("hashes.bin")))
        results = load_hashes(tmp("hashes.bin"))
        expected = {
            str_hash(l): l.startswith("_")
            for l in ["_hello", "_world", "i'm so original", "i'm originaler"]
        }
        self.assertEqual(expected, results)
예제 #11
0
    def test_dedup_with_np_dump(self):
        tmp = self.get_tmpdir()

        documents = [
            dict(text=text("_Hello", "_World", "I'm so original")),
            dict(text=text("_world", "I'm originaler", "_Hello")),
        ]
        with dedup.HashesCollector(field="text",
                                   output=tmp("hashes.bin")) as d:
            list(d.map(documents))

        results = FlatHashSet()
        results.load_np(tmp("hashes.bin"))
        expected = set(
            str_hash(l)
            for l in ["_hello", "_world", "i'm so original", "i'm originaler"])
        self.assertEqual(expected, set(results.keys()))
예제 #12
0
    def test_remove_duplicates_sharded(self):
        data = self.get_tmpdir()
        part_0 = [["Hello", "_World", "I'm so original"]]
        write_docs(data("part_0.json"), part_0)
        part_1 = [["_Good morning", "_World", "I'm originaler"]]
        write_docs(data("part_1.json"), part_1)

        h = self.get_tmpdir()
        h0 = FlatHashSet()
        h0.add([str_hash(s.lower()) for doc in part_0 for s in doc])
        h0.add([str_hash("_world")])
        h0.dump(h("part_0.bin"))
        self.assertEqual(
            {
                str_hash("hello"): False,
                str_hash("_world"): True,
                str_hash("i'm so original"): False,
            },
            as_dict(h0),
        )

        h1 = FlatHashSet()
        h1.add([str_hash(s.lower()) for doc in part_1 for s in doc])
        h1.add([str_hash("_good morning")])
        h1.dump(h("part_1.bin"))
        self.assertEqual(
            {
                str_hash("_good morning"): True,
                str_hash("_world"): False,
                str_hash("i'm originaler"): False,
            },
            as_dict(h1),
        )

        res = self.get_tmpdir()
        # dedup.DISABLE_MULTI_PROCESSING = True  # Simplifies debugging
        dedup.remove_duplicates_sharded(
            files=[data("part_0.json"),
                   data("part_1.json")],
            outputs=[res("part_0.json"),
                     res("part_1.json")],
            field="text",
            hashes_dir=h(),
        )

        with open(res("part_0.json")) as o:
            lines = o.readlines()
            print(lines)
            results_0 = list(jsonql.read_jsons(lines))
        expected_0 = [
            dict(text=text("Hello", "I'm so original"),
                 original_nlines=3,
                 nlines=2)
        ]
        assert_documents_equal(expected_0, results_0, ignoring=CUMBERSOME)

        with open(res("part_1.json")) as o:
            results_1 = [json.loads(l) for l in o.readlines()]
        # First pass removes "_world", second "_good morning".
        expected_1 = [
            dict(text=text("I'm originaler"), original_nlines=3, nlines=1)
        ]

        assert_documents_equal(expected_1, results_1, ignoring=CUMBERSOME)