Ejemplo n.º 1
0
def get_output(transformer, data, **kwargs):
    with io.StringIO() as output:
        # Convert data to a generator so that it's not interpreted as a file list.
        jsonql.run_pipe(transformer,
                        kwargs,
                        file=(x for x in data),
                        output=output)
        return output.getvalue()
Ejemplo n.º 2
0
def deduplicate_concatenated(files,
                             outputs,
                             field,
                             output_hashes,
                             finalize=True):
    """Deduplicate several files at once, using the same set of hashes for all."""
    hashes = FlatHashSet()
    dedup_kwargs = dict(
        field=field,
        hashes=hashes,
        add_hashes=True,
        output_hashes=None,
        finalize=finalize,
    )

    assert len(files) == len(outputs)
    for f, o in zip(files, outputs):
        jsonql.run_pipe(deduplicate, dedup_kwargs, file=f, output=o)
        log(f"Saw {len(hashes)} hashes.")

        if output_hashes:
            log(f"Dumping {len(hashes)} hashes to {output_hashes}.")
            hashes.dump(output_hashes)
Ejemplo n.º 3
0
def perplexity_to_bin(file: Path, output: Path, models, tok_field: str):
    pp_field = "perplexity"
    lm = DocLM(models, tok_field, output_field=pp_field)
    stats: List[float] = []
    max_stats = 1_000_000
    batch_size = 100_000
    i = 0
    batch = []
    with jsonql.smart_open(file) as f, open(output, "wb") as o:
        for doc in jsonql.read_jsons(f):
            i += 1
            pp = lm(doc)[pp_field]
            if len(stats) < max_stats:
                stats.append(pp)
            batch.append(pp)
            if len(batch) >= batch_size:
                np.array(batch, dtype=np.float32).tofile(o)
                batch = []
        if len(batch) > 0:
            np.array(batch, dtype=np.float32).tofile(o)


if __name__ == "__main__":
    args = get_args()
    output = Path(args["output"])
    if output.suffix == ".bin":
        perplexity_to_bin(args["file"], output, args["models"], args["field"])
    else:
        jsonql.run_pipe(DocLM, args)
Ejemplo n.º 4
0
        )
        summ = super().summary()
        if self.threshold > 0:
            ratio = n_accepted / n_doc if n_doc else 0
            summ.append(f"Kept {n_accepted} docs over {n_doc} ({ratio :.1%})")
        summ.append(f"Found {len(cnt)} {out_field} labels: {cnt}")

        disagreement = n_disagreement / n_doc if n_doc else 0
        if disagreement:
            summ.append(f"{out_field} disagreement is at {disagreement:.1%}.")
        return summ

    def __repr__(self):
        return f"Classifier({self.model})"


def classify_and_split(file, output, pattern, **kwargs):
    classifier = Classifier(**kwargs)
    splitter = jsonql.split(pattern)
    jsonql.run_pipes(classifier, splitter, file=file, output=output)


if __name__ == "__main__":
    args = get_args()
    pattern = args.get("pattern")
    if pattern:
        classify_and_split(**args)
    else:
        args.pop("pattern")
        jsonql.run_pipe(Classifier, args)
Ejemplo n.º 5
0
def main():
    args = get_args()

    return jsonql.run_pipe(deduplicate, args)