Beispiel #1
0
def insights_iter(file_path: pathlib.Path) -> Iterable[Prediction]:
    for prediction in jsonl_iter(file_path):
        yield dacite.from_dict(
            data_class=Prediction,
            data=prediction,
            config=dacite.Config(cast=[PredictionType]),
        )
Beispiel #2
0
    def stream(self) -> ProductStream:
        json_path_str = str(self.jsonl_path)

        if json_path_str.endswith(".gz"):
            iterator = gzip_jsonl_iter(json_path_str)
        else:
            iterator = jsonl_iter(json_path_str)

        return ProductStream(iterator)
Beispiel #3
0
def ocr_iter(
    source: Union[str, TextIO, pathlib.Path]
) -> Iterable[Tuple[Optional[str], Dict]]:
    if isinstance(source, pathlib.Path):
        items = jsonl_iter(source)
        yield from ocr_content_iter(items)

    elif not isinstance(source, str):
        items = jsonl_iter_fp(source)
        yield from ocr_content_iter(items)

    elif is_barcode(source):
        barcode: str = source
        image_data = fetch_images_for_ean(source)["product"]["images"]

        for image_id in image_data.keys():
            if image_id.isdigit():
                print("Getting OCR for image {}".format(image_id))
                data = get_json_for_image(barcode, image_id)
                source = get_source(image_id, barcode=barcode)
                if data:
                    yield source, data

    else:
        input_path = pathlib.Path(source)

        if not input_path.exists():
            print("Unrecognized input: {}".format(input_path))
            return

        if input_path.is_dir():
            for json_path in input_path.glob("**/*.json"):
                with open(str(json_path), "rb") as f:
                    source = get_source(json_path.stem,
                                        json_path=str(json_path))
                    yield source, orjson.loads(f.read())
        else:
            if ".json" in input_path.suffixes:
                with open(str(input_path), "rb") as f:
                    yield None, orjson.loads(f.read())

            elif ".jsonl" in input_path.suffixes:
                items = jsonl_iter(input_path)
                yield from ocr_content_iter(items)
Beispiel #4
0
def insert_batch(data_path: pathlib.Path, model_name: str,
                 model_version: str) -> int:
    timestamp = datetime.datetime.utcnow()
    logger.info("Loading seen set...")
    seen_set = get_seen_set()
    logger.info("Seen set loaded")
    inserted = 0

    for item in tqdm.tqdm(jsonl_iter(data_path)):
        barcode = item["barcode"]
        source_image = generate_image_path(barcode=barcode,
                                           image_id=item["image_id"])
        key = (model_name, source_image)

        if key in seen_set:
            continue

        image_instance = ImageModel.get_or_none(source_image=source_image)

        if image_instance is None:
            logger.warning("Unknown image in DB: {}".format(source_image))
            continue

        results = [r for r in item["result"] if r["score"] > 0.1]
        data = {"objects": results}
        max_confidence = max([r["score"] for r in results], default=None)

        inserted += 1
        image_prediction = ImagePrediction.create(
            type=TYPE,
            image=image_instance,
            timestamp=timestamp,
            model_name=model_name,
            model_version=model_version,
            data=data,
            max_confidence=max_confidence,
        )
        for i, item in enumerate(results):
            if item["score"] >= 0.5:
                LogoAnnotation.create(
                    image_prediction=image_prediction,
                    index=i,
                    score=item["score"],
                    bounding_box=item["bounding_box"],
                )
        seen_set.add(key)

    return inserted
Beispiel #5
0
 def from_jsonl(self, file_path):
     items = jsonl_iter(file_path)
     self.import_insights(items, automatic=False)
Beispiel #6
0
def insights_iter(file_path: pathlib.Path) -> Iterable[ProductInsights]:
    for insight in jsonl_iter(file_path):
        yield ProductInsights.from_dict(insight)
Beispiel #7
0
 def from_jsonl(self, file_path: pathlib.Path, server_domain: str):
     items = jsonl_iter(file_path)
     self.import_insights(items,
                          server_domain=server_domain,
                          automatic=False)
import pathlib
from random import shuffle

from robotoff import settings
from robotoff.utils import dump_jsonl, jsonl_iter

lang = "pt"
input_path: pathlib.Path = (settings.DATASET_DIR / "category" /
                            "category_{}.jsonl".format(lang))

items = list(jsonl_iter(input_path))
shuffle(items)

val_count = len(items) // 10
val_items = items[:val_count]
test_items = items[val_count:2 * val_count]
train_items = items[2 * val_count:]

dump_jsonl(input_path.with_name("category_{}.val.jsonl".format(lang)),
           val_items)
dump_jsonl(input_path.with_name("category_{}.test.jsonl".format(lang)),
           test_items)
dump_jsonl(input_path.with_name("category_{}.train.jsonl".format(lang)),
           train_items)