Esempio n. 1
0
def extract_ocr_insights(
    ocr_url: str, insight_types: Iterable[InsightType]
) -> Dict[InsightType, ProductInsights]:
    source_image = get_source_from_ocr_url(ocr_url)
    barcode = get_barcode_from_url(ocr_url)

    if barcode is None:
        raise ValueError("cannot extract barcode fro URL: {}".format(ocr_url))

    ocr_result = get_ocr_result(ocr_url)

    if ocr_result is None:
        logger.info("Error during OCR extraction: {}".format(ocr_url))
        return {}

    results = {}

    for insight_type in insight_types:
        insights = ocr.extract_insights(ocr_result, insight_type)

        if insights:
            results[insight_type] = ProductInsights(
                barcode=barcode,
                insights=insights,
                source_image=source_image,
                type=insight_type,
            )

    return results
Esempio n. 2
0
def extract_ocr_insights(ocr_url: str) -> JSONType:
    r = requests.get(ocr_url)

    if r.status_code == 404:
        logger.info("OCR JSON {} not found".format(ocr_url))
        return {}

    r.raise_for_status()

    ocr_data: Dict = requests.get(ocr_url).json()
    ocr_result = ocr.OCRResult.from_json(ocr_data)

    if ocr_result is None:
        logger.info("Error during OCR extraction: {}".format(ocr_url))
        return {}

    results = {}

    for insight_type in (InsightType.label.name,
                         InsightType.packager_code.name,
                         InsightType.product_weight.name,
                         InsightType.image_flag.name,
                         InsightType.expiration_date.name,
                         InsightType.brand.name, InsightType.store.name):
        insights = ocr.extract_insights(ocr_result, insight_type)

        if insights:
            results[insight_type] = insights

    return results
Esempio n. 3
0
def get_insights_from_product_name(barcode: str, product_name: str) -> Dict:
    results = {}
    for insight_type in PRODUCT_NAME_INSIGHT_TYPES:
        insights = ocr.extract_insights(product_name, insight_type)

        if insights:
            for insight in insights:
                insight["source"] = "product_name"

            results[insight_type] = {
                "insights": insights,
                "barcode": barcode,
                "type": insight_type,
            }

    return results
Esempio n. 4
0
def get_insights_from_product_name(
        barcode: str, product_name: str) -> Dict[InsightType, ProductInsights]:
    results = {}
    for insight_type in PRODUCT_NAME_INSIGHT_TYPES:
        insights = ocr.extract_insights(product_name, insight_type)

        if insights:
            for insight in insights:
                insight.data["source"] = "product_name"

            results[insight_type] = ProductInsights(
                insights=insights,
                barcode=barcode,
                type=insight_type,
            )

    return results
Esempio n. 5
0
def extract_ocr_insights(ocr_url: str,
                         insight_types: Iterable[str]) -> JSONType:
    r = http_session.get(ocr_url)
    r.raise_for_status()

    ocr_data: Dict = r.json()
    ocr_result = ocr.OCRResult.from_json(ocr_data)

    if ocr_result is None:
        logger.info("Error during OCR extraction: {}".format(ocr_url))
        return {}

    results = {}

    for insight_type in insight_types:
        insights = ocr.extract_insights(ocr_result, insight_type)

        if insights:
            results[insight_type] = insights

    return results
Esempio n. 6
0
def run_from_ocr_archive(input_: str, insight_type: str,
                         output: Optional[str]):
    if output is not None:
        output_f = open(output, 'w')
    else:
        output_f = sys.stdout

    with contextlib.closing(output_f):
        for source, ocr_json in ocr_iter(input_):
            if source is None:
                continue

            barcode: Optional[str] = get_barcode_from_path(source)

            if barcode is None:
                click.echo("cannot extract barcode from source "
                           "{}".format(source),
                           err=True)
                continue

            ocr_result: Optional[OCRResult] = OCRResult.from_json(ocr_json)

            if ocr_result is None:
                continue

            insights = extract_insights(ocr_result, insight_type)

            if insights:
                item = {
                    'insights': insights,
                    'barcode': barcode,
                    'type': insight_type,
                }

                if source:
                    item['source'] = source

                output_f.write(json.dumps(item) + '\n')
Esempio n. 7
0
def generate_from_ocr_archive(
    input_: Union[str, TextIO, pathlib.Path],
    insight_type: InsightType,
    keep_empty: bool = False,
) -> Iterable[ProductInsights]:
    for source_image, ocr_json in ocr_iter(input_):
        if source_image is None:
            continue

        barcode: Optional[str] = get_barcode_from_path(source_image)

        if barcode is None:
            click.echo(
                "cannot extract barcode from source "
                "{}".format(source_image),
                err=True,
            )
            continue

        ocr_result: Optional[OCRResult] = OCRResult.from_json(ocr_json)

        if ocr_result is None:
            continue

        insights = extract_insights(ocr_result, insight_type)

        # Do not produce output if insights is empty and we don't want to keep it
        if not keep_empty and not insights:
            continue

        yield ProductInsights(
            insights=insights,
            barcode=barcode,
            type=insight_type,
            source_image=source_image,
        )