Example #1
0
def load_new_batch(fname_glob, batch):
    if app.config["S3_ENABLED"]:
        storage = S3Storage(app.config)
    else:
        storage = LocalFSStorage(app.config)

    pages_processed = 0
    shreds_created = 0

    out_dir = os.path.join(app.config["SPLIT_OUT_DIR"], "batch_%s" % batch)
    storage.clear(out_dir)
    Cluster.objects(batch=batch).delete()

    for src_key in storage.list(fname_glob):
        fname = storage.get_file(src_key)
        sheet_name = os.path.splitext(os.path.basename(fname))[0]

        echo("\n\nProcessing file %s from %s" % (fname, sheet_name))
        sheet = SheetIO(fname, sheet_name, [GeometryFeatures, ColourFeatures],
                        out_dir, "png")

        image_path_fields = ["piece_fname", "mask_fname",
                             "piece_in_context_fname"]

        # TODO: Remove when all field names match unshred's.
        field_name_map = {
            # Unshred-tag name: unshred name.
            "mask_fname": "features_fname",
        }
        drop_fields = ['simplified_contour', 'img_roi']
        drop_features = ['on_sheet_height', 'on_sheet_width', 'on_sheet_angle',
                'bottommost', 'topmost', 'on_sheet_x', 'on_sheet_y']


        pages_processed += 1

        for shred in sheet.get_shreds():
            shred = shred._asdict()
            shred["id"] = "%s:%s_%s" % (batch, shred["sheet"], shred["name"])
            shreds_created += 1

            def _convert_opencv_contour(contour):
                """Converts opencv contour to a list of pairs."""
                return contour.reshape((len(contour), 2)).tolist()
            shred["contour"] = _convert_opencv_contour(
                shred["simplified_contour"])
            shred['tags'] = shred.pop('tags_suggestions')

            for field in drop_fields:
                del shred[field]
            for field in drop_features:
                del shred['features'][field]

            cluster = {}
            cluster["id"] = shred["id"]
            cluster["users_count"] = 0
            cluster["batch"] = batch

            for model_field_name in image_path_fields:
                import_field_name = field_name_map.get(model_field_name,
                                                       model_field_name)
                image_path = shred.pop(import_field_name)
                res = storage.put_file(image_path)
                shred[model_field_name] = res

            cluster["parents"] = []
            try:
                shred_obj = Shred.objects.create(**shred)
                cluster_member = ClusterMember(shred=shred_obj, position=[0, 0],
                                               angle=0)
                cluster["members"] = [cluster_member]
                Cluster.objects.create(**cluster)
            except bson.errors.InvalidDocument:
                echo(shred)
                raise

    Cluster.ensure_index(["users_processed", "users_count", "batch"])
    Cluster.ensure_index(["users_skipped", "users_count", "batch"])