def load_new_batch(fname_glob, batch): if app.config["S3_ENABLED"]: storage = S3Storage(app.config) else: storage = LocalFSStorage(app.config) pages_processed = 0 shreds_created = 0 out_dir = os.path.join(app.config["SPLIT_OUT_DIR"], "batch_%s" % batch) storage.clear(out_dir) Cluster.objects(batch=batch).delete() for src_key in storage.list(fname_glob): fname = storage.get_file(src_key) sheet_name = os.path.splitext(os.path.basename(fname))[0] echo("\n\nProcessing file %s from %s" % (fname, sheet_name)) sheet = SheetIO(fname, sheet_name, [GeometryFeatures, ColourFeatures], out_dir, "png") image_path_fields = ["piece_fname", "mask_fname", "piece_in_context_fname"] # TODO: Remove when all field names match unshred's. field_name_map = { # Unshred-tag name: unshred name. "mask_fname": "features_fname", } drop_fields = ['simplified_contour', 'img_roi'] drop_features = ['on_sheet_height', 'on_sheet_width', 'on_sheet_angle', 'bottommost', 'topmost', 'on_sheet_x', 'on_sheet_y'] pages_processed += 1 for shred in sheet.get_shreds(): shred = shred._asdict() shred["id"] = "%s:%s_%s" % (batch, shred["sheet"], shred["name"]) shreds_created += 1 def _convert_opencv_contour(contour): """Converts opencv contour to a list of pairs.""" return contour.reshape((len(contour), 2)).tolist() shred["contour"] = _convert_opencv_contour( shred["simplified_contour"]) shred['tags'] = shred.pop('tags_suggestions') for field in drop_fields: del shred[field] for field in drop_features: del shred['features'][field] cluster = {} cluster["id"] = shred["id"] cluster["users_count"] = 0 cluster["batch"] = batch for model_field_name in image_path_fields: import_field_name = field_name_map.get(model_field_name, model_field_name) image_path = shred.pop(import_field_name) res = storage.put_file(image_path) shred[model_field_name] = res cluster["parents"] = [] try: shred_obj = Shred.objects.create(**shred) cluster_member = ClusterMember(shred=shred_obj, position=[0, 0], angle=0) cluster["members"] = [cluster_member] Cluster.objects.create(**cluster) except bson.errors.InvalidDocument: echo(shred) raise Cluster.ensure_index(["users_processed", "users_count", "batch"]) Cluster.ensure_index(["users_skipped", "users_count", "batch"])