class Project(object): _storage = {} @classmethod def get_user_projects(cls, user, root): """Get all project names by user, this is used in multi-session mode""" return os.listdir(os.path.join(root, user)) @classmethod def get_all_projects(cls, root): """Get all projects in the system, this is used in multi-session mode Returns {user: projects} """ result = {} regex = r"........-....-....-....-............" # user uuid filter for user in os.listdir(root): # leave user dirs satisfied regex only matches = re.search(regex, user) if matches: user_dir = os.path.join(root, user) result[user] = os.listdir(user_dir) return result @classmethod def get_user_by_project(cls, project_uuid, root): all_projects = cls.get_all_projects(root) for user in all_projects: if project_uuid in all_projects[user]: return user def __init__(self, config, name, root_dir=".", context=None): self.config = config self.name = name self.path = os.path.join(root_dir, self.name) self.ml_backends = [] self.on_boarding = {} self.context = context or {} self.project_obj = None self.source_storage = None self.target_storage = None self.create_storages() ( self.label_config_line, self.label_config_full, self.parsed_label_config, self.input_data_tags, ) = ( None, None, None, None, ) # noqa self.derived_input_schema, self.derived_output_schema = None, None self.load_label_config() self.load_project_and_ml_backends() self.update_derived_input_schema() self.update_derived_output_schema() self.converter = None self.load_converter() self.max_tasks_file_size = 250 def get_storage(self, storage_for): if storage_for == "source": return self.source_storage elif storage_for == "target": return self.target_storage def get_available_storage_names(self, storage_for): if storage_for == "source": return self.get_available_source_storage_names() elif storage_for == "target": return self.get_available_target_storage_names() @classmethod def get_available_source_storages(cls): return ["tasks-json", "s3", "gcs"] @classmethod def get_available_target_storages(cls): return ["completions-dir", "s3-completions", "gcs-completions"] def get_available_source_storage_names(self): names = OrderedDict() nameset = set(self.get_available_source_storages()) for name, desc in get_available_storage_names().items(): # we don't expose configurable filesystem storage in UI to avoid security problems if name in nameset: names[name] = desc return names def get_available_target_storage_names(self): names = OrderedDict() nameset = set(self.get_available_target_storages()) for name, desc in get_available_storage_names().items(): # blobs have no sense for target storages if name in nameset: names[name] = desc return names def create_storages(self): source = self.config["source"] target = self.config["target"] self.source_storage = create_storage(source["type"], "source", source["path"], self.path, self, **source.get("params", {})) self.target_storage = create_storage(target["type"], "target", target["path"], self.path, self, **target.get("params", {})) def update_storage(self, storage_for, storage_kwargs): def _update_storage(storage_for, storage_kwargs): storage_name = storage_kwargs.pop("name", storage_for) storage_type = storage_kwargs.pop("type") storage_path = storage_kwargs.pop("path", None) # storage_path = self.config[storage_for]['path'] storage = create_storage(storage_type, storage_name, storage_path, self.path, self, **storage_kwargs) self.config[storage_for] = { "name": storage_name, "type": storage_type, "path": storage_path, "params": storage_kwargs, } self._save_config() logger.debug('Created storage type "' + storage_type + '"') return storage if storage_for == "source": self.source_storage = _update_storage("source", storage_kwargs) elif storage_for == "target": self.target_storage = _update_storage("target", storage_kwargs) self.update_derived_input_schema() self.update_derived_output_schema() @property def can_manage_tasks(self): return self.config["source"]["type"] not in { "s3", "s3-completions", "gcs", "gcs-completions", } @property def can_manage_completions(self): return self.config["target"]["type"] not in { "s3", "s3-completions", "gcs", "gcs-completions", } @property def can_delete_tasks(self): return self.can_manage_tasks and self.can_manage_completions @property def data_types_json(self): return self.project_obj.data_types_json def load_label_config(self): self.label_config_full = config_comments_free( open(self.config["label_config"], encoding="utf8").read()) self.label_config_line = config_line_stripped(self.label_config_full) self.parsed_label_config = parse_config(self.label_config_line) self.input_data_tags = self.get_input_data_tags(self.label_config_line) def update_derived_input_schema(self): self.derived_input_schema = set() for task_id, task in self.source_storage.items(): data_keys = set(task["data"].keys()) if not self.derived_input_schema: self.derived_input_schema = data_keys else: self.derived_input_schema &= data_keys logger.debug("Derived input schema: " + str(self.derived_input_schema)) def update_derived_output_schema(self): self.derived_output_schema = { "from_name_to_name_type": set(), "labels": defaultdict(set), } # for all already completed tasks we update derived output schema for further label config validation for task_id, c in self.target_storage.items(): for completion in c["completions"]: self._update_derived_output_schema(completion) logger.debug("Derived output schema: " + str(self.derived_output_schema)) def add_ml_backend(self, params, raise_on_error=True): ml_backend = MLBackend.from_params(params) if not ml_backend.connected and raise_on_error: raise ValueError('ML backend with URL: "' + str(params["url"]) + '" is not connected.') self.ml_backends.append(ml_backend) def remove_ml_backend(self, name): # remove from memory remove_idx = next( (i for i, b in enumerate(self.ml_backends) if b.model_name == name), None) if remove_idx is None: raise KeyError("Can't remove ML backend with name \"" + name + '": not found.') self.ml_backends.pop(remove_idx) # remove from config config_params = self.config.get("ml_backends", []) remove_idx = next( (i for i, b in enumerate(config_params) if b["name"] == name), None) if remove_idx is not None: config_params.pop(remove_idx) self.config["ml_backends"] = config_params self._save_config() def load_project_and_ml_backends(self): # configure project self.project_obj = ProjectObj( label_config=self.label_config_line, label_config_full=self.label_config_full, ) # configure multiple machine learning backends self.ml_backends = [] ml_backends_params = self.config.get("ml_backends", []) for ml_backend_params in ml_backends_params: self.add_ml_backend(ml_backend_params, raise_on_error=False) def load_converter(self): self.converter = Converter(self.parsed_label_config) @property def id(self): return self.project_obj.id @property def uuid(self): return os.path.basename(self.path) @property def data_types(self): return self.project_obj.data_types @property def label_config(self): return self.project_obj.label_config @property def ml_backends_connected(self): return len(self.ml_backends) > 0 @property def task_data_login(self): return self.project_obj.task_data_login @property def task_data_password(self): return self.project_obj.task_data_password def extract_data_types(self, config): return self.project_obj.extract_data_types(config) def validate_label_config(self, config_string): logger.debug("Validate label config") self.project_obj.validate_label_config(config_string) logger.debug("Get parsed config") parsed_config = parse_config(config_string) logger.debug("Validate label config on derived input schema") self.validate_label_config_on_derived_input_schema(parsed_config) logger.debug("Validate label config on derived output schema") self.validate_label_config_on_derived_output_schema(parsed_config) def _save_config(self): with io.open(self.config["config_path"], mode="w") as f: json.dump(self.config, f, indent=2) def update_params(self, params): if "ml_backend" in params: ml_backend_params = self._create_ml_backend_params( params["ml_backend"], self.name) self.add_ml_backend(ml_backend_params) self.config["ml_backends"].append(ml_backend_params) self._save_config() def update_label_config(self, new_label_config): label_config_file = self.config["label_config"] # save xml label config to file new_label_config = new_label_config.replace("\r\n", "\n") with io.open(label_config_file, mode="w", encoding="utf8") as f: f.write(new_label_config) # reload everything that depends on label config self.load_label_config() self.update_derived_output_schema() self.load_project_and_ml_backends() self.load_converter() # save project config state self.config["label_config_updated"] = True with io.open(self.config["config_path"], mode="w", encoding="utf8") as f: json.dump(self.config, f) logger.info( "Label config saved to: {path}".format(path=label_config_file)) def _update_derived_output_schema(self, completion): """ Given completion, output schema is updated. Output schema consists of unique tuples (from_name, to_name, type) and list of unique labels derived from existed completions :param completion: :return: """ for result in completion["result"]: result_type = result.get("type") if result_type in ("relation", "rating", "pairwise"): continue if "from_name" not in result or "to_name" not in result: logger.error( 'Unexpected completion.result format: "from_name" or "to_name" not found in %r' % result) continue self.derived_output_schema["from_name_to_name_type"].add( (result["from_name"], result["to_name"], result_type)) for label in result["value"].get(result_type, []): self.derived_output_schema["labels"][result["from_name"]].add( label) def validate_label_config_on_derived_input_schema( self, config_string_or_parsed_config): """ Validate label config on input schemas (tasks types and data keys) derived from imported tasks :param config_string_or_parsed_config: label config string or parsed config object :return: True if config match already imported tasks """ # check if schema exists, i.e. at least one task has been uploaded if not self.derived_input_schema: return config = config_string_or_parsed_config if isinstance(config, str): config = parse_config(config) input_types, input_values = set(), set() for input_items in map(itemgetter("inputs"), config.values()): for input_item in input_items: input_types.add(input_item["type"]) input_values.add(input_item["value"]) # check input data values: they must be in schema for item in input_values: if item not in self.derived_input_schema: raise ValidationError( "You have already imported tasks and they are incompatible with a new config. " "You've specified value=${item}, but imported tasks contain only keys: {input_schema_values}" .format(item=item, input_schema_values=list( self.derived_input_schema))) def validate_label_config_on_derived_output_schema( self, config_string_or_parsed_config): """ Validate label config on output schema (from_names, to_names and labeling types) derived from completions :param config_string_or_parsed_config: label config string or parsed config object :return: True if config match already created completions """ output_schema = self.derived_output_schema # check if schema exists, i.e. at least one completion has been created if not output_schema["from_name_to_name_type"]: return config = config_string_or_parsed_config if isinstance(config, str): config = parse_config(config) completion_tuples = set() for from_name, to in config.items(): completion_tuples.add( (from_name, to["to_name"][0], to["type"].lower())) for from_name, to_name, type in output_schema[ "from_name_to_name_type"]: if (from_name, to_name, type) not in completion_tuples: raise ValidationError( "You've already completed some tasks, but some of them couldn't be loaded with this config: " "name={from_name}, toName={to_name}, type={type} are expected" .format(from_name=from_name, to_name=to_name, type=type)) for from_name, expected_label_set in output_schema["labels"].items(): if from_name not in config: raise ValidationError( "You've already completed some tasks, but some of them couldn't be loaded with this config: " "name=" + from_name + " is expected") found_labels = set(config[from_name]["labels"]) extra_labels = list(expected_label_set - found_labels) if extra_labels: raise ValidationError( "You've already completed some tasks, but some of them couldn't be loaded with this config: " 'there are labels already created for "{from_name}":\n{extra_labels}' .format(from_name=from_name, extra_labels=extra_labels)) def no_tasks(self): return self.source_storage.empty() def delete_tasks(self): """ Deletes all tasks & completions from filesystem, then reloads clean project :return: """ self.source_storage.remove_all() self.target_storage.remove_all() self.update_derived_input_schema() self.update_derived_output_schema() # delete everything on ML backend if self.ml_backends_connected: for m in self.ml_backends: m.clear(self) def next_task(self, completed_tasks_ids): completed_tasks_ids = set(completed_tasks_ids) sampling = self.config.get("sampling", "sequential") # Tasks are ordered ascending by their "id" fields. This is default mode. task_iter = filter(lambda i: i not in completed_tasks_ids, sorted(self.source_storage.ids())) if sampling == "sequential": task_id = next(task_iter, None) if task_id is not None: return self.source_storage.get(task_id) # Tasks are sampled with equal probabilities elif sampling == "uniform": actual_tasks_ids = list(task_iter) if not actual_tasks_ids: return None random.shuffle(actual_tasks_ids) return self.source_storage.get(actual_tasks_ids[0]) # Task with minimum / maximum average prediction score is taken elif sampling.startswith("prediction-score"): id_score_map = {} for task_id, task in self.source_storage.items(): if task_id in completed_tasks_ids: continue if "predictions" in task and len(task["predictions"]) > 0: score = sum((p["score"] for p in task["predictions"]), 0) / len(task["predictions"]) id_score_map[task_id] = score if not id_score_map: return None if sampling.endswith("-min"): best_idx = min(id_score_map, key=id_score_map.get) elif sampling.endswith("-max"): best_idx = max(id_score_map, key=id_score_map.get) else: raise NotImplementedError("Unknown sampling method " + sampling) return self.source_storage.get(best_idx) else: raise NotImplementedError("Unknown sampling method " + sampling) def remove_task(self, task_id): self.source_storage.remove(task_id) self.delete_task_completions(task_id) self.update_derived_input_schema() self.update_derived_output_schema() def get_completions_ids(self): """List completion ids from output_dir directory :return: filenames without extensions and directories """ task_ids = set(self.source_storage.ids()) completion_ids = set(self.target_storage.ids()) completions = completion_ids.intersection(task_ids) # completions = list(self.target_storage.ids()) logger.debug("{num} completions found in {output_dir}".format( num=len(completions), output_dir=self.config["output_dir"])) return sorted(completions) def get_completed_at(self): """Get completed time for tasks :return: list of string with formatted datetime """ times = {} for _, data in self.target_storage.items(): id = data["id"] try: times[id] = max(data["completions"], key=itemgetter("created_at"))["created_at"] except Exception as exc: times[id] = "undefined" return times def get_cancelled_status(self): """Get was_cancelled (skipped) status for tasks: returns cancelled completion number for task :return: list of int """ items = {} for _, data in self.target_storage.items(): id = data["id"] try: # note: skipped will be deprecated flag = sum([ completion.get("skipped", False) or completion.get("was_cancelled", False) for completion in data["completions"] ]) except Exception as exc: items[id] = -1 else: items[id] = flag return items def get_task_with_completions(self, task_id): """Get task with completions :param task_id: task ids :return: json dict with completion """ data = self.target_storage.get(task_id) logger.debug("Get task " + str(task_id) + " from target storage") if data: logger.debug("Get predictions " + str(task_id) + " from source storage") # tasks can hold the newest version of predictions, so task it from tasks data["predictions"] = self.source_storage.get(task_id).get( "predictions", []) return data def save_completion(self, task_id, completion): """Save completion :param task_id: task id :param completion: json data from label (editor) """ # try to get completions with task first task = self.get_task_with_completions(task_id) # init task if completions with task not exists if not task: task = deepcopy(self.source_storage.get(task_id)) task["completions"] = [] else: task = deepcopy(task) # remove possible stored predictions task.pop("predictions", None) # update old completion updated = False if "id" in completion: for i, item in enumerate(task["completions"]): if item["id"] == completion["id"]: task["completions"][i].update(completion) updated = True # write new completion if not updated: completion["id"] = task["id"] * 1000 + len(task["completions"]) + 1 task["completions"].append(completion) try: self._update_derived_output_schema(completion) except Exception as exc: logger.error(exc, exc_info=True) logger.debug(json.dumps(completion, indent=2)) # save completion time completion["created_at"] = timestamp_now() # write task + completions to file self.target_storage.set(task_id, task) logger.debug("Completion for task " + str(task_id) + " saved with id =" + str(completion["id"])) return completion["id"] def delete_task_completion(self, task_id, completion_id): """Delete one task completion by id""" # try to get completions with task first task = self.get_task_with_completions(task_id) if not task: return False else: task = deepcopy(task) # remove completion from task for i, item in enumerate(task["completions"]): if item["id"] == completion_id: del task["completions"][i] self.update_derived_output_schema() # write task + completions to file self.target_storage.set(task_id, task) logger.debug("Completion " + str(completion_id) + " removed:\n") return True def delete_task_completions(self, task_id): """Delete all task completions""" self.target_storage.remove(task_id) self.update_derived_output_schema() def delete_all_completions(self): """Delete all completions from project""" self.target_storage.remove_all() self.update_derived_output_schema() def make_predictions(self, task): task = deepcopy(task) stored_predictions = task.get("predictions") task["predictions"] = [] try: for ml_backend in self.ml_backends: if not ml_backend.connected: continue predictions = ml_backend.make_predictions(task, self) predictions["created_by"] = ml_backend.model_name predictions["created_date"] = datetime.now().isoformat() task["predictions"].append(predictions) except Exception as exc: logger.debug(exc, exc_info=True) if not task["predictions"] and stored_predictions: task["predictions"] = stored_predictions return task def train(self): completions = [] for _, c in self.target_storage.items(): completions.append(c) train_status = False if self.ml_backends_connected: for ml_backend in self.ml_backends: if ml_backend.connected: ml_backend.train(completions, self) train_status = True return train_status @classmethod def get_project_dir(cls, project_name, args): return os.path.join(args.root_dir, project_name) @classmethod def get_input_data_tags(cls, label_config): tag_iter = ElementTree.fromstring(label_config).iter() return [ tag for tag in tag_iter if tag.attrib.get("name") and tag.attrib.get("value", "").startswith("$") ] @classmethod def _load_tasks(cls, input_path, args, label_config_file): with io.open(label_config_file, encoding="utf8") as f: label_config = f.read() task_loader = Tasks() if args.input_format == "json": return task_loader.from_json_file(input_path) if args.input_format == "json-dir": return task_loader.from_dir_with_json_files(input_path) input_data_tags = cls.get_input_data_tags(label_config) if len(input_data_tags) > 1: val = ",".join(tag.attrib.get("name") for tag in input_data_tags) print("Warning! Multiple input data tags found: " + val + ". Only first one is used.") elif len(input_data_tags) == 0: raise ValueError( 'You\'ve specified input format "{fmt}" which requires label config being explicitly defined. ' "Please specify --label-config=path/to/config.xml or use --format=json or format=json_dir" .format(fmt=args.input_format)) input_data_tag = input_data_tags[0] data_key = input_data_tag.attrib.get("value").lstrip("$") if args.input_format == "text": return task_loader.from_text_file(input_path, data_key) if args.input_format == "text-dir": return task_loader.from_dir_with_text_files(input_path, data_key) if args.input_format == "image-dir": return task_loader.from_dir_with_image_files(input_path, data_key) if args.input_format == "audio-dir": return task_loader.from_dir_with_audio_files(input_path, data_key) raise RuntimeError("Can't load tasks for input format={}".format( args.input_format)) @classmethod def _create_ml_backend_params(cls, url, project_name=None): if "=http" in url: name, url = url.split("=", 1) else: project_name = os.path.basename(project_name or "") name = project_name + str(uuid4())[:4] if not is_url(url): raise ValueError('Specified string "' + url + "\" doesn't look like URL.") return {"url": url, "name": name} @classmethod def create_project_dir(cls, project_name, args): """ Create project directory in args.root_dir/project_name, and initialize there all required files If some files are missed, restore them from defaults. If config files are specified by args, copy them in project directory :param project_name: :param args: :return: """ dir = cls.get_project_dir(project_name, args) if args.force: delete_dir_content(dir) os.makedirs(dir, exist_ok=True) config = (json_load(args.config_path) if args.config_path else json_load(find_file("default_config.json"))) def already_exists_error(what, path): raise RuntimeError( '{path} {what} already exists. Use "--force" option to recreate it.' .format(path=path, what=what)) input_path = args.input_path or config.get("input_path") # save label config config_xml = "config.xml" config_xml_path = os.path.join(dir, config_xml) label_config_file = args.label_config or config.get("label_config") if label_config_file: copy2(label_config_file, config_xml_path) print(label_config_file + " label config copied to " + config_xml_path) else: if os.path.exists(config_xml_path) and not args.force: already_exists_error("label config", config_xml_path) if not input_path: # create default config with polygons only if input data is not set default_label_config = find_file( "examples/image_polygons/config.xml") copy2(default_label_config, config_xml_path) print(default_label_config + " label config copied to " + config_xml_path) else: with io.open(config_xml_path, mode="w") as fout: fout.write("<View></View>") print("Empty config has been created in " + config_xml_path) config["label_config"] = config_xml if args.source: config["source"] = { "type": args.source, "path": args.source_path, "params": args.source_params, } else: # save tasks.json tasks_json = "tasks.json" tasks_json_path = os.path.join(dir, tasks_json) if input_path: tasks = cls._load_tasks(input_path, args, config_xml_path) else: tasks = {} with io.open(tasks_json_path, mode="w") as fout: json.dump(tasks, fout, indent=2) config["input_path"] = tasks_json config["source"] = { "name": "Tasks", "type": "tasks-json", "path": os.path.abspath(tasks_json_path), } logger.debug( "{tasks_json_path} input file with {n} tasks has been created from {input_path}" .format(tasks_json_path=tasks_json_path, n=len(tasks), input_path=input_path)) if args.target: config["target"] = { "type": args.target, "path": args.target_path, "params": args.target_params, } else: completions_dir = os.path.join(dir, "completions") if os.path.exists(completions_dir) and not args.force: already_exists_error("output dir", completions_dir) if os.path.exists(completions_dir): delete_dir_content(completions_dir) print(completions_dir + " output dir already exists. Clear it.") else: os.makedirs(completions_dir, exist_ok=True) print(completions_dir + " output dir has been created.") config["output_dir"] = "completions" config["target"] = { "name": "Completions", "type": "completions-dir", "path": os.path.abspath(completions_dir), } if "ml_backends" not in config or not isinstance( config["ml_backends"], list): config["ml_backends"] = [] if args.ml_backends: for url in args.ml_backends: config["ml_backends"].append( cls._create_ml_backend_params(url, project_name)) if args.sampling: config["sampling"] = args.sampling if args.port: config["port"] = args.port if args.host: config["host"] = args.host if args.allow_serving_local_files: config["allow_serving_local_files"] = True if args.key_file and args.cert_file: config["protocol"] = "https://" config["cert"] = args.cert_file config["key"] = args.key_file if (hasattr(args, "web_gui_project_desc") and args.web_gui_project_desc) or args.project_desc: config[ "description"] = args.web_gui_project_desc or args.project_desc # create config.json config_json = "config.json" config_json_path = os.path.join(dir, config_json) if os.path.exists(config_json_path) and not args.force: already_exists_error("config", config_json_path) with io.open(config_json_path, mode="w") as f: json.dump(config, f, indent=2) print("") print( "Label Studio has been successfully initialized. Check project states in " + dir) print("Start the server: label-studio start " + dir) return dir @classmethod def get_config(cls, project_name, args): return cls._get_config(cls.get_project_dir(project_name, args)) @classmethod def _get_config(cls, project_dir, args=None): """ Get config from input args Namespace acquired by Argparser :param args: :return: """ # check if project directory exists if not os.path.exists(project_dir): project_name = args.project_name if args is not None else "<project_name>" raise FileNotFoundError( "Couldn't find directory " + project_dir + ', maybe you\'ve missed appending "--init" option:\nlabel-studio start ' + project_name + " --init") # check config.json exists in directory config_path = os.path.join(project_dir, "config.json") if not os.path.exists(config_path): project_name = args.project_name if args is not None else "<project_name>" raise FileNotFoundError( "Couldn't find config file " + config_path + " in project directory " + project_dir + ', maybe you\'ve missed appending "--init" option:\nlabel-studio start ' + project_name + " --init") config_path = os.path.abspath(config_path) with io.open(config_path) as c: config = json.load(c) config["config_path"] = config_path if config.get("input_path"): config["input_path"] = os.path.join(os.path.dirname(config_path), config["input_path"]) config["label_config"] = os.path.join(os.path.dirname(config_path), config["label_config"]) if config.get("output_dir"): config["output_dir"] = os.path.join(os.path.dirname(config_path), config["output_dir"]) if not config.get("source"): config["source"] = { "name": "Tasks", "type": "tasks-json", "path": os.path.abspath(config["input_path"]), } if not config.get("target"): config["target"] = { "name": "Completions", "type": "completions-dir", "path": os.path.abspath(config["output_dir"]), } return config @classmethod def _load_from_dir(cls, project_dir, project_name, args, context): config = cls._get_config(project_dir, args) return cls(config, project_name, context=context, root_dir=args.root_dir) @classmethod def get(cls, project_name, args, context): # If project stored in memory, just return it if project_name in cls._storage: return cls._storage[project_name] # If project directory exists, load project from directory and update in-memory storage project_dir = cls.get_project_dir(project_name, args) if os.path.exists(project_dir): project = cls._load_from_dir(project_dir, project_name, args, context) cls._storage[project_name] = project return project raise ProjectNotFound( "Project {p} doesn't exist".format(p=project_name)) @classmethod def create(cls, project_name, args, context): # "create" method differs from "get" as it can create new directory with project resources project_dir = cls.create_project_dir(project_name, args) project = cls._load_from_dir(project_dir, project_name, args, context) cls._storage[project_name] = project return project @classmethod def get_or_create(cls, project_name, args, context): try: project = cls.get(project_name, args, context) logger.info('Get project "' + project_name + '".') except ProjectNotFound: project = cls.create(project_name, args, context) logger.info('Project "' + project_name + '" created.') return project def update_on_boarding_state(self): self.on_boarding["setup"] = self.config.get("label_config_updated", False) self.on_boarding["import"] = not self.no_tasks() self.on_boarding["labeled"] = not self.target_storage.empty() return self.on_boarding @property def generate_sample_task_escape(self): return self.project_obj.generate_sample_task_escape @property def supported_formats(self): return self.project_obj.supported_formats def serialize(self): """Serialize project to json dict""" ban_list = ("json", "dir-jsons") available_storages = list( filter(lambda i: i[0] not in ban_list, get_available_storage_names().items())) output = { "project_name": self.name, "task_count": len(self.source_storage.ids()), "completion_count": len(self.get_completions_ids()), "config": self.config, "instruction": self.config["instruction"], "can_manage_tasks": self.can_manage_tasks, "can_manage_completions": self.can_manage_completions, "can_delete_tasks": self.can_delete_tasks, "target_storage": { "readable_path": self.target_storage.readable_path }, "source_storage": { "readable_path": self.source_storage.readable_path }, "available_storages": available_storages, "source_syncing": self.source_storage.is_syncing, "target_syncing": self.target_storage.is_syncing, "data_types": self.data_types, "label_config_line": self.label_config_line, } return output
class Project(object): _storage = {} def __init__(self, config, name, root_dir='.', context=None): self.config = config self.name = name self.path = os.path.join(root_dir, self.name) self.ml_backends = [] self.on_boarding = {} self.context = context or {} self.project_obj = None self.source_storage = None self.target_storage = None self.create_storages() self.tasks = None self.label_config_line, self.label_config_full, self.parsed_label_config, self.input_data_tags = None, None, None, None # noqa self.derived_input_schema, self.derived_output_schema = None, None self.load_label_config() self.load_project_and_ml_backends() self.update_derived_input_schema() self.update_derived_output_schema() self.converter = None self.load_converter() self.max_tasks_file_size = 250 def get_storage(self, storage_for): if storage_for == 'source': return self.source_storage elif storage_for == 'target': return self.target_storage def get_available_storage_names(self, storage_for): if storage_for == 'source': return self.get_available_source_storage_names() elif storage_for == 'target': return self.get_available_target_storage_names() @classmethod def get_available_source_storages(cls): return ['tasks-json', 's3', 'gcs'] @classmethod def get_available_target_storages(cls): return ['completions-dir', 's3-completions', 'gcs-completions'] def get_available_source_storage_names(self): names = OrderedDict() nameset = set(self.get_available_source_storages()) for name, desc in get_available_storage_names().items(): # we don't expose configurable filesystem storage in UI to avoid security problems if name in nameset: names[name] = desc return names def get_available_target_storage_names(self): names = OrderedDict() nameset = set(self.get_available_target_storages()) for name, desc in get_available_storage_names().items(): # blobs have no sense for target storages if name in nameset: names[name] = desc return names def create_storages(self): source = self.config['source'] target = self.config['target'] self.source_storage = create_storage(source['type'], 'source', source['path'], self.path, self, **source.get('params', {})) self.target_storage = create_storage(target['type'], 'target', target['path'], self.path, self, **target.get('params', {})) def update_storage(self, storage_for, storage_kwargs): def _update_storage(storage_for, storage_kwargs): storage_name = storage_kwargs.pop('name', storage_for) storage_type = storage_kwargs.pop('type') storage_path = storage_kwargs.pop('path', None) # storage_path = self.config[storage_for]['path'] storage = create_storage(storage_type, storage_name, storage_path, self.path, self, **storage_kwargs) self.config[storage_for] = { 'name': storage_name, 'type': storage_type, 'path': storage_path, 'params': storage_kwargs } self._save_config() logger.debug('Created storage type "' + storage_type + '"') return storage if storage_for == 'source': self.source_storage = _update_storage('source', storage_kwargs) elif storage_for == 'target': self.target_storage = _update_storage('target', storage_kwargs) self.update_derived_input_schema() self.update_derived_output_schema() @property def can_manage_tasks(self): return self.config['source']['type'] not in { 's3', 's3-completions', 'gcs', 'gcs-completions' } @property def can_manage_completions(self): return self.config['target']['type'] not in { 's3', 's3-completions', 'gcs', 'gcs-completions' } @property def can_delete_tasks(self): return self.can_manage_tasks and self.can_manage_completions @property def data_types_json(self): return self.project_obj.data_types_json def load_label_config(self): self.label_config_full = config_comments_free( open(self.config['label_config'], encoding='utf8').read()) self.label_config_line = config_line_stripped(self.label_config_full) self.parsed_label_config = parse_config(self.label_config_line) self.input_data_tags = self.get_input_data_tags(self.label_config_line) def update_derived_input_schema(self): self.derived_input_schema = set() for task_id, task in self.source_storage.items(): data_keys = set(task['data'].keys()) if not self.derived_input_schema: self.derived_input_schema = data_keys else: self.derived_input_schema &= data_keys logger.debug('Derived input schema: ' + str(self.derived_input_schema)) def update_derived_output_schema(self): self.derived_output_schema = { 'from_name_to_name_type': set(), 'labels': defaultdict(set) } # for all already completed tasks we update derived output schema for further label config validation for task_id, c in self.target_storage.items(): for completion in c['completions']: self._update_derived_output_schema(completion) logger.debug('Derived output schema: ' + str(self.derived_output_schema)) def add_ml_backend(self, params, raise_on_error=True): ml_backend = MLBackend.from_params(params) if not ml_backend.connected and raise_on_error: raise ValueError('ML backend with URL: "' + str(params['url']) + '" is not connected.') self.ml_backends.append(ml_backend) def remove_ml_backend(self, name): # remove from memory remove_idx = next( (i for i, b in enumerate(self.ml_backends) if b.model_name == name), None) if remove_idx is None: raise KeyError('Can\'t remove ML backend with name "' + name + '": not found.') self.ml_backends.pop(remove_idx) # remove from config config_params = self.config.get('ml_backends', []) remove_idx = next( (i for i, b in enumerate(config_params) if b['name'] == name), None) if remove_idx is not None: config_params.pop(remove_idx) self.config['ml_backends'] = config_params self._save_config() def load_project_and_ml_backends(self): # configure project self.project_obj = ProjectObj(label_config=self.label_config_line, label_config_full=self.label_config_full) # configure multiple machine learning backends self.ml_backends = [] ml_backends_params = self.config.get('ml_backends', []) for ml_backend_params in ml_backends_params: self.add_ml_backend(ml_backend_params, raise_on_error=False) def load_converter(self): self.converter = Converter(self.parsed_label_config) @property def id(self): return self.project_obj.id @property def data_types(self): return self.project_obj.data_types @property def label_config(self): return self.project_obj.label_config @property def ml_backends_connected(self): return len(self.ml_backends) > 0 @property def task_data_login(self): return self.project_obj.task_data_login @property def task_data_password(self): return self.project_obj.task_data_password def extract_data_types(self, config): return self.project_obj.extract_data_types(config) def validate_label_config(self, config_string): logger.debug('Validate label config') self.project_obj.validate_label_config(config_string) logger.debug('Get parsed config') parsed_config = parse_config(config_string) logger.debug('Validate label config on derived input schema') self.validate_label_config_on_derived_input_schema(parsed_config) logger.debug('Validate label config on derived output schema') self.validate_label_config_on_derived_output_schema(parsed_config) def _save_config(self): with io.open(self.config['config_path'], mode='w') as f: json.dump(self.config, f, indent=2) def update_params(self, params): if 'ml_backend' in params: ml_backend_params = self._create_ml_backend_params( params['ml_backend'], self.name) self.add_ml_backend(ml_backend_params) self.config['ml_backends'].append(ml_backend_params) self._save_config() def update_label_config(self, new_label_config): label_config_file = self.config['label_config'] # save xml label config to file new_label_config = new_label_config.replace('\r\n', '\n') with io.open(label_config_file, mode='w', encoding='utf8') as f: f.write(new_label_config) # reload everything that depends on label config self.load_label_config() self.update_derived_output_schema() self.load_project_and_ml_backends() self.load_converter() # save project config state self.config['label_config_updated'] = True with io.open(self.config['config_path'], mode='w', encoding='utf8') as f: json.dump(self.config, f) logger.info( 'Label config saved to: {path}'.format(path=label_config_file)) def _update_derived_output_schema(self, completion): """ Given completion, output schema is updated. Output schema consists of unique tuples (from_name, to_name, type) and list of unique labels derived from existed completions :param completion: :return: """ for result in completion['result']: result_type = result.get('type') if result_type in ('relation', 'rating', 'pairwise'): continue if 'from_name' not in result or 'to_name' not in result: logger.error( 'Unexpected completion.result format: "from_name" or "to_name" not found in %r' % result) continue self.derived_output_schema['from_name_to_name_type'].add( (result['from_name'], result['to_name'], result_type)) for label in result['value'].get(result_type, []): self.derived_output_schema['labels'][result['from_name']].add( label) def validate_label_config_on_derived_input_schema( self, config_string_or_parsed_config): """ Validate label config on input schemas (tasks types and data keys) derived from imported tasks :param config_string_or_parsed_config: label config string or parsed config object :return: True if config match already imported tasks """ # check if schema exists, i.e. at least one task has been uploaded if not self.derived_input_schema: return config = config_string_or_parsed_config if isinstance(config, str): config = parse_config(config) input_types, input_values = set(), set() for input_items in map(itemgetter('inputs'), config.values()): for input_item in input_items: input_types.add(input_item['type']) input_values.add(input_item['value']) # check input data values: they must be in schema for item in input_values: if item not in self.derived_input_schema: raise ValidationError( 'You have already imported tasks and they are incompatible with a new config. ' 'You\'ve specified value=${item}, but imported tasks contain only keys: {input_schema_values}' .format(item=item, input_schema_values=list( self.derived_input_schema))) def validate_label_config_on_derived_output_schema( self, config_string_or_parsed_config): """ Validate label config on output schema (from_names, to_names and labeling types) derived from completions :param config_string_or_parsed_config: label config string or parsed config object :return: True if config match already created completions """ output_schema = self.derived_output_schema # check if schema exists, i.e. at least one completion has been created if not output_schema['from_name_to_name_type']: return config = config_string_or_parsed_config if isinstance(config, str): config = parse_config(config) completion_tuples = set() for from_name, to in config.items(): completion_tuples.add( (from_name, to['to_name'][0], to['type'].lower())) for from_name, to_name, type in output_schema[ 'from_name_to_name_type']: if (from_name, to_name, type) not in completion_tuples: raise ValidationError( 'You\'ve already completed some tasks, but some of them couldn\'t be loaded with this config: ' 'name={from_name}, toName={to_name}, type={type} are expected' .format(from_name=from_name, to_name=to_name, type=type)) for from_name, expected_label_set in output_schema['labels'].items(): if from_name not in config: raise ValidationError( 'You\'ve already completed some tasks, but some of them couldn\'t be loaded with this config: ' 'name=' + from_name + ' is expected') found_labels = set(config[from_name]['labels']) extra_labels = list(expected_label_set - found_labels) if extra_labels: raise ValidationError( 'You\'ve already completed some tasks, but some of them couldn\'t be loaded with this config: ' 'there are labels already created for "{from_name}":\n{extra_labels}' .format(from_name=from_name, extra_labels=extra_labels)) def no_tasks(self): return self.source_storage.empty() def delete_tasks(self): """ Deletes all tasks & completions from filesystem, then reloads clean project :return: """ self.source_storage.remove_all() self.target_storage.remove_all() self.update_derived_input_schema() self.update_derived_output_schema() # delete everything on ML backend if self.ml_backends_connected: for m in self.ml_backends: m.clear(self) def next_task(self, completed_tasks_ids): completed_tasks_ids = set(completed_tasks_ids) sampling = self.config.get('sampling', 'sequential') # Tasks are ordered ascending by their "id" fields. This is default mode. task_iter = filter(lambda i: i not in completed_tasks_ids, sorted(self.source_storage.ids())) if sampling == 'sequential': task_id = next(task_iter, None) if task_id is not None: return self.source_storage.get(task_id) # Tasks are sampled with equal probabilities elif sampling == 'uniform': actual_tasks_ids = list(task_iter) if not actual_tasks_ids: return None random.shuffle(actual_tasks_ids) return self.source_storage.get(actual_tasks_ids[0]) # Task with minimum / maximum average prediction score is taken elif sampling.startswith('prediction-score'): id_score_map = {} for task_id, task in self.source_storage.items(): if task_id in completed_tasks_ids: continue if 'predictions' in task and len(task['predictions']) > 0: score = sum((p['score'] for p in task['predictions']), 0) / len(task['predictions']) id_score_map[task_id] = score if not id_score_map: return None if sampling.endswith('-min'): best_idx = min(id_score_map, key=id_score_map.get) elif sampling.endswith('-max'): best_idx = max(id_score_map, key=id_score_map.get) else: raise NotImplementedError('Unknown sampling method ' + sampling) return self.source_storage.get(best_idx) else: raise NotImplementedError('Unknown sampling method ' + sampling) def remove_task(self, task_id): self.source_storage.remove(task_id) self.delete_completion(task_id) self.update_derived_input_schema() self.update_derived_output_schema() def get_completions_ids(self): """ List completion ids from output_dir directory :return: filenames without extensions and directories """ task_ids = set(self.source_storage.ids()) completion_ids = set(self.target_storage.ids()) completions = completion_ids.intersection(task_ids) #completions = list(self.target_storage.ids()) logger.debug('{num} completions found in {output_dir}'.format( num=len(completions), output_dir=self.config["output_dir"])) return sorted(completions) def get_completed_at(self): """ Get completed time for tasks :return: list of string with formatted datetime """ times = {} for _, data in self.target_storage.items(): id = data['id'] try: times[id] = max(data['completions'], key=itemgetter('created_at'))['created_at'] except Exception as exc: times[id] = 'undefined' return times def get_skipped_status(self): """ Get skipped status for tasks: returns skipped completion number for task :return: list of int """ items = {} for _, data in self.target_storage.items(): id = data['id'] try: flag = sum([ completion.get('skipped', False) for completion in data['completions'] ]) except Exception as exc: items[id] = -1 else: items[id] = flag return items def get_task_with_completions(self, task_id): """ Get task with completions :param task_id: task ids :return: json dict with completion """ data = self.target_storage.get(task_id) logger.debug('Get task ' + str(task_id) + ' from target storage') if data: logger.debug('Get predictions ' + str(task_id) + ' from source storage') # tasks can hold the newest version of predictions, so task it from tasks data['predictions'] = self.source_storage.get(task_id).get( 'predictions', []) return data def save_completion(self, task_id, completion): """ Save completion :param task_id: task id :param completion: json data from label (editor) """ # try to get completions with task first task = self.get_task_with_completions(task_id) # init task if completions with task not exists if not task: task = deepcopy(self.source_storage.get(task_id)) task['completions'] = [] else: task = deepcopy(task) # remove possible stored predictions task.pop('predictions', None) # update old completion updated = False if 'id' in completion: for i, item in enumerate(task['completions']): if item['id'] == completion['id']: task['completions'][i].update(completion) updated = True # write new completion if not updated: completion['id'] = task['id'] * 1000 + len(task['completions']) + 1 task['completions'].append(completion) try: self._update_derived_output_schema(completion) except Exception as exc: logger.error(exc, exc_info=True) logger.debug(json.dumps(completion, indent=2)) # save completion time completion['created_at'] = timestamp_now() # write task + completions to file self.target_storage.set(task_id, task) logger.debug('Completion ' + str(task_id) + ' saved:\n' + json.dumps(task, indent=2)) return completion['id'] def delete_completion(self, task_id): """ Delete completion :param task_id: task id """ self.target_storage.remove(task_id) self.update_derived_output_schema() def delete_all_completions(self): """ Delete all completions from project """ self.target_storage.remove_all() self.update_derived_output_schema() def make_predictions(self, task): task = deepcopy(task) stored_predictions = task.get('predictions') task['predictions'] = [] try: for ml_backend in self.ml_backends: if not ml_backend.connected: continue predictions = ml_backend.make_predictions(task, self) predictions['created_by'] = ml_backend.model_name predictions['created_date'] = datetime.now().isoformat() task['predictions'].append(predictions) except Exception as exc: logger.debug(exc, exc_info=True) if not task['predictions'] and stored_predictions: task['predictions'] = stored_predictions return task def train(self): completions = [] for _, c in self.target_storage.items(): completions.append(c) train_status = False if self.ml_backends_connected: for ml_backend in self.ml_backends: if ml_backend.connected: ml_backend.train(completions, self) train_status = True return train_status @classmethod def get_project_dir(cls, project_name, args): return os.path.join(args.root_dir, project_name) @classmethod def get_input_data_tags(cls, label_config): tag_iter = ElementTree.fromstring(label_config).iter() return [ tag for tag in tag_iter if tag.attrib.get('name') and tag.attrib.get('value', '').startswith('$') ] @classmethod def _load_tasks(cls, input_path, args, label_config_file): with io.open(label_config_file, encoding='utf8') as f: label_config = f.read() task_loader = Tasks() if args.input_format == 'json': return task_loader.from_json_file(input_path) if args.input_format == 'json-dir': return task_loader.from_dir_with_json_files(input_path) input_data_tags = cls.get_input_data_tags(label_config) if len(input_data_tags) > 1: val = ",".join(tag.attrib.get("name") for tag in input_data_tags) print('Warning! Multiple input data tags found: ' + val + '. Only first one is used.') elif len(input_data_tags) == 0: raise ValueError( 'You\'ve specified input format "{fmt}" which requires label config being explicitly defined. ' 'Please specify --label-config=path/to/config.xml or use --format=json or format=json_dir' .format(fmt=args.input_format)) input_data_tag = input_data_tags[0] data_key = input_data_tag.attrib.get('value').lstrip('$') if args.input_format == 'text': return task_loader.from_text_file(input_path, data_key) if args.input_format == 'text-dir': return task_loader.from_dir_with_text_files(input_path, data_key) if args.input_format == 'image-dir': return task_loader.from_dir_with_image_files(input_path, data_key) if args.input_format == 'audio-dir': return task_loader.from_dir_with_audio_files(input_path, data_key) raise RuntimeError('Can\'t load tasks for input format={}'.format( args.input_format)) @classmethod def _create_ml_backend_params(cls, url, project_name=None): if '=http' in url: name, url = url.split('=', 1) else: project_name = os.path.basename(project_name or '') name = project_name + str(uuid4())[:4] if not is_url(url): raise ValueError('Specified string "' + url + '" doesn\'t look like URL.') return {'url': url, 'name': name} @classmethod def create_project_dir(cls, project_name, args): """ Create project directory in args.root_dir/project_name, and initialize there all required files If some files are missed, restore them from defaults. If config files are specified by args, copy them in project directory :param project_name: :param args: :return: """ dir = cls.get_project_dir(project_name, args) if args.force: delete_dir_content(dir) os.makedirs(dir, exist_ok=True) config = json_load( args.config_path) if args.config_path else json_load( find_file('default_config.json')) def already_exists_error(what, path): raise RuntimeError( '{path} {what} already exists. Use "--force" option to recreate it.' .format(path=path, what=what)) input_path = args.input_path or config.get('input_path') # save label config config_xml = 'config.xml' config_xml_path = os.path.join(dir, config_xml) label_config_file = args.label_config or config.get('label_config') if label_config_file: copy2(label_config_file, config_xml_path) print(label_config_file + ' label config copied to ' + config_xml_path) else: if os.path.exists(config_xml_path) and not args.force: already_exists_error('label config', config_xml_path) if not input_path: # create default config with polygons only if input data is not set default_label_config = find_file( 'examples/image_polygons/config.xml') copy2(default_label_config, config_xml_path) print(default_label_config + ' label config copied to ' + config_xml_path) else: with io.open(config_xml_path, mode='w') as fout: fout.write('<View></View>') print('Empty config has been created in ' + config_xml_path) config['label_config'] = config_xml if args.source: config['source'] = { 'type': args.source, 'path': args.source_path, 'params': args.source_params } else: # save tasks.json tasks_json = 'tasks.json' tasks_json_path = os.path.join(dir, tasks_json) if input_path: tasks = cls._load_tasks(input_path, args, config_xml_path) else: tasks = {} with io.open(tasks_json_path, mode='w') as fout: json.dump(tasks, fout, indent=2) config['input_path'] = tasks_json config['source'] = { 'name': 'Tasks', 'type': 'tasks-json', 'path': os.path.abspath(tasks_json_path) } logger.debug( '{tasks_json_path} input file with {n} tasks has been created from {input_path}' .format(tasks_json_path=tasks_json_path, n=len(tasks), input_path=input_path)) if args.target: config['target'] = { 'type': args.target, 'path': args.target_path, 'params': args.target_params } else: completions_dir = os.path.join(dir, 'completions') if os.path.exists(completions_dir) and not args.force: already_exists_error('output dir', completions_dir) if os.path.exists(completions_dir): delete_dir_content(completions_dir) print(completions_dir + ' output dir already exists. Clear it.') else: os.makedirs(completions_dir, exist_ok=True) print(completions_dir + ' output dir has been created.') config['output_dir'] = 'completions' config['target'] = { 'name': 'Completions', 'type': 'completions-dir', 'path': os.path.abspath(completions_dir) } if 'ml_backends' not in config or not isinstance( config['ml_backends'], list): config['ml_backends'] = [] if args.ml_backends: for url in args.ml_backends: config['ml_backends'].append( cls._create_ml_backend_params(url, project_name)) if args.sampling: config['sampling'] = args.sampling if args.port: config['port'] = args.port if args.host: config['host'] = args.host if args.allow_serving_local_files: config['allow_serving_local_files'] = True if args.key_file and args.cert_file: config['protocol'] = 'https://' config['cert'] = args.cert_file config['key'] = args.key_file # create config.json config_json = 'config.json' config_json_path = os.path.join(dir, config_json) if os.path.exists(config_json_path) and not args.force: already_exists_error('config', config_json_path) with io.open(config_json_path, mode='w') as f: json.dump(config, f, indent=2) print('') print( 'Label Studio has been successfully initialized. Check project states in ' + dir) print('Start the server: label-studio start ' + dir) return dir @classmethod def get_config(cls, project_name, args): return cls._get_config(cls.get_project_dir(project_name, args)) @classmethod def _get_config(cls, project_dir, args=None): """ Get config from input args Namespace acquired by Argparser :param args: :return: """ # check if project directory exists if not os.path.exists(project_dir): project_name = args.project_name if args is not None else '<project_name>' raise FileNotFoundError( 'Couldn\'t find directory ' + project_dir + ', maybe you\'ve missed appending "--init" option:\nlabel-studio start ' + project_name + ' --init') # check config.json exists in directory config_path = os.path.join(project_dir, 'config.json') if not os.path.exists(config_path): project_name = args.project_name if args is not None else '<project_name>' raise FileNotFoundError( 'Couldn\'t find config file ' + config_path + ' in project directory ' + project_dir + ', maybe you\'ve missed appending "--init" option:\nlabel-studio start ' + project_name + ' --init') config_path = os.path.abspath(config_path) with io.open(config_path) as c: config = json.load(c) config['config_path'] = config_path if config.get('input_path'): config['input_path'] = os.path.join(os.path.dirname(config_path), config['input_path']) config['label_config'] = os.path.join(os.path.dirname(config_path), config['label_config']) if config.get('output_dir'): config['output_dir'] = os.path.join(os.path.dirname(config_path), config['output_dir']) if not config.get('source'): config['source'] = { 'name': 'Tasks', 'type': 'tasks-json', 'path': os.path.abspath(config['input_path']) } if not config.get('target'): config['target'] = { 'name': 'Completions', 'type': 'completions-dir', 'path': os.path.abspath(config['output_dir']) } return config @classmethod def _load_from_dir(cls, project_dir, project_name, args, context): config = cls._get_config(project_dir, args) return cls(config, project_name, context=context, root_dir=args.root_dir) @classmethod def get(cls, project_name, args, context): # If project stored in memory, just return it if project_name in cls._storage: return cls._storage[project_name] # If project directory exists, load project from directory and update in-memory storage project_dir = cls.get_project_dir(project_name, args) if os.path.exists(project_dir): project = cls._load_from_dir(project_dir, project_name, args, context) cls._storage[project_name] = project return project raise ProjectNotFound( 'Project {p} doesn\'t exist'.format(p=project_name)) @classmethod def create(cls, project_name, args, context): # "create" method differs from "get" as it can create new directory with project resources project_dir = cls.create_project_dir(project_name, args) project = cls._load_from_dir(project_dir, project_name, args, context) cls._storage[project_name] = project return project @classmethod def get_or_create(cls, project_name, args, context): try: project = cls.get(project_name, args, context) logger.info('Get project "' + project_name + '".') except ProjectNotFound: project = cls.create(project_name, args, context) logger.info('Project "' + project_name + '" created.') return project def update_on_boarding_state(self): self.on_boarding['setup'] = self.config.get('label_config_updated', False) self.on_boarding['import'] = not self.no_tasks() self.on_boarding['labeled'] = not self.target_storage.empty() return self.on_boarding @property def generate_sample_task_escape(self): return self.project_obj.generate_sample_task_escape @property def supported_formats(self): return self.project_obj.supported_formats def serialize(self): """ Serialize project to json dict """ project = self banlist = ('json', 'dir-jsons') available_storages = list( filter(lambda i: i[0] not in banlist, get_available_storage_names().items())) output = { 'project_name': project.name, 'task_count': len(project.source_storage.ids()), 'completion_count': len(project.get_completions_ids()), 'config': project.config, 'can_manage_tasks': project.can_manage_tasks, 'can_manage_completions': project.can_manage_completions, 'can_delete_tasks': project.can_delete_tasks, 'target_storage': { 'readable_path': project.target_storage.readable_path }, 'source_storage': { 'readable_path': project.source_storage.readable_path }, 'available_storages': available_storages, 'source_syncing': self.source_storage.is_syncing, 'target_syncing': self.target_storage.is_syncing, 'data_types': self.data_types, 'label_config_line': self.label_config_line } return output
class Project(object): _storage = {} _allowed_extensions = { 'Text': ('.txt', ), 'Image': ('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'), 'Audio': ('.wav', '.aiff', '.mp3', '.au', '.flac') } def __init__(self, config, name, context=None): self.config = config self.name = name self.on_boarding = {} self.context = context or {} self.tasks = None self.label_config_line, self.label_config_full, self.input_data_tags = None, None, None self.derived_input_schema, self.derived_output_schema = None, None self.load_tasks() self.load_label_config() self.load_derived_schemas() self.analytics = None self.load_analytics() self.project_obj, self.ml_backend = None, None self.load_project_ml_backend() self.converter = None self.load_converter() def load_tasks(self): self.tasks = {} self.derived_input_schema = set() tasks = json_load(self.config['input_path']) if len(tasks) == 0: logger.warning('No tasks loaded from ' + self.config['input_path']) return for task_id, task in tasks.items(): self.tasks[int(task_id)] = task data_keys = set(task['data'].keys()) if not self.derived_input_schema: self.derived_input_schema = data_keys else: self.derived_input_schema &= data_keys print( str(len(self.tasks)) + ' tasks loaded from: ' + self.config['input_path']) def load_label_config(self): self.label_config_full = config_comments_free( open(self.config['label_config']).read()) self.label_config_line = config_line_stripped(self.label_config_full) self.input_data_tags = self.get_input_data_tags(self.label_config_line) def load_derived_schemas(self): self.derived_output_schema = { 'from_name_to_name_type': set(), 'labels': defaultdict(set) } # for all already completed tasks we update derived output schema for further label config validation for task_id in self.get_task_ids(): task_with_completions = self.get_task_with_completions(task_id) if task_with_completions and 'completions' in task_with_completions: completions = task_with_completions['completions'] for completion in completions: self._update_derived_output_schema(completion) def load_analytics(self): collect_analytics = os.getenv('collect_analytics') if collect_analytics is None: collect_analytics = self.config.get('collect_analytics', True) collect_analytics = bool(collect_analytics) self.analytics = Analytics(self.label_config_line, collect_analytics, self.name, self.context) def load_project_ml_backend(self): # configure project self.project_obj = ProjectObj(label_config=self.label_config_line, label_config_full=self.label_config_full) # configure machine learning backend ml_backend_params = self.config.get('ml_backend') if ml_backend_params: self.ml_backend = MLBackend.from_params(ml_backend_params) self.project_obj.connect(self.ml_backend) def load_converter(self): self.converter = Converter(self.label_config_full) @property def id(self): return self.project_obj.id @property def data_types(self): return self.project_obj.data_types @property def label_config(self): return self.project_obj.label_config def extract_data_types(self, config): return self.project_obj.extract_data_types(config) def validate_label_config(self, config_string): self.project_obj.validate_label_config(config_string) parsed_config = parse_config(config_string) self.validate_label_config_on_derived_input_schema(parsed_config) self.validate_label_config_on_derived_output_schema(parsed_config) def update_label_config(self, new_label_config): label_config_file = self.config['label_config'] # save xml label config to file with io.open(label_config_file, mode='w') as f: f.write(new_label_config) # reload everything that depends on label config self.load_label_config() self.load_derived_schemas() self.load_analytics() self.load_project_ml_backend() self.load_converter() # save project config state self.config['label_config_updated'] = True with io.open(self.config['config_path'], mode='w') as f: json.dump(self.config, f) logger.info( 'Label config saved to: {path}'.format(path=label_config_file)) @classmethod def _get_single_input_value(cls, input_data_tags): if len(input_data_tags) > 1: val = ",".join(tag.attrib.get("name") for tag in input_data_tags) print('Warning! Multiple input data tags found: ' + val + '. Only first one is used.') input_data_tag = input_data_tags[0] data_key = input_data_tag.attrib.get('value').lstrip('$') return data_key def _update_derived_output_schema(self, completion): """ Given completion, output schema is updated. Output schema consists of unique tuples (from_name, to_name, type) and list of unique labels derived from existed completions :param completion: :return: """ for result in completion['result']: result_type = result.get('type') if result_type == 'relation': continue if 'from_name' not in result or 'to_name' not in result: logger.error( 'Unexpected completion.result format: "from_name" or "to_name" not found in %r' % result) continue self.derived_output_schema['from_name_to_name_type'].add( (result['from_name'], result['to_name'], result_type)) for label in result['value'].get(result_type, []): self.derived_output_schema['labels'][result['from_name']].add( label) def validate_label_config_on_derived_input_schema( self, config_string_or_parsed_config): """ Validate label config on input schemas (tasks types and data keys) derived from imported tasks :param config_string_or_parsed_config: label config string or parsed config object :return: True if config match already imported tasks """ # check if schema exists, i.e. at least one task has been uploaded if not self.derived_input_schema: return config = config_string_or_parsed_config if isinstance(config, str): config = parse_config(config) input_types, input_values = set(), set() for input_items in map(itemgetter('inputs'), config.values()): for input_item in input_items: input_types.add(input_item['type']) input_values.add(input_item['value']) # check input data values: they must be in schema for item in input_values: if item not in self.derived_input_schema: raise ValidationError( 'You have already imported tasks and they are incompatible with a new config. ' 'You\'ve specified value=${item}, but imported tasks contain only keys: {input_schema_values}' .format(item=item, input_schema_values=list( self.derived_input_schema))) def validate_label_config_on_derived_output_schema( self, config_string_or_parsed_config): """ Validate label config on output schema (from_names, to_names and labeling types) derived from completions :param config_string_or_parsed_config: label config string or parsed config object :return: True if config match already created completions """ output_schema = self.derived_output_schema # check if schema exists, i.e. at least one completion has been created if not output_schema['from_name_to_name_type']: return config = config_string_or_parsed_config if isinstance(config, str): config = parse_config(config) completion_tuples = set() for from_name, to in config.items(): completion_tuples.add( (from_name, to['to_name'][0], to['type'].lower())) for from_name, to_name, type in output_schema[ 'from_name_to_name_type']: if (from_name, to_name, type) not in completion_tuples: raise ValidationError( 'You\'ve already completed some tasks, but some of them couldn\'t be loaded with this config: ' 'name={from_name}, toName={to_name}, type={type} are expected' .format(from_name=from_name, to_name=to_name, type=type)) for from_name, expected_label_set in output_schema['labels'].items(): if from_name not in config: raise ValidationError( 'You\'ve already completed some tasks, but some of them couldn\'t be loaded with this config: ' 'name=' + from_name + ' is expected') found_labels = set(config[from_name]['labels']) extra_labels = list(expected_label_set - found_labels) if extra_labels: raise ValidationError( 'You\'ve already completed some tasks, but some of them couldn\'t be loaded with this config: ' 'there are labels already created for "{from_name}":\n{extra_labels}' .format(from_name=from_name, extra_labels=extra_labels)) def get_tasks(self): """ Load tasks from JSON files in input_path directory :return: file list """ return self.tasks def delete_tasks(self): """ Deletes all tasks & completions from filesystem, then reloads clean project :return: """ delete_dir_content(self.config['output_dir']) if os.path.exists(self.config['input_path']) and os.path.isfile( self.config['input_path']): with io.open(self.config['input_path'], mode='w') as f: json.dump({}, f) # delete everything on ML backend if self.ml_backend: self.ml_backend.clear(self) # reload everything related to tasks self.load_tasks() self.load_derived_schemas() def next_task(self, completed_tasks_ids): completed_tasks_ids = set(completed_tasks_ids) sampling = self.config.get('sampling', 'sequential') if sampling == 'sequential': actual_tasks = (self.tasks[task_id] for task_id in self.tasks if task_id not in completed_tasks_ids) return next(actual_tasks, None) elif sampling == 'uniform': actual_tasks_ids = [ task_id for task_id in self.tasks if task_id not in completed_tasks_ids ] if not actual_tasks_ids: return None random.shuffle(actual_tasks_ids) return self.tasks[actual_tasks_ids[0]] else: raise NotImplementedError('Unknown sampling method ' + sampling) def get_task_ids(self): """ Get task ids only :return: list of task ids """ return list(self.tasks.keys()) def get_task(self, task_id): """ Get one task :param task_id: :return: task """ try: task_id = int(task_id) except ValueError: return None return self.tasks.get(task_id) def get_completions_ids(self): """ List completion ids from output_dir directory :return: filenames without extensions and directories """ root_dir = self.config['output_dir'] os.mkdir(root_dir) if not os.path.exists(root_dir) else () files = os.listdir(root_dir) completions = [ int(os.path.splitext(f)[0]) for f in files if f.endswith('.json') ] logger.debug('{num} completions found in {output_dir}'.format( num=len(completions), output_dir=self.config["output_dir"])) return sorted(completions) def get_completed_at(self, task_ids): """ Get completed time for list of task ids :param task_ids: list of task ids :return: list of string with formatted datetime """ root_dir = self.config['output_dir'] existing_completions = set(self.get_completions_ids()) ids = existing_completions.intersection(task_ids) times = { i: os.path.getmtime(os.path.join(root_dir, str(i) + '.json')) for i in ids } times = { i: datetime.fromtimestamp(t).strftime('%Y-%m-%d %H:%M:%S') for i, t in times.items() } return times def get_task_with_completions(self, task_id): """ Get task with completions :param task_id: task ids :return: json dict with completion """ try: task_id = int( task_id ) # check task_id is int (disallow to escape from output_dir) except ValueError: return None if 'completions' in self.tasks[task_id]: return self.tasks[task_id] filename = os.path.join(self.config['output_dir'], str(task_id) + '.json') if os.path.exists(filename): data = json.load(open(filename)) # tasks can hold the newest version of predictions, so task it from tasks data['predictions'] = self.tasks[task_id].get('predictions', []) else: data = None return data def save_completion(self, task_id, completion): """ Save completion :param task_id: task id :param completion: json data from label (editor) """ # try to get completions with task first task = self.get_task_with_completions(task_id) # init task if completions with task not exists if not task: task = self.get_task(task_id) task['completions'] = [] # update old completion updated = False if 'id' in completion: for i, item in enumerate(task['completions']): if item['id'] == completion['id']: task['completions'][i].update(completion) updated = True # write new completion if not updated: completion['id'] = task['id'] * 1000 + len(task['completions']) + 1 task['completions'].append(completion) self._update_derived_output_schema(completion) # write task + completions to file filename = os.path.join(self.config['output_dir'], str(task_id) + '.json') os.mkdir(self.config['output_dir']) if not os.path.exists( self.config['output_dir']) else () json.dump(task, open(filename, 'w'), indent=4, sort_keys=True) return completion['id'] def delete_completion(self, task_id): """ Delete completion from disk :param task_id: task id """ filename = os.path.join(self.config['output_dir'], str(task_id) + '.json') os.remove(filename) @classmethod def get_project_dir(cls, project_name, args): return os.path.join(args.root_dir, project_name) @classmethod def get_input_data_tags(cls, label_config): tag_iter = ElementTree.fromstring(label_config).iter() return [ tag for tag in tag_iter if tag.attrib.get('name') and tag.attrib.get('value', '').startswith('$') ] @classmethod def _load_tasks(cls, input_path, args, label_config_file): with io.open(label_config_file) as f: label_config = f.read() task_loader = Tasks() if args.input_format == 'json': return task_loader.from_json_file(input_path) if args.input_format == 'json-dir': return task_loader.from_dir_with_json_files(input_path) input_data_tags = cls.get_input_data_tags(label_config) data_key = Project._get_single_input_value(input_data_tags) if args.input_format == 'text': return task_loader.from_text_file(input_path, data_key) if args.input_format == 'text-dir': return task_loader.from_dir_with_text_files(input_path, data_key) if args.input_format == 'image-dir': return task_loader.from_dir_with_image_files(input_path, data_key) if args.input_format == 'audio-dir': return task_loader.from_dir_with_audio_files(input_path, data_key) raise RuntimeError('Can\'t load tasks for input format={}'.format( args.input_format)) @classmethod def create_project_dir(cls, project_name, args): """ Create project directory in args.root_dir/project_name, and initialize there all required files If some files are missed, restore them from defaults. If config files are specified by args, copy them in project directory :param project_name: :param args: :return: """ dir = cls.get_project_dir(project_name, args) os.makedirs(dir, exist_ok=True) config = json_load( args.config_path) if args.config_path else json_load( find_file('default_config.json')) def already_exists_error(what, path): raise RuntimeError( '{path} {what} already exists. Use "--force" option to recreate it.' .format(path=path, what=what)) input_path = args.input_path or config.get('input_path') # save label config config_xml = 'config.xml' config_xml_path = os.path.join(dir, config_xml) label_config_file = args.label_config or config.get('label_config') if label_config_file: copy2(label_config_file, config_xml_path) print(label_config_file + ' label config copied to ' + config_xml_path) else: if os.path.exists(config_xml_path) and not args.force: already_exists_error('label config', config_xml_path) if not input_path: # create default config with polygons only if input data is not set default_label_config = find_file( 'examples/image_polygons/config.xml') copy2(default_label_config, config_xml_path) print(default_label_config + ' label config copied to ' + config_xml_path) else: with io.open(config_xml_path, mode='w') as fout: fout.write('<View></View>') print('Empty config has been created in ' + config_xml_path) config['label_config'] = config_xml # save tasks.json tasks_json = 'tasks.json' tasks_json_path = os.path.join(dir, tasks_json) if input_path: tasks = cls._load_tasks(input_path, args, config_xml_path) with io.open(tasks_json_path, mode='w') as fout: json.dump(tasks, fout, indent=2) print(tasks_json_path + ' input path has been created from ' + input_path) else: if os.path.exists(tasks_json_path) and not args.force: already_exists_error('input path', tasks_json_path) with io.open(tasks_json_path, mode='w') as fout: json.dump({}, fout) print(tasks_json_path + ' input path has been created with empty tasks.') config['input_path'] = tasks_json # create completions dir completions_dir = os.path.join(dir, 'completions') if os.path.exists(completions_dir) and not args.force: already_exists_error('output dir', completions_dir) if os.path.exists(completions_dir): delete_dir_content(completions_dir) print(completions_dir + ' output dir already exists. Clear it.') else: os.makedirs(completions_dir, exist_ok=True) print(completions_dir + ' output dir has been created.') config['output_dir'] = 'completions' if args.ml_backend_url: if 'ml_backend' not in config or not isinstance( config['ml_backend'], dict): config['ml_backend'] = {} config['ml_backend']['url'] = args.ml_backend_url if args.ml_backend_name: config['ml_backend']['name'] = args.ml_backend_name else: config['ml_backend']['name'] = str(uuid4()) # create config.json config_json = 'config.json' config_json_path = os.path.join(dir, config_json) if os.path.exists(config_json_path) and not args.force: already_exists_error('config', config_json_path) with io.open(config_json_path, mode='w') as f: json.dump(config, f, indent=2) print('') print( 'Label Studio has been successfully initialized. Check project states in ' + dir) print('Start the server: label-studio start ' + dir) return dir @classmethod def _get_config(cls, project_dir, args): """ Get config from input args Namespace acquired by Argparser :param args: :return: """ # check if project directory exists if not os.path.exists(project_dir): raise FileNotFoundError( 'Couldn\'t find directory ' + project_dir + ', maybe you\'ve missed appending "--init" option:\nlabel-studio start ' + args.project_name + ' --init') # check config.json exists in directory config_path = os.path.join(project_dir, 'config.json') if not os.path.exists(config_path): raise FileNotFoundError( 'Couldn\'t find config file ' + config_path + ' in project directory ' + project_dir + ', maybe you\'ve missed appending "--init" option:\nlabel-studio start ' + args.project_name + ' --init') config_path = os.path.abspath(config_path) with io.open(config_path) as c: config = json.load(c) config['config_path'] = config_path config['input_path'] = os.path.join(os.path.dirname(config_path), config['input_path']) config['label_config'] = os.path.join(os.path.dirname(config_path), config['label_config']) config['output_dir'] = os.path.join(os.path.dirname(config_path), config['output_dir']) return config @classmethod def _load_from_dir(cls, project_dir, project_name, args, context): config = cls._get_config(project_dir, args) return cls(config, project_name, context) @classmethod def get(cls, project_name, args, context): # If project stored in memory, just return it if project_name in cls._storage: return cls._storage[project_name] # If project directory exists, load project from directory and update in-memory storage project_dir = cls.get_project_dir(project_name, args) if os.path.exists(project_dir): project = cls._load_from_dir(project_dir, project_name, args, context) cls._storage[project_name] = project return project raise ProjectNotFound( 'Project {p} doesn\'t exist'.format(p=project_name)) @classmethod def create(cls, project_name, args, context): # "create" method differs from "get" as it can create new directory with project resources project_dir = cls.create_project_dir(project_name, args) project = cls._load_from_dir(project_dir, project_name, args, context) cls._storage[project_name] = project return project @classmethod def get_or_create(cls, project_name, args, context): try: project = cls.get(project_name, args, context) logger.info('Get project "' + project_name + '".') except ProjectNotFound: project = cls.create(project_name, args, context) logger.info('Project "' + project_name + '" created.') return project def update_on_boarding_state(self): self.on_boarding['setup'] = self.config.get('label_config_updated', False) self.on_boarding['import'] = len(self.tasks) > 0 self.on_boarding['labeled'] = len(os.listdir( self.config['output_dir'])) > 0 return self.on_boarding
class Project(object): _storage = {} def __init__(self, config, name, context=None): self.config = config self.name = name self.on_boarding = {} self.context = context or {} self.tasks = None self.labeling_classes = None self.label_config_line, self.label_config_full, self.parsed_label_config, self.input_data_tags = None, None, None, None # noqa self.derived_input_schema, self.derived_output_schema = None, None self.load_tasks() self.load_label_config() self.load_derived_schemas() self.load_labeling_classes() self.analytics = None self.load_analytics() self.project_obj = None self.ml_backends = [] self.load_project_ml_backend() self.converter = None self.load_converter() self.max_tasks_file_size = 250 def load_tasks(self): self.tasks = {} self.derived_input_schema = set() tasks = json_load(self.config['input_path']) if len(tasks) == 0: logger.warning('No tasks loaded from ' + self.config['input_path']) return for task_id, task in tasks.items(): self.tasks[int(task_id)] = task data_keys = set(task['data'].keys()) if not self.derived_input_schema: self.derived_input_schema = data_keys else: self.derived_input_schema &= data_keys print(str(len(self.tasks)) + ' tasks loaded from: ' + self.config['input_path']) def load_label_config(self): self.label_config_full = config_comments_free(open(self.config['label_config']).read()) self.label_config_line = config_line_stripped(self.label_config_full) self.parsed_label_config = parse_config(self.label_config_line) self.input_data_tags = self.get_input_data_tags(self.label_config_line) def load_derived_schemas(self): self.derived_output_schema = { 'from_name_to_name_type': set(), 'labels': defaultdict(set) } # for all already completed tasks we update derived output schema for further label config validation for task_id in self.get_task_ids(): task_with_completions = self.get_task_with_completions(task_id) if task_with_completions and 'completions' in task_with_completions: completions = task_with_completions['completions'] for completion in completions: self._update_derived_output_schema(completion) def load_analytics(self): collect_analytics = os.getenv('collect_analytics') if collect_analytics is None: collect_analytics = self.config.get('collect_analytics', True) collect_analytics = bool(collect_analytics) self.analytics = Analytics(self.label_config_line, collect_analytics, self.name, self.context) def add_ml_backend(self, params, raise_on_error=True): ml_backend = MLBackend.from_params(params) if not ml_backend.connected and raise_on_error: raise ValueError('ML backend with URL: "' + str(params['url']) + '" is not connected.') self.ml_backends.append(ml_backend) def remove_ml_backend(self, name): # remove from memory remove_idx = next((i for i, b in enumerate(self.ml_backends) if b.model_name == name), None) if remove_idx is None: raise KeyError('Can\'t remove ML backend with name "' + name + '": not found.') self.ml_backends.pop(remove_idx) # remove from config config_params = self.config.get('ml_backends', []) remove_idx = next((i for i, b in enumerate(config_params) if b['name'] == name), None) if remove_idx is not None: config_params.pop(remove_idx) self.config['ml_backends'] = config_params self._save_config() def load_project_ml_backend(self): # configure project self.project_obj = ProjectObj(label_config=self.label_config_line, label_config_full=self.label_config_full) # configure multiple machine learning backends self.ml_backends = [] ml_backends_params = self.config.get('ml_backends', []) for ml_backend_params in ml_backends_params: self.add_ml_backend(ml_backend_params, raise_on_error=False) def load_converter(self): self.converter = Converter(self.parsed_label_config) def load_labeling_classes(self): tree = ElementTree.parse(self.config['label_config']) root = tree.getroot() self.labeling_classes = [i.attrib['value'] for i in root.iter('Label')] @property def id(self): return self.project_obj.id @property def data_types(self): return self.project_obj.data_types @property def label_config(self): return self.project_obj.label_config @property def ml_backends_connected(self): return len(self.ml_backends) > 0 @property def task_data_login(self): return self.project_obj.task_data_login @property def task_data_password(self): return self.project_obj.task_data_password def extract_data_types(self, config): return self.project_obj.extract_data_types(config) def validate_label_config(self, config_string): logger.debug('Validate label config') self.project_obj.validate_label_config(config_string) logger.debug('Get parsed config') parsed_config = parse_config(config_string) logger.debug('Validate label config on derived input schema') self.validate_label_config_on_derived_input_schema(parsed_config) logger.debug('Validate label config on derived output schema') self.validate_label_config_on_derived_output_schema(parsed_config) def _save_config(self): with io.open(self.config['config_path'], mode='w') as f: json.dump(self.config, f, indent=2) def update_params(self, params): if 'ml_backend' in params: ml_backend_params = self._create_ml_backend_params(params['ml_backend']) self.add_ml_backend(ml_backend_params) self.config['ml_backends'].append(ml_backend_params) self._save_config() def update_label_config(self, new_label_config): label_config_file = self.config['label_config'] # save xml label config to file with io.open(label_config_file, mode='w') as f: f.write(new_label_config) # reload everything that depends on label config self.load_label_config() self.load_derived_schemas() self.load_analytics() self.load_project_ml_backend() self.load_converter() # save project config state self.config['label_config_updated'] = True with io.open(self.config['config_path'], mode='w') as f: json.dump(self.config, f) logger.info('Label config saved to: {path}'.format(path=label_config_file)) def _update_derived_output_schema(self, completion): """ Given completion, output schema is updated. Output schema consists of unique tuples (from_name, to_name, type) and list of unique labels derived from existed completions :param completion: :return: """ for result in completion['result']: result_type = result.get('type') if result_type in ('relation', 'rating', 'pairwise'): continue if 'from_name' not in result or 'to_name' not in result: logger.error('Unexpected completion.result format: "from_name" or "to_name" not found in %r' % result) continue self.derived_output_schema['from_name_to_name_type'].add(( result['from_name'], result['to_name'], result_type )) for label in result['value'].get(result_type, []): self.derived_output_schema['labels'][result['from_name']].add(label) def validate_label_config_on_derived_input_schema(self, config_string_or_parsed_config): """ Validate label config on input schemas (tasks types and data keys) derived from imported tasks :param config_string_or_parsed_config: label config string or parsed config object :return: True if config match already imported tasks """ # check if schema exists, i.e. at least one task has been uploaded if not self.derived_input_schema: return config = config_string_or_parsed_config if isinstance(config, str): config = parse_config(config) input_types, input_values = set(), set() for input_items in map(itemgetter('inputs'), config.values()): for input_item in input_items: input_types.add(input_item['type']) input_values.add(input_item['value']) # check input data values: they must be in schema for item in input_values: if item not in self.derived_input_schema: raise ValidationError( 'You have already imported tasks and they are incompatible with a new config. ' 'You\'ve specified value=${item}, but imported tasks contain only keys: {input_schema_values}' .format(item=item, input_schema_values=list(self.derived_input_schema))) def validate_label_config_on_derived_output_schema(self, config_string_or_parsed_config): """ Validate label config on output schema (from_names, to_names and labeling types) derived from completions :param config_string_or_parsed_config: label config string or parsed config object :return: True if config match already created completions """ output_schema = self.derived_output_schema # check if schema exists, i.e. at least one completion has been created if not output_schema['from_name_to_name_type']: return config = config_string_or_parsed_config if isinstance(config, str): config = parse_config(config) completion_tuples = set() for from_name, to in config.items(): completion_tuples.add((from_name, to['to_name'][0], to['type'].lower())) for from_name, to_name, type in output_schema['from_name_to_name_type']: if (from_name, to_name, type) not in completion_tuples: raise ValidationError( 'You\'ve already completed some tasks, but some of them couldn\'t be loaded with this config: ' 'name={from_name}, toName={to_name}, type={type} are expected' .format(from_name=from_name, to_name=to_name, type=type) ) for from_name, expected_label_set in output_schema['labels'].items(): if from_name not in config: raise ValidationError( 'You\'ve already completed some tasks, but some of them couldn\'t be loaded with this config: ' 'name=' + from_name + ' is expected' ) found_labels = set(config[from_name]['labels']) extra_labels = list(expected_label_set - found_labels) if extra_labels: raise ValidationError( 'You\'ve already completed some tasks, but some of them couldn\'t be loaded with this config: ' 'there are labels already created for "{from_name}":\n{extra_labels}' .format(from_name=from_name, extra_labels=extra_labels) ) def get_tasks(self): """ Load tasks from JSON files in input_path directory :return: file list """ return self.tasks def delete_tasks(self): """ Deletes all tasks & completions from filesystem, then reloads clean project :return: """ delete_dir_content(self.config['output_dir']) if os.path.exists(self.config['input_path']) and os.path.isfile(self.config['input_path']): with io.open(self.config['input_path'], mode='w') as f: json.dump({}, f) # delete everything on ML backend if self.ml_backends_connected: for m in self.ml_backends: m.clear(self) # reload everything related to tasks self.load_tasks() self.load_derived_schemas() def next_task(self, completed_tasks_ids): completed_tasks_ids = set(completed_tasks_ids) sampling = self.config.get('sampling', 'sequential') if sampling == 'sequential': actual_tasks = (self.tasks[task_id] for task_id in self.tasks if task_id not in completed_tasks_ids) return next(actual_tasks, None) elif sampling == 'uniform': actual_tasks_ids = [task_id for task_id in self.tasks if task_id not in completed_tasks_ids] if not actual_tasks_ids: return None random.shuffle(actual_tasks_ids) return self.tasks[actual_tasks_ids[0]] else: raise NotImplementedError('Unknown sampling method ' + sampling) def get_task_ids(self): """ Get task ids only :return: list of task ids """ return list(self.tasks.keys()) def get_task(self, task_id): """ Get one task :param task_id: :return: task """ try: task_id = int(task_id) except ValueError: return None return self.tasks.get(task_id) def iter_completions(self): root_dir = self.config['output_dir'] os.mkdir(root_dir) if not os.path.exists(root_dir) else () files = os.listdir(root_dir) for f in files: if f.endswith('.json'): yield os.path.join(root_dir, f) def get_completions_ids(self): """ List completion ids from output_dir directory :return: filenames without extensions and directories """ completions = [] for f in self.iter_completions(): completions.append(int(os.path.splitext(os.path.basename(f))[0])) logger.debug('{num} completions found in {output_dir}'.format( num=len(completions), output_dir=self.config["output_dir"])) return sorted(completions) def get_completed_at(self, task_ids): """ Get completed time for list of task ids :param task_ids: list of task ids :return: list of string with formatted datetime """ root_dir = self.config['output_dir'] existing_completions = set(self.get_completions_ids()) ids = existing_completions.intersection(task_ids) times = {i: os.path.getmtime(os.path.join(root_dir, str(i) + '.json')) for i in ids} times = {i: datetime.fromtimestamp(t).strftime('%Y-%m-%d %H:%M:%S') for i, t in times.items()} return times def get_task_with_completions(self, task_id): """ Get task with completions :param task_id: task ids :return: json dict with completion """ try: task_id = int(task_id) # check task_id is int (disallow to escape from output_dir) except ValueError: return None if 'completions' in self.tasks[task_id]: return self.tasks[task_id] filename = os.path.join(self.config['output_dir'], str(task_id) + '.json') if os.path.exists(filename): data = json.load(open(filename)) # tasks can hold the newest version of predictions, so task it from tasks data['predictions'] = self.tasks[task_id].get('predictions', []) else: data = None return data def get_area_set(self): area_set = set() number_set = set() area_set.add('ALL') number_set.add('ALL') for task_id in self.get_task_ids(): task_with_completions = self.get_task_with_completions(task_id) if task_with_completions and 'completions' in task_with_completions: area, num = task_with_completions['data']['area'].split()[-1].split('_') area_set.add(area) # show col number_set.add(num) return list(area_set), list(number_set) def get_object_points(self): area_points_dict = defaultdict(list) for task_id in self.get_task_ids(): task_with_completions = self.get_task_with_completions(task_id) if task_with_completions and 'completions' in task_with_completions: completions = task_with_completions['completions'] area_key = task_with_completions['data']['area'].split()[-1] for completion in completions: for result in completion['result']: points = result['value']['points'] area_points_dict[area_key].append(points) return area_points_dict def get_area_class_number(self): area_dict = {} for task_id in self.get_task_ids(): task_with_completions = self.get_task_with_completions(task_id) if task_with_completions and 'completions' in task_with_completions: completions = task_with_completions['completions'] area_key = task_with_completions['data']['area'].split()[-1] class_defect_number = area_dict.get(area_key, {c: 0 for c in self.labeling_classes}) for completion in completions: for result in completion['result']: class_defect_number[result['value']['polygonlabels'][0]] += 1 area_dict[area_key] = class_defect_number return area_dict def get_class_area_number(self): class_defect_dict = {c: {} for c in self.labeling_classes} for task_id in self.get_task_ids(): task_with_completions = self.get_task_with_completions(task_id) if task_with_completions and 'completions' in task_with_completions: completions = task_with_completions['completions'] area_key = task_with_completions['data']['area'].split()[-1] for completion in completions: for result in completion['result']: c = result['value']['polygonlabels'][0] class_defect_dict[c][area_key] = class_defect_dict[c].get(area_key, 0) + 1 class_defect_dict[c]['total'] = class_defect_dict[c].get('total', 0) + 1 return class_defect_dict def save_completion(self, task_id, completion): """ Save completion :param task_id: task id :param completion: json data from label (editor) """ # try to get completions with task first task = self.get_task_with_completions(task_id) # init task if completions with task not exists if not task: task = self.get_task(task_id) task['completions'] = [] # update old completion updated = False if 'id' in completion: for i, item in enumerate(task['completions']): if item['id'] == completion['id']: task['completions'][i].update(completion) updated = True # write new completion if not updated: completion['id'] = task['id'] * 1000 + len(task['completions']) + 1 task['completions'].append(completion) try: self._update_derived_output_schema(completion) except Exception as exc: logger.error(exc, exc_info=True) logger.debug(json.dumps(completion, indent=2)) # write task + completions to file filename = os.path.join(self.config['output_dir'], str(task_id) + '.json') os.mkdir(self.config['output_dir']) if not os.path.exists(self.config['output_dir']) else () json.dump(task, open(filename, 'w'), indent=4, sort_keys=True) # ------------------------- Save Labeling images ------------------------- # # Read Image filename, root = task['data']['image'].split('?d=%2F') root = '/' + root.replace('%2F', '/') filename = os.path.split(filename)[-1] filepath = os.path.join(root, filename) objs = [[obj['value']['polygonlabels'][0], obj['value']['points']] for obj in task['completions'][0]['result']] img = cv2.imread(filepath) img_draw = np.zeros_like(img) h, w, _ = img.shape # draw and save image save_dir = os.path.join(os.path.split(root)[0], 'image-finish') save_filepath = os.path.join(save_dir, filename) if len(objs) > 0: for obj in objs: label, points = obj points = (np.array([points]) / 100 * [w, h]).astype(np.int32) img_draw = cv2.fillPoly(img_draw, points, [0, 0, 255]) img_draw = self.print_chinese_opencv(img_draw, label, (points[0, :, 0].min(), points[0, :, 1].min()), (0, 0, 255)) img = cv2.addWeighted(img, 1, img_draw, 0.7, 0) else: cv2.putText(img_draw, 'No defect', (20, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) img = cv2.addWeighted(img, 1, img_draw, 0.7, 0) cv2.imwrite(save_filepath, img) return completion['id'] def delete_completion(self, task_id): """ Delete completion from disk :param task_id: task id """ filename = os.path.join(self.config['output_dir'], str(task_id) + '.json') os.remove(filename) self.load_tasks() self.load_derived_schemas() def make_predictions(self, task): task = deepcopy(task) task['predictions'] = [] try: for ml_backend in self.ml_backends: if not ml_backend.connected: continue predictions = ml_backend.make_predictions(task, self) predictions['created_by'] = ml_backend.model_name task['predictions'].append(predictions) except Exception as exc: logger.debug(exc) return task def train(self): completions = [] for f in self.iter_completions(): completions.append(json_load(f)) train_status = False if self.ml_backends_connected: for ml_backend in self.ml_backends: if ml_backend.connected: ml_backend.train(completions, self) train_status = True return train_status @classmethod def print_chinese_opencv(cls, im, chinese, pos, color): img_PIL = Image.fromarray(cv2.cvtColor(im, cv2.COLOR_BGR2RGB)) font = ImageFont.truetype('NotoSansCJK-Bold.ttc', 12) fillColor = color[::-1] position = (pos[0], pos[1]-12) draw = ImageDraw.Draw(img_PIL) draw.text(position, chinese, font=font, fill=fillColor) img = cv2.cvtColor(np.asarray(img_PIL), cv2.COLOR_RGB2BGR) return img @classmethod def get_project_dir(cls, project_name, args): return os.path.join(args.root_dir, project_name) @classmethod def get_input_data_tags(cls, label_config): tag_iter = ElementTree.fromstring(label_config).iter() return [ tag for tag in tag_iter if tag.attrib.get('name') and tag.attrib.get('value', '').startswith('$') ] @classmethod def _load_tasks(cls, input_path, args, label_config_file): with io.open(label_config_file) as f: label_config = f.read() task_loader = Tasks() if args.input_format == 'json': return task_loader.from_json_file(input_path) if args.input_format == 'json-dir': return task_loader.from_dir_with_json_files(input_path) input_data_tags = cls.get_input_data_tags(label_config) if len(input_data_tags) > 1: val = ",".join(tag.attrib.get("name") for tag in input_data_tags) print('Warning! Multiple input data tags found: ' + val + '. Only first one is used.') elif len(input_data_tags) == 0: raise ValueError( 'You\'ve specified input format "{fmt}" which requires label config being explicitly defined. ' 'Please specify --label-config=path/to/config.xml or use --format=json or format=json_dir'.format( fmt=args.input_format) ) input_data_tag = input_data_tags[0] data_key = input_data_tag.attrib.get('value').lstrip('$') if args.input_format == 'text': return task_loader.from_text_file(input_path, data_key) if args.input_format == 'text-dir': return task_loader.from_dir_with_text_files(input_path, data_key) if args.input_format == 'image-dir': return task_loader.from_dir_with_image_files(input_path, data_key) if args.input_format == 'audio-dir': return task_loader.from_dir_with_audio_files(input_path, data_key) raise RuntimeError('Can\'t load tasks for input format={}'.format(args.input_format)) @classmethod def _create_ml_backend_params(cls, url): if '=http' in url: name, url = url.split('=', 1) else: name = str(uuid4())[:8] if not is_url(url): raise ValueError('Specified string "' + url + '" doesn\'t look like URL.') return {'url': url, 'name': name} @classmethod def create_project_dir(cls, project_name, args): """ Create project directory in args.root_dir/project_name, and initialize there all required files If some files are missed, restore them from defaults. If config files are specified by args, copy them in project directory :param project_name: :param args: :return: """ dir = cls.get_project_dir(project_name, args) os.makedirs(dir, exist_ok=True) config = json_load(args.config_path) if args.config_path else json_load(find_file('default_config.json')) def already_exists_error(what, path): raise RuntimeError('{path} {what} already exists. Use "--force" option to recreate it.'.format( path=path, what=what )) input_path = args.input_path or config.get('input_path') # save label config config_xml = 'config.xml' config_xml_path = os.path.join(dir, config_xml) label_config_file = args.label_config or config.get('label_config') if label_config_file: copy2(label_config_file, config_xml_path) print(label_config_file + ' label config copied to ' + config_xml_path) else: if os.path.exists(config_xml_path) and not args.force: already_exists_error('label config', config_xml_path) if not input_path: # create default config with polygons only if input data is not set default_label_config = find_file('examples/image_polygons/config.xml') copy2(default_label_config, config_xml_path) print(default_label_config + ' label config copied to ' + config_xml_path) else: with io.open(config_xml_path, mode='w') as fout: fout.write('<View></View>') print('Empty config has been created in ' + config_xml_path) config['label_config'] = config_xml # save tasks.json tasks_json = 'tasks.json' tasks_json_path = os.path.join(dir, tasks_json) if input_path: tasks = cls._load_tasks(input_path, args, config_xml_path) with io.open(tasks_json_path, mode='w') as fout: json.dump(tasks, fout, indent=2) print('{tasks_json_path} input file with {n} tasks has been created from {input_path}'.format( tasks_json_path=tasks_json_path, n=len(tasks), input_path=input_path)) else: if os.path.exists(tasks_json_path) and not args.force: already_exists_error('input path', tasks_json_path) with io.open(tasks_json_path, mode='w') as fout: json.dump({}, fout) print(tasks_json_path + ' input path has been created with empty tasks.') config['input_path'] = tasks_json # create completions dir completions_dir = os.path.join(dir, 'completions') if os.path.exists(completions_dir) and not args.force: already_exists_error('output dir', completions_dir) if os.path.exists(completions_dir): delete_dir_content(completions_dir) print(completions_dir + ' output dir already exists. Clear it.') else: os.makedirs(completions_dir, exist_ok=True) print(completions_dir + ' output dir has been created.') config['output_dir'] = 'completions' if 'ml_backends' not in config or not isinstance(config['ml_backends'], list): config['ml_backends'] = [] if args.ml_backends: for url in args.ml_backends: config['ml_backends'].append(cls._create_ml_backend_params(url)) if args.sampling: config['sampling'] = args.sampling if args.port: config['port'] = args.port if args.host: config['host'] = args.host # create config.json config_json = 'config.json' config_json_path = os.path.join(dir, config_json) if os.path.exists(config_json_path) and not args.force: already_exists_error('config', config_json_path) with io.open(config_json_path, mode='w') as f: json.dump(config, f, indent=2) print('') print('Label Studio has been successfully initialized. Check project states in ' + dir) print('Start the server: label-studio start ' + dir) return dir @classmethod def get_config(cls, project_name, args): return cls._get_config(cls.get_project_dir(project_name, args)) @classmethod def _get_config(cls, project_dir, args=None): """ Get config from input args Namespace acquired by Argparser :param args: :return: """ # check if project directory exists if not os.path.exists(project_dir): project_name = args.project_name if args is not None else '<project_name>' raise FileNotFoundError( 'Couldn\'t find directory ' + project_dir + ', maybe you\'ve missed appending "--init" option:\nlabel-studio start ' + project_name + ' --init' ) # check config.json exists in directory config_path = os.path.join(project_dir, 'config.json') if not os.path.exists(config_path): project_name = args.project_name if args is not None else '<project_name>' raise FileNotFoundError( 'Couldn\'t find config file ' + config_path + ' in project directory ' + project_dir + ', maybe you\'ve missed appending "--init" option:\nlabel-studio start ' + project_name + ' --init' ) config_path = os.path.abspath(config_path) with io.open(config_path) as c: config = json.load(c) config['config_path'] = config_path config['input_path'] = os.path.join(os.path.dirname(config_path), config['input_path']) config['label_config'] = os.path.join(os.path.dirname(config_path), config['label_config']) config['output_dir'] = os.path.join(os.path.dirname(config_path), config['output_dir']) return config @classmethod def _load_from_dir(cls, project_dir, project_name, args, context): config = cls._get_config(project_dir, args) return cls(config, project_name, context) @classmethod def get(cls, project_name, args, context): # If project stored in memory, just return it if project_name in cls._storage: return cls._storage[project_name] # If project directory exists, load project from directory and update in-memory storage project_dir = cls.get_project_dir(project_name, args) if os.path.exists(project_dir): project = cls._load_from_dir(project_dir, project_name, args, context) cls._storage[project_name] = project return project raise ProjectNotFound('Project {p} doesn\'t exist'.format(p=project_name)) @classmethod def create(cls, project_name, args, context): # "create" method differs from "get" as it can create new directory with project resources project_dir = cls.create_project_dir(project_name, args) project = cls._load_from_dir(project_dir, project_name, args, context) cls._storage[project_name] = project return project @classmethod def get_or_create(cls, project_name, args, context): try: project = cls.get(project_name, args, context) logger.info('Get project "' + project_name + '".') except ProjectNotFound: project = cls.create(project_name, args, context) logger.info('Project "' + project_name + '" created.') return project def update_on_boarding_state(self): self.on_boarding['setup'] = self.config.get('label_config_updated', False) self.on_boarding['import'] = len(self.tasks) > 0 self.on_boarding['labeled'] = len(os.listdir(self.config['output_dir'])) > 0 return self.on_boarding