def _validate_task(self, key, parsed_data): """ Validate parsed data with labeling config and task structure """ is_list = isinstance(parsed_data, list) # we support only one task per JSON file if not (is_list and len(parsed_data) == 1 or isinstance(parsed_data, dict)): raise TaskValidationError( 'Error at ' + key + ':\n' 'Cloud storages support one task per one JSON file only. ' 'Task must be {} or [{}] with length = 1') # classic validation for one task validator = TaskValidator(self.project) try: new_tasks = validator.to_internal_value( parsed_data if is_list else [parsed_data]) except TaskValidationError as e: # pretty format of errors messages = e.msg_to_list() out = [(key + ' :: ' + msg) for msg in messages] out = "\n".join(out) raise TaskValidationError(out) return new_tasks[0]
def api_import(): project = project_get_or_create() # make django compatibility for uploader module class DjangoRequest: POST = request.form GET = request.args FILES = request.files data = request.json if request.json else request.form content_type = request.content_type start = time.time() # get tasks from request parsed_data = uploader.load_tasks(DjangoRequest()) # validate tasks validator = TaskValidator(project) try: new_tasks = validator.to_internal_value(parsed_data) except ValidationError as e: return make_response(jsonify(e.msg_to_list()), status.HTTP_400_BAD_REQUEST) # save task file to input dir if os.path.isdir(project.config['input_path']): # tasks are in directory, write a new file with tasks task_dir = project.config['input_path'] now = datetime.now() data = json.dumps(new_tasks, ensure_ascii=False) md5 = hashlib.md5(json.dumps(data).encode('utf-8')).hexdigest() name = 'import-' + now.strftime('%Y-%m-%d-%H-%M') + '-' + str(md5[0:8]) path = os.path.join(task_dir, name + '.json') tasks = new_tasks else: # tasks are all in one file, append it path = project.config['input_path'] old_tasks = json.load(open(path)) assert isinstance(old_tasks, list), 'Tasks from input_path must be list' tasks = old_tasks + new_tasks logger.error("It's recommended to use directory as input_path: " + project.config['input_path'] + ' -> ' + os.path.dirname(project.config['input_path'])) with open(path, 'w') as f: json.dump(tasks, f, ensure_ascii=False, indent=4) # load new tasks project.reload() duration = time.time() - start return make_response( jsonify({ 'task_count': len(new_tasks), 'completion_count': validator.completion_count, 'prediction_count': validator.prediction_count, 'duration': duration }), status.HTTP_201_CREATED)
def api_import(): """ The main API for task import, supports * json task data * files (as web form, files will be hosted by this flask server) * url links to images, audio, csv (if you use TimeSeries in labeling config) """ # make django compatibility for uploader module class DjangoRequest: def __init__(self): pass POST = request.form GET = request.args FILES = request.files data = request.json if request.json else request.form content_type = request.content_type start = time.time() # get tasks from request parsed_data, formats = uploader.load_tasks(DjangoRequest(), g.project) # validate tasks validator = TaskValidator(g.project) try: new_tasks = validator.to_internal_value(parsed_data) except ValidationError as e: return make_response(jsonify(e.msg_to_list()), status.HTTP_400_BAD_REQUEST) # get the last task id max_id_in_old_tasks = -1 if not g.project.no_tasks(): max_id_in_old_tasks = g.project.source_storage.max_id() new_tasks = Tasks().from_list_of_dicts(new_tasks, max_id_in_old_tasks + 1) try: g.project.source_storage.set_many(new_tasks.keys(), new_tasks.values()) except NotImplementedError: raise NotImplementedError('Import is not supported for the current storage ' + str(g.project.source_storage)) # if tasks have completion - we need to implicitly save it to target for i in new_tasks.keys(): for completion in new_tasks[i].get('completions', []): g.project.save_completion(int(i), completion) # update schemas based on newly uploaded tasks g.project.update_derived_input_schema() g.project.update_derived_output_schema() duration = time.time() - start return make_response(jsonify({ 'task_count': len(new_tasks), 'completion_count': validator.completion_count, 'prediction_count': validator.prediction_count, 'duration': duration, 'formats': formats, 'new_task_ids': [t for t in new_tasks] }), status.HTTP_201_CREATED)
def api_import(): project = project_get_or_create() # make django compatibility for uploader module class DjangoRequest: POST = request.form GET = request.args FILES = request.files data = request.json if request.json else request.form content_type = request.content_type start = time.time() # get tasks from request parsed_data = uploader.load_tasks(DjangoRequest(), project) # validate tasks validator = TaskValidator(project) try: new_tasks = validator.to_internal_value(parsed_data) except ValidationError as e: return make_response(jsonify(e.msg_to_list()), status.HTTP_400_BAD_REQUEST) # tasks are all in one file, append it path = project.config['input_path'] old_tasks = json.load(open(path)) max_id_in_old_tasks = int(max(map(int, old_tasks.keys()))) if old_tasks else -1 new_tasks = Tasks().from_list_of_dicts(new_tasks, max_id_in_old_tasks + 1) old_tasks.update(new_tasks) with open(path, 'w') as f: json.dump(old_tasks, f, ensure_ascii=False, indent=4) # load new tasks and everything related project.load_tasks() project.load_derived_schemas() duration = time.time() - start return make_response( jsonify({ 'task_count': len(new_tasks), 'completion_count': validator.completion_count, 'prediction_count': validator.prediction_count, 'duration': duration, 'new_task_ids': [t for t in new_tasks] }), status.HTTP_201_CREATED)
def api_import(): # make django compatibility for uploader module class DjangoRequest: POST = request.form GET = request.args FILES = request.files data = request.json if request.json else request.form content_type = request.content_type start = time.time() # get tasks from request parsed_data, formats = uploader.load_tasks(DjangoRequest(), g.project) # validate tasks validator = TaskValidator(g.project) try: new_tasks = validator.to_internal_value(parsed_data) except ValidationError as e: return make_response(jsonify(e.msg_to_list()), status.HTTP_400_BAD_REQUEST) max_id_in_old_tasks = -1 if not g.project.no_tasks(): max_id_in_old_tasks = g.project.source_storage.max_id() new_tasks = Tasks().from_list_of_dicts(new_tasks, max_id_in_old_tasks + 1) g.project.source_storage.set_many(new_tasks.keys(), new_tasks.values()) # if tasks have completion - we need to implicitly save it to target for i in new_tasks.keys(): for completion in new_tasks[i].get('completions', []): g.project.save_completion(int(i), completion) # update schemas based on newly uploaded tasks g.project.update_derived_input_schema() g.project.update_derived_output_schema() duration = time.time() - start return make_response( jsonify({ 'task_count': len(new_tasks), 'completion_count': validator.completion_count, 'prediction_count': validator.prediction_count, 'duration': duration, 'formats': formats, 'new_task_ids': [t for t in new_tasks] }), status.HTTP_201_CREATED)
def _update(self): if self.filelist: self.tasks, found_formats, self.data_keys = self._read_tasks() self._raise_if_inconsistent_with_current_project() if not self.found_formats: # It's a first time we get all formats self.found_formats = found_formats if self.selected_formats is None: # It's a first time we get all formats self.selected_formats, self.selected_objects = [], [] for format in sorted(found_formats.keys()): self.selected_formats.append(format) self.selected_objects = self._get_selected_objects() self.show_files_as_tasks_list = self._show_files_as_tasks_list() # validate tasks self._validator = TaskValidator(self.project) self.tasks = self._validator.to_internal_value(self.tasks)
class ImportState(object): # TODO: define SQLAlchemy declarative_base() _db = {} object_to_formats, format_to_object = read_object_formats() AMBIGUOUS_TASKS_LIST_FORMATS = {'csv', 'tsv', 'txt'} def __init__(self, filelist=(), tasks=(), project=None, **kwargs): super(ImportState, self).__init__(**kwargs) # these are actual db columns self.id = 0 self.reset() self.project = project self.filelist = filelist self.tasks = tasks self.preview_size = 10 self._validator = None if project and (filelist or tasks): self._update() def reset(self): self.project = None self.filelist = () self.tasks = () self.found_formats = {} self.selected_formats = None self.selected_objects = None self.columns_to_draw = [] self.data_keys = [] self.files_as_tasks_list = {'type': None, 'selected': True} self.show_files_as_tasks_list = None def serialize(self): return { 'id': self.id, 'project': self.project.name, 'task_preview': self.tasks_preview, 'columns_to_draw': self.columns_to_draw, 'total_tasks': self.total_tasks, 'total_completions': self.total_completions, 'total_predictions': self.total_predictions, 'found_formats': self.found_formats, 'selected_formats': self.selected_formats, 'selected_objects': self.selected_objects, 'files_as_tasks_list': self.files_as_tasks_list, 'show_files_as_tasks_list': self.show_files_as_tasks_list } def _get_selected_objects(self): objects = [] for format in self.selected_formats: normalized_format = format.lower().lstrip('.') if self.files_as_tasks_list[ 'selected'] and normalized_format in self.AMBIGUOUS_TASKS_LIST_FORMATS: objects.append(None) else: objects.append(self.format_to_object.get(normalized_format)) return objects def _show_files_as_tasks_list(self): for format in self.selected_formats: norm_format = format.lower().lstrip('.') if norm_format in self.AMBIGUOUS_TASKS_LIST_FORMATS: return True return False def _generate_label_config(self): # TODO: this is a temp workaround to guess initial config - we should make it prettier data_keys = list(self.project.derived_input_schema) if len(data_keys) > 1: # better to use Table here return '<View></View>' if len(data_keys) == 1: data_key = data_keys[0] objects = set( self.selected_objects) if self.selected_objects else [None] if len(objects) > 1: raise ValidationError('More than one data type is presented') object_tag = list(objects)[0] if not object_tag: return '<View></View>' data_key = object_tag.lower( ) if data_key == Settings.UPLOAD_DATA_UNDEFINED_NAME else data_key return '<View><{0} name="{1}" value="${2}"/></View>'.format( object_tag, object_tag.lower(), data_key) def _read_tasks(self, num_tasks=None): request_files = {} for filename in self.filelist: request_files[filename] = open(self.project.upload_dir + '/' + filename, mode='rb') with get_temp_dir() as tmpdir: files = aggregate_files(request_files, tmpdir, self.project.upload_dir) tasks, found_formats, data_keys = aggregate_tasks( files, self.project, self.selected_formats, self.files_as_tasks_list['selected'], num_tasks) for file in files.values(): try: file.close() except: pass return tasks, found_formats, data_keys def _raise_if_inconsistent_with_current_project(self): project_data_keys = self.project.data_keys if project_data_keys: import_data_keys = list( filter(lambda k: k != Settings.UPLOAD_DATA_UNDEFINED_NAME, self.data_keys)) if import_data_keys and import_data_keys != project_data_keys: raise ValidationError( "Import data inconsistent with current project:\n" "Imported column names {}\nare inconsistent with common columns found in dataset: {}" .format(','.join(import_data_keys), ','.join(project_data_keys))) def _update(self): if self.filelist: self.tasks, found_formats, self.data_keys = self._read_tasks() self._raise_if_inconsistent_with_current_project() if not self.found_formats: # It's a first time we get all formats self.found_formats = found_formats if self.selected_formats is None: # It's a first time we get all formats self.selected_formats, self.selected_objects = [], [] for format in sorted(found_formats.keys()): self.selected_formats.append(format) self.selected_objects = self._get_selected_objects() self.show_files_as_tasks_list = self._show_files_as_tasks_list() # validate tasks self._validator = TaskValidator(self.project) self.tasks = self._validator.to_internal_value(self.tasks) def apply(self): # get the last task id max_id_in_old_tasks = -1 if not self.project.no_tasks(): max_id_in_old_tasks = self.project.source_storage.max_id() # now read all tasks # currently self._update() reads all tasks - uncomment this on change # all_tasks, _, _ = self._read_tasks() all_tasks = self.tasks new_tasks = Tasks().from_list_of_dicts(all_tasks, max_id_in_old_tasks + 1) try: self.project.source_storage.set_many(new_tasks.keys(), new_tasks.values()) except NotImplementedError: raise NotImplementedError( 'Import is not supported for the current storage, change storage type in project settings' + str(self.project.source_storage)) # if tasks have completion - we need to implicitly save it to target for i in new_tasks.keys(): for completion in new_tasks[i].get('completions', []): self.project.save_completion(int(i), completion) # update schemas based on newly uploaded tasks self.project.update_derived_input_schema() self.project.update_derived_output_schema() if self.project.label_config_is_empty: generated_label_config = self._generate_label_config() self.project.update_label_config(generated_label_config) return new_tasks @property def tasks_preview(self): preview = [] for task in self.tasks[:self.preview_size]: t = deepcopy(task['data']) if 'completions' in task: t['completions'] = task['completions'] if 'predictions' in task: t['predictions'] = task['predictions'] preview.append(t) return preview @property def total_tasks(self): return len(self.tasks) @property def total_completions(self): return self._validator.completion_count @property def total_predictions(self): return self._validator.prediction_count @classmethod def create_from_filelist(cls, filelist, project): _id = 1 if _id not in cls._db: i = ImportState() i.id = _id cls._db[_id] = i import_state = cls._db[_id] import_state.reset() import_state.filelist = filelist import_state.project = project import_state._update() return import_state @classmethod def create_from_data(cls, data, project): if isinstance(data, dict): tasks = [data] elif isinstance(data, list): tasks = data else: raise ValidationError( 'Incorrect input data type, it must be JSON dict or list') _id = 1 if _id not in cls._db: i = ImportState() i.id = _id cls._db[_id] = i import_state = cls._db[_id] import_state.reset() import_state.tasks = tasks import_state.project = project import_state._update() return import_state @classmethod def get_by_id(cls, id): return cls._db[id] def update(self, **import_state_interface): [ setattr(self, name, value) for name, value in import_state_interface.items() ] self._update()
def api_import(): print("in api import") project = project_get_or_create() # make django compatibility for uploader module class DjangoRequest: POST = request.form GET = request.args FILES = request.files data = request.json if request.json else request.form content_type = request.content_type print("In api_import") start = time.time() # get tasks from request parsed_data = uploader.load_tasks(DjangoRequest()) # validate tasks validator = TaskValidator(project) try: new_tasks = validator.to_internal_value(parsed_data) except ValidationError as e: return make_response(jsonify(e.msg_to_list()), status.HTTP_400_BAD_REQUEST) # save task file to input dir if os.path.isdir(project.config['input_path']): # tasks are in directory, write a new file with tasks task_dir = project.config['input_path'] now = datetime.now() print("In new tasks api_import") data = json.dumps(new_tasks, ensure_ascii=False) md5 = hashlib.md5(json.dumps(data).encode('utf-8')).hexdigest() name = 'import-' + now.strftime('%Y-%m-%d-%H-%M') + '-' + str(md5[0:8]) path = os.path.join(task_dir, name + '.json') tasks = new_tasks else: # tasks are all in one file, append it path = project.config['input_path'] print("in old tasks section api_import") old_tasks = json.load(open(path)) assert isinstance(old_tasks, list), 'Tasks from input_path must be list' tasks = old_tasks + new_tasks temp = copy.deepcopy(tasks) tasks[:] = [] numcomps = 3 startingindex = 0 count = [0] * len(temp) c = 0 for i in range(0, len(temp)): for j in range(0, param): if (j + startingindex < len(temp)): if (count[j + startingindex] < 3): tasks.append(temp[j + startingindex]) count[j + startingindex] = count[j + startingindex] + 1 print(temp[j + startingindex]) c = c + 1 if (len(tasks) % (numcomps * param) == 0): startingindex = startingindex + param print(c) logger.error("It's recommended to use directory as input_path: " + project.config['input_path'] + ' -> ' + os.path.dirname(project.config['input_path'])) with open(path, 'w') as f: json.dump(tasks, f, ensure_ascii=False, indent=4) # load new tasks project.reload() duration = time.time() - start # #add to tasks queues # num_tasks = len(new_tasks) # temp = [] # a = 1 # while a < num_tasks: # for b in range(a,a+param): # temp.append(b) # a = a + param # if num_tasks - a < param: # # add all the rest # while(a <= num_tasks): # temp.append(a) # a = a + 1; # task_queue.append(temp) # temp=[] # print("JUST MADE THE QUEUE!!!*********") # print(task_queue) task_queue = make_task_queue(num_tasks) return make_response( jsonify({ 'task_count': len(new_tasks), 'completion_count': validator.completion_count, 'prediction_count': validator.prediction_count, 'duration': duration }), status.HTTP_201_CREATED)