def download_and_organize_pred_data(path_to_dataset, dataset_obj, filters=None): if filters is None: query = dl.Filters().prepare()['filter'] filters = dl.Filters() filters.custom_filter = query if not os.path.exists(os.path.dirname(path_to_dataset)): os.mkdir(os.path.dirname(path_to_dataset)) os.mkdir(path_to_dataset) dataset_obj.items.download(local_path=path_to_dataset, filters=filters) images_folder = os.path.join(path_to_dataset, 'items') logger.info('downloaded ' + str(len(os.listdir(images_folder))) + ' to ' + images_folder) # move to imgs and annotations to fixed format for path, su1bdirs, files in os.walk(images_folder): for name in files: filename, ext = os.path.splitext(name) if ext.lower() not in ['.jpg', '.jpeg', '.png']: continue img_path = os.path.join(path, name) new_img_path = os.path.join(path_to_dataset, name) os.rename(img_path, new_img_path) # delete dirs leave images and jsons for stuff in os.listdir(images_folder): im_path = os.path.join(images_folder, stuff) if os.path.isdir(im_path): shutil.rmtree(im_path, ignore_errors=True)
def maybe_download_pred_data(dataset_obj, val_query): # check if data is downloaded if not then download val_filters = dl.Filters() val_filters.custom_filter = val_query logger.info('val query: ' + str(val_query)) logger.info('filters: ' + str(val_filters.prepare())) parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) path_to_put_data = os.path.join(parent_dir, 'data') if not os.path.exists(path_to_put_data): os.mkdir(path_to_put_data) if data_format == 'dataloop': dataset_name = dataset_obj.name path_to_pred_dataset = os.path.join(path_to_put_data, dataset_name, 'predict_on') # download d.names try: dataset_obj.items.get('/d.names').download(local_path=os.path.join(path_to_put_data, dataset_name)) except: pass if os.path.exists(path_to_pred_dataset): logger.info(dataset_name + ' already exists, no need to download') else: download_and_organize_pred_data(path_to_pred_dataset, dataset_obj, val_filters) else: name = dataset_obj.directory_tree.dir_names[-2].strip('/') if os.path.exists(os.path.join(path_to_put_data, name)): logger.info(name + ' already exists, no need to download') else: dataset_obj.items.download(local_path=path_to_put_data, to_items_folder=False) logger.info('downloaded dataset to ' + path_to_put_data)
def main(first_project_name, second_project_name, first_dataset_name, second_dataset_name): """ Copy folder from project/dataset/folder :return: """ import dtlpy as dl # Get source project and dataset project = dl.projects.get(project_name=first_project_name) dataset_from = project.datasets.get(dataset_name=first_dataset_name) # filter to get all files of a specific folder filters = dl.Filters() filters.add(field='type', values='file') # get only files filters.add(field='filename', values='/source_folder/**') # get all items in folder (recursive) pages = dataset_from.items.list(filters=filters) # Get destination project and annotations project = dl.projects.get(project_name=second_project_name) dataset_to = project.datasets.get(dataset_name=second_dataset_name) # go over all projects and copy file from src to dst for page in pages: for item in page: # download item (without save to disk) buffer = item.download(save_locally=False) # give the items name to the buffer buffer.name = item.name # upload item new_item = dataset_to.items.upload(local_path=buffer, remote_path='/destination_folder') print(new_item.filename)
def __init__(self, optimal_model, ongoing_trials=None, remote=False): self.optimal_model = optimal_model self.ongoing_trials = ongoing_trials self.remote = remote self.num_available_devices = torch.cuda.device_count() self.home_path = optimal_model.data['home_path'] self.dataset_name = optimal_model.data['dataset_name'] self.package_name = 'zazuml' if self.remote: dataset_obj = get_dataset_obj(optimal_model.dataloop) self.project = dl.projects.get(project_id=dataset_obj.projects[0]) self.dataset_id = dataset_obj.id try: self.train_query = optimal_model.dataloop['train_query'] except: self.train_query = dl.Filters().prepare()['filter'] try: # TODO: TRAIN QUERY IS STILL BEING COPPIED try: self.val_query = deepcopy(self.train_query) except: self.val_query = dl.Filters().prepare() self.val_query['filter']['$and'][0][ 'dir'] = optimal_model.dataloop['test_dir'] except: try: self.val_query = optimal_model.dataloop['val_query'] except: self.val_query = dl.Filters().prepare()['filter'] with open('global_configs.json', 'r') as fp: global_project_name = json.load(fp)['project'] self.global_project = dl.projects.get( project_name=global_project_name) # TODO: dont convert here if self.optimal_model.name == 'yolov3': if self.optimal_model.data['annotation_type'] == 'coco': self._convert_coco_to_yolo_format() self.optimal_model.data['annotation_type'] = 'yolo'
def maybe_download_data(dataset_obj, train_query, val_query): # check if data is downloaded if not then download train_filters = dl.Filters() train_filters.custom_filter = train_query val_filters = dl.Filters() val_filters.custom_filter = val_query logger.info('train query: ' + str(train_query)) logger.info('filters: ' + str(train_filters.prepare())) logger.info('val query: ' + str(val_query)) logger.info('filters: ' + str(val_filters.prepare())) parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) path_to_put_data = os.path.join(parent_dir, 'data') if not os.path.exists(path_to_put_data): os.mkdir(path_to_put_data) if data_format == 'dataloop': dataset_name = dataset_obj.name path_to_dataset = os.path.join(path_to_put_data, dataset_name, 'train') path_to_val_dataset = os.path.join(path_to_put_data, dataset_name, 'val') if os.path.exists(path_to_dataset): logger.info(dataset_name + ' already exists, no need to download') if not os.path.exists(os.path.join(os.path.dirname(path_to_dataset), 'annotations')): convert_dataloop_to_coco(path_to_data=path_to_dataset, name='train', split_val=False) convert_dataloop_to_coco(path_to_data=path_to_val_dataset, name='val', split_val=False) else: download_and_organize(path_to_dataset, dataset_obj, train_filters) download_and_organize(path_to_val_dataset, dataset_obj, val_filters) convert_dataloop_to_coco(path_to_data=path_to_dataset, name='train', split_val=False) convert_dataloop_to_coco(path_to_data=path_to_val_dataset, name='val', split_val=False) else: name = dataset_obj.directory_tree.dir_names[-2].strip('/') if os.path.exists(os.path.join(path_to_put_data, name)): logger.info(name + ' already exists, no need to download') else: dataset_obj.items.download(local_path=path_to_put_data, to_items_folder=False) logger.info('downloaded dataset to ' + path_to_put_data)
def main(): import dtlpy as dl project = dl.projects.get(project_name='project_name') ################## # create trigger # ################## # create Item trigger # that will trigger service with given id # when item is created trigger = project.triggers.create(service_ids=['some_service_id'], resource='Item', actions=['Created'], active=True) # create Annotation trigger # that will trigger service with given id # when Annotation is deleted trigger = project.triggers.create(service_ids=['some_service_id'], resource='Annotation', actions=['Deleted'], active=True) ################## # update trigger # ################## # if we want trigger to be triggered on Created trigger.actions = ['Annotation'] trigger.update() # update trigger filters # to work only on specific items # trigger will be triggered only on # items with string: dog in their name filters = dl.Filters() filters.add(field='filename', values='*dog*') trigger.filters = filters trigger.update() ################## # delete trigger # ################## trigger.delete() ############################ # list of project triggers # ############################ project.triggers.list()
def first_method(self, config, progress=None): """ In this example we copy dataset items by query to a new dataset :param config: This is a json input :param progress: Use this to update the progress of your package :return: """ # these lines can be removed assert isinstance(progress, dl.Progress) source_dataset = dl.datasets.get(dataset_id=config['source_dataset_id']) filters = dl.Filters(custom_filter=config['query']) new_dataset = source_dataset.project.datasets.create(dataset_name='{}-copy'.format(source_dataset.name), labels=source_dataset.labels) new_dataset.items.upload(local_path=source_dataset.items.download(filters=filters)) logger.info('Dataset copied successfully')
def first_method(self, dataset, progress=None): """ Write your main package service here :param progress: Use this to update the progress of your package :return: """ # these lines can be removed assert isinstance(progress, dl.Progress) assert isinstance(dataset, dl.entities.Dataset) filters = dl.Filters(field='annotated', values=True) filters.add_join(field='label', values='person') task = dataset.tasks.create( task_name='AutomatedTask', due_date=datetime.datetime.now().timestamp() + 60 * 60 * 24 * 7, assignee_ids=['*****@*****.**', '*****@*****.**']) logger.info('Task created successfully. Task name: {}'.format( task.name))
def add_dataloop_remote_annotations(self, project_name, dataset_name, filter_value, model_name): dlp.setenv('prod') project = dlp.projects.get(project_name) dataset = project.datasets.get(dataset_name) filters = dlp.Filters() filters.add(field='filename', values=filter_value) pages = dataset.items.list(filters=filters) self.models_alowed_ls = [model_name, 'gt'] i_item = 0 pool = ThreadPool(processes=32) for page in pages: for item in page: if item.filename.startswith('/.dataloop'): continue pool.apply_async(self._collect_annotations, kwds={'w_item': item}) i_item += 1 pool.close() pool.join()
def main(): """ Move items to another folder in platform :return: """ import dtlpy as dl # Get project and dataset project = dl.projects.get(project_name='Ninja Turtles') dataset = project.datasets.get(dataset_name='Splinter') # Get all items from the source folder filters = dl.Filters() filters.add(field='filename', values='/fighting/**' ) # take files from the directory only (recursive) filters.add(field='type', values='file') # only files pages = dataset.items.list(filters=filters) dst_folder = '/fighting_shredder' # iterate through items for page in pages: for item in page: # move item item.move(new_path=dst_folder)
def __init__(self, configs, time, test_dataset, query): self.configs_input = dl.FunctionIO(type='Json', name='configs', value=configs) self.service = dl.services.get('zazu') project_name = configs['dataloop']['project'] self.project = dl.projects.get(project_name) maybe_download_pred_data(dataset_obj=test_dataset, val_query=query) filters = dl.Filters() filters.custom_filter = query dataset_name = test_dataset.name path_to_dataset = os.path.join(os.getcwd(), dataset_name) download_and_organize(path_to_dataset=path_to_dataset, dataset_obj=test_dataset, filters=filters) json_file_path = os.path.join(path_to_dataset, 'json') self.compute = precision_recall_compute() self.compute.add_dataloop_local_annotations(json_file_path) self._circle(time)
def main(): import dtlpy as dl ######## # prep # ######## project = dl.projects.get(project_name='RQL') dataset = project.datasets.get(dataset_name='Dataset') ################# ### Items ### ################# ################## # create filters # ################## filters = dl.Filters() # set resource - optional - default is item filters.resource = dl.FiltersResource.ITEM # add filter - only files filters.add(field='type', values='file') # add filter - only annotated items filters.add(field='annotated', values=True) # add filter - filename includes 'dog' filters.add(field='filename', values='*dog*') # -- time filters -- must be in ISO format and in UTC (offset from local time). converting using datetime package as follows: import datetime, time timestamp = datetime.datetime(year=2019, month=10, day=27, hour=15, minute=39, second=6, tzinfo=datetime.timezone(datetime.timedelta(seconds=-time.timezone))).isoformat() filters.add(field='createdAt', values=timestamp, operator=dl.FiltersOperations.GREATER_THAN) ###################### # get filtered items # ###################### # return results sorted by ascending id filters.sort_by(field='filename') pages = dataset.items.list(filters=filters) ######################### # update filtered items # ######################### # to add filed annotatedDogs to all filtered items and give value True # this field will be added to user metadata # create update order update_values = {'annotatedDogsSingJune2019': True} # update pages = dataset.items.update(filters=filters, update_values=update_values) # ######################### # # delete filtered items # # ######################### # dataset.items.delete(filters=filters) ##################################### # filter items by their annotations # ##################################### filters = dl.Filters() # set resource filters.resource = 'items' # add filter - only files filters.add(field='type', values='file') # add annotation filters - only items with 'box' annotations filters.add_join(field='type', values='box') # get results pages = dataset.items.list(filters=filters) ####################### ### Annotations ### ####################### ################## # create filters # ################## filters = dl.Filters() # set resource filters.resource = dl.FiltersResource.ANNOTATION # add filter - only box annotations filters.add(field='type', values='box') # add filter - only dogs filters.add(field='label', values=['Dog', 'cat'], operator=dl.FiltersOperations.IN) # add filter - annotated by Joe and David filters.add(field='creator', values=['*****@*****.**', '*****@*****.**', '*****@*****.**'], operator=dl.FiltersOperations.IN ) ############################ # get filtered annotations # ############################ # return results sorted by descending id filters.sort_by(field='id', value=dl.FiltersOrderByDirection.DESCENDING) pages = dataset.items.list(filters=filters) ############################### # update filtered annotations # ############################### # to add filed annotation_quality to all filtered annotations and give them value 'high' # this field will be added to user metadata # create update order update_values = {'annotation_quality': 'high'} # update pages = dataset.items.update(filters=filters, update_values=update_values) ############################### # delete filtered annotations # ############################### dataset.items.delete(filters=filters)
def run(self): # create temp path to save dataloop annotations local_annotations_path = os.path.join( tempfile.gettempdir(), 'dataloop_annotations_{}'.format(hash(os.times()))) if os.path.isdir(local_annotations_path): raise IsADirectoryError('path already exists') try: # download annotations zip to local directory project = dl.projects.get(project_name=self.project_name) dataset = project.datasets.get(dataset_name=self.dataset_name) dataset.items.download_annotations(dataset_name=self.dataset_name, local_path=os.path.join( local_annotations_path, '*')) # get labels to ids dictionary if 'labels_dict' not in self.params: self.params['labels_dict'] = { label: i_label for i_label, label in enumerate(list( dataset.labels.keys())) } output_annotations_path = os.path.join(self.output_directory, 'annotations') # create output directories if not os.path.isdir(self.output_directory): os.makedirs(self.output_directory) if not os.path.isdir(output_annotations_path): os.makedirs(output_annotations_path) # save labels with open(os.path.join(self.output_directory, 'labels.txt'), 'w') as f: f.write('\n'.join([ '%s:%s' % (val, key) for key, val in self.params['labels_dict'].items() ])) # get all items (for width and height) filters = dl.Filters() filters.add(field='filename', values=self.remote_path) filters.add(field='type', values='file') pages = dataset.items.list(filters=filters) # init workers and results lists pool = ThreadPool(processes=32) i_item = -1 num_items = pages.items_count self.outputs = [None for _ in range(num_items)] self.results = [None for _ in range(num_items)] self.errors = [None for _ in range(num_items)] for page in pages: for item in page: i_item += 1 # create input annotations json in_filepath = os.path.join(local_annotations_path, item.filename[1:]) name, ext = os.path.splitext(in_filepath) in_filepath = name + '.json' # check if annotations file exists if not os.path.isfile(in_filepath): self.results[i_item] = False self.errors[ i_item] = 'file not found: %s' % in_filepath continue with open(in_filepath, 'r', encoding="utf8") as f: data = json.load(f) pool.apply_async( self.threading_wrapper, kwds={ 'func': self.convert_single_file, 'i_item': i_item, # input for "func" 'output_directory': output_annotations_path, 'item': item, 'annotations': data['annotations'], 'params': self.params }) print('Done') pool.close() pool.join() pool.terminate() dummy = [ logger.error(self.errors[i_job]) for i_job, suc in enumerate(self.results) if suc is False ] return self.outputs except: raise finally: # cleanup if os.path.isdir(local_annotations_path): shutil.rmtree(local_annotations_path)
# downloading images for d, dataset_name in enumerate(args.dataset_name): try: dataset = project.datasets.get(dataset_name=dataset_name) logging.info("dataset {} found".format(dataset_name)) except Exception: raise ("dataset {} not found".format(dataset_name)) local_dest = os.path.join(args.local_dest, dataset_name) dataset.items.download(local_path=local_dest, annotation_options='json') # converting to yolo and downloading converter = dl.Converter() filters = dl.Filters() # filtering only boxes filters.resource = dl.FiltersResource.ANNOTATION filters.add(field='type', values='box') # downloading labels and converting converter.convert_dataset(dataset=dataset, to_format='yolo', local_path=local_dest, annotation_filter=filters) os.rename(os.path.join(local_dest, "items"), os.path.join(local_dest, "images")) os.rename(os.path.join(local_dest, "yolo"), os.path.join(local_dest, "labels"))
def __init__(self, configs, time, test_dataset_id, query): logger.info('dtlpy version: ' + str(dl.__version__)) logger.info('dtlpy info: ' + str(dl.info())) time = int(time) dl.setenv('prod') configs = json.loads(configs) query = json.loads(query) self.configs_input = dl.FunctionIO(type='Json', name='configs', value=configs) self.service = dl.services.get('zazu') project_name = configs['dataloop']['project'] self.project = dl.projects.get(project_name) test_dataset = self.project.datasets.get(dataset_id=test_dataset_id) maybe_download_pred_data(dataset_obj=test_dataset, val_query=query) # add gt annotations filters = dl.Filters() filters.custom_filter = query dataset_name = test_dataset.name path_to_dataset = os.path.join(os.getcwd(), dataset_name) # only download if doesnt exist if not os.path.exists(path_to_dataset): download_and_organize(path_to_dataset=path_to_dataset, dataset_obj=test_dataset, filters=filters) json_file_path = os.path.join(path_to_dataset, 'json') self.model_obj = self.project.models.get(model_name='retinanet') self.adapter = self.model_obj.build(local_path=os.getcwd()) logger.info('model built') while 1: self.compute = precision_recall_compute() self.compute.add_dataloop_local_annotations(json_file_path) logger.info("running new execution") execution_obj = self.service.execute(function_name='search', execution_input=[self.configs_input], project_id='72bb623f-517f-472b-ad69-104fed8ee94a') while execution_obj.latest_status['status'] != 'success': sleep(5) execution_obj = dl.executions.get(execution_id=execution_obj.id) if execution_obj.latest_status['status'] == 'failed': raise Exception("plugin execution failed") logger.info("execution object status is successful") self.project.artifacts.download(package_name='zazuml', execution_id=execution_obj.id, local_path=os.getcwd()) logs_file_name = 'timer_logs_' + str(execution_obj.id) + '.conf' graph_file_name = 'precision_recall_' + str(execution_obj.id) + '.png' self.cycle_logger = init_logging(__name__, filename=logs_file_name) logger.info('artifact download finished') logger.info(str(os.listdir('.'))) # load new checkpoint and change to unique name new_checkpoint_name = 'checkpoint_' + str(execution_obj.id) + '.pt' logger.info(str(os.listdir('.'))) os.rename('checkpoint0.pt', new_checkpoint_name) new_model_name = new_checkpoint_name[:-3] logger.info(str(os.listdir('.'))) new_checkpoint = torch.load(new_checkpoint_name, map_location=torch.device('cpu')) # self.model_obj = self.project.models.get(model_name=new_checkpoint['model_specs']['name']) # self.adapter = self.model_obj.build(local_path=os.getcwd()) # logger.info('model built') self.new_home_path = new_checkpoint['model_specs']['data']['home_path'] self._compute_predictions(checkpoint_path=new_checkpoint_name, model_name=new_model_name) if len(self.compute.by_model_name.keys()) < 2: # if the model cant predict anything then just skip it logger.info('''model couldn't make any predictions, trying to train again''') continue # if previous best checkpoint doesnt exist there must not be a service, launch prediction service with new # new_checkpoint and create trigger if 'check0' not in [checkp.name for checkp in self.model_obj.checkpoints.list()]: logger.info('there is no check0, will add upload new checkpoint as check0 and ' 'deploy prediction service') new_checkpoint_obj = self.model_obj.checkpoints.upload(checkpoint_name='check0', local_path=new_checkpoint_name) logger.info('uploaded this checkpoint as the new check0 : ' + new_checkpoint_name[:-3]) self._maybe_launch_predict(new_checkpoint_obj) continue logger.info('i guess check0 does exist') best_checkpoint = self.model_obj.checkpoints.get('check0') check0_path = best_checkpoint.download(local_path=os.getcwd()) logger.info('downloading best checkpoint') logger.info(str(os.listdir('.'))) logger.info('check0 path is: ' + str(check0_path)) self._compute_predictions(checkpoint_path=check0_path, model_name=best_checkpoint.name) # compute metrics new_checkpoint_mAP = self.compute.get_metric(model_name=new_model_name, precision_to_recall_ratio=1.) best_checkpoint_mAP = self.compute.get_metric(model_name=best_checkpoint.name, precision_to_recall_ratio=1.) logger.info('best checkpoint: ' + str(best_checkpoint_mAP)) logger.info('new checkpoint: ' + str(new_checkpoint_mAP)) # if new checkpoint performs better switch out prediction if new_checkpoint_mAP > best_checkpoint_mAP: logger.info('new checkpoint is better') logger.info('uploading old best checkpoint under new name') self.model_obj.checkpoints.upload(checkpoint_name='checkpoint_' + check0_path.split('_')[-1][:-3], local_path=check0_path) logger.info('deleting old best checkpoint') best_checkpoint.delete() logger.info('uploading new best checkpoint as check0') new_best_checkpoint_obj = self.model_obj.checkpoints.upload(checkpoint_name='check0', local_path=new_checkpoint_name) if 'predict' not in [s.name for s in dl.services.list()]: self._maybe_launch_predict(new_best_checkpoint_obj) else: self._update_predict_service(new_best_checkpoint_obj) logger.info('switched with new checkpoint') self.compute.save_plot_metrics(save_path=graph_file_name) self.project.artifacts.upload(filepath=logs_file_name, package_name='zazuml', execution_id=execution_obj.id) self.project.artifacts.upload(filepath=graph_file_name, package_name='zazuml', execution_id=execution_obj.id) logger.info('waiting ' + str(time) + ' seconds for next execution . . . .') sleep(time)