def init(filename=None): rootLogger = logging.getLogger() if filename is None: FileUtils.createDir('./logs') filename = os.path.abspath( './logs/' + datetime.now().strftime('%y-%m-%d_auto') + '.log') if len(rootLogger.handlers) > 0: if os.path.exists(filename): return for each in rootLogger.handlers: rootLogger.removeHandler(each) logFormatter = logging.Formatter( "%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s] %(message)s" ) rootLogger.level = logging.INFO #level fileHandler = logging.FileHandler(filename) fileHandler.setFormatter(logFormatter) rootLogger.addHandler(fileHandler) consoleHandler = logging.StreamHandler() consoleHandler.setFormatter(logFormatter) rootLogger.addHandler(consoleHandler)
def create_local(args): try: if args.name and len(args.name) > 0: filename = './tasks/' + args.report + '/runs/' + args.name + '/' + args.name + '.csv' if not os.path.exists(filename): headers = args.params_report_local with open(filename, 'w') as outfile: FileUtils.lock_file(outfile) outfile.write(','.join(headers) + '\n') outfile.flush() os.fsync(outfile) FileUtils.unlock_file(outfile) except Exception as e: logging.error(str(e)) exc_type, exc_value, exc_tb = sys.exc_info() logging.error( traceback.format_exception(exc_type, exc_value, exc_tb))
def create(args): try: if args.report and len(args.report) > 0: filename = os.path.join('reports', args.report) + '.csv' if not os.path.exists(filename): headers = args.params_report if not args.params_grid is None: headers += args.params_grid with open(filename, 'w') as outfile: FileUtils.lock_file(outfile) outfile.write(','.join(headers) + '\n') outfile.flush() os.fsync(outfile) FileUtils.unlock_file(outfile) except Exception as e: logging.error(str(e)) exc_type, exc_value, exc_tb = sys.exc_info() logging.error( traceback.format_exception(exc_type, exc_value, exc_tb))
def process_dists(idx_start, y_each, y_list, path_embeddings, sample_count, classes_size, embedding_size, triplet_similarity, mode): try: path_emb_json = f'{path_embeddings}/{y_each}.json' path_emb_mem = f'{path_embeddings}/{y_each}.mmap' path_dists_mem = f'{path_embeddings}/dists.mmap' dists_mem = np.memmap(path_dists_mem, mode='r+', dtype=np.float16, shape=(sample_count, classes_size)) emb_json = FileUtils.loadJSON(path_emb_json) emb_mem = np.memmap(path_emb_mem, mode='r', dtype=np.float16, shape=(emb_json['count'], embedding_size)) path_centroids_mem = f'{path_embeddings}/dists.mmap' centroids_mem = np.memmap(path_centroids_mem, mode='r', dtype=np.float16, shape=(classes_size, embedding_size)) for idx_y in y_list: np_class_centroids_tiled = np.tile(centroids_mem[idx_y], (emb_json['count'], 1)) dists = get_distance(emb_mem, np_class_centroids_tiled, triplet_similarity, mode).tolist() dists_mem[idx_start:idx_start + emb_json['count'], idx_y] = dists[:] #dists_mem.flush() except Exception as e: logging.error(str(e)) exc_type, exc_value, exc_tb = sys.exc_info() logging.error('\n'.join( traceback.format_exception(exc_type, exc_value, exc_tb)))
parser.add_argument('-debug_batch_count', default=0, type=int) # 0 = release version parser.add_argument('-embedding_size', default=32, type=int) parser.add_argument('-gamma', default=0.0, type=float) parser.add_argument('-C_0', default=0.0, type=float) parser.add_argument('-C_n', default=5.0, type=float) parser.add_argument('-C_interval', default=10000, type=int) parser.add_argument('-C_start', default=0, type=int) args, args_other = parser.parse_known_args() path_sequence = f'./results/{args.sequence_name}' args.run_name += ('-' + datetime.utcnow().strftime(f'%y-%m-%d--%H-%M-%S')) path_run = f'./results/{args.sequence_name}/{args.run_name}' FileUtils.createDir(path_run) path_artifacts = f'./artifacts/{args.sequence_name}/{args.run_name}' FileUtils.createDir(path_artifacts) FileUtils.writeJSON(f'{path_run}/args.json', args.__dict__) CsvUtils2.create_global(path_sequence) CsvUtils2.create_local(path_sequence, args.run_name) summary_writer = tensorboard_utils.CustomSummaryWriter( logdir=path_run ) rootLogger = logging.getLogger() logFormatter = logging.Formatter("%(asctime)s [%(process)d] [%(thread)d] [%(levelname)s] %(message)s") rootLogger.level = logging.INFO #level
default=False, type=lambda x: (str(x).lower() == 'true')) args, args_other = parser.parse_known_args() args = ArgsUtils.add_other_args(args, args_other) args_other_names = ArgsUtils.extract_other_args_names(args_other) if args.is_restricted_cpu: from cgroups import Cgroup # pip install cgroups # sudo /home/ubuntu/anaconda3/bin/user_cgroups ubuntu # sudo /home/evalds/.conda/envs/conda_env/bin/user_cgroups evalds # add all testable parameters to final report header args.params_report += args_other_names FileUtils.createDir('./reports') FileUtils.createDir('./tasks') FileUtils.createDir('./tasks/' + args.report) logging_utils = LoggingUtils(filename=os.path.join('reports', args.report + '.txt')) ArgsUtils.log_args(args, 'taskgen.py', logging_utils) task_settings = {'id': 0, 'repeat_id': 0} tasks_settings_path = os.path.join('tasks', 'tasks.json') if os.path.exists(tasks_settings_path): with open(tasks_settings_path, 'r') as outfile: tasks_settings_loaded = json.load(outfile) for key in tasks_settings_loaded: task_settings[key] = tasks_settings_loaded[key]
'score_best', 'loss', 'loss_dqn', 'loss_inverse', 'loss_forward', 'cosine_distance' ] if not args.params_report is None: for it in reversed(args.params_report): if not it in tmp: tmp.insert(0, it) args.params_report = tmp args.params_report_local = args.params_report FileUtils.createDir('./tasks/' + args.report) run_path = './tasks/' + args.report + '/runs/' + args.name if os.path.exists(run_path): shutil.rmtree(run_path, ignore_errors=True) time.sleep(3) while os.path.exists(run_path): pass FileUtils.createDir(run_path) logging_utils = LoggingUtils(filename=os.path.join(run_path, 'log.txt')) is_logged_cnorm = False ArgsUtils.log_args(args, 'main.py', logging_utils) CsvUtils.create_local(args)
parser.add_argument('-hpc_mem', help='HPC - override mem GB', default=0, type=int) parser.add_argument('-is_hpc', help='is HPC qsub tasks or local tasks', default=True, type=lambda x: (str(x).lower() == 'true')) parser.add_argument('-hpc_queue', help='hpc queue', default='batch', type=str) args, args_other = parser.parse_known_args() FileUtils.createDir('./reports') FileUtils.createDir('./tasks') #FileUtils.createDir('./tasks/' + args.report) if args.is_hpc: FileUtils.createDir(os.path.expanduser('~') + '/tmp') logging_utils = LoggingUtils(name=os.path.join('reports', args.report + '.txt')) task_settings = {'id': 0, 'repeat_id': 0} hpc_settings_path = os.path.join('tasks', 'tasks.json') if os.path.exists(hpc_settings_path): with open(hpc_settings_path, 'r') as outfile: hpc_settings_loaded = json.load(outfile)
def add_hparams(path_sequence, run_name, args_dict, metrics_dict, global_step): try: path_local_csv = f'{path_sequence}/{run_name}.csv' path_global_csv = f'{path_sequence}/sequence-{os.path.basename(path_sequence)}.csv' args_dict = copy.copy(args_dict) metrics_dict = copy.copy(metrics_dict) for each_dict in [args_dict, metrics_dict]: for key in list(each_dict.keys()): if not isinstance(each_dict[key], float) and \ not isinstance(each_dict[key], int) and \ not isinstance(each_dict[key], str) and \ not isinstance(each_dict[key], np.float) and \ not isinstance(each_dict[key], np.int) and \ not isinstance(each_dict[key], np.float32): del each_dict[key] for path_csv in [path_local_csv, path_global_csv]: if os.path.exists(path_csv): with open(path_csv, 'r+') as outfile: FileUtils.lock_file(outfile) lines_all = outfile.readlines() lines_all = [it.replace('\n', '').split(',') for it in lines_all if ',' in it] if len(lines_all) == 0 or len(lines_all[0]) < 2: headers = ['step'] + list(args_dict.keys()) + list(metrics_dict.keys()) headers = [str(it).replace(',', '_') for it in headers] lines_all.append(headers) values = [global_step] + list(args_dict.values()) + list(metrics_dict.values()) values = [str(it).replace(',', '_') for it in values] if path_csv == path_local_csv: lines_all.append(values) else: # global existing_line_idx = -1 args_values = list(args_dict.values()) args_values = [str(it).replace(',', '_') for it in args_values] for idx_line, line in enumerate(lines_all): if len(line) > 1: is_match = True for idx_arg in range(len(args_values)): if line[idx_arg + 1] != args_values[idx_arg]: is_match = False break if is_match: existing_line_idx = idx_line break if existing_line_idx >= 0: lines_all[existing_line_idx] = values else: lines_all.append(values) outfile.truncate(0) outfile.seek(0) outfile.flush() rows = [','.join(it) for it in lines_all] rows = [it for it in rows if len(it.replace('\n', '').strip()) > 0] outfile.write('\n'.join(rows).strip()) outfile.flush() os.fsync(outfile) FileUtils.unlock_file(outfile) except Exception as e: logging.exception(e)
# /simpsons/test.mmap # /simpsons/test.json parser.add_argument('-path_output', default='/Users/evalds/Downloads/simpsons_x/', type=str) # scale and squeeze images to this size parser.add_argument('-size_img', default=128, type=int) parser.add_argument('-thread_max', default=10, type=int) parser.add_argument('-test_split', default=0.2, type=float) args, args_other = parser.parse_known_args() FileUtils.createDir(args.path_output) logging_utils = LoggingUtils( f"{args.path_output}/simpsons-{datetime.now().strftime('%y-%m-%d_%H-%M-%S')}.log" ) class_names = [] last_class_name = None mmap_shape = [0, 3, args.size_img, args.size_img] logging_utils.info( f'move test samples into train to change from classification to re-identification task' ) paths_files = FileUtils.listSubFiles(args.path_input_test) for path_file in paths_files:
args_other_names = ArgsUtils.extract_other_args_names(args_other) if len(args.datasource_include_test_class_ids) > 0: args.datasource_include_test_class_ids = ' '.join( args.datasource_include_test_class_ids) if len(args.datasource_exclude_train_class_ids) > 0: args.datasource_exclude_train_class_ids = ' '.join( args.datasource_exclude_train_class_ids) if args.hpc_queue == 'inf': args.hpc_gpu_max_queue = 0 # for old ones disable GPU # add all testable parameters to final report header args.params_report += args_other_names FileUtils.createDir('./reports') FileUtils.createDir('./tasks') FileUtils.createDir('./tasks/' + args.report) if args.is_hpc: FileUtils.createDir(os.path.expanduser('~') + '/tmp') logging_utils = LoggingUtils(filename=os.path.join('reports', args.report + '.txt')) ArgsUtils.log_args(args, 'taskgen.py', logging_utils) task_settings = {'id': 0, 'repeat_id': 0} hpc_settings_path = os.path.join('tasks', 'tasks.json') if os.path.exists(hpc_settings_path): with open(hpc_settings_path, 'r') as outfile: hpc_settings_loaded = json.load(outfile) for key in hpc_settings_loaded:
def __init__(self, args, is_test_data): super().__init__() self.args = args self.is_test_data = is_test_data path_data = f'{self.args.path_data}/{self.args.datasource_type}' FileUtils.createDir(path_data) if not os.path.exists( f'{self.args.path_data}/{self.args.datasource_type}/lock'): with open( f'{self.args.path_data}/{self.args.datasource_type}/lock', 'w') as fp_download_lock: fp_download_lock.write('') time.sleep(1.0) with open(f'{self.args.path_data}/{self.args.datasource_type}/lock', 'r+') as fp_download_lock: FileUtils.lock_file(fp_download_lock) transform_colors = torchvision.transforms.ToTensor() if self.args.datasource_is_grayscale: transform_colors = torchvision.transforms.Compose([ torchvision.transforms.Grayscale(), torchvision.transforms.ToTensor() ]) if self.args.datasource_type == 'fassion_mnist': self.dataset = torchvision.datasets.FashionMNIST( path_data, download=True, train=not is_test_data, transform=torchvision.transforms.ToTensor()) elif self.args.datasource_type == 'mnist': self.dataset = torchvision.datasets.MNIST( path_data, download=True, train=not is_test_data, transform=torchvision.transforms.ToTensor()) elif self.args.datasource_type == 'cifar_10': self.dataset = torchvision.datasets.CIFAR10( path_data, download=True, train=not is_test_data, transform=transform_colors) elif self.args.datasource_type == 'cifar_100': self.dataset = torchvision.datasets.CIFAR100( path_data, download=True, train=not is_test_data, transform=transform_colors) elif self.args.datasource_type == 'emnist': # extended mnist https://arxiv.org/pdf/1702.05373.pdf self.dataset = torchvision.datasets.EMNIST( path_data, download=True, split='balanced', train=not is_test_data, transform=torchvision.transforms.Compose([ lambda img: torchvision.transforms.functional.rotate( img, -90), lambda img: torchvision.transforms. functional.hflip(img), torchvision.transforms.ToTensor() ])) FileUtils.unlock_file(fp_download_lock) self.classes = np.arange(np.array(self.dataset.targets).max() + 1).tolist() groups = [{'samples': [], 'counter': 0} for _ in self.classes] for img, label_idx in self.dataset: groups[int(label_idx)]['samples'].append(img) args.input_size = img.size(1) # channels, w, h args.input_features = img.size(0) if not is_test_data: ids = [ int(it) for it in self.args.datasource_exclude_train_class_ids ] ids = sorted(ids, reverse=True) for remove_id in ids: del self.classes[remove_id] del groups[remove_id] else: if len(self.args.datasource_include_test_class_ids): ids = set(self.classes) - set([ int(it) for it in self.args.datasource_include_test_class_ids ]) ids = list(ids) ids = sorted(ids, reverse=True) for remove_id in ids: del self.classes[remove_id] del groups[remove_id] self.classes = np.array(self.classes, dtype=np.int) self.size_samples = 0 for idx, group in enumerate(groups): samples = group['samples'] self.size_samples += len(samples) self.groups = groups # for debugging purposes # DEBUGGING if self.args.datasource_size_samples > 0: logging.info( f'debugging: reduced data size {self.args.datasource_size_samples}' ) self.size_samples = self.args.datasource_size_samples logging.info( f'{self.args.datasource_type} {"test" if is_test_data else "train"}: classes: {len(groups)} total triplets: {self.size_samples}' ) if not is_test_data: self.args.datasource_classes_train = len( groups) # override class count if self.args.batch_size % self.args.triplet_positives != 0 or self.args.batch_size <= self.args.triplet_positives: logging.error( f'batch does not accommodate triplet_positives {self.args.batch_size} {self.args.triplet_positives}' ) exit() self.reshuffle()
def add_results(args, state): try: if args.report and len(args.report) > 0: filename = os.path.join('reports', args.report) + '.csv' if not os.path.exists(filename): if not os.path.exists('./reports'): os.mkdir('./reports') with open(filename, 'w') as outfile: FileUtils.lock_file(outfile) outfile.write(','.join(args.params_report) + '\n') outfile.flush() os.fsync(outfile) FileUtils.unlock_file(outfile) lines_all = [] with open(filename, 'r+') as outfile: FileUtils.lock_file(outfile) raw_lines = outfile.readlines() if len(raw_lines) > 0: header_line = raw_lines[0].strip() headers = header_line.split(',') else: headers = args.params_report lines_all.append(headers) for line in raw_lines: line = line.strip() if len(line) > 0 and ',' in line: parts = line.split(',') lines_all.append(parts) line_new = [] for key in headers: #! gather from state if key in state: line_new.append(str(state[key])) # ! gather also from args elif key in vars(args): line_new.append(str(getattr(args, key))) # ! if not found empty else: line_new.append('') # look for existing line to override part_idx_id = headers.index('id') is_exist = False try: for idx_line in range(1, len(lines_all)): parts = lines_all[idx_line] part_id = parts[part_idx_id] if str(args.id) == part_id.strip(): lines_all[idx_line] = line_new is_exist = True break except Exception as e: logging.error(str(e)) exc_type, exc_value, exc_tb = sys.exc_info() logging.error( traceback.format_exception(exc_type, exc_value, exc_tb)) if not is_exist: lines_all.append(line_new) outfile.truncate(0) outfile.seek(0) outfile.flush() rows = [','.join(it) for it in lines_all] outfile.write('\n'.join(rows)) outfile.flush() os.fsync(outfile) FileUtils.unlock_file(outfile) except Exception as e: logging.error(str(e)) exc_type, exc_value, exc_tb = sys.exc_info() logging.error( traceback.format_exception(exc_type, exc_value, exc_tb))
def calculate_accuracy( path_embeddings, meter_acc: tnt.meter.ClassErrorMeter, meter_auc: tnt.meter.AUCMeter, type='range', norm='l2', triplet_similarity='cos', mode='cpu', embedding_size=None, class_max_dist=None, # precomputed class_centroids=None, y_list=None, #precumputed sample_count=None, #precomputed paths_embs_idx_path_pairs=None): # precomputed paths_embs = FileUtils.listSubFiles(path_embeddings) # calculate centroids first if class_max_dist is None: class_centroids = {} class_max_dist = {} y_list = [] paths_embs_idx_path_pairs = [] sample_count = 0 for path_emb in paths_embs: if path_emb.endswith('.json'): y_each = int(os.path.basename(path_emb).split('.')[0]) path_emb_json = f'{path_embeddings}/{y_each}.json' path_emb_mem = f'{path_embeddings}/{y_each}.mmap' emb_json = FileUtils.loadJSON(path_emb_json) emb_mem = np.memmap(path_emb_mem, mode='r', dtype=np.float16, shape=(emb_json['count'], embedding_size)) paths_embs_idx_path_pairs.append((sample_count, y_each)) sample_count += emb_json['count'] y_list += (np.ones( (emb_json['count'], ), dtype=np.int) * y_each).tolist() class_centroids[y_each] = np.average(emb_mem, axis=0) if norm == 'l2': class_centroids[y_each] = normalize_vec( class_centroids[y_each]) np_class_centroids_tiled = np.tile(class_centroids[y_each], (len(emb_mem), 1)) list_dists = get_distance(np_class_centroids_tiled, emb_mem, triplet_similarity, mode).tolist() list_dists = sorted(list_dists, reverse=False) list_dists = list_dists[:max( 2, int(len(list_dists) * 0.9) )] # drop 10 top percent embeddings as they could contain noise class_max_dist[y_each] = list_dists[ -1] # last largest distance classes_size = int(np.max(y_list)) + 1 # store distance matrix as memmap for optimization path_dists_mem = f'{path_embeddings}/dists.mmap' is_exist_dists_mem = os.path.exists(path_dists_mem) dists_mem = np.memmap(path_dists_mem, mode='r+' if is_exist_dists_mem else 'w+', dtype=np.float16, shape=(sample_count, classes_size)) #dists_mem.flush() path_centroids_mem = f'{path_embeddings}/dists.mmap' is_exist_centroids_mem = os.path.exists(path_centroids_mem) centroids_mem = np.memmap(path_centroids_mem, mode='r+' if is_exist_centroids_mem else 'w+', dtype=np.float16, shape=(classes_size, embedding_size)) for key, value in class_centroids.items(): centroids_mem[key] = value #centroids_mem.flush() if not is_exist_dists_mem: Parallel(n_jobs=multiprocessing.cpu_count() * 2, backend='threading')( delayed(process_dists)(idx_start, y_each, y_list, path_embeddings, sample_count, classes_size, embedding_size, triplet_similarity, mode) for idx_start, y_each in paths_embs_idx_path_pairs) dists_mem = np.memmap(path_dists_mem, mode='r', dtype=np.float16, shape=(sample_count, classes_size)) # iterate through precomputed distances to add to data to meters for mem optimization chunk_size = 1024 for idx_chunk_start in range(sample_count // chunk_size + 1): idx_chunk_end = min(sample_count, idx_chunk_start + chunk_size) chunk_each_size = idx_chunk_end - idx_chunk_start if chunk_each_size == 0: break if type == 'range': predicted = np.zeros((chunk_each_size, classes_size), dtype=np.float) else: predicted = np.ones( (chunk_each_size, classes_size), dtype=np.float) * 1e9 target = np.zeros((chunk_each_size, classes_size), dtype=np.float) for idx_y in class_max_dist.keys(): max_dist = class_max_dist[idx_y] for idx_class in range(chunk_each_size): target[idx_class, y_list[idx_chunk_start + idx_class]] = 1.0 dists = dists_mem[idx_chunk_start:idx_chunk_end] if type == 'range': for idx_emb, dist in enumerate(dists): if max_dist > dist[idx_y]: predicted[idx_emb, idx_y] += 1.0 else: predicted[:, idx_y] = np.minimum( predicted[:, idx_y], dists[:, idx_y] ) # store for each class closest embedding with distance value if type == 'range': predicted = predicted / (np.sum(predicted, axis=1, keepdims=True) + 1e-18) else: # TODO softmax/hardmax based accuracy idx_class = np.argmin( predicted, axis=1) # for each sample select closest distance predicted = np.zeros_like(predicted) # init probabilities vector predicted[ np.arange(predicted.shape[0]), idx_class] = 1.0 # for each sample set prob 100% by columns y_chunk = np.array(y_list[idx_chunk_start:idx_chunk_end]) meter_acc.add(predicted, y_chunk) # AssertionError: targets should be binary (0, 1) idxes_classes = np.argmax(predicted, axis=1) target_tp = np.array(np.equal(y_chunk, idxes_classes), dtype=np.int) meter_auc.add(np.max(predicted, axis=1), target_tp) return class_max_dist, class_centroids, y_list, sample_count, paths_embs_idx_path_pairs