def action_serve(argc, argv): global prj, app, classes, num_outputs args = parse_args(argv) prj = Project(args.path) err = prj.load() if err is not None: log.error("error while loading project: %s", err) quit() elif not prj.is_trained(): log.error("no trained Keras model found for this project") quit() if args.classes is None: num_outputs = prj.model.output.shape[1] if prj.classes is None: classes = ["class_%d" % i for i in range(num_outputs)] else: classes = [prj.classes[i] for i in range(num_outputs)] else: classes = [s.strip() for s in args.classes.split(',') if s.strip() != ""] num_outputs = len(classes) if args.profile: from werkzeug.contrib.profiler import ProfilerMiddleware args.debug = True app.config['PROFILE'] = True app.wsgi_app = ProfilerMiddleware(app.wsgi_app, restrictions=[args.restrictions]) app.run(host=args.address, port=args.port, debug=args.debug)
def action_create(argc, argv): args = parse_args(argv) if os.path.exists(args.path): log.error("path %s already exists" % args.path) quit() Project.create(args.path)
def action_view(argc, argv): args = parse_args(argv) prj = Project(args.path) err = prj.load() if err is not None: log.error("error while loading project: %s", err) quit() prj.view(args.img_only)
def action_view(args): prj = Project(args.project_path) err = prj.load() if err is not None: log.error("error while loading project: %s", err) quit() prj.view()
def action_view(argc, argv): if argc < 1: usage() prj = Project(argv[0]) err = prj.load() if err is not None: log.error("error while loading project: %s", err) quit() prj.view()
def action_create(args): path = args.project_path if os.path.exists(path): log.error("path %s already exists" % path) quit() log.info("creating %s ..." % path) os.makedirs(path, exist_ok=True) Project.create(path)
def action_to_fdeep(argc, argv): args = parse_args(argv) prj = Project(args.path) err = prj.load() if err is not None: log.error("error while loading project: %s", err) quit() elif not prj.is_trained(): log.error("no trained model found for this project") quit() convert(prj.weights_path, prj.fdeep_path, args.no_tests, args.metadata)
def action_create(argc, argv): if argc != 1: usage() path = argv[0] if os.path.exists(path): log.error("path %s already exists" % path) quit() log.info("creating %s ..." % path) os.makedirs(path, exist_ok=True) Project.create(path)
def action_to_fdeep(args): prj = Project(args.project_path) err = prj.load() if err is not None: log.error("error while loading project: %s", err) quit() elif not prj.is_trained(): log.error("no trained Keras model found for this project") quit() log.info("converting %s to %s ...", prj.weights_path, prj.fdeep_path) convert(prj.model, prj.fdeep_path)
def action_serve(argc, argv): global prj, app args = parse_args(argv) prj = Project(args.path) err = prj.load() if err is not None: log.error("error while loading project: %s", err) quit() elif not prj.is_trained(): log.error("no trained Keras model found for this project") quit() app.run(host=args.host, port=args.port, debug=args.debug)
def action_to_tf(argc, argv): args = parse_args(argv) prj = Project(args.path) err = prj.load() if err is not None: log.error("error while loading project: %s", err) quit() elif not prj.is_trained(): log.error("no trained Keras model found for this projec") quit() frozen_graph = freeze_session(K.get_session(), output_names=[out.op.name for out in prj.model.outputs]) log.info("saving protobuf to %s ...", os.path.join(prj.path, 'model.pb')) tf.train.write_graph(frozen_graph, prj.path, "model.pb", as_text=False)
def action_train(argc, argv): if argc < 1: usage() prj = Project(argv[0]) err = prj.load() if err is not None: log.error("error while loading project: %s", err) quit() args = parse_args(argv[1:]) if args.dataset is not None: # a dataset was specified, split it and generate # the subsets prj.dataset.do_save = not args.no_save prj.prepare(args.dataset, args.test, args.validation) elif prj.dataset.exists(): # no dataset passed, attempt to use the previously # generated subsets prj.dataset.load() else: log.error("no test/train/validation subsets found in %s, please specify a --dataset argument", argv[0]) quit() prj.train(args.gpus)
def action_prepare(argc, argv): args = parse_args(argv) prj = Project(args.path) err = prj.load() if err is not None: log.error("error while loading project: %s", err) quit() if args.dataset is None: log.error("no --dataset argument specified", args.path) quit() if prj.dataset.exists(): log.info("removing previously generated datasets") clean_dataset(args.path) prj.dataset.do_save = True prj.prepare(args.dataset, args.test, args.validation, not args.no_shuffle)
def action_compare(args): metrics = {} projects = { \ args.projects_paths[0]: None, args.projects_paths[1]: None, } ref = None inp_shape = None out_shape = None for path in projects: prj = Project(path) err = prj.load() if err is not None: log.error("error while loading project %s: %s", path, err) quit() if inp_shape is None: inp_shape = prj.model.input_shape elif inp_shape != prj.model.input_shape: log.error("model %s input shape is %s, expected %s", path, prj.model.input_shape, inp_shape) quit() if out_shape is None: out_shape = prj.model.output_shape elif out_shape != prj.model.output_shape: log.error("model %s output shape is %s, expected %s", path, prj.model.output_shape, out_shape) quit() if ref is None: ref = prj projects[path] = prj metrics[path] = None ref.prepare(args.dataset, 0.0, 0.0) log.info("evaluating %d models on %d samples ...", len(projects), len(ref.dataset.X)) for path, prj in projects.items(): # TODO: Run in parallel? log.debug("running %s ...", path) metrics[path] = prj.accuracy_for(ref.dataset.X, ref.dataset.Y, repo_as_dict=True) prev = None for path, m in metrics.items(): if prev is None: prev = m continue ref_repo, ref_cm = prev new_repo, new_cm = m diffs = {'report': [], 'cm': [], 'cm_stats': {}} table = [["Name", "Ref", "New", "Delta"]] for label, ref_run in ref_repo.items(): for name, ref_value in ref_run.items(): new_value = new_repo[label][name] if new_value != ref_value: delta = new_value - ref_value sign, fn = ('+', green) if delta >= 0 else ('', red) diffs['report'].append({ 'name': '%s / %s' % (label, name), 'delta': delta, }) table.append( [\ "%s / %s" % (label, name), "%.2f" % ref_value, "%.2f" % new_value, fn("%s%.2f" % (sign, delta))] ) print("") print(AsciiTable(table).table) heads = [""] for i in range(0, ref_cm.shape[0]): heads.append("class %d" % i) table = [heads] total = 0 impr = 0 regr = 0 for i in range(0, ref_cm.shape[0]): row = ["class %d" % i] row_diffs = [] for j in range(0, ref_cm.shape[1]): ref_v = ref_cm[i][j] new_v = new_cm[i][j] total = total + new_v delta = new_v - ref_v if ref_v != new_v: sign = '+' if delta >= 0 else '' if i == j: fn = green if delta >= 0 else red else: fn = red if delta >= 0 else green if fn == green: impr += abs(delta) else: regr += abs(delta) cell = fn("%d (%s%d)" % (new_v, sign, delta)) else: cell = "%d" % ref_v row.append(cell) row_diffs.append(delta) diffs['cm'].append(row_diffs) table.append(row) print("") print(AsciiTable(table).table) diffs['cm_stats'] = { 'improvements': { 'total': impr, 'perc': impr / float(total) * 100.0 }, 'regressions': { 'total': regr, 'perc': regr / float(total) * 100.0 } } print("") print("Improvements: %d ( %.2f %% )" % (impr, impr / float(total) * 100.0)) print("Regressions : %d ( %.2f %% )" % (regr, regr / float(total) * 100.0)) if args.to_json is not None: print("") log.info("creating %s ...", args.to_json) with open(args.to_json, 'w+') as fp: json.dump(diffs, fp, default=default)
def action_clean(argc, argv): if argc < 1: usage() args = parse_args(argv[1:]) Project.clean(argv[0], args.all)
def action_explore(argc, argv): global prj, nrows, ncols, attributes, n_jobs args = parse_args(argv) if args.all: args.pca = True args.correlations = True args.stats = True args.cluster = True args.D3 = True if args.workers == -1: import multiprocessing n_jobs = multiprocessing.cpu_count() elif args.workers != 0: n_jobs = args.workers log.info("using %d workers" % n_jobs) if args.nclusters and not args.cluster: log.warning( "number of clusters specified but clustering won't be perfomed") if not (args.pca or args.correlations or args.stats or args.cluster): log.error("No exploration action was specified") print("") parse_args(["-h"]) quit() prj = Project(args.path) err = prj.load() if err is not None: log.error("error while loading project: %s", err) quit() prj.prepare(args.dataset, 0.0, 0.0) if not prj.dataset.is_flat: log.error("data exploration can only be applied to flat inputs") quit() X, y = prj.dataset.subsample(args.ratio) nrows, ncols = X.shape attributes = get_attributes(args.attributes, ncols) if args.correlations: log.info("computing correlations of each feature with target") corr = compute_correlations_with_target(X, y) print_target_correlation_table(corr) log.info("computing features crosscorrelation") corr = calculate_corr(X) print_correlation_table(corr, min_corr=0.7) views.correlation_matrix(prj, corr, args.img_only) if args.pca: log.info("computing pca") pca = calculate_pca(X) log.info("computing pca projection") views.pca_projection(prj, pca, X, y, False) if args.D3: views.pca_projection(prj, pca, X, y, args.D3) views.pca_explained_variance(prj, pca, args.img_only) if args.stats: log.info("computing features stats") print_stats_table(X) inertia = False if args.cluster: if args.cluster_alg == 'kmeans': cluster_alg = kmeans_clustering if not args.nclusters: args.nclusters = len(set(np.argmax(y, axis=1))) args.nclusters = int(args.nclusters) if args.nmaxclusters: log.info( "performing inertia analysis with clusters in the range (%d, %d)" % (args.nclusters, args.nmaxclusters)) inertia = True n_clusters_analysis(X, args.nmaxclusters, args.nclusters) else: log.info("computing kmeans clustering with k=%d" % args.nclusters) elif args.cluster_alg == 'dbscan': cluster_alg = dbscan_clustering if not args.nclusters: args.nclusters = 2 log.info("computing dbscan clustering with eps=%f" % args.nclusters) if args.nmaxclusters: log.warning( "nmax specified but not used. Inertia analysis only available for Kmeans." ) if not args.pca and not inertia: log.info("computing pca to plot clusters") pca = calculate_pca(X) if not inertia: ca = cluster_alg(X, args.nclusters) if len(set(ca.labels_)) == 1: log.error("clustering failed. Check input parameter.") quit() views.plot_clusters(prj, pca, X, y, ca, False) if args.D3: views.plot_clusters(prj, pca, X, y, ca, args.D3) views.show(args.img_only)
def action_encode(argc, argv): args = parse_args(argv) if not os.path.exists(args.path): log.error("%s does not exist.", args.path) quit() prj = Project(args.project) err = prj.load() if err is not None: log.error("error while loading project: %s", err) quit() args.label = args.label.strip().lower() log.info("using %s labeling", 'auto' if args.label == 'auto' else 'hardcoded') inputs = [] if os.path.isdir(args.path): in_files = [] if args.label == 'auto': # the label is inferred from the dirname, so we expect # args.path to contain multiple subfolders for subfolder in glob.glob(os.path.join(args.path, "*")): log.info("enumerating %s ...", subfolder) in_filter = os.path.join(subfolder, args.filter) in_sub = glob.glob(in_filter) n_sub = len(in_sub) if n_sub > 0: log.info("collected %d inputs from %s", n_sub, subfolder) in_files.extend(in_sub) else: # grab files directly from args.path in_filter = os.path.join(args.path, args.filter) in_files.extend(glob.glob(in_filter)) log.info("collected %d inputs from %s", len(in_files), args.path) log.info("labeling %d files ...", len(in_files)) for filepath in in_files: if os.path.isfile(filepath): inputs.append((label_of(args, filepath), filepath)) elif args.multi: log.info("parsing multiple inputs from %s ...", args.path) label = label_of(args, args.path) with open(args.path, 'rt') as fp: for line in fp: inputs.append((label, line)) else: label = label_of(args, args.path) inputs.append((label, args.path)) # one encoding queue that pushes to another queue that centralizes # append operations to a single writer process num_in = len(inputs) enc_q = TaskQueue('encoding', args.workers) res_q = multiprocessing.Queue() app_p = multiprocessing.Process(target=appender, args=(args.output, num_in, res_q)) # open the output file and start waiting for lines to append app_p.start() log.info("encoding %d inputs to %s ...", num_in, args.output) for (y, x) in inputs: enc_q.add_task(parse_input, prj, x, y, res_q, args.delete) # wait for all inputs to be encoded enc_q.join() # let the writer know there are no more inputs to read res_q.put(None) # wait for the writer to finish app_p.join()
def action_clean(argc, argv): args = parse_args(argv) Project.clean(args.path, args.all)
def action_relevance(argc, argv): args = parse_args(argv) prj = Project(args.path) err = prj.load() if err is not None: log.error("error while loading project: %s", err) quit() elif not prj.is_trained(): log.error("no trained Keras model found for this project") quit() prj.prepare(args.dataset, 0.0, 0.0) X, y = prj.dataset.subsample(args.ratio) nrows, ncols = X.shape if prj.dataset.is_flat else (X[0].shape[0], len(X)) attributes = get_attributes(args.attributes, ncols) log.info("computing relevance of %d attributes on %d samples ...", ncols, nrows) start = time.time() ref_accu, ref_cm = prj.accuracy_for(X, y, repo_as_dict=True) deltas = [] tot = 0 speed = (1.0 / (time.time() - start)) * nrows for col in range(0, ncols): log.info( "[%.2f evals/s] computing relevance for attribute [%d/%d] %s ...", speed, col + 1, ncols, attributes[col]) backup = zeroize_feature(X, col, prj.dataset.is_flat) start = time.time() accu, cm = prj.accuracy_for(X, y, repo_as_dict=True) speed = (1.0 / (time.time() - start)) * nrows delta = ref_accu['weighted avg']['precision'] - accu['weighted avg'][ 'precision'] tot += delta deltas.append((col, delta)) restore_feature(X, col, backup, prj.dataset.is_flat) deltas = sorted(deltas, key=lambda x: abs(x[1]), reverse=True) rels = [] num_zero = 0 table = [("Column", "Feature", "Relevance")] for delta in deltas: col, d = delta colname = attributes[col] rel = {"attribute": colname, "index": col, "relevance": 0.0} if d != 0.0: relevance = (d / tot) * 100.0 row = ("%d" % col, attributes[col], "%.2f%%" % relevance) row = ["\033[31m%s\033[0m" % e for e in row] if relevance < 0.0 else row table.append(row) rel['relevance'] = relevance else: num_zero += 1 rels.append(rel) print("") print(AsciiTable(table).table) print("") if num_zero > 0: log.info("%d features have 0 relevance.", num_zero) if args.to_json is not None: print("") log.info("creating %s ...", args.to_json) with open(args.to_json, 'w+') as fp: json.dump(rels, fp, default=default)
def action_compare(argc, argv): if argc < 4: usage() args = parse_args(argv[2:]) metrics = {} projects = { \ argv[0]: None, argv[1]: None, } ref = None inp_shape = None out_shape = None for path in projects: prj = Project(path) err = prj.load() if err is not None: log.error("error while loading project %s: %s", path, err) quit() if inp_shape is None: inp_shape = prj.model.input_shape elif inp_shape != prj.model.input_shape: log.error("model %s input shape is %s, expected %s", path, prj.model.input_shape, inp_shape) quit() if out_shape is None: out_shape = prj.model.output_shape elif out_shape != prj.model.output_shape: log.error("model %s output shape is %s, expected %s", path, prj.model.output_shape, out_shape) quit() if ref is None: ref = prj projects[path] = prj metrics[path] = None ref.prepare(args.dataset, 0.0, 0.0) log.info("evaluating %d models on %d samples ...", len(projects), len(ref.dataset.X)) for path, prj in projects.items(): # TODO: Run in parallel? log.debug("running %s ...", path) metrics[path] = prj.accuracy_for(ref.dataset.X, ref.dataset.Y, repo_as_dict=True) prev = None for path, m in metrics.items(): if prev is None: prev = m continue ref_repo, ref_cm = prev new_repo, new_cm = m table = [["Name", "Ref", "New", "Delta"]] for label, ref_run in ref_repo.items(): for name, ref_value in ref_run.items(): new_value = new_repo[label][name] if new_value != ref_value: delta = new_value - ref_value sign = '+' if delta >= 0 else '' table.append( [\ "%s / %s" % (label, name), "%.2f" % ref_value, "%.2f" % new_value, "%s%.2f" % (sign, delta)] ) print("") print("Report:") print(AsciiTable(table).table) heads = [""] for i in range(0, ref_cm.shape[0]): heads.append("class %d" % i) table = [heads] for i in range(0, ref_cm.shape[0]): row = ["class %d" % i] for j in range(0, ref_cm.shape[1]): ref_v = ref_cm[i][j] new_v = new_cm[i][j] if ref_v != new_v: delta = new_v - ref_v sign = '+' if delta >= 0 else '' cell = "%d (%s%d)" % (new_v, sign, delta) else: cell = "%d" % ref_v row.append(cell) table.append(row) print("") print("Confusion matrix:") print(AsciiTable(table).table)
def action_relevance(argc, argv): global prj, deltas, tot, start, speed, nrows, ncols, attributes args = parse_args(argv) prj = Project(args.path) err = prj.load() if err is not None: log.error("error while loading project: %s", err) quit() elif not prj.is_trained(): log.error("no trained Keras model found for this project") quit() prj.prepare(args.dataset, 0.0, 0.0) # one single worker in blocking mode = serial if args.workers == 0: args.workers = 1 X, y = prj.dataset.subsample(args.ratio) nrows, ncols = X.shape if prj.dataset.is_flat else (X[0].shape[0], len(X)) attributes = get_attributes(args.attributes, ncols) queue = TaskQueue('relevance', num_workers=args.workers, blocking=True) if args.workers == 1: log.info("computing relevance of %d attributes on %d samples using '%s' metric (slow mode) ...", ncols, nrows, args.metric) else: log.info("computing relevance of %d attributes on %d samples using '%s' metric (parallel with %d workers) ...", ncols, nrows, args.metric, queue.num_workers) start = time.time() ref_accu, ref_cm = prj.accuracy_for(X, y, repo_as_dict = True) speed = (1.0 / (time.time() - start)) * nrows for col in range(0, ncols): queue.add_task( run_inference_without, X, y, col, prj.dataset.is_flat, ref_accu['weighted avg'][args.metric], args.metric) # wait for all inferences to finish queue.join() # sort relevances by absolute value deltas = sorted(deltas, key = lambda x: abs(x[1]), reverse = True) rels = [] num_zero = 0 table = [("Column", "Feature", "Relevance")] for delta in deltas: col, d = delta colname = attributes[col] rel = { "attribute": colname, "index": col, "relevance": 0.0 } if d != 0.0: relevance = (d / tot) * 100.0 row = ("%d" % col, attributes[col], "%.2f%%" % relevance) row = ["\033[31m%s\033[0m" % e for e in row] if relevance < 0.0 else row table.append(row) rel['relevance'] = relevance else: num_zero += 1 rels.append(rel) print("") print(AsciiTable(table).table) print("") if num_zero > 0: log.info("%d features have 0 relevance.", num_zero) if args.to_json is not None: print("") log.info("creating %s ...", args.to_json) with open(args.to_json, 'w+') as fp: json.dump(rels, fp, default=default)
def action_clean(args): Project.clean(args.project_path, args.all)
def action_compare(argc, argv): args = parse_args(argv) metrics = {} projects = { \ args.path_1: None, args.path_2: None, } ref = None inp_shape = None out_shape = None is_prepared = None prjs = [] for path in projects: prj = Project(path) err = prj.load() if err is not None: log.error("error while loading project %s: %s", path, err) quit() prjs.append(prj) if not is_prepared: is_prepared = True else: small_dataset = generate_reduced_dataset(args.dataset) are_equal = are_preparation_equal(prjs, small_dataset) log.info("deleting temporal file %s", small_dataset) os.remove(small_dataset) if out_shape is None: out_shape = prj.model.output_shape elif out_shape != prj.model.output_shape: log.error("model %s output shape is %s, expected %s", path, prj.model.output_shape, out_shape) quit() projects[path] = prj for prj, path in zip(prjs, projects): prj = Project(path) err = prj.load() if err is not None: log.error("error while loading project %s: %s", path, err) quit() if ref is None: prj.prepare(args.dataset, 0, 0, False) ref = prj is_prepared = True else: if are_equal: log.info("Projects use same prepare.py file ...") prj.dataset.X, prj.dataset.Y, prj.dataset.n_labels = ref.dataset.X.copy( ), ref.dataset.Y.copy(), ref.dataset.n_labels else: log.info( "Projects use different prepare.py files, reloading dataset ..." ) prj.prepare(args.dataset, 0., 0., False) # TODO: Run in parallel? log.debug("running %s ...", path) metrics[path] = prj.accuracy_for(prj.dataset.X, prj.dataset.Y, repo_as_dict=True) prev = None for path, m in metrics.items(): if prev is None: prev = m continue ref_repo, ref_cm = prev new_repo, new_cm = m diffs = {'report': [], 'cm': [], 'cm_stats': {}} table = [["Name", "Ref", "New", "Delta"]] for label, ref_run in ref_repo.items(): for name, ref_value in ref_run.items(): new_value = new_repo[label][name] if new_value != ref_value: delta = new_value - ref_value sign, fn = ('+', green) if delta >= 0 else ('', red) diffs['report'].append({ 'name': '%s / %s' % (label, name), 'delta': delta, }) table.append([ \ "%s / %s" % (label, name), "%.2f" % ref_value, "%.2f" % new_value, fn("%s%.2f" % (sign, delta))]) print("") print(AsciiTable(table).table) heads = [""] for i in range(0, ref_cm.shape[0]): heads.append("class %d" % i) table = [heads] total = 0 impr = 0 regr = 0 for i in range(0, ref_cm.shape[0]): row = ["class %d" % i] row_diffs = [] for j in range(0, ref_cm.shape[1]): ref_v = ref_cm[i][j] new_v = new_cm[i][j] total = total + new_v delta = new_v - ref_v if ref_v != new_v: sign = '+' if delta >= 0 else '' if i == j: fn = green if delta >= 0 else red else: fn = red if delta >= 0 else green if fn == green: impr += abs(delta) else: regr += abs(delta) cell = fn("%d (%s%d)" % (new_v, sign, delta)) else: cell = "%d" % ref_v row.append(cell) row_diffs.append(delta) diffs['cm'].append(row_diffs) table.append(row) print("") print(AsciiTable(table).table) diffs['cm_stats'] = { 'improvements': { 'total': impr, 'perc': impr / float(total) * 100.0 }, 'regressions': { 'total': regr, 'perc': regr / float(total) * 100.0 } } print("") print("Improvements: %d ( %.2f %% )" % (impr, impr / float(total) * 100.0)) print("Regressions : %d ( %.2f %% )" % (regr, regr / float(total) * 100.0)) if args.to_json is not None: print("") log.info("creating %s ...", args.to_json) with open(args.to_json, 'w+') as fp: json.dump(diffs, fp, default=default)