def filter_samples(blacklist, data, name): original_size = len(data) data.examples = [ example for example in data if example.id not in blacklist ] Logger.debug("{} size {} -> {}".format(name, original_size, len(data)))
def __accuracy(self, base_model, reject_threshold, valid_iter, adversarial=False): valid_stats = base_model.accuracy_with_reject( valid_iter, self.dataset.TARGET, self.dataset.reject_token_id, reject_threshold, ) if adversarial: Logger.start_scope("adversarial accuracy") stats = self.rename_adversary.adversarial_accuracy( base_model, valid_iter, [ AdversaryBatchIter( self.subtree_adversary, base_model, AdversaryBatchIter( self.rename_adversary, base_model, valid_iter, num_samples=2 ), ), AdversaryBatchIter( self.rename_adversary, base_model, valid_iter, num_samples=40 ), ], threshold=reject_threshold, approximate=True, ) Logger.end_scope() return stats.is_sound() valid_prec = valid_stats["mask_valid_noreject_acc"] return valid_prec == 100.0
def set_base_predictions( self, base_correct: torch.Tensor, base_preds: torch.Tensor, base_y=None, batch_ids=None, ): self.base_correct = base_correct self.base_preds = base_preds self.base_reject_mask = base_preds == self.reject_token_id self.base_y = base_y self.correct = { AdvType.SOUND_PRECISE: base_correct.clone(), AdvType.SOUND: base_correct.clone(), # AdvType.UNSOUND: torch.zeros_like(base_correct, dtype=torch.bool), } self.reject = { AdvType.SOUND_PRECISE: self.base_reject_mask.clone(), AdvType.SOUND: self.base_reject_mask.clone(), # AdvType.UNSOUND: torch.zeros_like(self.base_reject_mask, dtype=torch.bool), } self.incorrect = { AdvType.UNSOUND: ~(self.base_correct | self.base_reject_mask) } Logger.debug( "correct: {}, reject: {}, incorrect: {}, total: {}".format( self.base_correct.sum().item(), self.base_reject_mask.sum().item(), (~(self.base_correct | self.base_reject_mask)).sum().item(), self.base_correct.numel(), )) self.batch_ids = batch_ids
def filter_size(self, dataset, min_size=100): original_size = len(dataset) dataset.examples = [ example for example in dataset if len(example.target) > min_size ] Logger.debug("Filter size {} -> {}".format(original_size, len(dataset)))
def __save_paths(args, paths, name): num_entries = 0 with gzip.open(os.path.join(args.out_dir, name + ".json.gz"), "wb") as f_out: for path in paths: optimized = False if os.path.exists(path + ".opt"): optimized = True path = path + ".opt" if is_file_empty(path): continue print(num_entries, path) with gzip.open(path, "rb") as f: for entry in json.loads(f.read()): if not (args.include_js or entry["filename"].endswith(".ts")): continue num_entries += 1 if not optimized: entry["dependencies"] = entry["source_files"] del entry["source_files"] assert None not in entry["dependencies"] f_out.write(json.dumps(entry).encode("utf-8")) f_out.write("\n".encode("utf-8")) Logger.debug("{}, num files: {}".format(name, num_entries))
def save_model(model: NeuralModelBase, args, model_id): import torch checkpoint_file = os.path.join(checkpoint_dir(args), checkpoint_name(args, model_id)) Logger.debug("Saving model to {}".format(checkpoint_file)) torch.save(model.state_dict(), checkpoint_file)
def load_models( self, make_model, dataset: Dataset, adversary, subtree_adversary, args, model_id, max_models=None, last_base=True, ): self.models = [] while max_models is None or len(self.models) < max_models: model = RobustModel( make_model, dataset, idx=len(self.models), rename_adversary=adversary, subtree_adversary=subtree_adversary, base_model=last_base and len(self.models) + 1 == max_models, ) if not model.load(args, model_id): break self.models.append(model) Logger.debug("Loaded {} models".format(len(self.models)))
def train_nonempty_model( model_fn, dataset: Dataset, train_iter, valid_iter, num_epochs=10, max_steps=10, step=0.1, ): model = model_fn() train_model(model, dataset, num_epochs, train_iter, valid_iter, target_o=1.1) thresholds = get_rejection_thresholds( valid_iter, model, dataset, [0.98, 0.95, 0.9, 0.8] ) thresholds = [t for t in thresholds if t.h is not None and t.size > 100] if not thresholds: thresholds = get_rejection_thresholds( train_iter, model, dataset, [0.98, 0.95, 0.9, 0.8] ) thresholds = [t for t in thresholds if t.h is not None and t.size > 100] if thresholds: Logger.debug("Rejection Threshold: {}".format(thresholds[0])) model.accuracy_with_reject( valid_iter, dataset.TARGET, dataset.reject_token_id, thresholds[0].h ) return model, thresholds[0] return None, None
def print(self, dataset=None, edge_gen=None): Logger.debug("EdgeFilter") cumsum = 0 cumsum_seen = 0 total_seen = sum(self.seen_counts.values()) for feature, cost, count in zip(self.valid_features, self.costs, self.counts): cumsum += count cumsum_seen += self.seen_counts[feature] if dataset is not None or edge_gen is not None: node_type_u, node_type_v, edge_type = feature.split("_") node_type_u = (dataset.TYPES.vocab.itos[int(node_type_u)] if dataset is not None else node_type_u) node_type_v = (dataset.TYPES.vocab.itos[int(node_type_v)] if dataset is not None else node_type_v) edge_type = (edge_gen.id_to_edge_type[int(edge_type)] if edge_gen is not None else edge_type) feature = "{}_{}_{}".format(node_type_u, node_type_v, edge_type) Logger.debug( "\t{:>40s} cost: {:10.0f} ({:5.2f}%), count: {:10d} ({:5.2f}%), cumsum: {:6.2f}%, seen: {:6.2f}%" .format( feature, cost, cost * 100.0 / sum(self.costs), count, count * 100.0 / sum(self.counts), cumsum * 100.0 / sum(self.counts), (cumsum_seen * 100.0 / total_seen) if total_seen != 0 else 0, ))
def main(): args = parse_args() if not args.include_values: # When the values are not included renaming is a no-op args.n_renames = 0 if args.adv_mode != "RANDOM" or args.train_adv_mode != "RANDOM": args.dot_product_embedding = True args.tag = "{}/robust".format(args.tag) """ Debug Initialization """ Logger.init(args.log_file) Logger.debug(" ".join(sys.argv)) Random.seed(args.seed) USE_CUDA = torch.cuda.is_available() and args.use_cuda device = torch.device("cuda" if USE_CUDA else "cpu") """ Dataset Loading and Preprocessing """ dataset = Dataset( args, include_edges=args.model in [Models.UGraphTransformer.name, Models.GCN.name, Models.GGNN.name], ) dataset.remove_duplicates() masks = {"mask_valid": dataset.MASK_VALID} """ Training """ pd.set_option("display.max_rows", None) pd.set_option("display.max_columns", None) def save_results(data): data = pd.concat(data) print(data) csv_path = os.path.join(checkpoint_dir(args), "results.csv") data.to_csv(csv_path, index=False, header=True) dfs = [] for i in range(args.repeat): Random.seed(args.seed + i) if args.eval: df = eval( args, dataset, device, masks, max_models=args.max_models, model_id=i ) else: df = robust_multi( args, dataset, device, masks, max_models=args.max_models, model_id=i ) dfs.append(df) save_results(dfs)
def get_rejection_thresholds( it, model: NeuralModelBase, dataset: Dataset, precision_thresholds: Iterable[float] ): num_bins = 1000 # stats = [SimpleNamespace(correct=0, total=0) for _ in range(num_bins + 1)] num_correct = torch.zeros(num_bins) num_total = torch.zeros(num_bins) for batch in tqdm.tqdm(it, ncols=100, leave=False): _, best_predictions, reject_probs = model.predict_probs_with_reject( batch, reject_id=dataset.reject_token_id ) mask = model.padding_mask(batch, mask_field="mask_valid") targets = batch.Y best_predictions = best_predictions.masked_select(mask) reject_probs = reject_probs.masked_select(mask).cpu() targets = targets.masked_select(mask) is_corrects = (targets == best_predictions).cpu() num_total.add_(torch.histc(reject_probs, bins=num_bins, min=0, max=1)) num_correct.add_( torch.histc( reject_probs.masked_select(is_corrects), bins=num_bins, min=0, max=1 ) ) def precision(stat): if stat.total == 0: return 0 return stat.correct * 1.0 / stat.total thresholds = [SimpleNamespace(h=None, size=0) for _ in precision_thresholds] rolling_stat = SimpleNamespace(correct=0, total=0) for i, correct, total in zip( itertools.count(), num_correct.numpy(), num_total.numpy() ): for t, precision_threshold in zip(thresholds, precision_thresholds): if precision_threshold <= precision(rolling_stat): # update threshold if it's not set or the number of samples increased if t.h is None or t.size * 1.01 < rolling_stat.total: t.h = i / float(num_bins) t.size = int(rolling_stat.total) rolling_stat.correct += correct rolling_stat.total += total Logger.debug( "Thresholds: {}, sizes: {}".format( [t.h for t in thresholds], [t.size for t in thresholds] ) ) return thresholds
def load_model(model: NeuralModelBase, args, model_id): import torch checkpoint_file = os.path.join(checkpoint_dir(args), checkpoint_name(args, model_id)) print("checkpoint_file", checkpoint_file) if not os.path.exists(checkpoint_file): return False Logger.debug("Loading model from {}".format(checkpoint_file)) data = torch.load(checkpoint_file) model.load_state_dict(data) return True
def make_adversary(dataset: Dataset, make_iter): Logger.start_scope("Parsing Trees") trees_train_str = dataset_to_trees(dataset.dtrain, dataset.ID) trees_valid_str = dataset_to_trees(dataset.dvalid, dataset.ID) trees_test_str = dataset_to_trees(dataset.dtest, dataset.ID) trees_str = {**trees_train_str, **trees_valid_str, **trees_test_str} trees_train_num = dataset_to_trees_num(dataset.dtrain) trees_valid_num = dataset_to_trees_num(dataset.dvalid) trees_test_num = dataset_to_trees_num(dataset.dtest) trees_num = {**trees_train_num, **trees_valid_num, **trees_test_num} Logger.end_scope() Logger.start_scope("Indexing Trees") value_index = NodeValueIndex(dataset, trees_train_num) value_index_str = NodeValueIndexStr(dataset, trees_train_str) expr_gen = ExpressionGenerator(value_index_str) node_replacement = AdversarialNodeReplacement(value_index, dataset.fixed_value_offset) rules_index = node_replacement.make_rules(dataset, trees_str, trees_num) adversary = RenameAdversary(rules_index, dataset) Logger.end_scope() subtree_replacement = AdversarialSubtreeReplacement(expr_gen) subtree_rules = subtree_replacement.make_rules(dataset, trees_str, trees_num) subtree_adversary = SubtreeAdversary(subtree_rules, dataset, trees_str, make_iter) return adversary, subtree_adversary
def optimize_project(path, pool, include_js=False): if os.path.exists(path + ".opt"): return if is_file_empty(path): return with gzip.open(path, "rb") as f: entries = json.loads(f.read()) if not include_js: entries = [ entry for entry in entries if entry["filename"].endswith(".ts") ] Logger.start_scope("Optimizing {}".format(path)) Logger.debug("#Entries: {}".format(len(entries))) num_diffs = 0 opt_entries = [] for idx, entry in enumerate(pool.imap_unordered(optimize_file, entries)): # for idx, entry in enumerate(entries): # entry = optimize_file(entry) sys.stderr.write("\r{}/{}".format(idx, len(entries))) num_diffs += entry["num_diffs"] opt_entries.append(entry) sys.stderr.write("\r{}/{}\n".format(len(entries), len(entries))) Logger.debug("#Diffs: {}".format(num_diffs)) Logger.end_scope() print("write: ", path + ".opt") with gzip.open(path + ".opt", "wb") as f: f.write(json.dumps(opt_entries).encode("utf-8"))
def apply( self, it, filtered_it, mask_field="mask_valid", num_verbose=0, is_train=False ): num_predicted = 0 num_shown = 0 for batch, fbatch in zip(it, filtered_it): num_predicted_batch, num_shown_batch = self.apply_batch( batch, fbatch, mask_field=mask_field, num_verbose=max(0, num_verbose - num_shown), is_train=is_train, ) num_predicted += num_predicted_batch num_shown += num_shown_batch Logger.debug("Number of predicted nodes: {}".format(num_predicted))
def init(self, in_path, out_path): out_path = os.path.join(out_path, self.name) if not os.path.exists(out_path): os.makedirs(out_path) config_path = os.path.join(out_path, "config.json") if os.path.exists(config_path): existing_config = Config.load_from_file(config_path) if existing_config is not None and existing_config == self: Logger.debug("Dataset already preprocessed.") return else: Logger.debug( "Configs do not match. Overwriting existing dataset.") DATA_LOADERS[self.loader].preprocess_dataset(in_path, out_path, self) self.save_to_file(config_path)
def optimize_deps(filename, deps, base_deps, ref_json, base_time): t = time.time() opt_deps = set(deps) removal_candidates = list(set(deps) - set(base_deps)) random.shuffle(removal_candidates) opt_time = None queue = PriorityHeap() queue.add(removal_candidates) while len(queue) > 0: data = queue.pop() for to_remove in chunks(data, max(1, math.ceil(len(data) / 2))): start = time.time() ast_json = parse_file_server( filename, parser_name="typescript", data={ "remove_types": True, "deps": sorted([d for d in opt_deps if d not in to_remove]), }, ) opt_time = time.time() - start assert ast_json is not None if ast_json == ref_json: print("\ttook: {}, remove: {}".format(time.time() - start, len(to_remove))) opt_deps.difference_update(to_remove) elif len(to_remove) != 1: print("\ttook: {}, recurse".format(time.time() - start)) queue.add(to_remove) Logger.debug( "Original Size: #{} ({:.2f}s), Base Size: #{}, Optimized Size: #{} ({:.2f}s), Total Time: {:.2f}" .format( len(deps), base_time, len(base_deps), len(opt_deps), opt_time, time.time() - t, )) return list(opt_deps)
def print_rejection_thresholds(it, model: NeuralModelBase, dataset: Dataset): num_correct = 0 num_total = 0 thresholds = np.arange(0.1, 1.1, 0.1) stats = collections.defaultdict(lambda: SimpleNamespace(correct=0, total=0)) for batch in tqdm.tqdm(it, ncols=100, leave=False): _, best_predictions, reject_probs = model.predict_probs_with_reject( batch, reject_id=dataset.reject_token_id ) mask = model.padding_mask(batch, mask_field="mask_valid") targets = batch.Y best_predictions = best_predictions.masked_select(mask) reject_probs = reject_probs.masked_select(mask) targets = targets.masked_select(mask) is_correct = targets == best_predictions num_correct += torch.sum(is_correct).item() num_total += targets.numel() for h in thresholds: h_mask = reject_probs <= h stats[h].total += torch.sum(h_mask).item() stats[h].correct += torch.sum(is_correct.masked_select(h_mask)).item() for h in thresholds: Logger.debug( "Threshold {:5.2f}: {:6d}/{:6d} ({:.2f}%)".format( h, stats[h].correct, stats[h].total, acc(stats[h].correct, stats[h].total), ) ) Logger.debug( "{:6d}/{:6d} ({:.2f}%)".format( num_correct, num_total, acc(num_correct, num_total) ) )
def main(): parser = argparse.ArgumentParser( "Run TypeScript Type Checker on a dataset of project") parser.add_argument("--repos", default="data/Repos") parser.add_argument("--repos_cleaned", default="data/Repos-processed") parser.add_argument("--out_dir", default="data/out") parser.add_argument("--num_threads", default=12) parser.add_argument("--include_js", default=False, action="store_true") args = parser.parse_args() Logger.init() random.seed(42) args.repos = os.path.abspath(args.repos) args.repos_cleaned = os.path.abspath(args.repos_cleaned) args.out_dir = os.path.abspath(args.out_dir) paths = [] for path in os.listdir(args.repos): if path == "SAP": continue for p in find_top_level_projects(os.path.join(args.repos, path)): paths.append((args.repos, args.repos_cleaned, p)) if os.path.exists(args.repos_cleaned): shutil.rmtree(args.repos_cleaned) if not os.path.exists(args.repos_cleaned): os.makedirs(args.repos_cleaned) with multiprocessing.Pool(args.num_threads) as pool: pool.starmap(process_project, paths) # (optional) optimize dependencies # paths = glob.glob('{}/**/*.json.gz'.format(args.repos_cleaned), recursive=True) # with multiprocessing.Pool(args.num_threads) as pool: # for path in paths: # optimize_project(path, pool) save_dataset(args)
def __refine_model_adversarial( self, model, train_iter, valid_iter, adv_train_iter, threshold, min_nonabstained=500, ): Logger.debug("fit_adversarial") step = 1.0 / 4 schedule = [ f * 1.1 + (1 - f) * 1.02 for f in np.arange(start=1.0, stop=0.0, step=-step) ] + 12 * [1.02] num_refined_all = [] for epoch, o in enumerate(schedule): model.loss_function.o = o Logger.debug("Epoch {}, o={}".format(epoch, o)) num_refined = self.subtree_adversary.fit_adversarial( model, train_iter, adv_train_iter, threshold.h ) model.accuracy_with_reject( valid_iter, self.dataset.TARGET, self.dataset.reject_token_id, threshold.h, ) # print_rejection_thresholds(valid_iter, model, self.dataset) thresholds = get_rejection_thresholds( valid_iter, model, self.dataset, [0.99, 0.98, 0.95, 0.9] ) thresholds = [ t for t in thresholds if t.h is not None and t.size > min_nonabstained * 2 ] if not thresholds: return None threshold = thresholds[0] if num_refined == 0: break if epoch > 7 and num_refined * 3 >= sum(num_refined_all[-3:]): break num_refined_all.append(num_refined) thresholds = get_rejection_thresholds( valid_iter, model, self.dataset, [1.00, 0.99, 0.98, 0.95] ) thresholds = [ t for t in thresholds if t.h is not None and t.size > min_nonabstained ] Logger.debug("Selected Threshold: {}".format(thresholds)) assert thresholds return thresholds[0]
def iter_to_trees(iter) -> Dict[int, AstTree]: Logger.start_scope("Converting Iter to Trees") trees = {} for batch in iter: batch_trees = batch_to_trees(batch) for tree, idx in zip(batch_trees, batch.id): trees[idx.item()] = tree sys.stderr.write("\r{}".format(len(trees))) sys.stderr.write("\r") Logger.debug("# Trees: {}".format(len(trees))) Logger.end_scope() return trees
def dataset_to_trees(dataset, ID, analyzer=None) -> Dict[int, AstTree]: Logger.start_scope("Converting Dataset to Trees") trees = {} for sample in dataset: tree = AstTree.fromTensor(sample.types, sample.values, sample.depth, {"target": sample.target}) tree.analyzer = analyzer trees[ID.vocab.stoi[sample.id]] = tree sys.stderr.write("\r{}".format(len(trees))) sys.stderr.write("\r") Logger.debug("# Trees: {}".format(len(trees))) Logger.end_scope() return trees
def __train_inner( self, train_iter, valid_iter, num_epochs=10, train_adv_mode=AdversarialMode.RANDOM, min_nonabstained=500, ): model, threshold = train_nonempty_model( self.model_fn, self.dataset, train_iter, valid_iter, num_epochs=num_epochs ) if model is None: Logger.debug("Nonempty model failed!") return False best_model = model best_threshold = threshold best_edge_filter = None self.__accuracy(model, threshold.h, valid_iter, adversarial=False) edge_filter = compute_edge_filter( train_iter, best_model, self.dataset, best_model.loss_function, threshold=threshold.h, verbose=True, ) while True: Logger.debug( "Model with #{} non-rejected predictions".format(threshold.size) ) Logger.debug("Original Edges: #{}".format(number_of_edges(train_iter))) train_iter = FilteredGraphIterator.from_iter(train_iter, edge_filter) Logger.debug("Filtered Edges: #{}".format(number_of_edges(train_iter))) valid_iter = FilteredGraphIterator.from_iter(valid_iter, edge_filter) model = self.__copy_model(model) threshold = self.__refine_model( model, train_iter, valid_iter, min_nonabstained=min_nonabstained ) if threshold is None: break threshold = self.__refine_model_adversarial( model, train_iter, valid_iter, [ self.make_rename_adversary_iter( train_iter, model, train_adv_mode, num_samples=5 ), self.make_adversary_iter( train_iter, model, train_adv_mode, num_samples=5 ), ], threshold, min_nonabstained=min_nonabstained, ) if threshold is None: break best_model = model best_threshold = threshold best_edge_filter = edge_filter edge_filter = compute_edge_filter( train_iter, best_model, self.dataset, best_model.loss_function, threshold=threshold.h, verbose=False, ) Logger.debug( "new edges: {} ({}), old edges: {}".format( len(edge_filter), len(edge_filter) * 1.04, len(best_edge_filter) ) ) if len(edge_filter) * 1.04 >= len(best_edge_filter): # self.accuracy(model, threshold.h, valid_iter, adversarial=True) break if best_edge_filter is None: Logger.debug("No Edge Filter, training base model adversarially") best_threshold = self.__refine_model_adversarial( best_model, train_iter, valid_iter, [ self.make_rename_adversary_iter(train_iter, model, train_adv_mode), self.make_adversary_iter(train_iter, model, train_adv_mode), ], best_threshold, min_nonabstained=min_nonabstained, ) Logger.debug("Train Accuracy:") train_stats = best_model.accuracy_with_reject( train_iter, self.dataset.TARGET, self.dataset.reject_token_id, best_threshold.h, ) Logger.debug("Valid Accuracy:") valid_stats = best_model.accuracy_with_reject( valid_iter, self.dataset.TARGET, self.dataset.reject_token_id, best_threshold.h, ) train_prec = train_stats["mask_valid_noreject_acc"] valid_prec = valid_stats["mask_valid_noreject_acc"] Logger.debug(f"train_prec: {train_prec}, valid_prec: {valid_prec}") self.edge_filter = best_edge_filter self.model = best_model self.threshold = best_threshold.h self.model.accuracy_with_reject( train_iter, self.dataset.TARGET, self.dataset.reject_token_id, self.threshold, ) return True
def solve(self, debug_info=None): import gurobipy as gb verbose = len(self.samples) > 1 if verbose: Logger.debug("Number of samples: #{}".format(len(self.samples))) self.build_edge_types(self.samples) # Create optimization model m = gb.Model("netflow") timers = collections.defaultdict(Timer) if verbose: Logger.start_scope("Encoding Solver Model") cost = m.addVars( range(len(self.edge_types.values())), obj=1.0, name="cost", vtype=gb.GRB.INTEGER, ) flows = [] for idx, sample in enumerate(self.samples): timers["flow"].start() flow = m.addVars(sample.edges.keys(), name="flow_{}".format(idx), vtype=gb.GRB.INTEGER) timers["flow"].stop() flows.append(flow) # Arc-capacity constraints timers["cap"].start() m.addConstrs( (flow[i, j] <= cost[self.edge_types[e_type]] for (i, j), e_type in sample.edges.items()), "cap_{}".format(idx), ) timers["cap"].stop() # Flow-conservation constraints timers["node"].start() m.addConstrs( (flow.sum("*", j) + sample.inflow.get(j, 0) == flow.sum( j, "*") for j in sample.nodes), "node_{}".format(idx), ) timers["node"].stop() if verbose: for key, timer in timers.items(): Logger.debug("{} {}".format(key, timer)) Logger.end_scope() Logger.start_scope("Optimizing") m.write("file.lp") # disable logging m.Params.OutputFlag = 0 m.optimize() if verbose: Logger.end_scope() # Print solution if m.status == gb.GRB.Status.OPTIMAL: edge_costs = collections.Counter() edge_counts = collections.Counter() for flow, sample in zip(flows, self.samples): solution = m.getAttr("x", flow) # print('\nOptimal flows:') for (i, j), e_type in sample.edges.items(): if solution[i, j] > 0: # print('%s -> %s: %g' % (i, j, solution[i, j])) edge_costs[e_type] += solution[i, j] edge_counts[e_type] += 1 valid_features = [] solution = m.getAttr("x", cost) # print('Costs') for idx, c in enumerate(solution): # print('\t{} {} -> {} {:.2f} ({:.2f}%)'.format(idx, c, solution[c], # edge_costs[self.id_to_edge_type[c]], # edge_costs[self.id_to_edge_type[c]] * 100.0 / sum(edge_costs.values())) # ) if solution[c] > 0: edge_type = self.id_to_edge_type[c] valid_features.append((edge_type, edge_costs[edge_type], edge_counts[edge_type])) if not valid_features: print("valid_features", valid_features) print(debug_info) exit(0) return EdgeFilter(valid_features) else: print(debug_info) print(m.status) print("The model is infeasible; computing IIS") for sample in self.samples[:5]: print(sample.inflow) print(sample.edges) print(sample.nodes) m.computeIIS() if m.IISMinimal: print("IIS is minimal\n") else: print("IIS is not minimal\n") print("\nThe following constraint(s) cannot be satisfied:") for c in m.getConstrs(): if c.IISConstr: print("%s" % c.constrName) exit(0)
def train( self, train_iter, valid_iter, num_epochs=10, min_nonabstained=500, depth=None, test_iter=None, apply_model=True, # base_model=False, train_adv_mode=AdversarialMode.RANDOM, loaded=False, model_eval=None, ): if not loaded: if self.base_model: train_base_model( self.model, self.dataset, 10, RobustModelBatchIter( model_eval, AdversaryBatchIter( self.subtree_adversary, self.model, AdversaryBatchIter( self.rename_adversary, self.model, train_iter, num_samples=1, adv_mode=train_adv_mode, ), ), ), [valid_iter], verbose=False, ) success = True else: success = self.__train_inner( train_iter, valid_iter, num_epochs=num_epochs, train_adv_mode=train_adv_mode, min_nonabstained=min_nonabstained, ) if not success: Logger.debug("model train failed") input() return False # if self.idx is not None: # torch.save(self.state_dict(), '{:03d}_model.pt'.format(self.idx)) # we cannot reuse the iterators from calling __train_inner as these are shuffled if self.edge_filter is not None: Logger.debug("Original Edges: #{}".format(number_of_edges(train_iter))) f_train_iter = FilteredGraphIterator.from_iter(train_iter, self.edge_filter) Logger.debug("Filtered Edges: #{}".format(number_of_edges(f_train_iter))) f_valid_iter = FilteredGraphIterator.from_iter(valid_iter, self.edge_filter) if test_iter is not None: f_test_iter = FilteredGraphIterator.from_iter( test_iter, self.edge_filter ) else: f_train_iter = train_iter f_valid_iter = valid_iter if test_iter is not None: f_test_iter = test_iter Logger.debug("Train Accuracy:") train_stats = self.model.accuracy_with_reject( f_train_iter, self.dataset.TARGET, self.dataset.reject_token_id, self.threshold, ) Logger.debug("Valid Accuracy:") self.valid_stats = self.model.accuracy_with_reject( f_valid_iter, self.dataset.TARGET, self.dataset.reject_token_id, self.threshold, ) if test_iter is not None: Logger.debug("Test Accuracy:") self.model.accuracy_with_reject( f_test_iter, self.dataset.TARGET, self.dataset.reject_token_id, self.threshold, ) train_prec = train_stats["mask_valid_noreject_acc"] valid_prec = self.valid_stats["mask_valid_noreject_acc"] Logger.debug(f"train_prec: {train_prec}, valid_prec: {valid_prec}") if apply_model: self.apply(train_iter, f_train_iter, is_train=True) self.apply(valid_iter, f_valid_iter, is_train=True) if test_iter is not None: self.apply(test_iter, f_test_iter) return True
def print_stat(self): values = " ".join( "{:6.2f}%".format((100.0 * value / self.stat[0]) if self.stat[0] != 0 else 0) for value in self.stat) Logger.debug("nodes entering step: {}".format(values))
def train_model( model: NeuralModelBase, dataset: Dataset, num_epochs, train_iter, valid_iter, lr=0.001, weight=None, target_o=1.0, ): # model.reset_parameters() opt = optim.Adam(model.parameters(), lr=lr) Logger.start_scope("Training Model") o_base = len(dataset.TARGET.vocab) - 4 # 'reject', '<unk>', '<pad>' loss_function = RejectionCrossEntropyLoss( o_base, len(dataset.TARGET.vocab), dataset.reject_token_id, reduction="none", weight=weight, ) model.loss_function = loss_function model.opt = opt step = 1.0 / (num_epochs // 2) schedule = [ f * o_base + (1 - f) * 1.0 for f in np.arange(start=1.0, stop=0.0, step=-step) ] schedule += [ f * ((1.0 + schedule[-1]) / 2) + (1 - f) * target_o for f in np.arange(start=1.0, stop=0.0, step=-step) ] schedule += [target_o] * (num_epochs // 2) train_prec, valid_prec = None, None for epoch, o_upper in enumerate(schedule): Logger.start_scope("Epoch {}, o_upper={:.3f}".format(epoch, o_upper)) loss_function.o = o_upper model.fit(train_iter, opt, loss_function, mask_field="mask_valid") valid_stats = model.accuracy( valid_iter, dataset.TARGET ) # , thresholds=[0.5, 0.8, 0.9, 0.95]) valid_prec = valid_stats["mask_valid_noreject_acc"] Logger.debug(f"valid_prec: {valid_prec}") Logger.end_scope() # Logger.start_scope('Print Rejection Thresholds') # print_rejection_thresholds(train_iter, model, dataset) # print_rejection_thresholds(valid_iter, model, dataset) # Logger.end_scope() # Logger.start_scope('Get Rejection Thresholds') # get_rejection_thresholds(train_iter, model, dataset, [1.00, 0.99, 0.95, 0.9, 0.8]) # get_rejection_thresholds(valid_iter, model, dataset, [1.00, 0.99, 0.95, 0.9, 0.8]) # Logger.end_scope() train_stats = model.accuracy(train_iter, dataset.TARGET, verbose=False) train_prec = train_stats["mask_valid_noreject_acc"] Logger.debug(f"train_prec: {train_prec}, valid_prec: {valid_prec}") Logger.end_scope() # exit(0) return train_prec, valid_prec
def robust_multi( args, dataset: Dataset, device: torch.device, masks, max_models=20, model_id=0 ): train_iter, valid_iter, test_iter = Iterators.make( args, Models[args.model], dataset, device, masks ) def make_model(): return Models.make(args, dataset, device, train_iter) adversary, subtree_adversary = make_adversary( dataset, functools.partial( Iterators.make_single, args, Models[args.model], device, masks, dataset.EDGES, ), ) stats = collections.Counter() stats["mask_valid_noreject_correct"] = 0 stats["mask_valid_noreject_predicted"] = 0 models = [] for idx in range(max_models): # last model is trained without threshold to predict all the remaining samples base_model = (idx + 1) == max_models if not base_model: continue Logger.debug("Training iter: {}, base_model: {}".format(idx, base_model)) if mask_count(train_iter) == 0: break model = RobustModel( make_model, dataset, idx=idx, rename_adversary=adversary, subtree_adversary=subtree_adversary, base_model=base_model, ) if model_id is not None and model.load(args, model_id): # TODO: refactor, model is loaded but it needs to be applied on the iterator model.train( train_iter, valid_iter, num_epochs=args.num_epochs, test_iter=test_iter, apply_model=True, # base_model=base_model, train_adv_mode=args.train_adv_mode, loaded=True, ) else: Logger.debug( "Train positions to predict: {}".format(mask_count(train_iter)) ) Logger.debug( "Valid positions to predict: {}".format(mask_count(valid_iter)) ) model_eval = None if base_model: # reset iterators train_iter, valid_iter, test_iter = Iterators.make( args, Models[args.model], dataset, device, masks ) model_eval = RobustModelEval(subtree_adversary) model_eval.load_models( make_model, dataset, adversary, subtree_adversary, args, model_id, max_models=max_models - 1, last_base=False, ) # train_iter = RobustModelBatchIter(model_eval, train_iter) if not model.train( train_iter, valid_iter, num_epochs=args.num_epochs, test_iter=test_iter, apply_model=True, # base_model=base_model, train_adv_mode=args.train_adv_mode, min_nonabstained=args.min_nonabstained, model_eval=model_eval, ): break if model_id is not None: model.save(args, model_id) exit(0) models.append(model) for key in stats.keys(): stats[key] += model.valid_stats[key] Logger.debug( "Valid Accuracy: {}/{} ({:.2f}%)".format( stats["mask_valid_noreject_correct"], stats["mask_valid_noreject_predicted"], acc( stats["mask_valid_noreject_correct"], stats["mask_valid_noreject_predicted"], ), ) ) return eval(args, dataset, device, masks, max_models=max_models, model_id=model_id)
def compute_edge_filter( it, model, dataset, loss_function, threshold=0.5, verbose=False ): timers = collections.defaultdict(Timer) edge_optimizer = EdgeOptimizer() for node_grads in each_node_grads( it, model, dataset, loss_function, threshold=threshold, max_samples=30 ): if torch.any(node_grads.probs > 0.1): tgt_nodes = ( torch.masked_select(node_grads.tgt_nodes, node_grads.probs > 0.1) .cpu() .numpy() ) probs = ( torch.masked_select(node_grads.probs, node_grads.probs > 0.1) .cpu() .numpy() ) else: tgt_nodes = node_grads.tgt_nodes[:3].cpu().numpy() probs = node_grads.probs[:3].cpu().numpy() if len(tgt_nodes) == 0: Logger.debug( "Empty target nodes: src: {}, tgt_nodes: {}, {}".format( node_grads.src_node, node_grads.tgt_nodes, node_grads.probs ) ) continue debug_info = "" if verbose else None timers["nodes"].start() depth = max( nx.shortest_path_length( node_grads.rev_tree_nx, source=node_grads.src_node, target=tgt ) for tgt in tgt_nodes ) nodes = [node_grads.src_node] + list( itertools.chain.from_iterable( successors for _, successors in nx.bfs_successors( node_grads.rev_tree_nx, source=node_grads.src_node, depth_limit=depth, ) ) ) assert all(tgt_node in nodes for tgt_node in tgt_nodes) if verbose: debug_info += "nodes: {}\n".format(nodes) timers["nodes"].stop() timers["edges"].start() edges = [ (i, j) for (i, j) in node_grads.tree_nx.edges(nodes) if i in nodes and j in nodes ] if verbose: debug_info += "edges: {}\n".format(edges) timers["edges"].stop() timers["arcs"].start() features = EdgeFilter.edge_features( edges, node_grads.tree, debug_info=debug_info ) arcs = {} for (i, j), feature in zip(edges, features): if i == j: # split self-loops into new nodes # Needed when using self-loops as otherwise the same node both generates and consumes inflow i = "{}r".format(i) arcs[(str(i), str(j))] = feature # '{}_{}'.format(node_type, edge_type) if verbose: debug_info += "arcs: {}\n".format(arcs) debug_info += "features: {}\n".format(features) timers["arcs"].stop() # update list of notes with newly generated ones nodes = set() for (i, j) in arcs.keys(): nodes.add(i) nodes.add(j) if verbose: debug_info += "nodes: {}\n".format(nodes) tgt_nodes = [ str(v) if v != node_grads.src_node else "{}r".format(v) for v in tgt_nodes ] inflow = { tgt_node: int(p * 100) for tgt_node, p in zip(tgt_nodes, probs) if tgt_node in nodes } inflow[str(node_grads.src_node)] = -sum(inflow.values()) if verbose: debug_info += "inflow: {}\n".format(inflow) if len(arcs) == 0: continue edge_optimizer.add_sample(nodes, arcs, inflow) if verbose: edge_optimizer_tmp = EdgeOptimizer() edge_optimizer_tmp.add_sample(nodes, arcs, inflow) edge_optimizer_tmp.solve(debug_info=debug_info) for key, timer in timers.items(): Logger.debug("{}: {}".format(key, timer)) edge_filter = edge_optimizer.solve() edge_filter.print(dataset=dataset) # , edge_gen=it.edge_gen) return edge_filter
def __init__(self, it): Logger.debug("Caching Batches") self.batches = [batch.clone() for batch in tqdm.tqdm(it)]