def train_base_model( model: NeuralModelBase, dataset: Dataset, num_epochs, train_iter, valid_iter, lr=0.001, verbose=True, ): valid_iters = [valid_iter ] if not isinstance(valid_iter, list) else valid_iter Logger.start_scope("Training Model") opt = optim.Adam(model.parameters(), lr=lr) model.opt = opt loss_function = nn.CrossEntropyLoss(reduction="none") model.loss_function = loss_function train_prec, valid_prec = None, None for epoch in range(num_epochs): Logger.start_scope("Epoch {}".format(epoch)) model.fit(train_iter, opt, loss_function, mask_field="mask_valid") for valid_iter in valid_iters: valid_stats = model.accuracy(valid_iter, dataset.TARGET, verbose=verbose) valid_prec = valid_stats["mask_valid_noreject_acc"] Logger.debug(f"valid_prec: {valid_prec}") Logger.end_scope() train_stats = model.accuracy(train_iter, dataset.TARGET, verbose=False) train_prec = train_stats["mask_valid_noreject_acc"] Logger.debug(f"train_prec: {train_prec}, valid_prec: {valid_prec}") Logger.end_scope() return train_prec, valid_prec
def __accuracy(self, base_model, reject_threshold, valid_iter, adversarial=False): valid_stats = base_model.accuracy_with_reject( valid_iter, self.dataset.TARGET, self.dataset.reject_token_id, reject_threshold, ) if adversarial: Logger.start_scope("adversarial accuracy") stats = self.rename_adversary.adversarial_accuracy( base_model, valid_iter, [ AdversaryBatchIter( self.subtree_adversary, base_model, AdversaryBatchIter( self.rename_adversary, base_model, valid_iter, num_samples=2 ), ), AdversaryBatchIter( self.rename_adversary, base_model, valid_iter, num_samples=40 ), ], threshold=reject_threshold, approximate=True, ) Logger.end_scope() return stats.is_sound() valid_prec = valid_stats["mask_valid_noreject_acc"] return valid_prec == 100.0
def optimize_project(path, pool, include_js=False): if os.path.exists(path + ".opt"): return if is_file_empty(path): return with gzip.open(path, "rb") as f: entries = json.loads(f.read()) if not include_js: entries = [ entry for entry in entries if entry["filename"].endswith(".ts") ] Logger.start_scope("Optimizing {}".format(path)) Logger.debug("#Entries: {}".format(len(entries))) num_diffs = 0 opt_entries = [] for idx, entry in enumerate(pool.imap_unordered(optimize_file, entries)): # for idx, entry in enumerate(entries): # entry = optimize_file(entry) sys.stderr.write("\r{}/{}".format(idx, len(entries))) num_diffs += entry["num_diffs"] opt_entries.append(entry) sys.stderr.write("\r{}/{}\n".format(len(entries), len(entries))) Logger.debug("#Diffs: {}".format(num_diffs)) Logger.end_scope() print("write: ", path + ".opt") with gzip.open(path + ".opt", "wb") as f: f.write(json.dumps(opt_entries).encode("utf-8"))
def make_adversary(dataset: Dataset, make_iter): Logger.start_scope("Parsing Trees") trees_train_str = dataset_to_trees(dataset.dtrain, dataset.ID) trees_valid_str = dataset_to_trees(dataset.dvalid, dataset.ID) trees_test_str = dataset_to_trees(dataset.dtest, dataset.ID) trees_str = {**trees_train_str, **trees_valid_str, **trees_test_str} trees_train_num = dataset_to_trees_num(dataset.dtrain) trees_valid_num = dataset_to_trees_num(dataset.dvalid) trees_test_num = dataset_to_trees_num(dataset.dtest) trees_num = {**trees_train_num, **trees_valid_num, **trees_test_num} Logger.end_scope() Logger.start_scope("Indexing Trees") value_index = NodeValueIndex(dataset, trees_train_num) value_index_str = NodeValueIndexStr(dataset, trees_train_str) expr_gen = ExpressionGenerator(value_index_str) node_replacement = AdversarialNodeReplacement(value_index, dataset.fixed_value_offset) rules_index = node_replacement.make_rules(dataset, trees_str, trees_num) adversary = RenameAdversary(rules_index, dataset) Logger.end_scope() subtree_replacement = AdversarialSubtreeReplacement(expr_gen) subtree_rules = subtree_replacement.make_rules(dataset, trees_str, trees_num) subtree_adversary = SubtreeAdversary(subtree_rules, dataset, trees_str, make_iter) return adversary, subtree_adversary
def iter_to_trees(iter) -> Dict[int, AstTree]: Logger.start_scope("Converting Iter to Trees") trees = {} for batch in iter: batch_trees = batch_to_trees(batch) for tree, idx in zip(batch_trees, batch.id): trees[idx.item()] = tree sys.stderr.write("\r{}".format(len(trees))) sys.stderr.write("\r") Logger.debug("# Trees: {}".format(len(trees))) Logger.end_scope() return trees
def dataset_to_trees(dataset, ID, analyzer=None) -> Dict[int, AstTree]: Logger.start_scope("Converting Dataset to Trees") trees = {} for sample in dataset: tree = AstTree.fromTensor(sample.types, sample.values, sample.depth, {"target": sample.target}) tree.analyzer = analyzer trees[ID.vocab.stoi[sample.id]] = tree sys.stderr.write("\r{}".format(len(trees))) sys.stderr.write("\r") Logger.debug("# Trees: {}".format(len(trees))) Logger.end_scope() return trees
def train_model( model: NeuralModelBase, dataset: Dataset, num_epochs, train_iter, valid_iter, lr=0.001, weight=None, target_o=1.0, ): # model.reset_parameters() opt = optim.Adam(model.parameters(), lr=lr) Logger.start_scope("Training Model") o_base = len(dataset.TARGET.vocab) - 4 # 'reject', '<unk>', '<pad>' loss_function = RejectionCrossEntropyLoss( o_base, len(dataset.TARGET.vocab), dataset.reject_token_id, reduction="none", weight=weight, ) model.loss_function = loss_function model.opt = opt step = 1.0 / (num_epochs // 2) schedule = [ f * o_base + (1 - f) * 1.0 for f in np.arange(start=1.0, stop=0.0, step=-step) ] schedule += [ f * ((1.0 + schedule[-1]) / 2) + (1 - f) * target_o for f in np.arange(start=1.0, stop=0.0, step=-step) ] schedule += [target_o] * (num_epochs // 2) train_prec, valid_prec = None, None for epoch, o_upper in enumerate(schedule): Logger.start_scope("Epoch {}, o_upper={:.3f}".format(epoch, o_upper)) loss_function.o = o_upper model.fit(train_iter, opt, loss_function, mask_field="mask_valid") valid_stats = model.accuracy( valid_iter, dataset.TARGET ) # , thresholds=[0.5, 0.8, 0.9, 0.95]) valid_prec = valid_stats["mask_valid_noreject_acc"] Logger.debug(f"valid_prec: {valid_prec}") Logger.end_scope() # Logger.start_scope('Print Rejection Thresholds') # print_rejection_thresholds(train_iter, model, dataset) # print_rejection_thresholds(valid_iter, model, dataset) # Logger.end_scope() # Logger.start_scope('Get Rejection Thresholds') # get_rejection_thresholds(train_iter, model, dataset, [1.00, 0.99, 0.95, 0.9, 0.8]) # get_rejection_thresholds(valid_iter, model, dataset, [1.00, 0.99, 0.95, 0.9, 0.8]) # Logger.end_scope() train_stats = model.accuracy(train_iter, dataset.TARGET, verbose=False) train_prec = train_stats["mask_valid_noreject_acc"] Logger.debug(f"train_prec: {train_prec}, valid_prec: {valid_prec}") Logger.end_scope() # exit(0) return train_prec, valid_prec
def solve(self, debug_info=None): import gurobipy as gb verbose = len(self.samples) > 1 if verbose: Logger.debug("Number of samples: #{}".format(len(self.samples))) self.build_edge_types(self.samples) # Create optimization model m = gb.Model("netflow") timers = collections.defaultdict(Timer) if verbose: Logger.start_scope("Encoding Solver Model") cost = m.addVars( range(len(self.edge_types.values())), obj=1.0, name="cost", vtype=gb.GRB.INTEGER, ) flows = [] for idx, sample in enumerate(self.samples): timers["flow"].start() flow = m.addVars(sample.edges.keys(), name="flow_{}".format(idx), vtype=gb.GRB.INTEGER) timers["flow"].stop() flows.append(flow) # Arc-capacity constraints timers["cap"].start() m.addConstrs( (flow[i, j] <= cost[self.edge_types[e_type]] for (i, j), e_type in sample.edges.items()), "cap_{}".format(idx), ) timers["cap"].stop() # Flow-conservation constraints timers["node"].start() m.addConstrs( (flow.sum("*", j) + sample.inflow.get(j, 0) == flow.sum( j, "*") for j in sample.nodes), "node_{}".format(idx), ) timers["node"].stop() if verbose: for key, timer in timers.items(): Logger.debug("{} {}".format(key, timer)) Logger.end_scope() Logger.start_scope("Optimizing") m.write("file.lp") # disable logging m.Params.OutputFlag = 0 m.optimize() if verbose: Logger.end_scope() # Print solution if m.status == gb.GRB.Status.OPTIMAL: edge_costs = collections.Counter() edge_counts = collections.Counter() for flow, sample in zip(flows, self.samples): solution = m.getAttr("x", flow) # print('\nOptimal flows:') for (i, j), e_type in sample.edges.items(): if solution[i, j] > 0: # print('%s -> %s: %g' % (i, j, solution[i, j])) edge_costs[e_type] += solution[i, j] edge_counts[e_type] += 1 valid_features = [] solution = m.getAttr("x", cost) # print('Costs') for idx, c in enumerate(solution): # print('\t{} {} -> {} {:.2f} ({:.2f}%)'.format(idx, c, solution[c], # edge_costs[self.id_to_edge_type[c]], # edge_costs[self.id_to_edge_type[c]] * 100.0 / sum(edge_costs.values())) # ) if solution[c] > 0: edge_type = self.id_to_edge_type[c] valid_features.append((edge_type, edge_costs[edge_type], edge_counts[edge_type])) if not valid_features: print("valid_features", valid_features) print(debug_info) exit(0) return EdgeFilter(valid_features) else: print(debug_info) print(m.status) print("The model is infeasible; computing IIS") for sample in self.samples[:5]: print(sample.inflow) print(sample.edges) print(sample.nodes) m.computeIIS() if m.IISMinimal: print("IIS is minimal\n") else: print("IIS is not minimal\n") print("\nThe following constraint(s) cannot be satisfied:") for c in m.getConstrs(): if c.IISConstr: print("%s" % c.constrName) exit(0)
def __init__(self, args, include_edges=False, dataset_eval=None): Logger.debug("Dataset: {}".format(args.dataset)) config = Config.get_dataset(args.dataset) config.init(args.dataset_path, args.cache_dir) self.EDGES = ([ "child_edges", # 'computed_from_edges', # 'next_token_edges', "returns_to_edges", "last_write_edges", "last_read_edges", "last_lexical_usage_edges", ] if include_edges else []) # Type of the AST node (e.g., Identifier, BinaryExpression, IfStatement, etc.) self.TYPES = torchtext.data.Field(sequential=True, include_lengths=True) # Value of the AST node (e.g., x, y, 5, console, etc.) self.VALUES = torchtext.data.Field(sequential=True) # Target to predict, in our case type of the extession (e.g., int, string, string[], etc.) # torchtext 0.4 has hardcoded values of unk_token='<unk>', any other value will break the batching self.TARGET = torchtext.data.Field(sequential=True, unk_token="<unk>", is_target=True) # User provided type annotation self.GOLD = torchtext.data.Field(sequential=True, unk_token="<unk>") # ID of the file from which the sample was generated self.ID = torchtext.data.Field(sequential=False, use_vocab=True) # order of the sample. Used to keep batching deterministic self.ORDER = torchtext.data.Field(sequential=False, use_vocab=False) # boolean mask that denotes for which AST nodes the prediction should be made. # In our case, these are nodes that have a type (e.g, all identifiers and expressions) self.MASK_VALID = torchtext.data.Field(sequential=True, use_vocab=False, pad_token=0, dtype=torch.uint8) self.MASK_GOLD = torchtext.data.Field(sequential=True, use_vocab=False, pad_token=0, dtype=torch.uint8) # depth of the node in the AST (i.e., distance from the root) self.DEPTH = torchtext.data.Field(sequential=True, use_vocab=False, pad_token=0, dtype=torch.uint8) # position of the node with respect to the parent in the AST. # 0 denotes the first child, 1 is the second child, etc. # self.POS = torchtext.data.Field(sequential=True, pad_token=0) # files used by TypeScript to infer the ground-truth types # useful in case modifications are applied to the file and type inference needs to be executed again # self.DEPENDENCIES = torchtext.data.Field(sequential=True) fields = { "ast_types": ("types", self.TYPES), "ast_values": ("values", self.VALUES), "id": ("id", self.ID), # 'pos': ('pos', self.POS), "target_full": ("target", self.TARGET), "mask_valid_full": ("mask_valid", self.MASK_VALID), # 'gold_type': ('gold', self.GOLD), # 'mask_gold': ('mask_gold', self.MASK_GOLD), "depth": ("depth", self.DEPTH), # 'dependencies': ('dependencies', self.DEPENDENCIES) } for edge_type in self.EDGES: fields[edge_type + "_src"] = ( edge_type + "_src", torchtext.data.Field( sequential=True, pad_token=-1, include_lengths=True, use_vocab=False, dtype=torch.int16, ), ) fields[edge_type + "_tgt"] = ( edge_type + "_tgt", torchtext.data.Field(sequential=True, pad_token=-1, use_vocab=False, dtype=torch.int16), ) Logger.start_scope("Reading Dataset") dataset_cls = (torchtext.data.TabularDataset if not config.compressed else TabularCompressedDataset) dtrain, dvalid, dtest = dataset_cls.splits( path=os.path.join(args.cache_dir, args.dataset), train=config.train, validation=config.valid, test=config.test, format="json", fields=fields, ) if args.num_samples != -1: dtrain.examples = dtrain.examples[:args.num_samples] dvalid.examples = dvalid.examples[:args.num_samples] if dataset_eval is not None: eval_ids = set(sample.id for sample in itertools.chain( dataset_eval.dvalid, dataset_eval.dtest)) dvalid.examples = [ sample for sample in dvalid if sample.id in eval_ids ] dtest.examples = [ sample for sample in dtest if sample.id in eval_ids ] self.filter_size(dtrain) self.filter_size(dvalid) self.filter_size(dtest) types = [ "string", "number", "boolean", "void", "() => string", "() => number", "() => boolean", "() => void", "<null>", ] self.__normalize_values(itertools.chain(dtrain, dvalid, dtest)) Logger.end_scope() Logger.start_scope("Processing Dataset") self.TYPES.build_vocab(dtrain) self.VALUES.build_vocab(dtrain, min_freq=10) self.TARGET.build_vocab(dtrain, max_size=0) self.TARGET.vocab.extend(SimpleNamespace(itos=types)) # special value denoting no prediction self.TARGET.vocab.extend(SimpleNamespace(itos=["reject", "unsound"])) # use to replace types/values with predictions # make the types unique such that they are not mixed with the original values ext_types = ["__<" + v + ">__" for v in types + ["<unk>"]] self.TYPES.vocab.extend(SimpleNamespace(itos=ext_types)) self.VALUES.vocab.extend(SimpleNamespace(itos=ext_types)) # values >= than fixed_value_offset are replaced manually to denote predictions from previous iterations self.fixed_value_offset = self.VALUES.vocab.stoi[ext_types[0]] self.GOLD.build_vocab(dtrain, max_size=0) self.GOLD.vocab.extend(SimpleNamespace(itos=types)) # self.POS.build_vocab(dtrain) self.__remove_null_labels(itertools.chain(dtrain, dvalid, dtest)) # ID is for debugging so we build vocab also from test dataset # self.ID.build_vocab(dtrain, dvalid) self.ID.build_vocab( [sample.id for sample in itertools.chain(dtrain, dvalid, dtest)] + [ "{}_mod".format(sample.id) for sample in itertools.chain(dtrain, dvalid, dtest) ] # modified adversarial samples ) # self.DEPENDENCIES.build_vocab(dtrain, dvalid) Logger.debug(" TYPES Vocab Size: {:6d}/{:6d}".format( len(self.TYPES.vocab), len(self.TYPES.vocab.freqs))) Logger.debug("VALUES Vocab Size: {:6d}/{:6d}".format( len(self.VALUES.vocab), len(self.VALUES.vocab.freqs))) Logger.debug("TARGET Vocab Size: {:6d}/{:6d}".format( len(self.TARGET.vocab), len(self.TARGET.vocab.freqs))) for key, value in self.TARGET.vocab.freqs.most_common(20): print("\t{:10s} {:10d}: {}".format( "[vocab]" if key in self.TARGET.vocab.stoi else "", value, key)) Logger.end_scope() # Store the results self.dtrain = dtrain self.dvalid = dvalid self.dtest = dtest self.pad_token_id = self.VALUES.vocab.stoi[self.VALUES.pad_token] self.unk_token_id = self.VALUES.vocab.stoi[self.VALUES.unk_token] self.reject_token_id = self.TARGET.vocab.stoi["reject"] self.unsound_token_id = self.TARGET.vocab.stoi["unsound"] self.id_to_sample = { self.ID.vocab.stoi[sample.id]: sample for sample in itertools.chain(self.dtrain, self.dvalid, self.dtest) } self.__init_order(self.dtrain) self.__init_order(self.dvalid) self.__init_order(self.dtest) self.fields = [(name, field) for name, field in dtrain.fields.items()]