Exemple #1
0
def train_base_model(
    model: NeuralModelBase,
    dataset: Dataset,
    num_epochs,
    train_iter,
    valid_iter,
    lr=0.001,
    verbose=True,
):
    valid_iters = [valid_iter
                   ] if not isinstance(valid_iter, list) else valid_iter
    Logger.start_scope("Training Model")
    opt = optim.Adam(model.parameters(), lr=lr)
    model.opt = opt
    loss_function = nn.CrossEntropyLoss(reduction="none")
    model.loss_function = loss_function

    train_prec, valid_prec = None, None
    for epoch in range(num_epochs):
        Logger.start_scope("Epoch {}".format(epoch))
        model.fit(train_iter, opt, loss_function, mask_field="mask_valid")

        for valid_iter in valid_iters:
            valid_stats = model.accuracy(valid_iter,
                                         dataset.TARGET,
                                         verbose=verbose)
            valid_prec = valid_stats["mask_valid_noreject_acc"]
            Logger.debug(f"valid_prec: {valid_prec}")
        Logger.end_scope()

    train_stats = model.accuracy(train_iter, dataset.TARGET, verbose=False)
    train_prec = train_stats["mask_valid_noreject_acc"]
    Logger.debug(f"train_prec: {train_prec}, valid_prec: {valid_prec}")
    Logger.end_scope()
    return train_prec, valid_prec
Exemple #2
0
    def __accuracy(self, base_model, reject_threshold, valid_iter, adversarial=False):
        valid_stats = base_model.accuracy_with_reject(
            valid_iter,
            self.dataset.TARGET,
            self.dataset.reject_token_id,
            reject_threshold,
        )

        if adversarial:
            Logger.start_scope("adversarial accuracy")

            stats = self.rename_adversary.adversarial_accuracy(
                base_model,
                valid_iter,
                [
                    AdversaryBatchIter(
                        self.subtree_adversary,
                        base_model,
                        AdversaryBatchIter(
                            self.rename_adversary, base_model, valid_iter, num_samples=2
                        ),
                    ),
                    AdversaryBatchIter(
                        self.rename_adversary, base_model, valid_iter, num_samples=40
                    ),
                ],
                threshold=reject_threshold,
                approximate=True,
            )
            Logger.end_scope()
            return stats.is_sound()

        valid_prec = valid_stats["mask_valid_noreject_acc"]
        return valid_prec == 100.0
Exemple #3
0
def optimize_project(path, pool, include_js=False):
    if os.path.exists(path + ".opt"):
        return
    if is_file_empty(path):
        return

    with gzip.open(path, "rb") as f:
        entries = json.loads(f.read())

    if not include_js:
        entries = [
            entry for entry in entries if entry["filename"].endswith(".ts")
        ]

    Logger.start_scope("Optimizing {}".format(path))
    Logger.debug("#Entries: {}".format(len(entries)))

    num_diffs = 0
    opt_entries = []
    for idx, entry in enumerate(pool.imap_unordered(optimize_file, entries)):
        # for idx, entry in enumerate(entries):
        #     entry = optimize_file(entry)
        sys.stderr.write("\r{}/{}".format(idx, len(entries)))
        num_diffs += entry["num_diffs"]
        opt_entries.append(entry)
    sys.stderr.write("\r{}/{}\n".format(len(entries), len(entries)))
    Logger.debug("#Diffs: {}".format(num_diffs))
    Logger.end_scope()

    print("write: ", path + ".opt")
    with gzip.open(path + ".opt", "wb") as f:
        f.write(json.dumps(opt_entries).encode("utf-8"))
Exemple #4
0
def make_adversary(dataset: Dataset, make_iter):
    Logger.start_scope("Parsing Trees")
    trees_train_str = dataset_to_trees(dataset.dtrain, dataset.ID)
    trees_valid_str = dataset_to_trees(dataset.dvalid, dataset.ID)
    trees_test_str = dataset_to_trees(dataset.dtest, dataset.ID)
    trees_str = {**trees_train_str, **trees_valid_str, **trees_test_str}

    trees_train_num = dataset_to_trees_num(dataset.dtrain)
    trees_valid_num = dataset_to_trees_num(dataset.dvalid)
    trees_test_num = dataset_to_trees_num(dataset.dtest)
    trees_num = {**trees_train_num, **trees_valid_num, **trees_test_num}
    Logger.end_scope()

    Logger.start_scope("Indexing Trees")
    value_index = NodeValueIndex(dataset, trees_train_num)
    value_index_str = NodeValueIndexStr(dataset, trees_train_str)
    expr_gen = ExpressionGenerator(value_index_str)

    node_replacement = AdversarialNodeReplacement(value_index,
                                                  dataset.fixed_value_offset)
    rules_index = node_replacement.make_rules(dataset, trees_str, trees_num)
    adversary = RenameAdversary(rules_index, dataset)
    Logger.end_scope()

    subtree_replacement = AdversarialSubtreeReplacement(expr_gen)
    subtree_rules = subtree_replacement.make_rules(dataset, trees_str,
                                                   trees_num)
    subtree_adversary = SubtreeAdversary(subtree_rules, dataset, trees_str,
                                         make_iter)

    return adversary, subtree_adversary
Exemple #5
0
def iter_to_trees(iter) -> Dict[int, AstTree]:
    Logger.start_scope("Converting Iter to Trees")
    trees = {}
    for batch in iter:
        batch_trees = batch_to_trees(batch)
        for tree, idx in zip(batch_trees, batch.id):
            trees[idx.item()] = tree
        sys.stderr.write("\r{}".format(len(trees)))

    sys.stderr.write("\r")
    Logger.debug("# Trees: {}".format(len(trees)))
    Logger.end_scope()
    return trees
Exemple #6
0
def dataset_to_trees(dataset, ID, analyzer=None) -> Dict[int, AstTree]:
    Logger.start_scope("Converting Dataset to Trees")
    trees = {}
    for sample in dataset:
        tree = AstTree.fromTensor(sample.types, sample.values, sample.depth,
                                  {"target": sample.target})
        tree.analyzer = analyzer
        trees[ID.vocab.stoi[sample.id]] = tree
        sys.stderr.write("\r{}".format(len(trees)))

    sys.stderr.write("\r")
    Logger.debug("# Trees: {}".format(len(trees)))
    Logger.end_scope()
    return trees
Exemple #7
0
def train_model(
    model: NeuralModelBase,
    dataset: Dataset,
    num_epochs,
    train_iter,
    valid_iter,
    lr=0.001,
    weight=None,
    target_o=1.0,
):
    # model.reset_parameters()
    opt = optim.Adam(model.parameters(), lr=lr)
    Logger.start_scope("Training Model")

    o_base = len(dataset.TARGET.vocab) - 4  # 'reject', '<unk>', '<pad>'
    loss_function = RejectionCrossEntropyLoss(
        o_base,
        len(dataset.TARGET.vocab),
        dataset.reject_token_id,
        reduction="none",
        weight=weight,
    )
    model.loss_function = loss_function
    model.opt = opt

    step = 1.0 / (num_epochs // 2)
    schedule = [
        f * o_base + (1 - f) * 1.0 for f in np.arange(start=1.0, stop=0.0, step=-step)
    ]
    schedule += [
        f * ((1.0 + schedule[-1]) / 2) + (1 - f) * target_o
        for f in np.arange(start=1.0, stop=0.0, step=-step)
    ]
    schedule += [target_o] * (num_epochs // 2)

    train_prec, valid_prec = None, None
    for epoch, o_upper in enumerate(schedule):
        Logger.start_scope("Epoch {}, o_upper={:.3f}".format(epoch, o_upper))
        loss_function.o = o_upper
        model.fit(train_iter, opt, loss_function, mask_field="mask_valid")

        valid_stats = model.accuracy(
            valid_iter, dataset.TARGET
        )  # , thresholds=[0.5, 0.8, 0.9, 0.95])
        valid_prec = valid_stats["mask_valid_noreject_acc"]
        Logger.debug(f"valid_prec: {valid_prec}")
        Logger.end_scope()

        # Logger.start_scope('Print Rejection Thresholds')
        # print_rejection_thresholds(train_iter, model, dataset)
        # print_rejection_thresholds(valid_iter, model, dataset)
        # Logger.end_scope()

        # Logger.start_scope('Get Rejection Thresholds')
        # get_rejection_thresholds(train_iter, model, dataset, [1.00, 0.99, 0.95, 0.9, 0.8])
        # get_rejection_thresholds(valid_iter, model, dataset, [1.00, 0.99, 0.95, 0.9, 0.8])
        # Logger.end_scope()

    train_stats = model.accuracy(train_iter, dataset.TARGET, verbose=False)
    train_prec = train_stats["mask_valid_noreject_acc"]
    Logger.debug(f"train_prec: {train_prec}, valid_prec: {valid_prec}")
    Logger.end_scope()
    # exit(0)
    return train_prec, valid_prec
Exemple #8
0
    def solve(self, debug_info=None):
        import gurobipy as gb

        verbose = len(self.samples) > 1
        if verbose:
            Logger.debug("Number of samples: #{}".format(len(self.samples)))
        self.build_edge_types(self.samples)
        # Create optimization model
        m = gb.Model("netflow")

        timers = collections.defaultdict(Timer)
        if verbose:
            Logger.start_scope("Encoding Solver Model")
        cost = m.addVars(
            range(len(self.edge_types.values())),
            obj=1.0,
            name="cost",
            vtype=gb.GRB.INTEGER,
        )
        flows = []
        for idx, sample in enumerate(self.samples):
            timers["flow"].start()
            flow = m.addVars(sample.edges.keys(),
                             name="flow_{}".format(idx),
                             vtype=gb.GRB.INTEGER)
            timers["flow"].stop()
            flows.append(flow)

            # Arc-capacity constraints
            timers["cap"].start()
            m.addConstrs(
                (flow[i, j] <= cost[self.edge_types[e_type]]
                 for (i, j), e_type in sample.edges.items()),
                "cap_{}".format(idx),
            )
            timers["cap"].stop()

            # Flow-conservation constraints
            timers["node"].start()
            m.addConstrs(
                (flow.sum("*", j) + sample.inflow.get(j, 0) == flow.sum(
                    j, "*") for j in sample.nodes),
                "node_{}".format(idx),
            )
            timers["node"].stop()

        if verbose:
            for key, timer in timers.items():
                Logger.debug("{} {}".format(key, timer))
            Logger.end_scope()

            Logger.start_scope("Optimizing")
        m.write("file.lp")
        # disable logging
        m.Params.OutputFlag = 0
        m.optimize()
        if verbose:
            Logger.end_scope()

        # Print solution
        if m.status == gb.GRB.Status.OPTIMAL:
            edge_costs = collections.Counter()
            edge_counts = collections.Counter()
            for flow, sample in zip(flows, self.samples):
                solution = m.getAttr("x", flow)
                # print('\nOptimal flows:')
                for (i, j), e_type in sample.edges.items():
                    if solution[i, j] > 0:
                        # print('%s -> %s: %g' % (i, j, solution[i, j]))
                        edge_costs[e_type] += solution[i, j]
                        edge_counts[e_type] += 1

            valid_features = []
            solution = m.getAttr("x", cost)
            # print('Costs')
            for idx, c in enumerate(solution):
                # print('\t{} {} -> {} {:.2f} ({:.2f}%)'.format(idx, c, solution[c],
                #                                   edge_costs[self.id_to_edge_type[c]],
                #                                   edge_costs[self.id_to_edge_type[c]] * 100.0 / sum(edge_costs.values()))
                #       )
                if solution[c] > 0:
                    edge_type = self.id_to_edge_type[c]
                    valid_features.append((edge_type, edge_costs[edge_type],
                                           edge_counts[edge_type]))
            if not valid_features:
                print("valid_features", valid_features)
                print(debug_info)
                exit(0)

            return EdgeFilter(valid_features)
        else:
            print(debug_info)
            print(m.status)
            print("The model is infeasible; computing IIS")

            for sample in self.samples[:5]:
                print(sample.inflow)
                print(sample.edges)
                print(sample.nodes)

            m.computeIIS()
            if m.IISMinimal:
                print("IIS is minimal\n")
            else:
                print("IIS is not minimal\n")
            print("\nThe following constraint(s) cannot be satisfied:")
            for c in m.getConstrs():
                if c.IISConstr:
                    print("%s" % c.constrName)
            exit(0)
Exemple #9
0
    def __init__(self, args, include_edges=False, dataset_eval=None):
        Logger.debug("Dataset: {}".format(args.dataset))
        config = Config.get_dataset(args.dataset)
        config.init(args.dataset_path, args.cache_dir)

        self.EDGES = ([
            "child_edges",
            # 'computed_from_edges',
            # 'next_token_edges',
            "returns_to_edges",
            "last_write_edges",
            "last_read_edges",
            "last_lexical_usage_edges",
        ] if include_edges else [])

        # Type of the AST node (e.g., Identifier, BinaryExpression, IfStatement, etc.)
        self.TYPES = torchtext.data.Field(sequential=True,
                                          include_lengths=True)
        # Value of the AST node (e.g., x, y, 5, console, etc.)
        self.VALUES = torchtext.data.Field(sequential=True)

        # Target to predict, in our case type of the extession (e.g., int, string, string[], etc.)
        # torchtext 0.4 has hardcoded values of unk_token='<unk>', any other value will break the batching
        self.TARGET = torchtext.data.Field(sequential=True,
                                           unk_token="<unk>",
                                           is_target=True)
        # User provided type annotation
        self.GOLD = torchtext.data.Field(sequential=True, unk_token="<unk>")

        # ID of the file from which the sample was generated
        self.ID = torchtext.data.Field(sequential=False, use_vocab=True)
        # order of the sample. Used to keep batching deterministic
        self.ORDER = torchtext.data.Field(sequential=False, use_vocab=False)

        # boolean mask that denotes for which AST nodes the prediction should be made.
        # In our case, these are nodes that have a type (e.g, all identifiers and expressions)
        self.MASK_VALID = torchtext.data.Field(sequential=True,
                                               use_vocab=False,
                                               pad_token=0,
                                               dtype=torch.uint8)
        self.MASK_GOLD = torchtext.data.Field(sequential=True,
                                              use_vocab=False,
                                              pad_token=0,
                                              dtype=torch.uint8)

        # depth of the node in the AST (i.e., distance from the root)
        self.DEPTH = torchtext.data.Field(sequential=True,
                                          use_vocab=False,
                                          pad_token=0,
                                          dtype=torch.uint8)
        # position of the node with respect to the parent in the AST.
        # 0 denotes the first child, 1 is the second child, etc.
        # self.POS = torchtext.data.Field(sequential=True, pad_token=0)

        # files used by TypeScript to infer the ground-truth types
        # useful in case modifications are applied to the file and type inference needs to be executed again
        # self.DEPENDENCIES = torchtext.data.Field(sequential=True)

        fields = {
            "ast_types": ("types", self.TYPES),
            "ast_values": ("values", self.VALUES),
            "id": ("id", self.ID),
            # 'pos': ('pos', self.POS),
            "target_full": ("target", self.TARGET),
            "mask_valid_full": ("mask_valid", self.MASK_VALID),
            # 'gold_type': ('gold', self.GOLD),
            # 'mask_gold': ('mask_gold', self.MASK_GOLD),
            "depth": ("depth", self.DEPTH),
            # 'dependencies': ('dependencies', self.DEPENDENCIES)
        }

        for edge_type in self.EDGES:
            fields[edge_type + "_src"] = (
                edge_type + "_src",
                torchtext.data.Field(
                    sequential=True,
                    pad_token=-1,
                    include_lengths=True,
                    use_vocab=False,
                    dtype=torch.int16,
                ),
            )
            fields[edge_type + "_tgt"] = (
                edge_type + "_tgt",
                torchtext.data.Field(sequential=True,
                                     pad_token=-1,
                                     use_vocab=False,
                                     dtype=torch.int16),
            )

        Logger.start_scope("Reading Dataset")
        dataset_cls = (torchtext.data.TabularDataset
                       if not config.compressed else TabularCompressedDataset)
        dtrain, dvalid, dtest = dataset_cls.splits(
            path=os.path.join(args.cache_dir, args.dataset),
            train=config.train,
            validation=config.valid,
            test=config.test,
            format="json",
            fields=fields,
        )

        if args.num_samples != -1:
            dtrain.examples = dtrain.examples[:args.num_samples]
            dvalid.examples = dvalid.examples[:args.num_samples]

        if dataset_eval is not None:
            eval_ids = set(sample.id for sample in itertools.chain(
                dataset_eval.dvalid, dataset_eval.dtest))
            dvalid.examples = [
                sample for sample in dvalid if sample.id in eval_ids
            ]
            dtest.examples = [
                sample for sample in dtest if sample.id in eval_ids
            ]

        self.filter_size(dtrain)
        self.filter_size(dvalid)
        self.filter_size(dtest)

        types = [
            "string",
            "number",
            "boolean",
            "void",
            "() => string",
            "() => number",
            "() => boolean",
            "() => void",
            "<null>",
        ]

        self.__normalize_values(itertools.chain(dtrain, dvalid, dtest))

        Logger.end_scope()
        Logger.start_scope("Processing Dataset")
        self.TYPES.build_vocab(dtrain)
        self.VALUES.build_vocab(dtrain, min_freq=10)
        self.TARGET.build_vocab(dtrain, max_size=0)
        self.TARGET.vocab.extend(SimpleNamespace(itos=types))
        # special value denoting no prediction
        self.TARGET.vocab.extend(SimpleNamespace(itos=["reject", "unsound"]))

        # use to replace types/values with predictions
        # make the types unique such that they are not mixed with the original values
        ext_types = ["__<" + v + ">__" for v in types + ["<unk>"]]
        self.TYPES.vocab.extend(SimpleNamespace(itos=ext_types))
        self.VALUES.vocab.extend(SimpleNamespace(itos=ext_types))

        # values >= than fixed_value_offset are replaced manually to denote predictions from previous iterations
        self.fixed_value_offset = self.VALUES.vocab.stoi[ext_types[0]]

        self.GOLD.build_vocab(dtrain, max_size=0)
        self.GOLD.vocab.extend(SimpleNamespace(itos=types))
        # self.POS.build_vocab(dtrain)

        self.__remove_null_labels(itertools.chain(dtrain, dvalid, dtest))

        # ID is for debugging so we build vocab also from test dataset
        # self.ID.build_vocab(dtrain, dvalid)
        self.ID.build_vocab(
            [sample.id
             for sample in itertools.chain(dtrain, dvalid, dtest)] + [
                 "{}_mod".format(sample.id)
                 for sample in itertools.chain(dtrain, dvalid, dtest)
             ]  # modified adversarial samples
        )
        # self.DEPENDENCIES.build_vocab(dtrain, dvalid)
        Logger.debug(" TYPES Vocab Size: {:6d}/{:6d}".format(
            len(self.TYPES.vocab), len(self.TYPES.vocab.freqs)))
        Logger.debug("VALUES Vocab Size: {:6d}/{:6d}".format(
            len(self.VALUES.vocab), len(self.VALUES.vocab.freqs)))

        Logger.debug("TARGET Vocab Size: {:6d}/{:6d}".format(
            len(self.TARGET.vocab), len(self.TARGET.vocab.freqs)))
        for key, value in self.TARGET.vocab.freqs.most_common(20):
            print("\t{:10s} {:10d}: {}".format(
                "[vocab]" if key in self.TARGET.vocab.stoi else "", value,
                key))

        Logger.end_scope()

        # Store the results
        self.dtrain = dtrain
        self.dvalid = dvalid
        self.dtest = dtest

        self.pad_token_id = self.VALUES.vocab.stoi[self.VALUES.pad_token]
        self.unk_token_id = self.VALUES.vocab.stoi[self.VALUES.unk_token]
        self.reject_token_id = self.TARGET.vocab.stoi["reject"]
        self.unsound_token_id = self.TARGET.vocab.stoi["unsound"]

        self.id_to_sample = {
            self.ID.vocab.stoi[sample.id]: sample
            for sample in itertools.chain(self.dtrain, self.dvalid, self.dtest)
        }

        self.__init_order(self.dtrain)
        self.__init_order(self.dvalid)
        self.__init_order(self.dtest)

        self.fields = [(name, field) for name, field in dtrain.fields.items()]