Esempio n. 1
0
    def _unpack_model_batch_prediction(self,
                                       batch,
                                       coerce_tree=False) -> np.ndarray:
        """
        Interpret prediction result per batch
        coerce_tree = True if you want to ensure that the output forms a tree
        """
        out_dict = self.model(**batch)
        pred_matrix = out_dict["pred_matrix"]

        batch_interpretation = []
        for es in range(len(pred_matrix)):
            essay_pred = tonp(pred_matrix[es])

            # decoding using simple argmax
            essay_pred = np.argmax(essay_pred, axis=-1)
            dist_interpretation = []
            for i in range(len(essay_pred)):
                dist_interpretation.append(essay_pred[i] - i)

            # check if the output is a tree
            rep = TreeBuilder(dist_interpretation)
            if (not rep.is_tree()) and (coerce_tree == True):
                # run MINIMUM spanning tree
                attn_matrix = tonp(pred_matrix[es])
                attn_matrix = np.array(attn_matrix)
                rank_order = get_rank_order(attn_matrix)
                dist_interpretation = run_MST(
                    rank_order, rank_order, verdict="min"
                )  # --> use rank as the weight, "minimum" spanning tree, lower_rank number in rank is better

            # add the decoding result to the batch result
            batch_interpretation.append(dist_interpretation)
        return batch_interpretation
def save_tsv(filename, essay_code, sentences, dist, label_preds):
    header = ["essay code", "unit id", "text", "target", "relation", "drop_flag"]
    rep = TreeBuilder(dist)
    component_labels = rep.auto_component_labels(AC_breakdown=True)

    f = open(filename, "w")
    f.write("\t".join(header)+"\n")
    label_idx = 0

    for i in range(len(sentences)):
        output_line = []
        output_line.append(essay_code)
        output_line.append(str(i+1))
        output_line.append(sentences[i])
        if component_labels[i] == "non-arg comp.":
            output_line.append("")
            output_line.append("")
            output_line.append("TRUE")
        else:
            target = i+1+dist[i]
            if target==i+1: # point to itself, i.e., root
                output_line.append("")
                output_line.append("")
            else: # not root
                output_line.append(str(target))
                output_line.append(label_preds[label_idx])
                label_idx += 1
            output_line.append("FALSE")

        f.write("\t".join(output_line)+"\n")
    assert(label_idx == len(label_preds))
    f.close()
Esempio n. 3
0
def structured_output_quality(links) -> (List, float, float, float):
    """
    Infer component labels automatically from the structure
    """
    component_labels = []
    tree_ratio = 0
    avg_depth = 0
    avg_leaf_prop = 0
    all_depths = []

    n_essays = len(links)

    for i in range(len(links)):
        rep = TreeBuilder(links[i])
        component_labels.append(rep.auto_component_labels(AC_breakdown=True))

        if rep.is_tree():
            tree_ratio += 1

            # evaluate this only when the output forms a tree
            depth, leaf_prop = rep.tree_depth_and_leaf_proportion()
            avg_depth += depth
            all_depths.append(depth)
            avg_leaf_prop += leaf_prop

    return component_labels, float(tree_ratio) / float(n_essays), float(
        avg_depth) / float(tree_ratio), float(avg_leaf_prop) / float(
            tree_ratio), all_depths
Esempio n. 4
0
def f1_per_depth(dist_gold: List, dist_prediction: List, max_depth: int):
    """
    Find at which depth prediction mismatches happen (when the output forms a tree)

    Args:
        dist_gold (List): gold answer per essay
        dist_prediction (List): predicted answer per essay
        max_depth (int): max structure depth in the dataset

    Returns:
        tuple, i.e., (list, list, list)
    """
    gold_all_depth = []
    pred_all_depth = []

    for i in range(len(dist_gold)):
        rep_gold = TreeBuilder(dist_gold[i])
        rep_pred = TreeBuilder(dist_prediction[i])

        if rep_pred.is_tree():
            g_depths = rep_gold.node_depths()
            p_depths = rep_pred.node_depths()

            gold_all_depth.append(g_depths)
            pred_all_depth.append(p_depths)

    gold_all_depth_flat = flatten_list(gold_all_depth)
    pred_all_depth_flat = flatten_list(pred_all_depth)

    print("=== Depth prediction performance when output forms a tree ===")
    print(
        classification_report(y_true=gold_all_depth_flat,
                              y_pred=pred_all_depth_flat,
                              digits=3))
    report = classification_report(y_true=gold_all_depth_flat,
                                   y_pred=pred_all_depth_flat,
                                   output_dict=True)
    f1s = []
    for i in range(max_depth):
        try:
            f1s.append(report[str(i)]['f1-score'])
        except:
            f1s.append(0.0)

    return f1s
Esempio n. 5
0
    def __compute_loss(self, s_arc, rel_dists, seq_len):
        """
        Compute loss (average of essay-level loss)

        Args:
            s_arc (torch.Tensor)
            rel_dists (torch.Tensor)
            seq_len (Any)

        Returns:
            (torch.Tensor, torch.Tensor)
        """
        def dist_idx_to_dist(idx):
            return int(
                self.vocab.get_token_from_index(int(idx),
                                                namespace="rel_dist_labels"))

        batch_size = len(rel_dists)

        # gold ans
        gold_ans = []
        for b in range(batch_size):
            non_padded_pred = rel_dists[b][:seq_len[b]].tolist()
            non_padded_pred = [dist_idx_to_dist(x) for x in non_padded_pred]
            gold_matrix = torch.Tensor(TreeBuilder(non_padded_pred).adj_matrix)
            target = torch.argmax(gold_matrix,
                                  dim=-1)  # index of the correct label
            if self.torch_device.type == "cuda":  # move to device
                target = target.cuda()
            gold_ans.append(target)

        # pred ans
        pred_ans = []
        for b in range(batch_size):
            non_padded_pred = s_arc[b, :seq_len[b], :seq_len[b]]
            pred_ans.append(non_padded_pred)

        # loss
        avg_loss = []
        for b in range(batch_size):  # batch_size
            loss = self.loss(pred_ans[b], gold_ans[b])  # loss per essay
            avg_loss.append(loss)  # loss per batch
        avg_loss = torch.mean(torch.stack(avg_loss))

        return pred_ans, avg_loss
def create_pairwise_data(sentences, dist):
    """
    Create pairwise link labelling data

    Args:
        sentences (list[str])
        dist (list[int])

    Returns:
        list[tuple(str,str)]
    """
    rep = TreeBuilder(dist)
    component_labels = rep.auto_component_labels(AC_breakdown=True) 

    output = []
    for i in range(len(sentences)):
        if component_labels[i] == "non-arg comp.":
            pass
        else:
            if i+dist[i] != i: # the current sentence does not point to itself, i.e., not a root
                source = sentences[i]
                target = sentences[i+dist[i]]
                output.append((source, target))
    return output
Esempio n. 7
0
    def _unpack_model_batch_prediction(self,
                                       batch,
                                       coerce_tree=False) -> np.ndarray:
        """
        Interpret prediction result per batch
        coerce_tree = True if we want to make sure that the predictions form a tree (using MST (min or max) algorithm)
        """
        out_dict = self.model(**batch)
        pred_linking_softmax = tonp(out_dict["pred_linking_softmax"])
        pred_node_labelling_softmax = tonp(
            out_dict["pred_node_labelling_softmax"])

        linking_preds = []
        node_labelling_preds = []
        for es in range(len(pred_linking_softmax)):
            essay_linking = []
            essay_labelling = []
            max_seq_len = batch["seq_len"][es]

            # simple decoding using argmax
            for s in range(
                    max_seq_len
            ):  # iterate each sentence in the essay, s is the index of the current sentence
                # perform constrained argmax for linking
                curr_link_softmax = pred_linking_softmax[es][s]
                ranked_pred = [
                    i for i in reversed(
                        sorted(enumerate(curr_link_softmax),
                               key=lambda x: x[1]))
                ]
                for i in range(len(ranked_pred)):
                    tmp_dist = self.dist_idx_to_dist(ranked_pred[i][0])
                    if 0 <= tmp_dist + s <= max_seq_len - 1:
                        pred_dist = tmp_dist
                        break

                # argmax for labelling
                curr_label_softmax = pred_node_labelling_softmax[es][s]
                pred_idx = np.argmax(curr_label_softmax)
                pred_label = self.component_idx_to_label(pred_idx)

                # essay-level result
                essay_linking.append(pred_dist)
                essay_labelling.append(pred_label)

            # check if the output is tree
            rep = TreeBuilder(essay_linking)
            if (not rep.is_tree()) and (coerce_tree == True):
                attn_matrix = [
                ]  # element [i,j] denotes the probability of sentence i connects to sentence j (j as the target)
                for s in range(
                        max_seq_len
                ):  # iterate each sentence in the essay, s is the index of the current sentence
                    curr_pred = pred_linking_softmax[es][s]

                    # get the prediction to each possible target sentence in the text
                    row_pred = [0] * max_seq_len
                    for i in range(len(curr_pred)):
                        temp_dist = self.dist_idx_to_dist(i)
                        value = curr_pred[i]
                        if 0 <= temp_dist + s <= max_seq_len - 1:
                            row_pred[temp_dist + s] = value

                    attn_matrix.append(row_pred)

                # run MAXIMUM spanning tree
                attn_matrix = np.array(attn_matrix)
                rank_order = get_rank_order(attn_matrix)
                essay_linking = run_MST(
                    rank_order, attn_matrix, verdict="max"
                )  # --> use the softmax probability as the weight, we run the maximum spanning tree here because higher probability means better

            # batch-level result
            linking_preds.append(essay_linking)
            node_labelling_preds.append(essay_labelling)

        return linking_preds, node_labelling_preds
Esempio n. 8
0
    def _unpack_model_batch_prediction(self,
                                       batch,
                                       coerce_tree=False) -> np.ndarray:
        """
        Interpret prediction result per batch
        """
        out_dict = self.model(**batch)
        pred_softmax = tonp(out_dict["pred_softmax"])
        # print("seq len", batch["seq_len"])
        # print(pred_softmax.shape)

        batch_interpretation = []
        for es in range(len(pred_softmax)):

            essay_interpretation = []
            max_seq_len = batch["seq_len"][es]

            # simple decoding using argmax
            for s in range(
                    max_seq_len
            ):  # iterate each sentence in the essay, s is the index of the current sentence
                curr_pred = pred_softmax[es][s]

                # perform constrained argmax
                ranked_pred = [
                    i for i in reversed(
                        sorted(enumerate(curr_pred), key=lambda x: x[1]))
                ]
                # print(ranked_pred)
                for i in range(len(ranked_pred)):
                    tmp_dist = self.dist_idx_to_dist(ranked_pred[i][0])
                    # print(tmp_dist, tmp_dist+s)
                    # input()
                    if 0 <= tmp_dist + s <= max_seq_len - 1:
                        pred_dist = tmp_dist
                        break

                essay_interpretation.append(pred_dist)

            # check if the output is tree
            rep = TreeBuilder(essay_interpretation)
            if (not rep.is_tree()) and (coerce_tree == True):
                attn_matrix = [
                ]  # element [i,j] denotes the probability of sentence i connects to sentence j (j as the target)
                for s in range(
                        max_seq_len
                ):  # iterate each sentence in the essay, s is the index of the current sentence
                    curr_pred = pred_softmax[es][s]

                    # get the prediction to each possible target sentence in the text
                    row_pred = [0] * max_seq_len
                    for i in range(len(curr_pred)):
                        temp_dist = self.dist_idx_to_dist(i)
                        value = curr_pred[i]
                        if 0 <= temp_dist + s <= max_seq_len - 1:
                            row_pred[temp_dist + s] = value

                    attn_matrix.append(row_pred)

                # run MAXIMUM spanning tree
                attn_matrix = np.array(attn_matrix)
                rank_order = get_rank_order(attn_matrix)
                essay_interpretation = run_MST(
                    rank_order, attn_matrix, verdict="max"
                )  # --> use the softmax probability as the weight, we run the maximum spanning tree here because higher probability means better

            batch_interpretation.append(essay_interpretation)

        return batch_interpretation
Esempio n. 9
0
        mc_r, non_mc_rep = essay.text_repair_stats()
        mc_repair += mc_r
        non_mc_repair += non_mc_rep

        # to get non_AC samples
        non_ac_samples.extend(essay.get_non_ACS("original", False))

        # about tree structure
        if args.original_order:
            directions_with_non_AC = essay.get_rel_distances(
                "original", include_non_arg_units=True)[0]
        else:
            directions_with_non_AC = essay.get_rel_distances(
                "reordering", include_non_arg_units=True)[0]  # reordering
        try:
            rep = TreeBuilder(
                directions_with_non_AC)  # distances between sentences
        except:
            print("Distance error", essay.essay_code)
        depth, leaf_ratio = rep.tree_depth_and_leaf_proportion()
        leaf_prop.append(leaf_ratio)

    print("> Corpus", directory)
    print("> items", len(essays))
    print("> Common Stats")
    print("  \t\t\t \tsum \tmax \tmin \tavg \tstdev")
    print_stats("# Sentences\t", n_sentences)
    print_stats("# Tokens\t", n_tokens)
    print_stats("# Arg. components", n_ACs)
    print_stats("# Non-arg. comp.", n_non_ACs)

    print("> Relations")
Esempio n. 10
0
    def run(self, ybo, product, version, release, variant="", bugurl="",
            isfinal=False, workdir=None, outputdir=None, buildarch=None, volid=None,
            domacboot=False, doupgrade=True, remove_temp=False,
            installpkgs=None,
            ssss=None,
            size=2,
            add_templates=None,
            add_template_vars=None,
            add_arch_templates=None,
            add_arch_template_vars=None,
            template_tempdir=None):

        assert self._configured

        installpkgs = installpkgs or []

        # get lorax version
        try:
            import pylorax.version
        except ImportError:
            vernum = "devel"
        else:
            vernum = pylorax.version.num

        if domacboot:
            try:
                runcmd(["rpm", "-q", "hfsplus-tools"])
            except CalledProcessError:
                logger.critical("you need to install hfsplus-tools to create mac images")
                sys.exit(1)

        # set up work directory
        self.workdir = workdir or tempfile.mkdtemp(prefix="pylorax.work.")
        if not os.path.isdir(self.workdir):
            os.makedirs(self.workdir)

        # set up log directory
        logdir = '/var/log/lorax'
        if not os.path.isdir(logdir):
            os.makedirs(logdir)

        self.init_stream_logging()
        self.init_file_logging(logdir)

        logger.debug("version is {0}".format(vernum))
        logger.debug("using work directory {0.workdir}".format(self))
        logger.debug("using log directory {0}".format(logdir))

        # set up output directory
        self.outputdir = outputdir or tempfile.mkdtemp(prefix="pylorax.out.")
        if not os.path.isdir(self.outputdir):
            os.makedirs(self.outputdir)
        logger.debug("using output directory {0.outputdir}".format(self))

        # do we have root privileges?
        logger.info("checking for root privileges")
        if not os.geteuid() == 0:
            logger.critical("no root privileges")
            sys.exit(1)

        # is selinux disabled?
        # With selinux in enforcing mode the rpcbind package required for
        # dracut nfs module, which is in turn required by anaconda module,
        # will not get installed, because it's preinstall scriptlet fails,
        # resulting in an incomplete initial ramdisk image.
        # The reason is that the scriptlet runs tools from the shadow-utils
        # package in chroot, particularly groupadd and useradd to add the
        # required rpc group and rpc user. This operation fails, because
        # the selinux context on files in the chroot, that the shadow-utils
        # tools need to access (/etc/group, /etc/passwd, /etc/shadow etc.),
        # is wrong and selinux therefore disallows access to these files.
        logger.info("checking the selinux mode")
        if selinux.is_selinux_enabled() and selinux.security_getenforce():
            logger.critical("selinux must be disabled or in Permissive mode")
            sys.exit(1)

        # do we have a proper yum base object?
        logger.info("checking yum base object")
        if not isinstance(ybo, yum.YumBase):
            logger.critical("no yum base object")
            sys.exit(1)
        self.inroot = ybo.conf.installroot
        logger.debug("using install root: {0}".format(self.inroot))

        if not buildarch:
            buildarch = get_buildarch(ybo)

        logger.info("setting up build architecture")
        self.arch = ArchData(buildarch)
        for attr in ('buildarch', 'basearch', 'libdir'):
            logger.debug("self.arch.%s = %s", attr, getattr(self.arch,attr))

        logger.info("setting up build parameters")
        product = DataHolder(name=product, version=version, release=release,
                             variant=variant, bugurl=bugurl, isfinal=isfinal)
        self.product = product
        logger.debug("product data: %s" % product)

        # NOTE: if you change isolabel, you need to change pungi to match, or
        # the pungi images won't boot.
        isolabel = volid or "{0.name} {0.version} {1.basearch}".format(self.product,
                                                                       self.arch)

        if len(isolabel) > 32:
            logger.fatal("the volume id cannot be longer than 32 characters")
            sys.exit(1)

        templatedir = self.conf.get("lorax", "sharedir")
        # NOTE: rb.root = ybo.conf.installroot (== self.inroot)
        rb = RuntimeBuilder(product=self.product, arch=self.arch,
                            yum=ybo, templatedir=templatedir,
                            installpkgs=installpkgs,
                            add_templates=add_templates,
                            add_template_vars=add_template_vars)

        logger.info("installing runtime packages")
        rb.yum.conf.skip_broken = self.conf.getboolean("yum", "skipbroken")
        rb.install()

        # write .buildstamp
        buildstamp = BuildStamp(self.product.name, self.product.version,
                                self.product.bugurl, self.product.isfinal, self.arch.buildarch)

        buildstamp.write(joinpaths(self.inroot, ".buildstamp"))

        if self.debug:
            rb.writepkglists(joinpaths(logdir, "pkglists"))
            rb.writepkgsizes(joinpaths(logdir, "original-pkgsizes.txt"))

        logger.info("doing post-install configuration")
        rb.postinstall()

        # write .discinfo
        discinfo = DiscInfo(self.product.release, self.arch.basearch)
        discinfo.write(joinpaths(self.outputdir, ".discinfo"))

        logger.info("backing up installroot")
        installroot = joinpaths(self.workdir, "installroot")
        linktree(self.inroot, installroot)

        logger.info("generating kernel module metadata")
        rb.generate_module_data()

        logger.info("cleaning unneeded files")
        rb.cleanup()

        if self.debug:
            rb.writepkgsizes(joinpaths(logdir, "final-pkgsizes.txt"))

        logger.info("creating the runtime image")
        runtime = "images/install.img"
        compression = self.conf.get("compression", "type")
        compressargs = self.conf.get("compression", "args").split()
        if self.conf.getboolean("compression", "bcj"):
            if self.arch.bcj:
                compressargs += ["-Xbcj", self.arch.bcj]
            else:
                logger.info("no BCJ filter for arch %s", self.arch.basearch)
        rb.create_runtime(joinpaths(installroot,runtime),
                          compression=compression, compressargs=compressargs)

        logger.info("preparing to build output tree and boot images")
        treebuilder = TreeBuilder(product=self.product, arch=self.arch,
                                  inroot=installroot, outroot=self.outputdir, ssss=ssss,
                                  runtime=runtime, isolabel=isolabel,
                                  domacboot=domacboot, doupgrade=doupgrade,
                                  templatedir=templatedir,
                                  add_templates=add_arch_templates,
                                  add_template_vars=add_arch_template_vars,
                                  workdir=self.workdir)

        logger.info("rebuilding initramfs images")
        dracut_args = ["--xz", "--install", "/.buildstamp", "--no-early-microcode"]
        anaconda_args = dracut_args + ["--add", "anaconda pollcdrom"]

        # ppc64 cannot boot an initrd > 32MiB so remove some drivers
        if self.arch.basearch in ("ppc64", "ppc64le"):
            dracut_args.extend(["--omit-drivers", REMOVE_PPC64_DRIVERS])

            # Only omit dracut modules from the initrd so that they're kept for
            # upgrade.img
            anaconda_args.extend(["--omit", REMOVE_PPC64_MODULES])

        treebuilder.rebuild_initrds(add_args=anaconda_args)

        if doupgrade:
            # Build upgrade.img. It'd be nice if these could coexist in the same
            # image, but that would increase the size of the anaconda initramfs,
            # which worries some people (esp. PPC tftpboot). So they're separate.
            try:
                # If possible, use the 'redhat-upgrade-tool' plymouth theme
                themes = runcmd_output(['plymouth-set-default-theme', '--list'],
                                       root=installroot)
                if 'redhat-upgrade-tool' in themes.splitlines():
                    os.environ['PLYMOUTH_THEME_NAME'] = 'redhat-upgrade-tool'
            except RuntimeError:
                pass
            upgrade_args = dracut_args + ["--add", "system-upgrade convertfs"]
            treebuilder.rebuild_initrds(add_args=upgrade_args, prefix="upgrade")

        logger.info("populating output tree and building boot images")
        treebuilder.build()

        # write .treeinfo file and we're done
        treeinfo = TreeInfo(self.product.name, self.product.version,
                            self.product.variant, self.arch.basearch)
        for section, data in treebuilder.treeinfo_data.items():
            treeinfo.add_section(section, data)
        treeinfo.write(joinpaths(self.outputdir, ".treeinfo"))

        # cleanup
        if remove_temp:
            remove(self.workdir)
Esempio n. 11
0
        print("  > ", it + 1, essay.essay_code, vectors.shape)

        # assertion to check whether we have included non-arg-units here
        assert (len(sentences) == len(rel_distances))
        assert (len(sentences) == len(rel_labels))

        # determine where to save the file
        if args.split:
            split_folder = check_train_or_test(split_info, essay.essay_code)
            assert (split_folder != None)
            split_folder = split_folder + "/"
        else:
            split_folder = ""  # no split information provided

        # component labels
        rep = TreeBuilder(rel_distances)
        component_labels = rep.auto_component_labels(AC_breakdown=True)

        # save to file
        save_content_to_file(
            args.out_dir + "linking/" + split_folder.lower() +
            essay.essay_code + ".sentences", sentences)
        save_content_to_file(
            args.out_dir + "linking/" + split_folder.lower() +
            essay.essay_code + ".vectors", vectors.tolist())
        save_content_to_file(
            args.out_dir + "linking/" + split_folder.lower() +
            essay.essay_code + ".rel_distances", rel_distances)
        # save_content_to_file(args.out_dir + "linking/" + split_folder.lower() + essay.essay_code + ".rel_labels", rel_labels)
        save_content_to_file(
            args.out_dir + "linking/" + split_folder.lower() +