def _unpack_model_batch_prediction(self, batch, coerce_tree=False) -> np.ndarray: """ Interpret prediction result per batch coerce_tree = True if you want to ensure that the output forms a tree """ out_dict = self.model(**batch) pred_matrix = out_dict["pred_matrix"] batch_interpretation = [] for es in range(len(pred_matrix)): essay_pred = tonp(pred_matrix[es]) # decoding using simple argmax essay_pred = np.argmax(essay_pred, axis=-1) dist_interpretation = [] for i in range(len(essay_pred)): dist_interpretation.append(essay_pred[i] - i) # check if the output is a tree rep = TreeBuilder(dist_interpretation) if (not rep.is_tree()) and (coerce_tree == True): # run MINIMUM spanning tree attn_matrix = tonp(pred_matrix[es]) attn_matrix = np.array(attn_matrix) rank_order = get_rank_order(attn_matrix) dist_interpretation = run_MST( rank_order, rank_order, verdict="min" ) # --> use rank as the weight, "minimum" spanning tree, lower_rank number in rank is better # add the decoding result to the batch result batch_interpretation.append(dist_interpretation) return batch_interpretation
def save_tsv(filename, essay_code, sentences, dist, label_preds): header = ["essay code", "unit id", "text", "target", "relation", "drop_flag"] rep = TreeBuilder(dist) component_labels = rep.auto_component_labels(AC_breakdown=True) f = open(filename, "w") f.write("\t".join(header)+"\n") label_idx = 0 for i in range(len(sentences)): output_line = [] output_line.append(essay_code) output_line.append(str(i+1)) output_line.append(sentences[i]) if component_labels[i] == "non-arg comp.": output_line.append("") output_line.append("") output_line.append("TRUE") else: target = i+1+dist[i] if target==i+1: # point to itself, i.e., root output_line.append("") output_line.append("") else: # not root output_line.append(str(target)) output_line.append(label_preds[label_idx]) label_idx += 1 output_line.append("FALSE") f.write("\t".join(output_line)+"\n") assert(label_idx == len(label_preds)) f.close()
def structured_output_quality(links) -> (List, float, float, float): """ Infer component labels automatically from the structure """ component_labels = [] tree_ratio = 0 avg_depth = 0 avg_leaf_prop = 0 all_depths = [] n_essays = len(links) for i in range(len(links)): rep = TreeBuilder(links[i]) component_labels.append(rep.auto_component_labels(AC_breakdown=True)) if rep.is_tree(): tree_ratio += 1 # evaluate this only when the output forms a tree depth, leaf_prop = rep.tree_depth_and_leaf_proportion() avg_depth += depth all_depths.append(depth) avg_leaf_prop += leaf_prop return component_labels, float(tree_ratio) / float(n_essays), float( avg_depth) / float(tree_ratio), float(avg_leaf_prop) / float( tree_ratio), all_depths
def f1_per_depth(dist_gold: List, dist_prediction: List, max_depth: int): """ Find at which depth prediction mismatches happen (when the output forms a tree) Args: dist_gold (List): gold answer per essay dist_prediction (List): predicted answer per essay max_depth (int): max structure depth in the dataset Returns: tuple, i.e., (list, list, list) """ gold_all_depth = [] pred_all_depth = [] for i in range(len(dist_gold)): rep_gold = TreeBuilder(dist_gold[i]) rep_pred = TreeBuilder(dist_prediction[i]) if rep_pred.is_tree(): g_depths = rep_gold.node_depths() p_depths = rep_pred.node_depths() gold_all_depth.append(g_depths) pred_all_depth.append(p_depths) gold_all_depth_flat = flatten_list(gold_all_depth) pred_all_depth_flat = flatten_list(pred_all_depth) print("=== Depth prediction performance when output forms a tree ===") print( classification_report(y_true=gold_all_depth_flat, y_pred=pred_all_depth_flat, digits=3)) report = classification_report(y_true=gold_all_depth_flat, y_pred=pred_all_depth_flat, output_dict=True) f1s = [] for i in range(max_depth): try: f1s.append(report[str(i)]['f1-score']) except: f1s.append(0.0) return f1s
def __compute_loss(self, s_arc, rel_dists, seq_len): """ Compute loss (average of essay-level loss) Args: s_arc (torch.Tensor) rel_dists (torch.Tensor) seq_len (Any) Returns: (torch.Tensor, torch.Tensor) """ def dist_idx_to_dist(idx): return int( self.vocab.get_token_from_index(int(idx), namespace="rel_dist_labels")) batch_size = len(rel_dists) # gold ans gold_ans = [] for b in range(batch_size): non_padded_pred = rel_dists[b][:seq_len[b]].tolist() non_padded_pred = [dist_idx_to_dist(x) for x in non_padded_pred] gold_matrix = torch.Tensor(TreeBuilder(non_padded_pred).adj_matrix) target = torch.argmax(gold_matrix, dim=-1) # index of the correct label if self.torch_device.type == "cuda": # move to device target = target.cuda() gold_ans.append(target) # pred ans pred_ans = [] for b in range(batch_size): non_padded_pred = s_arc[b, :seq_len[b], :seq_len[b]] pred_ans.append(non_padded_pred) # loss avg_loss = [] for b in range(batch_size): # batch_size loss = self.loss(pred_ans[b], gold_ans[b]) # loss per essay avg_loss.append(loss) # loss per batch avg_loss = torch.mean(torch.stack(avg_loss)) return pred_ans, avg_loss
def create_pairwise_data(sentences, dist): """ Create pairwise link labelling data Args: sentences (list[str]) dist (list[int]) Returns: list[tuple(str,str)] """ rep = TreeBuilder(dist) component_labels = rep.auto_component_labels(AC_breakdown=True) output = [] for i in range(len(sentences)): if component_labels[i] == "non-arg comp.": pass else: if i+dist[i] != i: # the current sentence does not point to itself, i.e., not a root source = sentences[i] target = sentences[i+dist[i]] output.append((source, target)) return output
def _unpack_model_batch_prediction(self, batch, coerce_tree=False) -> np.ndarray: """ Interpret prediction result per batch coerce_tree = True if we want to make sure that the predictions form a tree (using MST (min or max) algorithm) """ out_dict = self.model(**batch) pred_linking_softmax = tonp(out_dict["pred_linking_softmax"]) pred_node_labelling_softmax = tonp( out_dict["pred_node_labelling_softmax"]) linking_preds = [] node_labelling_preds = [] for es in range(len(pred_linking_softmax)): essay_linking = [] essay_labelling = [] max_seq_len = batch["seq_len"][es] # simple decoding using argmax for s in range( max_seq_len ): # iterate each sentence in the essay, s is the index of the current sentence # perform constrained argmax for linking curr_link_softmax = pred_linking_softmax[es][s] ranked_pred = [ i for i in reversed( sorted(enumerate(curr_link_softmax), key=lambda x: x[1])) ] for i in range(len(ranked_pred)): tmp_dist = self.dist_idx_to_dist(ranked_pred[i][0]) if 0 <= tmp_dist + s <= max_seq_len - 1: pred_dist = tmp_dist break # argmax for labelling curr_label_softmax = pred_node_labelling_softmax[es][s] pred_idx = np.argmax(curr_label_softmax) pred_label = self.component_idx_to_label(pred_idx) # essay-level result essay_linking.append(pred_dist) essay_labelling.append(pred_label) # check if the output is tree rep = TreeBuilder(essay_linking) if (not rep.is_tree()) and (coerce_tree == True): attn_matrix = [ ] # element [i,j] denotes the probability of sentence i connects to sentence j (j as the target) for s in range( max_seq_len ): # iterate each sentence in the essay, s is the index of the current sentence curr_pred = pred_linking_softmax[es][s] # get the prediction to each possible target sentence in the text row_pred = [0] * max_seq_len for i in range(len(curr_pred)): temp_dist = self.dist_idx_to_dist(i) value = curr_pred[i] if 0 <= temp_dist + s <= max_seq_len - 1: row_pred[temp_dist + s] = value attn_matrix.append(row_pred) # run MAXIMUM spanning tree attn_matrix = np.array(attn_matrix) rank_order = get_rank_order(attn_matrix) essay_linking = run_MST( rank_order, attn_matrix, verdict="max" ) # --> use the softmax probability as the weight, we run the maximum spanning tree here because higher probability means better # batch-level result linking_preds.append(essay_linking) node_labelling_preds.append(essay_labelling) return linking_preds, node_labelling_preds
def _unpack_model_batch_prediction(self, batch, coerce_tree=False) -> np.ndarray: """ Interpret prediction result per batch """ out_dict = self.model(**batch) pred_softmax = tonp(out_dict["pred_softmax"]) # print("seq len", batch["seq_len"]) # print(pred_softmax.shape) batch_interpretation = [] for es in range(len(pred_softmax)): essay_interpretation = [] max_seq_len = batch["seq_len"][es] # simple decoding using argmax for s in range( max_seq_len ): # iterate each sentence in the essay, s is the index of the current sentence curr_pred = pred_softmax[es][s] # perform constrained argmax ranked_pred = [ i for i in reversed( sorted(enumerate(curr_pred), key=lambda x: x[1])) ] # print(ranked_pred) for i in range(len(ranked_pred)): tmp_dist = self.dist_idx_to_dist(ranked_pred[i][0]) # print(tmp_dist, tmp_dist+s) # input() if 0 <= tmp_dist + s <= max_seq_len - 1: pred_dist = tmp_dist break essay_interpretation.append(pred_dist) # check if the output is tree rep = TreeBuilder(essay_interpretation) if (not rep.is_tree()) and (coerce_tree == True): attn_matrix = [ ] # element [i,j] denotes the probability of sentence i connects to sentence j (j as the target) for s in range( max_seq_len ): # iterate each sentence in the essay, s is the index of the current sentence curr_pred = pred_softmax[es][s] # get the prediction to each possible target sentence in the text row_pred = [0] * max_seq_len for i in range(len(curr_pred)): temp_dist = self.dist_idx_to_dist(i) value = curr_pred[i] if 0 <= temp_dist + s <= max_seq_len - 1: row_pred[temp_dist + s] = value attn_matrix.append(row_pred) # run MAXIMUM spanning tree attn_matrix = np.array(attn_matrix) rank_order = get_rank_order(attn_matrix) essay_interpretation = run_MST( rank_order, attn_matrix, verdict="max" ) # --> use the softmax probability as the weight, we run the maximum spanning tree here because higher probability means better batch_interpretation.append(essay_interpretation) return batch_interpretation
mc_r, non_mc_rep = essay.text_repair_stats() mc_repair += mc_r non_mc_repair += non_mc_rep # to get non_AC samples non_ac_samples.extend(essay.get_non_ACS("original", False)) # about tree structure if args.original_order: directions_with_non_AC = essay.get_rel_distances( "original", include_non_arg_units=True)[0] else: directions_with_non_AC = essay.get_rel_distances( "reordering", include_non_arg_units=True)[0] # reordering try: rep = TreeBuilder( directions_with_non_AC) # distances between sentences except: print("Distance error", essay.essay_code) depth, leaf_ratio = rep.tree_depth_and_leaf_proportion() leaf_prop.append(leaf_ratio) print("> Corpus", directory) print("> items", len(essays)) print("> Common Stats") print(" \t\t\t \tsum \tmax \tmin \tavg \tstdev") print_stats("# Sentences\t", n_sentences) print_stats("# Tokens\t", n_tokens) print_stats("# Arg. components", n_ACs) print_stats("# Non-arg. comp.", n_non_ACs) print("> Relations")
def run(self, ybo, product, version, release, variant="", bugurl="", isfinal=False, workdir=None, outputdir=None, buildarch=None, volid=None, domacboot=False, doupgrade=True, remove_temp=False, installpkgs=None, ssss=None, size=2, add_templates=None, add_template_vars=None, add_arch_templates=None, add_arch_template_vars=None, template_tempdir=None): assert self._configured installpkgs = installpkgs or [] # get lorax version try: import pylorax.version except ImportError: vernum = "devel" else: vernum = pylorax.version.num if domacboot: try: runcmd(["rpm", "-q", "hfsplus-tools"]) except CalledProcessError: logger.critical("you need to install hfsplus-tools to create mac images") sys.exit(1) # set up work directory self.workdir = workdir or tempfile.mkdtemp(prefix="pylorax.work.") if not os.path.isdir(self.workdir): os.makedirs(self.workdir) # set up log directory logdir = '/var/log/lorax' if not os.path.isdir(logdir): os.makedirs(logdir) self.init_stream_logging() self.init_file_logging(logdir) logger.debug("version is {0}".format(vernum)) logger.debug("using work directory {0.workdir}".format(self)) logger.debug("using log directory {0}".format(logdir)) # set up output directory self.outputdir = outputdir or tempfile.mkdtemp(prefix="pylorax.out.") if not os.path.isdir(self.outputdir): os.makedirs(self.outputdir) logger.debug("using output directory {0.outputdir}".format(self)) # do we have root privileges? logger.info("checking for root privileges") if not os.geteuid() == 0: logger.critical("no root privileges") sys.exit(1) # is selinux disabled? # With selinux in enforcing mode the rpcbind package required for # dracut nfs module, which is in turn required by anaconda module, # will not get installed, because it's preinstall scriptlet fails, # resulting in an incomplete initial ramdisk image. # The reason is that the scriptlet runs tools from the shadow-utils # package in chroot, particularly groupadd and useradd to add the # required rpc group and rpc user. This operation fails, because # the selinux context on files in the chroot, that the shadow-utils # tools need to access (/etc/group, /etc/passwd, /etc/shadow etc.), # is wrong and selinux therefore disallows access to these files. logger.info("checking the selinux mode") if selinux.is_selinux_enabled() and selinux.security_getenforce(): logger.critical("selinux must be disabled or in Permissive mode") sys.exit(1) # do we have a proper yum base object? logger.info("checking yum base object") if not isinstance(ybo, yum.YumBase): logger.critical("no yum base object") sys.exit(1) self.inroot = ybo.conf.installroot logger.debug("using install root: {0}".format(self.inroot)) if not buildarch: buildarch = get_buildarch(ybo) logger.info("setting up build architecture") self.arch = ArchData(buildarch) for attr in ('buildarch', 'basearch', 'libdir'): logger.debug("self.arch.%s = %s", attr, getattr(self.arch,attr)) logger.info("setting up build parameters") product = DataHolder(name=product, version=version, release=release, variant=variant, bugurl=bugurl, isfinal=isfinal) self.product = product logger.debug("product data: %s" % product) # NOTE: if you change isolabel, you need to change pungi to match, or # the pungi images won't boot. isolabel = volid or "{0.name} {0.version} {1.basearch}".format(self.product, self.arch) if len(isolabel) > 32: logger.fatal("the volume id cannot be longer than 32 characters") sys.exit(1) templatedir = self.conf.get("lorax", "sharedir") # NOTE: rb.root = ybo.conf.installroot (== self.inroot) rb = RuntimeBuilder(product=self.product, arch=self.arch, yum=ybo, templatedir=templatedir, installpkgs=installpkgs, add_templates=add_templates, add_template_vars=add_template_vars) logger.info("installing runtime packages") rb.yum.conf.skip_broken = self.conf.getboolean("yum", "skipbroken") rb.install() # write .buildstamp buildstamp = BuildStamp(self.product.name, self.product.version, self.product.bugurl, self.product.isfinal, self.arch.buildarch) buildstamp.write(joinpaths(self.inroot, ".buildstamp")) if self.debug: rb.writepkglists(joinpaths(logdir, "pkglists")) rb.writepkgsizes(joinpaths(logdir, "original-pkgsizes.txt")) logger.info("doing post-install configuration") rb.postinstall() # write .discinfo discinfo = DiscInfo(self.product.release, self.arch.basearch) discinfo.write(joinpaths(self.outputdir, ".discinfo")) logger.info("backing up installroot") installroot = joinpaths(self.workdir, "installroot") linktree(self.inroot, installroot) logger.info("generating kernel module metadata") rb.generate_module_data() logger.info("cleaning unneeded files") rb.cleanup() if self.debug: rb.writepkgsizes(joinpaths(logdir, "final-pkgsizes.txt")) logger.info("creating the runtime image") runtime = "images/install.img" compression = self.conf.get("compression", "type") compressargs = self.conf.get("compression", "args").split() if self.conf.getboolean("compression", "bcj"): if self.arch.bcj: compressargs += ["-Xbcj", self.arch.bcj] else: logger.info("no BCJ filter for arch %s", self.arch.basearch) rb.create_runtime(joinpaths(installroot,runtime), compression=compression, compressargs=compressargs) logger.info("preparing to build output tree and boot images") treebuilder = TreeBuilder(product=self.product, arch=self.arch, inroot=installroot, outroot=self.outputdir, ssss=ssss, runtime=runtime, isolabel=isolabel, domacboot=domacboot, doupgrade=doupgrade, templatedir=templatedir, add_templates=add_arch_templates, add_template_vars=add_arch_template_vars, workdir=self.workdir) logger.info("rebuilding initramfs images") dracut_args = ["--xz", "--install", "/.buildstamp", "--no-early-microcode"] anaconda_args = dracut_args + ["--add", "anaconda pollcdrom"] # ppc64 cannot boot an initrd > 32MiB so remove some drivers if self.arch.basearch in ("ppc64", "ppc64le"): dracut_args.extend(["--omit-drivers", REMOVE_PPC64_DRIVERS]) # Only omit dracut modules from the initrd so that they're kept for # upgrade.img anaconda_args.extend(["--omit", REMOVE_PPC64_MODULES]) treebuilder.rebuild_initrds(add_args=anaconda_args) if doupgrade: # Build upgrade.img. It'd be nice if these could coexist in the same # image, but that would increase the size of the anaconda initramfs, # which worries some people (esp. PPC tftpboot). So they're separate. try: # If possible, use the 'redhat-upgrade-tool' plymouth theme themes = runcmd_output(['plymouth-set-default-theme', '--list'], root=installroot) if 'redhat-upgrade-tool' in themes.splitlines(): os.environ['PLYMOUTH_THEME_NAME'] = 'redhat-upgrade-tool' except RuntimeError: pass upgrade_args = dracut_args + ["--add", "system-upgrade convertfs"] treebuilder.rebuild_initrds(add_args=upgrade_args, prefix="upgrade") logger.info("populating output tree and building boot images") treebuilder.build() # write .treeinfo file and we're done treeinfo = TreeInfo(self.product.name, self.product.version, self.product.variant, self.arch.basearch) for section, data in treebuilder.treeinfo_data.items(): treeinfo.add_section(section, data) treeinfo.write(joinpaths(self.outputdir, ".treeinfo")) # cleanup if remove_temp: remove(self.workdir)
print(" > ", it + 1, essay.essay_code, vectors.shape) # assertion to check whether we have included non-arg-units here assert (len(sentences) == len(rel_distances)) assert (len(sentences) == len(rel_labels)) # determine where to save the file if args.split: split_folder = check_train_or_test(split_info, essay.essay_code) assert (split_folder != None) split_folder = split_folder + "/" else: split_folder = "" # no split information provided # component labels rep = TreeBuilder(rel_distances) component_labels = rep.auto_component_labels(AC_breakdown=True) # save to file save_content_to_file( args.out_dir + "linking/" + split_folder.lower() + essay.essay_code + ".sentences", sentences) save_content_to_file( args.out_dir + "linking/" + split_folder.lower() + essay.essay_code + ".vectors", vectors.tolist()) save_content_to_file( args.out_dir + "linking/" + split_folder.lower() + essay.essay_code + ".rel_distances", rel_distances) # save_content_to_file(args.out_dir + "linking/" + split_folder.lower() + essay.essay_code + ".rel_labels", rel_labels) save_content_to_file( args.out_dir + "linking/" + split_folder.lower() +