def step_assigner(self): from dice.reason import Assigner assigner = Assigner(self, verbose=self.parameters["verbose"]) if bool(self.parameters["log"]): self.set_assignment( assigner.process( variables_path=self.logger.path("gurobi_variables.tsv"), constraints_path=self.logger.path( "gurobi_constraints.tsv"))) else: self.set_assignment(assigner.process()) if bool(self.parameters["log"]): assigner.report(self.logger.path("variables_usage.png"), self.logger.path("assignment_stats.png"), self.logger.path("clauses.tsv"), self.logger.path("clauses")).save( self.logger.path("report_assigner.txt")) self.get_assignment().save(self.logger.path("assignment.tsv"), self.get_kb()) self.get_assignment().log_true(self.logger.path("assignment.txt"), self.get_kb(), self.get_taxonomy()) for d in Dimensions.iter(): self.get_assignment().draw( self.get_kb(), self.get_taxonomy(), d, self.logger.path("top-" + Dimensions.label(d, slug=True) + ".svg"), )
def plot(self, path): drawers = list() for dimension in Dimensions.iter(): drawers.append( EvidenceDrawer( path + "-" + Dimensions.label(dimension, slug=True), { index: self[index][dimension] for index in self.inputs.get_kb() }, Dimensions.label(dimension), self.inputs.get_kb(), self.inputs.get_taxonomy(), )) for cls in self.cues: drawers.append( EvidenceDrawer( path + "-" + cls.name, { index: self.cues[cls][index] for index in self.inputs.get_kb() }, cls.name, self.inputs.get_kb(), self.inputs.get_taxonomy(), )) for drawer in drawers: drawer.top() drawer.distrib()
def build(self, verbose=True): for cls in self.cues: if verbose: print("Gathering", cls.__name__) self.cues[cls] = cls() self.cues[cls].gather(self.inputs, verbose=verbose, joint_cue=self.cues[JointCue]) for index in self.inputs.get_kb(): self[index] = EvidenceWrapper(index, self.cues.values()) for dimension in Dimensions.iter(): if verbose: print("Normalizing", Dimensions.label(dimension), "evidence") self.normalize(dimension, verbose)
class TrackerFact(Fact): header = Fact.header + [ "{dimension}_{field}".format( dimension=Dimensions.label(dimension, slug=True), field=field) for dimension in Dimensions.iter() for field in ["evidence", "assignment", "confidence"] ] def __init__(self): Fact.__init__(self) self.attributes = { dimension: { field: None for field in ["evidence", "assignment", "confidence"] } for dimension in Dimensions.iter() } def from_fact(fact): tracker_fact = TrackerFact() tracker_fact.index = fact.index tracker_fact.subject = fact.subject tracker_fact.property = fact.property tracker_fact.modality = fact.modality tracker_fact.score = fact.score tracker_fact.text = fact.text tracker_fact.sense = fact.sense tracker_fact.source = fact.source return tracker_fact def __str__(self): return Fact.__str__(self) + "\t" + "\t".join( str(self.attributes[dimension][field]) for dimension in Dimensions.iter() for field in ["evidence", "assignment", "confidence"]) def parse(self, line): Fact.parse(self, line) split = line.strip().split("\t") i = 7 for dimension in Dimensions.iter(): for field in ["evidence", "assignment", "confidence"]: i += 1 if i % 3 == 0: self.attributes[dimension][field] = split[i] == "True" else: self.attributes[dimension][field] = float(split[i])
def log_true(self, path, kb, taxonomy): grouped = self.group() concepts = taxonomy.relation._imap with codecs.open(path, "w", "utf-8") as file: for concept in concepts: file.write("# " + concept + "\n\n") for d in Dimensions.iter(): file.write("## " + Dimensions.label(d) + "\n\n") for index in concepts[concept]: if grouped[index][d] == self.TRUE: file.write("{}\t{}\n".format( kb[index], self.confidence[Variable(index, d)] )) file.write("\n") file.write("\n")
def group(self): grouped = dict() for x in self.map: if x.index not in grouped: grouped[x.index] = {d: self.UNKNOWN for d in Dimensions.iter()} grouped[x.index][x.dimension] = self.map[x] return grouped
def process(self, path, amount): seed = rd.randint(1, 2**31 - 1) subjects = dict() for fact in self.tracker.values(): subjects.setdefault(fact.subject, list()) subjects[fact.subject].append(fact.index) rd.seed(seed) print("Random seed:", seed) pairs = [(i, j) for s in subjects for i in subjects[s] for j in subjects[s] if i > j] weights = [ sum([ abs(self.tracker[i].attributes[d]["confidence"] - self.tracker[j].attributes[d]["confidence"]) / len(subjects[self.tracker[i].subject])**1.5 for d in Dimensions.iter() ]) for i, j in pairs ] selection = list() while len(selection) < amount and len(pairs) > 0: choice = rd.choices(pairs, k=1, weights=weights)[0] index = pairs.index(choice) selection.append(Pair(self.tracker, *choice)) pairs.pop(index) weights.pop(index) with open(path, "w") as file: file.write("\t".join(Pair.HEADER) + "\n") for pair in selection: file.write(pair.tsv())
def score(self, dimension, log=False): loss, count = 0., 0 for index, row in self.annotation.iterrows(): if abs(3 - row[dimension]) < self.CONFIDENCE: continue if (row["source_1"], int(row["index_1"])) not in self.tracker: continue if (row["source_2"], int(row["index_2"])) not in self.tracker: continue fact_1 = self.tracker[(row["source_1"], row["index_1"])] fact_2 = self.tracker[(row["source_2"], row["index_2"])] count += 1 this_loss = self.loss( self.predict(fact_1, fact_2, dimension), self.gold(row, dimension), ) if log and this_loss != 0: print("\t".join([ str(self.predict(fact_1, fact_2, dimension)), str(self.gold(row, dimension)), str(this_loss), Dimensions.label(dimension), repr(fact_1), repr(fact_2), ])) loss += this_loss return loss, count
def report(self, filename): counts = { d: {Assignment.TRUE: 0, Assignment.FALSE: 0} for d in Dimensions.iter() } for x, val in self.map.items(): counts[x.dimension][val] += 1 trues = [counts[d][Assignment.TRUE] for d in Dimensions.iter()] falses = [counts[d][Assignment.FALSE] for d in Dimensions.iter()] width = .4 def pos(offset): return [i + offset * width for i in range(len(trues))] plt.figure(figsize=(8, 8)) plt.bar(pos(0), trues, width, label="True") plt.bar(pos(1), falses, width, label="False") plt.xticks(pos(.5), [Dimensions.label(d) for d in Dimensions.iter()]) plt.ylabel("Number of facts") plt.legend(loc="best") plt.savefig(filename, format="png") plt.close() report = Report() for d in Dimensions.iter(): report.add_value_ratio( Dimensions.label(d), counts[d][Assignment.TRUE], counts[d][Assignment.TRUE] + counts[d][Assignment.FALSE] ) return report
def __init__(self): Fact.__init__(self) self.attributes = { dimension: { field: None for field in ["evidence", "assignment", "confidence"] } for dimension in Dimensions.iter() }
def build_ranks(self): for dimension in Dimensions.iter(): rank_evidence = sorted(self.keys(), key=lambda index: -self[index].attributes[ dimension]["evidence"]) rank_confidence = sorted(self.keys(), key=lambda index: -self[index].attributes[ dimension]["confidence"]) for i in range(len(rank_evidence)): self.evidence_rank[dimension][rank_evidence[i]] = i + 1 self.confidence_rank[dimension][rank_confidence[i]] = i + 1
def parse(self, line): Fact.parse(self, line) split = line.strip().split("\t") i = 7 for dimension in Dimensions.iter(): for field in ["evidence", "assignment", "confidence"]: i += 1 if i % 3 == 0: self.attributes[dimension][field] = split[i] == "True" else: self.attributes[dimension][field] = float(split[i])
def build(self, inputs): detective = inputs.get_detective() assignment = inputs.get_assignment() for dimension in Dimensions.iter(): for fact in inputs.get_kb().values(): if Variable(fact.index, dimension) not in assignment.map: continue self.setdefault(fact.index, TrackerFact.from_fact(fact)) self[fact.index].attributes[dimension]["evidence"] =\ detective[fact.index][dimension] self[fact.index].attributes[dimension]["assignment"] =\ assignment.map[Variable(fact.index, dimension)] == Assignment.TRUE self[fact.index].attributes[dimension]["confidence"] =\ assignment.confidence[Variable(fact.index, dimension)] self.build_ranks()
def ground(self): kb = self.grounder.inputs.get_kb() similarity = self.grounder.inputs.get_similarity_matrix() for x in self.grounder.concepts: facts_x = dict() properties_x = list() for i, index in enumerate(self.grounder.concepts[x]): properties_x.append(similarity.index[kb[index].property]) facts_x[i] = index submatrix = similarity.matrix[properties_x][:, properties_x] for i, j in zip(*submatrix.nonzero()): for dimension in Dimensions.iter(): self.add([Variable(facts_x[i], dimension)], [Variable(facts_x[j], dimension)], similarity_weight=submatrix[i, j], taxonomy_weight=1.)
def pair_evaluator(argv): """pair_evaluator arguments: <annotation-file> <feature> <confidence> <tracker-file>+ """ from dice.evaluation import Tracker, PairEvaluator from dice.constants import Dimensions annotation_file, feature, confidence, *tracker_files = argv PairEvaluator.FEATURE = feature PairEvaluator.CONFIDENCE = float(confidence) evaluator = PairEvaluator(annotation_file, *[Tracker(f) for f in tracker_files]) print(" " * 8 + "\t ppref\t size") for dimension, results in evaluator.evaluate(details=True).items(): if dimension == -1: print("Overall \t", round(1 - results["mae"], 2), "\t", results["n"]) else: print(Dimensions.label(dimension), "\t", round(1 - results["mae"], 2), "\t", results["n"])
def evaluate(self, details=False, log=False): results = dict() total_loss, total_count = 0., 0 for dimension in Dimensions.iter(): loss, count = self.score(dimension, log) if count == 0: count = 1 results[dimension] = { "mae": loss / count, "n": count, } total_loss += loss total_count += count results[-1] = { "mae": total_loss / total_count, "n": total_count, } if details: return results return total_loss / total_count
def __init__(self, path=None): self.evidence_rank = {d: dict() for d in Dimensions.iter()} self.confidence_rank = {d: dict() for d in Dimensions.iter()} KnowledgeBase.__init__(self, path)
def _ground(self, fact): for dimension in Dimensions.iter(): self.add( [Variable(fact, dimension)], [], )
def __str__(self): return Fact.__str__(self) + "\t" + "\t".join( str(self.attributes[dimension][field]) for dimension in Dimensions.iter() for field in ["evidence", "assignment", "confidence"])
def _ground(self, fact_x, fact_y, similarity_weight): for dimension in Dimensions.iter(): self.add([Variable(fact_x, dimension)], [Variable(fact_y, dimension)], similarity_weight=similarity_weight, taxonomy_weight=1.)
def generate(self, script, remote_subject_filter=None): declaration_regex = re.compile( "{{(\d+)_(salient|typical|plausible|remarkable|random|score)_(up|down)}}" ) placeholder_regex = re.compile("{{(s|p)_(\d+)}}") bindings_regex = re.compile("{{bind_(\d+)_(\d+)}}") bes_regex = re.compile("{{be_(\w+)}}") bindings = bindings_regex.findall(script) be_rep = list(bes_regex.findall(script)) if len(be_rep) > 0: be_rep = be_rep[0] else: be_rep = "be" while True: facts = dict() selected = set() retry = False for index, dimension, direction in declaration_regex.findall( script): # Checking if the fact is already bound to another one subject_filter = None for pair in bindings: if index not in set(pair): continue index_a, index_b = pair if index == index_a and index_b in facts: subject_filter = facts[index_b].subject elif index == index_b and index_a in facts: subject_filter = facts[index_a].subject if remote_subject_filter is not None: subject_filter = remote_subject_filter # Getting the list of new facts source = [ fact for fact in self.tracker.values() if fact.index not in selected and " " not in fact.subject ] # Applying filter if necessary if subject_filter is not None: source = [ fact for fact in source if fact.subject == subject_filter ] # Re-do the sampling if the current one would not have enough facts if len(source) == 0: retry = True break # Sorting facts according to the correct dimension if dimension == "random": random.shuffle(source) ranked = source[:] elif dimension == "score": ranked = sorted(source, key=lambda fact: fact.score) else: ranked = sorted( source, key=lambda fact: fact.attributes[Dimensions.from_label( dimension)]["confidence"]) # Narrowing down selection space slice = [] if direction == "up": slice = ranked[int(.9 * len(ranked)):] elif direction == "down": slice = ranked[:int(.1 * len(ranked))] if len(slice) == 0: if direction == "up": slice = ranked[-1:] elif direction == "down": slice = ranked[:1] # Choosing and selecting fact fact = random.choice(slice) selected.add(fact.index) facts[index] = fact # Reaching this points ensures that all facts have been correctly # sampled. if not retry: break out = declaration_regex.sub("", script) out = bindings_regex.sub("", out) out = bes_regex.sub("", out) def replacer(match): field, index = match.group(1), match.group(2) if field == "s": return facts[index].subject return facts[index].property.replace("be ", be_rep + " ") text = placeholder_regex.sub(replacer, out).strip() corrected = "" for sentence in text.split("\n"): matches = self.tool.check(sentence) corrected_sentence = language_check.correct(sentence, matches) corrected += corrected_sentence.strip() + "\n" return { "text": text, "html": re.sub("\n", "<br>", corrected), "samples": facts, "seed": self.seed }
def solve(self, variables_path=None, constraints_path=None): open(self.gurobi_log_file, "w").close() self.model.params.Threads = 12 self.model.optimize() self.assignment = Assignment(self.variables) f_vars, f_cstr = None, None if constraints_path is not None: f_cstr = open(constraints_path, "w") f_cstr.write("\t".join(Ilp.cstr_attrs) + "\n") if f_cstr is not None: for constraint in self.model.getConstrs(): for attr in Ilp.cstr_attrs: value = "" try: value = constraint.getAttr(attr) except: pass f_cstr.write(str(value) + "\t") f_cstr.write("\n") f_cstr.close() if variables_path is not None: f_vars = open(variables_path, "w") f_vars.write("\t".join(Ilp.vars_attrs) + "\n") inner_confidence = [] for gurobi_var in self.model.getVars(): if gurobi_var.varName[0] not in "PTRS": continue if gurobi_var.rc == 0: up = min(2, gurobi_var.SAObjUp) low = max(-2, gurobi_var.SAObjLow) inner_confidence.append((gurobi_var.x - .51) * (up - low)) if len(inner_confidence) == 0: a, b = 0, 1 else: a, b = min(inner_confidence), max(inner_confidence) for gurobi_var in self.model.getVars(): if f_vars is not None: for attr in Ilp.vars_attrs: value = "" try: value = gurobi_var.getAttr(attr) except: pass f_vars.write(str(value) + "\t") f_vars.write("\n") letter = gurobi_var.varName[0] if letter not in "PTRS": continue index = int(gurobi_var.varName[2:-1]) confidence = 0 if gurobi_var.rc != 0: confidence = gurobi_var.rc else: confidence = gurobi_var.x + gurobi_var.obj + Parameters.EVIDENCE_OFFSET self.assignment.assign( Variable(index, Dimensions.from_letter(letter)), gurobi_var.x >= .5, confidence, ) if f_vars is not None: f_vars.close() return self.assignment
def ground(self, include_evidence_rule=True): if self.verbose: print("Grounding...") for index in self.kb: for dimension in Dimensions.iter(): self.variables.append(Variable(index, dimension)) concepts_rules = (SimilarityRule(self, self.clauses), ) subconcept_rules = ( RulePlausibilityInheritance(self, self.clauses), RuleTypicalityInheritance(self, self.clauses), RulePlausibilityInference(self, self.clauses), RuleRemarkabilityInheritance(self, self.clauses), RuleTypicalPreventsRemarkable(self, self.clauses), RuleNotPlausibleImpliesRemarkable(self, self.clauses), ) siblings_rules = ( RuleNotPlausibleImpliesRemarkableSiblings(self, self.clauses), RuleRemarkabilitySiblings(self, self.clauses), RuleTypicalPreventsRemarkableSiblings(self, self.clauses), ) other_rules = ( RuleSalientImpliesPlausible, RuleTypicalImpliesPlausible, RuleTypicalAndRemarkableImplySalient, ExistenceRule, ) kb = self.inputs.get_kb() similarity = self.inputs.get_similarity_matrix() concept_links = dict() parent_links = dict() siblings_links = dict() for x in self.concepts: for fact_x in self.concepts[x]: concept_links[fact_x] = set(self.concepts[x]) parent_links[fact_x] = set() siblings_links[fact_x] = set() if x not in self.taxonomy.nodes: continue for child in self.taxonomy.successors(x): for fact_y in self.concepts.get(child, list()): parent_links[fact_x].add(fact_y) for sibling in self.taxonomy.siblings(x): for fact_y in self.concepts.get(sibling, list()): siblings_links[fact_x].add(fact_y) properties = dict() for fact in kb.values(): ip = similarity.index[fact.property] properties.setdefault(ip, set()) properties[ip].add(fact.index) inds = list(properties.keys()) submatrix = similarity.matrix[inds][:, inds] for i, j in zip(*submatrix.nonzero()): similarity_weight = submatrix[i, j] for fact_x in properties[inds[i]]: for fact_y in properties[inds[j]].intersection( parent_links[fact_x]): for rule in subconcept_rules: rule._ground(fact_x, fact_y, 1., similarity_weight) for fact_y in properties[inds[j]].intersection( siblings_links[fact_x]): for rule in siblings_rules: rule._ground(fact_x, fact_y, 1., similarity_weight) for fact_y in properties[inds[j]].intersection( concept_links[fact_x]): for rule in concepts_rules: rule._ground(fact_x, fact_y, similarity_weight) for rule in other_rules: rule(self, self.clauses).ground() if include_evidence_rule: EvidenceRule(self, self.clauses).ground() return self.variables, self.clauses