def printMLN(path=None): out_str = "" for ci in Clust.clusts: cl = Clust.getClust(ci) out_str = "{}\t{}\n".format(cl._clustIdx,cl) for aci in cl._argClusts: ac = cl._argClusts[aci] out_str += "\t{}: ".format(aci) out_str += "\t".join(["{}: {}".format(k, v) for k, v in ac._argNum_cnt.items()]) out_str += "\n\t" out_str += "\t".join(["{}: {}: {}".format(k, ArgType.getArgType(k).toString(), v) for k, v in ac._argTypeIdx_cnt.items()]) out_str += "\n\t" out_str += "\t".join(["{}: {}: {}".format(k, Clust.getClust(k), v) for k, v in ac._chdClustIdx_cnt.items()]) out_str += "\n" if path is not None: dst = "{}/{}.mln".format(path, os.path.basename(os.path.dirname(path))) with open(dst, 'w') as f: f.write(out_str) return None else: return out_str
def printParse(path=None): out_str = "" for rnid, pt in Part.rootNodeId_part.items(): out_str += "{}\t{}\n".format(rnid, pt._relTreeRoot.getTreeStr()) out_str += "\t{}: {}\n".format(pt._clustIdx, Clust.getClust(pt._clustIdx).toString()) if pt._parPart is None: out_str += "\t\n\t\n" else: arg = pt._parPart.getArgument(pt._parArgIdx) out_str += "\t{}\t{}\t{}\n".format(pt._parPart._relTreeRoot.getId(), pt._parPart._clustIdx, Clust.getClust(pt._parPart._clustIdx)) out_str += "\t{}: {}: {}\n".format(pt._parPart.getArgClust(pt._parArgIdx), arg._path.getArgType(), ArgType.getArgType(arg._path.getArgType())) if path is not None: dst = "{}/{}.parse".format(path, os.path.basename(os.path.dirname(path))) with open(dst, 'w') as f: f.write(out_str) return None else: return out_str
def execComposePart(self, pp, cp): parClustIdx = pp._clustIdx chdClustIdx = cp._clustIdx pcl = Clust.getClust(parClustIdx) ccl = Clust.getClust(chdClustIdx) dep = pp.getArguments()[cp._parArgIdx]._path.getDep() pp._relTreeRoot.addChild(dep, cp._relTreeRoot) nrti = RelType.getRelType(pp._relTreeRoot) ncl = Clust.getClust(next(iter(Clust.getClustsWithRelType(nrti)))) nci = ncl.getId() pp.removeArgument(cp._parArgIdx, clust_only=True) for argIdx, arg in pp.getArguments().items(): pp.unsetArgClust(argIdx) arg._argPart.unsetParent() pp.changeClust(nci, nrti, clust_only=True) for argIdx, arg in pp.getArguments().items(): ati = arg._path.getArgType() aci = -1 if ati not in ncl._argTypeIdx_argClustIdxs: aci = ncl.createArgClust(ati) elif len(ncl._argTypeIdx_argClustIdxs[ati]) == 0: aci = ncl.createArgClust(ati) else: aci = next(iter(ncl._argTypeIdx_argClustIdxs[ati])) arg._argPart.setParent(pp, argIdx) pp.setArgClustOnly(argIdx, aci) pp.setRelTypeIdx(nrti) for argIdx, arg in cp.getArguments(): ati = arg._path.getArgType() aci = -1 if ati not in ncl._argTypeIdx_argClustIdxs: aci = ncl.createArgClust(ati) elif len(ncl._argTypeIdx_argClustIdxs[ati]) == 0: aci = ncl.createArgClust(ati) else: aci = next(iter(ncl._argTypeIdx_argClustIdxs[ati])) cp.unsetArgClustOnly(argIdx) pp.setArgClust(pp.addArgument(arg), aci) arg._argPart.setParent(pp, pp.addArgument(arg)) cp.destroy() return None
def genString(self): self._str = "OP_{}:".format(self._op) if self._op == SearchOp.OP_MERGE_CLUST: c1 = Clust.getClust(self._clustIdx1) c2 = Clust.getClust(self._clustIdx2) self._str += "{} == {}".format(c1.toString(), c2.toString()) elif self._op == SearchOp.OP_MERGE_ROLE: self._str += "{}:{}:{}".format(self._clustIdx, self._argIdx1, self._argIdx2) elif self._op == SearchOp.OP_COMPOSE: rc = Clust.getClust(self._parClustIdx) ac = Clust.getClust(self._chdClustIdx) self._str += "{} ++ {}".format(rc.toString(), ac.toString())
def setArgs(self, art_id, sent_id, sent, idx): this_part = Part.getPartByRootNodeId( genTreeNodeID(art_id, sent_id, idx)) this_node = this_part.getRelTreeRoot() node_clust = Clust.getClust(this_part.getClustIdx()) children = sent.get_children(idx) if children is None: return None else: for dependency, child_index in children: child_node_id = genTreeNodeID(art_id, sent_id, child_index) path = Path(dependency) argTypeIdx = path.getArgType() child_part = Part.getPartByRootNodeId(child_node_id) if child_part.getParPart() is not None: continue arg = Argument(this_node, path, child_part) argIdx = this_part.addArgument(arg) child_part.setParent(this_part, argIdx) argClustIdxs = node_clust.getArgClustIdxs(argTypeIdx) argClustIdx = -1 if argClustIdxs is None: argClustIdx = node_clust.createArgClust(argTypeIdx) else: argClustIdx = next(iter(argClustIdxs)) this_part.setArgClust(argIdx, argClustIdx, clust_only=True) setArgs(art_id, sent_id, sent, child_index) return None
def createAgenda(self, verbose=False): if verbose: clust_cnt = len(Part.getClustPartRootNodeIds()) milestones = set([x for x in range(1, 10, 1)]) i = 0 for clust_id in Part.getClustPartRootNodeIds(): clust = Clust.getClust(clust_id) if clust.getType() != 'C': continue elif clust.isStop(): continue # # self.logc.write("Adding to agenda for cluster {}\n".format(clust_id)) self.addAgendaForNewClust(clust_id, verbose) if verbose: i += 1 done = math.floor(i * 10 / clust_cnt) if done in milestones: milestones.remove(done) print("{}% complete.".format(done * 10)) # self.logc.close() return None
def changeClustRemap(self, newClustIdx, argClustIdx_newArgClustIdx, clust_only=False): if not clust_only: oldClustIdx = self.getClustIdx() ocl = Clust.getClust(oldClustIdx) self.changeClust(newClustIdx, self.getRelTypeIdx(), clust_only=clust_only) argIdx_newArgClustIdx = {} for ai, arg in self._args.items(): oaci = self._argIdx_argClustIdx.pop(ai) self._argClustIdx_argIdxs[oaci].remove(ai) if len(self._argClustIdx_argIdxs[oaci]) == 0: del self._argClustIdx_argIdxs[oaci] argIdx_newArgClustIdx[ai] = argClustIdx_newArgClustIdx[oaci] if not clust_only: ocl.onPartUnsetArg(self, arg, oaci) for ai in self._args: aci = argIdx_newArgClustIdx[ai] self.setArgClust(ai, aci, clust_only=clust_only) return None
def setArgClust(self, argIdx, argClustIdx, clust_only=False): oldArgClustIdx = -1 if argIdx in self._argIdx_argClustIdx: oldArgClustIdx = self.getArgClust(argIdx) if oldArgClustIdx != argClustIdx: self._argIdx_argClustIdx[argIdx] = argClustIdx if argClustIdx not in self._argClustIdx_argIdxs: self._argClustIdx_argIdxs[argClustIdx] = set() self._argClustIdx_argIdxs[argClustIdx].add(argIdx) arg = self.getArgument(argIdx) if not clust_only: cl = Clust.getClust(self._clustIdx) if oldArgClustIdx < 0: if not clust_only: cl.onPartSetArg(self, arg, argClustIdx) else: self._argClustIdx_argIdxs[oldArgClustIdx].remove(argIdx) if len(self._argClustIdx_argIdxs[oldArgClustIdx]) == 0: del self._argClustIdx_argIdxs[oldArgClustIdx] if not clust_only: cl.onPartSetArg(self, arg, argClustIdx, oldArgClustIdx) return None
def unsetArgClust(self, argIdx, clust_only=False): oldArgClustIdx = self._argIdx_argClustIdx.pop(argIdx) arg = self.getArgument(argIdx) self._argClustIdx_argIdxs[oldArgClustIdx].remove(argIdx) if len(self._argClustIdx_argIdxs[oldArgClustIdx]) == 0: del self._argClustIdx_argIdxs[oldArgClustIdx] if not clust_only: cl = Clust.getClust(self.getClustIdx()) cl.onPartUnsetArg(self, arg, oldArgClustIdx) return None
def setClust(self, clustIdx, clust_only=False): self._clustIdx = clustIdx rootID = self.getRelTreeRoot().getId() if clustIdx not in Part.clustIdx_partRootNodeIds: Part.clustIdx_partRootNodeIds[clustIdx] = SortedSet() Part.clustIdx_partRootNodeIds[clustIdx].add(rootID) if not clust_only: cl = Clust.getClust(clustIdx) cl.onPartSetClust(self) return None
def addAgendaMC(self, clustIdx1, clustIdx2, neighType): if not (self._skipMC or clustIdx1 == clustIdx2): type1 = Clust.getClust(clustIdx1).getType() type2 = Clust.getClust(clustIdx2).getType() if type2 == 'C' and type1 == 'C': op = SearchOp() op._op = SearchOp.OP_MERGE_CLUST op._clustIdx1 = min((clustIdx1, clustIdx2)) op._clustIdx2 = max((clustIdx1, clustIdx2)) if not self.moveAgendaToScore(op): if op not in self._mc_neighs: self._mc_neighs[op] = set() if len(self._mc_neighs[op]) + 1 >= ParseParams.minMCCnt: self._agendaToScore.add(op) del self._mc_neighs[op] else: self._mc_neighs[op].add(neighType) ## self.logc.write("\t\tMerge Op: {}; mc_neighs: {}, agendaToScore: {}\n".format(op, len(self._mc_neighs), len(self._agendaToScore))) return None
def initializeSent(self, ai, sj, sent, verbose=False): ''' Create TreeNode, Part, and Clust for each token in a sentence, also adding/assigning RelTypes. Increment the root count for the cluster assigned to the root token (tokens with a parent of ROOT). Finally, run CreateArgs() to define the parent-child relation- ships. This call is recursive, traversing the whole dependency tree for each sentence. ''' self.numTkns += len(sent.get_tokens()) - 1 roots = sent.get_children(0) if roots is None: return None elif len(roots) == 0: return None for k in range(1, len(sent.get_tokens())): Parse.part_from_node(ai, sj, sent, k, sent.get_token(k)) # if len(roots) == 1: for _, idx in roots: sub_node_id = genTreeNodeID(ai, sj, idx) # Is this global set really necessary? I don't think it is... self.rootTreeNodeIds.add(sub_node_id) node_part = Part.getPartByRootNodeId(sub_node_id) if node_part is None: continue ncl = Clust.getClust(node_part.getClustIdx()) ncl.incRootCnt() self.createArgs(ai, sj, sent, idx, verbose) return None
def scoreMCForParent(self, clustIdx1, clustIdx2): scr = 0 if clustIdx1 in Clust.clustIdx_parArgs and clustIdx2 in Clust.clustIdx_parArgs: parents1 = Clust.clustIdx_parArgs[clustIdx1] parents2 = Clust.clustIdx_parArgs[clustIdx2] for par_arg in parents1: if par_arg in parents2: par_clust_id, arg_clust_id = par_arg pcl = Clust.getClust(par_clust_id) if pcl is None: print("ERR: ScoreMC parent cluster is null: {}, {}". format(clustIdx1, clustIdx2)) continue ac = pcl._argClusts[arg_clust_id] c1 = ac._chdClustIdx_cnt[clustIdx1] c2 = ac._chdClustIdx_cnt[clustIdx2] scr += ParseParams.priorNumParam scr += Scorer.updateScore(c1, c2) return scr
def changeClust(self, newClustIdx, newRelTypeIdx, clust_only=False): oldClustIdx = self.getClustIdx() rootID = self.getRelTreeRoot().getId() Part.clustIdx_partRootNodeIds[oldClustIdx].discard(rootID) if clust_only: self._relTypeIdx = newRelTypeIdx else: ocl = Clust.getClust(oldClustIdx) ocl.onPartUnsetClust(self) self.setRelTypeIdx(newRelTypeIdx) self.setClust(newClustIdx, clust_only=clust_only) parent = self.getParPart() if parent is None: if newClustIdx in Clust.clustIdx_rootCnt: Clust.clustIdx_rootCnt[newClustIdx] += 1 else: Clust.clustIdx_rootCnt[newClustIdx] = 1 Clust.clustIdx_rootCnt[newClustIdx] -= 1 else: parent_clust_id = parent.getClustIdx() paci = parent.getArgClust(self.getParArgIdx()) pcl = Clust.getClust(parent_clust_id) pac = pcl._argClusts[paci] pac._chdClustIdx_cnt[oldClustIdx] -= 1 if newClustIdx in pac._chdClustIdx_cnt: pac._chdClustIdx_cnt[newClustIdx] += 1 else: pac._chdClustIdx_cnt[newClustIdx] = 1 pa = (parent_clust_id, paci) Clust.clustIdx_parArgs[oldClustIdx][pa] -= 1 if newClustIdx not in Clust.clustIdx_parArgs: Clust.clustIdx_parArgs[newClustIdx] = {} if pa in Clust.clustIdx_parArgs[newClustIdx]: Clust.clustIdx_parArgs[newClustIdx][pa] += 1 else: Clust.clustIdx_parArgs[newClustIdx][pa] = 1 opci = (parent_clust_id, oldClustIdx) npci = (parent_clust_id, newClustIdx) ptnid = (parent.getRelTreeRoot().getId(), rootID) Part.pairClustIdxs_pairPartRootNodeIds[opci].discard(ptnid) if len(Part.pairClustIdxs_pairPartRootNodeIds[opci]) == 0: del Part.pairClustIdxs_pairPartRootNodeIds[opci] # Part.clustIdx_pairClustIdxs[oldClustIdx].discard(opci) # Part.clustIdx_pairClustIdxs[parent_clust_id].discard(opci) if npci not in Part.pairClustIdxs_pairPartRootNodeIds: Part.pairClustIdxs_pairPartRootNodeIds[npci] = set() Part.pairClustIdxs_pairPartRootNodeIds[npci].add(ptnid) # Part.clustIdx_pairClustIdxs[parent_clust_id].add(npci) # if newClustIdx not in Part.clustIdx_pairClustIdxs: # Part.clustIdx_pairClustIdxs[newClustIdx] = set() # Part.clustIdx_pairClustIdxs[newClustIdx].add(npci) return None
def unsetRelTypeIdx(self): old_type = self._relTypeIdx cl = Clust.getClust(self._clustIdx) cl.onPartUnsetRelTypeIdx(old_type) return None
def scoreOpCompose(self, rcidx, acidx): def update_score_from_dict(scr, d, orig_d): for key, cnt in d.items(): origcnt = orig_d[key] # assert origcnt >= cnt scr -= xlogx(origcnt) if cnt > 0: scr += xlogx(cnt) else: scr += ParseParams.priorNumParam return scr # get parent and child root-node id numbers parChdNids = Part.getPairPartRootNodeIds(rcidx, acidx) if parChdNids is None: return -10000 score = 0 rcl = Clust.getClust(rcidx) acl = Clust.getClust(acidx) # Parent count, child count, and count of times they occur # together. rtc_new = rcl._ttlCnt atc_new = acl._ttlCnt ratc_new = 0 raRootCnt = 0 parArg_cnt = dict() rRelTypeIdx_newcnt = dict() aRelTypeIdx_newcnt = dict() raRelTypeIdx_newcnt = dict() rArgClustIdx_argNum_cnt = dict() aArgClustIdx_argNum_cnt = dict() rNewArgClustIdx_argNum_cnt = dict() aNewArgClustIdx_argNum_cnt = dict() rArgClustIdx_argTypeIdx_cnt = dict() aArgClustIdx_argTypeIdx_cnt = dict() rNewArgClustIdx_argTypeIdx_cnt = dict() aNewArgClustIdx_argTypeIdx_cnt = dict() rArgClustIdx_chdClustIdx_cnt = dict() aArgClustIdx_chdClustIdx_cnt = dict() rNewArgClustIdx_chdClustIdx_cnt = dict() aNewArgClustIdx_chdClustIdx_cnt = dict() rArgClustIdx_partCnt = dict() aArgClustIdx_partCnt = dict() rNewArgClustIdx_partCnt = dict() aNewArgClustIdx_partCnt = dict() rArgClustIdx_argCnt = dict() aArgClustIdx_argCnt = dict() rNewArgClustIdx_argCnt = dict() aNewArgClustIdx_argCnt = dict() # For each parent-child pair: for pcnid in parChdNids: pp, cp = Part.getPartByRootNodeId( pcnid[0]), Part.getPartByRootNodeId(pcnid[1]) rtc_new -= 1 atc_new -= 1 ratc_new += 1 rrt = pp.getRelTypeIdx() art = cp.getRelTypeIdx() raArgClustidx = pp.getArgClust(cp._parArgIdx) # Decrement individual relType counts and increment the combined # relType count for this pair rRelTypeIdx_newcnt = dec_key(rRelTypeIdx_newcnt, rrt, base=rcl._relTypeIdx_cnt[rrt]) aRelTypeIdx_newcnt = dec_key(aRelTypeIdx_newcnt, art, base=acl._relTypeIdx_cnt[art]) raRelTypeIdx_newcnt = inc_key(raRelTypeIdx_newcnt, (rrt, art)) pp_par = pp.getParPart() # If the parent has a parent, increment the parArg count, otherwise # increment the root count. if pp_par is not None: ai = pp.getParArgIdx() ppi = pp_par.getClustIdx() aci = pp_par.getArgClust(ai) parArg_cnt = inc_key(parArg_cnt, (ppi, aci)) else: raRootCnt += 1 # For each argClust on the parent part, decrement the old parent # part count and argClust count, and increment the new ones. The # trick is don't copy/increment the counts for argument shared by # this pair. for arg_ci in pp._argClustIdx_argIdxs: an = len(pp._argClustIdx_argIdxs[arg_ci]) ac = rcl._argClusts[arg_ci] rArgClustIdx_partCnt = dec_key(rArgClustIdx_partCnt, arg_ci, base=len( ac._partRootTreeNodeIds)) if arg_ci not in rArgClustIdx_argNum_cnt: rArgClustIdx_argNum_cnt[arg_ci] = {} rArgClustIdx_argNum_cnt[arg_ci] = \ dec_key(rArgClustIdx_argNum_cnt[arg_ci], an, base=ac._argNum_cnt[an]) newArgNum = an if arg_ci == raArgClustidx: newArgNum -= 1 if newArgNum == 0: continue if arg_ci not in rNewArgClustIdx_argNum_cnt: rNewArgClustIdx_argNum_cnt[arg_ci] = {} rNewArgClustIdx_argNum_cnt[arg_ci] = \ inc_key(rNewArgClustIdx_argNum_cnt[arg_ci], newArgNum) rNewArgClustIdx_partCnt = inc_key(rNewArgClustIdx_partCnt, arg_ci) # Same as above, but for child part, and we don't skip anything. for arg_ci in cp._argClustIdx_argIdxs: an = len(cp._argClustIdx_argIdxs[arg_ci]) ac = acl._argClusts[arg_ci] aArgClustIdx_partCnt = dec_key(aArgClustIdx_partCnt, arg_ci, base=len( ac._partRootTreeNodeIds)) if arg_ci not in aArgClustIdx_argNum_cnt: aArgClustIdx_argNum_cnt[arg_ci] = {} aArgClustIdx_argNum_cnt[arg_ci] = \ dec_key(aArgClustIdx_argNum_cnt[arg_ci], an, base=ac._argNum_cnt[an]) if arg_ci not in aNewArgClustIdx_argNum_cnt: aNewArgClustIdx_argNum_cnt[arg_ci] = {} aNewArgClustIdx_argNum_cnt[arg_ci] = \ inc_key(aNewArgClustIdx_argNum_cnt[arg_ci], an) aNewArgClustIdx_partCnt = inc_key(aNewArgClustIdx_partCnt, arg_ci) args = pp.getArguments() # For all the parent's arguments for ai, arg in args.items(): arg_part = arg._argPart child_clust_id = arg_part._clustIdx aci = pp.getArgClust(ai) ac = rcl._argClusts[aci] ati = arg._path.getArgType() # Drop the old arguments rArgClustIdx_argCnt = dec_key(rArgClustIdx_argCnt, aci, base=ac._ttlArgCnt) if aci not in rArgClustIdx_argTypeIdx_cnt: rArgClustIdx_argTypeIdx_cnt[aci] = {} rArgClustIdx_argTypeIdx_cnt[aci] = \ dec_key(rArgClustIdx_argTypeIdx_cnt[aci], ati, base=ac._argTypeIdx_cnt[ati]) if aci not in rArgClustIdx_chdClustIdx_cnt: rArgClustIdx_chdClustIdx_cnt[aci] = {} rArgClustIdx_chdClustIdx_cnt[aci] = \ dec_key(rArgClustIdx_chdClustIdx_cnt[aci], child_clust_id, base=ac._chdClustIdx_cnt[child_clust_id]) # Add the new arguments, except for the child part we're possibly # absorbing if arg_part.getRelTreeRoot().getId() != cp.getRelTreeRoot( ).getId(): rNewArgClustIdx_argCnt = inc_key(rNewArgClustIdx_argCnt, aci) if aci not in rNewArgClustIdx_argTypeIdx_cnt: rNewArgClustIdx_argTypeIdx_cnt[aci] = {} rNewArgClustIdx_argTypeIdx_cnt[aci] = \ inc_key(rNewArgClustIdx_argTypeIdx_cnt[aci], ati) if aci not in rNewArgClustIdx_chdClustIdx_cnt: rNewArgClustIdx_chdClustIdx_cnt[aci] = {} rNewArgClustIdx_chdClustIdx_cnt[aci] = \ inc_key(rNewArgClustIdx_chdClustIdx_cnt[aci], child_clust_id) args = cp.getArguments() for ai, arg in args.items(): ap = arg._argPart cci = ap._clustIdx aci = cp.getArgClust(ai) ac = acl._argClusts[aci] ati = arg._path.getArgType() # Drop the old arguments aArgClustIdx_argCnt = dec_key(aArgClustIdx_argCnt, aci, base=ac._ttlArgCnt) if aci not in aArgClustIdx_argTypeIdx_cnt: aArgClustIdx_argTypeIdx_cnt[aci] = {} aArgClustIdx_argTypeIdx_cnt[aci] = \ dec_key(aArgClustIdx_argTypeIdx_cnt[aci], ati, base=ac._argTypeIdx_cnt[ati]) if aci not in aArgClustIdx_chdClustIdx_cnt: aArgClustIdx_chdClustIdx_cnt[aci] = dict() aArgClustIdx_chdClustIdx_cnt[aci] = \ dec_key(aArgClustIdx_chdClustIdx_cnt[aci], cci, base=ac._chdClustIdx_cnt[cci]) # Add the new arguments aNewArgClustIdx_argCnt = inc_key(aNewArgClustIdx_argCnt, aci) if aci not in aNewArgClustIdx_argTypeIdx_cnt: aNewArgClustIdx_argTypeIdx_cnt[aci] = {} aNewArgClustIdx_argTypeIdx_cnt[aci] = \ inc_key(aNewArgClustIdx_argTypeIdx_cnt[aci], ati) if aci not in aNewArgClustIdx_chdClustIdx_cnt: aNewArgClustIdx_chdClustIdx_cnt[aci] = {} aNewArgClustIdx_chdClustIdx_cnt[aci] = \ inc_key(aNewArgClustIdx_chdClustIdx_cnt[aci], cci) if raRootCnt > 0: origRootCnt = Clust.clustIdx_rootCnt[rcidx] if origRootCnt > raRootCnt: score += xlogx(raRootCnt) \ + xlogx(origRootCnt - raRootCnt) \ - xlogx(origRootCnt) score -= ParseParams.priorNumParam denomor = xlogx(rcl._ttlCnt) denomnr = xlogx(rtc_new) score = update_score_from_dict(score, rRelTypeIdx_newcnt, rcl._relTypeIdx_cnt) score += denomor score -= denomnr denomoa = xlogx(acl._ttlCnt) denomna = xlogx(atc_new) score = update_score_from_dict(score, aRelTypeIdx_newcnt, acl._relTypeIdx_cnt) score += denomoa score -= denomna for cnt in raRelTypeIdx_newcnt.values(): score -= ParseParams.priorNumParam score += xlogx(cnt) denomra = xlogx(ratc_new) score -= denomra for pi, cnt in parArg_cnt.items(): pc = Clust.getClust(pi[0]) ac = pc._argClusts[pi[1]] origcnt = ac._chdClustIdx_cnt[rcidx] if cnt == origcnt: continue score -= ParseParams.priorNumParam score += xlogx(cnt) + xlogx(origcnt - cnt) - xlogx(origcnt) for aci, ac in rcl._argClusts.items(): origPartCnt = len(ac._partRootTreeNodeIds) score -= (xlogx(rcl._ttlCnt - origPartCnt) - denomor) if aci not in rArgClustIdx_partCnt: score += (xlogx(rtc_new - origPartCnt) - denomnr) continue if rArgClustIdx_partCnt[aci] > 0: score += (xlogx(rtc_new - rArgClustIdx_partCnt[aci]) - denomnr) score = update_score_from_dict(score, rArgClustIdx_argNum_cnt[aci], ac._argNum_cnt) score -= 2 * (xlogx(rArgClustIdx_argCnt[aci]) - xlogx(ac._ttlArgCnt)) score = update_score_from_dict(score, rArgClustIdx_argTypeIdx_cnt[aci], ac._argTypeIdx_cnt) score = update_score_from_dict(score, rArgClustIdx_chdClustIdx_cnt[aci], ac._chdClustIdx_cnt) # line 570 in Scorer.java for aci, ac in acl._argClusts.items(): origPartCnt = len(ac._partRootTreeNodeIds) score -= (xlogx(acl._ttlCnt - origPartCnt) - denomoa) if aci not in aArgClustIdx_partCnt: score += (xlogx(atc_new - origPartCnt) - denomna) continue if aArgClustIdx_partCnt[aci] > 0: score += (xlogx(atc_new - aArgClustIdx_partCnt[aci]) - denomna) score = update_score_from_dict(score, aArgClustIdx_argNum_cnt[aci], ac._argNum_cnt) score -= 2 * (xlogx(aArgClustIdx_argCnt[aci]) - xlogx(ac._ttlArgCnt)) score = update_score_from_dict(score, aArgClustIdx_argTypeIdx_cnt[aci], ac._argTypeIdx_cnt) score = update_score_from_dict(score, aArgClustIdx_chdClustIdx_cnt[aci], ac._chdClustIdx_cnt) for ds in [(rNewArgClustIdx_partCnt, rNewArgClustIdx_argNum_cnt), (aNewArgClustIdx_partCnt, aNewArgClustIdx_argNum_cnt)]: for aci, partCnt in ds[0].items(): score += xlogx(ratc_new - partCnt) - denomra for idx, cnt in ds[1][aci].items(): score += xlogx(cnt) score -= ParseParams.priorNumParam for ds in [(rNewArgClustIdx_argCnt, rNewArgClustIdx_argTypeIdx_cnt, rNewArgClustIdx_chdClustIdx_cnt), (aNewArgClustIdx_argCnt, aNewArgClustIdx_argTypeIdx_cnt, aNewArgClustIdx_chdClustIdx_cnt)]: for aci, argCnt in ds[0].items(): score -= 2 * xlogx(argCnt) for idx, cnt in ds[1][aci].items(): score += xlogx(cnt) score -= ParseParams.priorNumParam for idx, cnt in ds[2][aci].items(): score += xlogx(cnt) score -= ParseParams.priorNumParam return score
def execMC(self, op): # # Get clusters associated with our op # cluster1 = Clust.getClust(op._clustIdx1) cluster2 = Clust.getClust(op._clustIdx2) if cluster1 is None or cluster2 is None: return -1 # # If cluster 1 has fewer argument clusters than cluster 2, swap them. # We merge the "smaller" cluster into the larger one. # if len(cluster1._argClusts) < len(cluster2._argClusts): clust_swap = cluster2 cluster2 = cluster1 cluster1 = clust_swap # # Align the argument clusters based on scores, and then map over # any remaining argument clusters from cluster 2 to cluster 1. # aci2_aci1 = dict() scorer = self._parse.scorer _, aci2_aci1 = scorer.scoreMCForAlign(cluster1, cluster2, aci2_aci1) for arg_clust_id2 in cluster2._argClusts: if arg_clust_id2 not in aci2_aci1: arg_clust = cluster2._argClusts[arg_clust_id2] for arg_type in arg_clust._argTypeIdx_cnt: arg_clust_ids = cluster1.getArgClustIdxs(arg_type) if arg_clust_ids is None: arg_clust_id1 = cluster1.createArgClust(arg_type) else: arg_clust_id1 = next(iter(arg_clust_ids)) aci2_aci1[arg_clust_id2] = arg_clust_id1 break # # Finally, remap the Parts in cluster 2 to cluster 1 as well. # part_ids = set() part_ids.update(Part.getPartRootNodeIds(cluster2.getId())) for part_id in part_ids: pt = Part.getPartByRootNodeId(part_id) for arg in pt.getArguments().values(): arg._argPart.unsetParent() pt.changeClustRemap(cluster1.getId(), aci2_aci1) for argIdx, arg in pt.getArguments().items(): arg._argPart.setParent(pt, argIdx) Clust.removeClust(cluster2) return cluster1.getId()
def scoreOpComposePart(self, pp, cp): score = 0 rcl, acl = [Clust.getClust(x._clustIdx) for x in (pp, cp)] ptn, ctn = [x._relTreeRoot for x in (pp, cp)] ptn.addChild(dep, ctn) nrti = RelType.getRelType(ptn) if Clust.getClustsWithRelType(nrti) is None: return score pai = cp._parArgIdx pcarg = pp.getArguments()[pai] dep = pcarg._path.getDep() orti = pp._relTypeIdx ncl = Clust.getClust(Clust.getClustsWithRelType(nrti).next()) nci = ncl._clustIdx if pp.getParPart() is not None: ppp = pp.getParPart() ppcl = Clust.getClust(ppp.getClustIdx()) ac = ppcl._argClusts[ppp.getArgClust(pp.getParArgIdx())] oc = ac._chdClustIdx_cnt[rcl._clustIdx] nc = ac._chdClustIdx_cnt[nci] else: oc = Clust.clustIdx_rootCnt[rcl] nc = Clust.clustIdx_rootCnt[ncl] score += log(nc) - log(oc) for aci, ais in pp._argClustIdx_argIdxs.items(): ac = rcl._argClusts[aci] score -= (log(ac._argNum_cnt[len(ais)]) - log(ac._ttlArgCnt)) for ai in ais: arg = pp.getArgument(ai) score -= (log(ac._chdClustIdx_cnt[arg._argPart._clustIdx]) \ - log(ac._ttlArgCnt)) score -= (log(ac._argTypeIdx_cnt[arg._path.getArgType()]) \ - log(ac._ttlArgCnt)) ai_newaci = dict() for ai, arg in pp._args.items(): if ai == pai: pass else: ati = arg._path.getArgType() aci = ncl._argTypeIdx_argClustIdxs[ati].next() ai_newaci[ai] = aci newArgClustIdx_ais = dict() for ai, aci in ai_newaci.items(): if aci not in newArgClustIdx_ais: newArgClustIdx_ais[aci] = set() newArgClustIdx_ais[aci].add(ais) for aci, ais in newArgClustIdx_ais.items(): ac = ncl._argClusts[aci] score += (log(ac._argNum_cnt[len(ais)]) - log(ac._ttlArgCnt)) for ai in ais: arg = pp.getArgument(ai) score -= (log(ac._chdClustIdx_cnt[arg._argPart._clustIdx]) \ - log(ac._ttlArgCnt)) score -= (log(ac._argTypeIdx_cnt[arg._path.getArgType()]) \ - log(ac._ttlArgCnt)) return score
def scoreOpMC(self, op): # Get our two cluster ids, and make sure cluster 1 was defined earlier # than cluster 2. clust_id1, clust_id2 = op._clustIdx1, op._clustIdx2 assert clust_id1 < clust_id2 # Subtract 0 from score? Weird. score = 0 - ParseParams.priorMerge # If these clusters appear in conjunction with each other (indicating a # dissimilarity, otherwise it's redundant) penalize the score. # # Clust.pairClustIdx_conjCnt is dictionary of type {(int, int): int} # if (clust_id1, clust_id2) in Clust.pairClustIdx_conjCnt: score -= (ParseParams.priorNumConj \ * Clust.pairClustIdx_conjCnt[(clust_id1, clust_id2)]) # Now get the actual Clust objects clust1 = Clust.getClust(clust_id1) clust2 = Clust.getClust(clust_id2) # # We update the score by taking xlogx(x+y) - xlogx(x) - xlogx(y) # where xlogx() == x*log(x) # Here we calculate on total counts - basically how common these clusters # are in our corpus. Here we penalize if they are too common. score -= Scorer.updateScore(clust1._ttlCnt, clust2._ttlCnt) # # Then we check for shared relTypes, and score up those with lots of # shared relTypes. for reltype, count1 in clust1._relTypeIdx_cnt.items(): if reltype in clust2._relTypeIdx_cnt: count2 = clust2._relTypeIdx_cnt[reltype] score += Scorer.updateScore(count1, count2) score += ParseParams.priorNumParam # # Bonus as well if we have lots of sentence roots in these clusters, # indicating they are semantically important. if clust_id1 in Clust.clustIdx_rootCnt and clust_id2 in Clust.clustIdx_rootCnt: root_count1 = Clust.clustIdx_rootCnt[clust_id1] root_count2 = Clust.clustIdx_rootCnt[clust_id2] score += Scorer.updateScore(root_count1, root_count2) score += ParseParams.priorNumParam # # Let's compare the parent components as well # score += self.scoreMCForParent(clust_id1, clust_id2) # # Finally, if cluster 2 has more arguments than cluster 1, reverse them # before scoring on alignment of their arguments if len(clust2._argClusts) > len(clust1._argClusts): clx1 = clust2 clx2 = clust1 else: clx1 = clust1 clx2 = clust2 score_add, _ = self.scoreMCForAlign(clx1, clx2, dict()) score += score_add return score
def createArgs(self, art_id, sent_id, sent, parent_id, done=set(), verbose=False): ''' For each token, get the TreeNode, Part, Cluster and (based on sentence dependencies) the children tokens. For each child token, use the dependency relationship to define a Path and then argument type and Argument defining the parent- child relationship. Then add/create an ArgClust before recursing on any grand-child tokens. # ## CHECK TOKENS SO WE DON'T GET STUCK IN A RECURSIVE LOOP IF ## DEPENDENCIES ARE MALFORMED # ''' parent_node_id = genTreeNodeID(art_id, sent_id, parent_id) parent = TreeNode.getTreeNode(parent_node_id) parent_part = Part.getPartByRootNodeId(parent_node_id) parent_clust = Clust.getClust(parent_part.getClustIdx()) children = sent.get_children(parent_id) if children is not None: for relation, child_id in children: child_node_id = genTreeNodeID(art_id, sent_id, child_id) path = Path(relation) arg_type_id = path.getArgType() # if child_node_id in done: # continue child_part = Part.getPartByRootNodeId(child_node_id) if child_part is None: if verbose: print("Child node id {} has no part".format( child_node_id)) if child_part.getParPart() is not None: if verbose: print("Child node id {} already has " "parent {}".format( child_node_id, child_part.getParPart().getRelTreeRoot(). getId())) continue arg = Argument(parent, path, child_part) arg_id = parent_part.addArgument(arg) child_part.setParent(parent_part, arg_id) arg_clust_ids = parent_clust.getArgClustIdxs(arg_type_id) if arg_clust_ids is None: arg_clust_id = parent_clust.createArgClust(arg_type_id) else: arg_clust_id = next(iter(arg_clust_ids)) parent_part.setArgClust(arg_id, arg_clust_id) #done.add(child_node_id) self.createArgs(art_id, sent_id, sent, child_id) #done.add(parent_node_id) return None
def setRelTypeIdx(self, newRelTypeIdx): self._relTypeIdx = newRelTypeIdx cl = Clust.getClust(self._clustIdx) cl.onPartSetRelTypeIdx(newRelTypeIdx) return None
def execCompose(self, op): parClustIdx = op._parClustIdx chdClustIdx = op._chdClustIdx new_clust_id = -1 # # If either cluster are None, return -1 # if Clust.getClust(parClustIdx) is None or Clust.getClust( chdClustIdx) is None: return -1 new_clust = None parent_child_pair = (parClustIdx, chdClustIdx) part_ids = set() part_ids.update( Part.pairClustIdxs_pairPartRootNodeIds[parent_child_pair]) deleted_parts = [] for parent_id, child_id in part_ids: if parent_id in deleted_parts or child_id in deleted_parts: continue parent_part = Part.getPartByRootNodeId(parent_id) child_part = Part.getPartByRootNodeId(child_id) dep = parent_part.getArguments()[ child_part._parArgIdx]._path.getDep() parent_part._relTreeRoot.addChild(dep, child_part._relTreeRoot) nrti = RelType.getRelType(parent_part._relTreeRoot) if new_clust is None: # on first loop rel_clusts = Clust.getClustsWithRelType(nrti) if rel_clusts is None: new_clust = Clust.getClust(Clust.createClust(nrti)) elif len(rel_clusts) > 1: raise Exception else: new_clust = Clust.getClust(next(iter(rel_clusts))) new_clust_id = new_clust.getId() parent_part.removeArgument(child_part._parArgIdx) if parent_part.getClustIdx() != new_clust_id: for argIdx in parent_part.getArguments(): parent_part.unsetArgClust(argIdx) arg = parent_part.getArgument(argIdx) arg._argPart.unsetParent() parent_part.changeClust(new_clust_id, nrti) for argIdx, arg in parent_part.getArguments().items(): arg_type = arg._path.getArgType() arg_clust_id = -1 if arg_type not in new_clust._argTypeIdx_argClustIdxs: arg_clust_id = new_clust.createArgClust(arg_type) elif len( new_clust._argTypeIdx_argClustIdxs[arg_type]) == 0: arg_clust_id = new_clust.createArgClust(arg_type) else: arg_clust_id = next( iter(new_clust._argTypeIdx_argClustIdxs[arg_type])) arg._argPart.setParent(parent_part, argIdx) parent_part.setArgClust(argIdx, arg_clust_id) parent_part.setRelTypeIdx(nrti) else: parent_part.unsetRelTypeIdx() parent_part.setRelTypeIdx(nrti) # # Connect the child part's arguments directly to the parent part now # for argIdx, arg in child_part.getArguments().items(): child_part.unsetArgClust(argIdx) arg_type = arg._path.getArgType() arg_clust_id = -1 if arg_type not in new_clust._argTypeIdx_argClustIdxs: arg_clust_id = new_clust.createArgClust(arg_type) elif len(new_clust._argTypeIdx_argClustIdxs[arg_type]) == 0: arg_clust_id = new_clust.createArgClust(arg_type) else: arg_clust_id = next( iter(new_clust._argTypeIdx_argClustIdxs[arg_type])) newArgIdx = parent_part.addArgument(arg) arg._argPart.setParent(parent_part, newArgIdx) parent_part.setArgClust(newArgIdx, arg_clust_id) # # Remove the old child part # deleted_parts.append(child_part.getRelTreeRoot().getId()) child_part.destroy() # Part.clustIdx_pairClustIdxs[parClustIdx].remove(pci) # Part.clustIdx_pairClustIdxs[chdClustIdx].remove(pci) del Part.pairClustIdxs_pairPartRootNodeIds[parent_child_pair] return new_clust_id