Exemple #1
0
    def printMLN(path=None):
        out_str = ""

        for ci in Clust.clusts:
            cl = Clust.getClust(ci)
            out_str = "{}\t{}\n".format(cl._clustIdx,cl)

            for aci in cl._argClusts:
                ac = cl._argClusts[aci]
                out_str += "\t{}: ".format(aci)

                out_str += "\t".join(["{}: {}".format(k, v) 
                                      for k, v in ac._argNum_cnt.items()])
                out_str += "\n\t"
                out_str += "\t".join(["{}: {}: {}".format(k, 
                                                          ArgType.getArgType(k).toString(), 
                                                          v) 
                                      for k, v in ac._argTypeIdx_cnt.items()])
                out_str += "\n\t"
                out_str += "\t".join(["{}: {}: {}".format(k, 
                                                          Clust.getClust(k), 
                                                          v) 
                                      for k, v in ac._chdClustIdx_cnt.items()])
                out_str += "\n"

        if path is not None:
            dst = "{}/{}.mln".format(path, 
                                     os.path.basename(os.path.dirname(path)))

            with open(dst, 'w') as f:
                f.write(out_str)

            return None
        else:
            return out_str
Exemple #2
0
    def printParse(path=None):
        out_str = ""

        for rnid, pt in Part.rootNodeId_part.items():
            out_str += "{}\t{}\n".format(rnid, pt._relTreeRoot.getTreeStr())
            out_str += "\t{}: {}\n".format(pt._clustIdx,
                                           Clust.getClust(pt._clustIdx).toString())

            if pt._parPart is None:
                out_str += "\t\n\t\n"
            else:
                arg = pt._parPart.getArgument(pt._parArgIdx)
                out_str += "\t{}\t{}\t{}\n".format(pt._parPart._relTreeRoot.getId(),
                                                   pt._parPart._clustIdx,
                                                   Clust.getClust(pt._parPart._clustIdx))
                out_str += "\t{}: {}: {}\n".format(pt._parPart.getArgClust(pt._parArgIdx),
                                                   arg._path.getArgType(),
                                                   ArgType.getArgType(arg._path.getArgType()))

        if path is not None:
            dst = "{}/{}.parse".format(path, 
                                       os.path.basename(os.path.dirname(path)))
            with open(dst, 'w') as f:
                f.write(out_str)

            return None
        else:
            return out_str
Exemple #3
0
    def execComposePart(self, pp, cp):
        parClustIdx = pp._clustIdx
        chdClustIdx = cp._clustIdx
        pcl = Clust.getClust(parClustIdx)
        ccl = Clust.getClust(chdClustIdx)
        dep = pp.getArguments()[cp._parArgIdx]._path.getDep()
        pp._relTreeRoot.addChild(dep, cp._relTreeRoot)
        nrti = RelType.getRelType(pp._relTreeRoot)

        ncl = Clust.getClust(next(iter(Clust.getClustsWithRelType(nrti))))
        nci = ncl.getId()

        pp.removeArgument(cp._parArgIdx, clust_only=True)

        for argIdx, arg in pp.getArguments().items():
            pp.unsetArgClust(argIdx)
            arg._argPart.unsetParent()

        pp.changeClust(nci, nrti, clust_only=True)

        for argIdx, arg in pp.getArguments().items():
            ati = arg._path.getArgType()
            aci = -1

            if ati not in ncl._argTypeIdx_argClustIdxs:
                aci = ncl.createArgClust(ati)
            elif len(ncl._argTypeIdx_argClustIdxs[ati]) == 0:
                aci = ncl.createArgClust(ati)
            else:
                aci = next(iter(ncl._argTypeIdx_argClustIdxs[ati]))

            arg._argPart.setParent(pp, argIdx)
            pp.setArgClustOnly(argIdx, aci)

        pp.setRelTypeIdx(nrti)

        for argIdx, arg in cp.getArguments():
            ati = arg._path.getArgType()
            aci = -1

            if ati not in ncl._argTypeIdx_argClustIdxs:
                aci = ncl.createArgClust(ati)
            elif len(ncl._argTypeIdx_argClustIdxs[ati]) == 0:
                aci = ncl.createArgClust(ati)
            else:
                aci = next(iter(ncl._argTypeIdx_argClustIdxs[ati]))

            cp.unsetArgClustOnly(argIdx)
            pp.setArgClust(pp.addArgument(arg), aci)
            arg._argPart.setParent(pp, pp.addArgument(arg))

        cp.destroy()

        return None
Exemple #4
0
    def genString(self):
        self._str = "OP_{}:".format(self._op)

        if self._op == SearchOp.OP_MERGE_CLUST:
            c1 = Clust.getClust(self._clustIdx1)
            c2 = Clust.getClust(self._clustIdx2)
            self._str += "{} == {}".format(c1.toString(), c2.toString())
        elif self._op == SearchOp.OP_MERGE_ROLE:
            self._str += "{}:{}:{}".format(self._clustIdx, self._argIdx1,
                                           self._argIdx2)
        elif self._op == SearchOp.OP_COMPOSE:
            rc = Clust.getClust(self._parClustIdx)
            ac = Clust.getClust(self._chdClustIdx)
            self._str += "{} ++ {}".format(rc.toString(), ac.toString())
Exemple #5
0
    def part_from_node(ai, sj, sent, k, tok):
        if not Parse.isIgnore(sent, k):
            tn = TreeNode(genTreeNodeID(ai, sj, k), tok)
            part = Part(tn)
            relTypeIdx = part.getRelTypeIdx()
            clustIdxs = Clust.getClustsWithRelType(relTypeIdx)

            if clustIdxs is not None:
                clustIdx = next(iter(clustIdxs))
            else:
                clustIdx = Clust.createClust(relTypeIdx)

            part.setClust(clustIdx)

        return None
Exemple #6
0
    def setArgs(self, art_id, sent_id, sent, idx):
        this_part = Part.getPartByRootNodeId(
            genTreeNodeID(art_id, sent_id, idx))
        this_node = this_part.getRelTreeRoot()
        node_clust = Clust.getClust(this_part.getClustIdx())
        children = sent.get_children(idx)

        if children is None:
            return None
        else:
            for dependency, child_index in children:
                child_node_id = genTreeNodeID(art_id, sent_id, child_index)
                path = Path(dependency)
                argTypeIdx = path.getArgType()
                child_part = Part.getPartByRootNodeId(child_node_id)

                if child_part.getParPart() is not None:
                    continue

                arg = Argument(this_node, path, child_part)
                argIdx = this_part.addArgument(arg)
                child_part.setParent(this_part, argIdx)
                argClustIdxs = node_clust.getArgClustIdxs(argTypeIdx)
                argClustIdx = -1

                if argClustIdxs is None:
                    argClustIdx = node_clust.createArgClust(argTypeIdx)
                else:
                    argClustIdx = next(iter(argClustIdxs))

                this_part.setArgClust(argIdx, argClustIdx, clust_only=True)

                setArgs(art_id, sent_id, sent, child_index)

        return None
Exemple #7
0
    def setArgClust(self, argIdx, argClustIdx, clust_only=False):
        oldArgClustIdx = -1

        if argIdx in self._argIdx_argClustIdx:
            oldArgClustIdx = self.getArgClust(argIdx)

        if oldArgClustIdx != argClustIdx:
            self._argIdx_argClustIdx[argIdx] = argClustIdx

            if argClustIdx not in self._argClustIdx_argIdxs:
                self._argClustIdx_argIdxs[argClustIdx] = set()

            self._argClustIdx_argIdxs[argClustIdx].add(argIdx)
            arg = self.getArgument(argIdx)

            if not clust_only:
                cl = Clust.getClust(self._clustIdx)

            if oldArgClustIdx < 0:
                if not clust_only:
                    cl.onPartSetArg(self, arg, argClustIdx)
            else:
                self._argClustIdx_argIdxs[oldArgClustIdx].remove(argIdx)

                if len(self._argClustIdx_argIdxs[oldArgClustIdx]) == 0:
                    del self._argClustIdx_argIdxs[oldArgClustIdx]

                if not clust_only:
                    cl.onPartSetArg(self, arg, argClustIdx, oldArgClustIdx)

        return None
Exemple #8
0
    def createAgenda(self, verbose=False):
        if verbose:
            clust_cnt = len(Part.getClustPartRootNodeIds())
            milestones = set([x for x in range(1, 10, 1)])
            i = 0

        for clust_id in Part.getClustPartRootNodeIds():
            clust = Clust.getClust(clust_id)

            if clust.getType() != 'C':
                continue
            elif clust.isStop():
                continue

            # # self.logc.write("Adding to agenda for cluster {}\n".format(clust_id))
            self.addAgendaForNewClust(clust_id, verbose)

            if verbose:
                i += 1
                done = math.floor(i * 10 / clust_cnt)

                if done in milestones:
                    milestones.remove(done)
                    print("{}% complete.".format(done * 10))

        # self.logc.close()

        return None
Exemple #9
0
    def changeClustRemap(self,
                         newClustIdx,
                         argClustIdx_newArgClustIdx,
                         clust_only=False):

        if not clust_only:
            oldClustIdx = self.getClustIdx()
            ocl = Clust.getClust(oldClustIdx)

        self.changeClust(newClustIdx,
                         self.getRelTypeIdx(),
                         clust_only=clust_only)

        argIdx_newArgClustIdx = {}

        for ai, arg in self._args.items():
            oaci = self._argIdx_argClustIdx.pop(ai)
            self._argClustIdx_argIdxs[oaci].remove(ai)

            if len(self._argClustIdx_argIdxs[oaci]) == 0:
                del self._argClustIdx_argIdxs[oaci]

            argIdx_newArgClustIdx[ai] = argClustIdx_newArgClustIdx[oaci]

            if not clust_only:
                ocl.onPartUnsetArg(self, arg, oaci)

        for ai in self._args:
            aci = argIdx_newArgClustIdx[ai]
            self.setArgClust(ai, aci, clust_only=clust_only)

        return None
Exemple #10
0
    def unsetArgClust(self, argIdx, clust_only=False):
        oldArgClustIdx = self._argIdx_argClustIdx.pop(argIdx)
        arg = self.getArgument(argIdx)
        self._argClustIdx_argIdxs[oldArgClustIdx].remove(argIdx)

        if len(self._argClustIdx_argIdxs[oldArgClustIdx]) == 0:
            del self._argClustIdx_argIdxs[oldArgClustIdx]

        if not clust_only:
            cl = Clust.getClust(self.getClustIdx())
            cl.onPartUnsetArg(self, arg, oldArgClustIdx)

        return None
Exemple #11
0
    def setClust(self, clustIdx, clust_only=False):
        self._clustIdx = clustIdx
        rootID = self.getRelTreeRoot().getId()

        if clustIdx not in Part.clustIdx_partRootNodeIds:
            Part.clustIdx_partRootNodeIds[clustIdx] = SortedSet()

        Part.clustIdx_partRootNodeIds[clustIdx].add(rootID)

        if not clust_only:
            cl = Clust.getClust(clustIdx)
            cl.onPartSetClust(self)

        return None
Exemple #12
0
    def addAgendaMC(self, clustIdx1, clustIdx2, neighType):
        if not (self._skipMC or clustIdx1 == clustIdx2):
            type1 = Clust.getClust(clustIdx1).getType()
            type2 = Clust.getClust(clustIdx2).getType()

            if type2 == 'C' and type1 == 'C':
                op = SearchOp()
                op._op = SearchOp.OP_MERGE_CLUST
                op._clustIdx1 = min((clustIdx1, clustIdx2))
                op._clustIdx2 = max((clustIdx1, clustIdx2))

                if not self.moveAgendaToScore(op):
                    if op not in self._mc_neighs:
                        self._mc_neighs[op] = set()

                    if len(self._mc_neighs[op]) + 1 >= ParseParams.minMCCnt:
                        self._agendaToScore.add(op)
                        del self._mc_neighs[op]
                    else:
                        self._mc_neighs[op].add(neighType)

                    ## self.logc.write("\t\tMerge Op: {}; mc_neighs: {}, agendaToScore: {}\n".format(op, len(self._mc_neighs), len(self._agendaToScore)))

        return None
Exemple #13
0
    def initializeSent(self, ai, sj, sent, verbose=False):
        '''
            Create TreeNode, Part, and Clust for each token in a sentence,
            also adding/assigning RelTypes.
            Increment the root count for the cluster assigned to the root
            token (tokens with a parent of ROOT). 
            Finally, run CreateArgs() to define the parent-child relation-
            ships. This call is recursive, traversing the whole dependency
            tree for each sentence.
        '''
        self.numTkns += len(sent.get_tokens()) - 1
        roots = sent.get_children(0)

        if roots is None:
            return None
        elif len(roots) == 0:
            return None

        for k in range(1, len(sent.get_tokens())):
            Parse.part_from_node(ai, sj, sent, k, sent.get_token(k))

        # if len(roots) == 1:
        for _, idx in roots:
            sub_node_id = genTreeNodeID(ai, sj, idx)
            # Is this global set really necessary? I don't think it is...
            self.rootTreeNodeIds.add(sub_node_id)
            node_part = Part.getPartByRootNodeId(sub_node_id)

            if node_part is None:
                continue

            ncl = Clust.getClust(node_part.getClustIdx())
            ncl.incRootCnt()
            self.createArgs(ai, sj, sent, idx, verbose)

        return None
Exemple #14
0
    def scoreMCForParent(self, clustIdx1, clustIdx2):
        scr = 0

        if clustIdx1 in Clust.clustIdx_parArgs and clustIdx2 in Clust.clustIdx_parArgs:
            parents1 = Clust.clustIdx_parArgs[clustIdx1]
            parents2 = Clust.clustIdx_parArgs[clustIdx2]

            for par_arg in parents1:
                if par_arg in parents2:
                    par_clust_id, arg_clust_id = par_arg
                    pcl = Clust.getClust(par_clust_id)

                    if pcl is None:
                        print("ERR: ScoreMC parent cluster is null: {}, {}".
                              format(clustIdx1, clustIdx2))
                        continue

                    ac = pcl._argClusts[arg_clust_id]
                    c1 = ac._chdClustIdx_cnt[clustIdx1]
                    c2 = ac._chdClustIdx_cnt[clustIdx2]
                    scr += ParseParams.priorNumParam
                    scr += Scorer.updateScore(c1, c2)

        return scr
Exemple #15
0
    def createArgs(self,
                   art_id,
                   sent_id,
                   sent,
                   parent_id,
                   done=set(),
                   verbose=False):
        '''
            For each token, get the TreeNode, Part, Cluster and (based on 
            sentence dependencies) the children tokens.

            For each child token, use the dependency relationship to define 
            a Path and then argument type and Argument defining the parent-
            child relationship. Then add/create an ArgClust before recursing
            on any grand-child tokens. 

            #
            ## CHECK TOKENS SO WE DON'T GET STUCK IN A RECURSIVE LOOP IF 
            ## DEPENDENCIES ARE MALFORMED
            # 
        '''
        parent_node_id = genTreeNodeID(art_id, sent_id, parent_id)
        parent = TreeNode.getTreeNode(parent_node_id)
        parent_part = Part.getPartByRootNodeId(parent_node_id)
        parent_clust = Clust.getClust(parent_part.getClustIdx())
        children = sent.get_children(parent_id)

        if children is not None:
            for relation, child_id in children:
                child_node_id = genTreeNodeID(art_id, sent_id, child_id)
                path = Path(relation)
                arg_type_id = path.getArgType()

                # if child_node_id in done:
                #     continue

                child_part = Part.getPartByRootNodeId(child_node_id)

                if child_part is None:
                    if verbose:
                        print("Child node id {} has no part".format(
                            child_node_id))

                if child_part.getParPart() is not None:
                    if verbose:
                        print("Child node id {} already has "
                              "parent {}".format(
                                  child_node_id,
                                  child_part.getParPart().getRelTreeRoot().
                                  getId()))
                    continue

                arg = Argument(parent, path, child_part)
                arg_id = parent_part.addArgument(arg)
                child_part.setParent(parent_part, arg_id)

                arg_clust_ids = parent_clust.getArgClustIdxs(arg_type_id)

                if arg_clust_ids is None:
                    arg_clust_id = parent_clust.createArgClust(arg_type_id)
                else:
                    arg_clust_id = next(iter(arg_clust_ids))

                parent_part.setArgClust(arg_id, arg_clust_id)

                #done.add(child_node_id)
                self.createArgs(art_id, sent_id, sent, child_id)

        #done.add(parent_node_id)

        return None
Exemple #16
0
    def execMC(self, op):
        #
        # Get clusters associated with our op
        #

        cluster1 = Clust.getClust(op._clustIdx1)
        cluster2 = Clust.getClust(op._clustIdx2)

        if cluster1 is None or cluster2 is None:
            return -1

        #
        # If cluster 1 has fewer argument clusters than cluster 2, swap them.
        # We merge the "smaller" cluster into the larger one.
        #

        if len(cluster1._argClusts) < len(cluster2._argClusts):
            clust_swap = cluster2
            cluster2 = cluster1
            cluster1 = clust_swap

        #
        # Align the argument clusters based on scores, and then map over
        # any remaining argument clusters from cluster 2 to cluster 1.
        #

        aci2_aci1 = dict()
        scorer = self._parse.scorer
        _, aci2_aci1 = scorer.scoreMCForAlign(cluster1, cluster2, aci2_aci1)

        for arg_clust_id2 in cluster2._argClusts:
            if arg_clust_id2 not in aci2_aci1:
                arg_clust = cluster2._argClusts[arg_clust_id2]

                for arg_type in arg_clust._argTypeIdx_cnt:
                    arg_clust_ids = cluster1.getArgClustIdxs(arg_type)

                    if arg_clust_ids is None:
                        arg_clust_id1 = cluster1.createArgClust(arg_type)
                    else:
                        arg_clust_id1 = next(iter(arg_clust_ids))

                    aci2_aci1[arg_clust_id2] = arg_clust_id1
                    break

        #
        # Finally, remap the Parts in cluster 2 to cluster 1 as well.
        #

        part_ids = set()
        part_ids.update(Part.getPartRootNodeIds(cluster2.getId()))

        for part_id in part_ids:
            pt = Part.getPartByRootNodeId(part_id)

            for arg in pt.getArguments().values():
                arg._argPart.unsetParent()

            pt.changeClustRemap(cluster1.getId(), aci2_aci1)

            for argIdx, arg in pt.getArguments().items():
                arg._argPart.setParent(pt, argIdx)

        Clust.removeClust(cluster2)

        return cluster1.getId()
Exemple #17
0
    def changeClust(self, newClustIdx, newRelTypeIdx, clust_only=False):
        oldClustIdx = self.getClustIdx()
        rootID = self.getRelTreeRoot().getId()
        Part.clustIdx_partRootNodeIds[oldClustIdx].discard(rootID)

        if clust_only:
            self._relTypeIdx = newRelTypeIdx
        else:
            ocl = Clust.getClust(oldClustIdx)
            ocl.onPartUnsetClust(self)
            self.setRelTypeIdx(newRelTypeIdx)

        self.setClust(newClustIdx, clust_only=clust_only)

        parent = self.getParPart()

        if parent is None:
            if newClustIdx in Clust.clustIdx_rootCnt:
                Clust.clustIdx_rootCnt[newClustIdx] += 1
            else:
                Clust.clustIdx_rootCnt[newClustIdx] = 1
            Clust.clustIdx_rootCnt[newClustIdx] -= 1
        else:
            parent_clust_id = parent.getClustIdx()
            paci = parent.getArgClust(self.getParArgIdx())
            pcl = Clust.getClust(parent_clust_id)
            pac = pcl._argClusts[paci]
            pac._chdClustIdx_cnt[oldClustIdx] -= 1

            if newClustIdx in pac._chdClustIdx_cnt:
                pac._chdClustIdx_cnt[newClustIdx] += 1
            else:
                pac._chdClustIdx_cnt[newClustIdx] = 1

            pa = (parent_clust_id, paci)
            Clust.clustIdx_parArgs[oldClustIdx][pa] -= 1

            if newClustIdx not in Clust.clustIdx_parArgs:
                Clust.clustIdx_parArgs[newClustIdx] = {}

            if pa in Clust.clustIdx_parArgs[newClustIdx]:
                Clust.clustIdx_parArgs[newClustIdx][pa] += 1
            else:
                Clust.clustIdx_parArgs[newClustIdx][pa] = 1

            opci = (parent_clust_id, oldClustIdx)
            npci = (parent_clust_id, newClustIdx)
            ptnid = (parent.getRelTreeRoot().getId(), rootID)

            Part.pairClustIdxs_pairPartRootNodeIds[opci].discard(ptnid)

            if len(Part.pairClustIdxs_pairPartRootNodeIds[opci]) == 0:
                del Part.pairClustIdxs_pairPartRootNodeIds[opci]
                # Part.clustIdx_pairClustIdxs[oldClustIdx].discard(opci)
                # Part.clustIdx_pairClustIdxs[parent_clust_id].discard(opci)

            if npci not in Part.pairClustIdxs_pairPartRootNodeIds:
                Part.pairClustIdxs_pairPartRootNodeIds[npci] = set()

            Part.pairClustIdxs_pairPartRootNodeIds[npci].add(ptnid)

            # Part.clustIdx_pairClustIdxs[parent_clust_id].add(npci)

            # if newClustIdx not in Part.clustIdx_pairClustIdxs:
            #     Part.clustIdx_pairClustIdxs[newClustIdx] = set()

            # Part.clustIdx_pairClustIdxs[newClustIdx].add(npci)

        return None
Exemple #18
0
    def scoreOpMC(self, op):
        # Get our two cluster ids, and make sure cluster 1 was defined earlier
        # than cluster 2.
        clust_id1, clust_id2 = op._clustIdx1, op._clustIdx2
        assert clust_id1 < clust_id2

        # Subtract 0 from score? Weird.

        score = 0 - ParseParams.priorMerge

        # If these clusters appear in conjunction with each other (indicating a
        # dissimilarity, otherwise it's redundant) penalize the score.
        #
        # Clust.pairClustIdx_conjCnt is dictionary of type {(int, int): int}
        #

        if (clust_id1, clust_id2) in Clust.pairClustIdx_conjCnt:
            score -= (ParseParams.priorNumConj \
                    * Clust.pairClustIdx_conjCnt[(clust_id1, clust_id2)])

        # Now get the actual Clust objects

        clust1 = Clust.getClust(clust_id1)
        clust2 = Clust.getClust(clust_id2)

        #
        # We update the score by taking xlogx(x+y) - xlogx(x) - xlogx(y)
        # where xlogx() == x*log(x)
        # Here we calculate on total counts - basically how common these clusters
        # are in our corpus. Here we penalize if they are too common.

        score -= Scorer.updateScore(clust1._ttlCnt, clust2._ttlCnt)

        #
        # Then we check for shared relTypes, and score up those with lots of
        # shared relTypes.

        for reltype, count1 in clust1._relTypeIdx_cnt.items():
            if reltype in clust2._relTypeIdx_cnt:
                count2 = clust2._relTypeIdx_cnt[reltype]
                score += Scorer.updateScore(count1, count2)
                score += ParseParams.priorNumParam

        #
        # Bonus as well if we have lots of sentence roots in these clusters,
        # indicating they are semantically important.

        if clust_id1 in Clust.clustIdx_rootCnt and clust_id2 in Clust.clustIdx_rootCnt:
            root_count1 = Clust.clustIdx_rootCnt[clust_id1]
            root_count2 = Clust.clustIdx_rootCnt[clust_id2]
            score += Scorer.updateScore(root_count1, root_count2)
            score += ParseParams.priorNumParam

        #
        # Let's compare the parent components as well
        #

        score += self.scoreMCForParent(clust_id1, clust_id2)

        #
        # Finally, if cluster 2 has more arguments than cluster 1, reverse them
        # before scoring on alignment of their arguments

        if len(clust2._argClusts) > len(clust1._argClusts):
            clx1 = clust2
            clx2 = clust1
        else:
            clx1 = clust1
            clx2 = clust2

        score_add, _ = self.scoreMCForAlign(clx1, clx2, dict())
        score += score_add

        return score
Exemple #19
0
    def setRelTypeIdx(self, newRelTypeIdx):
        self._relTypeIdx = newRelTypeIdx
        cl = Clust.getClust(self._clustIdx)
        cl.onPartSetRelTypeIdx(newRelTypeIdx)

        return None
Exemple #20
0
    def execCompose(self, op):
        parClustIdx = op._parClustIdx
        chdClustIdx = op._chdClustIdx
        new_clust_id = -1

        #
        # If either cluster are None, return -1
        #
        if Clust.getClust(parClustIdx) is None or Clust.getClust(
                chdClustIdx) is None:
            return -1

        new_clust = None
        parent_child_pair = (parClustIdx, chdClustIdx)
        part_ids = set()
        part_ids.update(
            Part.pairClustIdxs_pairPartRootNodeIds[parent_child_pair])

        deleted_parts = []

        for parent_id, child_id in part_ids:
            if parent_id in deleted_parts or child_id in deleted_parts:
                continue

            parent_part = Part.getPartByRootNodeId(parent_id)
            child_part = Part.getPartByRootNodeId(child_id)
            dep = parent_part.getArguments()[
                child_part._parArgIdx]._path.getDep()
            parent_part._relTreeRoot.addChild(dep, child_part._relTreeRoot)
            nrti = RelType.getRelType(parent_part._relTreeRoot)

            if new_clust is None:
                # on first loop
                rel_clusts = Clust.getClustsWithRelType(nrti)
                if rel_clusts is None:
                    new_clust = Clust.getClust(Clust.createClust(nrti))
                elif len(rel_clusts) > 1:
                    raise Exception
                else:
                    new_clust = Clust.getClust(next(iter(rel_clusts)))

                new_clust_id = new_clust.getId()

            parent_part.removeArgument(child_part._parArgIdx)

            if parent_part.getClustIdx() != new_clust_id:
                for argIdx in parent_part.getArguments():
                    parent_part.unsetArgClust(argIdx)
                    arg = parent_part.getArgument(argIdx)
                    arg._argPart.unsetParent()

                parent_part.changeClust(new_clust_id, nrti)

                for argIdx, arg in parent_part.getArguments().items():
                    arg_type = arg._path.getArgType()
                    arg_clust_id = -1

                    if arg_type not in new_clust._argTypeIdx_argClustIdxs:
                        arg_clust_id = new_clust.createArgClust(arg_type)
                    elif len(
                            new_clust._argTypeIdx_argClustIdxs[arg_type]) == 0:
                        arg_clust_id = new_clust.createArgClust(arg_type)
                    else:
                        arg_clust_id = next(
                            iter(new_clust._argTypeIdx_argClustIdxs[arg_type]))

                    arg._argPart.setParent(parent_part, argIdx)
                    parent_part.setArgClust(argIdx, arg_clust_id)

                parent_part.setRelTypeIdx(nrti)
            else:
                parent_part.unsetRelTypeIdx()
                parent_part.setRelTypeIdx(nrti)

            #
            # Connect the child part's arguments directly to the parent part now
            #

            for argIdx, arg in child_part.getArguments().items():
                child_part.unsetArgClust(argIdx)
                arg_type = arg._path.getArgType()
                arg_clust_id = -1

                if arg_type not in new_clust._argTypeIdx_argClustIdxs:
                    arg_clust_id = new_clust.createArgClust(arg_type)
                elif len(new_clust._argTypeIdx_argClustIdxs[arg_type]) == 0:
                    arg_clust_id = new_clust.createArgClust(arg_type)
                else:
                    arg_clust_id = next(
                        iter(new_clust._argTypeIdx_argClustIdxs[arg_type]))

                newArgIdx = parent_part.addArgument(arg)
                arg._argPart.setParent(parent_part, newArgIdx)
                parent_part.setArgClust(newArgIdx, arg_clust_id)

            #
            # Remove the old child part
            #

            deleted_parts.append(child_part.getRelTreeRoot().getId())
            child_part.destroy()

        # Part.clustIdx_pairClustIdxs[parClustIdx].remove(pci)
        # Part.clustIdx_pairClustIdxs[chdClustIdx].remove(pci)
        del Part.pairClustIdxs_pairPartRootNodeIds[parent_child_pair]

        return new_clust_id
Exemple #21
0
    def scoreOpComposePart(self, pp, cp):
        score = 0
        rcl, acl = [Clust.getClust(x._clustIdx) for x in (pp, cp)]

        ptn, ctn = [x._relTreeRoot for x in (pp, cp)]
        ptn.addChild(dep, ctn)
        nrti = RelType.getRelType(ptn)

        if Clust.getClustsWithRelType(nrti) is None:
            return score

        pai = cp._parArgIdx
        pcarg = pp.getArguments()[pai]
        dep = pcarg._path.getDep()
        orti = pp._relTypeIdx

        ncl = Clust.getClust(Clust.getClustsWithRelType(nrti).next())
        nci = ncl._clustIdx

        if pp.getParPart() is not None:
            ppp = pp.getParPart()
            ppcl = Clust.getClust(ppp.getClustIdx())
            ac = ppcl._argClusts[ppp.getArgClust(pp.getParArgIdx())]
            oc = ac._chdClustIdx_cnt[rcl._clustIdx]
            nc = ac._chdClustIdx_cnt[nci]
        else:
            oc = Clust.clustIdx_rootCnt[rcl]
            nc = Clust.clustIdx_rootCnt[ncl]

        score += log(nc) - log(oc)

        for aci, ais in pp._argClustIdx_argIdxs.items():
            ac = rcl._argClusts[aci]
            score -= (log(ac._argNum_cnt[len(ais)]) - log(ac._ttlArgCnt))

            for ai in ais:
                arg = pp.getArgument(ai)
                score -= (log(ac._chdClustIdx_cnt[arg._argPart._clustIdx]) \
                        - log(ac._ttlArgCnt))
                score -= (log(ac._argTypeIdx_cnt[arg._path.getArgType()]) \
                        - log(ac._ttlArgCnt))

        ai_newaci = dict()

        for ai, arg in pp._args.items():
            if ai == pai:
                pass
            else:
                ati = arg._path.getArgType()
                aci = ncl._argTypeIdx_argClustIdxs[ati].next()
                ai_newaci[ai] = aci

        newArgClustIdx_ais = dict()

        for ai, aci in ai_newaci.items():
            if aci not in newArgClustIdx_ais:
                newArgClustIdx_ais[aci] = set()

            newArgClustIdx_ais[aci].add(ais)

        for aci, ais in newArgClustIdx_ais.items():
            ac = ncl._argClusts[aci]
            score += (log(ac._argNum_cnt[len(ais)]) - log(ac._ttlArgCnt))

            for ai in ais:
                arg = pp.getArgument(ai)
                score -= (log(ac._chdClustIdx_cnt[arg._argPart._clustIdx]) \
                        - log(ac._ttlArgCnt))
                score -= (log(ac._argTypeIdx_cnt[arg._path.getArgType()]) \
                        - log(ac._ttlArgCnt))

        return score
Exemple #22
0
    def reparse(self, aid, si):
        a = id_article[aid]
        sent = a.sentences[si]

        roots = sent.get_children(0)

        if roots is None:
            return None
        elif len(roots) == 0:
            return None
        else:
            old_nid_part = {}

            for ni in range(len(sent.get_tokens())):
                if Parse.isIgnore(sent, ni):
                    continue
                nid = genTreeNodeID(aid, si, ni)
                np = Part.getPartByRootNodeId(nid)
                del Part.rootTreeNodeId_part[nid]
                old_nid_part[nid] = np

            nid_part = {}

            for ni in range(len(sent.get_tokens())):
                if Parse.isIgnore(sent, ni):
                    continue
                part, clustIdx = Parse.part_from_node(aid, si, sent, ni)
                nid_part[genTreeNodeID(aid, si, ni)] = part
                part.setClust(clustIdx, clust_only=True)

            if len(roots) == 1:
                _, idx = next(iter(roots))
                nid = genTreeNodeID(aid, si, idx)
                np = Part.getPartByRootNodeId(nid)

                if np is not None:
                    setArgs(aid, si, sent, idx)

                maxImp = 1

                while maxImp > 0:
                    rp, ap = None, None
                    maxImp = 0

                    for prt in nid_part.values():
                        for arg in prt.getArguments().values():
                            score = self.scorer.scoreOpComposePart(prt, arg)

                            if score > maxImp:
                                maxImp = score
                                rp, ap = prt, arg

                    if maxImp <= 0:
                        break

                    self.executor.execComposePart(rp, ap)
                    del nid_part[ap.getRelTreeRoot().getId()]

                Clust.removePartAndUpdateStat(old_nid_part)
                Clust.updatePartStat(nid_part)

        return None
Exemple #23
0
    def unsetRelTypeIdx(self):
        old_type = self._relTypeIdx
        cl = Clust.getClust(self._clustIdx)
        cl.onPartUnsetRelTypeIdx(old_type)

        return None
Exemple #24
0
    def scoreOpCompose(self, rcidx, acidx):
        def update_score_from_dict(scr, d, orig_d):
            for key, cnt in d.items():
                origcnt = orig_d[key]
                # assert origcnt >= cnt
                scr -= xlogx(origcnt)

                if cnt > 0:
                    scr += xlogx(cnt)
                else:
                    scr += ParseParams.priorNumParam

            return scr

        # get parent and child root-node id numbers
        parChdNids = Part.getPairPartRootNodeIds(rcidx, acidx)

        if parChdNids is None:
            return -10000

        score = 0
        rcl = Clust.getClust(rcidx)
        acl = Clust.getClust(acidx)

        # Parent count, child count, and count of times they occur
        # together.
        rtc_new = rcl._ttlCnt
        atc_new = acl._ttlCnt
        ratc_new = 0
        raRootCnt = 0

        parArg_cnt = dict()

        rRelTypeIdx_newcnt = dict()
        aRelTypeIdx_newcnt = dict()
        raRelTypeIdx_newcnt = dict()

        rArgClustIdx_argNum_cnt = dict()
        aArgClustIdx_argNum_cnt = dict()
        rNewArgClustIdx_argNum_cnt = dict()
        aNewArgClustIdx_argNum_cnt = dict()

        rArgClustIdx_argTypeIdx_cnt = dict()
        aArgClustIdx_argTypeIdx_cnt = dict()
        rNewArgClustIdx_argTypeIdx_cnt = dict()
        aNewArgClustIdx_argTypeIdx_cnt = dict()

        rArgClustIdx_chdClustIdx_cnt = dict()
        aArgClustIdx_chdClustIdx_cnt = dict()
        rNewArgClustIdx_chdClustIdx_cnt = dict()
        aNewArgClustIdx_chdClustIdx_cnt = dict()

        rArgClustIdx_partCnt = dict()
        aArgClustIdx_partCnt = dict()
        rNewArgClustIdx_partCnt = dict()
        aNewArgClustIdx_partCnt = dict()

        rArgClustIdx_argCnt = dict()
        aArgClustIdx_argCnt = dict()
        rNewArgClustIdx_argCnt = dict()
        aNewArgClustIdx_argCnt = dict()

        # For each parent-child pair:
        for pcnid in parChdNids:
            pp, cp = Part.getPartByRootNodeId(
                pcnid[0]), Part.getPartByRootNodeId(pcnid[1])

            rtc_new -= 1
            atc_new -= 1
            ratc_new += 1

            rrt = pp.getRelTypeIdx()
            art = cp.getRelTypeIdx()
            raArgClustidx = pp.getArgClust(cp._parArgIdx)

            # Decrement individual relType counts and increment the combined
            # relType count for this pair
            rRelTypeIdx_newcnt = dec_key(rRelTypeIdx_newcnt,
                                         rrt,
                                         base=rcl._relTypeIdx_cnt[rrt])

            aRelTypeIdx_newcnt = dec_key(aRelTypeIdx_newcnt,
                                         art,
                                         base=acl._relTypeIdx_cnt[art])

            raRelTypeIdx_newcnt = inc_key(raRelTypeIdx_newcnt, (rrt, art))

            pp_par = pp.getParPart()

            # If the parent has a parent, increment the parArg count, otherwise
            # increment the root count.
            if pp_par is not None:
                ai = pp.getParArgIdx()
                ppi = pp_par.getClustIdx()
                aci = pp_par.getArgClust(ai)

                parArg_cnt = inc_key(parArg_cnt, (ppi, aci))
            else:
                raRootCnt += 1

            # For each argClust on the parent part, decrement the old parent
            # part count and argClust count, and increment the new ones. The
            # trick is don't copy/increment the counts for argument shared by
            # this pair.
            for arg_ci in pp._argClustIdx_argIdxs:
                an = len(pp._argClustIdx_argIdxs[arg_ci])
                ac = rcl._argClusts[arg_ci]

                rArgClustIdx_partCnt = dec_key(rArgClustIdx_partCnt,
                                               arg_ci,
                                               base=len(
                                                   ac._partRootTreeNodeIds))

                if arg_ci not in rArgClustIdx_argNum_cnt:
                    rArgClustIdx_argNum_cnt[arg_ci] = {}

                rArgClustIdx_argNum_cnt[arg_ci] = \
                    dec_key(rArgClustIdx_argNum_cnt[arg_ci],
                            an,
                            base=ac._argNum_cnt[an])

                newArgNum = an

                if arg_ci == raArgClustidx:
                    newArgNum -= 1

                if newArgNum == 0:
                    continue

                if arg_ci not in rNewArgClustIdx_argNum_cnt:
                    rNewArgClustIdx_argNum_cnt[arg_ci] = {}

                rNewArgClustIdx_argNum_cnt[arg_ci] = \
                    inc_key(rNewArgClustIdx_argNum_cnt[arg_ci], newArgNum)

                rNewArgClustIdx_partCnt = inc_key(rNewArgClustIdx_partCnt,
                                                  arg_ci)

            # Same as above, but for child part, and we don't skip anything.
            for arg_ci in cp._argClustIdx_argIdxs:
                an = len(cp._argClustIdx_argIdxs[arg_ci])
                ac = acl._argClusts[arg_ci]

                aArgClustIdx_partCnt = dec_key(aArgClustIdx_partCnt,
                                               arg_ci,
                                               base=len(
                                                   ac._partRootTreeNodeIds))

                if arg_ci not in aArgClustIdx_argNum_cnt:
                    aArgClustIdx_argNum_cnt[arg_ci] = {}

                aArgClustIdx_argNum_cnt[arg_ci] = \
                    dec_key(aArgClustIdx_argNum_cnt[arg_ci],
                            an,
                            base=ac._argNum_cnt[an])

                if arg_ci not in aNewArgClustIdx_argNum_cnt:
                    aNewArgClustIdx_argNum_cnt[arg_ci] = {}

                aNewArgClustIdx_argNum_cnt[arg_ci] = \
                    inc_key(aNewArgClustIdx_argNum_cnt[arg_ci], an)

                aNewArgClustIdx_partCnt = inc_key(aNewArgClustIdx_partCnt,
                                                  arg_ci)

            args = pp.getArguments()

            # For all the parent's arguments
            for ai, arg in args.items():
                arg_part = arg._argPart
                child_clust_id = arg_part._clustIdx
                aci = pp.getArgClust(ai)
                ac = rcl._argClusts[aci]
                ati = arg._path.getArgType()

                # Drop the old arguments

                rArgClustIdx_argCnt = dec_key(rArgClustIdx_argCnt,
                                              aci,
                                              base=ac._ttlArgCnt)

                if aci not in rArgClustIdx_argTypeIdx_cnt:
                    rArgClustIdx_argTypeIdx_cnt[aci] = {}

                rArgClustIdx_argTypeIdx_cnt[aci] = \
                    dec_key(rArgClustIdx_argTypeIdx_cnt[aci],
                            ati,
                            base=ac._argTypeIdx_cnt[ati])

                if aci not in rArgClustIdx_chdClustIdx_cnt:
                    rArgClustIdx_chdClustIdx_cnt[aci] = {}

                rArgClustIdx_chdClustIdx_cnt[aci] = \
                    dec_key(rArgClustIdx_chdClustIdx_cnt[aci],
                            child_clust_id,
                            base=ac._chdClustIdx_cnt[child_clust_id])

                # Add the new arguments, except for the child part we're possibly
                # absorbing

                if arg_part.getRelTreeRoot().getId() != cp.getRelTreeRoot(
                ).getId():
                    rNewArgClustIdx_argCnt = inc_key(rNewArgClustIdx_argCnt,
                                                     aci)

                    if aci not in rNewArgClustIdx_argTypeIdx_cnt:
                        rNewArgClustIdx_argTypeIdx_cnt[aci] = {}

                    rNewArgClustIdx_argTypeIdx_cnt[aci] = \
                        inc_key(rNewArgClustIdx_argTypeIdx_cnt[aci], ati)

                    if aci not in rNewArgClustIdx_chdClustIdx_cnt:
                        rNewArgClustIdx_chdClustIdx_cnt[aci] = {}

                    rNewArgClustIdx_chdClustIdx_cnt[aci] = \
                        inc_key(rNewArgClustIdx_chdClustIdx_cnt[aci], child_clust_id)

            args = cp.getArguments()

            for ai, arg in args.items():
                ap = arg._argPart
                cci = ap._clustIdx
                aci = cp.getArgClust(ai)
                ac = acl._argClusts[aci]
                ati = arg._path.getArgType()

                # Drop the old arguments

                aArgClustIdx_argCnt = dec_key(aArgClustIdx_argCnt,
                                              aci,
                                              base=ac._ttlArgCnt)

                if aci not in aArgClustIdx_argTypeIdx_cnt:
                    aArgClustIdx_argTypeIdx_cnt[aci] = {}

                aArgClustIdx_argTypeIdx_cnt[aci] = \
                    dec_key(aArgClustIdx_argTypeIdx_cnt[aci],
                            ati,
                            base=ac._argTypeIdx_cnt[ati])

                if aci not in aArgClustIdx_chdClustIdx_cnt:
                    aArgClustIdx_chdClustIdx_cnt[aci] = dict()

                aArgClustIdx_chdClustIdx_cnt[aci] = \
                    dec_key(aArgClustIdx_chdClustIdx_cnt[aci],
                            cci,
                            base=ac._chdClustIdx_cnt[cci])

                # Add the new arguments

                aNewArgClustIdx_argCnt = inc_key(aNewArgClustIdx_argCnt, aci)

                if aci not in aNewArgClustIdx_argTypeIdx_cnt:
                    aNewArgClustIdx_argTypeIdx_cnt[aci] = {}

                aNewArgClustIdx_argTypeIdx_cnt[aci] = \
                    inc_key(aNewArgClustIdx_argTypeIdx_cnt[aci], ati)

                if aci not in aNewArgClustIdx_chdClustIdx_cnt:
                    aNewArgClustIdx_chdClustIdx_cnt[aci] = {}

                aNewArgClustIdx_chdClustIdx_cnt[aci] = \
                    inc_key(aNewArgClustIdx_chdClustIdx_cnt[aci], cci)

        if raRootCnt > 0:
            origRootCnt = Clust.clustIdx_rootCnt[rcidx]

            if origRootCnt > raRootCnt:
                score +=  xlogx(raRootCnt) \
                        + xlogx(origRootCnt - raRootCnt) \
                        - xlogx(origRootCnt)
                score -= ParseParams.priorNumParam

        denomor = xlogx(rcl._ttlCnt)
        denomnr = xlogx(rtc_new)

        score = update_score_from_dict(score, rRelTypeIdx_newcnt,
                                       rcl._relTypeIdx_cnt)

        score += denomor
        score -= denomnr

        denomoa = xlogx(acl._ttlCnt)
        denomna = xlogx(atc_new)

        score = update_score_from_dict(score, aRelTypeIdx_newcnt,
                                       acl._relTypeIdx_cnt)

        score += denomoa
        score -= denomna

        for cnt in raRelTypeIdx_newcnt.values():
            score -= ParseParams.priorNumParam
            score += xlogx(cnt)

        denomra = xlogx(ratc_new)
        score -= denomra

        for pi, cnt in parArg_cnt.items():
            pc = Clust.getClust(pi[0])
            ac = pc._argClusts[pi[1]]
            origcnt = ac._chdClustIdx_cnt[rcidx]

            if cnt == origcnt:
                continue

            score -= ParseParams.priorNumParam
            score += xlogx(cnt) + xlogx(origcnt - cnt) - xlogx(origcnt)

        for aci, ac in rcl._argClusts.items():
            origPartCnt = len(ac._partRootTreeNodeIds)
            score -= (xlogx(rcl._ttlCnt - origPartCnt) - denomor)

            if aci not in rArgClustIdx_partCnt:
                score += (xlogx(rtc_new - origPartCnt) - denomnr)
                continue

            if rArgClustIdx_partCnt[aci] > 0:
                score += (xlogx(rtc_new - rArgClustIdx_partCnt[aci]) - denomnr)

            score = update_score_from_dict(score, rArgClustIdx_argNum_cnt[aci],
                                           ac._argNum_cnt)

            score -= 2 * (xlogx(rArgClustIdx_argCnt[aci]) -
                          xlogx(ac._ttlArgCnt))

            score = update_score_from_dict(score,
                                           rArgClustIdx_argTypeIdx_cnt[aci],
                                           ac._argTypeIdx_cnt)

            score = update_score_from_dict(score,
                                           rArgClustIdx_chdClustIdx_cnt[aci],
                                           ac._chdClustIdx_cnt)

        # line 570 in Scorer.java

        for aci, ac in acl._argClusts.items():
            origPartCnt = len(ac._partRootTreeNodeIds)
            score -= (xlogx(acl._ttlCnt - origPartCnt) - denomoa)

            if aci not in aArgClustIdx_partCnt:
                score += (xlogx(atc_new - origPartCnt) - denomna)
                continue

            if aArgClustIdx_partCnt[aci] > 0:
                score += (xlogx(atc_new - aArgClustIdx_partCnt[aci]) - denomna)

            score = update_score_from_dict(score, aArgClustIdx_argNum_cnt[aci],
                                           ac._argNum_cnt)

            score -= 2 * (xlogx(aArgClustIdx_argCnt[aci]) -
                          xlogx(ac._ttlArgCnt))

            score = update_score_from_dict(score,
                                           aArgClustIdx_argTypeIdx_cnt[aci],
                                           ac._argTypeIdx_cnt)
            score = update_score_from_dict(score,
                                           aArgClustIdx_chdClustIdx_cnt[aci],
                                           ac._chdClustIdx_cnt)

        for ds in [(rNewArgClustIdx_partCnt, rNewArgClustIdx_argNum_cnt),
                   (aNewArgClustIdx_partCnt, aNewArgClustIdx_argNum_cnt)]:
            for aci, partCnt in ds[0].items():
                score += xlogx(ratc_new - partCnt) - denomra

                for idx, cnt in ds[1][aci].items():
                    score += xlogx(cnt)
                    score -= ParseParams.priorNumParam

        for ds in [(rNewArgClustIdx_argCnt, rNewArgClustIdx_argTypeIdx_cnt,
                    rNewArgClustIdx_chdClustIdx_cnt),
                   (aNewArgClustIdx_argCnt, aNewArgClustIdx_argTypeIdx_cnt,
                    aNewArgClustIdx_chdClustIdx_cnt)]:
            for aci, argCnt in ds[0].items():
                score -= 2 * xlogx(argCnt)

                for idx, cnt in ds[1][aci].items():
                    score += xlogx(cnt)
                    score -= ParseParams.priorNumParam

                for idx, cnt in ds[2][aci].items():
                    score += xlogx(cnt)
                    score -= ParseParams.priorNumParam

        return score