Ejemplo n.º 1
0
    def EM(self, dataset, iterations, modelName="IBM1Base", index=0):
        task = Task("Aligner", modelName + str(iterations))
        self.logger.info("Starting Training Process")
        self.logger.info("Training size: " + str(len(dataset)))
        start_time = time.time()

        for iteration in range(iterations):
            self._beginningOfIteration()
            self.logger.info("Starting Iteration " + str(iteration))
            counter = 0

            for item in dataset:
                f, e = item[0:2]
                counter += 1
                task.progress(modelName + " iter %d, %d of %d" % (
                    iteration,
                    counter,
                    len(dataset),
                ))
                for fWord in f:
                    z = 0
                    for eWord in e:
                        z += self.tProbability(fWord, eWord)
                    for eWord in e:
                        self._updateCount(fWord, eWord, z, index)

            self._updateEndOfIteration()

        end_time = time.time()
        self.logger.info("Training Complete, total time(seconds): %f" %
                         (end_time - start_time, ))
        self.endOfEM()
        return
Ejemplo n.º 2
0
class AlignmentModel(Base):
    def __init__(self):
        self.modelName = "HMMWithAlignmentType"
        self.version = "0.1b"
        self.logger = logging.getLogger('HMM')
        self.p0H = 0.3
        self.nullEmissionProb = 0.000005
        self.smoothFactor = 0.1
        self.task = None
        self.evaluate = evaluate
        self.fe = ()

        self.s = defaultdict(list)
        self.sTag = defaultdict(list)
        self.index = 0
        self.typeList = []
        self.typeIndex = {}
        self.typeDist = []
        self.lambd = 1 - 1e-20
        self.lambda1 = 0.9999999999
        self.lambda2 = 9.999900827395436E-11
        self.lambda3 = 1.000000082740371E-15

        self.loadTypeDist = {
            "SEM": .401,
            "FUN": .264,
            "PDE": .004,
            "CDE": .004,
            "MDE": .012,
            "GIS": .205,
            "GIF": .031,
            "COI": .008,
            "TIN": .003,
            "NTR": .086,
            "MTA": .002
        }

        self.modelComponents = [
            "t", "pi", "a", "eLengthSet", "s", "sTag", "typeList", "typeIndex",
            "typeDist", "lambd", "lambda1", "lambda2", "lambda3"
        ]
        Base.__init__(self)
        return

    def _beginningOfIteration(self, dataset):
        self.lenDataset = len(dataset)
        self.c_feh = defaultdict(
            lambda: [0.0 for h in range(len(self.typeList))])
        return

    def _updateGamma(self, f, e, gamma, alpha, beta, alphaScale):
        for i in range(len(f)):
            for j in range(len(e)):
                tmpGamma = alpha[i][j] * beta[i][j] / alphaScale[i]
                gamma[i][j] = tmpGamma
                c_feh = self.c_feh[(f[i][self.index], e[j][self.index])]
                for h in range(len(self.typeList)):
                    c_feh[h] += tmpGamma * self.sProbability(f[i], e[j], h)

    def _updateEndOfIteration(self, maxE, delta, gammaSum_0, gammaBiword):
        # Update a
        for targetLen in self.eLengthSet:
            a = self.a[targetLen]
            for prev_j in range(len(a)):
                for j in range(len(a[prev_j])):
                    a[prev_j][j] = 0.0
        for Len in self.eLengthSet:
            for prev_j in range(Len):
                deltaSum = 0.0
                for j in range(Len):
                    deltaSum += delta[Len][prev_j][j]
                for j in range(Len):
                    self.a[Len][prev_j][j] = delta[Len][prev_j][j] /\
                        (deltaSum + 1e-37)

        # Update pi
        for i in range(maxE):
            self.pi[i] = gammaSum_0[i] * (1.0 / self.lenDataset)

        # Update t
        gammaEWord = defaultdict(float)
        for f, e in gammaBiword:
            gammaEWord[e] += gammaBiword[(f, e)]
        self.t.clear()
        for f, e in gammaBiword:
            self.t[(f, e)] = gammaBiword[(f, e)] / (gammaEWord[e] + 1e-37)

        s = self.s if self.index == 0 else self.sTag
        for (f, e) in self.c_feh:
            c_feh = self.c_feh[(f, e)]
            sTmp = s[(f, e)]
            gammaTmp = gammaBiword[(f, e)]
            for h in range(len(self.typeList)):
                sTmp[h] = c_feh[h] / gammaTmp
        self.fe = ()
        return

    def endOfBaumWelch(self):
        # Smoothing for target sentences of unencountered length
        for targetLen in self.eLengthSet:
            a = self.a[targetLen]
            for prev_j in range(targetLen):
                for j in range(targetLen):
                    a[prev_j][j] *= 1 - self.p0H
        for targetLen in self.eLengthSet:
            a = self.a[targetLen]
            for prev_j in range(targetLen):
                for j in range(targetLen):
                    a[prev_j][prev_j + targetLen] = self.p0H
                    a[prev_j + targetLen][prev_j + targetLen] = self.p0H
                    a[prev_j + targetLen][j] = a[prev_j][j]
        return

    def sProbability(self, f, e, h):
        fWord, fTag = f
        eWord, eTag = e
        if self.fe != (f, e):
            self.fe, sKey, sTagKey = (f, e), (f[0], e[0]), (f[1], e[1])
            self.sTmp = self.s[sKey] if sKey in self.s else None
            self.sTagTmp = self.sTag[sTagKey] if sTagKey in self.sTag else None
        sTmp = self.sTmp[h] if self.sTmp else 0
        sTagTmp = self.sTagTmp[h] if self.sTagTmp else 0
        if self.index == 0:
            p1 = (1 - self.lambd) * self.typeDist[h] + self.lambd * sTmp
            p2 = (1 - self.lambd) * self.typeDist[h] + self.lambd * sTagTmp
            p3 = self.typeDist[h]
            return self.lambda1 * p1 + self.lambda2 * p2 + self.lambda3 * p3
        else:
            return (1 - self.lambd) * self.typeDist[h] + self.lambd * sTagTmp

    def trainWithIndex(self, dataset, iterations, index):
        self.index = index
        alignerIBM1 = AlignerIBM1()
        alignerIBM1.initialiseBiwordCount(dataset, index)
        alignerIBM1.EM(dataset, iterations, 'IBM1', index)
        self.task.progress("IBM model Trained")
        self.logger.info("IBM model Trained")

        self.logger.info("Initialising HMM")
        self.initialiseBiwordCount(dataset, index)
        if self.index == 1:
            self.sTag = self.calculateS(dataset, self.fe_count, index)
        else:
            self.s = self.calculateS(dataset, self.fe_count, index)
        self.t = alignerIBM1.t
        self.logger.info("HMM Initialised, start training")
        self.baumWelch(dataset, iterations=iterations, index=index)
        self.task.progress("HMM finalising")
        return

    def train(self, dataset, iterations=5):
        self.task = Task("Aligner", "HMMOI" + str(iterations))
        self.logger.info("Loading alignment type distribution")
        self.initialiseAlignTypeDist(dataset, self.loadTypeDist)
        self.logger.info("Alignment type distribution loaded")

        self.task.progress("Stage 1 Training With POS Tags")
        self.logger.info("Stage 1 Training With POS Tags")
        self.trainWithIndex(dataset, iterations, 1)

        self.task.progress("Stage 1 Training With FORM")
        self.logger.info("Stage 1 Training With FORM")
        self.trainWithIndex(dataset, iterations, 0)

        self.logger.info("Training Complete")
        self.task = None
        return

    def logViterbi(self, f, e):
        fLen, eLen = len(f), len(e)
        e = deepcopy(e)
        for i in range(eLen):
            e.append(("null", "null"))

        score = np.zeros((fLen, eLen * 2, len(self.typeList)))
        prev_j = np.zeros((fLen, eLen * 2, len(self.typeList)))
        prev_h = np.zeros((fLen, eLen * 2, len(self.typeList)))

        for j in range(len(e)):
            tPr = log(self.tProbability(f[0], e[j]))
            for h in range(len(self.typeList)):
                score[0][j][h] = log(self.sProbability(f[0], e[j], h)) + tPr
                if j < len(self.pi) and self.pi[j] != 0:
                    score[0][j][h] += log(self.pi[j])
                else:
                    score[0][j][h] = -sys.maxint - 1

        for i in range(1, fLen):
            for j in range(len(e)):
                maxScore = -sys.maxint - 1
                jPrevBest = -sys.maxint - 1
                hPrevBest = 0
                tPr = log(self.tProbability(f[i], e[j]))
                for jPrev in range(len(e)):
                    aPrPreLog = self.aProbability(jPrev, j, eLen)
                    if aPrPreLog == 0:
                        continue
                    aPr = log(aPrPreLog)
                    for h in range(len(self.typeList)):
                        temp = score[i - 1][jPrev][h] + aPr + tPr
                        if temp > maxScore:
                            maxScore = temp
                            jPrevBest = jPrev
                            hPrevBest = h

                for h in range(len(self.typeList)):
                    s = self.sProbability(f[i], e[j], h)
                    if s != 0:
                        temp_s = log(s)
                        score[i][j][h] = maxScore + temp_s
                        prev_j[i][j][h] = jPrevBest
                        prev_h[i][j][h] = hPrevBest

        maxScore = -sys.maxint - 1
        best_j = best_h = 0
        for j in range(len(e)):
            for h in range(len(self.typeList)):
                if score[fLen - 1][j][h] > maxScore:
                    maxScore = score[fLen - 1][j][h]
                    best_j, best_h = j, h

        trace = [
            (best_j + 1, best_h),
        ]

        j, h = best_j, best_h
        i = fLen - 1

        while (i > 0):
            j, h = int(prev_j[i][j][h]), int(prev_h[i][j][h])
            trace = [(j + 1, h)] + trace
            i = i - 1
        return trace
Ejemplo n.º 3
0
class AlignmentModelBase(Base):
    def __init__(self):
        if "nullEmissionProb" not in vars(self):
            self.nullEmissionProb = 0.000005
        if "task" not in vars(self):
            self.task = None

        if "t" not in vars(self):
            self.t = defaultdict(float)
        if "eLengthSet" not in vars(self):
            self.eLengthSet = defaultdict(int)
        if "a" not in vars(self):
            self.a = [[[]]]
        if "pi" not in vars(self):
            self.pi = []

        if "logger" not in vars(self):
            self.logger = logging.getLogger('HMMBASE')
        if "modelComponents" not in vars(self):
            self.modelComponents = ["t", "pi", "a", "eLengthSet"]
        Base.__init__(self)
        return

    def initialiseParameter(self, Len):
        doubleLen = 2 * Len
        tmp = 1.0 / Len
        for z in range(Len):
            for y in range(Len):
                for x in range(Len + 1):
                    self.a[x][z][y] = tmp
        tmp = 1.0 / doubleLen
        for x in range(Len):
            self.pi[x] = tmp
        return

    def forwardBackward(self, f, e, tSmall, a):
        alpha = [[0.0 for x in range(len(e))] for y in range(len(f))]
        alphaScale = [0.0 for x in range(len(f))]
        alphaSum = 0

        for j in range(len(e)):
            alpha[0][j] = self.pi[j] * tSmall[0][j]
            alphaSum += alpha[0][j]

        alphaScale[0] = 1 / alphaSum
        for j in range(len(e)):
            alpha[0][j] *= alphaScale[0]

        for i in range(1, len(f)):
            alphaSum = 0
            for j in range(len(e)):
                total = 0
                for prev_j in range(len(e)):
                    total += alpha[i - 1][prev_j] * a[prev_j][j]
                alpha[i][j] = tSmall[i][j] * total
                alphaSum += alpha[i][j]

            alphaScale[i] = 1.0 / alphaSum
            for j in range(len(e)):
                alpha[i][j] = alphaScale[i] * alpha[i][j]

        beta = [[0.0 for x in range(len(e))] for y in range(len(f))]
        for j in range(len(e)):
            beta[len(f) - 1][j] = alphaScale[len(f) - 1]

        for i in range(len(f) - 2, -1, -1):
            for j in range(len(e)):
                total = 0
                for next_j in range(len(e)):
                    total += (beta[i + 1][next_j] * a[j][next_j] *
                              tSmall[i + 1][next_j])
                beta[i][j] = alphaScale[i] * total
        return alpha, alphaScale, beta

    def maxTargetSentenceLength(self, dataset):
        maxLength = 0
        eLengthSet = defaultdict(int)
        for (f, e, alignment) in dataset:
            tempLength = len(e)
            if tempLength > maxLength:
                maxLength = tempLength
            eLengthSet[tempLength] += 1
        return (maxLength, eLengthSet)

    def baumWelch(self, dataset, iterations=5, index=0):
        if not self.task:
            self.task = Task("Aligner", "HMMBaumWelchOI" + str(iterations))
        self.logger.info("Starting Training Process")
        self.logger.info("Training size: " + str(len(dataset)))
        startTime = time.time()

        maxE, self.eLengthSet = self.maxTargetSentenceLength(dataset)
        self.logger.info("Maximum Target sentence length: " + str(maxE))

        self.a = [[[0.0 for x in range(maxE * 2)] for y in range(maxE * 2)]
                  for z in range(maxE + 1)]
        self.pi = [0.0 for x in range(maxE * 2)]

        for iteration in range(iterations):
            self.logger.info("BaumWelch Iteration " + str(iteration))

            logLikelihood = 0

            gamma = [[0.0 for x in range(maxE)] for y in range(maxE * 2)]
            gammaBiword = defaultdict(float)
            gammaSum_0 = [0.0 for x in range(maxE)]
            delta = [[[0.0 for x in range(maxE)] for y in range(maxE)]
                     for z in range(maxE + 1)]

            self._beginningOfIteration(dataset)

            counter = 0
            for (f, e, alignment) in dataset:
                self.task.progress("BaumWelch iter %d, %d of %d" %
                                   (iteration, counter, len(dataset),))
                counter += 1
                if iteration == 0:
                    self.initialiseParameter(len(e))

                fLen, eLen = len(f), len(e)
                a = self.a[eLen]
                tSmall = [[self.t[(f[i][index], e[j][index])]
                           for j in range(eLen)]
                          for i in range(fLen)]

                alpha, alphaScale, beta = self.forwardBackward(f, e, tSmall, a)

                # Update logLikelihood
                for i in range(fLen):
                    logLikelihood -= log(alphaScale[i])

                # Setting gamma
                self._updateGamma(f, e, gamma, alpha, beta, alphaScale)

                for i in range(fLen):
                    for j in range(eLen):
                        gammaBiword[(f[i][index], e[j][index])] += gamma[i][j]
                for j in range(eLen):
                    gammaSum_0[j] += gamma[0][j]

                # Update delta
                c = [0.0 for i in range(eLen * 2)]
                for i in range(1, fLen):
                    for prev_j in range(eLen):
                        for j in range(eLen):
                            c[eLen - 1 + j - prev_j] += (alpha[i - 1][prev_j] *
                                                         beta[i][j] *
                                                         a[prev_j][j] *
                                                         tSmall[i][j])

                for prev_j in range(eLen):
                    for j in range(eLen):
                        delta[eLen][prev_j][j] += c[eLen - 1 + j - prev_j]
            # end of loop over dataset

            self.logger.info("likelihood " + str(logLikelihood))
            # M-Step
            self._updateEndOfIteration(maxE, delta, gammaSum_0, gammaBiword)

        self.endOfBaumWelch()
        endTime = time.time()
        self.logger.info("Training Complete, total time(seconds): %f" %
                         (endTime - startTime,))
        return

    def _beginningOfIteration(self, dataset):
        # self.lenDataset = len(dataset)
        # return
        raise NotImplementedError

    def _updateGamma(self, f, e, gamma, alpha, beta, alphaScale):
        # for i in range(len(f)):
        #     for j in range(len(e)):
        #         gamma[i][j] = alpha[i][j] * beta[i][j] / alphaScale[i]
        raise NotImplementedError

    def _updateEndOfIteration(self, maxE, delta, gammaSum_0, gammaBiword):
        # self.t.clear()
        # for Len in self.eLengthSet:
        #     for prev_j in range(Len):
        #         deltaSum = 0.0
        #         for j in range(Len):
        #             deltaSum += delta[Len][prev_j][j]
        #         for j in range(Len):
        #             self.a[Len][prev_j][j] = delta[Len][prev_j][j] /\
        #                 (deltaSum + 1e-37)

        # for i in range(maxE):
        #     self.pi[i] = gammaSum_0[i] * (1.0 / self.lenDataset)

        # gammaEWord = defaultdict(float)
        # for f, e in gammaBiword:
        #     gammaEWord[e] += gammaBiword[(f, e)]
        # for f, e in gammaBiword:
        #     self.t[(f, e)] = gammaBiword[(f, e)] / (gammaEWord[e] + 1e-37)
        # return
        raise NotImplementedError

    def endOfBaumWelch(self):
        # Apply final smoothing here
        raise NotImplementedError

    def tProbability(self, f, e, index=0):
        v = 163303
        if (f[index], e[index]) in self.t:
            return self.t[(f[index], e[index])]
        if e[index] == "null":
            return self.nullEmissionProb
        return 1.0 / v

    def aProbability(self, prev_j, j, targetLength):
        if targetLength in self.eLengthSet:
            return self.a[targetLength][prev_j][j]
        return 1.0 / targetLength

    def logViterbi(self, f, e):
        e = deepcopy(e)
        fLen, eLen = len(f), len(e)
        for i in range(eLen):
            e.append(("null", "null"))
        score = np.zeros((fLen, eLen * 2))
        prev_j = np.zeros((fLen, eLen * 2))

        for i in range(fLen):
            for j in range(eLen * 2):
                score[i][j] = log(self.tProbability(f[i], e[j]))
                if i == 0:
                    if j < len(self.pi) and self.pi[j] != 0:
                        score[i][j] += log(self.pi[j])
                    else:
                        score[i][j] = - sys.maxint - 1
                else:
                    # Find the best alignment for f[i-1]
                    maxScore = -sys.maxint - 1
                    bestPrev_j = -sys.maxint - 1
                    for jPrev in range(eLen * 2):
                        aPr = self.aProbability(jPrev, j, eLen)
                        if aPr == 0:
                            continue
                        temp = score[i - 1][jPrev] + log(aPr)
                        if temp > maxScore:
                            maxScore = temp
                            bestPrev_j = jPrev

                    score[i][j] += maxScore
                    prev_j[i][j] = bestPrev_j

        maxScore = -sys.maxint - 1
        best_j = 0
        for j in range(eLen * 2):
            if score[fLen - 1][j] > maxScore:
                maxScore = score[fLen - 1][j]
                best_j = j

        trace = [(best_j + 1, )]
        i = fLen - 1
        j = best_j

        while (i > 0):
            j = int(prev_j[i][j])
            trace = [(j + 1, )] + trace
            i = i - 1
        return trace

    def decodeSentence(self, sentence):
        f, e, alignment = sentence
        sentenceAlignment = []
        bestAlign = self.logViterbi(f, e)

        for i in range(len(bestAlign)):

            if bestAlign[i][0] <= len(e):
                if len(bestAlign[i]) > 1 and "typeList" in vars(self):
                    sentenceAlignment.append(
                        (i + 1, bestAlign[i][0],
                         self.typeList[bestAlign[i][1]]))
                else:
                    sentenceAlignment.append((i + 1, bestAlign[i][0]))
        return sentenceAlignment
Ejemplo n.º 4
0
class AlignmentModel(Base):
    def __init__(self):
        self.modelName = "HMM"
        self.version = "0.1b"
        self.logger = logging.getLogger('HMM')
        self.p0H = 0.3
        self.nullEmissionProb = 0.000005
        self.smoothFactor = 0.1
        self.task = None
        self.evaluate = evaluate

        self.modelComponents = ["t", "pi", "a", "eLengthSet"]
        Base.__init__(self)
        return

    def _beginningOfIteration(self, dataset):
        self.lenDataset = len(dataset)
        return

    def _updateGamma(self, f, e, gamma, alpha, beta, alphaScale):
        for i in range(len(f)):
            for j in range(len(e)):
                gamma[i][j] = alpha[i][j] * beta[i][j] / alphaScale[i]

    def _updateEndOfIteration(self, maxE, delta, gammaSum_0, gammaBiword):
        # Update a
        for Len in self.eLengthSet:
            for prev_j in range(Len):
                deltaSum = 0.0
                for j in range(Len):
                    deltaSum += delta[Len][prev_j][j]
                for j in range(Len):
                    self.a[Len][prev_j][j] = delta[Len][prev_j][j] /\
                        (deltaSum + 1e-37)

        # Update pi
        for i in range(maxE):
            self.pi[i] = gammaSum_0[i] * (1.0 / self.lenDataset)

        # Update t
        gammaEWord = defaultdict(float)
        for f, e in gammaBiword:
            gammaEWord[e] += gammaBiword[(f, e)]
        self.t.clear()
        for f, e in gammaBiword:
            self.t[(f, e)] = gammaBiword[(f, e)] / (gammaEWord[e] + 1e-37)
        return

    def endOfBaumWelch(self):
        # Smoothing for target sentences of unencountered length
        for targetLen in self.eLengthSet:
            a = self.a[targetLen]
            for prev_j in range(targetLen):
                for j in range(targetLen):
                    a[prev_j][j] *= 1 - self.p0H
        for targetLen in self.eLengthSet:
            a = self.a[targetLen]
            for prev_j in range(targetLen):
                for j in range(targetLen):
                    a[prev_j][prev_j + targetLen] = self.p0H
                    a[prev_j + targetLen][prev_j + targetLen] = self.p0H
                    a[prev_j + targetLen][j] = a[prev_j][j]
        return

    def train(self, dataset, iterations):
        self.task = Task("Aligner", "HMMOI" + str(iterations))
        self.task.progress("Training IBM model 1")
        self.logger.info("Training IBM model 1")
        alignerIBM1 = AlignerIBM1()
        alignerIBM1.train(dataset, iterations)
        self.t = alignerIBM1.t
        self.task.progress("IBM model Trained")
        self.logger.info("IBM model Trained")
        self.baumWelch(dataset, iterations=iterations)
        self.task.progress("finalising")
        self.task = None
        return