def main(self, conceptMap, symbolMap, symbolDt, dcpts, dpmfs, symbolName, outDir): conceptMap = LexMap().read(conceptMap) symbolMap = LexMap().read(symbolMap) _sink_ = int(symbolMap["_sink_"]) _SINK_ = int(conceptMap["_SINK_"]) conceptCard = len(conceptMap) symbolCard = len(symbolMap) jointDtC1C2C3C4 = gmtk.readDt(symbolDt) jointDcptC1C2C3C4 = gmtk.readDcpt(dcpts, "jointProbC1C2C3C4") jointProbC1C2C3C4 = gmtk.combineDtDcpt1(jointDtC1C2C3C4, jointDcptC1C2C3C4) symbolDpmfC1C2C3C4 = gmtk.readDpmf(dpmfs, "%sGivenC1C2C3C4" % symbolName) symbolGivenC1C2C3C4 = gmtk.combineDtDcpt2(jointDtC1C2C3C4, symbolDpmfC1C2C3C4) jointProbSymbolC1C2C3C4 = symbolGivenC1C2C3C4.multiple([1, 2, 3, 4], jointProbC1C2C3C4) gmtk.saveDpmfsProbs( outDir, "%sGivenC1C2C3C4" % symbolName, len(symbolGivenC1C2C3C4.vectSubList([1, 2, 3, 4])), symbolCard, symbolGivenC1C2C3C4, ) jointProbSymbolC1C2C3 = jointProbSymbolC1C2C3C4.marginalize([0, 1, 2, 3]) symbolGivenC1C2C3 = jointProbSymbolC1C2C3.conditionalize([1, 2, 3]) gmtk.saveDpmfsProbs( outDir, "%sGivenC1C2C3" % symbolName, len(symbolGivenC1C2C3.vectSubList([1, 2, 3])), symbolCard, symbolGivenC1C2C3, ) jointProbSymbolC1C2 = jointProbSymbolC1C2C3.marginalize([0, 1, 2]) symbolGivenC1C2 = jointProbSymbolC1C2.conditionalize([1, 2]) gmtk.saveDpmfsProbs( outDir, "%sGivenC1C2" % symbolName, len(symbolGivenC1C2.vectSubList([1, 2])), symbolCard, symbolGivenC1C2 ) jointProbSymbolC1 = jointProbSymbolC1C2.marginalize([0, 1]) symbolGivenC1 = jointProbSymbolC1.conditionalize([1]) # in case of conditioning by _SINK_ I have to enable to generate _sink_ word only # otherwise I would see _SINK_ concept in the stack # TODO: Turn it into validator symbolGivenC1.setValue([_sink_, _SINK_], 1) gmtk.saveDcptBigram(outDir, "%sGivenC1" % symbolName, symbolCard, conceptCard, symbolGivenC1) symbolUnigram = jointProbSymbolC1.marginalize([0]) # I need to enable to decode _unseen_ word only ! so set the probability of # generating _empty_ to zero symbolUnigram.setValue([int(symbolMap["_empty_"])], 0) # normalize sum of probabilities to one symbolUnigram = symbolUnigram.normJoint() gmtk.saveDcptUnigram(outDir, "%sUnigram" % symbolName, symbolCard, symbolUnigram) gmtk.saveDcptUnseen(outDir, "%sZerogram" % symbolName, symbolCard, symbolMap)
# int(conceptMap["_EMPTY_"]), # int(conceptMap["_EMPTY_"]), # int(conceptMap["_EMPTY_"])], 0) ##concept1DtC2C3C4 = gmtk.readDt(concept1DtFileName) ##concept1DpmfC2C3C4 = gmtk.readDpmf(dpmfsFileName, "concept1GivenC2C3C4") ##assert toolkit.testProb2(concept1DpmfC2C3C4), "Sum of probabilities should be always 1." ## ##concept1GivenC2C3C4 = gmtk.combineDtDcpt2(concept1DtC2C3C4, concept1DpmfC2C3C4) concept1GivenC2C3C4 = jointProbC1C2C3C4.conditionalize([1, 2, 3]) ##if verbose: ## print "Orig : Probs %4d : Parents %4d" % (len(concept1GivenC2C3C4.vectList()), len(concept1GivenC2C3C4.vectSubList([1, 2, 3]))) ## print "Comp : Probs %4d : Parents %4d" % (len(concept1GivenC2C3C4X.vectList()), len(concept1GivenC2C3C4X.vectSubList([1, 2, 3]))) gmtk.saveDpmfsProbs(dirOut, "concept1GivenC2C3C4", len(concept1GivenC2C3C4.vectSubList([1, 2, 3])), conceptCard, concept1GivenC2C3C4) ##gmtk.saveDpmfsProbs(dirOut, "concept1GivenC2C3C4X", len(concept1GivenC2C3C4X.vectSubList([1, 2, 3])), conceptCard, concept1GivenC2C3C4X) ############################################################################## # save trigrams jointProbC1C2C3 = jointProbC1C2C3C4.marginalize([0, 1, 2]) concept1GivenC2C3 = jointProbC1C2C3.conditionalize([1, 2]) gmtk.saveDpmfsProbs(dirOut, "concept1GivenC2C3", len(concept1GivenC2C3.vectSubList([1, 2])), conceptCard, concept1GivenC2C3) ############################################################################## # save bigrams as CPT jointProbC1C2 = jointProbC1C2C3.marginalize([0, 1]) concept1GivenC2 = jointProbC1C2.conditionalize([1]) # in case of conditioning by _SINK_ I have to enable to generate _SINK_ concept only
if table.getSafeValue(index) >= EPSILON: new_value = table.getSafeValue(index)/sum table.setValue(index, new_value) return table def storeResults(self, outDir, symName, (dt4, dt3, dt2), word_C, constFile=None, constAppend=True): wordCard = len(self.symMap) # Save 5-grams NAME = '%sGivenC1C2C3C4' % symName numberOfSpmfs = gmtk.saveDt(outDir, NAME, dt4, 4) gmtk.saveCollection(outDir, NAME, numberOfSpmfs) gmtk.saveSpmfs(outDir, NAME, numberOfSpmfs, wordCard) number = len(word_C.vectSubList([1, 2, 3, 4])) gmtk.saveDpmfsProbs(outDir, NAME, number, wordCard, word_C) if constFile: if constAppend: const_fw = file(constFile, 'a') else: const_fw = file(writeConst, 'w') try: const_fw.write("\n% the cardinality should be CONCEPT_CARD^DEPTH_OF_STACK, but I know that the stack values are sparse\n") const_fw.write("#define JOINT_C1C2C3C4_CARD %d\n" % numberOfSpmfs) finally: const_fw.close() # save 4-grams NAME = '%sGivenC1C2C3' % symName numberOfSpmfs = gmtk.saveDt(outDir, NAME, dt3, 3)
def main(self, conceptMap, symbolMap, symbolDt, dcpts, dpmfs, symbolName, outDir): conceptMap = LexMap().read(conceptMap) symbolMap = LexMap().read(symbolMap) _sink_ = int(symbolMap['_sink_']) _SINK_ = int(conceptMap['_SINK_']) conceptCard = len(conceptMap) symbolCard = len(symbolMap) jointDtC1C2C3C4 = gmtk.readDt(symbolDt) jointDcptC1C2C3C4 = gmtk.readDcpt(dcpts, "jointProbC1C2C3C4") jointProbC1C2C3C4 = gmtk.combineDtDcpt1(jointDtC1C2C3C4, jointDcptC1C2C3C4) symbolDpmfC1C2C3C4 = gmtk.readDpmf(dpmfs, "%sGivenC1C2C3C4" % symbolName) symbolGivenC1C2C3C4 = gmtk.combineDtDcpt2(jointDtC1C2C3C4, symbolDpmfC1C2C3C4) jointProbSymbolC1C2C3C4 = symbolGivenC1C2C3C4.multiple( [1, 2, 3, 4], jointProbC1C2C3C4) gmtk.saveDpmfsProbs(outDir, "%sGivenC1C2C3C4" % symbolName, len(symbolGivenC1C2C3C4.vectSubList([1, 2, 3, 4])), symbolCard, symbolGivenC1C2C3C4) jointProbSymbolC1C2C3 = jointProbSymbolC1C2C3C4.marginalize( [0, 1, 2, 3]) symbolGivenC1C2C3 = jointProbSymbolC1C2C3.conditionalize([1, 2, 3]) gmtk.saveDpmfsProbs(outDir, "%sGivenC1C2C3" % symbolName, len(symbolGivenC1C2C3.vectSubList([1, 2, 3])), symbolCard, symbolGivenC1C2C3) jointProbSymbolC1C2 = jointProbSymbolC1C2C3.marginalize([0, 1, 2]) symbolGivenC1C2 = jointProbSymbolC1C2.conditionalize([1, 2]) gmtk.saveDpmfsProbs(outDir, "%sGivenC1C2" % symbolName, len(symbolGivenC1C2.vectSubList([1, 2])), symbolCard, symbolGivenC1C2) jointProbSymbolC1 = jointProbSymbolC1C2.marginalize([0, 1]) symbolGivenC1 = jointProbSymbolC1.conditionalize([1]) # in case of conditioning by _SINK_ I have to enable to generate _sink_ word only # otherwise I would see _SINK_ concept in the stack # TODO: Turn it into validator symbolGivenC1.setValue([_sink_, _SINK_], 1) gmtk.saveDcptBigram(outDir, "%sGivenC1" % symbolName, symbolCard, conceptCard, symbolGivenC1) symbolUnigram = jointProbSymbolC1.marginalize([0]) # I need to enable to decode _unseen_ word only ! so set the probability of # generating _empty_ to zero symbolUnigram.setValue([int(symbolMap["_empty_"])], 0) # normalize sum of probabilities to one symbolUnigram = symbolUnigram.normJoint() gmtk.saveDcptUnigram(outDir, "%sUnigram" % symbolName, symbolCard, symbolUnigram) gmtk.saveDcptUnseen(outDir, "%sZerogram" % symbolName, symbolCard, symbolMap)
pushGivenC1C2C3C4 = gmtk.combineDtDcpt2(jointDtC1C2C3C4, pushDpmfC1C2C3C4) # # jointProbPC1C2C3C4 = pushGivenC1C2C3C4 * jointProbC1C2C3C4 # jointProbPC1C2C3C4 = pushGivenC1C2C3C4.multiple([1, 2, 3, 4], jointProbC1C2C3C4) ############################################################################## # save pentagrams pushGivenC1C2C3C4 = pushGivenC1C2C3C4.insertPenalty(0, penalty, pushCard) gmtk.saveDpmfsProbs(dirOut, "pushGivenC1C2C3C4", len(pushGivenC1C2C3C4.vectSubList([1, 2, 3, 4])), pushCard, pushGivenC1C2C3C4, -penalty) ############################################################################## # save quatrograms jointProbPC1C2C3 = jointProbPC1C2C3C4.marginalize([0, 1, 2, 3]) pushGivenC1C2C3 = jointProbPC1C2C3.conditionalize([1, 2, 3]) pushGivenC1C2C3 = pushGivenC1C2C3.insertPenalty(0, penalty, pushCard) gmtk.saveDpmfsProbs(dirOut, "pushGivenC1C2C3", len(pushGivenC1C2C3.vectSubList([1, 2, 3])), pushCard, pushGivenC1C2C3, -penalty) ############################################################################## # save trigrams jointProbPC1C2 = jointProbPC1C2C3.marginalize([0, 1, 2])
popDpmfC1C2C3C4 = gmtk.readDpmf(dpmfsFileName, "popGivenC1C2C3C4") assert toolkit.testProb2(popDpmfC1C2C3C4), "Sum of probabilities should be always 1." popGivenC1C2C3C4 = gmtk.combineDtDcpt2(jointDtC1C2C3C4, popDpmfC1C2C3C4) # # jointProbPC1C2C3C4 = popGivenC1C2C3C4 * jointProbC1C2C3C4 # jointProbPC1C2C3C4 = popGivenC1C2C3C4.multiple([1, 2, 3, 4], jointProbC1C2C3C4) ############################################################################## # save pentagrams popGivenC1C2C3C4 = popGivenC1C2C3C4.insertPenalty(0, penalty, popCard) gmtk.saveDpmfsProbs(dirOut, "popGivenC1C2C3C4", len(popGivenC1C2C3C4.vectSubList([1, 2, 3, 4])), popCard, popGivenC1C2C3C4, -penalty) ############################################################################## # save quatrograms jointProbPC1C2C3 = jointProbPC1C2C3C4.marginalize([0, 1, 2, 3]) popGivenC1C2C3 = jointProbPC1C2C3.conditionalize([1, 2, 3]) popGivenC1C2C3 = popGivenC1C2C3.insertPenalty(0, penalty, popCard) gmtk.saveDpmfsProbs(dirOut, "popGivenC1C2C3", len(popGivenC1C2C3.vectSubList([1, 2, 3])), popCard, popGivenC1C2C3, -penalty) ############################################################################## # save trigrams jointProbPC1C2 = jointProbPC1C2C3.marginalize([0, 1, 2]) popGivenC1C2 = jointProbPC1C2.conditionalize([1, 2]) popGivenC1C2 = popGivenC1C2.insertPenalty(0, penalty, popCard)
def storeResults(self, outDir, symName, (dt4, dt3, dt2), word_C, constFile=None, constAppend=True): wordCard = len(self.symMap) # Save 5-grams NAME = '%sGivenC1C2C3C4' % symName numberOfSpmfs = gmtk.saveDt(outDir, NAME, dt4, 4) gmtk.saveCollection(outDir, NAME, numberOfSpmfs) gmtk.saveSpmfs(outDir, NAME, numberOfSpmfs, wordCard) number = len(word_C.vectSubList([1, 2, 3, 4])) gmtk.saveDpmfsProbs(outDir, NAME, number, wordCard, word_C) if constFile: if constAppend: const_fw = file(constFile, 'a') else: const_fw = file(writeConst, 'w') try: const_fw.write( "\n% the cardinality should be CONCEPT_CARD^DEPTH_OF_STACK, but I know that the stack values are sparse\n" ) const_fw.write("#define JOINT_C1C2C3C4_CARD %d\n" % numberOfSpmfs) finally: const_fw.close()