lst = glob.glob(dirCmb + "/*.cmb") ##lst = lst[:300] lst.sort() conceptMap = LexMap().read(conceptFileName) wordMap = LexMap().read(wordFileName) rConceptMap = LexMap().read(conceptFileName).reverse() rWordMap = LexMap().read(wordFileName).reverse() wordGrams = {} c1Grams = {} c2Grams = {} for fileName in lst: bucketing.getWordGrams(fileName, wordGrams, c1Grams, c2Grams) # number of stacks [c1, c2, c3, c4] is lower than for training # because during force alingnment was not decoded many _DUMMY_ # concepts if not text: c2Grams = translate(c2Grams, wordMap, conceptMap) c2Grams4 = c2Grams c2Grams3 = reduceGrams(c2Grams4) c2Grams2 = reduceGrams(c2Grams3) if verbose: print("Number of c2Grams4: %d" % len(c2Grams4))
else: lst = glob.glob(dirCmb + "/*.cmb") ##lst = lst[:300] lst.sort() conceptMap = LexMap().read(conceptFileName) wordMap = LexMap().read(wordFileName) rConceptMap = LexMap().read(conceptFileName).reverse() rWordMap = LexMap().read(wordFileName).reverse() wordGrams = {} c1Grams = {} c2Grams = {} for fileName in lst: bucketing.getWordGrams(fileName, wordGrams, c1Grams, c2Grams) c1Grams[(u'_EMPTY_', u'_EMPTY_', u'_EMPTY_', u'_EMPTY_')] = 999999 # number of stacks [c1, c2, c3, c4] is lower than for training # because during force alingnment was not decoded many _DUMMY_ # concepts if not text: c1Grams = translate(c1Grams, wordMap, conceptMap) c1Grams4 = c1Grams c1Grams3 = reduceGrams(c1Grams4) c1Grams2 = reduceGrams(c1Grams3) c1Grams1 = reduceGrams(c1Grams2)
print ("Start word history backoff-ing") print ("-------------------------------------------------") lst = glob.glob(dirHO + "/*.hddn") ##lst = lst[:300] lst.sort() print len(lst) wordGrams = {} c1Grams = {} c2Grams = {} for fileName in lst: bucketing.getWordGrams(fileName, wordGrams, c1Grams, c2Grams, fileType="hddn") c1Grams[(u"_EMPTY_", u"_EMPTY_", u"_EMPTY_", u"_EMPTY_")] = 999999 # number of stacks [c1, c2, c3, c4] is lower than for training # because during force alingnment was not decoded many _DUMMY_ # concepts c1Grams4 = c1Grams c1Grams3 = reduceGrams(c1Grams4) c1Grams2 = reduceGrams(c1Grams3) c1Grams1 = reduceGrams(c1Grams2) print ("Number of c1Grams4: %d" % len(c1Grams4)) print ("Number of c1Grams3: %d" % len(c1Grams3))
print("Start word history backoff-ing") print("-------------------------------------------------") lst = glob.glob(dirHO + "/*.hddn") ##lst = lst[:300] lst.sort() print len(lst) wordGrams = {} c1Grams = {} c2Grams = {} for fileName in lst: bucketing.getWordGrams(fileName, wordGrams, c1Grams, c2Grams, fileType='hddn') c1Grams[(u'_EMPTY_', u'_EMPTY_', u'_EMPTY_', u'_EMPTY_')] = 999999 # number of stacks [c1, c2, c3, c4] is lower than for training # because during force alingnment was not decoded many _DUMMY_ # concepts c1Grams4 = c1Grams c1Grams3 = reduceGrams(c1Grams4) c1Grams2 = reduceGrams(c1Grams3) c1Grams1 = reduceGrams(c1Grams2) print("Number of c1Grams4: %d" % len(c1Grams4)) print("Number of c1Grams3: %d" % len(c1Grams3))