lst = glob.glob(dirCmb + "/*.cmb")
    ##lst = lst[:300]
    lst.sort()


    conceptMap = LexMap().read(conceptFileName)
    wordMap = LexMap().read(wordFileName)
    rConceptMap = LexMap().read(conceptFileName).reverse()
    rWordMap = LexMap().read(wordFileName).reverse()

    wordGrams = {}
    c1Grams = {}
    c2Grams = {}

    for fileName in lst:
        bucketing.getWordGrams(fileName, wordGrams, c1Grams, c2Grams)


    # number of stacks [c1, c2, c3, c4] is lower than for training
    # because during force alingnment was not decoded many _DUMMY_
    # concepts

    if not text:
        c2Grams = translate(c2Grams, wordMap, conceptMap)
        
    c2Grams4 = c2Grams
    c2Grams3 = reduceGrams(c2Grams4)
    c2Grams2 = reduceGrams(c2Grams3)

    if verbose:
        print("Number of c2Grams4: %d" % len(c2Grams4))
else:
    lst = glob.glob(dirCmb + "/*.cmb")
    ##lst = lst[:300]
    lst.sort()

    conceptMap = LexMap().read(conceptFileName)
    wordMap = LexMap().read(wordFileName)
    rConceptMap = LexMap().read(conceptFileName).reverse()
    rWordMap = LexMap().read(wordFileName).reverse()

    wordGrams = {}
    c1Grams = {}
    c2Grams = {}

    for fileName in lst:
        bucketing.getWordGrams(fileName, wordGrams, c1Grams, c2Grams)

    c1Grams[(u'_EMPTY_', u'_EMPTY_', u'_EMPTY_', u'_EMPTY_')] = 999999

    # number of stacks [c1, c2, c3, c4] is lower than for training
    # because during force alingnment was not decoded many _DUMMY_
    # concepts

    if not text:
        c1Grams = translate(c1Grams, wordMap, conceptMap)

    c1Grams4 = c1Grams
    c1Grams3 = reduceGrams(c1Grams4)
    c1Grams2 = reduceGrams(c1Grams3)
    c1Grams1 = reduceGrams(c1Grams2)
print ("Start word history backoff-ing")
print ("-------------------------------------------------")

lst = glob.glob(dirHO + "/*.hddn")
##lst = lst[:300]
lst.sort()

print len(lst)

wordGrams = {}
c1Grams = {}
c2Grams = {}

for fileName in lst:
    bucketing.getWordGrams(fileName, wordGrams, c1Grams, c2Grams, fileType="hddn")


c1Grams[(u"_EMPTY_", u"_EMPTY_", u"_EMPTY_", u"_EMPTY_")] = 999999

# number of stacks [c1, c2, c3, c4] is lower than for training
# because during force alingnment was not decoded many _DUMMY_
# concepts

c1Grams4 = c1Grams
c1Grams3 = reduceGrams(c1Grams4)
c1Grams2 = reduceGrams(c1Grams3)
c1Grams1 = reduceGrams(c1Grams2)

print ("Number of c1Grams4: %d" % len(c1Grams4))
print ("Number of c1Grams3: %d" % len(c1Grams3))
Example #4
0
print("Start word history backoff-ing")
print("-------------------------------------------------")

lst = glob.glob(dirHO + "/*.hddn")
##lst = lst[:300]
lst.sort()

print len(lst)

wordGrams = {}
c1Grams = {}
c2Grams = {}

for fileName in lst:
    bucketing.getWordGrams(fileName, wordGrams, c1Grams, c2Grams, fileType='hddn')

    
c1Grams[(u'_EMPTY_', u'_EMPTY_', u'_EMPTY_', u'_EMPTY_')] = 999999

# number of stacks [c1, c2, c3, c4] is lower than for training
# because during force alingnment was not decoded many _DUMMY_
# concepts

c1Grams4 = c1Grams
c1Grams3 = reduceGrams(c1Grams4)
c1Grams2 = reduceGrams(c1Grams3)
c1Grams1 = reduceGrams(c1Grams2)

print("Number of c1Grams4: %d" % len(c1Grams4))
print("Number of c1Grams3: %d" % len(c1Grams3))