Python wordTokenize Examples

Programming Language: Python

Namespace/Package Name: helper

Method/Function: wordTokenize

Examples at hotexamples.com: 4

Python wordTokenize - 4 examples found. These are the top rated real world Python examples of helper.wordTokenize extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: mergedRNN3.py Project: jamesmf/recipeGenerator

def recipesToWords(vocab, word_indices, recipes, recs):
    Xword = []
    for recnumber, recipe in enumerate(recipes):
        ind1 = recipe.find(recs[recnumber])
        recipe = recipe[0:ind1 + len(recs[recnumber])]
        word_snips = []
        next_chars = []
        for i in range(0, len(recipe) - maxlen, step):

            word_snips.append(helper.wordTokenize(recipe[:i + maxlen])[:-1])
            next_chars.append(recipe[i + maxlen])

        #iterate over snippets within one recipe
        xword = np.zeros((len(word_snips), maxlen))
        for i, wordsnip in enumerate(word_snips):
            #turn each snippet into a one-hot encoded array of examples x time x output
            if len(wordsnip) >= maxlen:
                diff = 0
                wordsnip = wordsnip[-maxlen:]
            else:
                diff = maxlen - len(wordsnip)

            #for each timestep in the snippet
            for t, word in enumerate(wordsnip):
                tadjust = t + diff
                if word in vocab:
                    xword[i, tadjust] = word_indices[word]

        Xword.append(xword)

    XwordOut = np.zeros((len(Xword[0]) * batchSize, maxlen))

    ind = 0
    added = 0
    no = 0
    for i in range(0, len(Xword[0])):
        for j in range(0, batchSize):
            if i < len(Xword[j]):
                XwordOut[ind, :] = Xword[j][i]
                added += 1
            else:
                no += 1
            ind += 1

    return XwordOut

Example #2

Show file

File: mergedRNNs.py Project: jamesmf/recipeGenerator

def recipesToWords(vocab,word_indices,recipes,recs):
    Xword   = []
    for recnumber, recipe in enumerate(recipes):
        ind1        = recipe.find(recs[recnumber])
        recipe      = recipe[0:ind1+len(recs[recnumber])]
        word_snips  = []
        next_chars  = []
        for i in range(0, len(recipe) - maxlen, step):
            
            word_snips.append(helper.wordTokenize(recipe[: i + maxlen])[:-1])
            next_chars.append(recipe[i + maxlen])

        #iterate over snippets within one recipe
        xword   = np.zeros((len(word_snips),maxlen))
        for i, wordsnip in enumerate(word_snips):
            #turn each snippet into a one-hot encoded array of examples x time x output
            if len(wordsnip) >= maxlen:
                diff        = 0
                wordsnip    = wordsnip[-maxlen:]
            else:
                diff        = maxlen - len(wordsnip)

            #for each timestep in the snippet
            for t, word in enumerate(wordsnip):
                tadjust         = t+diff
                if word in vocab:
                    xword[i,tadjust]= word_indices[word] 
            
        Xword.append(xword)
    
    XwordOut   = np.zeros((len(Xword[0])*batchSize,maxlen))
    
    ind     = 0
    added   = 0
    no      = 0
    for i in range(0,len(Xword[0])):
        for j in range(0,batchSize):
            if i < len(Xword[j]):
                XwordOut[ind,:] = Xword[j][i]
                added+=1
            else:
                no+=1
            ind+=1

    return XwordOut

Example #3

Show file

File: mergedRNN3.py Project: jamesmf/recipeGenerator

#read in the text file
path = "../allrecipes.txt"
text = open(path).read().lower()
recipes = [r + "$$$$" for r in text.split("$$$$")]
np.random.shuffle(recipes)
print("number of recipes:", len(recipes))

#define the character vocabulary
chars = list(set(text))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

#define the word vocabulary
word_thr = 2
toks = helper.wordTokenize(text)
counts = Counter(toks)
vocab = [x[0] for x in counts.most_common() if x[1] > word_thr]
vocSize = len(vocab)
print('corpus length (characters):', len(text))
print('corpus length (tokens)', )
print('vocab size:', vocSize)

word_indices = dict((c, i + 1) for i, c in enumerate(vocab))
indices_word = dict((i + 1, c) for i, c in enumerate(vocab))

maxlen = 30
step = 1
stripLen = 200
probStart = 0.1
numComps = 50

Example #4

Show file

File: mergedRNNs.py Project: jamesmf/recipeGenerator

#read in the text file
path    = "../allrecipes.txt"
text    = open(path).read().lower()
recipes = [r+"$$$$" for r in text.split("$$$$")]
np.random.shuffle(recipes)
print("number of recipes:",len(recipes))

#define the character vocabulary
chars = list(set(text))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

#define the word vocabulary
word_thr= 2
toks    = helper.wordTokenize(text)
counts  = Counter(toks)
vocab   = [x[0] for x in counts.most_common() if x[1] > word_thr]
vocSize = len(vocab)
print('corpus length (characters):', len(text))
print('corpus length (tokens)', )
print('vocab size:', vocSize)

word_indices = dict((c, i+1) for i, c in enumerate(vocab))
indices_word = dict((i+1, c) for i, c in enumerate(vocab))

maxlen   = 30
step     = 1
stripLen = 100
probStart= 0.1