Ejemplo n.º 1
0
def recipesToWords(vocab, word_indices, recipes, recs):
    Xword = []
    for recnumber, recipe in enumerate(recipes):
        ind1 = recipe.find(recs[recnumber])
        recipe = recipe[0:ind1 + len(recs[recnumber])]
        word_snips = []
        next_chars = []
        for i in range(0, len(recipe) - maxlen, step):

            word_snips.append(helper.wordTokenize(recipe[:i + maxlen])[:-1])
            next_chars.append(recipe[i + maxlen])

        #iterate over snippets within one recipe
        xword = np.zeros((len(word_snips), maxlen))
        for i, wordsnip in enumerate(word_snips):
            #turn each snippet into a one-hot encoded array of examples x time x output
            if len(wordsnip) >= maxlen:
                diff = 0
                wordsnip = wordsnip[-maxlen:]
            else:
                diff = maxlen - len(wordsnip)

            #for each timestep in the snippet
            for t, word in enumerate(wordsnip):
                tadjust = t + diff
                if word in vocab:
                    xword[i, tadjust] = word_indices[word]

        Xword.append(xword)

    XwordOut = np.zeros((len(Xword[0]) * batchSize, maxlen))

    ind = 0
    added = 0
    no = 0
    for i in range(0, len(Xword[0])):
        for j in range(0, batchSize):
            if i < len(Xword[j]):
                XwordOut[ind, :] = Xword[j][i]
                added += 1
            else:
                no += 1
            ind += 1

    return XwordOut
Ejemplo n.º 2
0
def recipesToWords(vocab,word_indices,recipes,recs):
    Xword   = []
    for recnumber, recipe in enumerate(recipes):
        ind1        = recipe.find(recs[recnumber])
        recipe      = recipe[0:ind1+len(recs[recnumber])]
        word_snips  = []
        next_chars  = []
        for i in range(0, len(recipe) - maxlen, step):
            
            word_snips.append(helper.wordTokenize(recipe[: i + maxlen])[:-1])
            next_chars.append(recipe[i + maxlen])

        #iterate over snippets within one recipe
        xword   = np.zeros((len(word_snips),maxlen))
        for i, wordsnip in enumerate(word_snips):
            #turn each snippet into a one-hot encoded array of examples x time x output
            if len(wordsnip) >= maxlen:
                diff        = 0
                wordsnip    = wordsnip[-maxlen:]
            else:
                diff        = maxlen - len(wordsnip)

            #for each timestep in the snippet
            for t, word in enumerate(wordsnip):
                tadjust         = t+diff
                if word in vocab:
                    xword[i,tadjust]= word_indices[word] 
            
        Xword.append(xword)
    
    XwordOut   = np.zeros((len(Xword[0])*batchSize,maxlen))
    
    ind     = 0
    added   = 0
    no      = 0
    for i in range(0,len(Xword[0])):
        for j in range(0,batchSize):
            if i < len(Xword[j]):
                XwordOut[ind,:] = Xword[j][i]
                added+=1
            else:
                no+=1
            ind+=1

    return XwordOut 
Ejemplo n.º 3
0
#read in the text file
path = "../allrecipes.txt"
text = open(path).read().lower()
recipes = [r + "$$$$" for r in text.split("$$$$")]
np.random.shuffle(recipes)
print("number of recipes:", len(recipes))

#define the character vocabulary
chars = list(set(text))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

#define the word vocabulary
word_thr = 2
toks = helper.wordTokenize(text)
counts = Counter(toks)
vocab = [x[0] for x in counts.most_common() if x[1] > word_thr]
vocSize = len(vocab)
print('corpus length (characters):', len(text))
print('corpus length (tokens)', )
print('vocab size:', vocSize)

word_indices = dict((c, i + 1) for i, c in enumerate(vocab))
indices_word = dict((i + 1, c) for i, c in enumerate(vocab))

maxlen = 30
step = 1
stripLen = 200
probStart = 0.1
numComps = 50
Ejemplo n.º 4
0
#read in the text file
path    = "../allrecipes.txt"
text    = open(path).read().lower()
recipes = [r+"$$$$" for r in text.split("$$$$")]
np.random.shuffle(recipes)
print("number of recipes:",len(recipes))

#define the character vocabulary
chars = list(set(text))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

#define the word vocabulary
word_thr= 2
toks    = helper.wordTokenize(text)
counts  = Counter(toks)
vocab   = [x[0] for x in counts.most_common() if x[1] > word_thr]
vocSize = len(vocab)
print('corpus length (characters):', len(text))
print('corpus length (tokens)', )
print('vocab size:', vocSize)

word_indices = dict((c, i+1) for i, c in enumerate(vocab))
indices_word = dict((i+1, c) for i, c in enumerate(vocab))

maxlen   = 30
step     = 1
stripLen = 100
probStart= 0.1