def recipesToWords(vocab, word_indices, recipes, recs): Xword = [] for recnumber, recipe in enumerate(recipes): ind1 = recipe.find(recs[recnumber]) recipe = recipe[0:ind1 + len(recs[recnumber])] word_snips = [] next_chars = [] for i in range(0, len(recipe) - maxlen, step): word_snips.append(helper.wordTokenize(recipe[:i + maxlen])[:-1]) next_chars.append(recipe[i + maxlen]) #iterate over snippets within one recipe xword = np.zeros((len(word_snips), maxlen)) for i, wordsnip in enumerate(word_snips): #turn each snippet into a one-hot encoded array of examples x time x output if len(wordsnip) >= maxlen: diff = 0 wordsnip = wordsnip[-maxlen:] else: diff = maxlen - len(wordsnip) #for each timestep in the snippet for t, word in enumerate(wordsnip): tadjust = t + diff if word in vocab: xword[i, tadjust] = word_indices[word] Xword.append(xword) XwordOut = np.zeros((len(Xword[0]) * batchSize, maxlen)) ind = 0 added = 0 no = 0 for i in range(0, len(Xword[0])): for j in range(0, batchSize): if i < len(Xword[j]): XwordOut[ind, :] = Xword[j][i] added += 1 else: no += 1 ind += 1 return XwordOut
def recipesToWords(vocab,word_indices,recipes,recs): Xword = [] for recnumber, recipe in enumerate(recipes): ind1 = recipe.find(recs[recnumber]) recipe = recipe[0:ind1+len(recs[recnumber])] word_snips = [] next_chars = [] for i in range(0, len(recipe) - maxlen, step): word_snips.append(helper.wordTokenize(recipe[: i + maxlen])[:-1]) next_chars.append(recipe[i + maxlen]) #iterate over snippets within one recipe xword = np.zeros((len(word_snips),maxlen)) for i, wordsnip in enumerate(word_snips): #turn each snippet into a one-hot encoded array of examples x time x output if len(wordsnip) >= maxlen: diff = 0 wordsnip = wordsnip[-maxlen:] else: diff = maxlen - len(wordsnip) #for each timestep in the snippet for t, word in enumerate(wordsnip): tadjust = t+diff if word in vocab: xword[i,tadjust]= word_indices[word] Xword.append(xword) XwordOut = np.zeros((len(Xword[0])*batchSize,maxlen)) ind = 0 added = 0 no = 0 for i in range(0,len(Xword[0])): for j in range(0,batchSize): if i < len(Xword[j]): XwordOut[ind,:] = Xword[j][i] added+=1 else: no+=1 ind+=1 return XwordOut
#read in the text file path = "../allrecipes.txt" text = open(path).read().lower() recipes = [r + "$$$$" for r in text.split("$$$$")] np.random.shuffle(recipes) print("number of recipes:", len(recipes)) #define the character vocabulary chars = list(set(text)) print('total chars:', len(chars)) char_indices = dict((c, i) for i, c in enumerate(chars)) indices_char = dict((i, c) for i, c in enumerate(chars)) #define the word vocabulary word_thr = 2 toks = helper.wordTokenize(text) counts = Counter(toks) vocab = [x[0] for x in counts.most_common() if x[1] > word_thr] vocSize = len(vocab) print('corpus length (characters):', len(text)) print('corpus length (tokens)', ) print('vocab size:', vocSize) word_indices = dict((c, i + 1) for i, c in enumerate(vocab)) indices_word = dict((i + 1, c) for i, c in enumerate(vocab)) maxlen = 30 step = 1 stripLen = 200 probStart = 0.1 numComps = 50
#read in the text file path = "../allrecipes.txt" text = open(path).read().lower() recipes = [r+"$$$$" for r in text.split("$$$$")] np.random.shuffle(recipes) print("number of recipes:",len(recipes)) #define the character vocabulary chars = list(set(text)) print('total chars:', len(chars)) char_indices = dict((c, i) for i, c in enumerate(chars)) indices_char = dict((i, c) for i, c in enumerate(chars)) #define the word vocabulary word_thr= 2 toks = helper.wordTokenize(text) counts = Counter(toks) vocab = [x[0] for x in counts.most_common() if x[1] > word_thr] vocSize = len(vocab) print('corpus length (characters):', len(text)) print('corpus length (tokens)', ) print('vocab size:', vocSize) word_indices = dict((c, i+1) for i, c in enumerate(vocab)) indices_word = dict((i+1, c) for i, c in enumerate(vocab)) maxlen = 30 step = 1 stripLen = 100 probStart= 0.1