def data(self, size, model): counter = 0 for doclist in docstream(): for doc in doclist: if counter >= size: raise StopIteration() counter += 1 sen_list = self.get_sents(doc) for sen in sen_list: sen = [w for w in sen if w not in self.stopwords] if len(sen) < 4: continue xs = np.zeros((len(self.vocab),len(sen))) ys = np.zeros((len(self.vocab),len(sen))) if model == 'cbow': for _ in range(len(sen)): context = self.get_context(_,sen) xs[:,_] = self.get_binvec(context) ys[:,_] = self.get_onehot(sen[_]) yield xs, ys elif model == 'skipgram': for _ in range(len(sen)): context = self.get_context(_,sen) xs[:,_] = self.get_onehot(sen[_]) ys[:,_] = self.get_binvec(context) yield xs, ys
def ns_data(self, size, model): counter = 0 for doclist in docstream(): for doc in doclist: if counter >= size: raise StopIteration() counter += 1 sen_list = self.get_sents(doc) for sen in sen_list: sen = [w for w in sen if w not in self.stopwords] if len(sen) < 4: continue if model == 'cbow': for _ in range(len(sen)): context = self.get_context(_,sen) x = self.get_binvec(context) y = self.get_onehot(sen[_]) yield x, y elif model == 'skipgram': for _ in range(len(sen)): context = self.get_context(_,sen) x = self.get_onehot(sen[_]) y = self.get_binvec(context) yield x, y