def addChannel(self, paths): c = Corpus() for path in paths: print path with open(path) as f: text = f.read() name = path.split('/')[-1] d = Document(name, text) c.add_document(d) self.sourceboard.addChannel(c) self.refresh() self.frame.fSizer.Layout() self.frame.Fit()
def parse_bookmark_list(dl): title_urls = [] for dt in dl.find_all('dt'): title_urls.append({ 'url': dt.find('a')['href'], 'title': dt.find('a').text }) return title_urls if __name__ == '__main__': bookmarks = read_bookmarks_from_file('test_bookmarks.html') urls = [bm['url'] for bm in bookmarks['Unsorted']] corp = Corpus() for u in urls: doc = create_document_from_url(u) corp.add_document(doc) vec = vectorize_corpus(corp) km = KMeans(n_clusters=4) km.fit(vec) clusters = km.labels_.tolist() bookmarks = {'url': urls, 'cluster': clusters} frame = pd.DataFrame(bookmarks, index=[clusters], columns=['url', 'cluster']) print(frame.sort_values(by=['cluster']))
class SGNS: def __init__(self, args): lexicon_builder = LexiconBuilder() print('building lexicon...') count = 0 for line in document_reader(args.corpus_dir): count += 1 if count % 100000 == 0: print('read {} lines'.format(count)) lexicon_builder.register_corpus(line) self.lexicon = lexicon_builder.generate_lexicon( args.lexicon_size, args.negative_sample_power) self.corpus = Corpus() print('loading corpus...') count = 0 for line in document_reader(args.corpus_dir): count += 1 if count % 100000 == 0: print('read {} lines'.format(count)) line_ids = [self.lexicon.word2id(word) for word in line] if not args.train_oov_token: line_ids = [i for i in line_ids if i != Lexicon.OOV_WORD_ID] self.corpus.add_document(line_ids) self.args = args self.learning_rate = args.init_learning_rate self.minibatch_count = 0 self.v = np.random.rand(args.lexicon_size, args.vector_dim) - 0.5 self.u = np.random.rand(args.lexicon_size, args.vector_dim) - 0.5 def train_minibatch(self): v_update = defaultdict(lambda: np.zeros(self.args.vector_dim)) u_update = defaultdict(lambda: np.zeros(self.args.vector_dim)) objective = 0 for _ in range(self.args.minibatch_size): center_word, target_word = self.corpus.sample_word_pair( self.args.window_size) vp, up = self.v[center_word], self.u[target_word] prob = sigmoid(np.dot(vp, up)) v_update[center_word] += self.learning_rate * (1 - prob) * up u_update[target_word] += self.learning_rate * (1 - prob) * vp objective += np.log(prob) for __ in range(self.args.negative_sample_size): negative_word = self.lexicon.sample_word() un = self.u[negative_word] prob = sigmoid(np.dot(vp, un)) v_update[center_word] -= self.learning_rate * prob * un u_update[negative_word] -= self.learning_rate * prob * vp objective += np.log(1 - prob) self.update_vectors(v_update, u_update) print("batch: %d lr: %.4f objective: %.04f " % (self.minibatch_count, self.learning_rate, objective / self.args.minibatch_size)) self.minibatch_count += 1 self.learning_rate *= self.args.learning_rate_decay def update_vectors(self, v_update, u_update): for i, dv in v_update.items(): self.v[i] += dv for i, du in u_update.items(): self.u[i] += du def train(self): for _ in range(self.args.num_minibatches): self.train_minibatch() def save(self, output_file): fp = open(output_file, 'w') fp.write('{} {}\n'.format(self.args.lexicon_size, self.args.vector_dim)) for i in range(self.args.lexicon_size): vec = 0.5 * (self.v[i] + self.u[i]) fp.write('{} {}\n'.format(self.lexicon.id2word(i), ' '.join([str(x) for x in vec])))