Esempio n. 1
0
	def addChannel(self, paths):
		c = Corpus()
		for path in paths:
			print path
			with open(path) as f:
				text = f.read()
				name = path.split('/')[-1]
				d = Document(name, text)
				c.add_document(d)
		self.sourceboard.addChannel(c)
		self.refresh()
		self.frame.fSizer.Layout()
		self.frame.Fit()
Esempio n. 2
0
def parse_bookmark_list(dl):
    title_urls = []
    for dt in dl.find_all('dt'):
        title_urls.append({
            'url': dt.find('a')['href'],
            'title': dt.find('a').text
        })
    return title_urls




if __name__ == '__main__':
    bookmarks = read_bookmarks_from_file('test_bookmarks.html')
    urls = [bm['url'] for bm in bookmarks['Unsorted']]

    corp = Corpus()
    for u in urls:
        doc = create_document_from_url(u)
        corp.add_document(doc)

    vec = vectorize_corpus(corp)

    km = KMeans(n_clusters=4)
    km.fit(vec)

    clusters = km.labels_.tolist()
    bookmarks = {'url': urls,  'cluster': clusters}
    frame = pd.DataFrame(bookmarks, index=[clusters], columns=['url', 'cluster'])
    print(frame.sort_values(by=['cluster']))
Esempio n. 3
0
class SGNS:
    def __init__(self, args):
        lexicon_builder = LexiconBuilder()
        print('building lexicon...')
        count = 0
        for line in document_reader(args.corpus_dir):
            count += 1
            if count % 100000 == 0:
                print('read {} lines'.format(count))
            lexicon_builder.register_corpus(line)
        self.lexicon = lexicon_builder.generate_lexicon(
            args.lexicon_size, args.negative_sample_power)
        self.corpus = Corpus()
        print('loading corpus...')
        count = 0
        for line in document_reader(args.corpus_dir):
            count += 1
            if count % 100000 == 0:
                print('read {} lines'.format(count))
            line_ids = [self.lexicon.word2id(word) for word in line]
            if not args.train_oov_token:
                line_ids = [i for i in line_ids if i != Lexicon.OOV_WORD_ID]
            self.corpus.add_document(line_ids)
        self.args = args
        self.learning_rate = args.init_learning_rate
        self.minibatch_count = 0

        self.v = np.random.rand(args.lexicon_size, args.vector_dim) - 0.5
        self.u = np.random.rand(args.lexicon_size, args.vector_dim) - 0.5

    def train_minibatch(self):
        v_update = defaultdict(lambda: np.zeros(self.args.vector_dim))
        u_update = defaultdict(lambda: np.zeros(self.args.vector_dim))
        objective = 0
        for _ in range(self.args.minibatch_size):
            center_word, target_word = self.corpus.sample_word_pair(
                self.args.window_size)
            vp, up = self.v[center_word], self.u[target_word]
            prob = sigmoid(np.dot(vp, up))
            v_update[center_word] += self.learning_rate * (1 - prob) * up
            u_update[target_word] += self.learning_rate * (1 - prob) * vp
            objective += np.log(prob)
            for __ in range(self.args.negative_sample_size):
                negative_word = self.lexicon.sample_word()
                un = self.u[negative_word]
                prob = sigmoid(np.dot(vp, un))
                v_update[center_word] -= self.learning_rate * prob * un
                u_update[negative_word] -= self.learning_rate * prob * vp
                objective += np.log(1 - prob)
        self.update_vectors(v_update, u_update)
        print("batch: %d lr: %.4f objective: %.04f " %
              (self.minibatch_count, self.learning_rate,
               objective / self.args.minibatch_size))
        self.minibatch_count += 1
        self.learning_rate *= self.args.learning_rate_decay

    def update_vectors(self, v_update, u_update):
        for i, dv in v_update.items():
            self.v[i] += dv
        for i, du in u_update.items():
            self.u[i] += du

    def train(self):
        for _ in range(self.args.num_minibatches):
            self.train_minibatch()

    def save(self, output_file):
        fp = open(output_file, 'w')
        fp.write('{} {}\n'.format(self.args.lexicon_size,
                                  self.args.vector_dim))
        for i in range(self.args.lexicon_size):
            vec = 0.5 * (self.v[i] + self.u[i])
            fp.write('{} {}\n'.format(self.lexicon.id2word(i),
                                      ' '.join([str(x) for x in vec])))