def __tokenize_sentence(self, processed): """ tokenize sentence and remove sentence-level punctuation, such as comma (,) but not dash (-) in, e.g. 'morning-after' function only for internal usage """ self.tokens = Preprocessor.get_processed_tokens(processed)
def get_idf_array(self): """ Use external corpus to get IDF scores for cluster centroid calculations :return: numpy array of idf values """ corpus = brown if self.args.corpus == 'R': corpus = reuters num_words = Vectors().num_unique_words n = len(corpus.fileids()) # number of documents in corpus docs_word_matrix = np.zeros([n, num_words]) for doc_idx, doc_id in enumerate(corpus.fileids()): sentences = list(corpus.sents(doc_id)) words_in_doc = set() for s in sentences: s = ' '.join(s) proc_s = Preprocessor.get_processed_tokens(Preprocessor.get_processed_sentence(s)) if proc_s: words_in_doc = words_in_doc.union(proc_s) for word in words_in_doc: word_idx = WordMap.id_of(word) if word_idx: docs_word_matrix[doc_idx, word_idx] = 1 docs_per_word = np.sum(docs_word_matrix, axis=0) self.idf_array = np.log10(np.divide(n, docs_per_word + 1)) # add one to avoid divide by zero error return self.idf_array