def filter_phi(phi_path=constant.lda_phi, phi_after_path=constant.lda_phi_after, phi_threadhold=8): word_count = 0 topic_count = 0 with open(phi_path) as origin: for i, line in enumerate(origin): if i == 0: items = line.strip().split(' ') word_count = len(items) topic_count += 1 phi = sparse.lil_matrix((topic_count, word_count), dtype=float) with open(phi_path) as origin: for i, line in enumerate(origin): items = line.strip().split() for j, it in enumerate(items): phi[i, j] = float(it) for j in xrange(word_count): origin_col = [] for i in xrange(topic_count): origin_col.append(phi[i, j]) slist = sorted(origin_col, reverse=True)[0:phi_threadhold] for i in xrange(topic_count): if origin_col[i] not in slist: phi[i, j] = 0 with open(phi_after_path, 'w') as file: for i in xrange(topic_count): for j in xrange(word_count): file.write(str(phi[i, j]) + ' ') file.write('\n')
def theta_sperator(theta_after_path=constant.lda_theta_after, sperator_path=constant.lda_fcn): topic_count = constant.lda_topic_count doc_count = constant.file.get_doc_count() theta = sparse.lil_matrix((doc_count, topic_count), dtype=float) with open(theta_after_path) as file: for i, line in enumerate(file): items = line.strip().split(' ') for j, item in enumerate(items): theta[i, j] = float(item) doc_list = constant.file.get_docmap(True) for j in xrange(topic_count): with open(sperator_path + str(j), 'w') as file: for i in xrange(doc_count): if theta[i, j] != 0: file.write(doc_list[i] + ' ' + str(theta[i, j]) + '\n') return theta
def print_model(model): # 输出theta theta = model.doc_topic_ with open(constant.lda_theta, 'w') as file: for i in theta: for j in i: file.write(str(j) + ' ') file.write('\n') # 输出phi phi = model.topic_word_ with open(constant.lda_phi, 'w') as file: for i in phi: for j in i: file.write(str(j) + ' ') file.write('\n')
def print_listmatrix(matrix, file_name, need_title_count=True): row = len(matrix) col = len(matrix[0]) with open(file_name, 'w') as file: if (need_title_count): file.write(str(row) + '*' + str(col) + '\n') for i in xrange(row): print str(float(i) / row) + '\r', for j in xrange(col): val = matrix[i][j] if val != 0: file.write(str(j) + ':' + str(val) + ' ') file.write('\n') del matrix gc.collect()
def __print_info(): list = constant.file.get_relation() word_map = {} count = 0 for line in list: for word in line: if word_map.has_key(word): pass else: word_map[word] = count count += 1 # 输出wordmap with open(constant.data_wordmap, 'w') as file: file.write(str(len(word_map)) + '\n') for it in word_map.items(): file.write(it[0]) file.write(' ') file.write(str(it[1])) file.write('\n') # 输出其他信息 with open(constant.data_other, 'w') as file: other = {} other['ndocs'] = len(constant.file.get_docmap()) other['nwords'] = len(constant.file.get_wordmap()) s = json.dumps(other,indent=4) file.write(s)