def main(tfidf=True): bib_data = get_bib_data() at_mat, authors, term_list1, authors_cnt = get_author_by_term_mat(bib_data, tfreq=5, afreq=10) yt_mat, years, term_list2, years_cnt = get_year_by_term_mat(bib_data, freq=5) if tfidf: at_mat = count_transform(at_mat) yt_mat = count_transform(yt_mat)
def main(url_file, use_tfidf=True): word_cnt, sites, site_urls = make_data.get_sites_words(url_file) sw_mat, word_list = make_data.make_site_by_word_mat(word_cnt, sites, freq=5, percent=0.7) X = sw_mat if use_tfidf: X = count_transform(sw_mat) labels = ['Normal Deviate', 'MLTheory', 'CNET', 'BBC', 'CNN', 'JP', 'CNN-Tech', 'TechReview', 'NYT-Tech', 'Time-World', 'Mark-Reid'] clustering(X, labels, algo='hcluster', figname='hcluster_site_by_word_tfidf.png')