コード例 #1
0
def main(tfidf=True):
    bib_data = get_bib_data()
    at_mat, authors, term_list1, authors_cnt = get_author_by_term_mat(bib_data, tfreq=5, afreq=10)
    yt_mat, years, term_list2, years_cnt = get_year_by_term_mat(bib_data, freq=5)
    
    if tfidf:
        at_mat = count_transform(at_mat)
        yt_mat = count_transform(yt_mat)
コード例 #2
0
def main(url_file, use_tfidf=True):
    word_cnt, sites, site_urls = make_data.get_sites_words(url_file)
    sw_mat, word_list = make_data.make_site_by_word_mat(word_cnt, sites, freq=5, percent=0.7)
    X = sw_mat
    if use_tfidf:
        X = count_transform(sw_mat)
    labels = ['Normal Deviate', 'MLTheory', 'CNET', 'BBC', 'CNN', 'JP', 'CNN-Tech', 'TechReview', 'NYT-Tech', 'Time-World', 'Mark-Reid']
    
    clustering(X, labels, algo='hcluster', figname='hcluster_site_by_word_tfidf.png')