Example #1
0
def tfidf_counts(lyrics=clean_lyrics, n_samples=20, max_feats=MXF):
    '''
    Returns the nth highest weighted words in the dataset
    '''
    tfidf_weights = TfidfVectorizer(
        max_features=max_feats).fit_transform(lyrics)
    weights = np.asarray(tfidf_weights.mean(axis=0)).ravel().tolist()
    weights_df = pd.DataFrame({
        'word':
        TfidfVectorizer(
            max_features=max_feats).fit(lyrics).get_feature_names(),
        'weight':
        weights
    })
    sort_df = weights_df.sort_values(by='weight',
                                     ascending=False).reset_index(drop=True)
    sort_df.index = np.arange(1, len(weights_df) + 1)
    print(
        tabulate(sort_df.head(n_samples),
                 headers=weights_df.columns,
                 tablefmt='pipe'))
    return sort_df.head(n_samples)