def main(fpath, out_folder, use_idf=True): for min_filter in 0, 0.05, 0.1, 0.5: folder = os.path.join(out_folder, str(min_filter)) os.mkdir(folder) out_fpath_tags = os.path.join(folder, 'tags') out_fpath_vocab = os.path.join(folder, 'vocab') doc_mat, vocabulary = vectorize_songs(fpath, use_idf=use_idf, bottom_filter=min_filter) with open(out_fpath_tags, 'w') as tags_file: rows, cols = doc_mat.nonzero() last_row = rows[0] for row, col in zip(rows, cols): if row != last_row: assert row > last_row print(file=tags_file) last_row = row print('%d:%.6f'%(col, doc_mat[row, col]), file=tags_file, end=' ') with open(out_fpath_vocab, 'w') as vocab_file: for term in vocabulary: term_id = vocabulary[term] print(term, term_id, file = vocab_file)
def main(fpath): doc_mat = vectorize_songs(fpath)[0] rows = doc_mat.nonzero()[0] to_plot = Counter(rows).values() x, cdf_y = ecdf(to_plot) ccdf_y = 1 - cdf_y print(stats.scoreatpercentile(to_plot, 0.1)) print(doc_mat.shape) ax = plt.gca() ax.set_yscale("log") ax.set_xscale("log") plt.plot(x, ccdf_y, "bo") plt.xlabel("Number Tags per Song (x)") plt.ylabel("Prob(Num. Tags per Song > x)") plt.title("CCDF of Tags per Song") plt.show()
def main(fpath): doc_mat = vectorize_songs(fpath)[0] cols = doc_mat.nonzero()[1] to_plot = Counter(cols).values() x, cdf_y = ecdf(to_plot) ccdf_y = 1 - cdf_y print(stats.scoreatpercentile(to_plot, 0.5)) print(doc_mat.shape) ax = plt.gca() ax.set_yscale('log') ax.set_xscale('log') plt.plot(x, ccdf_y, 'bo') plt.xlabel('Number of songs with tag (x)') plt.ylabel('Prob(Num. Songs with Tag > x)') plt.title('CCDF of Tag Popularity') plt.show()