Esempio n. 1
def read_lyrics(lyrics_dir='lyrics_en', artist=None, album=None, 
                print_stats=False, language='en-us', lookback=15):
    Read lyrics and compute Rhyme factor (riimikerroin) for each

        lyrics_dir  Path to the directory containing the lyrics.
        artist      Name of the artist directory under lyrics_dir (if this is
                    not provided, all artists are analyzed).
        album       Name of the album directory under lyrics_dir/artist/
        print_stats Whether we print summary statistics for each individual
        language    Use either Finnish (fi), American English (en-us), 
                    or English (en).
        lookback    How many previous words are checked for rhymes. For
                    Finnish I've used 10 and for English 15.
    if artist is not None:
        artists = [artist]
        artists = os.listdir(lyrics_dir)
    artist_scores = []
    song_scores = []
    song_names = []
    uniq_words = []
    longest_rhymes = []
    max_rhymes = 5
    for a in artists:
        print "Analyzing artist: %s" % a
        rls = []
        all_words = []
        if album is not None:
            albums = [album]
            albums = os.listdir(os.path.join(lyrics_dir, a))
            albums = sort_albums_by_year(albums)
        for al in albums:
            album_rls = []
            songs = os.listdir(os.path.join(lyrics_dir, a, al))
            # Only the .txt files
            songs = [s for s in songs if len(s)>=4 and s[-4:]=='.txt']
            for song in songs:
                file_name = os.path.join(lyrics_dir, a, al, song)
                l = Lyrics(file_name, print_stats=print_stats, 
                           language=language, lookback=lookback)
                rl = l.get_avg_rhyme_length()
                if len(longest_rhymes) < max_rhymes:
                    heapq.heappush(longest_rhymes, l.get_longest_rhyme())
                    heapq.heappushpop(longest_rhymes, l.get_longest_rhyme())

                if language == 'fi':
                    all_words += l.text.split()
                    text = l.text_orig.lower()
                    rx = re.compile(u'[^\wåäö]+')
                    text = rx.sub(' ', text)
                    all_words += text.split()
            # Print stats for the album
            #print "%s - %s: %.3f" % (a, al, np.mean(np.array(album_rls)))
            #print "%.5f" % (np.mean(np.array(album_rls)))

        # Compute the number of unique words the artist has used
        n_words = len(all_words)
        min_w = 20000
        if n_words >= min_w:
            n_uniq_words = len(set(all_words[:min_w]))
        mean_rl = np.mean(np.array(rls))

    # Sort the artists based on their avg rhyme lengths
    artist_scores = np.array(artist_scores)
    artists = np.array(artists)
    uniq_words = np.array(uniq_words)
    order = np.argsort(artist_scores)[::-1]
    artists = artists[order]
    uniq_words = uniq_words[order]
    artist_scores = artist_scores[order]

    print "\nBest rhymes"
    while len(longest_rhymes) > 0:
        l, rhyme = heapq.heappop(longest_rhymes)
        print rhyme

    print "\nBest songs:"
    song_scores = np.array(song_scores)
    song_names = np.array(song_names)
    song_names = song_names[np.argsort(song_scores)[::-1]]
    song_scores = sorted(song_scores)[::-1]
    for i in range(min(10,len(song_scores))):
        print '%.3f\t%s' % (song_scores[i], song_names[i])

    print "\nBest artists:"
    for i in range(len(artist_scores)):
        rx = re.compile(u'_')
        name = rx.sub(' ', artists[i])
        print '%d.\t%.3f\t%s' % (i+1, artist_scores[i], name)
Esempio n. 2
def read_lyrics(lyrics_dir='lyrics', 
				print_stats=False, language='en-us', lookback=30):
	Read lyrics and compute Rhyme factor (riimikerroin) for each

		lyrics_dir  Path to the directory containing the lyrics.
		artist      Name of the artist directory under lyrics_dir (if this is
					not provided, all artists are analyzed).
		album       Name of the album directory under lyrics_dir/artist/
		print_stats Whether we print summary statistics for each individual
		language    Use either Finnish (fi), American English (en-us), 
					or English (en).
		lookback    How many previous words are checked for rhymes. For
					Finnish I've used 10 and for English 15.

	# Set up CSV file to add the stats of each song to
	with open('raplyzer_out.csv', 'wb') as csvfile:
		csvwriter = csv.writer(csvfile, delimiter=',', lineterminator='\n',
								quotechar='|', quoting=csv.QUOTE_MINIMAL)
		csvwriter.writerow(["Artist", "Song", "Longest Rhyme Length", "Average Rhyme"])

	for a in os.listdir(lyrics_dir):
		print "Analyzing artist: %s" % a

		songs = os.listdir(lyrics_dir + '/' + a)
		songs = [s for s in songs if len(s) > 4 and s[-4:]=='.txt']
		for song in songs:
			file_name = lyrics_dir + '/' + a + '/' + song
				l = Lyrics(file_name, print_stats=print_stats, language='en-us', lookback=lookback)
				long_r = l.get_longest_rhyme()
				avg_r = l.get_avg_rhyme_length()
				print "\n%s -- %s" % (a, song)

			# Exception reading the file, scrap it and move on
				print 'Exception reading file ', file_name
				print '\tException: %s' % sys.exc_info()[0]
				long_r = (-1, "")
				avg_r = -1

			# Song file succesfully read
			# Calculate all the statistics we want
				# Calculate word statistics
				# text = l.text_orig.lower()
				# rx = re.compile(u'[^\wåäö]+')
				# text = rx.sub(' ', text)
				# all_words = text.split()
				# n_uwords = len(set(all_words))
				# n_words = len(all_words)
				# per_uwords = n_uwords / float(n_words)

				# Add the statistics to the csv file
				with open('raplyzer_out.csv', 'ab') as csvfile:
					csvwriter = csv.writer(csvfile, delimiter=',',
											quotechar='|', quoting=csv.QUOTE_MINIMAL)
					csvwriter.writerow([a, song, long_r[0], avg_r])
Esempio n. 3
def read_lyrics(lyrics_dir='lyrics_en', artist=None, album=None, 
                print_stats=False, language='en-us', lookback=15):
    Read lyrics and compute Rhyme factor (riimikerroin) for each

        lyrics_dir  Path to the directory containing the lyrics.
        artist      Name of the artist directory under lyrics_dir (if this is
                    not provided, all artists are analyzed).
        album       Name of the album directory under lyrics_dir/artist/
        print_stats Whether we print summary statistics for each individual
        language    Use either Finnish (fi), American English (en-us), 
                    or English (en).
        lookback    How many previous words are checked for rhymes. For
                    Finnish I've used 10 and for English 15.
    if artist is not None:
        artists = [artist]
        artists = os.listdir(lyrics_dir)
    artist_scores = []
    song_scores = []
    song_names = []
    uniq_words = []
    longest_rhymes = []
    max_rhymes = 5
    for a in artists:
        print "Analyzing artist: %s" % a
        rls = []
        all_words = []
        if album is not None:
            albums = [album]
            albums = os.listdir(os.path.join(lyrics_dir, a))
            albums = sort_albums_by_year(albums)
        for al in albums:
            album_rls = []
            songs = os.listdir(os.path.join(lyrics_dir, a, al))
            # Only the .txt files
            songs = [s for s in songs if len(s)>=4 and s[-4:]=='.txt']
            for song in songs:
                file_name = os.path.join(lyrics_dir, a, al, song)
                l = Lyrics(file_name, print_stats=print_stats, 
                           language=language, lookback=lookback)
                rl = l.get_avg_rhyme_length()
                if len(longest_rhymes) < max_rhymes:
                    heapq.heappush(longest_rhymes, l.get_longest_rhyme())
                    heapq.heappushpop(longest_rhymes, l.get_longest_rhyme())

                if language == 'fi':
                    all_words += l.text.split()
                    text = l.text_orig.lower()
                    rx = re.compile(u'[^\wåäö]+')
                    text = rx.sub(' ', text)
                    all_words += text.split()
            # Print stats for the album
            #print "%s - %s: %.3f" % (a, al, np.mean(np.array(album_rls)))
            #print "%.5f" % (np.mean(np.array(album_rls)))

        # Compute the number of unique words the artist has used
        n_words = len(all_words)
        min_w = 20000
        if n_words >= min_w:
            n_uniq_words = len(set(all_words[:min_w]))
        mean_rl = np.mean(np.array(rls))

    # Sort the artists based on their avg rhyme lengths
    artist_scores = np.array(artist_scores)
    artists = np.array(artists)
    uniq_words = np.array(uniq_words)
    order = np.argsort(artist_scores)[::-1]
    artists = artists[order]
    uniq_words = uniq_words[order]
    artist_scores = artist_scores[order]

    print "\nBest rhymes"
    while len(longest_rhymes) > 0:
        l, rhyme = heapq.heappop(longest_rhymes)
        print rhyme

    print "\nBest songs:"
    song_scores = np.array(song_scores)
    song_names = np.array(song_names)
    song_names = song_names[np.argsort(song_scores)[::-1]]
    song_scores = sorted(song_scores)[::-1]
    for i in range(min(10,len(song_scores))):
        print '%.3f\t%s' % (song_scores[i], song_names[i])

    print "\nBest artists:"
    for i in range(len(artist_scores)):
        rx = re.compile(u'_')
        name = rx.sub(' ', artists[i])
        print '%d.\t%.3f\t%s' % (i+1, artist_scores[i], name)