Example #1
0
def read_data(args):
    data = json.load(open(args.file))
    analyzer = {genre: {'chars': [], 'words': [], 'unique': []} for genre in args.genres}

    cntr = 0
    # Analyze length of song/number of unique words
    for idx, (artist, song, genre) in data.items():
        progressbar((int(idx)+1)/len(data))
        if not genre in args.genres:
            continue
        file_path = os.path.join('lyrics', '{}~{}'.format(artist.replace('/', ''), song.replace('/', '')))
        if os.path.exists(file_path):
            lyrics = open(file_path).read()
            tokenized_lyrics = nltk.wordpunct_tokenize(lyrics)

            analyzer[genre]['chars'].append(len(lyrics))
            analyzer[genre]['words'].append(len(tokenized_lyrics))
            analyzer[genre]['unique'].append(len(set(tokenized_lyrics)))

            cntr += 1
    print()
    print('Analyzed {} songs'.format(cntr))

    hist_data = []
    bin_data = []
    for genre in args.genres:
        hist_data.append(analyzer[genre][args.type])
        bin_data += analyzer[genre][args.type]

    return hist_data, bin_data
Example #2
0
def analyze_file(args):
    '''
    Takes an project file.
    Goes through all lyrics in it.
    If the lyrics doesn't exist or if the lyrics is in another language than english,
    the entry is removed from the json file.
    '''
    data = json.load(open(args.file))
    print()
    print('Analyzing file: "{}"'.format(args.file))
    print('Number of songs: {}'.format(len(data)))
    print()

    failed = []
    new_data = {}
    valid_cntr = 0
    for idx, (artist, song, genre) in data.items():
        file_path = os.path.join(
            args.folder_name, '{}~{}'.format(artist.replace('/', ''),
                                             song.replace('/', '')))
        progressbar((int(idx) + 1) / len(data))
        if os.path.exists(file_path):
            lyrics = open(file_path).read()

            unique_words = set(lyrics.replace('\n', ' ').split(' '))
            if 'instrumental' in lyrics.lower():
                if len(unique_words) < 5:
                    failed.append(['instrumental', artist + song, lyrics])
                    continue
            if len(lyrics) < 100:
                failed.append(['less < 100 chars', artist + song, lyrics])
                continue

            language = get_language(lyrics)

            language = get_language(lyrics)
            if language != 'english':
                failed.append(['no-english', artist + song, lyrics])
                continue

            new_data[valid_cntr] = [artist, song, genre]
            valid_cntr += 1
        else:
            failed.append(['file not found', artist + song, ''])

    failed.sort()

    if failed:
        print()
        print('Songs failed, removed: {}'.format(len(failed)))
    for reason, name, lyrics in failed:
        print('  "{}": "{}"'.format(reason, name))

    print()
    print('Number of songs before: {}'.format(len(data)))
    print('Number of songs after : {}'.format(len(new_data)))

    json.dump(new_data, open(args.output_file, 'w'), indent=4, sort_keys=True)
    print('Dumped in file: "{}"'.format(args.output_file))
Example #3
0
def get_lyrics_from_url():
    '''
    From the file "db_file", download and store all lyrics in the folder "folder_name"
    '''
    with open(args.db_file) as f:
        data = json.load(f)
    num_songs = len(data)
    print('File: {} with: {} songs'.format(args.db_file, num_songs))
    print()
    print()
    cntr = 0
    failed = []
    for key, url in data.items():
        artist, song = eval(key)
        cntr += 1
        progressbar(cntr / num_songs, '"{}"'.format(artist),
                    '"{}"'.format(song))
        print('\033[F {} found, {} failed, '.format(cntr - len(failed),
                                                    len(failed)))

        song_path = os.path.join(
            args.folder_name,
            artist.replace('/', '') + '~' + song.replace('/', ''))
        # File already processed
        if os.path.exists(song_path):
            continue

        # No valid url
        if url in ['', 'fail', 'manual', 'none']:
            failed.append([url, key])
            continue

        lyrics = get_lyrics_url(url)
        with open(song_path, 'w') as f:
            f.write(lyrics)

    print()
    for i, (code, name) in enumerate(failed):
        print('Failed: {}: {}'.format(code, name))
        if i >= 50:
            print('Showing only 50 first, {} in total'.format(len(failed)))
            break
    print('Number of failed: {}'.format(len(failed)))
Example #4
0
def scrape(n, corpus=[]):

    matches_url = 'https://api.opendota.com/api/explorer?sql=select%20match_id,radiant_win,duration,avg_mmr%20from%20public_matches%20where%20duration%20%3E%20900%20and%20lobby_type%20=%207%20and%20avg_mmr%20%3E%203000%20order%20by%20match_id%20desc%20limit%2010000'

    response = requests.get(matches_url)
    data = response.json()
    c = 0
    for i, row in enumerate(data['rows']):
        try:
            chat = get_chatlog(row['match_id'])
        except Exception:
            chat = {}

        if chat:
            corpus.append((row['match_id'], row['radiant_win'],
                           row['duration'], row['avg_mmr'], chat))
            c += 1
        w8m8.progressbar(c / n)
        if c == n:
            break

    return corpus
Example #5
0
def find_url_for_songs():
    '''
    Use the genius API to find urls for the songs in "db_file"
    '''
    fix_failed = 'fix_failed' if args.fix_failed else ''

    with open(args.db_file) as f:
        data = json.load(f)
    num_songs = len(data)
    print('File: {} with: {} songs'.format(args.db_file, num_songs))
    print()
    print()
    cntr = 0
    failed = []

    try:
        for key, url in data.items():
            artist, song = eval(key)
            cntr += 1
            print('\033[F {} found, {} failed, '.format(
                cntr - len(failed), len(failed)))
            progressbar(cntr / num_songs, '"{}"'.format(artist),
                        '"{}"'.format(song))
            if (url in ['manual', 'none']) or (url == 'fail'
                                               and not args.fix_failed):
                failed.append([artist, song])
                continue

            if 'genius.com' in url:
                continue

            if args.ignore_feat:
                if ' Featuring' in artist:
                    featuring = artist.index(' Featuring')
                    artist_no_feature = artist[:featuring]
                elif ' With' in artist:
                    featuring = artist.index(' With')
                    artist_no_feature = artist[:featuring]
                elif ' &' in artist:
                    featuring = artist.index(' &')
                    artist_no_feature = artist[:featuring]
                else:
                    artist_no_feature = artist
                url, q_artist, q_song = get_url_from_name(
                    artist_no_feature, song, fix_failed, args.lev)
            else:
                url, q_artist, q_song = get_url_from_name(
                    artist, song, fix_failed, args.lev)
            if url in ['fail', 'manual']:
                failed.append([artist, song])

            data[key] = url
    except KeyboardInterrupt:
        pass
    except Exception as e:
        print(e)
        pass

    # Dump data
    with open(args.db_file, 'w') as f:
        json.dump(data, f, indent=4)

    print()
    for i, fail in enumerate(failed):
        print('Url not found:', fail)
        if i >= 50:
            print('Showing only 50 first, {} in total'.format(len(failed)))
            break
    print()
Example #6
0
def scrape_billboard(billboard_file='billboard-links.json', db_file_name='url-db.json', genre_file_name='billboard.json'):
    '''
    From the 'billboard_file' (which contains urls)
    Scrape all songs (artist, song) and store in 'db_file_name'
    Scrape all songs (artist, song, genre) and store in 'genre_file_name'
    Will not overwrite old entries in 'db_file_name'
    '''
    print('Scraping billboard webpage')
    with open(billboard_file) as f:
        billboard_links = json.load(f)

    try:
        with open(db_file_name) as f:
            url_data_db = json.load(f)
    except:
        input('File broken "{}", continue with new file?'.format(db_file_name))
        url_data_db = {}

    print()
    num_urls = sum([len(urls) for genre, urls in billboard_links.items()])
    genre_data = {}
    cntr, url_cntr = 0, 0
    failed_links = []
    working_archive = set()
    chart_items = {
        False: [['ye-chart-item__text', 'div'], ['ye-chart-item__title', 'div'], ['ye-chart-item__artist', 'div']],
        True: [['ye-chart__item-text', 'div'], ['ye-chart__item-title', 'h1'], ['ye-chart__item-subtitle', 'h2']],
    }
    genre_distribution = []
    for genre, urls in billboard_links.items():
        for url in urls:
            url_cntr += 1

            page = requests.get(url)
            soup = BeautifulSoup(page.text, 'html.parser')

            with open('billboard-pages/' + url.replace('/', '|') + '.html', 'w') as f:
                f.write(page.text)
            print('\033[F\033[K"{}": {}'.format(genre, url))

            class_base = chart_items[True]
            items = soup.find_all('', {'class': class_base[0][0]})
            if not items:
                class_base = chart_items[False]
                items = soup.find_all('', {'class': class_base[0][0]})
            if not items:
                failed_links.append([genre, url])
            for row in items:
                if 'archive' in url:
                    working_archive.add(url)
                song = row.find('', {'class': class_base[1][0]}).text.strip()
                artist = row.find('', {'class': class_base[2][0]}).text.strip()

                artist = artist.replace(u'\u200b', '')
                song = song.replace(u'\u200b', '')
                key = str((artist, song))
                if not key in url_data_db:
                    url_data_db[key] = ''

                genre_data[str(cntr)] = [artist, song, genre]
                genre_distribution.append(genre)
                cntr += 1
            progressbar(url_cntr/num_urls, 'Number of songs: {} Failed: {}'.format(cntr, len(failed_links)))
    print()
    print()
    for genre, link in failed_links:
        print('Failed: Genre: "{}", url: {}'.format(genre, link))
    print()
    for archive in working_archive:
        print('Archive: {}'.format(archive))
    print('Manually added "adult pop songs 2012"')

    print()
    print()
    with open(db_file_name, 'w') as f:
        json.dump(url_data_db, f, indent=4)
    print('Db file saved to: "{}"'.format(db_file_name))

    with open(genre_file_name, 'w') as f:
        json.dump(genre_data, f, indent=4)
    print('Genre file saved to: "{}"'.format(genre_file_name))
    print()
    print()
    for genre, cnt in Counter(genre_distribution).items():
        print('  Genre: {} {}'.format(genre, cnt))
    print()
Example #7
0
# Answer #1: 1642
# Answer #2: 33601318

step_count = 301

import sys
sys.path.append('..')
from w8m8 import progressbar

cur_idx = 0
code = [0]
for i in range(2017):
    cur_idx = (cur_idx + step_count) % (i+1) + 1
    code.insert(cur_idx, i+1)

print('Answer #1: {}'.format(code[code.index(2017)+1]))

# Zero is always at the beginning of the code
# Save value if it have index 1
cur_idx = 0
best = -1
n_iter = 50000000
for i in range(n_iter):
    cur_idx = (cur_idx + step_count) % (i+1) + 1
    best = i + 1 if cur_idx == 1 else best
    progressbar(i/n_iter) if i % 100000 == 0 else None
print('\033[KAnswer #2: {}'.format(best))