Beispiel #1
0
def get_artist_df():
    artist_df = cache.get("artist_df")
    if artist_df is None:
        artist_df = pd.read_csv(path.join('.', 'data', 'artists.csv'),
                                header=0,
                                low_memory=False,
                                converters={
                                    'member_list':
                                    lambda x: x[1:-1].split(', '),
                                    'group_list': lambda x: x[1:-1].split(', ')
                                })
        cache.set("artist_df", artist_df)
    return artist_df
Beispiel #2
0
def get_similar(release_id):
    df = cache.get("df")
    # Get the df index that corresponds to the release_id argument
    idx = cache.get("indices")[release_id]

    # Get the release's corresponding artist_id, so we can filter it out of the recommendations
    artist_id = df[df['release_id'] == release_id]['artist_id'].iloc[0]

    # Get the pairwise similarity scores of all albums against our chosen release
    sim_scores = list(enumerate(cache.get("cosine_sim")[idx]))

    # Turn that list of tuples into a Pandas series
    sim_series = pd.Series([i[1] for i in sim_scores])
    sim_series.rename('similarity', inplace=True)

    # Merge the series of similarity scores into the DataFrame containing the full collection
    similar = df.merge(sim_series,
                       how='left',
                       left_index=True,
                       right_index=True)

    # Sort by similariry scores in decending order
    similar.sort_values(by=['similarity'], inplace=True, ascending=False)

    # Filter out any additional albums by the same artist
    similar = similar[similar['artist_id'] != artist_id]

    similar.reset_index(drop=True, inplace=True)

    i = 1
    while i < 9:
        if similar.iloc[i].artist_id in similar[0:i].artist_id.values:
            similar.drop(similar.index[i], inplace=True)
        else:
            i += 1

    # Return the top nine most similar albums
    similar = similar[0:9]
    cache.set("similar", similar)

    return similar
Beispiel #3
0
def analysis(df):
    genre_list = []
    for genre in df['genres']:
        for i in genre:
            genre_list.append(i)

    genre_df = pd.DataFrame.from_dict(Counter(genre_list),
                                      orient='index',
                                      columns=['count'])
    genre_df.sort_values(by=['count'], inplace=True, ascending=False)

    cache.set("genre_df", genre_df)

    style_list = []
    for style in df['styles']:
        for i in style:
            style_list.append(i)

    style_df = pd.DataFrame.from_dict(Counter(style_list),
                                      orient='index',
                                      columns=['count'])
    style_df.sort_values(by=['count'], inplace=True, ascending=False)
    style_df = style_df[0:10]
    style_df.reset_index(inplace=True)
    style_df.rename(columns={'index': 'style'}, inplace=True)

    cache.set("style_df", style_df)

    top_ten = df['artist_name'].value_counts().head(10)

    cache.set("top_ten", top_ten)
Beispiel #4
0
def transform_data(release_dict):
    # Make a DataFrame from the passed in dictionary containing the user's record collection
    # Reset the df's index to and rename the newly made 'index' col to 'release_id'
    df = pd.DataFrame.from_dict(release_dict, orient="index")
    df.reset_index(inplace=True)
    df.rename(columns={'index': 'release_id'}, inplace=True)

    # Make a Pandas series containing release_id's paired with their df index number
    indices = pd.Series(df.index, index=df.release_id)
    cache.set("indices", indices)

    # Clean the genre & style attributes
    df['genres'] = df['genres'].apply(clean_string)
    df['styles'] = df['styles'].apply(clean_string)
    df['descriptors'] = df['descriptors'].apply(clean_string)

    # Merge in the artist_df, so now each release contains band member info (where present)
    df = df.merge(get_artist_df(), how="left",
                  on="artist_id").set_index(df.index)
    df.fillna('', inplace=True)

    # Make the word soup that we will pass into the CountVectorizer
    df['soup'] = df.apply(create_soup, axis=1)

    # Create CountVectorizer, fit_transform the release's word soup attribute,
    # and make a cosine similarity matrix that reflects the likeness between each
    count = CountVectorizer()
    count_matrix = count.fit_transform(df['soup'])
    cosine_sim = cosine_similarity(count_matrix, count_matrix)

    cache.set("df", df)
    cache.set("cosine_sim", cosine_sim)

    analysis(df)
Beispiel #5
0
def parse_collection(response):
    # Iterate through every release (r) in the API's response
    for r in response.json()['releases']:
        release_id = r['id']

        descriptors = ""

        try:
            for entry in r['notes']:
                if entry['field_id'] == 5:
                    descriptors = entry['value'].split(', ')
                    if 'male vocals' in descriptors:
                        descriptors.remove('male vocals')
                    descriptors = [
                        descriptor.title() for descriptor in descriptors
                    ]
        except KeyError:
            pass

        # Drill to the release's 'basic_information' field
        r = r['basic_information']

        format_list = [i['name'] for i in r['formats']]

        if 'Vinyl' in format_list:
            release_name = r['title']
            artist_name = r['artists'][0]['name']
            artist_id = r['artists'][0]['id']
            resource_url = r['resource_url']
            year = r['year']

            cover_url = r['cover_image']

            genre_list = [i for i in r['genres']]
            style_list = [i for i in r['styles']]

            release_dict[release_id] = {
                "release_name": release_name,
                "artist_id": artist_id,
                "artist_name": artist_name,
                "year": year,
                "genres": genre_list,
                "styles": style_list,
                "resource_url": resource_url,
                "cover_url": cover_url,
                "descriptors": descriptors
            }

    pagination = response.json()['pagination']

    # If there are more pages to be read, print the sync status,
    # then recursively call parse_list, passing in the "next url" already generate by Discogs' API
    if pagination['page'] < pagination['pages']:
        print(
            str(len(release_dict)) + ' of ' + str(pagination['items']) +
            ' items synced...')

        rate_limit(response)

        parse_collection(
            requests.get(pagination['urls']['next'], headers=headers))

    cache.set("release_dict", release_dict)
Beispiel #6
0
def reset_dict():
    release_dict.clear()
    cache.set("release_dict", None)
Beispiel #7
0
def reset_df():
    reset_dict()
    cache.set("df", None)