def fetch_playlist(playlist_id, allow_every=False): try: results = sp.playlist(playlist_id) except: print(f'Invalid PID: {playlist_id}') return False description, name, owner = results['description'], results[ 'name'], results['owner']['id'] total_tracks = results['tracks']['total'] desc_lemmas, desc_lang = lemmatize(description, return_lang=True) name_lemmas, name_lang = lemmatize(name, return_lang=True) valid_tracks = get_valid_tracks(sp, results['tracks']) playlist_length = len(valid_tracks) if not allow_every and (playlist_length > 2000 or playlist_length <= 1 or desc_lang != 'en' or name_lang != 'en'): print(f'{playlist_id} rejected. Invalid # of tracks or non-english.') return False tracks_to_add = list() # playlist tracks that don't exist in DB artists_to_add = set() playlist_tids = list( ) # playlist tracks that already exist or are initialized without error for tid, track in valid_tracks.items(): if not db.track_exists(tid): try: track_data = initialize_track(track, playlist_id) artist_ids = track_data['artist_ids'] for artist_id in artist_ids: if not db.artist_exists(artist_id): artists_to_add.add(artist_id) tracks_to_add.append(track_data) playlist_tids.append(tid) except: print(f'Error initializing track {tid}') continue else: playlist_tids.append(tid) db.add_pid_to_track(tid, playlist_id) tracks_to_add = add_audio_features(tracks_to_add) playlist_to_add = dict(_id=playlist_id, name=name, name_lemmas=name_lemmas, owner=owner, last_updated=datetime.datetime.now(), description=description, description_lemmas=desc_lemmas, tids=playlist_tids) artists_to_add = fetch_artist_info(artists_to_add) db.insert_artists(artists_to_add) db.insert_tracks(tracks_to_add) db.insert_playlist(playlist_to_add) return True # returns True if it successfully adds playlist to db, False otherwise
def initialize_track(track, playlist_id): track_data = dict() track_info = track['track'] track_data['name'] = track_info['name'] track_data['name_lemmas'] = lemmatize(track_data['name']) track_data['_id'] = track_info['id'] track_data['explicit'] = track_info['explicit'] track_data['duration'] = track_info['duration_ms'] track_data['artist_id'] = track_info['artists'][0]['id'] track_data['pids'] = list([playlist_id]) return track_data
def get_track_frequencies(search_words, tid_subset=None): # dict - word, counter word_track_frequencies = dict() lemmas = lemmatize(search_words) for lemma in lemmas: frequencies = Counter() playlists = playlists_db.find({'name_lemmas': lemma}) for playlist in playlists: tids = playlist['tids'] for tid in tids: if tid_subset and tid in tid_subset: frequencies[tid] += 1 elif not tid_subset: frequencies[tid] += 1 word_track_frequencies[lemma] = frequencies return word_track_frequencies
def initialize_track(track, playlist_id=None, for_user=False): track_data = dict() track_info = track['track'] track_data['name'] = track_info['name'] track_data['name_lemmas'] = lemmatize(track_data['name']) track_data['_id'] = track_info['id'] track_data['album_name'] = track_info['album']['name'] track_data['album_id'] = track_info['album']['id'] track_data['explicit'] = track_info['explicit'] track_data['duration'] = track_info['duration_ms'] track_data['artist_ids'] = [ artist['id'] for artist in track_info['artists'] ] track_data['pids'] = list() if for_user: track_data['added_at'] = track['added_at'] else: track_data['pids'].append(playlist_id) return track_data
def add_playlist(playlist_id): if playlist_id == '' or playlist_id == None: return None try: results = sp.user_playlist(playlist_id=playlist_id, user=None) except: print(f'cant get playlist {playlist_id}') return None description = results['description'] name = results['name'] owner = results['owner']['id'] total_tracks = results['tracks']['total'] desc_lemmas, desc_lang = lemmatize(description, return_lang=True) name_lemmas, name_lang = lemmatize(name, return_lang=True) playlist_tracks = parse_tracks(results, sp) playlist_length = len(playlist_tracks) if playlist_length > 1000 or playlist_length < 5 or desc_lang != 'en' or name_lang != 'en': print(f'Too many/few tracks or not english {playlist_id}') return None tracks_to_add = list() existing_tids = set() artists_to_add = set() for tid, track in playlist_tracks.items(): if not db.track_exists(tid): try: track_data = initialize_track(track, playlist_id) artist_id = track_data['artist_id'] if not db.artist_exists(artist_id): artists_to_add.add(artist_id) tracks_to_add.append(track_data) except: print(f'Error initializing track {tid}') continue else: existing_tids.add(tid) tracks_db.update_one({'_id': tid}, {'$push': { 'pids': playlist_id }}) tracks_to_add = add_audio_features(tracks_to_add) playlist_to_add = dict(_id=playlist_id, name=name, name_lemmas=name_lemmas, owner=owner, description=description, description_lemmas=desc_lemmas, tids=list()) artists_to_add = list(artists_to_add) artists_to_add = add_genres(artists_to_add) if artists_to_add: try: artists_db.insert_many(artists_to_add) except BulkWriteError as bwe: for artist in artists_to_add: try: artists_db.insert_one(artist) except: print(f'cant insert artist {artist}') if tracks_to_add: try: tracks_db.insert_many(tracks_to_add) tids = [track['_id'] for track in tracks_to_add] playlist_to_add['tids'].extend(tids) except: for track in tracks_to_add: try: tracks_db.insert_one(track) playlist_to_add['tids'].append(track['_id']) except: print(f'cannot insert track {track}') playlist_to_add['tids'].extend(list(existing_tids)) playlists_db.insert_one(playlist_to_add)
def extract_tokens( text, tokenizing_function, noise_tokens=set(), gram_list=[], keep_hashtags_and_mentions=True, make_lowercase=True, do_lemmatize=True, remove_possessive=True, do_arabic_stemming=True, ): """ Extract any requested ngrams as tokens. Ngrams generated from raw text, no punctuation (hopefully), urls & single characters removed removed, # and @ prefixes removed. """ text = HTMLParser.HTMLParser().unescape(text) tempTokens = [ unicode(token) for token in tokenizing_function(text) if token[:4] != 'http' and len(token) > 1 ] if keep_hashtags_and_mentions: tempTokens = [ token[1:] if token[0] == '#' or token[0] == '@' and len(token) > 1 else token for token in tempTokens ] if make_lowercase: words = [token.lower() for token in tempTokens] else: words = tempTokens # remove commas or words containing commas, numbers words = [ token for token in words if token not in noise_tokens and len(token) >= 1 and token.find(',') < 0 and not token.translate(remove_punctuation_map).isdigit() ] # remove initial, terminal, or paired quotation marks from word boundaries quotes = u'\'"`‘“’”' words = [ token[1:] if token[0] in quotes and (len(token) == 1 or token[1] in string.letters) else token for token in words ] words = [ token[:-1] if len(token) > 0 and token[-1] in quotes and token[0] in string.letters else token for token in words ] if remove_possessive: ##remove simple english possessive words = [POSSESSIVE_REGEX.sub("", token) for token in words] tokens = words # If the tweet is in English, append lemmatized tokens using wordnet # If the tweet is in Arabic if do_lemmatize: for i in range(len(tokens)): temp = lemmatize(tokens[i]) if temp is not None: tokens[i] = temp if do_arabic_stemming: for i in range(len(tokens)): if len(arabic_regex.findall(tokens[i])) > 0: tokens[i] = arabic_stemmer().stem(tokens[i]) tokens = [t for t in tokens if len(t) > 0] for gramSize in gram_list: new_grams = getNGrams(tempTokens, gramSize) # drop any n-grams that do not contain at least two non-stopwords for gram in new_grams: drop_gram = True valid_count = 0 for token in gram.split(' '): if not (token in noise_tokens): valid_count += 1 if valid_count > 1: drop_gram = False break if not drop_gram: tokens.append(gram) tokens = [t for t in tokens if len(t) > 0 and t not in noise_tokens] if make_lowercase: tokens = [t.lower() for t in tokens] return tokens
def extract_tokens(text, tokenizing_function, noise_tokens = set(), gram_list= [], keep_hashtags_and_mentions=True, make_lowercase=True, do_lemmatize=True, remove_possessive=True, do_arabic_stemming=True, ): """ Extract any requested ngrams as tokens. Ngrams generated from raw text, no punctuation (hopefully), urls & single characters removed removed, # and @ prefixes removed. """ text = HTMLParser.HTMLParser().unescape(text) tempTokens = [unicode(token) for token in tokenizing_function(text) if token[:4] != 'http' and len(token) > 1] if keep_hashtags_and_mentions: tempTokens = [token[1:] if token[0] == '#' or token[0] == '@' and len(token) > 1 else token for token in tempTokens] if make_lowercase: words = [token.lower() for token in tempTokens] else: words = tempTokens # remove commas or words containing commas, numbers words = [token for token in words if token not in noise_tokens and len(token) >= 1 and token.find(',') < 0 and not token.translate(remove_punctuation_map).isdigit()] # remove initial, terminal, or paired quotation marks from word boundaries quotes = u'\'"`‘“’”' words = [token[1:] if token[0] in quotes and (len(token) == 1 or token[1] in string.letters) else token for token in words] words = [token[:-1] if len(token) > 0 and token[-1] in quotes and token[0] in string.letters else token for token in words] if remove_possessive: ##remove simple english possessive words = [POSSESSIVE_REGEX.sub("",token) for token in words] tokens = words # If the tweet is in English, append lemmatized tokens using wordnet # If the tweet is in Arabic if do_lemmatize: for i in range(len(tokens)): temp = lemmatize(tokens[i]) if temp is not None: tokens[i] = temp if do_arabic_stemming: for i in range(len(tokens)): if len(arabic_regex.findall(tokens[i])) > 0: tokens[i] = arabic_stemmer().stem(tokens[i]) tokens = [t for t in tokens if len(t) > 0] for gramSize in gram_list: new_grams = getNGrams(tempTokens, gramSize) # drop any n-grams that do not contain at least two non-stopwords for gram in new_grams: drop_gram = True valid_count = 0 for token in gram.split(' '): if not (token in noise_tokens): valid_count += 1 if valid_count > 1: drop_gram = False break if not drop_gram: tokens.append(gram) tokens = [t for t in tokens if len(t) > 0 and t not in noise_tokens] if make_lowercase: tokens = [t.lower() for t in tokens] return tokens