def update_similarity(urls: list, sheet_name: str, start_row: int, stop_row: int): url = urls[0] gsheet_id = get_gsheet_id_from_url(url=url) df = get_df_from_speadsheet(gsheet_id=gsheet_id, sheet_name=sheet_name) df["DurationMs"].replace({"": "0"}, inplace=True) df = df.loc[start_row:stop_row] row_index = df.index start = row_index.n_estimators_start stop = row_index.n_estimators_stop step = 25 for i in range(start, stop, step): x = i + step if x <= stop: stop_range = x else: stop_range = stop f = [] for j in range(i, stop_range): track_title = df.track_title.loc[j] SourceURI = df.SourceURI.loc[j] FormatID = df.FormatID.loc[j] DurationMs = df.DurationMs.loc[j] k = similarity(track_title=track_title, youtube_url=SourceURI, formatid=FormatID, duration=DurationMs).get( 'similarity') f.append([k]) joy1 = f"{sheet_name}!N{i + 2}" update_value(list_result=f, grid_range_to_update=joy1, gsheet_id=gsheet_id)
def check_youtube_url_mp3(gsheet_id: str): ''' MP3_SHEET_NAME = {"sheet_name": "MP_3", "fomatid": DataSourceFormatMaster.FORMAT_ID_MP3_FULL, "column_name": ["track_id", "Memo", "Mp3_link", "url_to_add"]} ''' sheet_info = sheet_type.MP3_SHEET_NAME sheet_name = sheet_info.get('sheet_name') column_name = sheet_info.get('column_name') + ['Type'] original_df = get_df_from_speadsheet(gsheet_id, sheet_name).applymap(str.lower) original_df.columns = original_df.columns.str.replace('TrackId', 'track_id') original_df.columns = original_df.columns.str.replace('MP3_link', 'Mp3_link') youtube_url_mp3 = original_df[column_name].copy() youtube_url_mp3['len'] = youtube_url_mp3['url_to_add'].apply(lambda x: len(x)) check_youtube_url_mp3 = youtube_url_mp3[ (( (youtube_url_mp3['track_id'] != '') & (youtube_url_mp3['Memo'] == 'added') & (youtube_url_mp3['len'] == 43) & (youtube_url_mp3['Type'].isin(["c", "d", "z"])) ) | ( (youtube_url_mp3['track_id'] != '') & (youtube_url_mp3['Memo'] == 'not found') & (youtube_url_mp3['url_to_add'] == '') & (youtube_url_mp3['Type'] == '') )) ] print(check_youtube_url_mp3)
def check_artist_image(gsheet_id: str): ''' ArtistTrackUUID Memo URL_to_add Assignee no need to check not null ok null not null added not null :return: ''' sheet_name = 'Artist_image' original_df = get_df_from_speadsheet(gsheet_id, sheet_name).applymap(str.lower) artist_image = original_df[['Artist_uuid', 'Memo', 'url_to_add', 'Assignee']] check_artist_image = artist_image[~ (( (artist_image['Artist_uuid'] != '') & (artist_image['Memo'] == 'ok') & (artist_image['url_to_add'] == '') ) | ( (artist_image['Artist_uuid'] != '') & (artist_image['Memo'] == 'added') & (artist_image['url_to_add'] != '') ) | ( (artist_image['Assignee'] == 'no need to check') )) ] return check_artist_image.Artist_uuid.str.upper()
def check_youtube_url_mp3(gsheet_id: str): ''' TrackID Memo URL_to_add Type Assignee no need to check not null added length = 43 C/D/Z not null not found none none :return: ''' sheet_name = 'MP_3' original_df = get_df_from_speadsheet(gsheet_id, sheet_name).applymap(str.lower) original_df['len'] = original_df['url_to_add'].apply(lambda x: len(x)) youtube_url_mp3 = original_df[['track_id', 'Memo', 'url_to_add', 'len', 'Type', 'Assignee']] check_youtube_url_mp3 = youtube_url_mp3[~ (( (youtube_url_mp3['track_id'] != '') & (youtube_url_mp3['Memo'] == 'added') & (youtube_url_mp3['len'] == 43) & (youtube_url_mp3['Type'].isin(["c", "d", "z"])) ) | ( (youtube_url_mp3['track_id'] != '') & (youtube_url_mp3['Memo'] == 'not found') & (youtube_url_mp3['url_to_add'] == 'none') & (youtube_url_mp3['Type'] == 'none') ) | ( (youtube_url_mp3['Assignee'] == 'no need to check') )) ] return check_youtube_url_mp3.track_id.str.upper()
def extract_artist_page_similarity(artist_names: list, urls: list, sheet_name: str): # urls: list: only one url for artist page similarity gsheet_id = get_gsheet_id_from_url(url=urls[0]) sheet_titles = get_list_of_sheet_title(gsheet_id=gsheet_id) formatid = sheet_info.get('fomatid') if sheet_name in sheet_titles: df = get_df_from_speadsheet(gsheet_id=gsheet_id, sheet_name=sheet_name) else: df = pd.DataFrame() get_df_datasource_by_artist_and_formatid(artist_names=artist_names, formatid=formatid, df=df, gsheet_id=gsheet_id, sheet_name=sheet_name)
def check_album_wiki(gsheet_id: str): ''' AlbumUUID Memo URL_to_add Content_to_add Assignee no need to check not null ok null null not null added https://en.wikipedia.org/% not null not null not found none none not null not ok https://en.wikipedia.org/% not null not null not ok none none :return: ''' sheet_name = 'Album_wiki' original_df = get_df_from_speadsheet(gsheet_id, sheet_name).applymap(str.lower) album_wiki = original_df[['Album_uuid', 'Memo', 'url_to_add', 'Content_to_add', 'Assignee']] check_album_wiki = album_wiki[~ (( (album_wiki['Album_uuid'] != '') & (album_wiki['Memo'] == 'ok') & (album_wiki['url_to_add'] == '') & (album_wiki['Content_to_add'] == '') ) | ( (album_wiki['Album_uuid'] != '') & (album_wiki['Memo'] == 'added') & (album_wiki['url_to_add'].str[:25] == 'https://en.wikipedia.org/') & (album_wiki['Content_to_add'] != '') ) | ( (album_wiki['Album_uuid'] != '') & (album_wiki['Memo'] == 'not found') & (album_wiki['url_to_add'] == 'none') & (album_wiki['Content_to_add'] == 'none') ) | ( (album_wiki['Album_uuid'] != '') & (album_wiki['Memo'] == 'not ok') & (album_wiki['url_to_add'].str[:25] == 'https://en.wikipedia.org/') & (album_wiki['Content_to_add'] != '') ) | ( (album_wiki['Album_uuid'] != '') & (album_wiki['Memo'] == 'not ok') & (album_wiki['url_to_add'] == 'none') & (album_wiki['Content_to_add'] == 'none') ) | ( (album_wiki['Assignee'] == 'no need to check') )) ] return check_album_wiki.Album_uuid.str.upper()
def check_version(gsheet_id: str): ''' TrackID URL Remix_Artist not null length = 43 not null not null null null TrackID URL2 Venue Live_year not null length = 43 not null null hoặc nằm trong khoảng từ 1950 đến 2030 not null null null null ''' sheet_name = 'Version_done' original_df = get_df_from_speadsheet(gsheet_id, sheet_name).applymap(str.lower) original_df['len_remix_url'] = original_df['Remix_url'].apply(lambda x: len(x)) original_df['len_live_url'] = original_df['Live_url'].apply(lambda x: len(x)) original_df['Live_year'] = pd.to_numeric(original_df.Live_year, errors='coerce').astype('Int64').fillna(0) youtube_url_version = original_df[ ['track_id', 'Remix_url', 'Remix_artist', 'Live_url', 'Live_venue', 'len_remix_url', 'len_live_url', 'Live_year']] check_version = youtube_url_version[~ ((( (youtube_url_version['track_id'] != '') & (youtube_url_version['len_remix_url'] == 43) & (youtube_url_version['Remix_artist'] != '') ) | ( (youtube_url_version['track_id'] != '') & (youtube_url_version['Remix_url'] == '') & (youtube_url_version['Remix_artist'] == '') )) & (( (youtube_url_version['track_id'] != '') & (youtube_url_version['len_live_url'] == 43) & (youtube_url_version['Live_venue'] != '') & ((youtube_url_version['Live_year'] == 0) | ( (1950 <= original_df['Live_year']) & (original_df['Live_year'] <= 2030))) ) | ( (youtube_url_version['track_id'] != '') & (youtube_url_version['Live_url'] == '') & (youtube_url_version['Live_venue'] == '') & (youtube_url_version['Live_year'] == 0) ))) ] return check_version.track_id.str.upper()
def check_youtube_url_mp4(gsheet_id: str): ''' TrackID Memo URL_to_add Assignee no need to check not null ok null not null added length = 43 not null not found none not null not ok length = 43 not null not ok none :return: ''' sheet_name = 'MP_4' original_df = get_df_from_speadsheet(gsheet_id, sheet_name).applymap(str.lower) original_df['len'] = original_df['url_to_add'].apply(lambda x: len(x)) youtube_url_mp4 = original_df[['track_id', 'Memo', 'url_to_add', 'len', 'Assignee']] check_youtube_url_mp4 = youtube_url_mp4[~ (( (youtube_url_mp4['track_id'] != '') & (youtube_url_mp4['Memo'] == 'ok') & (youtube_url_mp4['url_to_add'] == '') ) | ( (youtube_url_mp4['track_id'] != '') & (youtube_url_mp4['Memo'] == 'added') & (youtube_url_mp4['len'] == 43) ) | ( (youtube_url_mp4['track_id'] != '') & (youtube_url_mp4['Memo'] == 'not found') & (youtube_url_mp4['url_to_add'] == 'none') ) | ( (youtube_url_mp4['track_id'] != '') & (youtube_url_mp4['Memo'] == 'not ok') & (youtube_url_mp4['len'] == 43) ) | ( (youtube_url_mp4['track_id'] != '') & (youtube_url_mp4['Memo'] == 'not ok') & (youtube_url_mp4['url_to_add'] == 'none') ) | (youtube_url_mp4['Assignee'] == 'no need to check') ) ] return check_youtube_url_mp4.track_id.str.upper()
def process_file(self, sheet_info: str): sheet_name = sheet_info.get('sheet_name') column_names = sheet_info.get('column_name') df = get_df_from_speadsheet(gsheet_id=self.gsheet_id, sheet_name=sheet_name) lower_names = [name.lower() for name in df.columns] df.columns = lower_names if sheet_name in get_list_of_sheet_title(gsheet_id=self.gsheet_id): # reformat_column_name df.columns = df.columns.str.replace('track_id', 'track_id') df.columns = df.columns.str.replace('track id', 'track_id') df.columns = df.columns.str.replace('trackid', 'track_id') df.columns = df.columns.str.replace('s12', 'memo') df.columns = df.columns.str.replace('a12', 'memo') df.columns = df.columns.str.replace('mp3_link', 'mp3_link') df.columns = df.columns.str.replace('mp3link', 'mp3_link') df.columns = df.columns.str.replace('mp4_link', 'mp4_link') df.columns = df.columns.str.replace('mp4link', 'mp4_link') df.columns = df.columns.str.replace('url to add', 'url_to_add') df.columns = df.columns.str.replace('artist_url_to_add', 'url_to_add') df.columns = df.columns.str.replace('artist_uuid', 'uuid') df.columns = df.columns.str.replace('objectid', 'uuid') df.columns = df.columns.str.replace('album_uuid', 'uuid') df.columns = df.columns.str.replace('content tomadd', 'content_to_add') df.columns = df.columns.str.replace('albumtitle', 'album_title') df.columns = df.columns.str.replace('albumartist', 'album_artist') df.columns = df.columns.str.replace('itunes_album_url', 'itune_album_url') df.columns = df.columns.str.replace('itunes_album_link', 'itune_album_url') df.columns = df.columns.str.replace('albumurl', 'sportify_album_url') df_columns = df.columns column_name_reformat = [] for column_name in column_names: if column_name in df_columns: column_name_reformat.append(column_name) df = df[column_name_reformat] return df else: print(f"sheet_name: {sheet_name} not have")
def similarity(track_title: str, youtube_url: str, formatid: str, duration): special_characters = \ get_df_from_speadsheet(gsheet_id='1W1TlNDXqZTMAaAFofrorqaEo6bfX7GjwnhWMXcq70xA', sheet_name='Similarity')[ 'Keywords'].tolist() track_title = track_title.lower() get_youtube_info = get_youtube_title_and_youtube_uploader_from_youtube_url( youtube_url) get_youtube_title = get_youtube_info['youtube_title'].lower() # get_youtube_uploader = get_youtube_info['uploader'].lower() get_youtube_duration = get_youtube_info['duration'] abs_duration = abs(int(duration) - int(get_youtube_duration)) if abs_duration > 10000 and formatid == DataSourceFormatMaster.FORMAT_ID_MP3_FULL: token_set_ratio = 0 else: result = "type 3" for special_character in special_characters: if special_character in track_title: result = "type 1" break elif special_character in get_youtube_title: result = "type 2" break else: pass if result == "type 1": if special_character in get_youtube_title: token_set_ratio = get_token_set_ratio(get_youtube_title, track_title) else: token_set_ratio = 0 elif result == "type 2": if special_character in track_title: token_set_ratio = get_token_set_ratio(get_youtube_title, track_title) else: token_set_ratio = 0 else: token_set_ratio = get_token_set_ratio(get_youtube_title, track_title) get_youtube_info['similarity'] = token_set_ratio youtube_info = get_youtube_info return youtube_info
def check_box_S_11_validate(gsheet_id: str): ''' S_11 = {"sheet_name": "S_11", "column_name": ["release_date", "album_title", "album_artist", "itune_album_url", "sportify_album_url"]} ''' sheet_info = sheet_type.S_11 sheet_name = sheet_info.get('sheet_name') column_name = sheet_info.get('column_name') S_11_df = get_df_from_speadsheet(gsheet_id=gsheet_id, sheet_name=sheet_name) S_11_df.columns = S_11_df.columns.str.replace('Release_date', 'release_date') S_11_df.columns = S_11_df.columns.str.replace('AlbumTitle', 'album_title') S_11_df.columns = S_11_df.columns.str.replace('AlbumArtist', 'album_artist') S_11_df.columns = S_11_df.columns.str.replace('Itunes_Album_URL', 'itune_album_url') S_11_df.columns = S_11_df.columns.str.replace('AlbumURL', 'sportify_album_url') S_11_df = S_11_df[column_name].head(10) # Step 2: check validate format check_format_album_wiki = S_11_df[~((S_11_df['itune_album_url'] == 'not found')| (S_11_df['itune_album_url'].str[:32] == 'https://music.apple.com/us/album'))] S_11_format_validate = check_format_album_wiki.album_title.str.upper().to_numpy().tolist() if S_11_format_validate: print(check_format_album_wiki) return S_11_format_validate # Step 3: check validate itune_url else: S_11_df['itune_id'] = S_11_df['itune_album_url'].apply( lambda x: get_itune_id_region_from_itune_url(url=x)[0] if x != 'not found' else 'None') S_11_df['region'] = S_11_df['itune_album_url'].apply( lambda x: get_itune_id_region_from_itune_url(url=x)[1] if x != 'not found' else 'None') S_11_df['checking_validate_itune'] = S_11_df['itune_id'].apply(lambda x: check_validate_itune(x) if x != 'None' else 'None') S_11_df['token_set_ratio'] = S_11_df.apply( lambda x: get_max_ratio(itune_album_id=x['itune_id'], input_album_title=x['album_title']) if x['itune_id'] != 'None' else 'None', axis=1) # Step 4 update value: column_name = ['itune_id', 'region', 'checking_validate_itune', 'token_set_ratio'] updated_df = S_11_df[column_name] list_result = updated_df.values.tolist() # transfer data_frame to 2D list list_result.insert_column(0, column_name) range_to_update = f"{sheet_name}!M1" update_value(list_result, range_to_update, gsheet_id) # validate_value type: object, int, category... NOT DATETIME
def __init__(self, url: str): gsheet_id = get_gsheet_id_from_url(url=url) list_of_sheet_title = get_list_of_sheet_title(gsheet_id=gsheet_id) sheet_names = get_list_of_sheet_title(gsheet_id=gsheet_id) if "MP_3" in sheet_names: self.MP3_SHEET_NAME = {"sheet_name": "MP_3", "fomatid": DataSourceFormatMaster.FORMAT_ID_MP3_FULL, "column_name": ["track_id", "memo", "mp3_link", "url_to_add", "type", "checking_mp3", "already_existed", "is_released", "assignee"]} if "MP_4" in sheet_names: self.MP4_SHEET_NAME = {"sheet_name": "MP_4", "fomatid": DataSourceFormatMaster.FORMAT_ID_MP4_FULL, "column_name": ["track_id", "memo", "mp4_link", "url_to_add", "checking_mp4", "already_existed", "is_released", "verified", "assignee"]} if "Version_done" in sheet_names: self.VERSION_SHEET_NAME = {"sheet_name": "Version_done", "fomatid": [DataSourceFormatMaster.FORMAT_ID_MP4_REMIX, DataSourceFormatMaster.FORMAT_ID_MP4_LIVE], "column_name": ["track_id", "remix_url", "remix_artist", "live_url", "live_venue", "live_year"]} if f"{SheetNames.ARTIST_IMAGE} cant upload" in list_of_sheet_title: if get_df_from_speadsheet(gsheet_id, f"{SheetNames.ARTIST_IMAGE} cant upload").values.tolist() == [ ['Upload thành công 100% nhé các em ^ - ^']]: pass else: sheet_name = f"{SheetNames.ARTIST_IMAGE} cant upload" self.ARTIST_IMAGE = {"sheet_name": f"{sheet_name}", "column_name": ["uuid", "memo", "url_to_add"], "object_type": ObjectType.ARTIST} elif "image" in list_of_sheet_title: sheet_name = "image" self.ARTIST_IMAGE = {"sheet_name": f"{sheet_name}", "column_name": ["uuid", "memo", "url_to_add"], "object_type": ObjectType.ARTIST} elif "Artist_image" in list_of_sheet_title: sheet_name = "Artist_image" self.ARTIST_IMAGE = {"sheet_name": f"{sheet_name}", "column_name": ["uuid", "memo", "url_to_add"], "object_type": ObjectType.ARTIST} else: pass if "Album_image" in sheet_names: self.ALBUM_IMAGE = {"sheet_name": "Album_image", "column_name": ["uuid", "memo", "url_to_add"], "object_type": ObjectType.ALBUM} if "Artist_wiki" in sheet_names: self.ARTIST_WIKI = {"sheet_name": "Artist_wiki", "column_name": ["uuid", "memo", "url_to_add", "content_to_add"], "table_name": "artists"} if "Album_wiki" in sheet_names: self.ALBUM_WIKI = {"sheet_name": "Album_wiki", "column_name": ["uuid", "memo", "url_to_add", "content_to_add"], "table_name": "albums"} if "Track_wiki" in sheet_names: self.TRACK_WIKI = {"sheet_name": "Track_wiki", "column_name": ["id", "memo", "url_to_add", "content_to_add"], "table_name": "tracks"} if "S_11" in sheet_names: self.S_11 = {"sheet_name": "S_11", "column_name": ["release_date", "album_title", "album_artist", "itune_album_url", "sportify_album_url"]} if "Youtube collect_experiment" in sheet_names: self.C_11 = {"sheet_name": "Youtube collect_experiment", "column_name": ["pre_valid", "p.i.c", "itune_album_url", "official_music_video_2", "artist_name", "year", "live_concert_name_place", "track_title/track_num", "contribution_link", "content type", "pointlogsid", "itune_id", "region", "checking_validate_itune", "06_id", "06_status", "e5_id", "e5_status", "track_title", "track_id", "similarity", "d9", "d9_status"]}
track_title) get_youtube_info['similarity'] = token_set_ratio youtube_info = get_youtube_info return youtube_info if __name__ == "__main__": # https://docs.google.com/spreadsheets/d/1aRhZ7NQAfhud3jjR5aboCZ3Ew8u2Y0SqGqUQYwcUnBs/edit#gid=98817891 start_time = time.time() pd.set_option("display.max_rows", None, "display.max_columns", 50, 'display.width', 1000) # https://docs.google.com/spreadsheets/d/1eO8J2qqjxgRVnc3b1EWGskVHYc1baUAmDzdqT6hIdRg/edit#gid=926860952 gsheet_id = '1eO8J2qqjxgRVnc3b1EWGskVHYc1baUAmDzdqT6hIdRg' sheet_name = 'mp_3_3' df = get_df_from_speadsheet(gsheet_id=gsheet_id, sheet_name=sheet_name) df["DurationMs"].replace({"": "0"}, inplace=True) df = df.loc[8148:9000] row_index = df.index start = row_index.n_estimators_start stop = row_index.n_estimators_stop step = 25 for i in range(start, stop, step): x = i + step if x <= stop: stop_range = x else: stop_range = stop f = [] for j in range(i, stop_range): track_title = df.track_title.loc[j]
import time import pandas as pd from google_spreadsheet_api.function import get_df_from_speadsheet, get_gsheet_name, update_value from data_process.crawlingtask import crawl_itunes_album from core import query_path if __name__ == "__main__": start_time = time.time() pd.set_option("display.max_rows", None, "display.max_columns", 50, 'display.width', 1000) # with open(query_path, "a+") as f: # df = get_df_from_speadsheet(gsheet_id="1HUnal5ZfTngeSlKVCLH0kTnz0ZMa_RwvDv5uTqJSM6Q", sheet_name="joy") # gsheet_name = get_gsheet_name(gsheet_id="1HUnal5ZfTngeSlKVCLH0kTnz0ZMa_RwvDv5uTqJSM6Q") # sheet_name = "joy" # pic = f"{gsheet_name}_{sheet_name}" # row_index = df.index # for i in row_index: # itune_id = df.id.loc[i] # region = df["region"].loc[i] # crawling_task = crawl_itunes_album(ituneid=itune_id, pic=pic, region=region) # print(crawling_task) # f.write(crawling_task) df = get_df_from_speadsheet( gsheet_id="1J0tfInOX5VFnC0QM2CVnPb2i4YblF1EVFoZAOtEslHo", sheet_name="MP_3") print(df.head(10)) print("--- %s seconds ---" % (time.time() - start_time))