def update_c11_check_box(original_df: object, pre_valid: str):
    original_df['url'] = original_df.apply(
        lambda x: get_key_value_from_gsheet_info(gsheet_info=x['gsheet_info'], key='url') if x[
                                                                                                 'pre_valid'] == pre_valid else 'None',
        axis=1)
    original_df['itune_id'] = original_df.apply(
        lambda x: get_itune_id_region_from_itune_url(url=x['itune_album_url'])[0] if (
                    x['itune_album_url'] != '' and x['pre_valid'] == pre_valid) else x['itune_id'], axis=1)
    original_df['region'] = original_df.apply(
        lambda x: get_itune_id_region_from_itune_url(url=x['itune_album_url'])[1] if (
                    x['itune_album_url'] != '' and x['pre_valid'] == pre_valid) else x['region'], axis=1)

    original_df['checking_validate_itune'] = original_df.apply(
        lambda x: check_validate_itune(itune_album_id=x['itune_id'], itune_region=x['region']) if (
                    x['itune_album_url'] != '' and x['pre_valid'] == pre_valid) else x['checking_validate_itune'],
        axis=1)

    gsheet_infos = list(set(original_df.gsheet_info.tolist()))
    sheet_name = get_key_value_from_gsheet_info(gsheet_info=gsheet_infos[0], key='sheet_name')
    url = get_key_value_from_gsheet_info(gsheet_info=gsheet_infos[0], key='url')
    grid_range_to_update = f"{sheet_name}!AJ2"
    list_result = original_df[
        ['itune_id', 'region', 'checking_validate_itune']].values.tolist()  # transfer data_frame to 2D list
    update_value(list_result=list_result, grid_range_to_update=grid_range_to_update,
                 gsheet_id=get_gsheet_id_from_url(url=url))
Example #2
0
def check_validate():
    original_df['itune_id'] = original_df['Itunes_Album_URL'].apply(
        lambda x: get_itune_id_region_from_itune_url(url=x)[0])
    original_df['region'] = original_df['Itunes_Album_URL'].apply(
        lambda x: get_itune_id_region_from_itune_url(url=x)[1])
    original_df['checking_validate_itune'] = original_df['itune_id'].apply(
        lambda x: check_validate_itune(x))
    original_df['token_set_ratio'] = original_df.apply(lambda x: get_max_ratio(
        itune_album_id=x['itune_id'], input_album_title=x.AlbumTitle),
                                                       axis=1)
    print(original_df)
Example #3
0
 def c11_filter(self):
     df = self.original_file
     if c11_checkbox(original_df=df, pre_valid=self.pre_valid):
         filter_df = df[
             (df['pre_valid'] == self.pre_valid)
             & (df['itune_album_url'] != '')
             & (~df['content type'].str.contains('REJECT'))].reset_index()
         filter_df['itune_id'] = filter_df['itune_album_url'].apply(
             lambda x: get_itune_id_region_from_itune_url(url=x)[0])
         filter_df['region'] = filter_df['itune_album_url'].apply(
             lambda x: get_itune_id_region_from_itune_url(url=x)[1])
         filter_df = filter_df.drop_duplicates(
             subset=['itune_id',
                     'gsheet_info'], keep='first').reset_index()
         return filter_df
Example #4
0
    def s11_filter(self):
        df = self.original_file
        s11_checkbox(df=df)
        if s11_checkbox(df=df):
            filter_df = df[(df['itune_album_url'] != 'not found')
                           & (df['itune_album_url'] != '')].drop_duplicates(
                               subset=['itune_album_url', 'gsheet_info'],
                               keep='first').reset_index()

            filter_df['itune_id'] = filter_df['itune_album_url'].apply(
                lambda x: get_itune_id_region_from_itune_url(url=x)[0])
            filter_df['region'] = filter_df['itune_album_url'].apply(
                lambda x: get_itune_id_region_from_itune_url(url=x)[1])
            return filter_df
        else:
            pass
def check_box_S_11_validate(gsheet_id: str):
    '''
    S_11 = {"sheet_name": "S_11",
            "column_name": ["release_date", "album_title", "album_artist", "itune_album_url", "sportify_album_url"]}
    '''

    sheet_info = sheet_type.S_11
    sheet_name = sheet_info.get('sheet_name')
    column_name = sheet_info.get('column_name')
    S_11_df = get_df_from_speadsheet(gsheet_id=gsheet_id, sheet_name=sheet_name)
    S_11_df.columns = S_11_df.columns.str.replace('Release_date', 'release_date')
    S_11_df.columns = S_11_df.columns.str.replace('AlbumTitle', 'album_title')
    S_11_df.columns = S_11_df.columns.str.replace('AlbumArtist', 'album_artist')
    S_11_df.columns = S_11_df.columns.str.replace('Itunes_Album_URL', 'itune_album_url')
    S_11_df.columns = S_11_df.columns.str.replace('AlbumURL', 'sportify_album_url')
    S_11_df = S_11_df[column_name].head(10)

    # Step 2: check validate format

    check_format_album_wiki = S_11_df[~((S_11_df['itune_album_url'] == 'not found')| (S_11_df['itune_album_url'].str[:32] == 'https://music.apple.com/us/album'))]
    S_11_format_validate = check_format_album_wiki.album_title.str.upper().to_numpy().tolist()
    if S_11_format_validate:
        print(check_format_album_wiki)
        return S_11_format_validate
    # Step 3: check validate itune_url
    else:
        S_11_df['itune_id'] = S_11_df['itune_album_url'].apply(
            lambda x: get_itune_id_region_from_itune_url(url=x)[0] if x != 'not found' else 'None')
        S_11_df['region'] = S_11_df['itune_album_url'].apply(
            lambda x: get_itune_id_region_from_itune_url(url=x)[1] if x != 'not found' else 'None')
        S_11_df['checking_validate_itune'] = S_11_df['itune_id'].apply(lambda x: check_validate_itune(x) if x != 'None' else 'None')
        S_11_df['token_set_ratio'] = S_11_df.apply(
            lambda x: get_max_ratio(itune_album_id=x['itune_id'], input_album_title=x['album_title']) if x['itune_id'] != 'None' else 'None', axis=1)

        # Step 4 update value:
        column_name = ['itune_id', 'region', 'checking_validate_itune', 'token_set_ratio']
        updated_df = S_11_df[column_name]

        list_result = updated_df.values.tolist()  # transfer data_frame to 2D list
        list_result.insert_column(0, column_name)
        range_to_update = f"{sheet_name}!M1"
        update_value(list_result, range_to_update,
                     gsheet_id)  # validate_value type: object, int, category... NOT DATETIME
def update_s11_check_box(df: object):
    gsheet_infos = list(set(df.gsheet_info.tolist()))
    df['url'] = df['gsheet_info'].apply(
        lambda x: get_key_value_from_gsheet_info(gsheet_info=x, key='url'))
    df['itune_id'] = df['itune_album_url'].apply(
        lambda x: get_itune_id_region_from_itune_url(url=x)[0] if x != 'not found' else 'None')
    df['region'] = df['itune_album_url'].apply(
        lambda x: get_itune_id_region_from_itune_url(url=x)[1] if x != 'not found' else 'None')
    df['checking_validate_itune'] = df['itune_id'].apply(
        lambda x: check_validate_itune(x) if x != 'None' else 'None')
    df['token_set_ratio'] = df.apply(
        lambda x: get_max_ratio(itune_album_id=x['itune_id'],
                                input_album_title=x['album_title']) if x['itune_id'] != 'None' else 'None',
        axis=1)
    # Update data
    for gsheet_info in gsheet_infos:
        url = get_key_value_from_gsheet_info(gsheet_info=gsheet_info, key='url')
        df_to_upload = df[df['url'] == url].reset_index()
        # print(df_to_upload)
        column_name = ['itune_id', 'region', 'checking_validate_itune', 'token_set_ratio']
        updated_df = df_to_upload[column_name]
        update_value_at_last_column(df_to_update=updated_df, gsheet_id=get_gsheet_id_from_url(url),
                                    sheet_name=SheetNames.S_11)
def c11_checkbox(original_df: object, pre_valid: str = None):
    df = original_df[original_df['pre_valid'] == pre_valid].reset_index()
    df['itune_id'] = df['itune_album_url'].apply(
        lambda x: get_itune_id_region_from_itune_url(url=x)[0] if x not in (
            'None', '', 'not found', 'non', 'nan', 'Itunes_Album_Link') else 'None')
    df['url'] = df['gsheet_info'].apply(
        lambda x: get_key_value_from_gsheet_info(gsheet_info=x, key='url'))
    check_format_s11 = df[~((
                                    (~df['content type'].str.contains('REJECT'))
                                    & (df['itune_album_url'].str[:24] == 'https://music.apple.com/')
                            ) |
                            (
                                    (df['itune_album_url'] == '')
                                    & (df['content type'].str.contains('REJECT'))
                            ))]
    if check_format_s11.empty:
        print(Fore.LIGHTYELLOW_EX + f"Pass check box" + Style.RESET_ALL)
        return True
    else:
        print(Fore.LIGHTYELLOW_EX + f"Not pass check box" + Style.RESET_ALL)
        print(check_format_s11.head(10))
        return False
def checking_s11_crawler_status(df: object):
    original_df = df.copy()
    original_df['itune_id'] = original_df['itune_album_url'].apply(
        lambda x: get_itune_id_region_from_itune_url(url=x)[0]
        if x not in ('None', '', 'not found', 'non', 'nan', 'Itunes_Album_Link'
                     ) else 'None')
    original_df['url'] = original_df['gsheet_info'].apply(
        lambda x: get_key_value_from_gsheet_info(gsheet_info=x, key='url'))

    gsheet_infos = list(set(original_df.gsheet_info.tolist()))
    for gsheet_info in gsheet_infos:
        gsheet_name = get_key_value_from_gsheet_info(gsheet_info=gsheet_info,
                                                     key='gsheet_name')
        sheet_name = get_key_value_from_gsheet_info(gsheet_info=gsheet_info,
                                                    key='sheet_name')
        PIC_taskdetail = f"{gsheet_name}_{sheet_name}"
        url = get_key_value_from_gsheet_info(gsheet_info=gsheet_info,
                                             key='url')
        original_df_split = original_df[original_df['url'] ==
                                        url].reset_index()
        count = 0
        while True and count < 300:
            checking_accuracy_result = get_df_from_query(
                get_s11_crawlingtask_info(pic=PIC_taskdetail))
            checking_accuracy_result[
                'itune_album_id'] = checking_accuracy_result[
                    'itune_album_id'].apply(lambda x: x.strip('"'))

            result = checking_accuracy_result[
                ~((checking_accuracy_result['06_status'] == 'complete')
                  & (checking_accuracy_result['e5_status'] == 'complete')) |
                (checking_accuracy_result['06_status'] == 'incomplete') |
                ((checking_accuracy_result['06_status'] == 'complete')
                 & (checking_accuracy_result['e5_status'] == 'incomplete'))]

            checking = result.empty
            if checking == 1:
                print(
                    Fore.LIGHTYELLOW_EX +
                    f"File: {gsheet_name}, sheet_name: {sheet_name} has been crawled complete already"
                    + Style.RESET_ALL)
                data_merge = pd.merge(original_df_split,
                                      checking_accuracy_result,
                                      how='left',
                                      left_on='itune_id',
                                      right_on='itune_album_id',
                                      validate='m:1').fillna(value='None')
                print(data_merge)
                # update data to gsheet

                data_updated = data_merge[checking_accuracy_result.columns]
                update_value_at_last_column(
                    df_to_update=data_updated,
                    gsheet_id=get_gsheet_id_from_url(url=url),
                    sheet_name=sheet_name)

                # update data report:
                data_report = data_merge[~(
                    ((data_merge['itune_album_url'].isin(['not found', '']))
                     & (data_merge['06_status'] == 'None')
                     & (data_merge['e5_status'] == 'None')) |
                    ((~data_merge['itune_album_url'].isin(['not found', '']))
                     & (data_merge['06_status'] == 'complete')
                     & (data_merge['e5_status'] == 'complete')))]
                if data_report.empty:
                    print(Fore.LIGHTYELLOW_EX + f"Accuracy: ok\nStatus: ok" +
                          Style.RESET_ALL)
                else:
                    print(Fore.LIGHTYELLOW_EX +
                          f"Accuracy: not ok\nStatus: not ok" +
                          Style.RESET_ALL)
                    columns_data_report = ['itune_id'] + list(
                        checking_accuracy_result.columns)
                    data_report = data_report[columns_data_report]
                    print(data_report)

                break
            else:
                count += 1
                print(
                    Fore.LIGHTYELLOW_EX +
                    f"File: {gsheet_name}, sheet_name: {sheet_name} hasn't been crawled complete"
                    + Style.RESET_ALL)
                time.sleep(10)
                print(count, "-----", result)
def checking_c11_crawler_status(original_df: object, pre_valid: str = None):
    original_df['itune_id'] = original_df.apply(
        lambda x: get_itune_id_region_from_itune_url(url=x['itune_album_url'])[
            0] if x['itune_album_url'] not in
        ('None', '', 'not found', 'non', 'nan', 'Itunes_Album_Link') else x[
            'itune_id'],
        axis=1)
    original_df['url'] = original_df['gsheet_info'].apply(
        lambda x: get_key_value_from_gsheet_info(gsheet_info=x, key='url'))
    gsheet_infos = list(set(original_df.gsheet_info.tolist()))
    for gsheet_info in gsheet_infos:
        gsheet_name = get_key_value_from_gsheet_info(gsheet_info=gsheet_info,
                                                     key='gsheet_name')
        sheet_name = get_key_value_from_gsheet_info(gsheet_info=gsheet_info,
                                                    key='sheet_name')
        PIC_taskdetail = f"{gsheet_name}_{sheet_name}_{pre_valid}"
        url = get_key_value_from_gsheet_info(gsheet_info=gsheet_info,
                                             key='url')
        original_df_split = original_df[original_df['url'] ==
                                        url].reset_index()
        count = 0
        while True and count < 300:
            checking_accuracy_result = get_df_from_query(
                get_s11_crawlingtask_info(pic=PIC_taskdetail))
            checking_accuracy_result[
                'itune_album_id'] = checking_accuracy_result[
                    'itune_album_id'].apply(lambda x: x.strip('"'))
            result = checking_accuracy_result[~(
                ((checking_accuracy_result['06_status'] == 'complete')
                 & (checking_accuracy_result['E5_status'] == 'complete')) |
                (checking_accuracy_result['06_status'] == 'incomplete') |
                ((checking_accuracy_result['06_status'] == 'complete')
                 & (checking_accuracy_result['E5_status'] == 'incomplete')))]
            checking = result.empty
            if checking == 1:
                print(
                    Fore.LIGHTYELLOW_EX +
                    f"File: {gsheet_name}, sheet_name: {sheet_name} has been crawled complete already"
                    + Style.RESET_ALL)

                data_merge = pd.merge(original_df_split,
                                      checking_accuracy_result,
                                      how='left',
                                      left_on='itune_id',
                                      right_on='itune_album_id',
                                      validate='m:1').fillna(value='None')
                data_merge['06_id_x'] = data_merge.apply(
                    lambda x: x['06_id_y']
                    if x['pre_valid'] == pre_valid else x['06_id_x'],
                    axis=1)
                data_merge['06_status_x'] = data_merge.apply(
                    lambda x: x['06_status_y']
                    if x['pre_valid'] == pre_valid else x['06_status_x'],
                    axis=1)
                data_merge['e5_id'] = data_merge.apply(
                    lambda x: x['E5_id']
                    if x['pre_valid'] == pre_valid else x['e5_id'],
                    axis=1)
                data_merge['e5_status'] = data_merge.apply(
                    lambda x: x['E5_status']
                    if x['pre_valid'] == pre_valid else x['e5_status'],
                    axis=1)
                data_merge.columns = data_merge.columns.str.replace(
                    '06_id_x', '06_id')
                data_merge.columns = data_merge.columns.str.replace(
                    '06_status_x', '06_status')
                data_merge = data_merge[original_df_split.columns]

                # update data report:
                data_report = data_merge[data_merge['pre_valid'] == pre_valid]

                data_report = data_report[~(
                    ((data_report['itune_album_url'].isin(['not found', '']))
                     & (data_report['06_status'] == 'None')
                     & (data_report['e5_status'] == 'None'))
                    |
                    ((~data_report['itune_album_url'].isin(['not found', '']))
                     & (data_report['06_status'] == 'complete')
                     & (data_report['e5_status'] == 'complete')))]
                if data_report.empty:
                    print(Fore.LIGHTYELLOW_EX + f"Accuracy: ok\nStatus: ok" +
                          Style.RESET_ALL)
                    row_num = data_merge.index
                    for i in row_num:
                        if data_merge['pre_valid'].loc[i] == pre_valid:
                            itune_album_id = data_merge['itune_id'].loc[i]
                            seq = data_merge['track_title/track_num'].loc[i]
                            format_id = get_format_id_from_content_type(
                                content_type=data_merge['content type'].loc[i])
                            youtube_url = data_merge['contribution_link'].loc[
                                i]
                            db_track = get_track_title_track_artist_by_ituneid_and_seq(
                                itune_album_id=itune_album_id, seq=seq)
                            if db_track:
                                track_title = db_track.title
                                track_id = db_track.id
                                track_duration = db_track.duration_ms
                                track_similarity = similarity(
                                    track_title=track_title,
                                    youtube_url=youtube_url,
                                    formatid=format_id,
                                    duration=track_duration).get('similarity')
                            else:
                                track_title = 'not found'
                                track_id = 'not found'
                                track_similarity = 'not found'
                            data_merge.loc[i, 'track_title'] = track_title
                            data_merge.loc[i, 'track_id'] = track_id
                            data_merge.loc[i, 'similarity'] = track_similarity
                        else:
                            pass
                    updated_columns = [
                        '06_id', '06_status', 'e5_id', 'e5_status',
                        'track_title', 'track_id', 'similarity'
                    ]
                    print(data_merge[updated_columns])
                else:
                    print(Fore.LIGHTYELLOW_EX +
                          f"Accuracy: not ok\nStatus: not ok" +
                          Style.RESET_ALL)
                    updated_columns = [
                        '06_id', '06_status', 'e5_id', 'e5_status'
                    ]
                # update data to gsheet
                data_updated = data_merge[updated_columns]
                grid_range_to_update = f"{sheet_name}!AM2"
                list_result = data_updated.values.tolist(
                )  # transfer data_frame to 2D list
                update_value(list_result=list_result,
                             grid_range_to_update=grid_range_to_update,
                             gsheet_id=get_gsheet_id_from_url(url=url))
                break
            else:
                count += 1
                print(
                    Fore.LIGHTYELLOW_EX +
                    f"File: {gsheet_name}, sheet_name: {sheet_name} hasn't been crawled complete"
                    + Style.RESET_ALL)
                time.sleep(10)
                print(count, "-----", result)