Python create_jsonの例

プログラミング言語: Python

名前空間/パッケージ名: dataproducts.util.utils

メソッド/関数: create_json

hotexamples.comのコード掲載数: 14

Python create_json - 14件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのdataproducts.util.utils.create_jsonの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: district_monthly.py プロジェクト: reshmi-nair/sunbird-data-products

 def unique_users(self, result_loc_, date_, state_):
     """
     Query druid for unique users by district over a month for a state
     :param result_loc_: pathlib.Path object to store resultant CSV
     :param date_: datetime object to pass for query and path
     :param query_: json query template
     :param state_: the state to be used in query
     :return: None
     """
     slug_ = result_loc_.name
     year = date_.year
     month = date_.month
     if month != 1:
         start_date = datetime(year, month - 1, 1)
     else:
         start_date = datetime(year - 1, 12, 1)
     query = Template(district_devices_monthly.init())
     query = query.substitute(
         app=self.config['context']['pdata']['id']['app'],
         portal=self.config['context']['pdata']['id']['portal'],
         state=state_,
         start_date=start_date.strftime('%Y-%m-%dT00:00:00+00:00'),
         end_date=date_.strftime('%Y-%m-%dT00:00:00+00:00'))
     url = "{}druid/v2/".format(self.druid_hostname)
     headers = {'Content-Type': "application/json"}
     response = requests.request("POST", url, data=query, headers=headers)
     if response.status_code == 200:
         if len(response.json()) == 0:
             return
         data = []
         for response in response.json():
             data.append(response['event'])
         df = pd.DataFrame(data).fillna('Unknown')
         df.to_csv(result_loc_.parent.joinpath(
             date_.strftime("%Y-%m-%d"), "{}_monthly.csv".format(slug_)),
                   index=False)
         post_data_to_blob(result_loc_.parent.joinpath(
             date_.strftime("%Y-%m-%d"), "{}_monthly.csv".format(slug_)),
                           backup=True)
         df['District'] = df.get('District',
                                 pd.Series(index=df.index, name='District'))
         df['Unique Devices'] = df['Unique Devices'].astype(int)
         df = verify_state_district(
             result_loc_.parent.joinpath(date_.strftime('%Y-%m-%d')),
             state_, df)
         df = df[['District', 'Unique Devices']]
         df = df.groupby('District').sum().reset_index()
         df.to_csv(
             result_loc_.joinpath("aggregated_unique_users_summary.csv"),
             index=False)
         create_json(
             result_loc_.joinpath("aggregated_unique_users_summary.csv"))
         post_data_to_blob(
             result_loc_.joinpath("aggregated_unique_users_summary.csv"))
     else:
         with open(result_loc_.parent.joinpath('error_log.log'), 'a') as f:
             f.write(state_ + 'summary ' + response.status_code +
                     response.text)

コード例 #2

ファイルを表示

    def generate_reports(self, from_time, to_time):
        ecg_data = self.get_monitoring_data(from_time, to_time)

        findspark.init()
        spark = SparkSession.builder.appName("ECGLearning").master(
            "local[*]").getOrCreate()
        spark.conf.set('spark.sql.session.timeZone', 'Asia/Kolkata')
        os.makedirs(os.path.join(self.write_path, 'public'), exist_ok=True)

        # Create data frame
        ecg_data_rdd = spark.sparkContext.parallelize(ecg_data)
        schema = StructType([
            StructField('time', IntegerType(), True),
            StructField('tps', StringType(), True)
        ])
        tps_df = spark.createDataFrame(ecg_data_rdd, schema)
        tps_df = tps_df.withColumn("tps", tps_df["tps"].cast("float"))
        tps_df = tps_df.withColumn("tps", F.ceil(tps_df["tps"]))
        tps_df = tps_df.withColumn(
            "time", F.from_unixtime(tps_df["time"], "yyyy/MM/dd HH:mm:ss"))

        # Downloading the current file from blob container
        get_data_from_blob(
            Path(self.write_path).joinpath('public', self.csv_file_name))
        current_blob_df = spark.read.csv(os.path.join(self.write_path,
                                                      'public',
                                                      self.csv_file_name),
                                         header=True)
        current_blob_df = current_blob_df.withColumn(
            "tps", current_blob_df["tps"].cast("int"))
        current_blob_df = current_blob_df.union(tps_df)
        current_blob_df = current_blob_df.dropDuplicates(["time"])
        current_blob_df = current_blob_df.sort("time")

        # removing the first day's data on 7 days data
        current_blob_df = self.remove_last_day(current_blob_df)

        os.makedirs(os.path.join(self.write_path, 'public'), exist_ok=True)
        current_blob_df.toPandas().to_csv(os.path.join(self.write_path,
                                                       'public',
                                                       self.csv_file_name),
                                          index=False)
        create_json(
            os.path.join(self.write_path, 'public', self.csv_file_name), True)

        # Uploading updated data to Azure blob container
        write_data_to_blob(self.write_path,
                           os.path.join('public', self.csv_file_name))
        write_data_to_blob(self.write_path,
                           os.path.join('public', self.json_file_name))

        spark.stop()

コード例 #3

ファイルを表示

 def combine_creation_reports(self, result_loc_, date_):
     """
     Combine weekly and overall numbers.
     :param result_loc_: pathlib.Path object to store the resultant csv at.
     :param date_: datetime object to use in query as well as path
     :return: None
     """
     tenant_name_mapping = pd.read_csv(result_loc_.joinpath(date_.strftime('%Y-%m-%d'), 'tenant_info.csv'))
     _start_date = date_ - timedelta(days=7)
     week = pd.read_csv(result_loc_.joinpath(date_.strftime(format('%Y-%m-%d')), 'week.csv'))
     overall = pd.read_csv(result_loc_.joinpath(date_.strftime(format('%Y-%m-%d')), 'overall.csv'))
     while True:
         try:
             week['total'] = week['draft'] + week['live'] + week['review']
             break
         except KeyError as ke:
             week[ke.args[0]] = 0
     while True:
         try:
             overall['total'] = overall['draft'] + overall['live'] + overall['review']
             break
         except KeyError as ke:
             overall[ke.args[0]] = 0
     week = week.set_index('tenant')
     overall = overall.set_index('tenant')
     week_transpose = week.transpose()
     overall_transpose = overall.transpose()
     for ind_, row_ in tenant_name_mapping.iterrows():
         try:
             week_numbers = week_transpose[row_['id']]
             overall_numbers = overall_transpose[row_['id']]
             final_df = pd.concat([week_numbers, overall_numbers], axis=1)
             final_df.index.name = 'Content Status'
             final_df.columns = ['Week starting {}'.format(_start_date.strftime('%d %B')),
                                 'As on {}'.format(date_.strftime('%d %B'))]
             final_df.to_csv(
                 result_loc_.joinpath(date_.strftime('%Y-%m-%d'), '{}_Content_Status.csv'.format(row_['slug'])))
             final_df.index.name = 'Status'
             final_df.columns = ['Status over last week: starting {}'.format(_start_date.strftime('%d %B')),
                                 'Status from the beginning']
             result_loc_.parent.joinpath('portal_dashboards', row_['slug']).mkdir(exist_ok=True)
             final_df.to_csv(result_loc_.parent.joinpath('portal_dashboards', row_['slug'], 'content_creation.csv'))
             create_json(result_loc_.parent.joinpath('portal_dashboards', row_['slug'], 'content_creation.csv'))
             post_data_to_blob(result_loc_.parent.joinpath('portal_dashboards', row_['slug'], 'content_creation.csv'))
         except KeyError as ke:
             print(row_['id'], ke)

コード例 #4

ファイルを表示

ファイル: cmo_dashboard.py プロジェクト: reshmi-nair/sunbird-data-products

    def init(self):
        start_time_sec = int(round(time.time()))
        print("START:CMO Dashboard")
        data_store_location = self.data_store_location.joinpath('portal_dashboards')
        data_store_location.mkdir(exist_ok=True)
        analysis_date = datetime.strptime(self.execution_date, "%d/%m/%Y")
        data_store_location.joinpath('public').mkdir(exist_ok=True)
        get_data_from_blob(data_store_location.joinpath('overall', 'daily_metrics.csv'))
        self.data_wrangling(result_loc_=data_store_location.joinpath('overall', 'daily_metrics.csv'), date_=analysis_date)
        create_json(data_store_location.joinpath('public', 'cmo_dashboard.csv'), last_update=True)
        post_data_to_blob(data_store_location.joinpath('public', 'cmo_dashboard.csv'))
        get_tenant_info(result_loc_=data_store_location.parent.joinpath('textbook_reports'), org_search_=self.org_search,
                        date_=analysis_date)
        board_slug = pd.read_csv(
            data_store_location.parent.joinpath('textbook_reports', analysis_date.strftime('%Y-%m-%d'), 'tenant_info.csv'))
        slug_list = board_slug['slug'].unique().tolist()
        for slug in slug_list:
            try:
                get_data_from_blob(result_loc_=data_store_location.joinpath(slug, 'daily_metrics.csv'))
                self.data_wrangling(result_loc_=data_store_location.joinpath(slug, 'daily_metrics.csv'), date_=analysis_date)
                create_json(read_loc_=data_store_location.joinpath(slug, 'cmo_dashboard.csv'), last_update=True)
                post_data_to_blob(result_loc_=data_store_location.joinpath(slug, 'cmo_dashboard.csv'))
            except:
                pass
        print("END:CMO Dashboard")

        end_time_sec = int(round(time.time()))
        time_taken = end_time_sec - start_time_sec
        metrics = [
            {
                "metric": "timeTakenSecs",
                "value": time_taken
            },
            {
                "metric": "date",
                "value": analysis_date.strftime("%Y-%m-%d")
            }
        ]
        push_metric_event(metrics, "CMO Dashboard")

コード例 #5

ファイルを表示

ファイル: etb_metrics.py プロジェクト: reshmi-nair/sunbird-data-products

 def dce_aggregates(self, result_loc_, slug, df):
     """
     generate charts from DCE textbook data.
     :param result_loc_: pathlib.Path object with path to store resultant CSVs.
     :param slug: slug name for channel
     :param df: DCE textbook dataframe for the channel
     :return: None
     """
     qr_linked = df[[
         'Number of QR codes with atleast 1 linked content',
         'Number of QR codes with no linked content'
     ]].sum()
     qr_linked.index = ['QR Code With Content', 'QR Code Without Content']
     qr_linked = pd.DataFrame(qr_linked).reset_index()
     qr_linked.columns = ['Status', 'Count']
     qr_linked.to_csv(result_loc_.joinpath('portal_dashboards', slug,
                                           'dce_qr_content_status.csv'),
                      index=False)
     create_json(
         result_loc_.joinpath('portal_dashboards', slug,
                              'dce_qr_content_status.csv'))
     post_data_to_blob(
         result_loc_.joinpath('portal_dashboards', slug,
                              'dce_qr_content_status.csv'))
     qr_linked_by_grade = df.groupby('Grade')[[
         'Number of QR codes with atleast 1 linked content',
         'Number of QR codes with no linked content'
     ]].sum().reset_index()
     qr_linked_by_grade.columns = [
         'Grade', 'QR Codes with content', 'QR Codes without content'
     ]
     qr_linked_by_grade = self.grade_fix(qr_linked_by_grade)
     qr_linked_by_grade.to_csv(result_loc_.joinpath(
         'portal_dashboards', slug, 'dce_qr_content_status_grade.csv'),
                               index=False)
     create_json(
         result_loc_.joinpath('portal_dashboards', slug,
                              'dce_qr_content_status_grade.csv'))
     post_data_to_blob(
         result_loc_.joinpath('portal_dashboards', slug,
                              'dce_qr_content_status_grade.csv'))
     qr_linked_by_subject = df.groupby('Subject')[[
         'Number of QR codes with atleast 1 linked content',
         'Number of QR codes with no linked content'
     ]].sum().reset_index()
     qr_linked_by_subject.columns = [
         'Subject', 'QR Codes with content', 'QR Codes without content'
     ]
     qr_linked_by_subject.to_csv(result_loc_.joinpath(
         'portal_dashboards', slug, 'dce_qr_content_status_subject.csv'),
                                 index=False)
     create_json(
         result_loc_.joinpath('portal_dashboards', slug,
                              'dce_qr_content_status_subject.csv'))
     post_data_to_blob(
         result_loc_.joinpath('portal_dashboards', slug,
                              'dce_qr_content_status_subject.csv'))

コード例 #6

ファイルを表示

ファイル: district_weekly.py プロジェクト: reshmi-nair/sunbird-data-products

 def merge_metrics(self, result_loc_, date_):
     """
     merge all the metrics
     :param result_loc_: pathlib.Path object to store resultant CSV at.
     :param date_: datetime object to be used in path
     :return: None
     """
     slug_ = result_loc_.name
     result_loc_.parent.parent.parent.joinpath("portal_dashboards").mkdir(
         exist_ok=True)
     last_sunday = datetime.strftime(date_ - timedelta(days=1), '%d/%m/%Y')
     try:
         devices_df = pd.read_csv(
             result_loc_.joinpath(
                 "aggregated_district_unique_devices.csv")).set_index(
                     ['District', 'Platform'])
     except FileNotFoundError:
         devices_df = pd.DataFrame([],
                                   columns=[
                                       'District', 'Platform',
                                       'Unique Devices'
                                   ]).set_index(['District', 'Platform'])
     try:
         plays_df = pd.read_csv(
             result_loc_.joinpath(
                 "aggregated_district_content_plays.csv")).set_index(
                     ['District', 'Platform'])
     except FileNotFoundError:
         plays_df = pd.DataFrame([],
                                 columns=[
                                     'District', 'Platform',
                                     'Number of Content Plays'
                                 ]).set_index(['District', 'Platform'])
     try:
         scans_df = pd.read_csv(
             result_loc_.joinpath(
                 "aggregated_district_qr_scans.csv")).set_index(
                     ['District', 'Platform'])
     except FileNotFoundError:
         scans_df = pd.DataFrame([],
                                 columns=[
                                     'District', 'Platform',
                                     'Number of QR Scans'
                                 ]).set_index(['District', 'Platform'])
     district_df = devices_df.join(scans_df, how='outer').join(
         plays_df, how='outer').reset_index().pivot(index='District',
                                                    columns='Platform')
     district_df = district_df.join(district_df.sum(level=0, axis=1))
     district_df.columns = [
         col[0] + ' on ' +
         col[1].split('.')[-1] if isinstance(col, tuple) else 'Total ' + col
         for col in district_df.columns
     ]
     district_df['Data as on Last day (Sunday) of the week'] = last_sunday
     district_df = district_df.reset_index()
     district_df.index = [
         pd.to_datetime(
             district_df['Data as on Last day (Sunday) of the week'],
             format='%d/%m/%Y'), district_df['District']
     ]
     for c in [
             'Unique Devices on portal', 'Unique Devices on app',
             'Total Unique Devices', 'Number of QR Scans on portal',
             'Number of QR Scans on app', 'Total Number of QR Scans',
             'Number of Content Plays on portal',
             'Number of Content Plays on app',
             'Total Number of Content Plays'
     ]:
         if c not in district_df.columns:
             district_df[c] = 0
     try:
         get_data_from_blob(
             result_loc_.parent.parent.parent.joinpath(
                 "portal_dashboards", slug_,
                 "aggregated_district_data.csv"))
         blob_data = pd.read_csv(
             result_loc_.parent.parent.parent.joinpath(
                 "portal_dashboards", slug_,
                 "aggregated_district_data.csv"))
         blob_data = blob_data[
             blob_data['Data as on Last day (Sunday) of the week'] !=
             last_sunday]
         blob_data.index = [
             pd.to_datetime(
                 blob_data['Data as on Last day (Sunday) of the week'],
                 format='%d/%m/%Y'), blob_data['District']
         ]
     except AzureMissingResourceHttpError:
         blob_data = pd.DataFrame()
     except FileNotFoundError:
         blob_data = pd.DataFrame()
     district_df = pd.concat([blob_data, district_df], sort=True)
     district_df = district_df.sort_index().drop_duplicates(
         subset=['Data as on Last day (Sunday) of the week', 'District'],
         keep='last').fillna(0)
     district_df = district_df[[
         'Data as on Last day (Sunday) of the week', 'District',
         'Unique Devices on app', 'Unique Devices on portal',
         'Total Unique Devices', 'Number of QR Scans on app',
         'Number of QR Scans on portal', 'Total Number of QR Scans',
         'Number of Content Plays on app',
         'Number of Content Plays on portal',
         'Total Number of Content Plays'
     ]]
     district_df.to_csv(result_loc_.parent.parent.parent.joinpath(
         "portal_dashboards", slug_, "aggregated_district_data.csv"),
                        index=False)
     create_json(
         result_loc_.parent.parent.parent.joinpath(
             "portal_dashboards", slug_, "aggregated_district_data.csv"))
     post_data_to_blob(
         result_loc_.parent.parent.parent.joinpath(
             "portal_dashboards", slug_, "aggregated_district_data.csv"))

コード例 #7

ファイルを表示

ファイル: content_consumption.py プロジェクト: reshmi-nair/sunbird-data-products

 def get_overall_report(result_loc_, druid_rollup_, date_, config):
     """
     Query Druid Rollup monthwise for content and platform level play session counts and time_spent.
     :param result_loc_: pathlib.Path() object to store resultant data at
     :param druid_rollup_: Druid broker ip and port for rollup data
     :param date_: execution date for report
     :param config: diksha configurables
     :return: None
     """
     tenant_info = pd.read_csv(
         result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                              'tenant_info.csv'))[['id', 'slug']]
     tenant_info['id'] = tenant_info['id'].astype(str)
     tenant_info.set_index('id', inplace=True)
     content_model = pd.read_csv(
         result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                              'content_model_snapshot.csv'))
     content_model['channel'] = content_model['channel'].astype(str)
     # content_model['mimeType'] = content_model['mimeType'].apply(mime_type)
     content_model = content_model[[
         'channel', 'board', 'medium', 'gradeLevel', 'subject',
         'contentType', 'identifier', 'name', 'creator', 'mimeType',
         'createdOn', 'lastPublishedOn', 'tb_id', 'tb_name',
         'me_totalRatings', 'me_averageRating'
     ]]
     content_model.set_index('identifier', inplace=True)
     result_loc_.joinpath(date_.strftime('%Y-%m-%d')).mkdir(exist_ok=True)
     start_date = datetime(2019, 6, 1)
     while start_date < date_:
         if datetime(start_date.year + int(start_date.month / 12),
                     (start_date.month % 12) + 1, 1) < date_:
             end_date = datetime(
                 start_date.year + int(start_date.month / 12),
                 (start_date.month % 12) + 1, 1)
         else:
             end_date = date_
         get_content_plays(result_loc_=result_loc_.joinpath(
             date_.strftime('%Y-%m-%d')),
                           start_date_=start_date,
                           end_date_=end_date,
                           druid_=druid_rollup_,
                           config_=config,
                           version_='v1')
         start_date = end_date
     spark = SparkSession.builder.appName('content_consumption').master(
         "local[*]").getOrCreate()
     content_plays = spark.read.csv(str(
         result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                              'content_plays_*.csv')),
                                    header=True)
     content_plays = content_plays.groupby(
         fn.col('object_id'), fn.col('dimensions_pdata_id')).agg(
             fn.sum('Number of plays').alias('Number of plays'),
             fn.sum('Total time spent').alias(
                 'Total time spent')).toPandas()
     spark.stop()
     content_plays = content_plays.pivot(
         index='object_id',
         columns='dimensions_pdata_id',
         values=['Number of plays', 'Total time spent'])
     col_list = []
     for i in content_plays.columns:
         col_list.append(i[0] + ' on ' + i[1].split('.')[-1].title())
     content_plays.columns = col_list
     content_plays.fillna(0, inplace=True)
     content_plays['Total No of Plays (App and Portal)'] = content_plays[
         'Number of plays on App'] + content_plays[
             'Number of plays on Portal']
     content_plays['Average Play Time in mins on App'] = round(
         content_plays['Total time spent on App'] /
         (content_plays['Number of plays on App'] * 60), 2)
     content_plays['Average Play Time in mins on Portal'] = round(
         content_plays['Total time spent on Portal'] /
         (content_plays['Number of plays on Portal'] * 60), 2)
     content_plays['Average Play Time in mins (On App and Portal)'] = round(
         (content_plays['Total time spent on App'] +
          content_plays['Total time spent on Portal']) /
         ((content_plays['Number of plays on App'] +
           content_plays['Number of plays on Portal']) * 60), 2)
     content_plays.drop(
         ['Total time spent on App', 'Total time spent on Portal'],
         axis=1,
         inplace=True)
     overall = content_model.join(content_plays).reset_index()
     overall = overall[[
         'channel', 'board', 'medium', 'gradeLevel', 'subject',
         'identifier', 'name', 'mimeType', 'createdOn', 'creator',
         'lastPublishedOn', 'tb_id', 'tb_name', 'me_averageRating',
         'me_totalRatings', 'Number of plays on App',
         'Number of plays on Portal', 'Total No of Plays (App and Portal)',
         'Average Play Time in mins on App',
         'Average Play Time in mins on Portal',
         'Average Play Time in mins (On App and Portal)'
     ]]
     overall.columns = [
         'channel', 'Board', 'Medium', 'Grade', 'Subject', 'Content ID',
         'Content Name', 'Mime Type', 'Created On', 'Creator (User Name)',
         'Last Published On', 'Linked Textbook Id(s)',
         'Linked Textbook Name(s)', 'Average Rating(out of 5)',
         'Total No of Ratings', 'Number of Plays on App',
         'Number of Plays on Portal', 'Total No of Plays (App and Portal)',
         'Average Play Time in mins on App',
         'Average Play Time in mins on Portal',
         'Average Play Time in mins (On App and Portal)'
     ]
     overall['Content ID'] = overall['Content ID'].str.replace('.img', '')
     overall['Created On'] = overall['Created On'].fillna('T').apply(
         lambda x: '-'.join(x.split('T')[0].split('-')[::-1]))
     overall['Last Published On'] = overall['Last Published On'].fillna(
         'T').apply(lambda x: '-'.join(x.split('T')[0].split('-')[::-1]))
     overall.fillna(
         {
             'Board': 'Unknown',
             'Medium': 'Unknown',
             'Grade': 'Unknown',
             'Subject': 'Unknown',
             'Creator (User Name)': '',
             'Linked Textbook Id(s)': '',
             'Linked Textbook Name(s)': '',
             'Number of Plays on App': 0,
             'Number of Plays on Portal': 0,
             'Total No of Plays (App and Portal)': 0,
             'Average Play Time in mins on App': 0,
             'Average Play Time in mins on Portal': 0,
             'Average Play Time in mins (On App and Portal)': 0
         },
         inplace=True)
     overall.sort_values(inplace=True,
                         ascending=[1, 1, 1, 1, 1, 0],
                         by=[
                             'channel', 'Board', 'Medium', 'Grade',
                             'Subject', 'Total No of Plays (App and Portal)'
                         ])
     for channel in overall.channel.unique():
         try:
             slug = tenant_info.loc[channel]['slug']
         except KeyError:
             continue
         content_aggregates = overall[overall['channel'] == channel]
         content_aggregates.drop(['channel'], axis=1, inplace=True)
         result_loc_.parent.joinpath('portal_dashboards',
                                     slug).mkdir(exist_ok=True)
         content_aggregates.to_csv(result_loc_.parent.joinpath(
             'portal_dashboards', slug, 'content_aggregated.csv'),
                                   index=False,
                                   encoding='utf-8-sig')
         create_json(
             result_loc_.parent.joinpath('portal_dashboards', slug,
                                         'content_aggregated.csv'))
         post_data_to_blob(
             result_loc_.parent.joinpath('portal_dashboards', slug,
                                         'content_aggregated.csv'))

コード例 #8

ファイルを表示

ファイル: content_consumption.py プロジェクト: reshmi-nair/sunbird-data-products

 def get_weekly_report(result_loc_, druid_rollup_, date_, config):
     """
     Query druid rollups for weekly content and platform level play session counts and time_spent.
     :param result_loc_: pathlib.Path() object to store resultant CSVs at
     :param druid_rollup_: druid broker ip and port for rollup data
     :param date_: execution date for report
     :param config: diksha configurables
     :return: None
     """
     tenant_info = pd.read_csv(
         result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                              'tenant_info.csv'))[['id', 'slug']]
     tenant_info['id'] = tenant_info['id'].astype(str)
     tenant_info.set_index('id', inplace=True)
     content_model = pd.read_csv(
         result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                              'content_model_snapshot.csv'))
     content_model['channel'] = content_model['channel'].astype(str)
     # content_model['mimeType'] = content_model['mimeType'].apply(mime_type)
     content_model = content_model[[
         'channel', 'board', 'medium', 'gradeLevel', 'subject',
         'contentType', 'identifier', 'name', 'creator', 'mimeType',
         'createdOn', 'lastPublishedOn', 'tb_id', 'tb_name',
         'me_totalRatings', 'me_averageRating'
     ]]
     content_model.set_index('identifier', inplace=True)
     result_loc_.joinpath(date_.strftime('%Y-%m-%d')).mkdir(exist_ok=True)
     start_date = date_ - timedelta(days=7)
     get_content_plays(result_loc_=result_loc_.joinpath(
         date_.strftime('%Y-%m-%d')),
                       start_date_=start_date,
                       end_date_=date_,
                       druid_=druid_rollup_,
                       config_=config,
                       version_='v2')
     content_plays = pd.read_csv(
         result_loc_.joinpath(
             date_.strftime('%Y-%m-%d'),
             'content_plays_{}.csv'.format(date_.strftime('%Y-%m-%d'))))
     content_plays = content_plays.groupby([
         'object_id', 'dimensions_pdata_id'
     ])[['Number of plays', 'Total time spent']].sum().reset_index()
     content_plays = content_plays.pivot(
         index='object_id',
         columns='dimensions_pdata_id',
         values=['Number of plays', 'Total time spent'])
     col_list = []
     for i in content_plays.columns:
         col_list.append(i[0] + ' on ' + i[1].split('.')[-1].title())
     content_plays.columns = col_list
     content_plays.fillna(0, inplace=True)
     content_plays['Total No of Plays (App and Portal)'] = content_plays[
         'Number of plays on App'] + content_plays[
             'Number of plays on Portal']
     content_plays['Average Play Time in mins on App'] = round(
         content_plays['Total time spent on App'] /
         (content_plays['Number of plays on App'] * 60), 2)
     content_plays['Average Play Time in mins on Portal'] = round(
         content_plays['Total time spent on Portal'] /
         (content_plays['Number of plays on Portal'] * 60), 2)
     content_plays['Average Play Time in mins (On App and Portal)'] = round(
         (content_plays['Total time spent on App'] +
          content_plays['Total time spent on Portal']) /
         ((content_plays['Number of plays on App'] +
           content_plays['Number of plays on Portal']) * 60), 2)
     content_plays.drop(
         ['Total time spent on App', 'Total time spent on Portal'],
         axis=1,
         inplace=True)
     weekly = content_model.join(content_plays).reset_index()
     weekly = weekly[[
         'channel', 'board', 'medium', 'gradeLevel', 'subject',
         'identifier', 'name', 'mimeType', 'createdOn', 'creator',
         'lastPublishedOn', 'tb_id', 'tb_name', 'me_averageRating',
         'me_totalRatings', 'Number of plays on App',
         'Number of plays on Portal', 'Total No of Plays (App and Portal)',
         'Average Play Time in mins on App',
         'Average Play Time in mins on Portal',
         'Average Play Time in mins (On App and Portal)'
     ]]
     weekly.columns = [
         'channel', 'Board', 'Medium', 'Grade', 'Subject', 'Content ID',
         'Content Name', 'Mime Type', 'Created On', 'Creator (User Name)',
         'Last Published On', 'Linked Textbook Id(s)',
         'Linked Textbook Name(s)', 'Average Rating(out of 5)',
         'Total No of Ratings', 'Number of Plays on App',
         'Number of Plays on Portal', 'Total No of Plays (App and Portal)',
         'Average Play Time in mins on App',
         'Average Play Time in mins on Portal',
         'Average Play Time in mins (On App and Portal)'
     ]
     weekly['Content ID'] = weekly['Content ID'].str.replace('.img', '')
     weekly['Created On'] = weekly['Created On'].fillna('T').apply(
         lambda x: '-'.join(x.split('T')[0].split('-')[::-1]))
     weekly['Last Published On'] = weekly['Last Published On'].fillna(
         'T').apply(lambda x: '-'.join(x.split('T')[0].split('-')[::-1]))
     weekly.fillna(
         {
             'Board': 'Unknown',
             'Medium': 'Unknown',
             'Grade': 'Unknown',
             'Subject': 'Unknown',
             'Creator (User Name)': '',
             'Linked Textbook Id(s)': '',
             'Linked Textbook Name(s)': '',
             'Number of Plays on App': 0,
             'Number of Plays on Portal': 0,
             'Total No of Plays (App and Portal)': 0,
             'Average Play Time in mins on App': 0,
             'Average Play Time in mins on Portal': 0,
             'Average Play Time in mins (On App and Portal)': 0
         },
         inplace=True)
     weekly.sort_values(inplace=True,
                        ascending=[1, 1, 1, 1, 1, 0],
                        by=[
                            'channel', 'Board', 'Medium', 'Grade',
                            'Subject', 'Total No of Plays (App and Portal)'
                        ])
     weekly['Last Date of the week'] = date_.strftime('%d-%m-%Y')
     for channel in weekly.channel.unique():
         try:
             slug = tenant_info.loc[channel]['slug']
         except KeyError:
             continue
         content_aggregates = weekly[weekly['channel'] == channel]
         content_aggregates.drop(['channel'], axis=1, inplace=True)
         result_loc_.parent.joinpath('portal_dashboards',
                                     slug).mkdir(exist_ok=True)
         content_aggregates.to_csv(result_loc_.parent.joinpath(
             'portal_dashboards', slug, 'content_consumption_lastweek.csv'),
                                   index=False,
                                   encoding='utf-8-sig')
         create_json(
             result_loc_.parent.joinpath(
                 'portal_dashboards', slug,
                 'content_consumption_lastweek.csv'))
         post_data_to_blob(
             result_loc_.parent.joinpath(
                 'portal_dashboards', slug,
                 'content_consumption_lastweek.csv'))
         content_aggregates.to_csv(result_loc_.joinpath(
             date_.strftime('%Y-%m-%d'),
             'content_consumption_lastweek_{}.csv'.format(slug)),
                                   index=False,
                                   encoding='utf-8-sig')
         post_data_to_blob(result_loc_.joinpath(
             date_.strftime('%Y-%m-%d'),
             'content_consumption_lastweek_{}.csv'.format(slug)),
                           backup=True)

コード例 #9

ファイルを表示

    def generate_report(self, result_loc_, date_):
        board_slug = pd.read_csv(
            result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                                 'tenant_info.csv'))[['id', 'slug']]

        df = pd.read_csv(
            result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                                 'content_model_snapshot.csv'))[[
                                     'board', 'medium', 'gradeLevel',
                                     'subject', 'identifier', 'name', 'status',
                                     'createdOn', 'creator', 'lastPublishedOn',
                                     'lastUpdatedOn', 'channel',
                                     'lastSubmittedOn'
                                 ]]

        if 'createdOn' not in df.columns:
            df['createdOn'] = 'T'
        if 'lastSubmittedOn' not in df.columns:
            df['lastSubmittedOn'] = 'T'
        if 'lastPublishedOn' not in df.columns:
            df['lastPublishedOn'] = 'T'

        df['createdOn'] = df['createdOn'].fillna('T').apply(self.date_format)

        review = df[df['status'] == 'Review']
        review['lastSubmittedOn'] = review['lastSubmittedOn'].fillna(
            'T').apply(self.date_format)
        review.rename(
            columns={'lastSubmittedOn': 'Pending in current status since'},
            inplace=True)

        only_draft = df[(df['status'] == 'Draft')
                        & (df['lastPublishedOn'].isna())]
        only_draft.loc[:,
                       'Pending in current status since'] = only_draft.loc[:,
                                                                           'createdOn']

        published = df[(df['status'] == 'Unlisted') | \
                       (df['status'] == 'Live') | \
                       ((df['status'] == 'Draft') & (df['lastPublishedOn'].notna()))]
        published['status'] = pd.np.where(published['status'] == 'Unlisted',
                                          'Limited Sharing',
                                          published['status'])
        published['lastPublishedOn'] = published['lastPublishedOn'].fillna(
            'T').apply(self.date_format)
        published.rename(
            columns={'lastPublishedOn': 'Pending in current status since'},
            inplace=True)

        result_df = pd.concat([review, only_draft, published])
        result_df['gradeSort'] = result_df['gradeLevel'].apply(self.grade_sort)

        result_df = result_df.sort_values(
            by=['board', 'medium', 'gradeSort', 'subject', 'name'],
            ascending=[False, True, True, True, True])

        result_df = result_df.fillna('Unknown')

        result_df = result_df[[
            'board', 'medium', 'gradeLevel', 'subject', 'identifier', 'name',
            'status', 'createdOn', 'Pending in current status since',
            'creator', 'channel'
        ]]

        result_df.to_csv(result_loc_.joinpath(
            date_.strftime('%Y-%m-%d'), 'Content_Creation_Status_Overall.csv'),
                         index=False,
                         encoding='utf-8')
        # create_json(result_loc_.joinpath('Content_Creation_Status.csv'))
        post_data_to_blob(
            result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                                 'Content_Creation_Status_Overall.csv'), True)

        for index, bs_value in board_slug.iterrows():
            channel, slug = bs_value.values
            print(slug)

            channel_df = result_df[result_df['channel'] == channel]
            channel_df = channel_df[[
                'board', 'medium', 'gradeLevel', 'subject', 'identifier',
                'name', 'status', 'createdOn',
                'Pending in current status since', 'creator'
            ]]
            channel_df.columns = [
                'Board', 'Medium', 'Grade', 'Subject', 'Content Id',
                'Content name', 'Status', 'Created On',
                'Pending in current status since', 'Created By'
            ]

            os.makedirs(result_loc_.joinpath(slug), exist_ok=True)
            channel_df.to_csv(result_loc_.joinpath(
                slug, 'Content_Creation_Status.csv'),
                              index=False,
                              encoding='utf-8')
            create_json(
                result_loc_.joinpath(slug, 'Content_Creation_Status.csv'))
            post_data_to_blob(
                result_loc_.joinpath(slug, 'Content_Creation_Status.csv'))

コード例 #10

ファイルを表示

ファイル: consumption_metrics.py プロジェクト: reshmi-nair/sunbird-data-products

    def daily_metrics(self, read_loc_, date_):
        """
        merge the three metrics
        :param read_loc_: pathlib.Path object to read CSV from.
        :param date_: datetime object to use in path
        :return: None
        """
        try:
            board_slug = \
                pd.read_csv(
                    self.data_store_location.joinpath('textbook_reports', date_.strftime('%Y-%m-%d'), 'tenant_info.csv'))[
                    ['id', 'slug']]
            board_slug.set_index('id', inplace=True)
        except Exception:
            raise Exception('Board Slug Error!')
        try:
            scans_df = pd.read_csv(
                read_loc_.joinpath('dialcode_scans',
                                   date_.strftime('%Y-%m-%d'),
                                   'dial_scans.csv')).fillna('')
            scans_df = scans_df.pivot(index='dialcode_channel',
                                      columns='failed_flag',
                                      values='count').reset_index().fillna(0)
            scans_df = scans_df.join(
                board_slug, on='dialcode_channel',
                how='left')[['slug', 'Failed QR Scans', 'Successful QR Scans']]
            scans_df['Total QR scans'] = scans_df[
                'Successful QR Scans'] + scans_df['Failed QR Scans']
            scans_df['Percentage (%) of Failed QR Scans'] = scans_df[
                'Failed QR Scans'] * 100 / scans_df['Total QR scans']
            unmapped = scans_df[scans_df.slug.isna()]['Total QR scans'][0]
            scans_df.dropna(subset=['slug'], inplace=True)
        except Exception as e:
            raise Exception('Scans Error! :: {}'.format(str(e)))
        try:
            downloads_df = pd.read_csv(
                read_loc_.joinpath('downloads', date_.strftime('%Y-%m-%d'),
                                   'downloads.csv'))
            downloads_df = downloads_df.fillna('').join(
                board_slug, on='channel',
                how='left')[['count', 'slug']].dropna(subset=['slug'])
            downloads_df.columns = ['Total Content Downloads', 'slug']
        except Exception:
            raise Exception('Downloads Error!')
        try:
            app_df = pd.read_csv(
                read_loc_.joinpath('play', date_.strftime('%Y-%m-%d'),
                                   'app_sessions.csv'))
            app_df = app_df[[
                'Total App Sessions', 'Total Devices on App',
                'Total Time on App (in hours)'
            ]]
            plays_df = pd.read_csv(read_loc_.joinpath(
                'play', date_.strftime('%Y-%m-%d'), 'plays.csv'),
                                   header=[0, 1],
                                   dtype={0: str})

            # Making the channel column as index with string type since the csv is in multiindex format
            plays_df.set_index(plays_df.columns[0], inplace=True)
            plays_df.index.names = ['channel']
            plays_df = plays_df[1:]

            plays_df = plays_df.reset_index().join(board_slug,
                                                   on='channel',
                                                   how='left')
            plays_df['Total Content Plays on App'] = plays_df.get(
                ('Total Content Plays',
                 self.config['context']['pdata']['id']['app']),
                pd.Series(index=plays_df.index,
                          name=('Total Content Plays',
                                self.config['context']['pdata']['id']['app'])))
            plays_df['Total Content Plays on Portal'] = plays_df.get(
                ('Total Content Plays',
                 self.config['context']['pdata']['id']['portal']),
                pd.Series(
                    index=plays_df.index,
                    name=('Total Content Plays',
                          self.config['context']['pdata']['id']['portal'])))
            plays_df[
                'Total Devices that played content on App'] = plays_df.get(
                    ('Total Devices that played content',
                     self.config['context']['pdata']['id']['app']),
                    pd.Series(
                        index=plays_df.index,
                        name=('Total Devices that played content',
                              self.config['context']['pdata']['id']['app'])))
            plays_df[
                'Total Devices that played content on Portal'] = plays_df.get(
                    ('Total Devices that played content',
                     self.config['context']['pdata']['id']['portal']),
                    pd.Series(
                        index=plays_df.index,
                        name=(
                            'Total Devices that played content',
                            self.config['context']['pdata']['id']['portal'])))
            plays_df['Content Play Time on App (in hours)'] = plays_df.get(
                ('Content Play Time (in hours)',
                 self.config['context']['pdata']['id']['app']),
                pd.Series(index=plays_df.index,
                          name=('Content Play Time (in hours)',
                                self.config['context']['pdata']['id']['app'])))
            plays_df['Content Play Time on Portal (in hours)'] = plays_df.get(
                ('Content Play Time (in hours)',
                 self.config['context']['pdata']['id']['portal']),
                pd.Series(
                    index=plays_df.index,
                    name=('Content Play Time (in hours)',
                          self.config['context']['pdata']['id']['portal'])))
            plays_df = plays_df[[
                'Total Content Plays on App', 'Total Content Plays on Portal',
                'Total Devices that played content on App',
                'Total Devices that played content on Portal',
                'Content Play Time on App (in hours)',
                'Content Play Time on Portal (in hours)', 'slug'
            ]].dropna(subset=['slug'])
        except Exception as e:
            raise Exception('App and Plays Error! :: {}'.format(str(e)))
        try:
            daily_metrics_df = scans_df.join(
                downloads_df.set_index('slug'), on='slug',
                how='outer').reset_index(drop=True).join(
                    plays_df.set_index('slug'),
                    on='slug',
                    how='outer',
                    rsuffix='_plays').fillna(0)
            daily_metrics_df['Date'] = '-'.join(
                date_.strftime('%Y-%m-%d').split('-')[::-1])
        except Exception:
            raise Exception('Daily Metrics Error!')
        try:
            overall = daily_metrics_df[[
                'Successful QR Scans', 'Failed QR Scans',
                'Total Content Downloads', 'Total Content Plays on App',
                'Total Content Plays on Portal',
                'Total Devices that played content on App',
                'Total Devices that played content on Portal',
                'Content Play Time on App (in hours)',
                'Content Play Time on Portal (in hours)'
            ]].sum().astype(int)
            overall['Total App Sessions'] = app_df['Total App Sessions'].loc[0]
            overall['Total Devices on App'] = app_df[
                'Total Devices on App'].loc[0]
            overall['Total Time on App (in hours)'] = app_df[
                'Total Time on App (in hours)'].loc[0]
            overall['Date'] = '-'.join(
                date_.strftime('%Y-%m-%d').split('-')[::-1])
            overall['Unmapped QR Scans'] = unmapped
            overall[
                'Total QR scans'] = overall['Successful QR Scans'] + overall[
                    'Failed QR Scans'] + overall['Unmapped QR Scans']
            overall['Percentage (%) of Failed QR Scans'] = '%.2f' % (
                overall['Failed QR Scans'] * 100 / overall['Total QR scans'])
            overall['Percentage (%) of Unmapped QR Scans'] = '%.2f' % (
                overall['Unmapped QR Scans'] * 100 / overall['Total QR scans'])
            overall['Total Content Plays'] = overall[
                'Total Content Plays on App'] + overall[
                    'Total Content Plays on Portal']
            overall['Total Devices that played content'] = overall[
                'Total Devices that played content on App'] + overall[
                    'Total Devices that played content on Portal']
            overall['Total Content Play Time (in hours)'] = overall[
                'Content Play Time on App (in hours)'] + overall[
                    'Content Play Time on Portal (in hours)']
            overall = overall[[
                'Date', 'Total QR scans', 'Successful QR Scans',
                'Failed QR Scans', 'Unmapped QR Scans',
                'Percentage (%) of Failed QR Scans',
                'Percentage (%) of Unmapped QR Scans',
                'Total Content Downloads', 'Total App Sessions',
                'Total Devices on App', 'Total Time on App (in hours)',
                'Total Content Plays on App',
                'Total Devices that played content on App',
                'Content Play Time on App (in hours)',
                'Total Content Plays on Portal',
                'Total Devices that played content on Portal',
                'Content Play Time on Portal (in hours)',
                'Total Content Plays', 'Total Devices that played content',
                'Total Content Play Time (in hours)'
            ]]
            read_loc_.joinpath('portal_dashboards',
                               'overall').mkdir(exist_ok=True)
            read_loc_.joinpath('portal_dashboards',
                               'mhrd').mkdir(exist_ok=True)
            try:
                get_data_from_blob(
                    read_loc_.joinpath('portal_dashboards', 'overall',
                                       'daily_metrics.csv'))
                blob_data = pd.read_csv(
                    read_loc_.joinpath('portal_dashboards', 'overall',
                                       'daily_metrics.csv'))
            except:
                blob_data = pd.DataFrame()
            blob_data = blob_data.append(pd.DataFrame(overall).transpose(),
                                         sort=False).fillna('')
            blob_data.index = pd.to_datetime(blob_data.Date, format='%d-%m-%Y')
            blob_data.drop_duplicates('Date', inplace=True, keep='last')
            blob_data.sort_index(inplace=True)
            # can remove after first run
            blob_data = blob_data[[
                'Date', 'Total QR scans', 'Successful QR Scans',
                'Failed QR Scans', 'Unmapped QR Scans',
                'Percentage (%) of Failed QR Scans',
                'Percentage (%) of Unmapped QR Scans',
                'Total Content Downloads', 'Total App Sessions',
                'Total Devices on App', 'Total Time on App (in hours)',
                'Total Content Plays on App',
                'Total Devices that played content on App',
                'Content Play Time on App (in hours)',
                'Total Content Plays on Portal',
                'Total Devices that played content on Portal',
                'Content Play Time on Portal (in hours)',
                'Total Content Plays', 'Total Devices that played content',
                'Total Content Play Time (in hours)'
            ]]
            blob_data.to_csv(read_loc_.joinpath('portal_dashboards', 'overall',
                                                'daily_metrics.csv'),
                             index=False)
            create_json(
                read_loc_.joinpath('portal_dashboards', 'overall',
                                   'daily_metrics.csv'))
            post_data_to_blob(
                read_loc_.joinpath('portal_dashboards', 'overall',
                                   'daily_metrics.csv'))
        except Exception:
            raise Exception('Overall Metrics Error!')
        try:
            daily_metrics_df['Total Content Plays'] = daily_metrics_df[
                'Total Content Plays on App'] + daily_metrics_df[
                    'Total Content Plays on Portal']
            daily_metrics_df['Total Devices that played content'] = daily_metrics_df[
                                                                        'Total Devices that played content on App'] + \
                                                                    daily_metrics_df[
                                                                        'Total Devices that played content on Portal']
            daily_metrics_df['Total Content Play Time (in hours)'] = daily_metrics_df[
                                                                         'Content Play Time on App (in hours)'] + \
                                                                     daily_metrics_df[
                                                                         'Content Play Time on Portal (in hours)']
            daily_metrics_df.set_index(['slug'], inplace=True)
            daily_metrics_df = daily_metrics_df[[
                'Date', 'Total QR scans', 'Successful QR Scans',
                'Failed QR Scans', 'Percentage (%) of Failed QR Scans',
                'Total Content Downloads', 'Total Content Plays on App',
                'Total Devices that played content on App',
                'Content Play Time on App (in hours)',
                'Total Content Plays on Portal',
                'Total Devices that played content on Portal',
                'Content Play Time on Portal (in hours)',
                'Total Content Plays', 'Total Devices that played content',
                'Total Content Play Time (in hours)'
            ]]
            for slug, value in daily_metrics_df.iterrows():
                if slug != '':
                    read_loc_.joinpath('portal_dashboards',
                                       slug).mkdir(exist_ok=True)
                    for key, val in value.items():
                        if key not in [
                                'Date', 'Percentage (%) of Failed QR Scans'
                        ]:
                            value[key] = int(val)
                        elif key == 'Percentage (%) of Failed QR Scans':
                            value[key] = '%.2f' % val
                    try:
                        get_data_from_blob(
                            read_loc_.joinpath('portal_dashboards', slug,
                                               'daily_metrics.csv'))
                        blob_data = pd.read_csv(
                            read_loc_.joinpath('portal_dashboards', slug,
                                               'daily_metrics.csv'))
                    except:
                        blob_data = pd.DataFrame()
                    blob_data = blob_data.append(
                        pd.DataFrame(value).transpose(), sort=False).fillna('')
                    blob_data.index = pd.to_datetime(blob_data.Date,
                                                     format='%d-%m-%Y')
                    blob_data.drop_duplicates('Date',
                                              inplace=True,
                                              keep='last')
                    blob_data.sort_index(inplace=True)
                    # can remove after first run
                    blob_data = blob_data[[
                        'Date', 'Total QR scans', 'Successful QR Scans',
                        'Failed QR Scans', 'Percentage (%) of Failed QR Scans',
                        'Total Content Downloads',
                        'Total Content Plays on App',
                        'Total Devices that played content on App',
                        'Content Play Time on App (in hours)',
                        'Total Content Plays on Portal',
                        'Total Devices that played content on Portal',
                        'Content Play Time on Portal (in hours)',
                        'Total Content Plays',
                        'Total Devices that played content',
                        'Total Content Play Time (in hours)'
                    ]]
                    blob_data.to_csv(read_loc_.joinpath(
                        'portal_dashboards', slug, 'daily_metrics.csv'),
                                     index=False)
                    create_json(
                        read_loc_.joinpath('portal_dashboards', slug,
                                           'daily_metrics.csv'))
                    post_data_to_blob(
                        read_loc_.joinpath('portal_dashboards', slug,
                                           'daily_metrics.csv'))
        except Exception:
            raise Exception('State Metrics Error!')

コード例 #11

ファイルを表示

    def generate_report(self):
        execution_date_str = datetime.strptime(self.execution_date,
                                               "%d/%m/%Y").strftime('%Y-%m-%d')
        week_last_date = (datetime.strptime(self.execution_date, "%d/%m/%Y") -
                          timedelta(1)).strftime('%d/%m/%Y')

        board_slug = pd.read_csv(
            self.data_store_location.joinpath(
                'textbook_reports', execution_date_str,
                'tenant_info.csv'))[['id', 'slug']]
        board_slug.set_index('slug', inplace=True)

        scans_df = pd.read_csv(
            self.data_store_location.joinpath('textbook_reports',
                                              execution_date_str,
                                              'weekly_dialcode_counts.csv'))
        scans_df["edata_filters_dialcodes"] = scans_df[
            "edata_filters_dialcodes"].str.upper().str.strip()
        scans_df = scans_df.groupby("edata_filters_dialcodes").agg({
            "Total Scans":
            "sum"
        }).reset_index()

        tb_dial_df = pd.read_csv(
            self.data_store_location.joinpath('tb_metadata',
                                              execution_date_str,
                                              'qr_code_state.csv'))
        tb_dial_df["QR"] = tb_dial_df["QR"].str.upper().str.strip()

        tb_dial_scans_df = pd.merge(scans_df,
                                    tb_dial_df,
                                    left_on="edata_filters_dialcodes",
                                    right_on="QR")
        tb_dial_scans_df['Index'] = tb_dial_scans_df['Index'].str.split(
            '.').str[0].astype(int)

        tb_dial_scans_df.groupby(["channel", "TB_ID", "Index"]).agg({
            "Total Scans":
            "sum"
        }).reset_index()
        tb_dial_scans_df['weighted_scans'] = tb_dial_scans_df[
            'Index'] * tb_dial_scans_df['Total Scans']

        weighted_avg_df = tb_dial_scans_df.groupby("channel").agg({
            "Total Scans":
            "sum",
            "weighted_scans":
            "sum"
        })

        weighted_avg_df['weighted_average'] = weighted_avg_df[
            'weighted_scans'] / weighted_avg_df['Total Scans']
        weighted_avg_df['weighted_average'] = weighted_avg_df[
            'weighted_average'].round(1)
        weighted_avg_df = weighted_avg_df.reset_index()[[
            'channel', 'weighted_average'
        ]]
        weighted_avg_df.rename(columns={"weighted_average": "Index"},
                               inplace=True)
        weighted_avg_df['Date'] = week_last_date

        for slug, board_value in board_slug.iterrows():
            print(slug)
            try:
                get_data_from_blob(
                    self.data_store_location.joinpath("portal_dashboards",
                                                      slug,
                                                      "gps_learning.csv"))
                blob_data = pd.read_csv(
                    self.data_store_location.joinpath("portal_dashboards",
                                                      slug,
                                                      "gps_learning.csv"))
            except Exception:
                blob_data = pd.DataFrame(columns=["Date", "Index"])

            current_channel_df = weighted_avg_df[weighted_avg_df['channel'] ==
                                                 board_value.id][[
                                                     "Date", "Index"
                                                 ]]

            blob_data = pd.concat([blob_data, current_channel_df])
            blob_data.drop_duplicates(subset=['Date'],
                                      keep='last',
                                      inplace=True)
            blob_data.to_csv(self.data_store_location.joinpath(
                'portal_dashboards', slug, 'gps_learning.csv'),
                             index=False)
            create_json(
                self.data_store_location.joinpath('portal_dashboards', slug,
                                                  'gps_learning.csv'))
            post_data_to_blob(
                self.data_store_location.joinpath('portal_dashboards', slug,
                                                  'gps_learning.csv'))

コード例 #12

ファイルを表示

ファイル: etb_metrics.py プロジェクト: reshmi-nair/sunbird-data-products

    def generate_reports(self, result_loc_, content_search_,
                         content_hierarchy_, date_):
        """
        generate the overall ETB and DCE reports at textbook and detailed levels
        :param hostname:IP and port to query the list of textbooks and hierarchy
        :param result_loc_: location to store data
        :return: None
        """
        board_slug = pd.read_csv(
            self.data_store_location.joinpath(
                'textbook_reports', date_.strftime('%Y-%m-%d'),
                'tenant_info.csv'))[['id', 'slug']]
        board_slug.set_index('id', inplace=True)

        importer = DictImporter()
        dialcode_etb = []
        textbook_etb = []
        dialcode_dce = []
        textbook_dce = []
        scans_df = pd.read_csv(
            result_loc_.joinpath('textbook_reports', 'dialcode_counts.csv'))
        scans_df = scans_df.groupby(
            'edata_filters_dialcodes')['Total Scans'].sum()
        tb_url = "{}v3/search".format(content_search_)
        payload = """{
                "request": {
                    "filters": {
                        "contentType": ["Textbook"],
                        "status": ["Live", "Review", "Draft"]
                    },
                    "sort_by": {"createdOn":"desc"},
                    "limit": 10000
                }
            }"""
        tb_headers = {
            'content-type': "application/json; charset=utf-8",
            'cache-control': "no-cache"
        }
        retry_count = 0
        while retry_count < 5:
            try:
                response = requests.request("POST",
                                            tb_url,
                                            data=payload,
                                            headers=tb_headers)
                textbooks = pd.DataFrame(
                    response.json()['result']['content'])[[
                        'identifier', 'createdFor', 'createdOn',
                        'lastUpdatedOn', 'board', 'medium', 'gradeLevel',
                        'subject', 'name', 'status', 'channel'
                    ]]
                textbooks[textbooks.duplicated(
                    subset=['identifier', 'status'])].to_csv(
                        result_loc_.joinpath('textbook_reports',
                                             date_.strftime('%Y-%m-%d'),
                                             'duplicate_tb.csv'),
                        index=False)
                textbooks.drop_duplicates(subset=['identifier', 'status'],
                                          inplace=True)
                textbooks['gradeLevel'] = textbooks['gradeLevel'].apply(
                    lambda x: ['Unknown'] if type(x) == float else x)
                textbooks.fillna({'createdFor': ' '}, inplace=True)
                textbooks.fillna('Unknown', inplace=True)
                textbooks['grade'] = textbooks['gradeLevel'].apply(
                    lambda grade: ', '.join(
                        [y if y == 'KG' else y.title() for y in grade]))
                textbooks.to_csv(result_loc_.joinpath(
                    'textbook_reports', date_.strftime('%Y-%m-%d'),
                    'tb_list.csv'),
                                 index=False)
                break
            except requests.exceptions.ConnectionError:
                print("Retry {} for textbook list".format(retry_count + 1))
                retry_count += 1
                time.sleep(10)
        else:
            print("Max retries reached...")
            return
        counter = 0
        skipped_tbs = []
        for ind_, row_ in textbooks.iterrows():
            counter += 1
            print('Running for {} out of {}: {}% ({} sec/it)'.format(
                counter, textbooks.shape[0],
                '%.2f' % (counter * 100 / textbooks.shape[0]),
                '%.2f' % ((datetime.now() - self.start_time).total_seconds() /
                          counter)))
            if isinstance(row_['gradeLevel'], list) and len(
                    row_['gradeLevel']) == 0:
                row_['gradeLevel'].append(' ')
            if row_['status'] == 'Live':
                url = "{}learning-service/content/v3/hierarchy/{}".format(
                    content_hierarchy_, row_['identifier'])
            else:
                url = "{}learning-service/content/v3/hierarchy/{}?mode=edit".format(
                    content_hierarchy_, row_['identifier'])
            retry_count = 0
            while retry_count < 5:
                try:
                    response = requests.get(url)
                    tb = response.json()['result']['content']
                    tree_obj = self.parse_etb(tb, row_)
                    root = importer.import_(tree_obj)
                    self.etb_dialcode(row_, (root, ) + root.descendants,
                                      dialcode_etb)
                    self.etb_textbook(row_, root, textbook_etb)
                    if row_['status'] == 'Live':
                        chapters = findall(
                            root, filter_=lambda node: node.depth == 1)
                        for i in range(len(chapters)):
                            term = 'T1' if i <= (len(chapters) / 2) else 'T2'
                            chapters[i].term = term
                            for descendant in chapters[i].descendants:
                                descendant.term = term
                        root.term = 'T1'
                        dialcode_wo_content = findall(
                            root,
                            filter_=lambda node: node.dialcode != '' and node.
                            leafNodesCount == 0)
                        self.dce_dialcode(row_, dialcode_wo_content,
                                          dialcode_dce)
                        self.dce_textbook(row_, root, textbook_dce)
                    break
                except requests.exceptions.ConnectionError:
                    retry_count += 1
                    print("ConnectionError: Retry {} for textbook {}".format(
                        retry_count, row_['identifier']))
                    time.sleep(10)
                except KeyError:
                    with open(
                            result_loc_.joinpath('textbook_reports',
                                                 date_.strftime('%Y-%m-%d'),
                                                 'etb_error_log.log'),
                            'a') as f:
                        f.write(
                            "KeyError: Resource not found for textbook {} in {}\n"
                            .format(row_['identifier'], row_['status']))
                    break
            else:
                print("Max retries reached...")
                continue
            if retry_count == 5:
                skipped_tbs.append(row_)
                continue
            if response.status_code != 200:
                continue

        etb_dc = pd.DataFrame(dialcode_etb)
        etb_dc.to_csv(result_loc_.joinpath('textbook_reports',
                                           date_.strftime('%Y-%m-%d'),
                                           'ETB_dialcode_data_pre.csv'),
                      index=False,
                      encoding='utf-8-sig')
        post_data_to_blob(result_loc_.joinpath('textbook_reports',
                                               date_.strftime('%Y-%m-%d'),
                                               'ETB_dialcode_data_pre.csv'),
                          backup=True)
        etb_tb = pd.DataFrame(textbook_etb).fillna('')
        etb_tb.to_csv(result_loc_.joinpath('textbook_reports',
                                           date_.strftime('%Y-%m-%d'),
                                           'ETB_textbook_data_pre.csv'),
                      index=False,
                      encoding='utf-8-sig')
        post_data_to_blob(result_loc_.joinpath('textbook_reports',
                                               date_.strftime('%Y-%m-%d'),
                                               'ETB_textbook_data_pre.csv'),
                          backup=True)
        dce_dc = pd.DataFrame(dialcode_dce)
        dce_dc.to_csv(result_loc_.joinpath('textbook_reports',
                                           date_.strftime('%Y-%m-%d'),
                                           'DCE_dialcode_data_pre.csv'),
                      index=False,
                      encoding='utf-8-sig')
        post_data_to_blob(result_loc_.joinpath('textbook_reports',
                                               date_.strftime('%Y-%m-%d'),
                                               'DCE_dialcode_data_pre.csv'),
                          backup=True)
        dce_tb = pd.DataFrame(textbook_dce).fillna('')
        dce_tb.to_csv(result_loc_.joinpath('textbook_reports',
                                           date_.strftime('%Y-%m-%d'),
                                           'DCE_textbook_data_pre.csv'),
                      index=False,
                      encoding='utf-8-sig')
        post_data_to_blob(result_loc_.joinpath('textbook_reports',
                                               date_.strftime('%Y-%m-%d'),
                                               'DCE_textbook_data_pre.csv'),
                          backup=True)
        channels = set()
        for c in etb_dc.channel.unique():
            if c in board_slug.index:
                channels.add(c)
        for c in etb_tb.channel.unique():
            if c in board_slug.index:
                channels.add(c)
        for c in dce_dc.channel.unique():
            if c in board_slug.index:
                channels.add(c)
        for c in dce_tb.channel.unique():
            if c in board_slug.index:
                channels.add(c)
        channels = list(channels)
        etb_dc = etb_dc.join(scans_df, on='QR Code', how='left').fillna('')
        etb_dc.sort_values(
            by=['channel', 'Medium', 'grade_sort', 'Subject', 'Textbook Name'],
            inplace=True)
        etb_dc = etb_dc[[
            'Textbook ID', 'channel', 'Medium', 'Grade', 'Subject',
            'Textbook Name', 'Textbook Status', 'Type of Node', 'Level 1 Name',
            'Level 2 Name', 'Level 3 Name', 'Level 4 Name', 'Level 5 Name',
            'QR Code', 'Total Scans', 'Number of contents'
        ]]
        etb_dc.to_csv(result_loc_.joinpath('textbook_reports',
                                           date_.strftime('%Y-%m-%d'),
                                           'ETB_dialcode_data.csv'),
                      index=False,
                      encoding='utf-8-sig')
        etb_tb.sort_values(
            by=['channel', 'Medium', 'grade_sort', 'Subject', 'Textbook Name'],
            inplace=True)
        etb_tb = etb_tb[[
            'Textbook ID', 'channel', 'Medium', 'Grade', 'Subject',
            'Textbook Name', 'Textbook Status', 'Created On',
            'Last Updated On', 'Total content linked',
            'Total QR codes linked to content',
            'Total number of QR codes with no linked content',
            'Total number of leaf nodes',
            'Number of leaf nodes with no content', 'With QR codes'
        ]]
        etb_tb.to_csv(result_loc_.joinpath('textbook_reports',
                                           date_.strftime('%Y-%m-%d'),
                                           'ETB_textbook_data.csv'),
                      index=False,
                      encoding='utf-8-sig')
        dce_dc = dce_dc.join(scans_df, on='QR Code', how='left').fillna('')
        dce_dc.sort_values(
            by=['channel', 'Medium', 'grade_sort', 'Subject', 'Textbook Name'],
            inplace=True)
        dce_dc = dce_dc[[
            'Textbook ID', 'channel', 'Medium', 'Grade', 'Subject',
            'Textbook Name', 'Level 1 Name', 'Level 2 Name', 'Level 3 Name',
            'Level 4 Name', 'Level 5 Name', 'QR Code', 'Total Scans', 'Term'
        ]]
        dce_dc.to_csv(result_loc_.joinpath('textbook_reports',
                                           date_.strftime('%Y-%m-%d'),
                                           'DCE_dialcode_data.csv'),
                      index=False,
                      encoding='utf-8-sig')
        dce_tb.sort_values(
            by=['channel', 'Medium', 'grade_sort', 'Subject', 'Textbook Name'],
            inplace=True)
        dce_tb = dce_tb[[
            'Textbook ID', 'channel', 'Medium', 'Grade', 'Subject',
            'Textbook Name', 'Created On', 'Last Updated On',
            'Total number of QR codes',
            'Number of QR codes with atleast 1 linked content',
            'Number of QR codes with no linked content',
            'Term 1 QR Codes with no linked content',
            'Term 2 QR Codes with no linked content'
        ]]
        dce_tb.to_csv(result_loc_.joinpath('textbook_reports',
                                           date_.strftime('%Y-%m-%d'),
                                           'DCE_textbook_data.csv'),
                      index=False,
                      encoding='utf-8-sig')
        for channel in channels:
            slug = board_slug.loc[channel]['slug']
            df_etb_dc = etb_dc[etb_dc['channel'] == channel]
            result_loc_.joinpath('portal_dashboards',
                                 slug).mkdir(exist_ok=True)
            etb_dc_path = result_loc_.joinpath('portal_dashboards', slug,
                                               'ETB_dialcode_data.csv')
            df_etb_dc.drop('channel', axis=1).to_csv(etb_dc_path,
                                                     index=False,
                                                     encoding='utf-8-sig')
            create_json(etb_dc_path)
            post_data_to_blob(etb_dc_path)
            df_etb_tb = etb_tb[etb_tb['channel'] == channel]
            result_loc_.joinpath('portal_dashboards',
                                 slug).mkdir(exist_ok=True)
            etb_tb_path = result_loc_.joinpath('portal_dashboards', slug,
                                               'ETB_textbook_data.csv')
            self.etb_aggregates(result_loc_, slug, df_etb_tb)
            df_etb_tb.drop(['channel', 'With QR codes'],
                           axis=1).to_csv(etb_tb_path,
                                          index=False,
                                          encoding='utf-8-sig')
            create_json(etb_tb_path)
            post_data_to_blob(etb_tb_path)
            df_dce_dc = dce_dc[dce_dc['channel'] == channel]
            result_loc_.joinpath('portal_dashboards',
                                 slug).mkdir(exist_ok=True)
            dce_dc_path = result_loc_.joinpath('portal_dashboards', slug,
                                               'DCE_dialcode_data.csv')
            df_dce_dc.drop('channel', axis=1).to_csv(dce_dc_path,
                                                     index=False,
                                                     encoding='utf-8-sig')
            create_json(dce_dc_path)
            post_data_to_blob(dce_dc_path)
            df_dce_tb = dce_tb[dce_tb['channel'] == channel]
            result_loc_.joinpath('portal_dashboards',
                                 slug).mkdir(exist_ok=True)
            dce_tb_path = result_loc_.joinpath('portal_dashboards', slug,
                                               'DCE_textbook_data.csv')
            try:
                self.dce_aggregates(result_loc_, slug, df_dce_tb)
            except IndexError:
                pass
            df_dce_tb.drop('channel', axis=1).to_csv(dce_tb_path,
                                                     index=False,
                                                     encoding='utf-8-sig')
            create_json(dce_tb_path)
            post_data_to_blob(dce_tb_path)
        if skipped_tbs:
            with open(
                    result_loc_.joinpath('textbook_reports',
                                         date_.strftime('%Y-%m-%d'),
                                         'etb_error_log.log'), 'a') as f:
                for tb_id in skipped_tbs:
                    f.write(
                        'ConnectionError: Failed to fetch Hierarchy for {} in {} state.\n'
                        .format(tb_id['identifier'], tb_id['status']))

コード例 #13

ファイルを表示

ファイル: etb_metrics.py プロジェクト: reshmi-nair/sunbird-data-products

 def etb_aggregates(self, result_loc_, slug, df):
     """
     generate charts from ETB data
     :param result_loc_: pathlib.Path object for resultant CSVs.
     :param slug: slug for the channel
     :param df: ETB textbook dataframe for channel
     :return: None
     """
     textbook_status = pd.DataFrame(
         df['Textbook Status'].value_counts()).reindex(
             ['Live', 'Review', 'Draft']).reset_index().fillna(0)
     textbook_status.columns = ['Status', 'Count']
     textbook_status.to_csv(result_loc_.joinpath('portal_dashboards', slug,
                                                 'etb_textbook_status.csv'),
                            index=False)
     create_json(
         result_loc_.joinpath('portal_dashboards', slug,
                              'etb_textbook_status.csv'))
     post_data_to_blob(
         result_loc_.joinpath('portal_dashboards', slug,
                              'etb_textbook_status.csv'))
     textbook_status_grade = pd.DataFrame(
         df.groupby(['Grade', 'Textbook Status'
                     ])['Textbook ID'].count()).reset_index().pivot(
                         index='Grade',
                         columns='Textbook Status').fillna(0).reset_index()
     columns = ['Grade']
     for column in textbook_status_grade.columns[1:]:
         columns.append(column[1])
     textbook_status_grade.columns = columns
     textbook_status_grade = self.grade_fix(textbook_status_grade)
     statuses = ['Live', 'Review', 'Draft']
     column_order = ['Class']
     for status in statuses:
         if status not in textbook_status_grade.columns:
             textbook_status_grade[status] = 0
     textbook_status_grade = textbook_status_grade[column_order + statuses]
     textbook_status_grade.to_csv(result_loc_.joinpath(
         'portal_dashboards', slug, 'etb_textbook_status_grade.csv'),
                                  index=False)
     create_json(
         result_loc_.joinpath('portal_dashboards', slug,
                              'etb_textbook_status_grade.csv'))
     post_data_to_blob(
         result_loc_.joinpath('portal_dashboards', slug,
                              'etb_textbook_status_grade.csv'))
     textbook_status_subject = pd.DataFrame(
         df.groupby(['Subject', 'Textbook Status'
                     ])['Textbook ID'].count()).reset_index().pivot(
                         index='Subject',
                         columns='Textbook Status').fillna(0).reset_index()
     columns = ['Subject']
     for column in textbook_status_subject.columns[1:]:
         columns.append(column[1])
     textbook_status_subject.columns = columns
     column_order = ['Subject']
     for status in statuses:
         if status not in textbook_status_subject.columns:
             textbook_status_subject[status] = 0
     textbook_status_subject = textbook_status_subject[column_order +
                                                       statuses]
     textbook_status_subject.to_csv(result_loc_.joinpath(
         'portal_dashboards', slug, 'etb_textbook_status_subject.csv'),
                                    index=False)
     create_json(
         result_loc_.joinpath('portal_dashboards', slug,
                              'etb_textbook_status_subject.csv'))
     post_data_to_blob(
         result_loc_.joinpath('portal_dashboards', slug,
                              'etb_textbook_status_subject.csv'))
     qr_counts = pd.DataFrame(
         df.groupby([
             'channel', 'With QR codes'
         ])['Textbook ID'].count()).reset_index().drop('channel', axis=1)
     qr_counts.columns = ['Status', 'Count']
     qr_counts.to_csv(result_loc_.joinpath('portal_dashboards', slug,
                                           'etb_qr_count.csv'),
                      index=False)
     create_json(
         result_loc_.joinpath('portal_dashboards', slug,
                              'etb_qr_count.csv'))
     post_data_to_blob(
         result_loc_.joinpath('portal_dashboards', slug,
                              'etb_qr_count.csv'))
     qr_linkage = df[[
         'Total QR codes linked to content',
         'Total number of QR codes with no linked content'
     ]].sum()
     qr_linkage.index = ['QR Code With Content', 'QR Code Without Content']
     qr_linkage = pd.DataFrame(qr_linkage).reset_index()
     qr_linkage.columns = ['Status', 'Count']
     qr_linkage.to_csv(result_loc_.joinpath('portal_dashboards', slug,
                                            'etb_qr_content_status.csv'),
                       index=False)
     create_json(
         result_loc_.joinpath('portal_dashboards', slug,
                              'etb_qr_content_status.csv'))
     post_data_to_blob(
         result_loc_.joinpath('portal_dashboards', slug,
                              'etb_qr_content_status.csv'))
     qr_linkage_grade = df.groupby('Grade')[[
         'Total QR codes linked to content',
         'Total number of QR codes with no linked content'
     ]].sum().reset_index()
     qr_linkage_grade.columns = [
         'Grade', 'QR Codes with content', 'QR Codes without content'
     ]
     qr_linkage_grade = self.grade_fix(qr_linkage_grade)
     qr_linkage_grade.to_csv(result_loc_.joinpath(
         'portal_dashboards', slug, 'etb_qr_content_status_grade.csv'),
                             index=False)
     create_json(
         result_loc_.joinpath('portal_dashboards', slug,
                              'etb_qr_content_status_grade.csv'))
     post_data_to_blob(
         result_loc_.joinpath('portal_dashboards', slug,
                              'etb_qr_content_status_grade.csv'))
     qr_linkage_subject = df.groupby('Subject')[[
         'Total QR codes linked to content',
         'Total number of QR codes with no linked content'
     ]].sum().reset_index()
     qr_linkage_subject.columns = [
         'Class', 'QR Codes with content', 'QR Codes without content'
     ]
     qr_linkage_subject.to_csv(result_loc_.joinpath(
         'portal_dashboards', slug, 'etb_qr_content_status_subject.csv'),
                               index=False)
     create_json(
         result_loc_.joinpath('portal_dashboards', slug,
                              'etb_qr_content_status_subject.csv'))
     post_data_to_blob(
         result_loc_.joinpath('portal_dashboards', slug,
                              'etb_qr_content_status_subject.csv'))

コード例 #14

ファイルを表示

 def get_weekly_plays(self, result_loc_, date_, cassandra_, keyspace_):
     """
     query cassandra table for 1 week of content play and timespent.
     :param result_loc_: local path to store resultant csv
     :param date_: datetime object to pass to file path
     :param cassandra_: ip of the cassandra cluster
     :param keyspace_: keyspace in which we are working
     :return: None
     """
     tenant_info = pd.read_csv(
         result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                              'tenant_info.csv'))[['id', 'slug']]
     tenant_info['id'] = tenant_info['id'].astype(str)
     tenant_info.set_index('id', inplace=True)
     cluster = Cluster([cassandra_])
     session = cluster.connect()
     start_date = date_ - timedelta(days=7)
     fetch_query = Template("""
     SELECT content_id, period, pdata_id, metric FROM $keyspace.content_aggregates WHERE 
     period >= $start_date AND 
     period < $end_date
     ALLOW FILTERING
     """)
     result = session.execute(
         fetch_query.substitute(keyspace=keyspace_,
                                start_date=start_date.strftime('%Y%m%d'),
                                end_date=date_.strftime('%Y%m%d')))
     df_dict = {}
     for row in result:
         if row.content_id in df_dict.keys():
             pass
         else:
             df_dict[row.content_id] = {
                 'identifier': row.content_id,
                 'Number of Plays on App': 0,
                 'Number of Plays on Portal': 0,
                 'Timespent on App': 0,
                 'Timespent on Portal': 0
             }
         pdata_id = 'App' if row.pdata_id == self.config['context']['pdata']['id']['app'] else 'Portal' if \
             row.pdata_id == self.config['context']['pdata']['id']['portal'] else 'error'
         df_dict[row.content_id]['Number of Plays on ' +
                                 pdata_id] += row.metric['plays']
         df_dict[row.content_id]['Timespent on ' +
                                 pdata_id] = row.metric['timespent']
     temp = []
     for k, v in df_dict.items():
         temp.append(v)
     df = pd.DataFrame(temp)
     df['Total No of Plays (App and Portal)'] = df[
         'Number of Plays on App'] + df['Number of Plays on Portal']
     df['Average Play Time in mins on App'] = round(
         df['Timespent on App'] / (60 * df['Number of Plays on App']), 2)
     df['Average Play Time in mins on Portal'] = round(
         df['Timespent on Portal'] / (60 * df['Number of Plays on Portal']),
         2)
     df['Average Play Time in mins (On App and Portal)'] = round(
         (df['Timespent on App'] + df['Timespent on Portal']) /
         (60 * df['Total No of Plays (App and Portal)']), 2)
     df = df[[
         'identifier', 'Total No of Plays (App and Portal)',
         'Number of Plays on App', 'Number of Plays on Portal',
         'Average Play Time in mins (On App and Portal)',
         'Average Play Time in mins on App',
         'Average Play Time in mins on Portal'
     ]]
     content_model = pd.read_csv(
         result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                              'content_model_snapshot.csv'))[[
                                  'channel', 'board', 'medium',
                                  'gradeLevel', 'subject', 'identifier',
                                  'name', 'mimeType', 'createdOn',
                                  'creator', 'lastPublishedOn',
                                  'me_averageRating'
                              ]]
     content_model["creator"] = content_model["creator"].str.replace(
         "null", "")
     content_model['channel'] = content_model['channel'].astype(str)
     content_model['mimeType'] = content_model['mimeType'].apply(
         self.mime_type)
     content_model.columns = [
         'channel', 'Board', 'Medium', 'Grade', 'Subject', 'Content ID',
         'Content Name', 'Mime Type', 'Created On', 'Creator (User Name)',
         'Last Published On', 'Average Rating(out of 5)'
     ]
     content_model['Content ID'] = content_model['Content ID'].str.replace(
         ".img", "")
     content_model['Created On'] = content_model['Created On'].fillna(
         'T').apply(lambda x: '-'.join(x.split('T')[0].split('-')[::-1]))
     content_model['Last Published On'] = content_model[
         'Last Published On'].fillna('T').apply(
             lambda x: '-'.join(x.split('T')[0].split('-')[::-1]))
     # content_model['Last Updated On'] = content_model['Last Updated On'].fillna('T').apply(
     #     lambda x: '-'.join(x.split('T')[0].split('-')[::-1]))
     df = content_model.join(df.set_index('identifier'),
                             on='Content ID',
                             how='left')
     df['Last Date of the week'] = (date_ -
                                    timedelta(days=1)).strftime('%d-%m-%Y')
     df['Total No of Plays (App and Portal)'] = df[
         'Total No of Plays (App and Portal)'].fillna(0)
     df['Number of Plays on App'] = df['Number of Plays on App'].fillna(0)
     df['Number of Plays on Portal'] = df[
         'Number of Plays on Portal'].fillna(0)
     df['Average Play Time in mins (On App and Portal)'] = df[
         'Average Play Time in mins (On App and Portal)'].fillna(0)
     df['Average Play Time in mins on App'] = df[
         'Average Play Time in mins on App'].fillna(0)
     df['Average Play Time in mins on Portal'] = df[
         'Average Play Time in mins on Portal'].fillna(0)
     df = df.fillna('Unknown')
     df.sort_values(inplace=True,
                    ascending=[1, 1, 1, 1, 1, 0],
                    by=[
                        'channel', 'Board', 'Medium', 'Grade', 'Subject',
                        'Total No of Plays (App and Portal)'
                    ])
     df.to_csv(result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                                    'weekly_plays.csv'),
               index=False)
     post_data_to_blob(result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                                            'weekly_plays.csv'),
                       backup=True)
     for channel in df.channel.unique():
         try:
             slug = tenant_info.loc[channel]['slug']
             print(slug)
         except KeyError:
             continue
         content_aggregates = df[df['channel'] == channel]
         content_aggregates.drop(['channel'], axis=1, inplace=True)
         try:
             get_data_from_blob(
                 result_loc_.parent.joinpath('portal_dashboards', slug,
                                             'content_aggregates.csv'))
             blob_data = pd.read_csv(
                 result_loc_.parent.joinpath('portal_dashboards', slug,
                                             'content_aggregates.csv'))
         except AzureMissingResourceHttpError:
             blob_data = pd.DataFrame()
         except FileNotFoundError:
             blob_data = pd.DataFrame()
         content_aggregates = content_aggregates.append(
             blob_data).drop_duplicates(
                 subset=['Content ID', 'Last Date of the week'],
                 keep='first')
         content_aggregates = content_aggregates[[
             'Board', 'Medium', 'Grade', 'Subject', 'Content ID',
             'Content Name', 'Mime Type', 'Created On',
             'Creator (User Name)', 'Last Published On',
             'Total No of Plays (App and Portal)', 'Number of Plays on App',
             'Number of Plays on Portal',
             'Average Play Time in mins (On App and Portal)',
             'Average Play Time in mins on App',
             'Average Play Time in mins on Portal',
             'Average Rating(out of 5)', 'Last Date of the week'
         ]]
         result_loc_.parent.joinpath('portal_dashboards',
                                     slug).mkdir(exist_ok=True)
         content_aggregates.to_csv(result_loc_.parent.joinpath(
             'portal_dashboards', slug, 'content_aggregates.csv'),
                                   index=False,
                                   encoding='utf-8-sig')
         create_json(
             result_loc_.parent.joinpath('portal_dashboards', slug,
                                         'content_aggregates.csv'))
         post_data_to_blob(
             result_loc_.parent.joinpath('portal_dashboards', slug,
                                         'content_aggregates.csv'))