def get_dataverse_count(self, **extra_filters): """ Return the Dataverse count -- a single number """ if self.was_error_found(): return self.get_error_msg_return() filter_params = self.get_date_filter_params() if extra_filters: for k, v in extra_filters.items(): filter_params[k] = v if self.include_harvested: q = Dataverse.objects.filter(**filter_params) else: q = Dataverse.objects.filter(**filter_params\ ).exclude(self.get_harvested_dataverse_ids()\ ) sql_query = str(q.query) data_dict = OrderedDict() data_dict['count'] = q.count() data_dict['count_string'] = "{:,}".format(data_dict['count']) return StatsResult.build_success_result(data_dict, sql_query)
def get_easy_file_downloads_by_month(self, **extra_filters): file_counts_by_month = self.get_easy_file_downloads_counts( **extra_filters) running_total = self.get_easy_file_downloads_running_total( **extra_filters) noncumulative = self.noncumulative formatted_records = [] # move from a queryset to a [] for d in file_counts_by_month: year_month = d['yyyy_mm'][:7] year = int(d['yyyy_mm'][:4]) try: month = int(d['yyyy_mm'][5:7]) except: return StatsResult.build_error_result( "in converting %s (month) into an integer (in get_easy_dataset_count_by_month)" % d['yyyy_mm'][5:7]) fmt_rec = OrderedDict() fmt_rec['yyyy_mm'] = year_month fmt_rec['count'] = d['count'] # running total running_total += d['count'] if noncumulative: fmt_rec['running_total'] = d['count'] else: fmt_rec['running_total'] = running_total # Add year and month numbers fmt_rec['year_num'] = year fmt_rec['month_num'] = month # Add month name month_name_found, month_name_short = get_month_name_abbreviation( month) if month_name_found: assume_month_name_found, fmt_rec[ 'month_name'] = get_month_name(month) fmt_rec['month_name_short'] = month_name_short else: logging.warning( "no month name found for month %d (get_easy_file_downloads_by_month)" % month) formatted_records.append(fmt_rec) data_dict = OrderedDict() data_dict['total_downloads'] = running_total data_dict['record_count'] = len(formatted_records) data_dict['records'] = formatted_records return StatsResult.build_success_result(data_dict, None)
def get_easy_counts_by_month(self, ds_counts_by_month, running_total, noncumulative): formatted_records = [] for d in ds_counts_by_month: year_month = d['yyyy_mm'][:7] year = int(d['yyyy_mm'][:4]) try: month = int(d['yyyy_mm'][5:7]) except: return StatsResult.build_error_result( "in converting %s (month) into an integer (in get_easy_dataset_count_by_month)" % d['yyyy_mm'][5:7]) fmt_dict = OrderedDict() fmt_dict['yyyy_mm'] = year_month fmt_dict['count'] = d['count'] # running total running_total += d['count'] if noncumulative: fmt_dict['running_total'] = d['count'] else: fmt_dict['running_total'] = running_total # Add year and month numbers fmt_dict['year_num'] = year fmt_dict['month_num'] = month # Add month name month_name_found, month_name_short = get_month_name_abbreviation( month) if month_name_found: assume_month_name_found, fmt_dict[ 'month_name'] = get_month_name(month) fmt_dict['month_name_short'] = month_name_short else: logging.warning( "no month name found for month %d (get_easy_dataset_count_by_month)" % month) # Add formatted record formatted_records.append(fmt_dict) data_dict = OrderedDict() data_dict['record_count'] = len(formatted_records) data_dict['records'] = formatted_records return StatsResult.build_success_result(data_dict, None)
def view_file_extensions_within_type(self, file_type=None): """View extensions for files based on their "Filemetadata.contenttype" value""" #file_type = 'data/various-formats' if file_type is None: # Retrieve list of **all** file names -- this could be too much! l = FileMetadata.objects.distinct('datafile__id', 'label'\ ).values_list('datafile__id', 'label') else: # Retrieve ids of Datafile filtered by "contenttype" ids = Datafile.objects.filter(contenttype=file_type).values_list( 'dvobject__id', flat=True) # Retrieve the names of these Datafiles via the FileMetadata object l = FileMetadata.objects.filter(datafile__in=ids\ ).distinct('datafile__id', 'label'\ ).values_list('datafile__id', 'label') # Convert the file names to file extensions ext_list = [splitext(info[1])[-1] for info in l] # Make a dict counting the extensions extension_counts = { } # {file extension : count, file ext : count, etc} for ext in ext_list: extension_counts[ext] = extension_counts.get(ext, 0) + 1 # Sort the counts in descending order--highest count first ext_pairs = extension_counts.items() ext_pairs = sorted(ext_pairs, key=lambda k: k[1], reverse=True) ext_list = [] total_count = sum(x[1] for x in ext_pairs) + 0.000 for ext_pair in ext_pairs: d = OrderedDict(extension=ext_pair[0]) d['count'] = ext_pair[1] d['total_count'] = int(total_count) d['percent_string'] = '{0:.3%}'.format(ext_pair[1] / total_count) ext_list.append(d) data_dict = OrderedDict(number_unique_extensions=len(ext_pairs)) data_dict['total_file_count'] = int(total_count) data_dict['record_count'] = len(ext_list) data_dict['records'] = ext_list data_dict['all_dv_files'] = Datafile.objects.all().count() data_dict['percent_unknown'] = '{0:.3%}'.format( total_count / data_dict['all_dv_files']) return StatsResult.build_success_result(data_dict)
def get_stats_result(self, request): """Return the StatsResult object for this statistic""" dv_id = self.kwargs.get('ds_id', None) if dv_id is None: return StatsResult.build_error_result("No Dataset id specified", 400) # Get the latest version dataset_version = get_latest_dataset_version(dv_id) if dataset_version is None: return StatsResult.build_error_result('No published Dataset with id: %s' % dv_id, 404) dataset_as_json = DatasetSerializer(dataset_version).as_json() return StatsResult.build_success_result(dataset_as_json)
def view_file_extensions_within_type(self, file_type=None): """View extensions for files based on their "Filemetadata.contenttype" value""" #file_type = 'data/various-formats' if file_type is None: # Retrieve list of **all** file names -- this could be too much! l = FileMetadata.objects.distinct('datafile__id', 'label'\ ).values_list('datafile__id', 'label') else: # Retrieve ids of Datafile filtered by "contenttype" ids = Datafile.objects.filter(contenttype=file_type).values_list('dvobject__id', flat=True) # Retrieve the names of these Datafiles via the FileMetadata object l = FileMetadata.objects.filter(datafile__in=ids\ ).distinct('datafile__id', 'label'\ ).values_list('datafile__id', 'label') # Convert the file names to file extensions ext_list = [splitext(info[1])[-1] for info in l] # Make a dict counting the extensions extension_counts = {} # {file extension : count, file ext : count, etc} for ext in ext_list: extension_counts[ext] = extension_counts.get(ext, 0) + 1 # Sort the counts in descending order--highest count first ext_pairs = extension_counts.items() ext_pairs = sorted(ext_pairs, key=lambda k: k[1], reverse=True) ext_list = [] total_count = sum(x[1] for x in ext_pairs) + 0.000 for ext_pair in ext_pairs: d = OrderedDict(extension=ext_pair[0]) d['count'] = ext_pair[1] d['total_count'] = int(total_count) d['percent_string'] = '{0:.3%}'.format(ext_pair[1] / total_count) ext_list.append(d) data_dict = OrderedDict(number_unique_extensions=len(ext_pairs)) data_dict['total_file_count'] = int(total_count) data_dict['record_count'] = len(ext_list) data_dict['records'] = ext_list data_dict['all_dv_files'] = Datafile.objects.all().count() data_dict['percent_unknown'] = '{0:.3%}'.format(total_count/data_dict['all_dv_files']) return StatsResult.build_success_result(data_dict)
def get_stats_result(self, request): """Return the StatsResult object for this statistic""" #dv_id = request.GET.get('id', None) alias = self.kwargs.get('alias', None) if alias is None: return StatsResult.build_error_result("No Dataverse 'alias' specified", 400) try: selected_dv = Dataverse.objects.select_related('dvobject').get(\ alias=alias,\ dvobject__publicationdate__isnull=False) except Dataverse.DoesNotExist: return StatsResult.build_error_result('No published Dataverse with alias: %s' % alias, 404) dataverse_as_json = DataverseSerializer(selected_dv).as_json() return StatsResult.build_success_result(dataverse_as_json)
def get_stats_result(self, request): """Return the StatsResult object for this statistic""" dv_id = self.kwargs.get('ds_id', None) if dv_id is None: return StatsResult.build_error_result("No Dataset id specified", 400) # Get the latest version dataset_version = get_latest_dataset_version(dv_id) if dataset_version is None: return StatsResult.build_error_result( 'No published Dataset with id: %s' % dv_id, 404) dataset_as_json = DatasetSerializer(dataset_version).as_json() return StatsResult.build_success_result(dataset_as_json)
def get_dataset_category_counts(self, **extra_filters): """Dataset counts by subjet""" # Was an error found earlier? # if self.was_error_found(): return self.get_error_msg_return() if EASY_STATISTICS: ds_values = self.get_easy_dataset_category_counts() else: ds_values = self.get_dataverse_dataset_subject_counts( **extra_filters) # ----------------------------- # Iterate through the vocab values, # process the totals, calculate percentage # ----------------------------- running_total = 0 formatted_records = [] # move from a queryset to a [] total_count = sum([rec['cnt'] for rec in ds_values]) + 0.00 for info in ds_values: rec = OrderedDict() rec['category'] = info['category'] # count rec['count'] = info['cnt'] rec['total_count'] = int(total_count) # percent float_percent = info['cnt'] / total_count rec['percent_string'] = '{0:.1%}'.format(float_percent) rec['percent_number'] = float("%.3f" % (float_percent)) # total count formatted_records.append(rec) data_dict = OrderedDict() data_dict['record_count'] = len(formatted_records) data_dict['records'] = formatted_records return StatsResult.build_success_result(data_dict)
def get_total_file_downloads(self, **extra_filters): """ Get the total file download count """ if self.was_error_found(): return self.get_error_msg_return() filter_params = self.get_date_filter_params(date_var_name='responsetime') filter_params.update(self.get_download_type_filter()) # Narrow down to specific Dataverses filter_params.update(self.get_dataverse_params_for_guestbook()) if self.was_error_found(): return self.get_error_msg_return() # Add extra filters, if they exist count_pre_dv4_downloads = False if extra_filters: for k, v in extra_filters.items(): if k == INCLUDE_PRE_DV4_DOWNLOADS: # skip this param count_pre_dv4_downloads = True del extra_filters[k] else: filter_params[k] = v if count_pre_dv4_downloads: exclude_params = {} else: exclude_params = dict(responsetime__isnull=True) q = GuestBookResponse.objects.exclude(**exclude_params\ ).filter(**filter_params) sql_query = str(q.query) data_dict = OrderedDict() data_dict['count'] = q.count() data_dict['count_string'] = "{:,}".format(data_dict['count']) return StatsResult.build_success_result(data_dict, sql_query)
def get_total_file_downloads(self, **extra_filters): """ Get the total file download count """ if self.was_error_found(): return self.get_error_msg_return() filter_params = self.get_date_filter_params( date_var_name='responsetime') filter_params.update(self.get_download_type_filter()) # Narrow down to specific Dataverses filter_params.update(self.get_dataverse_params_for_guestbook()) if self.was_error_found(): return self.get_error_msg_return() # Add extra filters, if they exist count_pre_dv4_downloads = False if extra_filters: for k, v in extra_filters.items(): if k == INCLUDE_PRE_DV4_DOWNLOADS: # skip this param count_pre_dv4_downloads = True del extra_filters[k] else: filter_params[k] = v if count_pre_dv4_downloads: exclude_params = {} else: exclude_params = dict(responsetime__isnull=True) q = GuestBookResponse.objects.exclude(**exclude_params\ ).filter(**filter_params) sql_query = str(q.query) data_dict = OrderedDict() data_dict['count'] = q.count() data_dict['count_string'] = "{:,}".format(data_dict['count']) return StatsResult.build_success_result(data_dict, sql_query)
def get_dataset_count(self, **extra_filters): """ Return the Dataset count """ if self.was_error_found(): return self.get_error_msg_return() filter_params = self.get_date_filter_params() if extra_filters: for k, v in extra_filters.items(): filter_params[k] = v q = Dataset.objects.filter(**filter_params) sql_query = str(q.query) data_dict = OrderedDict() data_dict['count'] = q.count() data_dict['count_string'] = "{:,}".format(data_dict['count']) return StatsResult.build_success_result(data_dict, sql_query)
def get_datafile_count(self, **extra_filters): """ Return the Datafile count """ if self.was_error_found(): return self.get_error_msg_return() filter_params = self.get_date_filter_params() # Add extra filters, if they exist if extra_filters: for k, v in extra_filters.items(): filter_params[k] = v q = Datafile.objects.filter(**filter_params) sql_query = str(q.query) data_dict = OrderedDict() data_dict['count'] = q.count() data_dict['count_string'] = "{:,}".format(data_dict['count']) return StatsResult.build_success_result(data_dict, sql_query)
def get_stats_result(self, request): """Return the StatsResult object for this statistic""" persistent_id = request.GET.get('persistentId', None) if persistent_id is None: return StatsResult.build_error_result( "No Dataset persistent id specified", 400) ds = Dataset.get_dataset_by_persistent_id(persistent_id) err_404 = 'No published dataset found for persistentId: %s' % persistent_id if ds is None or not ds.dvobject.publicationdate: return StatsResult.build_error_result(err_404, 404) # Get the latest version dataset_version = get_latest_dataset_version(ds.dvobject.id) if dataset_version is None: return StatsResult.build_error_result(err_404, 404) dataset_as_json = DatasetSerializer(dataset_version).as_json() return StatsResult.build_success_result(dataset_as_json)
def get_stats_result(self, request): """Return the StatsResult object for this statistic""" persistent_id = request.GET.get('persistentId', None) if persistent_id is None: return StatsResult.build_error_result("No Dataset persistent id specified", 400) ds = Dataset.get_dataset_by_persistent_id(persistent_id) err_404 = 'No published dataset found for persistentId: %s' % persistent_id if ds is None or not ds.dvobject.publicationdate: return StatsResult.build_error_result(err_404, 404) # Get the latest version dataset_version = get_latest_dataset_version(ds.dvobject.id) if dataset_version is None: return StatsResult.build_error_result(err_404, 404) dataset_as_json = DatasetSerializer(dataset_version).as_json() return StatsResult.build_success_result(dataset_as_json)
def get_dataverse_affiliation_counts(self, **extra_filters): """ Return Dataverse counts by affiliation Returns: dv_counts_by_affiliation": [ { "affiliation": "University of Oxford", "affiliation_count": 2, "total_count": 191, "percent_string": "1.0%" }, { "affiliation": "University of Illinois", "affiliation_count": 1, "total_count": 191, "percent_string": "0.5%" } ... ] """ if self.was_error_found(): return self.get_error_msg_return() # Retrieve the date parameters # filter_params = self.get_date_filter_params(DVOBJECT_CREATEDATE_ATTR) if extra_filters: for k, v in extra_filters.items(): filter_params[k] = v dataverse_counts_by_affil = Dataverse.objects.select_related('dvobject'\ ).filter(**filter_params\ ).values('affiliation'\ ).order_by('affiliation'\ ).annotate(affiliation_count=models.Count('affiliation')\ ).order_by('-affiliation_count') # ----------------------------------- # Get SQL query string # ----------------------------------- sql_query = str(dataverse_counts_by_affil.query) # Count all dataverses # total_count = sum([rec.get('affiliation_count', 0) for rec in dataverse_counts_by_affil]) total_count = total_count + 0.0 print 'dataverse_counts_by_affil', dataverse_counts_by_affil # Format the records, adding 'total_count' and 'percent_string' to each one # formatted_records = [] for rec in dataverse_counts_by_affil: if rec.get('affiliation_count', 0) > 0: fmt_dict = OrderedDict() affil_str = rec.get('affiliation', None) if affil_str is not None: affil_str = affil_str.encode('utf-8') fmt_dict['affiliation'] = affil_str fmt_dict['affiliation_count'] = rec.get('affiliation_count', 0) if total_count > 0: float_percent = rec.get('affiliation_count', 0) / total_count fmt_dict['total_count'] = int(total_count) fmt_dict['percent_string'] = '{0:.1%}'.format(float_percent) else: fmt_dict['total_count'] = 0 fmt_dict['percent_string'] = '0%' formatted_records.append(fmt_dict) data_dict = OrderedDict() data_dict['record_count'] = len(formatted_records) data_dict['records'] = formatted_records return StatsResult.build_success_result(data_dict, sql_query)
def get_dataverse_counts_by_type(self, exclude_uncategorized=True, **extra_filters): """ Return dataverse counts by 'dataversetype' Optional if a dataverse is uncategorized: - Specifying 'uncategorized_replacement_name' will set "UNCATEGORIZED" to another string Returns: { "dv_counts_by_type": [ { "dataversetype": "RESEARCH_PROJECTS", "type_count": 85, "total_count": 356, "percent_string": "23.9%" }, { "dataversetype": "TEACHING_COURSES", "type_count": 10, "total_count": 356, "percent_string": "2.8%" } ... etc ] } """ if self.was_error_found(): return self.get_error_msg_return() # Retrieve the date parameters # filter_params = self.get_date_filter_params(DVOBJECT_CREATEDATE_ATTR) # Add extra filters if extra_filters: for k, v in extra_filters.items(): filter_params[k] = v if exclude_uncategorized: exclude_params = dict(dataversetype=DATAVERSE_TYPE_UNCATEGORIZED) else: exclude_params = {} dataverse_counts_by_type = Dataverse.objects.select_related('dvobject'\ ).filter(**filter_params\ ).exclude(**exclude_params\ ).values('dataversetype'\ ).order_by('dataversetype'\ ).annotate(type_count=models.Count('dataversetype')\ ).order_by('-type_count') # ----------------------------------- # Get SQL query string # ----------------------------------- sql_query = str(dataverse_counts_by_type.query) # Count all dataverses # total_count = sum([rec.get('type_count', 0) for rec in dataverse_counts_by_type]) total_count = total_count + 0.0 # Format the records, adding 'total_count' and 'percent_string' to each one # formatted_records = [] for rec in dataverse_counts_by_type: fmt_dict = OrderedDict() fmt_dict['dataversetype'] = rec['dataversetype'] fmt_dict['dataversetype_label'] = rec['dataversetype'].replace('_', ' ') fmt_dict['type_count'] = rec.get('type_count', 0) if total_count > 0: float_percent = rec.get('type_count', 0) / total_count fmt_dict['total_count'] = int(total_count) fmt_dict['percent_string'] = '{0:.1%}'.format(float_percent) else: fmt_dict['total_count'] = 0 fmt_dict['percent_string'] = '0%' formatted_records.append(fmt_dict) data_dict = OrderedDict() data_dict['record_count'] = len(formatted_records) data_dict['records'] = formatted_records return StatsResult.build_success_result(data_dict, sql_query)
def get_dataverse_counts_by_month(self, date_param=DVOBJECT_CREATEDATE_ATTR, **extra_filters): """ Return Dataverse counts by month """ # Was an error found earlier? # if self.was_error_found(): return self.get_error_msg_return() # ----------------------------------- # (1) Build query filters # ----------------------------------- # Exclude records where dates are null # - e.g. a record may not have a publication date exclude_params = { '%s__isnull' % date_param : True} if self.include_harvested: exclude_params['dvobject__id__in'] = self.get_harvested_dataverse_ids() # Retrieve the date parameters # filter_params = self.get_date_filter_params() # Add extra filters from kwargs # if extra_filters: for k, v in extra_filters.items(): filter_params[k] = v # ----------------------------------- # (2) Construct query # ----------------------------------- # add exclude filters date filters # dv_counts_by_month = Dataverse.objects.select_related('dvobject'\ ).exclude(**exclude_params\ ).filter(**filter_params) # annotate query adding "month_year" and "count" # dv_counts_by_month = dv_counts_by_month.annotate(\ yyyy_mm=TruncYearMonth('%s' % date_param)\ ).values('yyyy_mm'\ ).annotate(count=models.Count('dvobject_id')\ ).values('yyyy_mm', 'count'\ ).order_by('%syyyy_mm' % self.time_sort) # ----------------------------------- # (2a) Get SQL query string # ----------------------------------- sql_query = str(dv_counts_by_month.query) # ----------------------------------- # (3) Format results # ----------------------------------- # hold the running total count running_total = self.get_dataverse_count_start_point(**extra_filters) formatted_records = [] # move from a queryset to a [] for d in dv_counts_by_month: rec_fmt = OrderedDict() # change the datetime object to a string rec_fmt['yyyy_mm'] = d['yyyy_mm'].strftime('%Y-%m') rec_fmt['count'] = d['count'] # running total running_total += d['count'] rec_fmt['running_total'] = running_total # d['month_year'] = d['yyyy_mm'].strftime('%Y-%m') # Add year and month numbers rec_fmt['year_num'] = d['yyyy_mm'].year rec_fmt['month_num'] = d['yyyy_mm'].month # Add month name month_name_found, month_name_short = get_month_name_abbreviation(d['yyyy_mm'].month) if month_name_found: assume_month_name_found, rec_fmt['month_name'] = get_month_name(d['yyyy_mm'].month) rec_fmt['month_name_short'] = month_name_short else: # Log it!!!!!! pass # Add formatted record formatted_records.append(rec_fmt) data_dict = OrderedDict() data_dict['record_count'] = len(formatted_records) data_dict['total_count'] = running_total data_dict['records'] = formatted_records return StatsResult.build_success_result(data_dict, sql_query)
def get_file_counts_per_dataset_latest_versions(self, **extra_filters): """ Get binning stats for the number of files in each Dataset. For the counts, only use the LATEST DatasetVersion """ # Get the correct DatasetVersion ids as a filter parameter # latest_dsv_ids = self.get_dataset_version_ids(**extra_filters) filter_params = dict(datasetversion__id__in=latest_dsv_ids) # Make query # ds_version_counts = FileMetadata.objects.filter(**filter_params\ ).annotate(dsv_id=F('datasetversion__id'),\ ).values('dsv_id',\ ).annotate(cnt=models.Count('datafile__id')\ ).values('dsv_id', 'cnt'\ ).order_by('-cnt') # Convert to Dataframe # df = pd.DataFrame(list(ds_version_counts), columns = ['dsv_id', 'cnt']) # Get the list of bins # high_num = high_num=df['cnt'].max() + self.bin_size bins = self.get_bin_list(step=self.bin_size, low_num=0, high_num=high_num+self.bin_size) # Add a new column, assigning each file count to a bin # df['bin_label'] = pd.cut(df['cnt'], bins) # Count the occurrence of each bin # bin_count_series = pd.value_counts(df['bin_label']) # Make the Series into a new DataFrame # df_bins = pd.DataFrame(dict(bin=bin_count_series.index,\ count=bin_count_series.values)) # Add a sort key # (0, 20] -> 0 # (20, 30] -> 20 # etc df_bins['sort_key'] = df_bins['bin'].apply(lambda x: int(x[1:-1].split(',')[0])) df_bins['bin_start_inclusive'] = df_bins['sort_key'] df_bins['bin_end'] = df_bins['bin'].apply(lambda x: int(x[1:-1].split(',')[1])) # Add a formatted string # (0, 20] -> 0 to 20 # (20, 30] -> 20 to 30 # etc df_bins['bin_str'] = df_bins['bin'].apply(lambda x: x[1:-1].replace(', ', ' to ')) # Sort the bins # df_bins = df_bins.sort('sort_key') #msgt(df_bins) # If appropriate, skip empty bins, e.g. remove 0 counts # if self.skip_empty_bins: df_bins = df_bins.query('count != 0') #msg(df_bins) # Return as python dict # # bit expensive but want orderedDict formatted_records_json = df_bins.to_json(orient='records') formatted_records = json.loads(formatted_records_json, object_pairs_hook=OrderedDict) data_dict = OrderedDict() data_dict['record_count'] = len(formatted_records) data_dict['records'] = formatted_records return StatsResult.build_success_result(data_dict)
def get_datafile_content_type_counts(self, **extra_filters): """ Return datafile counts by 'content type' "datafile_content_type_counts": [ { "total_count": 1584, "contenttype": "text/tab-separated-values", "type_count": 187, "percent_string": "11.8%" }, { "total_count": 1584, "contenttype": "image/jpeg", "type_count": 182, "percent_string": "11.5%" }, { "total_count": 1584, "contenttype": "text/plain", "type_count": 147, "percent_string": "9.3%" } ] """ if self.was_error_found(): return self.get_error_msg_return() # Retrieve the date parameters # filter_params = self.get_date_filter_params(DVOBJECT_CREATEDATE_ATTR) # Add extra filters if extra_filters: for k, v in extra_filters.items(): filter_params[k] = v datafile_counts_by_type = Datafile.objects.select_related('dvobject'\ ).filter(**filter_params\ ).values('contenttype'\ ).order_by('contenttype'\ ).annotate(type_count=models.Count('contenttype')\ ).order_by('-type_count') sql_query = str(datafile_counts_by_type.query) # Count all dataverses # total_count = sum([rec.get('type_count', 0) for rec in datafile_counts_by_type]) total_count = total_count + 0.0 # Format the records, adding 'total_count' and 'percent_string' to each one # formatted_records = [] #num = 0 for rec in datafile_counts_by_type: if total_count > 0: fmt_dict = OrderedDict() fmt_dict['contenttype'] = rec['contenttype'] # short contenttype contenttype_parts = rec['contenttype'].split('/') if len(contenttype_parts) > 1: fmt_dict['short_content_type'] = '/'.join(contenttype_parts[1:]) else: fmt_dict['short_content_type'] = rec['contenttype'] fmt_dict['type_count'] = rec.get('type_count', 0) float_percent = fmt_dict['type_count'] / total_count fmt_dict['total_count'] = int(total_count) fmt_dict['percent_string'] = '{0:.1%}'.format(float_percent) formatted_records.append(fmt_dict) data_dict = OrderedDict() data_dict['record_count'] = len(formatted_records) data_dict['records'] = formatted_records return StatsResult.build_success_result(data_dict, sql_query)
def get_datafile_content_type_counts(self, **extra_filters): """ Return datafile counts by 'content type' "datafile_content_type_counts": [ { "total_count": 1584, "contenttype": "text/tab-separated-values", "type_count": 187, "percent_string": "11.8%" }, { "total_count": 1584, "contenttype": "image/jpeg", "type_count": 182, "percent_string": "11.5%" }, { "total_count": 1584, "contenttype": "text/plain", "type_count": 147, "percent_string": "9.3%" } ] """ if self.was_error_found(): return self.get_error_msg_return() # Retrieve the date parameters # filter_params = self.get_date_filter_params(DVOBJECT_CREATEDATE_ATTR) # Add extra filters if extra_filters: for k, v in extra_filters.items(): filter_params[k] = v datafile_counts_by_type = Datafile.objects.select_related('dvobject'\ ).filter(**filter_params\ ).values('contenttype'\ ).order_by('contenttype'\ ).annotate(type_count=models.Count('contenttype')\ ).order_by('-type_count') sql_query = str(datafile_counts_by_type.query) # Count all dataverses # total_count = sum( [rec.get('type_count', 0) for rec in datafile_counts_by_type]) total_count = total_count + 0.0 # Format the records, adding 'total_count' and 'percent_string' to each one # formatted_records = [] #num = 0 for rec in datafile_counts_by_type: if total_count > 0: fmt_dict = OrderedDict() fmt_dict['contenttype'] = rec['contenttype'] # short contenttype contenttype_parts = rec['contenttype'].split('/') if len(contenttype_parts) > 1: fmt_dict['short_content_type'] = '/'.join( contenttype_parts[1:]) else: fmt_dict['short_content_type'] = rec['contenttype'] fmt_dict['type_count'] = rec.get('type_count', 0) float_percent = fmt_dict['type_count'] / total_count fmt_dict['total_count'] = int(total_count) fmt_dict['percent_string'] = '{0:.1%}'.format(float_percent) formatted_records.append(fmt_dict) data_dict = OrderedDict() data_dict['record_count'] = len(formatted_records) data_dict['records'] = formatted_records return StatsResult.build_success_result(data_dict, sql_query)
def get_file_downloads_by_month(self, **extra_filters): """ Using the GuestBookResponse object, find the number of file downloads per month """ if self.was_error_found(): return self.get_error_msg_return() filter_params = self.get_date_filter_params( date_var_name='responsetime') filter_params.update(self.get_download_type_filter()) # Narrow down to specific Dataverses filter_params.update(self.get_dataverse_params_for_guestbook()) if self.was_error_found(): return self.get_error_msg_return() # Add extra filters, if they exist count_pre_dv4_downloads = False if extra_filters: for k, v in extra_filters.items(): if k == INCLUDE_PRE_DV4_DOWNLOADS: # skip this param count_pre_dv4_downloads = True del extra_filters[k] else: filter_params[k] = v file_counts_by_month = GuestBookResponse.objects.exclude(\ responsetime__isnull=True\ ).filter(**filter_params\ ).annotate(yyyy_mm=TruncYearMonth('responsetime')\ ).values('yyyy_mm'\ ).annotate(count=models.Count('id')\ ).values('yyyy_mm', 'count'\ ).order_by('%syyyy_mm' % self.time_sort) #print 'file_counts_by_month.query', file_counts_by_month.query sql_query = str(file_counts_by_month.query) formatted_records = [] # move from a queryset to a [] if count_pre_dv4_downloads: file_running_total = self.get_file_download_start_point_include_undated( **extra_filters) else: file_running_total = self.get_file_download_start_point( **extra_filters) for d in file_counts_by_month: fmt_rec = OrderedDict() fmt_rec['yyyy_mm'] = d['yyyy_mm'].strftime('%Y-%m') fmt_rec['count'] = d['count'] file_running_total += d['count'] fmt_rec['running_total'] = file_running_total # d['month_year'] = d['yyyy_mm'].strftime('%Y-%m') # Add year and month numbers fmt_rec['year_num'] = d['yyyy_mm'].year fmt_rec['month_num'] = d['yyyy_mm'].month # Add month name month_name_found, month_name_short = get_month_name_abbreviation( d['yyyy_mm'].month) if month_name_found: assume_month_name_found, fmt_rec[ 'month_name'] = get_month_name(d['yyyy_mm'].month) fmt_rec['month_name_short'] = month_name_short else: # Log it!!!!!! pass formatted_records.append(fmt_rec) data_dict = OrderedDict() data_dict['total_downloads'] = file_running_total data_dict['record_count'] = len(formatted_records) data_dict['records'] = formatted_records return StatsResult.build_success_result(data_dict, sql_query)
def get_file_downloads_by_month(self, **extra_filters): """ Using the GuestBookResponse object, find the number of file downloads per month """ if self.was_error_found(): return self.get_error_msg_return() filter_params = self.get_date_filter_params(date_var_name='responsetime') filter_params.update(self.get_download_type_filter()) # Narrow down to specific Dataverses filter_params.update(self.get_dataverse_params_for_guestbook()) if self.was_error_found(): return self.get_error_msg_return() # Add extra filters, if they exist count_pre_dv4_downloads = False if extra_filters: for k, v in extra_filters.items(): if k == INCLUDE_PRE_DV4_DOWNLOADS: # skip this param count_pre_dv4_downloads = True del extra_filters[k] else: filter_params[k] = v file_counts_by_month = GuestBookResponse.objects.exclude(\ responsetime__isnull=True\ ).filter(**filter_params\ ).annotate(yyyy_mm=TruncYearMonth('responsetime')\ ).values('yyyy_mm'\ ).annotate(count=models.Count('id')\ ).values('yyyy_mm', 'count'\ ).order_by('%syyyy_mm' % self.time_sort) #print 'file_counts_by_month.query', file_counts_by_month.query sql_query = str(file_counts_by_month.query) formatted_records = [] # move from a queryset to a [] if count_pre_dv4_downloads: file_running_total = self.get_file_download_start_point_include_undated(**extra_filters) else: file_running_total = self.get_file_download_start_point(**extra_filters) for d in file_counts_by_month: fmt_rec = OrderedDict() fmt_rec['yyyy_mm'] = d['yyyy_mm'].strftime('%Y-%m') fmt_rec['count'] = d['count'] file_running_total += d['count'] fmt_rec['running_total'] = file_running_total # d['month_year'] = d['yyyy_mm'].strftime('%Y-%m') # Add year and month numbers fmt_rec['year_num'] = d['yyyy_mm'].year fmt_rec['month_num'] = d['yyyy_mm'].month # Add month name month_name_found, month_name_short = get_month_name_abbreviation( d['yyyy_mm'].month) if month_name_found: assume_month_name_found, fmt_rec['month_name'] = get_month_name(d['yyyy_mm'].month) fmt_rec['month_name_short'] = month_name_short else: # Log it!!!!!! pass formatted_records.append(fmt_rec) data_dict = OrderedDict() data_dict['total_downloads'] = file_running_total data_dict['record_count'] = len(formatted_records) data_dict['records'] = formatted_records return StatsResult.build_success_result(data_dict, sql_query)
def get_file_count_by_month(self, date_param=DVOBJECT_CREATEDATE_ATTR, **extra_filters): """ File counts by month """ # Was an error found earlier? # if self.was_error_found(): return self.get_error_msg_return() # ----------------------------------- # (1) Build query filters # ----------------------------------- # Exclude records where dates are null # - e.g. a record may not have a publication date if date_param == DVOBJECT_CREATEDATE_ATTR: exclude_params = {} else: exclude_params = { '%s__isnull' % date_param : True} # Retrieve the date parameters # filter_params = self.get_date_filter_params() # Add extra filters from kwargs # if extra_filters: for k, v in extra_filters.items(): filter_params[k] = v # ----------------------------------- # (2) Construct query # ----------------------------------- # add exclude filters date filters # file_counts_by_month = Datafile.objects.select_related('dvobject'\ ).exclude(**exclude_params\ ).filter(**filter_params) # annotate query adding "month_year" and "cnt" # file_counts_by_month = file_counts_by_month.annotate(\ yyyy_mm=TruncYearMonth('%s' % date_param)\ ).values('yyyy_mm'\ ).annotate(count=models.Count('dvobject_id')\ ).annotate(bytes=models.Sum('filesize')\ ).values('yyyy_mm', 'count', 'bytes'\ ).order_by('%syyyy_mm' % self.time_sort) sql_query = str(file_counts_by_month.query) # ----------------------------------- # (3) Format results # ----------------------------------- running_total = self.get_file_count_start_point(**extra_filters) # hold the running total count total_bytes = 0 formatted_records = [] # move from a queryset to a [] for d in file_counts_by_month: fmt_rec = OrderedDict() fmt_rec['yyyy_mm'] = d['yyyy_mm'].strftime('%Y-%m') fmt_rec['count'] = d['count'] fmt_rec['bytes'] = d['bytes'] fmt_rec['bytes_str'] = comma_sep_number(d['bytes']) total_bytes += d['bytes'] # running total running_total += d['count'] fmt_rec['running_total'] = running_total # d['month_year'] = d['yyyy_mm'].strftime('%Y-%m') # Add year and month numbers fmt_rec['year_num'] = d['yyyy_mm'].year fmt_rec['month_num'] = d['yyyy_mm'].month # Add month name month_name_found, month_name_short = get_month_name_abbreviation(d['yyyy_mm'].month) if month_name_found: assume_month_name_found, fmt_rec['month_name'] = get_month_name(d['yyyy_mm'].month) fmt_rec['month_name_short'] = month_name_short else: # Log it!!!!!! pass # Add formatted record formatted_records.append(fmt_rec) data_dict = OrderedDict() data_dict['record_count'] = len(formatted_records) data_dict['records'] = formatted_records data_dict['total_bytes'] = total_bytes data_dict['total_bytes_str'] = comma_sep_number(total_bytes) return StatsResult.build_success_result(data_dict, sql_query)
def get_dataverse_affiliation_counts(self, **extra_filters): """ Return Dataverse counts by affiliation Returns: dv_counts_by_affiliation": [ { "affiliation": "University of Oxford", "affiliation_count": 2, "total_count": 191, "percent_string": "1.0%" }, { "affiliation": "University of Illinois", "affiliation_count": 1, "total_count": 191, "percent_string": "0.5%" } ... ] """ if self.was_error_found(): return self.get_error_msg_return() # Retrieve the date parameters # filter_params = self.get_date_filter_params(DVOBJECT_CREATEDATE_ATTR) if extra_filters: for k, v in extra_filters.items(): filter_params[k] = v dataverse_counts_by_affil = Dataverse.objects.select_related('dvobject'\ ).filter(**filter_params\ ).values('affiliation'\ ).order_by('affiliation'\ ).annotate(affiliation_count=models.Count('affiliation')\ ).order_by('-affiliation_count') # ----------------------------------- # Get SQL query string # ----------------------------------- sql_query = str(dataverse_counts_by_affil.query) # Count all dataverses # total_count = sum([ rec.get('affiliation_count', 0) for rec in dataverse_counts_by_affil ]) total_count = total_count + 0.0 print 'dataverse_counts_by_affil', dataverse_counts_by_affil # Format the records, adding 'total_count' and 'percent_string' to each one # formatted_records = [] for rec in dataverse_counts_by_affil: if rec.get('affiliation_count', 0) > 0: fmt_dict = OrderedDict() affil_str = rec.get('affiliation', None) if affil_str is not None: affil_str = affil_str.encode('utf-8') fmt_dict['affiliation'] = affil_str fmt_dict['affiliation_count'] = rec.get('affiliation_count', 0) if total_count > 0: float_percent = rec.get('affiliation_count', 0) / total_count fmt_dict['total_count'] = int(total_count) fmt_dict['percent_string'] = '{0:.1%}'.format( float_percent) else: fmt_dict['total_count'] = 0 fmt_dict['percent_string'] = '0%' formatted_records.append(fmt_dict) data_dict = OrderedDict() data_dict['record_count'] = len(formatted_records) data_dict['records'] = formatted_records return StatsResult.build_success_result(data_dict, sql_query)
def get_dataset_subject_counts(self, **extra_filters): """Dataset counts by subjet""" # Was an error found earlier? # if self.was_error_found(): return self.get_error_msg_return() # ----------------------------------- # (1) Build query filters # ----------------------------------- # Retrieve the date parameters # ----------------------------------- filter_params = self.get_date_filter_params() # ----------------------------------- # Add extra filters from kwargs # ----------------------------------- if extra_filters: for k, v in extra_filters.items(): filter_params[k] = v # ----------------------------- # Get the DatasetFieldType for subject # ----------------------------- search_attrs = dict(name='subject',\ required=True,\ metadatablock__name='citation') try: ds_field_type = DatasetFieldType.objects.get(**search_attrs) except DatasetFieldType.DoesNotExist: return False, 'DatasetFieldType for Citation title not found. (kwargs: %s)' % search_attrs # ----------------------------- # Retrieve Dataset ids by time and published/unpublished # ----------------------------- dataset_ids = Dataset.objects.select_related('dvobject'\ ).filter(**filter_params\ ).values_list('dvobject__id', flat=True) # ----------------------------- # Get latest DatasetVersion ids # ----------------------------- id_info_list = DatasetVersion.objects.filter(\ dataset__in=dataset_ids\ ).values('id', 'dataset_id', 'versionnumber', 'minorversionnumber'\ ).order_by('dataset_id', '-id', '-versionnumber', '-minorversionnumber') # ----------------------------- # Iterate through and get the DatasetVersion id # of the latest version # ----------------------------- dsv_ids = [] last_dataset_id = None for idx, info in enumerate(id_info_list): if idx == 0 or info['dataset_id'] != last_dataset_id: dsv_ids.append(info['id']) last_dataset_id = info['dataset_id'] # ----------------------------- # Get the DatasetField ids # ----------------------------- search_attrs2 = dict(datasetversion__id__in=dsv_ids,\ datasetfieldtype__id=ds_field_type.id) ds_field_ids = DatasetField.objects.select_related( 'datasetfieldtype').filter(**search_attrs2).values_list('id', flat=True) # ----------------------------- # Finally, get the ControlledVocabularyValues # ----------------------------- ds_values = DatasetFieldControlledVocabularyValue.objects.select_related('controlledvocabularyvalues'\ ).filter(datasetfield__in=ds_field_ids\ ).annotate(subject=F('controlledvocabularyvalues__strvalue') ).values('subject'\ ).annotate(cnt=models.Count('controlledvocabularyvalues__id')\ ).values('subject', 'cnt'\ ).order_by('-cnt') # ----------------------------- # Iterate through the vocab values, # process the totals, calculate percentage # ----------------------------- running_total = 0 formatted_records = [] # move from a queryset to a [] total_count = sum([rec['cnt'] for rec in ds_values]) + 0.00 for info in ds_values: rec = OrderedDict() rec['subject'] = info['subject'] # count rec['count'] = info['cnt'] rec['total_count'] = int(total_count) # percent float_percent = info['cnt'] / total_count rec['percent_string'] = '{0:.1%}'.format(float_percent) rec['percent_number'] = float("%.3f" % (float_percent)) # total count formatted_records.append(rec) data_dict = OrderedDict() data_dict['record_count'] = len(formatted_records) data_dict['records'] = formatted_records return StatsResult.build_success_result(data_dict)
def get_dataset_count_by_month(self, date_param=DVOBJECT_CREATEDATE_ATTR, **extra_filters): """ Return dataset counts by month """ # Was an error found earlier? # if self.was_error_found(): return self.get_error_msg_return() # ----------------------------------- # (1) Build query filters # ----------------------------------- # Exclude records where dates are null # - e.g. a record may not have a publication date if date_param == DVOBJECT_CREATEDATE_ATTR: exclude_params = {} else: exclude_params = {'%s__isnull' % date_param: True} # Retrieve the date parameters # filter_params = self.get_date_filter_params() # Add extra filters from kwargs # if extra_filters: for k, v in extra_filters.items(): filter_params[k] = v # ----------------------------------- # (2) Construct query # ----------------------------------- # add exclude filters date filters # ds_counts_by_month = Dataset.objects.select_related('dvobject'\ ).exclude(**exclude_params\ ).filter(**filter_params) # annotate query adding "month_year" and "cnt" # ds_counts_by_month = ds_counts_by_month.annotate(\ yyyy_mm=TruncYearMonth('%s' % date_param)\ ).values('yyyy_mm'\ ).annotate(count=models.Count('dvobject_id')\ ).values('yyyy_mm', 'count'\ ).order_by('%syyyy_mm' % self.time_sort) # store query string sql_query = str(ds_counts_by_month.query) # ----------------------------------- # (3) Format results # ----------------------------------- # hold the running total count running_total = self.get_dataset_count_start_point(**extra_filters) formatted_records = [] # move from a queryset to a [] for d in ds_counts_by_month: fmt_dict = OrderedDict() fmt_dict['yyyy_mm'] = d['yyyy_mm'].strftime('%Y-%m') fmt_dict['count'] = d['count'] # running total running_total += d['count'] fmt_dict['running_total'] = running_total # d['month_year'] = d['yyyy_mm'].strftime('%Y-%m') # Add year and month numbers fmt_dict['year_num'] = d['yyyy_mm'].year fmt_dict['month_num'] = d['yyyy_mm'].month # Add month name month_name_found, month_name_short = get_month_name_abbreviation( d['yyyy_mm'].month) if month_name_found: assume_month_name_found, fmt_dict[ 'month_name'] = get_month_name(d['yyyy_mm'].month) fmt_dict['month_name_short'] = month_name_short else: # Log it!!!!!! pass # Add formatted record formatted_records.append(fmt_dict) data_dict = OrderedDict() data_dict['record_count'] = len(formatted_records) data_dict['records'] = formatted_records return StatsResult.build_success_result(data_dict, sql_query)
def get_published_dataverses_without_content(self, **extra_filters): """For curation purposes: a list of all published dataverses that do not contain any datasets/content. A spreadsheet starting with the oldest dataverses is appreciated. Based on @sekmiller's SQL query""" # Was an error found earlier? # if self.was_error_found(): return self.get_error_msg_return() # ----------------------------------- # Retrieve the date parameters - distinguish by create date # ----------------------------------- filter_params = self.get_date_filter_params() filter_params.update(self.get_is_published_filter_param()) # ----------------------------------- # Retrieve ids of Dataverses to ~exclude~ # ----------------------------------- # Get DvObject Ids of: # - Dataverses that contain Datasets # - Dataverses that have an owner # id_set1 = DvObject.objects.filter(\ Q(dtype=DTYPE_DATASET) |\ Q(dtype=DTYPE_DATAVERSE, owner__isnull=False) ).distinct('owner__id'\ ).values_list('owner__id', flat=True) # Get DvObject Ids of: # - Dataverses that link to datasets # id_set2 = DatasetLinkingDataverse.objects.distinct('linkingdataverse__id'\ ).values_list('linkingdataverse__id', flat=True) # Get DvObject Ids of: # - Dataverses that link to Dataverses # id_set3 = DataverseLinkingDataverse.objects.distinct('dataverse__id'\ ).values_list('dataverse__id', flat=True) # Combine the ids into a list # dv_ids_to_exclude = set( list(id_set1) + list(id_set2) + list((id_set3))) # Retrieve published Dataverses that aren't in the list above # dv_info_list = Dataverse.objects.select_related('dvobject'\ ).exclude(dvobject__id__in=dv_ids_to_exclude\ ).filter(**filter_params\ ).order_by(DVOBJECT_CREATEDATE_ATTR\ ).annotate(dv_id=F('dvobject__id'),\ create_date=F(DVOBJECT_CREATEDATE_ATTR),\ pub_date=F('dvobject__publicationdate')\ ).values('dv_id', 'name', 'alias'\ , 'create_date', 'pub_date'\ , 'affiliation'\ ).order_by('create_date', 'name') sql_query = str(q.query) records = [] for dv_info in dv_info_list: single_rec = OrderedDict() single_rec['id'] = dv_info['dv_id'] single_rec['name'] = dv_info['name'] single_rec['alias'] = dv_info['alias'] single_rec['url'] = '%s/dataverse/%s' % ( settings.DATAVERSE_INSTALLATION_URL, dv_info['alias']) #single_rec['description'] = dv_info['description'] single_rec['affiliation'] = dv_info['affiliation'] single_rec['publication_date'] = dv_info['pub_date'].strftime( TIMESTAMP_MASK) single_rec['create_date'] = dv_info['create_date'].strftime( TIMESTAMP_MASK) records.append(single_rec) data_dict = OrderedDict() data_dict['count'] = len(records) data_dict['records'] = records return StatsResult.build_success_result(data_dict, sql_query)
def get_file_counts_per_dataset_latest_versions(self, **extra_filters): """ Get binning stats for the number of files in each Dataset. For the counts, only use the LATEST DatasetVersion """ # Get the correct DatasetVersion ids as a filter parameter # latest_dsv_ids = self.get_dataset_version_ids(**extra_filters) filter_params = dict(datasetversion__id__in=latest_dsv_ids) # Make query # ds_version_counts = FileMetadata.objects.filter(**filter_params\ ).annotate(dsv_id=F('datasetversion__id'),\ ).values('dsv_id',\ ).annotate(cnt=models.Count('datafile__id')\ ).values('dsv_id', 'cnt'\ ).order_by('-cnt') # Convert to Dataframe # df = pd.DataFrame(list(ds_version_counts), columns=['dsv_id', 'cnt']) # Get the list of bins # high_num = high_num = df['cnt'].max() + self.bin_size bins = self.get_bin_list(step=self.bin_size, low_num=0, high_num=high_num + self.bin_size) # Add a new column, assigning each file count to a bin # df['bin_label'] = pd.cut(df['cnt'], bins) # Count the occurrence of each bin # bin_count_series = pd.value_counts(df['bin_label']) # Make the Series into a new DataFrame # df_bins = pd.DataFrame(dict(bin=bin_count_series.index,\ count=bin_count_series.values)) # Add a sort key # (0, 20] -> 0 # (20, 30] -> 20 # etc df_bins['sort_key'] = df_bins['bin'].apply( lambda x: int(x[1:-1].split(',')[0])) df_bins['bin_start_inclusive'] = df_bins['sort_key'] df_bins['bin_end'] = df_bins['bin'].apply( lambda x: int(x[1:-1].split(',')[1])) # Add a formatted string # (0, 20] -> 0 to 20 # (20, 30] -> 20 to 30 # etc df_bins['bin_str'] = df_bins['bin'].apply( lambda x: x[1:-1].replace(', ', ' to ')) # Sort the bins # df_bins = df_bins.sort('sort_key') #msgt(df_bins) # If appropriate, skip empty bins, e.g. remove 0 counts # if self.skip_empty_bins: df_bins = df_bins.query('count != 0') #msg(df_bins) # Return as python dict # # bit expensive but want orderedDict formatted_records_json = df_bins.to_json(orient='records') formatted_records = json.loads(formatted_records_json, object_pairs_hook=OrderedDict) data_dict = OrderedDict() data_dict['record_count'] = len(formatted_records) data_dict['records'] = formatted_records return StatsResult.build_success_result(data_dict)
def get_dataset_size_counts(self, **extra_filters): """ Get binning stats for the byte size of each Dataset. """ # Get the correct DatasetVersion ids as a filter parameter # filter_params = {} if extra_filters: filter_params.update(extra_filters) # Make query # dataset_file_sizes = Datafile.objects.filter(**filter_params\ ).annotate(ds_id=F('dvobject__owner__id'),\ ).values('ds_id',\ ).annotate(cnt=models.Count('dvobject__id')\ , ds_size=Sum('filesize') ).values('ds_id', 'cnt', 'ds_size'\ ).order_by('ds_size') #total_bytes_used_result = Datafile.objects.filter(**filter_params\ # ).aggregate(ds_size=Sum('filesize')) # Convert to Dataframe # df = pd.DataFrame(list(dataset_file_sizes), columns = ['ds_id', 'cnt', 'ds_size']) #total_dataset_count = len(df.index) # * includes rows with missing values total_bytes_used = df['ds_size'].sum() # Get the list of bins # high_num = df['ds_size'].max() + self.bin_size_bytes bins = self.get_bin_list(step=self.bin_size_bytes, low_num=0, high_num=high_num+self.bin_size_bytes) # Add a new column, assigning each file count to a bin # df['bin_label'] = pd.cut(df['ds_size'], bins) # Count the occurrence of each bin # bin_count_series = pd.value_counts(df['bin_label']) # Make the Series into a new DataFrame # df_bins = pd.DataFrame(dict(bin=bin_count_series.index,\ count=bin_count_series.values)) total_dataset_count = df_bins['count'].sum() # Add a sort key # (0, 20] -> 0 # (20, 30] -> 20 # etc df_bins['sort_key'] = df_bins['bin'].apply(lambda x: int(x[1:-1].split(',')[0])) if total_dataset_count > 0: df_bins['percentage_of_datasets'] = df_bins['count'].apply(lambda x: "{0:.4f}%".format(100 * x/float(total_dataset_count))) #100*x/float(x.sum()) df_bins['bin_start_inclusive'] = df_bins['sort_key'] df_bins['bin_start_inclusive_commas'] = df_bins['bin_start_inclusive'].apply(lambda x: comma_sep_number(x)) df_bins['bin_start_inclusive_abbrev'] = df_bins['bin_start_inclusive'].apply(lambda x: sizeof_fmt(x)) df_bins['bin_end'] = df_bins['bin'].apply(lambda x: int(x[1:-1].split(',')[1])) df_bins['bin_end_commas'] = df_bins['bin_end'].apply(lambda x: comma_sep_number(x)) df_bins['bin_end_abbrev'] = df_bins['bin_end'].apply(lambda x: sizeof_fmt(x)) df_bins['bin_str'] = df_bins['bin_start_inclusive_abbrev'].str.cat(df_bins['bin_end_abbrev'].values.astype(str), sep=' to ') # Sort the bins # df_bins = df_bins.sort('sort_key') #msgt(df_bins) # If appropriate, skip empty bins, e.g. remove 0 counts # if self.skip_empty_bins: df_bins = df_bins.query('count != 0') #msg(df_bins) # Return as python dict # # bit expensive but want orderedDict formatted_records_json = df_bins.to_json(orient='records') formatted_records = json.loads(formatted_records_json, object_pairs_hook=OrderedDict) data_dict = OrderedDict() data_dict['record_count'] = len(formatted_records) data_dict['dataset_count'] = total_dataset_count data_dict['total_bytes_used'] = total_bytes_used data_dict['total_bytes_used_comma'] = comma_sep_number(int(total_bytes_used)) data_dict['total_bytes_used_abbrev'] = sizeof_fmt(total_bytes_used) data_dict['records'] = formatted_records return StatsResult.build_success_result(data_dict)
def get_dataverse_counts_by_type(self, exclude_uncategorized=True, **extra_filters): """ Return dataverse counts by 'dataversetype' Optional if a dataverse is uncategorized: - Specifying 'uncategorized_replacement_name' will set "UNCATEGORIZED" to another string Returns: { "dv_counts_by_type": [ { "dataversetype": "RESEARCH_PROJECTS", "type_count": 85, "total_count": 356, "percent_string": "23.9%" }, { "dataversetype": "TEACHING_COURSES", "type_count": 10, "total_count": 356, "percent_string": "2.8%" } ... etc ] } """ if self.was_error_found(): return self.get_error_msg_return() # Retrieve the date parameters # filter_params = self.get_date_filter_params(DVOBJECT_CREATEDATE_ATTR) # Add extra filters if extra_filters: for k, v in extra_filters.items(): filter_params[k] = v if exclude_uncategorized: exclude_params = dict(dataversetype=DATAVERSE_TYPE_UNCATEGORIZED) else: exclude_params = {} dataverse_counts_by_type = Dataverse.objects.select_related('dvobject'\ ).filter(**filter_params\ ).exclude(**exclude_params\ ).values('dataversetype'\ ).order_by('dataversetype'\ ).annotate(type_count=models.Count('dataversetype')\ ).order_by('-type_count') # ----------------------------------- # Get SQL query string # ----------------------------------- sql_query = str(dataverse_counts_by_type.query) # Count all dataverses # total_count = sum( [rec.get('type_count', 0) for rec in dataverse_counts_by_type]) total_count = total_count + 0.0 # Format the records, adding 'total_count' and 'percent_string' to each one # formatted_records = [] for rec in dataverse_counts_by_type: fmt_dict = OrderedDict() fmt_dict['dataversetype'] = rec['dataversetype'] fmt_dict['dataversetype_label'] = rec['dataversetype'].replace( '_', ' ') fmt_dict['type_count'] = rec.get('type_count', 0) if total_count > 0: float_percent = rec.get('type_count', 0) / total_count fmt_dict['total_count'] = int(total_count) fmt_dict['percent_string'] = '{0:.1%}'.format(float_percent) else: fmt_dict['total_count'] = 0 fmt_dict['percent_string'] = '0%' formatted_records.append(fmt_dict) data_dict = OrderedDict() data_dict['record_count'] = len(formatted_records) data_dict['records'] = formatted_records return StatsResult.build_success_result(data_dict, sql_query)