Beispiel #1
0
def view_dataverse_list(request, output_format='xlsx', **kwargs):

    filter_params = {}

    published_only = kwargs.get('published_only', True)

    if published_only:
        filter_params.update(query_helper.get_is_published_filter_param())

    vals = [
        'id', 'name', 'alias', 'dataversetype', 'createdate', 'publicationdate'
    ]

    dlist = Dataverse.objects.select_related('dvobject'\
                ).filter(**filter_params
                ).annotate(id=F('dvobject__id'),
                    createdate=F('dvobject__createdate'),
                    publicationdate=F('dvobject__publicationdate'),
                ).values(*vals\
                ).order_by('alias')

    df = pd.DataFrame(list(dlist), columns=vals)

    df['dataverse_url'] = df['alias'].apply(
        lambda x: 'https://dataverse.harvard.edu/dataverse/%s' % x)
    vals.append('dataverse_url')

    if output_format == 'xlsx':
        excel_string_io = StringIO.StringIO()

        pd_writer = pd.ExcelWriter(excel_string_io, engine='xlsxwriter')

        df.to_excel(pd_writer, index=False, sheet_name='metrics', columns=vals)

        pd_writer.save()

        excel_string_io.seek(0)
        workbook = excel_string_io.getvalue()

        if workbook is None:
            # Ah, make a better error
            return HttpResponse(
                'Sorry! An error occurred trying to create an Excel spreadsheet.'
            )

        xlsx_fname = 'dataverses_%s.xlsx' % get_timestamp_for_filename()

        response = HttpResponse(workbook,\
                content_type='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')

        response[
            'Content-Disposition'] = 'attachment; filename=%s' % xlsx_fname

        return response

    return HttpResponse('Sorry.  This format is not recognized: %s' %
                        output_format)
Beispiel #2
0
def view_single_dataverse_by_alias(request, alias):
    """JSON repr. of Dataverse. Published Dataverses only"""
    query_params = dict(alias=alias)
    query_params.update(query_helper.get_is_published_filter_param())

    try:
        dv = Dataverse.objects.select_related('dvobject').get(**query_params)
    except Dataverse.DoesNotExist:
        raise Http404

    return _view_single_dataverse(request, dv)
Beispiel #3
0
def view_dataverse_list(request, output_format='xlsx', **kwargs):

    filter_params = {}

    published_only = kwargs.get('published_only', True)

    if published_only:
        filter_params.update(query_helper.get_is_published_filter_param())

    vals = ['id', 'name', 'alias', 'dataversetype', 'createdate', 'publicationdate' ]

    dlist = Dataverse.objects.select_related('dvobject'\
                ).filter(**filter_params
                ).annotate(id=F('dvobject__id'),
                    createdate=F('dvobject__createdate'),
                    publicationdate=F('dvobject__publicationdate'),
                ).values(*vals\
                ).order_by('alias')


    df = pd.DataFrame(list(dlist), columns=vals)

    df['dataverse_url'] = df['alias'].apply(lambda x: 'https://dataverse.harvard.edu/dataverse/%s' %  x)
    vals.append('dataverse_url')

    if output_format == 'xlsx':
        excel_string_io = StringIO.StringIO()

        pd_writer = pd.ExcelWriter(excel_string_io, engine='xlsxwriter')

        df.to_excel(pd_writer, index=False, sheet_name='metrics', columns=vals)

        pd_writer.save()

        excel_string_io.seek(0)
        workbook = excel_string_io.getvalue()

        if workbook is None:
            # Ah, make a better error
            return HttpResponse('Sorry! An error occurred trying to create an Excel spreadsheet.')


        xlsx_fname = 'dataverses_%s.xlsx' % get_timestamp_for_filename()

        response = HttpResponse(workbook,\
                content_type='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')

        response['Content-Disposition'] = 'attachment; filename=%s' % xlsx_fname

        return response

    return HttpResponse('Sorry.  This format is not recognized: %s' % output_format)
Beispiel #4
0
def view_file_content_types(request):
    """JSON list of content types"""

    is_pretty = False
    as_excel = False
    if request.GET.get('as_excel') is not None:
        as_excel = True
    elif request.GET.get('pretty') is not None:
        is_pretty = True


    is_published_param = query_helper.get_is_published_filter_param()

    content_types = Datafile.objects.select_related('dvobject'\
                        ).filter(**is_published_param\
                        ).values_list('contenttype', flat=True\
                        ).distinct().order_by('contenttype')

    d = OrderedDict()
    d['content_types'] = list(content_types)

    if as_excel:
        if len(d) == 0:
            return HttpResponse('nothing found')
        # https://stackoverflow.com/questions/35267585/django-pandas-to-http-response-download-file
        df = pd.read_json(json.dumps(d))
        excel_file = StringIO.StringIO()
        xlwriter = pd.ExcelWriter(excel_file, engine='xlsxwriter')
        df.to_excel(xlwriter, 'content_types', index=False)
        xlwriter.save()
        xlwriter.close()
        excel_file.seek(0)
        # set the mime type so that the browser knows what to do with the file
        response = HttpResponse(\
                   excel_file.read(),
                   content_type='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')

        # set the file name in the Content-Disposition header
        response['Content-Disposition'] = 'attachment; filename=dv_content_types.xlsx'

        return response

    d['description'] = 'Content types of published Dataverse files'

    if is_pretty:
        json_str = '<pre>%s</pre>' % json.dumps(d, indent=4)
        return HttpResponse(json_str)
    else:
        json_str = json.dumps(d)
        return HttpResponse(json_str,
                            content_type="application/json")
Beispiel #5
0
def get_latest_dataset_version(dataset_id):
    """Given a dataset id, retrieve the latest *published* DatasetVersion"""

    dataset = Dataset.objects.select_related('dvobject'\
            ).filter(dvobject__id=dataset_id\
            ).filter(**query_helper.get_is_published_filter_param()\
            ).first()

    if dataset is None:
        return None

    # Get the latest version
    dataset_version = DatasetVersion.objects\
            .select_related('dataset')\
            .filter(dataset=dataset,\
                versionstate=VERSION_STATE_RELEASED)\
            .order_by('-id').first()

    return dataset_version
Beispiel #6
0
def get_table_rows(datafile_id):

    if not datafile_id:
        err_msg = "No file id specified"
        return False, err_msg, 400

    is_published_param = query_helper.get_is_published_filter_param()

    datafile = Datafile.objects.select_related('dvobject'
                        ).filter(**is_published_param
                        ).filter(dvobject__id=datafile_id).first()

    if datafile is None:
        err_msg = "No published file found with id: %s" % datafile_id
        return False, err_msg, 404

    file_access_url = DatafileUtil.get_file_access_url(datafile_id)

    temp_filepath, file_ext = temp_file_helper.download_file(file_access_url)

    if temp_filepath is None:
        err_msg = "Failed to download file"
        return False, err_msg, 400

    previewer = TabularPreviewer(temp_filepath, **dict(file_ext=file_ext))

    if previewer.has_error():
        return False, previewer.error_message, 400

    data_rows = previewer.get_data_rows()
    if data_rows is None:
        return False, previewer.error_message, 500
    # We have the rows, delete the downloaded file
    # In future, cache or save preview rows to db, etc.
    #
    temp_file_helper.make_sure_file_deleted(temp_filepath)

    return True, data_rows, 200
Beispiel #7
0
def get_table_rows(datafile_id):

    if not datafile_id:
        err_msg = "No file id specified"
        return False, err_msg, 400

    is_published_param = query_helper.get_is_published_filter_param()

    datafile = Datafile.objects.select_related('dvobject').filter(
        **is_published_param).filter(dvobject__id=datafile_id).first()

    if datafile is None:
        err_msg = "No published file found with id: %s" % datafile_id
        return False, err_msg, 404

    file_access_url = DatafileUtil.get_file_access_url(datafile_id)

    temp_filepath, file_ext = temp_file_helper.download_file(file_access_url)

    if temp_filepath is None:
        err_msg = "Failed to download file"
        return False, err_msg, 400

    previewer = TabularPreviewer(temp_filepath, **dict(file_ext=file_ext))

    if previewer.has_error():
        return False, previewer.error_message, 400

    data_rows = previewer.get_data_rows()
    if data_rows is None:
        return False, previewer.error_message, 500
    # We have the rows, delete the downloaded file
    # In future, cache or save preview rows to db, etc.
    #
    temp_file_helper.make_sure_file_deleted(temp_filepath)

    return True, data_rows, 200
    def make_json_files(self):

        # Set publication status
        #
        filters = {}
        if self.published_only:
            filters.update(query_helper.get_is_published_filter_param())

        # Query for dataset ids
        #
        ds_id_query = Dataset.objects.filter(**filters\
                            ).annotate(ds_id=F('dvobject__id')\
                            ).values_list('ds_id', flat=True\
                            ).order_by('ds_id')

        # Iterate through dataset ids
        #
        #start_time = datetime.now()
        start_time = int(time.time())  # epoch seconds

        cnt = 0
        no_versions_found_list = [45900]

        for ds_id in ds_id_query:
            cnt += 1
            msgt('(%d) Checking dataset id %s' % (cnt, ds_id))
            if ds_id < self.dataset_start_id:
                msg('skipping...(start at dataset id: %d)' %
                    self.dataset_start_id)
                continue

            # Create file name
            #
            fname = 'ds_%s.json' % (str(ds_id).zfill(8))
            full_fname = join(OUTPUT_DIR, fname)

            # Should we overwrite the existing file?
            #
            if isfile(full_fname) and not self.overwrite_existing_files:
                msg('skipping...file already exists')
                continue

            dataset_version = get_latest_dataset_version(ds_id)

            if dataset_version is None:
                msg("Could not find dataset_version!")
                no_versions_found_list.append(ds_id)
                continue

            dataset_as_json = DatasetSerializer(dataset_version).as_json()

            open(full_fname, 'w').write(json.dumps(dataset_as_json, indent=4))
            msg('File written: %s' % full_fname)

            if cnt % 500 == 0:
                self.show_elapsed_time(start_time)
            #if cnt > 10:
            #    self.show_elapsed_time(start_time)
            #    break

        self.show_elapsed_time(start_time)
        print 'no_versions_found_list: %s' % no_versions_found_list
 def get_is_published_filter_param(self, dvobject_var_name='dvobject'):
     """
     Check if the dvobject has a publication date--which indicates
     that it has been published
     """
     return query_helper.get_is_published_filter_param(dvobject_var_name)
Beispiel #10
0
def view_file_list_by_type(request):
    """Give a list of published files based on the content type.
    e.g.  ?contenttype=text/tab-separated-values
    """
    is_pretty = False
    as_excel = False
    if request.GET.get('as_excel') is not None:
        as_excel = True
    elif request.GET.get('pretty') is not None:
        is_pretty = True

    contenttype = request.GET.get('contenttype', CONTENT_TYPE_TABULAR)

    query_params = dict(contenttype=contenttype,
                        filesize__gt=0,
                        checksumvalue__isnull=False,
                        restricted=False,
                        ingeststatus=INGEST_STATUS_NONE)

    query_params.update(query_helper.get_is_published_filter_param())

    dfiles = Datafile.objects.select_related('dvobject', 'dvobject__owner'\
                    ).filter(**query_params\
                    ).order_by('contenttype')

    flist = [Datafile.to_json(df)\
             for df in dfiles\
             if df is not None and df.id]

    d = OrderedDict()
    d['file_list'] = flist

    if as_excel:
        # https://stackoverflow.com/questions/35267585/django-pandas-to-http-response-download-file
        if len(flist) == 0:
            return HttpResponse('no published files for contenttype: %s' % contenttype)

        df = pd.DataFrame(flist, columns=flist[0].keys())
        excel_file = StringIO.StringIO()
        xlwriter = pd.ExcelWriter(excel_file, engine='xlsxwriter')
        df.to_excel(xlwriter, 'file_list', index=False)
        xlwriter.save()
        xlwriter.close()
        excel_file.seek(0)
        # set the mime type so that the browser knows what to do with the file
        response = HttpResponse(\
                   excel_file.read(),
                   content_type='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')

        # set the file name in the Content-Disposition header
        response['Content-Disposition'] = 'attachment; filename=dv_file_list.xlsx'

        return response

    d['description'] = 'Files with content type %s. Count: %d' % (contenttype, len(flist))

    if is_pretty:
        json_str = '<pre>%s</pre>' % json.dumps(d, indent=4)
        return HttpResponse(json_str)
    else:
        json_str = json.dumps(d)
        return HttpResponse(json_str,
                            content_type="application/json")
Beispiel #11
0
    def make_json_files(self):

        # Set publication status
        #
        filters = {}
        if self.published_only:
            filters.update(query_helper.get_is_published_filter_param())

        # Query for dataset ids
        #
        ds_id_query = Dataset.objects.filter(**filters\
                            ).annotate(ds_id=F('dvobject__id')\
                            ).values_list('ds_id', flat=True\
                            ).order_by('ds_id')

        # Iterate through dataset ids
        #
        #start_time = datetime.now()
        start_time = int(time.time()) # epoch seconds

        cnt = 0
        no_versions_found_list = [45900]

        for ds_id in ds_id_query:
            cnt += 1
            msgt('(%d) Checking dataset id %s' % (cnt, ds_id))
            if ds_id < self.dataset_start_id:
                msg('skipping...(start at dataset id: %d)' % self.dataset_start_id)
                continue

            # Create file name
            #
            fname = 'ds_%s.json' % (str(ds_id).zfill(8))
            full_fname = join(OUTPUT_DIR, fname)

            # Should we overwrite the existing file?
            #
            if isfile(full_fname) and not self.overwrite_existing_files:
                msg('skipping...file already exists')
                continue

            dataset_version = get_latest_dataset_version(ds_id)

            if dataset_version is None:
                msg("Could not find dataset_version!")
                no_versions_found_list.append(ds_id)
                continue

            dataset_as_json = DatasetSerializer(dataset_version).as_json()

            open(full_fname, 'w').write(json.dumps(dataset_as_json, indent=4))
            msg('File written: %s' % full_fname)

            if cnt % 500 == 0:
                self.show_elapsed_time(start_time)
            #if cnt > 10:
            #    self.show_elapsed_time(start_time)
            #    break

        self.show_elapsed_time(start_time)
        print 'no_versions_found_list: %s' % no_versions_found_list