Example #1
0
    def get(self, request, document_type_code: str, *_args, **_kwargs):
        start = time.time()
        try:
            document_type = DocumentType.objects.get(code=document_type_code)

            project_ids = as_int_list(request.GET, 'project_ids')  # type: List[int]

            columns = as_str_list(request.GET, 'columns')

            include_annotations = as_bool(request.GET, 'associated_text')
            if include_annotations:
                all_annotation_columns = get_annotation_columns(document_type)
                columns += [i.field_code for i in all_annotation_columns
                            if i.field_code.rstrip(FIELD_CODE_ANNOTATION_SUFFIX) in columns]

            fmt = request.GET.get('fmt') or self.FMT_JSON

            offset = as_int(request.GET, 'offset', None)
            if offset is not None and offset < 0:
                offset = None

            limit = as_int(request.GET, 'limit', None)
            if limit is not None and limit <= 0:
                limit = None

            # For json output we limit number of returned documents because we dont use streaming response for JSON
            # and want to keep it fast.
            if fmt == self.FMT_JSON and self.MAX_RETURNED_DOCUMENTS_JSON is not None \
                    and (limit is None or limit > self.MAX_RETURNED_DOCUMENTS_JSON):
                limit = self.MAX_RETURNED_DOCUMENTS_JSON

            saved_filters = as_int_list(request.GET, 'saved_filters')  # type: List[int]

            column_filters = list()
            for param, value in request.GET.items():  # type: str, str
                if param.startswith(self.URL_PARAM_PREFIX_FILTER):
                    column_filters.append((param[len(self.URL_PARAM_PREFIX_FILTER):], value))

            order_by = request.GET.get('order_by') or None  # type: str
            order_by = parse_order_by(order_by) if order_by else None

            save_filter = as_bool(request.GET, 'save_filter', False)  # type: bool

            return_reviewed = as_bool(request.GET, 'return_reviewed', False)
            return_total = as_bool(request.GET, 'return_total', True)
            return_data = as_bool(request.GET, 'return_data', True)
            ignore_errors = as_bool(request.GET, 'ignore_errors', True)

            if project_ids and save_filter:
                column_filters_dict = {c: f for c, f in column_filters}
                for project_id in project_ids:
                    with transaction.atomic():
                        obj = SavedFilter.objects.create(user=request.user,
                                                         document_type=document_type,
                                                         filter_type=FT_USER_DOC_GRID_CONFIG,
                                                         project_id=project_id,
                                                         columns=columns,
                                                         column_filters=column_filters_dict,
                                                         title=None,
                                                         order_by=[(column, direction.value)
                                                                   for
                                                                   column, direction in
                                                                   order_by] if order_by
                                                         else None
                                                         )
                        SavedFilter.objects.filter(user=request.user,
                                                   document_type=document_type,
                                                   filter_type=FT_USER_DOC_GRID_CONFIG,
                                                   project_id=project_id) \
                            .exclude(pk=obj.pk) \
                            .delete()

            query_results = query_documents(requester=request.user,
                                            document_type=document_type,
                                            project_ids=project_ids,
                                            column_names=columns,
                                            saved_filter_ids=saved_filters,
                                            column_filters=column_filters,
                                            order_by=order_by,
                                            offset=offset,
                                            limit=limit,
                                            return_documents=return_data,
                                            return_reviewed_count=return_reviewed,
                                            return_total_count=return_total,
                                            ignore_errors=ignore_errors,
                                            include_annotation_fields=True)  # type: DocumentQueryResults

            if fmt in {self.FMT_XLSX, self.FMT_CSV} and not return_data:
                raise APIRequestError('Export to csv/xlsx requested with return_data=false')

            if fmt == self.FMT_CSV:
                return _query_results_to_csv(query_results)
            elif fmt == self.FMT_XLSX:
                return _query_results_to_xlsx(query_results)
            else:
                if query_results is None:
                    return Response({'time': time.time() - start})
                return _query_results_to_json(query_results, time.time() - start)
        except APIRequestError as e:
            return e.to_response()
        except Exception as e:
            return APIRequestError(message='Unable to process request', caused_by=e, http_status_code=500).to_response()
    def get(self, request, document_type_code: str, *_args, **_kwargs):
        start = time.time()
        try:
            document_type = DocumentType.objects.get(code=document_type_code)

            project_ids = as_int_list(request.GET,
                                      'project_ids')  # type: List[int]

            columns = as_str_list(request.GET, 'columns')

            fmt = request.GET.get('fmt') or self.FMT_JSON

            offset = as_int(request.GET, 'offset', None)
            if offset is not None and offset < 0:
                offset = None

            limit = as_int(request.GET, 'limit', None)
            if limit is not None and limit <= 0:
                limit = None

            # For json output we limit number of returned documents because we dont use streaming response for JSON
            # and want to keep it fast.
            if fmt == self.FMT_JSON and (
                    limit is None or limit > self.MAX_RETURNED_DOCUMENTS_JSON):
                limit = self.MAX_RETURNED_DOCUMENTS_JSON

            saved_filters = as_int_list(request.GET,
                                        'saved_filters')  # type: List[int]

            column_filters = list()
            for param, value in request.GET.items():  # type: str, str
                if param.startswith(self.URL_PARAM_PREFIX_FILTER):
                    column_filters.append(
                        (param[len(self.URL_PARAM_PREFIX_FILTER):], value))

            order_by = request.GET.get('order_by') or None  # type: str
            order_by = parse_order_by(order_by) if order_by else None

            save_filter = as_bool(request.GET, 'save_filter',
                                  False)  # type: bool

            return_reviewed = as_bool(request.GET, 'return_reviewed', False)
            return_total = as_bool(request.GET, 'return_total', True)
            return_data = as_bool(request.GET, 'return_data', True)
            ignore_errors = as_bool(request.GET, 'ignore_errors', True)

            if project_ids and save_filter:
                column_filters_dict = {c: f for c, f in column_filters}
                for project_id in project_ids:
                    SavedFilter.objects.update_or_create(
                        user=request.user,
                        document_type=document_type,
                        filter_type=FT_USER_DOC_GRID_CONFIG,
                        project_id=project_id,
                        defaults={
                            'user':
                            request.user,
                            'document_type':
                            document_type,
                            'filter_type':
                            FT_USER_DOC_GRID_CONFIG,
                            'project_id':
                            project_id,
                            'columns':
                            columns,
                            'column_filters':
                            column_filters_dict,
                            'title':
                            None,
                            'order_by': [(column, direction.value)
                                         for column, direction in order_by]
                            if order_by else None
                        })
            query_results = get_documents(
                requester=request.user,
                document_type=document_type,
                project_ids=project_ids,
                column_names=columns,
                saved_filter_ids=saved_filters,
                column_filters=column_filters,
                order_by=order_by,
                offset=offset,
                limit=limit,
                return_documents=return_data,
                return_reviewed_count=return_reviewed,
                return_total_count=return_total,
                ignore_errors=ignore_errors)  # type: DocumentQueryResults

            if fmt.lower() == 'csv':
                if not return_data:
                    raise APIRequestError(
                        'Export to csv requested with return_data=false')
                else:
                    resp = StreamingHttpResponse(csv_gen(
                        query_results.column_codes, query_results.fetch(),
                        query_results.column_titles),
                                                 content_type='text/csv')
                    resp[
                        'Content-Disposition'] = 'attachment; filename="export.csv"'
                    return resp
            else:
                if query_results is None:
                    return Response({'time': time.time() - start})

                # As we limit the number of returned documents for JSON we can keep response in non-streaming form.
                return Response(
                    _query_results_to_json(query_results,
                                           time.time() - start))

                # Switch to StreamingHttpResponse if/when we really need to return very big json output.
                # _query_results_to_json() returns dict with document items backed with a generator.
                # But on local tests for small number of documents the streaming json output works two times
                # slower than non-streaming response. CSV works the same fast.
                # return StreamingHttpResponse(json_gen(_query_results_to_json(query_results, time.time() - start)),
                #       content_type='application/json')
        except APIRequestError as e:
            return e.to_response()
        except Exception as e:
            return APIRequestError(message='Unable to process request',
                                   caused_by=e,
                                   http_status_code=500).to_response()
Example #3
0
    def get(self, request, document_type_code: str, *_args, **_kwargs):
        start = time.time()
        try:
            document_type = DocumentType.objects.get(code=document_type_code)

            project_ids = as_int_list(request.GET,
                                      'project_ids')  # type: List[int]

            columns = as_str_list(request.GET, 'columns')

            include_annotations = as_bool(request.GET, 'associated_text')
            if include_annotations:
                all_annotation_columns = get_annotation_columns(document_type)
                columns += [
                    i.field_code for i in all_annotation_columns
                    if i.field_code.rstrip(FIELD_CODE_ANNOTATION_SUFFIX) in
                    columns
                ]

            fmt = request.GET.get('fmt') or self.FMT_JSON
            as_zip = request.GET.get('as_zip') == 'true'

            offset = as_int(request.GET, 'offset', None)
            if offset is not None and offset < 0:
                offset = None

            limit = as_int(request.GET, 'limit', None)
            if limit is not None and limit <= 0:
                limit = None

            # For json output we limit number of returned documents because we dont use streaming response for JSON
            # and want to keep it fast.
            if fmt == self.FMT_JSON and self.MAX_RETURNED_DOCUMENTS_JSON is not None \
                    and (limit is None or limit > self.MAX_RETURNED_DOCUMENTS_JSON):
                limit = self.MAX_RETURNED_DOCUMENTS_JSON

            saved_filters = as_int_list(request.GET,
                                        'saved_filters')  # type: List[int]

            column_filters = list()
            for param, value in request.GET.items():  # type: str, str
                if param.startswith(self.URL_PARAM_PREFIX_FILTER):
                    column_filters.append(
                        (param[len(self.URL_PARAM_PREFIX_FILTER):], value))

            # in case if filter params are passed like &filters=a=b&c=d
            filter_query_string = request.GET.get('filters')
            if filter_query_string:
                for param, value in ast.literal_eval(
                        filter_query_string).items():  # type: str, str
                    if param.startswith(self.URL_PARAM_PREFIX_FILTER):
                        column_filters.append(
                            (param[len(self.URL_PARAM_PREFIX_FILTER):], value))

            order_by = request.GET.get('order_by') or None  # type: str
            order_by = parse_order_by(order_by) if order_by else None

            save_filter = as_bool(request.GET, 'save_filter',
                                  False)  # type: bool

            return_reviewed = as_bool(request.GET, 'return_reviewed', False)
            return_total = as_bool(request.GET, 'return_total', True)
            return_data = as_bool(request.GET, 'return_data', True)
            ignore_errors = as_bool(request.GET, 'ignore_errors', True)

            if project_ids and save_filter:
                column_filters_dict = {c: f for c, f in column_filters}
                for project_id in project_ids:
                    with transaction.atomic():
                        obj = SavedFilter.objects.create(
                            user=request.user,
                            document_type=document_type,
                            filter_type=FT_USER_DOC_GRID_CONFIG,
                            project_id=project_id,
                            columns=columns,
                            column_filters=column_filters_dict,
                            title=None,
                            order_by=[(column, direction.value)
                                      for column, direction in order_by]
                            if order_by else None)
                        SavedFilter.objects.filter(user=request.user,
                                                   filter_type=FT_USER_DOC_GRID_CONFIG,
                                                   project_id=project_id) \
                            .exclude(pk=obj.pk) \
                            .delete()

            # show_unprocessed = as_bool(request.GET, 'show_unprocessed', False)
            # if show_unprocessed is False:
            #     column_filters.append((FIELD_CODE_DOC_PROCESSED, 'true'))
            total_documents_query = Document.objects.filter(
                document_type=document_type)
            if project_ids:
                total_documents_query = total_documents_query.filter(
                    project_id__in=project_ids)
            total_documents_of_type = total_documents_query.count()

            columns_to_query = columns
            if columns_to_query:
                columns_to_query = leave_unique_values(
                    ['document_id', 'document_name'] + columns)

            query_results = query_documents(
                requester=request.user,
                document_type=document_type,
                project_ids=project_ids,
                column_names=columns_to_query,  # columns,
                saved_filter_ids=saved_filters,
                column_filters=column_filters,
                order_by=order_by,
                offset=offset,
                limit=limit,
                return_documents=return_data,
                return_reviewed_count=return_reviewed,
                return_total_count=return_total,
                ignore_errors=ignore_errors,
                include_annotation_fields=True)  # type: DocumentQueryResults

            if query_results is None:
                if fmt in {self.FMT_XLSX, self.FMT_CSV} and not return_data:
                    raise APIRequestError('Empty data, nothing to export')
                return Response({'time': time.time() - start})

            # get assignees stats
            assignees_query_results = query_documents(
                requester=request.user,
                document_type=document_type,
                project_ids=project_ids,
                column_names=['document_id', 'assignee_name', 'assignee_id'],
                saved_filter_ids=saved_filters,
                column_filters=column_filters,
                return_documents=True,
                return_reviewed_count=False,
                include_annotation_fields=include_annotations
            )  # type: DocumentQueryResults

            query_results.assignees = []
            if assignees_query_results is not None:
                df = pd.DataFrame(assignees_query_results.fetch_dicts())
                if not df.empty:
                    df = df.groupby(['assignee_id', 'assignee_name'])\
                        .agg({'document_id': [('document_ids', lambda x: list(x)), ('documents_count', 'count')]})
                    if not df.empty:
                        df.columns = df.columns.droplevel()
                        df = df.reset_index()
                        df['assignee_id'] = df['assignee_id'].astype(int)
                        query_results.assignees = df.to_dict('records')

            query_results.unfiltered_count = total_documents_of_type

            if fmt in {self.FMT_XLSX, self.FMT_CSV} and not return_data:
                raise APIRequestError(
                    'Export to csv/xlsx requested with return_data=false')

            if fmt == self.FMT_CSV:
                return query_results.to_csv(as_zip=as_zip)
            elif fmt == self.FMT_XLSX:
                return query_results.to_xlsx(as_zip=as_zip)
            else:
                query_dict = query_results.to_json(time_start=start)
                if columns and 'items' in query_dict:
                    columns_to_remove = []
                    if 'document_id' not in columns:
                        columns_to_remove.append('document_id')
                    query_dict['items'] = self.expand_items(
                        query_dict['items'], columns_to_remove)
                return Response(query_dict)
        except APIRequestError as e:
            return e.to_response()
        except Exception as e:
            return APIRequestError(message='Unable to process request',
                                   caused_by=e,
                                   http_status_code=500).to_response()