def test_query_by_domain(self): domain1 = 'test1-{}'.format(self.test_id) domain2 = 'test2-{}'.format(self.test_id) self._ship_forms_to_es(2 * [TestFormMetadata(domain=domain1)] + 1 * [TestFormMetadata(domain=domain2)]) self.assertEqual(2, FormES().domain(domain1).run().total) self.assertEqual(1, FormES().domain(domain2).run().total)
def get_all_user_ids_submitted(domain, app_ids=None): query = FormES().domain(domain).aggregation(TermsAggregation("user_id", "form.meta.userID")).size(0) if app_ids: query = query.app(app_ids) return query.run().aggregations.user_id.buckets_dict.keys()
def get_last_submission_time_for_user(domain, user_id, datespan): form_query = FormES() \ .domain(domain) \ .user_id([user_id]) \ .completed(gte=datespan.startdate.date(), lte=datespan.enddate.date()) \ .sort("form.meta.timeEnd", desc=True) \ .size(1) results = form_query.run().hits def convert_to_date(date): return string_to_datetime(date).date() if date else None return convert_to_date(results[0]['form']['meta']['timeEnd'] if results else None)
def test_form_soft_deletion(self): form, metadata = self._create_form_and_sync_to_es() # verify there results = FormES().run() self.assertEqual(1, results.total) # soft delete the form with self.process_form_changes: FormAccessors(self.domain).soft_delete_forms([form.form_id]) self.elasticsearch.indices.refresh(XFORM_INDEX_INFO.index) # ensure not there anymore results = FormES().run() self.assertEqual(0, results.total)
def _get_form_counts_by_user(domain, datespan, is_submission_time): form_query = FormES().domain(domain) if is_submission_time: form_query = (form_query .submitted(gte=datespan.startdate.date(), lte=datespan.enddate.date())) else: form_query = (form_query .completed(gte=datespan.startdate.date(), lte=datespan.enddate.date())) form_query = (form_query .user_aggregation() .size(1)) return form_query.run().aggregations.user.counts_by_bucket()
def get_users_with_forms(domain, user_ids): users_with_forms = set() for user_id in user_ids: f = FormES().domain(domain).user_id(user_id).count() if f: users_with_forms.add(user_id) return users_with_forms
def _get_form_counts_by_date(domain, user_ids, datespan, timezone, is_submission_time): form_query = (FormES().domain(domain).user_id(user_ids)) for xmlns in SYSTEM_FORM_XMLNS_MAP.keys(): form_query = form_query.filter(filters.NOT(xmlns_filter(xmlns))) if is_submission_time: form_query = (form_query.submitted( gte=datespan.startdate.date(), lte=datespan.enddate.date()).submitted_histogram(timezone.zone)) else: form_query = (form_query.completed( gte=datespan.startdate.date(), lte=datespan.enddate.date()).completed_histogram(timezone.zone)) form_query = form_query.size(0) results = form_query.run().aggregations.date_histogram.buckets_list # Convert timestamp from millis -> seconds -> aware datetime # ES bucket key is an epoch timestamp relative to the timezone specified, # so pass timezone into fromtimestamp() to create an accurate datetime, otherwise will be treated as UTC results = list( map( lambda result: (datetime.fromtimestamp(result.key // 1000, timezone).date(). isoformat(), result.doc_count), results, )) return dict(results)
def test_form_soft_deletion(self): form, metadata = self._create_form_and_sync_to_es() # verify there results = FormES().run() self.assertEqual(1, results.total) # soft delete the form with process_pillow_changes('xform-pillow', {'skip_ucr': True}): with process_pillow_changes('DefaultChangeFeedPillow'): FormAccessors(self.domain).soft_delete_forms([form.form_id]) self.elasticsearch.indices.refresh(XFORM_INDEX_INFO.index) # ensure not there anymore results = FormES().run() self.assertEqual(0, results.total)
def resave_es_forms_with_unknown_user_type(user_id): domain_form_id_list = ( FormES().user_type(UNKNOWN_USER_TYPE).user_id(user_id).values_list( 'domain', '_id', scroll=True)) for domain, form_id in domain_form_id_list: form = FormAccessors(domain).get_form(form_id) resave_form(domain, form)
def _received_on_query(domain, desc=False): return ( FormES() .fields(['received_on']) .domain(domain) .sort('received_on', desc=desc) )
def get_last_forms_by_app(user_id): """ gets the last form submission for each app for a given user id :param user_id: id of a couch user :return: last form submission for every app that user has submitted """ query = ( FormES() .user_id(user_id) .aggregation( TermsAggregation('app_id', 'app_id').aggregation( TopHitsAggregation( 'top_hits_last_form_submissions', 'received_on', is_ascending=False, ) ) ) .size(0) ) aggregations = query.run().aggregations buckets_dict = aggregations.app_id.buckets_dict result = [] for app_id, bucket in buckets_dict.items(): result.append(bucket.top_hits_last_form_submissions.hits[0]) return result
def _get_es_modified_dates_for_forms(form_ids): results = (FormES(for_export=True).remove_default_filters().form_ids( form_ids).values_list('_id', 'received_on', 'doc_type', 'domain')) return { _id: (iso_string_to_datetime(received_on), doc_type, domain) for _id, received_on, doc_type, domain in results }
def get_paged_forms_by_type(domain, doc_types, start=0, size=10): query = (FormES().domain(domain).remove_default_filter( 'is_xform_instance').remove_default_filter('has_user').doc_type([ doc_type.lower() for doc_type in doc_types ]).sort("received_on", desc=True).start(start).size(size)) result = query.run() return PagedResult(total=result.total, hits=result.hits)
def get_form_counts_for_domains(domains): return FormES() \ .filter(filters.term('domain', domains)) \ .domain_aggregation() \ .size(0) \ .run() \ .aggregations.domain.counts_by_bucket()
def dehydrate(self, bundle): show_extras = _safe_bool(bundle, 'extras') if show_extras: extras = {} now = datetime.datetime.utcnow() form_es_base = (FormES().domain(bundle.request.domain).user_id( [bundle.obj._id])) extras['submitted_last_30'] = (form_es_base.submitted( gte=now - datetime.timedelta(days=30), lte=now).size(0).run()).total extras['completed_last_30'] = (form_es_base.completed( gte=now - datetime.timedelta(days=30), lte=now).size(0).run()).total first_of_this_month = datetime.datetime(now.year, now.month, 1) first_of_last_month = (first_of_this_month - datetime.timedelta(days=1)).replace(day=1) extras['submitted_last_month'] = (form_es_base.submitted( gte=first_of_last_month, lte=first_of_this_month).size(0).run()).total extras['completed_last_month'] = (form_es_base.completed( gte=first_of_last_month, lte=first_of_this_month).size(0).run()).total bundle.data['extras'] = extras return super(UserResource, self).dehydrate(bundle)
def get_form_duration_stats_for_users(domain, app_id, xmlns, user_ids, startdate, enddate, by_submission_time=True): """Gets the form duration stats for a group of users""" date_filter_fn = submitted_filter if by_submission_time else completed_filter query = (FormES().domain(domain).user_ids_handle_unknown( user_ids ).remove_default_filter('has_user').xmlns(xmlns).filter( date_filter_fn(gte=startdate, lt=enddate) ).aggregation( ExtendedStatsAggregation( 'duration_stats', 'form.meta.timeStart', script= "doc['form.meta.timeEnd'].value - doc['form.meta.timeStart'].value", )).size(0)) if app_id: query = query.app(app_id) return query.run().aggregations.duration_stats.result
def _assert_form_is_in_es(self, form): results = FormES().run() self.assertEqual(1, results.total) form_doc = results.hits[0] self.assertEqual(self.domain, form_doc['domain']) self.assertEqual(form.xmlns, form_doc['xmlns']) self.assertEqual('XFormInstance', form_doc['doc_type'])
def _get_form_counts_by_date(domain, user_ids, datespan, timezone, is_submission_time): form_query = (FormES().domain(domain).user_id(user_ids)) for xmlns in SYSTEM_FORM_XMLNS_MAP.keys(): form_query = form_query.filter(filters.NOT(xmlns_filter(xmlns))) if is_submission_time: form_query = (form_query.submitted( gte=datespan.startdate.date(), lte=datespan.enddate.date()).submitted_histogram(timezone.zone)) else: form_query = (form_query.completed( gte=datespan.startdate.date(), lte=datespan.enddate.date()).completed_histogram(timezone.zone)) form_query = form_query.size(0) results = form_query.run().aggregations.date_histogram.buckets_list # Convert timestamp into timezone aware datetime. Must divide timestamp by 1000 since python's # fromtimestamp takes a timestamp in seconds, whereas elasticsearch's timestamp is in milliseconds results = list( map( lambda result: (datetime.fromtimestamp(result.key // 1000).date(). isoformat(), result.doc_count), results, )) return dict(results)
def get_last_submission_time_for_users(domain, user_ids, datespan, es_instance_alias=ES_DEFAULT_INSTANCE): def convert_to_date(date): return string_to_datetime(date).date() if date else None query = (FormES(es_instance_alias=es_instance_alias).domain( domain).user_id(user_ids).completed( gte=datespan.startdate.date(), lte=datespan.enddate.date()).aggregation( TermsAggregation('user_id', 'form.meta.userID').aggregation( TopHitsAggregation( 'top_hits_last_form_submissions', 'form.meta.timeEnd', is_ascending=False, include='form.meta.timeEnd', ))).size(0)) aggregations = query.run().aggregations buckets_dict = aggregations.user_id.buckets_dict result = {} for user_id, bucket in buckets_dict.items(): result[user_id] = convert_to_date( bucket.top_hits_last_form_submissions.hits[0]['form']['meta'] ['timeEnd']) return result
def get_last_form_submission_for_xmlns(domain, xmlns): query = (FormES().domain(domain).xmlns(xmlns).sort('received_on', desc=True).size(1)) if query.run().hits: return query.run().hits[0] return None
def get_username_in_last_form_user_id_submitted(domain, user_id): query = (FormES().domain(domain).user_id(user_id).sort( 'received_on', desc=True).source(['form.meta.username']).size(1)) results = query.run().hits if results: return results[0]['form']['meta'].get('username', None)
def get_form_ids_having_multimedia(domain, app_id, xmlns, startdate, enddate, user_types=None, group=None): query = (FormES().domain(domain).app(app_id).xmlns(xmlns).submitted( gte=startdate, lte=enddate).remove_default_filter("has_user").source( ['_id', 'external_blobs'])) if user_types: query = query.user_type(user_types) if group: results = (GroupES().domain(domain).group_ids([group]).source( ['users'])).run().hits assert len(results) <= 1 user_ids = results[0]['users'] query = query.user_id(user_ids) form_ids = set() for form in query.scroll(): try: for attachment in _get_attachment_dicts_from_form(form): if attachment['content_type'] != "text/xml": form_ids.add(form['_id']) continue except AttributeError: pass return form_ids
def send_unknown_user_type_stats(): metrics_gauge('commcare.fix_user_types.unknown_user_count', _get_unknown_user_type_user_ids_approx_count(), multiprocess_mode=MPM_MAX) metrics_gauge('commcare.fix_user_types.unknown_user_form_count', FormES().user_type(UNKNOWN_USER_TYPE).count(), multiprocess_mode=MPM_MAX)
def get_last_submission_time_for_users(domain, user_ids, datespan, for_export=False): def convert_to_date(date): return string_to_datetime(date).date() if date else None query = (FormES( for_export=for_export).domain(domain).user_id(user_ids).submitted( gte=datespan.startdate.date(), lte=datespan.enddate.date()).aggregation( TermsAggregation('user_id', 'form.meta.userID').aggregation( TopHitsAggregation( 'top_hits_last_form_submissions', 'received_on', is_ascending=False, include='received_on', ))).size(0)) aggregations = query.run().aggregations buckets_dict = aggregations.user_id.buckets_dict result = {} for user_id, bucket in buckets_dict.items(): result[user_id] = convert_to_date( bucket.top_hits_last_form_submissions.hits[0]['received_on']) return result
def resave_es_forms_with_unknown_user_type(user_id): domain_form_id_list = ( FormES().user_type(UNKNOWN_USER_TYPE).user_id(user_id) .values_list('domain', '_id', scroll=True) ) for domain, form_id in domain_form_id_list: form = XFormInstance.objects.get_form(form_id, domain) resave_form(domain, form)
def get_form_ids_missing_from_elasticsearch(all_form_ids): missing_from_elasticsearch = set() for form_ids in chunked(all_form_ids, 500): form_ids = set(form_ids) not_missing = set(FormES().doc_id(form_ids).get_ids()) missing_from_elasticsearch.update(form_ids - not_missing) assert not_missing - form_ids == set() return list(missing_from_elasticsearch)
def get_all_user_ids_submitted(domain, app_ids=None): query = (FormES().domain(domain).aggregation( TermsAggregation('user_id', 'form.meta.userID')).size(0)) if app_ids: query = query.app(app_ids) return list(query.run().aggregations.user_id.buckets_dict)
def _get_es_modified_dates_for_forms(form_ids): results = ( FormES(es_instance_alias=ES_EXPORT_INSTANCE).remove_default_filters() .form_ids(form_ids) .values_list('_id', 'received_on', 'doc_type', 'domain') ) return {_id: (iso_string_to_datetime(received_on), doc_type, domain) for _id, received_on, doc_type, domain in results}
def get_form_name_from_last_submission_for_xmlns(domain, xmlns): query = (FormES().domain(domain).xmlns(xmlns).sort( 'received_on', desc=True).source(['form.@name']).size(1).non_null('form.@name')) results = query.run().hits if results: return results[0]['form']['@name'] return None
def get_form_ids_having_multimedia(domain, app_id, xmlns, datespan, user_types): enddate = datespan.enddate + timedelta(days=1) query = (FormES().domain(domain).app(app_id).xmlns(xmlns).submitted( gte=datespan.startdate, lte=enddate).remove_default_filter("has_user")) if user_types: query = query.user_type(user_types) return {form['_id'] for form in _forms_with_attachments(query)}
def get_form_duration_stats_by_user( domain, app_id, xmlns, user_ids, startdate, enddate, by_submission_time=True): """Gets stats on the duration of a selected form grouped by users""" date_filter_fn = submitted_filter if by_submission_time else completed_filter missing_users = None in user_ids query = ( FormES() .domain(domain) .user_ids_handle_unknown(user_ids) .remove_default_filter('has_user') .xmlns(xmlns) .filter(date_filter_fn(gte=startdate, lt=enddate)) .aggregation( TermsAggregation('user_id', 'form.meta.userID').aggregation( ExtendedStatsAggregation( 'duration_stats', 'form.meta.timeStart', script="doc['form.meta.timeEnd'].value - doc['form.meta.timeStart'].value", ) ) ) .size(0) ) if app_id: query = query.app(app_id) if missing_users: query = query.aggregation( MissingAggregation('missing_user_id', 'form.meta.userID').aggregation( ExtendedStatsAggregation( 'duration_stats', 'form.meta.timeStart', script="doc['form.meta.timeEnd'].value - doc['form.meta.timeStart'].value", ) ) ) result = {} aggregations = query.run().aggregations if missing_users: result[MISSING_KEY] = aggregations.missing_user_id.bucket.duration_stats.result buckets_dict = aggregations.user_id.buckets_dict for user_id, bucket in buckets_dict.iteritems(): result[user_id] = bucket.duration_stats.result return result
def _resolve_from_template(self, template, query_context): # todo: support other types and options assert template.type == 'form' startdate, enddate = get_daterange_start_end_dates(template.time_range) xmlns = Form.get_form(template.source_id).xmlns return FormES().user_id(query_context.user._id).xmlns( [xmlns]).submitted( gte=startdate, lte=enddate, ).size(0).count()
def _get_form_counts_by_user(domain, datespan, is_submission_time, user_ids=None): form_query = FormES().domain(domain).filter(filters.NOT(xmlns_filter(SYSTEM_FORM_XMLNS))) if is_submission_time: form_query = (form_query .submitted(gte=datespan.startdate.date(), lte=datespan.enddate.date())) else: form_query = (form_query .completed(gte=datespan.startdate.date(), lte=datespan.enddate.date())) if user_ids: form_query = form_query.user_id(user_ids) form_query = (form_query .user_aggregation() .size(0)) return form_query.run().aggregations.user.counts_by_bucket()
def _get_form_counts_by_user(domain, datespan, is_submission_time, user_ids=None, export=False): es_instance = ES_EXPORT_INSTANCE if export else ES_DEFAULT_INSTANCE form_query = FormES(es_instance_alias=es_instance).domain(domain) for xmlns in SYSTEM_FORM_XMLNS_MAP.keys(): form_query = form_query.filter(filters.NOT(xmlns_filter(xmlns))) if is_submission_time: form_query = (form_query .submitted(gte=datespan.startdate.date(), lte=datespan.enddate.date())) else: form_query = (form_query .completed(gte=datespan.startdate.date(), lte=datespan.enddate.date())) if user_ids: form_query = form_query.user_id(user_ids) form_query = (form_query .user_aggregation() .size(0)) return form_query.run().aggregations.user.counts_by_bucket()
def _get_form_counts_by_date(domain, user_ids, datespan, timezone, is_submission_time): form_query = FormES().domain(domain).user_id(user_ids).filter(filters.NOT(xmlns_filter(SYSTEM_FORM_XMLNS))) if is_submission_time: form_query = form_query.submitted( gte=datespan.startdate.date(), lte=datespan.enddate.date() ).submitted_histogram(timezone.zone) else: form_query = form_query.completed( gte=datespan.startdate.date(), lte=datespan.enddate.date() ).completed_histogram(timezone.zone) form_query = form_query.size(0) results = form_query.run().aggregations.date_histogram.buckets_list # Convert timestamp into timezone aware dateime. Must divide timestamp by 1000 since python's # fromtimestamp takes a timestamp in seconds, whereas elasticsearch's timestamp is in milliseconds results = map( lambda result: (datetime.fromtimestamp(result.key / 1000).date().isoformat(), result.doc_count), results ) return dict(results)
def get_last_form_submission_for_xmlns(domain, xmlns): query = FormES().domain(domain).xmlns(xmlns).sort("received_on", desc=True).size(1) if query.run().hits: return query.run().hits[0] return None
def test_query_completed_date(self): domain = 'test-completed-{}'.format(self.test_id) early = datetime.datetime(2015, 12, 5) later = datetime.datetime(2015, 12, 8) self._ship_forms_to_es( 2 * [TestFormMetadata(domain=domain, time_end=early)] + 1 * [TestFormMetadata(domain=domain, time_end=later)] ) base_qs = FormES().domain(domain) self.assertEqual(3, base_qs.run().total) # test gt/gte self.assertEqual(3, base_qs.completed(gt=early - datetime.timedelta(days=1)).run().total) self.assertEqual(3, base_qs.completed(gte=early).run().total) self.assertEqual(1, base_qs.completed(gt=early).run().total) self.assertEqual(1, base_qs.completed(gte=later).run().total) self.assertEqual(0, base_qs.completed(gt=later).run().total) # test lt/lte self.assertEqual(3, base_qs.completed(lt=later + datetime.timedelta(days=1)).run().total) self.assertEqual(3, base_qs.completed(lte=later).run().total) self.assertEqual(2, base_qs.completed(lt=later).run().total) self.assertEqual(2, base_qs.completed(lte=early).run().total) self.assertEqual(0, base_qs.completed(lt=early).run().total) # test both self.assertEqual(0, base_qs.completed(gt=early, lt=later).run().total)