def get_sms_only_domain_stats_data(domains, datespan, interval, datefield="date_created"): """ Returns domains that have only used SMS and not forms. Returned based on date domain is created """ histo_data = [] sms = SMSES().domain(domains).terms_aggregation("domain", "domains").size(0) forms = FormES().domain(domains).terms_aggregation("domain", "domains").size(0) sms_domains = set(sms.run().aggregations.domains.keys) form_domains = set(forms.run().aggregations.domains.keys) sms_only_domains = sms_domains - form_domains domains_after_date = ( DomainES() .in_domains(sms_only_domains) .created(gte=datespan.startdate, lte=datespan.enddate) .date_histogram("date", datefield, interval) .size(0) ) histo_data = domains_after_date.run().aggregations.date.as_facet_result() domains_before_date = DomainES().in_domains(sms_only_domains).created(lt=datespan.startdate).size(0) domains_before_date = domains_before_date.run().total return format_return_data(histo_data, domains_before_date, datespan)
def _domains_over_x_forms(num_forms=200, domains=None): form_domains = FormES().domain_aggregation().size(0) if domains: form_domains = form_domains.domain(domains) form_domains = form_domains.run().aggregations.domain.buckets_list return {x.key for x in form_domains if x.doc_count > num_forms}
def _domains_over_x_forms(num_forms=200, domains=None): form_domains = FormES().domain_facet().size(0) if domains: form_domains = form_domains.domain(domains) form_domains = form_domains.run().facet('domain', 'terms') return {x['term'] for x in form_domains if x['count'] > num_forms}
def get_domain_device_breakdown_es(domain_name, monthspan): query = FormES().domain(domain_name).submitted( gte=monthspan.startdate, lt=monthspan.computed_enddate, ).aggregation(TermsAggregation('device_id', 'form.meta.deviceID')).size(0) return query.run().aggregations.device_id.counts_by_bucket()
def get_domain_device_breakdown_es(domain_name, monthspan): query = FormES(for_export=True).domain(domain_name).submitted( gte=monthspan.startdate, lt=monthspan.computed_enddate, ).aggregation(TermsAggregation('device_id', 'form.meta.deviceID')).size(0) return query.run().aggregations.device_id.counts_by_bucket()
def get_sms_only_domain_stats_data(domains, datespan, interval, datefield="date_created"): """ Returns domains that have only used SMS and not forms. Returned based on date domain is created """ histo_data = [] sms = SMSES().domain(domains).terms_facet("domain", "domains", size=DOMAIN_COUNT_UPPER_BOUND).size(0) forms = FormES().domain(domains).terms_facet("domain", "domains", size=DOMAIN_COUNT_UPPER_BOUND).size(0) sms_domains = {x["term"] for x in sms.run().facet("domains", "terms")} form_domains = {x["term"] for x in forms.run().facet("domains", "terms")} sms_only_domains = sms_domains - form_domains domains_after_date = ( DomainES() .in_domains(sms_only_domains) .created(gte=datespan.startdate, lte=datespan.enddate) .date_histogram("date", datefield, interval) .size(0) ) histo_data = domains_after_date.run().facet("date", "entries") domains_before_date = DomainES().in_domains(sms_only_domains).created(lt=datespan.startdate).size(0) domains_before_date = domains_before_date.run().total return format_return_data(histo_data, domains_before_date, datespan)
def _domains_over_x_forms(num_forms=200, domains=None): form_domains = FormES().domain_facet().size(0) if domains: form_domains = form_domains.domain(domains) form_domains = form_domains.run().facet("domain", "terms") return {x["term"] for x in form_domains if x["count"] > num_forms}
def get_domain_device_breakdown_es(domain_name, monthspan): query = FormES(es_instance_alias=ES_EXPORT_INSTANCE).domain(domain_name).submitted( gte=monthspan.startdate, lt=monthspan.computed_enddate, ).aggregation(TermsAggregation('device_id', 'form.meta.deviceID')).size(0) return query.run().aggregations.device_id.counts_by_bucket()
def test_not_filter_edge_case(self): self._setup_data() query = FormES().remove_default_filters().filter( filters.NOT(filters.OR( filters.term('domain', 'd'), filters.term('app_id', 'a') )) ) self.assertEqual(query.run().doc_ids, ['doc3'])
def total_distinct_users(domain): """ Get total number of users who've ever submitted a form in a domain. """ query = FormES().domain(domain).user_aggregation() terms = { user_id for user_id in query.run().aggregations.user.keys if user_id not in WEIRD_USER_IDS } user_ids = terms.intersection(set(CouchUser.ids_by_domain(domain))) return len(user_ids)
def es_query_from_get_params(search_params, domain, reserved_query_params=None, doc_type='form'): # doc_type can be form or case assert doc_type in ['form', 'case'] es = FormES() if doc_type == 'form' else CaseES() query = es.remove_default_filters().domain(domain) if doc_type == 'form': if 'include_archived' in search_params: query = query.filter( filters.OR(filters.term('doc_type', 'xforminstance'), filters.term('doc_type', 'xformarchived'))) else: query = query.filter(filters.term('doc_type', 'xforminstance')) if '_search' in search_params: # This is undocumented usecase by Data export tool and one custom project # Validate that the passed in param is one of these two expected _filter = _validate_and_get_es_filter( json.loads(search_params['_search'])) query = query.filter(_filter) # filters are actually going to be a more common case reserved_query_params = RESERVED_QUERY_PARAMS | set(reserved_query_params or []) query_params = { param: value for param, value in search_params.items() if param not in reserved_query_params and not param.endswith('__full') } for consumer in query_param_consumers: try: payload_filter = consumer.consume_params(query_params) except DateTimeError as e: raise Http400("Bad query parameter: {}".format(str(e))) if payload_filter: query = query.filter(payload_filter) # add unconsumed filters for param, value in query_params.items(): # assume these fields are analyzed in ES so convert to lowercase # Any fields that are not analyzed in ES should be in the ``query_param_consumers`` above value = value.lower() query = query.filter(filters.term(param, value)) return query.raw_query
def get_app_submission_breakdown_es(domain_name, monthspan, user_ids=None): # takes > 1 m to load at 50k worker scale terms = [ AggregationTerm('app_id', 'app_id'), AggregationTerm('device_id', 'form.meta.deviceID'), AggregationTerm('user_id', 'form.meta.userID'), AggregationTerm('username', 'form.meta.username'), ] query = FormES(es_instance_alias=ES_EXPORT_INSTANCE).domain(domain_name).submitted( gte=monthspan.startdate, lt=monthspan.computed_enddate, ) if user_ids is not None: query = query.user_id(user_ids) return NestedTermAggregationsHelper(base_query=query, terms=terms).get_data()
def test_ids_query(self): self._setup_data() ids = ['doc1', 'doc2'] self.assertEqual( FormES().remove_default_filters().ids_query(ids).exclude_source().run().doc_ids, ids )
def active_mobile_users(domain, *args): """ Returns the number of mobile users who have submitted a form or SMS in the last 30 days """ now = datetime.utcnow() then = (now - timedelta(days=30)) user_ids = get_mobile_users(domain) form_users = { q['term'] for q in (FormES().domain(domain).user_facet( size=USER_COUNT_UPPER_BOUND).submitted( gte=then).user_id(user_ids).size(0).run().facets.user.result) } sms_users = { q['term'] for q in (SMSES().user_facet( size=USER_COUNT_UPPER_BOUND).to_commcare_user().domain( domain).received(gte=then).size(0).run().facets.user.result) } num_users = len(form_users | sms_users) return num_users if 'inactive' not in args else len(user_ids) - num_users
def handle(self, *args, **options): possible_bad_forms = (FormES().submitted( gte=datetime.date(2016, 10, 14), lt=datetime.date(2016, 10, 20), ).filter(filters.term('backend_id', 'sql')).source('_id')).run().hits form_ids = [form['_id'] for form in possible_bad_forms] blob_db = get_blob_db() for form_id in form_ids: form = FormAccessorSQL.get_form(form_id) for attachment in form.get_attachments(): if attachment.blob_bucket: continue bucket = attachment.blobdb_bucket(remove_dashes=False) attach_id = str(attachment.attachment_id) if blob_db.exists(attachment.blob_id, bucket): FormAccessorSQL.write_blob_bucket(attachment, bucket) logging.info(attach_id + " overwritten blob_bucket_succesfully") else: # This is the default and what we want long term # verify it exists bucket = attachment.blobdb_bucket(remove_dashes=True) if not blob_db.exists(attachment.blob_id, bucket): logger.error( attach_id + " does not exist in either expected bucket")
def get_user_type_filters(histo_type, user_type_mobile, require_submissions): result = {'terms': {}} if histo_type == 'forms': result['terms']["form.meta.userID"] = list( get_user_ids(user_type_mobile)) elif histo_type == 'users_all': existing_users = get_user_ids(user_type_mobile) if require_submissions: LARGE_NUMBER = 1000 * 1000 * 10 real_form_users = { user_count['term'] for user_count in ( FormES().user_facet().size(0).run().facets.user.result) } real_sms_users = { user_count['term'] for user_count in (SMSES().terms_facet( 'couch_recipient', 'user', LARGE_NUMBER).incoming_messages( ).size(0).run().facets.user.result) } filtered_real_users = (existing_users & (real_form_users | real_sms_users)) else: filtered_real_users = existing_users result['terms']['_id'] = list(filtered_real_users) return result
def get_active_countries_stats_data(domains, datespan, interval, datefield='received_on'): """ Returns list of timestamps and how many countries were active in the 30 days before the timestamp """ histo_data = [] for timestamp in daterange(interval, datespan.startdate, datespan.enddate): t = timestamp f = timestamp - relativedelta(days=30) form_query = (FormES().domain(domains).terms_facet( 'domain', 'domains', size=LARGE_ES_NUMBER).submitted(gte=f, lte=t).size(0)) domains = form_query.run().facet('domains', "terms") domains = [x['term'] for x in domains] countries = (DomainES().in_domains(domains).terms_facet( 'countries', 'countries', size=LARGE_ES_NUMBER)) c = len(countries.run().facet('countries', 'terms')) if c > 0: histo_data.append(get_data_point(c, timestamp)) return format_return_data(histo_data, 0, datespan)
def get_active_users_data(domains, datespan, interval, datefield='date', additional_params_es={}, include_forms=False): """ Returns list of timestamps and how many users of SMS were active in the 30 days before each timestamp """ histo_data = [] mobile_users = get_mobile_users(domains) for timestamp in daterange(interval, datespan.startdate, datespan.enddate): t = timestamp f = timestamp - relativedelta(days=30) sms_query = get_sms_query(f, t, 'users', 'couch_recipient', domains) if additional_params_es: sms_query = add_params_to_query(sms_query, additional_params_es) users = set( sms_query.incoming_messages().run().aggregations.users.keys) if include_forms: users |= set(FormES().domain(domains).user_aggregation().submitted( gte=f, lte=t).user_id(mobile_users).size( 0).run().aggregations.user.keys) c = len(users) if c > 0: histo_data.append(get_data_point(c, timestamp)) return format_return_data(histo_data, 0, datespan)
def get_forms_for_users(domain, user_ids, start, end): query = (FormES( es_instance_alias=ES_EXPORT_INSTANCE).domain(domain).submitted( gte=start, lte=end).user_id(user_ids).source( ['form.meta.userID', 'form.case', 'form.@xmlns'])) return query.scroll()
def j2me_forms_in_last(domain, days): """ Returns the number of forms submitted by j2me in the last given number of days """ then = datetime.utcnow() - timedelta(days=int(days)) return FormES().domain(domain).j2me_submissions( gte=then).size(0).run().total
def get_300th_form_submission_received(domain): result = FormES().domain(domain).start(300).size(1).sort( 'received_on').fields(['received_on']).run().hits if not result: return return iso_string_to_datetime(result[0]['received_on'])
def get_sms_only_domain_stats_data(domains, datespan, interval, datefield='date_created'): """ Returns domains that have only used SMS and not forms. Returned based on date domain is created """ histo_data = [] sms = (SMSES().domain(domains).terms_aggregation('domain', 'domains').size(0)) forms = (FormES().domain(domains).terms_aggregation('domain', 'domains').size(0)) sms_domains = set(sms.run().aggregations.domains.keys) form_domains = set(forms.run().aggregations.domains.keys) sms_only_domains = sms_domains - form_domains domains_after_date = (DomainES().in_domains(sms_only_domains).created( gte=datespan.startdate, lte=datespan.enddate).date_histogram('date', datefield, interval).size(0)) histo_data = domains_after_date.run().aggregations.date.as_facet_result() domains_before_date = (DomainES().in_domains(sms_only_domains).created( lt=datespan.startdate).size(0)) domains_before_date = domains_before_date.run().total return format_return_data(histo_data, domains_before_date, datespan)
def active_mobile_users(domain, *args): """ Returns the number of mobile users who have submitted a form or SMS in the last 30 days """ now = datetime.utcnow() then = (now - timedelta(days=30)) user_ids = get_mobile_users(domain) form_users = set( FormES() .domain(domain) .user_aggregation() .submitted(gte=then) .user_id(user_ids) .size(0) .run() .aggregations.user.keys ) sms_users = set( SMSES() .incoming_messages() .user_aggregation() .to_commcare_user() .domain(domain) .received(gte=then) .size(0) .run() .aggregations.user.keys ) num_users = len(form_users | sms_users) return num_users if 'inactive' not in args else len(user_ids) - num_users
def get_submitted_users(): real_form_users = { user_count['term'] for user_count in ( FormES() .user_facet(size=USER_COUNT_UPPER_BOUND) .size(0) .run() .facets.user.result ) } real_sms_users = { user_count['term'] for user_count in ( SMSES() .terms_facet( 'couch_recipient', 'user', USER_COUNT_UPPER_BOUND ) .incoming_messages() .size(0) .run() .facets.user.result ) } return real_form_users | real_sms_users
def get_active_users_data(domains, datespan, interval, datefield='date', additional_params_es={}, include_forms=False): """ Returns list of timestamps and how many users of SMS were active in the 30 days before each timestamp """ histo_data = [] mobile_users = get_mobile_users(domains) for timestamp in daterange(interval, datespan.startdate, datespan.enddate): t = timestamp f = timestamp - relativedelta(days=30) sms_query = get_sms_query(f, t, 'users', 'couch_recipient', domains, USER_COUNT_UPPER_BOUND) if additional_params_es: sms_query = add_params_to_query(sms_query, additional_params_es) users = {u['term'] for u in sms_query.run().facet('users', "terms")} if include_forms: users |= { u['term'] for u in FormES() .domain(domains) .user_facet(size=USER_COUNT_UPPER_BOUND) .submitted(gte=f, lte=t) .user_id(mobile_users) .size(0) .run() .facets.user.result } c = len(users) if c > 0: histo_data.append(get_data_point(c, timestamp)) return format_return_data(histo_data, 0, datespan)
def get_active_countries_stats_data(domains, datespan, interval, datefield='received_on'): """ Returns list of timestamps and how many countries were active in the 30 days before the timestamp """ histo_data = [] for timestamp in daterange(interval, datespan.startdate, datespan.enddate): t = timestamp f = timestamp - relativedelta(days=30) form_query = (FormES() .domain(domains) .terms_aggregation('domain', 'domains') .submitted(gte=f, lte=t) .size(0)) active_domains = form_query.run().aggregations.domains.keys countries = (DomainES() .in_domains(active_domains) .terms_aggregation('countries', 'countries') .size(0)) c = len(countries.run().aggregations.countries.keys) if c > 0: histo_data.append(get_data_point(c, timestamp)) return format_return_data(histo_data, 0, datespan)
def get_sms_only_domain_stats_data(domains, datespan, interval, datefield='date_created'): """ Returns domains that have only used SMS and not forms. Returned based on date domain is created """ histo_data = [] sms = (SMSES().domain(domains).terms_facet('domain', 'domains', size=LARGE_ES_NUMBER).size(0)) forms = (FormES().domain(domains).terms_facet( 'domain', 'domains', size=LARGE_ES_NUMBER).size(0)) sms_domains = {x['term'] for x in sms.run().facet('domains', 'terms')} form_domains = {x['term'] for x in forms.run().facet('domains', 'terms')} sms_only_domains = sms_domains - form_domains domains_after_date = (DomainES().in_domains(sms_only_domains).created( gte=datespan.startdate, lte=datespan.enddate).date_histogram('date', datefield, interval).size(0)) histo_data = domains_after_date.run().facet('date', 'entries') domains_before_date = (DomainES().in_domains(sms_only_domains).created( lt=datespan.startdate).size(0)) domains_before_date = domains_before_date.run().total return format_return_data(histo_data, domains_before_date, datespan)
def active_mobile_users(domain, start, end, *args): """ Returns the number of mobile users who have submitted a form or SMS in the time specified """ user_ids = get_mobile_users(domain.name) form_users = (FormES(es_instance_alias=ES_EXPORT_INSTANCE) .domain(domain.name) .user_aggregation() .submitted(gte=start, lt=end) .user_id(user_ids) .size(0) .run() .aggregations.user.counts_by_bucket()) sms_users = set( SMSES(es_instance_alias=ES_EXPORT_INSTANCE) .incoming_messages() .user_aggregation() .to_commcare_user() .domain(domain.name) .received(gte=start, lt=end) .size(0) .run() .aggregations.user.keys ) return set(user_ids), form_users, sms_users
def get_submitted_users(): real_form_users = set( FormES().user_aggregation().size(0).run().aggregations.user.keys) real_sms_users = set(SMSES().terms_aggregation( 'couch_recipient', 'user').incoming_messages().size(0).run().aggregations.user.keys) return real_form_users | real_sms_users
def commtrack_form_submissions(domains, datespan, interval, datefield='received_on'): mobile_workers = get_mobile_users(domains) forms_after_date = (FormES().domain(domains).submitted( gte=datespan.startdate, lte=datespan.enddate).date_histogram( 'date', datefield, interval).user_id(mobile_workers).size(0)) histo_data = forms_after_date.run().aggregations.date.as_facet_result() forms_before_date = (FormES().domain(domains).submitted( lt=datespan.startdate).user_id(mobile_workers).size(0)) forms_before_date = forms_before_date.run().total return format_return_data(histo_data, forms_before_date, datespan)
def get_active_countries_stats_data(domains, datespan, interval, datefield="received_on"): """ Returns list of timestamps and how many countries were active in the 30 days before the timestamp """ histo_data = [] for timestamp in daterange(interval, datespan.startdate, datespan.enddate): t = timestamp f = timestamp - relativedelta(days=30) form_query = FormES().domain(domains).terms_aggregation("domain", "domains").submitted(gte=f, lte=t).size(0) active_domains = form_query.run().aggregations.domains.keys countries = DomainES().in_domains(active_domains).terms_aggregation("countries", "countries").size(0) c = len(countries.run().aggregations.countries.keys) if c > 0: histo_data.append(get_data_point(c, timestamp)) return format_return_data(histo_data, 0, datespan)
def commtrack_form_submissions(domains, datespan, interval, datefield="received_on"): mobile_workers = get_mobile_users(domains) forms_after_date = ( FormES() .domain(domains) .submitted(gte=datespan.startdate, lte=datespan.enddate) .date_histogram("date", datefield, interval) .user_id(mobile_workers) .size(0) ) histo_data = forms_after_date.run().aggregations.date.as_facet_result() forms_before_date = FormES().domain(domains).submitted(lt=datespan.startdate).user_id(mobile_workers).size(0) forms_before_date = forms_before_date.run().total return format_return_data(histo_data, forms_before_date, datespan)
def commtrack_form_submissions(domains, datespan, interval, datefield="received_on"): mobile_workers = [a["_id"] for a in UserES().fields([]).mobile_users().show_inactive().run().raw_hits] forms_after_date = ( FormES() .domain(domains) .submitted(gte=datespan.startdate, lte=datespan.enddate) .date_histogram("date", datefield, interval) .user_id(mobile_workers) .size(0) ) histo_data = forms_after_date.run().facet("date", "entries") forms_before_date = FormES().domain(domains).submitted(lt=datespan.startdate).user_id(mobile_workers).size(0) forms_before_date = forms_before_date.run().total return format_return_data(histo_data, forms_before_date, datespan)
def db_comparisons(request): comparison_config = [{ 'description': 'Users (base_doc is "CouchUser")', 'couch_db': CommCareUser.get_db(), 'view_name': 'users/by_username', 'es_query': UserES().remove_default_filter('active').remove_default_filter( 'mobile_worker').size(0), 'sql_rows': User.objects.count(), }, { 'description': 'Domains (doc_type is "Domain")', 'couch_db': Domain.get_db(), 'view_name': 'domain/by_status', 'es_query': DomainES().size(0), 'sql_rows': None, }, { 'description': 'Forms (doc_type is "XFormInstance")', 'couch_db': XFormInstance.get_db(), 'view_name': 'couchforms/by_xmlns', 'es_query': FormES().remove_default_filter('has_xmlns').remove_default_filter( 'has_user').size(0), 'sql_rows': FormData.objects.count(), }, { 'description': 'Cases (doc_type is "CommCareCase")', 'couch_db': CommCareCase.get_db(), 'view_name': 'case/by_owner', 'es_query': CaseES().size(0), 'sql_rows': None, }] comparisons = [] for comp in comparison_config: comparisons.append({ 'description': comp['description'], 'couch_docs': comp['couch_db'].view( comp['view_name'], reduce=True, ).one()['value'], 'es_docs': comp['es_query'].run().total, 'sql_rows': comp['sql_rows'] if comp['sql_rows'] else 'n/a', }) return json_response(comparisons)
def get_forms_for_users(domain, user_ids, start, end): query = (FormES().domain(domain).submitted( gte=start, lte=end).user_id(user_ids).aggregation( TermsAggregation('user_id', 'form.meta.userID').aggregation( TopHitsAggregation(name='top_hits_user_submissions', size=1000000, include=['form.case', 'form.@xmlns']))).size(0)) aggregations = query.run().aggregations return aggregations.user_id.buckets_dict
def commtrack_form_submissions(domains, datespan, interval, datefield='received_on'): mobile_workers = [ a['_id'] for a in UserES().fields( []).mobile_users().show_inactive().run().raw_hits ] forms_after_date = (FormES().domain(domains).submitted( gte=datespan.startdate, lte=datespan.enddate).date_histogram( 'date', datefield, interval).user_id(mobile_workers).size(0)) histo_data = forms_after_date.run().facet('date', 'entries') forms_before_date = (FormES().domain(domains).submitted( lt=datespan.startdate).user_id(mobile_workers).size(0)) forms_before_date = forms_before_date.run().total return format_return_data(histo_data, forms_before_date, datespan)
def get_app_submission_breakdown_es(domain_name, monthspan): terms = [ AggregationTerm('app_id', 'app_id'), AggregationTerm('device_id', 'form.meta.deviceID'), AggregationTerm('user_id', 'form.meta.userID'), AggregationTerm('username', 'form.meta.username'), ] query = FormES().domain(domain_name).submitted( gte=monthspan.startdate, lt=monthspan.computed_enddate, ) return NestedTermAggregationsHelper(base_query=query, terms=terms).get_data()