def intake_user(api_type, api_version): form = IntakeUserForm(request.form) # TODO: Change this to validate_or_400 after writing intake tests # to confirm it's proper behavior. if not form.validate(): abort(400, "Request data validation failed with the following errors: \n%s" % "\n".join("%s - %s" % (field, ",".join(errors)) for field, errors in form.errors.items())) user_id = hash_value(form.user_id.data, pre_hashed=form._pre_hashed) existing_user = IntakeUser.query(IntakeUser.api_type == api_type, IntakeUser.user_id == user_id, IntakeUser.org == g.consumer.org).get() user_fields = {} user_fields.update(form.data) # Remove fields that are not to be persisted to the IntakeUser itself. for field in user_fields.keys(): if not hasattr(IntakeUser, field): del user_fields[field] updated_user = IntakeUser.create_or_update(user_fields, g.consumer.key, g.consumer.org, api_type, existing_user=existing_user, pre_hashed=form._pre_hashed) return jsonify(user_id=form.user_id.data, is_new=(existing_user == None))
def intake_user(api_type, api_version): form = IntakeUserForm(request.form) # TODO: Change this to validate_or_400 after writing intake tests # to confirm it's proper behavior. if not form.validate(): abort( 400, "Request data validation failed with the following errors: \n%s" % "\n".join("%s - %s" % (field, ",".join(errors)) for field, errors in form.errors.items())) user_id = hash_value(form.user_id.data, pre_hashed=form._pre_hashed) existing_user = IntakeUser.query(IntakeUser.api_type == api_type, IntakeUser.user_id == user_id, IntakeUser.org == g.consumer.org).get() user_fields = {} user_fields.update(form.data) # Remove fields that are not to be persisted to the IntakeUser itself. for field in user_fields.keys(): if not hasattr(IntakeUser, field): del user_fields[field] updated_user = IntakeUser.create_or_update(user_fields, g.consumer.key, g.consumer.org, api_type, existing_user=existing_user, pre_hashed=form._pre_hashed) return jsonify(user_id=form.user_id.data, is_new=(existing_user == None))
def organization_management(organization_key): org = ndb.Key(urlsafe=organization_key).get() intake_user_count = IntakeUser.query(IntakeUser.org == org.key).count() most_recent_update = IntakeUser.query().order(-IntakeUser.updated).get() if most_recent_update: most_recent_update = most_recent_update.updated.strftime( "%a %b %d %H:%M:%S %Y") pii_stats = org_pii_stats(org.key) ordered_transaction_stats = org_transaction_stats(org.key) ordered_pii_stats = OrderedDict() ordered_pii_stats["total"] = intake_user_count for field, value in sorted(pii_stats.items(), key=lambda i: i[1], reverse=True): ordered_pii_stats[field] = value if request.method == "POST": form = OrganizationForm(request.form) if form.validate(): org.credit = form.credit.data org.org_type = form.org_type.data org.put() flash("Organization %s successfully updated." % org.full_name) else: form = OrganizationForm(credit=org.credit, org_type=org.org_type) return render_template("admin/organization.html", org=org, form=form, intake_user_count=intake_user_count, most_recent_update=most_recent_update, pii_stats=ordered_pii_stats, transaction_stats=ordered_transaction_stats)
def check_on_data(): api_user_count = IntakeUser.query(IntakeUser.api_type=="api").count() sandbox_user_count = IntakeUser.query(IntakeUser.api_type=="sandbox").count() total_user_count = IntakeUser.query().count() return render_template("admin/check_on_data.html", api_user_count=api_user_count, sandbox_user_count=sandbox_user_count, total_user_count=total_user_count)
def check_on_data(): api_user_count = IntakeUser.query(IntakeUser.api_type == "api").count() sandbox_user_count = IntakeUser.query( IntakeUser.api_type == "sandbox").count() total_user_count = IntakeUser.query().count() return render_template("admin/check_on_data.html", api_user_count=api_user_count, sandbox_user_count=sandbox_user_count, total_user_count=total_user_count)
def org_pii_stats(organization_key): org = organization_key.get() pii_stats = {} for pii_field in PII_FIELDS: if hasattr(IntakeUser, pii_field): count = IntakeUser.query(IntakeUser.org == org.key).filter( getattr(IntakeUser, pii_field) != None).count() else: count = 0 pii_stats[pii_field] = count return pii_stats
def compute_org_value(organization_key): """ Task that computes the current value of our data for the given organization. This task can take a long time and should be run on a backend. """ org = ndb.Key(urlsafe=organization_key).get() vs = ValueSummary.query(ValueSummary.organization == org.key).get() if not vs: vs = ValueSummary(organization=org.key) vs.processed_users = 0 vs.total_users = IntakeUser.query(IntakeUser.org == org.key).count() vs.is_running = True vs.overlap_counts = {} vs.org_name = org.full_name vs.put() overlap_counts = defaultdict(int) for intake_user in IntakeUser.query(IntakeUser.org == org.key): #logging.info(intake_user.pii_dict()) user_sets = find_users(intake_user.pii_dict(), pre_hashed=True) #logging.info("USER SETS:") #logging.info(user_sets) lrg_summary = generate_lrg_summary(user_sets, [org.key]) #logging.info("LRG SUMMARY:") #logging.info(lrg_summary) if lrg_summary: overlap_counts[lrg_summary["marketplace_memberships"]] += 1 vs.processed_users += 1 vs.put() vs.is_running = False vs.overlap_counts = overlap_counts vs.put() return jsonify(success=True, processed_users=vs.processed_users)
def data_quality(): if request.method == "POST": BATCH_SIZE = 200 key_batch = [] for intake_user_key in IntakeUser.query().iter(keys_only=True): key_batch.append(intake_user_key.urlsafe()) if len(key_batch) == BATCH_SIZE: _enqueue_rehashing(key_batch) key_batch = [] if key_batch: _enqueue_rehashing(key_batch) return redirect(url_for("data_quality")) return render_template("admin/data_quality.html")
def org_transaction_stats(organization_key): org = organization_key.get() transaction_stats = OrderedDict() UPPER_BOUND = 10 for i in range(UPPER_BOUND + 1): count_q = IntakeUser.query(IntakeUser.org == org.key) if i < UPPER_BOUND: count_q = count_q.filter(IntakeUser.transaction_count == i) else: count_q = count_q.filter(IntakeUser.transaction_count >= i) count = count_q.count() key_name = unicode(i) if i >= UPPER_BOUND: key_name += "+" transaction_stats[key_name] = count return transaction_stats
def run_analysis(org_key=None): """ Task that deletes all existing CombinedUsers and regenerates them from the underlying IntakeUsers. This task can take a long time to run, so it should be run on a dedicated instance. """ DELETE_BATCH = 500 ANALYZE_BATCH = 50 # Clear out the existing combined users cu_query = CombinedUser.query() if org_key: org_key = ndb.Key(urlsafe=org_key) cu_query = cu_query.filter(CombinedUser.orgs == org_key) while True: results, cursor, more = cu_query.fetch_page(DELETE_BATCH, keys_only=True) ndb.delete_multi(results) if not more: break # Analyze all the intake users, on a per-organization basis if org_key: org_keys = [org_key] else: org_keys = Organization.query().iter(keys_only=True) for org_key in org_keys: counter_key = "analysis::run_count::%s" % org_key.urlsafe() memcache.set(key=counter_key, value=0) iu_query = IntakeUser.query(IntakeUser.org == org_key) for iu_key in iu_query.iter(keys_only=True): memcache.incr(counter_key) #deferred.defer(analyze_user, intake_user_key=iu_key) analyze_user(iu_key) generate_csv(org_key.get().name) return "Great Success"
def enqueue_task(task_func): BATCH_SIZE = 5 if DEVELOPMENT else 200 key_batch = [] total_size = 0 for intake_user_key in IntakeUser.query().iter(keys_only=True): key_batch.append(intake_user_key.urlsafe()) total_size += 1 if len(key_batch) == BATCH_SIZE: taskqueue.add(url=url_for(task_func), payload=json.dumps(key_batch)) logging.info("Successfully enqueued %d users for task %s." % (len(key_batch), task_func)) key_batch = [] if key_batch: taskqueue.add(url=url_for(task_func), payload=json.dumps(key_batch)) logging.info("Successfully enqueued %d users for task %s." % (len(key_batch), task_func)) return jsonify(total_size=total_size)
def analysis(): orgs = Organization.query().fetch(50) orgs_info = {} for org in orgs: total_users = IntakeUser.query(IntakeUser.org == org.key).count() if total_users == 0: continue orgs_info[org.name] = {} counter_key = "analysis::run_count::%s" % org.key.urlsafe() orgs_info[org.name]["analyzed_users"] = memcache.get(counter_key) orgs_info[org.name]["total_users"] = total_users orgs_info[org.name]["org_key"] = org.key.urlsafe() if request.method == "POST": org_key = request.form.get("org_key", None) taskqueue.add(url=url_for("run_analysis", org_key=org_key), target="analyzer") return redirect(url_for("analysis")) return render_template("admin/analysis.html", orgs_info=orgs_info)
def find_matching_users(pii_fields, api_type, org_key=None, previous_matches=None): """ Takes a dict of pii fields and finds all matching IntakeUsers. Each match has comes with a list of which PII fields were hits. Args: pii_fields: A dictionary of pii field -> [set or list of values]. api_type: Which api to search, such as "api" or "sandbox" org_key: Organization to which the query should be limited previous_matches: A list of keys of all IntakeUser objects matched up to this point. Returns: A list of dictionaries of IntakeUser keys to MatchingIntakeUsers with the dict of the most direct matches at the top. """ if not previous_matches: previous_matches = set() # Scrub the PII - we don't accept empty values. for field, values in pii_fields.items(): for value in values: if not value or not unicode(value).strip(): values.remove(value) #logging.info("FIND USER FROM PII:") #logging.info(pii_fields) # TODO this should be parallelized, doing it synchronously is dumb. # TODO we currently limit PII matches to 100 entities. This seems sane, # and keeps the DB from blowing up on bad queries. But, we might # want something more sophisticated in place at some point. direct_matches = {} for field, values in pii_fields.items(): if hasattr(IntakeUser, field) and values: iu_query = IntakeUser.query(IntakeUser.api_type == api_type) if org_key: iu_query = iu_query.filter(IntakeUser.org == org_key) iu_query = iu_query.filter(getattr(IntakeUser, field).IN(values)) matching_users = iu_query.fetch(100) for matching_user in matching_users: if matching_user.key not in previous_matches: if matching_user.key in direct_matches: direct_matches[matching_user.key].fields.append(field) else: miu = MatchingIntakeUser(matching_user, [field]) direct_matches[matching_user.key] = miu # Update our ongoing list of matches previous_matches.update(direct_matches.keys()) # We have a set of all the IntakeUsers who matched on the inital data. # Next see if our matching IntakeUsers have yielded new PII # not present in the original query. new_pii = defaultdict(list) for matching_user in (v.user for v in direct_matches.values()): for field in PII_FIELDS: field_value = getattr(matching_user, field) if (field_value and field_value not in pii_fields.get(field, [])): new_pii[field].append(field_value) if new_pii: indirect_matches = find_matching_users( new_pii, api_type, org_key=org_key, previous_matches=previous_matches) else: indirect_matches = None rv = [direct_matches] if indirect_matches: rv.extend(indirect_matches) return rv
def find_matching_users(pii_fields, api_type, org_key=None, previous_matches=None): """ Takes a dict of pii fields and finds all matching IntakeUsers. Each match has comes with a list of which PII fields were hits. Args: pii_fields: A dictionary of pii field -> [set or list of values]. api_type: Which api to search, such as "api" or "sandbox" org_key: Organization to which the query should be limited previous_matches: A list of keys of all IntakeUser objects matched up to this point. Returns: A list of dictionaries of IntakeUser keys to MatchingIntakeUsers with the dict of the most direct matches at the top. """ if not previous_matches: previous_matches = set() # Scrub the PII - we don't accept empty values. for field, values in pii_fields.items(): for value in values: if not value or not unicode(value).strip(): values.remove(value) #logging.info("FIND USER FROM PII:") #logging.info(pii_fields) # TODO this should be parallelized, doing it synchronously is dumb. # TODO we currently limit PII matches to 100 entities. This seems sane, # and keeps the DB from blowing up on bad queries. But, we might # want something more sophisticated in place at some point. direct_matches = {} for field, values in pii_fields.items(): if hasattr(IntakeUser, field) and values: iu_query = IntakeUser.query(IntakeUser.api_type == api_type) if org_key: iu_query = iu_query.filter(IntakeUser.org == org_key) iu_query = iu_query.filter(getattr(IntakeUser, field).IN(values)) matching_users = iu_query.fetch(100) for matching_user in matching_users: if matching_user.key not in previous_matches: if matching_user.key in direct_matches: direct_matches[matching_user.key].fields.append(field) else: miu = MatchingIntakeUser(matching_user, [field]) direct_matches[matching_user.key] = miu # Update our ongoing list of matches previous_matches.update(direct_matches.keys()) # We have a set of all the IntakeUsers who matched on the inital data. # Next see if our matching IntakeUsers have yielded new PII # not present in the original query. new_pii = defaultdict(list) for matching_user in (v.user for v in direct_matches.values()): for field in PII_FIELDS: field_value = getattr(matching_user, field) if (field_value and field_value not in pii_fields.get(field, [])): new_pii[field].append(field_value) if new_pii: indirect_matches = find_matching_users(new_pii, api_type, org_key=org_key, previous_matches=previous_matches) else: indirect_matches = None rv = [direct_matches] if indirect_matches: rv.extend(indirect_matches) return rv