Esempio n. 1
0
def intake_user(api_type, api_version):    
    form = IntakeUserForm(request.form)
    # TODO: Change this to validate_or_400 after writing intake tests
    #       to confirm it's proper behavior.           
    if not form.validate():
        abort(400, "Request data validation failed with the following errors: \n%s" % 
                        "\n".join("%s - %s" % (field, ",".join(errors)) for field, errors in form.errors.items()))
    
    user_id = hash_value(form.user_id.data, pre_hashed=form._pre_hashed)
    existing_user = IntakeUser.query(IntakeUser.api_type == api_type,
                                     IntakeUser.user_id == user_id,
                                     IntakeUser.org == g.consumer.org).get()
    
    user_fields = {}
    user_fields.update(form.data)
    
    # Remove fields that are not to be persisted to the IntakeUser itself.
    for field in user_fields.keys():
        if not hasattr(IntakeUser, field):
            del user_fields[field]
    
    updated_user = IntakeUser.create_or_update(user_fields,
                                               g.consumer.key,
                                               g.consumer.org,   
                                               api_type,                                            
                                               existing_user=existing_user,
                                               pre_hashed=form._pre_hashed)
    
    return jsonify(user_id=form.user_id.data, is_new=(existing_user == None))
Esempio n. 2
0
def intake_user(api_type, api_version):
    form = IntakeUserForm(request.form)
    # TODO: Change this to validate_or_400 after writing intake tests
    #       to confirm it's proper behavior.
    if not form.validate():
        abort(
            400,
            "Request data validation failed with the following errors: \n%s" %
            "\n".join("%s - %s" % (field, ",".join(errors))
                      for field, errors in form.errors.items()))

    user_id = hash_value(form.user_id.data, pre_hashed=form._pre_hashed)
    existing_user = IntakeUser.query(IntakeUser.api_type == api_type,
                                     IntakeUser.user_id == user_id,
                                     IntakeUser.org == g.consumer.org).get()

    user_fields = {}
    user_fields.update(form.data)

    # Remove fields that are not to be persisted to the IntakeUser itself.
    for field in user_fields.keys():
        if not hasattr(IntakeUser, field):
            del user_fields[field]

    updated_user = IntakeUser.create_or_update(user_fields,
                                               g.consumer.key,
                                               g.consumer.org,
                                               api_type,
                                               existing_user=existing_user,
                                               pre_hashed=form._pre_hashed)

    return jsonify(user_id=form.user_id.data, is_new=(existing_user == None))
Esempio n. 3
0
def organization_management(organization_key):
    org = ndb.Key(urlsafe=organization_key).get()
    intake_user_count = IntakeUser.query(IntakeUser.org == org.key).count()
    most_recent_update = IntakeUser.query().order(-IntakeUser.updated).get()
    
    if most_recent_update:
        most_recent_update = most_recent_update.updated.strftime(
                                                        "%a %b %d %H:%M:%S %Y")
    
    pii_stats = org_pii_stats(org.key)
    ordered_transaction_stats = org_transaction_stats(org.key)

    ordered_pii_stats = OrderedDict()
    ordered_pii_stats["total"] = intake_user_count
    for field, value in sorted(pii_stats.items(), key=lambda i: i[1], reverse=True):
        ordered_pii_stats[field] = value
    
    if request.method == "POST":
        form = OrganizationForm(request.form)
        if form.validate():
            org.credit = form.credit.data
            org.org_type = form.org_type.data
            org.put()
            flash("Organization %s successfully updated." % org.full_name)
    else:
        form = OrganizationForm(credit=org.credit, 
                                org_type=org.org_type)
        
    return render_template("admin/organization.html", org=org, form=form,
                           intake_user_count=intake_user_count,
                           most_recent_update=most_recent_update,
                           pii_stats=ordered_pii_stats,
                           transaction_stats=ordered_transaction_stats)
Esempio n. 4
0
def check_on_data():
    api_user_count = IntakeUser.query(IntakeUser.api_type=="api").count()
    sandbox_user_count = IntakeUser.query(IntakeUser.api_type=="sandbox").count()
    total_user_count = IntakeUser.query().count()
    
    return render_template("admin/check_on_data.html",
                            api_user_count=api_user_count,
                            sandbox_user_count=sandbox_user_count,
                            total_user_count=total_user_count)
Esempio n. 5
0
def check_on_data():
    api_user_count = IntakeUser.query(IntakeUser.api_type == "api").count()
    sandbox_user_count = IntakeUser.query(
        IntakeUser.api_type == "sandbox").count()
    total_user_count = IntakeUser.query().count()

    return render_template("admin/check_on_data.html",
                           api_user_count=api_user_count,
                           sandbox_user_count=sandbox_user_count,
                           total_user_count=total_user_count)
Esempio n. 6
0
def org_pii_stats(organization_key):
    org = organization_key.get()
    pii_stats = {}
    for pii_field in PII_FIELDS:
        if hasattr(IntakeUser, pii_field):
            count = IntakeUser.query(IntakeUser.org == org.key).filter(
                getattr(IntakeUser, pii_field) != None).count()
        else:
            count = 0

        pii_stats[pii_field] = count

    return pii_stats
Esempio n. 7
0
def org_pii_stats(organization_key):
    org = organization_key.get()
    pii_stats = {}
    for pii_field in PII_FIELDS:
        if hasattr(IntakeUser, pii_field):
            count = IntakeUser.query(IntakeUser.org == org.key).filter(
                                getattr(IntakeUser, pii_field) != None).count()
        else:
            count = 0
            
        pii_stats[pii_field] = count        
        
    return pii_stats
Esempio n. 8
0
def compute_org_value(organization_key):
    """
    Task that computes the current value of our data for the given organization.
    This task can take a long time and should be run on a backend.
    """
    org = ndb.Key(urlsafe=organization_key).get()
    vs = ValueSummary.query(ValueSummary.organization == org.key).get()
    if not vs:
        vs = ValueSummary(organization=org.key)

    vs.processed_users = 0
    vs.total_users = IntakeUser.query(IntakeUser.org == org.key).count()
    vs.is_running = True
    vs.overlap_counts = {}
    vs.org_name = org.full_name

    vs.put()

    overlap_counts = defaultdict(int)
    for intake_user in IntakeUser.query(IntakeUser.org == org.key):
        #logging.info(intake_user.pii_dict())
        user_sets = find_users(intake_user.pii_dict(), pre_hashed=True)
        #logging.info("USER SETS:")
        #logging.info(user_sets)
        lrg_summary = generate_lrg_summary(user_sets, [org.key])
        #logging.info("LRG SUMMARY:")
        #logging.info(lrg_summary)
        if lrg_summary:
            overlap_counts[lrg_summary["marketplace_memberships"]] += 1
        vs.processed_users += 1
        vs.put()

    vs.is_running = False
    vs.overlap_counts = overlap_counts
    vs.put()

    return jsonify(success=True, processed_users=vs.processed_users)
Esempio n. 9
0
def compute_org_value(organization_key):
    """
    Task that computes the current value of our data for the given organization.
    This task can take a long time and should be run on a backend.
    """
    org = ndb.Key(urlsafe=organization_key).get()
    vs = ValueSummary.query(ValueSummary.organization == org.key).get()
    if not vs:
        vs = ValueSummary(organization=org.key)        

    vs.processed_users = 0
    vs.total_users = IntakeUser.query(IntakeUser.org == org.key).count()
    vs.is_running = True    
    vs.overlap_counts = {}
    vs.org_name = org.full_name
    
    vs.put()

    overlap_counts = defaultdict(int)
    for intake_user in IntakeUser.query(IntakeUser.org == org.key):
        #logging.info(intake_user.pii_dict())
        user_sets = find_users(intake_user.pii_dict(), pre_hashed=True)
        #logging.info("USER SETS:")
        #logging.info(user_sets)
        lrg_summary = generate_lrg_summary(user_sets, [org.key])
        #logging.info("LRG SUMMARY:")
        #logging.info(lrg_summary)
        if lrg_summary:
            overlap_counts[lrg_summary["marketplace_memberships"]] += 1
        vs.processed_users += 1
        vs.put()
           
    vs.is_running = False
    vs.overlap_counts = overlap_counts
    vs.put()
    
    return jsonify(success=True, processed_users=vs.processed_users)
Esempio n. 10
0
def organization_management(organization_key):
    org = ndb.Key(urlsafe=organization_key).get()
    intake_user_count = IntakeUser.query(IntakeUser.org == org.key).count()
    most_recent_update = IntakeUser.query().order(-IntakeUser.updated).get()

    if most_recent_update:
        most_recent_update = most_recent_update.updated.strftime(
            "%a %b %d %H:%M:%S %Y")

    pii_stats = org_pii_stats(org.key)
    ordered_transaction_stats = org_transaction_stats(org.key)

    ordered_pii_stats = OrderedDict()
    ordered_pii_stats["total"] = intake_user_count
    for field, value in sorted(pii_stats.items(),
                               key=lambda i: i[1],
                               reverse=True):
        ordered_pii_stats[field] = value

    if request.method == "POST":
        form = OrganizationForm(request.form)
        if form.validate():
            org.credit = form.credit.data
            org.org_type = form.org_type.data
            org.put()
            flash("Organization %s successfully updated." % org.full_name)
    else:
        form = OrganizationForm(credit=org.credit, org_type=org.org_type)

    return render_template("admin/organization.html",
                           org=org,
                           form=form,
                           intake_user_count=intake_user_count,
                           most_recent_update=most_recent_update,
                           pii_stats=ordered_pii_stats,
                           transaction_stats=ordered_transaction_stats)
Esempio n. 11
0
def data_quality():
    if request.method == "POST":
        BATCH_SIZE = 200
        key_batch = []
        for intake_user_key in IntakeUser.query().iter(keys_only=True):
            key_batch.append(intake_user_key.urlsafe())            
            if len(key_batch) == BATCH_SIZE:                
                _enqueue_rehashing(key_batch)
                key_batch = []

        if key_batch:
            _enqueue_rehashing(key_batch)

        return redirect(url_for("data_quality"))

    return render_template("admin/data_quality.html")        
Esempio n. 12
0
def data_quality():
    if request.method == "POST":
        BATCH_SIZE = 200
        key_batch = []
        for intake_user_key in IntakeUser.query().iter(keys_only=True):
            key_batch.append(intake_user_key.urlsafe())
            if len(key_batch) == BATCH_SIZE:
                _enqueue_rehashing(key_batch)
                key_batch = []

        if key_batch:
            _enqueue_rehashing(key_batch)

        return redirect(url_for("data_quality"))

    return render_template("admin/data_quality.html")
Esempio n. 13
0
def org_transaction_stats(organization_key):
    org = organization_key.get()
    transaction_stats = OrderedDict()
    UPPER_BOUND = 10
    for i in range(UPPER_BOUND + 1):
        count_q = IntakeUser.query(IntakeUser.org == org.key)
        if i < UPPER_BOUND:
            count_q = count_q.filter(IntakeUser.transaction_count == i)
        else:
            count_q = count_q.filter(IntakeUser.transaction_count >= i)
        count = count_q.count()
        
        key_name = unicode(i)
        if i >= UPPER_BOUND:
            key_name += "+"

        transaction_stats[key_name] = count
   
    return transaction_stats
Esempio n. 14
0
def org_transaction_stats(organization_key):
    org = organization_key.get()
    transaction_stats = OrderedDict()
    UPPER_BOUND = 10
    for i in range(UPPER_BOUND + 1):
        count_q = IntakeUser.query(IntakeUser.org == org.key)
        if i < UPPER_BOUND:
            count_q = count_q.filter(IntakeUser.transaction_count == i)
        else:
            count_q = count_q.filter(IntakeUser.transaction_count >= i)
        count = count_q.count()

        key_name = unicode(i)
        if i >= UPPER_BOUND:
            key_name += "+"

        transaction_stats[key_name] = count

    return transaction_stats
Esempio n. 15
0
def run_analysis(org_key=None):
    """
    Task that deletes all existing CombinedUsers and regenerates them from the
    underlying IntakeUsers. This task can take a long time to run, so it should
    be run on a dedicated instance. 
    """
    DELETE_BATCH = 500
    ANALYZE_BATCH = 50

    # Clear out the existing combined users
    cu_query = CombinedUser.query()
    if org_key:
        org_key = ndb.Key(urlsafe=org_key)
        cu_query = cu_query.filter(CombinedUser.orgs == org_key)

    while True:
        results, cursor, more = cu_query.fetch_page(DELETE_BATCH,
                                                    keys_only=True)
        ndb.delete_multi(results)
        if not more:
            break

    # Analyze all the intake users, on a per-organization basis
    if org_key:
        org_keys = [org_key]
    else:
        org_keys = Organization.query().iter(keys_only=True)

    for org_key in org_keys:
        counter_key = "analysis::run_count::%s" % org_key.urlsafe()
        memcache.set(key=counter_key, value=0)

        iu_query = IntakeUser.query(IntakeUser.org == org_key)
        for iu_key in iu_query.iter(keys_only=True):
            memcache.incr(counter_key)
            #deferred.defer(analyze_user, intake_user_key=iu_key)
            analyze_user(iu_key)

        generate_csv(org_key.get().name)

    return "Great Success"
Esempio n. 16
0
def enqueue_task(task_func):
    BATCH_SIZE = 5 if DEVELOPMENT else 200
    key_batch = []
    total_size = 0
    for intake_user_key in IntakeUser.query().iter(keys_only=True):
        key_batch.append(intake_user_key.urlsafe()) 
        total_size += 1           
        if len(key_batch) == BATCH_SIZE:                
            taskqueue.add(url=url_for(task_func), 
                          payload=json.dumps(key_batch))

            logging.info("Successfully enqueued %d users for task %s." % (len(key_batch), task_func))
            key_batch = []

    if key_batch:
        taskqueue.add(url=url_for(task_func), 
                      payload=json.dumps(key_batch))

        logging.info("Successfully enqueued %d users for task %s." % (len(key_batch), task_func))
        
    return jsonify(total_size=total_size)
Esempio n. 17
0
def analysis():
    orgs = Organization.query().fetch(50)

    orgs_info = {}
    for org in orgs:
        total_users = IntakeUser.query(IntakeUser.org == org.key).count()
        if total_users == 0:
            continue
        orgs_info[org.name] = {}
        counter_key = "analysis::run_count::%s" % org.key.urlsafe()
        orgs_info[org.name]["analyzed_users"] = memcache.get(counter_key)
        orgs_info[org.name]["total_users"] = total_users
        orgs_info[org.name]["org_key"] = org.key.urlsafe()

    if request.method == "POST":
        org_key = request.form.get("org_key", None)
        taskqueue.add(url=url_for("run_analysis", org_key=org_key),
                      target="analyzer")
        return redirect(url_for("analysis"))

    return render_template("admin/analysis.html", orgs_info=orgs_info)
Esempio n. 18
0
def run_analysis(org_key=None):
    """
    Task that deletes all existing CombinedUsers and regenerates them from the
    underlying IntakeUsers. This task can take a long time to run, so it should
    be run on a dedicated instance. 
    """
    DELETE_BATCH = 500
    ANALYZE_BATCH = 50
    
    # Clear out the existing combined users    
    cu_query = CombinedUser.query()
    if org_key:
        org_key = ndb.Key(urlsafe=org_key)
        cu_query = cu_query.filter(CombinedUser.orgs == org_key)
    
    while True:
        results, cursor, more = cu_query.fetch_page(DELETE_BATCH, keys_only=True)
        ndb.delete_multi(results)
        if not more:
            break            
    
    # Analyze all the intake users, on a per-organization basis  
    if org_key:
        org_keys = [org_key]
    else:
        org_keys = Organization.query().iter(keys_only=True)
    
    for org_key in org_keys:
        counter_key = "analysis::run_count::%s" % org_key.urlsafe()
        memcache.set(key=counter_key, value=0)
        
        iu_query = IntakeUser.query(IntakeUser.org == org_key)        
        for iu_key in iu_query.iter(keys_only=True):
            memcache.incr(counter_key)            
            #deferred.defer(analyze_user, intake_user_key=iu_key)
            analyze_user(iu_key)
        
        generate_csv(org_key.get().name)

    return "Great Success"
Esempio n. 19
0
def analysis():
    orgs = Organization.query().fetch(50)

    orgs_info = {}   
    for org in orgs:
        total_users = IntakeUser.query(IntakeUser.org == org.key).count()
        if total_users == 0:
            continue            
        orgs_info[org.name] = {}
        counter_key = "analysis::run_count::%s" % org.key.urlsafe()
        orgs_info[org.name]["analyzed_users"] = memcache.get(counter_key)
        orgs_info[org.name]["total_users"] = total_users
        orgs_info[org.name]["org_key"] = org.key.urlsafe()
    
    if request.method == "POST":
        org_key = request.form.get("org_key", None)            
        taskqueue.add(url=url_for("run_analysis", org_key=org_key), 
                      target="analyzer")
        return redirect(url_for("analysis"))

    return render_template("admin/analysis.html",
                            orgs_info=orgs_info)
Esempio n. 20
0
def enqueue_task(task_func):
    BATCH_SIZE = 5 if DEVELOPMENT else 200
    key_batch = []
    total_size = 0
    for intake_user_key in IntakeUser.query().iter(keys_only=True):
        key_batch.append(intake_user_key.urlsafe())
        total_size += 1
        if len(key_batch) == BATCH_SIZE:
            taskqueue.add(url=url_for(task_func),
                          payload=json.dumps(key_batch))

            logging.info("Successfully enqueued %d users for task %s." %
                         (len(key_batch), task_func))
            key_batch = []

    if key_batch:
        taskqueue.add(url=url_for(task_func), payload=json.dumps(key_batch))

        logging.info("Successfully enqueued %d users for task %s." %
                     (len(key_batch), task_func))

    return jsonify(total_size=total_size)
Esempio n. 21
0
def find_matching_users(pii_fields,
                        api_type,
                        org_key=None,
                        previous_matches=None):
    """
    Takes a dict of pii fields and finds all matching IntakeUsers.    
    Each match has comes with a list of which PII fields were hits.

    Args:
        pii_fields: A dictionary of pii field -> [set or list of values]. 
        api_type: Which api to search, such as "api" or "sandbox"
        org_key: Organization to which the query should be limited
        previous_matches: A list of keys of all IntakeUser objects
                          matched up to this point.        
    Returns:
        A list of dictionaries of IntakeUser keys to 
        MatchingIntakeUsers with the dict of the most direct matches at the top.
    """
    if not previous_matches:
        previous_matches = set()

    # Scrub the PII - we don't accept empty values.
    for field, values in pii_fields.items():
        for value in values:
            if not value or not unicode(value).strip():
                values.remove(value)

    #logging.info("FIND USER FROM PII:")
    #logging.info(pii_fields)

    # TODO this should be parallelized, doing it synchronously is dumb.
    # TODO we currently limit PII matches to 100 entities. This seems sane,
    #      and keeps the DB from blowing up on bad queries. But, we might
    #      want something more sophisticated in place at some point.
    direct_matches = {}
    for field, values in pii_fields.items():
        if hasattr(IntakeUser, field) and values:
            iu_query = IntakeUser.query(IntakeUser.api_type == api_type)
            if org_key:
                iu_query = iu_query.filter(IntakeUser.org == org_key)
            iu_query = iu_query.filter(getattr(IntakeUser, field).IN(values))
            matching_users = iu_query.fetch(100)

            for matching_user in matching_users:
                if matching_user.key not in previous_matches:
                    if matching_user.key in direct_matches:
                        direct_matches[matching_user.key].fields.append(field)
                    else:
                        miu = MatchingIntakeUser(matching_user, [field])
                        direct_matches[matching_user.key] = miu

    # Update our ongoing list of matches
    previous_matches.update(direct_matches.keys())

    # We have a set of all the IntakeUsers who matched on the inital data.
    # Next see if our matching IntakeUsers have yielded new PII
    # not present in the original query.
    new_pii = defaultdict(list)
    for matching_user in (v.user for v in direct_matches.values()):
        for field in PII_FIELDS:
            field_value = getattr(matching_user, field)
            if (field_value and field_value not in pii_fields.get(field, [])):
                new_pii[field].append(field_value)

    if new_pii:
        indirect_matches = find_matching_users(
            new_pii,
            api_type,
            org_key=org_key,
            previous_matches=previous_matches)
    else:
        indirect_matches = None

    rv = [direct_matches]
    if indirect_matches:
        rv.extend(indirect_matches)

    return rv
Esempio n. 22
0
def find_matching_users(pii_fields, api_type, org_key=None, previous_matches=None):
    """
    Takes a dict of pii fields and finds all matching IntakeUsers.    
    Each match has comes with a list of which PII fields were hits.

    Args:
        pii_fields: A dictionary of pii field -> [set or list of values]. 
        api_type: Which api to search, such as "api" or "sandbox"
        org_key: Organization to which the query should be limited
        previous_matches: A list of keys of all IntakeUser objects
                          matched up to this point.        
    Returns:
        A list of dictionaries of IntakeUser keys to 
        MatchingIntakeUsers with the dict of the most direct matches at the top.
    """
    if not previous_matches:
        previous_matches = set()

    # Scrub the PII - we don't accept empty values. 
    for field, values in pii_fields.items():
        for value in values:
            if not value or not unicode(value).strip():
                values.remove(value)

    #logging.info("FIND USER FROM PII:")
    #logging.info(pii_fields)

    # TODO this should be parallelized, doing it synchronously is dumb.
    # TODO we currently limit PII matches to 100 entities. This seems sane,
    #      and keeps the DB from blowing up on bad queries. But, we might 
    #      want something more sophisticated in place at some point.
    direct_matches = {}
    for field, values in pii_fields.items():        
        if hasattr(IntakeUser, field) and values:
            iu_query = IntakeUser.query(IntakeUser.api_type == api_type)
            if org_key:
                iu_query = iu_query.filter(IntakeUser.org == org_key)
            iu_query = iu_query.filter(getattr(IntakeUser, field).IN(values))
            matching_users = iu_query.fetch(100)
            
            for matching_user in matching_users:
                if matching_user.key not in previous_matches: 
                    if matching_user.key in direct_matches:
                        direct_matches[matching_user.key].fields.append(field)
                    else:                        
                        miu = MatchingIntakeUser(matching_user, [field])
                        direct_matches[matching_user.key] = miu                        

    # Update our ongoing list of matches
    previous_matches.update(direct_matches.keys())

    # We have a set of all the IntakeUsers who matched on the inital data. 
    # Next see if our matching IntakeUsers have yielded new PII
    # not present in the original query.
    new_pii = defaultdict(list)
    for matching_user in (v.user for v in direct_matches.values()):
        for field in PII_FIELDS:
            field_value = getattr(matching_user, field)
            if (field_value and field_value not in pii_fields.get(field, [])):
                new_pii[field].append(field_value)

    if new_pii:
        indirect_matches = find_matching_users(new_pii, 
                                                api_type, 
                                                org_key=org_key,
                                                previous_matches=previous_matches)
    else:
        indirect_matches = None

    rv = [direct_matches]
    if indirect_matches:
        rv.extend(indirect_matches)

    return rv