def test_hashing(): for test_value in ("This is a test value to hash", u"Now a uni\u0107ode string", 1231, datetime.date.today()): if isinstance(test_value, unicode): test_value = test_value.encode("utf-8") else: test_value = str(test_value) public_hash = hmac.new(cryptography._get_public_salt(), msg=test_value, digestmod=hashlib.sha256).hexdigest() internal_hash = hmac.new(cryptography._get_salt(), msg=public_hash, digestmod=hashlib.sha256).hexdigest() assert internal_hash == cryptography.hash_value(test_value, pre_hashed=False) internal_hash = hmac.new(cryptography._get_salt(), msg=test_value, digestmod=hashlib.sha256).hexdigest() assert internal_hash == cryptography.hash_value(test_value, pre_hashed=True)
def intake_user(api_type, api_version): form = IntakeUserForm(request.form) # TODO: Change this to validate_or_400 after writing intake tests # to confirm it's proper behavior. if not form.validate(): abort( 400, "Request data validation failed with the following errors: \n%s" % "\n".join("%s - %s" % (field, ",".join(errors)) for field, errors in form.errors.items())) user_id = hash_value(form.user_id.data, pre_hashed=form._pre_hashed) existing_user = IntakeUser.query(IntakeUser.api_type == api_type, IntakeUser.user_id == user_id, IntakeUser.org == g.consumer.org).get() user_fields = {} user_fields.update(form.data) # Remove fields that are not to be persisted to the IntakeUser itself. for field in user_fields.keys(): if not hasattr(IntakeUser, field): del user_fields[field] updated_user = IntakeUser.create_or_update(user_fields, g.consumer.key, g.consumer.org, api_type, existing_user=existing_user, pre_hashed=form._pre_hashed) return jsonify(user_id=form.user_id.data, is_new=(existing_user == None))
def intake_user(api_type, api_version): form = IntakeUserForm(request.form) # TODO: Change this to validate_or_400 after writing intake tests # to confirm it's proper behavior. if not form.validate(): abort(400, "Request data validation failed with the following errors: \n%s" % "\n".join("%s - %s" % (field, ",".join(errors)) for field, errors in form.errors.items())) user_id = hash_value(form.user_id.data, pre_hashed=form._pre_hashed) existing_user = IntakeUser.query(IntakeUser.api_type == api_type, IntakeUser.user_id == user_id, IntakeUser.org == g.consumer.org).get() user_fields = {} user_fields.update(form.data) # Remove fields that are not to be persisted to the IntakeUser itself. for field in user_fields.keys(): if not hasattr(IntakeUser, field): del user_fields[field] updated_user = IntakeUser.create_or_update(user_fields, g.consumer.key, g.consumer.org, api_type, existing_user=existing_user, pre_hashed=form._pre_hashed) return jsonify(user_id=form.user_id.data, is_new=(existing_user == None))
def _rehash_pii(intake_user_key): intake_user = intake_user_key.get() iu_dict = intake_user.to_dict() for key in iu_dict.keys(): if key.endswith("_enc"): key_for_hashed = key[:-4] encrypted_value = iu_dict[key] if not encrypted_value: setattr(intake_user, key_for_hashed, None) continue decrypted_value = decrypt_value(encrypted_value) if (isinstance(decrypted_value, str) or isinstance(decrypted_value, unicode)): decrypted_value = decrypted_value.strip() if decrypted_value: setattr(intake_user, key_for_hashed, hash_value(decrypted_value)) else: setattr(intake_user, key_for_hashed, None) intake_user.put()
def check_intakeuser(intake_user, user_data, developer_key, org_key, pre_hashed=False): non_pii = ("date_joined", "date_banned", "reason_banned", "review_count", "transaction_count", "positive_review_percentage") eq_(intake_user.developer, developer_key) eq_(intake_user.org, org_key) # Test that hashing went correctly for key, value in user_data.items(): if key.startswith("date"): value = datetime.datetime.strptime(value, "%Y-%m-%d").date() # print "Key: %s" % key # print "Intake: %s" % getattr(intake_user, key) # print "Test Data: %s" % value if key not in non_pii: # print "Intake Encrypted: %s" % getattr(intake_user, key+"_enc") # print "Intake Decrypt: %s" % cryptography.decrypt_value(getattr(intake_user, key+"_enc")) if pre_hashed: eq_(getattr(intake_user, key+"_enc"), None) else: eq_(cryptography.decrypt_value(getattr(intake_user, key+"_enc")), value) value = cryptography.hash_value(value, pre_hashed=pre_hashed) eq_(getattr(intake_user, key), value)
def find_users(pii_fields, api_type, org_key=None, pre_hashed=False): """ Method for finding users matching the given PII. Adds more intelligence to the process through the following: Isolating "user sets", non-intersecting groups of users tied to different fields of the given PII. This could be the innocent result of sparse information, an indicator of fraud, or a sign that the given PII belongs to multiple individuals. Idenfitying conflicting PII. These are PII values within a set of supposedly coherent users that does not agree. This could be the result of user using different contact information with different organizations, or possibly a sign of fraud. Args: pii_fields: A dictionary of pii_field -> value Returns: A list of user sets. Each user set is a list of lists of MatchingIntakeUsers, each sub-list representing a level of matching directness. """ #logging.info("Find Users for: %s" % pii_fields) hashed_pii = {} for key, value in pii_fields.items(): if value: hashed_pii[key] = hash_value(value) if not pre_hashed else value combined_pii = defaultdict(set) user_sets = [] for field, value in hashed_pii.items(): if field in combined_pii and value in combined_pii[field]: # If we've run across this pii value in a previous user set, # we don't need to search on it again. continue matching_users = find_matching_users({field: [value]}, api_type, org_key=org_key) # Get rid of the dictionary part, we don't need it anymore. #logging.info("Matching Users: %s" % matching_users) user_set = [match_dict.values() for match_dict in matching_users] user_sets.append(user_set) user_set_pii = aggregate_pii( [miu.user for match_level in user_set for miu in match_level]) for k, v in user_set_pii.items(): combined_pii[k].update(v) return user_sets
def find_users(pii_fields, api_type, org_key=None, pre_hashed=False): """ Method for finding users matching the given PII. Adds more intelligence to the process through the following: Isolating "user sets", non-intersecting groups of users tied to different fields of the given PII. This could be the innocent result of sparse information, an indicator of fraud, or a sign that the given PII belongs to multiple individuals. Idenfitying conflicting PII. These are PII values within a set of supposedly coherent users that does not agree. This could be the result of user using different contact information with different organizations, or possibly a sign of fraud. Args: pii_fields: A dictionary of pii_field -> value Returns: A list of user sets. Each user set is a list of lists of MatchingIntakeUsers, each sub-list representing a level of matching directness. """ #logging.info("Find Users for: %s" % pii_fields) hashed_pii = {} for key, value in pii_fields.items(): if value: hashed_pii[key] = hash_value(value) if not pre_hashed else value combined_pii = defaultdict(set) user_sets = [] for field, value in hashed_pii.items(): if field in combined_pii and value in combined_pii[field]: # If we've run across this pii value in a previous user set, # we don't need to search on it again. continue matching_users = find_matching_users({field:[value]}, api_type, org_key=org_key) # Get rid of the dictionary part, we don't need it anymore. #logging.info("Matching Users: %s" % matching_users) user_set = [match_dict.values() for match_dict in matching_users] user_sets.append(user_set) user_set_pii = aggregate_pii([miu.user for match_level in user_set for miu in match_level]) for k,v in user_set_pii.items(): combined_pii[k].update(v) return user_sets
def create_or_update(cls, fields, dev_key, org_key, api_type, existing_user=None, pre_hashed=False): """ Creates an IntakeUser model from the given data or updates an existing model. Note that this method DOES persist the resulting model to the database. Args: fields: dictionary of field -> value org_key: entity key for the org this user belongs to dev_key: entity key for the dev who uploaded this user existing_model: an existing intake user model to be updated pre_hashed: bool indicating whether the PII has already been hashed. db_key: key of the database entity to use as an ancestor for this user. Returns: An intake user model that has been persisted to the DB. """ if existing_user: intake_user = existing_user else: intake_user = IntakeUser() intake_user.org = org_key intake_user.developer = dev_key intake_user.api_type = api_type values_to_set = {} for field, value in fields.items(): # Hash and encrypt the PII data if field in (('user_id',) + PII_FIELDS): # Only save an encrypted version if we got raw data, it's # a bit silly to save a copy of the raw pre-hashed data. if ((isinstance(value, str) or isinstance(value, unicode)) and not value.strip()): # don't even save pure whitespace PII continue if not pre_hashed: values_to_set[field+"_enc"] = encrypt_value(value) value = hash_value(value, pre_hashed=pre_hashed) values_to_set[field] = value for key, value in values_to_set.items(): setattr(intake_user, key, value) intake_user.put() return intake_user
def test_submit(): grant_submit() # Submit through developer 1 dev1 = models.Developer.query(models.Developer.consumer_key == "valid_key1").get() consumer1 = oauth.Consumer(key=dev1.consumer_key, secret=dev1.consumer_secret) req = create_request(consumer1, "http://localhost/api/v1/submit/user", "POST", urlencode(test_data)) response = testapp.post("/api/v1/submit/user", req.to_postdata()) assert response.status_int == 200 assert response.json["user_id"] == u"1" assert response.json["is_new"] assert models.IntakeUser.query().count() == 1 intake_user = models.IntakeUser.query().get() check_intakeuser(intake_user, test_data, dev1.key, dev1.org) # Submit through developer 2 dev2 = models.Developer.query(models.Developer.consumer_key == "valid_key2").get() consumer2 = oauth.Consumer(key=dev2.consumer_key, secret=dev2.consumer_secret) req = create_request(consumer2, "http://localhost/api/v1/submit/user", "POST", urlencode(test_data)) response = testapp.post("/api/v1/submit/user", req.to_postdata()) assert response.status_int == 200 assert response.json["user_id"] == u"1" assert response.json["is_new"] assert models.IntakeUser.query().count() == 2 intake_user = models.IntakeUser.query(models.IntakeUser.org == dev2.org).get() check_intakeuser(intake_user, test_data, dev2.key, dev2.org) # Submit an update through developer 1 test_data2 = {} test_data2.update(test_data) test_data2["transaction_count"] = 1056 test_data2["twitter_id"] = "1234567" req = create_request(consumer1, "http://localhost/api/v1/submit/user", "POST", urlencode(test_data2)) response = testapp.post("/api/v1/submit/user", req.to_postdata()) assert response.status_int == 200 assert response.json["user_id"] == u"1" assert not response.json["is_new"] intake_user = models.IntakeUser.query(models.IntakeUser.org == dev1.org).get() check_intakeuser(intake_user, test_data2, dev1.key, dev1.org) # Submit a second through developer 2 test_data3 = {} test_data3.update(test_data) test_data3["user_id"] = u"2" req = create_request(consumer2, "http://localhost/api/v1/submit/user", "POST", urlencode(test_data3)) response = testapp.post("/api/v1/submit/user", req.to_postdata()) assert response.status_int == 200 assert response.json["user_id"] == u"2" assert response.json["is_new"] intake_user = models.IntakeUser.query(models.IntakeUser.user_id == cryptography.hash_value(test_data3["user_id"]), models.IntakeUser.developer == dev2.key).get() check_intakeuser(intake_user, test_data3, dev2.key, dev2.org)