Esempio n. 1
0
    def testHashLongStrings(self):
        # very very long strings
        long_string1 = get_long_string([
            ' \\"hello\\" ', '\n\n \t\t \n\t \t\n',
            'email: \\"[email protected]\\"', '\'\'\'\'', "'\\\"'",
            "{inside: \\\"of a string\\\"}"
        ], 5000)
        long_string2 = get_long_string([
            'This is a very string.  \n\n \t', '\n\t', '{\\"name\\"}',
            ' \\ END', '\\"string\\"', "'string'", "\\\"'quotes'\\\"",
            "open quotes: \\\" '"
        ], 5000)

        hash1 = anonymize.hash_string(long_string1, 0, True)
        hash2 = anonymize.hash_string(long_string2, 0, True)

        long_json = '{"key1" : "%s", "key2" : "%s"}' % (long_string1,
                                                        long_string2)

        expected_sanitized_long_json = '{"key1" : %s, "key2" : %s}' % (hash1,
                                                                       hash2)
        real_sanitized_long_json = self.s.sanitize(long_json, 0)

        self.assertEqual(expected_sanitized_long_json,
                         real_sanitized_long_json)
Esempio n. 2
0
 def testHashLongStrings(self):
     # very very long strings
     long_string1 = get_long_string([' \\"hello\\" ', '\n\n \t\t \n\t \t\n', 'email: \\"[email protected]\\"', '\'\'\'\'', "'\\\"'", "{inside: \\\"of a string\\\"}"], 5000)
     long_string2 = get_long_string(['This is a very string.  \n\n \t', '\n\t', '{\\"name\\"}', ' \\ END', '\\"string\\"', "'string'", "\\\"'quotes'\\\"", "open quotes: \\\" '"], 5000)
     
     hash1 = anonymize.hash_string(long_string1, 0, True)
     hash2 = anonymize.hash_string(long_string2, 0, True)
     
     long_json = '{"key1" : "%s", "key2" : "%s"}' % (long_string1, long_string2)
     
     expected_sanitized_long_json = '{"key1" : %s, "key2" : %s}' % (hash1, hash2)
     real_sanitized_long_json = self.s.sanitize(long_json, 0)
     
     self.assertEqual(expected_sanitized_long_json, real_sanitized_long_json)
Esempio n. 3
0
    def testHashMultiline(self):
        # short strings
        short_string1 = get_long_string([' \\"hello\\" ', '\n\n \t\t \n\t \t\n', 'email: \\"[email protected]\\"', '\'\'\'\'', "'\\\"'", "{inside: \\\"of a string\\\"}"], 1)
        short_string2 = get_long_string(['This is not a long string.  \n\n \t', '\n\t', '{\\"name\\"}', ' \\ END', '\\"string\\"', "'string'", "\\\"'quotes'\\\"", "open quotes: \\\" '"], 1)
        
        hash1 = anonymize.hash_string(short_string1, 0, True)
        hash2 = anonymize.hash_string(short_string2, 0, True)
        
        json = '{"key1" : "%s", "key2" : "%s"}' % (short_string1, short_string2)
#        print json
        
        expected_sanitized_json = '{"key1" : %s, "key2" : %s}' % (hash1, hash2)
        real_sanitized_json = self.s.sanitize(json, 0)
        
        self.assertEqual(expected_sanitized_json, real_sanitized_json)
Esempio n. 4
0
 def infer_salt(self, candidate_hashes, known_collections):
     """this is a ridiculous hack. Let's hope the salt is 0. But even if not..."""
     max_salt = 100000000
     #if self.debug:
     LOG.info("Trying to brute-force the salt 0-%d [numCollections=%d / numHashes=%d]", \
              max_salt, len(known_collections), len(candidate_hashes))
     salt = 0
     # the col names are hashed with quotes around them
     col_names = map(self.get_hash_string, known_collections)
     while True:
         if salt % (max_salt / 100) == 0 and salt > 0:
             LOG.info("SEARCH: salt=%d [%.1f%%]", salt,
                      (salt / float(max_salt)) * 100)
         for known_col in col_names:
             hash = anonymize.hash_string(
                 known_col, salt)  # imported from anonymize.py
             if hash in candidate_hashes:
                 #if self.debug:
                 print
                 LOG.info("SUCCESS! %s hashes to a known value. SALT: %d",
                          known_col, salt)
                 return salt
         salt += 1
         if salt > max_salt:
             break
     if self.debug:
         print
         LOG.warn("FAIL. The salt value is unknown")
     return None
Esempio n. 5
0
 def infer_salt(self, candidate_hashes, known_collections):
     """this is a ridiculous hack. Let's hope the salt is 0. But even if not..."""
     max_salt = 100000000
     #if self.debug:
     LOG.info("Trying to brute-force the salt 0-%d [numCollections=%d / numHashes=%d]", \
              max_salt, len(known_collections), len(candidate_hashes))
     salt = 0
     # the col names are hashed with quotes around them 
     col_names = map(self.get_hash_string, known_collections)
     while True:
         if salt % (max_salt / 100) == 0 and salt > 0:
             LOG.info("SEARCH: salt=%d [%.1f%%]", salt, (salt / float(max_salt))*100)
         for known_col in col_names:
             hash = anonymize.hash_string(known_col, salt) # imported from anonymize.py
             if hash in candidate_hashes:
                 #if self.debug: 
                 print
                 LOG.info("SUCCESS! %s hashes to a known value. SALT: %d", known_col, salt)
                 return salt
         salt += 1
         if salt > max_salt:
             break
     if self.debug:
         print
         LOG.warn("FAIL. The salt value is unknown")
     return None
Esempio n. 6
0
 def testHashStringMany(self):
     # many strings in json
     s = anonymize.Sanitizer(None, None, True)
     text = 'string with \\\"escaped quotes\\\"'
     hashed_text = anonymize.hash_string(text, 0, True)
     long_json = "{" + get_long_string(['"key" : "%s", ' % text], 4000) + "}"
     expected_result = "{" + get_long_string(['"key" : %s, ' % hashed_text], 4000) + "}"
     real_result = self.s.sanitize(long_json, 0)
     
     #print long_json
     #print expected_result
     #print real_result
     
     self.assertEqual(expected_result, real_result)
Esempio n. 7
0
    def testHashMultiline(self):
        # short strings
        short_string1 = get_long_string([
            ' \\"hello\\" ', '\n\n \t\t \n\t \t\n',
            'email: \\"[email protected]\\"', '\'\'\'\'', "'\\\"'",
            "{inside: \\\"of a string\\\"}"
        ], 1)
        short_string2 = get_long_string([
            'This is not a long string.  \n\n \t', '\n\t', '{\\"name\\"}',
            ' \\ END', '\\"string\\"', "'string'", "\\\"'quotes'\\\"",
            "open quotes: \\\" '"
        ], 1)

        hash1 = anonymize.hash_string(short_string1, 0, True)
        hash2 = anonymize.hash_string(short_string2, 0, True)

        json = '{"key1" : "%s", "key2" : "%s"}' % (short_string1,
                                                   short_string2)
        #        print json

        expected_sanitized_json = '{"key1" : %s, "key2" : %s}' % (hash1, hash2)
        real_sanitized_json = self.s.sanitize(json, 0)

        self.assertEqual(expected_sanitized_json, real_sanitized_json)
Esempio n. 8
0
    def postProcess(self):
        """Process the operations to fix the collection names used in aggregate queries"""

        if not self.known_collections:
            LOG.warn(
                "No plaintext collections were found in operations. Unable to perform post-processing"
            )
            return

        if self.no_salt_search:
            LOG.warn("Skipping post-processing")
            return

        if self.debug:
            LOG.debug(
                "Performing post processing on %s sessions with %d operations"
                % (self.getSessionCount(), self.getOpCount()))
            LOG.debug("-- Aggregate Collection Names --")
            LOG.debug("Encountered %d collection names in plaintext." %
                      len(self.known_collections))
            LOG.debug(pformat(self.known_collections))

        # Find
        candidate_hashes = self.get_candidate_hashes()

        # HACK: Figure out what salt was used so that we can match
        #       them with our known collection names
        salt = self.infer_salt(candidate_hashes, self.known_collections)
        if salt is None:
            LOG.warn(
                "Failed to find string hashing salt. Unable to fix aggregate collection names"
            )
            return

        # Now for the given salt value, populate a mapping from
        # hashes to collection names
        LOG.debug("Pre-computing hashes for all known collection names...")
        hashed_collections = {}  # hash --> collection name
        for col_name in self.known_collections:
            hash = anonymize.hash_string(self.get_hash_string(col_name), salt)
            hashed_collections[hash] = col_name
            if self.debug:
                LOG.debug("hash: %s / col_name: %s / hash_str: %s" %
                          (hash, col_name, get_hash_string(col_name)))
        ## FOR

        # Now use our hash xref to fix the collection names in all aggreate operations
        self.fix_collection_names(hashed_collections)
Esempio n. 9
0
    def testHashStringMany(self):
        # many strings in json
        s = anonymize.Sanitizer(None, None, True)
        text = 'string with \\\"escaped quotes\\\"'
        hashed_text = anonymize.hash_string(text, 0, True)
        long_json = "{" + get_long_string(['"key" : "%s", ' % text],
                                          4000) + "}"
        expected_result = "{" + get_long_string(['"key" : %s, ' % hashed_text],
                                                4000) + "}"
        real_result = self.s.sanitize(long_json, 0)

        #print long_json
        #print expected_result
        #print real_result

        self.assertEqual(expected_result, real_result)
Esempio n. 10
0
    def postProcess(self):
        """Process the operations to fix the collection names used in aggregate queries"""
        
        if not self.known_collections:
            LOG.warn("No plaintext collections were found in operations. Unable to perform post-processing")
            return
        
        if self.no_salt_search:
            LOG.warn("Skipping post-processing")
            return
        
        if self.debug:
            LOG.debug("Performing post processing on %s sessions with %d operations" % (self.getSessionCount(), self.getOpCount()))
            LOG.debug("-- Aggregate Collection Names --")
            LOG.debug("Encountered %d collection names in plaintext." % len(self.known_collections))
            LOG.debug(pformat(self.known_collections))
        
        # Find 
        candidate_hashes = self.get_candidate_hashes()
        
        # HACK: Figure out what salt was used so that we can match
        #       them with our known collection names
        salt = self.infer_salt(candidate_hashes, self.known_collections)
        if salt is None:
            LOG.warn("Failed to find string hashing salt. Unable to fix aggregate collection names")
            return

        # Now for the given salt value, populate a mapping from
        # hashes to collection names
        LOG.debug("Pre-computing hashes for all known collection names...")
        hashed_collections = {} # hash --> collection name
        for col_name in self.known_collections:
            hash = anonymize.hash_string(self.get_hash_string(col_name), salt)
            hashed_collections[hash] = col_name
            if self.debug:
                LOG.debug("hash: %s / col_name: %s / hash_str: %s" % (hash, col_name, get_hash_string(col_name)))
        ## FOR
            
        # Now use our hash xref to fix the collection names in all aggreate operations
        self.fix_collection_names(hashed_collections)
Esempio n. 11
0
 def testHashStringSimple(self):
     # other tests
     str1 = "\"THIS SHOULD BE SIMPLY HASHED\""
     hash1 = anonymize.hash_string("THIS SHOULD BE SIMPLY HASHED", 0, True)
     result1 = self.s.sanitize(str1, 0)
     self.assertEqual(hash1, result1)
Esempio n. 12
0
 def testHashStringSimple(self):
     # other tests
     str1 = "\"THIS SHOULD BE SIMPLY HASHED\""
     hash1 = anonymize.hash_string("THIS SHOULD BE SIMPLY HASHED", 0, True)
     result1 = self.s.sanitize(str1, 0)
     self.assertEqual(hash1, result1)