Exemple #1
0
    def obfuscate_wiki_entry(self, line, user_profile):
        fields = line.rstrip('\r\n').decode('utf8').split('\t')
        record = ArticleRevisionRecord(*fields)

        user_info = {}
        if user_profile is not None:
            user_id = record.user_id
            if user_id != 'NULL':
                profile_entry = user_profile.get(user_id)
                if profile_entry is None:
                    log.error("Missing profile entry for user_id %s", user_id)
                else:
                    user_info['name'] = [
                        profile_entry.name,
                    ]

        if record.ip_address != 'NULL' and record.ip_address != 'ip_address':
            log.warning("Found non-NULL IP address")
        if record.automatic_log != '' and record.automatic_log != 'automatic_log':
            log.warning(u"Found non-zero-length automatic_log: %s",
                        record.automatic_log)

        # Can't reset values, so update original fields.
        fields[12] = backslash_encode_value(
            self.obfuscator.obfuscate_text(
                backslash_decode_value(record.content), user_info))
        fields[2] = backslash_encode_value(
            self.obfuscator.obfuscate_text(
                backslash_decode_value(record.user_message), user_info))
        return u"\t".join(fields).encode('utf-8')
    def filter_row(self, row):
        user_id = row[5]
        user_info = {}
        if user_id != 'NULL':
            user_id = int(user_id)
            user_info['user_id'] = [user_id, ]
            try:
                entry = self.user_by_id[user_id]
                if 'username' in entry:
                    user_info['username'] = [entry['username'], ]
                if 'name' in entry:
                    user_info['name'] = [entry['name'], ]
            except KeyError:
                log.error("Unable to find wiki user_id: %s in the user_by_id map", user_id)

        row[2] = ''  # user_message
        row[3] = ''  # automatic_log
        row[4] = ''  # ip_address
        # For user_id, preserve 'NULL' value if present.
        if user_id != 'NULL':
            row[5] = self.remap_id(user_id)

        wiki_content = backslash_decode_value(row[12].decode('utf8'))
        cleaned_content = self.obfuscator.obfuscate_text(wiki_content, user_info)
        row[12] = backslash_encode_value(cleaned_content).encode('utf8')

        return row
Exemple #3
0
    def filter_row(self, row):
        user_id = row[5]
        user_info = {}
        if user_id != 'NULL':
            user_id = int(user_id)
            user_info['user_id'] = [user_id, ]
            try:
                entry = self.user_by_id[user_id]
                if 'username' in entry:
                    user_info['username'] = [entry['username'], ]
                if 'name' in entry:
                    user_info['name'] = [entry['name'], ]
            except KeyError:
                log.error("Unable to find wiki user_id: %s in the user_by_id map", user_id)

        row[2] = ''  # user_message
        row[3] = ''  # automatic_log
        row[4] = ''  # ip_address
        # For user_id, preserve 'NULL' value if present.
        if user_id != 'NULL':
            row[5] = self.remap_id(user_id)

        wiki_content = backslash_decode_value(row[12].decode('utf8'))
        cleaned_content = self.obfuscator.obfuscate_text(wiki_content, user_info)
        row[12] = backslash_encode_value(cleaned_content).encode('utf8')

        return row
 def get_raw_event(self, event_line):
     event = eventlog.parse_json_event(event_line)
     event_data = eventlog.get_event_data(event)
     if event_data is not None:
         event['event'] = event_data
     dump = json.dumps(event, sort_keys=True)
     encoded_dump = backslash_encode_value(dump)
     return encoded_dump
 def get_raw_event(self, event_line):
     event = eventlog.parse_json_event(event_line)
     event_data = eventlog.get_event_data(event)
     if event_data is not None:
         event['event'] = event_data
     dump = json.dumps(event, sort_keys=True)
     encoded_dump = backslash_encode_value(dump)
     return encoded_dump
    def obfuscate_wiki_entry(self, line, user_profile):
        fields = line.rstrip('\r\n').decode('utf8').split('\t')
        record = ArticleRevisionRecord(*fields)

        user_info = {}
        if user_profile is not None:
            user_id = record.user_id
            if user_id != 'NULL':
                profile_entry = user_profile.get(user_id)
                if profile_entry is None:
                    log.error("Missing profile entry for user_id %s", user_id)
                else:
                    user_info['name'] = [profile_entry.name, ]

        if record.ip_address != 'NULL' and record.ip_address != 'ip_address':
            log.warning("Found non-NULL IP address")
        if record.automatic_log != '' and record.automatic_log != 'automatic_log':
            log.warning(u"Found non-zero-length automatic_log: %s", record.automatic_log)

        # Can't reset values, so update original fields.
        fields[12] = backslash_encode_value(self.obfuscator.obfuscate_text(backslash_decode_value(record.content), user_info))
        fields[2] = backslash_encode_value(self.obfuscator.obfuscate_text(backslash_decode_value(record.user_message), user_info))
        return u"\t".join(fields).encode('utf-8')
Exemple #7
0
 def mapper(self, line):
     value = self.get_event_and_date_string(line)
     if value is None:
         return
     event, event_date = value
     event_type = event.get('event_type')
     event_source = event.get('event_source')
     exported = False
     if event_source is None or event_type is None or event_date is None:
         # Ignore if any of the keys is None
         return
     if event_type.startswith('/'):
         # Ignore events that begin with a slash
         return
     if (event_source, event_type) in self.known_events:
         event_category = self.known_events[(event_source, event_type)]
         exported = True
     else:
         event_category = 'unknown'
     # Make sure that event_type doesn't have embedded newlines and such, but do so
     # after checking that it's not None.
     event_type = backslash_encode_value(unicode(event_type))
     yield (event_date, event_category, event_type, event_source, exported), 1
 def mapper(self, line):
     value = self.get_event_and_date_string(line)
     if value is None:
         return
     event, event_date = value
     event_type = event.get('event_type')
     event_source = event.get('event_source')
     exported = False
     if event_source is None or event_type is None or event_date is None:
         # Ignore if any of the keys is None
         return
     if event_type.startswith('/'):
         # Ignore events that begin with a slash
         return
     if (event_source, event_type) in self.known_events:
         event_category = self.known_events[(event_source, event_type)]
         exported = True
     else:
         event_category = 'unknown'
     # Make sure that event_type doesn't have embedded newlines and such, but do so
     # after checking that it's not None.
     event_type = backslash_encode_value(unicode(event_type))
     yield (event_date, event_category, event_type, event_source, exported), 1
 def get_raw_event(self, event_line):
     event = eventlog.parse_json_event(event_line)
     dump = json.dumps(event, sort_keys=True)
     encoded_dump = backslash_encode_value(dump)
     return encoded_dump
 def test_encoding(self, text, expected_result):
     self.assertEquals(obfuscate_util.backslash_encode_value(text),
                       expected_result)
 def test_encoding_round_trip(self, text):
     self.assertEquals(
         text,
         obfuscate_util.backslash_decode_value(
             obfuscate_util.backslash_encode_value(text)))
Exemple #12
0
    def _add_entry(self, record_dict, record_key, record_field, label, obj):
        """
        Add the `obj` to the `record_key` entry of `record_dict`, performing appropriate conversion based on `record_field`.

        For strings, the entry is truncated, if necessary, so we should rarely see truncation errors.  Also, null characters
        are escaped so that they won't fail when being loaded into BigQuery.

        For timestamps, parsing is done using ciso8601.

        Errors are logged, but are not fatal.  In such cases, the value is simply not set.  (It's only fatal, then, if the
        value was required.)
        """
        if isinstance(record_field, StringField):
            if obj is None:
                # TODO: this should really check to see if the record_field is nullable.
                value = None
            else:
                value = backslash_encode_value(unicode(obj))
                if '\x00' in value:
                    value = value.replace('\x00', '\\0')
                # Avoid validation errors later due to length by truncating here.
                field_length = record_field.length
                value_length = len(value)
                # TODO: This implies that field_length is at least 4.
                if value_length > field_length:
                    log.error(
                        "Record value length (%d) exceeds max length (%d) for field %s: %r",
                        value_length, field_length, record_key, value)
                    value = u"{}...".format(value[:field_length - 4])
            record_dict[record_key] = value
        elif isinstance(record_field, IntegerField):
            try:
                record_dict[record_key] = int(obj)
            except ValueError:
                log.error('Unable to cast value to int for %s: %r', label, obj)
        elif isinstance(record_field, BooleanField):
            try:
                record_dict[record_key] = bool(obj)
            except ValueError:
                log.error('Unable to cast value to bool for %s: %r', label,
                          obj)
        elif isinstance(record_field, FloatField):
            try:
                record_dict[record_key] = float(obj)
            except ValueError:
                log.error('Unable to cast value to float for %s: %r', label,
                          obj)
        elif isinstance(record_field, DateTimeField):
            datetime_obj = None
            try:
                if obj is not None:
                    datetime_obj = ciso8601.parse_datetime(obj)
                    if datetime_obj.tzinfo:
                        datetime_obj = datetime_obj.astimezone(pytz.utc)
                else:
                    datetime_obj = obj
            except ValueError:
                log.error('Unable to cast value to datetime for %s: %r', label,
                          obj)

            # Because it's not enough just to create a datetime object, also perform
            # validation here.
            if datetime_obj is not None:
                validation_errors = self.date_time_field_for_validating.validate(
                    datetime_obj)
                if len(validation_errors) > 0:
                    log.error(
                        'Invalid assigment of value %r to field "%s": %s',
                        datetime_obj, label, ', '.join(validation_errors))
                    datetime_obj = None

            record_dict[record_key] = datetime_obj
        else:
            record_dict[record_key] = obj
 def test_encoding(self, text, expected_result):
     self.assertEquals(obfuscate_util.backslash_encode_value(text), expected_result)
 def test_encoding_round_trip(self, text):
     self.assertEquals(text, obfuscate_util.backslash_decode_value(obfuscate_util.backslash_encode_value(text)))
 def get_raw_event(self, event_line):
     event = eventlog.parse_json_event(event_line)
     dump = json.dumps(event, sort_keys=True)
     encoded_dump = backslash_encode_value(dump)
     return encoded_dump