def test_source_from_filename(self) -> None:
        """Test getting the data source name from the filename."""
        self.assertEqual(
            flatten_base.source_from_filename(
                'gs://firehook-scans/echo/CP_Quack-echo-2020-08-23-06-01-02/results.json'
            ), 'CP_Quack-echo-2020-08-23-06-01-02')

        self.assertEqual(
            flatten_base.source_from_filename(
                'gs://firehook-scans/http/CP_Quack-http-2020-09-13-01-02-07/results.json'
            ), 'CP_Quack-http-2020-09-13-01-02-07')
    def _process_hyperquack_v1(self, filename: str, scan: Any,
                               random_measurement_id: str) -> Iterator[Row]:
        """Process a line of Echo/Discard/HTTP/S data in HyperQuack V1 format.

    https://github.com/censoredplanet/censoredplanet/blob/master/docs/hyperquackv1.rst

    Args:
      filename: a filepath string
      scan: a loaded json object containing the parsed content of the line
      random_measurement_id: a hex id identifying this individual measurement

    Yields:
      Rows
    """
        for index, result in enumerate(scan.get('Results', [])):
            date = result['StartTime'][:10]

            sent_domain = _extract_domain_from_sent_field(result['Sent'])
            is_control = flatten_base.is_control_url(sent_domain)
            # Due to a bug the sent field sometimes isn't populated
            # when the measurement failed due to network timeout.
            if not sent_domain:
                # Control measurements come at the end, and are not counted as retries.
                is_control = index > scan['Retries']
                if is_control:
                    domain = ""
                else:
                    domain = scan['Keyword']
            else:
                domain = sent_domain

            row = {
                'domain': domain,
                'category':
                self.category_matcher.get_category(domain, is_control),
                'ip': scan['Server'],
                'date': date,
                'start_time': result['StartTime'],
                'end_time': result['EndTime'],
                'anomaly': scan['Blocked'],
                'success': result['Success'],
                'stateful_block': scan['StatefulBlock'],
                'is_control': is_control,
                'controls_failed': scan['FailSanity'],
                'measurement_id': random_measurement_id,
                'source': flatten_base.source_from_filename(filename),
            }

            if 'Received' in result:
                received = result.get('Received', '')
                received_fields = flatten_base.parse_received_data(
                    self.blockpage_matcher, received, scan['Blocked'])
                row.update(received_fields)

            if 'Error' in result:
                row['error'] = result['Error']

            yield row
    def _process_satellite_v2(self, scan: Any, filepath: str,
                              random_measurement_id: str) -> Iterator[Row]:
        """Process a line of Satellite v2 data.

    Args:
      scan: a loaded json object containing the parsed content of the line
      filepath: path like "<path>/<filename>.json.gz"
      random_measurement_id: a hex id identifying this individual measurement

    Yields:
      Rows
    """
        is_control_domain = flatten_base.is_control_url(scan['test_url'])

        row = {
            'domain':
            scan['test_url'],
            'is_control':
            is_control_domain,
            'category':
            self.category_matcher.get_category(scan['test_url'],
                                               is_control_domain),
            'ip':
            scan['vp'],
            'is_control_ip':
            scan['vp'] in CONTROL_IPS,
            'country':
            scan.get('location', {}).get('country_code'),
            'date':
            scan['start_time'][:10],
            'start_time':
            format_timestamp(scan['start_time']),
            'end_time':
            format_timestamp(scan['end_time']),
            'error':
            scan.get('error', None),
            'anomaly':
            scan['anomaly'],
            'success':
            not scan['connect_error'],
            'received':
            None,
            'measurement_id':
            random_measurement_id,
            'source':
            flatten_base.source_from_filename(filepath),
        }

        if datetime.date.fromisoformat(
                row['date']) < SATELLITE_V2_2_START_DATE:
            yield from _process_satellite_v2p1(row, scan)
        else:
            yield from _process_satellite_v2p2(row, scan)
Beispiel #4
0
    def _data_to_load(self,
                      gcs: GCSFileSystem,
                      scan_type: str,
                      incremental_load: bool,
                      table_name: str,
                      start_date: Optional[datetime.date] = None,
                      end_date: Optional[datetime.date] = None) -> List[str]:
        """Select the right files to read.

    Args:
      gcs: GCSFileSystem object
      scan_type: one of 'echo', 'discard', 'http', 'https', 'satellite'
      incremental_load: boolean. If true, only read the latest new data
      table_name: dataset.table name like 'base.scan_echo'
      start_date: date object, only files after or at this date will be read
      end_date: date object, only files at or before this date will be read

    Returns:
      A List of filename strings. ex
       ['gs://firehook-scans/echo/CP_Quack-echo-2020-08-22-06-08-03/results.json',
        'gs://firehook-scans/echo/CP_Quack-echo-2020-08-23-06-01-02/results.json']
    """
        if incremental_load:
            full_table_name = self._get_full_table_name(table_name)
            existing_sources = _get_existing_datasources(full_table_name)
        else:
            existing_sources = []

        if scan_type == satellite.SCAN_TYPE_SATELLITE:
            files_to_load = flatten_satellite.SATELLITE_FILES
        else:
            files_to_load = SCAN_FILES

        # Filepath like `gs://firehook-scans/echo/**/*'
        files_regex = f'{self.bucket}{scan_type}/**/*'
        file_metadata = [m.metadata_list for m in gcs.match([files_regex])][0]

        filepaths = [metadata.path for metadata in file_metadata]
        file_sizes = [metadata.size_in_bytes for metadata in file_metadata]

        filtered_filenames = [
            filepath for (filepath, file_size) in zip(filepaths, file_sizes)
            if (_between_dates(filepath, start_date, end_date)
                and _filename_matches(filepath, files_to_load)
                and flatten_base.source_from_filename(filepath) not in
                existing_sources and file_size > EMPTY_GZIPPED_FILE_SIZE)
        ]

        return filtered_filenames
    def _process_hyperquack_v2(self, filename: str, scan: Any,
                               random_measurement_id: str) -> Iterator[Row]:
        """Process a line of Echo/Discard/HTTP/S data in HyperQuack V2 format.

    https://github.com/censoredplanet/censoredplanet/blob/master/docs/hyperquackv2.rst

    Args:
      filename: a filepath string
      scan: a loaded json object containing the parsed content of the line
      random_measurement_id: a hex id identifying this individual measurement

    Yields:
      Rows
    """
        for response in scan.get('response', []):
            date = response['start_time'][:10]
            domain: str = response.get('control_url', scan['test_url'])
            is_control = 'control_url' in response

            row = {
                'domain': domain,
                'category':
                self.category_matcher.get_category(domain, is_control),
                'ip': scan['vp'],
                'date': date,
                'start_time': response['start_time'],
                'end_time': response['end_time'],
                'anomaly': scan['anomaly'],
                'success': response['matches_template'],
                'stateful_block': scan['stateful_block'],
                'is_control': is_control,
                'controls_failed': scan.get('controls_failed', None),
                'measurement_id': random_measurement_id,
                'source': flatten_base.source_from_filename(filename),
            }

            if 'response' in response:
                received = response.get('response', '')
                received_fields = flatten_base.parse_received_data(
                    self.blockpage_matcher, received, scan['anomaly'])
                row.update(received_fields)

            if 'error' in response:
                row['error'] = response['error']

            yield row
    def _process_satellite_v1(self, date: str, scan: Any, filepath: str,
                              random_measurement_id: str) -> Iterator[Row]:
        """Process a line of Satellite data.

    Args:
      date: a date string YYYY-mm-DD
      scan: a loaded json object containing the parsed content of the line
      filepath: one of
        <path>/answers_control.json
        <path>/interference.json
        also potentially .gz files
      random_measurement_id: a hex id identifying this individual measurement

    Yields:
      Rows
    """
        filename = pathlib.PurePosixPath(filepath).name
        if '.gz' in pathlib.PurePosixPath(filename).suffixes:
            filename = pathlib.PurePosixPath(filename).stem

        row = {
            'domain': scan['query'],
            'is_control': False,  # v1 doesn't have domain controls
            'category':
            self.category_matcher.get_category(scan['query'], False),
            'ip': scan.get('resolver', scan.get('ip')),
            'is_control_ip': filename == SATELLITE_ANSWERS_CONTROL_FILE,
            'date': date,
            'error': scan.get('error', None),
            'anomaly': not scan['passed'] if 'passed' in scan else None,
            'success': 'error' not in scan,
            'received': None,
            'rcode': ['0'] if 'error' not in scan else ['-1'],
            'measurement_id': random_measurement_id,
            'source': flatten_base.source_from_filename(filepath),
        }

        if isinstance(row['error'], dict):
            row['error'] = json.dumps(row['error'])

        received_ips = scan.get('answers')
        yield from _process_received_ips(row, received_ips)
    def _process_satellite_blockpages(self, scan: Any,
                                      filepath: str) -> Iterator[Row]:
        """Process a line of Satellite blockpage data.

    Args:
      scan: a loaded json object containing the parsed content of the line
      filepath: a filepath string

    Yields:
      Rows, usually 2 corresponding to the fetched http and https data respectively
    """
        row = {
            'domain': scan['keyword'],
            'ip': scan['ip'],
            'date': scan['start_time'][:10],
            'start_time': format_timestamp(scan['start_time']),
            'end_time': format_timestamp(scan['end_time']),
            'success': scan['fetched'],
            'source': flatten_base.source_from_filename(filepath),
        }

        http = {
            'https': False,
        }
        http.update(row)
        received_fields = flatten_base.parse_received_data(
            self.blockpage_matcher, scan.get('http', ''), True)
        http.update(received_fields)
        yield http

        https = {
            'https': True,
        }
        https.update(row)
        received_fields = flatten_base.parse_received_data(
            self.blockpage_matcher, scan.get('https', ''), True)
        https.update(received_fields)
        yield https
    def _process_satellite_v2_control(
            self, scan: Any, filepath: str,
            random_measurement_id: str) -> Iterator[Row]:
        """Process a line of Satellite ip control data.

      Args:
        scan: a loaded json object containing the parsed content of the line
        filepath: path like "<path>/<filename>.json.gz"
        random_measurement_id: a hex id identifying this individual measurement

      Yields:
        Rows
    """
        responses = scan.get('response', [])
        if responses:
            # An overall satellite v2 measurement
            # always contains some non-control trial domains
            is_control_domain = False

            row = {
                'domain':
                scan['test_url'],
                'is_control':
                is_control_domain,
                'category':
                self.category_matcher.get_category(scan['test_url'],
                                                   is_control_domain),
                'ip':
                scan['vp'],
                'is_control_ip':
                True,
                'date':
                responses[0]['start_time'][:10],
                'start_time':
                format_timestamp(responses[0]['start_time']),
                'end_time':
                format_timestamp(responses[-1]['end_time']),
                'anomaly':
                None,
                'success':
                not scan['connect_error'],
                'controls_failed':
                not scan['passed_control'],
                'rcode': [str(response['rcode']) for response in responses],
                'measurement_id':
                random_measurement_id,
                'source':
                flatten_base.source_from_filename(filepath),
            }
            errors = [
                response['error'] for response in responses
                if response['error'] and response['error'] != 'null'
            ]
            row['error'] = ' | '.join(errors) if errors else None
            for response in responses:
                if response['url'] == row['domain']:
                    # Check response for test domain
                    if response['rcode'] == 0 and response['has_type_a']:
                        # Valid answers
                        row['has_type_a'] = True
                        # Separate into one answer IP per row for tagging
                        for ip in response['response']:
                            row['received'] = {'ip': ip}
                            yield row.copy()