Ejemplo n.º 1
0
    def test_flatten_measurement_invalid_json(self) -> None:
        line = 'invalid json'

        with self.assertLogs(level='WARNING') as cm:
            rows = list(
                beam_tables._flatten_measurement('test_filename.json', line))
            self.assertEqual(
                cm.output[0], 'WARNING:root:JSONDecodeError: '
                'Expecting value: line 1 column 1 (char 0)\n'
                'Filename: test_filename.json\ninvalid json\n')

        self.assertEqual(len(rows), 0)
Ejemplo n.º 2
0
    def test_flatten_measurement_http_success(self) -> None:
        """Test parsing an example successful HTTP measurement

    Not all measurements recieve any data/errors,
    in that case the received_ and error fields should not exist
    and will end up Null in bigquery.
    """

        line = """{
      "Server":"170.248.33.11",
      "Keyword":"scribd.com",
      "Retries":0,
      "Results":[
        {
         "Sent":"scribd.com",
         "Success":true,
         "StartTime":"2020-11-09T01:10:47.826486107-05:00",
         "EndTime":"2020-11-09T01:10:47.84869292-05:00"
        }
      ],
      "Blocked":false,
      "FailSanity":false,
      "StatefulBlock":false
    }"""

        expected_row = {
            'domain': 'scribd.com',
            'ip': '170.248.33.11',
            'date': '2020-11-09',
            'start_time': '2020-11-09T01:10:47.826486107-05:00',
            'end_time': '2020-11-09T01:10:47.84869292-05:00',
            'retries': 0,
            'sent': 'scribd.com',
            'blocked': False,
            'success': True,
            'fail_sanity': False,
            'stateful_block': False,
            'measurement_id': '',
            'source': 'CP_Quack-http-2020-11-09-01-02-08',
        }

        filename = 'gs://firehook-scans/http/CP_Quack-http-2020-11-09-01-02-08/results.json'

        row = list(beam_tables._flatten_measurement(filename, line))[0]
        # We can't test the measurement id because it's random
        row['measurement_id'] = ''
        self.assertEqual(row, expected_row)
Ejemplo n.º 3
0
    def test_flatten_measurement_https(self) -> None:
        """Test parsing an unsuccessful HTTPS measurement."""
        line = """{
      "Server":"213.175.166.157",
      "Keyword":"www.arabhra.org",
      "Retries":2,
      "Results":[
        {
         "Sent":"www.arabhra.org",
         "Received":{
           "status_line":"302 Found",
           "headers":{
             "Content-Language":["en"],
             "Content-Type":["text/html; charset=iso-8859-1"],
             "Date":["Fri, 06 Nov 2020 20:24:21 GMT"],
             "Location":[
               "https://jobs.bankaudi.com.lb/OA_HTML/IrcVisitor.jsp"
             ],
             "Set-Cookie":[
               "BIGipServer~IFRMS-WEB~IFRMS-OHS-HTTPS=rd7o00000000000000000000ffffc0a8fde4o4443; expires=Fri, 06-Nov-2020 21:24:21 GMT; path=/; Httponly; Secure",
               "TS016c74f4=01671efb9a1a400535e215d6f76498a5887425fed793ca942baa75f16076e60e1350988222922fa06fc16f53ef016d9ecd38535fcabf14861525811a7c3459e91086df326f; Path=/"
             ],
             "X-Frame-Options":["SAMEORIGIN"]
            },
            "body":"\\u003c!DOCTYPE HTML PUBLIC \\\"-//IETF//DTD HTML 2.0//EN\\\"\\u003e\\n\\u003cHTML\\u003e\\u003cHEAD\\u003e\\n\\u003cTITLE\\u003e302 Found\\u003c/TITLE\\u003e\\n\\u003c/HEAD\\u003e\\u003cBODY\\u003e\\n\\u003cH1\\u003eFound\\u003c/H1\\u003e\\nThe document has moved \\u003cA HREF=\\\"https://jobs.bankaudi.com.lb/OA_HTML/IrcVisitor.jsp\\\"\\u003ehere\\u003c/A\\u003e.\\u003cP\\u003e\\n\\u003c/BODY\\u003e\\u003c/HTML\\u003e\\n",
            "tls":{
              "version":771,
              "cipher_suite":49199,
              "cert":"MIIHLzCCBhegAwIBAgIQDCECYKFMPekAAAAAVNFY9jANBgkqhkiG9w0BAQsFADCBujELMAkGA1UEBhMCVVMxFjAUBgNVBAoTDUVudHJ1c3QsIEluYy4xKDAmBgNVBAsTH1NlZSB3d3cuZW50cnVzdC5uZXQvbGVnYWwtdGVybXMxOTA3BgNVBAsTMChjKSAyMDE0IEVudHJ1c3QsIEluYy4gLSBmb3IgYXV0aG9yaXplZCB1c2Ugb25seTEuMCwGA1UEAxMlRW50cnVzdCBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eSAtIEwxTTAeFw0yMDA1MTIxMTIzMDNaFw0yMTA1MTIxMTUzMDJaMIHCMQswCQYDVQQGEwJMQjEPMA0GA1UEBxMGQmVpcnV0MRMwEQYLKwYBBAGCNzwCAQMTAkxCMRcwFQYLKwYBBAGCNzwCAQETBkJlaXJ1dDEWMBQGA1UEChMNQmFuayBBdWRpIFNBTDEdMBsGA1UEDxMUUHJpdmF0ZSBPcmdhbml6YXRpb24xDjAMBgNVBAsTBUJBU0FMMQ4wDAYDVQQFEwUxMTM0NzEdMBsGA1UEAxMUam9icy5iYW5rYXVkaS5jb20ubGIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQC37LFk2A2Q8xxahyjhOkul8O9Nv5FFp0NkL4qIy2fTUbsz1uWOqQKo0jDS6Inwtb+i84//znY7ed7Uu5LfbPk0Biefkl4ke0d9LZ3fu7y0iQWWUqKGn4YAPDGv3R0y/47XlhHhDaR+D0z7SbmYHx2NQI7fj6iEfEB90PvPhrdDEKHypNoXa5PwOuGSoU0l+yGmuvF5N7/hr82y987pLRjMdJaszs5EM//C+eiyL9mTA8gvOOf3ZHYQ4ITsJpA9I2Q0E6fDQhGS8SDW2ktdZ7z2TIOQsyMuXJKbBeXCgKyjnaX5UWDis8Hpj43CI8Kge32qsqaTKbjf3Mb66nqHrwSdAgMBAAGjggMlMIIDITA5BgNVHREEMjAwghRqb2JzLmJhbmthdWRpLmNvbS5sYoIYd3d3LmpvYnMuYmFua2F1ZGkuY29tLmxiMIIBfQYKKwYBBAHWeQIEAgSCAW0EggFpAWcAdQBVgdTCFpA2AUrqC5tXPFPwwOQ4eHAlCBcvo6odBxPTDAAAAXIIuy40AAAEAwBGMEQCIEByP85HYDmBb/4WK0B6s5L66Owim+Hzf3jiPYvzhw5eAiBsT1ZEn5PuJfBZ9a9Y/TzJ8K9Qx+3+pyJATsPglI4z3AB2AJQgvB6O1Y1siHMfgosiLA3R2k1ebE+UPWHbTi9YTaLCAAABcgi7LlQAAAQDAEcwRQIgOgyG1ORFwA+sDB3cD4fCu25ahSyMi/4d+xvrP+STJxgCIQDXm1WBzc+gQlU/PhpVti+e4j+2MouWIBBvjw3k0/HTtgB2APZclC/RdzAiFFQYCDCUVo7jTRMZM7/fDC8gC8xO8WTjAAABcgi7LqAAAAQDAEcwRQIgaiMkFpZwGZ5Iac/cfTL8v6TbPHUIeSVjTnB1Z2m9gsoCIQCJr+wqJ0UF+FYhxq9ChDfn1Ukg3uVQePrv4WoWNYjOZzAOBgNVHQ8BAf8EBAMCBaAwHQYDVR0lBBYwFAYIKwYBBQUHAwEGCCsGAQUFBwMCMGgGCCsGAQUFBwEBBFwwWjAjBggrBgEFBQcwAYYXaHR0cDovL29jc3AuZW50cnVzdC5uZXQwMwYIKwYBBQUHMAKGJ2h0dHA6Ly9haWEuZW50cnVzdC5uZXQvbDFtLWNoYWluMjU2LmNlcjAzBgNVHR8ELDAqMCigJqAkhiJodHRwOi8vY3JsLmVudHJ1c3QubmV0L2xldmVsMW0uY3JsMEoGA1UdIARDMEEwNgYKYIZIAYb6bAoBAjAoMCYGCCsGAQUFBwIBFhpodHRwOi8vd3d3LmVudHJ1c3QubmV0L3JwYTAHBgVngQwBATAfBgNVHSMEGDAWgBTD99C1KjCtrw2RIXA5VN28iXDHOjAdBgNVHQ4EFgQUt5uewiz6lN1FGnoOCX/soGsCwoIwCQYDVR0TBAIwADANBgkqhkiG9w0BAQsFAAOCAQEArlnXiyOefAVaQd0jfxtGwzAed4c8EidlBaoebJACR4zlAIFG0r0pXbdHkLZnCkMCL7XvoV+Y27c1I/Tfcket6qr4gDuKKnbUZIdgg8LGU2OklVEfLv1LJi3+tRuGGCfKpzHWoL1FW+3T6YEETGeb1qZrGBE7Its/4WfVAwaBHynSrvdjeQTYuYP8XsvehhfI5PNQbfV3KIH+sOF7sg80C2sIEyxwD+VEfRGeV6nEhJGJdlibAWfNOwQAyRQcGoiVIdLoa9um9UAUugjktJJ/Dk74YyxIf3aX1yjqTANVIuBgSotC8FvUNTmAALL7Ug8fqvJ9sPQhxIataKh/JdrDCQ=="
            }
          },
          "Success":false,
          "Error":"Incorrect web response: status lines don't match",
          "StartTime":"2020-11-06T15:24:21.124508839-05:00",
          "EndTime":"2020-11-06T15:24:21.812075476-05:00"
        }
      ],
      "Blocked":false,
      "FailSanity":false,
      "StatefulBlock":false
    }
    """
        filename = 'gs://firehook-scans/http/CP_Quack-https-2020-11-06-15-15-31/results.json'

        # yapf: disable
        expected_row: beam_tables.Row = {
            'domain': 'www.arabhra.org',
            'ip': '213.175.166.157',
            'date': '2020-11-06',
            'start_time': '2020-11-06T15:24:21.124508839-05:00',
            'end_time': '2020-11-06T15:24:21.812075476-05:00',
            'retries': 2,
            'sent': 'www.arabhra.org',
            'received_status': '302 Found',
            # The received_body field in the json has a lot of unicode escapes
            # but the interpreted string in the output should not.
            'received_body': '<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">\n<HTML><HEAD>\n<TITLE>302 Found</TITLE>\n</HEAD><BODY>\n<H1>Found</H1>\nThe document has moved <A HREF=\"https://jobs.bankaudi.com.lb/OA_HTML/IrcVisitor.jsp\">here</A>.<P>\n</BODY></HTML>\n',
            'received_tls_version': 771,
            'received_tls_cipher_suite': 49199,
            'received_tls_cert': 'MIIHLzCCBhegAwIBAgIQDCECYKFMPekAAAAAVNFY9jANBgkqhkiG9w0BAQsFADCBujELMAkGA1UEBhMCVVMxFjAUBgNVBAoTDUVudHJ1c3QsIEluYy4xKDAmBgNVBAsTH1NlZSB3d3cuZW50cnVzdC5uZXQvbGVnYWwtdGVybXMxOTA3BgNVBAsTMChjKSAyMDE0IEVudHJ1c3QsIEluYy4gLSBmb3IgYXV0aG9yaXplZCB1c2Ugb25seTEuMCwGA1UEAxMlRW50cnVzdCBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eSAtIEwxTTAeFw0yMDA1MTIxMTIzMDNaFw0yMTA1MTIxMTUzMDJaMIHCMQswCQYDVQQGEwJMQjEPMA0GA1UEBxMGQmVpcnV0MRMwEQYLKwYBBAGCNzwCAQMTAkxCMRcwFQYLKwYBBAGCNzwCAQETBkJlaXJ1dDEWMBQGA1UEChMNQmFuayBBdWRpIFNBTDEdMBsGA1UEDxMUUHJpdmF0ZSBPcmdhbml6YXRpb24xDjAMBgNVBAsTBUJBU0FMMQ4wDAYDVQQFEwUxMTM0NzEdMBsGA1UEAxMUam9icy5iYW5rYXVkaS5jb20ubGIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQC37LFk2A2Q8xxahyjhOkul8O9Nv5FFp0NkL4qIy2fTUbsz1uWOqQKo0jDS6Inwtb+i84//znY7ed7Uu5LfbPk0Biefkl4ke0d9LZ3fu7y0iQWWUqKGn4YAPDGv3R0y/47XlhHhDaR+D0z7SbmYHx2NQI7fj6iEfEB90PvPhrdDEKHypNoXa5PwOuGSoU0l+yGmuvF5N7/hr82y987pLRjMdJaszs5EM//C+eiyL9mTA8gvOOf3ZHYQ4ITsJpA9I2Q0E6fDQhGS8SDW2ktdZ7z2TIOQsyMuXJKbBeXCgKyjnaX5UWDis8Hpj43CI8Kge32qsqaTKbjf3Mb66nqHrwSdAgMBAAGjggMlMIIDITA5BgNVHREEMjAwghRqb2JzLmJhbmthdWRpLmNvbS5sYoIYd3d3LmpvYnMuYmFua2F1ZGkuY29tLmxiMIIBfQYKKwYBBAHWeQIEAgSCAW0EggFpAWcAdQBVgdTCFpA2AUrqC5tXPFPwwOQ4eHAlCBcvo6odBxPTDAAAAXIIuy40AAAEAwBGMEQCIEByP85HYDmBb/4WK0B6s5L66Owim+Hzf3jiPYvzhw5eAiBsT1ZEn5PuJfBZ9a9Y/TzJ8K9Qx+3+pyJATsPglI4z3AB2AJQgvB6O1Y1siHMfgosiLA3R2k1ebE+UPWHbTi9YTaLCAAABcgi7LlQAAAQDAEcwRQIgOgyG1ORFwA+sDB3cD4fCu25ahSyMi/4d+xvrP+STJxgCIQDXm1WBzc+gQlU/PhpVti+e4j+2MouWIBBvjw3k0/HTtgB2APZclC/RdzAiFFQYCDCUVo7jTRMZM7/fDC8gC8xO8WTjAAABcgi7LqAAAAQDAEcwRQIgaiMkFpZwGZ5Iac/cfTL8v6TbPHUIeSVjTnB1Z2m9gsoCIQCJr+wqJ0UF+FYhxq9ChDfn1Ukg3uVQePrv4WoWNYjOZzAOBgNVHQ8BAf8EBAMCBaAwHQYDVR0lBBYwFAYIKwYBBQUHAwEGCCsGAQUFBwMCMGgGCCsGAQUFBwEBBFwwWjAjBggrBgEFBQcwAYYXaHR0cDovL29jc3AuZW50cnVzdC5uZXQwMwYIKwYBBQUHMAKGJ2h0dHA6Ly9haWEuZW50cnVzdC5uZXQvbDFtLWNoYWluMjU2LmNlcjAzBgNVHR8ELDAqMCigJqAkhiJodHRwOi8vY3JsLmVudHJ1c3QubmV0L2xldmVsMW0uY3JsMEoGA1UdIARDMEEwNgYKYIZIAYb6bAoBAjAoMCYGCCsGAQUFBwIBFhpodHRwOi8vd3d3LmVudHJ1c3QubmV0L3JwYTAHBgVngQwBATAfBgNVHSMEGDAWgBTD99C1KjCtrw2RIXA5VN28iXDHOjAdBgNVHQ4EFgQUt5uewiz6lN1FGnoOCX/soGsCwoIwCQYDVR0TBAIwADANBgkqhkiG9w0BAQsFAAOCAQEArlnXiyOefAVaQd0jfxtGwzAed4c8EidlBaoebJACR4zlAIFG0r0pXbdHkLZnCkMCL7XvoV+Y27c1I/Tfcket6qr4gDuKKnbUZIdgg8LGU2OklVEfLv1LJi3+tRuGGCfKpzHWoL1FW+3T6YEETGeb1qZrGBE7Its/4WfVAwaBHynSrvdjeQTYuYP8XsvehhfI5PNQbfV3KIH+sOF7sg80C2sIEyxwD+VEfRGeV6nEhJGJdlibAWfNOwQAyRQcGoiVIdLoa9um9UAUugjktJJ/Dk74YyxIf3aX1yjqTANVIuBgSotC8FvUNTmAALL7Ug8fqvJ9sPQhxIataKh/JdrDCQ==',
            'received_headers': [
                'Content-Language: en',
                'Content-Type: text/html; charset=iso-8859-1',
                'Date: Fri, 06 Nov 2020 20:24:21 GMT',
                'Location: https://jobs.bankaudi.com.lb/OA_HTML/IrcVisitor.jsp',
                'Set-Cookie: BIGipServer~IFRMS-WEB~IFRMS-OHS-HTTPS=rd7o00000000000000000000ffffc0a8fde4o4443; expires=Fri, 06-Nov-2020 21:24:21 GMT; path=/; Httponly; Secure',
                'Set-Cookie: TS016c74f4=01671efb9a1a400535e215d6f76498a5887425fed793ca942baa75f16076e60e1350988222922fa06fc16f53ef016d9ecd38535fcabf14861525811a7c3459e91086df326f; Path=/',
                'X-Frame-Options: SAMEORIGIN',
            ],
            'error': 'Incorrect web response: status lines don\'t match',
            'blocked': False,
            'success': False,
            'fail_sanity': False,
            'stateful_block': False,
            'measurement_id': '',
            'source': 'CP_Quack-https-2020-11-06-15-15-31',
        }
        # yapf: enable

        row = list(beam_tables._flatten_measurement(filename, line))[0]
        # We can't test the measurement id because it's random
        row['measurement_id'] = ''

        self.assertEqual(row, expected_row)
Ejemplo n.º 4
0
    def test_flatten_measurement_http(self) -> None:
        """Test parsing an unsuccessful HTTP measurement."""
        line = """{
      "Server":"184.50.171.225",
      "Keyword":"www.csmonitor.com",
      "Retries":0,
      "Results":[
        {
         "Sent":"www.csmonitor.com",
         "Received":{
            "status_line":"301 Moved Permanently",
            "headers":{
               "Content-Length":["0"],
               "Date":["Sun, 13 Sep 2020 05:10:58 GMT"],
               "Location":["https://www.csmonitor.com/"],
               "Server":["HTTP Proxy/1.0"]
            },
            "body":"test body"
         },
         "Success":false,
         "Error":"Incorrect web response: status lines don't match",
         "StartTime":"2020-09-13T01:10:57.499263112-04:00",
         "EndTime":"2020-09-13T01:10:58.077524926-04:00"
        }
      ],
      "Blocked":true,
      "FailSanity":false,
      "StatefulBlock":false
    }"""

        expected_row = {
            'domain':
            'www.csmonitor.com',
            'ip':
            '184.50.171.225',
            'date':
            '2020-09-13',
            'start_time':
            '2020-09-13T01:10:57.499263112-04:00',
            'end_time':
            '2020-09-13T01:10:58.077524926-04:00',
            'retries':
            0,
            'sent':
            'www.csmonitor.com',
            'received_status':
            '301 Moved Permanently',
            'received_body':
            'test body',
            'received_headers': [
                'Content-Length: 0',
                'Date: Sun, 13 Sep 2020 05:10:58 GMT',
                'Location: https://www.csmonitor.com/',
                'Server: HTTP Proxy/1.0',
            ],
            'error':
            'Incorrect web response: status lines don\'t match',
            'blocked':
            True,
            'success':
            False,
            'fail_sanity':
            False,
            'stateful_block':
            False,
            'measurement_id':
            '',
            'source':
            'CP_Quack-http-2020-09-13-01-02-07',
        }
        filename = 'gs://firehook-scans/http/CP_Quack-http-2020-09-13-01-02-07/results.json'

        row = list(beam_tables._flatten_measurement(filename, line))[0]
        # We can't test the measurement id because it's random
        row['measurement_id'] = ''
        self.assertEqual(row, expected_row)
Ejemplo n.º 5
0
    def test_flatten_measurement_echo(self) -> None:
        """Test parsing an example Echo measurement."""
        line = """{
      "Server":"1.2.3.4",
      "Keyword":"www.example.com",
      "Retries":1,
      "Results":[
        {
          "Sent":"GET / HTTP/1.1 Host: www.example.com",
          "Received":"HTTP/1.1 403 Forbidden",
          "Success":false,
          "Error":"Incorrect echo response",
          "StartTime":"2020-09-20T07:45:09.643770291-04:00",
          "EndTime":"2020-09-20T07:45:10.088851843-04:00"
        },
        {
          "Sent":"GET / HTTP/1.1 Host: www.example.com",
          "Received": "HTTP/1.1 503 Service Unavailable",
          "Success":false,
          "Error":"Incorrect echo response",
          "StartTime":"2020-09-20T07:45:16.170427683-04:00",
          "EndTime":"2020-09-20T07:45:16.662093893-04:00"
        }
      ],
      "Blocked":true,
      "FailSanity":false,
      "StatefulBlock":false
    }"""

        expected_rows: List[beam_tables.Row] = [{
            'domain':
            'www.example.com',
            'ip':
            '1.2.3.4',
            'date':
            '2020-09-20',
            'start_time':
            '2020-09-20T07:45:09.643770291-04:00',
            'end_time':
            '2020-09-20T07:45:10.088851843-04:00',
            'retries':
            1,
            'sent':
            'GET / HTTP/1.1 Host: www.example.com',
            'received_status':
            'HTTP/1.1 403 Forbidden',
            'error':
            'Incorrect echo response',
            'blocked':
            True,
            'success':
            False,
            'fail_sanity':
            False,
            'stateful_block':
            False,
            'measurement_id':
            '',
            'source':
            'CP_Quack-echo-2020-08-23-06-01-02',
        }, {
            'domain':
            'www.example.com',
            'ip':
            '1.2.3.4',
            'date':
            '2020-09-20',
            'start_time':
            '2020-09-20T07:45:16.170427683-04:00',
            'end_time':
            '2020-09-20T07:45:16.662093893-04:00',
            'retries':
            1,
            'sent':
            'GET / HTTP/1.1 Host: www.example.com',
            'received_status':
            'HTTP/1.1 503 Service Unavailable',
            'error':
            'Incorrect echo response',
            'blocked':
            True,
            'success':
            False,
            'fail_sanity':
            False,
            'stateful_block':
            False,
            'measurement_id':
            '',
            'source':
            'CP_Quack-echo-2020-08-23-06-01-02',
        }]

        filename = 'gs://firehook-scans/echo/CP_Quack-echo-2020-08-23-06-01-02/results.json'
        rows = list(beam_tables._flatten_measurement(filename, line))
        self.assertEqual(len(rows), 2)

        # Measurement ids should be the same
        self.assertEqual(rows[0]['measurement_id'], rows[1]['measurement_id'])
        # But they're randomly generated,
        # so we can't test them against the full expected rows.
        rows[0]['measurement_id'] = ''
        rows[1]['measurement_id'] = ''

        self.assertListEqual(rows, expected_rows)