def test_generate_reports_with_chunking_and_copying_header(self): """Test generate_reports with chunking and copying the header""" chunk_size = 1000 # This test only makes sense if the test data actually is longer # than the chunk size self.assertTrue(chunk_size < len(csv_test_data)) template = Report(harmonization=HARM) template.add("feed.name", "test_generate_reports_with_chunking_and_header") observation_time = template["time.observation"] original_header = io.BytesIO(csv_test_data).readline() decoded_chunks = [original_header] for report in generate_reports(template, io.BytesIO(csv_test_data), chunk_size=chunk_size, copy_header_line=True): self.assertEqual(report["feed.name"], "test_generate_reports_with_chunking_and_header") self.assertEqual(report["time.observation"], observation_time) report_data = io.BytesIO(base64.b64decode(report["raw"])) header = report_data.readline() chunk = report_data.read() self.assertEqual(original_header, header) decoded_chunks.append(chunk) self.assertEqual(b"".join(decoded_chunks), csv_test_data)
def process(self): self.logger.debug("Started looking for files.") if os.path.isdir(self.parameters.path): p = os.path.abspath(self.parameters.path) # iterate over all files in dir for f in os.listdir(p): filename = os.path.join(p, f) if os.path.isfile(filename): if fnmatch.fnmatch(f, '*' + self.parameters.postfix): self.logger.info("Processing file %r.", filename) template = self.new_report() template.add("feed.url", "file://localhost%s" % filename) with open(filename, 'rb') as fh: for report in generate_reports(template, fh, self.chunk_size, self.chunk_replicate_header): self.send_message(report) if self.parameters.delete_file: try: os.remove(filename) self.logger.debug("Deleted file: %r.", filename) except PermissionError: self.logger.error("Could not delete file %r.", filename) self.logger.info("Maybe I don't have sufficient rights on that file?") self.logger.error("Stopping now, to prevent reading this file again.") self.stop()
def process(self): self.logger.debug("Started looking for files.") if os.path.isdir(self.parameters.path): p = os.path.abspath(self.parameters.path) # iterate over all files in dir for f in os.listdir(p): filename = os.path.join(p, f) if os.path.isfile(filename): if fnmatch.fnmatch(f, '*' + self.parameters.postfix): self.logger.info("Processing file %r.", filename) template = self.new_report() template.add("feed.url", "file://localhost%s" % filename) template.add("extra.file_name", f) with open(filename, 'rb') as fh: for report in generate_reports(template, fh, self.chunk_size, self.chunk_replicate_header): self.send_message(report) if self.parameters.delete_file: try: os.remove(filename) self.logger.debug("Deleted file: %r.", filename) except PermissionError: self.logger.error("Could not delete file %r.", filename) self.logger.info("Maybe I don't have sufficient rights on that file?") self.logger.error("Stopping now, to prevent reading this file again.") self.stop()
def process_message(self, uid, message): erroneous = False # If errors occurred this will be set to true. seen = False for body in message.body['plain']: match = re.search(self.parameters.url_regex, str(body.decode('utf-8') if isinstance(body, bytes) else body)) if match: url = match.group() # strip leading and trailing spaces, newlines and # carriage returns url = url.strip() self.logger.info("Downloading report from %r.", url) try: resp = self.session.get(url=url) except requests.exceptions.Timeout: self.logger.error("Request timed out %i times in a row. " % self.http_timeout_max_tries) erroneous = True # The download timed out too often, leave the Loop. continue if resp.status_code // 100 != 2: self.logger.error('HTTP response status code was {}.' ''.format(resp.status_code)) erroneous = True continue if not resp.content: self.logger.warning('Got empty reponse from server.') else: self.logger.info("Report downloaded.") template = self.new_report() template["feed.url"] = url template["extra.email_subject"] = message.subject template["extra.email_from"] = ','.join(x['email'] for x in message.sent_from) template["extra.email_message_id"] = message.message_id template["extra.file_name"] = file_name_from_response(resp) for report in generate_reports(template, io.BytesIO(resp.content), self.chunk_size, self.chunk_replicate_header): self.send_message(report) seen = True if not erroneous: self.logger.info("Email report read.") else: if self.parameters.error_procedure == 'pass': seen = True else: self.logger.error("Email report read with above errors, the report was not processed.") return seen
def test_generate_reports_no_chunking(self): """Test generate_reports with chunking disabled""" template = Report(harmonization=HARM) template.add("feed.name", "test_generate_reports_no_chunking") [report] = list(generate_reports(template, io.BytesIO(csv_test_data), chunk_size=None, copy_header_line=False)) self.assertEqual(report["feed.name"], "test_generate_reports_no_chunking") self.assertEqual(base64.b64decode(report["raw"]), csv_test_data)
def process(self): mailbox = imbox.Imbox(self.parameters.mail_host, self.parameters.mail_user, self.parameters.mail_password, self.parameters.mail_ssl) emails = mailbox.messages(folder=self.parameters.folder, unread=True) if emails: for uid, message in emails: if (self.parameters.subject_regex and not re.search(self.parameters.subject_regex, re.sub("\r\n\s", " ", message.subject))): continue for body in message.body['plain']: match = re.search(self.parameters.url_regex, str(body)) if match: url = match.group() # strip leading and trailing spaces, newlines and # carriage returns url = url.strip() self.logger.info("Downloading report from %r.", url) resp = requests.get(url=url, auth=self.auth, proxies=self.proxy, headers=self.http_header, verify=self.http_verify_cert, cert=self.ssl_client_cert, timeout=self.http_timeout) if resp.status_code // 100 != 2: raise ValueError( 'HTTP response status code was {}.' ''.format(resp.status_code)) self.logger.info("Report downloaded.") template = self.new_report() for report in generate_reports( template, io.BytesIO(resp.content), self.chunk_size, self.chunk_replicate_header): self.send_message(report) # Only mark read if message relevant to this instance, # so other instances watching this mailbox will still # check it. mailbox.mark_seen(uid) self.logger.info("Email report read.") mailbox.logout()
def test_generate_reports_with_chunking_no_header(self): """Test generate_reports with chunking and not copying the header""" template = Report(harmonization=HARM) template.add("feed.name", "test_generate_reports_with_chunking") chunk_size = 1000 # This test only makes sense if the test data actually is longer # than the chunk size self.assertTrue(chunk_size < len(csv_test_data)) decoded_chunks = [] for report in generate_reports(template, io.BytesIO(csv_test_data), chunk_size=chunk_size, copy_header_line=False): self.assertEqual(report["feed.name"], "test_generate_reports_with_chunking") decoded_chunks.append(base64.b64decode(report["raw"])) self.assertEqual(b"".join(decoded_chunks), csv_test_data)
def process(self): mailbox = self.connect_mailbox() emails = mailbox.messages(folder=self.parameters.folder, unread=True) if emails: for uid, message in emails: if (self.parameters.subject_regex and not re.search(self.parameters.subject_regex, re.sub(r"\r\n\s", " ", message.subject))): continue erroneous = False # If errors occured this will be set to true. for body in message.body['plain']: match = re.search(self.parameters.url_regex, str(body)) if match: url = match.group() # strip leading and trailing spaces, newlines and # carriage returns url = url.strip() self.logger.info("Downloading report from %r.", url) timeoutretries = 0 resp = None while timeoutretries < self.http_timeout_max_tries and resp is None: try: resp = requests.get(url=url, auth=self.auth, proxies=self.proxy, headers=self.http_header, verify=self.http_verify_cert, cert=self.ssl_client_cert, timeout=self.http_timeout_sec) except requests.exceptions.Timeout: timeoutretries += 1 self.logger.warn("Timeout whilst downloading the report.") if resp is None and timeoutretries >= self.http_timeout_max_tries: self.logger.error("Request timed out %i times in a row. " % timeoutretries) erroneous = True # The download timed out too often, leave the Loop. continue if resp.status_code // 100 != 2: raise ValueError('HTTP response status code was {}.' ''.format(resp.status_code)) self.logger.info("Report downloaded.") template = self.new_report() for report in generate_reports(template, io.BytesIO(resp.content), self.chunk_size, self.chunk_replicate_header): self.send_message(report) # Only mark read if message relevant to this instance, # so other instances watching this mailbox will still # check it. try: mailbox.mark_seen(uid) except imaplib.abort: # Disconnect, see https://github.com/certtools/intelmq/issues/852 mailbox = self.connect_mailbox() mailbox.mark_seen(uid) if not erroneous: self.logger.info("Email report read.") else: self.logger.error("Email report read with errors, the report was not processed.") mailbox.logout()
def process(self): mailbox = imbox.Imbox(self.parameters.mail_host, self.parameters.mail_user, self.parameters.mail_password, self.parameters.mail_ssl) emails = mailbox.messages(folder=self.parameters.folder, unread=True) if emails: for uid, message in emails: if (self.parameters.subject_regex and not re.search(self.parameters.subject_regex, re.sub("\r\n\s", " ", message.subject))): continue erroneous = False # If errors occured this will be set to true. for body in message.body['plain']: match = re.search(self.parameters.url_regex, str(body)) if match: url = match.group() # strip leading and trailing spaces, newlines and # carriage returns url = url.strip() self.logger.info("Downloading report from %r.", url) timeoutretries = 0 resp = None while timeoutretries < self.http_timeout_max_tries and resp is None: try: resp = requests.get(url=url, auth=self.auth, proxies=self.proxy, headers=self.http_header, verify=self.http_verify_cert, cert=self.ssl_client_cert, timeout=self.http_timeout_sec) except requests.exceptions.Timeout: timeoutretries += 1 self.logger.warn("Timeout whilst downloading the report.") if resp is None and timeoutretries >= self.http_timeout_max_tries: self.logger.error("Request timed out %i times in a row. " % timeoutretries) erroneous = True # The download timed out too often, leave the Loop. continue if resp.status_code // 100 != 2: raise ValueError('HTTP response status code was {}.' ''.format(resp.status_code)) self.logger.info("Report downloaded.") template = self.new_report() for report in generate_reports(template, io.BytesIO(resp.content), self.chunk_size, self.chunk_replicate_header): self.send_message(report) # Only mark read if message relevant to this instance, # so other instances watching this mailbox will still # check it. mailbox.mark_seen(uid) if not erroneous: self.logger.info("Email report read.") else: self.logger.error("Email report read with errors, the report was not processed.") mailbox.logout()
def process(self): mailbox = self.connect_mailbox() emails = mailbox.messages(folder=self.parameters.folder, unread=True, sent_to=getattr(self.parameters, "sent_to", None), sent_from=getattr(self.parameters, "sent_from", None)) if emails: for uid, message in emails: if (self.parameters.subject_regex and not re.search(self.parameters.subject_regex, re.sub(r"\r\n\s", " ", message.subject))): self.logger.debug("Message with date %s skipped because subject %r does not match.", message.date, message.subject) continue erroneous = False # If errors occurred this will be set to true. for body in message.body['plain']: match = re.search(self.parameters.url_regex, str(body)) if match: url = match.group() # strip leading and trailing spaces, newlines and # carriage returns url = url.strip() self.logger.info("Downloading report from %r.", url) timeoutretries = 0 resp = None while timeoutretries < self.http_timeout_max_tries and resp is None: try: resp = requests.get(url=url, auth=self.auth, proxies=self.proxy, headers=self.http_header, verify=self.http_verify_cert, cert=self.ssl_client_cert, timeout=self.http_timeout_sec) except requests.exceptions.Timeout: timeoutretries += 1 self.logger.warn("Timeout whilst downloading the report.") if resp is None and timeoutretries >= self.http_timeout_max_tries: self.logger.error("Request timed out %i times in a row. " % timeoutretries) erroneous = True # The download timed out too often, leave the Loop. continue if resp.status_code // 100 != 2: raise ValueError('HTTP response status code was {}.' ''.format(resp.status_code)) if not resp.content: self.logger.warning('Got empty reponse from server.') else: self.logger.info("Report downloaded.") template = self.new_report() for report in generate_reports(template, io.BytesIO(resp.content), self.chunk_size, self.chunk_replicate_header): self.send_message(report) # Only mark read if message relevant to this instance, # so other instances watching this mailbox will still # check it. try: mailbox.mark_seen(uid) except imaplib.abort: # Disconnect, see https://github.com/certtools/intelmq/issues/852 mailbox = self.connect_mailbox() mailbox.mark_seen(uid) if not erroneous: self.logger.info("Email report read.") else: self.logger.error("Email report read with errors, the report was not processed.") else: self.logger.debug("No unread mails to check.") mailbox.logout()
def process_message(self, uid, message): erroneous = False # If errors occurred this will be set to true. seen = False for body in message.body['plain']: match = re.search( self.parameters.url_regex, str(body.decode('utf-8') if isinstance(body, bytes) else body)) if match: url = match.group() # strip leading and trailing spaces, newlines and # carriage returns url = url.strip() self.logger.info("Downloading report from %r.", url) timeoutretries = 0 resp = None while timeoutretries < self.http_timeout_max_tries and resp is None: try: resp = requests.get(url=url, auth=self.auth, proxies=self.proxy, headers=self.http_header, verify=self.http_verify_cert, cert=self.ssl_client_cert, timeout=self.http_timeout_sec) except requests.exceptions.Timeout: timeoutretries += 1 self.logger.warn( "Timeout whilst downloading the report.") if resp is None and timeoutretries >= self.http_timeout_max_tries: self.logger.error("Request timed out %i times in a row. " % timeoutretries) erroneous = True # The download timed out too often, leave the Loop. continue if resp.status_code // 100 != 2: self.logger.error('HTTP response status code was {}.' ''.format(resp.status_code)) erroneous = True continue if not resp.content: self.logger.warning('Got empty reponse from server.') else: self.logger.info("Report downloaded.") template = self.new_report() for report in generate_reports( template, io.BytesIO(resp.content), self.chunk_size, self.chunk_replicate_header): self.send_message(report) seen = True if not erroneous: self.logger.info("Email report read.") else: if self.parameters.error_procedure == 'pass': seen = True else: self.logger.error( "Email report read with above errors, the report was not processed." ) return seen
def process_message(self, uid, message): erroneous = False # If errors occurred this will be set to true. seen = False for body in message.body['plain']: match = re.search(self.parameters.url_regex, str(body.decode('utf-8') if isinstance(body, bytes) else body)) if match: url = match.group() # strip leading and trailing spaces, newlines and # carriage returns url = url.strip() self.logger.info("Downloading report from %r.", url) timeoutretries = 0 resp = None while timeoutretries < self.http_timeout_max_tries and resp is None: try: resp = requests.get(url=url, auth=self.auth, proxies=self.proxy, headers=self.http_header, verify=self.http_verify_cert, cert=self.ssl_client_cert, timeout=self.http_timeout_sec) except requests.exceptions.Timeout: timeoutretries += 1 self.logger.warn("Timeout whilst downloading the report.") if resp is None and timeoutretries >= self.http_timeout_max_tries: self.logger.error("Request timed out %i times in a row. " % timeoutretries) erroneous = True # The download timed out too often, leave the Loop. continue if resp.status_code // 100 != 2: self.logger.error('HTTP response status code was {}.' ''.format(resp.status_code)) erroneous = True continue if not resp.content: self.logger.warning('Got empty reponse from server.') else: self.logger.info("Report downloaded.") template = self.new_report() for report in generate_reports(template, io.BytesIO(resp.content), self.chunk_size, self.chunk_replicate_header): self.send_message(report) seen = True if not erroneous: self.logger.info("Email report read.") else: if self.parameters.error_procedure == 'pass': seen = True else: self.logger.error("Email report read with above errors, the report was not processed.") return seen