def _extract_zip_file(self): extracted_files = [] with ZipFile(self._zip_path) as zip_file: files_in_zip = zip_file.namelist() for file_in_zip in files_in_zip: if not self._is_file_eligible(file_in_zip): continue extract_path = os.path.join(self._extract_dir, file_in_zip) zip_file.extract(file_in_zip, self._extract_dir) extracted_files.append(extract_path) for extracted_path in extracted_files: mime = self._mime_type.guess_type(extracted_path)[0] file_format = Attachment.guess_format_by_mime_type(mime) file_in_zip = os.path.split(extracted_path)[1] if file_format == AttachmentFormat.text: with open(extracted_path, "r", encoding="utf-8") as extracted_file: file_content = extracted_file.read() file_content = file_content.replace("\n", " ") file_content = file_content.replace("\t", " ") unzip_attachment = Attachment(p_name=file_in_zip, p_format=AttachmentFormat.text, p_text_content=file_content) else: with open(extracted_path, "rb") as extracted_file: file_content = extracted_file.read() unzip_attachment = Attachment(p_name=file_in_zip, p_format=AttachmentFormat.binary, p_binary_content=file_content) self._new_attachments.append(unzip_attachment)
def download_links_in_html_as_attachments(self, p_html: str, p_extensions: List[str]): """ Scans the given HTML file, finds links, downloads the files and saves them as attachments. This method supports only text attachments at this time. """ # Build clean HTML if p_html is None or len(p_html) <= 0: return clean_html = p_html.replace("\r", "").replace("\n", "") html_tag_pos = clean_html.lower().find("<html") if html_tag_pos < 0: return clean_html = clean_html[html_tag_pos:] # Extract URL's extractor = URLExtract() urls = extractor.find_urls(clean_html) # Download as necessary for url in urls: low_url = url.lower() has_eligible_extension = False for extension in p_extensions: low_extension = "." + extension.lower() if low_extension in low_url: has_eligible_extension = True break if not has_eligible_extension: continue if "urldefense.com" in url: real_http_pos = low_url.rfind("http") clean_url = url[real_http_pos:].replace("__", "") else: clean_url = url if clean_url[-1] == "/": clean_url = clean_url[:-1] filename = os.path.basename(clean_url) dummy_name, extension = os.path.splitext(filename) extension = extension.replace(".", "") file_format = Attachment.guess_format_by_file_extension(extension) response = requests.get(clean_url, allow_redirects=True) if file_format == AttachmentFormat.text: downloaded_attachment = Attachment( p_name=filename, p_format=AttachmentFormat.text, p_text_content=response.text) else: downloaded_attachment = Attachment( p_name=filename, p_format=AttachmentFormat.binary, p_binary_content=response.content) self.attachments.append(downloaded_attachment)
def _pull(self) -> List[AbstractPassenger]: output = [] for item in self.account.inbox.all().order_by('-datetime_received'): # pylint: disable=E1101 email_passenger = Email( p_external_id=item.message_id, p_internal_id=uuid1(), p_source_system=AbstractExchange._SOURCE_SYSTEM, p_attachments=[], p_puller_module=self.__module__, p_pull_datetime=datetime.now(), p_passenger_module=self.email_module) for item_attachment in item.attachments: try: dummy = item_attachment.name dummy = item_attachment.content except Exception: # pylint: disable=W0703 continue if any([ item_attachment.name is None, item_attachment.content is None ]): continue if any([ item_attachment.content_type is None, item_attachment.content_type == "" ]): attachment_format = Attachment.guess_format_by_file_name( item_attachment.name) else: attachment_format = Attachment.guess_format_by_mime_type( item_attachment.content_type) if attachment_format == AttachmentFormat.text: passenger_attachment = Attachment( p_name=item_attachment.name, p_format=AttachmentFormat.text, p_text_content=str(item_attachment.content)[2:]) else: passenger_attachment = Attachment( p_name=item_attachment.name, p_format=AttachmentFormat.binary, p_binary_content=item_attachment.content) email_passenger.attachments.append(passenger_attachment) if self._email_decorator is not None: self._email_decorator(item, email_passenger) self.log.append_text("Got mail from Exchange: " + email_passenger.id_text) output.append(email_passenger) return output
def pull(self) -> List[DemoPassenger1]: """ Fake pull from imaginary source system """ output = [] passenger1 = DemoPassenger1() passenger1.external_id = "ID_1_1" passenger1.dataset = "Puller 1 pulled first DemoPassenger1" passenger1.source_system = "DEMO_SYSTEM" passenger1.puller_module = self.__module__ passenger1.attachments.append( Attachment(p_name="puller1_file1.txt", p_format=AttachmentFormat.text, p_text_content="Lorem Ipsum")) output.append(passenger1) self.log.append_text("Got passenger " + passenger1.id_text) passenger2 = DemoPassenger1() passenger2.external_id = "ID_1_2" passenger2.dataset = "Puller 1 pulled second DemoPassenger1" passenger2.source_system = "DEMO_SYSTEM" passenger2.puller_module = self.__module__ output.append(passenger2) self.log.append_text("Got passenger " + passenger2.id_text) return output
def _download(self, url): filename = os.path.basename(url) dummy_name, extension = os.path.splitext(filename) extension = extension.replace(".", "") file_format = Attachment.guess_format_by_file_extension(extension) response = requests.get(url, allow_redirects=True) if file_format == AttachmentFormat.text: downloaded_attachment = Attachment(p_name=filename, p_format=AttachmentFormat.text, p_text_content=response.text) else: downloaded_attachment = Attachment( p_name=filename, p_format=AttachmentFormat.binary, p_binary_content=response.content) self._attachments.append(downloaded_attachment)
def pull(self) -> List[DemoBinaryPassenger]: """ Fake pull from imaginary source system """ output = [] passenger1 = DemoBinaryPassenger() passenger1.external_id = "ID_BIN_1" passenger1.source_system = "DEMO_SYSTEM" passenger1.puller_module = self.__module__ passenger1.attachments.append( Attachment(p_name=DemoBinaryPuller._BIN_FILE_NAME, p_format=AttachmentFormat.binary, p_binary_content=DemoBinaryPuller._get_sample_binary())) output.append(passenger1) self.log.append_text(f"Got passenger {passenger1.id_text}") return output
def pull(self) -> List[AbstractPassenger]: """ Pulls passengers from the source system """ output = [] for exchange_account in self.exchange_accounts: new_emails = exchange_account.puller.pull() for new_email in new_emails: artificial_attachment_dict = {"alias": exchange_account.alias} artificial_attachment_json = json.dumps(artificial_attachment_dict) artificial_attachment = Attachment( p_name=AbstractMultiExchange._ARTIFICIAL_ATTACHMENT_FILE, p_text_content=artificial_attachment_json) new_email.attachments.append(artificial_attachment) output.append(new_email) return output
def pull(self) -> List[DemoPassenger1]: """ Fake operation """ output = [] passenger1 = DemoPassenger1() passenger1.external_id = "ID_3_1" passenger1.dataset = "Puller 3 pulled first DemoPassenger1" passenger1.source_system = "DEMO_SYSTEM" passenger1.puller_module = self.__module__ passenger1.attachments.append( Attachment(p_name="puller3_file1.txt", p_format=AttachmentFormat.text, p_text_content="Lorem Ipsum")) output.append(passenger1) self.log.append_text(f"Got passenger {passenger1.id_text}") return output
def _get_attachment_obj(self, p_internal_id: str, p_attachment_json: dict) -> Attachment: output = Attachment( p_name=p_attachment_json["name"], p_format=AttachmentFormat[p_attachment_json["format"]]) full_path = self._path.get_attachment_file_path( p_internal_id, output.name) self._log.append_text(f"Reading attachment from disk: {full_path}") if output.format == AttachmentFormat.text: with open(full_path, "r", encoding="utf-8") as text_file: output.text_content = text_file.read() elif output.format == AttachmentFormat.binary: with open(full_path, "rb") as bin_file: output.binary_content = bin_file.read() else: raise AttachmentError(AttachmentError.ErrorCode.invalid_format, output.format) return output
def _pull(self) -> List[AbstractPassenger]: # pylint: disable=R0912 output = [] for item in self.account.inbox.all().order_by('-datetime_received'): # pylint: disable=E1101 try: summary = AbstractExchange._get_exchange_item_summary(item) self.log.append_text(f"Encountered Exchange E-Mail: {summary}") email_passenger = Email( p_external_id=item.message_id, p_internal_id=uuid1(), p_source_system=AbstractExchange._SOURCE_SYSTEM, p_attachments=[], p_puller_module=self.__module__, p_pull_datetime=datetime.now(), p_passenger_module=self.email_module) for item_attachment in item.attachments: try: dummy = item_attachment.name dummy = item_attachment.content except Exception: # pylint: disable=W0703 continue if any([ item_attachment.name is None, item_attachment.content is None ]): continue if any([ item_attachment.content_type is None, item_attachment.content_type == "" ]): attachment_format = Attachment.guess_format_by_file_name( item_attachment.name) else: attachment_format = Attachment.guess_format_by_mime_type( item_attachment.content_type) if attachment_format == AttachmentFormat.text: passenger_attachment = Attachment( p_name=item_attachment.name, p_format=AttachmentFormat.text, p_text_content=str(item_attachment.content)[2:]) else: passenger_attachment = Attachment( p_name=item_attachment.name, p_format=AttachmentFormat.binary, p_binary_content=item_attachment.content) email_passenger.attachments.append(passenger_attachment) self.log.append_text("Got Exchange mail " + email_passenger.id_text) if self._email_decorator is None: ignore_email = False else: ignore_email = self._email_decorator(item, email_passenger) if ignore_email is None: ignore_email = False if ignore_email: self.log.append_text("E-Mail eliminated by decorator") else: output.append(email_passenger) except Exception as error: self.log.append_entry( LogEntry(p_message=f"Error: {str(error)}", p_type=MessageType.error)) return output
def unzip_attachments(self): """ Finds .ZIP attachments, and turns them into regular attachments within the same object """ mime_type = mimetypes.MimeTypes() attachment_index = -1 deletable_indices = [] new_attachments = [] for attachment in self.attachments: attachment_index += 1 if len(attachment.name) < 4: continue if attachment.name.upper()[-4:] != ".ZIP": continue shutil.rmtree(AbstractPassenger._TMP_ZIP_DIR, ignore_errors=True) os.makedirs(AbstractPassenger._TMP_ZIP_DIR, exist_ok=True) extract_dir = os.path.join(AbstractPassenger._TMP_ZIP_DIR, AbstractPassenger._TMP_ZIP_EXTRACT_DIR) zip_path = os.path.join(AbstractPassenger._TMP_ZIP_DIR, AbstractPassenger._TMP_ZIP_FILE) with open(zip_path, "wb") as zip_file: zip_file.write(attachment.binary_content) extracted_files = [] with ZipFile(zip_path) as zip_file: files_in_zip = zip_file.namelist() for file_in_zip in files_in_zip: extract_path = os.path.join(extract_dir, file_in_zip) zip_file.extract(file_in_zip, extract_dir) extracted_files.append(extract_path) for extracted_path in extracted_files: mime = mime_type.guess_type(extracted_path)[0] file_format = Attachment.guess_format_by_mime_type(mime) file_in_zip = os.path.split(extracted_path)[1] if file_format == AttachmentFormat.text: with open(extracted_path, "r", encoding="utf-8") as extracted_file: file_content = extracted_file.read() file_content = file_content.replace("\n", " ") file_content = file_content.replace("\t", " ") unzip_attachment = Attachment( p_name=file_in_zip, p_format=AttachmentFormat.text, p_text_content=file_content) else: with open(extracted_path, "rb") as extracted_file: file_content = extracted_file.read() unzip_attachment = Attachment( p_name=file_in_zip, p_format=AttachmentFormat.binary, p_binary_content=file_content) new_attachments.append(unzip_attachment) deletable_indices.append(attachment_index) shutil.rmtree(AbstractPassenger._TMP_ZIP_DIR, ignore_errors=True) deletable_indices.sort(reverse=True) for deletable_index in deletable_indices: self.attachments.pop(deletable_index) for new_attachment in new_attachments: self.attachments.append(new_attachment)