Example #1
0
    def _extract_zip_file(self):
        extracted_files = []
        with ZipFile(self._zip_path) as zip_file:
            files_in_zip = zip_file.namelist()
            for file_in_zip in files_in_zip:
                if not self._is_file_eligible(file_in_zip):
                    continue
                extract_path = os.path.join(self._extract_dir, file_in_zip)
                zip_file.extract(file_in_zip, self._extract_dir)
                extracted_files.append(extract_path)

        for extracted_path in extracted_files:
            mime = self._mime_type.guess_type(extracted_path)[0]
            file_format = Attachment.guess_format_by_mime_type(mime)
            file_in_zip = os.path.split(extracted_path)[1]

            if file_format == AttachmentFormat.text:
                with open(extracted_path, "r",
                          encoding="utf-8") as extracted_file:
                    file_content = extracted_file.read()
                    file_content = file_content.replace("\n", " ")
                    file_content = file_content.replace("\t", " ")
                unzip_attachment = Attachment(p_name=file_in_zip,
                                              p_format=AttachmentFormat.text,
                                              p_text_content=file_content)
            else:
                with open(extracted_path, "rb") as extracted_file:
                    file_content = extracted_file.read()
                unzip_attachment = Attachment(p_name=file_in_zip,
                                              p_format=AttachmentFormat.binary,
                                              p_binary_content=file_content)

            self._new_attachments.append(unzip_attachment)
    def download_links_in_html_as_attachments(self, p_html: str,
                                              p_extensions: List[str]):
        """ Scans the given HTML file, finds links, downloads
        the files and saves them as attachments.
        This method supports only text attachments at this time.
        """
        # Build clean HTML
        if p_html is None or len(p_html) <= 0:
            return
        clean_html = p_html.replace("\r", "").replace("\n", "")
        html_tag_pos = clean_html.lower().find("<html")
        if html_tag_pos < 0:
            return
        clean_html = clean_html[html_tag_pos:]

        # Extract URL's
        extractor = URLExtract()
        urls = extractor.find_urls(clean_html)

        # Download as necessary
        for url in urls:
            low_url = url.lower()
            has_eligible_extension = False
            for extension in p_extensions:
                low_extension = "." + extension.lower()
                if low_extension in low_url:
                    has_eligible_extension = True
                    break
            if not has_eligible_extension:
                continue

            if "urldefense.com" in url:
                real_http_pos = low_url.rfind("http")
                clean_url = url[real_http_pos:].replace("__", "")
            else:
                clean_url = url
            if clean_url[-1] == "/":
                clean_url = clean_url[:-1]

            filename = os.path.basename(clean_url)
            dummy_name, extension = os.path.splitext(filename)
            extension = extension.replace(".", "")
            file_format = Attachment.guess_format_by_file_extension(extension)

            response = requests.get(clean_url, allow_redirects=True)

            if file_format == AttachmentFormat.text:
                downloaded_attachment = Attachment(
                    p_name=filename,
                    p_format=AttachmentFormat.text,
                    p_text_content=response.text)
            else:
                downloaded_attachment = Attachment(
                    p_name=filename,
                    p_format=AttachmentFormat.binary,
                    p_binary_content=response.content)

            self.attachments.append(downloaded_attachment)
    def _pull(self) -> List[AbstractPassenger]:
        output = []

        for item in self.account.inbox.all().order_by('-datetime_received'):  # pylint: disable=E1101
            email_passenger = Email(
                p_external_id=item.message_id,
                p_internal_id=uuid1(),
                p_source_system=AbstractExchange._SOURCE_SYSTEM,
                p_attachments=[],
                p_puller_module=self.__module__,
                p_pull_datetime=datetime.now(),
                p_passenger_module=self.email_module)

            for item_attachment in item.attachments:
                try:
                    dummy = item_attachment.name
                    dummy = item_attachment.content
                except Exception:  # pylint: disable=W0703
                    continue

                if any([
                        item_attachment.name is None,
                        item_attachment.content is None
                ]):
                    continue

                if any([
                        item_attachment.content_type is None,
                        item_attachment.content_type == ""
                ]):
                    attachment_format = Attachment.guess_format_by_file_name(
                        item_attachment.name)
                else:
                    attachment_format = Attachment.guess_format_by_mime_type(
                        item_attachment.content_type)

                if attachment_format == AttachmentFormat.text:
                    passenger_attachment = Attachment(
                        p_name=item_attachment.name,
                        p_format=AttachmentFormat.text,
                        p_text_content=str(item_attachment.content)[2:])
                else:
                    passenger_attachment = Attachment(
                        p_name=item_attachment.name,
                        p_format=AttachmentFormat.binary,
                        p_binary_content=item_attachment.content)

                email_passenger.attachments.append(passenger_attachment)

            if self._email_decorator is not None:
                self._email_decorator(item, email_passenger)
            self.log.append_text("Got mail from Exchange: " +
                                 email_passenger.id_text)
            output.append(email_passenger)

        return output
Example #4
0
    def pull(self) -> List[DemoPassenger1]:
        """ Fake pull from imaginary source system """
        output = []

        passenger1 = DemoPassenger1()
        passenger1.external_id = "ID_1_1"
        passenger1.dataset = "Puller 1 pulled first DemoPassenger1"
        passenger1.source_system = "DEMO_SYSTEM"
        passenger1.puller_module = self.__module__
        passenger1.attachments.append(
            Attachment(p_name="puller1_file1.txt",
                       p_format=AttachmentFormat.text,
                       p_text_content="Lorem Ipsum"))
        output.append(passenger1)
        self.log.append_text("Got passenger " + passenger1.id_text)

        passenger2 = DemoPassenger1()
        passenger2.external_id = "ID_1_2"
        passenger2.dataset = "Puller 1 pulled second DemoPassenger1"
        passenger2.source_system = "DEMO_SYSTEM"
        passenger2.puller_module = self.__module__
        output.append(passenger2)
        self.log.append_text("Got passenger " + passenger2.id_text)

        return output
Example #5
0
    def _download(self, url):
        filename = os.path.basename(url)
        dummy_name, extension = os.path.splitext(filename)
        extension = extension.replace(".", "")
        file_format = Attachment.guess_format_by_file_extension(extension)

        response = requests.get(url, allow_redirects=True)

        if file_format == AttachmentFormat.text:
            downloaded_attachment = Attachment(p_name=filename,
                                               p_format=AttachmentFormat.text,
                                               p_text_content=response.text)
        else:
            downloaded_attachment = Attachment(
                p_name=filename,
                p_format=AttachmentFormat.binary,
                p_binary_content=response.content)

        self._attachments.append(downloaded_attachment)
Example #6
0
    def _get_attachment_obj(self, p_internal_id: str,
                            p_attachment_json: dict) -> Attachment:
        output = Attachment(
            p_name=p_attachment_json["name"],
            p_format=AttachmentFormat[p_attachment_json["format"]])

        full_path = self._path.get_attachment_file_path(
            p_internal_id, output.name)
        self._log.append_text(f"Reading attachment from disk: {full_path}")

        if output.format == AttachmentFormat.text:
            with open(full_path, "r", encoding="utf-8") as text_file:
                output.text_content = text_file.read()
        elif output.format == AttachmentFormat.binary:
            with open(full_path, "rb") as bin_file:
                output.binary_content = bin_file.read()
        else:
            raise AttachmentError(AttachmentError.ErrorCode.invalid_format,
                                  output.format)

        return output
Example #7
0
 def pull(self) -> List[DemoBinaryPassenger]:
     """ Fake pull from imaginary source system """
     output = []
     passenger1 = DemoBinaryPassenger()
     passenger1.external_id = "ID_BIN_1"
     passenger1.source_system = "DEMO_SYSTEM"
     passenger1.puller_module = self.__module__
     passenger1.attachments.append(
         Attachment(p_name=DemoBinaryPuller._BIN_FILE_NAME,
                    p_format=AttachmentFormat.binary,
                    p_binary_content=DemoBinaryPuller._get_sample_binary()))
     output.append(passenger1)
     self.log.append_text(f"Got passenger {passenger1.id_text}")
     return output
 def pull(self) -> List[AbstractPassenger]:
     """ Pulls passengers from the source system """
     output = []
     for exchange_account in self.exchange_accounts:
         new_emails = exchange_account.puller.pull()
         for new_email in new_emails:
             artificial_attachment_dict = {"alias": exchange_account.alias}
             artificial_attachment_json = json.dumps(artificial_attachment_dict)
             artificial_attachment = Attachment(
                 p_name=AbstractMultiExchange._ARTIFICIAL_ATTACHMENT_FILE,
                 p_text_content=artificial_attachment_json)
             new_email.attachments.append(artificial_attachment)
             output.append(new_email)
     return output
Example #9
0
    def pull(self) -> List[DemoPassenger1]:
        """ Fake operation """
        output = []

        passenger1 = DemoPassenger1()
        passenger1.external_id = "ID_3_1"
        passenger1.dataset = "Puller 3 pulled first DemoPassenger1"
        passenger1.source_system = "DEMO_SYSTEM"
        passenger1.puller_module = self.__module__
        passenger1.attachments.append(
            Attachment(p_name="puller3_file1.txt",
                       p_format=AttachmentFormat.text,
                       p_text_content="Lorem Ipsum"))
        output.append(passenger1)
        self.log.append_text(f"Got passenger {passenger1.id_text}")

        return output
Example #10
0
    def _pull(self) -> List[AbstractPassenger]:  # pylint: disable=R0912
        output = []

        for item in self.account.inbox.all().order_by('-datetime_received'):  # pylint: disable=E1101
            try:
                summary = AbstractExchange._get_exchange_item_summary(item)
                self.log.append_text(f"Encountered Exchange E-Mail: {summary}")

                email_passenger = Email(
                    p_external_id=item.message_id,
                    p_internal_id=uuid1(),
                    p_source_system=AbstractExchange._SOURCE_SYSTEM,
                    p_attachments=[],
                    p_puller_module=self.__module__,
                    p_pull_datetime=datetime.now(),
                    p_passenger_module=self.email_module)

                for item_attachment in item.attachments:
                    try:
                        dummy = item_attachment.name
                        dummy = item_attachment.content
                    except Exception:  # pylint: disable=W0703
                        continue

                    if any([
                            item_attachment.name is None,
                            item_attachment.content is None
                    ]):
                        continue

                    if any([
                            item_attachment.content_type is None,
                            item_attachment.content_type == ""
                    ]):
                        attachment_format = Attachment.guess_format_by_file_name(
                            item_attachment.name)
                    else:
                        attachment_format = Attachment.guess_format_by_mime_type(
                            item_attachment.content_type)

                    if attachment_format == AttachmentFormat.text:
                        passenger_attachment = Attachment(
                            p_name=item_attachment.name,
                            p_format=AttachmentFormat.text,
                            p_text_content=str(item_attachment.content)[2:])
                    else:
                        passenger_attachment = Attachment(
                            p_name=item_attachment.name,
                            p_format=AttachmentFormat.binary,
                            p_binary_content=item_attachment.content)

                    email_passenger.attachments.append(passenger_attachment)

                self.log.append_text("Got Exchange mail " +
                                     email_passenger.id_text)

                if self._email_decorator is None:
                    ignore_email = False
                else:
                    ignore_email = self._email_decorator(item, email_passenger)
                    if ignore_email is None:
                        ignore_email = False

                if ignore_email:
                    self.log.append_text("E-Mail eliminated by decorator")
                else:
                    output.append(email_passenger)

            except Exception as error:
                self.log.append_entry(
                    LogEntry(p_message=f"Error: {str(error)}",
                             p_type=MessageType.error))

        return output
    def unzip_attachments(self):
        """ Finds .ZIP attachments, and turns them into regular
        attachments within the same object
        """
        mime_type = mimetypes.MimeTypes()
        attachment_index = -1
        deletable_indices = []
        new_attachments = []

        for attachment in self.attachments:
            attachment_index += 1
            if len(attachment.name) < 4:
                continue
            if attachment.name.upper()[-4:] != ".ZIP":
                continue

            shutil.rmtree(AbstractPassenger._TMP_ZIP_DIR, ignore_errors=True)
            os.makedirs(AbstractPassenger._TMP_ZIP_DIR, exist_ok=True)

            extract_dir = os.path.join(AbstractPassenger._TMP_ZIP_DIR,
                                       AbstractPassenger._TMP_ZIP_EXTRACT_DIR)

            zip_path = os.path.join(AbstractPassenger._TMP_ZIP_DIR,
                                    AbstractPassenger._TMP_ZIP_FILE)
            with open(zip_path, "wb") as zip_file:
                zip_file.write(attachment.binary_content)

            extracted_files = []
            with ZipFile(zip_path) as zip_file:
                files_in_zip = zip_file.namelist()
                for file_in_zip in files_in_zip:
                    extract_path = os.path.join(extract_dir, file_in_zip)
                    zip_file.extract(file_in_zip, extract_dir)
                    extracted_files.append(extract_path)

            for extracted_path in extracted_files:
                mime = mime_type.guess_type(extracted_path)[0]
                file_format = Attachment.guess_format_by_mime_type(mime)
                file_in_zip = os.path.split(extracted_path)[1]

                if file_format == AttachmentFormat.text:
                    with open(extracted_path, "r",
                              encoding="utf-8") as extracted_file:
                        file_content = extracted_file.read()
                        file_content = file_content.replace("\n", " ")
                        file_content = file_content.replace("\t", " ")
                    unzip_attachment = Attachment(
                        p_name=file_in_zip,
                        p_format=AttachmentFormat.text,
                        p_text_content=file_content)
                else:
                    with open(extracted_path, "rb") as extracted_file:
                        file_content = extracted_file.read()
                    unzip_attachment = Attachment(
                        p_name=file_in_zip,
                        p_format=AttachmentFormat.binary,
                        p_binary_content=file_content)

                new_attachments.append(unzip_attachment)

            deletable_indices.append(attachment_index)
            shutil.rmtree(AbstractPassenger._TMP_ZIP_DIR, ignore_errors=True)

        deletable_indices.sort(reverse=True)
        for deletable_index in deletable_indices:
            self.attachments.pop(deletable_index)

        for new_attachment in new_attachments:
            self.attachments.append(new_attachment)