def _update(self, provider, update, test=False): config = provider.get("config", {}) new_items = [] try: imap = self.authenticate(provider, config) try: rv, data = imap.select(config.get("mailbox", None), readonly=False) if rv != "OK": raise IngestEmailError.emailMailboxError() try: rv, data = imap.search(None, config.get("filter", "(UNSEEN)")) if rv != "OK": raise IngestEmailError.emailFilterError() for num in data[0].split(): rv, data = imap.fetch(num, "(RFC822)") if rv == "OK" and not test: try: parser = self.get_feed_parser(provider, data) new_items.append(parser.parse(data, provider)) rv, data = imap.store(num, "+FLAGS", "\\Seen") except IngestEmailError: continue finally: imap.close() finally: imap.logout() except IngestEmailError: raise except Exception as ex: raise IngestEmailError.emailError(ex, provider) return new_items
def _update(self, provider): config = provider.get('config', {}) server = config.get('server', '') port = int(config.get('port', 993)) try: imap = imaplib.IMAP4_SSL(host=server, port=port) try: imap.login(config.get('user', None), config.get('password', None)) except imaplib.IMAP4.error: raise IngestEmailError.emailLoginError(imaplib.IMAP4.error, provider) rv, data = imap.select(config.get('mailbox', None), readonly=False) if rv == 'OK': rv, data = imap.search(None, config.get('filter', '(UNSEEN)')) if rv == 'OK': new_items = [] for num in data[0].split(): rv, data = imap.fetch(num, '(RFC822)') if rv == 'OK': try: parser = self.get_feed_parser(provider, data) new_items.append(parser.parse(data, provider)) rv, data = imap.store(num, '+FLAGS', '\\Seen') except IngestEmailError: continue imap.close() imap.logout() except IngestEmailError: raise except Exception as ex: raise IngestEmailError.emailError(ex, provider) return new_items
def _update(self, provider, update, test=False): config = provider.get('config', {}) server = config.get('server', '') port = int(config.get('port', 993)) new_items = [] try: try: socket.setdefaulttimeout(app.config.get('EMAIL_TIMEOUT', 10)) imap = imaplib.IMAP4_SSL(host=server, port=port) except (socket.gaierror, OSError) as e: raise IngestEmailError.emailHostError(exception=e, provider=provider) try: imap.login(config.get('user', None), config.get('password', None)) except imaplib.IMAP4.error: raise IngestEmailError.emailLoginError(imaplib.IMAP4.error, provider) try: rv, data = imap.select(config.get('mailbox', None), readonly=False) if rv != 'OK': raise IngestEmailError.emailMailboxError() try: rv, data = imap.search(None, config.get('filter', '(UNSEEN)')) if rv != 'OK': raise IngestEmailError.emailFilterError() for num in data[0].split(): rv, data = imap.fetch(num, '(RFC822)') if rv == 'OK' and not test: try: parser = self.get_feed_parser(provider, data) item = parser.parse(data, provider) if config.get('attachment'): self.save_attachment(data, item) new_items.append(item) rv, data = imap.store(num, '+FLAGS', '\\Seen') except IngestEmailError: continue finally: imap.close() finally: imap.logout() except IngestEmailError: raise except Exception as ex: raise IngestEmailError.emailError(ex, provider) return new_items
class EmailFeedingService(FeedingService): """ Feeding Service class which can read the article(s) from a configured mail box. """ NAME = 'email' ERRORS = [ IngestEmailError.emailError().get_error_description(), IngestEmailError.emailLoginError().get_error_description() ] label = 'Email' def _update(self, provider, update): config = provider.get('config', {}) server = config.get('server', '') port = int(config.get('port', 993)) try: imap = imaplib.IMAP4_SSL(host=server, port=port) try: imap.login(config.get('user', None), config.get('password', None)) except imaplib.IMAP4.error: raise IngestEmailError.emailLoginError(imaplib.IMAP4.error, provider) rv, data = imap.select(config.get('mailbox', None), readonly=False) if rv == 'OK': rv, data = imap.search(None, config.get('filter', '(UNSEEN)')) if rv == 'OK': new_items = [] for num in data[0].split(): rv, data = imap.fetch(num, '(RFC822)') if rv == 'OK': try: parser = self.get_feed_parser(provider, data) new_items.append(parser.parse(data, provider)) rv, data = imap.store(num, '+FLAGS', '\\Seen') except IngestEmailError: continue imap.close() imap.logout() except IngestEmailError: raise except Exception as ex: raise IngestEmailError.emailError(ex, provider) return new_items def prepare_href(self, href, mimetype=None): return url_for_media(href, mimetype)
def authenticate(self, provider: dict, config: dict) -> imaplib.IMAP4_SSL: server = config.get("server", "") port = int(config.get("port", 993)) try: socket.setdefaulttimeout(app.config.get("EMAIL_TIMEOUT", 10)) imap = imaplib.IMAP4_SSL(host=server, port=port) except (socket.gaierror, OSError) as e: raise IngestEmailError.emailHostError(exception=e, provider=provider) try: imap.login(config.get("user", None), config.get("password", None)) except imaplib.IMAP4.error: raise IngestEmailError.emailLoginError(imaplib.IMAP4.error, provider) return imap
def _update(self, provider, update, test=False): config = provider.get('config', {}) server = config.get('server', '') port = int(config.get('port', 993)) new_items = [] try: try: socket.setdefaulttimeout(app.config.get('EMAIL_TIMEOUT', 10)) imap = imaplib.IMAP4_SSL(host=server, port=port) except (socket.gaierror, OSError) as e: raise IngestEmailError.emailHostError(exception=e, provider=provider) try: imap.login(config.get('user', None), config.get('password', None)) except imaplib.IMAP4.error: raise IngestEmailError.emailLoginError(imaplib.IMAP4.error, provider) try: rv, data = imap.select(config.get('mailbox', None), readonly=False) if rv != 'OK': raise IngestEmailError.emailMailboxError() try: rv, data = imap.search(None, config.get('filter', '(UNSEEN)')) if rv != 'OK': raise IngestEmailError.emailFilterError() for num in data[0].split(): rv, data = imap.fetch(num, '(RFC822)') if rv == 'OK' and not test: try: parser = self.get_feed_parser(provider, data) new_items.append(parser.parse(data, provider)) rv, data = imap.store(num, '+FLAGS', '\\Seen') except IngestEmailError: continue finally: imap.close() finally: imap.logout() except IngestEmailError: raise except Exception as ex: raise IngestEmailError.emailError(ex, provider) return new_items
class EmailReaderService(IngestService): PROVIDER = 'email' ERRORS = [IngestEmailError.emailError().get_error_description(), IngestEmailError.emailLoginError().get_error_description()] def __init__(self): self.parser = rfc822Parser() def _update(self, provider): config = provider.get('config', {}) server = config.get('server', '') port = int(config.get('port', 993)) try: imap = imaplib.IMAP4_SSL(host=server, port=port) try: imap.login(config.get('user', None), config.get('password', None)) except imaplib.IMAP4.error: raise IngestEmailError.emailLoginError(imaplib.IMAP4.error, provider) rv, data = imap.select(config.get('mailbox', None), readonly=False) if rv == 'OK': rv, data = imap.search(None, config.get('filter', None)) if rv == 'OK': new_items = [] for num in data[0].split(): rv, data = imap.fetch(num, '(RFC822)') if rv == 'OK': try: new_items.append(self.parser.parse_email(data, provider)) except IngestEmailError: continue imap.close() imap.logout() except IngestEmailError: raise except Exception as ex: raise IngestEmailError.emailError(ex, provider) return new_items def prepare_href(self, href): return url_for_media(href)
def _update(self, provider, update, test=False): config = provider.get("config", {}) new_items = [] try: imap = self.authenticate(provider, config) try: rv, data = imap.select(config.get("mailbox", None), readonly=False) if rv != "OK": raise IngestEmailError.emailMailboxError() try: # at least one criterion must be set # (see file:///usr/share/doc/python/html/library/imaplib.html#imaplib.IMAP4.search) rv, data = imap.search(None, config.get("filter") or "(UNSEEN)") if rv != "OK": raise IngestEmailError.emailFilterError() for num in data[0].split(): rv, data = imap.fetch(num, "(RFC822)") if rv == "OK" and not test: try: parser = self.get_feed_parser(provider, data) parsed_items = parser.parse(data, provider) self.parse_extra(imap, num, parsed_items) new_items.append(parsed_items) rv, data = imap.store(num, "+FLAGS", "\\Seen") except IngestEmailError: continue finally: imap.close() finally: imap.logout() except IngestEmailError: raise except Exception as ex: raise IngestEmailError.emailError(ex, provider) return new_items
def authenticate(self, provider: dict, config: dict) -> imaplib.IMAP4_SSL: oauth2_token_service = superdesk.get_resource_service("oauth2_token") token = oauth2_token_service.find_one(req=None, _id=provider["url_id"]) if token is None: raise IngestEmailError.notConfiguredError(ValueError( l_("You need to log in first")), provider=provider) imap = imaplib.IMAP4_SSL("imap.gmail.com") if token["expires_at"].timestamp() < time.time() + 600: logger.info("Refreshing token for {provider_name}".format( provider_name=provider["name"])) token = oauth.refresh_google_token(token["_id"]) auth_string = "user={email}\x01auth=Bearer {token}\x01\x01".format( email=token["email"], token=token["access_token"]) imap.authenticate("XOAUTH2", lambda __: auth_string.encode()) return imap
def parse_email(self, data, provider): try: new_items = [] # create an item for the body text of the email # either text or html item = dict() item['type'] = 'text' item['versioncreated'] = utcnow() comp_item = None # a list to keep the references to the attachments refs = [] html_body = None text_body = None for response_part in data: if isinstance(response_part, tuple): msg = email.message_from_bytes(response_part[1]) item['headline'] = self.parse_header(msg['subject']) item['original_creator'] = self.parse_header(msg['from']) item['guid'] = msg['Message-ID'] date_tuple = email.utils.parsedate_tz(msg['Date']) if date_tuple: dt = datetime.datetime.utcfromtimestamp( email.utils.mktime_tz(date_tuple)) dt = dt.replace(tzinfo=timezone('utc')) item['firstcreated'] = dt # this will loop through all the available multiparts in mail for part in msg.walk(): if part.get_content_type() == "text/plain": body = part.get_payload(decode=True) try: # if we don't know the charset just have a go! if part.get_content_charset() is None: text_body = body.decode() else: charset = part.get_content_charset() text_body = body.decode(charset) continue except Exception as ex: logger.exception( "Exception parsing text body for {0} from {1}" .format(item['headline'], item['original_creator']), ex) continue if part.get_content_type() == "text/html": body = part.get_payload(decode=True) try: if part.get_content_charset() is None: html_body = body.decode() else: charset = part.get_content_charset() html_body = body.decode(charset) html_body = self.safe_html(html_body) continue except Exception as ex: logger.exception( "Exception parsing text html for {0} from {1}" .format(item['headline'], item['original_creator']), ex) continue if part.get_content_maintype() == 'multipart': continue if part.get('Content-Disposition') is None: continue # we are only going to pull off image attachments at this stage if part.get_content_maintype() != 'image': continue fileName = part.get_filename() if bool(fileName): image = part.get_payload(decode=True) content = io.BytesIO(image) res = process_file_from_stream( content, part.get_content_type()) file_name, content_type, metadata = res if content_type == 'image/gif' or content_type == 'image/png': continue content.seek(0) image_id = self.parser_app.media.put( content, filename=fileName, content_type=content_type, metadata=metadata) renditions = {'baseImage': {'href': image_id}} # if we have not got a composite item then create one if not comp_item: comp_item = dict() comp_item['type'] = 'composite' comp_item['guid'] = generate_guid( type=GUID_TAG) comp_item['versioncreated'] = utcnow() comp_item['groups'] = [] comp_item['headline'] = item['headline'] comp_item['groups'] = [] # create a reference to the item that stores the body of the email item_ref = {} item_ref['guid'] = item['guid'] item_ref['residRef'] = item['guid'] item_ref['headline'] = item['headline'] item_ref['location'] = 'ingest' item_ref['itemClass'] = 'icls:text' refs.append(item_ref) media_item = dict() media_item['guid'] = generate_guid(type=GUID_TAG) media_item['versioncreated'] = utcnow() media_item['type'] = 'picture' media_item['renditions'] = renditions media_item['mimetype'] = content_type media_item['filemeta'] = metadata media_item['slugline'] = fileName if text_body is not None: media_item['body_html'] = text_body media_item['headline'] = item['headline'] new_items.append(media_item) # add a reference to this item in the composite item media_ref = {} media_ref['guid'] = media_item['guid'] media_ref['residRef'] = media_item['guid'] media_ref['headline'] = fileName media_ref['location'] = 'ingest' media_ref['itemClass'] = 'icls:picture' refs.append(media_ref) if html_body is not None: item['body_html'] = html_body else: item['body_html'] = text_body item['type'] = 'preformatted' # if there is composite item then add the main group and references if comp_item: grefs = {} grefs['refs'] = [{'idRef': 'main'}] grefs['id'] = 'root' grefs['role'] = 'grpRole:NEP' comp_item['groups'].append(grefs) grefs = {} grefs['refs'] = refs grefs['id'] = 'main' grefs['role'] = 'grpRole:Main' comp_item['groups'].append(grefs) new_items.append(comp_item) new_items.append(item) return new_items except Exception as ex: raise IngestEmailError.emailParseError(ex, provider)
def _update(self, provider, update): config = provider.get('config', {}) server = config.get('server', '') port = int(config.get('port', 993)) try: imap = imaplib.IMAP4_SSL(host=server, port=port) try: imap.login(config.get('user', None), config.get('password', None)) except imaplib.IMAP4.error: raise IngestEmailError.emailLoginError(imaplib.IMAP4.error, provider) rv, data = imap.select(config.get('mailbox', None), readonly=False) if rv == 'OK': rv, data = imap.search(None, config.get('filter', '(UNSEEN)')) if rv == 'OK': new_items = [] for num in data[0].split(): rv, data = imap.fetch(num, '(RFC822)') if rv == 'OK': try: logger.info('Ingesting events from email') parser = self.get_feed_parser(provider, data) for response_part in data: if isinstance(response_part, tuple): if isinstance(response_part[1], bytes): msg = email.message_from_bytes( response_part[1]) else: msg = email.message_from_string( response_part[1]) # this will loop through all the available multiparts in email for part in msg.walk(): # parse attached files only if part.get('Content-Disposition' ) is None: continue fileName = part.get_filename() if bool(fileName): attachment = part.get_payload( decode=True) content = io.BytesIO( attachment) res = process_file_from_stream( content, part.get_content_type()) file_name, content_type, metadata = res logger.info( 'Ingesting events with {} parser' .format(parser.__class__. __name__)) if getattr( parser, 'parse_email'): try: new_items.append( parser.parse_email( content, content_type, provider)) except ParserError.parseMessageError: continue else: new_items.append( parser.parse( data, provider)) rv, data = imap.store(num, '+FLAGS', '\\Seen') except IngestEmailError: continue imap.close() imap.logout() except IngestEmailError: raise except Exception as ex: raise IngestEmailError.emailError(ex, provider) return new_items
class EventEmailFeedingService(FeedingService): """ Feeding Service class which can read the article(s) from a configured mail box. """ NAME = 'event_email' ERRORS = [ IngestEmailError.emailError().get_error_description(), IngestEmailError.emailLoginError().get_error_description() ] label = 'Event email' fields = [{ 'id': 'server', 'type': 'text', 'label': 'Email Server', 'placeholder': 'Email Server', 'required': True, 'errors': { 6003: 'Server not found.', 6002: 'Unexpected server response' } }, { 'id': 'port', 'type': 'text', 'label': 'Email Server Port', 'placeholder': 'Email Server Port', 'required': True, 'default': '993' }, { 'id': 'user', 'type': 'text', 'label': 'User', 'placeholder': 'User', 'required': True }, { 'id': 'password', 'type': 'password', 'label': 'Password', 'placeholder': 'Password', 'required': True, 'errors': { 6000: 'Authentication error.' } }, { 'id': 'mailbox', 'type': 'text', 'label': 'Mailbox', 'placeholder': 'Mailbox', 'required': True, 'errors': { 6004: 'Authentication error.' } }, { 'id': 'formatted', 'type': 'boolean', 'label': 'Formatted Email Parser', 'required': True }, { 'id': 'filter', 'type': 'text', 'label': 'Filter', 'placeholder': 'Filter', 'required': True }] """ Defines the collection service to be used with this ingest feeding service. """ service = 'events' def _update(self, provider, update): config = provider.get('config', {}) server = config.get('server', '') port = int(config.get('port', 993)) try: imap = imaplib.IMAP4_SSL(host=server, port=port) try: imap.login(config.get('user', None), config.get('password', None)) except imaplib.IMAP4.error: raise IngestEmailError.emailLoginError(imaplib.IMAP4.error, provider) rv, data = imap.select(config.get('mailbox', None), readonly=False) if rv == 'OK': rv, data = imap.search(None, config.get('filter', '(UNSEEN)')) if rv == 'OK': new_items = [] for num in data[0].split(): rv, data = imap.fetch(num, '(RFC822)') if rv == 'OK': try: logger.info('Ingesting events from email') parser = self.get_feed_parser(provider, data) for response_part in data: if isinstance(response_part, tuple): if isinstance(response_part[1], bytes): msg = email.message_from_bytes( response_part[1]) else: msg = email.message_from_string( response_part[1]) # this will loop through all the available multiparts in email for part in msg.walk(): # parse attached files only if part.get('Content-Disposition' ) is None: continue fileName = part.get_filename() if bool(fileName): attachment = part.get_payload( decode=True) content = io.BytesIO( attachment) res = process_file_from_stream( content, part.get_content_type()) file_name, content_type, metadata = res logger.info( 'Ingesting events with {} parser' .format(parser.__class__. __name__)) if getattr( parser, 'parse_email'): try: new_items.append( parser.parse_email( content, content_type, provider)) except ParserError.parseMessageError: continue else: new_items.append( parser.parse( data, provider)) rv, data = imap.store(num, '+FLAGS', '\\Seen') except IngestEmailError: continue imap.close() imap.logout() except IngestEmailError: raise except Exception as ex: raise IngestEmailError.emailError(ex, provider) return new_items def prepare_href(self, href, mimetype=None): return url_for_media(href, mimetype)
def _parse_formatted_email(self, data, provider): """Construct an item from an email that was constructed as a notification from a google form submission. The google form submits to a google sheet, this sheet creates the email as a notification :param data: :param provider: :return: A list of 1 item """ try: item = dict() item[ITEM_TYPE] = CONTENT_TYPE.TEXT item["versioncreated"] = utcnow() for response_part in data: if isinstance(response_part, tuple): msg = email.message_from_bytes(response_part[1]) # Check that the subject line matches what we expect, ignore it if not if self.parse_header( msg["subject"]) != "Formatted Editorial Story": return [] item["guid"] = msg["Message-ID"] date_tuple = email.utils.parsedate_tz(msg["Date"]) if date_tuple: dt = datetime.datetime.utcfromtimestamp( email.utils.mktime_tz(date_tuple)) dt = dt.replace(tzinfo=timezone("utc")) item["firstcreated"] = dt for part in msg.walk(): if part.get_content_type() == "text/plain": body = part.get_payload(decode=True) # if we don't know the charset just have a go! if part.get_content_charset() is None: json_str = body.decode().replace("\r\n", "").replace( " ", " ") else: charset = part.get_content_charset() json_str = body.decode(charset).replace( "\r\n", "").replace(" ", " ") mail_item = dict( (k, v[0]) for k, v in json.loads(json_str).items()) self._expand_category(item, mail_item) item["original_source"] = mail_item.get( "Username", mail_item.get("Email Address", "")) item["headline"] = mail_item.get("Headline", "") item["abstract"] = mail_item.get("Abstract", "") item["slugline"] = mail_item.get("Slugline", "") item["body_html"] = "<p>" + mail_item.get( "Body", "").replace("\n", "</p><p>") + "</p>" default_source = app.config.get( "DEFAULT_SOURCE_VALUE_FOR_MANUAL_ARTICLES") city = mail_item.get("Dateline", "") cities = app.locators.find_cities() located = [ c for c in cities if c["city"].lower() == city.lower() ] item.setdefault("dateline", {}) item["dateline"]["located"] = ( located[0] if len(located) > 0 else { "city_code": city, "city": city, "tz": "UTC", "dateline": "city" }) item["dateline"]["source"] = default_source item["dateline"][ "text"] = format_dateline_to_locmmmddsrc( item["dateline"]["located"], get_date(item["firstcreated"]), source=default_source) if mail_item.get("Priority") != "": if mail_item.get("Priority", "3").isdigit(): item["priority"] = int( mail_item.get("Priority", "3")) else: priority_map = superdesk.get_resource_service( "vocabularies").find_one( req=None, _id="priority") priorities = [ x for x in priority_map.get("items", []) if x["name"].upper() == mail_item.get( "Priority", "").upper() ] if priorities is not None and len( priorities) > 0: item["priority"] = int( priorities[0].get("qcode", "3")) else: item["priority"] = 3 if mail_item.get("News Value") != "": item["urgency"] = int( mail_item.get("News Value", "3")) # We expect the username passed corresponds to a superdesk user query = { "email": re.compile( "^{}$".format( mail_item.get( "Username", mail_item.get("Email Address", ""))), re.IGNORECASE, ) } user = superdesk.get_resource_service( "users").find_one(req=None, **query) if not user: logger.error( "Failed to find user for email {}".format( mail_item.get( "Username", mail_item.get("Email Address", "")))) raise UserNotRegisteredException() item["original_creator"] = user.get("_id") if BYLINE in user and user.get(BYLINE, ""): item["byline"] = user.get(BYLINE) item[SIGN_OFF] = user.get(SIGN_OFF) # attempt to match the given desk name against the defined desks query = { "name": re.compile( "^{}$".format(mail_item.get("Desk", "")), re.IGNORECASE) } desk = superdesk.get_resource_service( "desks").find_one(req=None, **query) if desk: item["task"] = { "desk": desk.get("_id"), "stage": desk.get("incoming_stage") } if "Place" in mail_item: locator_map = superdesk.get_resource_service( "vocabularies").find_one(req=None, _id="locators") place = [ x for x in locator_map.get("items", []) if x["qcode"] == mail_item.get( "Place", "").upper() ] if place is not None: item["place"] = place if mail_item.get("Legal flag", "") == "LEGAL": item["flags"] = {"marked_for_legal": True} break return [item] except Exception as ex: raise IngestEmailError.emailParseError(ex, provider)
def parse(self, data, provider=None): config = provider.get('config', {}) # If the channel is configured to process structured email generated from a google form if config.get('formatted', False): return self._parse_formatted_email(data, provider) try: new_items = [] # create an item for the body text of the email # either text or html item = dict() item[ITEM_TYPE] = CONTENT_TYPE.TEXT item['versioncreated'] = utcnow() comp_item = None # a list to keep the references to the attachments refs = [] html_body = None text_body = None for response_part in data: if isinstance(response_part, tuple): msg = email.message_from_bytes(response_part[1]) item['headline'] = self.parse_header(msg['subject']) field_from = self.parse_header(msg['from']) item['original_source'] = field_from try: if email_regex.findall(field_from): email_address = email_regex.findall(field_from)[0] user = get_resource_service('users').get_user_by_email(email_address) item['original_creator'] = user[eve.utils.config.ID_FIELD] except UserNotRegisteredException: pass item['guid'] = msg['Message-ID'] date_tuple = email.utils.parsedate_tz(msg['Date']) if date_tuple: dt = datetime.datetime.utcfromtimestamp( email.utils.mktime_tz(date_tuple)) dt = dt.replace(tzinfo=timezone('utc')) item['firstcreated'] = dt # this will loop through all the available multiparts in mail for part in msg.walk(): if part.get_content_type() == "text/plain": body = part.get_payload(decode=True) try: # if we don't know the charset just have a go! if part.get_content_charset() is None: text_body = body.decode() else: charset = part.get_content_charset() text_body = body.decode(charset) continue except Exception as ex: logger.exception( "Exception parsing text body for {0} from {1}: {2}".format(item['headline'], field_from, ex)) continue if part.get_content_type() == "text/html": body = part.get_payload(decode=True) try: if part.get_content_charset() is None: html_body = body.decode() else: charset = part.get_content_charset() html_body = body.decode(charset) html_body = self.safe_html(html_body) continue except Exception as ex: logger.exception( "Exception parsing html body for {0} from {1}: {2}".format(item['headline'], field_from, ex)) continue if part.get_content_maintype() == 'multipart': continue if part.get('Content-Disposition') is None: continue # we are only going to pull off image attachments at this stage if part.get_content_maintype() != 'image': continue fileName = part.get_filename() if bool(fileName): image = part.get_payload(decode=True) content = io.BytesIO(image) res = process_file_from_stream(content, part.get_content_type()) file_name, content_type, metadata = res if content_type == 'image/gif' or content_type == 'image/png': continue content.seek(0) image_id = self.parser_app.media.put(content, filename=fileName, content_type=content_type, metadata=metadata) renditions = {'baseImage': {'href': image_id}} # if we have not got a composite item then create one if not comp_item: comp_item = dict() comp_item[ITEM_TYPE] = CONTENT_TYPE.COMPOSITE comp_item['guid'] = generate_guid(type=GUID_TAG) comp_item['versioncreated'] = utcnow() comp_item['groups'] = [] comp_item['headline'] = item['headline'] comp_item['groups'] = [] comp_item['original_source'] = item['original_source'] if 'original_creator' in item: comp_item['original_creator'] = item['original_creator'] # create a reference to the item that stores the body of the email item_ref = {'guid': item['guid'], 'residRef': item['guid'], 'headline': item['headline'], 'location': 'ingest', 'itemClass': 'icls:text', 'original_source': item['original_source']} if 'original_creator' in item: item_ref['original_creator'] = item['original_creator'] refs.append(item_ref) media_item = dict() media_item['guid'] = generate_guid(type=GUID_TAG) media_item['versioncreated'] = utcnow() media_item[ITEM_TYPE] = CONTENT_TYPE.PICTURE media_item['renditions'] = renditions media_item['mimetype'] = content_type set_filemeta(media_item, metadata) media_item['slugline'] = fileName if text_body is not None: media_item['body_html'] = text_body media_item['headline'] = item['headline'] media_item['original_source'] = item['original_source'] if 'original_creator' in item: media_item['original_creator'] = item['original_creator'] new_items.append(media_item) # add a reference to this item in the composite item media_ref = {'guid': media_item['guid'], 'residRef': media_item['guid'], 'headline': fileName, 'location': 'ingest', 'itemClass': 'icls:picture', 'original_source': item['original_source']} if 'original_creator' in item: media_ref['original_creator'] = item['original_creator'] refs.append(media_ref) if html_body is not None: item['body_html'] = html_body else: item['body_html'] = '<pre>' + text_body + '</pre>' item[FORMAT] = FORMATS.PRESERVED # if there is composite item then add the main group and references if comp_item: grefs = {'refs': [{'idRef': 'main'}], 'id': 'root', 'role': 'grpRole:NEP'} comp_item['groups'].append(grefs) grefs = {'refs': refs, 'id': 'main', 'role': 'grpRole:Main'} comp_item['groups'].append(grefs) new_items.append(comp_item) new_items.append(item) return new_items except Exception as ex: raise IngestEmailError.emailParseError(ex, provider)
def _parse_formatted_email(self, data, provider): """ Passed an email that was constructed as a notificaton from a google form submission it constructs an item. The google form submits to a google sheet, this sheet creates the email as a notification :param data: :param provider: :return: A list of 1 item """ try: item = dict() item[ITEM_TYPE] = CONTENT_TYPE.TEXT item['versioncreated'] = utcnow() for response_part in data: if isinstance(response_part, tuple): msg = email.message_from_bytes(response_part[1]) # Check that the subject line macthes what we expect, ignore it if not if self.parse_header(msg['subject']) != 'Formatted Editorial Story': return [] item['guid'] = msg['Message-ID'] date_tuple = email.utils.parsedate_tz(msg['Date']) if date_tuple: dt = datetime.datetime.utcfromtimestamp( email.utils.mktime_tz(date_tuple)) dt = dt.replace(tzinfo=timezone('utc')) item['firstcreated'] = dt for part in msg.walk(): if part.get_content_type() == "text/plain": body = part.get_payload(decode=True) # if we don't know the charset just have a go! if part.get_content_charset() is None: json_str = body.decode().replace('\r\n', '').replace(' ', ' ') else: charset = part.get_content_charset() json_str = body.decode(charset).replace('\r\n', '').replace(' ', ' ') mail_item = dict((k, v[0]) for k, v in json.loads(json_str).items()) self._expand_category(item, mail_item) item['original_source'] = mail_item.get('Username') item['headline'] = mail_item.get('Headline') item['abstract'] = mail_item.get('Abstract') item['slugline'] = mail_item.get('Slugline') item['body_html'] = mail_item.get('Body').replace('\n', '<br />') if mail_item.get('Priority') != '': item['priority'] = int(mail_item.get('Priority')) if mail_item.get('Urgency') != '': item['urgency'] = int(mail_item.get('Urgency')) # We expect the username passed coresponds to a superdesk user query = {'email': re.compile('^{}$'.format(mail_item.get('Username')), re.IGNORECASE)} user = superdesk.get_resource_service('users').find_one(req=None, **query) if not user: logger.error('Failed to find user for email {}'.format(mail_item.get('Username'))) raise UserNotRegisteredException() item['original_creator'] = user.get('_id') item['byline'] = user.get(BYLINE, user.get('display_name')) item[SIGN_OFF] = user.get(SIGN_OFF) # attempt to match the given desk name against the defined desks desk = superdesk.get_resource_service('desks').find_one( req=None, name=mail_item.get('Desk')) if desk: item['task'] = {'desk': desk.get('_id'), 'stage': desk.get('incoming_stage')} break return [item] except Exception as ex: raise IngestEmailError.emailParseError(ex, provider)
# Copyright 2013, 2014 Sourcefabric z.u. and contributors. # # For the full copyright and license information, please see the # AUTHORS and LICENSE files distributed with this source code, or # at https://www.sourcefabric.org/superdesk/license import imaplib from .ingest_service import IngestService from superdesk.io import register_provider from superdesk.upload import url_for_media from superdesk.errors import IngestEmailError from superdesk.io.rfc822 import rfc822Parser PROVIDER = 'email' errors = [IngestEmailError.emailError().get_error_description(), IngestEmailError.emailLoginError().get_error_description()] class EmailReaderService(IngestService): def __init__(self): self.parser = rfc822Parser() def _update(self, provider): config = provider.get('config', {}) server = config.get('server', '') port = int(config.get('port', 993)) try: imap = imaplib.IMAP4_SSL(host=server, port=port)
def parse(self, data, provider=None): config = provider.get('config', {}) # If the channel is configured to process structured email generated from a google form if config.get('formatted', False): return self._parse_formatted_email(data, provider) try: new_items = [] # create an item for the body text of the email # either text or html item = dict() item[ITEM_TYPE] = CONTENT_TYPE.TEXT item['versioncreated'] = utcnow() comp_item = None # a list to keep the references to the attachments refs = [] html_body = None text_body = None for response_part in data: if isinstance(response_part, tuple): msg = email.message_from_bytes(response_part[1]) item['headline'] = self.parse_header(msg['subject']) field_from = self.parse_header(msg['from']) item['original_source'] = field_from try: if email_regex.findall(field_from): email_address = email_regex.findall(field_from)[0] user = get_resource_service( 'users').get_user_by_email(email_address) item['original_creator'] = user[ eve.utils.config.ID_FIELD] except UserNotRegisteredException: pass item['guid'] = msg['Message-ID'] date_tuple = email.utils.parsedate_tz(msg['Date']) if date_tuple: dt = datetime.datetime.utcfromtimestamp( email.utils.mktime_tz(date_tuple)) dt = dt.replace(tzinfo=timezone('utc')) item['firstcreated'] = dt # this will loop through all the available multiparts in mail for part in msg.walk(): if part.get_content_type() == "text/plain": body = part.get_payload(decode=True) try: # if we don't know the charset just have a go! if part.get_content_charset() is None: text_body = body.decode() else: charset = part.get_content_charset() text_body = body.decode(charset) continue except Exception as ex: logger.exception( "Exception parsing text body for {0} from {1}: {2}" .format(item['headline'], field_from, ex)) continue if part.get_content_type() == "text/html": body = part.get_payload(decode=True) try: if part.get_content_charset() is None: html_body = body.decode() else: charset = part.get_content_charset() html_body = body.decode(charset) html_body = self.safe_html(html_body) continue except Exception as ex: logger.exception( "Exception parsing html body for {0} from {1}: {2}" .format(item['headline'], field_from, ex)) continue if part.get_content_maintype() == 'multipart': continue if part.get('Content-Disposition') is None: continue # we are only going to pull off image attachments at this stage if part.get_content_maintype() != 'image': continue fileName = part.get_filename() if bool(fileName): image = part.get_payload(decode=True) content = io.BytesIO(image) res = process_file_from_stream( content, part.get_content_type()) file_name, content_type, metadata = res if content_type == 'image/gif' or content_type == 'image/png': continue content.seek(0) image_id = self.parser_app.media.put( content, filename=fileName, content_type=content_type, metadata=metadata) renditions = {'baseImage': {'href': image_id}} # if we have not got a composite item then create one if not comp_item: comp_item = dict() comp_item[ITEM_TYPE] = CONTENT_TYPE.COMPOSITE comp_item['guid'] = generate_guid( type=GUID_TAG) comp_item['versioncreated'] = utcnow() comp_item['groups'] = [] comp_item['headline'] = item['headline'] comp_item['groups'] = [] comp_item['original_source'] = item[ 'original_source'] if 'original_creator' in item: comp_item['original_creator'] = item[ 'original_creator'] # create a reference to the item that stores the body of the email item_ref = { 'guid': item['guid'], 'residRef': item['guid'], 'headline': item['headline'], 'location': 'ingest', 'itemClass': 'icls:text', 'original_source': item['original_source'] } if 'original_creator' in item: item_ref['original_creator'] = item[ 'original_creator'] refs.append(item_ref) media_item = dict() media_item['guid'] = generate_guid(type=GUID_TAG) media_item['versioncreated'] = utcnow() media_item[ITEM_TYPE] = CONTENT_TYPE.PICTURE media_item['renditions'] = renditions media_item['mimetype'] = content_type set_filemeta(media_item, metadata) media_item['slugline'] = fileName if text_body is not None: media_item['body_html'] = text_body media_item['headline'] = item['headline'] media_item['original_source'] = item[ 'original_source'] if 'original_creator' in item: media_item['original_creator'] = item[ 'original_creator'] new_items.append(media_item) # add a reference to this item in the composite item media_ref = { 'guid': media_item['guid'], 'residRef': media_item['guid'], 'headline': fileName, 'location': 'ingest', 'itemClass': 'icls:picture', 'original_source': item['original_source'] } if 'original_creator' in item: media_ref['original_creator'] = item[ 'original_creator'] refs.append(media_ref) if html_body is not None: item['body_html'] = html_body else: item['body_html'] = '<pre>' + text_body + '</pre>' item[FORMAT] = FORMATS.PRESERVED # if there is composite item then add the main group and references if comp_item: grefs = { 'refs': [{ 'idRef': 'main' }], 'id': 'root', 'role': 'grpRole:NEP' } comp_item['groups'].append(grefs) grefs = {'refs': refs, 'id': 'main', 'role': 'grpRole:Main'} comp_item['groups'].append(grefs) new_items.append(comp_item) new_items.append(item) return new_items except Exception as ex: raise IngestEmailError.emailParseError(ex, provider)
class EmailFeedingService(FeedingService): """ Feeding Service class which can read the article(s) from a configured mail box. """ NAME = "email" ERRORS = [ IngestEmailError.emailError().get_error_description(), IngestEmailError.emailLoginError().get_error_description(), ] label = "Email" fields = [ { "id": "server", "type": "text", "label": l_("Email Server"), "placeholder": "Email Server", "required": True, "errors": { 6003: "Server not found.", 6002: "Unexpected server response" }, }, { "id": "port", "type": "text", "label": l_("Email Server Port"), "placeholder": "Email Server Port", "required": True, "default": "993", }, { "id": "user", "type": "text", "label": l_("User"), "placeholder": "User", "required": True }, { "id": "password", "type": "password", "label": l_("Password"), "placeholder": "Password", "required": True, "errors": { 6000: "Authentication error." }, }, { "id": "mailbox", "type": "text", "label": l_("Mailbox"), "placeholder": "Mailbox", "required": True, "errors": { 6004: "Authentication error." }, }, { "id": "formatted", "type": "boolean", "label": l_("Formatted Email Parser"), "required": True }, { "id": "filter", "type": "text", "label": l_("Filter"), "placeholder": "Filter", "required": False }, ] def _test(self, provider): self._update(provider, update=None, test=True) def authenticate(self, provider: dict, config: dict) -> imaplib.IMAP4_SSL: server = config.get("server", "") port = int(config.get("port", 993)) try: socket.setdefaulttimeout(app.config.get("EMAIL_TIMEOUT", 10)) imap = imaplib.IMAP4_SSL(host=server, port=port) except (socket.gaierror, OSError) as e: raise IngestEmailError.emailHostError(exception=e, provider=provider) try: imap.login(config.get("user", None), config.get("password", None)) except imaplib.IMAP4.error: raise IngestEmailError.emailLoginError(imaplib.IMAP4.error, provider) return imap def _update(self, provider, update, test=False): config = provider.get("config", {}) new_items = [] try: imap = self.authenticate(provider, config) try: rv, data = imap.select(config.get("mailbox", None), readonly=False) if rv != "OK": raise IngestEmailError.emailMailboxError() try: rv, data = imap.search(None, config.get("filter", "(UNSEEN)")) if rv != "OK": raise IngestEmailError.emailFilterError() for num in data[0].split(): rv, data = imap.fetch(num, "(RFC822)") if rv == "OK" and not test: try: parser = self.get_feed_parser(provider, data) new_items.append(parser.parse(data, provider)) rv, data = imap.store(num, "+FLAGS", "\\Seen") except IngestEmailError: continue finally: imap.close() finally: imap.logout() except IngestEmailError: raise except Exception as ex: raise IngestEmailError.emailError(ex, provider) return new_items def prepare_href(self, href, mimetype=None): return url_for_media(href, mimetype)
def parse(self, data, provider=None): config = provider.get("config", {}) # If the channel is configured to process structured email generated from a google form if config.get("formatted", False): return self._parse_formatted_email(data, provider) try: new_items = [] # create an item for the body text of the email # either text or html item = dict() item[ITEM_TYPE] = CONTENT_TYPE.TEXT item["versioncreated"] = utcnow() comp_item = None # a list to keep the references to the attachments refs = [] html_body = None text_body = None for response_part in data: if isinstance(response_part, tuple): msg = email.message_from_bytes(response_part[1]) item["headline"] = self.parse_header(msg["subject"]) field_from = self.parse_header(msg["from"]) item["original_source"] = field_from try: if email_regex.findall(field_from): email_address = email_regex.findall(field_from)[0] user = get_resource_service( "users").get_user_by_email(email_address) item["original_creator"] = user[ eve.utils.config.ID_FIELD] except UserNotRegisteredException: pass item["guid"] = msg["Message-ID"] date_tuple = email.utils.parsedate_tz(msg["Date"]) if date_tuple: dt = datetime.datetime.utcfromtimestamp( email.utils.mktime_tz(date_tuple)) dt = dt.replace(tzinfo=timezone("utc")) item["firstcreated"] = dt # this will loop through all the available multiparts in mail for part in msg.walk(): if part.get_content_type() == "text/plain": body = part.get_payload(decode=True) try: # if we don't know the charset just have a go! if part.get_content_charset() is None: text_body = body.decode() else: charset = part.get_content_charset() text_body = body.decode(charset) continue except Exception as ex: logger.exception( "Exception parsing text body for {0} from {1}: {2}" .format(item["headline"], field_from, ex)) continue if part.get_content_type() == "text/html": body = part.get_payload(decode=True) try: if part.get_content_charset() is None: html_body = body.decode() else: charset = part.get_content_charset() html_body = body.decode(charset) html_body = sanitize_html(html_body) continue except Exception as ex: logger.exception( "Exception parsing html body for {0} from {1}: {2}" .format(item["headline"], field_from, ex)) continue if part.get_content_maintype() == "multipart": continue if part.get("Content-Disposition") is None: continue # we are only going to pull off image attachments at this stage if part.get_content_maintype() != "image": continue fileName = part.get_filename() if bool(fileName): image = part.get_payload(decode=True) content = io.BytesIO(image) res = process_file_from_stream( content, part.get_content_type()) file_name, content_type, metadata = res if content_type == "image/gif" or content_type == "image/png": continue content.seek(0) image_id = self.parser_app.media.put( content, filename=fileName, content_type=content_type, metadata=metadata) renditions = {"baseImage": {"href": image_id}} # if we have not got a composite item then create one if not comp_item: comp_item = dict() comp_item[ITEM_TYPE] = CONTENT_TYPE.COMPOSITE comp_item["guid"] = generate_guid( type=GUID_TAG) comp_item["versioncreated"] = utcnow() comp_item["groups"] = [] comp_item["headline"] = item["headline"] comp_item["groups"] = [] comp_item["original_source"] = item[ "original_source"] if "original_creator" in item: comp_item["original_creator"] = item[ "original_creator"] # create a reference to the item that stores the body of the email item_ref = { "guid": item["guid"], "residRef": item["guid"], "headline": item["headline"], "location": "ingest", "itemClass": "icls:text", "original_source": item["original_source"], } if "original_creator" in item: item_ref["original_creator"] = item[ "original_creator"] refs.append(item_ref) media_item = dict() media_item["guid"] = generate_guid(type=GUID_TAG) media_item["versioncreated"] = utcnow() media_item[ITEM_TYPE] = CONTENT_TYPE.PICTURE media_item["renditions"] = renditions media_item["mimetype"] = content_type set_filemeta(media_item, metadata) media_item["slugline"] = fileName if text_body is not None: media_item["body_html"] = text_body media_item["headline"] = item["headline"] media_item["original_source"] = item[ "original_source"] if "original_creator" in item: media_item["original_creator"] = item[ "original_creator"] new_items.append(media_item) # add a reference to this item in the composite item media_ref = { "guid": media_item["guid"], "residRef": media_item["guid"], "headline": fileName, "location": "ingest", "itemClass": "icls:picture", "original_source": item["original_source"], } if "original_creator" in item: media_ref["original_creator"] = item[ "original_creator"] refs.append(media_ref) if html_body: item["body_html"] = html_body else: item["body_html"] = "<pre>" + text_body + "</pre>" item[FORMAT] = FORMATS.PRESERVED # if there is composite item then add the main group and references if comp_item: grefs = { "refs": [{ "idRef": "main" }], "id": "root", "role": "grpRole:NEP" } comp_item["groups"].append(grefs) grefs = {"refs": refs, "id": "main", "role": "grpRole:Main"} comp_item["groups"].append(grefs) new_items.append(comp_item) new_items.append(item) return new_items except Exception as ex: raise IngestEmailError.emailParseError(ex, provider)
class GMailFeedingService(EmailFeedingService): """ Feeding Service class which can read the article(s) from a configured mail box. """ NAME = "gmail" ERRORS = [ IngestEmailError.emailError().get_error_description(), IngestEmailError.emailLoginError().get_error_description(), ] label = "Gmail" fields = [ { "id": "email", "type": "text", "label": l_("email"), "readonly": True, "show_expression": "provider.config['email'] != null", }, { "id": "log_in_url", "type": "url_request", "label": l_("Log-in with GMail"), # provider._id != null provider has to be saved before trying to log in # provider.config['email'] == null do not display log-in button if logged-in already "show_expression": "provider._id != null && provider.config['email'] == null", }, { "id": "log_out_url", "type": "url_request", "label": l_("Log-out"), # provider.config['email'] != null only display log-out button if already logged in "show_expression": "provider.config['email'] != null", }, { "id": "mailbox", "type": "text", "label": l_("Mailbox"), "default_value": "INBOX", "placeholder": l_("Mailbox"), "required": True, "errors": { 6004: "Authentication error." }, }, { "id": "filter", "type": "text", "label": l_("Filter"), "placeholder": "Filter", "required": False }, ] @classmethod def init_app(cls, app): # we need to access config to set the URL, so we do it here field = next(f for f in cls.fields if f["id"] == "log_in_url") field["url"] = join(app.config["SERVER_URL"], "login", "google", "{PROVIDER_ID}") field = next(f for f in cls.fields if f["id"] == "log_out_url") field["url"] = join(app.config["SERVER_URL"], "logout", "google", "{PROVIDER_ID}") def _test(self, provider): self._update(provider, update=None, test=True) def authenticate(self, provider: dict, config: dict) -> imaplib.IMAP4_SSL: oauth2_token_service = superdesk.get_resource_service("oauth2_token") token = oauth2_token_service.find_one(req=None, _id=ObjectId(provider["_id"])) if token is None: raise IngestEmailError.notConfiguredError(ValueError( l_("You need to log in first")), provider=provider) imap = imaplib.IMAP4_SSL("imap.gmail.com") if token["expires_at"].timestamp() < time.time() + 600: logger.info("Refreshing token for {provider_name}".format( provider_name=provider["name"])) token = oauth.refresh_google_token(token["_id"]) auth_string = "user={email}\x01auth=Bearer {token}\x01\x01".format( email=token["email"], token=token["access_token"]) imap.authenticate("XOAUTH2", lambda __: auth_string.encode()) return imap def parse_extra(self, imap: imaplib.IMAP4_SSL, num: str, parsed_items: List[dict]) -> None: """Add GMail labels to parsed_items""" try: # we use GMail IMAP Extensions # https://developers.google.com/gmail/imap/imap-extensions#access_to_gmail_labels_x-gm-labels _, data = imap.fetch(num, "(X-GM-LABELS)") # it seems that there is nothing to help parsing in standard lib # thus we use some regex to get our labels data_bytes = data[0] if not isinstance(data_bytes, bytes): raise ValueError(f"Unexpected data type: {type(data_bytes)}") data_str = data_bytes.decode("utf-7") match_labels_str = RE_LABELS_STR.search(data_str) if match_labels_str is None: raise ValueError( f"Can't find the expected label string in data: {data_str:r}" ) labels_str = match_labels_str.group(1) labels = [(m.group("quoted") or m.group("unquoted")).replace('\\"', '"') for m in RE_LABEL.finditer(labels_str)] for parsed_item in parsed_items: subjects = parsed_item.setdefault("subject", []) for label in labels: subjects.append({ "name": label, "qcode": label, "scheme": "gmail_label" }) except Exception: logger.exception("Can't retrieve GMail labels")
class GMailFeedingService(EmailFeedingService): """ Feeding Service class which can read the article(s) from a configured mail box. """ NAME = "gmail" ERRORS = [ IngestEmailError.emailError().get_error_description(), IngestEmailError.emailLoginError().get_error_description(), ] label = "Gmail" fields = [ { "type": "url_request", "label": l_("Log-in with GMail"), }, { "id": "email", "type": "string", "label": l_("email"), "readonly": True, "placeholder": l_("This field will be automatically filled once you've logged using log-in button above" ), }, { "id": "mailbox", "type": "text", "label": l_("Mailbox"), "default_value": "INBOX", "placeholder": l_("Mailbox"), "required": True, "errors": { 6004: "Authentication error." }, }, { "id": "filter", "type": "text", "label": l_("Filter"), "placeholder": "Filter", "required": False }, ] @classmethod def init_app(cls, app): # we need to access config to set the URL, so we do it here field = next(f for f in cls.fields if f["type"] == "url_request") field["url"] = join(app.config["SERVER_URL"], "login", "google", "{URL_ID}") def _test(self, provider): self._update(provider, update=None, test=True) def authenticate(self, provider: dict, config: dict) -> imaplib.IMAP4_SSL: oauth2_token_service = superdesk.get_resource_service("oauth2_token") token = oauth2_token_service.find_one(req=None, _id=provider["url_id"]) if token is None: raise IngestEmailError.notConfiguredError(ValueError( l_("You need to log in first")), provider=provider) imap = imaplib.IMAP4_SSL("imap.gmail.com") if token["expires_at"].timestamp() < time.time() + 600: logger.info("Refreshing token for {provider_name}".format( provider_name=provider["name"])) token = oauth.refresh_google_token(token["_id"]) auth_string = "user={email}\x01auth=Bearer {token}\x01\x01".format( email=token["email"], token=token["access_token"]) imap.authenticate("XOAUTH2", lambda __: auth_string.encode()) return imap
def parse_email(self, data, provider): try: new_items = [] # create an item for the body text of the email # either text or html item = dict() item['type'] = 'text' item['versioncreated'] = utcnow() comp_item = None # a list to keep the references to the attachments refs = [] html_body = None text_body = None for response_part in data: if isinstance(response_part, tuple): msg = email.message_from_bytes(response_part[1]) item['headline'] = self.parse_header(msg['subject']) item['original_creator'] = self.parse_header(msg['from']) item['guid'] = msg['Message-ID'] date_tuple = email.utils.parsedate_tz(msg['Date']) if date_tuple: dt = datetime.datetime.utcfromtimestamp(email.utils.mktime_tz(date_tuple)) dt = dt.replace(tzinfo=timezone('utc')) item['firstcreated'] = dt # this will loop through all the available multiparts in mail for part in msg.walk(): if part.get_content_type() == "text/plain": body = part.get_payload(decode=True) try: # if we don't know the charset just have a go! if part.get_content_charset() is None: text_body = body.decode() else: charset = part.get_content_charset() text_body = body.decode(charset) continue except Exception as ex: logger.exception( "Exception parsing text body for {0} from {1}".format(item['headline'], item['original_creator']), ex) continue if part.get_content_type() == "text/html": body = part.get_payload(decode=True) try: if part.get_content_charset() is None: html_body = body.decode() else: charset = part.get_content_charset() html_body = body.decode(charset) html_body = self.safe_html(html_body) continue except Exception as ex: logger.exception( "Exception parsing text html for {0} from {1}".format(item['headline'], item['original_creator']), ex) continue if part.get_content_maintype() == 'multipart': continue if part.get('Content-Disposition') is None: continue # we are only going to pull off image attachments at this stage if part.get_content_maintype() != 'image': continue fileName = part.get_filename() if bool(fileName): image = part.get_payload(decode=True) content = io.BytesIO(image) res = process_file_from_stream(content, part.get_content_type()) file_name, content_type, metadata = res if content_type == 'image/gif' or content_type == 'image/png': continue content.seek(0) image_id = self.parser_app.media.put(content, filename=fileName, content_type=content_type, metadata=metadata) renditions = {'baseImage': {'href': image_id}} # if we have not got a composite item then create one if not comp_item: comp_item = dict() comp_item['type'] = 'composite' comp_item['guid'] = generate_guid(type=GUID_TAG) comp_item['versioncreated'] = utcnow() comp_item['groups'] = [] comp_item['headline'] = item['headline'] comp_item['groups'] = [] # create a reference to the item that stores the body of the email item_ref = {} item_ref['guid'] = item['guid'] item_ref['residRef'] = item['guid'] item_ref['headline'] = item['headline'] item_ref['location'] = 'ingest' item_ref['itemClass'] = 'icls:text' refs.append(item_ref) media_item = dict() media_item['guid'] = generate_guid(type=GUID_TAG) media_item['versioncreated'] = utcnow() media_item['type'] = 'picture' media_item['renditions'] = renditions media_item['mimetype'] = content_type media_item['filemeta'] = metadata media_item['slugline'] = fileName if text_body is not None: media_item['body_html'] = text_body media_item['headline'] = item['headline'] new_items.append(media_item) # add a reference to this item in the composite item media_ref = {} media_ref['guid'] = media_item['guid'] media_ref['residRef'] = media_item['guid'] media_ref['headline'] = fileName media_ref['location'] = 'ingest' media_ref['itemClass'] = 'icls:picture' refs.append(media_ref) if html_body is not None: item['body_html'] = html_body else: item['body_html'] = text_body item['type'] = 'preformatted' # if there is composite item then add the main group and references if comp_item: grefs = {} grefs['refs'] = [{'idRef': 'main'}] grefs['id'] = 'root' grefs['role'] = 'grpRole:NEP' comp_item['groups'].append(grefs) grefs = {} grefs['refs'] = refs grefs['id'] = 'main' grefs['role'] = 'grpRole:Main' comp_item['groups'].append(grefs) new_items.append(comp_item) new_items.append(item) return new_items except Exception as ex: raise IngestEmailError.emailParseError(ex, provider)
def _parse_formatted_email(self, data, provider): """Construct an item from an email that was constructed as a notification from a google form submission. The google form submits to a google sheet, this sheet creates the email as a notification :param data: :param provider: :return: A list of 1 item """ try: item = dict() item[ITEM_TYPE] = CONTENT_TYPE.TEXT item['versioncreated'] = utcnow() for response_part in data: if isinstance(response_part, tuple): msg = email.message_from_bytes(response_part[1]) # Check that the subject line matches what we expect, ignore it if not if self.parse_header(msg['subject']) != 'Formatted Editorial Story': return [] item['guid'] = msg['Message-ID'] date_tuple = email.utils.parsedate_tz(msg['Date']) if date_tuple: dt = datetime.datetime.utcfromtimestamp( email.utils.mktime_tz(date_tuple)) dt = dt.replace(tzinfo=timezone('utc')) item['firstcreated'] = dt for part in msg.walk(): if part.get_content_type() == "text/plain": body = part.get_payload(decode=True) # if we don't know the charset just have a go! if part.get_content_charset() is None: json_str = body.decode().replace('\r\n', '').replace(' ', ' ') else: charset = part.get_content_charset() json_str = body.decode(charset).replace('\r\n', '').replace(' ', ' ') mail_item = dict((k, v[0]) for k, v in json.loads(json_str).items()) self._expand_category(item, mail_item) item['original_source'] = mail_item.get('Username', '') item['headline'] = mail_item.get('Headline', '') item['abstract'] = mail_item.get('Abstract', '') item['slugline'] = mail_item.get('Slugline', '') item['body_html'] = '<p>' + mail_item.get('Body', '').replace('\n', '</p><p>') + '</p>' default_source = app.config.get('DEFAULT_SOURCE_VALUE_FOR_MANUAL_ARTICLES') city = mail_item.get('Dateline', '') cities = app.locators.find_cities() located = [c for c in cities if c['city'].lower() == city.lower()] item.setdefault('dateline', {}) item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city'} item['dateline']['source'] = default_source item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'], get_date(item['firstcreated']), source=default_source) if mail_item.get('Priority') != '': if mail_item.get('Priority', '3').isdigit(): item['priority'] = int(mail_item.get('Priority', '3')) else: priority_map = superdesk.get_resource_service('vocabularies').find_one( req=None, _id='priority') priorities = [x for x in priority_map.get('items', []) if x['name'].upper() == mail_item.get('Priority', '').upper()] if priorities is not None and len(priorities) > 0: item['priority'] = int(priorities[0].get('qcode', '3')) else: item['priority'] = 3 if mail_item.get('News Value') != '': item['urgency'] = int(mail_item.get('News Value', '3')) # We expect the username passed corresponds to a superdesk user query = {'email': re.compile('^{}$'.format(mail_item.get('Username')), re.IGNORECASE)} user = superdesk.get_resource_service('users').find_one(req=None, **query) if not user: logger.error('Failed to find user for email {}'.format(mail_item.get('Username'))) raise UserNotRegisteredException() item['original_creator'] = user.get('_id') if BYLINE in user and user.get(BYLINE, ''): item['byline'] = user.get(BYLINE) item[SIGN_OFF] = user.get(SIGN_OFF) # attempt to match the given desk name against the defined desks query = {'name': re.compile('^{}$'.format(mail_item.get('Desk', '')), re.IGNORECASE)} desk = superdesk.get_resource_service('desks').find_one( req=None, **query) if desk: item['task'] = {'desk': desk.get('_id'), 'stage': desk.get('incoming_stage')} if 'Place' in mail_item: locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='locators') place = [x for x in locator_map.get('items', []) if x['qcode'] == mail_item.get('Place', '').upper()] if place is not None: item['place'] = place if mail_item.get('Legal flag', '') == 'LEGAL': item['flags'] = {'marked_for_legal': True} break return [item] except Exception as ex: raise IngestEmailError.emailParseError(ex, provider)
def _parse_formatted_email(self, data, provider): """Construct an item from an email that was constructed as a notification from a google form submission. The google form submits to a google sheet, this sheet creates the email as a notification :param data: :param provider: :return: A list of 1 item """ try: item = dict() item[ITEM_TYPE] = CONTENT_TYPE.TEXT item['versioncreated'] = utcnow() for response_part in data: if isinstance(response_part, tuple): msg = email.message_from_bytes(response_part[1]) # Check that the subject line matches what we expect, ignore it if not if self.parse_header( msg['subject']) != 'Formatted Editorial Story': return [] item['guid'] = msg['Message-ID'] date_tuple = email.utils.parsedate_tz(msg['Date']) if date_tuple: dt = datetime.datetime.utcfromtimestamp( email.utils.mktime_tz(date_tuple)) dt = dt.replace(tzinfo=timezone('utc')) item['firstcreated'] = dt for part in msg.walk(): if part.get_content_type() == "text/plain": body = part.get_payload(decode=True) # if we don't know the charset just have a go! if part.get_content_charset() is None: json_str = body.decode().replace('\r\n', '').replace( ' ', ' ') else: charset = part.get_content_charset() json_str = body.decode(charset).replace( '\r\n', '').replace(' ', ' ') mail_item = dict( (k, v[0]) for k, v in json.loads(json_str).items()) self._expand_category(item, mail_item) item['original_source'] = mail_item.get( 'Username', '') item['headline'] = mail_item.get('Headline', '') item['abstract'] = mail_item.get('Abstract', '') item['slugline'] = mail_item.get('Slugline', '') item['body_html'] = '<p>' + mail_item.get( 'Body', '').replace('\n', '</p><p>') + '</p>' default_source = app.config.get( 'DEFAULT_SOURCE_VALUE_FOR_MANUAL_ARTICLES') city = mail_item.get('Dateline', '') cities = app.locators.find_cities() located = [ c for c in cities if c['city'].lower() == city.lower() ] item.setdefault('dateline', {}) item['dateline']['located'] = located[0] if len( located) > 0 else { 'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city' } item['dateline']['source'] = default_source item['dateline'][ 'text'] = format_dateline_to_locmmmddsrc( item['dateline']['located'], get_date(item['firstcreated']), source=default_source) if mail_item.get('Priority') != '': if mail_item.get('Priority', '3').isdigit(): item['priority'] = int( mail_item.get('Priority', '3')) else: priority_map = superdesk.get_resource_service( 'vocabularies').find_one( req=None, _id='priority') priorities = [ x for x in priority_map.get('items', []) if x['name'].upper() == mail_item.get( 'Priority', '').upper() ] if priorities is not None and len( priorities) > 0: item['priority'] = int( priorities[0].get('qcode', '3')) else: item['priority'] = 3 if mail_item.get('News Value') != '': item['urgency'] = int( mail_item.get('News Value', '3')) # We expect the username passed corresponds to a superdesk user query = { 'email': re.compile( '^{}$'.format(mail_item.get('Username')), re.IGNORECASE) } user = superdesk.get_resource_service( 'users').find_one(req=None, **query) if not user: logger.error( 'Failed to find user for email {}'.format( mail_item.get('Username'))) raise UserNotRegisteredException() item['original_creator'] = user.get('_id') if BYLINE in user and user.get(BYLINE, ''): item['byline'] = user.get(BYLINE) item[SIGN_OFF] = user.get(SIGN_OFF) # attempt to match the given desk name against the defined desks query = { 'name': re.compile( '^{}$'.format(mail_item.get('Desk', '')), re.IGNORECASE) } desk = superdesk.get_resource_service( 'desks').find_one(req=None, **query) if desk: item['task'] = { 'desk': desk.get('_id'), 'stage': desk.get('incoming_stage') } if 'Place' in mail_item: locator_map = superdesk.get_resource_service( 'vocabularies').find_one(req=None, _id='locators') place = [ x for x in locator_map.get('items', []) if x['qcode'] == mail_item.get( 'Place', '').upper() ] if place is not None: item['place'] = place if mail_item.get('Legal flag', '') == 'LEGAL': item['flags'] = {'marked_for_legal': True} break return [item] except Exception as ex: raise IngestEmailError.emailParseError(ex, provider)
def _parse_formatted_email(self, data, provider): """ Passed an email that was constructed as a notification from a google form submission it constructs an item. The google form submits to a google sheet, this sheet creates the email as a notification :param data: :param provider: :return: A list of 1 item """ try: item = dict() item[ITEM_TYPE] = CONTENT_TYPE.TEXT item['versioncreated'] = utcnow() for response_part in data: if isinstance(response_part, tuple): msg = email.message_from_bytes(response_part[1]) # Check that the subject line matches what we expect, ignore it if not if self.parse_header( msg['subject']) != 'Formatted Editorial Story': return [] item['guid'] = msg['Message-ID'] date_tuple = email.utils.parsedate_tz(msg['Date']) if date_tuple: dt = datetime.datetime.utcfromtimestamp( email.utils.mktime_tz(date_tuple)) dt = dt.replace(tzinfo=timezone('utc')) item['firstcreated'] = dt for part in msg.walk(): if part.get_content_type() == "text/plain": body = part.get_payload(decode=True) # if we don't know the charset just have a go! if part.get_content_charset() is None: json_str = body.decode().replace('\r\n', '').replace( ' ', ' ') else: charset = part.get_content_charset() json_str = body.decode(charset).replace( '\r\n', '').replace(' ', ' ') mail_item = dict( (k, v[0]) for k, v in json.loads(json_str).items()) self._expand_category(item, mail_item) item['original_source'] = mail_item.get('Username') item['headline'] = mail_item.get('Headline') item['abstract'] = mail_item.get('Abstract') item['slugline'] = mail_item.get('Slugline') item['body_html'] = mail_item.get('Body').replace( '\n', '<br />') if mail_item.get('Priority') != '': item['priority'] = int( mail_item.get('Priority')) if mail_item.get('Urgency') != '': item['urgency'] = int(mail_item.get('Urgency')) # We expect the username passed corresponds to a superdesk user query = { 'email': re.compile( '^{}$'.format(mail_item.get('Username')), re.IGNORECASE) } user = superdesk.get_resource_service( 'users').find_one(req=None, **query) if not user: logger.error( 'Failed to find user for email {}'.format( mail_item.get('Username'))) raise UserNotRegisteredException() item['original_creator'] = user.get('_id') item['byline'] = user.get(BYLINE, user.get('display_name')) item[SIGN_OFF] = user.get(SIGN_OFF) # attempt to match the given desk name against the defined desks desk = superdesk.get_resource_service( 'desks').find_one(req=None, name=mail_item.get('Desk')) if desk: item['task'] = { 'desk': desk.get('_id'), 'stage': desk.get('incoming_stage') } break return [item] except Exception as ex: raise IngestEmailError.emailParseError(ex, provider)
class EmailFeedingService(FeedingService): """ Feeding Service class which can read the article(s) from a configured mail box. """ NAME = 'email' ERRORS = [ IngestEmailError.emailError().get_error_description(), IngestEmailError.emailLoginError().get_error_description() ] label = 'Email' fields = [{ 'id': 'server', 'type': 'text', 'label': 'Email Server', 'placeholder': 'Email Server', 'required': True, 'errors': { 6003: 'Server not found.', 6002: 'Unexpected server response' } }, { 'id': 'port', 'type': 'text', 'label': 'Email Server Port', 'placeholder': 'Email Server Port', 'required': True, 'default': '993' }, { 'id': 'user', 'type': 'text', 'label': 'User', 'placeholder': 'User', 'required': True }, { 'id': 'password', 'type': 'password', 'label': 'Password', 'placeholder': 'Password', 'required': True, 'errors': { 6000: 'Authentication error.' } }, { 'id': 'mailbox', 'type': 'text', 'label': 'Mailbox', 'placeholder': 'Mailbox', 'required': True, 'errors': { 6004: 'Authentication error.' } }, { 'id': 'formatted', 'type': 'boolean', 'label': 'Formatted Email Parser', 'required': True }, { 'id': 'filter', 'type': 'text', 'label': 'Filter', 'placeholder': 'Filter', 'required': True }] def _test(self, provider): self._update(provider, update=None, test=True) def _update(self, provider, update, test=False): config = provider.get('config', {}) server = config.get('server', '') port = int(config.get('port', 993)) new_items = [] try: try: socket.setdefaulttimeout(app.config.get('EMAIL_TIMEOUT', 10)) imap = imaplib.IMAP4_SSL(host=server, port=port) except (socket.gaierror, OSError) as e: raise IngestEmailError.emailHostError(exception=e) try: imap.login(config.get('user', None), config.get('password', None)) except imaplib.IMAP4.error: raise IngestEmailError.emailLoginError(imaplib.IMAP4.error, provider) try: rv, data = imap.select(config.get('mailbox', None), readonly=False) if rv != 'OK': raise IngestEmailError.emailMailboxError() try: rv, data = imap.search(None, config.get('filter', '(UNSEEN)')) if rv != 'OK': raise IngestEmailError.emailFilterError() for num in data[0].split(): rv, data = imap.fetch(num, '(RFC822)') if rv == 'OK' and not test: try: parser = self.get_feed_parser(provider, data) new_items.append(parser.parse(data, provider)) rv, data = imap.store(num, '+FLAGS', '\\Seen') except IngestEmailError: continue finally: imap.close() finally: imap.logout() except IngestEmailError: raise except Exception as ex: raise IngestEmailError.emailError(ex, provider) return new_items def prepare_href(self, href, mimetype=None): return url_for_media(href, mimetype)
# # For the full copyright and license information, please see the # AUTHORS and LICENSE files distributed with this source code, or # at https://www.sourcefabric.org/superdesk/license import imaplib from .ingest_service import IngestService from superdesk.io import register_provider from superdesk.upload import url_for_media from superdesk.errors import IngestEmailError from superdesk.io.rfc822 import rfc822Parser PROVIDER = 'email' errors = [ IngestEmailError.emailError().get_error_description(), IngestEmailError.emailLoginError().get_error_description() ] class EmailReaderService(IngestService): def __init__(self): self.parser = rfc822Parser() def _update(self, provider): config = provider.get('config', {}) server = config.get('server', '') port = int(config.get('port', 993)) try: imap = imaplib.IMAP4_SSL(host=server, port=port)