def get_main_content(self, msg: email.message.EmailMessage): ''' メール本文、フォーマット、キャラクターセットを取得する。 ''' try: body_part = msg.get_body() main_content = body_part.get_content() format_ = body_part.get_content_type() charset = body_part.get_content_charset() except Exception as error: print(error) main_content = '解析失敗' format_ = '不明' charset = '不明' # get_bodyでエラーになるのは文字コード設定がおかしいメールを受信した場合なので、 # decodeせずにテキスト部分をそのまま返す。 for part in msg.walk(): if part.get_content_type() == 'text/plain': format_ = part.get_content_type() main_content = str(part.get_payload()) charset = part.get_content_charset() return main_content, format_, charset
def filter_email(eml: email.message.EmailMessage) -> str: # Get plaintext part body = eml.get_body("plain") if body == None: raise EmailParseException() # Find a link in the body link_re = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))" urls = re.findall(link_re, body.get_content()) if len(urls) == 0: raise LinkNotFoundException() url: str = urls[0][0] # Some sites require this header headers = { 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7' } # Download the contents at that URL with urllib request = urllib.request.Request(url, headers=headers) with urllib.request.urlopen(request) as response: webpage = response.read() # Parse with BeautifulSoup bs = BeautifulSoup(webpage, features='lxml') content = bs.prettify() return content return url
def process_message(uid, msg: email.message.EmailMessage): try: raw_date = msg.get('Date') local_date = None # Now convert to local date-time date_tuple = email.utils.parsedate_tz(raw_date) if date_tuple: local_date = datetime.datetime.fromtimestamp( email.utils.mktime_tz(date_tuple)) except: raw_date = None local_date = None links = [] try: body = msg.get_body(('html', 'plain')) if body: if body.get_content_type() == 'text/plain': links = links_from_plaintext(body.get_content()) elif body.get_content_type() == 'text/html': links = links_from_html(body.get_content()) except: pass try: msg_to = msg.get('To') except: msg_to = None try: msg_from = msg.get('From') except: msg_from = None try: msg_sub = msg.get('Subject') except: msg_sub = None try: msg_id = msg.get('Message-ID') except: msg_id = None info = { 'uid': uid, 'to': msg_to, 'from': msg_from, 'subject': msg_sub, 'raw_date': raw_date, 'local_date': local_date, 'message-id': msg_id, 'links': links, 'num-links': len(links) } return info