def handle_attachment(line, target, message): """ Mac versions put "Attachment converted", Windows (Lite) has "Attachment Converted". Next comes a system-dependent path to the attachment binary. On mac version, separated by colons, starts with volume, but omits path elements between: Eudora Folder:Attachments Folder. Windows versions have a full DOS path name to the binary (Lite version uses 8-char filenames) This replaces that filepath with a file URI to the file in the attachments_dirs directories. This has no direct effect in Kmail, but sometimes Pine can open the file (so long as there aren't any spaces in the filepath). At least it makes more sense than leaving the old filepath. """ global attachments_listed, attachments_found, attachments_missing, attachments_dirs global paths_found, paths_missing global missing_attachments, found_attachments global mac_mismatches attachments_listed = attachments_listed + 1 # Mac 1.3.1 has e.g. (Type: 'PDF ' Creator: 'CARO') # Mac 3.1 has e.g (PDF /CARO) (00000645) if re_quoted_attachment.match(line): attachment_desc = re_quoted_attachment.sub('\\1', line) elif re_attachment.match(line): attachment_desc = re_attachment.sub('\\1', line) else: # If we're dealing with attachments recorded by the # X-Attachments header, line will be a single, naked # attachment desc, with no Attachment Converted # surroundings attachment_desc = line if attachment_desc.find('"') != -1: print "**>>**", attachment_desc attachment_desc = strip_linesep(attachment_desc) # some of John's attachment names have an odd OutboundG4: # prefix which is not present in the filenames on disk.. if attachment_desc.find('OutboundG4:') != -1: attachment_desc = attachment_desc.replace('OutboundG4:', '') name = '' # if has :\, must be windows etc = '' if re_dos_path_beginning.match(attachment_desc): desc_list = attachment_desc.split("\\") # DOS backslashes name = desc_list.pop().strip() # pop off last portion of name orig_path = "/".join(desc_list) if name[-1] == '"': name = name[:-1] elif re_mac_info.match(line): name = re_mac_info.sub('\\1', line) etc = re_mac_info.sub('\\2', line).strip() dlist = name.split(":") # Mac path delim name = dlist.pop().strip() # pop off last portion of name orig_path = "/".join(dlist) else: # EudoraLog.log.warn( "FAILED to convert attachment: \'" # + attachment_desc + "\'" ) name = attachment_desc orig_path = attachment_desc if len(name) <= 0: return filename = None for adir in attachments_dirs: if not filename or not os.path.exists(filename): filename = os.path.join(target, adir, name) if not os.path.isabs(target): filename = os.path.join(os.environ['HOME'], filename) # Trim NULL bytes from filenames (found in old Eudora 2.x mailboxes) filename = filename.replace(b'\x00', '') if not os.path.exists(filename): if name.startswith('OutboundG4:'): name = name[11:] print "**** Hey, name is now %s" % (name, ) filename = os.path.join(target, attachments_dir, name) # our user has attachments that have / characters in # the file name, but when they got copied over to # unix, the / chars were taken out, if it would help. if not os.path.exists(filename): if name.find('/') != -1: name = name.replace('/', '') filename = os.path.join(target, adir, name) # our user also has attachments that have _ characters # in the file name where the file on disk has spaces. # translate that as well, if it would help. if not os.path.exists(filename): if name.find('_') != -1: name = name.replace('_', ' ') filename = os.path.join(target, adir, name) # our user actually also has attachments that have # space characters in the file name where the file on # disk has underscores. if we didn't find the match # after our last transform, try the rever if not os.path.exists(filename): if name.find(' ') != -1: name = name.replace(' ', '_') filename = os.path.join(target, adir, name) # in our user's attachments, we have some files named # akin to 'filename.ppt 1' and so forth. we're going # to trim anything after the first whitespace # character after the first . in the filename cleaner_match = re_filename_cleaner.match(filename) if cleaner_match: filename = cleaner_match.group(1) # Trim any NULL bytes we might have pulled in filename = filename.replace(b'\x00', '') mimeinfo = mimetypes.guess_type(filename) if not os.path.exists(filename): cleaner_match = re_filename_cleaner.match(filename.replace('_', ' ')) if cleaner_match and os.path.exists(cleaner_match.group(1)): filename = cleaner_match.group(1) if not mimeinfo[0]: (mimetype, mimesubtype) = ('application', 'octet-stream') else: (mimetype, mimesubtype) = mimeinfo[0].split('/') if os.path.isfile(filename): fp = open(filename, 'rb') try: if mimetype == 'application' or mimetype == 'video': msg = MIMEApplication(fp.read(), _subtype=mimesubtype) elif mimetype == 'image': msg = MIMEImage(fp.read(), _subtype=mimesubtype) elif mimetype == 'text': msg = MIMEText(fp.read(), _subtype=mimesubtype) elif mimetype == 'audio': msg = MIMEAudio(fp.read(), _subtype=mimesubtype) else: EudoraLog.log.error( "Unrecognized mime type '%s' while processing attachment '%s'" % (mimeinfo[0], filename)) return finally: fp.close() msg.add_header('Content-Disposition', 'attachment', filename=name) message.attach(msg) attachments_found = attachments_found + 1 # EudoraLog.log.warn(" SUCCEEDED finding attachment: \'" + attachment_desc + "\', name = \'" + name + "\'") if orig_path in paths_found: paths_found[orig_path] = paths_found[orig_path] + 1 else: paths_found[orig_path] = 1 if not EudoraLog.log.mbx_name() in found_attachments: found_attachments[EudoraLog.log.mbx_name()] = [] found_attachments[EudoraLog.log.mbx_name()].append( (attachment_desc, filename)) else: attachments_missing = attachments_missing + 1 if not EudoraLog.log.mbx_name() in missing_attachments: missing_attachments[EudoraLog.log.mbx_name()] = [] missing_attachments[EudoraLog.log.mbx_name()].append(attachment_desc) # EudoraLog.log.warn(" FAILED to find attachment: \'" + attachment_desc + "\'" ) if re_mangled_mac.search(filename): print "Mac pattern: %s" % (filename, ) mac_mismatches.append(filename) if orig_path in paths_missing: paths_missing[orig_path] = paths_missing[orig_path] + 1 else: paths_missing[orig_path] = 1
def extract_pieces(msg_lines, msg_offset, mbx, inner_mesg=False): """Takes four parameters. The first is a list of line strings containing the headers and body of a message from a Eudora MBX file. The second is the offset of the first character in the first line in the msg_lines list within the MBX file we're processing. The third is the name of the MBX file we're reading. The fourth, inner_mesg, is a boolean controlling whether or not the pieces were are extracting are a top-level message in the MBX file. If inner_mesg is true, we will carry out our processing under the assumption that we are handling an attached message carried in an message/rfc822 segment. Returns a tuple (header, body, attachments, embeddeds, mbx) containing a Header object, a body String containing the body of the message, a list of attachment definition tuples, a list of embedded definition tuples, and the name of the MBX file we're processing.""" global toc_info, replies global target headers = Header() body = [] attachments = [] embeddeds = [] in_headers = True found_rfc822_inner_mesg = False is_html = False if not inner_mesg: headers.add('From ', msg_lines[0][5:].strip()) for line in msg_lines: if in_headers: if re_initial_whitespace.match(line): # Header "folding" (RFC 2822 3.2.3) headers.appendToLast(line) elif len(line.strip()) != 0: # Message header headers.add_line(line) attachment_matcher = re_x_attachment.match(line) if attachment_matcher: files = attachment_matcher.group(1) attach_list = re.split(';\s*', files) for attachment in attach_list: attachments.append((attachment, target)) else: # End of message headers. # scrub the header lines we've scanned if not inner_mesg: headers.clean(toc_info, msg_offset, replies) in_headers = False content_type = headers.getValue('Content-Type:') if content_type and content_type.lower() == 'message/rfc822': found_rfc822_inner_mesg = True print "+", elif found_rfc822_inner_mesg: # We're processing a message/rfc822 message, # and so we don't want to process attachments # at this level. Instead, we want to properly # extract all body lines for later processing body.append(strip_linesep(line) + "\n") else: # We're in the body of the text and we need to # handle attachments if not is_html and re_xhtml.search(line) or re_normal_html.search( line): is_html = True if attachments_dirs and re_attachment.search(line): # remove the newline that # Eudora inserts before the # 'Attachment Converted' line. if len(body) > 0 and (body[-1] == '\n' or body[-1] == '\r\n'): body.pop() #EudoraLog.log.warn("Adding attachment with contenttype = " + contenttype) attachments.append((line, target)) else: embedded_matcher = re_embedded.match(line) if embedded_matcher: filename = embedded_matcher.group(1) embeddeds.append(filename) else: orig_line = line if scrub_xflowed: line = re.sub(re_xflowed, '', line) line = re.sub(re_xhtml, '', line) line = re.sub(re_pete_stuff, '', line) if orig_line == line or line != '': body.append(strip_linesep(line) + "\n") return (headers, body, attachments, embeddeds, mbx, is_html)
message_count = message_count + 1 newmailbox.add(message) except TypeError: print str(headers) print message.get_content_type() traceback.print_exc(file=sys.stdout) EudoraLog.msg_no = EudoraLog.msg_no + 1 msg_offset = last_file_position msg_lines = [] if not line: break msg_lines.append(strip_linesep(line) + "\n") last_file_position = INPUT.tell() EudoraLog.line_no += 1 # Check if the file isn't empty and any messages have been processed. if EudoraLog.line_no == 0: EudoraLog.log.warn('empty file') elif EudoraLog.msg_no == 0: EudoraLog.log.error('no messages (not a Eudora mailbox file?)') if True: print print "\nMissing path count:" for (path, count) in paths_missing.iteritems():
def handle_attachment( line, target, message ): """ Mac versions put "Attachment converted", Windows (Lite) has "Attachment Converted". Next comes a system-dependent path to the attachment binary. On mac version, separated by colons, starts with volume, but omits path elements between: Eudora Folder:Attachments Folder. Windows versions have a full DOS path name to the binary (Lite version uses 8-char filenames) This replaces that filepath with a file URI to the file in the attachments_dirs directories. This has no direct effect in Kmail, but sometimes Pine can open the file (so long as there aren't any spaces in the filepath). At least it makes more sense than leaving the old filepath. """ global attachments_listed, attachments_found, attachments_missing, attachments_dirs global paths_found, paths_missing global missing_attachments, found_attachments global mac_mismatches attachments_listed = attachments_listed + 1 # Mac 1.3.1 has e.g. (Type: 'PDF ' Creator: 'CARO') # Mac 3.1 has e.g (PDF /CARO) (00000645) if re_quoted_attachment.match(line): attachment_desc = re_quoted_attachment.sub( '\\1', line ) elif re_attachment.match(line): attachment_desc = re_attachment.sub( '\\1', line ) else: # If we're dealing with attachments recorded by the # X-Attachments header, line will be a single, naked # attachment desc, with no Attachment Converted # surroundings attachment_desc = line if attachment_desc.find('"') != -1: print "**>>**", attachment_desc attachment_desc = strip_linesep(attachment_desc) # some of John's attachment names have an odd OutboundG4: # prefix which is not present in the filenames on disk.. if attachment_desc.find('OutboundG4:') != -1: attachment_desc = attachment_desc.replace('OutboundG4:', '') name = '' # if has :\, must be windows etc = '' if re_dos_path_beginning.match( attachment_desc ): desc_list = attachment_desc.split( "\\" ) # DOS backslashes name = desc_list.pop().strip() # pop off last portion of name orig_path = "/".join(desc_list) if name[-1] == '"': name = name[:-1] elif re_mac_info.match( line ): name = re_mac_info.sub( '\\1', line ) etc = re_mac_info.sub( '\\2', line ).strip() dlist = name.split( ":" ) # Mac path delim name = dlist.pop().strip() # pop off last portion of name orig_path = "/".join(dlist) else: # EudoraLog.log.warn( "FAILED to convert attachment: \'" # + attachment_desc + "\'" ) name = attachment_desc orig_path = attachment_desc if len( name ) <= 0: return filename = None for adir in attachments_dirs: if not filename or not os.path.exists(filename): filename = os.path.join( target, adir, name ) if not os.path.isabs( target ): filename = os.path.join( os.environ['HOME'], filename ) if not os.path.exists(filename): if name.startswith('OutboundG4:'): name = name[11:] print "**** Hey, name is now %s" % (name, ) filename = os.path.join(target, attachments_dir, name) # our user has attachments that have / characters in # the file name, but when they got copied over to # unix, the / chars were taken out, if it would help. if not os.path.exists(filename): if name.find('/') != -1: name=name.replace('/','') filename = os.path.join(target, adir, name) # our user also has attachments that have _ characters # in the file name where the file on disk has spaces. # translate that as well, if it would help. if not os.path.exists(filename): if name.find('_') != -1: name = name.replace('_', ' ') filename = os.path.join(target, adir, name) # our user actually also has attachments that have # space characters in the file name where the file on # disk has underscores. if we didn't find the match # after our last transform, try the rever if not os.path.exists(filename): if name.find(' ') != -1: name = name.replace(' ', '_') filename = os.path.join(target, adir, name) # in our user's attachments, we have some files named # akin to 'filename.ppt 1' and so forth. we're going # to trim anything after the first whitespace # character after the first . in the filename cleaner_match = re_filename_cleaner.match( filename ) if cleaner_match: filename = cleaner_match.group(1) mimeinfo = mimetypes.guess_type(filename) if not os.path.exists(filename): cleaner_match = re_filename_cleaner.match(filename.replace('_', ' ')) if cleaner_match and os.path.exists(cleaner_match.group(1)): filename = cleaner_match.group(1) if not mimeinfo[0]: (mimetype, mimesubtype) = ('application', 'octet-stream') else: (mimetype, mimesubtype) = mimeinfo[0].split('/') if os.path.isfile(filename): fp = open(filename, 'rb') try: if mimetype == 'application' or mimetype == 'video': msg = MIMEApplication(fp.read(), _subtype=mimesubtype) elif mimetype == 'image': msg = MIMEImage(fp.read(), _subtype=mimesubtype) elif mimetype == 'text': msg = MIMEText(fp.read(), _subtype=mimesubtype) elif mimetype == 'audio': msg = MIMEAudio(fp.read(), _subtype=mimesubtype) else: EudoraLog.log.error("Unrecognized mime type '%s' while processing attachment '%s'" % (mimeinfo[0], filename)) return finally: fp.close() msg.add_header('Content-Disposition', 'attachment', filename=name) message.attach(msg) attachments_found = attachments_found + 1 # EudoraLog.log.warn(" SUCCEEDED finding attachment: \'" + attachment_desc + "\', name = \'" + name + "\'") if orig_path in paths_found: paths_found[orig_path] = paths_found[orig_path] + 1 else: paths_found[orig_path] = 1 if not EudoraLog.log.mbx_name() in found_attachments: found_attachments[EudoraLog.log.mbx_name()] = [] found_attachments[EudoraLog.log.mbx_name()].append((attachment_desc, filename)) else: attachments_missing = attachments_missing + 1 if not EudoraLog.log.mbx_name() in missing_attachments: missing_attachments[EudoraLog.log.mbx_name()] = [] missing_attachments[EudoraLog.log.mbx_name()].append(attachment_desc) # EudoraLog.log.warn(" FAILED to find attachment: \'" + attachment_desc + "\'" ) if re_mangled_mac.search(filename): print "Mac pattern: %s" % (filename, ) mac_mismatches.append(filename) if orig_path in paths_missing: paths_missing[orig_path] = paths_missing[orig_path] + 1 else: paths_missing[orig_path] = 1
def extract_pieces( msg_lines, msg_offset, mbx, inner_mesg=False ): """Takes four parameters. The first is a list of line strings containing the headers and body of a message from a Eudora MBX file. The second is the offset of the first character in the first line in the msg_lines list within the MBX file we're processing. The third is the name of the MBX file we're reading. The fourth, inner_mesg, is a boolean controlling whether or not the pieces were are extracting are a top-level message in the MBX file. If inner_mesg is true, we will carry out our processing under the assumption that we are handling an attached message carried in an message/rfc822 segment. Returns a tuple (header, body, attachments, embeddeds, mbx) containing a Header object, a body String containing the body of the message, a list of attachment definition tuples, a list of embedded definition tuples, and the name of the MBX file we're processing.""" global toc_info, replies global target headers = Header() body = [] attachments = [] embeddeds = [] in_headers = True found_rfc822_inner_mesg = False is_html = False if not inner_mesg: headers.add( 'From ', msg_lines[0][5:].strip() ) for line in msg_lines: if in_headers: if re_initial_whitespace.match( line ): # Header "folding" (RFC 2822 3.2.3) headers.appendToLast( line ) elif len( line.strip() ) != 0: # Message header headers.add_line(line) attachment_matcher = re_x_attachment.match( line ) if attachment_matcher: files = attachment_matcher.group(1) attach_list = re.split(';\s*', files) for attachment in attach_list: attachments.append( (attachment, target) ) else: # End of message headers. # scrub the header lines we've scanned if not inner_mesg: headers.clean(toc_info, msg_offset, replies) in_headers = False content_type = headers.getValue('Content-Type:') if content_type and content_type.lower() == 'message/rfc822': found_rfc822_inner_mesg = True print "+", elif found_rfc822_inner_mesg: # We're processing a message/rfc822 message, # and so we don't want to process attachments # at this level. Instead, we want to properly # extract all body lines for later processing body.append(strip_linesep(line) + "\n") else: # We're in the body of the text and we need to # handle attachments if not is_html and re_xhtml.search( line ) or re_normal_html.search( line ): is_html = True if attachments_dirs and re_attachment.search( line ): # remove the newline that # Eudora inserts before the # 'Attachment Converted' line. if len(body) > 0 and (body[-1] == '\n' or body[-1] == '\r\n'): body.pop() #EudoraLog.log.warn("Adding attachment with contenttype = " + contenttype) attachments.append( (line, target) ) else: embedded_matcher = re_embedded.match ( line ) if embedded_matcher: filename = embedded_matcher.group(1) embeddeds.append( filename ) else: orig_line = line if scrub_xflowed: line = re.sub(re_xflowed, '', line) line = re.sub(re_xhtml, '', line) line = re.sub(re_pete_stuff, '', line) if orig_line == line or line != '': body.append(strip_linesep(line) + "\n") return ( headers, body, attachments, embeddeds, mbx, is_html )
message_count = message_count + 1 newmailbox.add(message) except TypeError: print str(headers) print message.get_content_type() traceback.print_exc(file=sys.stdout) EudoraLog.msg_no = EudoraLog.msg_no + 1 msg_offset = last_file_position msg_lines = [] if not line: break msg_lines.append(strip_linesep(line) + "\n") last_file_position = INPUT.tell() EudoraLog.line_no += 1 # Check if the file isn't empty and any messages have been processed. if EudoraLog.line_no == 0: EudoraLog.log.warn( 'empty file' ) elif EudoraLog.msg_no == 0: EudoraLog.log.error( 'no messages (not a Eudora mailbox file?)' ) if True: print print "\nMissing path count:" for (path, count) in paths_missing.iteritems():