Esempio n. 1
0
def handle_attachment(line, target, message):
    """
	Mac versions put "Attachment converted", Windows (Lite) has
	"Attachment Converted". 

	Next comes a system-dependent path to the attachment binary.
	On mac version, separated by colons, starts with volume, but omits
	path elements between:

	Eudora Folder:Attachments Folder. 

	Windows versions have a full DOS path name to the binary
	(Lite version uses 8-char filenames)
	
	This replaces that filepath with a file URI to the file in the
	attachments_dirs directories.  This has no direct effect in Kmail, but 
	sometimes Pine can open the file (so long as there aren't any 
	spaces in the filepath).  At least it makes more sense than
	leaving the old filepath.
	"""

    global attachments_listed, attachments_found, attachments_missing, attachments_dirs
    global paths_found, paths_missing
    global missing_attachments, found_attachments
    global mac_mismatches

    attachments_listed = attachments_listed + 1

    # Mac 1.3.1 has e.g. (Type: 'PDF ' Creator: 'CARO')
    # Mac 3.1 has e.g (PDF /CARO) (00000645)

    if re_quoted_attachment.match(line):
        attachment_desc = re_quoted_attachment.sub('\\1', line)
    elif re_attachment.match(line):
        attachment_desc = re_attachment.sub('\\1', line)
    else:
        # If we're dealing with attachments recorded by the
        # X-Attachments header, line will be a single, naked
        # attachment desc, with no Attachment Converted
        # surroundings

        attachment_desc = line

    if attachment_desc.find('"') != -1:
        print "**>>**", attachment_desc

    attachment_desc = strip_linesep(attachment_desc)

    # some of John's attachment names have an odd OutboundG4:
    # prefix which is not present in the filenames on disk..

    if attachment_desc.find('OutboundG4:') != -1:
        attachment_desc = attachment_desc.replace('OutboundG4:', '')

    name = ''
    # if has :\, must be windows
    etc = ''
    if re_dos_path_beginning.match(attachment_desc):
        desc_list = attachment_desc.split("\\")  # DOS backslashes
        name = desc_list.pop().strip()  # pop off last portion of name
        orig_path = "/".join(desc_list)
        if name[-1] == '"':
            name = name[:-1]
    elif re_mac_info.match(line):
        name = re_mac_info.sub('\\1', line)
        etc = re_mac_info.sub('\\2', line).strip()
        dlist = name.split(":")  # Mac path delim
        name = dlist.pop().strip()  # pop off last portion of name
        orig_path = "/".join(dlist)
    else:
        #		EudoraLog.log.warn( "FAILED to convert attachment: \'"
        #				    + attachment_desc + "\'" )
        name = attachment_desc
        orig_path = attachment_desc

    if len(name) <= 0:
        return

    filename = None

    for adir in attachments_dirs:
        if not filename or not os.path.exists(filename):
            filename = os.path.join(target, adir, name)
            if not os.path.isabs(target):
                filename = os.path.join(os.environ['HOME'], filename)

# Trim NULL bytes from filenames (found in old Eudora 2.x mailboxes)
            filename = filename.replace(b'\x00', '')

            if not os.path.exists(filename):
                if name.startswith('OutboundG4:'):
                    name = name[11:]
                    print "**** Hey, name is now %s" % (name, )
                    filename = os.path.join(target, attachments_dir, name)

            # our user has attachments that have / characters in
            # the file name, but when they got copied over to
            # unix, the / chars were taken out, if it would help.

            if not os.path.exists(filename):
                if name.find('/') != -1:
                    name = name.replace('/', '')
                    filename = os.path.join(target, adir, name)

            # our user also has attachments that have _ characters
            # in the file name where the file on disk has spaces.
            # translate that as well, if it would help.

            if not os.path.exists(filename):
                if name.find('_') != -1:
                    name = name.replace('_', ' ')
                    filename = os.path.join(target, adir, name)

            # our user actually also has attachments that have
            # space characters in the file name where the file on
            # disk has underscores.  if we didn't find the match
            # after our last transform, try the rever

            if not os.path.exists(filename):
                if name.find(' ') != -1:
                    name = name.replace(' ', '_')
                    filename = os.path.join(target, adir, name)

    # in our user's attachments, we have some files named
    # akin to 'filename.ppt 1' and so forth.  we're going
    # to trim anything after the first whitespace
    # character after the first . in the filename

    cleaner_match = re_filename_cleaner.match(filename)

    if cleaner_match:
        filename = cleaner_match.group(1)
        # Trim any NULL bytes we might have pulled in
        filename = filename.replace(b'\x00', '')

    mimeinfo = mimetypes.guess_type(filename)

    if not os.path.exists(filename):
        cleaner_match = re_filename_cleaner.match(filename.replace('_', ' '))

        if cleaner_match and os.path.exists(cleaner_match.group(1)):
            filename = cleaner_match.group(1)

    if not mimeinfo[0]:
        (mimetype, mimesubtype) = ('application', 'octet-stream')
    else:
        (mimetype, mimesubtype) = mimeinfo[0].split('/')

    if os.path.isfile(filename):
        fp = open(filename, 'rb')

        try:
            if mimetype == 'application' or mimetype == 'video':
                msg = MIMEApplication(fp.read(), _subtype=mimesubtype)
            elif mimetype == 'image':
                msg = MIMEImage(fp.read(), _subtype=mimesubtype)
            elif mimetype == 'text':
                msg = MIMEText(fp.read(), _subtype=mimesubtype)
            elif mimetype == 'audio':
                msg = MIMEAudio(fp.read(), _subtype=mimesubtype)
            else:
                EudoraLog.log.error(
                    "Unrecognized mime type '%s' while processing attachment '%s'"
                    % (mimeinfo[0], filename))
                return
        finally:
            fp.close()

        msg.add_header('Content-Disposition', 'attachment', filename=name)

        message.attach(msg)

        attachments_found = attachments_found + 1

        #		EudoraLog.log.warn(" SUCCEEDED finding attachment: \'" + attachment_desc + "\', name = \'" + name + "\'")
        if orig_path in paths_found:
            paths_found[orig_path] = paths_found[orig_path] + 1
        else:
            paths_found[orig_path] = 1

        if not EudoraLog.log.mbx_name() in found_attachments:
            found_attachments[EudoraLog.log.mbx_name()] = []
        found_attachments[EudoraLog.log.mbx_name()].append(
            (attachment_desc, filename))
    else:
        attachments_missing = attachments_missing + 1

        if not EudoraLog.log.mbx_name() in missing_attachments:
            missing_attachments[EudoraLog.log.mbx_name()] = []
        missing_attachments[EudoraLog.log.mbx_name()].append(attachment_desc)

        #		EudoraLog.log.warn(" FAILED to find attachment: \'" + attachment_desc + "\'" )

        if re_mangled_mac.search(filename):
            print "Mac pattern: %s" % (filename, )
            mac_mismatches.append(filename)

        if orig_path in paths_missing:
            paths_missing[orig_path] = paths_missing[orig_path] + 1
        else:
            paths_missing[orig_path] = 1
Esempio n. 2
0
def extract_pieces(msg_lines, msg_offset, mbx, inner_mesg=False):
    """Takes four parameters.  The first is a list of line strings
	containing the headers and body of a message from a Eudora MBX
	file.  The second is the offset of the first character in the
	first line in the msg_lines list within the MBX file we're
	processing.  The third is the name of the MBX file we're
	reading.  The fourth, inner_mesg, is a boolean controlling
	whether or not the pieces were are extracting are a top-level
	message in the MBX file.  If inner_mesg is true, we will carry
	out our processing under the assumption that we are handling
	an attached message carried in an message/rfc822 segment.
	
	Returns a tuple (header, body, attachments, embeddeds, mbx)
	containing a Header object, a body String containing the body
	of the message, a list of attachment definition tuples, a list
	of embedded definition tuples, and the name of the MBX file
	we're processing."""

    global toc_info, replies
    global target

    headers = Header()
    body = []
    attachments = []
    embeddeds = []

    in_headers = True
    found_rfc822_inner_mesg = False
    is_html = False

    if not inner_mesg:
        headers.add('From ', msg_lines[0][5:].strip())

    for line in msg_lines:
        if in_headers:
            if re_initial_whitespace.match(line):
                # Header "folding" (RFC 2822 3.2.3)
                headers.appendToLast(line)
            elif len(line.strip()) != 0:
                # Message header
                headers.add_line(line)

                attachment_matcher = re_x_attachment.match(line)

                if attachment_matcher:
                    files = attachment_matcher.group(1)
                    attach_list = re.split(';\s*', files)

                    for attachment in attach_list:
                        attachments.append((attachment, target))
            else:
                # End of message headers.

                # scrub the header lines we've scanned

                if not inner_mesg:
                    headers.clean(toc_info, msg_offset, replies)

                in_headers = False

                content_type = headers.getValue('Content-Type:')

                if content_type and content_type.lower() == 'message/rfc822':
                    found_rfc822_inner_mesg = True
                    print "+",
        elif found_rfc822_inner_mesg:
            # We're processing a message/rfc822 message,
            # and so we don't want to process attachments
            # at this level.  Instead, we want to properly
            # extract all body lines for later processing

            body.append(strip_linesep(line) + "\n")
        else:
            # We're in the body of the text and we need to
            # handle attachments

            if not is_html and re_xhtml.search(line) or re_normal_html.search(
                    line):
                is_html = True

            if attachments_dirs and re_attachment.search(line):
                # remove the newline that
                # Eudora inserts before the
                # 'Attachment Converted' line.

                if len(body) > 0 and (body[-1] == '\n' or body[-1] == '\r\n'):
                    body.pop()

                #EudoraLog.log.warn("Adding attachment with contenttype = " + contenttype)
                attachments.append((line, target))
            else:
                embedded_matcher = re_embedded.match(line)

                if embedded_matcher:
                    filename = embedded_matcher.group(1)
                    embeddeds.append(filename)
                else:
                    orig_line = line

                    if scrub_xflowed:
                        line = re.sub(re_xflowed, '', line)
                        line = re.sub(re_xhtml, '', line)
                        line = re.sub(re_pete_stuff, '', line)

                    if orig_line == line or line != '':
                        body.append(strip_linesep(line) + "\n")

    return (headers, body, attachments, embeddeds, mbx, is_html)
Esempio n. 3
0
                message_count = message_count + 1
                newmailbox.add(message)
            except TypeError:
                print str(headers)
                print message.get_content_type()
                traceback.print_exc(file=sys.stdout)

            EudoraLog.msg_no = EudoraLog.msg_no + 1
            msg_offset = last_file_position

            msg_lines = []

        if not line:
            break

        msg_lines.append(strip_linesep(line) + "\n")
        last_file_position = INPUT.tell()
        EudoraLog.line_no += 1

    # Check if the file isn't empty and any messages have been processed.
    if EudoraLog.line_no == 0:
        EudoraLog.log.warn('empty file')
    elif EudoraLog.msg_no == 0:
        EudoraLog.log.error('no messages (not a Eudora mailbox file?)')

    if True:
        print

        print "\nMissing path count:"

        for (path, count) in paths_missing.iteritems():
Esempio n. 4
0
def handle_attachment( line, target, message ):
	"""
	Mac versions put "Attachment converted", Windows (Lite) has
	"Attachment Converted". 

	Next comes a system-dependent path to the attachment binary.
	On mac version, separated by colons, starts with volume, but omits
	path elements between:

	Eudora Folder:Attachments Folder. 

	Windows versions have a full DOS path name to the binary
	(Lite version uses 8-char filenames)
	
	This replaces that filepath with a file URI to the file in the
	attachments_dirs directories.  This has no direct effect in Kmail, but 
	sometimes Pine can open the file (so long as there aren't any 
	spaces in the filepath).  At least it makes more sense than
	leaving the old filepath.
	"""

	global attachments_listed, attachments_found, attachments_missing, attachments_dirs
	global paths_found, paths_missing
	global missing_attachments, found_attachments
	global mac_mismatches

	attachments_listed = attachments_listed + 1

	# Mac 1.3.1 has e.g. (Type: 'PDF ' Creator: 'CARO')
	# Mac 3.1 has e.g (PDF /CARO) (00000645)

	if re_quoted_attachment.match(line):
		attachment_desc = re_quoted_attachment.sub( '\\1', line )
	elif re_attachment.match(line):
		attachment_desc = re_attachment.sub( '\\1', line )
	else:
		# If we're dealing with attachments recorded by the
		# X-Attachments header, line will be a single, naked
		# attachment desc, with no Attachment Converted
		# surroundings

		attachment_desc = line

	if attachment_desc.find('"') != -1:
		print "**>>**", attachment_desc

	attachment_desc = strip_linesep(attachment_desc)

	# some of John's attachment names have an odd OutboundG4:
	# prefix which is not present in the filenames on disk..

	if attachment_desc.find('OutboundG4:') != -1:
		attachment_desc = attachment_desc.replace('OutboundG4:', '')

	name = ''
	# if has :\, must be windows
	etc = ''
	if re_dos_path_beginning.match( attachment_desc ):
		desc_list = attachment_desc.split( "\\" ) # DOS backslashes
		name = desc_list.pop().strip()	# pop off last portion of name
		orig_path = "/".join(desc_list)
		if name[-1] == '"':
			name = name[:-1]
	elif re_mac_info.match( line ):
		name = re_mac_info.sub( '\\1', line )
		etc = re_mac_info.sub( '\\2', line ).strip() 
		dlist = name.split( ":" ) # Mac path delim
		name = dlist.pop().strip()	# pop off last portion of name
		orig_path = "/".join(dlist)
	else:
#		EudoraLog.log.warn( "FAILED to convert attachment: \'"
#				    + attachment_desc + "\'" )
		name = attachment_desc
		orig_path = attachment_desc

	if len( name ) <= 0:
		return

	filename = None

	for adir in attachments_dirs:
		if not filename or not os.path.exists(filename):
			filename = os.path.join( target, adir, name )
			if not os.path.isabs( target ):
				filename = os.path.join( os.environ['HOME'], filename )

			if not os.path.exists(filename):
				if name.startswith('OutboundG4:'):
					name = name[11:]
					print "**** Hey, name is now %s" % (name, )
					filename = os.path.join(target, attachments_dir, name)

			# our user has attachments that have / characters in
			# the file name, but when they got copied over to
			# unix, the / chars were taken out, if it would help.

			if not os.path.exists(filename):
				if name.find('/') != -1:
					name=name.replace('/','')
					filename = os.path.join(target, adir, name)

			# our user also has attachments that have _ characters
			# in the file name where the file on disk has spaces.
			# translate that as well, if it would help.

			if not os.path.exists(filename):
				if name.find('_') != -1:
					name = name.replace('_', ' ')
					filename = os.path.join(target, adir, name)

			# our user actually also has attachments that have
			# space characters in the file name where the file on
			# disk has underscores.  if we didn't find the match
			# after our last transform, try the rever

			if not os.path.exists(filename):
				if name.find(' ') != -1:
					name = name.replace(' ', '_')
					filename = os.path.join(target, adir, name)

	# in our user's attachments, we have some files named
	# akin to 'filename.ppt 1' and so forth.  we're going
	# to trim anything after the first whitespace
	# character after the first . in the filename

	cleaner_match = re_filename_cleaner.match( filename )

	if cleaner_match:
		filename = cleaner_match.group(1)

	mimeinfo = mimetypes.guess_type(filename)

	if not os.path.exists(filename):
		cleaner_match = re_filename_cleaner.match(filename.replace('_', ' '))

		if cleaner_match and os.path.exists(cleaner_match.group(1)):
			filename = cleaner_match.group(1)

	if not mimeinfo[0]:
		(mimetype, mimesubtype) = ('application', 'octet-stream')
	else:
		(mimetype, mimesubtype) = mimeinfo[0].split('/')

	if os.path.isfile(filename):
		fp = open(filename, 'rb')

		try:
			if mimetype == 'application' or mimetype == 'video':
				msg = MIMEApplication(fp.read(), _subtype=mimesubtype)
			elif mimetype == 'image':
				msg = MIMEImage(fp.read(), _subtype=mimesubtype)
			elif mimetype == 'text':
				msg = MIMEText(fp.read(), _subtype=mimesubtype)
			elif mimetype == 'audio':
				msg = MIMEAudio(fp.read(), _subtype=mimesubtype)
			else:
				EudoraLog.log.error("Unrecognized mime type '%s' while processing attachment '%s'" % (mimeinfo[0], filename))
				return
		finally:
			fp.close()

		msg.add_header('Content-Disposition', 'attachment', filename=name)

		message.attach(msg)

		attachments_found = attachments_found + 1

#		EudoraLog.log.warn(" SUCCEEDED finding attachment: \'" + attachment_desc + "\', name = \'" + name + "\'")
		if orig_path in paths_found:
			paths_found[orig_path] = paths_found[orig_path] + 1
		else:
			paths_found[orig_path] = 1

		if not EudoraLog.log.mbx_name() in found_attachments:
			found_attachments[EudoraLog.log.mbx_name()] = []
		found_attachments[EudoraLog.log.mbx_name()].append((attachment_desc, filename))
	else:
		attachments_missing = attachments_missing + 1

		if not EudoraLog.log.mbx_name() in missing_attachments:
			missing_attachments[EudoraLog.log.mbx_name()] = []
		missing_attachments[EudoraLog.log.mbx_name()].append(attachment_desc)

#		EudoraLog.log.warn(" FAILED to find attachment: \'" + attachment_desc + "\'" )

		if re_mangled_mac.search(filename):
			print "Mac pattern: %s" % (filename, )
			mac_mismatches.append(filename)

		if orig_path in paths_missing:
			paths_missing[orig_path] = paths_missing[orig_path] + 1
		else:
			paths_missing[orig_path] = 1
Esempio n. 5
0
def extract_pieces( msg_lines, msg_offset, mbx, inner_mesg=False ):
	"""Takes four parameters.  The first is a list of line strings
	containing the headers and body of a message from a Eudora MBX
	file.  The second is the offset of the first character in the
	first line in the msg_lines list within the MBX file we're
	processing.  The third is the name of the MBX file we're
	reading.  The fourth, inner_mesg, is a boolean controlling
	whether or not the pieces were are extracting are a top-level
	message in the MBX file.  If inner_mesg is true, we will carry
	out our processing under the assumption that we are handling
	an attached message carried in an message/rfc822 segment.
	
	Returns a tuple (header, body, attachments, embeddeds, mbx)
	containing a Header object, a body String containing the body
	of the message, a list of attachment definition tuples, a list
	of embedded definition tuples, and the name of the MBX file
	we're processing."""

	global toc_info, replies
	global target

	headers = Header()
	body = []
	attachments = []
	embeddeds = []

	in_headers = True
	found_rfc822_inner_mesg = False
	is_html = False

	if not inner_mesg:
		headers.add( 'From ', msg_lines[0][5:].strip() )

	for line in msg_lines:
		if in_headers:
			if re_initial_whitespace.match( line ):
				# Header "folding" (RFC 2822 3.2.3)
				headers.appendToLast( line )
			elif len( line.strip() ) != 0:
				# Message header
				headers.add_line(line)

				attachment_matcher = re_x_attachment.match( line )

				if attachment_matcher:
					files = attachment_matcher.group(1)
					attach_list = re.split(';\s*', files)

					for attachment in attach_list:
						attachments.append( (attachment, target) )
			else:
				# End of message headers.

				# scrub the header lines we've scanned

				if not inner_mesg:
					headers.clean(toc_info, msg_offset, replies)

				in_headers = False

				content_type = headers.getValue('Content-Type:')

				if content_type and content_type.lower() == 'message/rfc822':
					found_rfc822_inner_mesg = True
					print "+",
		elif found_rfc822_inner_mesg:
			# We're processing a message/rfc822 message,
			# and so we don't want to process attachments
			# at this level.  Instead, we want to properly
			# extract all body lines for later processing

			body.append(strip_linesep(line) + "\n")
		else:
			# We're in the body of the text and we need to
			# handle attachments

			if not is_html and re_xhtml.search( line ) or re_normal_html.search( line ):
				is_html = True

			if attachments_dirs and re_attachment.search( line ):
				# remove the newline that
				# Eudora inserts before the
				# 'Attachment Converted' line.

				if len(body) > 0 and (body[-1] == '\n' or body[-1] == '\r\n'):
					body.pop()

				#EudoraLog.log.warn("Adding attachment with contenttype = " + contenttype)
				attachments.append( (line, target) )
			else:
				embedded_matcher = re_embedded.match ( line )
					
				if embedded_matcher:
					filename = embedded_matcher.group(1)
					embeddeds.append( filename )
				else:
					orig_line = line

					if scrub_xflowed:
						line = re.sub(re_xflowed, '', line)
						line = re.sub(re_xhtml, '', line)
						line = re.sub(re_pete_stuff, '', line)

					if orig_line == line or line != '':
						body.append(strip_linesep(line) + "\n")

	return ( headers, body, attachments, embeddeds, mbx, is_html )
Esempio n. 6
0
				message_count = message_count + 1
				newmailbox.add(message)
			except TypeError:
				print str(headers)
				print message.get_content_type()
				traceback.print_exc(file=sys.stdout)

			EudoraLog.msg_no = EudoraLog.msg_no + 1
			msg_offset = last_file_position

			msg_lines = []

		if not line:
			break

		msg_lines.append(strip_linesep(line) + "\n")
		last_file_position = INPUT.tell()
		EudoraLog.line_no += 1

	# Check if the file isn't empty and any messages have been processed.
	if EudoraLog.line_no == 0:
		EudoraLog.log.warn( 'empty file' )
	elif EudoraLog.msg_no == 0:
		EudoraLog.log.error( 'no messages (not a Eudora mailbox file?)' )

	if True:
		print

		print "\nMissing path count:"

		for (path, count) in paths_missing.iteritems():