# Verify the corpus (category)
    if sys.argv[1] == "ham":
        corpus = "1"
    elif sys.argv[1] == "spam":
        corpus = "2"
    else:
        print("Usage: " + sys.argv[0] + " {spam|ham} output_filename")
        quit(1)

    output_filename = sys.argv[2]
else:
    print("Usage: " + sys.argv[0] + " {spam|ham} output_filename")
    quit(1)

# Initialize the feature extractor
extractor = EmailFeatureExtractor(vector_file)

# Iterate through each line in stdin
for line in sys.stdin:
    message_filename = line.strip()

    # See if the file exists
    if not os.path.exists(message_filename):
        print("Error: \"" + message_filename + "\" does not exist")
        continue

    # Open the file
    fp = open(message_filename, "rb")

    # Parse the file as an email message
    try:
Exemple #2
0
	def handle(self):
		# Get the request line (first command)
		request_line = str(self.rfile.readline().strip(), 'iso-8859-1')

		# Separate the words
		words = request_line.split()
		self.command = words[0]
		protocol = words[1]

		# Make sure the command is valid
		if  self.command != "CHECK" and \
			self.command != "SYMBOLS" and \
			self.command != "REPORT" and \
			self.command != "REPORT_IFSPAM" and \
			self.command != "SKIP" and \
			self.command != "PING" and \
			self.command != "PROCESS" and \
			self.command != "TELL":
			# Send a bad response
			self.sendResponse("USAGE", True, 100, "")
			return

		# See if we are dealing with a PING or a SKIP
		if self.command == "PING" or self.command == "SKIP":
			# Send a short OK response
			self.wfile.write(b'SPAMD/1.1 0 EX_OK\r\n\r\n')
			return

		# Get the headers; (we only care about Content-length)
		self.length = 0
		while True:
			raw_header = str(self.rfile.readline().strip(), 'iso-8859-1')
			header = raw_header.split(':', 2)
			if header[0].strip().lower() == 'content-length':
				self.length = int(header[1].strip())
			if not raw_header:
				break

		# Read the specified number of bytes into a temporary file
		temp_file = tempfile.NamedTemporaryFile(dir=self.tmp_dir)

		while self.length > 0:
			# See how many bytes we are going to read
			block_size = 4096
			if self.length > block_size:
				bytes_to_read = block_size
			else:
				bytes_to_read = self.length

			# Read the block and write it to the temp file
			temp_file.write(self.rfile.read(bytes_to_read))

			self.length -= block_size

		# Flush the file to disk
		temp_file.flush()

		# Extract the features from the email message
		fp = open(temp_file.name, "rb")

		# Parse the email message
		try:
			message = email.message_from_binary_file(fp)
		except:
			fp.close()
			self.sendResponse("OK", True, 100,
				"Message could not be parsed as a valid email")
			return

		# Extract information from the message for logging purposes
		recepient = message["X-Envelope-To"]
		subject = message["Subject"]

		# Extract the features
		extractor = EmailFeatureExtractor(self.vectors)
		vector_count = extractor.vectorCount()
		#vector_count = 236655
		features = extractor.extract(message)

		# Close the file, both the binary version and temp file version
		fp.close()
		temp_file.close()

		# Write out the features to another temporary file
		features_file = \
			tempfile.NamedTemporaryFile(dir=self.tmp_dir)
		features_file.write(b'1') # Corpus identifier (ham for now)
		for feature_number in sorted(features.keys()):
			if feature_number <= vector_count:
				features_file.write(bytearray(" " + str(feature_number) + ":" +
					str(features[feature_number]), 'iso-8859-1'))
		features_file.write(b'\n')
		features_file.flush()

		# Call the classifier on the features
		try:
			command = [self.mallet, 'classify-svmlight', '--input',
				features_file.name, '--classifier', self.filter_model,
				'--output', '-']
			output = subprocess.check_output(command, stderr=subprocess.STDOUT)
		except:
			self.sendResponse("TEMPFAIL", False, 0, "")
			features_file.close()
			return

		# Close the features temporary file; this will delete the file
		features_file.close()

		# Interpret the probabilities
		output_words = output.strip().split()
		ham_probability = float(output_words[2]) * 100
		spam_probability = float(output_words[4]) * 100

		# Determine if the message is spam
		if ham_probability >= spam_probability:
			is_spam = False
			message = "The content filter has determined your email is not spam"
		else:
			is_spam = True
			message = "The content filter has determined your email is spam"

		# Send the results
		self.sendResponse("OK", is_spam, spam_probability, message)

		#print(recepient, "|", subject, "|", ham_probability, "|", spam_probability)

		# Log the entry
		with open("/var/log/spam_filter/filter.log", "a") as log_file:
			log_file.write(str(ham_probability) + "|" + str(spam_probability) + "|" + str(recepient) + "|" + (subject) + "\n")
    if sys.argv[1] == "ham":
        corpus = "1"
    elif sys.argv[1] == "spam":
        corpus = "2"
    else:
        print("Usage: " + sys.argv[0] + " {spam|ham} output_filename")
        quit(1)

    output_filename = sys.argv[2]
else:
    print("Usage: " + sys.argv[0] + " {spam|ham} output_filename")
    quit(1);


# Initialize the feature extractor
extractor = EmailFeatureExtractor(vector_file)

# Iterate through each line in stdin
for line in sys.stdin:
    message_filename = line.strip()

    # See if the file exists
    if not os.path.exists(message_filename):
        print("Error: \"" + message_filename + "\" does not exist")
        continue

    # Open the file
    fp = open(message_filename, "rb")

    # Parse the file as an email message
    try: