Beispiel #1
0
class Trainer(object):

	# Initializes the object
	# @param self Trainer object
	# @param directory The directory that contains the training folders
	# @param spam The sub directory for the spam class
	# @param ham The sub directory for the ham class.
	# @param limit The number of emails to be scanned
	def __init__(self, directory = 'data/corpus2', spam = 'spam', ham = 'ham', limit = 1000):
		self.spam_path = os.path.join(os.getcwd(), directory, spam)
		self.ham_path = os.path.join(os.getcwd(), directory, ham)
		self.limit = limit
		self.classifier = NaiveBayes()

	# A wrapper for the  train_classifier function.
	# @param self The trainer object
	# @param verbose Depending on verbosity information will be printed
	# @return The classifier object
	def train(self, verbose = False):
		self.train_classifier(self.spam_path,'spam', verbose)
		self.train_classifier(self.ham_path,'ham', verbose)

		return self.classifier

	# Converts a document into tokens and extracts features as mentioned in README.md
	# @param self The Trainer object
	# @param text The text to be scanned
	def extract_features(self, text):
		features  = []
		tokens  = text.split()
		porter = stem.porter.PorterStemmer()
		tokens = [token for token in tokens if token not in stopwords.words('english')]
		link = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
		for token in tokens:
			if len(token.translate(None,string.punctuation)) < 3:
				continue
			if "." + token in mimetypes.types_map.keys():
				features.append('ATTACHMENT')
			elif token.isdigit():
				features.append('NUMBER')
			elif token.upper() == token:
				features.append('ALL_CAPS')
				features.append(porter.stem(token.translate(None, string.punctuation)).lower())
			elif link.match(token):
				features.append('LINK')
			else:
				features.append(porter.stem(token.translate(None, string.punctuation)).lower())

		return features


	# The function that does the actual classfication
	# @param path The path of the data to be trained
	# @param label The label underwhich the data is classified
	# @param verbose the verbsoity of statistics printed
	def train_classifier(self, path, label, verbose):

		limit = len(os.listdir(path)) < self.limit and len(os.listdir(path)) or self.limit
		if verbose:
			print colored("Training %d emails in %s class" %(limit, label),'green')

		os.chdir(path)
		for email in os.listdir(path)[:self.limit]:
			if verbose and verbose > 1:
				print colored("Working on file %s" %(email),'green')
			email_file = open(email, 'r')
			email_text = email_file.read()
			try:
				email_text = bs4.UnicodeDammit.detwingle(email_text).decode('utf-8')
			except:
				print colored("Skipping file %s because of bad coding"%(email),'red')
				continue
			email_file.close()
			email_text = email_text.encode('ascii', 'ignore')
			features = self.extract_features(email_text)
			self.classifier.train(features, label)
		print colored(self.classifier,'green')