Beispiel #1
0
	def add_doc(self, doc_id = '', doc_class='', doc_terms=[], do_padding = False):
		my_doc_terms = SuperList()
		for term in doc_terms:
			self.terms.unique_append(term)
			my_doc_terms.insert_after_padding(self.terms.index(term))
		self.matrix.append({'id': doc_id, 'class': doc_class, 'terms': my_doc_terms})
		if do_padding:
			self.do_padding()
Beispiel #2
0
	def add_query(self, query_id = '', query_class='n/a', query_terms=[]):
		my_query_terms = SuperList()
		my_query_terms.do_padding(new_len=len(self.terms), padding_data=0)
		new_terms_count = 0
		for term in query_terms:
			try:
				my_query_terms.insert_after_padding(self.terms.index(term))
			except:
				# Term not obtaied in traing phase
				new_terms_count += 1
		self.queries.append({'id': query_id, 'class': query_class, 'terms': my_query_terms, 'new_terms_count': new_terms_count})
Beispiel #3
0
	def add_query(self, query_id = '', query_class='n/a', query_terms=[]):
		my_query_terms = SuperList()
		my_query_terms.do_padding(new_len=len(self.terms), padding_data=0)
		for term in query_terms:
			try:
				my_query_terms.insert_after_padding(self.terms.index(term))
			except:
				# Term not obtaied in traing phase, ignore it
				pass
		# Calling add_vectors to convert my_query_terms to log_tf values
		self.add_vectors(a=my_query_terms, log_tf_a = True)
		self.queries.append({'id': query_id, 'class': query_class, 'terms': my_query_terms})
Beispiel #4
0
	def add_vectors(self, a=[], b=[], log_tf_a = True, log_tf_b = True):
		if not b:
			b = SuperList()
			b.do_padding(new_len=a.__len__(), padding_data=0)
		elif a.__len__() != b.__len__():
			if self.verbose: print "add_vectors:", a.__len__(), "!=", b.__len__()
			raise Exception
		sum_vector = SuperList()
		for i in range(0,a.__len__()):
			sum_vector.append(self.log_tf(a[i], do_nothing = not log_tf_a) + self.log_tf(b[i], do_nothing = not log_tf_b))
		return sum_vector
Beispiel #5
0
	def compare_queries(self, testing=True):
		return_value = []
		queries_count = 0
		if self.verbose: 
			print "\nCalculating for %d queries" % len(self.queries)
		# Before doing any comparisons we need to convert the matrix to log_tf
		# Moved the below line to calculate_training_data()
		#self.matrix_to_log_tf()
		for query in self.queries:
			if self.verbose: 
				queries_count += 1
				if queries_count % (len(self.queries)/5) == 0:
					print "- %d querues has been processed" % queries_count 
			top_k_classes = SuperList()
			for doc in self.matrix:
				q_distance = self.calculate_vectors_distance(query['terms'], doc['terms'])
				item = {"class": doc['class'], "distance": q_distance}
				top_k_classes.populate_in_reverse_order(item, self._greater_than)
			if self.distance_metric == "euclid":
				top_k_classes.reverse()
			return_value.append((query["class"], self.get_top_class(nearest_docs=top_k_classes, query_class=query["class"])[0]))
		return return_value
Beispiel #6
0
	def __init__(self, verbose=False, fold="n/a", config=object, ev=object):
		self.verbose = verbose
		self.fold = fold
		self.config = config
		self.config_data = config.get_configuration()
		self.distance_metric = self.config_data['distance_metric']
		# Set k=0 now, let kNN reset it later on
		self.k = 0
		# confugure the evaluation module
		self.ev = ev
		self.terms = SuperList()
		self.matrix = []
		self.queries = []
		if self.verbose: 
			print "\nInitialization for fold %s done!" % fold
Beispiel #7
0
class Index:	
	''' 
	Index is our main class, will inherit others for each IR algorithms from it.
	Its two main data-structures are:
	* terms: This is a simple list of all terms in all training documents
	* matrix: This is our vector space, where terms, documents & classes are mapped to each other
		matrix = [{'id': 'document1',
					'class': 'spam',
					'terms': [1,0,1,0,0,1]
					}]
	* queries: should look exactly like matrix
		queries = [{'id': 'query1',
				'class': 'spam', # In testing: This is the known class, else "n/a".
				'terms': [1,0,1,1,0,1]
					}]
	'''

	# The initialization functions, we set verbose=True for debugging 
	def __init__(self, verbose=False, fold="n/a", config=object, ev=object):
		self.verbose = verbose
		self.fold = fold
		self.config = config
		self.config_data = config.get_configuration()
		self.distance_metric = self.config_data['distance_metric']
		# Set k=0 now, let kNN reset it later on
		self.k = 0
		# confugure the evaluation module
		self.ev = ev
		self.terms = SuperList()
		self.matrix = []
		self.queries = []
		if self.verbose: 
			print "\nInitialization for fold %s done!" % fold

	# Index[key] returns a list of occurences of term (key) in all documents
	def __getitem__(self, key):
		try:
			index = self.terms.index(key)
			return [doc['terms'][index] for doc in self.matrix]
		except:
			if self.verbose: print sys.exc_info()
			raise KeyError

	# Gives some stats about the our training-set
	def diagnose(self):
		print "Diagnose:", self.__class__
		print "- Number of Documents:", len(self.matrix)
		print "- Number of Terms:", len(self.terms)
		#for doc in self.matrix:
		#	print doc['id'], sum(doc['terms'])
		#print "-- Terms:", self.terms

	# To align the length of all rows in matrix after new docs/terms are added to it
	def do_padding(self):
		for doc in self.matrix:
			doc['terms'].do_padding(new_len=len(self.terms), padding_data=0)
		for query in self.queries:
			query['terms'].do_padding(new_len=len(self.terms), padding_data=0)

	# We better keep matrix without log_tf at first, in case we need to do Feature Selection
	# In case of Rocchio we do the log_tf on the fly when calculating the proto_classes
	# Whereas in kNN we might need to call this function
	def matrix_to_log_tf(self):
		for doc in self.matrix:
			doc['terms'] = self.vector_log_tf(doc['terms'])

	# To be used for debugging reasons, displays index and matrix
	def display_idx(self):
		print self.terms
		for doc in self.matrix:
			print doc['id'], doc['class'], doc['terms']

	# Coverts a scalar value to its log_tf (1 + log_10(value) OR zero)
	def log_tf(self, value, do_nothing=False):
		val = float(value)
		if not do_nothing:
			val = 1 + math.log10(val) if val != 0 else float(0)
		return val
	
	# Coverts a vector value to its log_tf (1 + log_10(value) OR zero)
	def vector_log_tf(self, a=[], do_nothing=False):
		new_vector = SuperList()
		for i in range(0,a.__len__()):
			new_vector.append(self.log_tf(value=a[i], do_nothing=do_nothing))
		return new_vector

	# Divides each item in a vector (list) by a scalar number
	def divide_vector(self, vector=[], scalar=1):
		result = SuperList()
		for item in vector:
			result.append(float(item)/scalar)
		return result

	# Add to vectors (lists) to each other and return the resulting vector
	# For each one of them, we can either convert its items into log_tf before addition or not
	def add_vectors(self, a=[], b=[], log_tf_a = True, log_tf_b = True):
		if not b:
			b = SuperList()
			b.do_padding(new_len=a.__len__(), padding_data=0)
		elif a.__len__() != b.__len__():
			if self.verbose: print "add_vectors:", a.__len__(), "!=", b.__len__()
			raise Exception
		sum_vector = SuperList()
		for i in range(0,a.__len__()):
			sum_vector.append(self.log_tf(a[i], do_nothing = not log_tf_a) + self.log_tf(b[i], do_nothing = not log_tf_b))
		return sum_vector

	# Calculates the cosine of the angles between two vectors (lists)
	def cos_vectors(self, a=[], b=[]):
		if a.__len__() != b.__len__():
			if self.verbose: print "cos_vectors:", a.__len__(), "!=", b.__len__()
			raise Exception
		norm_a_sqrd = norm_b_sqrd = 0
		numerator = 0
		for i in range(0,a.__len__()):
			numerator = numerator + a[i]*b[i]
			# Do not use math.pow(), time consuming!
			norm_a_sqrd = norm_a_sqrd + (a[i]*a[i]) 
			norm_b_sqrd = norm_b_sqrd + (b[i]*b[i])
		# In some cases, when one vector is all zeros, division by zero happens
		# Normally this happens when training on small training-set
		# And all vocabulary in query is first time to be seen.
		try:
		 	return_value = numerator / (math.sqrt(norm_a_sqrd) * math.sqrt(norm_b_sqrd))
		except:
			return_value = 0
		return return_value

	# Calculate Euclidean distance between two vectors (lists)
	def euclid_vectors(self, a=[], b=[]):
		if a.__len__() != b.__len__():
			if self.verbose: print "euclid_vectors:", a.__len__(), "!=", b.__len__()
			raise Exception
		euclid_sqrd = 0
		for i in range(0,a.__len__()):
			euclid_sqrd += math.pow((a[i] - b[i]), 2)
		return math.sqrt(euclid_sqrd)

	# Calculate distance between two vectors (lists)
	def calculate_vectors_distance(self, a=[], b=[]):
		if self.distance_metric == "cos":
			return self.cos_vectors(a, b)
		elif self.distance_metric == "euclid":
			return self.euclid_vectors(a, b)

	# We call this each time we are training on a new document
	# It is given the document's doc_class and a list of the parsed doc_terms from it
	# Since each time we get a new documet, we also might get new terms in our terms and matrix list
	# So, if do_padding=True: We extend and pad all old rows in matrix to match the new length of terms now
	# Otherwise, we might be postponing this padding process after we finish adding all docs for processing reasons
	def add_doc(self, doc_id = '', doc_class='', doc_terms=[], do_padding = False):
		my_doc_terms = SuperList()
		for term in doc_terms:
			self.terms.unique_append(term)
			my_doc_terms.insert_after_padding(self.terms.index(term))
		self.matrix.append({'id': doc_id, 'class': doc_class, 'terms': my_doc_terms})
		if do_padding:
			self.do_padding()

	# We call this each time we are training on a new query
	# It is given the document's query_class and a list of the parsed query_terms from it
	# No padding here, since terms in query not learnt during training will be ignored
	def add_query(self, query_id = '', query_class='n/a', query_terms=[]):
		my_query_terms = SuperList()
		my_query_terms.do_padding(new_len=len(self.terms), padding_data=0)
		for term in query_terms:
			try:
				my_query_terms.insert_after_padding(self.terms.index(term))
			except:
				# Term not obtaied in traing phase, ignore it
				pass
		# Calling add_vectors to convert my_query_terms to log_tf values
		self.add_vectors(a=my_query_terms, log_tf_a = True)
		self.queries.append({'id': query_id, 'class': query_class, 'terms': my_query_terms})

	# This is where each classifier may do any calculations after loading traing data
	# We will leave it for each child class to overwrite it on its own way, or ignore it
	# We may add the Feature Selection here, for example: Maximum Information Gain
	# Hence, make sure all child classes call their parent's method before overwriting
	def calculate_training_data(self):
		pass
Beispiel #8
0
	def _non_zero_indices(self, l):
		ret = SuperList()
		for i in range(0,len(l)):
			if l[i] != 0: ret.append(i)
		return ret
Beispiel #9
0
	def divide_vector(self, vector=[], scalar=1):
		result = SuperList()
		for item in vector:
			result.append(float(item)/scalar)
		return result
Beispiel #10
0
	def vector_log_tf(self, a=[], do_nothing=False):
		new_vector = SuperList()
		for i in range(0,a.__len__()):
			new_vector.append(self.log_tf(value=a[i], do_nothing=do_nothing))
		return new_vector