Esempio n. 1
0
	def add_vectors(self, a=[], b=[], log_tf_a = True, log_tf_b = True):
		if not b:
			b = SuperList()
			b.do_padding(new_len=a.__len__(), padding_data=0)
		elif a.__len__() != b.__len__():
			if self.verbose: print "add_vectors:", a.__len__(), "!=", b.__len__()
			raise Exception
		sum_vector = SuperList()
		for i in range(0,a.__len__()):
			sum_vector.append(self.log_tf(a[i], do_nothing = not log_tf_a) + self.log_tf(b[i], do_nothing = not log_tf_b))
		return sum_vector
Esempio n. 2
0
 def compare_queries(self, testing=True):
     return_value = []
     queries_count = 0
     if self.verbose:
         print "\nCalculating for %d queries" % len(self.queries)
     # Before doing any comparisons we need to convert the matrix to log_tf
     # Moved the below line to calculate_training_data()
     #self.matrix_to_log_tf()
     for query in self.queries:
         if self.verbose:
             queries_count += 1
             if queries_count % (len(self.queries) / 5) == 0:
                 print "- %d querues has been processed" % queries_count
         top_k_classes = SuperList()
         for doc in self.matrix:
             q_distance = self.calculate_vectors_distance(
                 query['terms'], doc['terms'])
             item = {"class": doc['class'], "distance": q_distance}
             top_k_classes.populate_in_reverse_order(
                 item, self._greater_than)
         if self.distance_metric == "euclid":
             top_k_classes.reverse()
         return_value.append(
             (query["class"],
              self.get_top_class(nearest_docs=top_k_classes,
                                 query_class=query["class"])[0]))
     return return_value
Esempio n. 3
0
 def calculate_proto_classes(self):
     vector_len = len(self.terms)
     sum_vector = SuperList()
     for doc in self.matrix:
         if self.proto_classes.has_key(doc['class']):
             # Updating values of existing proto-class with new doc, we only log_tf the newly added vector
             sum_vector = self.add_vectors(
                 a=self.proto_classes[doc['class']]['log_tf'],
                 b=doc['terms'],
                 log_tf_a=False,
                 log_tf_b=True)
             self.proto_classes[doc['class']] = {
                 'log_tf':
                 sum_vector,
                 'docs_count':
                 self.proto_classes[doc['class']]['docs_count'] + 1
             }
         else:
             # First time to deal with the class, notice the add_vector will convert to log_tf by default
             sum_vector = self.add_vectors(a=doc['terms'], log_tf_a=True)
             self.proto_classes[doc['class']] = {
                 'log_tf': sum_vector,
                 'docs_count': 1
             }
     for p_class in self.proto_classes.keys():
         # Calculate centroid (proto-class) mean values
         self.proto_classes[p_class]['log_tf'] = self.divide_vector(
             self.proto_classes[p_class]['log_tf'],
             self.proto_classes[p_class]['docs_count'])
Esempio n. 4
0
	def add_doc(self, doc_id = '', doc_class='', doc_terms=[], do_padding = False):
		my_doc_terms = SuperList()
		for term in doc_terms:
			self.terms.unique_append(term)
			my_doc_terms.insert_after_padding(self.terms.index(term))
		self.matrix.append({'id': doc_id, 'class': doc_class, 'terms': my_doc_terms})
		if do_padding:
			self.do_padding()
Esempio n. 5
0
	def add_query(self, query_id = '', query_class='n/a', query_terms=[]):
		my_query_terms = SuperList()
		my_query_terms.do_padding(new_len=len(self.terms), padding_data=0)
		new_terms_count = 0
		for term in query_terms:
			try:
				my_query_terms.insert_after_padding(self.terms.index(term))
			except:
				# Term not obtaied in traing phase
				new_terms_count += 1
		self.queries.append({'id': query_id, 'class': query_class, 'terms': my_query_terms, 'new_terms_count': new_terms_count})
Esempio n. 6
0
	def add_query(self, query_id = '', query_class='n/a', query_terms=[]):
		my_query_terms = SuperList()
		my_query_terms.do_padding(new_len=len(self.terms), padding_data=0)
		for term in query_terms:
			try:
				my_query_terms.insert_after_padding(self.terms.index(term))
			except:
				# Term not obtaied in traing phase, ignore it
				pass
		# Calling add_vectors to convert my_query_terms to log_tf values
		self.add_vectors(a=my_query_terms, log_tf_a = True)
		self.queries.append({'id': query_id, 'class': query_class, 'terms': my_query_terms})
Esempio n. 7
0
	def __init__(self, verbose=False, fold="n/a", config=object, ev=object):
		self.verbose = verbose
		self.fold = fold
		self.config = config
		self.config_data = config.get_configuration()
		self.distance_metric = self.config_data['distance_metric']
		# Set k=0 now, let kNN reset it later on
		self.k = 0
		# confugure the evaluation module
		self.ev = ev
		self.terms = SuperList()
		self.matrix = []
		self.queries = []
		if self.verbose: 
			print "\nInitialization for fold %d done!" % int(fold)
Esempio n. 8
0
	def add_doc(self, doc_id = '', doc_class='', doc_terms=[], do_padding = True):
		# If multivariant, remove multiple occurences of terms in document
		#print "Bayse >> add_doc", doc_terms
		if self.mode == 'm_variate':
			doc_terms = list(set(doc_terms))
		#print doc_terms
		for term in doc_terms:
			self.terms.unique_append(term)
			# In case this is the first time to see this class
			if not self.m_matrix.has_key(doc_class):
				self.m_matrix[doc_class] = {'freq': SuperList(), 'total': 0, 'docs_count': 0}	
			self.m_matrix[doc_class]['freq'].insert_after_padding(index=self.terms.index(term))
		self.m_matrix[doc_class]['docs_count'] += 1
		if do_padding:
			self.do_padding()
Esempio n. 9
0
	def _non_zero_indices(self, l):
		ret = SuperList()
		for i in range(0,len(l)):
			if l[i] != 0: ret.append(i)
		return ret
Esempio n. 10
0
	def divide_vector(self, vector=[], scalar=1):
		result = SuperList()
		for item in vector:
			result.append(float(item)/scalar)
		return result
Esempio n. 11
0
	def vector_log_tf(self, a=[], do_nothing=False):
		new_vector = SuperList()
		for i in range(0,a.__len__()):
			new_vector.append(self.log_tf(value=a[i], do_nothing=do_nothing))
		return new_vector