def add_doc(self, doc_id = '', doc_class='', doc_terms=[], do_padding = False): my_doc_terms = SuperList() for term in doc_terms: self.terms.unique_append(term) my_doc_terms.insert_after_padding(self.terms.index(term)) self.matrix.append({'id': doc_id, 'class': doc_class, 'terms': my_doc_terms}) if do_padding: self.do_padding()
def add_query(self, query_id = '', query_class='n/a', query_terms=[]): my_query_terms = SuperList() my_query_terms.do_padding(new_len=len(self.terms), padding_data=0) new_terms_count = 0 for term in query_terms: try: my_query_terms.insert_after_padding(self.terms.index(term)) except: # Term not obtaied in traing phase new_terms_count += 1 self.queries.append({'id': query_id, 'class': query_class, 'terms': my_query_terms, 'new_terms_count': new_terms_count})
def add_query(self, query_id = '', query_class='n/a', query_terms=[]): my_query_terms = SuperList() my_query_terms.do_padding(new_len=len(self.terms), padding_data=0) for term in query_terms: try: my_query_terms.insert_after_padding(self.terms.index(term)) except: # Term not obtaied in traing phase, ignore it pass # Calling add_vectors to convert my_query_terms to log_tf values self.add_vectors(a=my_query_terms, log_tf_a = True) self.queries.append({'id': query_id, 'class': query_class, 'terms': my_query_terms})
def add_vectors(self, a=[], b=[], log_tf_a = True, log_tf_b = True): if not b: b = SuperList() b.do_padding(new_len=a.__len__(), padding_data=0) elif a.__len__() != b.__len__(): if self.verbose: print "add_vectors:", a.__len__(), "!=", b.__len__() raise Exception sum_vector = SuperList() for i in range(0,a.__len__()): sum_vector.append(self.log_tf(a[i], do_nothing = not log_tf_a) + self.log_tf(b[i], do_nothing = not log_tf_b)) return sum_vector
def compare_queries(self, testing=True): return_value = [] queries_count = 0 if self.verbose: print "\nCalculating for %d queries" % len(self.queries) # Before doing any comparisons we need to convert the matrix to log_tf # Moved the below line to calculate_training_data() #self.matrix_to_log_tf() for query in self.queries: if self.verbose: queries_count += 1 if queries_count % (len(self.queries)/5) == 0: print "- %d querues has been processed" % queries_count top_k_classes = SuperList() for doc in self.matrix: q_distance = self.calculate_vectors_distance(query['terms'], doc['terms']) item = {"class": doc['class'], "distance": q_distance} top_k_classes.populate_in_reverse_order(item, self._greater_than) if self.distance_metric == "euclid": top_k_classes.reverse() return_value.append((query["class"], self.get_top_class(nearest_docs=top_k_classes, query_class=query["class"])[0])) return return_value
def __init__(self, verbose=False, fold="n/a", config=object, ev=object): self.verbose = verbose self.fold = fold self.config = config self.config_data = config.get_configuration() self.distance_metric = self.config_data['distance_metric'] # Set k=0 now, let kNN reset it later on self.k = 0 # confugure the evaluation module self.ev = ev self.terms = SuperList() self.matrix = [] self.queries = [] if self.verbose: print "\nInitialization for fold %s done!" % fold
class Index: ''' Index is our main class, will inherit others for each IR algorithms from it. Its two main data-structures are: * terms: This is a simple list of all terms in all training documents * matrix: This is our vector space, where terms, documents & classes are mapped to each other matrix = [{'id': 'document1', 'class': 'spam', 'terms': [1,0,1,0,0,1] }] * queries: should look exactly like matrix queries = [{'id': 'query1', 'class': 'spam', # In testing: This is the known class, else "n/a". 'terms': [1,0,1,1,0,1] }] ''' # The initialization functions, we set verbose=True for debugging def __init__(self, verbose=False, fold="n/a", config=object, ev=object): self.verbose = verbose self.fold = fold self.config = config self.config_data = config.get_configuration() self.distance_metric = self.config_data['distance_metric'] # Set k=0 now, let kNN reset it later on self.k = 0 # confugure the evaluation module self.ev = ev self.terms = SuperList() self.matrix = [] self.queries = [] if self.verbose: print "\nInitialization for fold %s done!" % fold # Index[key] returns a list of occurences of term (key) in all documents def __getitem__(self, key): try: index = self.terms.index(key) return [doc['terms'][index] for doc in self.matrix] except: if self.verbose: print sys.exc_info() raise KeyError # Gives some stats about the our training-set def diagnose(self): print "Diagnose:", self.__class__ print "- Number of Documents:", len(self.matrix) print "- Number of Terms:", len(self.terms) #for doc in self.matrix: # print doc['id'], sum(doc['terms']) #print "-- Terms:", self.terms # To align the length of all rows in matrix after new docs/terms are added to it def do_padding(self): for doc in self.matrix: doc['terms'].do_padding(new_len=len(self.terms), padding_data=0) for query in self.queries: query['terms'].do_padding(new_len=len(self.terms), padding_data=0) # We better keep matrix without log_tf at first, in case we need to do Feature Selection # In case of Rocchio we do the log_tf on the fly when calculating the proto_classes # Whereas in kNN we might need to call this function def matrix_to_log_tf(self): for doc in self.matrix: doc['terms'] = self.vector_log_tf(doc['terms']) # To be used for debugging reasons, displays index and matrix def display_idx(self): print self.terms for doc in self.matrix: print doc['id'], doc['class'], doc['terms'] # Coverts a scalar value to its log_tf (1 + log_10(value) OR zero) def log_tf(self, value, do_nothing=False): val = float(value) if not do_nothing: val = 1 + math.log10(val) if val != 0 else float(0) return val # Coverts a vector value to its log_tf (1 + log_10(value) OR zero) def vector_log_tf(self, a=[], do_nothing=False): new_vector = SuperList() for i in range(0,a.__len__()): new_vector.append(self.log_tf(value=a[i], do_nothing=do_nothing)) return new_vector # Divides each item in a vector (list) by a scalar number def divide_vector(self, vector=[], scalar=1): result = SuperList() for item in vector: result.append(float(item)/scalar) return result # Add to vectors (lists) to each other and return the resulting vector # For each one of them, we can either convert its items into log_tf before addition or not def add_vectors(self, a=[], b=[], log_tf_a = True, log_tf_b = True): if not b: b = SuperList() b.do_padding(new_len=a.__len__(), padding_data=0) elif a.__len__() != b.__len__(): if self.verbose: print "add_vectors:", a.__len__(), "!=", b.__len__() raise Exception sum_vector = SuperList() for i in range(0,a.__len__()): sum_vector.append(self.log_tf(a[i], do_nothing = not log_tf_a) + self.log_tf(b[i], do_nothing = not log_tf_b)) return sum_vector # Calculates the cosine of the angles between two vectors (lists) def cos_vectors(self, a=[], b=[]): if a.__len__() != b.__len__(): if self.verbose: print "cos_vectors:", a.__len__(), "!=", b.__len__() raise Exception norm_a_sqrd = norm_b_sqrd = 0 numerator = 0 for i in range(0,a.__len__()): numerator = numerator + a[i]*b[i] # Do not use math.pow(), time consuming! norm_a_sqrd = norm_a_sqrd + (a[i]*a[i]) norm_b_sqrd = norm_b_sqrd + (b[i]*b[i]) # In some cases, when one vector is all zeros, division by zero happens # Normally this happens when training on small training-set # And all vocabulary in query is first time to be seen. try: return_value = numerator / (math.sqrt(norm_a_sqrd) * math.sqrt(norm_b_sqrd)) except: return_value = 0 return return_value # Calculate Euclidean distance between two vectors (lists) def euclid_vectors(self, a=[], b=[]): if a.__len__() != b.__len__(): if self.verbose: print "euclid_vectors:", a.__len__(), "!=", b.__len__() raise Exception euclid_sqrd = 0 for i in range(0,a.__len__()): euclid_sqrd += math.pow((a[i] - b[i]), 2) return math.sqrt(euclid_sqrd) # Calculate distance between two vectors (lists) def calculate_vectors_distance(self, a=[], b=[]): if self.distance_metric == "cos": return self.cos_vectors(a, b) elif self.distance_metric == "euclid": return self.euclid_vectors(a, b) # We call this each time we are training on a new document # It is given the document's doc_class and a list of the parsed doc_terms from it # Since each time we get a new documet, we also might get new terms in our terms and matrix list # So, if do_padding=True: We extend and pad all old rows in matrix to match the new length of terms now # Otherwise, we might be postponing this padding process after we finish adding all docs for processing reasons def add_doc(self, doc_id = '', doc_class='', doc_terms=[], do_padding = False): my_doc_terms = SuperList() for term in doc_terms: self.terms.unique_append(term) my_doc_terms.insert_after_padding(self.terms.index(term)) self.matrix.append({'id': doc_id, 'class': doc_class, 'terms': my_doc_terms}) if do_padding: self.do_padding() # We call this each time we are training on a new query # It is given the document's query_class and a list of the parsed query_terms from it # No padding here, since terms in query not learnt during training will be ignored def add_query(self, query_id = '', query_class='n/a', query_terms=[]): my_query_terms = SuperList() my_query_terms.do_padding(new_len=len(self.terms), padding_data=0) for term in query_terms: try: my_query_terms.insert_after_padding(self.terms.index(term)) except: # Term not obtaied in traing phase, ignore it pass # Calling add_vectors to convert my_query_terms to log_tf values self.add_vectors(a=my_query_terms, log_tf_a = True) self.queries.append({'id': query_id, 'class': query_class, 'terms': my_query_terms}) # This is where each classifier may do any calculations after loading traing data # We will leave it for each child class to overwrite it on its own way, or ignore it # We may add the Feature Selection here, for example: Maximum Information Gain # Hence, make sure all child classes call their parent's method before overwriting def calculate_training_data(self): pass
def _non_zero_indices(self, l): ret = SuperList() for i in range(0,len(l)): if l[i] != 0: ret.append(i) return ret
def divide_vector(self, vector=[], scalar=1): result = SuperList() for item in vector: result.append(float(item)/scalar) return result
def vector_log_tf(self, a=[], do_nothing=False): new_vector = SuperList() for i in range(0,a.__len__()): new_vector.append(self.log_tf(value=a[i], do_nothing=do_nothing)) return new_vector