예제 #1
0
 def add_doc(self, doc_id='', doc_class='', doc_terms=[], do_padding=False):
     my_doc_terms = SuperList()
     for term in doc_terms:
         self.terms.unique_append(term)
         my_doc_terms.insert_after_padding(self.terms.index(term))
     self.matrix.append({
         'id': doc_id,
         'class': doc_class,
         'terms': my_doc_terms
     })
     if do_padding:
         self.do_padding()
예제 #2
0
 def add_vectors(self, a=[], b=[], log_tf_a=True, log_tf_b=True):
     if not b:
         b = SuperList()
         b.do_padding(new_len=a.__len__(), padding_data=0)
     elif a.__len__() != b.__len__():
         if self.verbose:
             print "add_vectors:", a.__len__(), "!=", b.__len__()
         raise Exception
     sum_vector = SuperList()
     for i in range(0, a.__len__()):
         sum_vector.append(
             self.log_tf(a[i], do_nothing=not log_tf_a) +
             self.log_tf(b[i], do_nothing=not log_tf_b))
     return sum_vector
예제 #3
0
 def __init__(self, verbose=False, fold="n/a", config=object, ev=object):
     self.verbose = verbose
     self.fold = fold
     self.config = config
     self.config_data = config.get_configuration()
     self.distance_metric = self.config_data['distance_metric']
     # Set k=0 now, let kNN reset it later on
     self.k = 0
     # confugure the evaluation module
     self.ev = ev
     self.terms = SuperList()
     self.matrix = []
     self.queries = []
     if self.verbose:
         print "\nInitialization for fold %d done!" % int(fold)
예제 #4
0
 def calculate_proto_classes(self):
     vector_len = len(self.terms)
     sum_vector = SuperList()
     for doc in self.matrix:
         if self.proto_classes.has_key(doc['class']):
             # Updating values of existing proto-class with new doc, we only log_tf the newly added vector
             sum_vector = self.add_vectors(
                 a=self.proto_classes[doc['class']]['log_tf'],
                 b=doc['terms'],
                 log_tf_a=False,
                 log_tf_b=True)
             self.proto_classes[doc['class']] = {
                 'log_tf':
                 sum_vector,
                 'docs_count':
                 self.proto_classes[doc['class']]['docs_count'] + 1
             }
         else:
             # First time to deal with the class, notice the add_vector will convert to log_tf by default
             sum_vector = self.add_vectors(a=doc['terms'], log_tf_a=True)
             self.proto_classes[doc['class']] = {
                 'log_tf': sum_vector,
                 'docs_count': 1
             }
     for p_class in self.proto_classes.keys():
         # Calculate centroid (proto-class) mean values
         self.proto_classes[p_class]['log_tf'] = self.divide_vector(
             self.proto_classes[p_class]['log_tf'],
             self.proto_classes[p_class]['docs_count'])
예제 #5
0
 def compare_queries(self, testing=True):
     return_value = []
     queries_count = 0
     if self.verbose:
         print "\nCalculating for %d queries" % len(self.queries)
     # Before doing any comparisons we need to convert the matrix to log_tf
     # Moved the below line to calculate_training_data()
     #self.matrix_to_log_tf()
     for query in self.queries:
         if self.verbose:
             queries_count += 1
             if queries_count % (len(self.queries) / 5) == 0:
                 print "- %d querues has been processed" % queries_count
         top_k_classes = SuperList()
         for doc in self.matrix:
             q_distance = self.calculate_vectors_distance(
                 query['terms'], doc['terms'])
             item = {"class": doc['class'], "distance": q_distance}
             top_k_classes.populate_in_reverse_order(
                 item, self._greater_than)
         if self.distance_metric == "euclid":
             top_k_classes.reverse()
         return_value.append(
             (query["class"],
              self.get_top_class(nearest_docs=top_k_classes,
                                 query_class=query["class"])[0]))
     return return_value
예제 #6
0
 def add_doc(self, doc_id='', doc_class='', doc_terms=[], do_padding=True):
     # If multivariant, remove multiple occurences of terms in document
     #print "Bayse >> add_doc", doc_terms
     if self.mode == 'm_variate':
         doc_terms = list(set(doc_terms))
     #print doc_terms
     for term in doc_terms:
         self.terms.unique_append(term)
         # In case this is the first time to see this class
         if not self.m_matrix.has_key(doc_class):
             self.m_matrix[doc_class] = {
                 'freq': SuperList(),
                 'total': 0,
                 'docs_count': 0
             }
         self.m_matrix[doc_class]['freq'].insert_after_padding(
             index=self.terms.index(term))
     self.m_matrix[doc_class]['docs_count'] += 1
     if do_padding:
         self.do_padding()
예제 #7
0
 def add_query(self, query_id='', query_class='n/a', query_terms=[]):
     my_query_terms = SuperList()
     my_query_terms.do_padding(new_len=len(self.terms), padding_data=0)
     new_terms_count = 0
     for term in query_terms:
         try:
             my_query_terms.insert_after_padding(self.terms.index(term))
         except:
             # Term not obtaied in traing phase
             new_terms_count += 1
     self.queries.append({
         'id': query_id,
         'class': query_class,
         'terms': my_query_terms,
         'new_terms_count': new_terms_count
     })
예제 #8
0
 def add_query(self, query_id='', query_class='n/a', query_terms=[]):
     my_query_terms = SuperList()
     my_query_terms.do_padding(new_len=len(self.terms), padding_data=0)
     for term in query_terms:
         try:
             my_query_terms.insert_after_padding(self.terms.index(term))
         except:
             # Term not obtaied in traing phase, ignore it
             pass
     # Calling add_vectors to convert my_query_terms to log_tf values
     self.add_vectors(a=my_query_terms, log_tf_a=True)
     self.queries.append({
         'id': query_id,
         'class': query_class,
         'terms': my_query_terms
     })
예제 #9
0
 def _non_zero_indices(self, l):
     ret = SuperList()
     for i in range(0, len(l)):
         if l[i] != 0: ret.append(i)
     return ret
예제 #10
0
class Index:
    ''' 
	Index is our main class, will inherit others for each IR algorithms from it.
	Its two main data-structures are:
	* terms: This is a simple list of all terms in all training documents
	* matrix: This is our vector space, where terms, documents & classes are mapped to each other
		matrix = [{'id': 'document1',
					'class': 'spam',
					'terms': [1,0,1,0,0,1]
					}]
	* queries: should look exactly like matrix
		queries = [{'id': 'query1',
				'class': 'spam', # In testing: This is the known class, else "n/a".
				'terms': [1,0,1,1,0,1]
					}]
	'''

    # The initialization functions, we set verbose=True for debugging
    def __init__(self, verbose=False, fold="n/a", config=object, ev=object):
        self.verbose = verbose
        self.fold = fold
        self.config = config
        self.config_data = config.get_configuration()
        self.distance_metric = self.config_data['distance_metric']
        # Set k=0 now, let kNN reset it later on
        self.k = 0
        # confugure the evaluation module
        self.ev = ev
        self.terms = SuperList()
        self.matrix = []
        self.queries = []
        if self.verbose:
            print "\nInitialization for fold %d done!" % int(fold)

    # Index[key] returns a list of occurences of term (key) in all documents
    def __getitem__(self, key):
        try:
            index = self.terms.index(key)
            return [doc['terms'][index] for doc in self.matrix]
        except:
            if self.verbose: print sys.exc_info()
            raise KeyError

    # Gives some stats about the our training-set
    def diagnose(self):
        print "Diagnose:", self.__class__
        print "- Number of Documents:", len(self.matrix)
        print "- Number of Terms:", len(self.terms)
        #for doc in self.matrix:
        #	print doc['id'], sum(doc['terms'])
        #print "-- Terms:", self.terms

    # To align the length of all rows in matrix after new docs/terms are added to it
    def do_padding(self):
        for doc in self.matrix:
            doc['terms'].do_padding(new_len=len(self.terms), padding_data=0)
        for query in self.queries:
            query['terms'].do_padding(new_len=len(self.terms), padding_data=0)

    # We better keep matrix without log_tf at first, in case we need to do Feature Selection
    # In case of Rocchio we do the log_tf on the fly when calculating the proto_classes
    # Whereas in kNN we might need to call this function
    def matrix_to_log_tf(self):
        for doc in self.matrix:
            doc['terms'] = self.vector_log_tf(doc['terms'])

    # To be used for debugging reasons, displays index and matrix
    def display_idx(self):
        print self.terms
        for doc in self.matrix:
            print doc['id'], doc['class'], doc['terms']

    # Coverts a scalar value to its log_tf (1 + log_10(value) OR zero)
    def log_tf(self, value, do_nothing=False):
        val = float(value)
        if not do_nothing:
            val = 1 + math.log10(val) if val != 0 else float(0)
        return val

    # Coverts a vector value to its log_tf (1 + log_10(value) OR zero)
    def vector_log_tf(self, a=[], do_nothing=False):
        new_vector = SuperList()
        for i in range(0, a.__len__()):
            new_vector.append(self.log_tf(value=a[i], do_nothing=do_nothing))
        return new_vector

    # Divides each item in a vector (list) by a scalar number
    def divide_vector(self, vector=[], scalar=1):
        result = SuperList()
        for item in vector:
            result.append(float(item) / scalar)
        return result

    # Add to vectors (lists) to each other and return the resulting vector
    # For each one of them, we can either convert its items into log_tf before addition or not
    def add_vectors(self, a=[], b=[], log_tf_a=True, log_tf_b=True):
        if not b:
            b = SuperList()
            b.do_padding(new_len=a.__len__(), padding_data=0)
        elif a.__len__() != b.__len__():
            if self.verbose:
                print "add_vectors:", a.__len__(), "!=", b.__len__()
            raise Exception
        sum_vector = SuperList()
        for i in range(0, a.__len__()):
            sum_vector.append(
                self.log_tf(a[i], do_nothing=not log_tf_a) +
                self.log_tf(b[i], do_nothing=not log_tf_b))
        return sum_vector

    # Calculates the cosine of the angles between two vectors (lists)
    def cos_vectors(self, a=[], b=[]):
        if a.__len__() != b.__len__():
            if self.verbose:
                print "cos_vectors:", a.__len__(), "!=", b.__len__()
            raise Exception
        norm_a_sqrd = norm_b_sqrd = 0
        numerator = 0
        for i in range(0, a.__len__()):
            numerator = numerator + a[i] * b[i]
            # Do not use math.pow(), time consuming!
            norm_a_sqrd = norm_a_sqrd + (a[i] * a[i])
            norm_b_sqrd = norm_b_sqrd + (b[i] * b[i])
        # In some cases, when one vector is all zeros, division by zero happens
        # Normally this happens when training on small training-set
        # And all vocabulary in query is first time to be seen.
        try:
            return_value = numerator / (math.sqrt(norm_a_sqrd) *
                                        math.sqrt(norm_b_sqrd))
        except:
            return_value = 0
        return return_value

    # Calculate Euclidean distance between two vectors (lists)
    def euclid_vectors(self, a=[], b=[]):
        if a.__len__() != b.__len__():
            if self.verbose:
                print "euclid_vectors:", a.__len__(), "!=", b.__len__()
            raise Exception
        euclid_sqrd = 0
        for i in range(0, a.__len__()):
            euclid_sqrd += math.pow((a[i] - b[i]), 2)
        return math.sqrt(euclid_sqrd)

    # Calculate distance between two vectors (lists)
    def calculate_vectors_distance(self, a=[], b=[]):
        if self.distance_metric == "cos":
            return self.cos_vectors(a, b)
        elif self.distance_metric == "euclid":
            return self.euclid_vectors(a, b)

    # We call this each time we are training on a new document
    # It is given the document's doc_class and a list of the parsed doc_terms from it
    # Since each time we get a new documet, we also might get new terms in our terms and matrix list
    # So, if do_padding=True: We extend and pad all old rows in matrix to match the new length of terms now
    # Otherwise, we might be postponing this padding process after we finish adding all docs for processing reasons
    def add_doc(self, doc_id='', doc_class='', doc_terms=[], do_padding=False):
        my_doc_terms = SuperList()
        for term in doc_terms:
            self.terms.unique_append(term)
            my_doc_terms.insert_after_padding(self.terms.index(term))
        self.matrix.append({
            'id': doc_id,
            'class': doc_class,
            'terms': my_doc_terms
        })
        if do_padding:
            self.do_padding()

    # We call this each time we are training on a new query
    # It is given the document's query_class and a list of the parsed query_terms from it
    # No padding here, since terms in query not learnt during training will be ignored
    def add_query(self, query_id='', query_class='n/a', query_terms=[]):
        my_query_terms = SuperList()
        my_query_terms.do_padding(new_len=len(self.terms), padding_data=0)
        for term in query_terms:
            try:
                my_query_terms.insert_after_padding(self.terms.index(term))
            except:
                # Term not obtaied in traing phase, ignore it
                pass
        # Calling add_vectors to convert my_query_terms to log_tf values
        self.add_vectors(a=my_query_terms, log_tf_a=True)
        self.queries.append({
            'id': query_id,
            'class': query_class,
            'terms': my_query_terms
        })

    # This is where each classifier may do any calculations after loading traing data
    # We will leave it for each child class to overwrite it on its own way, or ignore it
    # We may add the Feature Selection here, for example: Maximum Information Gain
    # Hence, make sure all child classes call their parent's method before overwriting
    def calculate_training_data(self):
        pass
예제 #11
0
 def divide_vector(self, vector=[], scalar=1):
     result = SuperList()
     for item in vector:
         result.append(float(item) / scalar)
     return result
예제 #12
0
 def vector_log_tf(self, a=[], do_nothing=False):
     new_vector = SuperList()
     for i in range(0, a.__len__()):
         new_vector.append(self.log_tf(value=a[i], do_nothing=do_nothing))
     return new_vector