Exemple #1
0
class Matrix:

    def __init__(self):
        # List of unique terms (vocabulary)
        self.terms = SuperList()
        # List of document classes and terms summary
        #self.classes = {}
        self.docs = []

    def __len__(self):
        'Returns number of loaded ducuments'
        return len(self.docs)

    def vocabulary(self):
        'Returns list of unique terms'
        return self.terms
            
    def __str__(self):
        s  = 'Matrix:'
        s += '\n * Vocabulary read: %d' % len(self.terms)
        s += '\n * Documents read: %d' % len(self.docs)
        return s

    def dump(self, filename, delimiter='\t', header=True):
        ''' Dumps matrix to a file
        '''
        fd = open(filename,'w')
        # Let's first print file header
        header_line = 'id'
        header_line = header_line + delimiter + 'class'
        for term in self.terms:
            header_line = header_line + delimiter + term
        if header:
            fd.write('%s\n' % header_line)
        # Now we print data lines
        for doc in self.docs:
            line = doc['id']
            line = line + delimiter +  doc['class']
            for term in doc['terms']:
                line = line + delimiter + str(term) 
            fd.write('%s\n' % line)
        fd.close()
    
    def dump_arff(self, filename, delimiter=',',):
        ''' Dumps matrix to a file
        '''
        fd = open(filename,'w')
        header = '@RELATION %s\n\n' % filename.split('.')[0]
        header = header + '@ATTRIBUTE \'ID\' NUMERIC\n'
        for term in self.terms:
            header = header + '@ATTRIBUTE \'' + term + '\' NUMERIC\n'
        header = header + '@ATTRIBUTE class NUMERIC\n'
        fd.write('%s\n' % header)
        
        # Now we print data lines
        fd.write('@DATA\n')
        for doc in self.docs:
            line = doc['id']
            for term in doc['terms']:
                line = line + delimiter + str(term) 
            line = line + delimiter +  str(doc['class'])
            fd.write('%s\n' % line)
        fd.close()
        
        
    def dump_transposed(self, filename, delimiter='\t', header=True):
        fd = open(filename,'w')
        # Let's first print file header
        header_line = 'terms'
        for doc in self.docs:
            header_line = header_line + delimiter + doc['id']
        if header:
            fd.write('%s\n' % header_line)
        # Now we print data lines
        idx = 0
        for term in self.terms:
            line = term
            for doc in self.docs:
                line = line + delimiter + str(doc['terms'][idx]) 
            fd.write('%s\n' % line)
            idx += 1
        fd.close()
    
    def dump_transposed_arff(self, filename):
        fd = open(filename,'w')
        # Let's first print file header
        header = '@RELATION %s\n\n' % filename.split('.')[0]
        header = header + '@ATTRIBUTE terms STRING\n'
        for doc in self.docs:
            header = header + '@ATTRIBUTE "%s" NUMERIC\n' % doc['id']
        fd.write('%s\n' % header)
        
        # Now we print data lines
        fd.write('@DATA\n')
        idx = 0
        delimiter = ','
        for term in self.terms:
            line = '"%s"' % term
            for doc in self.docs:
                line = line + delimiter + str(doc['terms'][idx]) 
            fd.write('%s\n' % line)
            idx += 1
        fd.close()
        
    def prune(self, prune_map):
        ''' Helper method to remove terms (fields) of our matrix
            prune_map is a list of 0's and 1's of same length as self.terms.
            For each term, if 0, then remove it, otherwise keep it.
        '''
        if not(prune_map) or len(prune_map) != len(self.terms):
            return False
        for i in range(len(prune_map)-1,-1,-1):
            if prune_map[i] == 0:
                #print self.terms[i]
                self.terms.pop(i)
                for doc in self.docs:
                    doc['terms'].pop(i)
     
    def freq_levels(self, threshold=3):
        ''' Creates a list of 0's and 1's,
            where 1 means term's freq >= threshold
        '''
        freq_map = [0] * len(self.terms)
        for i in range(0,len(self.terms)):
            val = 0
            for doc in self.docs:
                if doc['terms'][i] != 0:
                    val += 1 
            if val >= threshold:
                freq_map[i] = 1
        return freq_map         
        
    def __contains__(self, term):
        'Checks if certain terms is loaded'
        return self.terms.__contains__(term)        

    def to_be_deleted__getitem__(self, term):
        'Returns occurences of term in all documents'
        if not term in self:
            return SuperList()
        col = [doc['terms'][self.terms.index(term)] for doc in self.docs]
        return SuperList(col)
        
    def __getitem__(self, term):
        ''' If term exists in terms, retruns it position in list,
            otherwise, return -1
        '''    
        if not term in self:
            return -1
        else:
            return self.terms.index(term)
    
    def do_padding(self):
        ''' Align the length of all rows in matrix
            Each time we see a new term, list of terms is expanded,
            and the matrix row for such document is of same length too.
            But what about rows added earlier for previous documents?
            So, this method alighn all previously added rows, 
            to match the current length of the terms list.
        '''
        if len(self.docs[-1]['terms']) == len(self.docs[0]['terms']):
            return
        for doc in self.docs:
            doc['terms'].expand(new_len=len(self.terms))
        #for cls in self.classes:
        #    self.classes[cls].expand(new_len=len(self.terms))

    def tf_idf(self, do_idf=True):
        ''' Converts matrix to tf.idf values
            do_idf: if False, convert to tf only
        '''
        N = len(self)
        for doc in self.docs:
            for idx in range(len(doc)):
                df = self[self.terms[idx]].nonzero_count()
                tf = log_tf(doc['terms'][idx])
                idf = float(N) / df if do_idf else 1 
                doc['terms'][idx] = tf * idf
 
    def add_doc(self, doc_id = '', doc_class='', doc_terms=[], 
                frequency=False, do_padding=False):
        ''' Add new document to our matrix:
            doc_id: Identifier for the document, eg. file name, url, etc. 
            doc_class: You might need this in classification.
            doc_terms: List of terms you got after tokenizing the document.
                       Terms can be typles, string and values
            frequency: If true, term occurences is incremented by one.
                        Else, occurences is only 0 or 1 (a la Bernoulli)
            do_padding: Boolean. Check do_padding() for more info.
        ''' 
        # Update list of terms if new term seen.
        # And document (row) with its associated data.
        my_doc_terms = SuperList()
        for term in doc_terms:
            if type(term) == tuple:
                term_idx = self.terms.unique_append(term[0])
                my_doc_terms.increment_after_padding(term_idx,term[1])
            else:
                term_idx = self.terms.unique_append(term)
                if frequency:
                    my_doc_terms.increment_after_padding(term_idx,1)
                else:
                    my_doc_terms.insert_after_padding(term_idx,1)
        self.docs.append({  'id': doc_id, 
                            'class': doc_class, 
                            'terms': my_doc_terms})
        # Update list of document classes if new class seen.
        #self.classes.unique_append(doc_class)
        #if self.classes.has_key(doc_class):
        #else:
        #    self.classes[doc_class].add(my_doc_terms)
        #    self.classes[doc_class] = my_doc_terms
        if do_padding: 
            self.do_padding()
        

    def query_to_vector(self, q_terms, frequency=False,):
        ''' Converts query to a list alligned with our self.terms.
            Terms not seen before will be ignored.
            q_terms: list of query terms
            frequency: return a multinomial or multivariate list?
        '''
        my_query_vector = SuperList()
        my_query_vector.expand(new_len=len(self.terms))
        for term in q_terms:
            try:
                term_idx = self.terms.index(term)
            except:
                # Term not seen before, skip
                continue
            #print term, self.terms.index(term)
            if frequency:
                my_query_vector.increment_after_padding(term_idx,1)
            else:
                my_query_vector.insert_after_padding(term_idx,1)
        return my_query_vector
        
    def get_stats(self):
        return Stats(self)
Exemple #2
0
class Matrix:

    def __init__(self, whitelist=[]):
        ''' Initilize our matrix.
            whitelist: If not empty, discard any terms not in whitelist,
                       when adding new terms via add_doc()
            terms: We will populate this with our vocabulary of terms
            docs: This is our actual 2D matrix terms/docs.
                  A list of the following dictionary,
                  { 'id': Unique ID to each document, 
                    'class': In case of labeled data, doc class label, 
                    'terms': list of 1's and 0's, i.e. term Frequencies.
                  }
        '''
        # List of unique terms (vocabulary)
        self.terms = SuperList()
        # List of document classes and terms summary
        #self.classes = {}
        self.docs = []
        self.whitelist = whitelist

    def __len__(self):
        'Returns number of loaded ducuments'
        return len(self.docs)

    def vocabulary(self, threshold_map=[]):
        '''Returns list of all unique terms if threshold_map not given.
           Otherwise, only return terms above threshold.        
        '''
        if not threshold_map:
            return self.terms
        elif len(threshold_map) == len(self.terms):
            vlist = []
            for i in range(len(self.terms)):
                if threshold_map[i] == 1:
                   vlist.append(self.terms[i])
            return vlist 
        else:
            return []
            
            
    def __str__(self):
        s  = 'Matrix:'
        s += '\n * Vocabulary read: %d' % len(self.terms)
        s += '\n * Documents read: %d' % len(self.docs)
        return s

    def dump_tf(self, filename, freqs, delimiter='\t', header=True):
        ''' Dumps term frequencies
        '''
        fd = open(filename,'w')
        # Let's first print file header
        header_line = 'term'
        header_line = header_line + delimiter + 'freq'
        if header:
            fd.write('%s\n' % header_line)
        # Now we print data lines
        terms = self.vocabulary()
        for i in range(len(terms)):
            line = terms[i]
            line = line + delimiter + str(freqs[i])
            fd.write('%s\n' % line)
        fd.close()
        
    def dump(self, filename, delimiter='\t', header=True):
        ''' Dumps matrix to a file
        '''
        fd = open(filename,'w')
        # Let's first print file header
        header_line = 'id'
        header_line = header_line + delimiter + 'class'
        for term in self.terms:
            header_line = header_line + delimiter + term
        if header:
            fd.write('%s\n' % header_line)
        # Now we print data lines
        for doc in self.docs:
            line = doc['id']
            line = line + delimiter +  doc['class']
            for term in doc['terms']:
                line = line + delimiter + str(term) 
            fd.write('%s\n' % line)
        fd.close()
    
    def dump_arff(self, filename, delimiter=',', clstype='NUMERIC'):
        ''' Dumps matrix to a file
        '''
        fd = open(filename,'w')
        header = '@RELATION %s\n\n' % filename.split('.')[0]
        header = header + '@ATTRIBUTE \'ARFFID\' NUMERIC\n'
        for term in self.terms:
            header = header + '@ATTRIBUTE \'' + term + '\' NUMERIC\n'
        header = header + '@ATTRIBUTE \'ClassLabel\' ' + clstype + '\n'
        fd.write('%s\n' % header)
        
        # Now we print data lines
        fd.write('@DATA\n')
        for doc in self.docs:
            line = doc['id']
            for term in doc['terms']:
                line = line + delimiter + str(term) 
            line = line + delimiter +  str(doc['class'])
            fd.write('%s\n' % line)
        fd.close()
        
        
    def dump_transposed(self, filename, delimiter='\t', header=True):
        fd = open(filename,'w')
        # Let's first print file header
        header_line = 'terms'
        for doc in self.docs:
            header_line = header_line + delimiter + doc['id']
        if header:
            fd.write('%s\n' % header_line)
        # Now we print data lines
        idx = 0
        for term in self.terms:
            line = term
            for doc in self.docs:
                line = line + delimiter + str(doc['terms'][idx]) 
            fd.write('%s\n' % line)
            idx += 1
        fd.close()
    
    def dump_transposed_arff(self, filename):
        fd = open(filename,'w')
        # Let's first print file header
        header = '@RELATION %s\n\n' % filename.split('.')[0]
        header = header + '@ATTRIBUTE terms STRING\n'
        for doc in self.docs:
            header = header + '@ATTRIBUTE "%s" NUMERIC\n' % doc['id']
        fd.write('%s\n' % header)
        
        # Now we print data lines
        fd.write('@DATA\n')
        idx = 0
        delimiter = ','
        for term in self.terms:
            line = '"%s"' % term
            for doc in self.docs:
                line = line + delimiter + str(doc['terms'][idx]) 
            fd.write('%s\n' % line)
            idx += 1
        fd.close()
        
    def prune_old(self, prune_map):
        ''' Helper method to remove terms (fields) of our matrix
            prune_map is a list of 0's and 1's of same length as self.terms.
            For each term, if 0, then remove it, otherwise keep it.
        '''
        if not(prune_map) or len(prune_map) != len(self.terms):
            return False
        for i in range(len(prune_map)-1,-1,-1):
            if prune_map[i] == 0:
                #print self.terms[i]
                self.terms.pop(i)
                for doc in self.docs:
                    doc['terms'].pop(i)
                    
    def prune(self, prune_map, show_progress=True):
        ''' Helper method to remove terms (fields) of our matrix
            prune_map is a list of 0's and 1's of same length as self.terms.
            For each term, if 0, then remove it, otherwise keep it.
        '''
        if not(prune_map) or len(prune_map) != len(self.terms):
            return False
        if show_progress:
            print '  Pruning terms list ...'
        new_terms =  SuperList()
        for i in range(len(prune_map)-1,-1,-1):
            if prune_map[i] == 1:
                #print self.terms[i]
                new_terms.append(self.terms[i])
        self.terms = new_terms
        if show_progress:
            print '  Pruning documents ...'
        p = Progress(n=len(self), percent=10)
        for doc in self.docs:
            new_doc_terms =  SuperList()
            for i in range(len(prune_map)-1,-1,-1):
                if prune_map[i] == 1:
                    new_doc_terms.append(doc['terms'][i])
            doc['terms'] = new_doc_terms
            if show_progress:
                p.show(message='  Pruning progress:')
                     
    def freq_levels(self, threshold=3):
        ''' Creates two lists:
            threshold_map is a list of 0's and 1's,
            where 1 means term's freq >= threshold
            freq_map is a list of terms frequences
        '''
        threshold_map = [0] * len(self.terms)
        freq_map = [0] * len(self.terms)
        for i in range(0,len(self.terms)):
            val = 0
            for doc in self.docs:
                if doc['terms'][i] != 0:
                    #val += 1 
                    val += doc['terms'][i]
            if val >= threshold:
                threshold_map[i] = 1
            freq_map[i] = val
        return (threshold_map, freq_map)         
        
    def __contains__(self, term):
        'Checks if certain terms is loaded'
        return self.terms.__contains__(term)        

    def to_be_deleted__getitem__(self, term):
        'Returns occurences of term in all documents'
        if not term in self:
            return SuperList()
        col = [doc['terms'][self.terms.index(term)] for doc in self.docs]
        return SuperList(col)
        
    def __getitem__(self, term):
        ''' If term exists in terms, retruns it position in list,
            otherwise, return -1
        '''    
        if not term in self:
            return -1
        else:
            return self.terms.index(term)
    
    def do_padding(self):
        ''' Align the length of all rows in matrix
            Each time we see a new term, list of terms is expanded,
            and the matrix row for such document is of same length too.
            But what about rows added earlier for previous documents?
            So, this method alighn all previously added rows, 
            to match the current length of the terms list.
        '''
        if len(self.docs[-1]['terms']) == len(self.docs[0]['terms']):
            return
        for doc in self.docs:
            doc['terms'].expand(new_len=len(self.terms))
        #for cls in self.classes:
        #    self.classes[cls].expand(new_len=len(self.terms))

    def _log_tf(self, value):
		val = float(value)
		val = 1 + math.log10(val) if val != 0 else float(0)
		return val
		
    def tf_idf(self, do_idf=True):
        ''' Converts matrix to tf.idf values
            do_idf: if False, convert to tf only
        '''        
        N = len(self.docs)
        df = SuperList([0] * len(self.terms))
        for doc in self.docs:
            row = SuperList([0] * len(self.terms))
            for idx in range(len(self.terms)):
                if doc['terms'][idx] > 0:
                    row[idx] = 1
            df.add(row)
        
        for doc in self.docs:
            for idx in range(len(self.terms)):
                tf = self._log_tf(doc['terms'][idx])
                idf = math.log10(float(N) / df[idx])
                if do_idf:
                    doc['terms'][idx] = tf * idf
                else:
                    doc['terms'][idx] = tf

 
    def add_doc(self, doc_id = '', doc_class='', doc_terms=[], 
                frequency=False, do_padding=False, 
                unique_ids=False, stopwords=[]):
        ''' Add new document to our matrix:
            doc_id: Identifier for the document, eg. file name, url, etc. 
            doc_class: You might need this in classification.
            doc_terms: List of terms you got after tokenizing the document.
                       Terms can be typles; string and frequencies
            frequency: If true, term occurences is incremented by one.
                        Else, occurences is only 0 or 1 (a la Bernoulli)
            do_padding: Boolean. Check do_padding() for more info.
            unique_ids: When true, if two documents are added with same id,
                        then their terms are summed up into only one record.
            stopwords: If not empty, ignore those stop words in doc_terms 
        ''' 
        # Update list of terms if new term seen.
        # And document (row) with its associated data.
        my_doc_terms = SuperList()
        # Discard anything not in whitelist if it is not empty
        if self.whitelist:
            doc_terms = [t for t in doc_terms if t in self.whitelist]
        # Discard anything in stopwords if not empty
        if stopwords:
            doc_terms = [t for t in doc_terms if t not in stopwords]
        for term in doc_terms:
            if type(term) == tuple:
                term_idx = self.terms.unique_append(term[0])
                my_doc_terms.increment_after_padding(term_idx,term[1])
            else:
                term_idx = self.terms.unique_append(term)
                if frequency:
                    my_doc_terms.increment_after_padding(term_idx,1)
                else:
                    my_doc_terms.insert_after_padding(term_idx,1)
        # In the rare event when whitelisting causes an empty doc_terms list
        # We add at least one zero in the list of my_doc_terms
        if not my_doc_terms:
            zeros = [float(0)] * len(self.vocabulary())
            my_doc_terms = SuperList(zeros)
            
            
        if unique_ids:
            found = 0
            for doc in self.docs:
                if doc['id'] == doc_id:
                    doc['terms'].add(my_doc_terms)
                    found = 1
            if not found:        
                self.docs.append({'id': doc_id, 
                                  'class': doc_class, 
                                  'terms': my_doc_terms}) 
        else:
            self.docs.append({  'id': doc_id, 
                                'class': doc_class, 
                                'terms': my_doc_terms})
        # Update list of document classes if new class seen.
        #self.classes.unique_append(doc_class)
        #if self.classes.has_key(doc_class):
        #else:
        #    self.classes[doc_class].add(my_doc_terms)
        #    self.classes[doc_class] = my_doc_terms
        if do_padding: 
            self.do_padding()
        

    def query_to_vector(self, q_terms, frequency=False,):
        ''' Converts query to a list alligned with our self.terms.
            Terms not seen before will be ignored.
            q_terms: list of query terms
            frequency: return a multinomial or multivariate list?
        '''
        my_query_vector = SuperList()
        my_query_vector.expand(new_len=len(self.terms))
        for term in q_terms:
            try:
                term_idx = self.terms.index(term)
            except:
                # Term not seen before, skip
                continue
            #print term, self.terms.index(term)
            if frequency:
                my_query_vector.increment_after_padding(term_idx,1)
            else:
                my_query_vector.insert_after_padding(term_idx,1)
        return my_query_vector
        
    def get_stats(self):
        return Stats(self)
Exemple #3
0
class Matrix:

    def __init__(self, whitelist=[]):
        ''' Initilize our matrix.
            whitelist: If not empty, discard any terms not in whitelist,
                       when adding new terms via add_doc()
            terms: We will populate this with our vocabulary of terms
            docs: This is our actual 2D matrix terms/docs.
                  A list of the following dictionary,
                  { 'id': Unique ID to each document, 
                    'class': In case of labeled data, doc class label, 
                    'terms': list of 1's and 0's, i.e. term Frequencies.
                  }
        '''
        # List of unique terms (vocabulary)
        self.terms = SuperList()
        # List of document classes and terms summary
        #self.classes = {}
        self.docs = []
        self.whitelist = whitelist

    def __len__(self):
        'Returns number of loaded ducuments'
        return len(self.docs)

    def vocabulary(self, threshold_map=[]):
        '''Returns list of all unique terms if threshold_map not given.
           Otherwise, only return terms above threshold.        
        '''
        if not threshold_map:
            return self.terms
        elif len(threshold_map) == len(self.terms):
            vlist = []
            for i in range(len(self.terms)):
                if threshold_map[i] == 1:
                   vlist.append(self.terms[i])
            return vlist 
        else:
            return []
            
            
    def __str__(self):
        s  = 'Matrix:'
        s += '\n * Vocabulary read: %d' % len(self.terms)
        s += '\n * Documents read: %d' % len(self.docs)
        return s

    def dump_tf(self, filename, freqs, delimiter='\t', header=True):
        ''' Dumps term frequencies
        '''
        fd = open(filename,'w')
        # Let's first print file header
        header_line = 'term'
        header_line = header_line + delimiter + 'freq'
        if header:
            fd.write('%s\n' % header_line)
        # Now we print data lines
        terms = self.vocabulary()
        for i in range(len(terms)):
            line = terms[i]
            line = line + delimiter + str(freqs[i])
            fd.write('%s\n' % line)
        fd.close()
        
    def dump(self, filename, delimiter='\t', header=True):
        ''' Dumps matrix to a file
        '''
        fd = open(filename,'w')
        # Let's first print file header
        header_line = 'id'
        header_line = header_line + delimiter + 'class'
        for term in self.terms:
            header_line = header_line + delimiter + term
        if header:
            fd.write('%s\n' % header_line)
        # Now we print data lines
        for doc in self.docs:
            line = doc['id']
            line = line + delimiter +  doc['class']
            for term in doc['terms']:
                line = line + delimiter + str(term) 
            fd.write('%s\n' % line)
        fd.close()
    
    def dump_arff(self, filename, delimiter=',', clstype='NUMERIC'):
        ''' Dumps matrix to a file
        '''
        fd = open(filename,'w')
        header = '@RELATION %s\n\n' % filename.split('.')[0]
        header = header + '@ATTRIBUTE \'ID\' NUMERIC\n'
        for term in self.terms:
            header = header + '@ATTRIBUTE \'' + term + '\' NUMERIC\n'
        header = header + '@ATTRIBUTE \'ClassLabel\' ' + clstype + '\n'
        fd.write('%s\n' % header)
        
        # Now we print data lines
        fd.write('@DATA\n')
        for doc in self.docs:
            line = doc['id']
            for term in doc['terms']:
                line = line + delimiter + str(term) 
            line = line + delimiter +  str(doc['class'])
            fd.write('%s\n' % line)
        fd.close()
        
        
    def dump_transposed(self, filename, delimiter='\t', header=True):
        fd = open(filename,'w')
        # Let's first print file header
        header_line = 'terms'
        for doc in self.docs:
            header_line = header_line + delimiter + doc['id']
        if header:
            fd.write('%s\n' % header_line)
        # Now we print data lines
        idx = 0
        for term in self.terms:
            line = term
            for doc in self.docs:
                line = line + delimiter + str(doc['terms'][idx]) 
            fd.write('%s\n' % line)
            idx += 1
        fd.close()
    
    def dump_transposed_arff(self, filename):
        fd = open(filename,'w')
        # Let's first print file header
        header = '@RELATION %s\n\n' % filename.split('.')[0]
        header = header + '@ATTRIBUTE terms STRING\n'
        for doc in self.docs:
            header = header + '@ATTRIBUTE "%s" NUMERIC\n' % doc['id']
        fd.write('%s\n' % header)
        
        # Now we print data lines
        fd.write('@DATA\n')
        idx = 0
        delimiter = ','
        for term in self.terms:
            line = '"%s"' % term
            for doc in self.docs:
                line = line + delimiter + str(doc['terms'][idx]) 
            fd.write('%s\n' % line)
            idx += 1
        fd.close()
        
    def prune_old(self, prune_map):
        ''' Helper method to remove terms (fields) of our matrix
            prune_map is a list of 0's and 1's of same length as self.terms.
            For each term, if 0, then remove it, otherwise keep it.
        '''
        if not(prune_map) or len(prune_map) != len(self.terms):
            return False
        for i in range(len(prune_map)-1,-1,-1):
            if prune_map[i] == 0:
                #print self.terms[i]
                self.terms.pop(i)
                for doc in self.docs:
                    doc['terms'].pop(i)
                    
    def prune(self, prune_map, show_progress=True):
        ''' Helper method to remove terms (fields) of our matrix
            prune_map is a list of 0's and 1's of same length as self.terms.
            For each term, if 0, then remove it, otherwise keep it.
        '''
        if not(prune_map) or len(prune_map) != len(self.terms):
            return False
        if show_progress:
            print '  Pruning terms list ...'
        new_terms =  SuperList()
        for i in range(len(prune_map)-1,-1,-1):
            if prune_map[i] == 1:
                #print self.terms[i]
                new_terms.append(self.terms[i])
        self.terms = new_terms
        if show_progress:
            print '  Pruning documents ...'
        p = Progress(n=len(self), percent=10)
        for doc in self.docs:
            new_doc_terms =  SuperList()
            for i in range(len(prune_map)-1,-1,-1):
                if prune_map[i] == 1:
                    new_doc_terms.append(doc['terms'][i])
            doc['terms'] = new_doc_terms
            if show_progress:
                p.show(message='  Pruning progress:')
                     
    def freq_levels(self, threshold=3):
        ''' Creates two lists:
            threshold_map is a list of 0's and 1's,
            where 1 means term's freq >= threshold
            freq_map is a list of terms frequences
        '''
        threshold_map = [0] * len(self.terms)
        freq_map = [0] * len(self.terms)
        for i in range(0,len(self.terms)):
            val = 0
            for doc in self.docs:
                if doc['terms'][i] != 0:
                    #val += 1 
                    val += doc['terms'][i]
            if val >= threshold:
                threshold_map[i] = 1
            freq_map[i] = val
        return (threshold_map, freq_map)         
        
    def __contains__(self, term):
        'Checks if certain terms is loaded'
        return self.terms.__contains__(term)        

    def to_be_deleted__getitem__(self, term):
        'Returns occurences of term in all documents'
        if not term in self:
            return SuperList()
        col = [doc['terms'][self.terms.index(term)] for doc in self.docs]
        return SuperList(col)
        
    def __getitem__(self, term):
        ''' If term exists in terms, retruns it position in list,
            otherwise, return -1
        '''    
        if not term in self:
            return -1
        else:
            return self.terms.index(term)
    
    def do_padding(self):
        ''' Align the length of all rows in matrix
            Each time we see a new term, list of terms is expanded,
            and the matrix row for such document is of same length too.
            But what about rows added earlier for previous documents?
            So, this method alighn all previously added rows, 
            to match the current length of the terms list.
        '''
        if len(self.docs[-1]['terms']) == len(self.docs[0]['terms']):
            return
        for doc in self.docs:
            doc['terms'].expand(new_len=len(self.terms))
        #for cls in self.classes:
        #    self.classes[cls].expand(new_len=len(self.terms))

    def _log_tf(self, value):
		val = float(value)
		val = 1 + math.log10(val) if val != 0 else float(0)
		return val
		
    def tf_idf(self, do_idf=True):
        ''' Converts matrix to tf.idf values
            do_idf: if False, convert to tf only
        '''        
        N = len(self.docs)
        df = SuperList([0] * len(self.terms))
        for doc in self.docs:
            row = SuperList([0] * len(self.terms))
            for idx in range(len(self.terms)):
                if doc['terms'][idx] > 0:
                    row[idx] = 1
            df.add(row)
        
        for doc in self.docs:
            for idx in range(len(self.terms)):
                tf = self._log_tf(doc['terms'][idx])
                idf = math.log10(float(N) / df[idx])
                if do_idf:
                    doc['terms'][idx] = tf * idf
                else:
                    doc['terms'][idx] = tf

 
    def add_doc(self, doc_id = '', doc_class='', doc_terms=[], 
                frequency=False, do_padding=False):
        ''' Add new document to our matrix:
            doc_id: Identifier for the document, eg. file name, url, etc. 
            doc_class: You might need this in classification.
            doc_terms: List of terms you got after tokenizing the document.
                       Terms can be typles; string and frequencies
            frequency: If true, term occurences is incremented by one.
                        Else, occurences is only 0 or 1 (a la Bernoulli)
            do_padding: Boolean. Check do_padding() for more info.
        ''' 
        # Update list of terms if new term seen.
        # And document (row) with its associated data.
        my_doc_terms = SuperList()
        # Discard anything not in whitelist if it is not empty
        if self.whitelist:
            doc_terms = [t for t in doc_terms if t in self.whitelist]
        for term in doc_terms:
            if type(term) == tuple:
                term_idx = self.terms.unique_append(term[0])
                my_doc_terms.increment_after_padding(term_idx,term[1])
            else:
                term_idx = self.terms.unique_append(term)
                if frequency:
                    my_doc_terms.increment_after_padding(term_idx,1)
                else:
                    my_doc_terms.insert_after_padding(term_idx,1)
        # In the rare event when whitelisting causes an empty doc_terms list
        # We add at least one zero in the list of my_doc_terms
        if not my_doc_terms:
            zeros = [float(0)] * len(self.vocabulary())
            my_doc_terms = SuperList(zeros)
        self.docs.append({  'id': doc_id, 
                            'class': doc_class, 
                            'terms': my_doc_terms})
        # Update list of document classes if new class seen.
        #self.classes.unique_append(doc_class)
        #if self.classes.has_key(doc_class):
        #else:
        #    self.classes[doc_class].add(my_doc_terms)
        #    self.classes[doc_class] = my_doc_terms
        if do_padding: 
            self.do_padding()
        

    def query_to_vector(self, q_terms, frequency=False,):
        ''' Converts query to a list alligned with our self.terms.
            Terms not seen before will be ignored.
            q_terms: list of query terms
            frequency: return a multinomial or multivariate list?
        '''
        my_query_vector = SuperList()
        my_query_vector.expand(new_len=len(self.terms))
        for term in q_terms:
            try:
                term_idx = self.terms.index(term)
            except:
                # Term not seen before, skip
                continue
            #print term, self.terms.index(term)
            if frequency:
                my_query_vector.increment_after_padding(term_idx,1)
            else:
                my_query_vector.insert_after_padding(term_idx,1)
        return my_query_vector
        
    def get_stats(self):
        return Stats(self)
Exemple #4
0
class Matrix:

    def __init__(self):
        # List of unique terms (vocabulary)
        self.terms = SuperList()
        # List of document classes and terms summary
        self.classes = {}
        self.docs = []

    def __len__(self):
        'Returns number of loaded ducuments'
        return len(self.docs)

    def vocabulary(self):
        'Returns list of unique terms'
        return self.terms
            
    def __str__(self):
        s  = 'Matrix:'
        s += '\n * Vocabulary read: %d' % len(self.terms)
        s += '\n * Documents read: %d' % len(self.docs)
        return s

    def __contains__(self, term):
        'Checks if certain terms is loaded'
        return self.terms.__contains__(term)        

    def to_be_deleted__getitem__(self, term):
        'Returns occurences of term in all documents'
        if not term in self:
            return SuperList()
        col = [doc['terms'][self.terms.index(term)] for doc in self.docs]
        return SuperList(col)
        
    def __getitem__(self, term):
        ''' If term exists in terms, retruns it position in list,
            otherwise, return -1
        '''    
        if not term in self:
            return -1
        else:
            return self.terms.index(term)
    
    def do_padding(self):
        ''' Align the length of all rows in matrix
            Each time we see a new term, list of terms is expanded,
            and the matrix row for such document is of same length too.
            But what about rows added earlier for previous documents?
            So, this method alighn all previously added rows, 
            to match the current length of the terms list.
        '''
        if len(self.docs[-1]) == len(self.docs[0]):
            return
        for doc in self.docs:
            doc['terms'].expand(new_len=len(self.terms))
        #for cls in self.classes:
        #    self.classes[cls].expand(new_len=len(self.terms))

    def tf_idf(self, do_idf=True):
        ''' Converts matrix to tf.idf values
            do_idf: if False, convert to tf only
        '''
        N = len(self)
        for doc in self.docs:
            for idx in range(len(doc)):
                df = self[self.terms[idx]].nonzero_count()
                tf = log_tf(doc['terms'][idx])
                idf = float(N) / df if do_idf else 1 
                doc['terms'][idx] = tf * idf
 
    def add_doc(self, doc_id = '', doc_class='', doc_terms=[], 
                frequency=False, do_padding=False):
        ''' Add new document to our matrix:
            doc_id: Identifier for the document, eg. file name, url, etc. 
            doc_class: You might need this in classification.
            doc_terms: List of terms you got after tokenizing the document.
            frequency: If true, term occurences is incremented by one.
                        Else, occurences is only 0 or 1 (a la Bernoulli)
            do_padding: Boolean. Check do_padding() for more info.
        ''' 
        # Update list of terms if new term seen.
        # And document (row) with its associated data.
        my_doc_terms = SuperList()
        for term in doc_terms:
            term_idx = self.terms.unique_append(term)
            #my_doc_terms.insert_after_padding(self.terms.index(term))
            if frequency:
                my_doc_terms.increment_after_padding(term_idx,1)
            else:
                my_doc_terms.insert_after_padding(term_idx,1)
        self.docs.append({  'id': doc_id, 
                            'class': doc_class, 
                            'terms': my_doc_terms})
        # Update list of document classes if new class seen.
        #self.classes.unique_append(doc_class)
        if self.classes.has_key(doc_class):
            self.classes[doc_class].add(my_doc_terms)
        else:
            self.classes[doc_class] = my_doc_terms
        if do_padding: 
            self.do_padding()
        

    def query_to_vector(self, q_terms, frequency=False,):
        ''' Converts query to a list alligned with our self.terms.
            Terms not seen before will be ignored.
            q_terms: list of query terms
            frequency: return a multinomial or multivariate list?
        '''
        my_query_vector = SuperList()
        my_query_vector.expand(new_len=len(self.terms))
        for term in q_terms:
            try:
                term_idx = self.terms.index(term)
            except:
                # Term not seen before, skip
                continue
            #print term, self.terms.index(term)
            if frequency:
                my_query_vector.increment_after_padding(term_idx,1)
            else:
                my_query_vector.insert_after_padding(term_idx,1)
        return my_query_vector
        
    def get_stats(self):
        return Stats(self)