Ejemplo n.º 1
0
 def test_percentage_greater_than(self):
     sl = SuperList()
     for i in range(100):
         sl.append(100 - i)
     self.assertEqual(sl.percentage_greater_than(-1), 100)
     self.assertEqual(sl.percentage_greater_than(11), 90)
     self.assertEqual(sl.percentage_greater_than(101), 0)
Ejemplo n.º 2
0
    def add_doc(self,
                doc_id='',
                doc_class='',
                doc_terms=[],
                frequency=False,
                do_padding=False,
                unique_ids=False,
                meta_data={}):
        ''' Add new document to our matrix:
            doc_id: Identifier for the document, eg. file name, url, etc. 
            doc_class: You might need this in classification.
            doc_terms: List of terms you got after tokenizing the document.
                       Terms can be typles; string and frequencies
            frequency: If true, term occurences is incremented by one.
                        Else, occurences is only 0 or 1 (a la Bernoulli)
            do_padding: Boolean. Check do_padding() for more info.
            unique_ids: When true, if two documents are added with same id,
                        then their terms are summed up into only one record.
            meta_data: More fields to add to the document, for your own use.
        '''
        if not doc_terms:
            raise ValueError('doc_terms cannot be empty')
        # Update list of terms if new term seen.
        # And document (row) with its associated data.
        my_doc_terms = SuperList()
        # Discard anything not in whitelist if it is not empty
        if self.whitelist:
            doc_terms = [t for t in doc_terms if t in self.whitelist]
        # Discard anything in stopwords if not empty
        if self.blacklist:
            doc_terms = [t for t in doc_terms if t not in self.blacklist]
        for term in doc_terms:
            if type(term) == tuple:
                term_idx = self.terms.unique_append(term[0])
                my_doc_terms.increment_after_padding(term_idx, term[1])
            else:
                term_idx = self.terms.unique_append(term)
                if frequency:
                    my_doc_terms.increment_after_padding(term_idx, 1)
                else:
                    my_doc_terms.insert_after_padding(term_idx, 1)
        # In the rare event when whitelisting causes an empty doc_terms list
        # We add at least one zero in the list of my_doc_terms
        if not my_doc_terms:
            zeros = [float(0)] * len(self.vocabulary())
            my_doc_terms = SuperList(zeros)

        doc_data = {'id': doc_id, 'class': doc_class, 'terms': my_doc_terms}

        for key in meta_data:
            doc_data[key] = meta_data[key]

        if unique_ids:
            self.docs.add_unique(doc_data)
        else:
            self.docs.append(doc_data)

        if do_padding:
            self.do_padding()
Ejemplo n.º 3
0
 def test_percentage_lower_than(self):
     sl = SuperList()
     for i in range(100):
         sl.append(100 - i)
     self.assertEqual(sl.percentage_lower_than(-1), 0)
     self.assertEqual(sl.percentage_lower_than(11), 10)
     self.assertEqual(sl.percentage_lower_than(33.3), 33)
     self.assertEqual(sl.percentage_lower_than(101), 100)
     sl.append(4)
     self.assertTrue(sl.percentage_lower_than(5) > 4.95
                     and sl.percentage_lower_than(5) < 4.96)
Ejemplo n.º 4
0
 def __init__(self, matrix):
     self.mx = matrix
     self.N  = 0
     self.classes = {}
     self.terms = SuperList()       
     for c in self.mx.classes:
         self.classes[c] = {}
         self.classes[c]['terms'] = self.mx.classes[c]
         self.classes[c]['total'] = sum(self.classes[c]['terms'])
         self.terms.add(self.classes[c]['terms'])
         self.N += self.classes[c]['total']
     self.mi_terms = []
Ejemplo n.º 5
0
 def prune(self, prune_map, show_progress=True):
     ''' Helper method to remove terms (fields) of our matrix
         prune_map is a list of 0's and 1's of same length as self.terms.
         For each term, if 0, then remove it, otherwise keep it.
     '''
     if not(prune_map) or len(prune_map) != len(self.terms):
         return False
     if show_progress:
         print '  Pruning terms list ...'
     new_terms =  SuperList()
     for i in range(len(prune_map)-1,-1,-1):
         if prune_map[i] == 1:
             #print self.terms[i]
             new_terms.append(self.terms[i])
     self.terms = new_terms
     if show_progress:
         print '  Pruning documents ...'
     p = Progress(n=len(self), percent=10)
     for doc in self.docs:
         new_doc_terms =  SuperList()
         for i in range(len(prune_map)-1,-1,-1):
             if prune_map[i] == 1:
                 new_doc_terms.append(doc['terms'][i])
         doc['terms'] = new_doc_terms
         if show_progress:
             p.show(message='  Pruning progress:')
Ejemplo n.º 6
0
 def load(self, filename, delimiter='\t', header=True):
     ''' Loads matrix from CSV/TSV file
     '''
     with open(filename, 'r') as fd:
         header_line = fd.readline()
         header_data = header_line.strip().split(delimiter)
         # First 2 columns are id and class
         self.terms = SuperList(header_data[2:])
         for line in fd.readlines():
             doc_data = {
                 'id': line[0],
                 'class': line[1],
                 'terms': SuperList(line[2:])
             }
             self.docs.append(doc_data)
     fd.close()
Ejemplo n.º 7
0
 def add_doc(self, doc_id = '', doc_class='', doc_terms=[], 
             frequency=False, do_padding=False):
     ''' Add new document to our matrix:
         doc_id: Identifier for the document, eg. file name, url, etc. 
         doc_class: You might need this in classification.
         doc_terms: List of terms you got after tokenizing the document.
                    Terms can be typles, string and values
         frequency: If true, term occurences is incremented by one.
                     Else, occurences is only 0 or 1 (a la Bernoulli)
         do_padding: Boolean. Check do_padding() for more info.
     ''' 
     # Update list of terms if new term seen.
     # And document (row) with its associated data.
     my_doc_terms = SuperList()
     for term in doc_terms:
         if type(term) == tuple:
             term_idx = self.terms.unique_append(term[0])
             my_doc_terms.increment_after_padding(term_idx,term[1])
         else:
             term_idx = self.terms.unique_append(term)
             if frequency:
                 my_doc_terms.increment_after_padding(term_idx,1)
             else:
                 my_doc_terms.insert_after_padding(term_idx,1)
     self.docs.append({  'id': doc_id, 
                         'class': doc_class, 
                         'terms': my_doc_terms})
     # Update list of document classes if new class seen.
     #self.classes.unique_append(doc_class)
     #if self.classes.has_key(doc_class):
     #else:
     #    self.classes[doc_class].add(my_doc_terms)
     #    self.classes[doc_class] = my_doc_terms
     if do_padding: 
         self.do_padding()
Ejemplo n.º 8
0
 def __init__(self, whitelist=[]):
     ''' Initilize our matrix.
         whitelist: If not empty, discard any terms not in whitelist,
                    when adding new terms via add_doc()
         terms: We will populate this with our vocabulary of terms
         docs: This is our actual 2D matrix terms/docs.
               A list of the following dictionary,
               { 'id': Unique ID to each document, 
                 'class': In case of labeled data, doc class label, 
                 'terms': list of 1's and 0's, i.e. term Frequencies.
               }
     '''
     # List of unique terms (vocabulary)
     self.terms = SuperList()
     # List of document classes and terms summary
     #self.classes = {}
     self.docs = []
     self.whitelist = whitelist
Ejemplo n.º 9
0
 def add_doc(self, doc_id = '', doc_class='', doc_terms=[], 
             frequency=False, do_padding=False):
     ''' Add new document to our matrix:
         doc_id: Identifier for the document, eg. file name, url, etc. 
         doc_class: You might need this in classification.
         doc_terms: List of terms you got after tokenizing the document.
                    Terms can be typles; string and frequencies
         frequency: If true, term occurences is incremented by one.
                     Else, occurences is only 0 or 1 (a la Bernoulli)
         do_padding: Boolean. Check do_padding() for more info.
     ''' 
     # Update list of terms if new term seen.
     # And document (row) with its associated data.
     my_doc_terms = SuperList()
     # Discard anything not in whitelist if it is not empty
     if self.whitelist:
         doc_terms = [t for t in doc_terms if t in self.whitelist]
     for term in doc_terms:
         if type(term) == tuple:
             term_idx = self.terms.unique_append(term[0])
             my_doc_terms.increment_after_padding(term_idx,term[1])
         else:
             term_idx = self.terms.unique_append(term)
             if frequency:
                 my_doc_terms.increment_after_padding(term_idx,1)
             else:
                 my_doc_terms.insert_after_padding(term_idx,1)
     # In the rare event when whitelisting causes an empty doc_terms list
     # We add at least one zero in the list of my_doc_terms
     if not my_doc_terms:
         zeros = [float(0)] * len(self.vocabulary())
         my_doc_terms = SuperList(zeros)
     self.docs.append({  'id': doc_id, 
                         'class': doc_class, 
                         'terms': my_doc_terms})
     # Update list of document classes if new class seen.
     #self.classes.unique_append(doc_class)
     #if self.classes.has_key(doc_class):
     #else:
     #    self.classes[doc_class].add(my_doc_terms)
     #    self.classes[doc_class] = my_doc_terms
     if do_padding: 
         self.do_padding()
Ejemplo n.º 10
0
 def tf_idf(self, do_idf=True):
     ''' Converts matrix to tf.idf values
         do_idf: if False, convert to tf only
     '''        
     N = len(self.docs)
     df = SuperList([0] * len(self.terms))
     for doc in self.docs:
         row = SuperList([0] * len(self.terms))
         for idx in range(len(self.terms)):
             if doc['terms'][idx] > 0:
                 row[idx] = 1
         df.add(row)
     
     for doc in self.docs:
         for idx in range(len(self.terms)):
             tf = self._log_tf(doc['terms'][idx])
             idf = math.log10(float(N) / df[idx])
             if do_idf:
                 doc['terms'][idx] = tf * idf
             else:
                 doc['terms'][idx] = tf
Ejemplo n.º 11
0
 def tf_idf(self, do_idf=True):
     ''' Converts matrix to tf.idf values
         do_idf: if False, convert to tf only
     '''        
     N = len(self.docs)
     df = SuperList([0] * len(self.terms))
     for doc in self.docs:
         row = SuperList([0] * len(self.terms))
         for idx in range(len(self.terms)):
             if doc['terms'][idx] > 0:
                 row[idx] = 1
         df.add(row)
     
     for doc in self.docs:
         for idx in range(len(self.terms)):
             tf = self._log_tf(doc['terms'][idx])
             idf = math.log10(float(N) / df[idx])
             if do_idf:
                 doc['terms'][idx] = tf * idf
             else:
                 doc['terms'][idx] = tf
Ejemplo n.º 12
0
 def add_doc(self, doc_id = '', doc_class='', doc_terms=[], 
             frequency=False, do_padding=False):
     ''' Add new document to our matrix:
         doc_id: Identifier for the document, eg. file name, url, etc. 
         doc_class: You might need this in classification.
         doc_terms: List of terms you got after tokenizing the document.
         frequency: If true, term occurences is incremented by one.
                     Else, occurences is only 0 or 1 (a la Bernoulli)
         do_padding: Boolean. Check do_padding() for more info.
     ''' 
     # Update list of terms if new term seen.
     # And document (row) with its associated data.
     my_doc_terms = SuperList()
     for term in doc_terms:
         term_idx = self.terms.unique_append(term)
         #my_doc_terms.insert_after_padding(self.terms.index(term))
         if frequency:
             my_doc_terms.increment_after_padding(term_idx,1)
         else:
             my_doc_terms.insert_after_padding(term_idx,1)
     self.docs.append({  'id': doc_id, 
                         'class': doc_class, 
                         'terms': my_doc_terms})
     # Update list of document classes if new class seen.
     #self.classes.unique_append(doc_class)
     if self.classes.has_key(doc_class):
         self.classes[doc_class].add(my_doc_terms)
     else:
         self.classes[doc_class] = my_doc_terms
     if do_padding: 
         self.do_padding()
Ejemplo n.º 13
0
 def prune(self, prune_map, show_progress=True):
     ''' Helper method to remove terms (fields) of our matrix
         prune_map is a list of 0's and 1's of same length as self.terms.
         For each term, if 0, then remove it, otherwise keep it.
     '''
     if not(prune_map) or len(prune_map) != len(self.terms):
         return False
     if show_progress:
         print '  Pruning terms list ...'
     new_terms =  SuperList()
     for i in range(len(prune_map)-1,-1,-1):
         if prune_map[i] == 1:
             #print self.terms[i]
             new_terms.append(self.terms[i])
     self.terms = new_terms
     if show_progress:
         print '  Pruning documents ...'
     p = Progress(n=len(self), percent=10)
     for doc in self.docs:
         new_doc_terms =  SuperList()
         for i in range(len(prune_map)-1,-1,-1):
             if prune_map[i] == 1:
                 new_doc_terms.append(doc['terms'][i])
         doc['terms'] = new_doc_terms
         if show_progress:
             p.show(message='  Pruning progress:')
Ejemplo n.º 14
0
 def test_percentage(self):
     sl = SuperList()
     for i in range(100):
         sl.append(100 - i)
     self.assertEqual(sl.percentage(0), 1)
     self.assertEqual(sl.percentage(90), 91)
     self.assertEqual(sl.percentage(100), 100)
     self.assertEqual(sl.percentage(101), 100)
Ejemplo n.º 15
0
 def add_doc(self, doc_id = '', doc_class='', doc_terms=[], 
             frequency=False, do_padding=False, stopwords=[]):
     ''' Add new document to our matrix:
         doc_id: Identifier for the document, eg. file name, url, etc. 
         doc_class: You might need this in classification.
         doc_terms: List of terms you got after tokenizing the document.
                    Terms can be typles; string and frequencies
         frequency: If true, term occurences is incremented by one.
                     Else, occurences is only 0 or 1 (a la Bernoulli)
         do_padding: Boolean. Useless here
         stopwords: If not empty, ignore those stop words in doc_terms
     ''' 
     # Update list of terms if new term seen.
     # And document (row) with its associated data.
     my_doc_terms = SuperList()
     # Discard anything not in whitelist if it is not empty
     if self.whitelist:
         doc_terms = [t for t in doc_terms if t in self.whitelist]
     # Discard anything in stopwords if not empty
     if stopwords:
         doc_terms = [t for t in doc_terms if t not in stopwords]
     for term in doc_terms:
         if type(term) == tuple:
             term_idx = self.terms.unique_append(term[0])
             my_doc_terms.increment_after_padding(term_idx,term[1])
         else:
             term_idx = self.terms.unique_append(term)
             if frequency:
                 my_doc_terms.increment_after_padding(term_idx,1)
             else:
                 my_doc_terms.insert_after_padding(term_idx,1)
     #self.docs.append({  'id': doc_id, 
     #                    'class': doc_class, 
     #                    'terms': my_doc_terms})
     found = 0
     for doc in self.docs:
         if doc['class'] == doc_class:
             doc['terms'].add(my_doc_terms)
             found = 1
     if not found:        
         self.docs.append({'id': doc_id, 
                           'class': doc_class, 
                           'terms': my_doc_terms}) 
     if do_padding: 
         self.do_padding()  
Ejemplo n.º 16
0
 def load(self, filename, delimiter='\t', header=True):
     ''' Loads matrix from CSV/TSV file
     '''
     with open(filename, 'r') as fd:
         header_line = fd.readline()
         header_data = header_line.strip().split(delimiter)
         # First 2 columns are id and class
         self.terms = SuperList(header_data[2:])
         for line in fd.readlines():
             doc_data = {
                 'id': line[0], 
                 'class': line[1], 
                 'terms': SuperList(line[2:])
             }
             self.docs.append(doc_data)
     fd.close()
Ejemplo n.º 17
0
 def __init__(self, whitelist=[]):
     ''' Initilize our matrix.
         whitelist: If not empty, discard any terms not in whitelist,
                    when adding new terms via add_doc()
         terms: We will populate this with our vocabulary of terms
         docs: This is our actual 2D matrix terms/docs.
               A list of the following dictionary,
               { 'id': Unique ID to each document, 
                 'class': In case of labeled data, doc class label, 
                 'terms': list of 1's and 0's, i.e. term Frequencies.
               }
     '''
     # List of unique terms (vocabulary)
     self.terms = SuperList()
     # List of document classes and terms summary
     #self.classes = {}
     self.docs = []
     self.whitelist = whitelist
Ejemplo n.º 18
0
 def query_to_vector(self, q_terms, frequency=False,):
     ''' Converts query to a list alligned with our self.terms.
         Terms not seen before will be ignored.
         q_terms: list of query terms
         frequency: return a multinomial or multivariate list?
     '''
     my_query_vector = SuperList()
     my_query_vector.expand(new_len=len(self.terms))
     for term in q_terms:
         try:
             term_idx = self.terms.index(term)
         except:
             # Term not seen before, skip
             continue
         #print term, self.terms.index(term)
         if frequency:
             my_query_vector.increment_after_padding(term_idx,1)
         else:
             my_query_vector.insert_after_padding(term_idx,1)
     return my_query_vector
Ejemplo n.º 19
0
 def add_doc(self, doc_id="", doc_class="", doc_terms=[], frequency=False, do_padding=False):
     """ Add new document to our matrix:
         doc_id: Identifier for the document, eg. file name, url, etc. 
         doc_class: You might need this in classification.
         doc_terms: List of terms you got after tokenizing the document.
                    Terms can be typles; string and frequencies
         frequency: If true, term occurences is incremented by one.
                     Else, occurences is only 0 or 1 (a la Bernoulli)
         do_padding: Boolean. Check do_padding() for more info.
     """
     # Update list of terms if new term seen.
     # And document (row) with its associated data.
     my_doc_terms = SuperList()
     # Discard anything not in whitelist if it is not empty
     if self.whitelist:
         doc_terms = [t for t in doc_terms if t in self.whitelist]
     for term in doc_terms:
         if type(term) == tuple:
             term_idx = self.terms.unique_append(term[0])
             my_doc_terms.increment_after_padding(term_idx, term[1])
         else:
             term_idx = self.terms.unique_append(term)
             if frequency:
                 my_doc_terms.increment_after_padding(term_idx, 1)
             else:
                 my_doc_terms.insert_after_padding(term_idx, 1)
     # In the rare event when whitelisting causes an empty doc_terms list
     # We add at least one zero in the list of my_doc_terms
     if not my_doc_terms:
         zeros = [float(0)] * len(self.vocabulary())
         my_doc_terms = SuperList(zeros)
     self.docs.append({"id": doc_id, "class": doc_class, "terms": my_doc_terms})
     # Update list of document classes if new class seen.
     # self.classes.unique_append(doc_class)
     # if self.classes.has_key(doc_class):
     # else:
     #    self.classes[doc_class].add(my_doc_terms)
     #    self.classes[doc_class] = my_doc_terms
     if do_padding:
         self.do_padding()
Ejemplo n.º 20
0
 def query_to_vector(self, q_terms, frequency=False,):
     ''' Converts query to a list alligned with our self.terms.
         Terms not seen before will be ignored.
         q_terms: list of query terms
         frequency: return a multinomial or multivariate list?
     '''
     my_query_vector = SuperList()
     my_query_vector.expand(new_len=len(self.terms))
     for term in q_terms:
         try:
             term_idx = self.terms.index(term)
         except:
             # Term not seen before, skip
             continue
         #print term, self.terms.index(term)
         if frequency:
             my_query_vector.increment_after_padding(term_idx,1)
         else:
             my_query_vector.insert_after_padding(term_idx,1)
     return my_query_vector
class DistributionAnalyzer:

    def __init__(self):
        self.list = SuperList()

    def add(self, value):
        self.list.append(value)

    def get_size(self):
        return len(self.list)

    def print_percentage(self, percentage):
        value = self.list.percentage(percentage)
        print(str(percentage) + "% under " + str(value))

    def print_default_percentages(self):
        self.print_percentage(50)
        self.print_percentage(75)
        self.print_percentage(90)
        self.print_percentage(99)
        self.print_percentage(100)

    def find_percentage_lower_than(self, value):
        p = self.list.percentage_lower_than(value)
        print("%.2f%% of values are < %s" % (p, value))

    def find_percentage_greater_than(self, value):
        p = self.list.percentage_greater_than(value)
        print("%.2f%% of values are > %s" % (p, value))

    def find_percentage_between(self, low, high):
        low, high = min(low, high), max(low, high)
        a = self.list.percentage_greater_than(high)
        b = self.list.percentage_lower_than(low)
        result = 100.0 - (a + b)
        print("%.2f of values are in [ %s, %s ]" % (result, low, high))
Ejemplo n.º 22
0
class Stats:

    def __init__(self, matrix):
        self.mx = matrix
        self.N  = 0
        self.classes = {}
        self.terms = SuperList()       
        for c in self.mx.classes:
            self.classes[c] = {}
            self.classes[c]['terms'] = self.mx.classes[c]
            self.classes[c]['total'] = sum(self.classes[c]['terms'])
            self.terms.add(self.classes[c]['terms'])
            self.N += self.classes[c]['total']
        self.mi_terms = []
        
    def __str__(self):
        s  = 'Matrix Stats:'
        s += '\n * Vocabulary/Terms: %d/%d' % (len(self.terms), self.N)
        return s
        
    def getN(self):
        ''' Get total number of terms, counting their frequencies too.
            Notice: This is not the same as len(vocabulary)
        '''
        return self.N
        
    def get_terms_freq(self, normalized=False):
        ''' Returns 2d matrix of vocabulary terms and their occurences
            if normalized is True, devide by total number of terms
        '''
        terms = self.mx.terms
        freq = self.terms.div(self.N) if normalized else self.terms
        return [terms, freq] 
            
    def pr_term(self, t):
        ' Get probability of term t '
        i = self.mx[t]
        if i == -1:
            return 0
        return float(self.terms[i]) / self.N

    def pr_class(self, c):
        ' Get probability of class c '
        return float(self.classes[c]['total']) / self.N
        
    def pr_joint(self, t, c):
        'Get joint probability between term t and class c'
        i = self.mx[t]
        if i == -1:
            return 0
        return float(self.classes[c]['terms'][i]) / self.N
        
    def mi(self):
        for t in self.mx.vocabulary():
            mi = 0
            for c in self.classes:
                try:
                    mi += self.pr_joint(t,c) * math.log10( self.pr_joint(t,c) / ( self.pr_term(t) * self.pr_class(c) ))
                except:
                    # Oh, log(0), let's set mi = 0
                    mi = 0
            self.mi_terms.append(mi) 
        print self.classes    
        print self.mi_terms
Ejemplo n.º 23
0
 def __init__(self):
     # List of unique terms (vocabulary)
     self.terms = SuperList()
     # List of document classes and terms summary
     #self.classes = {}
     self.docs = []
Ejemplo n.º 24
0
class Matrix:

    def __init__(self):
        # List of unique terms (vocabulary)
        self.terms = SuperList()
        # List of document classes and terms summary
        #self.classes = {}
        self.docs = []

    def __len__(self):
        'Returns number of loaded ducuments'
        return len(self.docs)

    def vocabulary(self):
        'Returns list of unique terms'
        return self.terms
            
    def __str__(self):
        s  = 'Matrix:'
        s += '\n * Vocabulary read: %d' % len(self.terms)
        s += '\n * Documents read: %d' % len(self.docs)
        return s

    def dump(self, filename, delimiter='\t', header=True):
        ''' Dumps matrix to a file
        '''
        fd = open(filename,'w')
        # Let's first print file header
        header_line = 'id'
        header_line = header_line + delimiter + 'class'
        for term in self.terms:
            header_line = header_line + delimiter + term
        if header:
            fd.write('%s\n' % header_line)
        # Now we print data lines
        for doc in self.docs:
            line = doc['id']
            line = line + delimiter +  doc['class']
            for term in doc['terms']:
                line = line + delimiter + str(term) 
            fd.write('%s\n' % line)
        fd.close()
    
    def dump_arff(self, filename, delimiter=',',):
        ''' Dumps matrix to a file
        '''
        fd = open(filename,'w')
        header = '@RELATION %s\n\n' % filename.split('.')[0]
        header = header + '@ATTRIBUTE \'ID\' NUMERIC\n'
        for term in self.terms:
            header = header + '@ATTRIBUTE \'' + term + '\' NUMERIC\n'
        header = header + '@ATTRIBUTE class NUMERIC\n'
        fd.write('%s\n' % header)
        
        # Now we print data lines
        fd.write('@DATA\n')
        for doc in self.docs:
            line = doc['id']
            for term in doc['terms']:
                line = line + delimiter + str(term) 
            line = line + delimiter +  str(doc['class'])
            fd.write('%s\n' % line)
        fd.close()
        
        
    def dump_transposed(self, filename, delimiter='\t', header=True):
        fd = open(filename,'w')
        # Let's first print file header
        header_line = 'terms'
        for doc in self.docs:
            header_line = header_line + delimiter + doc['id']
        if header:
            fd.write('%s\n' % header_line)
        # Now we print data lines
        idx = 0
        for term in self.terms:
            line = term
            for doc in self.docs:
                line = line + delimiter + str(doc['terms'][idx]) 
            fd.write('%s\n' % line)
            idx += 1
        fd.close()
    
    def dump_transposed_arff(self, filename):
        fd = open(filename,'w')
        # Let's first print file header
        header = '@RELATION %s\n\n' % filename.split('.')[0]
        header = header + '@ATTRIBUTE terms STRING\n'
        for doc in self.docs:
            header = header + '@ATTRIBUTE "%s" NUMERIC\n' % doc['id']
        fd.write('%s\n' % header)
        
        # Now we print data lines
        fd.write('@DATA\n')
        idx = 0
        delimiter = ','
        for term in self.terms:
            line = '"%s"' % term
            for doc in self.docs:
                line = line + delimiter + str(doc['terms'][idx]) 
            fd.write('%s\n' % line)
            idx += 1
        fd.close()
        
    def prune(self, prune_map):
        ''' Helper method to remove terms (fields) of our matrix
            prune_map is a list of 0's and 1's of same length as self.terms.
            For each term, if 0, then remove it, otherwise keep it.
        '''
        if not(prune_map) or len(prune_map) != len(self.terms):
            return False
        for i in range(len(prune_map)-1,-1,-1):
            if prune_map[i] == 0:
                #print self.terms[i]
                self.terms.pop(i)
                for doc in self.docs:
                    doc['terms'].pop(i)
     
    def freq_levels(self, threshold=3):
        ''' Creates a list of 0's and 1's,
            where 1 means term's freq >= threshold
        '''
        freq_map = [0] * len(self.terms)
        for i in range(0,len(self.terms)):
            val = 0
            for doc in self.docs:
                if doc['terms'][i] != 0:
                    val += 1 
            if val >= threshold:
                freq_map[i] = 1
        return freq_map         
        
    def __contains__(self, term):
        'Checks if certain terms is loaded'
        return self.terms.__contains__(term)        

    def to_be_deleted__getitem__(self, term):
        'Returns occurences of term in all documents'
        if not term in self:
            return SuperList()
        col = [doc['terms'][self.terms.index(term)] for doc in self.docs]
        return SuperList(col)
        
    def __getitem__(self, term):
        ''' If term exists in terms, retruns it position in list,
            otherwise, return -1
        '''    
        if not term in self:
            return -1
        else:
            return self.terms.index(term)
    
    def do_padding(self):
        ''' Align the length of all rows in matrix
            Each time we see a new term, list of terms is expanded,
            and the matrix row for such document is of same length too.
            But what about rows added earlier for previous documents?
            So, this method alighn all previously added rows, 
            to match the current length of the terms list.
        '''
        if len(self.docs[-1]['terms']) == len(self.docs[0]['terms']):
            return
        for doc in self.docs:
            doc['terms'].expand(new_len=len(self.terms))
        #for cls in self.classes:
        #    self.classes[cls].expand(new_len=len(self.terms))

    def tf_idf(self, do_idf=True):
        ''' Converts matrix to tf.idf values
            do_idf: if False, convert to tf only
        '''
        N = len(self)
        for doc in self.docs:
            for idx in range(len(doc)):
                df = self[self.terms[idx]].nonzero_count()
                tf = log_tf(doc['terms'][idx])
                idf = float(N) / df if do_idf else 1 
                doc['terms'][idx] = tf * idf
 
    def add_doc(self, doc_id = '', doc_class='', doc_terms=[], 
                frequency=False, do_padding=False):
        ''' Add new document to our matrix:
            doc_id: Identifier for the document, eg. file name, url, etc. 
            doc_class: You might need this in classification.
            doc_terms: List of terms you got after tokenizing the document.
                       Terms can be typles, string and values
            frequency: If true, term occurences is incremented by one.
                        Else, occurences is only 0 or 1 (a la Bernoulli)
            do_padding: Boolean. Check do_padding() for more info.
        ''' 
        # Update list of terms if new term seen.
        # And document (row) with its associated data.
        my_doc_terms = SuperList()
        for term in doc_terms:
            if type(term) == tuple:
                term_idx = self.terms.unique_append(term[0])
                my_doc_terms.increment_after_padding(term_idx,term[1])
            else:
                term_idx = self.terms.unique_append(term)
                if frequency:
                    my_doc_terms.increment_after_padding(term_idx,1)
                else:
                    my_doc_terms.insert_after_padding(term_idx,1)
        self.docs.append({  'id': doc_id, 
                            'class': doc_class, 
                            'terms': my_doc_terms})
        # Update list of document classes if new class seen.
        #self.classes.unique_append(doc_class)
        #if self.classes.has_key(doc_class):
        #else:
        #    self.classes[doc_class].add(my_doc_terms)
        #    self.classes[doc_class] = my_doc_terms
        if do_padding: 
            self.do_padding()
        

    def query_to_vector(self, q_terms, frequency=False,):
        ''' Converts query to a list alligned with our self.terms.
            Terms not seen before will be ignored.
            q_terms: list of query terms
            frequency: return a multinomial or multivariate list?
        '''
        my_query_vector = SuperList()
        my_query_vector.expand(new_len=len(self.terms))
        for term in q_terms:
            try:
                term_idx = self.terms.index(term)
            except:
                # Term not seen before, skip
                continue
            #print term, self.terms.index(term)
            if frequency:
                my_query_vector.increment_after_padding(term_idx,1)
            else:
                my_query_vector.insert_after_padding(term_idx,1)
        return my_query_vector
        
    def get_stats(self):
        return Stats(self)
Ejemplo n.º 25
0
class Matrix:

    def __init__(self):
        # List of unique terms (vocabulary)
        self.terms = SuperList()
        # List of document classes and terms summary
        self.classes = {}
        self.docs = []

    def __len__(self):
        'Returns number of loaded ducuments'
        return len(self.docs)

    def vocabulary(self):
        'Returns list of unique terms'
        return self.terms
            
    def __str__(self):
        s  = 'Matrix:'
        s += '\n * Vocabulary read: %d' % len(self.terms)
        s += '\n * Documents read: %d' % len(self.docs)
        return s

    def __contains__(self, term):
        'Checks if certain terms is loaded'
        return self.terms.__contains__(term)        

    def to_be_deleted__getitem__(self, term):
        'Returns occurences of term in all documents'
        if not term in self:
            return SuperList()
        col = [doc['terms'][self.terms.index(term)] for doc in self.docs]
        return SuperList(col)
        
    def __getitem__(self, term):
        ''' If term exists in terms, retruns it position in list,
            otherwise, return -1
        '''    
        if not term in self:
            return -1
        else:
            return self.terms.index(term)
    
    def do_padding(self):
        ''' Align the length of all rows in matrix
            Each time we see a new term, list of terms is expanded,
            and the matrix row for such document is of same length too.
            But what about rows added earlier for previous documents?
            So, this method alighn all previously added rows, 
            to match the current length of the terms list.
        '''
        if len(self.docs[-1]) == len(self.docs[0]):
            return
        for doc in self.docs:
            doc['terms'].expand(new_len=len(self.terms))
        #for cls in self.classes:
        #    self.classes[cls].expand(new_len=len(self.terms))

    def tf_idf(self, do_idf=True):
        ''' Converts matrix to tf.idf values
            do_idf: if False, convert to tf only
        '''
        N = len(self)
        for doc in self.docs:
            for idx in range(len(doc)):
                df = self[self.terms[idx]].nonzero_count()
                tf = log_tf(doc['terms'][idx])
                idf = float(N) / df if do_idf else 1 
                doc['terms'][idx] = tf * idf
 
    def add_doc(self, doc_id = '', doc_class='', doc_terms=[], 
                frequency=False, do_padding=False):
        ''' Add new document to our matrix:
            doc_id: Identifier for the document, eg. file name, url, etc. 
            doc_class: You might need this in classification.
            doc_terms: List of terms you got after tokenizing the document.
            frequency: If true, term occurences is incremented by one.
                        Else, occurences is only 0 or 1 (a la Bernoulli)
            do_padding: Boolean. Check do_padding() for more info.
        ''' 
        # Update list of terms if new term seen.
        # And document (row) with its associated data.
        my_doc_terms = SuperList()
        for term in doc_terms:
            term_idx = self.terms.unique_append(term)
            #my_doc_terms.insert_after_padding(self.terms.index(term))
            if frequency:
                my_doc_terms.increment_after_padding(term_idx,1)
            else:
                my_doc_terms.insert_after_padding(term_idx,1)
        self.docs.append({  'id': doc_id, 
                            'class': doc_class, 
                            'terms': my_doc_terms})
        # Update list of document classes if new class seen.
        #self.classes.unique_append(doc_class)
        if self.classes.has_key(doc_class):
            self.classes[doc_class].add(my_doc_terms)
        else:
            self.classes[doc_class] = my_doc_terms
        if do_padding: 
            self.do_padding()
        

    def query_to_vector(self, q_terms, frequency=False,):
        ''' Converts query to a list alligned with our self.terms.
            Terms not seen before will be ignored.
            q_terms: list of query terms
            frequency: return a multinomial or multivariate list?
        '''
        my_query_vector = SuperList()
        my_query_vector.expand(new_len=len(self.terms))
        for term in q_terms:
            try:
                term_idx = self.terms.index(term)
            except:
                # Term not seen before, skip
                continue
            #print term, self.terms.index(term)
            if frequency:
                my_query_vector.increment_after_padding(term_idx,1)
            else:
                my_query_vector.insert_after_padding(term_idx,1)
        return my_query_vector
        
    def get_stats(self):
        return Stats(self)
Ejemplo n.º 26
0
 def __init__(self):
     # List of unique terms (vocabulary)
     self.terms = SuperList()
     # List of document classes and terms summary
     self.classes = {}
     self.docs = []
 def __init__(self):
     self.list = SuperList()
Ejemplo n.º 28
0
 def to_be_deleted__getitem__(self, term):
     'Returns occurences of term in all documents'
     if not term in self:
         return SuperList()
     col = [doc['terms'][self.terms.index(term)] for doc in self.docs]
     return SuperList(col)
Ejemplo n.º 29
0
    def add_doc(self, doc_id='', 
                doc_class='', 
                doc_terms=[], 
                frequency=False, 
                do_padding=False, 
                unique_ids=False,
                meta_data={}):
        ''' Add new document to our matrix:
            doc_id: Identifier for the document, eg. file name, url, etc. 
            doc_class: You might need this in classification.
            doc_terms: List of terms you got after tokenizing the document.
                       Terms can be typles; string and frequencies
            frequency: If true, term occurences is incremented by one.
                        Else, occurences is only 0 or 1 (a la Bernoulli)
            do_padding: Boolean. Check do_padding() for more info.
            unique_ids: When true, if two documents are added with same id,
                        then their terms are summed up into only one record.
            meta_data: More fields to add to the document, for your own use.
        ''' 
        if not doc_terms:
            raise ValueError('doc_terms cannot be empty')
        # Update list of terms if new term seen.
        # And document (row) with its associated data.
        my_doc_terms = SuperList()
        # Discard anything not in whitelist if it is not empty
        if self.whitelist:
            doc_terms = [t for t in doc_terms if t in self.whitelist]
        # Discard anything in stopwords if not empty
        if self.blacklist: 
            doc_terms = [t for t in doc_terms if t not in self.blacklist]
        for term in doc_terms:
            if type(term) == tuple:
                term_idx = self.terms.unique_append(term[0])
                my_doc_terms.increment_after_padding(term_idx,term[1])
            else:
                term_idx = self.terms.unique_append(term)
                if frequency:
                    my_doc_terms.increment_after_padding(term_idx,1)
                else:
                    my_doc_terms.insert_after_padding(term_idx,1)
        # In the rare event when whitelisting causes an empty doc_terms list
        # We add at least one zero in the list of my_doc_terms
        if not my_doc_terms:
            zeros = [float(0)] * len(self.vocabulary())
            my_doc_terms = SuperList(zeros)
         
        doc_data = {
            'id': doc_id, 
            'class': doc_class, 
            'terms': my_doc_terms
        }

        for key in meta_data:
            doc_data[key] = meta_data[key]

        if unique_ids:
            self.docs.add_unique(doc_data)              
        else:
            self.docs.append(doc_data)

        if do_padding: 
            self.do_padding()
Ejemplo n.º 30
0
class Matrix:

    def __init__(self, whitelist=[]):
        ''' Initilize our matrix.
            whitelist: If not empty, discard any terms not in whitelist,
                       when adding new terms via add_doc()
            terms: We will populate this with our vocabulary of terms
            docs: This is our actual 2D matrix terms/docs.
                  A list of the following dictionary,
                  { 'id': Unique ID to each document, 
                    'class': In case of labeled data, doc class label, 
                    'terms': list of 1's and 0's, i.e. term Frequencies.
                  }
        '''
        # List of unique terms (vocabulary)
        self.terms = SuperList()
        # List of document classes and terms summary
        #self.classes = {}
        self.docs = []
        self.whitelist = whitelist

    def __len__(self):
        'Returns number of loaded ducuments'
        return len(self.docs)

    def vocabulary(self, threshold_map=[]):
        '''Returns list of all unique terms if threshold_map not given.
           Otherwise, only return terms above threshold.        
        '''
        if not threshold_map:
            return self.terms
        elif len(threshold_map) == len(self.terms):
            vlist = []
            for i in range(len(self.terms)):
                if threshold_map[i] == 1:
                   vlist.append(self.terms[i])
            return vlist 
        else:
            return []
            
            
    def __str__(self):
        s  = 'Matrix:'
        s += '\n * Vocabulary read: %d' % len(self.terms)
        s += '\n * Documents read: %d' % len(self.docs)
        return s

    def dump_tf(self, filename, freqs, delimiter='\t', header=True):
        ''' Dumps term frequencies
        '''
        fd = open(filename,'w')
        # Let's first print file header
        header_line = 'term'
        header_line = header_line + delimiter + 'freq'
        if header:
            fd.write('%s\n' % header_line)
        # Now we print data lines
        terms = self.vocabulary()
        for i in range(len(terms)):
            line = terms[i]
            line = line + delimiter + str(freqs[i])
            fd.write('%s\n' % line)
        fd.close()
        
    def dump(self, filename, delimiter='\t', header=True):
        ''' Dumps matrix to a file
        '''
        fd = open(filename,'w')
        # Let's first print file header
        header_line = 'id'
        header_line = header_line + delimiter + 'class'
        for term in self.terms:
            header_line = header_line + delimiter + term
        if header:
            fd.write('%s\n' % header_line)
        # Now we print data lines
        for doc in self.docs:
            line = doc['id']
            line = line + delimiter +  doc['class']
            for term in doc['terms']:
                line = line + delimiter + str(term) 
            fd.write('%s\n' % line)
        fd.close()
    
    def dump_arff(self, filename, delimiter=',', clstype='NUMERIC'):
        ''' Dumps matrix to a file
        '''
        fd = open(filename,'w')
        header = '@RELATION %s\n\n' % filename.split('.')[0]
        header = header + '@ATTRIBUTE \'ID\' NUMERIC\n'
        for term in self.terms:
            header = header + '@ATTRIBUTE \'' + term + '\' NUMERIC\n'
        header = header + '@ATTRIBUTE \'ClassLabel\' ' + clstype + '\n'
        fd.write('%s\n' % header)
        
        # Now we print data lines
        fd.write('@DATA\n')
        for doc in self.docs:
            line = doc['id']
            for term in doc['terms']:
                line = line + delimiter + str(term) 
            line = line + delimiter +  str(doc['class'])
            fd.write('%s\n' % line)
        fd.close()
        
        
    def dump_transposed(self, filename, delimiter='\t', header=True):
        fd = open(filename,'w')
        # Let's first print file header
        header_line = 'terms'
        for doc in self.docs:
            header_line = header_line + delimiter + doc['id']
        if header:
            fd.write('%s\n' % header_line)
        # Now we print data lines
        idx = 0
        for term in self.terms:
            line = term
            for doc in self.docs:
                line = line + delimiter + str(doc['terms'][idx]) 
            fd.write('%s\n' % line)
            idx += 1
        fd.close()
    
    def dump_transposed_arff(self, filename):
        fd = open(filename,'w')
        # Let's first print file header
        header = '@RELATION %s\n\n' % filename.split('.')[0]
        header = header + '@ATTRIBUTE terms STRING\n'
        for doc in self.docs:
            header = header + '@ATTRIBUTE "%s" NUMERIC\n' % doc['id']
        fd.write('%s\n' % header)
        
        # Now we print data lines
        fd.write('@DATA\n')
        idx = 0
        delimiter = ','
        for term in self.terms:
            line = '"%s"' % term
            for doc in self.docs:
                line = line + delimiter + str(doc['terms'][idx]) 
            fd.write('%s\n' % line)
            idx += 1
        fd.close()
        
    def prune_old(self, prune_map):
        ''' Helper method to remove terms (fields) of our matrix
            prune_map is a list of 0's and 1's of same length as self.terms.
            For each term, if 0, then remove it, otherwise keep it.
        '''
        if not(prune_map) or len(prune_map) != len(self.terms):
            return False
        for i in range(len(prune_map)-1,-1,-1):
            if prune_map[i] == 0:
                #print self.terms[i]
                self.terms.pop(i)
                for doc in self.docs:
                    doc['terms'].pop(i)
                    
    def prune(self, prune_map, show_progress=True):
        ''' Helper method to remove terms (fields) of our matrix
            prune_map is a list of 0's and 1's of same length as self.terms.
            For each term, if 0, then remove it, otherwise keep it.
        '''
        if not(prune_map) or len(prune_map) != len(self.terms):
            return False
        if show_progress:
            print '  Pruning terms list ...'
        new_terms =  SuperList()
        for i in range(len(prune_map)-1,-1,-1):
            if prune_map[i] == 1:
                #print self.terms[i]
                new_terms.append(self.terms[i])
        self.terms = new_terms
        if show_progress:
            print '  Pruning documents ...'
        p = Progress(n=len(self), percent=10)
        for doc in self.docs:
            new_doc_terms =  SuperList()
            for i in range(len(prune_map)-1,-1,-1):
                if prune_map[i] == 1:
                    new_doc_terms.append(doc['terms'][i])
            doc['terms'] = new_doc_terms
            if show_progress:
                p.show(message='  Pruning progress:')
                     
    def freq_levels(self, threshold=3):
        ''' Creates two lists:
            threshold_map is a list of 0's and 1's,
            where 1 means term's freq >= threshold
            freq_map is a list of terms frequences
        '''
        threshold_map = [0] * len(self.terms)
        freq_map = [0] * len(self.terms)
        for i in range(0,len(self.terms)):
            val = 0
            for doc in self.docs:
                if doc['terms'][i] != 0:
                    #val += 1 
                    val += doc['terms'][i]
            if val >= threshold:
                threshold_map[i] = 1
            freq_map[i] = val
        return (threshold_map, freq_map)         
        
    def __contains__(self, term):
        'Checks if certain terms is loaded'
        return self.terms.__contains__(term)        

    def to_be_deleted__getitem__(self, term):
        'Returns occurences of term in all documents'
        if not term in self:
            return SuperList()
        col = [doc['terms'][self.terms.index(term)] for doc in self.docs]
        return SuperList(col)
        
    def __getitem__(self, term):
        ''' If term exists in terms, retruns it position in list,
            otherwise, return -1
        '''    
        if not term in self:
            return -1
        else:
            return self.terms.index(term)
    
    def do_padding(self):
        ''' Align the length of all rows in matrix
            Each time we see a new term, list of terms is expanded,
            and the matrix row for such document is of same length too.
            But what about rows added earlier for previous documents?
            So, this method alighn all previously added rows, 
            to match the current length of the terms list.
        '''
        if len(self.docs[-1]['terms']) == len(self.docs[0]['terms']):
            return
        for doc in self.docs:
            doc['terms'].expand(new_len=len(self.terms))
        #for cls in self.classes:
        #    self.classes[cls].expand(new_len=len(self.terms))

    def _log_tf(self, value):
		val = float(value)
		val = 1 + math.log10(val) if val != 0 else float(0)
		return val
		
    def tf_idf(self, do_idf=True):
        ''' Converts matrix to tf.idf values
            do_idf: if False, convert to tf only
        '''        
        N = len(self.docs)
        df = SuperList([0] * len(self.terms))
        for doc in self.docs:
            row = SuperList([0] * len(self.terms))
            for idx in range(len(self.terms)):
                if doc['terms'][idx] > 0:
                    row[idx] = 1
            df.add(row)
        
        for doc in self.docs:
            for idx in range(len(self.terms)):
                tf = self._log_tf(doc['terms'][idx])
                idf = math.log10(float(N) / df[idx])
                if do_idf:
                    doc['terms'][idx] = tf * idf
                else:
                    doc['terms'][idx] = tf

 
    def add_doc(self, doc_id = '', doc_class='', doc_terms=[], 
                frequency=False, do_padding=False):
        ''' Add new document to our matrix:
            doc_id: Identifier for the document, eg. file name, url, etc. 
            doc_class: You might need this in classification.
            doc_terms: List of terms you got after tokenizing the document.
                       Terms can be typles; string and frequencies
            frequency: If true, term occurences is incremented by one.
                        Else, occurences is only 0 or 1 (a la Bernoulli)
            do_padding: Boolean. Check do_padding() for more info.
        ''' 
        # Update list of terms if new term seen.
        # And document (row) with its associated data.
        my_doc_terms = SuperList()
        # Discard anything not in whitelist if it is not empty
        if self.whitelist:
            doc_terms = [t for t in doc_terms if t in self.whitelist]
        for term in doc_terms:
            if type(term) == tuple:
                term_idx = self.terms.unique_append(term[0])
                my_doc_terms.increment_after_padding(term_idx,term[1])
            else:
                term_idx = self.terms.unique_append(term)
                if frequency:
                    my_doc_terms.increment_after_padding(term_idx,1)
                else:
                    my_doc_terms.insert_after_padding(term_idx,1)
        # In the rare event when whitelisting causes an empty doc_terms list
        # We add at least one zero in the list of my_doc_terms
        if not my_doc_terms:
            zeros = [float(0)] * len(self.vocabulary())
            my_doc_terms = SuperList(zeros)
        self.docs.append({  'id': doc_id, 
                            'class': doc_class, 
                            'terms': my_doc_terms})
        # Update list of document classes if new class seen.
        #self.classes.unique_append(doc_class)
        #if self.classes.has_key(doc_class):
        #else:
        #    self.classes[doc_class].add(my_doc_terms)
        #    self.classes[doc_class] = my_doc_terms
        if do_padding: 
            self.do_padding()
        

    def query_to_vector(self, q_terms, frequency=False,):
        ''' Converts query to a list alligned with our self.terms.
            Terms not seen before will be ignored.
            q_terms: list of query terms
            frequency: return a multinomial or multivariate list?
        '''
        my_query_vector = SuperList()
        my_query_vector.expand(new_len=len(self.terms))
        for term in q_terms:
            try:
                term_idx = self.terms.index(term)
            except:
                # Term not seen before, skip
                continue
            #print term, self.terms.index(term)
            if frequency:
                my_query_vector.increment_after_padding(term_idx,1)
            else:
                my_query_vector.insert_after_padding(term_idx,1)
        return my_query_vector
        
    def get_stats(self):
        return Stats(self)
Ejemplo n.º 31
0
class Matrix:

    def __init__(self, whitelist=[]):
        ''' Initilize our matrix.
            whitelist: If not empty, discard any terms not in whitelist,
                       when adding new terms via add_doc()
            terms: We will populate this with our vocabulary of terms
            docs: This is our actual 2D matrix terms/docs.
                  A list of the following dictionary,
                  { 'id': Unique ID to each document, 
                    'class': In case of labeled data, doc class label, 
                    'terms': list of 1's and 0's, i.e. term Frequencies.
                  }
        '''
        # List of unique terms (vocabulary)
        self.terms = SuperList()
        # List of document classes and terms summary
        #self.classes = {}
        self.docs = []
        self.whitelist = whitelist

    def __len__(self):
        'Returns number of loaded ducuments'
        return len(self.docs)

    def vocabulary(self, threshold_map=[]):
        '''Returns list of all unique terms if threshold_map not given.
           Otherwise, only return terms above threshold.        
        '''
        if not threshold_map:
            return self.terms
        elif len(threshold_map) == len(self.terms):
            vlist = []
            for i in range(len(self.terms)):
                if threshold_map[i] == 1:
                   vlist.append(self.terms[i])
            return vlist 
        else:
            return []
            
            
    def __str__(self):
        s  = 'Matrix:'
        s += '\n * Vocabulary read: %d' % len(self.terms)
        s += '\n * Documents read: %d' % len(self.docs)
        return s

    def dump_tf(self, filename, freqs, delimiter='\t', header=True):
        ''' Dumps term frequencies
        '''
        fd = open(filename,'w')
        # Let's first print file header
        header_line = 'term'
        header_line = header_line + delimiter + 'freq'
        if header:
            fd.write('%s\n' % header_line)
        # Now we print data lines
        terms = self.vocabulary()
        for i in range(len(terms)):
            line = terms[i]
            line = line + delimiter + str(freqs[i])
            fd.write('%s\n' % line)
        fd.close()
        
    def dump(self, filename, delimiter='\t', header=True):
        ''' Dumps matrix to a file
        '''
        fd = open(filename,'w')
        # Let's first print file header
        header_line = 'id'
        header_line = header_line + delimiter + 'class'
        for term in self.terms:
            header_line = header_line + delimiter + term
        if header:
            fd.write('%s\n' % header_line)
        # Now we print data lines
        for doc in self.docs:
            line = doc['id']
            line = line + delimiter +  doc['class']
            for term in doc['terms']:
                line = line + delimiter + str(term) 
            fd.write('%s\n' % line)
        fd.close()
    
    def dump_arff(self, filename, delimiter=',', clstype='NUMERIC'):
        ''' Dumps matrix to a file
        '''
        fd = open(filename,'w')
        header = '@RELATION %s\n\n' % filename.split('.')[0]
        header = header + '@ATTRIBUTE \'ARFFID\' NUMERIC\n'
        for term in self.terms:
            header = header + '@ATTRIBUTE \'' + term + '\' NUMERIC\n'
        header = header + '@ATTRIBUTE \'ClassLabel\' ' + clstype + '\n'
        fd.write('%s\n' % header)
        
        # Now we print data lines
        fd.write('@DATA\n')
        for doc in self.docs:
            line = doc['id']
            for term in doc['terms']:
                line = line + delimiter + str(term) 
            line = line + delimiter +  str(doc['class'])
            fd.write('%s\n' % line)
        fd.close()
        
        
    def dump_transposed(self, filename, delimiter='\t', header=True):
        fd = open(filename,'w')
        # Let's first print file header
        header_line = 'terms'
        for doc in self.docs:
            header_line = header_line + delimiter + doc['id']
        if header:
            fd.write('%s\n' % header_line)
        # Now we print data lines
        idx = 0
        for term in self.terms:
            line = term
            for doc in self.docs:
                line = line + delimiter + str(doc['terms'][idx]) 
            fd.write('%s\n' % line)
            idx += 1
        fd.close()
    
    def dump_transposed_arff(self, filename):
        fd = open(filename,'w')
        # Let's first print file header
        header = '@RELATION %s\n\n' % filename.split('.')[0]
        header = header + '@ATTRIBUTE terms STRING\n'
        for doc in self.docs:
            header = header + '@ATTRIBUTE "%s" NUMERIC\n' % doc['id']
        fd.write('%s\n' % header)
        
        # Now we print data lines
        fd.write('@DATA\n')
        idx = 0
        delimiter = ','
        for term in self.terms:
            line = '"%s"' % term
            for doc in self.docs:
                line = line + delimiter + str(doc['terms'][idx]) 
            fd.write('%s\n' % line)
            idx += 1
        fd.close()
        
    def prune_old(self, prune_map):
        ''' Helper method to remove terms (fields) of our matrix
            prune_map is a list of 0's and 1's of same length as self.terms.
            For each term, if 0, then remove it, otherwise keep it.
        '''
        if not(prune_map) or len(prune_map) != len(self.terms):
            return False
        for i in range(len(prune_map)-1,-1,-1):
            if prune_map[i] == 0:
                #print self.terms[i]
                self.terms.pop(i)
                for doc in self.docs:
                    doc['terms'].pop(i)
                    
    def prune(self, prune_map, show_progress=True):
        ''' Helper method to remove terms (fields) of our matrix
            prune_map is a list of 0's and 1's of same length as self.terms.
            For each term, if 0, then remove it, otherwise keep it.
        '''
        if not(prune_map) or len(prune_map) != len(self.terms):
            return False
        if show_progress:
            print '  Pruning terms list ...'
        new_terms =  SuperList()
        for i in range(len(prune_map)-1,-1,-1):
            if prune_map[i] == 1:
                #print self.terms[i]
                new_terms.append(self.terms[i])
        self.terms = new_terms
        if show_progress:
            print '  Pruning documents ...'
        p = Progress(n=len(self), percent=10)
        for doc in self.docs:
            new_doc_terms =  SuperList()
            for i in range(len(prune_map)-1,-1,-1):
                if prune_map[i] == 1:
                    new_doc_terms.append(doc['terms'][i])
            doc['terms'] = new_doc_terms
            if show_progress:
                p.show(message='  Pruning progress:')
                     
    def freq_levels(self, threshold=3):
        ''' Creates two lists:
            threshold_map is a list of 0's and 1's,
            where 1 means term's freq >= threshold
            freq_map is a list of terms frequences
        '''
        threshold_map = [0] * len(self.terms)
        freq_map = [0] * len(self.terms)
        for i in range(0,len(self.terms)):
            val = 0
            for doc in self.docs:
                if doc['terms'][i] != 0:
                    #val += 1 
                    val += doc['terms'][i]
            if val >= threshold:
                threshold_map[i] = 1
            freq_map[i] = val
        return (threshold_map, freq_map)         
        
    def __contains__(self, term):
        'Checks if certain terms is loaded'
        return self.terms.__contains__(term)        

    def to_be_deleted__getitem__(self, term):
        'Returns occurences of term in all documents'
        if not term in self:
            return SuperList()
        col = [doc['terms'][self.terms.index(term)] for doc in self.docs]
        return SuperList(col)
        
    def __getitem__(self, term):
        ''' If term exists in terms, retruns it position in list,
            otherwise, return -1
        '''    
        if not term in self:
            return -1
        else:
            return self.terms.index(term)
    
    def do_padding(self):
        ''' Align the length of all rows in matrix
            Each time we see a new term, list of terms is expanded,
            and the matrix row for such document is of same length too.
            But what about rows added earlier for previous documents?
            So, this method alighn all previously added rows, 
            to match the current length of the terms list.
        '''
        if len(self.docs[-1]['terms']) == len(self.docs[0]['terms']):
            return
        for doc in self.docs:
            doc['terms'].expand(new_len=len(self.terms))
        #for cls in self.classes:
        #    self.classes[cls].expand(new_len=len(self.terms))

    def _log_tf(self, value):
		val = float(value)
		val = 1 + math.log10(val) if val != 0 else float(0)
		return val
		
    def tf_idf(self, do_idf=True):
        ''' Converts matrix to tf.idf values
            do_idf: if False, convert to tf only
        '''        
        N = len(self.docs)
        df = SuperList([0] * len(self.terms))
        for doc in self.docs:
            row = SuperList([0] * len(self.terms))
            for idx in range(len(self.terms)):
                if doc['terms'][idx] > 0:
                    row[idx] = 1
            df.add(row)
        
        for doc in self.docs:
            for idx in range(len(self.terms)):
                tf = self._log_tf(doc['terms'][idx])
                idf = math.log10(float(N) / df[idx])
                if do_idf:
                    doc['terms'][idx] = tf * idf
                else:
                    doc['terms'][idx] = tf

 
    def add_doc(self, doc_id = '', doc_class='', doc_terms=[], 
                frequency=False, do_padding=False, 
                unique_ids=False, stopwords=[]):
        ''' Add new document to our matrix:
            doc_id: Identifier for the document, eg. file name, url, etc. 
            doc_class: You might need this in classification.
            doc_terms: List of terms you got after tokenizing the document.
                       Terms can be typles; string and frequencies
            frequency: If true, term occurences is incremented by one.
                        Else, occurences is only 0 or 1 (a la Bernoulli)
            do_padding: Boolean. Check do_padding() for more info.
            unique_ids: When true, if two documents are added with same id,
                        then their terms are summed up into only one record.
            stopwords: If not empty, ignore those stop words in doc_terms 
        ''' 
        # Update list of terms if new term seen.
        # And document (row) with its associated data.
        my_doc_terms = SuperList()
        # Discard anything not in whitelist if it is not empty
        if self.whitelist:
            doc_terms = [t for t in doc_terms if t in self.whitelist]
        # Discard anything in stopwords if not empty
        if stopwords:
            doc_terms = [t for t in doc_terms if t not in stopwords]
        for term in doc_terms:
            if type(term) == tuple:
                term_idx = self.terms.unique_append(term[0])
                my_doc_terms.increment_after_padding(term_idx,term[1])
            else:
                term_idx = self.terms.unique_append(term)
                if frequency:
                    my_doc_terms.increment_after_padding(term_idx,1)
                else:
                    my_doc_terms.insert_after_padding(term_idx,1)
        # In the rare event when whitelisting causes an empty doc_terms list
        # We add at least one zero in the list of my_doc_terms
        if not my_doc_terms:
            zeros = [float(0)] * len(self.vocabulary())
            my_doc_terms = SuperList(zeros)
            
            
        if unique_ids:
            found = 0
            for doc in self.docs:
                if doc['id'] == doc_id:
                    doc['terms'].add(my_doc_terms)
                    found = 1
            if not found:        
                self.docs.append({'id': doc_id, 
                                  'class': doc_class, 
                                  'terms': my_doc_terms}) 
        else:
            self.docs.append({  'id': doc_id, 
                                'class': doc_class, 
                                'terms': my_doc_terms})
        # Update list of document classes if new class seen.
        #self.classes.unique_append(doc_class)
        #if self.classes.has_key(doc_class):
        #else:
        #    self.classes[doc_class].add(my_doc_terms)
        #    self.classes[doc_class] = my_doc_terms
        if do_padding: 
            self.do_padding()
        

    def query_to_vector(self, q_terms, frequency=False,):
        ''' Converts query to a list alligned with our self.terms.
            Terms not seen before will be ignored.
            q_terms: list of query terms
            frequency: return a multinomial or multivariate list?
        '''
        my_query_vector = SuperList()
        my_query_vector.expand(new_len=len(self.terms))
        for term in q_terms:
            try:
                term_idx = self.terms.index(term)
            except:
                # Term not seen before, skip
                continue
            #print term, self.terms.index(term)
            if frequency:
                my_query_vector.increment_after_padding(term_idx,1)
            else:
                my_query_vector.insert_after_padding(term_idx,1)
        return my_query_vector
        
    def get_stats(self):
        return Stats(self)
Ejemplo n.º 32
0
 def add_doc(self, doc_id = '', doc_class='', doc_terms=[], 
             frequency=False, do_padding=False, 
             unique_ids=False, stopwords=[]):
     ''' Add new document to our matrix:
         doc_id: Identifier for the document, eg. file name, url, etc. 
         doc_class: You might need this in classification.
         doc_terms: List of terms you got after tokenizing the document.
                    Terms can be typles; string and frequencies
         frequency: If true, term occurences is incremented by one.
                     Else, occurences is only 0 or 1 (a la Bernoulli)
         do_padding: Boolean. Check do_padding() for more info.
         unique_ids: When true, if two documents are added with same id,
                     then their terms are summed up into only one record.
         stopwords: If not empty, ignore those stop words in doc_terms 
     ''' 
     # Update list of terms if new term seen.
     # And document (row) with its associated data.
     my_doc_terms = SuperList()
     # Discard anything not in whitelist if it is not empty
     if self.whitelist:
         doc_terms = [t for t in doc_terms if t in self.whitelist]
     # Discard anything in stopwords if not empty
     if stopwords:
         doc_terms = [t for t in doc_terms if t not in stopwords]
     for term in doc_terms:
         if type(term) == tuple:
             term_idx = self.terms.unique_append(term[0])
             my_doc_terms.increment_after_padding(term_idx,term[1])
         else:
             term_idx = self.terms.unique_append(term)
             if frequency:
                 my_doc_terms.increment_after_padding(term_idx,1)
             else:
                 my_doc_terms.insert_after_padding(term_idx,1)
     # In the rare event when whitelisting causes an empty doc_terms list
     # We add at least one zero in the list of my_doc_terms
     if not my_doc_terms:
         zeros = [float(0)] * len(self.vocabulary())
         my_doc_terms = SuperList(zeros)
         
         
     if unique_ids:
         found = 0
         for doc in self.docs:
             if doc['id'] == doc_id:
                 doc['terms'].add(my_doc_terms)
                 found = 1
         if not found:        
             self.docs.append({'id': doc_id, 
                               'class': doc_class, 
                               'terms': my_doc_terms}) 
     else:
         self.docs.append({  'id': doc_id, 
                             'class': doc_class, 
                             'terms': my_doc_terms})
     # Update list of document classes if new class seen.
     #self.classes.unique_append(doc_class)
     #if self.classes.has_key(doc_class):
     #else:
     #    self.classes[doc_class].add(my_doc_terms)
     #    self.classes[doc_class] = my_doc_terms
     if do_padding: 
         self.do_padding()