def tf_idf(self, do_idf=True): ''' Converts matrix to tf.idf values do_idf: if False, convert to tf only ''' N = len(self.docs) df = SuperList([0] * len(self.terms)) for doc in self.docs: row = SuperList([0] * len(self.terms)) for idx in range(len(self.terms)): if doc['terms'][idx] > 0: row[idx] = 1 df.add(row) for doc in self.docs: for idx in range(len(self.terms)): tf = self._log_tf(doc['terms'][idx]) idf = math.log10(float(N) / df[idx]) if do_idf: doc['terms'][idx] = tf * idf else: doc['terms'][idx] = tf
class Stats: def __init__(self, matrix): self.mx = matrix self.N = 0 self.classes = {} self.terms = SuperList() for c in self.mx.classes: self.classes[c] = {} self.classes[c]['terms'] = self.mx.classes[c] self.classes[c]['total'] = sum(self.classes[c]['terms']) self.terms.add(self.classes[c]['terms']) self.N += self.classes[c]['total'] self.mi_terms = [] def __str__(self): s = 'Matrix Stats:' s += '\n * Vocabulary/Terms: %d/%d' % (len(self.terms), self.N) return s def getN(self): ''' Get total number of terms, counting their frequencies too. Notice: This is not the same as len(vocabulary) ''' return self.N def get_terms_freq(self, normalized=False): ''' Returns 2d matrix of vocabulary terms and their occurences if normalized is True, devide by total number of terms ''' terms = self.mx.terms freq = self.terms.div(self.N) if normalized else self.terms return [terms, freq] def pr_term(self, t): ' Get probability of term t ' i = self.mx[t] if i == -1: return 0 return float(self.terms[i]) / self.N def pr_class(self, c): ' Get probability of class c ' return float(self.classes[c]['total']) / self.N def pr_joint(self, t, c): 'Get joint probability between term t and class c' i = self.mx[t] if i == -1: return 0 return float(self.classes[c]['terms'][i]) / self.N def mi(self): for t in self.mx.vocabulary(): mi = 0 for c in self.classes: try: mi += self.pr_joint(t,c) * math.log10( self.pr_joint(t,c) / ( self.pr_term(t) * self.pr_class(c) )) except: # Oh, log(0), let's set mi = 0 mi = 0 self.mi_terms.append(mi) print self.classes print self.mi_terms