def test_percentage_greater_than(self): sl = SuperList() for i in range(100): sl.append(100 - i) self.assertEqual(sl.percentage_greater_than(-1), 100) self.assertEqual(sl.percentage_greater_than(11), 90) self.assertEqual(sl.percentage_greater_than(101), 0)
def add_doc(self, doc_id='', doc_class='', doc_terms=[], frequency=False, do_padding=False, unique_ids=False, meta_data={}): ''' Add new document to our matrix: doc_id: Identifier for the document, eg. file name, url, etc. doc_class: You might need this in classification. doc_terms: List of terms you got after tokenizing the document. Terms can be typles; string and frequencies frequency: If true, term occurences is incremented by one. Else, occurences is only 0 or 1 (a la Bernoulli) do_padding: Boolean. Check do_padding() for more info. unique_ids: When true, if two documents are added with same id, then their terms are summed up into only one record. meta_data: More fields to add to the document, for your own use. ''' if not doc_terms: raise ValueError('doc_terms cannot be empty') # Update list of terms if new term seen. # And document (row) with its associated data. my_doc_terms = SuperList() # Discard anything not in whitelist if it is not empty if self.whitelist: doc_terms = [t for t in doc_terms if t in self.whitelist] # Discard anything in stopwords if not empty if self.blacklist: doc_terms = [t for t in doc_terms if t not in self.blacklist] for term in doc_terms: if type(term) == tuple: term_idx = self.terms.unique_append(term[0]) my_doc_terms.increment_after_padding(term_idx, term[1]) else: term_idx = self.terms.unique_append(term) if frequency: my_doc_terms.increment_after_padding(term_idx, 1) else: my_doc_terms.insert_after_padding(term_idx, 1) # In the rare event when whitelisting causes an empty doc_terms list # We add at least one zero in the list of my_doc_terms if not my_doc_terms: zeros = [float(0)] * len(self.vocabulary()) my_doc_terms = SuperList(zeros) doc_data = {'id': doc_id, 'class': doc_class, 'terms': my_doc_terms} for key in meta_data: doc_data[key] = meta_data[key] if unique_ids: self.docs.add_unique(doc_data) else: self.docs.append(doc_data) if do_padding: self.do_padding()
def test_percentage_lower_than(self): sl = SuperList() for i in range(100): sl.append(100 - i) self.assertEqual(sl.percentage_lower_than(-1), 0) self.assertEqual(sl.percentage_lower_than(11), 10) self.assertEqual(sl.percentage_lower_than(33.3), 33) self.assertEqual(sl.percentage_lower_than(101), 100) sl.append(4) self.assertTrue(sl.percentage_lower_than(5) > 4.95 and sl.percentage_lower_than(5) < 4.96)
def __init__(self, matrix): self.mx = matrix self.N = 0 self.classes = {} self.terms = SuperList() for c in self.mx.classes: self.classes[c] = {} self.classes[c]['terms'] = self.mx.classes[c] self.classes[c]['total'] = sum(self.classes[c]['terms']) self.terms.add(self.classes[c]['terms']) self.N += self.classes[c]['total'] self.mi_terms = []
def prune(self, prune_map, show_progress=True): ''' Helper method to remove terms (fields) of our matrix prune_map is a list of 0's and 1's of same length as self.terms. For each term, if 0, then remove it, otherwise keep it. ''' if not(prune_map) or len(prune_map) != len(self.terms): return False if show_progress: print ' Pruning terms list ...' new_terms = SuperList() for i in range(len(prune_map)-1,-1,-1): if prune_map[i] == 1: #print self.terms[i] new_terms.append(self.terms[i]) self.terms = new_terms if show_progress: print ' Pruning documents ...' p = Progress(n=len(self), percent=10) for doc in self.docs: new_doc_terms = SuperList() for i in range(len(prune_map)-1,-1,-1): if prune_map[i] == 1: new_doc_terms.append(doc['terms'][i]) doc['terms'] = new_doc_terms if show_progress: p.show(message=' Pruning progress:')
def load(self, filename, delimiter='\t', header=True): ''' Loads matrix from CSV/TSV file ''' with open(filename, 'r') as fd: header_line = fd.readline() header_data = header_line.strip().split(delimiter) # First 2 columns are id and class self.terms = SuperList(header_data[2:]) for line in fd.readlines(): doc_data = { 'id': line[0], 'class': line[1], 'terms': SuperList(line[2:]) } self.docs.append(doc_data) fd.close()
def add_doc(self, doc_id = '', doc_class='', doc_terms=[], frequency=False, do_padding=False): ''' Add new document to our matrix: doc_id: Identifier for the document, eg. file name, url, etc. doc_class: You might need this in classification. doc_terms: List of terms you got after tokenizing the document. Terms can be typles, string and values frequency: If true, term occurences is incremented by one. Else, occurences is only 0 or 1 (a la Bernoulli) do_padding: Boolean. Check do_padding() for more info. ''' # Update list of terms if new term seen. # And document (row) with its associated data. my_doc_terms = SuperList() for term in doc_terms: if type(term) == tuple: term_idx = self.terms.unique_append(term[0]) my_doc_terms.increment_after_padding(term_idx,term[1]) else: term_idx = self.terms.unique_append(term) if frequency: my_doc_terms.increment_after_padding(term_idx,1) else: my_doc_terms.insert_after_padding(term_idx,1) self.docs.append({ 'id': doc_id, 'class': doc_class, 'terms': my_doc_terms}) # Update list of document classes if new class seen. #self.classes.unique_append(doc_class) #if self.classes.has_key(doc_class): #else: # self.classes[doc_class].add(my_doc_terms) # self.classes[doc_class] = my_doc_terms if do_padding: self.do_padding()
def __init__(self, whitelist=[]): ''' Initilize our matrix. whitelist: If not empty, discard any terms not in whitelist, when adding new terms via add_doc() terms: We will populate this with our vocabulary of terms docs: This is our actual 2D matrix terms/docs. A list of the following dictionary, { 'id': Unique ID to each document, 'class': In case of labeled data, doc class label, 'terms': list of 1's and 0's, i.e. term Frequencies. } ''' # List of unique terms (vocabulary) self.terms = SuperList() # List of document classes and terms summary #self.classes = {} self.docs = [] self.whitelist = whitelist
def add_doc(self, doc_id = '', doc_class='', doc_terms=[], frequency=False, do_padding=False): ''' Add new document to our matrix: doc_id: Identifier for the document, eg. file name, url, etc. doc_class: You might need this in classification. doc_terms: List of terms you got after tokenizing the document. Terms can be typles; string and frequencies frequency: If true, term occurences is incremented by one. Else, occurences is only 0 or 1 (a la Bernoulli) do_padding: Boolean. Check do_padding() for more info. ''' # Update list of terms if new term seen. # And document (row) with its associated data. my_doc_terms = SuperList() # Discard anything not in whitelist if it is not empty if self.whitelist: doc_terms = [t for t in doc_terms if t in self.whitelist] for term in doc_terms: if type(term) == tuple: term_idx = self.terms.unique_append(term[0]) my_doc_terms.increment_after_padding(term_idx,term[1]) else: term_idx = self.terms.unique_append(term) if frequency: my_doc_terms.increment_after_padding(term_idx,1) else: my_doc_terms.insert_after_padding(term_idx,1) # In the rare event when whitelisting causes an empty doc_terms list # We add at least one zero in the list of my_doc_terms if not my_doc_terms: zeros = [float(0)] * len(self.vocabulary()) my_doc_terms = SuperList(zeros) self.docs.append({ 'id': doc_id, 'class': doc_class, 'terms': my_doc_terms}) # Update list of document classes if new class seen. #self.classes.unique_append(doc_class) #if self.classes.has_key(doc_class): #else: # self.classes[doc_class].add(my_doc_terms) # self.classes[doc_class] = my_doc_terms if do_padding: self.do_padding()
def tf_idf(self, do_idf=True): ''' Converts matrix to tf.idf values do_idf: if False, convert to tf only ''' N = len(self.docs) df = SuperList([0] * len(self.terms)) for doc in self.docs: row = SuperList([0] * len(self.terms)) for idx in range(len(self.terms)): if doc['terms'][idx] > 0: row[idx] = 1 df.add(row) for doc in self.docs: for idx in range(len(self.terms)): tf = self._log_tf(doc['terms'][idx]) idf = math.log10(float(N) / df[idx]) if do_idf: doc['terms'][idx] = tf * idf else: doc['terms'][idx] = tf
def add_doc(self, doc_id = '', doc_class='', doc_terms=[], frequency=False, do_padding=False): ''' Add new document to our matrix: doc_id: Identifier for the document, eg. file name, url, etc. doc_class: You might need this in classification. doc_terms: List of terms you got after tokenizing the document. frequency: If true, term occurences is incremented by one. Else, occurences is only 0 or 1 (a la Bernoulli) do_padding: Boolean. Check do_padding() for more info. ''' # Update list of terms if new term seen. # And document (row) with its associated data. my_doc_terms = SuperList() for term in doc_terms: term_idx = self.terms.unique_append(term) #my_doc_terms.insert_after_padding(self.terms.index(term)) if frequency: my_doc_terms.increment_after_padding(term_idx,1) else: my_doc_terms.insert_after_padding(term_idx,1) self.docs.append({ 'id': doc_id, 'class': doc_class, 'terms': my_doc_terms}) # Update list of document classes if new class seen. #self.classes.unique_append(doc_class) if self.classes.has_key(doc_class): self.classes[doc_class].add(my_doc_terms) else: self.classes[doc_class] = my_doc_terms if do_padding: self.do_padding()
def test_percentage(self): sl = SuperList() for i in range(100): sl.append(100 - i) self.assertEqual(sl.percentage(0), 1) self.assertEqual(sl.percentage(90), 91) self.assertEqual(sl.percentage(100), 100) self.assertEqual(sl.percentage(101), 100)
def add_doc(self, doc_id = '', doc_class='', doc_terms=[], frequency=False, do_padding=False, stopwords=[]): ''' Add new document to our matrix: doc_id: Identifier for the document, eg. file name, url, etc. doc_class: You might need this in classification. doc_terms: List of terms you got after tokenizing the document. Terms can be typles; string and frequencies frequency: If true, term occurences is incremented by one. Else, occurences is only 0 or 1 (a la Bernoulli) do_padding: Boolean. Useless here stopwords: If not empty, ignore those stop words in doc_terms ''' # Update list of terms if new term seen. # And document (row) with its associated data. my_doc_terms = SuperList() # Discard anything not in whitelist if it is not empty if self.whitelist: doc_terms = [t for t in doc_terms if t in self.whitelist] # Discard anything in stopwords if not empty if stopwords: doc_terms = [t for t in doc_terms if t not in stopwords] for term in doc_terms: if type(term) == tuple: term_idx = self.terms.unique_append(term[0]) my_doc_terms.increment_after_padding(term_idx,term[1]) else: term_idx = self.terms.unique_append(term) if frequency: my_doc_terms.increment_after_padding(term_idx,1) else: my_doc_terms.insert_after_padding(term_idx,1) #self.docs.append({ 'id': doc_id, # 'class': doc_class, # 'terms': my_doc_terms}) found = 0 for doc in self.docs: if doc['class'] == doc_class: doc['terms'].add(my_doc_terms) found = 1 if not found: self.docs.append({'id': doc_id, 'class': doc_class, 'terms': my_doc_terms}) if do_padding: self.do_padding()
def query_to_vector(self, q_terms, frequency=False,): ''' Converts query to a list alligned with our self.terms. Terms not seen before will be ignored. q_terms: list of query terms frequency: return a multinomial or multivariate list? ''' my_query_vector = SuperList() my_query_vector.expand(new_len=len(self.terms)) for term in q_terms: try: term_idx = self.terms.index(term) except: # Term not seen before, skip continue #print term, self.terms.index(term) if frequency: my_query_vector.increment_after_padding(term_idx,1) else: my_query_vector.insert_after_padding(term_idx,1) return my_query_vector
def add_doc(self, doc_id="", doc_class="", doc_terms=[], frequency=False, do_padding=False): """ Add new document to our matrix: doc_id: Identifier for the document, eg. file name, url, etc. doc_class: You might need this in classification. doc_terms: List of terms you got after tokenizing the document. Terms can be typles; string and frequencies frequency: If true, term occurences is incremented by one. Else, occurences is only 0 or 1 (a la Bernoulli) do_padding: Boolean. Check do_padding() for more info. """ # Update list of terms if new term seen. # And document (row) with its associated data. my_doc_terms = SuperList() # Discard anything not in whitelist if it is not empty if self.whitelist: doc_terms = [t for t in doc_terms if t in self.whitelist] for term in doc_terms: if type(term) == tuple: term_idx = self.terms.unique_append(term[0]) my_doc_terms.increment_after_padding(term_idx, term[1]) else: term_idx = self.terms.unique_append(term) if frequency: my_doc_terms.increment_after_padding(term_idx, 1) else: my_doc_terms.insert_after_padding(term_idx, 1) # In the rare event when whitelisting causes an empty doc_terms list # We add at least one zero in the list of my_doc_terms if not my_doc_terms: zeros = [float(0)] * len(self.vocabulary()) my_doc_terms = SuperList(zeros) self.docs.append({"id": doc_id, "class": doc_class, "terms": my_doc_terms}) # Update list of document classes if new class seen. # self.classes.unique_append(doc_class) # if self.classes.has_key(doc_class): # else: # self.classes[doc_class].add(my_doc_terms) # self.classes[doc_class] = my_doc_terms if do_padding: self.do_padding()
class DistributionAnalyzer: def __init__(self): self.list = SuperList() def add(self, value): self.list.append(value) def get_size(self): return len(self.list) def print_percentage(self, percentage): value = self.list.percentage(percentage) print(str(percentage) + "% under " + str(value)) def print_default_percentages(self): self.print_percentage(50) self.print_percentage(75) self.print_percentage(90) self.print_percentage(99) self.print_percentage(100) def find_percentage_lower_than(self, value): p = self.list.percentage_lower_than(value) print("%.2f%% of values are < %s" % (p, value)) def find_percentage_greater_than(self, value): p = self.list.percentage_greater_than(value) print("%.2f%% of values are > %s" % (p, value)) def find_percentage_between(self, low, high): low, high = min(low, high), max(low, high) a = self.list.percentage_greater_than(high) b = self.list.percentage_lower_than(low) result = 100.0 - (a + b) print("%.2f of values are in [ %s, %s ]" % (result, low, high))
class Stats: def __init__(self, matrix): self.mx = matrix self.N = 0 self.classes = {} self.terms = SuperList() for c in self.mx.classes: self.classes[c] = {} self.classes[c]['terms'] = self.mx.classes[c] self.classes[c]['total'] = sum(self.classes[c]['terms']) self.terms.add(self.classes[c]['terms']) self.N += self.classes[c]['total'] self.mi_terms = [] def __str__(self): s = 'Matrix Stats:' s += '\n * Vocabulary/Terms: %d/%d' % (len(self.terms), self.N) return s def getN(self): ''' Get total number of terms, counting their frequencies too. Notice: This is not the same as len(vocabulary) ''' return self.N def get_terms_freq(self, normalized=False): ''' Returns 2d matrix of vocabulary terms and their occurences if normalized is True, devide by total number of terms ''' terms = self.mx.terms freq = self.terms.div(self.N) if normalized else self.terms return [terms, freq] def pr_term(self, t): ' Get probability of term t ' i = self.mx[t] if i == -1: return 0 return float(self.terms[i]) / self.N def pr_class(self, c): ' Get probability of class c ' return float(self.classes[c]['total']) / self.N def pr_joint(self, t, c): 'Get joint probability between term t and class c' i = self.mx[t] if i == -1: return 0 return float(self.classes[c]['terms'][i]) / self.N def mi(self): for t in self.mx.vocabulary(): mi = 0 for c in self.classes: try: mi += self.pr_joint(t,c) * math.log10( self.pr_joint(t,c) / ( self.pr_term(t) * self.pr_class(c) )) except: # Oh, log(0), let's set mi = 0 mi = 0 self.mi_terms.append(mi) print self.classes print self.mi_terms
def __init__(self): # List of unique terms (vocabulary) self.terms = SuperList() # List of document classes and terms summary #self.classes = {} self.docs = []
class Matrix: def __init__(self): # List of unique terms (vocabulary) self.terms = SuperList() # List of document classes and terms summary #self.classes = {} self.docs = [] def __len__(self): 'Returns number of loaded ducuments' return len(self.docs) def vocabulary(self): 'Returns list of unique terms' return self.terms def __str__(self): s = 'Matrix:' s += '\n * Vocabulary read: %d' % len(self.terms) s += '\n * Documents read: %d' % len(self.docs) return s def dump(self, filename, delimiter='\t', header=True): ''' Dumps matrix to a file ''' fd = open(filename,'w') # Let's first print file header header_line = 'id' header_line = header_line + delimiter + 'class' for term in self.terms: header_line = header_line + delimiter + term if header: fd.write('%s\n' % header_line) # Now we print data lines for doc in self.docs: line = doc['id'] line = line + delimiter + doc['class'] for term in doc['terms']: line = line + delimiter + str(term) fd.write('%s\n' % line) fd.close() def dump_arff(self, filename, delimiter=',',): ''' Dumps matrix to a file ''' fd = open(filename,'w') header = '@RELATION %s\n\n' % filename.split('.')[0] header = header + '@ATTRIBUTE \'ID\' NUMERIC\n' for term in self.terms: header = header + '@ATTRIBUTE \'' + term + '\' NUMERIC\n' header = header + '@ATTRIBUTE class NUMERIC\n' fd.write('%s\n' % header) # Now we print data lines fd.write('@DATA\n') for doc in self.docs: line = doc['id'] for term in doc['terms']: line = line + delimiter + str(term) line = line + delimiter + str(doc['class']) fd.write('%s\n' % line) fd.close() def dump_transposed(self, filename, delimiter='\t', header=True): fd = open(filename,'w') # Let's first print file header header_line = 'terms' for doc in self.docs: header_line = header_line + delimiter + doc['id'] if header: fd.write('%s\n' % header_line) # Now we print data lines idx = 0 for term in self.terms: line = term for doc in self.docs: line = line + delimiter + str(doc['terms'][idx]) fd.write('%s\n' % line) idx += 1 fd.close() def dump_transposed_arff(self, filename): fd = open(filename,'w') # Let's first print file header header = '@RELATION %s\n\n' % filename.split('.')[0] header = header + '@ATTRIBUTE terms STRING\n' for doc in self.docs: header = header + '@ATTRIBUTE "%s" NUMERIC\n' % doc['id'] fd.write('%s\n' % header) # Now we print data lines fd.write('@DATA\n') idx = 0 delimiter = ',' for term in self.terms: line = '"%s"' % term for doc in self.docs: line = line + delimiter + str(doc['terms'][idx]) fd.write('%s\n' % line) idx += 1 fd.close() def prune(self, prune_map): ''' Helper method to remove terms (fields) of our matrix prune_map is a list of 0's and 1's of same length as self.terms. For each term, if 0, then remove it, otherwise keep it. ''' if not(prune_map) or len(prune_map) != len(self.terms): return False for i in range(len(prune_map)-1,-1,-1): if prune_map[i] == 0: #print self.terms[i] self.terms.pop(i) for doc in self.docs: doc['terms'].pop(i) def freq_levels(self, threshold=3): ''' Creates a list of 0's and 1's, where 1 means term's freq >= threshold ''' freq_map = [0] * len(self.terms) for i in range(0,len(self.terms)): val = 0 for doc in self.docs: if doc['terms'][i] != 0: val += 1 if val >= threshold: freq_map[i] = 1 return freq_map def __contains__(self, term): 'Checks if certain terms is loaded' return self.terms.__contains__(term) def to_be_deleted__getitem__(self, term): 'Returns occurences of term in all documents' if not term in self: return SuperList() col = [doc['terms'][self.terms.index(term)] for doc in self.docs] return SuperList(col) def __getitem__(self, term): ''' If term exists in terms, retruns it position in list, otherwise, return -1 ''' if not term in self: return -1 else: return self.terms.index(term) def do_padding(self): ''' Align the length of all rows in matrix Each time we see a new term, list of terms is expanded, and the matrix row for such document is of same length too. But what about rows added earlier for previous documents? So, this method alighn all previously added rows, to match the current length of the terms list. ''' if len(self.docs[-1]['terms']) == len(self.docs[0]['terms']): return for doc in self.docs: doc['terms'].expand(new_len=len(self.terms)) #for cls in self.classes: # self.classes[cls].expand(new_len=len(self.terms)) def tf_idf(self, do_idf=True): ''' Converts matrix to tf.idf values do_idf: if False, convert to tf only ''' N = len(self) for doc in self.docs: for idx in range(len(doc)): df = self[self.terms[idx]].nonzero_count() tf = log_tf(doc['terms'][idx]) idf = float(N) / df if do_idf else 1 doc['terms'][idx] = tf * idf def add_doc(self, doc_id = '', doc_class='', doc_terms=[], frequency=False, do_padding=False): ''' Add new document to our matrix: doc_id: Identifier for the document, eg. file name, url, etc. doc_class: You might need this in classification. doc_terms: List of terms you got after tokenizing the document. Terms can be typles, string and values frequency: If true, term occurences is incremented by one. Else, occurences is only 0 or 1 (a la Bernoulli) do_padding: Boolean. Check do_padding() for more info. ''' # Update list of terms if new term seen. # And document (row) with its associated data. my_doc_terms = SuperList() for term in doc_terms: if type(term) == tuple: term_idx = self.terms.unique_append(term[0]) my_doc_terms.increment_after_padding(term_idx,term[1]) else: term_idx = self.terms.unique_append(term) if frequency: my_doc_terms.increment_after_padding(term_idx,1) else: my_doc_terms.insert_after_padding(term_idx,1) self.docs.append({ 'id': doc_id, 'class': doc_class, 'terms': my_doc_terms}) # Update list of document classes if new class seen. #self.classes.unique_append(doc_class) #if self.classes.has_key(doc_class): #else: # self.classes[doc_class].add(my_doc_terms) # self.classes[doc_class] = my_doc_terms if do_padding: self.do_padding() def query_to_vector(self, q_terms, frequency=False,): ''' Converts query to a list alligned with our self.terms. Terms not seen before will be ignored. q_terms: list of query terms frequency: return a multinomial or multivariate list? ''' my_query_vector = SuperList() my_query_vector.expand(new_len=len(self.terms)) for term in q_terms: try: term_idx = self.terms.index(term) except: # Term not seen before, skip continue #print term, self.terms.index(term) if frequency: my_query_vector.increment_after_padding(term_idx,1) else: my_query_vector.insert_after_padding(term_idx,1) return my_query_vector def get_stats(self): return Stats(self)
class Matrix: def __init__(self): # List of unique terms (vocabulary) self.terms = SuperList() # List of document classes and terms summary self.classes = {} self.docs = [] def __len__(self): 'Returns number of loaded ducuments' return len(self.docs) def vocabulary(self): 'Returns list of unique terms' return self.terms def __str__(self): s = 'Matrix:' s += '\n * Vocabulary read: %d' % len(self.terms) s += '\n * Documents read: %d' % len(self.docs) return s def __contains__(self, term): 'Checks if certain terms is loaded' return self.terms.__contains__(term) def to_be_deleted__getitem__(self, term): 'Returns occurences of term in all documents' if not term in self: return SuperList() col = [doc['terms'][self.terms.index(term)] for doc in self.docs] return SuperList(col) def __getitem__(self, term): ''' If term exists in terms, retruns it position in list, otherwise, return -1 ''' if not term in self: return -1 else: return self.terms.index(term) def do_padding(self): ''' Align the length of all rows in matrix Each time we see a new term, list of terms is expanded, and the matrix row for such document is of same length too. But what about rows added earlier for previous documents? So, this method alighn all previously added rows, to match the current length of the terms list. ''' if len(self.docs[-1]) == len(self.docs[0]): return for doc in self.docs: doc['terms'].expand(new_len=len(self.terms)) #for cls in self.classes: # self.classes[cls].expand(new_len=len(self.terms)) def tf_idf(self, do_idf=True): ''' Converts matrix to tf.idf values do_idf: if False, convert to tf only ''' N = len(self) for doc in self.docs: for idx in range(len(doc)): df = self[self.terms[idx]].nonzero_count() tf = log_tf(doc['terms'][idx]) idf = float(N) / df if do_idf else 1 doc['terms'][idx] = tf * idf def add_doc(self, doc_id = '', doc_class='', doc_terms=[], frequency=False, do_padding=False): ''' Add new document to our matrix: doc_id: Identifier for the document, eg. file name, url, etc. doc_class: You might need this in classification. doc_terms: List of terms you got after tokenizing the document. frequency: If true, term occurences is incremented by one. Else, occurences is only 0 or 1 (a la Bernoulli) do_padding: Boolean. Check do_padding() for more info. ''' # Update list of terms if new term seen. # And document (row) with its associated data. my_doc_terms = SuperList() for term in doc_terms: term_idx = self.terms.unique_append(term) #my_doc_terms.insert_after_padding(self.terms.index(term)) if frequency: my_doc_terms.increment_after_padding(term_idx,1) else: my_doc_terms.insert_after_padding(term_idx,1) self.docs.append({ 'id': doc_id, 'class': doc_class, 'terms': my_doc_terms}) # Update list of document classes if new class seen. #self.classes.unique_append(doc_class) if self.classes.has_key(doc_class): self.classes[doc_class].add(my_doc_terms) else: self.classes[doc_class] = my_doc_terms if do_padding: self.do_padding() def query_to_vector(self, q_terms, frequency=False,): ''' Converts query to a list alligned with our self.terms. Terms not seen before will be ignored. q_terms: list of query terms frequency: return a multinomial or multivariate list? ''' my_query_vector = SuperList() my_query_vector.expand(new_len=len(self.terms)) for term in q_terms: try: term_idx = self.terms.index(term) except: # Term not seen before, skip continue #print term, self.terms.index(term) if frequency: my_query_vector.increment_after_padding(term_idx,1) else: my_query_vector.insert_after_padding(term_idx,1) return my_query_vector def get_stats(self): return Stats(self)
def __init__(self): # List of unique terms (vocabulary) self.terms = SuperList() # List of document classes and terms summary self.classes = {} self.docs = []
def __init__(self): self.list = SuperList()
def to_be_deleted__getitem__(self, term): 'Returns occurences of term in all documents' if not term in self: return SuperList() col = [doc['terms'][self.terms.index(term)] for doc in self.docs] return SuperList(col)
def add_doc(self, doc_id='', doc_class='', doc_terms=[], frequency=False, do_padding=False, unique_ids=False, meta_data={}): ''' Add new document to our matrix: doc_id: Identifier for the document, eg. file name, url, etc. doc_class: You might need this in classification. doc_terms: List of terms you got after tokenizing the document. Terms can be typles; string and frequencies frequency: If true, term occurences is incremented by one. Else, occurences is only 0 or 1 (a la Bernoulli) do_padding: Boolean. Check do_padding() for more info. unique_ids: When true, if two documents are added with same id, then their terms are summed up into only one record. meta_data: More fields to add to the document, for your own use. ''' if not doc_terms: raise ValueError('doc_terms cannot be empty') # Update list of terms if new term seen. # And document (row) with its associated data. my_doc_terms = SuperList() # Discard anything not in whitelist if it is not empty if self.whitelist: doc_terms = [t for t in doc_terms if t in self.whitelist] # Discard anything in stopwords if not empty if self.blacklist: doc_terms = [t for t in doc_terms if t not in self.blacklist] for term in doc_terms: if type(term) == tuple: term_idx = self.terms.unique_append(term[0]) my_doc_terms.increment_after_padding(term_idx,term[1]) else: term_idx = self.terms.unique_append(term) if frequency: my_doc_terms.increment_after_padding(term_idx,1) else: my_doc_terms.insert_after_padding(term_idx,1) # In the rare event when whitelisting causes an empty doc_terms list # We add at least one zero in the list of my_doc_terms if not my_doc_terms: zeros = [float(0)] * len(self.vocabulary()) my_doc_terms = SuperList(zeros) doc_data = { 'id': doc_id, 'class': doc_class, 'terms': my_doc_terms } for key in meta_data: doc_data[key] = meta_data[key] if unique_ids: self.docs.add_unique(doc_data) else: self.docs.append(doc_data) if do_padding: self.do_padding()
class Matrix: def __init__(self, whitelist=[]): ''' Initilize our matrix. whitelist: If not empty, discard any terms not in whitelist, when adding new terms via add_doc() terms: We will populate this with our vocabulary of terms docs: This is our actual 2D matrix terms/docs. A list of the following dictionary, { 'id': Unique ID to each document, 'class': In case of labeled data, doc class label, 'terms': list of 1's and 0's, i.e. term Frequencies. } ''' # List of unique terms (vocabulary) self.terms = SuperList() # List of document classes and terms summary #self.classes = {} self.docs = [] self.whitelist = whitelist def __len__(self): 'Returns number of loaded ducuments' return len(self.docs) def vocabulary(self, threshold_map=[]): '''Returns list of all unique terms if threshold_map not given. Otherwise, only return terms above threshold. ''' if not threshold_map: return self.terms elif len(threshold_map) == len(self.terms): vlist = [] for i in range(len(self.terms)): if threshold_map[i] == 1: vlist.append(self.terms[i]) return vlist else: return [] def __str__(self): s = 'Matrix:' s += '\n * Vocabulary read: %d' % len(self.terms) s += '\n * Documents read: %d' % len(self.docs) return s def dump_tf(self, filename, freqs, delimiter='\t', header=True): ''' Dumps term frequencies ''' fd = open(filename,'w') # Let's first print file header header_line = 'term' header_line = header_line + delimiter + 'freq' if header: fd.write('%s\n' % header_line) # Now we print data lines terms = self.vocabulary() for i in range(len(terms)): line = terms[i] line = line + delimiter + str(freqs[i]) fd.write('%s\n' % line) fd.close() def dump(self, filename, delimiter='\t', header=True): ''' Dumps matrix to a file ''' fd = open(filename,'w') # Let's first print file header header_line = 'id' header_line = header_line + delimiter + 'class' for term in self.terms: header_line = header_line + delimiter + term if header: fd.write('%s\n' % header_line) # Now we print data lines for doc in self.docs: line = doc['id'] line = line + delimiter + doc['class'] for term in doc['terms']: line = line + delimiter + str(term) fd.write('%s\n' % line) fd.close() def dump_arff(self, filename, delimiter=',', clstype='NUMERIC'): ''' Dumps matrix to a file ''' fd = open(filename,'w') header = '@RELATION %s\n\n' % filename.split('.')[0] header = header + '@ATTRIBUTE \'ID\' NUMERIC\n' for term in self.terms: header = header + '@ATTRIBUTE \'' + term + '\' NUMERIC\n' header = header + '@ATTRIBUTE \'ClassLabel\' ' + clstype + '\n' fd.write('%s\n' % header) # Now we print data lines fd.write('@DATA\n') for doc in self.docs: line = doc['id'] for term in doc['terms']: line = line + delimiter + str(term) line = line + delimiter + str(doc['class']) fd.write('%s\n' % line) fd.close() def dump_transposed(self, filename, delimiter='\t', header=True): fd = open(filename,'w') # Let's first print file header header_line = 'terms' for doc in self.docs: header_line = header_line + delimiter + doc['id'] if header: fd.write('%s\n' % header_line) # Now we print data lines idx = 0 for term in self.terms: line = term for doc in self.docs: line = line + delimiter + str(doc['terms'][idx]) fd.write('%s\n' % line) idx += 1 fd.close() def dump_transposed_arff(self, filename): fd = open(filename,'w') # Let's first print file header header = '@RELATION %s\n\n' % filename.split('.')[0] header = header + '@ATTRIBUTE terms STRING\n' for doc in self.docs: header = header + '@ATTRIBUTE "%s" NUMERIC\n' % doc['id'] fd.write('%s\n' % header) # Now we print data lines fd.write('@DATA\n') idx = 0 delimiter = ',' for term in self.terms: line = '"%s"' % term for doc in self.docs: line = line + delimiter + str(doc['terms'][idx]) fd.write('%s\n' % line) idx += 1 fd.close() def prune_old(self, prune_map): ''' Helper method to remove terms (fields) of our matrix prune_map is a list of 0's and 1's of same length as self.terms. For each term, if 0, then remove it, otherwise keep it. ''' if not(prune_map) or len(prune_map) != len(self.terms): return False for i in range(len(prune_map)-1,-1,-1): if prune_map[i] == 0: #print self.terms[i] self.terms.pop(i) for doc in self.docs: doc['terms'].pop(i) def prune(self, prune_map, show_progress=True): ''' Helper method to remove terms (fields) of our matrix prune_map is a list of 0's and 1's of same length as self.terms. For each term, if 0, then remove it, otherwise keep it. ''' if not(prune_map) or len(prune_map) != len(self.terms): return False if show_progress: print ' Pruning terms list ...' new_terms = SuperList() for i in range(len(prune_map)-1,-1,-1): if prune_map[i] == 1: #print self.terms[i] new_terms.append(self.terms[i]) self.terms = new_terms if show_progress: print ' Pruning documents ...' p = Progress(n=len(self), percent=10) for doc in self.docs: new_doc_terms = SuperList() for i in range(len(prune_map)-1,-1,-1): if prune_map[i] == 1: new_doc_terms.append(doc['terms'][i]) doc['terms'] = new_doc_terms if show_progress: p.show(message=' Pruning progress:') def freq_levels(self, threshold=3): ''' Creates two lists: threshold_map is a list of 0's and 1's, where 1 means term's freq >= threshold freq_map is a list of terms frequences ''' threshold_map = [0] * len(self.terms) freq_map = [0] * len(self.terms) for i in range(0,len(self.terms)): val = 0 for doc in self.docs: if doc['terms'][i] != 0: #val += 1 val += doc['terms'][i] if val >= threshold: threshold_map[i] = 1 freq_map[i] = val return (threshold_map, freq_map) def __contains__(self, term): 'Checks if certain terms is loaded' return self.terms.__contains__(term) def to_be_deleted__getitem__(self, term): 'Returns occurences of term in all documents' if not term in self: return SuperList() col = [doc['terms'][self.terms.index(term)] for doc in self.docs] return SuperList(col) def __getitem__(self, term): ''' If term exists in terms, retruns it position in list, otherwise, return -1 ''' if not term in self: return -1 else: return self.terms.index(term) def do_padding(self): ''' Align the length of all rows in matrix Each time we see a new term, list of terms is expanded, and the matrix row for such document is of same length too. But what about rows added earlier for previous documents? So, this method alighn all previously added rows, to match the current length of the terms list. ''' if len(self.docs[-1]['terms']) == len(self.docs[0]['terms']): return for doc in self.docs: doc['terms'].expand(new_len=len(self.terms)) #for cls in self.classes: # self.classes[cls].expand(new_len=len(self.terms)) def _log_tf(self, value): val = float(value) val = 1 + math.log10(val) if val != 0 else float(0) return val def tf_idf(self, do_idf=True): ''' Converts matrix to tf.idf values do_idf: if False, convert to tf only ''' N = len(self.docs) df = SuperList([0] * len(self.terms)) for doc in self.docs: row = SuperList([0] * len(self.terms)) for idx in range(len(self.terms)): if doc['terms'][idx] > 0: row[idx] = 1 df.add(row) for doc in self.docs: for idx in range(len(self.terms)): tf = self._log_tf(doc['terms'][idx]) idf = math.log10(float(N) / df[idx]) if do_idf: doc['terms'][idx] = tf * idf else: doc['terms'][idx] = tf def add_doc(self, doc_id = '', doc_class='', doc_terms=[], frequency=False, do_padding=False): ''' Add new document to our matrix: doc_id: Identifier for the document, eg. file name, url, etc. doc_class: You might need this in classification. doc_terms: List of terms you got after tokenizing the document. Terms can be typles; string and frequencies frequency: If true, term occurences is incremented by one. Else, occurences is only 0 or 1 (a la Bernoulli) do_padding: Boolean. Check do_padding() for more info. ''' # Update list of terms if new term seen. # And document (row) with its associated data. my_doc_terms = SuperList() # Discard anything not in whitelist if it is not empty if self.whitelist: doc_terms = [t for t in doc_terms if t in self.whitelist] for term in doc_terms: if type(term) == tuple: term_idx = self.terms.unique_append(term[0]) my_doc_terms.increment_after_padding(term_idx,term[1]) else: term_idx = self.terms.unique_append(term) if frequency: my_doc_terms.increment_after_padding(term_idx,1) else: my_doc_terms.insert_after_padding(term_idx,1) # In the rare event when whitelisting causes an empty doc_terms list # We add at least one zero in the list of my_doc_terms if not my_doc_terms: zeros = [float(0)] * len(self.vocabulary()) my_doc_terms = SuperList(zeros) self.docs.append({ 'id': doc_id, 'class': doc_class, 'terms': my_doc_terms}) # Update list of document classes if new class seen. #self.classes.unique_append(doc_class) #if self.classes.has_key(doc_class): #else: # self.classes[doc_class].add(my_doc_terms) # self.classes[doc_class] = my_doc_terms if do_padding: self.do_padding() def query_to_vector(self, q_terms, frequency=False,): ''' Converts query to a list alligned with our self.terms. Terms not seen before will be ignored. q_terms: list of query terms frequency: return a multinomial or multivariate list? ''' my_query_vector = SuperList() my_query_vector.expand(new_len=len(self.terms)) for term in q_terms: try: term_idx = self.terms.index(term) except: # Term not seen before, skip continue #print term, self.terms.index(term) if frequency: my_query_vector.increment_after_padding(term_idx,1) else: my_query_vector.insert_after_padding(term_idx,1) return my_query_vector def get_stats(self): return Stats(self)
class Matrix: def __init__(self, whitelist=[]): ''' Initilize our matrix. whitelist: If not empty, discard any terms not in whitelist, when adding new terms via add_doc() terms: We will populate this with our vocabulary of terms docs: This is our actual 2D matrix terms/docs. A list of the following dictionary, { 'id': Unique ID to each document, 'class': In case of labeled data, doc class label, 'terms': list of 1's and 0's, i.e. term Frequencies. } ''' # List of unique terms (vocabulary) self.terms = SuperList() # List of document classes and terms summary #self.classes = {} self.docs = [] self.whitelist = whitelist def __len__(self): 'Returns number of loaded ducuments' return len(self.docs) def vocabulary(self, threshold_map=[]): '''Returns list of all unique terms if threshold_map not given. Otherwise, only return terms above threshold. ''' if not threshold_map: return self.terms elif len(threshold_map) == len(self.terms): vlist = [] for i in range(len(self.terms)): if threshold_map[i] == 1: vlist.append(self.terms[i]) return vlist else: return [] def __str__(self): s = 'Matrix:' s += '\n * Vocabulary read: %d' % len(self.terms) s += '\n * Documents read: %d' % len(self.docs) return s def dump_tf(self, filename, freqs, delimiter='\t', header=True): ''' Dumps term frequencies ''' fd = open(filename,'w') # Let's first print file header header_line = 'term' header_line = header_line + delimiter + 'freq' if header: fd.write('%s\n' % header_line) # Now we print data lines terms = self.vocabulary() for i in range(len(terms)): line = terms[i] line = line + delimiter + str(freqs[i]) fd.write('%s\n' % line) fd.close() def dump(self, filename, delimiter='\t', header=True): ''' Dumps matrix to a file ''' fd = open(filename,'w') # Let's first print file header header_line = 'id' header_line = header_line + delimiter + 'class' for term in self.terms: header_line = header_line + delimiter + term if header: fd.write('%s\n' % header_line) # Now we print data lines for doc in self.docs: line = doc['id'] line = line + delimiter + doc['class'] for term in doc['terms']: line = line + delimiter + str(term) fd.write('%s\n' % line) fd.close() def dump_arff(self, filename, delimiter=',', clstype='NUMERIC'): ''' Dumps matrix to a file ''' fd = open(filename,'w') header = '@RELATION %s\n\n' % filename.split('.')[0] header = header + '@ATTRIBUTE \'ARFFID\' NUMERIC\n' for term in self.terms: header = header + '@ATTRIBUTE \'' + term + '\' NUMERIC\n' header = header + '@ATTRIBUTE \'ClassLabel\' ' + clstype + '\n' fd.write('%s\n' % header) # Now we print data lines fd.write('@DATA\n') for doc in self.docs: line = doc['id'] for term in doc['terms']: line = line + delimiter + str(term) line = line + delimiter + str(doc['class']) fd.write('%s\n' % line) fd.close() def dump_transposed(self, filename, delimiter='\t', header=True): fd = open(filename,'w') # Let's first print file header header_line = 'terms' for doc in self.docs: header_line = header_line + delimiter + doc['id'] if header: fd.write('%s\n' % header_line) # Now we print data lines idx = 0 for term in self.terms: line = term for doc in self.docs: line = line + delimiter + str(doc['terms'][idx]) fd.write('%s\n' % line) idx += 1 fd.close() def dump_transposed_arff(self, filename): fd = open(filename,'w') # Let's first print file header header = '@RELATION %s\n\n' % filename.split('.')[0] header = header + '@ATTRIBUTE terms STRING\n' for doc in self.docs: header = header + '@ATTRIBUTE "%s" NUMERIC\n' % doc['id'] fd.write('%s\n' % header) # Now we print data lines fd.write('@DATA\n') idx = 0 delimiter = ',' for term in self.terms: line = '"%s"' % term for doc in self.docs: line = line + delimiter + str(doc['terms'][idx]) fd.write('%s\n' % line) idx += 1 fd.close() def prune_old(self, prune_map): ''' Helper method to remove terms (fields) of our matrix prune_map is a list of 0's and 1's of same length as self.terms. For each term, if 0, then remove it, otherwise keep it. ''' if not(prune_map) or len(prune_map) != len(self.terms): return False for i in range(len(prune_map)-1,-1,-1): if prune_map[i] == 0: #print self.terms[i] self.terms.pop(i) for doc in self.docs: doc['terms'].pop(i) def prune(self, prune_map, show_progress=True): ''' Helper method to remove terms (fields) of our matrix prune_map is a list of 0's and 1's of same length as self.terms. For each term, if 0, then remove it, otherwise keep it. ''' if not(prune_map) or len(prune_map) != len(self.terms): return False if show_progress: print ' Pruning terms list ...' new_terms = SuperList() for i in range(len(prune_map)-1,-1,-1): if prune_map[i] == 1: #print self.terms[i] new_terms.append(self.terms[i]) self.terms = new_terms if show_progress: print ' Pruning documents ...' p = Progress(n=len(self), percent=10) for doc in self.docs: new_doc_terms = SuperList() for i in range(len(prune_map)-1,-1,-1): if prune_map[i] == 1: new_doc_terms.append(doc['terms'][i]) doc['terms'] = new_doc_terms if show_progress: p.show(message=' Pruning progress:') def freq_levels(self, threshold=3): ''' Creates two lists: threshold_map is a list of 0's and 1's, where 1 means term's freq >= threshold freq_map is a list of terms frequences ''' threshold_map = [0] * len(self.terms) freq_map = [0] * len(self.terms) for i in range(0,len(self.terms)): val = 0 for doc in self.docs: if doc['terms'][i] != 0: #val += 1 val += doc['terms'][i] if val >= threshold: threshold_map[i] = 1 freq_map[i] = val return (threshold_map, freq_map) def __contains__(self, term): 'Checks if certain terms is loaded' return self.terms.__contains__(term) def to_be_deleted__getitem__(self, term): 'Returns occurences of term in all documents' if not term in self: return SuperList() col = [doc['terms'][self.terms.index(term)] for doc in self.docs] return SuperList(col) def __getitem__(self, term): ''' If term exists in terms, retruns it position in list, otherwise, return -1 ''' if not term in self: return -1 else: return self.terms.index(term) def do_padding(self): ''' Align the length of all rows in matrix Each time we see a new term, list of terms is expanded, and the matrix row for such document is of same length too. But what about rows added earlier for previous documents? So, this method alighn all previously added rows, to match the current length of the terms list. ''' if len(self.docs[-1]['terms']) == len(self.docs[0]['terms']): return for doc in self.docs: doc['terms'].expand(new_len=len(self.terms)) #for cls in self.classes: # self.classes[cls].expand(new_len=len(self.terms)) def _log_tf(self, value): val = float(value) val = 1 + math.log10(val) if val != 0 else float(0) return val def tf_idf(self, do_idf=True): ''' Converts matrix to tf.idf values do_idf: if False, convert to tf only ''' N = len(self.docs) df = SuperList([0] * len(self.terms)) for doc in self.docs: row = SuperList([0] * len(self.terms)) for idx in range(len(self.terms)): if doc['terms'][idx] > 0: row[idx] = 1 df.add(row) for doc in self.docs: for idx in range(len(self.terms)): tf = self._log_tf(doc['terms'][idx]) idf = math.log10(float(N) / df[idx]) if do_idf: doc['terms'][idx] = tf * idf else: doc['terms'][idx] = tf def add_doc(self, doc_id = '', doc_class='', doc_terms=[], frequency=False, do_padding=False, unique_ids=False, stopwords=[]): ''' Add new document to our matrix: doc_id: Identifier for the document, eg. file name, url, etc. doc_class: You might need this in classification. doc_terms: List of terms you got after tokenizing the document. Terms can be typles; string and frequencies frequency: If true, term occurences is incremented by one. Else, occurences is only 0 or 1 (a la Bernoulli) do_padding: Boolean. Check do_padding() for more info. unique_ids: When true, if two documents are added with same id, then their terms are summed up into only one record. stopwords: If not empty, ignore those stop words in doc_terms ''' # Update list of terms if new term seen. # And document (row) with its associated data. my_doc_terms = SuperList() # Discard anything not in whitelist if it is not empty if self.whitelist: doc_terms = [t for t in doc_terms if t in self.whitelist] # Discard anything in stopwords if not empty if stopwords: doc_terms = [t for t in doc_terms if t not in stopwords] for term in doc_terms: if type(term) == tuple: term_idx = self.terms.unique_append(term[0]) my_doc_terms.increment_after_padding(term_idx,term[1]) else: term_idx = self.terms.unique_append(term) if frequency: my_doc_terms.increment_after_padding(term_idx,1) else: my_doc_terms.insert_after_padding(term_idx,1) # In the rare event when whitelisting causes an empty doc_terms list # We add at least one zero in the list of my_doc_terms if not my_doc_terms: zeros = [float(0)] * len(self.vocabulary()) my_doc_terms = SuperList(zeros) if unique_ids: found = 0 for doc in self.docs: if doc['id'] == doc_id: doc['terms'].add(my_doc_terms) found = 1 if not found: self.docs.append({'id': doc_id, 'class': doc_class, 'terms': my_doc_terms}) else: self.docs.append({ 'id': doc_id, 'class': doc_class, 'terms': my_doc_terms}) # Update list of document classes if new class seen. #self.classes.unique_append(doc_class) #if self.classes.has_key(doc_class): #else: # self.classes[doc_class].add(my_doc_terms) # self.classes[doc_class] = my_doc_terms if do_padding: self.do_padding() def query_to_vector(self, q_terms, frequency=False,): ''' Converts query to a list alligned with our self.terms. Terms not seen before will be ignored. q_terms: list of query terms frequency: return a multinomial or multivariate list? ''' my_query_vector = SuperList() my_query_vector.expand(new_len=len(self.terms)) for term in q_terms: try: term_idx = self.terms.index(term) except: # Term not seen before, skip continue #print term, self.terms.index(term) if frequency: my_query_vector.increment_after_padding(term_idx,1) else: my_query_vector.insert_after_padding(term_idx,1) return my_query_vector def get_stats(self): return Stats(self)
def add_doc(self, doc_id = '', doc_class='', doc_terms=[], frequency=False, do_padding=False, unique_ids=False, stopwords=[]): ''' Add new document to our matrix: doc_id: Identifier for the document, eg. file name, url, etc. doc_class: You might need this in classification. doc_terms: List of terms you got after tokenizing the document. Terms can be typles; string and frequencies frequency: If true, term occurences is incremented by one. Else, occurences is only 0 or 1 (a la Bernoulli) do_padding: Boolean. Check do_padding() for more info. unique_ids: When true, if two documents are added with same id, then their terms are summed up into only one record. stopwords: If not empty, ignore those stop words in doc_terms ''' # Update list of terms if new term seen. # And document (row) with its associated data. my_doc_terms = SuperList() # Discard anything not in whitelist if it is not empty if self.whitelist: doc_terms = [t for t in doc_terms if t in self.whitelist] # Discard anything in stopwords if not empty if stopwords: doc_terms = [t for t in doc_terms if t not in stopwords] for term in doc_terms: if type(term) == tuple: term_idx = self.terms.unique_append(term[0]) my_doc_terms.increment_after_padding(term_idx,term[1]) else: term_idx = self.terms.unique_append(term) if frequency: my_doc_terms.increment_after_padding(term_idx,1) else: my_doc_terms.insert_after_padding(term_idx,1) # In the rare event when whitelisting causes an empty doc_terms list # We add at least one zero in the list of my_doc_terms if not my_doc_terms: zeros = [float(0)] * len(self.vocabulary()) my_doc_terms = SuperList(zeros) if unique_ids: found = 0 for doc in self.docs: if doc['id'] == doc_id: doc['terms'].add(my_doc_terms) found = 1 if not found: self.docs.append({'id': doc_id, 'class': doc_class, 'terms': my_doc_terms}) else: self.docs.append({ 'id': doc_id, 'class': doc_class, 'terms': my_doc_terms}) # Update list of document classes if new class seen. #self.classes.unique_append(doc_class) #if self.classes.has_key(doc_class): #else: # self.classes[doc_class].add(my_doc_terms) # self.classes[doc_class] = my_doc_terms if do_padding: self.do_padding()