def add_doc(self, doc_id = '', doc_class='', doc_terms=[], frequency=False, do_padding=False): ''' Add new document to our matrix: doc_id: Identifier for the document, eg. file name, url, etc. doc_class: You might need this in classification. doc_terms: List of terms you got after tokenizing the document. frequency: If true, term occurences is incremented by one. Else, occurences is only 0 or 1 (a la Bernoulli) do_padding: Boolean. Check do_padding() for more info. ''' # Update list of terms if new term seen. # And document (row) with its associated data. my_doc_terms = SuperList() for term in doc_terms: term_idx = self.terms.unique_append(term) #my_doc_terms.insert_after_padding(self.terms.index(term)) if frequency: my_doc_terms.increment_after_padding(term_idx,1) else: my_doc_terms.insert_after_padding(term_idx,1) self.docs.append({ 'id': doc_id, 'class': doc_class, 'terms': my_doc_terms}) # Update list of document classes if new class seen. #self.classes.unique_append(doc_class) if self.classes.has_key(doc_class): self.classes[doc_class].add(my_doc_terms) else: self.classes[doc_class] = my_doc_terms if do_padding: self.do_padding()
def add_doc(self, doc_id = '', doc_class='', doc_terms=[], frequency=False, do_padding=False): ''' Add new document to our matrix: doc_id: Identifier for the document, eg. file name, url, etc. doc_class: You might need this in classification. doc_terms: List of terms you got after tokenizing the document. Terms can be typles, string and values frequency: If true, term occurences is incremented by one. Else, occurences is only 0 or 1 (a la Bernoulli) do_padding: Boolean. Check do_padding() for more info. ''' # Update list of terms if new term seen. # And document (row) with its associated data. my_doc_terms = SuperList() for term in doc_terms: if type(term) == tuple: term_idx = self.terms.unique_append(term[0]) my_doc_terms.increment_after_padding(term_idx,term[1]) else: term_idx = self.terms.unique_append(term) if frequency: my_doc_terms.increment_after_padding(term_idx,1) else: my_doc_terms.insert_after_padding(term_idx,1) self.docs.append({ 'id': doc_id, 'class': doc_class, 'terms': my_doc_terms}) # Update list of document classes if new class seen. #self.classes.unique_append(doc_class) #if self.classes.has_key(doc_class): #else: # self.classes[doc_class].add(my_doc_terms) # self.classes[doc_class] = my_doc_terms if do_padding: self.do_padding()
def add_doc(self, doc_id='', doc_class='', doc_terms=[], frequency=False, do_padding=False, unique_ids=False, meta_data={}): ''' Add new document to our matrix: doc_id: Identifier for the document, eg. file name, url, etc. doc_class: You might need this in classification. doc_terms: List of terms you got after tokenizing the document. Terms can be typles; string and frequencies frequency: If true, term occurences is incremented by one. Else, occurences is only 0 or 1 (a la Bernoulli) do_padding: Boolean. Check do_padding() for more info. unique_ids: When true, if two documents are added with same id, then their terms are summed up into only one record. meta_data: More fields to add to the document, for your own use. ''' if not doc_terms: raise ValueError('doc_terms cannot be empty') # Update list of terms if new term seen. # And document (row) with its associated data. my_doc_terms = SuperList() # Discard anything not in whitelist if it is not empty if self.whitelist: doc_terms = [t for t in doc_terms if t in self.whitelist] # Discard anything in stopwords if not empty if self.blacklist: doc_terms = [t for t in doc_terms if t not in self.blacklist] for term in doc_terms: if type(term) == tuple: term_idx = self.terms.unique_append(term[0]) my_doc_terms.increment_after_padding(term_idx, term[1]) else: term_idx = self.terms.unique_append(term) if frequency: my_doc_terms.increment_after_padding(term_idx, 1) else: my_doc_terms.insert_after_padding(term_idx, 1) # In the rare event when whitelisting causes an empty doc_terms list # We add at least one zero in the list of my_doc_terms if not my_doc_terms: zeros = [float(0)] * len(self.vocabulary()) my_doc_terms = SuperList(zeros) doc_data = {'id': doc_id, 'class': doc_class, 'terms': my_doc_terms} for key in meta_data: doc_data[key] = meta_data[key] if unique_ids: self.docs.add_unique(doc_data) else: self.docs.append(doc_data) if do_padding: self.do_padding()
def add_doc(self, doc_id = '', doc_class='', doc_terms=[], frequency=False, do_padding=False, stopwords=[]): ''' Add new document to our matrix: doc_id: Identifier for the document, eg. file name, url, etc. doc_class: You might need this in classification. doc_terms: List of terms you got after tokenizing the document. Terms can be typles; string and frequencies frequency: If true, term occurences is incremented by one. Else, occurences is only 0 or 1 (a la Bernoulli) do_padding: Boolean. Useless here stopwords: If not empty, ignore those stop words in doc_terms ''' # Update list of terms if new term seen. # And document (row) with its associated data. my_doc_terms = SuperList() # Discard anything not in whitelist if it is not empty if self.whitelist: doc_terms = [t for t in doc_terms if t in self.whitelist] # Discard anything in stopwords if not empty if stopwords: doc_terms = [t for t in doc_terms if t not in stopwords] for term in doc_terms: if type(term) == tuple: term_idx = self.terms.unique_append(term[0]) my_doc_terms.increment_after_padding(term_idx,term[1]) else: term_idx = self.terms.unique_append(term) if frequency: my_doc_terms.increment_after_padding(term_idx,1) else: my_doc_terms.insert_after_padding(term_idx,1) #self.docs.append({ 'id': doc_id, # 'class': doc_class, # 'terms': my_doc_terms}) found = 0 for doc in self.docs: if doc['class'] == doc_class: doc['terms'].add(my_doc_terms) found = 1 if not found: self.docs.append({'id': doc_id, 'class': doc_class, 'terms': my_doc_terms}) if do_padding: self.do_padding()
def add_doc(self, doc_id = '', doc_class='', doc_terms=[], frequency=False, do_padding=False): ''' Add new document to our matrix: doc_id: Identifier for the document, eg. file name, url, etc. doc_class: You might need this in classification. doc_terms: List of terms you got after tokenizing the document. Terms can be typles; string and frequencies frequency: If true, term occurences is incremented by one. Else, occurences is only 0 or 1 (a la Bernoulli) do_padding: Boolean. Check do_padding() for more info. ''' # Update list of terms if new term seen. # And document (row) with its associated data. my_doc_terms = SuperList() # Discard anything not in whitelist if it is not empty if self.whitelist: doc_terms = [t for t in doc_terms if t in self.whitelist] for term in doc_terms: if type(term) == tuple: term_idx = self.terms.unique_append(term[0]) my_doc_terms.increment_after_padding(term_idx,term[1]) else: term_idx = self.terms.unique_append(term) if frequency: my_doc_terms.increment_after_padding(term_idx,1) else: my_doc_terms.insert_after_padding(term_idx,1) # In the rare event when whitelisting causes an empty doc_terms list # We add at least one zero in the list of my_doc_terms if not my_doc_terms: zeros = [float(0)] * len(self.vocabulary()) my_doc_terms = SuperList(zeros) self.docs.append({ 'id': doc_id, 'class': doc_class, 'terms': my_doc_terms}) # Update list of document classes if new class seen. #self.classes.unique_append(doc_class) #if self.classes.has_key(doc_class): #else: # self.classes[doc_class].add(my_doc_terms) # self.classes[doc_class] = my_doc_terms if do_padding: self.do_padding()
def query_to_vector(self, q_terms, frequency=False,): ''' Converts query to a list alligned with our self.terms. Terms not seen before will be ignored. q_terms: list of query terms frequency: return a multinomial or multivariate list? ''' my_query_vector = SuperList() my_query_vector.expand(new_len=len(self.terms)) for term in q_terms: try: term_idx = self.terms.index(term) except: # Term not seen before, skip continue #print term, self.terms.index(term) if frequency: my_query_vector.increment_after_padding(term_idx,1) else: my_query_vector.insert_after_padding(term_idx,1) return my_query_vector
def add_doc(self, doc_id="", doc_class="", doc_terms=[], frequency=False, do_padding=False): """ Add new document to our matrix: doc_id: Identifier for the document, eg. file name, url, etc. doc_class: You might need this in classification. doc_terms: List of terms you got after tokenizing the document. Terms can be typles; string and frequencies frequency: If true, term occurences is incremented by one. Else, occurences is only 0 or 1 (a la Bernoulli) do_padding: Boolean. Check do_padding() for more info. """ # Update list of terms if new term seen. # And document (row) with its associated data. my_doc_terms = SuperList() # Discard anything not in whitelist if it is not empty if self.whitelist: doc_terms = [t for t in doc_terms if t in self.whitelist] for term in doc_terms: if type(term) == tuple: term_idx = self.terms.unique_append(term[0]) my_doc_terms.increment_after_padding(term_idx, term[1]) else: term_idx = self.terms.unique_append(term) if frequency: my_doc_terms.increment_after_padding(term_idx, 1) else: my_doc_terms.insert_after_padding(term_idx, 1) # In the rare event when whitelisting causes an empty doc_terms list # We add at least one zero in the list of my_doc_terms if not my_doc_terms: zeros = [float(0)] * len(self.vocabulary()) my_doc_terms = SuperList(zeros) self.docs.append({"id": doc_id, "class": doc_class, "terms": my_doc_terms}) # Update list of document classes if new class seen. # self.classes.unique_append(doc_class) # if self.classes.has_key(doc_class): # else: # self.classes[doc_class].add(my_doc_terms) # self.classes[doc_class] = my_doc_terms if do_padding: self.do_padding()
def add_doc(self, doc_id = '', doc_class='', doc_terms=[], frequency=False, do_padding=False, unique_ids=False, stopwords=[]): ''' Add new document to our matrix: doc_id: Identifier for the document, eg. file name, url, etc. doc_class: You might need this in classification. doc_terms: List of terms you got after tokenizing the document. Terms can be typles; string and frequencies frequency: If true, term occurences is incremented by one. Else, occurences is only 0 or 1 (a la Bernoulli) do_padding: Boolean. Check do_padding() for more info. unique_ids: When true, if two documents are added with same id, then their terms are summed up into only one record. stopwords: If not empty, ignore those stop words in doc_terms ''' # Update list of terms if new term seen. # And document (row) with its associated data. my_doc_terms = SuperList() # Discard anything not in whitelist if it is not empty if self.whitelist: doc_terms = [t for t in doc_terms if t in self.whitelist] # Discard anything in stopwords if not empty if stopwords: doc_terms = [t for t in doc_terms if t not in stopwords] for term in doc_terms: if type(term) == tuple: term_idx = self.terms.unique_append(term[0]) my_doc_terms.increment_after_padding(term_idx,term[1]) else: term_idx = self.terms.unique_append(term) if frequency: my_doc_terms.increment_after_padding(term_idx,1) else: my_doc_terms.insert_after_padding(term_idx,1) # In the rare event when whitelisting causes an empty doc_terms list # We add at least one zero in the list of my_doc_terms if not my_doc_terms: zeros = [float(0)] * len(self.vocabulary()) my_doc_terms = SuperList(zeros) if unique_ids: found = 0 for doc in self.docs: if doc['id'] == doc_id: doc['terms'].add(my_doc_terms) found = 1 if not found: self.docs.append({'id': doc_id, 'class': doc_class, 'terms': my_doc_terms}) else: self.docs.append({ 'id': doc_id, 'class': doc_class, 'terms': my_doc_terms}) # Update list of document classes if new class seen. #self.classes.unique_append(doc_class) #if self.classes.has_key(doc_class): #else: # self.classes[doc_class].add(my_doc_terms) # self.classes[doc_class] = my_doc_terms if do_padding: self.do_padding()
def add_doc(self, doc_id='', doc_class='', doc_terms=[], frequency=False, do_padding=False, unique_ids=False, meta_data={}): ''' Add new document to our matrix: doc_id: Identifier for the document, eg. file name, url, etc. doc_class: You might need this in classification. doc_terms: List of terms you got after tokenizing the document. Terms can be typles; string and frequencies frequency: If true, term occurences is incremented by one. Else, occurences is only 0 or 1 (a la Bernoulli) do_padding: Boolean. Check do_padding() for more info. unique_ids: When true, if two documents are added with same id, then their terms are summed up into only one record. meta_data: More fields to add to the document, for your own use. ''' if not doc_terms: raise ValueError('doc_terms cannot be empty') # Update list of terms if new term seen. # And document (row) with its associated data. my_doc_terms = SuperList() # Discard anything not in whitelist if it is not empty if self.whitelist: doc_terms = [t for t in doc_terms if t in self.whitelist] # Discard anything in stopwords if not empty if self.blacklist: doc_terms = [t for t in doc_terms if t not in self.blacklist] for term in doc_terms: if type(term) == tuple: term_idx = self.terms.unique_append(term[0]) my_doc_terms.increment_after_padding(term_idx,term[1]) else: term_idx = self.terms.unique_append(term) if frequency: my_doc_terms.increment_after_padding(term_idx,1) else: my_doc_terms.insert_after_padding(term_idx,1) # In the rare event when whitelisting causes an empty doc_terms list # We add at least one zero in the list of my_doc_terms if not my_doc_terms: zeros = [float(0)] * len(self.vocabulary()) my_doc_terms = SuperList(zeros) doc_data = { 'id': doc_id, 'class': doc_class, 'terms': my_doc_terms } for key in meta_data: doc_data[key] = meta_data[key] if unique_ids: self.docs.add_unique(doc_data) else: self.docs.append(doc_data) if do_padding: self.do_padding()