def clean_stem_query(self): q = "" for token in re.sub(r"[.,:;\-!?\"']", " ", self.query).split(): try: lower = token.lower() if stringcheck.check(lower): q += self.stemmer.stem_word(lower) + " " except: if self.debug: print "Probable unicode error in stemming query" self.query = q if self.debug: print "STEMMED QUERY:", self.query
def title_indexer(self, title, doc_id, index=True): for i, token in enumerate(re.sub(r"[.,:;\-!?\"']", " ", title).split()): lower = token.lower() try: # no encoding errors if stringcheck.check(lower): item = self.stem(lower) if index: self.term_add_doc_id_title(item, doc_id) self.term_add_doc_id_title_posting(item, doc_id, i) else: self.term_remove_doc_id_title(item, doc_id) self.term_remove_doc_id_title_posting(item, doc_id) except: if self.debug: print "Probable unicode error"
def extract_features(self, **kwargs): ''' Extracts vital info from current document Up to features_limit in length Using a tfidf threshold to filter the top of them ''' export_value = kwargs.get('export', json.dumps) tfidf_threshold_absolute = kwargs.get('tfidf_threshold_absolute', 0.0000001) features_limit = kwargs.get('features_limit', 500) rnd = kwargs.get('rnd', 4) doc = kwargs.get('doc', None) # just in case, we chech if we have to re-tokenize the doc if not len(self.sanitized_text): if doc is None: raise Exception, " No document given !! " for i, token in enumerate(re.sub(r"[.,:;!\-?\"']", " ", doc).split()): lower = token.lower() try: if stringcheck.check(lower): item = self.stem(lower) self.update_pos(item, i) self.sanitized_text.append(item) except: if self.debug: print "Probable unicode error" self.doc_len = len(self.sanitized_text) idfs = [i[1] for i in self.get_dfs(self.sanitized_text)] tfidf_tuple_list = [] adapt_features = [] for i in xrange(min(self.doc_len, len(idfs), features_limit)): tfidf = len(self.pos[self.sanitized_text[i]]) * idfs[i] / self.doc_len tup = (self.sanitized_text[i], str(round(tfidf , rnd)), i) tfidf_tuple_list.append(tup) if tfidf > tfidf_threshold_absolute: adapt_features.append(tup) self.clear() return export_value(tfidf_tuple_list), export_value(adapt_features)
def content_indexer(self, doc, doc_id, index=True): for i, token in enumerate(re.sub(r"[.,:;!\-?\"']", " ", doc).split()): lower = token.lower() try: # no encoding errors if stringcheck.check(lower): item = self.stem(lower) self.update_pos(item, i) self.sanitized_text.append(item) except: if self.debug: print "Probable unicode error" self.doc_len = len(self.sanitized_text) if index: for term, posting in self.pos.iteritems(): self.term_add_doc_id(term, doc_id, float(len(posting))/self.doc_len ) self.term_add_doc_id_posting(term, doc_id, ",".join(posting) ) else: # remove from index for term, posting in self.pos.iteritems(): self.term_remove_doc_id(term, doc_id) self.term_remove_doc_id_posting(term, doc_id)