Example #1
0
 def clean_stem_query(self):
     q = ""
     for token in re.sub(r"[.,:;\-!?\"']", " ", self.query).split():
         try: 
             lower = token.lower()
             if stringcheck.check(lower):
                 q += self.stemmer.stem_word(lower) + " "         
         except: 
             if self.debug: print "Probable unicode error in stemming query"  
             
     self.query = q    
     if self.debug: print "STEMMED QUERY:", self.query
Example #2
0
    def title_indexer(self, title, doc_id, index=True):

        for i, token in enumerate(re.sub(r"[.,:;\-!?\"']", " ", title).split()):
            lower = token.lower()
            try: # no encoding errors
                if stringcheck.check(lower):
                    item = self.stem(lower)
                    
                    if index: 
                        self.term_add_doc_id_title(item, doc_id)
                        self.term_add_doc_id_title_posting(item, doc_id, i)
                    else: 
                        self.term_remove_doc_id_title(item, doc_id)
                        self.term_remove_doc_id_title_posting(item, doc_id)
                    
            except: 
                if self.debug: print "Probable unicode error"  
Example #3
0
    def extract_features(self, **kwargs):
        '''
        Extracts vital info from current document
        Up to features_limit in length
        Using a tfidf threshold to filter the top of them
        '''
        export_value = kwargs.get('export', json.dumps)
        tfidf_threshold_absolute = kwargs.get('tfidf_threshold_absolute', 0.0000001)
        features_limit = kwargs.get('features_limit', 500)
        rnd = kwargs.get('rnd', 4)
        doc = kwargs.get('doc', None)
        
        # just in case, we chech if we have to re-tokenize the doc

        if not len(self.sanitized_text):
            if doc is None: 
                raise Exception, " No document given !! "

            for i, token in enumerate(re.sub(r"[.,:;!\-?\"']", " ", doc).split()):
                lower = token.lower()
                try: 
                    if stringcheck.check(lower):
                        item = self.stem(lower)
                        self.update_pos(item, i)
                        self.sanitized_text.append(item)
                except: 
                    if self.debug: print "Probable unicode error"  
                                
            self.doc_len = len(self.sanitized_text)    
        
        idfs = [i[1] for i in self.get_dfs(self.sanitized_text)]

        tfidf_tuple_list = []
        adapt_features = []
        
        for i in xrange(min(self.doc_len, len(idfs), features_limit)):
            tfidf = len(self.pos[self.sanitized_text[i]]) * idfs[i] / self.doc_len
            tup = (self.sanitized_text[i], str(round(tfidf , rnd)), i)
            tfidf_tuple_list.append(tup)
            
            if tfidf > tfidf_threshold_absolute: adapt_features.append(tup)
        
        self.clear()
        return export_value(tfidf_tuple_list), export_value(adapt_features)        
Example #4
0
    def content_indexer(self, doc, doc_id,  index=True):

        for i, token in enumerate(re.sub(r"[.,:;!\-?\"']", " ", doc).split()):
            lower = token.lower()
            try: # no encoding errors
                if stringcheck.check(lower):
                    item = self.stem(lower)
                    self.update_pos(item, i)
                    self.sanitized_text.append(item)
            except: 
                if self.debug: print "Probable unicode error"  
                
        self.doc_len = len(self.sanitized_text)  
        
        if index:
            for term, posting in self.pos.iteritems():     
                self.term_add_doc_id(term,  doc_id, float(len(posting))/self.doc_len )   
                self.term_add_doc_id_posting(term,  doc_id, ",".join(posting) )   
                
        else: # remove from index                  
            for term, posting in self.pos.iteritems():     
                self.term_remove_doc_id(term, doc_id)   
                self.term_remove_doc_id_posting(term, doc_id)