Python check Examples

Programming Language: Python

Namespace/Package Name: stringcheck

Method/Function: check

Examples at hotexamples.com: 4

Python check - 4 examples found. These are the top rated real world Python examples of stringcheck.check extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: query_handler.py Project: navtej/pyredise

 def clean_stem_query(self):
     q = ""
     for token in re.sub(r"[.,:;\-!?\"']", " ", self.query).split():
         try: 
             lower = token.lower()
             if stringcheck.check(lower):
                 q += self.stemmer.stem_word(lower) + " "         
         except: 
             if self.debug: print "Probable unicode error in stemming query"  
             
     self.query = q    
     if self.debug: print "STEMMED QUERY:", self.query

Example #2

Show file

File: corpus_handler.py Project: laurenproctor/pyredise

    def title_indexer(self, title, doc_id, index=True):

        for i, token in enumerate(re.sub(r"[.,:;\-!?\"']", " ", title).split()):
            lower = token.lower()
            try: # no encoding errors
                if stringcheck.check(lower):
                    item = self.stem(lower)
                    
                    if index: 
                        self.term_add_doc_id_title(item, doc_id)
                        self.term_add_doc_id_title_posting(item, doc_id, i)
                    else: 
                        self.term_remove_doc_id_title(item, doc_id)
                        self.term_remove_doc_id_title_posting(item, doc_id)
                    
            except: 
                if self.debug: print "Probable unicode error"

Example #3

Show file

File: corpus_handler.py Project: laurenproctor/pyredise

    def extract_features(self, **kwargs):
        '''
        Extracts vital info from current document
        Up to features_limit in length
        Using a tfidf threshold to filter the top of them
        '''
        export_value = kwargs.get('export', json.dumps)
        tfidf_threshold_absolute = kwargs.get('tfidf_threshold_absolute', 0.0000001)
        features_limit = kwargs.get('features_limit', 500)
        rnd = kwargs.get('rnd', 4)
        doc = kwargs.get('doc', None)
        
        # just in case, we chech if we have to re-tokenize the doc

        if not len(self.sanitized_text):
            if doc is None: 
                raise Exception, " No document given !! "

            for i, token in enumerate(re.sub(r"[.,:;!\-?\"']", " ", doc).split()):
                lower = token.lower()
                try: 
                    if stringcheck.check(lower):
                        item = self.stem(lower)
                        self.update_pos(item, i)
                        self.sanitized_text.append(item)
                except: 
                    if self.debug: print "Probable unicode error"  
                                
            self.doc_len = len(self.sanitized_text)    
        
        idfs = [i[1] for i in self.get_dfs(self.sanitized_text)]

        tfidf_tuple_list = []
        adapt_features = []
        
        for i in xrange(min(self.doc_len, len(idfs), features_limit)):
            tfidf = len(self.pos[self.sanitized_text[i]]) * idfs[i] / self.doc_len
            tup = (self.sanitized_text[i], str(round(tfidf , rnd)), i)
            tfidf_tuple_list.append(tup)
            
            if tfidf > tfidf_threshold_absolute: adapt_features.append(tup)
        
        self.clear()
        return export_value(tfidf_tuple_list), export_value(adapt_features)

Example #4

Show file

File: corpus_handler.py Project: laurenproctor/pyredise

    def content_indexer(self, doc, doc_id,  index=True):

        for i, token in enumerate(re.sub(r"[.,:;!\-?\"']", " ", doc).split()):
            lower = token.lower()
            try: # no encoding errors
                if stringcheck.check(lower):
                    item = self.stem(lower)
                    self.update_pos(item, i)
                    self.sanitized_text.append(item)
            except: 
                if self.debug: print "Probable unicode error"  
                
        self.doc_len = len(self.sanitized_text)  
        
        if index:
            for term, posting in self.pos.iteritems():     
                self.term_add_doc_id(term,  doc_id, float(len(posting))/self.doc_len )   
                self.term_add_doc_id_posting(term,  doc_id, ",".join(posting) )   
                
        else: # remove from index                  
            for term, posting in self.pos.iteritems():     
                self.term_remove_doc_id(term, doc_id)   
                self.term_remove_doc_id_posting(term, doc_id)