Exemple #1
0
class LexicalNormalizer(object):
    cname = __name__ + '.LexicalNormalizer'
    def __init__(self,vocabulary_set):
        # instance oov detector
        self.detector = IVTermDetectorBasicFactory.create_detector(vocabulary_set)
        
        
        # instance analyzers and push them to the candidate generator
        cand_generator = CandidateGenerator() 
        for resource in vocabulary_set:            
            cand_generator.add_analyzer("metric-space", MetricSpaceAnalyzerFactory.create_analyzer(resource))
        
        self.cand_generator = cand_generator
        
        # instance preprocessor and logger
        self.preprocessor = BasicDocumentPreprocessor()
        self.logger = logging.getLogger(LexicalNormalizer.cname)    
    
    def _generate_alternatives(self, doc_items):
        norm_items = []
        for item in doc_items:
            if self.detector.detect_term(item):
                norm_items.append(item)
            else:
                norm_items.append(self.cand_generator.generate_candidates(item))
        
        return norm_items
        
    
    def normalize_document(self, doc):
        proc_document = self.preprocessor.process(doc)
        item_alternatives = self._generate_alternatives(proc_document)
        return item_alternatives            
Exemple #2
0
 def __init__(self,vocabulary_set):
     # instance oov detector
     self.detector = IVTermDetectorBasicFactory.create_detector(vocabulary_set)
     
     
     # instance analyzers and push them to the candidate generator
     cand_generator = CandidateGenerator() 
     for resource in vocabulary_set:            
         cand_generator.add_analyzer("metric-space", MetricSpaceAnalyzerFactory.create_analyzer(resource))
     
     self.cand_generator = cand_generator
     
     # instance preprocessor and logger
     self.preprocessor = BasicDocumentPreprocessor()
     self.logger = logging.getLogger(LexicalNormalizer.cname)