class LexicalNormalizer(object): cname = __name__ + '.LexicalNormalizer' def __init__(self,vocabulary_set): # instance oov detector self.detector = IVTermDetectorBasicFactory.create_detector(vocabulary_set) # instance analyzers and push them to the candidate generator cand_generator = CandidateGenerator() for resource in vocabulary_set: cand_generator.add_analyzer("metric-space", MetricSpaceAnalyzerFactory.create_analyzer(resource)) self.cand_generator = cand_generator # instance preprocessor and logger self.preprocessor = BasicDocumentPreprocessor() self.logger = logging.getLogger(LexicalNormalizer.cname) def _generate_alternatives(self, doc_items): norm_items = [] for item in doc_items: if self.detector.detect_term(item): norm_items.append(item) else: norm_items.append(self.cand_generator.generate_candidates(item)) return norm_items def normalize_document(self, doc): proc_document = self.preprocessor.process(doc) item_alternatives = self._generate_alternatives(proc_document) return item_alternatives
def __init__(self,vocabulary_set): # instance oov detector self.detector = IVTermDetectorBasicFactory.create_detector(vocabulary_set) # instance analyzers and push them to the candidate generator cand_generator = CandidateGenerator() for resource in vocabulary_set: cand_generator.add_analyzer("metric-space", MetricSpaceAnalyzerFactory.create_analyzer(resource)) self.cand_generator = cand_generator # instance preprocessor and logger self.preprocessor = BasicDocumentPreprocessor() self.logger = logging.getLogger(LexicalNormalizer.cname)