def __init__(self, words: Dict[str, str] = None, dictionary: str = None, normalization: LyricsNormalizationParams = None): super().__init__() self.words = words if words else {} if normalization: normalization = LyricsNormalizationParams( **normalization.to_dict()) normalization.lyrics_normalization = LyricsNormalization.SYLLABLES p = LyricsNormalizationProcessor(normalization) if dictionary: with open(dictionary) as f: for line in f: word, hyphen = line.split() if p: word = p.apply(word) hyphen = p.apply(hyphen) self.words[word] = hyphen if len(self.words) == 0: raise Exception( "Empty dictionary for hyphenation. Either pass the hyphenation directly or as a file" )
class Predictor(AlgorithmPredictor): @staticmethod def meta(): return Meta def __init__(self, settings: AlgorithmPredictorSettings): super().__init__(settings) self.document_id = settings.params.documentId self.document_similar_tester = SimilarDocumentChecker() self.text_normalizer = LyricsNormalizationProcessor( LyricsNormalizationParams(LyricsNormalization.WORDS)) @classmethod def unprocessed(cls, page: DatabasePage) -> bool: return True def predict( self, pages: List[DatabasePage], callback: Optional[PredictionCallback] = None ) -> AlgorithmPredictionResultGenerator: book = pages[0].book documents = DatabaseBookDocuments().load(book) document: Document = documents.database_documents.get_document_by_id( self.document_id) text = document.get_text_of_document(book) text = self.text_normalizer.apply(text) text = text.split(' ') count = self.document_similar_tester.check_word_based_similarity(text) texts = [] for key, count in count.most_common(5): #print(self.document_similar_tester.document_dict[key].sentence) #print(self.document_similar_tester.document_dict[key].get_word_list()) #print(self.document_similar_tester.document_dict[key].get_text()) texts.append( self.document_similar_tester.document_dict[key].get_text()) yield Result(texts)