def __init__(self, export_path): self.word_dict = defaultdict(list) self.bigram_word_dict = defaultdict(list) self.text_normalizer1 = LyricsNormalizationProcessor(LyricsNormalizationParams(LyricsNormalization.SYLLABLES)) self.text_normalizer2 = LyricsNormalizationProcessor( LyricsNormalizationParams(LyricsNormalization.WORDS, True, False, True, True, )) try: self.populate(export_path) except Exception as e: logger.error("Could not Load Document Database") logger.exception(e)
def __init__(self): self.word_dict = defaultdict(list) self.document_dict = {} self.document_path = {} self.text_normalizer = LyricsNormalizationProcessor(LyricsNormalizationParams(LyricsNormalization.WORDS)) try: self.populate() except Exception as e: logger.error("Could not Load Document Database") logger.exception(e)
def extract_gt_prediction(self, full_predictions: List[PredictionResult]): from omr.dataset.dataset import LyricsNormalizationProcessor, LyricsNormalizationParams, LyricsNormalization lnp = LyricsNormalizationProcessor(LyricsNormalizationParams(LyricsNormalization.SYLLABLES)) def format_gt(s): s = lnp.apply(s) return s def flatten(x): return sum(x, []) pred = [[tl.hyphenated for tl in p.text_lines] for p in full_predictions] gt = [[format_gt(tl.line.operation.text_line.sentence.text()) for tl in p.text_lines] for p in full_predictions] return flatten(gt), flatten(pred)
def __init__(self, words: Dict[str, str] = None, dictionary: str = None, normalization: LyricsNormalizationParams = None): super().__init__() self.words = words if words else {} if normalization: normalization = LyricsNormalizationParams( **normalization.to_dict()) normalization.lyrics_normalization = LyricsNormalization.SYLLABLES p = LyricsNormalizationProcessor(normalization) if dictionary: with open(dictionary) as f: for line in f: word, hyphen = line.split() if p: word = p.apply(word) hyphen = p.apply(hyphen) self.words[word] = hyphen if len(self.words) == 0: raise Exception( "Empty dictionary for hyphenation. Either pass the hyphenation directly or as a file" )
def __init__(self, settings: AlgorithmPredictorSettings): super().__init__(settings) self.document_id = settings.params.documentId self.document_text = settings.params.documentText self.document_similar_tester = SimilarDocumentChecker() self.text_normalizer = LyricsNormalizationProcessor( LyricsNormalizationParams(LyricsNormalization.WORDS)) meta = Step.meta(AlgorithmTypes.OCR_CALAMARI) from ommr4all.settings import BASE_DIR model = Model( MetaId.from_custom_path( BASE_DIR + '/internal_storage/default_models/fraktur/text_calamari/', meta.type())) settings = AlgorithmPredictorSettings(model=model, ) settings.params.ctcDecoder.params.type = CTCDecoderParams.CTC_DEFAULT self.ocr_predictor = meta.create_predictor(settings)
class Predictor(AlgorithmPredictor): @staticmethod def meta(): return Meta def __init__(self, settings: AlgorithmPredictorSettings): super().__init__(settings) self.document_id = settings.params.documentId self.document_similar_tester = SimilarDocumentChecker() self.text_normalizer = LyricsNormalizationProcessor( LyricsNormalizationParams(LyricsNormalization.WORDS)) @classmethod def unprocessed(cls, page: DatabasePage) -> bool: return True def predict( self, pages: List[DatabasePage], callback: Optional[PredictionCallback] = None ) -> AlgorithmPredictionResultGenerator: book = pages[0].book documents = DatabaseBookDocuments().load(book) document: Document = documents.database_documents.get_document_by_id( self.document_id) text = document.get_text_of_document(book) text = self.text_normalizer.apply(text) text = text.split(' ') count = self.document_similar_tester.check_word_based_similarity(text) texts = [] for key, count in count.most_common(5): #print(self.document_similar_tester.document_dict[key].sentence) #print(self.document_similar_tester.document_dict[key].get_word_list()) #print(self.document_similar_tester.document_dict[key].get_text()) texts.append( self.document_similar_tester.document_dict[key].get_text()) yield Result(texts)
def chars_only(s: str): return LyricsNormalizationProcessor(self.args.global_args.dataset_params.lyrics_normalization).apply(s)
def __init__(self, settings: AlgorithmPredictorSettings): super().__init__(settings) self.document_id = settings.params.documentId self.document_similar_tester = SimilarDocumentChecker() self.text_normalizer = LyricsNormalizationProcessor( LyricsNormalizationParams(LyricsNormalization.WORDS))
def check_word_based_similarity(self, sentence): documents = [] for x in sentence: if x in self.word_dict: documents += (set(self.word_dict[x])) from collections import Counter count = Counter(documents) return count if __name__ == "__main__": word_dict = defaultdict(list) document_dict = {} document_path = {} document_meta = {} text_normalizer = LyricsNormalizationProcessor(LyricsNormalizationParams(LyricsNormalization.WORDS)) for x in documents_gen(): b = load_json(x.document_meta) a = populate(x.data) document_meta[x.document_id] = b["dokumenten_id"] text = a.get_text(text_normalizer).split(" ") populate_look_up_dict(text, x.document_id, word_dict) document_dict[x.document_id] = a document_path[x.document_id] = x counter = check_word_based_similarity(["lux", "aduenit", "ueneranda", "lux", "in", "chrois"], word_dict) for key, count in counter.most_common(10): print(key) print(count)