コード例 #1
0
    def __init__(self, export_path):
        self.word_dict = defaultdict(list)
        self.bigram_word_dict = defaultdict(list)

        self.text_normalizer1 = LyricsNormalizationProcessor(LyricsNormalizationParams(LyricsNormalization.SYLLABLES))
        self.text_normalizer2 = LyricsNormalizationProcessor(
            LyricsNormalizationParams(LyricsNormalization.WORDS,
                                      True,
                                      False,
                                      True,
                                      True,
                                      ))

        try:
            self.populate(export_path)
        except Exception as e:
            logger.error("Could not Load Document Database")
            logger.exception(e)
コード例 #2
0
 def __init__(self):
     self.word_dict = defaultdict(list)
     self.document_dict = {}
     self.document_path = {}
     self.text_normalizer = LyricsNormalizationProcessor(LyricsNormalizationParams(LyricsNormalization.WORDS))
     try:
         self.populate()
     except Exception as e:
         logger.error("Could not Load Document Database")
         logger.exception(e)
コード例 #3
0
    def extract_gt_prediction(self, full_predictions: List[PredictionResult]):
        from omr.dataset.dataset import LyricsNormalizationProcessor, LyricsNormalizationParams, LyricsNormalization
        lnp = LyricsNormalizationProcessor(LyricsNormalizationParams(LyricsNormalization.SYLLABLES))

        def format_gt(s):
            s = lnp.apply(s)
            return s

        def flatten(x):
            return sum(x, [])
        pred = [[tl.hyphenated for tl in p.text_lines] for p in full_predictions]
        gt = [[format_gt(tl.line.operation.text_line.sentence.text()) for tl in p.text_lines] for p in full_predictions]

        return flatten(gt), flatten(pred)
コード例 #4
0
    def __init__(self,
                 words: Dict[str, str] = None,
                 dictionary: str = None,
                 normalization: LyricsNormalizationParams = None):
        super().__init__()
        self.words = words if words else {}
        if normalization:
            normalization = LyricsNormalizationParams(
                **normalization.to_dict())
            normalization.lyrics_normalization = LyricsNormalization.SYLLABLES
            p = LyricsNormalizationProcessor(normalization)
        if dictionary:
            with open(dictionary) as f:
                for line in f:
                    word, hyphen = line.split()
                    if p:
                        word = p.apply(word)
                        hyphen = p.apply(hyphen)
                    self.words[word] = hyphen

        if len(self.words) == 0:
            raise Exception(
                "Empty dictionary for hyphenation. Either pass the hyphenation directly or as a file"
            )
コード例 #5
0
    def __init__(self, settings: AlgorithmPredictorSettings):
        super().__init__(settings)
        self.document_id = settings.params.documentId
        self.document_text = settings.params.documentText

        self.document_similar_tester = SimilarDocumentChecker()
        self.text_normalizer = LyricsNormalizationProcessor(
            LyricsNormalizationParams(LyricsNormalization.WORDS))
        meta = Step.meta(AlgorithmTypes.OCR_CALAMARI)
        from ommr4all.settings import BASE_DIR
        model = Model(
            MetaId.from_custom_path(
                BASE_DIR +
                '/internal_storage/default_models/fraktur/text_calamari/',
                meta.type()))
        settings = AlgorithmPredictorSettings(model=model, )
        settings.params.ctcDecoder.params.type = CTCDecoderParams.CTC_DEFAULT
        self.ocr_predictor = meta.create_predictor(settings)
コード例 #6
0
class Predictor(AlgorithmPredictor):
    @staticmethod
    def meta():
        return Meta

    def __init__(self, settings: AlgorithmPredictorSettings):
        super().__init__(settings)
        self.document_id = settings.params.documentId
        self.document_similar_tester = SimilarDocumentChecker()
        self.text_normalizer = LyricsNormalizationProcessor(
            LyricsNormalizationParams(LyricsNormalization.WORDS))

    @classmethod
    def unprocessed(cls, page: DatabasePage) -> bool:
        return True

    def predict(
        self,
        pages: List[DatabasePage],
        callback: Optional[PredictionCallback] = None
    ) -> AlgorithmPredictionResultGenerator:
        book = pages[0].book
        documents = DatabaseBookDocuments().load(book)
        document: Document = documents.database_documents.get_document_by_id(
            self.document_id)
        text = document.get_text_of_document(book)
        text = self.text_normalizer.apply(text)
        text = text.split(' ')
        count = self.document_similar_tester.check_word_based_similarity(text)
        texts = []
        for key, count in count.most_common(5):
            #print(self.document_similar_tester.document_dict[key].sentence)
            #print(self.document_similar_tester.document_dict[key].get_word_list())
            #print(self.document_similar_tester.document_dict[key].get_text())
            texts.append(
                self.document_similar_tester.document_dict[key].get_text())

        yield Result(texts)
コード例 #7
0
 def chars_only(s: str):
     return LyricsNormalizationProcessor(self.args.global_args.dataset_params.lyrics_normalization).apply(s)
コード例 #8
0
 def __init__(self, settings: AlgorithmPredictorSettings):
     super().__init__(settings)
     self.document_id = settings.params.documentId
     self.document_similar_tester = SimilarDocumentChecker()
     self.text_normalizer = LyricsNormalizationProcessor(
         LyricsNormalizationParams(LyricsNormalization.WORDS))
コード例 #9
0
    def check_word_based_similarity(self, sentence):
        documents = []
        for x in sentence:
            if x in self.word_dict:
                documents += (set(self.word_dict[x]))
        from collections import Counter
        count = Counter(documents)
        return count


if __name__ == "__main__":
    word_dict = defaultdict(list)
    document_dict = {}
    document_path = {}
    document_meta = {}
    text_normalizer = LyricsNormalizationProcessor(LyricsNormalizationParams(LyricsNormalization.WORDS))

    for x in documents_gen():
        b = load_json(x.document_meta)
        a = populate(x.data)

        document_meta[x.document_id] = b["dokumenten_id"]
        text = a.get_text(text_normalizer).split(" ")
        populate_look_up_dict(text, x.document_id, word_dict)
        document_dict[x.document_id] = a
        document_path[x.document_id] = x

    counter = check_word_based_similarity(["lux", "aduenit", "ueneranda", "lux", "in", "chrois"], word_dict)
    for key, count in counter.most_common(10):
        print(key)
        print(count)