def __call__(self, chunk): from src.Sequencer import TextChunk chunk = self._duplicate_chunk(chunk) result = [chunk] if isinstance(chunk, TextChunk) and chunk.language == 'japanese': chunk.printable = False tokens = self.tokenize(chunk.text) for t in tokens: result.append( TextChunk(text=t[0], language='japanese', audible=False, printable=True, final=True)) if len(t) > 1: text = f' ({t[1]}) ' result.append( TextChunk(text=text, language='japanese', audible=False, printable=True, final=True)) return result
def test_promote(self): from src.Sequencer import TextChunk, SpeechChunk a = TextChunk(text='財布の中に何もありません', language='japanese', audible=True, printable=True, final=False) b = a.promote(SpeechChunk, volume=75)
def __call__(self, chunk): from src.Sequencer import TextChunk, SpeechChunk, JingleChunk result = list() original = chunk if isinstance(chunk, TextChunk): current_language = None buffer = '' n_last = len(chunk.text) - 1 n_cuts = 0 for n, char in enumerate(chunk.text): language = self._get_language(char) if current_language is None: current_language = language elif current_language != language and language is not None or n == n_last: if n_cuts: result.append(JingleChunk(jingle='silence')) text = buffer + char if n == n_last else buffer changes = {"text": text, "language": current_language} if isinstance(original, SpeechChunk): voice = chunk.voice if chunk.language == current_language else None evolved = evolve(chunk, **changes, voice=voice) result.append(evolved) else: result.append(TextChunk(**changes)) current_language = language buffer = '' n_cuts += 1 buffer += char else: result.append(self._duplicate_chunk(chunk)) return result
def __call__(self, chunk): chunk = self._duplicate_chunk(chunk) chunk.final = True result = [chunk] text = self.tokenize(chunk.text) result.append( TextChunk(text=text, language='japanese', audible=False, printable=True, final=True)) return result
def test_abbr(self): from src.Sequencer import TextChunk, ChunkProcessor from src.filter.ExpandContractions import ExpandContractions from src.filter.StubFinalizer import StubFinalizer a = TextChunk(text='to listen up to smb. and smth.', language='english', audible=True, printable=True, final=False) p0 = ChunkProcessor(filters=[ExpandContractions(), StubFinalizer()]) result = p0.apply_filters(a) assert all( map(lambda s: s in result[0].text, ['somebody', 'something']))
def search_excerpt(self, app_config, foreign_name, word): if ' ' in word or foreign_name not in app_config['phraseExamples']: return None example = self.get_excerpt(word, foreign_name) if example is None: # Try to find an example for lemmatized (stemmed) form base_forms = self.lemmatize(word) for base_form in base_forms: example = self.get_excerpt(base_form, foreign_name) if example: break return [ JingleChunk(jingle='excerpt'), JingleChunk(jingle='silence'), TextChunk(text=example, language=foreign_name), JingleChunk(jingle='silence_long') ] if example else None
def test_split_mixed_languages(self): from src.Sequencer import TextChunk, ChunkProcessor, JingleChunk from src.filter.AddVoice import AddVoice from src.filter.SplitMixedLanguages import SplitMixedLanguages from src.filter.StubFinalizer import StubFinalizer a = TextChunk(text='hi there привет こんにちは', language='english', audible=True, printable=True, final=False) p0 = ChunkProcessor( filters=[SplitMixedLanguages( ), AddVoice(), StubFinalizer()]) result = p0.apply_filters(a) assert result[0].language == 'english' self.assertIsInstance(result[1], JingleChunk) assert result[2].language == 'russian' self.assertIsInstance(result[3], JingleChunk) assert result[4].language == 'japanese'
def __call__(self, chunk): from src.Sequencer import JingleChunk, TextChunk chunk = self._duplicate_chunk(chunk) result = [chunk] if self._needs_process(chunk.text, chunk.language): result.append(JingleChunk(jingle='silence_long', printable=False)) result.append(JingleChunk(jingle='by_letter', printable=False)) result.append(JingleChunk(jingle='silence', printable=False)) for letter in chunk.text: result.append( JingleChunk(jingle='silence_short', printable=False)) if letter.isspace(): result.append(JingleChunk(jingle='space', printable=False)) else: result.append( TextChunk(text=letter, language='english', audible=True, printable=False, final=True)) return result
def __call__(self, chunk): from src.Sequencer import TextChunk, JingleChunk chunk = self._duplicate_chunk(chunk) result = [chunk] if not isinstance(chunk, TextChunk) or chunk.language != 'japanese': return result explanations = self._get_explanations(chunk.text) result.append( TextChunk(text='[', audible=False, printable=True, final=True)) for k, ons, kuns, explanation in explanations: result.append( TextChunk(text=k, language='japanese', audible=False, printable=True, final=True)) result.append( TextChunk(text='on', language='english', audible=True, printable=False, final=True)) result.append(JingleChunk(jingle='silence')) for on in ons: result.append( TextChunk(text=on, language='japanese', audible=True, printable=True, final=True)) result.append(JingleChunk(jingle='silence')) result.append( TextChunk(text='、', audible=False, printable=True, final=True)) result.append( TextChunk(text='koon', language='english', audible=True, printable=False, final=True)) result.append(JingleChunk(jingle='silence')) for kun in kuns: result.append( TextChunk(text=kun, language='japanese', audible=True, printable=True, final=True)) result.append(JingleChunk(jingle='silence')) result.append( TextChunk(text='、', audible=False, printable=True, final=True)) result.append(JingleChunk(jingle='definition')) result.append( TextChunk(text=explanation, language='english', audible=True, printable=True, final=True)) result.append( TextChunk(text=']', audible=False, printable=True, final=True)) result.append( JingleChunk(jingle='silence_long', audible=False, printable=True, final=True)) return result
def example_sequence(foreign, native, pair): return [ TextChunk(text=pair[0].strip(), language=foreign), JingleChunk(jingle='silence'), TextChunk(text=pair[1].strip(), language=native) ]
def chunk_factory(*, language, text): return TextChunk(text=text, language=language)
def test_chunk_preprocessing(self): from src.Sequencer import ChunkProcessor, TextChunk from src.filter.AddFurigana import AddFurigana from src.filter.ExplainJapaneseSentences import ExplainJapaneseSentences from src.filter.ExplainKanji import ExplainKanji from src.filter.PronounceByLetter import PronounceByLetter from src.filter.SimilarKanji import jp_reverse from src.filter.TidyUpEnglish import TidyUpEnglish from src.filter.StubFinalizer import StubFinalizer from src.filter.TidyUpText import TidyUpText c0 = TextChunk(text='"some guy\'s bad text {', language='english', audible=True, printable=True, final=False) p0 = ChunkProcessor(filters=[TidyUpEnglish(), StubFinalizer()]) result0 = p0.apply_filters(c0) c1 = TextChunk(text='知りません', language='japanese', audible=False, printable=True, final=False) p1 = ChunkProcessor(filters=[ExplainKanji(), StubFinalizer()]) result1 = p1.apply_filters(c1) c2 = TextChunk(text='faux pas', language='english', audible=True, printable=True, final=False) p2 = ChunkProcessor(filters=[PronounceByLetter(), StubFinalizer()]) result2 = p2.apply_filters(c2) c3 = TextChunk(text='財布の中に何もありません', language='japanese', audible=True, printable=True, final=False) p3 = ChunkProcessor( filters=[ExplainJapaneseSentences(), StubFinalizer()]) result3 = p3.apply_filters(c3) c4 = TextChunk(text='財布の中に何もありません', language='japanese', audible=True, printable=True, final=False) p4 = ChunkProcessor(filters=[AddFurigana(), StubFinalizer()]) result4 = p4.apply_filters(c4) r = jp_reverse('知') c5 = TextChunk(text='/in brackets/some_bad_formatting{going}here()', language='japanese', audible=True, printable=True, final=False) p5 = ChunkProcessor(filters=[TidyUpText(), StubFinalizer()]) result5 = p5.apply_filters(c5) pass