def test_is_single_level_true_not_empty(self): smf = SyncMapFragment() child = Tree(value=smf) tree = Tree() tree.add_child(child) syn = SyncMap(tree=tree) self.assertTrue(syn.is_single_level)
def _level_time_map_to_tree(self, text_file, time_map, tree=None, add_head_tail=True): """ Convert a level time map into a Tree of SyncMapFragments. The time map is a list of pairs ``[start_time, end_time]``, of length equal to number of fragments + 2, where the two extra elements are for the HEAD (first) and TAIL (last). :param text_file: the text file object :type text_file: :class:`~aeneas.textfile.TextFile` :param list time_map: the time map :param tree: the tree; if ``None``, a new Tree will be built :type tree: :class:`~aeneas.tree.Tree` :rtype: :class:`~aeneas.tree.Tree` """ if tree is None: tree = Tree() if add_head_tail: fragments = ( [TextFragment(u"HEAD", self.task.configuration["language"], [u""])] + text_file.fragments + [TextFragment(u"TAIL", self.task.configuration["language"], [u""])] ) i = 0 else: fragments = text_file.fragments i = 1 for fragment in fragments: interval = time_map[i] sm_frag = SyncMapFragment(fragment, interval[0], interval[1]) tree.add_child(Tree(value=sm_frag)) i += 1 return tree
def test_fragments_tree_not_empty(self): smf = SyncMapFragment() child = Tree(value=smf) tree = Tree() tree.add_child(child) syn = SyncMap(tree=tree) self.assertEqual(len(syn.fragments_tree), 1)
def __init__(self, file_path=None, file_format=None, parameters=None, rconf=None, logger=None): super(TextFile, self).__init__(rconf=rconf, logger=logger) self.file_path = file_path self.file_format = file_format self.parameters = {} if parameters is None else parameters self.fragments_tree = Tree() if (self.file_path is not None) and (self.file_format is not None): self._read_from_file()
def test_is_single_level_false(self): smf2 = SyncMapFragment() child2 = Tree(value=smf2) smf = SyncMapFragment() child = Tree(value=smf) child.add_child(child2) tree = Tree() tree.add_child(child) syn = SyncMap(tree=tree) self.assertFalse(syn.is_single_level)
def __init__(self, tree=None, rconf=None, logger=None): if (tree is not None) and (not isinstance(tree, Tree)): raise TypeError(u"tree is not an instance of Tree") super(SyncMap, self).__init__(rconf=rconf, logger=logger) if tree is None: tree = Tree() self.fragments_tree = tree
def test_has_zero_length_leaves(self): params = [ ([("0.000", "0.000"), ("0.000", "0.000")], True), ([("0.000", "0.000"), ("0.000", "1.000")], True), ([("0.000", "1.000"), ("1.000", "1.000")], True), ([("0.000", "1.000"), ("1.000", "2.000")], False), ([("0.000", "0.000"), ("1.000", "1.000")], True), ([("0.000", "0.000"), ("1.000", "2.000")], True), ([("0.000", "1.000"), ("2.000", "2.000")], True), ([("0.000", "1.000"), ("2.000", "3.000")], False), ] for l, exp in params: tree = Tree() for b, e in l: interval = TimeInterval(begin=TimeValue(b), end=TimeValue(e)) smf = SyncMapFragment(interval=interval) child = Tree(value=smf) tree.add_child(child, as_last=True) syn = SyncMap(tree=tree) self.assertEqual(syn.has_zero_length_leaves, exp)
def _level_time_map_to_tree(self, text_file, time_map, tree=None, add_head_tail=True): """ Convert a level time map into a Tree of SyncMapFragments. The time map is a list of pairs ``[start_time, end_time]``, of length equal to number of fragments + 2, where the two extra elements are for the HEAD (first) and TAIL (last). :param text_file: the text file object :type text_file: :class:`~aeneas.textfile.TextFile` :param list time_map: the time map :param tree: the tree; if ``None``, a new Tree will be built :type tree: :class:`~aeneas.tree.Tree` :rtype: :class:`~aeneas.tree.Tree` """ if tree is None: tree = Tree() if add_head_tail: fragments = ([ TextFragment(u"HEAD", self.task.configuration["language"], [u""]) ] + text_file.fragments + [ TextFragment(u"TAIL", self.task.configuration["language"], [u""]) ]) i = 0 else: fragments = text_file.fragments i = 1 for fragment in fragments: interval = time_map[i] sm_frag = SyncMapFragment(fragment, interval[0], interval[1]) tree.add_child(Tree(value=sm_frag)) i += 1 return tree
def test_leaves_are_consistent(self): params = [ ([("0.000", "0.000"), ("0.000", "0.000")], True), ([("0.000", "0.000"), ("0.000", "1.000")], True), ([("0.000", "1.000"), ("1.000", "1.000")], True), ([("0.000", "1.000"), ("1.000", "2.000")], True), ([("0.000", "0.000"), ("1.000", "1.000")], True), ([("0.000", "0.000"), ("1.000", "2.000")], True), ([("0.000", "1.000"), ("2.000", "2.000")], True), ([("0.000", "1.000"), ("2.000", "3.000")], True), ([("0.000", "1.000"), ("1.000", "1.000"), ("1.000", "2.000")], True), ([("0.000", "1.000"), ("1.000", "1.000"), ("2.000", "2.000")], True), ([("0.000", "1.000"), ("2.000", "3.000"), ("1.500", "1.500")], True), ([("0.000", "1.000"), ("2.000", "3.000"), ("1.500", "1.750")], True), ([("0.000", "1.000"), ("1.040", "2.000")], True), ([("0.000", "1.000"), ("0.000", "0.500")], False), ([("0.000", "1.000"), ("0.000", "1.000")], False), ([("0.000", "1.000"), ("0.000", "1.500")], False), ([("0.000", "1.000"), ("0.500", "0.500")], False), ([("0.000", "1.000"), ("0.500", "0.750")], False), ([("0.000", "1.000"), ("0.500", "1.000")], False), ([("0.000", "1.000"), ("0.500", "1.500")], False), ([("0.000", "1.000"), ("2.000", "2.000"), ("1.500", "2.500")], False), ([("0.000", "1.000"), ("2.000", "3.000"), ("1.500", "2.500")], False), ([("0.000", "1.000"), ("0.960", "2.000")], False), ] for l, exp in params: tree = Tree() for b, e in l: interval = TimeInterval(begin=TimeValue(b), end=TimeValue(e)) smf = SyncMapFragment(interval=interval) child = Tree(value=smf) tree.add_child(child, as_last=True) syn = SyncMap(tree=tree) self.assertEqual(syn.leaves_are_consistent, exp)
def test_add_child(self): root = Tree(value="root") child1 = Tree(value="child1") child2 = Tree(value="child2") root.add_child(child1) root.add_child(child2) self.assertEqual(len(root), 2) self.assertEqual(root.level, 0) self.assertEqual(root.height, 2) self.assertTrue(root.is_root) self.assertFalse(root.is_leaf) self.assertEqual(root.children, [child1, child2]) self.assertEqual(root.vchildren, ["child1", "child2"]) self.assertEqual(root.leaves, [child1, child2]) self.assertEqual(root.vleaves, ["child1", "child2"]) for node in [child1, child2]: self.assertEqual(len(node), 0) self.assertEqual(node.level, 1) self.assertTrue(node.is_leaf) self.assertFalse(node.is_root) self.assertEqual(node.children, []) self.assertEqual(node.vchildren, [])
def add_fragment(self, fragment, as_last=True): """ Add the given text fragment as the first or last child of the root node of the text file tree. :param fragment: the text fragment to be added :type fragment: :class:`~aeneas.textfile.TextFragment` :param bool as_last: if ``True`` append fragment, otherwise prepend it """ if not isinstance(fragment, TextFragment): self.log_exc(u"fragment is not an instance of TextFragment", None, True, TypeError) self.fragments_tree.add_child(Tree(value=fragment), as_last=as_last)
def _select_levels(self, tree): """ Select the correct levels in the tree, reading the ``os_task_file_levels`` parameter in the Task configuration. If ``None`` or invalid, return the current sync map tree unchanged. Otherwise, return only the levels appearing in it. :param tree: a Tree of SyncMapFragments :type tree: :class:`~aeneas.tree.Tree` :rtype: :class:`~aeneas.tree.Tree` """ levels = self.task.configuration["o_levels"] self.log([u"Levels: '%s'", levels]) if (levels is None) or (len(levels) < 1): return tree try: levels = [int(l) for l in levels if int(l) > 0] self.log([u"Converted levels: %s", levels]) except ValueError: self.log_warn( u"Cannot convert levels to list of int, returning unchanged") return tree # remove head and tail nodes head = tree.vchildren[0] tail = tree.vchildren[-1] tree.remove_child(0) tree.remove_child(-1) # keep only the selected levels tree.keep_levels(levels) # add head and tail back tree.add_child(Tree(value=head), as_last=False) tree.add_child(Tree(value=tail), as_last=True) # return the new tree return tree
def __init__( self, file_path=None, file_format=None, parameters=None, rconf=None, logger=None ): super(TextFile, self).__init__(rconf=rconf, logger=logger) self.file_path = file_path self.file_format = file_format self.parameters = {} if parameters is None else parameters self.fragments_tree = Tree() if (self.file_path is not None) and (self.file_format is not None): self._read_from_file()
def add_fragment(self, fragment, as_last=True): """ Add the given sync map fragment, as the first or last child of the root node of the sync map tree. :param fragment: the sync map fragment to be added :type fragment: :class:`~aeneas.syncmap.fragment.SyncMapFragment` :param bool as_last: if ``True``, append fragment; otherwise prepend it :raises: TypeError: if ``fragment`` is ``None`` or it is not an instance of :class:`~aeneas.syncmap.fragment.SyncMapFragment` """ if not isinstance(fragment, SyncMapFragment): self.log_exc(u"fragment is not an instance of SyncMapFragment", None, True, TypeError) self.fragments_tree.add_child(Tree(value=fragment), as_last=as_last)
def test_empty(self): root = Tree() self.assertEqual(len(root), 0) self.assertEqual(root.level, 0) self.assertEqual(root.height, 1) self.assertIsNone(root.value) self.assertTrue(root.is_root) self.assertTrue(root.is_leaf) self.assertTrue(root.is_empty) self.assertTrue(root.is_pleasant) self.assertEqual(root.children, []) self.assertEqual(root.subtree, [root]) self.assertEqual(root.leaves, [root]) self.assertEqual(root.vleaves, [None]) self.assertEqual(root.leaves_not_empty, []) self.assertEqual(root.vleaves_not_empty, [])
def append_fragment_list_to_sync_root(self, sync_root): """ Append the sync map fragment list to the given node from a sync map tree. :param sync_root: the root of the sync map tree to which the new nodes should be appended :type sync_root: :class:`~aeneas.tree.Tree` """ if not isinstance(sync_root, Tree): self.log_exc(u"sync_root is not a Tree object", None, True, TypeError) self.log(u"Appending fragment list to sync root...") for fragment in self.smflist: sync_root.add_child(Tree(value=fragment)) self.log(u"Appending fragment list to sync root... done")
def _execute_single_level_task(self): """ Execute a single-level task """ self.log(u"Executing single level task...") try: # load audio file, extract MFCCs from real wave, clear audio file self._step_begin(u"extract MFCC real wave") real_wave_mfcc = self._extract_mfcc( file_path=self.task.audio_file_path_absolute, file_format=None, ) self._step_end() # compute head and/or tail and set it self._step_begin(u"compute head tail") (head_length, process_length, tail_length) = self._compute_head_process_tail(real_wave_mfcc) real_wave_mfcc.set_head_middle_tail(head_length, process_length, tail_length) self._step_end() # compute alignment, outputting a tree of time intervals self._set_synthesizer() sync_root = Tree() self._execute_inner(real_wave_mfcc, self.task.text_file, sync_root=sync_root, force_aba_auto=False, log=True, leaf_level=True) self._clear_cache_synthesizer() # create syncmap and add it to task self._step_begin(u"create sync map") self._create_sync_map(sync_root=sync_root) self._step_end() # log total self._step_total() self.log(u"Executing single level task... done") except Exception as exc: self._step_failure(exc)
def test_set_parent(self): root = Tree(value="root") new_root = Tree(value="newroot") root.parent = new_root self.assertIsNotNone(root.parent) self.assertFalse(root.is_root)
def _read_munparsed(self, lines): """ Read text fragments from an munparsed format text file. :param list lines: the lines of the unparsed text file """ from bs4 import BeautifulSoup def nodes_at_level(root, level): """ Return a dict with the bs4 filter parameters """ LEVEL_TO_REGEX_MAP = [ None, gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX, gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX, gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX, ] attribute_name = "id" regex_string = self.parameters[LEVEL_TO_REGEX_MAP[level]] indent = u" " * 2 * (level - 1) self.log([ u"%sRegex for %s: '%s'", indent, attribute_name, regex_string ]) regex = re.compile(r".*\b" + regex_string + r"\b.*") return root.findAll(attrs={attribute_name: regex}) # # TODO better and/or parametric parsing, # for example, removing tags but keeping text, etc. # self.log(u"Parsing fragments from munparsed text format") # transform text in a soup object self.log(u"Creating soup") soup = BeautifulSoup("\n".join(lines), "lxml") # extract according to class_regex and id_regex text_from_id = {} ids = [] self.log(u"Finding l1 elements") tree = Tree() for l1_node in nodes_at_level(soup, 1): has_word = False try: l1_id = gf.safe_unicode(l1_node["id"]) self.log([u"Found l1 node with id: '%s'", l1_id]) l1_text = [] paragraph_node = Tree() paragraph_text = [] for l2_node in nodes_at_level(l1_node, 2): l2_id = gf.safe_unicode(l2_node["id"]) self.log([u" Found l2 node with id: '%s'", l2_id]) l2_text = [] sentence_node = Tree() paragraph_node.add_child(sentence_node) sentence_text = [] for l3_node in nodes_at_level(l2_node, 3): l3_id = gf.safe_unicode(l3_node["id"]) l3_text = gf.safe_unicode(l3_node.text) self.log([u" Found l3 node with id: '%s'", l3_id]) self.log( [u" Found l3 node with text: '%s'", l3_text]) word_fragment = TextFragment(identifier=l3_id, lines=[l3_text], filtered_lines=[l3_text]) word_node = Tree(value=word_fragment) sentence_node.add_child(word_node) sentence_text.append(l3_text) has_word = True sentence_text = u" ".join(sentence_text) paragraph_text.append(sentence_text) sentence_node.value = TextFragment( identifier=l2_id, lines=[sentence_text], filtered_lines=[sentence_text]) self.log( [u" Found l2 node with text: '%s'" % sentence_text]) if has_word: paragraph_text = u" ".join(paragraph_text) paragraph_node.value = TextFragment( identifier=l1_id, lines=[paragraph_text], filtered_lines=[paragraph_text]) tree.add_child(paragraph_node) self.log( [u"Found l1 node with text: '%s'" % paragraph_text]) else: self.log(u"Found l1 node but it has no words, skipping") except KeyError: self.log_warn(u"KeyError while parsing a l1 node") # append to fragments self.log(u"Storing tree") self.fragments_tree = tree
def _read_mplain(self, lines): """ Read text fragments from a multilevel format text file. :param list lines: the lines of the subtitles text file """ self.log(u"Parsing fragments from subtitles text format") word_separator = self._mplain_word_separator() self.log([u"Word separator is: '%s'", word_separator]) lines = [line.strip() for line in lines] pairs = [] i = 1 current = 0 tree = Tree() while current < len(lines): line_text = lines[current] if len(line_text) > 0: sentences = [line_text] following = current + 1 while (following < len(lines)) and (len(lines[following]) > 0): sentences.append(lines[following]) following += 1 # here sentences holds the sentences for this paragraph # create paragraph node paragraph_identifier = u"p%06d" % i paragraph_lines = [u" ".join(sentences)] paragraph_fragment = TextFragment( identifier=paragraph_identifier, lines=paragraph_lines, filtered_lines=paragraph_lines) paragraph_node = Tree(value=paragraph_fragment) tree.add_child(paragraph_node) self.log([u"Paragraph %s", paragraph_identifier]) # create sentences nodes j = 1 for s in sentences: sentence_identifier = paragraph_identifier + u"s%06d" % j sentence_lines = [s] sentence_fragment = TextFragment( identifier=sentence_identifier, lines=sentence_lines, filtered_lines=sentence_lines) sentence_node = Tree(value=sentence_fragment) paragraph_node.add_child(sentence_node) j += 1 self.log([u" Sentence %s", sentence_identifier]) # create words nodes k = 1 for w in [ w for w in s.split(word_separator) if len(w) > 0 ]: word_identifier = sentence_identifier + u"w%06d" % k word_lines = [w] word_fragment = TextFragment( identifier=word_identifier, lines=word_lines, filtered_lines=word_lines) word_node = Tree(value=word_fragment) sentence_node.add_child(word_node) k += 1 self.log([u" Word %s", word_identifier]) # keep iterating current = following i += 1 current += 1 self.log(u"Storing tree") self.fragments_tree = tree
def create_tree1(self, soon=True): root = Tree(value="root") c1 = Tree(value="c1") c11 = Tree(value="c11") c111 = Tree(value="c111") c1111 = Tree(value="c1111") c1112 = Tree(value="c1112") c1113 = Tree(value="c1113") if soon: root.add_child(c1) c1.add_child(c11) c11.add_child(c111) c111.add_child(c1111) c111.add_child(c1112) c111.add_child(c1113) else: c111.add_child(c1111) c111.add_child(c1112) c111.add_child(c1113) c11.add_child(c111) c1.add_child(c11) root.add_child(c1) return (root, c1, c11, c111, c1111, c1112, c1113)
def test_fragments_tree_empty(self): tree = Tree() syn = SyncMap(tree=tree) self.assertEqual(len(syn.fragments_tree), 0)
def test_unicode(self): root = Tree(value="root") s = root.__unicode__() self.assertIsNotNone(s)
def test_add_child_not_tree(self): root = Tree(value="root") with self.assertRaises(TypeError): root.add_child("bad child")
def test_str(self): root = Tree(value="root") s = root.__str__() self.assertIsNotNone(s)
def _read_mplain(self, lines): """ Read text fragments from a multilevel format text file. :param list lines: the lines of the subtitles text file """ self.log(u"Parsing fragments from subtitles text format") word_separator = self._mplain_word_separator() self.log([u"Word separator is: '%s'", word_separator]) lines = [line.strip() for line in lines] pairs = [] i = 1 current = 0 tree = Tree() while current < len(lines): line_text = lines[current] if len(line_text) > 0: sentences = [line_text] following = current + 1 while (following < len(lines)) and (len(lines[following]) > 0): sentences.append(lines[following]) following += 1 # here sentences holds the sentences for this paragraph # create paragraph node paragraph_identifier = u"p%06d" % i paragraph_lines = [u" ".join(sentences)] paragraph_fragment = TextFragment( identifier=paragraph_identifier, lines=paragraph_lines, filtered_lines=paragraph_lines ) paragraph_node = Tree(value=paragraph_fragment) tree.add_child(paragraph_node) self.log([u"Paragraph %s", paragraph_identifier]) # create sentences nodes j = 1 for s in sentences: sentence_identifier = paragraph_identifier + u"s%06d" % j sentence_lines = [s] sentence_fragment = TextFragment( identifier=sentence_identifier, lines=sentence_lines, filtered_lines=sentence_lines ) sentence_node = Tree(value=sentence_fragment) paragraph_node.add_child(sentence_node) j += 1 self.log([u" Sentence %s", sentence_identifier]) # create words nodes k = 1 for w in [w for w in s.split(word_separator) if len(w) > 0]: word_identifier = sentence_identifier + u"w%06d" % k word_lines = [w] word_fragment = TextFragment( identifier=word_identifier, lines=word_lines, filtered_lines=word_lines ) word_node = Tree(value=word_fragment) sentence_node.add_child(word_node) k += 1 self.log([u" Word %s", word_identifier]) # keep iterating current = following i += 1 current += 1 self.log(u"Storing tree") self.fragments_tree = tree
def test_parent(self): root = Tree(value="root") self.assertIsNone(root.parent) self.assertTrue(root.is_root)
def create_tree2(self): root = Tree(value="r") c1 = Tree(value="c1") c2 = Tree(value="c2") c3 = Tree(value="c3") c4 = Tree(value="c4") c11 = Tree(value="c11") c12 = Tree(value="c12") c13 = Tree(value="c13") c21 = Tree(value="c21") c22 = Tree(value="c22") c23 = Tree(value="c23") c24 = Tree(value="c24") c25 = Tree(value="c25") c231 = Tree(value="c231") c232 = Tree(value="c232") root.add_child(c1) root.add_child(c2) root.add_child(c3) root.add_child(c4) c1.add_child(c11) c1.add_child(c12) c1.add_child(c13) c2.add_child(c21) c2.add_child(c22) c2.add_child(c23) c2.add_child(c24) c2.add_child(c25) c23.add_child(c231) c23.add_child(c232) return (root, c1, c11, c12, c13, c2, c21, c22, c23, c231, c232, c24, c25, c3, c4)
def clear(self): """ Clear the text file, removing all the current fragments. """ self.log(u"Clearing text fragments") self.fragments_tree = Tree()
def _execute_multi_level_task(self): """ Execute a multi-level task """ self.log(u"Executing multi level task...") self.log(u"Saving rconf...") # save original rconf orig_rconf = self.rconf.clone() # clone rconfs and set granularity # TODO the following code assumes 3 levels: generalize this level_rconfs = [ None, self.rconf.clone(), self.rconf.clone(), self.rconf.clone() ] level_mfccs = [None, None, None, None] force_aba_autos = [None, False, False, True] for i in range(1, len(level_rconfs)): level_rconfs[i].set_granularity(i) self.log([u"Level %d mmn: %s", i, level_rconfs[i].mmn]) self.log([u"Level %d mwl: %.3f", i, level_rconfs[i].mwl]) self.log([u"Level %d mws: %.3f", i, level_rconfs[i].mws]) level_rconfs[i].set_tts(i) self.log([u"Level %d tts: %s", i, level_rconfs[i].tts]) self.log([u"Level %d tts_path: %s", i, level_rconfs[i].tts_path]) self.log(u"Saving rconf... done") try: self.log(u"Creating AudioFile object...") audio_file = self._load_audio_file() self.log(u"Creating AudioFile object... done") # extract MFCC for each level for i in range(1, len(level_rconfs)): self._step_begin(u"extract MFCC real wave level %d" % i) if (i == 1) or ( level_rconfs[i].mws != level_rconfs[i - 1].mws) or ( level_rconfs[i].mwl != level_rconfs[i - 1].mwl): self.rconf = level_rconfs[i] level_mfccs[i] = self._extract_mfcc(audio_file=audio_file) else: self.log(u"Keeping MFCC real wave from previous level") level_mfccs[i] = level_mfccs[i - 1] self._step_end() self.log(u"Clearing AudioFile object...") self.rconf = level_rconfs[1] self._clear_audio_file(audio_file) self.log(u"Clearing AudioFile object... done") # compute head tail for the entire real wave (level 1) self._step_begin(u"compute head tail") (head_length, process_length, tail_length) = self._compute_head_process_tail(level_mfccs[1]) level_mfccs[1].set_head_middle_tail(head_length, process_length, tail_length) self._step_end() # compute alignment at each level sync_root = Tree() sync_roots = [sync_root] text_files = [self.task.text_file] number_levels = len(level_rconfs) for i in range(1, number_levels): self._step_begin(u"compute alignment level %d" % i) self.rconf = level_rconfs[i] text_files, sync_roots = self._execute_level( level=i, audio_file_mfcc=level_mfccs[i], text_files=text_files, sync_roots=sync_roots, force_aba_auto=force_aba_autos[i], ) self._step_end() # restore original rconf, and create syncmap and add it to task self._step_begin(u"create sync map") self.rconf = orig_rconf self._create_sync_map(sync_root=sync_root) self._step_end() self._step_total() self.log(u"Executing multi level task... done") except Exception as exc: self._step_failure(exc)
class TextFile(Loggable): """ A tree of text fragments, representing a text file. :param string file_path: the path to the text file. If not ``None`` (and also ``file_format`` is not ``None``), the file will be read immediately. :param file_format: the format of the text file :type file_format: :class:`~aeneas.textfile.TextFileFormat` :param dict parameters: additional parameters used to parse the text file :param rconf: a runtime configuration :type rconf: :class:`~aeneas.runtimeconfiguration.RuntimeConfiguration` :param logger: the logger object :type logger: :class:`~aeneas.logger.Logger` :raises: OSError: if ``file_path`` cannot be read :raises: TypeError: if ``parameters`` is not an instance of ``dict`` :raises: ValueError: if ``file_format`` value is not allowed """ DEFAULT_ID_FORMAT = u"f%06d" TAG = u"TextFile" def __init__( self, file_path=None, file_format=None, parameters=None, rconf=None, logger=None ): super(TextFile, self).__init__(rconf=rconf, logger=logger) self.file_path = file_path self.file_format = file_format self.parameters = {} if parameters is None else parameters self.fragments_tree = Tree() if (self.file_path is not None) and (self.file_format is not None): self._read_from_file() def __len__(self): return len(self.fragments) def __unicode__(self): msg = [] if self.fragments_tree is not None: for node in self.fragments_tree.pre: if not node.is_empty: indent = u" " * 2 * (node.level - 1) msg.append(u"%s%s" % (indent, node.value.__unicode__())) return u"\n".join(msg) def __str__(self): return gf.safe_str(self.__unicode__()) @property def fragments_tree(self): """ Return the current tree of fragments. :rtype: :class:`~aeneas.tree.Tree` """ return self.__fragments_tree @fragments_tree.setter def fragments_tree(self, fragments_tree): self.__fragments_tree = fragments_tree @property def children_not_empty(self): """ Return the direct not empty children of the root of the fragments tree, as ``TextFile`` objects. :rtype: list of :class:`~aeneas.textfile.TextFile` """ children = [] for child_node in self.fragments_tree.children_not_empty: child_text_file = self.get_subtree(child_node) child_text_file.set_language(child_node.value.language) children.append(child_text_file) return children @property def chars(self): """ Return the number of characters of the text file, not counting line or fragment separators. :rtype: int """ return sum([fragment.chars for fragment in self.fragments]) @property def file_path(self): """ The path of the text file. :rtype: string """ return self.__file_path @file_path.setter def file_path(self, file_path): if (file_path is not None) and (not gf.file_can_be_read(file_path)): self.log_exc(u"Text file '%s' cannot be read" % (file_path), None, True, OSError) self.__file_path = file_path @property def file_format(self): """ The format of the text file. :rtype: :class:`~aeneas.textfile.TextFileFormat` """ return self.__file_format @file_format.setter def file_format(self, file_format): if (file_format is not None) and (file_format not in TextFileFormat.ALLOWED_VALUES): self.log_exc(u"Text file format '%s' is not allowed" % (file_format), None, True, ValueError) self.__file_format = file_format @property def parameters(self): """ Additional parameters used to parse the text file. :rtype: dict """ return self.__parameters @parameters.setter def parameters(self, parameters): if (parameters is not None) and (not isinstance(parameters, dict)): self.log_exc(u"parameters is not an instance of dict", None, True, TypeError) self.__parameters = parameters @property def characters(self): """ The number of characters in this text file. :rtype: int """ chars = 0 for fragment in self.fragments: chars += fragment.characters return chars @property def fragments(self): """ The current list of text fragments which are the children of the root node of the text file tree. :rtype: list of :class:`~aeneas.textfile.TextFragment` """ return self.fragments_tree.vchildren_not_empty def add_fragment(self, fragment, as_last=True): """ Add the given text fragment as the first or last child of the root node of the text file tree. :param fragment: the text fragment to be added :type fragment: :class:`~aeneas.textfile.TextFragment` :param bool as_last: if ``True`` append fragment, otherwise prepend it """ if not isinstance(fragment, TextFragment): self.log_exc(u"fragment is not an instance of TextFragment", None, True, TypeError) self.fragments_tree.add_child(Tree(value=fragment), as_last=as_last) def get_subtree(self, root): """ Return a new :class:`~aeneas.textfile.TextFile` object, rooted at the given node ``root``. :param root: the root node :type root: :class:`~aeneas.tree.Tree` :rtype: :class:`~aeneas.textfile.TextFile` """ if not isinstance(root, Tree): self.log_exc(u"root is not an instance of Tree", None, True, TypeError) new_text_file = TextFile() new_text_file.fragments_tree = root return new_text_file def get_slice(self, start=None, end=None): """ Return a new list of text fragments, indexed from start (included) to end (excluded). :param int start: the start index, included :param int end: the end index, excluded :rtype: :class:`~aeneas.textfile.TextFile` """ if start is not None: start = min(max(0, start), len(self) - 1) else: start = 0 if end is not None: end = min(max(0, end), len(self)) end = max(end, start + 1) else: end = len(self) new_text = TextFile() for fragment in self.fragments[start:end]: new_text.add_fragment(fragment) return new_text def set_language(self, language): """ Set the given language for all the text fragments. :param language: the language of the text fragments :type language: :class:`~aeneas.language.Language` """ self.log([u"Setting language: '%s'", language]) for fragment in self.fragments: fragment.language = language def clear(self): """ Clear the text file, removing all the current fragments. """ self.log(u"Clearing text fragments") self.fragments_tree = Tree() def read_from_list(self, lines): """ Read text fragments from a given list of strings:: [fragment_1, fragment_2, ..., fragment_n] :param list lines: the text fragments """ self.log(u"Reading text fragments from list") self._read_plain(lines) def read_from_list_with_ids(self, lines): """ Read text fragments from a given list of tuples:: [(id_1, text_1), (id_2, text_2), ..., (id_n, text_n)]. :param list lines: the list of ``[id, text]`` fragments (see above) """ self.log(u"Reading text fragments from list with ids") self._create_text_fragments([(line[0], [line[1]]) for line in lines]) def _read_from_file(self): """ Read text fragments from file. """ # test if we can read the given file if not gf.file_can_be_read(self.file_path): self.log_exc(u"File '%s' cannot be read" % (self.file_path), None, True, OSError) if self.file_format not in TextFileFormat.ALLOWED_VALUES: self.log_exc(u"Text file format '%s' is not supported." % (self.file_format), None, True, ValueError) # read the contents of the file self.log([u"Reading contents of file '%s'", self.file_path]) with io.open(self.file_path, "r", encoding="utf-8") as text_file: lines = text_file.readlines() # clear text fragments self.clear() # parse the contents map_read_function = { TextFileFormat.MPLAIN: self._read_mplain, TextFileFormat.MUNPARSED: self._read_munparsed, TextFileFormat.PARSED: self._read_parsed, TextFileFormat.PLAIN: self._read_plain, TextFileFormat.SUBTITLES: self._read_subtitles, TextFileFormat.UNPARSED: self._read_unparsed } map_read_function[self.file_format](lines) # log the number of fragments self.log([u"Parsed %d fragments", len(self.fragments)]) def _mplain_word_separator(self): """ Get the word separator to split words in mplain format. :rtype: string """ word_separator = gf.safe_get(self.parameters, gc.PPN_TASK_IS_TEXT_MPLAIN_WORD_SEPARATOR, u" ") if (word_separator is None) or (word_separator == "space"): return u" " elif word_separator == "equal": return u"=" elif word_separator == "pipe": return u"|" elif word_separator == "tab": return u"\u0009" return word_separator def _read_mplain(self, lines): """ Read text fragments from a multilevel format text file. :param list lines: the lines of the subtitles text file """ self.log(u"Parsing fragments from subtitles text format") word_separator = self._mplain_word_separator() self.log([u"Word separator is: '%s'", word_separator]) lines = [line.strip() for line in lines] pairs = [] i = 1 current = 0 tree = Tree() while current < len(lines): line_text = lines[current] if len(line_text) > 0: sentences = [line_text] following = current + 1 while (following < len(lines)) and (len(lines[following]) > 0): sentences.append(lines[following]) following += 1 # here sentences holds the sentences for this paragraph # create paragraph node paragraph_identifier = u"p%06d" % i paragraph_lines = [u" ".join(sentences)] paragraph_fragment = TextFragment( identifier=paragraph_identifier, lines=paragraph_lines, filtered_lines=paragraph_lines ) paragraph_node = Tree(value=paragraph_fragment) tree.add_child(paragraph_node) self.log([u"Paragraph %s", paragraph_identifier]) # create sentences nodes j = 1 for s in sentences: sentence_identifier = paragraph_identifier + u"s%06d" % j sentence_lines = [s] sentence_fragment = TextFragment( identifier=sentence_identifier, lines=sentence_lines, filtered_lines=sentence_lines ) sentence_node = Tree(value=sentence_fragment) paragraph_node.add_child(sentence_node) j += 1 self.log([u" Sentence %s", sentence_identifier]) # create words nodes k = 1 for w in [w for w in s.split(word_separator) if len(w) > 0]: word_identifier = sentence_identifier + u"w%06d" % k word_lines = [w] word_fragment = TextFragment( identifier=word_identifier, lines=word_lines, filtered_lines=word_lines ) word_node = Tree(value=word_fragment) sentence_node.add_child(word_node) k += 1 self.log([u" Word %s", word_identifier]) # keep iterating current = following i += 1 current += 1 self.log(u"Storing tree") self.fragments_tree = tree def _read_munparsed(self, lines): """ Read text fragments from an munparsed format text file. :param list lines: the lines of the unparsed text file """ def nodes_at_level(root, level): """ Return a dict with the bs4 filter parameters """ LEVEL_TO_REGEX_MAP = [ None, gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX, gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX, gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX, ] attribute_name = "id" regex_string = self.parameters[LEVEL_TO_REGEX_MAP[level]] indent = u" " * 2 * (level - 1) self.log([u"%sRegex for %s: '%s'", indent, attribute_name, regex_string]) regex = re.compile(r".*\b" + regex_string + r"\b.*") return root.findAll(attrs={ attribute_name: regex }) # # TODO better and/or parametric parsing, # for example, removing tags but keeping text, etc. # self.log(u"Parsing fragments from munparsed text format") # transform text in a soup object self.log(u"Creating soup") soup = BeautifulSoup("\n".join(lines), "lxml") # extract according to class_regex and id_regex text_from_id = {} ids = [] self.log(u"Finding l1 elements") tree = Tree() for l1_node in nodes_at_level(soup, 1): has_word = False try: l1_id = gf.safe_unicode(l1_node["id"]) self.log([u"Found l1 node with id: '%s'", l1_id]) l1_text = [] paragraph_node = Tree() paragraph_text = [] for l2_node in nodes_at_level(l1_node, 2): l2_id = gf.safe_unicode(l2_node["id"]) self.log([u" Found l2 node with id: '%s'", l2_id]) l2_text = [] sentence_node = Tree() paragraph_node.add_child(sentence_node) sentence_text = [] for l3_node in nodes_at_level(l2_node, 3): l3_id = gf.safe_unicode(l3_node["id"]) l3_text = gf.safe_unicode(l3_node.text) self.log([u" Found l3 node with id: '%s'", l3_id]) self.log([u" Found l3 node with text: '%s'", l3_text]) word_fragment = TextFragment( identifier=l3_id, lines=[l3_text], filtered_lines=[l3_text] ) word_node = Tree(value=word_fragment) sentence_node.add_child(word_node) sentence_text.append(l3_text) has_word = True sentence_text = u" ".join(sentence_text) paragraph_text.append(sentence_text) sentence_node.value = TextFragment( identifier=l2_id, lines=[sentence_text], filtered_lines=[sentence_text] ) self.log([u" Found l2 node with text: '%s'" % sentence_text]) if has_word: paragraph_text = u" ".join(paragraph_text) paragraph_node.value = TextFragment( identifier=l1_id, lines=[paragraph_text], filtered_lines=[paragraph_text] ) tree.add_child(paragraph_node) self.log([u"Found l1 node with text: '%s'" % paragraph_text]) else: self.log(u"Found l1 node but it has no words, skipping") except KeyError: self.log_warn(u"KeyError while parsing a l1 node") # append to fragments self.log(u"Storing tree") self.fragments_tree = tree def _read_subtitles(self, lines): """ Read text fragments from a subtitles format text file. :param list lines: the lines of the subtitles text file :raises: ValueError: if the id regex is not valid """ self.log(u"Parsing fragments from subtitles text format") id_format = self._get_id_format() lines = [line.strip() for line in lines] pairs = [] i = 1 current = 0 while current < len(lines): line_text = lines[current] if len(line_text) > 0: fragment_lines = [line_text] following = current + 1 while (following < len(lines)) and (len(lines[following]) > 0): fragment_lines.append(lines[following]) following += 1 identifier = id_format % i pairs.append((identifier, fragment_lines)) current = following i += 1 current += 1 self._create_text_fragments(pairs) def _read_parsed(self, lines): """ Read text fragments from a parsed format text file. :param list lines: the lines of the parsed text file :param dict parameters: additional parameters for parsing (e.g., class/id regex strings) """ self.log(u"Parsing fragments from parsed text format") pairs = [] for line in lines: pieces = line.split(gc.PARSED_TEXT_SEPARATOR) if len(pieces) == 2: identifier = pieces[0].strip() text = pieces[1].strip() if len(identifier) > 0: pairs.append((identifier, [text])) self._create_text_fragments(pairs) def _read_plain(self, lines): """ Read text fragments from a plain format text file. :param list lines: the lines of the plain text file :param dict parameters: additional parameters for parsing (e.g., class/id regex strings) :raises: ValueError: if the id regex is not valid """ self.log(u"Parsing fragments from plain text format") id_format = self._get_id_format() lines = [line.strip() for line in lines] pairs = [] i = 1 for line in lines: identifier = id_format % i text = line.strip() pairs.append((identifier, [text])) i += 1 self._create_text_fragments(pairs) def _read_unparsed(self, lines): """ Read text fragments from an unparsed format text file. :param list lines: the lines of the unparsed text file """ def filter_attributes(): """ Return a dict with the bs4 filter parameters """ attributes = {} for attribute_name, filter_name in [ ("class", gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX), ("id", gc.PPN_JOB_IS_TEXT_UNPARSED_ID_REGEX) ]: if filter_name in self.parameters: regex_string = self.parameters[filter_name] if regex_string is not None: self.log([u"Regex for %s: '%s'", attribute_name, regex_string]) regex = re.compile(r".*\b" + regex_string + r"\b.*") attributes[attribute_name] = regex return attributes # # TODO better and/or parametric parsing, # for example, removing tags but keeping text, etc. # self.log(u"Parsing fragments from unparsed text format") # transform text in a soup object self.log(u"Creating soup") soup = BeautifulSoup("\n".join(lines), "lxml") # extract according to class_regex and id_regex text_from_id = {} ids = [] filter_attributes = filter_attributes() self.log([u"Finding elements matching attributes '%s'", filter_attributes]) nodes = soup.findAll(attrs=filter_attributes) for node in nodes: try: f_id = gf.safe_unicode(node["id"]) f_text = gf.safe_unicode(node.text) text_from_id[f_id] = f_text ids.append(f_id) except KeyError: self.log_warn(u"KeyError while parsing a node") # sort by ID as requested id_sort = gf.safe_get( dictionary=self.parameters, key=gc.PPN_JOB_IS_TEXT_UNPARSED_ID_SORT, default_value=IDSortingAlgorithm.UNSORTED, can_return_none=False ) self.log([u"Sorting text fragments using '%s'", id_sort]) sorted_ids = IDSortingAlgorithm(id_sort).sort(ids) # append to fragments self.log(u"Appending fragments") self._create_text_fragments([(key, [text_from_id[key]]) for key in sorted_ids]) def _get_id_format(self): """ Return the id regex from the parameters""" id_format = gf.safe_get( self.parameters, gc.PPN_TASK_OS_FILE_ID_REGEX, self.DEFAULT_ID_FORMAT, can_return_none=False ) try: identifier = id_format % 1 except (TypeError, ValueError) as exc: self.log_exc(u"String '%s' is not a valid id format" % (id_format), exc, True, ValueError) return id_format def _create_text_fragments(self, pairs): """ Create text fragment objects and append them to this list. :param list pairs: a list of pairs, each pair being (id, [line_1, ..., line_n]) """ self.log(u"Creating TextFragment objects") text_filter = self._build_text_filter() for pair in pairs: self.add_fragment( TextFragment( identifier=pair[0], lines=pair[1], filtered_lines=text_filter.apply_filter(pair[1]) ) ) def _build_text_filter(self): """ Build a suitable TextFilter object. """ text_filter = TextFilter(logger=self.logger) self.log(u"Created TextFilter object") for key, cls, param_name in [ ( gc.PPN_TASK_IS_TEXT_FILE_IGNORE_REGEX, TextFilterIgnoreRegex, "regex" ), ( gc.PPN_TASK_IS_TEXT_FILE_TRANSLITERATE_MAP, TextFilterTransliterate, "map_file_path" ) ]: cls_name = cls.__name__ param_value = gf.safe_get(self.parameters, key, None) if param_value is not None: self.log([u"Creating %s object...", cls_name]) params = { param_name : param_value, "logger" : self.logger } try: inner_filter = cls(**params) text_filter.add_filter(inner_filter) self.log([u"Creating %s object... done", cls_name]) except ValueError as exc: self.log_exc(u"Creating %s object failed" % (cls_name), exc, False, None) return text_filter
class TextFile(Loggable): """ A tree of text fragments, representing a text file. :param string file_path: the path to the text file. If not ``None`` (and also ``file_format`` is not ``None``), the file will be read immediately. :param file_format: the format of the text file :type file_format: :class:`~aeneas.textfile.TextFileFormat` :param dict parameters: additional parameters used to parse the text file :param rconf: a runtime configuration :type rconf: :class:`~aeneas.runtimeconfiguration.RuntimeConfiguration` :param logger: the logger object :type logger: :class:`~aeneas.logger.Logger` :raises: OSError: if ``file_path`` cannot be read :raises: TypeError: if ``parameters`` is not an instance of ``dict`` :raises: ValueError: if ``file_format`` value is not allowed """ DEFAULT_ID_FORMAT = u"f%06d" TAG = u"TextFile" def __init__(self, file_path=None, file_format=None, parameters=None, rconf=None, logger=None): super(TextFile, self).__init__(rconf=rconf, logger=logger) self.file_path = file_path self.file_format = file_format self.parameters = {} if parameters is None else parameters self.fragments_tree = Tree() if (self.file_path is not None) and (self.file_format is not None): self._read_from_file() def __len__(self): return len(self.fragments) def __unicode__(self): msg = [] if self.fragments_tree is not None: for node in self.fragments_tree.pre: if not node.is_empty: indent = u" " * 2 * (node.level - 1) msg.append(u"%s%s" % (indent, node.value.__unicode__())) return u"\n".join(msg) def __str__(self): return gf.safe_str(self.__unicode__()) @property def fragments_tree(self): """ Return the current tree of fragments. :rtype: :class:`~aeneas.tree.Tree` """ return self.__fragments_tree @fragments_tree.setter def fragments_tree(self, fragments_tree): self.__fragments_tree = fragments_tree @property def children_not_empty(self): """ Return the direct not empty children of the root of the fragments tree, as ``TextFile`` objects. :rtype: list of :class:`~aeneas.textfile.TextFile` """ children = [] for child_node in self.fragments_tree.children_not_empty: child_text_file = self.get_subtree(child_node) child_text_file.set_language(child_node.value.language) children.append(child_text_file) return children @property def file_path(self): """ The path of the text file. :rtype: string """ return self.__file_path @file_path.setter def file_path(self, file_path): if (file_path is not None) and (not gf.file_can_be_read(file_path)): self.log_exc(u"Text file '%s' cannot be read" % (file_path), None, True, OSError) self.__file_path = file_path @property def file_format(self): """ The format of the text file. :rtype: :class:`~aeneas.textfile.TextFileFormat` """ return self.__file_format @file_format.setter def file_format(self, file_format): if (file_format is not None) and (file_format not in TextFileFormat.ALLOWED_VALUES): self.log_exc( u"Text file format '%s' is not allowed" % (file_format), None, True, ValueError) self.__file_format = file_format @property def parameters(self): """ Additional parameters used to parse the text file. :rtype: dict """ return self.__parameters @parameters.setter def parameters(self, parameters): if (parameters is not None) and (not isinstance(parameters, dict)): self.log_exc(u"parameters is not an instance of dict", None, True, TypeError) self.__parameters = parameters @property def chars(self): """ Return the number of characters of the text file, not counting line or fragment separators. :rtype: int """ return sum([fragment.chars for fragment in self.fragments]) @property def characters(self): """ The number of characters in this text file. :rtype: int """ chars = 0 for fragment in self.fragments: chars += fragment.characters return chars @property def fragments(self): """ The current list of text fragments which are the children of the root node of the text file tree. :rtype: list of :class:`~aeneas.textfile.TextFragment` """ return self.fragments_tree.vchildren_not_empty def add_fragment(self, fragment, as_last=True): """ Add the given text fragment as the first or last child of the root node of the text file tree. :param fragment: the text fragment to be added :type fragment: :class:`~aeneas.textfile.TextFragment` :param bool as_last: if ``True`` append fragment, otherwise prepend it """ if not isinstance(fragment, TextFragment): self.log_exc(u"fragment is not an instance of TextFragment", None, True, TypeError) self.fragments_tree.add_child(Tree(value=fragment), as_last=as_last) def get_subtree(self, root): """ Return a new :class:`~aeneas.textfile.TextFile` object, rooted at the given node ``root``. :param root: the root node :type root: :class:`~aeneas.tree.Tree` :rtype: :class:`~aeneas.textfile.TextFile` """ if not isinstance(root, Tree): self.log_exc(u"root is not an instance of Tree", None, True, TypeError) new_text_file = TextFile() new_text_file.fragments_tree = root return new_text_file def get_slice(self, start=None, end=None): """ Return a new list of text fragments, indexed from start (included) to end (excluded). :param int start: the start index, included :param int end: the end index, excluded :rtype: :class:`~aeneas.textfile.TextFile` """ if start is not None: start = min(max(0, start), len(self) - 1) else: start = 0 if end is not None: end = min(max(0, end), len(self)) end = max(end, start + 1) else: end = len(self) new_text = TextFile() for fragment in self.fragments[start:end]: new_text.add_fragment(fragment) return new_text def set_language(self, language): """ Set the given language for all the text fragments. :param language: the language of the text fragments :type language: :class:`~aeneas.language.Language` """ self.log([u"Setting language: '%s'", language]) for fragment in self.fragments: fragment.language = language def clear(self): """ Clear the text file, removing all the current fragments. """ self.log(u"Clearing text fragments") self.fragments_tree = Tree() def read_from_list(self, lines): """ Read text fragments from a given list of strings:: [fragment_1, fragment_2, ..., fragment_n] :param list lines: the text fragments """ self.log(u"Reading text fragments from list") self._read_plain(lines) def read_from_list_with_ids(self, lines): """ Read text fragments from a given list of tuples:: [(id_1, text_1), (id_2, text_2), ..., (id_n, text_n)]. :param list lines: the list of ``[id, text]`` fragments (see above) """ self.log(u"Reading text fragments from list with ids") self._create_text_fragments([(line[0], [line[1]]) for line in lines]) def _read_from_file(self): """ Read text fragments from file. """ # test if we can read the given file if not gf.file_can_be_read(self.file_path): self.log_exc(u"File '%s' cannot be read" % (self.file_path), None, True, OSError) if self.file_format not in TextFileFormat.ALLOWED_VALUES: self.log_exc( u"Text file format '%s' is not supported." % (self.file_format), None, True, ValueError) # read the contents of the file self.log([u"Reading contents of file '%s'", self.file_path]) with io.open(self.file_path, "r", encoding="utf-8") as text_file: lines = text_file.readlines() # clear text fragments self.clear() # parse the contents map_read_function = { TextFileFormat.MPLAIN: self._read_mplain, TextFileFormat.MUNPARSED: self._read_munparsed, TextFileFormat.PARSED: self._read_parsed, TextFileFormat.PLAIN: self._read_plain, TextFileFormat.SUBTITLES: self._read_subtitles, TextFileFormat.UNPARSED: self._read_unparsed } map_read_function[self.file_format](lines) # log the number of fragments self.log([u"Parsed %d fragments", len(self.fragments)]) def _mplain_word_separator(self): """ Get the word separator to split words in mplain format. :rtype: string """ word_separator = gf.safe_get(self.parameters, gc.PPN_TASK_IS_TEXT_MPLAIN_WORD_SEPARATOR, u" ") if (word_separator is None) or (word_separator == "space"): return u" " elif word_separator == "equal": return u"=" elif word_separator == "pipe": return u"|" elif word_separator == "tab": return u"\u0009" return word_separator def _read_mplain(self, lines): """ Read text fragments from a multilevel format text file. :param list lines: the lines of the subtitles text file """ self.log(u"Parsing fragments from subtitles text format") word_separator = self._mplain_word_separator() self.log([u"Word separator is: '%s'", word_separator]) lines = [line.strip() for line in lines] pairs = [] i = 1 current = 0 tree = Tree() while current < len(lines): line_text = lines[current] if len(line_text) > 0: sentences = [line_text] following = current + 1 while (following < len(lines)) and (len(lines[following]) > 0): sentences.append(lines[following]) following += 1 # here sentences holds the sentences for this paragraph # create paragraph node paragraph_identifier = u"p%06d" % i paragraph_lines = [u" ".join(sentences)] paragraph_fragment = TextFragment( identifier=paragraph_identifier, lines=paragraph_lines, filtered_lines=paragraph_lines) paragraph_node = Tree(value=paragraph_fragment) tree.add_child(paragraph_node) self.log([u"Paragraph %s", paragraph_identifier]) # create sentences nodes j = 1 for s in sentences: sentence_identifier = paragraph_identifier + u"s%06d" % j sentence_lines = [s] sentence_fragment = TextFragment( identifier=sentence_identifier, lines=sentence_lines, filtered_lines=sentence_lines) sentence_node = Tree(value=sentence_fragment) paragraph_node.add_child(sentence_node) j += 1 self.log([u" Sentence %s", sentence_identifier]) # create words nodes k = 1 for w in [ w for w in s.split(word_separator) if len(w) > 0 ]: word_identifier = sentence_identifier + u"w%06d" % k word_lines = [w] word_fragment = TextFragment( identifier=word_identifier, lines=word_lines, filtered_lines=word_lines) word_node = Tree(value=word_fragment) sentence_node.add_child(word_node) k += 1 self.log([u" Word %s", word_identifier]) # keep iterating current = following i += 1 current += 1 self.log(u"Storing tree") self.fragments_tree = tree def _read_munparsed(self, lines): """ Read text fragments from an munparsed format text file. :param list lines: the lines of the unparsed text file """ from bs4 import BeautifulSoup def nodes_at_level(root, level): """ Return a dict with the bs4 filter parameters """ LEVEL_TO_REGEX_MAP = [ None, gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX, gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX, gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX, ] attribute_name = "id" regex_string = self.parameters[LEVEL_TO_REGEX_MAP[level]] indent = u" " * 2 * (level - 1) self.log([ u"%sRegex for %s: '%s'", indent, attribute_name, regex_string ]) regex = re.compile(r".*\b" + regex_string + r"\b.*") return root.findAll(attrs={attribute_name: regex}) # # TODO better and/or parametric parsing, # for example, removing tags but keeping text, etc. # self.log(u"Parsing fragments from munparsed text format") # transform text in a soup object self.log(u"Creating soup") soup = BeautifulSoup("\n".join(lines), "lxml") # extract according to class_regex and id_regex text_from_id = {} ids = [] self.log(u"Finding l1 elements") tree = Tree() for l1_node in nodes_at_level(soup, 1): has_word = False try: l1_id = gf.safe_unicode(l1_node["id"]) self.log([u"Found l1 node with id: '%s'", l1_id]) l1_text = [] paragraph_node = Tree() paragraph_text = [] for l2_node in nodes_at_level(l1_node, 2): l2_id = gf.safe_unicode(l2_node["id"]) self.log([u" Found l2 node with id: '%s'", l2_id]) l2_text = [] sentence_node = Tree() paragraph_node.add_child(sentence_node) sentence_text = [] for l3_node in nodes_at_level(l2_node, 3): l3_id = gf.safe_unicode(l3_node["id"]) l3_text = gf.safe_unicode(l3_node.text) self.log([u" Found l3 node with id: '%s'", l3_id]) self.log( [u" Found l3 node with text: '%s'", l3_text]) word_fragment = TextFragment(identifier=l3_id, lines=[l3_text], filtered_lines=[l3_text]) word_node = Tree(value=word_fragment) sentence_node.add_child(word_node) sentence_text.append(l3_text) has_word = True sentence_text = u" ".join(sentence_text) paragraph_text.append(sentence_text) sentence_node.value = TextFragment( identifier=l2_id, lines=[sentence_text], filtered_lines=[sentence_text]) self.log( [u" Found l2 node with text: '%s'" % sentence_text]) if has_word: paragraph_text = u" ".join(paragraph_text) paragraph_node.value = TextFragment( identifier=l1_id, lines=[paragraph_text], filtered_lines=[paragraph_text]) tree.add_child(paragraph_node) self.log( [u"Found l1 node with text: '%s'" % paragraph_text]) else: self.log(u"Found l1 node but it has no words, skipping") except KeyError: self.log_warn(u"KeyError while parsing a l1 node") # append to fragments self.log(u"Storing tree") self.fragments_tree = tree def _read_subtitles(self, lines): """ Read text fragments from a subtitles format text file. :param list lines: the lines of the subtitles text file :raises: ValueError: if the id regex is not valid """ self.log(u"Parsing fragments from subtitles text format") id_format = self._get_id_format() lines = [line.strip() for line in lines] pairs = [] i = 1 current = 0 while current < len(lines): line_text = lines[current] if len(line_text) > 0: fragment_lines = [line_text] following = current + 1 while (following < len(lines)) and (len(lines[following]) > 0): fragment_lines.append(lines[following]) following += 1 identifier = id_format % i pairs.append((identifier, fragment_lines)) current = following i += 1 current += 1 self._create_text_fragments(pairs) def _read_parsed(self, lines): """ Read text fragments from a parsed format text file. :param list lines: the lines of the parsed text file :param dict parameters: additional parameters for parsing (e.g., class/id regex strings) """ self.log(u"Parsing fragments from parsed text format") pairs = [] for line in lines: pieces = line.split(gc.PARSED_TEXT_SEPARATOR) if len(pieces) == 2: identifier = pieces[0].strip() text = pieces[1].strip() if len(identifier) > 0: pairs.append((identifier, [text])) self._create_text_fragments(pairs) def _read_plain(self, lines): """ Read text fragments from a plain format text file. :param list lines: the lines of the plain text file :param dict parameters: additional parameters for parsing (e.g., class/id regex strings) :raises: ValueError: if the id regex is not valid """ self.log(u"Parsing fragments from plain text format") id_format = self._get_id_format() lines = [line.strip() for line in lines] pairs = [] i = 1 for line in lines: identifier = id_format % i text = line.strip() pairs.append((identifier, [text])) i += 1 self._create_text_fragments(pairs) def _read_unparsed(self, lines): """ Read text fragments from an unparsed format text file. :param list lines: the lines of the unparsed text file """ from bs4 import BeautifulSoup def filter_attributes(): """ Return a dict with the bs4 filter parameters """ attributes = {} for attribute_name, filter_name in [ ("class", gc.PPN_TASK_IS_TEXT_UNPARSED_CLASS_REGEX), ("id", gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX) ]: if filter_name in self.parameters: regex_string = self.parameters[filter_name] if regex_string is not None: self.log([ u"Regex for %s: '%s'", attribute_name, regex_string ]) regex = re.compile(r".*\b" + regex_string + r"\b.*") attributes[attribute_name] = regex return attributes # # TODO better and/or parametric parsing, # for example, removing tags but keeping text, etc. # self.log(u"Parsing fragments from unparsed text format") # transform text in a soup object self.log(u"Creating soup") soup = BeautifulSoup("\n".join(lines), "lxml") # extract according to class_regex and id_regex text_from_id = {} ids = [] filter_attributes = filter_attributes() self.log( [u"Finding elements matching attributes '%s'", filter_attributes]) nodes = soup.findAll(attrs=filter_attributes) for node in nodes: try: f_id = gf.safe_unicode(node["id"]) f_text = gf.safe_unicode(node.text) text_from_id[f_id] = f_text ids.append(f_id) except KeyError: self.log_warn(u"KeyError while parsing a node") # sort by ID as requested id_sort = gf.safe_get(dictionary=self.parameters, key=gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT, default_value=IDSortingAlgorithm.UNSORTED, can_return_none=False) self.log([u"Sorting text fragments using '%s'", id_sort]) sorted_ids = IDSortingAlgorithm(id_sort).sort(ids) # append to fragments self.log(u"Appending fragments") self._create_text_fragments([(key, [text_from_id[key]]) for key in sorted_ids]) def _get_id_format(self): """ Return the id regex from the parameters""" id_format = gf.safe_get(self.parameters, gc.PPN_TASK_OS_FILE_ID_REGEX, self.DEFAULT_ID_FORMAT, can_return_none=False) try: identifier = id_format % 1 except (TypeError, ValueError) as exc: self.log_exc(u"String '%s' is not a valid id format" % (id_format), exc, True, ValueError) return id_format def _create_text_fragments(self, pairs): """ Create text fragment objects and append them to this list. :param list pairs: a list of pairs, each pair being (id, [line_1, ..., line_n]) """ self.log(u"Creating TextFragment objects") text_filter = self._build_text_filter() for pair in pairs: self.add_fragment( TextFragment(identifier=pair[0], lines=pair[1], filtered_lines=text_filter.apply_filter(pair[1]))) def _build_text_filter(self): """ Build a suitable TextFilter object. """ text_filter = TextFilter(logger=self.logger) self.log(u"Created TextFilter object") for key, cls, param_name in [ (gc.PPN_TASK_IS_TEXT_FILE_IGNORE_REGEX, TextFilterIgnoreRegex, "regex"), (gc.PPN_TASK_IS_TEXT_FILE_TRANSLITERATE_MAP, TextFilterTransliterate, "map_file_path") ]: cls_name = cls.__name__ param_value = gf.safe_get(self.parameters, key, None) if param_value is not None: self.log([u"Creating %s object...", cls_name]) params = {param_name: param_value, "logger": self.logger} try: inner_filter = cls(**params) text_filter.add_filter(inner_filter) self.log([u"Creating %s object... done", cls_name]) except ValueError as exc: self.log_exc(u"Creating %s object failed" % (cls_name), exc, False, None) return text_filter
def _execute_multi_level_task(self): """ Execute a multi-level task """ self.log(u"Executing multi level task...") self.log(u"Saving rconf...") # save original rconf orig_rconf = self.rconf.clone() # clone rconfs and set granularity level_rconfs = [ None, self.rconf.clone(), self.rconf.clone(), self.rconf.clone() ] level_mfccs = [None, None, None, None] for i in range(1, len(level_rconfs)): level_rconfs[i].set_granularity(i) self.log([u"Level %d mws: %.3f", i, level_rconfs[i].mws]) self.log(u"Saving rconf... done") try: self.log(u"Creating AudioFile object...") audio_file = self._load_audio_file() self.log(u"Creating AudioFile object... done") # extract MFCC for each level for i in range(1, len(level_rconfs)): self._step_begin(u"extract MFCC real wave level %d" % i) if (i == 1) or ( level_rconfs[i].mws != level_rconfs[i - 1].mws) or ( level_rconfs[i].mwl != level_rconfs[i - 1].mwl): self.rconf = level_rconfs[i] level_mfccs[i] = self._extract_mfcc(audio_file=audio_file) else: self.log(u"Keeping MFCC real wave from previous level") level_mfccs[i] = level_mfccs[i - 1] self._step_end() self.log(u"Clearing AudioFile object...") self.rconf = level_rconfs[1] self._clear_audio_file(audio_file) self.log(u"Clearing AudioFile object... done") # compute head tail for the entire real wave (level 1) self._step_begin(u"compute head tail") (head_length, process_length, tail_length) = self._compute_head_process_tail(level_mfccs[1]) level_mfccs[1].set_head_middle_tail(head_length, process_length, tail_length) self._step_end() # compute alignment at each level tree = Tree() sync_roots = [tree] text_files = [self.task.text_file] aht = [None, True, False, False] aba = [None, True, True, False] for i in range(1, len(level_rconfs)): self._step_begin(u"compute alignment level %d" % i) text_files, sync_roots = self._execute_level( i, level_rconfs[i], level_mfccs[i], text_files, sync_roots, aht[i], aba[i]) self._step_end() self._step_begin(u"select levels") tree = self._select_levels(tree) self._step_end() self._step_begin(u"create sync map") self.rconf = orig_rconf self.task.sync_map = self._create_syncmap(tree) self._step_end() self._step_begin(u"check zero duration") self._check_no_zero(level_rconfs[-1].mws) self._step_end() self._step_total() self.log(u"Executing multi level task... done") except Exception as exc: self._step_failure(exc)
def _read_munparsed(self, lines): """ Read text fragments from an munparsed format text file. :param list lines: the lines of the unparsed text file """ def nodes_at_level(root, level): """ Return a dict with the bs4 filter parameters """ LEVEL_TO_REGEX_MAP = [ None, gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX, gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX, gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX, ] attribute_name = "id" regex_string = self.parameters[LEVEL_TO_REGEX_MAP[level]] indent = u" " * 2 * (level - 1) self.log([u"%sRegex for %s: '%s'", indent, attribute_name, regex_string]) regex = re.compile(r".*\b" + regex_string + r"\b.*") return root.findAll(attrs={ attribute_name: regex }) # # TODO better and/or parametric parsing, # for example, removing tags but keeping text, etc. # self.log(u"Parsing fragments from munparsed text format") # transform text in a soup object self.log(u"Creating soup") soup = BeautifulSoup("\n".join(lines), "lxml") # extract according to class_regex and id_regex text_from_id = {} ids = [] self.log(u"Finding l1 elements") tree = Tree() for l1_node in nodes_at_level(soup, 1): has_word = False try: l1_id = gf.safe_unicode(l1_node["id"]) self.log([u"Found l1 node with id: '%s'", l1_id]) l1_text = [] paragraph_node = Tree() paragraph_text = [] for l2_node in nodes_at_level(l1_node, 2): l2_id = gf.safe_unicode(l2_node["id"]) self.log([u" Found l2 node with id: '%s'", l2_id]) l2_text = [] sentence_node = Tree() paragraph_node.add_child(sentence_node) sentence_text = [] for l3_node in nodes_at_level(l2_node, 3): l3_id = gf.safe_unicode(l3_node["id"]) l3_text = gf.safe_unicode(l3_node.text) self.log([u" Found l3 node with id: '%s'", l3_id]) self.log([u" Found l3 node with text: '%s'", l3_text]) word_fragment = TextFragment( identifier=l3_id, lines=[l3_text], filtered_lines=[l3_text] ) word_node = Tree(value=word_fragment) sentence_node.add_child(word_node) sentence_text.append(l3_text) has_word = True sentence_text = u" ".join(sentence_text) paragraph_text.append(sentence_text) sentence_node.value = TextFragment( identifier=l2_id, lines=[sentence_text], filtered_lines=[sentence_text] ) self.log([u" Found l2 node with text: '%s'" % sentence_text]) if has_word: paragraph_text = u" ".join(paragraph_text) paragraph_node.value = TextFragment( identifier=l1_id, lines=[paragraph_text], filtered_lines=[paragraph_text] ) tree.add_child(paragraph_node) self.log([u"Found l1 node with text: '%s'" % paragraph_text]) else: self.log(u"Found l1 node but it has no words, skipping") except KeyError: self.log_warn(u"KeyError while parsing a l1 node") # append to fragments self.log(u"Storing tree") self.fragments_tree = tree
def test_value(self): root = Tree(value="root") self.assertIsNotNone(root.value) self.assertFalse(root.is_empty) self.assertEqual(root.vleaves, ["root"])