def test_is_single_level_true_not_empty(self):
     smf = SyncMapFragment()
     child = Tree(value=smf)
     tree = Tree()
     tree.add_child(child)
     syn = SyncMap(tree=tree)
     self.assertTrue(syn.is_single_level)
Exemple #2
0
    def _level_time_map_to_tree(self, text_file, time_map, tree=None, add_head_tail=True):
        """
        Convert a level time map into a Tree of SyncMapFragments.

        The time map is
        a list of pairs ``[start_time, end_time]``,
        of length equal to number of fragments + 2,
        where the two extra elements are for
        the HEAD (first) and TAIL (last).

        :param text_file: the text file object
        :type  text_file: :class:`~aeneas.textfile.TextFile`
        :param list time_map: the time map
        :param tree: the tree; if ``None``, a new Tree will be built
        :type  tree: :class:`~aeneas.tree.Tree`
        :rtype: :class:`~aeneas.tree.Tree`
        """
        if tree is None:
            tree = Tree()
        if add_head_tail:
            fragments = (
                [TextFragment(u"HEAD", self.task.configuration["language"], [u""])]
                + text_file.fragments
                + [TextFragment(u"TAIL", self.task.configuration["language"], [u""])]
            )
            i = 0
        else:
            fragments = text_file.fragments
            i = 1
        for fragment in fragments:
            interval = time_map[i]
            sm_frag = SyncMapFragment(fragment, interval[0], interval[1])
            tree.add_child(Tree(value=sm_frag))
            i += 1
        return tree
 def test_fragments_tree_not_empty(self):
     smf = SyncMapFragment()
     child = Tree(value=smf)
     tree = Tree()
     tree.add_child(child)
     syn = SyncMap(tree=tree)
     self.assertEqual(len(syn.fragments_tree), 1)
Exemple #4
0
 def __init__(self,
              file_path=None,
              file_format=None,
              parameters=None,
              rconf=None,
              logger=None):
     super(TextFile, self).__init__(rconf=rconf, logger=logger)
     self.file_path = file_path
     self.file_format = file_format
     self.parameters = {} if parameters is None else parameters
     self.fragments_tree = Tree()
     if (self.file_path is not None) and (self.file_format is not None):
         self._read_from_file()
 def test_is_single_level_false(self):
     smf2 = SyncMapFragment()
     child2 = Tree(value=smf2)
     smf = SyncMapFragment()
     child = Tree(value=smf)
     child.add_child(child2)
     tree = Tree()
     tree.add_child(child)
     syn = SyncMap(tree=tree)
     self.assertFalse(syn.is_single_level)
Exemple #6
0
 def __init__(self, tree=None, rconf=None, logger=None):
     if (tree is not None) and (not isinstance(tree, Tree)):
         raise TypeError(u"tree is not an instance of Tree")
     super(SyncMap, self).__init__(rconf=rconf, logger=logger)
     if tree is None:
         tree = Tree()
     self.fragments_tree = tree
 def test_has_zero_length_leaves(self):
     params = [
         ([("0.000", "0.000"), ("0.000", "0.000")], True),
         ([("0.000", "0.000"), ("0.000", "1.000")], True),
         ([("0.000", "1.000"), ("1.000", "1.000")], True),
         ([("0.000", "1.000"), ("1.000", "2.000")], False),
         ([("0.000", "0.000"), ("1.000", "1.000")], True),
         ([("0.000", "0.000"), ("1.000", "2.000")], True),
         ([("0.000", "1.000"), ("2.000", "2.000")], True),
         ([("0.000", "1.000"), ("2.000", "3.000")], False),
     ]
     for l, exp in params:
         tree = Tree()
         for b, e in l:
             interval = TimeInterval(begin=TimeValue(b), end=TimeValue(e))
             smf = SyncMapFragment(interval=interval)
             child = Tree(value=smf)
             tree.add_child(child, as_last=True)
         syn = SyncMap(tree=tree)
         self.assertEqual(syn.has_zero_length_leaves, exp)
Exemple #8
0
    def _level_time_map_to_tree(self,
                                text_file,
                                time_map,
                                tree=None,
                                add_head_tail=True):
        """
        Convert a level time map into a Tree of SyncMapFragments.

        The time map is
        a list of pairs ``[start_time, end_time]``,
        of length equal to number of fragments + 2,
        where the two extra elements are for
        the HEAD (first) and TAIL (last).

        :param text_file: the text file object
        :type  text_file: :class:`~aeneas.textfile.TextFile`
        :param list time_map: the time map
        :param tree: the tree; if ``None``, a new Tree will be built
        :type  tree: :class:`~aeneas.tree.Tree`
        :rtype: :class:`~aeneas.tree.Tree`
        """
        if tree is None:
            tree = Tree()
        if add_head_tail:
            fragments = ([
                TextFragment(u"HEAD", self.task.configuration["language"],
                             [u""])
            ] + text_file.fragments + [
                TextFragment(u"TAIL", self.task.configuration["language"],
                             [u""])
            ])
            i = 0
        else:
            fragments = text_file.fragments
            i = 1
        for fragment in fragments:
            interval = time_map[i]
            sm_frag = SyncMapFragment(fragment, interval[0], interval[1])
            tree.add_child(Tree(value=sm_frag))
            i += 1
        return tree
 def test_leaves_are_consistent(self):
     params = [
         ([("0.000", "0.000"), ("0.000", "0.000")], True),
         ([("0.000", "0.000"), ("0.000", "1.000")], True),
         ([("0.000", "1.000"), ("1.000", "1.000")], True),
         ([("0.000", "1.000"), ("1.000", "2.000")], True),
         ([("0.000", "0.000"), ("1.000", "1.000")], True),
         ([("0.000", "0.000"), ("1.000", "2.000")], True),
         ([("0.000", "1.000"), ("2.000", "2.000")], True),
         ([("0.000", "1.000"), ("2.000", "3.000")], True),
         ([("0.000", "1.000"), ("1.000", "1.000"),
           ("1.000", "2.000")], True),
         ([("0.000", "1.000"), ("1.000", "1.000"),
           ("2.000", "2.000")], True),
         ([("0.000", "1.000"), ("2.000", "3.000"),
           ("1.500", "1.500")], True),
         ([("0.000", "1.000"), ("2.000", "3.000"),
           ("1.500", "1.750")], True),
         ([("0.000", "1.000"), ("1.040", "2.000")], True),
         ([("0.000", "1.000"), ("0.000", "0.500")], False),
         ([("0.000", "1.000"), ("0.000", "1.000")], False),
         ([("0.000", "1.000"), ("0.000", "1.500")], False),
         ([("0.000", "1.000"), ("0.500", "0.500")], False),
         ([("0.000", "1.000"), ("0.500", "0.750")], False),
         ([("0.000", "1.000"), ("0.500", "1.000")], False),
         ([("0.000", "1.000"), ("0.500", "1.500")], False),
         ([("0.000", "1.000"), ("2.000", "2.000"),
           ("1.500", "2.500")], False),
         ([("0.000", "1.000"), ("2.000", "3.000"),
           ("1.500", "2.500")], False),
         ([("0.000", "1.000"), ("0.960", "2.000")], False),
     ]
     for l, exp in params:
         tree = Tree()
         for b, e in l:
             interval = TimeInterval(begin=TimeValue(b), end=TimeValue(e))
             smf = SyncMapFragment(interval=interval)
             child = Tree(value=smf)
             tree.add_child(child, as_last=True)
         syn = SyncMap(tree=tree)
         self.assertEqual(syn.leaves_are_consistent, exp)
Exemple #10
0
 def test_add_child(self):
     root = Tree(value="root")
     child1 = Tree(value="child1")
     child2 = Tree(value="child2")
     root.add_child(child1)
     root.add_child(child2)
     self.assertEqual(len(root), 2)
     self.assertEqual(root.level, 0)
     self.assertEqual(root.height, 2)
     self.assertTrue(root.is_root)
     self.assertFalse(root.is_leaf)
     self.assertEqual(root.children, [child1, child2])
     self.assertEqual(root.vchildren, ["child1", "child2"])
     self.assertEqual(root.leaves, [child1, child2])
     self.assertEqual(root.vleaves, ["child1", "child2"])
     for node in [child1, child2]:
         self.assertEqual(len(node), 0)
         self.assertEqual(node.level, 1)
         self.assertTrue(node.is_leaf)
         self.assertFalse(node.is_root)
         self.assertEqual(node.children, [])
         self.assertEqual(node.vchildren, [])
Exemple #11
0
    def add_fragment(self, fragment, as_last=True):
        """
        Add the given text fragment as the first or last child of the root node
        of the text file tree.

        :param fragment: the text fragment to be added
        :type  fragment: :class:`~aeneas.textfile.TextFragment`
        :param bool as_last: if ``True`` append fragment, otherwise prepend it
        """
        if not isinstance(fragment, TextFragment):
            self.log_exc(u"fragment is not an instance of TextFragment", None,
                         True, TypeError)
        self.fragments_tree.add_child(Tree(value=fragment), as_last=as_last)
Exemple #12
0
    def _select_levels(self, tree):
        """
        Select the correct levels in the tree,
        reading the ``os_task_file_levels``
        parameter in the Task configuration.

        If ``None`` or invalid, return the current sync map tree
        unchanged.
        Otherwise, return only the levels appearing in it.

        :param tree: a Tree of SyncMapFragments
        :type  tree: :class:`~aeneas.tree.Tree`
        :rtype: :class:`~aeneas.tree.Tree`
        """
        levels = self.task.configuration["o_levels"]
        self.log([u"Levels: '%s'", levels])
        if (levels is None) or (len(levels) < 1):
            return tree
        try:
            levels = [int(l) for l in levels if int(l) > 0]
            self.log([u"Converted levels: %s", levels])
        except ValueError:
            self.log_warn(
                u"Cannot convert levels to list of int, returning unchanged")
            return tree
        # remove head and tail nodes
        head = tree.vchildren[0]
        tail = tree.vchildren[-1]
        tree.remove_child(0)
        tree.remove_child(-1)
        # keep only the selected levels
        tree.keep_levels(levels)
        # add head and tail back
        tree.add_child(Tree(value=head), as_last=False)
        tree.add_child(Tree(value=tail), as_last=True)
        # return the new tree
        return tree
Exemple #13
0
 def __init__(
         self,
         file_path=None,
         file_format=None,
         parameters=None,
         rconf=None,
         logger=None
     ):
     super(TextFile, self).__init__(rconf=rconf, logger=logger)
     self.file_path = file_path
     self.file_format = file_format
     self.parameters = {} if parameters is None else parameters
     self.fragments_tree = Tree()
     if (self.file_path is not None) and (self.file_format is not None):
         self._read_from_file()
Exemple #14
0
    def add_fragment(self, fragment, as_last=True):
        """
        Add the given sync map fragment,
        as the first or last child of the root node
        of the sync map tree.

        :param fragment: the sync map fragment to be added
        :type  fragment: :class:`~aeneas.syncmap.fragment.SyncMapFragment`
        :param bool as_last: if ``True``, append fragment; otherwise prepend it
        :raises: TypeError: if ``fragment`` is ``None`` or
                            it is not an instance of :class:`~aeneas.syncmap.fragment.SyncMapFragment`
        """
        if not isinstance(fragment, SyncMapFragment):
            self.log_exc(u"fragment is not an instance of SyncMapFragment",
                         None, True, TypeError)
        self.fragments_tree.add_child(Tree(value=fragment), as_last=as_last)
Exemple #15
0
 def test_empty(self):
     root = Tree()
     self.assertEqual(len(root), 0)
     self.assertEqual(root.level, 0)
     self.assertEqual(root.height, 1)
     self.assertIsNone(root.value)
     self.assertTrue(root.is_root)
     self.assertTrue(root.is_leaf)
     self.assertTrue(root.is_empty)
     self.assertTrue(root.is_pleasant)
     self.assertEqual(root.children, [])
     self.assertEqual(root.subtree, [root])
     self.assertEqual(root.leaves, [root])
     self.assertEqual(root.vleaves, [None])
     self.assertEqual(root.leaves_not_empty, [])
     self.assertEqual(root.vleaves_not_empty, [])
Exemple #16
0
    def append_fragment_list_to_sync_root(self, sync_root):
        """
        Append the sync map fragment list
        to the given node from a sync map tree.

        :param sync_root: the root of the sync map tree to which the new nodes should be appended
        :type  sync_root: :class:`~aeneas.tree.Tree`
        """
        if not isinstance(sync_root, Tree):
            self.log_exc(u"sync_root is not a Tree object", None, True,
                         TypeError)

        self.log(u"Appending fragment list to sync root...")
        for fragment in self.smflist:
            sync_root.add_child(Tree(value=fragment))
        self.log(u"Appending fragment list to sync root... done")
Exemple #17
0
    def _execute_single_level_task(self):
        """ Execute a single-level task """
        self.log(u"Executing single level task...")
        try:
            # load audio file, extract MFCCs from real wave, clear audio file
            self._step_begin(u"extract MFCC real wave")
            real_wave_mfcc = self._extract_mfcc(
                file_path=self.task.audio_file_path_absolute,
                file_format=None,
            )
            self._step_end()

            # compute head and/or tail and set it
            self._step_begin(u"compute head tail")
            (head_length, process_length,
             tail_length) = self._compute_head_process_tail(real_wave_mfcc)
            real_wave_mfcc.set_head_middle_tail(head_length, process_length,
                                                tail_length)
            self._step_end()

            # compute alignment, outputting a tree of time intervals
            self._set_synthesizer()
            sync_root = Tree()
            self._execute_inner(real_wave_mfcc,
                                self.task.text_file,
                                sync_root=sync_root,
                                force_aba_auto=False,
                                log=True,
                                leaf_level=True)
            self._clear_cache_synthesizer()

            # create syncmap and add it to task
            self._step_begin(u"create sync map")
            self._create_sync_map(sync_root=sync_root)
            self._step_end()

            # log total
            self._step_total()
            self.log(u"Executing single level task... done")
        except Exception as exc:
            self._step_failure(exc)
Exemple #18
0
 def test_add_child(self):
     root = Tree(value="root")
     child1 = Tree(value="child1")
     child2 = Tree(value="child2")
     root.add_child(child1)
     root.add_child(child2)
     self.assertEqual(len(root), 2)
     self.assertEqual(root.level, 0)
     self.assertEqual(root.height, 2)
     self.assertTrue(root.is_root)
     self.assertFalse(root.is_leaf)
     self.assertEqual(root.children, [child1, child2])
     self.assertEqual(root.vchildren, ["child1", "child2"])
     self.assertEqual(root.leaves, [child1, child2])
     self.assertEqual(root.vleaves, ["child1", "child2"])
     for node in [child1, child2]:
         self.assertEqual(len(node), 0)
         self.assertEqual(node.level, 1)
         self.assertTrue(node.is_leaf)
         self.assertFalse(node.is_root)
         self.assertEqual(node.children, [])
         self.assertEqual(node.vchildren, [])
Exemple #19
0
 def test_set_parent(self):
     root = Tree(value="root")
     new_root = Tree(value="newroot")
     root.parent = new_root
     self.assertIsNotNone(root.parent)
     self.assertFalse(root.is_root)
Exemple #20
0
    def _read_munparsed(self, lines):
        """
        Read text fragments from an munparsed format text file.

        :param list lines: the lines of the unparsed text file
        """
        from bs4 import BeautifulSoup

        def nodes_at_level(root, level):
            """ Return a dict with the bs4 filter parameters """
            LEVEL_TO_REGEX_MAP = [
                None,
                gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX,
                gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX,
                gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX,
            ]
            attribute_name = "id"
            regex_string = self.parameters[LEVEL_TO_REGEX_MAP[level]]
            indent = u" " * 2 * (level - 1)
            self.log([
                u"%sRegex for %s: '%s'", indent, attribute_name, regex_string
            ])
            regex = re.compile(r".*\b" + regex_string + r"\b.*")
            return root.findAll(attrs={attribute_name: regex})

        #
        # TODO better and/or parametric parsing,
        #      for example, removing tags but keeping text, etc.
        #
        self.log(u"Parsing fragments from munparsed text format")
        # transform text in a soup object
        self.log(u"Creating soup")
        soup = BeautifulSoup("\n".join(lines), "lxml")
        # extract according to class_regex and id_regex
        text_from_id = {}
        ids = []
        self.log(u"Finding l1 elements")
        tree = Tree()
        for l1_node in nodes_at_level(soup, 1):
            has_word = False
            try:
                l1_id = gf.safe_unicode(l1_node["id"])
                self.log([u"Found l1 node with id:   '%s'", l1_id])
                l1_text = []
                paragraph_node = Tree()
                paragraph_text = []
                for l2_node in nodes_at_level(l1_node, 2):
                    l2_id = gf.safe_unicode(l2_node["id"])
                    self.log([u"  Found l2 node with id:   '%s'", l2_id])
                    l2_text = []
                    sentence_node = Tree()
                    paragraph_node.add_child(sentence_node)
                    sentence_text = []
                    for l3_node in nodes_at_level(l2_node, 3):
                        l3_id = gf.safe_unicode(l3_node["id"])
                        l3_text = gf.safe_unicode(l3_node.text)
                        self.log([u"    Found l3 node with id:   '%s'", l3_id])
                        self.log(
                            [u"    Found l3 node with text: '%s'", l3_text])
                        word_fragment = TextFragment(identifier=l3_id,
                                                     lines=[l3_text],
                                                     filtered_lines=[l3_text])
                        word_node = Tree(value=word_fragment)
                        sentence_node.add_child(word_node)
                        sentence_text.append(l3_text)
                        has_word = True
                    sentence_text = u" ".join(sentence_text)
                    paragraph_text.append(sentence_text)
                    sentence_node.value = TextFragment(
                        identifier=l2_id,
                        lines=[sentence_text],
                        filtered_lines=[sentence_text])
                    self.log(
                        [u"  Found l2 node with text: '%s'" % sentence_text])
                if has_word:
                    paragraph_text = u" ".join(paragraph_text)
                    paragraph_node.value = TextFragment(
                        identifier=l1_id,
                        lines=[paragraph_text],
                        filtered_lines=[paragraph_text])
                    tree.add_child(paragraph_node)
                    self.log(
                        [u"Found l1 node with text: '%s'" % paragraph_text])
                else:
                    self.log(u"Found l1 node but it has no words, skipping")
            except KeyError:
                self.log_warn(u"KeyError while parsing a l1 node")
        # append to fragments
        self.log(u"Storing tree")
        self.fragments_tree = tree
Exemple #21
0
    def _read_mplain(self, lines):
        """
        Read text fragments from a multilevel format text file.

        :param list lines: the lines of the subtitles text file
        """
        self.log(u"Parsing fragments from subtitles text format")
        word_separator = self._mplain_word_separator()
        self.log([u"Word separator is: '%s'", word_separator])
        lines = [line.strip() for line in lines]
        pairs = []
        i = 1
        current = 0
        tree = Tree()
        while current < len(lines):
            line_text = lines[current]
            if len(line_text) > 0:
                sentences = [line_text]
                following = current + 1
                while (following < len(lines)) and (len(lines[following]) > 0):
                    sentences.append(lines[following])
                    following += 1

                # here sentences holds the sentences for this paragraph

                # create paragraph node
                paragraph_identifier = u"p%06d" % i
                paragraph_lines = [u" ".join(sentences)]
                paragraph_fragment = TextFragment(
                    identifier=paragraph_identifier,
                    lines=paragraph_lines,
                    filtered_lines=paragraph_lines)
                paragraph_node = Tree(value=paragraph_fragment)
                tree.add_child(paragraph_node)
                self.log([u"Paragraph %s", paragraph_identifier])

                # create sentences nodes
                j = 1
                for s in sentences:
                    sentence_identifier = paragraph_identifier + u"s%06d" % j
                    sentence_lines = [s]
                    sentence_fragment = TextFragment(
                        identifier=sentence_identifier,
                        lines=sentence_lines,
                        filtered_lines=sentence_lines)
                    sentence_node = Tree(value=sentence_fragment)
                    paragraph_node.add_child(sentence_node)
                    j += 1
                    self.log([u"  Sentence %s", sentence_identifier])

                    # create words nodes
                    k = 1
                    for w in [
                            w for w in s.split(word_separator) if len(w) > 0
                    ]:
                        word_identifier = sentence_identifier + u"w%06d" % k
                        word_lines = [w]
                        word_fragment = TextFragment(
                            identifier=word_identifier,
                            lines=word_lines,
                            filtered_lines=word_lines)
                        word_node = Tree(value=word_fragment)
                        sentence_node.add_child(word_node)
                        k += 1
                        self.log([u"    Word %s", word_identifier])

                # keep iterating
                current = following
                i += 1
            current += 1
        self.log(u"Storing tree")
        self.fragments_tree = tree
Exemple #22
0
 def create_tree1(self, soon=True):
     root = Tree(value="root")
     c1 = Tree(value="c1")
     c11 = Tree(value="c11")
     c111 = Tree(value="c111")
     c1111 = Tree(value="c1111")
     c1112 = Tree(value="c1112")
     c1113 = Tree(value="c1113")
     if soon:
         root.add_child(c1)
         c1.add_child(c11)
         c11.add_child(c111)
         c111.add_child(c1111)
         c111.add_child(c1112)
         c111.add_child(c1113)
     else:
         c111.add_child(c1111)
         c111.add_child(c1112)
         c111.add_child(c1113)
         c11.add_child(c111)
         c1.add_child(c11)
         root.add_child(c1)
     return (root, c1, c11, c111, c1111, c1112, c1113)
Exemple #23
0
 def test_set_parent(self):
     root = Tree(value="root")
     new_root = Tree(value="newroot")
     root.parent = new_root
     self.assertIsNotNone(root.parent)
     self.assertFalse(root.is_root)
 def test_fragments_tree_empty(self):
     tree = Tree()
     syn = SyncMap(tree=tree)
     self.assertEqual(len(syn.fragments_tree), 0)
Exemple #25
0
 def test_unicode(self):
     root = Tree(value="root")
     s = root.__unicode__()
     self.assertIsNotNone(s)
Exemple #26
0
 def test_add_child_not_tree(self):
     root = Tree(value="root")
     with self.assertRaises(TypeError):
         root.add_child("bad child")
Exemple #27
0
 def create_tree1(self, soon=True):
     root = Tree(value="root")
     c1 = Tree(value="c1")
     c11 = Tree(value="c11")
     c111 = Tree(value="c111")
     c1111 = Tree(value="c1111")
     c1112 = Tree(value="c1112")
     c1113 = Tree(value="c1113")
     if soon:
         root.add_child(c1)
         c1.add_child(c11)
         c11.add_child(c111)
         c111.add_child(c1111)
         c111.add_child(c1112)
         c111.add_child(c1113)
     else:
         c111.add_child(c1111)
         c111.add_child(c1112)
         c111.add_child(c1113)
         c11.add_child(c111)
         c1.add_child(c11)
         root.add_child(c1)
     return (root, c1, c11, c111, c1111, c1112, c1113)
Exemple #28
0
 def test_str(self):
     root = Tree(value="root")
     s = root.__str__()
     self.assertIsNotNone(s)
Exemple #29
0
 def test_str(self):
     root = Tree(value="root")
     s = root.__str__()
     self.assertIsNotNone(s)
Exemple #30
0
 def test_unicode(self):
     root = Tree(value="root")
     s = root.__unicode__()
     self.assertIsNotNone(s)
Exemple #31
0
    def _read_mplain(self, lines):
        """
        Read text fragments from a multilevel format text file.

        :param list lines: the lines of the subtitles text file
        """
        self.log(u"Parsing fragments from subtitles text format")
        word_separator = self._mplain_word_separator()
        self.log([u"Word separator is: '%s'", word_separator])
        lines = [line.strip() for line in lines]
        pairs = []
        i = 1
        current = 0
        tree = Tree()
        while current < len(lines):
            line_text = lines[current]
            if len(line_text) > 0:
                sentences = [line_text]
                following = current + 1
                while (following < len(lines)) and (len(lines[following]) > 0):
                    sentences.append(lines[following])
                    following += 1

                # here sentences holds the sentences for this paragraph

                # create paragraph node
                paragraph_identifier = u"p%06d" % i
                paragraph_lines = [u" ".join(sentences)]
                paragraph_fragment = TextFragment(
                    identifier=paragraph_identifier,
                    lines=paragraph_lines,
                    filtered_lines=paragraph_lines
                )
                paragraph_node = Tree(value=paragraph_fragment)
                tree.add_child(paragraph_node)
                self.log([u"Paragraph %s", paragraph_identifier])

                # create sentences nodes
                j = 1
                for s in sentences:
                    sentence_identifier = paragraph_identifier + u"s%06d" % j
                    sentence_lines = [s]
                    sentence_fragment = TextFragment(
                        identifier=sentence_identifier,
                        lines=sentence_lines,
                        filtered_lines=sentence_lines
                    )
                    sentence_node = Tree(value=sentence_fragment)
                    paragraph_node.add_child(sentence_node)
                    j += 1
                    self.log([u"  Sentence %s", sentence_identifier])

                    # create words nodes
                    k = 1
                    for w in [w for w in s.split(word_separator) if len(w) > 0]:
                        word_identifier = sentence_identifier + u"w%06d" % k
                        word_lines = [w]
                        word_fragment = TextFragment(
                            identifier=word_identifier,
                            lines=word_lines,
                            filtered_lines=word_lines
                        )
                        word_node = Tree(value=word_fragment)
                        sentence_node.add_child(word_node)
                        k += 1
                        self.log([u"    Word %s", word_identifier])

                # keep iterating
                current = following
                i += 1
            current += 1
        self.log(u"Storing tree")
        self.fragments_tree = tree
Exemple #32
0
 def test_add_child_not_tree(self):
     root = Tree(value="root")
     with self.assertRaises(TypeError):
         root.add_child("bad child")
Exemple #33
0
 def test_parent(self):
     root = Tree(value="root")
     self.assertIsNone(root.parent)
     self.assertTrue(root.is_root)
Exemple #34
0
    def create_tree2(self):
        root = Tree(value="r")
        c1 = Tree(value="c1")
        c2 = Tree(value="c2")
        c3 = Tree(value="c3")
        c4 = Tree(value="c4")

        c11 = Tree(value="c11")
        c12 = Tree(value="c12")
        c13 = Tree(value="c13")

        c21 = Tree(value="c21")
        c22 = Tree(value="c22")
        c23 = Tree(value="c23")
        c24 = Tree(value="c24")
        c25 = Tree(value="c25")

        c231 = Tree(value="c231")
        c232 = Tree(value="c232")

        root.add_child(c1)
        root.add_child(c2)
        root.add_child(c3)
        root.add_child(c4)

        c1.add_child(c11)
        c1.add_child(c12)
        c1.add_child(c13)

        c2.add_child(c21)
        c2.add_child(c22)
        c2.add_child(c23)
        c2.add_child(c24)
        c2.add_child(c25)

        c23.add_child(c231)
        c23.add_child(c232)
        return (root, c1, c11, c12, c13, c2, c21, c22, c23, c231, c232, c24, c25, c3, c4)
Exemple #35
0
 def clear(self):
     """
     Clear the text file, removing all the current fragments.
     """
     self.log(u"Clearing text fragments")
     self.fragments_tree = Tree()
Exemple #36
0
    def _execute_multi_level_task(self):
        """ Execute a multi-level task """
        self.log(u"Executing multi level task...")

        self.log(u"Saving rconf...")
        # save original rconf
        orig_rconf = self.rconf.clone()
        # clone rconfs and set granularity
        # TODO the following code assumes 3 levels: generalize this
        level_rconfs = [
            None,
            self.rconf.clone(),
            self.rconf.clone(),
            self.rconf.clone()
        ]
        level_mfccs = [None, None, None, None]
        force_aba_autos = [None, False, False, True]
        for i in range(1, len(level_rconfs)):
            level_rconfs[i].set_granularity(i)
            self.log([u"Level %d mmn: %s", i, level_rconfs[i].mmn])
            self.log([u"Level %d mwl: %.3f", i, level_rconfs[i].mwl])
            self.log([u"Level %d mws: %.3f", i, level_rconfs[i].mws])
            level_rconfs[i].set_tts(i)
            self.log([u"Level %d tts: %s", i, level_rconfs[i].tts])
            self.log([u"Level %d tts_path: %s", i, level_rconfs[i].tts_path])
        self.log(u"Saving rconf... done")
        try:
            self.log(u"Creating AudioFile object...")
            audio_file = self._load_audio_file()
            self.log(u"Creating AudioFile object... done")

            # extract MFCC for each level
            for i in range(1, len(level_rconfs)):
                self._step_begin(u"extract MFCC real wave level %d" % i)
                if (i == 1) or (
                        level_rconfs[i].mws != level_rconfs[i - 1].mws) or (
                            level_rconfs[i].mwl != level_rconfs[i - 1].mwl):
                    self.rconf = level_rconfs[i]
                    level_mfccs[i] = self._extract_mfcc(audio_file=audio_file)
                else:
                    self.log(u"Keeping MFCC real wave from previous level")
                    level_mfccs[i] = level_mfccs[i - 1]
                self._step_end()

            self.log(u"Clearing AudioFile object...")
            self.rconf = level_rconfs[1]
            self._clear_audio_file(audio_file)
            self.log(u"Clearing AudioFile object... done")

            # compute head tail for the entire real wave (level 1)
            self._step_begin(u"compute head tail")
            (head_length, process_length,
             tail_length) = self._compute_head_process_tail(level_mfccs[1])
            level_mfccs[1].set_head_middle_tail(head_length, process_length,
                                                tail_length)
            self._step_end()

            # compute alignment at each level
            sync_root = Tree()
            sync_roots = [sync_root]
            text_files = [self.task.text_file]
            number_levels = len(level_rconfs)
            for i in range(1, number_levels):
                self._step_begin(u"compute alignment level %d" % i)
                self.rconf = level_rconfs[i]
                text_files, sync_roots = self._execute_level(
                    level=i,
                    audio_file_mfcc=level_mfccs[i],
                    text_files=text_files,
                    sync_roots=sync_roots,
                    force_aba_auto=force_aba_autos[i],
                )
                self._step_end()

            # restore original rconf, and create syncmap and add it to task
            self._step_begin(u"create sync map")
            self.rconf = orig_rconf
            self._create_sync_map(sync_root=sync_root)
            self._step_end()

            self._step_total()
            self.log(u"Executing multi level task... done")
        except Exception as exc:
            self._step_failure(exc)
Exemple #37
0
    def create_tree2(self):
        root = Tree(value="r")
        c1 = Tree(value="c1")
        c2 = Tree(value="c2")
        c3 = Tree(value="c3")
        c4 = Tree(value="c4")

        c11 = Tree(value="c11")
        c12 = Tree(value="c12")
        c13 = Tree(value="c13")

        c21 = Tree(value="c21")
        c22 = Tree(value="c22")
        c23 = Tree(value="c23")
        c24 = Tree(value="c24")
        c25 = Tree(value="c25")

        c231 = Tree(value="c231")
        c232 = Tree(value="c232")

        root.add_child(c1)
        root.add_child(c2)
        root.add_child(c3)
        root.add_child(c4)

        c1.add_child(c11)
        c1.add_child(c12)
        c1.add_child(c13)

        c2.add_child(c21)
        c2.add_child(c22)
        c2.add_child(c23)
        c2.add_child(c24)
        c2.add_child(c25)

        c23.add_child(c231)
        c23.add_child(c232)
        return (root, c1, c11, c12, c13, c2, c21, c22, c23, c231, c232, c24,
                c25, c3, c4)
Exemple #38
0
class TextFile(Loggable):
    """
    A tree of text fragments, representing a text file.

    :param string file_path: the path to the text file.
                             If not ``None`` (and also ``file_format`` is not ``None``),
                             the file will be read immediately.
    :param file_format: the format of the text file
    :type  file_format: :class:`~aeneas.textfile.TextFileFormat`
    :param dict parameters: additional parameters used to parse the text file
    :param rconf: a runtime configuration
    :type  rconf: :class:`~aeneas.runtimeconfiguration.RuntimeConfiguration`
    :param logger: the logger object
    :type  logger: :class:`~aeneas.logger.Logger`
    :raises: OSError: if ``file_path`` cannot be read
    :raises: TypeError: if ``parameters`` is not an instance of ``dict``
    :raises: ValueError: if ``file_format`` value is not allowed
    """

    DEFAULT_ID_FORMAT = u"f%06d"

    TAG = u"TextFile"

    def __init__(
            self,
            file_path=None,
            file_format=None,
            parameters=None,
            rconf=None,
            logger=None
        ):
        super(TextFile, self).__init__(rconf=rconf, logger=logger)
        self.file_path = file_path
        self.file_format = file_format
        self.parameters = {} if parameters is None else parameters
        self.fragments_tree = Tree()
        if (self.file_path is not None) and (self.file_format is not None):
            self._read_from_file()

    def __len__(self):
        return len(self.fragments)

    def __unicode__(self):
        msg = []
        if self.fragments_tree is not None:
            for node in self.fragments_tree.pre:
                if not node.is_empty:
                    indent = u" " * 2 * (node.level - 1)
                    msg.append(u"%s%s" % (indent, node.value.__unicode__()))
        return u"\n".join(msg)

    def __str__(self):
        return gf.safe_str(self.__unicode__())

    @property
    def fragments_tree(self):
        """
        Return the current tree of fragments.

        :rtype: :class:`~aeneas.tree.Tree`
        """
        return self.__fragments_tree
    @fragments_tree.setter
    def fragments_tree(self, fragments_tree):
        self.__fragments_tree = fragments_tree

    @property
    def children_not_empty(self):
        """
        Return the direct not empty children of the root of the fragments tree,
        as ``TextFile`` objects.

        :rtype: list of :class:`~aeneas.textfile.TextFile`
        """
        children = []
        for child_node in self.fragments_tree.children_not_empty:
            child_text_file = self.get_subtree(child_node)
            child_text_file.set_language(child_node.value.language)
            children.append(child_text_file)
        return children

    @property
    def chars(self):
        """
        Return the number of characters of the text file,
        not counting line or fragment separators.

        :rtype: int
        """
        return sum([fragment.chars for fragment in self.fragments])

    @property
    def file_path(self):
        """
        The path of the text file.

        :rtype: string
        """
        return self.__file_path
    @file_path.setter
    def file_path(self, file_path):
        if (file_path is not None) and (not gf.file_can_be_read(file_path)):
            self.log_exc(u"Text file '%s' cannot be read" % (file_path), None, True, OSError)
        self.__file_path = file_path

    @property
    def file_format(self):
        """
        The format of the text file.

        :rtype: :class:`~aeneas.textfile.TextFileFormat`
        """
        return self.__file_format
    @file_format.setter
    def file_format(self, file_format):
        if (file_format is not None) and (file_format not in TextFileFormat.ALLOWED_VALUES):
            self.log_exc(u"Text file format '%s' is not allowed" % (file_format), None, True, ValueError)
        self.__file_format = file_format

    @property
    def parameters(self):
        """
        Additional parameters used to parse the text file.

        :rtype: dict
        """
        return self.__parameters
    @parameters.setter
    def parameters(self, parameters):
        if (parameters is not None) and (not isinstance(parameters, dict)):
            self.log_exc(u"parameters is not an instance of dict", None, True, TypeError)
        self.__parameters = parameters

    @property
    def characters(self):
        """
        The number of characters in this text file.

        :rtype: int
        """
        chars = 0
        for fragment in self.fragments:
            chars += fragment.characters
        return chars

    @property
    def fragments(self):
        """
        The current list of text fragments
        which are the children of the root node
        of the text file tree.

        :rtype: list of :class:`~aeneas.textfile.TextFragment`
        """
        return self.fragments_tree.vchildren_not_empty

    def add_fragment(self, fragment, as_last=True):
        """
        Add the given text fragment as the first or last child of the root node
        of the text file tree.

        :param fragment: the text fragment to be added
        :type  fragment: :class:`~aeneas.textfile.TextFragment`
        :param bool as_last: if ``True`` append fragment, otherwise prepend it
        """
        if not isinstance(fragment, TextFragment):
            self.log_exc(u"fragment is not an instance of TextFragment", None, True, TypeError)
        self.fragments_tree.add_child(Tree(value=fragment), as_last=as_last)

    def get_subtree(self, root):
        """
        Return a new :class:`~aeneas.textfile.TextFile` object,
        rooted at the given node ``root``.

        :param root: the root node
        :type  root: :class:`~aeneas.tree.Tree`
        :rtype: :class:`~aeneas.textfile.TextFile`
        """
        if not isinstance(root, Tree):
            self.log_exc(u"root is not an instance of Tree", None, True, TypeError)
        new_text_file = TextFile()
        new_text_file.fragments_tree = root
        return new_text_file

    def get_slice(self, start=None, end=None):
        """
        Return a new list of text fragments,
        indexed from start (included) to end (excluded).

        :param int start: the start index, included
        :param int end: the end index, excluded
        :rtype: :class:`~aeneas.textfile.TextFile`
        """
        if start is not None:
            start = min(max(0, start), len(self) - 1)
        else:
            start = 0
        if end is not None:
            end = min(max(0, end), len(self))
            end = max(end, start + 1)
        else:
            end = len(self)
        new_text = TextFile()
        for fragment in self.fragments[start:end]:
            new_text.add_fragment(fragment)
        return new_text

    def set_language(self, language):
        """
        Set the given language for all the text fragments.

        :param language: the language of the text fragments
        :type  language: :class:`~aeneas.language.Language`
        """
        self.log([u"Setting language: '%s'", language])
        for fragment in self.fragments:
            fragment.language = language

    def clear(self):
        """
        Clear the text file, removing all the current fragments.
        """
        self.log(u"Clearing text fragments")
        self.fragments_tree = Tree()

    def read_from_list(self, lines):
        """
        Read text fragments from a given list of strings::

            [fragment_1, fragment_2, ..., fragment_n]

        :param list lines: the text fragments
        """
        self.log(u"Reading text fragments from list")
        self._read_plain(lines)

    def read_from_list_with_ids(self, lines):
        """
        Read text fragments from a given list of tuples::

            [(id_1, text_1), (id_2, text_2), ..., (id_n, text_n)].

        :param list lines: the list of ``[id, text]`` fragments (see above)
        """
        self.log(u"Reading text fragments from list with ids")
        self._create_text_fragments([(line[0], [line[1]]) for line in lines])

    def _read_from_file(self):
        """
        Read text fragments from file.
        """
        # test if we can read the given file
        if not gf.file_can_be_read(self.file_path):
            self.log_exc(u"File '%s' cannot be read" % (self.file_path), None, True, OSError)

        if self.file_format not in TextFileFormat.ALLOWED_VALUES:
            self.log_exc(u"Text file format '%s' is not supported." % (self.file_format), None, True, ValueError)

        # read the contents of the file
        self.log([u"Reading contents of file '%s'", self.file_path])
        with io.open(self.file_path, "r", encoding="utf-8") as text_file:
            lines = text_file.readlines()

        # clear text fragments
        self.clear()

        # parse the contents
        map_read_function = {
            TextFileFormat.MPLAIN: self._read_mplain,
            TextFileFormat.MUNPARSED: self._read_munparsed,
            TextFileFormat.PARSED: self._read_parsed,
            TextFileFormat.PLAIN: self._read_plain,
            TextFileFormat.SUBTITLES: self._read_subtitles,
            TextFileFormat.UNPARSED: self._read_unparsed
        }
        map_read_function[self.file_format](lines)

        # log the number of fragments
        self.log([u"Parsed %d fragments", len(self.fragments)])

    def _mplain_word_separator(self):
        """
        Get the word separator to split words in mplain format.

        :rtype: string
        """
        word_separator = gf.safe_get(self.parameters, gc.PPN_TASK_IS_TEXT_MPLAIN_WORD_SEPARATOR, u" ")
        if (word_separator is None) or (word_separator == "space"):
            return u" "
        elif word_separator == "equal":
            return u"="
        elif word_separator == "pipe":
            return u"|"
        elif word_separator == "tab":
            return u"\u0009"
        return word_separator

    def _read_mplain(self, lines):
        """
        Read text fragments from a multilevel format text file.

        :param list lines: the lines of the subtitles text file
        """
        self.log(u"Parsing fragments from subtitles text format")
        word_separator = self._mplain_word_separator()
        self.log([u"Word separator is: '%s'", word_separator])
        lines = [line.strip() for line in lines]
        pairs = []
        i = 1
        current = 0
        tree = Tree()
        while current < len(lines):
            line_text = lines[current]
            if len(line_text) > 0:
                sentences = [line_text]
                following = current + 1
                while (following < len(lines)) and (len(lines[following]) > 0):
                    sentences.append(lines[following])
                    following += 1

                # here sentences holds the sentences for this paragraph

                # create paragraph node
                paragraph_identifier = u"p%06d" % i
                paragraph_lines = [u" ".join(sentences)]
                paragraph_fragment = TextFragment(
                    identifier=paragraph_identifier,
                    lines=paragraph_lines,
                    filtered_lines=paragraph_lines
                )
                paragraph_node = Tree(value=paragraph_fragment)
                tree.add_child(paragraph_node)
                self.log([u"Paragraph %s", paragraph_identifier])

                # create sentences nodes
                j = 1
                for s in sentences:
                    sentence_identifier = paragraph_identifier + u"s%06d" % j
                    sentence_lines = [s]
                    sentence_fragment = TextFragment(
                        identifier=sentence_identifier,
                        lines=sentence_lines,
                        filtered_lines=sentence_lines
                    )
                    sentence_node = Tree(value=sentence_fragment)
                    paragraph_node.add_child(sentence_node)
                    j += 1
                    self.log([u"  Sentence %s", sentence_identifier])

                    # create words nodes
                    k = 1
                    for w in [w for w in s.split(word_separator) if len(w) > 0]:
                        word_identifier = sentence_identifier + u"w%06d" % k
                        word_lines = [w]
                        word_fragment = TextFragment(
                            identifier=word_identifier,
                            lines=word_lines,
                            filtered_lines=word_lines
                        )
                        word_node = Tree(value=word_fragment)
                        sentence_node.add_child(word_node)
                        k += 1
                        self.log([u"    Word %s", word_identifier])

                # keep iterating
                current = following
                i += 1
            current += 1
        self.log(u"Storing tree")
        self.fragments_tree = tree

    def _read_munparsed(self, lines):
        """
        Read text fragments from an munparsed format text file.

        :param list lines: the lines of the unparsed text file
        """
        def nodes_at_level(root, level):
            """ Return a dict with the bs4 filter parameters """
            LEVEL_TO_REGEX_MAP = [
                None,
                gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX,
                gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX,
                gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX,
            ]
            attribute_name = "id"
            regex_string = self.parameters[LEVEL_TO_REGEX_MAP[level]]
            indent = u" " * 2 * (level - 1)
            self.log([u"%sRegex for %s: '%s'", indent, attribute_name, regex_string])
            regex = re.compile(r".*\b" + regex_string + r"\b.*")
            return root.findAll(attrs={ attribute_name: regex })
        #
        # TODO better and/or parametric parsing,
        #      for example, removing tags but keeping text, etc.
        #
        self.log(u"Parsing fragments from munparsed text format")
        # transform text in a soup object
        self.log(u"Creating soup")
        soup = BeautifulSoup("\n".join(lines), "lxml")
        # extract according to class_regex and id_regex
        text_from_id = {}
        ids = []
        self.log(u"Finding l1 elements")
        tree = Tree()
        for l1_node in nodes_at_level(soup, 1):
            has_word = False
            try:
                l1_id = gf.safe_unicode(l1_node["id"])
                self.log([u"Found l1 node with id:   '%s'", l1_id])
                l1_text = []
                paragraph_node = Tree()
                paragraph_text = []
                for l2_node in nodes_at_level(l1_node, 2):
                    l2_id = gf.safe_unicode(l2_node["id"])
                    self.log([u"  Found l2 node with id:   '%s'", l2_id])
                    l2_text = []
                    sentence_node = Tree()
                    paragraph_node.add_child(sentence_node)
                    sentence_text = []
                    for l3_node in nodes_at_level(l2_node, 3):
                        l3_id = gf.safe_unicode(l3_node["id"])
                        l3_text = gf.safe_unicode(l3_node.text)
                        self.log([u"    Found l3 node with id:   '%s'", l3_id])
                        self.log([u"    Found l3 node with text: '%s'", l3_text])
                        word_fragment = TextFragment(
                            identifier=l3_id,
                            lines=[l3_text],
                            filtered_lines=[l3_text]
                        )
                        word_node = Tree(value=word_fragment)
                        sentence_node.add_child(word_node)
                        sentence_text.append(l3_text)
                        has_word = True
                    sentence_text = u" ".join(sentence_text)
                    paragraph_text.append(sentence_text)
                    sentence_node.value = TextFragment(
                        identifier=l2_id,
                        lines=[sentence_text],
                        filtered_lines=[sentence_text]
                    )
                    self.log([u"  Found l2 node with text: '%s'" % sentence_text])
                if has_word:
                    paragraph_text = u" ".join(paragraph_text)
                    paragraph_node.value = TextFragment(
                        identifier=l1_id,
                        lines=[paragraph_text],
                        filtered_lines=[paragraph_text]
                    )
                    tree.add_child(paragraph_node)
                    self.log([u"Found l1 node with text: '%s'" % paragraph_text])
                else:
                    self.log(u"Found l1 node but it has no words, skipping")
            except KeyError:
                self.log_warn(u"KeyError while parsing a l1 node")
        # append to fragments
        self.log(u"Storing tree")
        self.fragments_tree = tree

    def _read_subtitles(self, lines):
        """
        Read text fragments from a subtitles format text file.

        :param list lines: the lines of the subtitles text file
        :raises: ValueError: if the id regex is not valid
        """
        self.log(u"Parsing fragments from subtitles text format")
        id_format = self._get_id_format()
        lines = [line.strip() for line in lines]
        pairs = []
        i = 1
        current = 0
        while current < len(lines):
            line_text = lines[current]
            if len(line_text) > 0:
                fragment_lines = [line_text]
                following = current + 1
                while (following < len(lines)) and (len(lines[following]) > 0):
                    fragment_lines.append(lines[following])
                    following += 1
                identifier = id_format % i
                pairs.append((identifier, fragment_lines))
                current = following
                i += 1
            current += 1
        self._create_text_fragments(pairs)

    def _read_parsed(self, lines):
        """
        Read text fragments from a parsed format text file.

        :param list lines: the lines of the parsed text file
        :param dict parameters: additional parameters for parsing
                                (e.g., class/id regex strings)
        """
        self.log(u"Parsing fragments from parsed text format")
        pairs = []
        for line in lines:
            pieces = line.split(gc.PARSED_TEXT_SEPARATOR)
            if len(pieces) == 2:
                identifier = pieces[0].strip()
                text = pieces[1].strip()
                if len(identifier) > 0:
                    pairs.append((identifier, [text]))
        self._create_text_fragments(pairs)

    def _read_plain(self, lines):
        """
        Read text fragments from a plain format text file.

        :param list lines: the lines of the plain text file
        :param dict parameters: additional parameters for parsing
                                (e.g., class/id regex strings)
        :raises: ValueError: if the id regex is not valid
        """
        self.log(u"Parsing fragments from plain text format")
        id_format = self._get_id_format()
        lines = [line.strip() for line in lines]
        pairs = []
        i = 1
        for line in lines:
            identifier = id_format % i
            text = line.strip()
            pairs.append((identifier, [text]))
            i += 1
        self._create_text_fragments(pairs)

    def _read_unparsed(self, lines):
        """
        Read text fragments from an unparsed format text file.

        :param list lines: the lines of the unparsed text file
        """
        def filter_attributes():
            """ Return a dict with the bs4 filter parameters """
            attributes = {}
            for attribute_name, filter_name in [
                    ("class", gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX),
                    ("id", gc.PPN_JOB_IS_TEXT_UNPARSED_ID_REGEX)
            ]:
                if filter_name in self.parameters:
                    regex_string = self.parameters[filter_name]
                    if regex_string is not None:
                        self.log([u"Regex for %s: '%s'", attribute_name, regex_string])
                        regex = re.compile(r".*\b" + regex_string + r"\b.*")
                        attributes[attribute_name] = regex
            return attributes
        #
        # TODO better and/or parametric parsing,
        #      for example, removing tags but keeping text, etc.
        #
        self.log(u"Parsing fragments from unparsed text format")

        # transform text in a soup object
        self.log(u"Creating soup")
        soup = BeautifulSoup("\n".join(lines), "lxml")

        # extract according to class_regex and id_regex
        text_from_id = {}
        ids = []
        filter_attributes = filter_attributes()
        self.log([u"Finding elements matching attributes '%s'", filter_attributes])
        nodes = soup.findAll(attrs=filter_attributes)
        for node in nodes:
            try:
                f_id = gf.safe_unicode(node["id"])
                f_text = gf.safe_unicode(node.text)
                text_from_id[f_id] = f_text
                ids.append(f_id)
            except KeyError:
                self.log_warn(u"KeyError while parsing a node")

        # sort by ID as requested
        id_sort = gf.safe_get(
            dictionary=self.parameters,
            key=gc.PPN_JOB_IS_TEXT_UNPARSED_ID_SORT,
            default_value=IDSortingAlgorithm.UNSORTED,
            can_return_none=False
        )
        self.log([u"Sorting text fragments using '%s'", id_sort])
        sorted_ids = IDSortingAlgorithm(id_sort).sort(ids)

        # append to fragments
        self.log(u"Appending fragments")
        self._create_text_fragments([(key, [text_from_id[key]]) for key in sorted_ids])

    def _get_id_format(self):
        """ Return the id regex from the parameters"""
        id_format = gf.safe_get(
            self.parameters,
            gc.PPN_TASK_OS_FILE_ID_REGEX,
            self.DEFAULT_ID_FORMAT,
            can_return_none=False
        )
        try:
            identifier = id_format % 1
        except (TypeError, ValueError) as exc:
            self.log_exc(u"String '%s' is not a valid id format" % (id_format), exc, True, ValueError)
        return id_format

    def _create_text_fragments(self, pairs):
        """
        Create text fragment objects and append them to this list.

        :param list pairs: a list of pairs, each pair being (id, [line_1, ..., line_n])
        """
        self.log(u"Creating TextFragment objects")
        text_filter = self._build_text_filter()
        for pair in pairs:
            self.add_fragment(
                TextFragment(
                    identifier=pair[0],
                    lines=pair[1],
                    filtered_lines=text_filter.apply_filter(pair[1])
                )
            )

    def _build_text_filter(self):
        """
        Build a suitable TextFilter object.
        """
        text_filter = TextFilter(logger=self.logger)
        self.log(u"Created TextFilter object")
        for key, cls, param_name in [
                (
                    gc.PPN_TASK_IS_TEXT_FILE_IGNORE_REGEX,
                    TextFilterIgnoreRegex,
                    "regex"
                ),
                (
                    gc.PPN_TASK_IS_TEXT_FILE_TRANSLITERATE_MAP,
                    TextFilterTransliterate,
                    "map_file_path"
                )
        ]:
            cls_name = cls.__name__
            param_value = gf.safe_get(self.parameters, key, None)
            if param_value is not None:
                self.log([u"Creating %s object...", cls_name])
                params = {
                    param_name : param_value,
                    "logger" : self.logger
                }
                try:
                    inner_filter = cls(**params)
                    text_filter.add_filter(inner_filter)
                    self.log([u"Creating %s object... done", cls_name])
                except ValueError as exc:
                    self.log_exc(u"Creating %s object failed" % (cls_name), exc, False, None)
        return text_filter
Exemple #39
0
class TextFile(Loggable):
    """
    A tree of text fragments, representing a text file.

    :param string file_path: the path to the text file.
                             If not ``None`` (and also ``file_format`` is not ``None``),
                             the file will be read immediately.
    :param file_format: the format of the text file
    :type  file_format: :class:`~aeneas.textfile.TextFileFormat`
    :param dict parameters: additional parameters used to parse the text file
    :param rconf: a runtime configuration
    :type  rconf: :class:`~aeneas.runtimeconfiguration.RuntimeConfiguration`
    :param logger: the logger object
    :type  logger: :class:`~aeneas.logger.Logger`
    :raises: OSError: if ``file_path`` cannot be read
    :raises: TypeError: if ``parameters`` is not an instance of ``dict``
    :raises: ValueError: if ``file_format`` value is not allowed
    """

    DEFAULT_ID_FORMAT = u"f%06d"

    TAG = u"TextFile"

    def __init__(self,
                 file_path=None,
                 file_format=None,
                 parameters=None,
                 rconf=None,
                 logger=None):
        super(TextFile, self).__init__(rconf=rconf, logger=logger)
        self.file_path = file_path
        self.file_format = file_format
        self.parameters = {} if parameters is None else parameters
        self.fragments_tree = Tree()
        if (self.file_path is not None) and (self.file_format is not None):
            self._read_from_file()

    def __len__(self):
        return len(self.fragments)

    def __unicode__(self):
        msg = []
        if self.fragments_tree is not None:
            for node in self.fragments_tree.pre:
                if not node.is_empty:
                    indent = u" " * 2 * (node.level - 1)
                    msg.append(u"%s%s" % (indent, node.value.__unicode__()))
        return u"\n".join(msg)

    def __str__(self):
        return gf.safe_str(self.__unicode__())

    @property
    def fragments_tree(self):
        """
        Return the current tree of fragments.

        :rtype: :class:`~aeneas.tree.Tree`
        """
        return self.__fragments_tree

    @fragments_tree.setter
    def fragments_tree(self, fragments_tree):
        self.__fragments_tree = fragments_tree

    @property
    def children_not_empty(self):
        """
        Return the direct not empty children of the root of the fragments tree,
        as ``TextFile`` objects.

        :rtype: list of :class:`~aeneas.textfile.TextFile`
        """
        children = []
        for child_node in self.fragments_tree.children_not_empty:
            child_text_file = self.get_subtree(child_node)
            child_text_file.set_language(child_node.value.language)
            children.append(child_text_file)
        return children

    @property
    def file_path(self):
        """
        The path of the text file.

        :rtype: string
        """
        return self.__file_path

    @file_path.setter
    def file_path(self, file_path):
        if (file_path is not None) and (not gf.file_can_be_read(file_path)):
            self.log_exc(u"Text file '%s' cannot be read" % (file_path), None,
                         True, OSError)
        self.__file_path = file_path

    @property
    def file_format(self):
        """
        The format of the text file.

        :rtype: :class:`~aeneas.textfile.TextFileFormat`
        """
        return self.__file_format

    @file_format.setter
    def file_format(self, file_format):
        if (file_format
                is not None) and (file_format
                                  not in TextFileFormat.ALLOWED_VALUES):
            self.log_exc(
                u"Text file format '%s' is not allowed" % (file_format), None,
                True, ValueError)
        self.__file_format = file_format

    @property
    def parameters(self):
        """
        Additional parameters used to parse the text file.

        :rtype: dict
        """
        return self.__parameters

    @parameters.setter
    def parameters(self, parameters):
        if (parameters is not None) and (not isinstance(parameters, dict)):
            self.log_exc(u"parameters is not an instance of dict", None, True,
                         TypeError)
        self.__parameters = parameters

    @property
    def chars(self):
        """
        Return the number of characters of the text file,
        not counting line or fragment separators.

        :rtype: int
        """
        return sum([fragment.chars for fragment in self.fragments])

    @property
    def characters(self):
        """
        The number of characters in this text file.

        :rtype: int
        """
        chars = 0
        for fragment in self.fragments:
            chars += fragment.characters
        return chars

    @property
    def fragments(self):
        """
        The current list of text fragments
        which are the children of the root node
        of the text file tree.

        :rtype: list of :class:`~aeneas.textfile.TextFragment`
        """
        return self.fragments_tree.vchildren_not_empty

    def add_fragment(self, fragment, as_last=True):
        """
        Add the given text fragment as the first or last child of the root node
        of the text file tree.

        :param fragment: the text fragment to be added
        :type  fragment: :class:`~aeneas.textfile.TextFragment`
        :param bool as_last: if ``True`` append fragment, otherwise prepend it
        """
        if not isinstance(fragment, TextFragment):
            self.log_exc(u"fragment is not an instance of TextFragment", None,
                         True, TypeError)
        self.fragments_tree.add_child(Tree(value=fragment), as_last=as_last)

    def get_subtree(self, root):
        """
        Return a new :class:`~aeneas.textfile.TextFile` object,
        rooted at the given node ``root``.

        :param root: the root node
        :type  root: :class:`~aeneas.tree.Tree`
        :rtype: :class:`~aeneas.textfile.TextFile`
        """
        if not isinstance(root, Tree):
            self.log_exc(u"root is not an instance of Tree", None, True,
                         TypeError)
        new_text_file = TextFile()
        new_text_file.fragments_tree = root
        return new_text_file

    def get_slice(self, start=None, end=None):
        """
        Return a new list of text fragments,
        indexed from start (included) to end (excluded).

        :param int start: the start index, included
        :param int end: the end index, excluded
        :rtype: :class:`~aeneas.textfile.TextFile`
        """
        if start is not None:
            start = min(max(0, start), len(self) - 1)
        else:
            start = 0
        if end is not None:
            end = min(max(0, end), len(self))
            end = max(end, start + 1)
        else:
            end = len(self)
        new_text = TextFile()
        for fragment in self.fragments[start:end]:
            new_text.add_fragment(fragment)
        return new_text

    def set_language(self, language):
        """
        Set the given language for all the text fragments.

        :param language: the language of the text fragments
        :type  language: :class:`~aeneas.language.Language`
        """
        self.log([u"Setting language: '%s'", language])
        for fragment in self.fragments:
            fragment.language = language

    def clear(self):
        """
        Clear the text file, removing all the current fragments.
        """
        self.log(u"Clearing text fragments")
        self.fragments_tree = Tree()

    def read_from_list(self, lines):
        """
        Read text fragments from a given list of strings::

            [fragment_1, fragment_2, ..., fragment_n]

        :param list lines: the text fragments
        """
        self.log(u"Reading text fragments from list")
        self._read_plain(lines)

    def read_from_list_with_ids(self, lines):
        """
        Read text fragments from a given list of tuples::

            [(id_1, text_1), (id_2, text_2), ..., (id_n, text_n)].

        :param list lines: the list of ``[id, text]`` fragments (see above)
        """
        self.log(u"Reading text fragments from list with ids")
        self._create_text_fragments([(line[0], [line[1]]) for line in lines])

    def _read_from_file(self):
        """
        Read text fragments from file.
        """
        # test if we can read the given file
        if not gf.file_can_be_read(self.file_path):
            self.log_exc(u"File '%s' cannot be read" % (self.file_path), None,
                         True, OSError)

        if self.file_format not in TextFileFormat.ALLOWED_VALUES:
            self.log_exc(
                u"Text file format '%s' is not supported." %
                (self.file_format), None, True, ValueError)

        # read the contents of the file
        self.log([u"Reading contents of file '%s'", self.file_path])
        with io.open(self.file_path, "r", encoding="utf-8") as text_file:
            lines = text_file.readlines()

        # clear text fragments
        self.clear()

        # parse the contents
        map_read_function = {
            TextFileFormat.MPLAIN: self._read_mplain,
            TextFileFormat.MUNPARSED: self._read_munparsed,
            TextFileFormat.PARSED: self._read_parsed,
            TextFileFormat.PLAIN: self._read_plain,
            TextFileFormat.SUBTITLES: self._read_subtitles,
            TextFileFormat.UNPARSED: self._read_unparsed
        }
        map_read_function[self.file_format](lines)

        # log the number of fragments
        self.log([u"Parsed %d fragments", len(self.fragments)])

    def _mplain_word_separator(self):
        """
        Get the word separator to split words in mplain format.

        :rtype: string
        """
        word_separator = gf.safe_get(self.parameters,
                                     gc.PPN_TASK_IS_TEXT_MPLAIN_WORD_SEPARATOR,
                                     u" ")
        if (word_separator is None) or (word_separator == "space"):
            return u" "
        elif word_separator == "equal":
            return u"="
        elif word_separator == "pipe":
            return u"|"
        elif word_separator == "tab":
            return u"\u0009"
        return word_separator

    def _read_mplain(self, lines):
        """
        Read text fragments from a multilevel format text file.

        :param list lines: the lines of the subtitles text file
        """
        self.log(u"Parsing fragments from subtitles text format")
        word_separator = self._mplain_word_separator()
        self.log([u"Word separator is: '%s'", word_separator])
        lines = [line.strip() for line in lines]
        pairs = []
        i = 1
        current = 0
        tree = Tree()
        while current < len(lines):
            line_text = lines[current]
            if len(line_text) > 0:
                sentences = [line_text]
                following = current + 1
                while (following < len(lines)) and (len(lines[following]) > 0):
                    sentences.append(lines[following])
                    following += 1

                # here sentences holds the sentences for this paragraph

                # create paragraph node
                paragraph_identifier = u"p%06d" % i
                paragraph_lines = [u" ".join(sentences)]
                paragraph_fragment = TextFragment(
                    identifier=paragraph_identifier,
                    lines=paragraph_lines,
                    filtered_lines=paragraph_lines)
                paragraph_node = Tree(value=paragraph_fragment)
                tree.add_child(paragraph_node)
                self.log([u"Paragraph %s", paragraph_identifier])

                # create sentences nodes
                j = 1
                for s in sentences:
                    sentence_identifier = paragraph_identifier + u"s%06d" % j
                    sentence_lines = [s]
                    sentence_fragment = TextFragment(
                        identifier=sentence_identifier,
                        lines=sentence_lines,
                        filtered_lines=sentence_lines)
                    sentence_node = Tree(value=sentence_fragment)
                    paragraph_node.add_child(sentence_node)
                    j += 1
                    self.log([u"  Sentence %s", sentence_identifier])

                    # create words nodes
                    k = 1
                    for w in [
                            w for w in s.split(word_separator) if len(w) > 0
                    ]:
                        word_identifier = sentence_identifier + u"w%06d" % k
                        word_lines = [w]
                        word_fragment = TextFragment(
                            identifier=word_identifier,
                            lines=word_lines,
                            filtered_lines=word_lines)
                        word_node = Tree(value=word_fragment)
                        sentence_node.add_child(word_node)
                        k += 1
                        self.log([u"    Word %s", word_identifier])

                # keep iterating
                current = following
                i += 1
            current += 1
        self.log(u"Storing tree")
        self.fragments_tree = tree

    def _read_munparsed(self, lines):
        """
        Read text fragments from an munparsed format text file.

        :param list lines: the lines of the unparsed text file
        """
        from bs4 import BeautifulSoup

        def nodes_at_level(root, level):
            """ Return a dict with the bs4 filter parameters """
            LEVEL_TO_REGEX_MAP = [
                None,
                gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX,
                gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX,
                gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX,
            ]
            attribute_name = "id"
            regex_string = self.parameters[LEVEL_TO_REGEX_MAP[level]]
            indent = u" " * 2 * (level - 1)
            self.log([
                u"%sRegex for %s: '%s'", indent, attribute_name, regex_string
            ])
            regex = re.compile(r".*\b" + regex_string + r"\b.*")
            return root.findAll(attrs={attribute_name: regex})

        #
        # TODO better and/or parametric parsing,
        #      for example, removing tags but keeping text, etc.
        #
        self.log(u"Parsing fragments from munparsed text format")
        # transform text in a soup object
        self.log(u"Creating soup")
        soup = BeautifulSoup("\n".join(lines), "lxml")
        # extract according to class_regex and id_regex
        text_from_id = {}
        ids = []
        self.log(u"Finding l1 elements")
        tree = Tree()
        for l1_node in nodes_at_level(soup, 1):
            has_word = False
            try:
                l1_id = gf.safe_unicode(l1_node["id"])
                self.log([u"Found l1 node with id:   '%s'", l1_id])
                l1_text = []
                paragraph_node = Tree()
                paragraph_text = []
                for l2_node in nodes_at_level(l1_node, 2):
                    l2_id = gf.safe_unicode(l2_node["id"])
                    self.log([u"  Found l2 node with id:   '%s'", l2_id])
                    l2_text = []
                    sentence_node = Tree()
                    paragraph_node.add_child(sentence_node)
                    sentence_text = []
                    for l3_node in nodes_at_level(l2_node, 3):
                        l3_id = gf.safe_unicode(l3_node["id"])
                        l3_text = gf.safe_unicode(l3_node.text)
                        self.log([u"    Found l3 node with id:   '%s'", l3_id])
                        self.log(
                            [u"    Found l3 node with text: '%s'", l3_text])
                        word_fragment = TextFragment(identifier=l3_id,
                                                     lines=[l3_text],
                                                     filtered_lines=[l3_text])
                        word_node = Tree(value=word_fragment)
                        sentence_node.add_child(word_node)
                        sentence_text.append(l3_text)
                        has_word = True
                    sentence_text = u" ".join(sentence_text)
                    paragraph_text.append(sentence_text)
                    sentence_node.value = TextFragment(
                        identifier=l2_id,
                        lines=[sentence_text],
                        filtered_lines=[sentence_text])
                    self.log(
                        [u"  Found l2 node with text: '%s'" % sentence_text])
                if has_word:
                    paragraph_text = u" ".join(paragraph_text)
                    paragraph_node.value = TextFragment(
                        identifier=l1_id,
                        lines=[paragraph_text],
                        filtered_lines=[paragraph_text])
                    tree.add_child(paragraph_node)
                    self.log(
                        [u"Found l1 node with text: '%s'" % paragraph_text])
                else:
                    self.log(u"Found l1 node but it has no words, skipping")
            except KeyError:
                self.log_warn(u"KeyError while parsing a l1 node")
        # append to fragments
        self.log(u"Storing tree")
        self.fragments_tree = tree

    def _read_subtitles(self, lines):
        """
        Read text fragments from a subtitles format text file.

        :param list lines: the lines of the subtitles text file
        :raises: ValueError: if the id regex is not valid
        """
        self.log(u"Parsing fragments from subtitles text format")
        id_format = self._get_id_format()
        lines = [line.strip() for line in lines]
        pairs = []
        i = 1
        current = 0
        while current < len(lines):
            line_text = lines[current]
            if len(line_text) > 0:
                fragment_lines = [line_text]
                following = current + 1
                while (following < len(lines)) and (len(lines[following]) > 0):
                    fragment_lines.append(lines[following])
                    following += 1
                identifier = id_format % i
                pairs.append((identifier, fragment_lines))
                current = following
                i += 1
            current += 1
        self._create_text_fragments(pairs)

    def _read_parsed(self, lines):
        """
        Read text fragments from a parsed format text file.

        :param list lines: the lines of the parsed text file
        :param dict parameters: additional parameters for parsing
                                (e.g., class/id regex strings)
        """
        self.log(u"Parsing fragments from parsed text format")
        pairs = []
        for line in lines:
            pieces = line.split(gc.PARSED_TEXT_SEPARATOR)
            if len(pieces) == 2:
                identifier = pieces[0].strip()
                text = pieces[1].strip()
                if len(identifier) > 0:
                    pairs.append((identifier, [text]))
        self._create_text_fragments(pairs)

    def _read_plain(self, lines):
        """
        Read text fragments from a plain format text file.

        :param list lines: the lines of the plain text file
        :param dict parameters: additional parameters for parsing
                                (e.g., class/id regex strings)
        :raises: ValueError: if the id regex is not valid
        """
        self.log(u"Parsing fragments from plain text format")
        id_format = self._get_id_format()
        lines = [line.strip() for line in lines]
        pairs = []
        i = 1
        for line in lines:
            identifier = id_format % i
            text = line.strip()
            pairs.append((identifier, [text]))
            i += 1
        self._create_text_fragments(pairs)

    def _read_unparsed(self, lines):
        """
        Read text fragments from an unparsed format text file.

        :param list lines: the lines of the unparsed text file
        """
        from bs4 import BeautifulSoup

        def filter_attributes():
            """ Return a dict with the bs4 filter parameters """
            attributes = {}
            for attribute_name, filter_name in [
                ("class", gc.PPN_TASK_IS_TEXT_UNPARSED_CLASS_REGEX),
                ("id", gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX)
            ]:
                if filter_name in self.parameters:
                    regex_string = self.parameters[filter_name]
                    if regex_string is not None:
                        self.log([
                            u"Regex for %s: '%s'", attribute_name, regex_string
                        ])
                        regex = re.compile(r".*\b" + regex_string + r"\b.*")
                        attributes[attribute_name] = regex
            return attributes

        #
        # TODO better and/or parametric parsing,
        #      for example, removing tags but keeping text, etc.
        #
        self.log(u"Parsing fragments from unparsed text format")

        # transform text in a soup object
        self.log(u"Creating soup")
        soup = BeautifulSoup("\n".join(lines), "lxml")

        # extract according to class_regex and id_regex
        text_from_id = {}
        ids = []
        filter_attributes = filter_attributes()
        self.log(
            [u"Finding elements matching attributes '%s'", filter_attributes])
        nodes = soup.findAll(attrs=filter_attributes)
        for node in nodes:
            try:
                f_id = gf.safe_unicode(node["id"])
                f_text = gf.safe_unicode(node.text)
                text_from_id[f_id] = f_text
                ids.append(f_id)
            except KeyError:
                self.log_warn(u"KeyError while parsing a node")

        # sort by ID as requested
        id_sort = gf.safe_get(dictionary=self.parameters,
                              key=gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT,
                              default_value=IDSortingAlgorithm.UNSORTED,
                              can_return_none=False)
        self.log([u"Sorting text fragments using '%s'", id_sort])
        sorted_ids = IDSortingAlgorithm(id_sort).sort(ids)

        # append to fragments
        self.log(u"Appending fragments")
        self._create_text_fragments([(key, [text_from_id[key]])
                                     for key in sorted_ids])

    def _get_id_format(self):
        """ Return the id regex from the parameters"""
        id_format = gf.safe_get(self.parameters,
                                gc.PPN_TASK_OS_FILE_ID_REGEX,
                                self.DEFAULT_ID_FORMAT,
                                can_return_none=False)
        try:
            identifier = id_format % 1
        except (TypeError, ValueError) as exc:
            self.log_exc(u"String '%s' is not a valid id format" % (id_format),
                         exc, True, ValueError)
        return id_format

    def _create_text_fragments(self, pairs):
        """
        Create text fragment objects and append them to this list.

        :param list pairs: a list of pairs, each pair being (id, [line_1, ..., line_n])
        """
        self.log(u"Creating TextFragment objects")
        text_filter = self._build_text_filter()
        for pair in pairs:
            self.add_fragment(
                TextFragment(identifier=pair[0],
                             lines=pair[1],
                             filtered_lines=text_filter.apply_filter(pair[1])))

    def _build_text_filter(self):
        """
        Build a suitable TextFilter object.
        """
        text_filter = TextFilter(logger=self.logger)
        self.log(u"Created TextFilter object")
        for key, cls, param_name in [
            (gc.PPN_TASK_IS_TEXT_FILE_IGNORE_REGEX, TextFilterIgnoreRegex,
             "regex"),
            (gc.PPN_TASK_IS_TEXT_FILE_TRANSLITERATE_MAP,
             TextFilterTransliterate, "map_file_path")
        ]:
            cls_name = cls.__name__
            param_value = gf.safe_get(self.parameters, key, None)
            if param_value is not None:
                self.log([u"Creating %s object...", cls_name])
                params = {param_name: param_value, "logger": self.logger}
                try:
                    inner_filter = cls(**params)
                    text_filter.add_filter(inner_filter)
                    self.log([u"Creating %s object... done", cls_name])
                except ValueError as exc:
                    self.log_exc(u"Creating %s object failed" % (cls_name),
                                 exc, False, None)
        return text_filter
Exemple #40
0
 def clear(self):
     """
     Clear the text file, removing all the current fragments.
     """
     self.log(u"Clearing text fragments")
     self.fragments_tree = Tree()
Exemple #41
0
    def _execute_multi_level_task(self):
        """ Execute a multi-level task """
        self.log(u"Executing multi level task...")

        self.log(u"Saving rconf...")
        # save original rconf
        orig_rconf = self.rconf.clone()
        # clone rconfs and set granularity
        level_rconfs = [
            None,
            self.rconf.clone(),
            self.rconf.clone(),
            self.rconf.clone()
        ]
        level_mfccs = [None, None, None, None]
        for i in range(1, len(level_rconfs)):
            level_rconfs[i].set_granularity(i)
            self.log([u"Level %d mws: %.3f", i, level_rconfs[i].mws])
        self.log(u"Saving rconf... done")

        try:
            self.log(u"Creating AudioFile object...")
            audio_file = self._load_audio_file()
            self.log(u"Creating AudioFile object... done")

            # extract MFCC for each level
            for i in range(1, len(level_rconfs)):
                self._step_begin(u"extract MFCC real wave level %d" % i)
                if (i == 1) or (
                        level_rconfs[i].mws != level_rconfs[i - 1].mws) or (
                            level_rconfs[i].mwl != level_rconfs[i - 1].mwl):
                    self.rconf = level_rconfs[i]
                    level_mfccs[i] = self._extract_mfcc(audio_file=audio_file)
                else:
                    self.log(u"Keeping MFCC real wave from previous level")
                    level_mfccs[i] = level_mfccs[i - 1]
                self._step_end()

            self.log(u"Clearing AudioFile object...")
            self.rconf = level_rconfs[1]
            self._clear_audio_file(audio_file)
            self.log(u"Clearing AudioFile object... done")

            # compute head tail for the entire real wave (level 1)
            self._step_begin(u"compute head tail")
            (head_length, process_length,
             tail_length) = self._compute_head_process_tail(level_mfccs[1])
            level_mfccs[1].set_head_middle_tail(head_length, process_length,
                                                tail_length)
            self._step_end()

            # compute alignment at each level
            tree = Tree()
            sync_roots = [tree]
            text_files = [self.task.text_file]
            aht = [None, True, False, False]
            aba = [None, True, True, False]
            for i in range(1, len(level_rconfs)):
                self._step_begin(u"compute alignment level %d" % i)
                text_files, sync_roots = self._execute_level(
                    i, level_rconfs[i], level_mfccs[i], text_files, sync_roots,
                    aht[i], aba[i])
                self._step_end()

            self._step_begin(u"select levels")
            tree = self._select_levels(tree)
            self._step_end()

            self._step_begin(u"create sync map")
            self.rconf = orig_rconf
            self.task.sync_map = self._create_syncmap(tree)
            self._step_end()

            self._step_begin(u"check zero duration")
            self._check_no_zero(level_rconfs[-1].mws)
            self._step_end()

            self._step_total()
            self.log(u"Executing multi level task... done")
        except Exception as exc:
            self._step_failure(exc)
Exemple #42
0
    def _read_munparsed(self, lines):
        """
        Read text fragments from an munparsed format text file.

        :param list lines: the lines of the unparsed text file
        """
        def nodes_at_level(root, level):
            """ Return a dict with the bs4 filter parameters """
            LEVEL_TO_REGEX_MAP = [
                None,
                gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX,
                gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX,
                gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX,
            ]
            attribute_name = "id"
            regex_string = self.parameters[LEVEL_TO_REGEX_MAP[level]]
            indent = u" " * 2 * (level - 1)
            self.log([u"%sRegex for %s: '%s'", indent, attribute_name, regex_string])
            regex = re.compile(r".*\b" + regex_string + r"\b.*")
            return root.findAll(attrs={ attribute_name: regex })
        #
        # TODO better and/or parametric parsing,
        #      for example, removing tags but keeping text, etc.
        #
        self.log(u"Parsing fragments from munparsed text format")
        # transform text in a soup object
        self.log(u"Creating soup")
        soup = BeautifulSoup("\n".join(lines), "lxml")
        # extract according to class_regex and id_regex
        text_from_id = {}
        ids = []
        self.log(u"Finding l1 elements")
        tree = Tree()
        for l1_node in nodes_at_level(soup, 1):
            has_word = False
            try:
                l1_id = gf.safe_unicode(l1_node["id"])
                self.log([u"Found l1 node with id:   '%s'", l1_id])
                l1_text = []
                paragraph_node = Tree()
                paragraph_text = []
                for l2_node in nodes_at_level(l1_node, 2):
                    l2_id = gf.safe_unicode(l2_node["id"])
                    self.log([u"  Found l2 node with id:   '%s'", l2_id])
                    l2_text = []
                    sentence_node = Tree()
                    paragraph_node.add_child(sentence_node)
                    sentence_text = []
                    for l3_node in nodes_at_level(l2_node, 3):
                        l3_id = gf.safe_unicode(l3_node["id"])
                        l3_text = gf.safe_unicode(l3_node.text)
                        self.log([u"    Found l3 node with id:   '%s'", l3_id])
                        self.log([u"    Found l3 node with text: '%s'", l3_text])
                        word_fragment = TextFragment(
                            identifier=l3_id,
                            lines=[l3_text],
                            filtered_lines=[l3_text]
                        )
                        word_node = Tree(value=word_fragment)
                        sentence_node.add_child(word_node)
                        sentence_text.append(l3_text)
                        has_word = True
                    sentence_text = u" ".join(sentence_text)
                    paragraph_text.append(sentence_text)
                    sentence_node.value = TextFragment(
                        identifier=l2_id,
                        lines=[sentence_text],
                        filtered_lines=[sentence_text]
                    )
                    self.log([u"  Found l2 node with text: '%s'" % sentence_text])
                if has_word:
                    paragraph_text = u" ".join(paragraph_text)
                    paragraph_node.value = TextFragment(
                        identifier=l1_id,
                        lines=[paragraph_text],
                        filtered_lines=[paragraph_text]
                    )
                    tree.add_child(paragraph_node)
                    self.log([u"Found l1 node with text: '%s'" % paragraph_text])
                else:
                    self.log(u"Found l1 node but it has no words, skipping")
            except KeyError:
                self.log_warn(u"KeyError while parsing a l1 node")
        # append to fragments
        self.log(u"Storing tree")
        self.fragments_tree = tree
Exemple #43
0
 def test_value(self):
     root = Tree(value="root")
     self.assertIsNotNone(root.value)
     self.assertFalse(root.is_empty)
     self.assertEqual(root.vleaves, ["root"])