Beispiel #1
0
 def _build_text_filter(self):
     """
     Build a suitable TextFilter object.
     """
     text_filter = TextFilter(logger=self.logger)
     self.log(u"Created TextFilter object")
     for key, cls, param_name in [
         (gc.PPN_TASK_IS_TEXT_FILE_IGNORE_REGEX, TextFilterIgnoreRegex,
          "regex"),
         (gc.PPN_TASK_IS_TEXT_FILE_TRANSLITERATE_MAP,
          TextFilterTransliterate, "map_file_path")
     ]:
         cls_name = cls.__name__
         param_value = gf.safe_get(self.parameters, key, None)
         if param_value is not None:
             self.log([u"Creating %s object...", cls_name])
             params = {param_name: param_value, "logger": self.logger}
             try:
                 inner_filter = cls(**params)
                 text_filter.add_filter(inner_filter)
                 self.log([u"Creating %s object... done", cls_name])
             except ValueError as exc:
                 self.log_exc(u"Creating %s object failed" % (cls_name),
                              exc, False, None)
     return text_filter
Beispiel #2
0
 def _build_text_filter(self):
     """
     Build a suitable TextFilter object.
     """
     text_filter = TextFilter(logger=self.logger)
     self.log(u"Created TextFilter object")
     for key, cls, param_name in [
             (
                 gc.PPN_TASK_IS_TEXT_FILE_IGNORE_REGEX,
                 TextFilterIgnoreRegex,
                 "regex"
             ),
             (
                 gc.PPN_TASK_IS_TEXT_FILE_TRANSLITERATE_MAP,
                 TextFilterTransliterate,
                 "map_file_path"
             )
     ]:
         cls_name = cls.__name__
         param_value = gf.safe_get(self.parameters, key, None)
         if param_value is not None:
             self.log([u"Creating %s object...", cls_name])
             params = {
                 param_name : param_value,
                 "logger" : self.logger
             }
             try:
                 inner_filter = cls(**params)
                 text_filter.add_filter(inner_filter)
                 self.log([u"Creating %s object... done", cls_name])
             except ValueError as exc:
                 self.log_exc(u"Creating %s object failed" % (cls_name), exc, False, None)
     return text_filter
Beispiel #3
0
    def _read_unparsed(self, lines):
        """
        Read text fragments from an unparsed format text file.

        :param list lines: the lines of the unparsed text file
        """
        def filter_attributes():
            """ Return a dict with the bs4 filter parameters """
            attributes = {}
            for attribute_name, filter_name in [
                    ("class", gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX),
                    ("id", gc.PPN_JOB_IS_TEXT_UNPARSED_ID_REGEX)
            ]:
                if filter_name in self.parameters:
                    regex_string = self.parameters[filter_name]
                    if regex_string is not None:
                        self.log([u"Regex for %s: '%s'", attribute_name, regex_string])
                        regex = re.compile(r".*\b" + regex_string + r"\b.*")
                        attributes[attribute_name] = regex
            return attributes
        #
        # TODO better and/or parametric parsing,
        #      for example, removing tags but keeping text, etc.
        #
        self.log(u"Parsing fragments from unparsed text format")

        # transform text in a soup object
        self.log(u"Creating soup")
        soup = BeautifulSoup("\n".join(lines), "lxml")

        # extract according to class_regex and id_regex
        text_from_id = {}
        ids = []
        filter_attributes = filter_attributes()
        self.log([u"Finding elements matching attributes '%s'", filter_attributes])
        nodes = soup.findAll(attrs=filter_attributes)
        for node in nodes:
            try:
                f_id = gf.safe_unicode(node["id"])
                f_text = gf.safe_unicode(node.text)
                text_from_id[f_id] = f_text
                ids.append(f_id)
            except KeyError:
                self.log_warn(u"KeyError while parsing a node")

        # sort by ID as requested
        id_sort = gf.safe_get(
            dictionary=self.parameters,
            key=gc.PPN_JOB_IS_TEXT_UNPARSED_ID_SORT,
            default_value=IDSortingAlgorithm.UNSORTED,
            can_return_none=False
        )
        self.log([u"Sorting text fragments using '%s'", id_sort])
        sorted_ids = IDSortingAlgorithm(id_sort).sort(ids)

        # append to fragments
        self.log(u"Appending fragments")
        self._create_text_fragments([(key, [text_from_id[key]]) for key in sorted_ids])
Beispiel #4
0
 def test_safe_get(self):
     tests = [
         (None, None, u"default", u"default"),
         (None, u"key", u"default", u"default"),
         ({}, None, u"default", u"default"),
         ({}, u"key", u"default", u"default"),
         ([], u"key", u"default", u"default"),
         ({u"key": u"value"}, None, u"default", u"default"),
         ({u"key": u"value"}, u"key", u"default", u"value"),
     ]
     for test in tests:
         self.assertEqual(gf.safe_get(test[0], test[1], test[2]), test[3])
Beispiel #5
0
 def _get_id_format(self):
     """ Return the id regex from the parameters"""
     id_format = gf.safe_get(self.parameters,
                             gc.PPN_TASK_OS_FILE_ID_REGEX,
                             self.DEFAULT_ID_FORMAT,
                             can_return_none=False)
     try:
         identifier = id_format % 1
     except (TypeError, ValueError) as exc:
         self.log_exc(u"String '%s' is not a valid id format" % (id_format),
                      exc, True, ValueError)
     return id_format
Beispiel #6
0
 def _get_id_format(self):
     """ Return the id regex from the parameters"""
     id_format = gf.safe_get(
         self.parameters,
         gc.PPN_TASK_OS_FILE_ID_REGEX,
         self.DEFAULT_ID_FORMAT,
         can_return_none=False
     )
     try:
         identifier = id_format % 1
     except (TypeError, ValueError) as exc:
         self.log_exc(u"String '%s' is not a valid id format" % (id_format), exc, True, ValueError)
     return id_format
Beispiel #7
0
    def read(self, sync_map_format, input_file_path, parameters=None):
        """
        Read sync map fragments from the given file in the specified format,
        and add them the current (this) sync map.

        Return ``True`` if the call succeeded,
        ``False`` if an error occurred.

        :param sync_map_format: the format of the sync map
        :type  sync_map_format: :class:`~aeneas.syncmap.SyncMapFormat`
        :param string input_file_path: the path to the input file to read
        :param dict parameters: additional parameters (e.g., for ``SMIL`` input)
        :raises: ValueError: if ``sync_map_format`` is ``None`` or it is not an allowed value
        :raises: OSError: if ``input_file_path`` does not exist
        """
        if sync_map_format is None:
            self.log_exc(u"Sync map format is None", None, True, ValueError)
        if sync_map_format not in SyncMapFormat.CODE_TO_CLASS:
            self.log_exc(
                u"Sync map format '%s' is not allowed" % (sync_map_format),
                None, True, ValueError)
        if not gf.file_can_be_read(input_file_path):
            self.log_exc(
                u"Cannot read sync map file '%s'. Wrong permissions?" %
                (input_file_path), None, True, OSError)

        self.log([u"Input format:     '%s'", sync_map_format])
        self.log([u"Input path:       '%s'", input_file_path])
        self.log([u"Input parameters: '%s'", parameters])

        reader = (SyncMapFormat.CODE_TO_CLASS[sync_map_format])(
            variant=sync_map_format,
            parameters=parameters,
            rconf=self.rconf,
            logger=self.logger)

        # open file for reading
        self.log(u"Reading input file...")
        with io.open(input_file_path, "r", encoding="utf-8") as input_file:
            input_text = input_file.read()
        reader.parse(input_text=input_text, syncmap=self)
        self.log(u"Reading input file... done")

        # overwrite language if requested
        language = gf.safe_get(parameters, gc.PPN_SYNCMAP_LANGUAGE, None)
        if language is not None:
            self.log([u"Overwriting language to '%s'", language])
            for fragment in self.fragments:
                fragment.text_fragment.language = language
Beispiel #8
0
    def _mplain_word_separator(self):
        """
        Get the word separator to split words in mplain format.

        :rtype: string
        """
        word_separator = gf.safe_get(self.parameters, gc.PPN_TASK_IS_TEXT_MPLAIN_WORD_SEPARATOR, u" ")
        if (word_separator is None) or (word_separator == "space"):
            return u" "
        elif word_separator == "equal":
            return u"="
        elif word_separator == "pipe":
            return u"|"
        elif word_separator == "tab":
            return u"\u0009"
        return word_separator
 def test_safe_get(self):
     tests = [
         (None, None, u"default", u"default"),
         (None, u"key", u"default", u"default"),
         ({}, None, u"default", u"default"),
         ({}, u"key", u"default", u"default"),
         ([], u"key", u"default", u"default"),
         ({
             u"key": u"value"
         }, None, u"default", u"default"),
         ({
             u"key": u"value"
         }, u"key", u"default", u"value"),
     ]
     for test in tests:
         self.assertEqual(gf.safe_get(test[0], test[1], test[2]), test[3])
Beispiel #10
0
    def read(self, sync_map_format, input_file_path, parameters=None):
        """
        Read sync map fragments from the given file in the specified format,
        and add them the current (this) sync map.

        Return ``True`` if the call succeeded,
        ``False`` if an error occurred.

        :param sync_map_format: the format of the sync map
        :type  sync_map_format: :class:`~aeneas.syncmap.SyncMapFormat`
        :param string input_file_path: the path to the input file to read
        :param dict parameters: additional parameters (e.g., for ``SMIL`` input)
        :raises: ValueError: if ``sync_map_format`` is ``None`` or it is not an allowed value
        :raises: OSError: if ``input_file_path`` does not exist
        """
        if sync_map_format is None:
            self.log_exc(u"Sync map format is None", None, True, ValueError)
        if sync_map_format not in SyncMapFormat.CODE_TO_CLASS:
            self.log_exc(u"Sync map format '%s' is not allowed" % (sync_map_format), None, True, ValueError)
        if not gf.file_can_be_read(input_file_path):
            self.log_exc(u"Cannot read sync map file '%s'. Wrong permissions?" % (input_file_path), None, True, OSError)

        self.log([u"Input format:     '%s'", sync_map_format])
        self.log([u"Input path:       '%s'", input_file_path])
        self.log([u"Input parameters: '%s'", parameters])

        reader = (SyncMapFormat.CODE_TO_CLASS[sync_map_format])(
            variant=sync_map_format,
            parameters=parameters,
            rconf=self.rconf,
            logger=self.logger
        )

        # open file for reading
        self.log(u"Reading input file...")
        with io.open(input_file_path, "r", encoding="utf-8") as input_file:
            input_text = input_file.read()
        reader.parse(input_text=input_text, syncmap=self)
        self.log(u"Reading input file... done")

        # overwrite language if requested
        language = gf.safe_get(parameters, gc.PPN_SYNCMAP_LANGUAGE, None)
        if language is not None:
            self.log([u"Overwriting language to '%s'", language])
            for fragment in self.fragments:
                fragment.text_fragment.language = language
Beispiel #11
0
    def _mplain_word_separator(self):
        """
        Get the word separator to split words in mplain format.

        :rtype: string
        """
        word_separator = gf.safe_get(self.parameters,
                                     gc.PPN_TASK_IS_TEXT_MPLAIN_WORD_SEPARATOR,
                                     u" ")
        if (word_separator is None) or (word_separator == "space"):
            return u" "
        elif word_separator == "equal":
            return u"="
        elif word_separator == "pipe":
            return u"|"
        elif word_separator == "tab":
            return u"\u0009"
        return word_separator
Beispiel #12
0
    def _read_unparsed(self, lines):
        """
        Read text fragments from an unparsed format text file.

        :param list lines: the lines of the unparsed text file
        """
        from bs4 import BeautifulSoup

        def filter_attributes():
            """ Return a dict with the bs4 filter parameters """
            attributes = {}
            for attribute_name, filter_name in [
                ("class", gc.PPN_TASK_IS_TEXT_UNPARSED_CLASS_REGEX),
                ("id", gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX)
            ]:
                if filter_name in self.parameters:
                    regex_string = self.parameters[filter_name]
                    if regex_string is not None:
                        self.log([
                            u"Regex for %s: '%s'", attribute_name, regex_string
                        ])
                        regex = re.compile(r".*\b" + regex_string + r"\b.*")
                        attributes[attribute_name] = regex
            return attributes

        #
        # TODO better and/or parametric parsing,
        #      for example, removing tags but keeping text, etc.
        #
        self.log(u"Parsing fragments from unparsed text format")

        # transform text in a soup object
        self.log(u"Creating soup")
        soup = BeautifulSoup("\n".join(lines), "lxml")

        # extract according to class_regex and id_regex
        text_from_id = {}
        ids = []
        filter_attributes = filter_attributes()
        self.log(
            [u"Finding elements matching attributes '%s'", filter_attributes])
        nodes = soup.findAll(attrs=filter_attributes)
        for node in nodes:
            try:
                f_id = gf.safe_unicode(node["id"])
                f_text = gf.safe_unicode(node.text)
                text_from_id[f_id] = f_text
                ids.append(f_id)
            except KeyError:
                self.log_warn(u"KeyError while parsing a node")

        # sort by ID as requested
        id_sort = gf.safe_get(dictionary=self.parameters,
                              key=gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT,
                              default_value=IDSortingAlgorithm.UNSORTED,
                              can_return_none=False)
        self.log([u"Sorting text fragments using '%s'", id_sort])
        sorted_ids = IDSortingAlgorithm(id_sort).sort(ids)

        # append to fragments
        self.log(u"Appending fragments")
        self._create_text_fragments([(key, [text_from_id[key]])
                                     for key in sorted_ids])
Beispiel #13
0
    def format(self, syncmap):
        # check for required parameters
        for key in [
                gc.PPN_TASK_OS_FILE_SMIL_PAGE_REF,
                gc.PPN_TASK_OS_FILE_SMIL_AUDIO_REF
        ]:
            if gf.safe_get(self.parameters, key, None) is None:
                self.log_exc(u"Parameter %s must be specified for format %s" % (key, self.variant), None, True, SyncMapMissingParameterError)

        from lxml import etree
        # we are sure we have them
        text_ref = self.parameters[gc.PPN_TASK_OS_FILE_SMIL_PAGE_REF]
        audio_ref = self.parameters[gc.PPN_TASK_OS_FILE_SMIL_AUDIO_REF]

        # namespaces
        smil_ns = "http://www.w3.org/ns/SMIL"
        epub_ns = "http://www.idpf.org/2007/ops"
        ns_map = {None: smil_ns, "epub": epub_ns}

        # build tree
        smil_elem = etree.Element("{%s}smil" % smil_ns, nsmap=ns_map)
        smil_elem.attrib["version"] = "3.0"
        body_elem = etree.SubElement(smil_elem, "{%s}body" % smil_ns)
        seq_elem = etree.SubElement(body_elem, "{%s}seq" % smil_ns)
        seq_elem.attrib["id"] = u"seq000001"
        seq_elem.attrib["{%s}textref" % epub_ns] = text_ref

        if syncmap.is_single_level:
            # single level
            for i, fragment in enumerate(syncmap.fragments, 1):
                text = fragment.text_fragment
                par_elem = etree.SubElement(seq_elem, "{%s}par" % smil_ns)
                par_elem.attrib["id"] = "par%06d" % (i)
                text_elem = etree.SubElement(par_elem, "{%s}text" % smil_ns)
                text_elem.attrib["src"] = "%s#%s" % (text_ref, text.identifier)
                audio_elem = etree.SubElement(par_elem, "{%s}audio" % smil_ns)
                audio_elem.attrib["src"] = audio_ref
                audio_elem.attrib["clipBegin"] = self.format_time_function(fragment.begin)
                audio_elem.attrib["clipEnd"] = self.format_time_function(fragment.end)
        else:
            # TODO support generic multiple levels
            # multiple levels
            for par_index, par_child in enumerate(syncmap.fragments_tree.children_not_empty, 1):
                par_seq_elem = etree.SubElement(seq_elem, "{%s}seq" % smil_ns)
                # COMMENTED par_seq_elem.attrib["id"] = "p%06d" % (par_index)
                par_seq_elem.attrib["{%s}type" % epub_ns] = "paragraph"
                par_seq_elem.attrib["{%s}textref" % epub_ns] = text_ref + "#" + par_child.value.text_fragment.identifier
                for sen_index, sen_child in enumerate(par_child.children_not_empty, 1):
                    sen_seq_elem = etree.SubElement(par_seq_elem, "{%s}seq" % smil_ns)
                    # COMMENTED sen_seq_elem.attrib["id"] = par_seq_elem.attrib["id"] + "s%06d" % (sen_index)
                    sen_seq_elem.attrib["{%s}type" % epub_ns] = "sentence"
                    sen_seq_elem.attrib["{%s}textref" % epub_ns] = text_ref + "#" + sen_child.value.text_fragment.identifier
                    for wor_index, wor_child in enumerate(sen_child.children_not_empty, 1):
                        fragment = wor_child.value
                        text = fragment.text_fragment
                        wor_seq_elem = etree.SubElement(sen_seq_elem, "{%s}seq" % smil_ns)
                        # COMMENTED wor_seq_elem.attrib["id"] = sen_seq_elem.attrib["id"] + "w%06d" % (wor_index)
                        wor_seq_elem.attrib["{%s}type" % epub_ns] = "word"
                        wor_seq_elem.attrib["{%s}textref" % epub_ns] = text_ref + "#" + text.identifier
                        wor_par_elem = etree.SubElement(wor_seq_elem, "{%s}par" % smil_ns)
                        text_elem = etree.SubElement(wor_par_elem, "{%s}text" % smil_ns)
                        text_elem.attrib["src"] = "%s#%s" % (text_ref, text.identifier)
                        audio_elem = etree.SubElement(wor_par_elem, "{%s}audio" % smil_ns)
                        audio_elem.attrib["src"] = audio_ref
                        audio_elem.attrib["clipBegin"] = self.format_time_function(fragment.begin)
                        audio_elem.attrib["clipEnd"] = self.format_time_function(fragment.end)
        return self._tree_to_string(smil_elem, xml_declaration=False)