コード例 #1
0
ファイル: config.py プロジェクト: patwat/python-unitex
    def load(self, options):
        char_by_char = options.get("char_by_char", False)
        if isinstance(char_by_char, bool) is False:
            raise UnitexException("[TOKENIZE] Wrong value for the 'char_by_char' option. Boolean required.")
        self["char_by_char"] = char_by_char

        tokens = options.get("tokens", None)
        if tokens is not None:
            if isinstance(tokens, str) is False:
                raise UnitexException("[TOKENIZE] Wrong value for the 'tokens' option. String required.")
            if exists(tokens) is False:
                raise UnitexException("[TOKENIZE] Tokens file '%s' doesn't exist." % tokens)
        self["tokens"] = tokens

        input_offsets = options.get("input_offsets", None)
        if input_offsets is not None:
            if isinstance(input_offsets, str) is False:
                raise UnitexException("[TOKENIZE] Wrong value for the 'input_offsets' option. String required.")
            if exists(input_offsets) is False:
                raise UnitexException("[TOKENIZE] Offsets file '%s' doesn't exist." % input_offsets)
        self["input_offsets"] = input_offsets

        output_offsets = options.get("output_offsets", None)
        if output_offsets is not None and isinstance(output_offsets, str) is False:
                raise UnitexException("[TOKENIZE] Wrong value for the 'output_offsets' option. String required.")
        self["output_offsets"] = output_offsets

        if self["input_offsets"] is None and self["output_offsets"] is not None:
            raise UnitexException("[TOKENIZE] You must provide both input and output offsets...")
        if self["input_offsets"] is not None and self["output_offsets"] is None:
            raise UnitexException("[TOKENIZE] You must provide both input and output offsets...")
コード例 #2
0
ファイル: config.py プロジェクト: patwat/python-unitex
    def load(self, options):
        clean = options.get("clean", False)
        if isinstance(clean, bool) is False:
            raise UnitexException("[TXT2TFST] Wrong value for the 'clean' option. Boolean required.")
        self["clean"] = clean

        normalization_grammar = options.get("normalization_grammar", None)
        if normalization_grammar is not None:
            if isinstance(normalization_grammar, str) is False:
                raise UnitexException("[TXT2TFST] Wrong value for the 'normalization_grammar' option. String required.")
            if exists(normalization_grammar) is False:
                raise UnitexException("[TXT2TFST] Offsets file '%s' doesn't exist." % normalization_grammar)
        self["normalization_grammar"] = normalization_grammar

        tagset = options.get("tagset", None)
        if tagset is not None:
            if isinstance(tagset, str) is False:
                raise UnitexException("[TXT2TFST] Wrong value for the 'tagset' option. String required.")
            if exists(tagset) is False:
                raise UnitexException("[TXT2TFST] Offsets file '%s' doesn't exist." % tagset)
        self["tagset"] = tagset

        korean = options.get("korean", False)
        if isinstance(korean, bool) is False:
            raise UnitexException("[TXT2TFST] Wrong value for the 'korean' option. Boolean required.")
        self["korean"] = korean
コード例 #3
0
ファイル: config.py プロジェクト: patwat/python-unitex
    def load(self, options):
        morpho = options.get("morpho", None)
        if morpho is not None:
            if isinstance(morpho, list) is False:
                raise UnitexException("[DICO] Wrong value for the 'morpho' option. List of string required.")
            for dictionary in morpho:
                if exists(dictionary) is False:
                    raise UnitexException("[DICO] Morphological dictionary '%s' doesn't exist." % dictionary)
        self["morpho"] = morpho

        korean = options.get("korean", False)
        if isinstance(korean, bool) is False:
            raise UnitexException("[DICO] Wrong value for the 'korean' option. Boolean required.")
        self["korean"] = korean

        semitic = options.get("semitic", False)
        if isinstance(semitic, bool) is False:
            raise UnitexException("[DICO] Wrong value for the 'semitic' option. Boolean required.")
        self["semitic"] = semitic

        arabic_rules = options.get("arabic_rules", None)
        if arabic_rules is not None:
            if isinstance(arabic_rules, str) is False:
                raise UnitexException("[DICO] Wrong value for the 'arabic_rules' option. String required.")
            if exists(arabic_rules) is False:
                raise UnitexException("[DICO] Rules file '%s' doesn't exist." % arabic_rules)
        self["arabic_rules"] = arabic_rules

        raw = options.get("raw", None)
        if raw is not None and isinstance(raw, str) is False:
            raise UnitexException("[DICO] Wrong value for the 'raw' option. String required.")
        self["raw"] = raw
コード例 #4
0
    def load(self, options):
        no_carriage_return = options.get("no_carriage_return", False)
        if isinstance(no_carriage_return, bool) is False:
            raise UnitexException(
                "[NORMALIZE] Wrong value for the 'no_carriage_return' option. Boolean required."
            )
        self["no_carriage_return"] = no_carriage_return

        input_offsets = options.get("input_offsets", None)
        if input_offsets is not None:
            if isinstance(input_offsets, str) is False:
                raise UnitexException(
                    "[NORMALIZE] Wrong value for the 'input_offsets' option. String required."
                )
            if exists(input_offsets) is False:
                raise UnitexException(
                    "[NORMALIZE] Offsets file '%s' doesn't exist." %
                    input_offsets)
        self["input_offsets"] = input_offsets

        output_offsets = options.get("output_offsets", None)
        if output_offsets is not None and isinstance(output_offsets,
                                                     str) is False:
            raise UnitexException(
                "[NORMALIZE] Wrong value for the 'output_offsets' option. String required."
            )
        self["output_offsets"] = output_offsets

        if self["input_offsets"] is None and self["output_offsets"] is not None:
            raise UnitexException(
                "[NORMALIZE] You must provide both input and output offsets..."
            )
        if self["input_offsets"] is not None and self["output_offsets"] is None:
            raise UnitexException(
                "[NORMALIZE] You must provide both input and output offsets..."
            )

        no_separator_normalization = options.get("no_separator_normalization",
                                                 False)
        if isinstance(no_separator_normalization, bool) is False:
            raise UnitexException(
                "[NORMALIZE] Wrong value for the 'no_separator_normalization' option. Boolean required."
            )
        self["no_separator_normalization"] = no_separator_normalization

        replacement_rules = options.get("replacement_rules", None)
        if replacement_rules is not None:
            if isinstance(replacement_rules, str) is False:
                raise UnitexException(
                    "[NORMALIZE] Wrong value for the 'replacement_rules' option. String required."
                )
            if exists(replacement_rules) is False:
                raise UnitexException(
                    "[NORMALIZE] Rules file '%s' doesn't exist." %
                    replacement_rules)
        self["replacement_rules"] = replacement_rules
コード例 #5
0
def extract(text, output, index, **kwargs):
    """
    This function extracts from the given text all sentences that
    contain at least one occurrence from the concordance. The parameter
    <text> represents the complete path of the text file, without
    omitting the extension .snt.

    *Arguments:*

    - **text [str]** -- the text file (.snt format).
    
    - **output [str]** -- the output text file.
    
    - **index [str]** -- the index file path (produced by the 'locate'
      function).

    *Keyword arguments:*

    - **non_matching_sentences [bool]** -- extracts all sentences that
      don’t contain matching units (default: False).

    *Return [bool]:*

      **True** if it succeeds, **False** otherwise.
    """
    options = ExtractOptions()
    options.load(kwargs)

    if exists(text) is False:
        raise UnitexException("[EXTRACT] Text file '%s' doesn't exists" % text)
    if exists(index) is False:
        raise UnitexException("[EXTRACT] Index file '%s' doesn't exists" %
                              index)

    command = ["UnitexTool", "Extract"]

    if options["non_matching_sentences"] is False:
        command.append("--yes")
    else:
        command.append("--no")

    command.append("--output=%s" % output)
    command.append("--index=%s" % index)

    command.append(text)

    command.append("-qutf8-no-bom")
    command = " ".join(command)

    _LOGGER.info("Extracting sentences")
    _LOGGER.debug("Command: %s", command)
    ret = _unitex.unitex_tool(command)

    return ret
コード例 #6
0
ファイル: tools.py プロジェクト: patwat/python-unitex
def extract(text, output, index, **kwargs):
    """
    This function extracts from the given text all sentences that
    contain at least one occurrence from the concordance. The parameter
    <text> represents the complete path of the text file, without
    omitting the extension .snt.

    *Arguments:*

    - **text [str]** -- the text file (.snt format).
    
    - **output [str]** -- the output text file.
    
    - **index [str]** -- the index file path (produced by the 'locate'
      function).

    *Keyword arguments:*

    - **non_matching_sentences [bool]** -- extracts all sentences that
      don’t contain matching units (default: False).

    *Return [bool]:*

      **True** if it succeeds, **False** otherwise.
    """
    options = ExtractOptions()
    options.load(kwargs)

    if exists(text) is False:
        raise UnitexException("[EXTRACT] Text file '%s' doesn't exists" % text)
    if exists(index) is False:
        raise UnitexException("[EXTRACT] Index file '%s' doesn't exists" % index)

    command = ["UnitexTool", "Extract"]

    if options["non_matching_sentences"] is False:
        command.append("--yes")
    else:
        command.append("--no")

    command.append("--output=%s" % output)
    command.append("--index=%s" % index)

    command.append(text)

    command.append("-qutf8-no-bom")
    command = " ".join(command)

    _LOGGER.info("Extracting sentences")
    _LOGGER.debug("Command: %s", command)
    ret = _unitex.unitex_tool(command)

    return ret
コード例 #7
0
ファイル: config.py プロジェクト: patwat/python-unitex
    def load(self, options):
        duplicates = options.get("duplicates", False)
        if isinstance(duplicates, bool) is False:
            raise UnitexException("[SORTTXT] Wrong value for the 'duplicates' option. Boolean required.")
        self["duplicates"] = duplicates

        reverse = options.get("reverse", False)
        if isinstance(reverse, bool) is False:
            raise UnitexException("[SORTTXT] Wrong value for the 'reverse' option. Boolean required.")
        self["reverse"] = reverse

        sort_order = options.get("sort_order", None)
        if sort_order is not None:
            if isinstance(sort_order, str) is False:
                raise UnitexException("[SORTTXT] Wrong value for the 'sort_order' option. String required.")
            if exists(sort_order) is False:
                raise UnitexException("[SORTTXT] Alphabet file '%s' doesn't exist." % sort_order)
        self["sort_order"] = sort_order

        line_info = options.get("line_info", None)
        if line_info is not None and isinstance(line_info, str) is False:
            raise UnitexException("[SORTTXT] Wrong value for the 'line_info' option. String required.")
        self["line_info"] = line_info

        thai = options.get("thai", False)
        if isinstance(thai, bool) is False:
            raise UnitexException("[SORTTXT] Wrong value for the 'thai' option. Boolean required.")
        self["thai"] = thai

        factorize_inflectional_codes = options.get("factorize_inflectional_codes", False)
        if isinstance(factorize_inflectional_codes, bool) is False:
            raise UnitexException("[SORTTXT] Wrong value for the 'factorize_inflectional_codes' option. Boolean required.")
        self["factorize_inflectional_codes"] = factorize_inflectional_codes
コード例 #8
0
def check_dic(dictionary, dtype, alphabet, **kwargs):
    """
    This function checks the format of <dela> and produces a file named
    CHECK_DIC.TXT that contains check result informations. This file is
    stored in the <dela> directory.

    *Arguments:*

    - **dictionary [str]** -- the dictionary file path.

    - **dtype [str]** -- the dictionary type:
      - UnitexConstants.DELAF (inflected);
      - UnitexConstants.DELAS (non inflected).

    - **alphabet [str]** -- the alphabet file path.

    *Keyword arguments:*

    - **strict [bool]** -- strict syntax checking against unprotected
      dot and comma (default: False).

    - **no_space_warning [bool]** -- tolerates spaces in grammatical,
      semantic and inflectional codes (default: True).

    *Return [bool]:*

      **True** if it succeeds, **False** otherwise.
    """
    options = CheckDicOptions()
    options.load(kwargs)

    if exists(dictionary) is False:
        raise UnitexException(
            "[CHECKDIC] Dictionary file '%s' doesn't exists" % dictionary)

    command = ["UnitexTool", "CheckDic"]

    if dtype == UnitexConstants.DELAF:
        command.append("--delaf")
    elif dtype == UnitexConstants.DELAS:
        command.append("--delas")

    if options["strict"] is True:
        command.append("--strict")
    if options["no_space_warning"] is True:
        command.append("--no_space_warning")

    command.append("--alphabet=%s" % alphabet)

    command.append(dictionary)

    command.append("-qutf8-no-bom")
    command = " ".join(command)

    _LOGGER.info("Checking dic '%s'" % dictionary)
    _LOGGER.debug("Command: %s", command)
    ret = _unitex.unitex_tool(command)

    return ret
コード例 #9
0
ファイル: tools.py プロジェクト: patwat/python-unitex
def check_dic(dictionary, dtype, alphabet, **kwargs):
    """
    This function checks the format of <dela> and produces a file named
    CHECK_DIC.TXT that contains check result informations. This file is
    stored in the <dela> directory.

    *Arguments:*

    - **dictionary [str]** -- the dictionary file path.

    - **dtype [str]** -- the dictionary type:
      - UnitexConstants.DELAF (inflected);
      - UnitexConstants.DELAS (non inflected).

    - **alphabet [str]** -- the alphabet file path.

    *Keyword arguments:*

    - **strict [bool]** -- strict syntax checking against unprotected
      dot and comma (default: False).

    - **no_space_warning [bool]** -- tolerates spaces in grammatical,
      semantic and inflectional codes (default: True).

    *Return [bool]:*

      **True** if it succeeds, **False** otherwise.
    """
    options = CheckDicOptions()
    options.load(kwargs)

    if exists(dictionary) is False:
        raise UnitexException("[CHECKDIC] Dictionary file '%s' doesn't exists" % dictionary)

    command = ["UnitexTool", "CheckDic"]

    if dtype == UnitexConstants.DELAF:
        command.append("--delaf")
    elif dtype == UnitexConstants.DELAS:
        command.append("--delas")

    if options["strict"] is True:
        command.append("--strict")
    if options["no_space_warning"] is True:
        command.append("--no_space_warning")

    command .append("--alphabet=%s" % alphabet)

    command.append(dictionary)

    command.append("-qutf8-no-bom")
    command = " ".join(command)

    _LOGGER.info("Checking dic '%s'" % dictionary)
    _LOGGER.debug("Command: %s", command)
    ret = _unitex.unitex_tool(command)

    return ret
コード例 #10
0
    def load(self, options):
        char_by_char = options.get("char_by_char", False)
        if isinstance(char_by_char, bool) is False:
            raise UnitexException(
                "[TOKENIZE] Wrong value for the 'char_by_char' option. Boolean required."
            )
        self["char_by_char"] = char_by_char

        tokens = options.get("tokens", None)
        if tokens is not None:
            if isinstance(tokens, str) is False:
                raise UnitexException(
                    "[TOKENIZE] Wrong value for the 'tokens' option. String required."
                )
            if exists(tokens) is False:
                raise UnitexException(
                    "[TOKENIZE] Tokens file '%s' doesn't exist." % tokens)
        self["tokens"] = tokens

        input_offsets = options.get("input_offsets", None)
        if input_offsets is not None:
            if isinstance(input_offsets, str) is False:
                raise UnitexException(
                    "[TOKENIZE] Wrong value for the 'input_offsets' option. String required."
                )
            if exists(input_offsets) is False:
                raise UnitexException(
                    "[TOKENIZE] Offsets file '%s' doesn't exist." %
                    input_offsets)
        self["input_offsets"] = input_offsets

        output_offsets = options.get("output_offsets", None)
        if output_offsets is not None and isinstance(output_offsets,
                                                     str) is False:
            raise UnitexException(
                "[TOKENIZE] Wrong value for the 'output_offsets' option. String required."
            )
        self["output_offsets"] = output_offsets

        if self["input_offsets"] is None and self["output_offsets"] is not None:
            raise UnitexException(
                "[TOKENIZE] You must provide both input and output offsets...")
        if self["input_offsets"] is not None and self["output_offsets"] is None:
            raise UnitexException(
                "[TOKENIZE] You must provide both input and output offsets...")
コード例 #11
0
ファイル: ConceptParser.py プロジェクト: plependu/PiLabs
 def get_text(self, file_path):
     '''Get text contents from a file'''
     if exists(file_path) is False:
         sys.stderr.write("[ERROR] File {} not found\n".format(file_path))
     unfile = UnitexFile()
     unfile.open(file_path, mode='r')
     unfile_txt = unfile.read()
     unfile.close()
     return unfile_txt.splitlines()
コード例 #12
0
    def load(self, options):
        morpho = options.get("morpho", None)
        if morpho is not None:
            if isinstance(morpho, list) is False:
                raise UnitexException(
                    "[DICO] Wrong value for the 'morpho' option. List of string required."
                )
            for dictionary in morpho:
                if exists(dictionary) is False:
                    raise UnitexException(
                        "[DICO] Morphological dictionary '%s' doesn't exist." %
                        dictionary)
        self["morpho"] = morpho

        korean = options.get("korean", False)
        if isinstance(korean, bool) is False:
            raise UnitexException(
                "[DICO] Wrong value for the 'korean' option. Boolean required."
            )
        self["korean"] = korean

        semitic = options.get("semitic", False)
        if isinstance(semitic, bool) is False:
            raise UnitexException(
                "[DICO] Wrong value for the 'semitic' option. Boolean required."
            )
        self["semitic"] = semitic

        arabic_rules = options.get("arabic_rules", None)
        if arabic_rules is not None:
            if isinstance(arabic_rules, str) is False:
                raise UnitexException(
                    "[DICO] Wrong value for the 'arabic_rules' option. String required."
                )
            if exists(arabic_rules) is False:
                raise UnitexException("[DICO] Rules file '%s' doesn't exist." %
                                      arabic_rules)
        self["arabic_rules"] = arabic_rules

        raw = options.get("raw", None)
        if raw is not None and isinstance(raw, str) is False:
            raise UnitexException(
                "[DICO] Wrong value for the 'raw' option. String required.")
        self["raw"] = raw
コード例 #13
0
ファイル: config.py プロジェクト: patwat/python-unitex
    def load(self, options):
        language = options.get("language", None)
        if language is None:
            raise UnitexException("[RESOURCES] You must specify the 'language' element.")
        self["language"] = language

        alphabet = options.get("alphabet", None)
        if alphabet is None:
            _LOGGER.warning("[RESOURCES] No alphabet file provided.")
        elif not exists(alphabet):
            raise UnitexException("[RESOURCES] Alphabet file '%s' doesn't exist." % alphabet)
        self["alphabet"] = alphabet

        alphabet_sorted = options.get("alphabet-sorted", None)
        if alphabet_sorted is None:
            _LOGGER.warning("[RESOURCES] No sorted alphabet file provided.")
        elif not exists(alphabet_sorted):
            raise UnitexException("[RESOURCES] Sorted alphabet file '%s' doesn't exist." % alphabet_sorted)
        self["alphabet-sorted"] = alphabet_sorted

        sentence = options.get("sentence", None)
        if sentence is None:
            _LOGGER.warning("[RESOURCES] No sentence grammar provided.")
        else:
            _, extension = os.path.splitext(sentence)
            if extension != ".fst2":
                raise UnitexException("[RESOURCES] Wrong extension for '%s'. Grammars must be compiled and have the '.fst2' extension." % sentence)
            if not exists(sentence):
                raise UnitexException("[RESOURCES] Sentence grammar file '%s' doesn't exist." % sentence)
        self["sentence"] = sentence

        replace = options.get("replace", None)
        if replace is None:
            _LOGGER.warning("[RESOURCES] No replace grammar provided.")
        else:
            _, extension = os.path.splitext(replace)
            if extension != ".fst2":
                raise UnitexException("[RESOURCES] Wrong extension for '%s'. Grammars must be compiled and have the '.fst2' extension." % replace)
            if not exists(replace):
                raise UnitexException("[RESOURCES] Replace grammar file '%s' doesn't exist." % replace)
        self["replace"] = replace

        dictionaries = options.get("dictionaries", None)
        if dictionaries is None:
            _LOGGER.warning("[RESOURCES] No dictionaries provided.")
        else:
            if not isinstance(dictionaries, list):
                raise UnitexException("[RESOURCES] The 'dictionaries' element must be a list of .bin or .fst2 files.")
            for dictionary in dictionaries:
                prefix, extension = os.path.splitext(dictionary)
                if extension != ".bin" and extension != ".fst2":
                    raise UnitexException("[RESOURCES] Wrong extension for '%s'. Dictionaries must be compiled and have the '.bin' or the '.fst2' extension." % dictionary)
                if not exists(dictionary):
                    raise UnitexException("[RESOURCES] Dictionary file '%s' doesn't exist." % dictionary)
                if extension == ".bin" and not exists("%s.inf" % prefix):
                    raise UnitexException("[RESOURCES] Dictionary .inf file missing for '%s'." % dictionary)
        self["dictionaries"] = dictionaries
コード例 #14
0
    def load(self, options):
        clean = options.get("clean", False)
        if isinstance(clean, bool) is False:
            raise UnitexException(
                "[TXT2TFST] Wrong value for the 'clean' option. Boolean required."
            )
        self["clean"] = clean

        normalization_grammar = options.get("normalization_grammar", None)
        if normalization_grammar is not None:
            if isinstance(normalization_grammar, str) is False:
                raise UnitexException(
                    "[TXT2TFST] Wrong value for the 'normalization_grammar' option. String required."
                )
            if exists(normalization_grammar) is False:
                raise UnitexException(
                    "[TXT2TFST] Offsets file '%s' doesn't exist." %
                    normalization_grammar)
        self["normalization_grammar"] = normalization_grammar

        tagset = options.get("tagset", None)
        if tagset is not None:
            if isinstance(tagset, str) is False:
                raise UnitexException(
                    "[TXT2TFST] Wrong value for the 'tagset' option. String required."
                )
            if exists(tagset) is False:
                raise UnitexException(
                    "[TXT2TFST] Offsets file '%s' doesn't exist." % tagset)
        self["tagset"] = tagset

        korean = options.get("korean", False)
        if isinstance(korean, bool) is False:
            raise UnitexException(
                "[TXT2TFST] Wrong value for the 'korean' option. Boolean required."
            )
        self["korean"] = korean
コード例 #15
0
ファイル: config.py プロジェクト: patwat/python-unitex
    def load(self, options):
        no_carriage_return = options.get("no_carriage_return", False)
        if isinstance(no_carriage_return, bool) is False:
            raise UnitexException("[NORMALIZE] Wrong value for the 'no_carriage_return' option. Boolean required.")
        self["no_carriage_return"] = no_carriage_return

        input_offsets = options.get("input_offsets", None)
        if input_offsets is not None:
            if isinstance(input_offsets, str) is False:
                raise UnitexException("[NORMALIZE] Wrong value for the 'input_offsets' option. String required.")
            if exists(input_offsets) is False:
                raise UnitexException("[NORMALIZE] Offsets file '%s' doesn't exist." % input_offsets)
        self["input_offsets"] = input_offsets

        output_offsets = options.get("output_offsets", None)
        if output_offsets is not None and isinstance(output_offsets, str) is False:
                raise UnitexException("[NORMALIZE] Wrong value for the 'output_offsets' option. String required.")
        self["output_offsets"] = output_offsets

        if self["input_offsets"] is None and self["output_offsets"] is not None:
            raise UnitexException("[NORMALIZE] You must provide both input and output offsets...")
        if self["input_offsets"] is not None and self["output_offsets"] is None:
            raise UnitexException("[NORMALIZE] You must provide both input and output offsets...")

        no_separator_normalization = options.get("no_separator_normalization", False)
        if isinstance(no_separator_normalization, bool) is False:
            raise UnitexException("[NORMALIZE] Wrong value for the 'no_separator_normalization' option. Boolean required.")
        self["no_separator_normalization"] = no_separator_normalization

        replacement_rules = options.get("replacement_rules", None)
        if replacement_rules is not None:
            if isinstance(replacement_rules, str) is False:
                raise UnitexException("[NORMALIZE] Wrong value for the 'replacement_rules' option. String required.")
            if exists(replacement_rules) is False:
                raise UnitexException("[NORMALIZE] Rules file '%s' doesn't exist." % replacement_rules)
        self["replacement_rules"] = replacement_rules
コード例 #16
0
    def load(self, options):
        duplicates = options.get("duplicates", False)
        if isinstance(duplicates, bool) is False:
            raise UnitexException(
                "[SORTTXT] Wrong value for the 'duplicates' option. Boolean required."
            )
        self["duplicates"] = duplicates

        reverse = options.get("reverse", False)
        if isinstance(reverse, bool) is False:
            raise UnitexException(
                "[SORTTXT] Wrong value for the 'reverse' option. Boolean required."
            )
        self["reverse"] = reverse

        sort_order = options.get("sort_order", None)
        if sort_order is not None:
            if isinstance(sort_order, str) is False:
                raise UnitexException(
                    "[SORTTXT] Wrong value for the 'sort_order' option. String required."
                )
            if exists(sort_order) is False:
                raise UnitexException(
                    "[SORTTXT] Alphabet file '%s' doesn't exist." % sort_order)
        self["sort_order"] = sort_order

        line_info = options.get("line_info", None)
        if line_info is not None and isinstance(line_info, str) is False:
            raise UnitexException(
                "[SORTTXT] Wrong value for the 'line_info' option. String required."
            )
        self["line_info"] = line_info

        thai = options.get("thai", False)
        if isinstance(thai, bool) is False:
            raise UnitexException(
                "[SORTTXT] Wrong value for the 'thai' option. Boolean required."
            )
        self["thai"] = thai

        factorize_inflectional_codes = options.get(
            "factorize_inflectional_codes", False)
        if isinstance(factorize_inflectional_codes, bool) is False:
            raise UnitexException(
                "[SORTTXT] Wrong value for the 'factorize_inflectional_codes' option. Boolean required."
            )
        self["factorize_inflectional_codes"] = factorize_inflectional_codes
コード例 #17
0
def sort_txt(text, **kwargs):
    """
    This function carries out a lexicographical sorting of the lines of
    file <txt>. <txt> represents the complete path of the file to be
    sorted.

    The input text file is modified. By default, the sorting is
    performed in the order of Unicode characters, removing duplicate
    lines.

    *Arguments:*

    - **text [str]** -- the text file to sort.

    *Keyword arguments:*

    - **duplicates [bool]** -- keep duplicate lines (default: False).
    
    - **reverse [bool]** -- sort in descending order (default: False).
    
    - **sort_order [str]** -- sorts using the alphabet order defined in
      this file. If this parameter is missing, the sorting is done
      according to the order of Unicode characters.
    
    - **line_info [str]** -- backup the number of lines of the result
      file in this file.
    
    - **thai [bool]** -- option for sorting Thai text (default: False).
    
    - **factorize_inflectional_codes [bool]** -- makes two entries
      X,Y.Z:A and X,Y.Z:B become a single entry X,Y.Z:A:B
      (default: False).

    *Return [bool]:*

      **True** if it succeeds, **False** otherwise.
    """
    options = SortTxtOptions()
    options.load(kwargs)

    if exists(text) is False:
        raise UnitexException("[SORTTXT] Text file '%s' doesn't exists" % text)

    command = ["UnitexTool", "SortTxt"]

    if options["duplicates"] is False:
        command.append("--no_duplicates")
    else:
        command.append("--duplicates")

    if options["reverse"] is True:
        command.append("--reverse")
    if options["sort_order"] is None:
        command.append("--sort_order=%s" % options["sort_order"])
    if options["line_info"] is None:
        command.append("--line_info=%s" % options["line_info"])
    if options["thai"] is True:
        command.append("--thai")
    if options["factorize_inflectional_codes"] is True:
        command.append("--factorize_inflectional_codes")

    command.append(text)

    command.append("-qutf8-no-bom")
    command = " ".join(command)

    _LOGGER.info("Sorting file '%s'..." % text)
    _LOGGER.debug("Command: %s", command)
    ret = _unitex.unitex_tool(command)

    return ret
コード例 #18
0
def concord(index, alphabet, **kwargs):
    """
    This function takes a concordance index file produced by the
    function 'locate' and produces a concordance. It is also possible to
    produce a modified text version taking into account the transducer
    outputs associated to the occurrences.

    The result of the application of this function is a file called
    concord.txt if the concordance was constructed in text mode, a file
    called concord.html if 'output_mode' is UnitexConstants.FORMAT_HTML,
    UnitexConstants.FORMAT_GLOSSANET' or UnitexConstants.FORMAT_SCRIPT,
    and a text file with the name defined by the user of the function if
    the function has constructed a modified version of the text.

    In html mode, the occurrence is coded as a hypertext link. The
    reference associated to this link is of the form <a href="X Y Z">.
    X et Y represent the beginning and ending positions of the
    occurrence in characters in the file text_name.snt. Z represents the
    number of the sentence in which the occurrence was found.

    *Arguments:*

    - **index [str]** -- the index file path (produced by the 'locate'
      function).

    - **alphabet [str]** -- alphabet file used for sorting.

    *Keyword arguments:*

    - *Generic options:*

      - **font [str]** -- the name of the font to use if the output is
        an HTML file.

      - **fontsize [int]** -- the font size to use if the output is an
        HTML file.

      - **only_ambiguous [bool]** -- Only displays identical occurrences
        with ambiguous outputs, in text order (default: False).

      - **only_matches [bool]** -- this option will force empty right
        and left contexts. Moreover, if used with
        UnitexConstants.FORMAT_TEXT, the function will not surround
        matches with tabulations (default: False).

      - **left [str]** -- number of characters on the left of the
        occurrences (default=0). In Thai mode, this means the number of
        non-diacritic characters.

      - **right [str]** -- number of characters (non-diacritic ones in
        Thai mode) on the right of the occurrences (default=0). If the
        occurrence is shorter than this value, the concordance line is
        completed up to right. If the occurrence is longer than the
        length defined by right, it is nevertheless saved as whole.

      **NOTE:** For both 'left' and 'right', you can add the 's'
      character to stop at the first {S} tag. For instance, if you set
      '40s' for the left value, the left context will end at 40
      characters at most, less if the {S} tag is found before.

    - *Sort options:*

      - **sort [str]** -- specifies the sort order. Possible values:

        - UnitexConstants.SORT_TEXT_ORDER: order in which the
          occurrences appear in the text (default);
        
        - UnitexConstants.SORT_LEFT_CENTER: left context for primary
          sort, then occurrence for secondary sort;
        
        - UnitexConstants.SORT_LEFT_RIGHT: left context, then right
          context;
        
        - UnitexConstants.SORT_CENTER_LEFT: occurrence, then left
          context;
        
        - UnitexConstants.SORT_CENTER_RIGHT: occurrence, then right
          context;
        
        - UnitexConstants.SORT_RIGHT_LEFT: right context, then left
          context;
        
        - UnitexConstants.SORT_RIGHT_CENTER: left context, then
          occurrence.

    - *Output options:*

      - **format [str]** -- specifies the output fomat. Possible values:
      
        - UnitexConstants.FORMAT_HTML: produces a concordance in HTML
          format encoded in UTF-8 (default);
        
        - UnitexConstants.FORMAT_TEXT: produces a concordance in Unicode
          text format;
        
        - UnitexConstants.FORMAT_GLOSSANET: produces a concordance for
          GlossaNet in HTML format where occurrences are links described
          by the 'script' argument (cf. Unitex manual p. 268). The HTML
          file is encoded in UTF-8;
        
        - UnitexConstants.FORMAT_SCRIPT: produces a HTML concordance
          file where occurrences are links described by the 'script'
          argument;
        
        - UnitexConstants.FORMAT_INDEX: produces an index of the
          concordance, made of the content of the occurrences (with the
          grammar outputs, if any), preceded by the positions of the
          occurrences in the text file given in characters;
        
        - UnitexConstants.FORMAT_UIMA: produces an index of the
          concordance relative to the original text file, before any
          Unitex operation. The 'offsets' argument must be provided;
        
        - UnitexConstants.FORMAT_PRLG: produces a concordance for PRLG
          corpora where each line is prefixed by information extracted
          with Unxmlize’s 'prlg' option. You must provide both the
          'offsets' and the 'unxmlize' argument;
        
        - UnitexConstants.FORMAT_XML: produces an xml index of the
          concordance;
        
        - UnitexConstants.FORMAT_XML_WITH_HEADER: produces an xml index
          of the concordance with full xml header;
        
        - UnitexConstants.FORMAT_AXIS: quite the same as 'index', but
          the numbers represent the median character of each occurrence;
        
        - UnitexConstants.FORMAT_XALIGN: another index file, used by the
          text alignment module. Each line is made of 3 integers X Y Z
          followed by the content of the occurrence. X is the sentence
          number, starting from 1. Y and Z are the starting and ending
          positions of the occurrence in the sentence, given in
          characters;
        
        - UnitexConstants.FORMAT_MERGE: indicates to the function that
          it is supposed to produce a modified version of the text and
          save it in a file. The filename must be provided with the
          'output' argument.
      
      - **script [str]** -- string describing the links format for
        'glossanet' and 'script' output. For instance, if you use
        'http://www.google.com/search?q=', you will obtain a HTML
        concordance file where occurrences are hyperlinks to Google
        queries.
      
      - **offsets [str]** -- the file produced by Tokenize’s
        output_offsets option (needed by the 'uima' and the 'prlg'
        format).
      
      - **unxmlize [str]** -- file produced by Unxmlize’s 'prlg' option
        (needed by the 'prlg' format).
      
      - **output [str]** -- the output filename (needed by the 'merge'
        format).

    - *Other options:*

      - **directory [str]** -- indicates to the function that it must
        not work in the same directory than <index> but in
        'directory'.
      
      - **thai [bool]** -- option to use for Thai concordances
        (default: False).

    *Return [bool]:*

      **True** if it succeeds, **False** otherwise.
    """
    options = ConcordOptions()
    options.load(kwargs)

    if exists(index) is False:
        raise UnitexException("[CONCORD] Index file '%s' doesn't exists" % index)
    if exists(alphabet) is False:
        raise UnitexException("[CONCORD] Alphabet file '%s' doesn't exists" % alphabet)

    command = ["UnitexTool", "Concord"]

    if options["font"] is not None:
        command.append("--font=%s" % options["font"])
    if options["fontsize"] is not None:
        command.append("--fontsize=%s" % options["fontsize"])
    if options["only_ambiguous"] is True:
        command.append("--only_ambiguous")
    if options["only_matches"] is True:
        command.append("--only_matches")

    command.append("--left=%s" % options["left"])
    command.append("--right=%s" % options["right"])

    if options["sort"] == UnitexConstants.SORT_TEXT_ORDER:
        command.append("--TO")
    elif options["sort"] == UnitexConstants.SORT_LEFT_CENTER:
        command.append("--LC")
    elif options["sort"] == UnitexConstants.SORT_LEFT_RIGHT:
        command.append("--LR")
    elif options["sort"] == UnitexConstants.SORT_CENTER_LEFT:
        command.append("--CL")
    elif options["sort"] == UnitexConstants.SORT_CENTER_RIGHT:
        command.append("--CR")
    elif options["sort"] == UnitexConstants.SORT_RIGHT_LEFT:
        command.append("--RL")
    elif options["sort"] == UnitexConstants.SORT_RIGHT_CENTER:
        command.append("--RC")

    if options["format"] == UnitexConstants.FORMAT_HTML:
        command.append("--html")
    elif options["format"] == UnitexConstants.FORMAT_TEXT:
        command.append("--text")
    elif options["format"] == UnitexConstants.FORMAT_GLOSSANET:
        command.append("--glossanet=%s" % options["script"])
    elif options["format"] == UnitexConstants.FORMAT_SCRIPT:
        command.append("--script=%s" % options["script"])
    elif options["format"] == UnitexConstants.FORMAT_INDEX:
        command.append("--index")
    elif options["format"] == UnitexConstants.FORMAT_UIMA:
        command.append("--uima=%s" % options["offsets"])
    elif options["format"] == UnitexConstants.FORMAT_PRLG:
        command.append("--PRLG=%s,%s" % options["unxmlize"], options["offsets"])
    elif options["format"] == UnitexConstants.FORMAT_XML:
        command.append("--xml")
    elif options["format"] == UnitexConstants.FORMAT_XML_WITH_HEADERS:
        command.append("--xml-with-header")
    elif options["format"] == UnitexConstants.FORMAT_AXIS:
        command.append("--axis")
    elif options["format"] == UnitexConstants.FORMAT_XALIGN:
        command.append("--xalign")
    elif options["format"] == UnitexConstants.FORMAT_MERGE:
        command.append("--merge=%s" % options["output"])

    if options["directory"] is not None:
        command.append("--directory=%s" % options["directory"])

    command.append("--alphabet=%s" % alphabet)

    if options["thai"] is not True:
        command.append("--thai")

    command.append(index)

    command.append("-qutf8-no-bom")
    command = " ".join(command)

    _LOGGER.info("Create concordance for '%s'" % index)
    _LOGGER.debug("Command: %s", command)
    ret = _unitex.unitex_tool(command)

    return ret
コード例 #19
0
def grf2fst2(grammar, alphabet, **kwargs):
    """
    This function compiles a grammar into a .fst2 file (for more details
    see section 6.2). The parameter <grf> denotes the complete path of
    the main graph of the grammar, without omitting the extension .grf.

    The result is a file with the same name as the graph passed to the
    function as a parameter, but with extension .fst2. This file is
    saved in the same directory as <grf>.

    *Arguments:*

    - **grammar [str]** -- the grf to compile.
    
    - **alphabet [str]** -- specifies the alphabet file to be used for
      tokenizing the content of the grammar boxes into lexical units.

    *Keyword arguments:*

    - **loop_check [bool]** -- enables error (loop) checking
      (default: False).
    
    - **char_by_char [bool]** -- tokenization will be done character by
      character. If neither -c nor -a option is used, lexical units will
      be sequences of any Unicode letters (default: False).
    
    - **pkgdir [str]** -- specifies the repository directory to use (see
      section 5.2.2, page 99).
    
    - **no_empty_graph_warning [bool]** -- no warning will be emitted
      when a graph matches the empty word. This option is used by
      MultiFlex in order not to scare users with meaningless error
      messages when they design an inflection grammar that matches the
      empty word (default: False).
    
    - **tfst_check [bool]** -- checks wether the given graph can be
      considered as a valid sentence automaton or not (default: False).
    
    - **silent_grf_name [bool]** -- does not print the graph names
      (default: True).
    
    - **named_repositories [list(str)]** -- declaration of named
      repositories. This argument is made of one or more X=Y sequences,
      separated by ‘;’, where X is the name of the repository denoted by
      pathname Y.
    
    - **debug [bool]** -- compile graphs in debug mode (default: False).
    
    - **check_variables [bool]** -- check output validity to avoid
      malformed variable expressions (default: True).

    *Return [bool]:*

      **True** if it succeeds, **False** otherwise.
    """
    options = Grf2Fst2Options()
    options.load(kwargs)

    if exists(grammar) is False:
        raise UnitexException("[GRF2FST2] Grammar file '%s' doesn't exists" % grammar)
    if exists(alphabet) is False:
        raise UnitexException("[GRF2FST2] Alphabet file '%s' doesn't exists" % alphabet)

    command = ["UnitexTool", "Grf2Fst2"]

    if options["loop_check"] is False:
        command.append("--no_loop_check")
    else:
        command.append("--loop_check")

    command.append("--alphabet=%s" % alphabet)

    if options["char_by_char"] is True:
        command.append("--char_by_char")
    if options["pkgdir"] is not None:
        command.append("--pkgdir=%s" % options["pkgdir"])
    if options["no_empty_graph_warning"] is True:
        command.append("--no_empty_graph_warning")
    if options["tfst_check"] is True:
        command.append("--tfst_check")
    if options["silent_grf_name"] is True:
        command.append("--silent_grf_name")
    if options["named_repositories"] is not None:
        command.append("--named_repositories=%s" % ";".join(options["named_repositories"]))
    if options["debug"] is True:
        command.append("--debug")
    if options["check_variables"] is True:
        command.append("--check_variables")

    command.append(grammar)

    command.append("-qutf8-no-bom")
    command = " ".join(command)

    _LOGGER.info("Compiling grammar '%s'..." % grammar)
    _LOGGER.debug("Command: %s", command)
    ret = _unitex.unitex_tool(command)

    return ret
コード例 #20
0
    def load(self, options):
        start_on_space = options.get("start_on_space", False)
        if isinstance(start_on_space, bool) is False:
            raise UnitexException(
                "[LOCATE] Wrong value for the 'start_on_space' option. Boolean required."
            )
        self["start_on_space"] = start_on_space

        char_by_char = options.get("char_by_char", False)
        if isinstance(char_by_char, bool) is False:
            raise UnitexException(
                "[LOCATE] Wrong value for the 'char_by_char' option. Boolean required."
            )
        self["char_by_char"] = char_by_char

        morpho = options.get("morpho", None)
        if morpho is not None:
            if isinstance(morpho, list) is False:
                raise UnitexException(
                    "[LOCATE] Wrong value for the 'morpho' option. List of string required."
                )
            for dictionary in morpho:
                if exists(dictionary) is False:
                    raise UnitexException(
                        "[LOCATE] Morphological dictionary '%s' doesn't exist."
                        % dictionary)
        self["morpho"] = morpho

        korean = options.get("korean", False)
        if isinstance(korean, bool) is False:
            raise UnitexException(
                "[LOCATE] Wrong value for the 'korean' option. Boolean required."
            )
        self["korean"] = korean

        arabic_rules = options.get("arabic_rules", None)
        if arabic_rules is not None:
            if isinstance(arabic_rules, str) is False:
                raise UnitexException(
                    "[LOCATE] Wrong value for the 'arabic_rules' option. String required."
                )
            if exists(arabic_rules) is False:
                raise UnitexException(
                    "[LOCATE] Rules file '%s' doesn't exist." % arabic_rules)
        self["arabic_rules"] = arabic_rules

        sntdir = options.get("sntdir", None)
        if sntdir is not None:
            if isinstance(sntdir, str) is False:
                raise UnitexException(
                    "[LOCATE] Wrong value for the 'sntdir' option. String required."
                )
            if exists(sntdir) is False:
                raise UnitexException(
                    "[LOCATE] Directory '%s' doesn't exist." % sntdir)
        self["sntdir"] = sntdir

        negation_operator = options.get("negation_operator",
                                        UnitexConstants.NEGATION_OPERATOR)
        if negation_operator not in (UnitexConstants.NEGATION_OPERATOR,
                                     UnitexConstants.NEGATION_OPERATOR_OLD):
            raise UnitexException(
                "[LOCATE] Wrong value for the 'negation_operator' option. UnitexConstants.NEGATION_OPERATOR(_OLD) required."
            )
        self["negation_operator"] = negation_operator

        number_of_matches = options.get("number_of_matches", None)
        if number_of_matches is not None and isinstance(
                number_of_matches, int) is False:
            raise UnitexException(
                "[LOCATE] Wrong value for the 'number_of_matches' option. Integer required."
            )
        self["number_of_matches"] = number_of_matches

        stop_token_count = options.get("stop_token_count", None)
        if stop_token_count is not None:
            if isinstance(stop_token_count, list) is False:
                raise UnitexException(
                    "[LOCATE] Wrong value for the 'stop_token_count' option. List of 2 integers required."
                )
            if len(stop_token_count) != 2:
                raise UnitexException(
                    "[LOCATE] Wrong value for the 'stop_token_count' option. List of 2 integers required."
                )
            for i in stop_token_count:
                if isinstance(i, int) is False:
                    raise UnitexException(
                        "[LOCATE] Wrong value for the 'stop_token_count' option. List of 2 integers required."
                    )
        self["stop_token_count"] = stop_token_count

        match_mode = options.get("match_mode",
                                 UnitexConstants.MATCH_MODE_LONGEST)
        if match_mode not in (UnitexConstants.MATCH_MODE_LONGEST,
                              UnitexConstants.MATCH_MODE_SHORTEST,
                              UnitexConstants.MATCH_MODE_ALL):
            raise UnitexException(
                "[LOCATE] Wrong value for the 'match_mode' option. UnitexConstants.MATCH_MODE_X required."
            )
        self["match_mode"] = match_mode

        output_mode = options.get("output_mode",
                                  UnitexConstants.OUTPUT_MODE_IGNORE)
        if output_mode not in (UnitexConstants.OUTPUT_MODE_IGNORE,
                               UnitexConstants.OUTPUT_MODE_MERGE,
                               UnitexConstants.OUTPUT_MODE_REPLACE):
            raise UnitexException(
                "[LOCATE] Wrong value for the 'output_mode' option. UnitexConstants.OUTPUT_MODE_X required."
            )
        self["output_mode"] = output_mode

        protect_dic_chars = options.get("protect_dic_chars", True)
        if isinstance(protect_dic_chars, bool) is False:
            raise UnitexException(
                "[LOCATE] Wrong value for the 'protect_dic_chars' option. Boolean required."
            )
        self["protect_dic_chars"] = protect_dic_chars

        variable = options.get("variable", None)
        if variable is not None:
            if isinstance(variable, list) is False:
                raise UnitexException(
                    "[LOCATE] Wrong value for the 'variable' option. List of 2 strings required."
                )
            if len(variable) != 2:
                raise UnitexException(
                    "[LOCATE] Wrong value for the 'variable' option. List of 2 strings required."
                )
            if isinstance(variable[0], str) is False:
                raise UnitexException(
                    "[LOCATE] Wrong value for the 'variable' option. List of 2 strings required."
                )
            # Checks if the second argument is in ascii
            if isinstance(variable[1], str) is False and all(
                    ord(c) < 128 for c in variable[1]) is False:
                raise UnitexException(
                    "[LOCATE] Wrong value for the 'variable' option. List of 2 strings required (the second must be *ascii*)."
                )
        self["variable"] = variable

        ambiguous_outputs = options.get("ambiguous_outputs", True)
        if isinstance(ambiguous_outputs, bool) is False:
            raise UnitexException(
                "[LOCATE] Wrong value for the 'ambiguous_outputs' option. Boolean required."
            )
        self["ambiguous_outputs"] = ambiguous_outputs

        variable_error = options.get("variable_error",
                                     UnitexConstants.ON_ERROR_IGNORE)
        if variable_error not in (UnitexConstants.ON_ERROR_IGNORE,
                                  UnitexConstants.ON_ERROR_EXIT,
                                  UnitexConstants.ON_ERROR_BACKTRACK):
            raise UnitexException(
                "[LOCATE] Wrong value for the 'variable_error' option. UnitexConstants.OUTPUT_MODE_X required."
            )
        self["variable_error"] = variable_error
コード例 #21
0
ファイル: config.py プロジェクト: patwat/python-unitex
    def load(self, options):
        font = options.get("font", None)
        if font is not None and isinstance(font, str) is False:
            raise UnitexException("[CONCORD] Wrong value for the 'font' option. String required.")
        self["font"] = font

        fontsize = options.get("fontsize", None)
        if fontsize is not None and isinstance(fontsize, int) is False:
            raise UnitexException("[CONCORD] Wrong value for the 'fontsize' option. Integer required.")
        self["fontsize"] = fontsize

        only_ambiguous = options.get("only_ambiguous", False)
        if isinstance(only_ambiguous, bool) is False:
            raise UnitexException("[CONCORD] Wrong value for the 'only_ambiguous' option. Boolean required.")
        self["only_ambiguous"] = only_ambiguous

        only_matches = options.get("only_matches", False)
        if isinstance(only_matches, bool) is False:
            raise UnitexException("[CONCORD] Wrong value for the 'only_matches' option. Boolean required.")
        self["only_matches"] = only_matches

        left = options.get("left", "0")
        if left is not None and isinstance(left, str) is False:
            raise UnitexException("[CONCORD] Wrong value for the 'left' option. String required.")
        self["left"] = left

        right = options.get("right", "0")
        if right is not None and isinstance(right, str) is False:
            raise UnitexException("[CONCORD] Wrong value for the 'right' option. String required.")
        self["right"] = right

        sort = options.get("sort", UnitexConstants.SORT_TEXT_ORDER)
        if sort not in (UnitexConstants.SORT_TEXT_ORDER,
                        UnitexConstants.SORT_LEFT_CENTER,
                        UnitexConstants.SORT_LEFT_RIGHT,
                        UnitexConstants.SORT_CENTER_LEFT,
                        UnitexConstants.SORT_CENTER_RIGHT,
                        UnitexConstants.SORT_RIGHT_LEFT,
                        UnitexConstants.SORT_RIGHT_CENTER):
            raise UnitexException("[CONCORD] Wrong value for the 'sort' option. UnitexConstants.SORT_XXX required.")
        self["sort"] = sort

        format = options.get("format", UnitexConstants.FORMAT_TEXT)
        if format not in (UnitexConstants.FORMAT_HTML,
                          UnitexConstants.FORMAT_TEXT,
                          UnitexConstants.FORMAT_GLOSSANET,
                          UnitexConstants.FORMAT_SCRIPT,
                          UnitexConstants.FORMAT_INDEX,
                          UnitexConstants.FORMAT_UIMA,
                          UnitexConstants.FORMAT_PRLG,
                          UnitexConstants.FORMAT_XML,
                          UnitexConstants.FORMAT_XML_WITH_HEADERS,
                          UnitexConstants.FORMAT_AXIS,
                          UnitexConstants.FORMAT_XALIGN,
                          UnitexConstants.FORMAT_MERGE):
            raise UnitexException("[CONCORD] Wrong value for the 'format' option. UnitexConstants.FORMAT_XXX required.")
        self["format"] = format

        self["script"] = None
        self["offsets"] = None
        self["unxmlize"] = None
        self["output"] = None

        if self["format"] in (UnitexConstants.FORMAT_HTML, UnitexConstants.FORMAT_GLOSSANET, UnitexConstants.FORMAT_SCRIPT):
            if self["font"] is None:
                self["font"] = "Courier new"
            if self["fontsize"] is None:
                self["fontsize"] = 12

            if self["format"] in (UnitexConstants.FORMAT_GLOSSANET, UnitexConstants.FORMAT_SCRIPT):
                script = options.get("script", None)
                if script is None:
                    raise UnitexException("You must provide the 'script' option for UnitexConstants.FORMAT_(GLOSSANET|SCRIPT) formats...")
                self["script"] = script

        elif self["format"] == UnitexConstants.FORMAT_UIMA:
            offsets = options.get("offsets", None)
            if offsets is None:
                raise UnitexException("You must provide the 'offsets' option for UnitexConstants.FORMAT_UIMA formats...")
            self["offsets"] = offsets

        elif self["format"] == UnitexConstants.FORMAT_PRLG:
            offsets = options.get("offsets", None)
            if offsets is None:
                raise UnitexException("You must provide the 'offsets' option for UnitexConstants.FORMAT_PRLG formats...")
            self["offsets"] = offsets

            unxmlize = options.get("unxmlize", None)
            if unxmlize is None:
                raise UnitexException("You must provide the 'unxmlize' option for UnitexConstants.FORMAT_PRLG format...")
            self["unxmlize"] = unxmlize

        elif self["format"] == UnitexConstants.FORMAT_MERGE:
            output = options.get("output", None)
            if output is None:
                raise UnitexException("You must provide the 'output' option for UnitexConstants.FORMAT_MERGE format...")
            self["output"] = output

        directory = options.get("directory", None)
        if directory is not None:
            if isinstance(directory, str) is False:
                raise UnitexException("[CONCORD] Wrong value for the 'directory' option. String required.")
            if exists(directory) is False:
                raise UnitexException("[CONCORD] The text 'directory' doesn't exist.")
        self["directory"] = directory

        thai = options.get("thai", False)
        if isinstance(thai, bool) is False:
            raise UnitexException("[CONCORD] Wrong value for the 'thai' option. Boolean required.")
        self["thai"] = thai
コード例 #22
0
def locate(grammar, text, alphabet, **kwargs):
    """
    This function applies a grammar to a text and constructs an index of
    the occurrences found.

    This function saves the references to the found occurrences in a
    file called concord.ind. The number of occurrences, the number of
    units belonging to those occurrences, as well as the percentage of
    recognized units within the text are saved in a file called
    concord.n. These two files are stored in the directory of the text.

    *Arguments:*

    - **grammar [str]** -- the fst2 to apply on the text.

    - **text [str]** -- the text file, with extension .snt.

    - **alphabet [str]** -- the alphabet file of the language of the
      text.

    *Keyword arguments:*

    - *Generic options*:
    
      - **start_on_space [bool]** -- this parameter indicates that the
        search will start at any position in the text, even before a
        space. This parameter should only be used to carry out
        morphological searches (default: False).
    
      - **char_by_char [bool]** -- works in character by character
        tokenization mode. This is useful for languages like Thai
        (default: False).
    
      - **morpho [list(str)]** -- this optional argument indicates which
        morphological mode dictionaries are to be used, if needed by
        some .fst2 dictionaries. The argument is a list of dictionary
        path (bin format).
    
      - **korean [bool]** -- specify the dictionary is in korean
        (default: False).
    
      - **arabic_rules [str]** -- specifies the Arabic typographic rule
        configuration file path.
    
      - **sntdir [str]** -- puts produced files in 'sntdir' instead of
        the text directory. Note that 'sntdir' must end with a file
        separator (\ or /).
    
      - **negation_operator [str]** -- specifies the negation operator
        to be used in Locate patterns. The two legal values for X are
        minus and tilde (default). Using minus provides backward
        compatibility with previous versions of Unitex.
    
    - *Search limit options:*

      - **number_of_matches [int]** -- stops after the first N matches
        (default: all matches).
    
    - *Maximum iterations per token options:*

      - **stop_token_count [list(int_1, int_2)]** -- emits a warning
        after 'int_1' iterations on a token and stops after 'int_2'
        iterations.
    
    - *Matching mode options:*

      - **match_mode [str]** -- Possible values are:
        - UnitexConstants.MATCH_MODE_SHORTEST
        - UnitexConstants.MATCH_MODE_LONGEST (default)
        - UnitexConstants.MATCH_MODE_ALL
    
    - Output options:

      - **output_mode [str]** -- Possible values are:
        - UnitexConstants.OUTPUT_MODE_IGNORE (default)
        - UnitexConstants.OUTPUT_MODE_MERGE
        - UnitexConstants.OUTPUT_MODE_REPLACE
    
      - **protect_dic_chars [bool]** -- when 'merge' or 'replace' mode
        is used, this option protects some input characters with a
        backslash. This is useful when Locate is called by 'dico' in
        order to avoid producing bad lines like: 3,14,.PI.NUM
        (default: True).
    
      - **variable [list(str_1, str_2)]** -- sets an output variable
        named str_1 with content str_2. Note that str_2 must be ASCII.
    
    - *Ambiguous output options:*

      - **ambiguous_outputs [bool]** -- allows the production of several
        matches with same input but different outputs. If False, in case
        of ambiguous outputs, one will be arbitrarily chosen and kept,
        depending on the internal state of the function (default: True).
      
      - **variable_error [str]** -- Possible values are:
        - UnitexConstants.ON_ERROR_EXIT
        - UnitexConstants.ON_ERROR_IGNORE (default)
        - UnitexConstants.ON_ERROR_BACKTRACK

    *Return [bool]:*

      **True** if it succeeds, **False** otherwise.
    """
    options = LocateOptions()
    options.load(kwargs)

    if exists(grammar) is False:
        raise UnitexException("[LOCATE] Grammar file '%s' doesn't exists" % grammar)
    if exists(text) is False:
        raise UnitexException("[LOCATE] Text file '%s' doesn't exists" % text)
    if exists(alphabet) is False:
        raise UnitexException("[LOCATE] Alphabet file '%s' doesn't exists" % alphabet)

    command = ["UnitexTool", "Locate"]

    command.append("--text=%s" % text)
    command.append("--alphabet=%s" % alphabet)

    if options["morpho"] is not None:
        command.append("--morpho=%s" % ",".join(options["morpho"]))

    if options["start_on_space"] is False:
        command.append("--dont_start_on_space")
    else:
        command.append("--start_on_space")

    if options["char_by_char"] is False:
        command.append("--word_by_word")
    else:
        command.append("--char_by_char")

    if options["sntdir"] is not None:
        command.append("--sntdir=%s" % options["sntdir"])
    if options["korean"] is True:
        command.append("--korean")
    if options["arabic_rules"] is not None:
        command.append("--arabic_rules=%s" % options["arabic_rules"])
    if options["negation_operator"] is not None:
        command.append("--negation_operator=%s" % options["negation_operator"])

    if options["number_of_matches"] is None:
        command.append("--all")
    else:
        command.append("--number_of_matches=%s" % options["number_of_matches"])

    if options["stop_token_count"] is not None:
        if options["stop_token_count[0]"] is None:
            command.append("--stop_token_count=%s" % stop_token_count[1])
        else:
            command.append("--stop_token_count=%s,%s" % (stop_token_count[0], stop_token_count[1]))

    if options["match_mode"] == UnitexConstants.MATCH_MODE_LONGEST:
        command.append("--longest_matches")
    elif options["match_mode"] == UnitexConstants.MATCH_MODE_SHORTEST:
        command.append("--shortest_matches")
    elif options["match_mode"] == UnitexConstants.MATCH_MODE_ALL:
        command.append("--all_matches")

    if options["output_mode"] == UnitexConstants.OUTPUT_MODE_IGNORE:
        command.append("--ignore")
    elif options["output_mode"] == UnitexConstants.OUTPUT_MODE_MERGE:
        command.append("--merge")
    elif options["output_mode"] == UnitexConstants.OUTPUT_MODE_REPLACE:
        command.append("--replace")

    if options["protect_dic_chars"] is True:
        command.append("--protect_dic_chars")

    if options["variable"] is not None:
        command.append("--variable=%s=%s" % (options["variable"][0], options["variable"][1]))

    if options["ambiguous_outputs"] is True:
        command.append("--ambiguous_outputs")
    else:
        command.append("--no_ambiguous_outputs")

    if options["variable_error"] == UnitexConstants.ON_ERROR_IGNORE:
        command.append("--ignore_variable_error")
    elif options["variable_error"] == UnitexConstants.ON_ERROR_EXIT:
        command.append("--exit_on_variable_error")
    elif options["variable_error"] == UnitexConstants.ON_ERROR_BACKTRACK:
        command.append("--backtrack_on_variable_error")

    command.append(grammar)

    command.append("-qutf8-no-bom")
    command = " ".join(command)

    _LOGGER.info("Locating pattern '%s'..." % grammar)
    _LOGGER.debug("Command: %s", command)
    ret = _unitex.unitex_tool(command)

    return ret
コード例 #23
0
def compress(dictionary, **kwargs):
    """
    This function takes a DELAF dictionary as a parameter and compresses
    it. The compression of a dictionary dico.dic produces two files:

    - dico.bin: a binary file containing the minimum automaton of
      the inflected forms of the dictionary;

    - dico.inf: a text file containing the compressed forms required
      for the reconstruction of the dictionary lines from the inflected
      forms contained in the automaton.

    *Arguments:*

    - **dictionary [str]** -- the dictionary file path.

    *Keyword arguments:*

    - **output [str]** -- sets the output file. By default, a file
      xxx.dic will produce a file xxx.bin.

    - **flip [bool]** -- indicates that the inflected and canonical
      forms should be swapped in the compressed dictionary. This option
      is used to construct an inverse dictionary which is necessary for
      the program 'Reconstrucao' (default: False).

    - **semitic [bool]** -- indicates that the semitic compression
      algorithm should be used. Setting this option with semitic
      languages like Arabic significantly reduces the size of the output
      dictionary (default: False).

    - **version [str]** -- Possible values are:

      - UnitexConstants.DICTIONARY_VERSION_1: produces an old style .bin
      - UnitexConstants.DICTIONARY_VERSION_2: produces a new style .bin
        file, with no file size limitation to 16 Mb and a smaller size
        (default).

    *Return [bool]:*

      **True** if it succeeds, **False** otherwise.
    """
    options = CompressOptions()
    options.load(kwargs)

    if exists(dictionary) is False:
        raise UnitexException("[COMPRESS] Dictionary file '%s' doesn't exists" % dictionary)

    command = ["UnitexTool", "Compress"]

    if options["output"] is not None:
        command.append("--output=%s" % options["output"])
    if options["flip"] is True:
        command.append("--flip")
    if options["semitic"] is True:
        command.append("--semitic")

    if options["version"] == UnitexConstants.DICTIONARY_VERSION_1:
        command.append("--v1")
    elif options["version"] == UnitexConstants.DICTIONARY_VERSION_2:
        command.append("--v2")

    command.append(dictionary)

    command.append("-qutf8-no-bom")
    command = " ".join(command)

    _LOGGER.info("Compressing dic '%s'" % dictionary)
    _LOGGER.debug("Command: %s", command)
    ret = _unitex.unitex_tool(command)

    return ret
コード例 #24
0
def tokenize(text, alphabet, **kwargs):
    """
    This function tokenizes a tet text into lexical units. <txt> the
    complete path of the text file, without omitting the .snt extension.

    The function codes each unit as a whole. The list of units is saved
    in a text file called tokens.txt. The sequence of codes representing
    the units now allows the coding of the text. This sequence is saved
    in a binary file named text.cod. The function also produces the
    following four files:

    - tok_by_freq.txt: text file containing the units sorted by
      frequency.
    - tok_by_alph.txt: text file containing the units sorted
      alphabetically.
    - stats.n: text file containing information on the number of
      sentence separators, the number of units, the number of simple
      words and the number of numbers.
    - enter.pos: binary file containing the list of newline positions in
      the text. The coded representation of the text does not contain
      newlines, but spaces. Since a newline counts as two characters and
      a space as a single one, it is necessary to know where newlines
      occur in the text when the positions of occurrences located by the
      'locate' function are to be synchronized with the text file. File
      enter.pos is used for this by the 'concord' function. Thanks to
      this, when clicking on an occurrence in a concordance, it is
      correctly selected in the text. File enter.pos is a binary file
      containing the list of the positions of newlines in the text.

    All produced files are saved in the text directory

    *Arguments:*

    - **text [str]** -- the text file to tokenize (.snt format).

    - **alphabet [str]** -- the alphabet file.

    *Keyword arguments:*

    - *Generic options:*

      - **char_by_char [bool]** -- indicates whether the function is
        applied character by character, with the exceptions of the
        sentence delimiter {S}, the stop marker {STOP} and lexical
        tags like {today,.ADV} which are considered to be single units
        (default: False).
      
      - **tokens [str]** -- specifies a tokens.txt file to load and
        modify, instead of creating a new one from scratch.
    
    - *Offsets options:*

      - **input_offsets [str]** -- base offset file to be used.
    
      - **output_offsets [str]** -- offset file to be produced.

    *Return [bool]:*

      **True** if it succeeds, **False** otherwise.
    """
    options = TokenizeOptions()
    options.load(kwargs)

    if exists(text) is False:
        raise UnitexException("[TOKENIZE] Text file '%s' doesn't exists" % text)
    if exists(alphabet) is False:
        raise UnitexException("[TOKENIZE] Alphabet file '%s' doesn't exists" % alphabet)

    command = ["UnitexTool", "Tokenize"]

    command.append("--alphabet=%s" % alphabet)

    if options["char_by_char"] is True:
        command.append("--char_by_char")
    else:
        command.append("--word_by_word")

    if options["tokens"] is not None:
        command.append("--tokens=%s" % options["tokens"])

    if options["input_offsets"] is not None:
        command.append("--input_offsets=%s" % options["input_offsets"])
    if options["output_offsets"] is not None:
        command.append("--output_offsets=%s" % options["output_offsets"])

    command.append(text)

    command.append("-qutf8-no-bom")
    command = " ".join(command)

    _LOGGER.info("Tokenizing file '%s'..." % text)
    _LOGGER.debug("Command: %s", command)
    ret = _unitex.unitex_tool(command)

    return ret
コード例 #25
0
def fst2txt(grammar, text, alphabet, **kwargs):
    """
    This function applies a transducer to a text in longest match mode
    at the preprocessing stage, when the text has not been cut into
    lexical units yet. This function modifies the input text file.

    **NOTE:** This function modifies the input text file.

    *Arguments:*

    - **grammar [str]** -- the fst2 to apply on the text.
    
    - **text [str]** -- the (.snt) text file to be modified.
    
    - **alphabet [str]** -- the alphabet file of the language of the
      text.

    *Keyword arguments:*

    - **start_on_space [bool]** -- this parameter indicates that the
      search will start at any position in the text, even before a
      space. This parameter should only be used to carry out
      morphological searches (default: False).
    
    - **char_by_char [bool]** -- works in character by character
      tokenization mode. This is useful for languages like Thai
      (default: False).
    
    - **merge [bool]** -- merge (instead of replace) transducer outputs
      with text inputs (default: True).

    *Return [bool]:*

      **True** if it succeeds, **False** otherwise.
    """
    options = Fst2TxtOptions()
    options.load(kwargs)

    if exists(grammar) is False:
        raise UnitexException("[FST2TXT] Grammar file '%s' doesn't exists" % grammar)
    if exists(text) is False:
        raise UnitexException("[FST2TXT] Text file '%s' doesn't exists" % text)
    if exists(alphabet) is False:
        raise UnitexException("[FST2TXT] Alphabet file '%s' doesn't exists" % alphabet)

    command = ["UnitexTool", "Fst2Txt"]

    command.append("--text=%s" % text)
    command.append("--alphabet=%s" % alphabet)

    if options["start_on_space"] is False:
        command.append("--dont_start_on_space")
    else:
        command.append("--start_on_space")

    if options["char_by_char"] is False:
        command.append("--word_by_word")
    else:
        command.append("--char_by_char")

    if options["merge"] is True:
        command.append("--merge")
    else:
        command.append("--replace")

    command.append(grammar)

    command.append("-qutf8-no-bom")
    command = " ".join(command)

    _LOGGER.info("Applying grammar '%s'..." % grammar)
    _LOGGER.debug("Command: %s", command)
    ret = _unitex.unitex_tool(command)

    return ret
コード例 #26
0
def normalize(text, **kwargs):
    """
    This function carries out a normalization of text separators. The
    separators are space, tab, and newline. Every sequence of separators
    that contains at least one newline is replaced by a unique newline.
    All other sequences of separators are replaced by a single space.

    This function also checks the syntax of lexical tags found in the
    text. All sequences in curly brackets should be either the sentence
    delimiter {S}, the stop marker {STOP}, or valid entries in the DELAF
    format ({aujourd’hui,.ADV}).

    **NOTE:** the function creates a modified version of the text that is
    saved in a file with extension .snt.

    **WARNING:** if you specify a normalization rule file, its rules
    will be applied prior to anything else. So, you have to be very
    careful if you manipulate separators in such rules.

    *Arguments:*

    - **text [str]** -- the text file to normalize.

    *Keyword arguments:*

    - ** no_carriage_return [bool]** -- every separator sequence will be
      turned into a single space (default: False).
    
    - **input_offsets [str]** -- base offset file to be used.
    
    - **output_offsets [str]** -- offset file to be produced.
    
    - **replacement_rules [str]** -- specifies the normalization rule
      file to be used. See section 14.13.6 for details about the format
      of this file. By default, the function only replaces { and } by
      [ and ].
    
    - **no_separator_normalization [bool]** -- only applies replacement
      rules specified with the 'replacement_rules' option
      (default: False).

    *Return [bool]:*

      **True** if it succeeds, **False** otherwise.
    """
    options = NormalizeOptions()
    options.load(kwargs)

    if exists(text) is False:
        raise UnitexException("[NORMALIZE] Text file '%s' doesn't exists" % text)

    command = ["UnitexTool", "Normalize"]

    if options["no_carriage_return"] is True:
        command.append("--no_carriage_return")

    if options["input_offsets"] is not None:
        command.append("--input_offsets=%s" % options["input_offsets"])
    if options["output_offsets"] is not None:
        command.append("--output_offsets=%s" % options["output_offsets"])

    if options["replacement_rules"] is not None:
        command.append("--replacement_rules=%s" % options["replacement_rules"])

    if options["no_separator_normalization"] is True:
        command.append("--no_separator_normalization")

    command.append(text)

    command.append("-qutf8-no-bom")
    command = " ".join(command)

    _LOGGER.info("Normalizing text '%s'..." % text)
    _LOGGER.debug("Command: %s", command)
    ret = _unitex.unitex_tool(command)

    return ret
コード例 #27
0
    def load(self, options):
        language = options.get("language", None)
        if language is None:
            raise UnitexException(
                "[RESOURCES] You must specify the 'language' element.")
        self["language"] = language

        alphabet = options.get("alphabet", None)
        if alphabet is None:
            _LOGGER.warning("[RESOURCES] No alphabet file provided.")
        elif not exists(alphabet):
            raise UnitexException(
                "[RESOURCES] Alphabet file '%s' doesn't exist." % alphabet)
        self["alphabet"] = alphabet

        alphabet_sorted = options.get("alphabet-sorted", None)
        if alphabet_sorted is None:
            _LOGGER.warning("[RESOURCES] No sorted alphabet file provided.")
        elif not exists(alphabet_sorted):
            raise UnitexException(
                "[RESOURCES] Sorted alphabet file '%s' doesn't exist." %
                alphabet_sorted)
        self["alphabet-sorted"] = alphabet_sorted

        sentence = options.get("sentence", None)
        if sentence is None:
            _LOGGER.warning("[RESOURCES] No sentence grammar provided.")
        else:
            _, extension = os.path.splitext(sentence)
            if extension != ".fst2":
                raise UnitexException(
                    "[RESOURCES] Wrong extension for '%s'. Grammars must be compiled and have the '.fst2' extension."
                    % sentence)
            if not exists(sentence):
                raise UnitexException(
                    "[RESOURCES] Sentence grammar file '%s' doesn't exist." %
                    sentence)
        self["sentence"] = sentence

        replace = options.get("replace", None)
        if replace is None:
            _LOGGER.warning("[RESOURCES] No replace grammar provided.")
        else:
            _, extension = os.path.splitext(replace)
            if extension != ".fst2":
                raise UnitexException(
                    "[RESOURCES] Wrong extension for '%s'. Grammars must be compiled and have the '.fst2' extension."
                    % replace)
            if not exists(replace):
                raise UnitexException(
                    "[RESOURCES] Replace grammar file '%s' doesn't exist." %
                    replace)
        self["replace"] = replace

        dictionaries = options.get("dictionaries", None)
        if dictionaries is None:
            _LOGGER.warning("[RESOURCES] No dictionaries provided.")
        else:
            if not isinstance(dictionaries, list):
                raise UnitexException(
                    "[RESOURCES] The 'dictionaries' element must be a list of .bin or .fst2 files."
                )
            for dictionary in dictionaries:
                prefix, extension = os.path.splitext(dictionary)
                if extension != ".bin" and extension != ".fst2":
                    raise UnitexException(
                        "[RESOURCES] Wrong extension for '%s'. Dictionaries must be compiled and have the '.bin' or the '.fst2' extension."
                        % dictionary)
                if not exists(dictionary):
                    raise UnitexException(
                        "[RESOURCES] Dictionary file '%s' doesn't exist." %
                        dictionary)
                if extension == ".bin" and not exists("%s.inf" % prefix):
                    raise UnitexException(
                        "[RESOURCES] Dictionary .inf file missing for '%s'." %
                        dictionary)
        self["dictionaries"] = dictionaries
コード例 #28
0
def txt2tfst(text, alphabet, **kwargs):
    """
    This function constructs an automaton of a text.

    If the text is separated into sentences, the function constructs an
    automaton for each sentence. If this is not the case, the function
    arbitrarily cuts the text into sequences of 2000 tokens and produces
    an automaton for each of these sequences.

    The result is a file called text.tfst which is saved in the
    directory of the text. Another file named text.tind is also produced.

    *Arguments:*

    - **text [str]** -- the path to the text file in .snt format.

    - alphabet [str]** -- the alphabet file.

    *Keyword arguments:*

    - **clean [bool]** -- indicates whether the rule of conservation of
      the best paths (see section 7.2.4) should be applied
      (default: False).

    - **normalization_grammar [str]** -- name of a normalization grammar
      that is to be applied to the text automaton.

    - **tagset [str]** -- Elag tagset file to use to normalize
      dictionary entries.

    - **korean [bool]** -- tells the function that it works on Korean
      (default: False).

    *Return [bool]:*

      **True** if it succeeds, **False** otherwise.
    """
    options = Txt2TFstOptions()
    options.load(kwargs)

    if exists(text) is False:
        raise UnitexException("[TXT2TFST] Text file '%s' doesn't exists" % text)
    if exists(alphabet) is False:
        raise UnitexException("[TXT2TFST] Alphabet file '%s' doesn't exists" % alphabet)

    command = ["UnitexTool", "Txt2Tfst"]

    command.append("--alphabet=%s" % alphabet)

    if options["clean"] is not False:
        command.append("--clean")
    if options["normalization_grammar"] is not None:
        command.append("--normalization_grammar=%s" % options["normalization_grammar"])
    if options["tagset"] is not None:
        command.append("--tagset=%s" % options["tagset"])
    if options["korean"] is not False:
        command.append("--korean")

    command.append(text)

    command.append("-qutf8-no-bom")
    command = " ".join(command)

    _LOGGER.info("Building text automaton for '%s'..." % text)
    _LOGGER.debug("Command: %s", command)
    ret = _unitex.unitex_tool(command)

    return ret
コード例 #29
0
def dico(dictionaries, text, alphabet, **kwargs):
    """
    This function applies dictionaries to a text. The text must have
    been cut up into lexical units by the 'tokenize' function.

    The function 'dico' produces the following files, and saves them in
    the directory of the text:

    - dlf: dictionary of simple words in the text;
    - dlc: dictionary of compound words in the text;
    - err: list of unknown words in the text;
    - tags_err: unrecognized simple words that are not matched by the
      tags.ind file;
    - tags.ind: sequences to be inserted in the text automaton (see
      section 3.8.3, page 69);
    - stat_dic.n: file containing the number of simple words, the number
      of compound words, and the number of unknown words in the text.

    **NOTE:** Files dlf, dlc, err and tags_err are not sorted. Use the
    function 'sort_txt' to sort them.

    *Arguments:*

    - **dictionaries [list(str)]** -- list of dictionary pathes ('bin'
      or 'fst2' formats).
    
    - **text [str]** -- text (snt format) file path.
    
    - **alphabet [str]** -- alphabet file path.

    *Keyword arguments:*

    - **morpho [list(str)]** -- this optional argument indicates which
      morphological mode dictionaries are to be used, if needed by some
      .fst2 dictionaries. The argument is a list of dictionary path (bin
      format).
    
    - **korean [bool]** -- specify the dictionary is in korean
      (default: False).
    
    - **semitic [bool]** -- specify the dictionary is in a semitic
      language (default: False).
    
    - **arabic_rules [str]** -- specifies the Arabic typographic rule
      configuration file path.
    
    - **raw [str]** -- alternative output file path containing both
      simple and compound words, without requiring a text directory.

    *Return [bool]:*

      **True** if it succeeds, **False** otherwise.
    """
    options = DicoOptions()
    options.load(kwargs)

    for dictionary in dictionaries:
        if exists(dictionary) is False:
            raise UnitexException("[DICO] Dictionary file '%s' doesn't exists" % dictionary)
    if exists(text) is False:
        raise UnitexException("[DICO] Text file '%s' doesn't exists" % text)
    if exists(alphabet) is False:
        raise UnitexException("[DICO] Alphabet file '%s' doesn't exists" % alphabet)

    command = ["UnitexTool", "Dico"]

    command.append("--text=%s" % text)
    command.append("--alphabet=%s" % alphabet)

    if options["morpho"] is not None:
        command.append("--morpho=%s" % ",".join(options["morpho"]))
    if options["korean"] is True:
        command.append("--korean")
    if options["semitic"] is True:
        command.append("--semitic")
    if options["arabic_rules"] is not None:
        command.append("--arabic_rules=%s" % options["arabic_rules"])
    if options["raw"] is not None:
        command.append("--raw=%s" % raw)

    command += dictionaries

    command.append("-qutf8-no-bom")
    command = " ".join(command)

    _LOGGER.info("Applying dictionaries")
    _LOGGER.debug("Command: %s", command)
    ret = _unitex.unitex_tool(command)

    return ret
コード例 #30
0
    def load(self, options):
        font = options.get("font", None)
        if font is not None and isinstance(font, str) is False:
            raise UnitexException(
                "[CONCORD] Wrong value for the 'font' option. String required."
            )
        self["font"] = font

        fontsize = options.get("fontsize", None)
        if fontsize is not None and isinstance(fontsize, int) is False:
            raise UnitexException(
                "[CONCORD] Wrong value for the 'fontsize' option. Integer required."
            )
        self["fontsize"] = fontsize

        only_ambiguous = options.get("only_ambiguous", False)
        if isinstance(only_ambiguous, bool) is False:
            raise UnitexException(
                "[CONCORD] Wrong value for the 'only_ambiguous' option. Boolean required."
            )
        self["only_ambiguous"] = only_ambiguous

        only_matches = options.get("only_matches", False)
        if isinstance(only_matches, bool) is False:
            raise UnitexException(
                "[CONCORD] Wrong value for the 'only_matches' option. Boolean required."
            )
        self["only_matches"] = only_matches

        left = options.get("left", "0")
        if left is not None and isinstance(left, str) is False:
            raise UnitexException(
                "[CONCORD] Wrong value for the 'left' option. String required."
            )
        self["left"] = left

        right = options.get("right", "0")
        if right is not None and isinstance(right, str) is False:
            raise UnitexException(
                "[CONCORD] Wrong value for the 'right' option. String required."
            )
        self["right"] = right

        sort = options.get("sort", UnitexConstants.SORT_TEXT_ORDER)
        if sort not in (UnitexConstants.SORT_TEXT_ORDER,
                        UnitexConstants.SORT_LEFT_CENTER,
                        UnitexConstants.SORT_LEFT_RIGHT,
                        UnitexConstants.SORT_CENTER_LEFT,
                        UnitexConstants.SORT_CENTER_RIGHT,
                        UnitexConstants.SORT_RIGHT_LEFT,
                        UnitexConstants.SORT_RIGHT_CENTER):
            raise UnitexException(
                "[CONCORD] Wrong value for the 'sort' option. UnitexConstants.SORT_XXX required."
            )
        self["sort"] = sort

        format = options.get("format", UnitexConstants.FORMAT_TEXT)
        if format not in (UnitexConstants.FORMAT_HTML,
                          UnitexConstants.FORMAT_TEXT,
                          UnitexConstants.FORMAT_GLOSSANET,
                          UnitexConstants.FORMAT_SCRIPT,
                          UnitexConstants.FORMAT_INDEX,
                          UnitexConstants.FORMAT_UIMA,
                          UnitexConstants.FORMAT_PRLG,
                          UnitexConstants.FORMAT_XML,
                          UnitexConstants.FORMAT_XML_WITH_HEADERS,
                          UnitexConstants.FORMAT_AXIS,
                          UnitexConstants.FORMAT_XALIGN,
                          UnitexConstants.FORMAT_MERGE):
            raise UnitexException(
                "[CONCORD] Wrong value for the 'format' option. UnitexConstants.FORMAT_XXX required."
            )
        self["format"] = format

        self["script"] = None
        self["offsets"] = None
        self["unxmlize"] = None
        self["output"] = None

        if self["format"] in (UnitexConstants.FORMAT_HTML,
                              UnitexConstants.FORMAT_GLOSSANET,
                              UnitexConstants.FORMAT_SCRIPT):
            if self["font"] is None:
                self["font"] = "Courier new"
            if self["fontsize"] is None:
                self["fontsize"] = 12

            if self["format"] in (UnitexConstants.FORMAT_GLOSSANET,
                                  UnitexConstants.FORMAT_SCRIPT):
                script = options.get("script", None)
                if script is None:
                    raise UnitexException(
                        "You must provide the 'script' option for UnitexConstants.FORMAT_(GLOSSANET|SCRIPT) formats..."
                    )
                self["script"] = script

        elif self["format"] == UnitexConstants.FORMAT_UIMA:
            offsets = options.get("offsets", None)
            if offsets is None:
                raise UnitexException(
                    "You must provide the 'offsets' option for UnitexConstants.FORMAT_UIMA formats..."
                )
            self["offsets"] = offsets

        elif self["format"] == UnitexConstants.FORMAT_PRLG:
            offsets = options.get("offsets", None)
            if offsets is None:
                raise UnitexException(
                    "You must provide the 'offsets' option for UnitexConstants.FORMAT_PRLG formats..."
                )
            self["offsets"] = offsets

            unxmlize = options.get("unxmlize", None)
            if unxmlize is None:
                raise UnitexException(
                    "You must provide the 'unxmlize' option for UnitexConstants.FORMAT_PRLG format..."
                )
            self["unxmlize"] = unxmlize

        elif self["format"] == UnitexConstants.FORMAT_MERGE:
            output = options.get("output", None)
            if output is None:
                raise UnitexException(
                    "You must provide the 'output' option for UnitexConstants.FORMAT_MERGE format..."
                )
            self["output"] = output

        directory = options.get("directory", None)
        if directory is not None:
            if isinstance(directory, str) is False:
                raise UnitexException(
                    "[CONCORD] Wrong value for the 'directory' option. String required."
                )
            if exists(directory) is False:
                raise UnitexException(
                    "[CONCORD] The text 'directory' doesn't exist.")
        self["directory"] = directory

        thai = options.get("thai", False)
        if isinstance(thai, bool) is False:
            raise UnitexException(
                "[CONCORD] Wrong value for the 'thai' option. Boolean required."
            )
        self["thai"] = thai
コード例 #31
0
ファイル: config.py プロジェクト: patwat/python-unitex
    def load(self, options):
        start_on_space = options.get("start_on_space", False)
        if isinstance(start_on_space, bool) is False:
            raise UnitexException("[LOCATE] Wrong value for the 'start_on_space' option. Boolean required.")
        self["start_on_space"] = start_on_space

        char_by_char = options.get("char_by_char", False)
        if isinstance(char_by_char, bool) is False:
            raise UnitexException("[LOCATE] Wrong value for the 'char_by_char' option. Boolean required.")
        self["char_by_char"] = char_by_char

        morpho = options.get("morpho", None)
        if morpho is not None:
            if isinstance(morpho, list) is False:
                raise UnitexException("[LOCATE] Wrong value for the 'morpho' option. List of string required.")
            for dictionary in morpho:
                if exists(dictionary) is False:
                    raise UnitexException("[LOCATE] Morphological dictionary '%s' doesn't exist." % dictionary)
        self["morpho"] = morpho

        korean = options.get("korean", False)
        if isinstance(korean, bool) is False:
            raise UnitexException("[LOCATE] Wrong value for the 'korean' option. Boolean required.")
        self["korean"] = korean

        arabic_rules = options.get("arabic_rules", None)
        if arabic_rules is not None:
            if isinstance(arabic_rules, str) is False:
                raise UnitexException("[LOCATE] Wrong value for the 'arabic_rules' option. String required.")
            if exists(arabic_rules) is False:
                raise UnitexException("[LOCATE] Rules file '%s' doesn't exist." % arabic_rules)
        self["arabic_rules"] = arabic_rules

        sntdir = options.get("sntdir", None)
        if sntdir is not None:
            if isinstance(sntdir, str) is False:
                raise UnitexException("[LOCATE] Wrong value for the 'sntdir' option. String required.")
            if exists(sntdir) is False:
                raise UnitexException("[LOCATE] Directory '%s' doesn't exist." % sntdir)
        self["sntdir"] = sntdir

        negation_operator = options.get("negation_operator", UnitexConstants.NEGATION_OPERATOR)
        if negation_operator not in (UnitexConstants.NEGATION_OPERATOR, UnitexConstants.NEGATION_OPERATOR_OLD):
            raise UnitexException("[LOCATE] Wrong value for the 'negation_operator' option. UnitexConstants.NEGATION_OPERATOR(_OLD) required.")
        self["negation_operator"] = negation_operator

        number_of_matches = options.get("number_of_matches", None)
        if number_of_matches is not None and isinstance(number_of_matches, int) is False:
            raise UnitexException("[LOCATE] Wrong value for the 'number_of_matches' option. Integer required.")
        self["number_of_matches"] = number_of_matches

        stop_token_count = options.get("stop_token_count", None)
        if stop_token_count is not None:
            if isinstance(stop_token_count, list) is False:
                raise UnitexException("[LOCATE] Wrong value for the 'stop_token_count' option. List of 2 integers required.")
            if len(stop_token_count) != 2:
                raise UnitexException("[LOCATE] Wrong value for the 'stop_token_count' option. List of 2 integers required.")
            for i in stop_token_count:
                if isinstance(i, int) is False:
                    raise UnitexException("[LOCATE] Wrong value for the 'stop_token_count' option. List of 2 integers required.")
        self["stop_token_count"] = stop_token_count

        match_mode = options.get("match_mode", UnitexConstants.MATCH_MODE_LONGEST)
        if match_mode not in (UnitexConstants.MATCH_MODE_LONGEST,
                              UnitexConstants.MATCH_MODE_SHORTEST,
                              UnitexConstants.MATCH_MODE_ALL):
            raise UnitexException("[LOCATE] Wrong value for the 'match_mode' option. UnitexConstants.MATCH_MODE_X required.")
        self["match_mode"] = match_mode

        output_mode = options.get("output_mode", UnitexConstants.OUTPUT_MODE_IGNORE)
        if output_mode not in (UnitexConstants.OUTPUT_MODE_IGNORE,
                               UnitexConstants.OUTPUT_MODE_MERGE,
                               UnitexConstants.OUTPUT_MODE_REPLACE):
            raise UnitexException("[LOCATE] Wrong value for the 'output_mode' option. UnitexConstants.OUTPUT_MODE_X required.")
        self["output_mode"] = output_mode

        protect_dic_chars = options.get("protect_dic_chars", True)
        if isinstance(protect_dic_chars, bool) is False:
            raise UnitexException("[LOCATE] Wrong value for the 'protect_dic_chars' option. Boolean required.")
        self["protect_dic_chars"] = protect_dic_chars

        variable = options.get("variable", None)
        if variable is not None:
            if isinstance(variable, list) is False:
                raise UnitexException("[LOCATE] Wrong value for the 'variable' option. List of 2 strings required.")
            if len(variable) != 2:
                raise UnitexException("[LOCATE] Wrong value for the 'variable' option. List of 2 strings required.")
            if isinstance(variable[0], str) is False:
                raise UnitexException("[LOCATE] Wrong value for the 'variable' option. List of 2 strings required.")
            # Checks if the second argument is in ascii
            if isinstance(variable[1], str) is False and all(ord(c) < 128 for c in variable[1]) is False:
                raise UnitexException("[LOCATE] Wrong value for the 'variable' option. List of 2 strings required (the second must be *ascii*).")
        self["variable"] = variable

        ambiguous_outputs = options.get("ambiguous_outputs", True)
        if isinstance(ambiguous_outputs, bool) is False:
            raise UnitexException("[LOCATE] Wrong value for the 'ambiguous_outputs' option. Boolean required.")
        self["ambiguous_outputs"] = ambiguous_outputs

        variable_error = options.get("variable_error", UnitexConstants.ON_ERROR_IGNORE)
        if variable_error not in (UnitexConstants.ON_ERROR_IGNORE,
                              UnitexConstants.ON_ERROR_EXIT,
                              UnitexConstants.ON_ERROR_BACKTRACK):
            raise UnitexException("[LOCATE] Wrong value for the 'variable_error' option. UnitexConstants.OUTPUT_MODE_X required.")
        self["variable_error"] = variable_error