Ejemplo n.º 1
0
    def __init__(self, dialect, script, numeral="Latin", separator='▁'):

        # validate parameters
        with open(klpt.get_data("data/tokenize.json")) as tokenize_file:
            self.tokenize_map = json.load(tokenize_file)

        with open(klpt.get_data("data/preprocess_map.json")) as preprocess_file:
            self.preprocess_map = json.load(preprocess_file)

        # sentence tokenizer variables
        self.dialect, self.script = dialect, script
        self.alphabets = "([%s])"%"".join(self.tokenize_map["sent_tokenize"][self.dialect][self.script]["alphabet"])
        self.prefixes = self.tokenize_map["sent_tokenize"][self.dialect][self.script]["prefixes"]
        self.suffixes = self.tokenize_map["sent_tokenize"][self.dialect][self.script]["suffixes"]
        self.starters = self.tokenize_map["sent_tokenize"][self.dialect][self.script]["starters"]
        self.websites = self.tokenize_map["sent_tokenize"]["universal"]["websites"]
        self.acronyms = self.tokenize_map["sent_tokenize"][self.dialect][self.script]["acronyms"]
        self.digits = "([%s])"%"".join(list(set(list(self.preprocess_map["normalizer"]["universal"]["numerals"][numeral].values()))))

        # load lexicons
        with open(klpt.data_directory["tokenize"][self.dialect][self.script], "r") as f_lexicon:
            self.lexicon = json.load(f_lexicon)["Lexicon"]
         
        self.mwe_lexicon = {lemma: form for lemma, form in self.lexicon.items() if "-" in lemma}

        with open(klpt.data_directory["morphemes"][self.dialect], "r") as f_morphemes:
            self.morphemes = json.load(f_morphemes)["Morphemes"]["Concatenated"][self.script]
Ejemplo n.º 2
0
    def __init__(self, dialect, script):

        self.dialect = dialect
        self.script = script 

        self.hunspell_flags = {"po": "pos", "is": "description", "ts": "terminal_suffix", "ds": "formation"}
        if self.dialect == "Sorani" and self.script == "Arabic":
            self.huns = Hunspell("ckb-Arab", hunspell_data_dir=klpt.get_data("data/"))
        else:
            if not (self.dialect == "Kurmanji" and self.script == "Latin"):
                raise Exception("Sorry, only Sorani dialect in the Arabic script is supported now. Stay tuned for other dialects and scripts!")
Ejemplo n.º 3
0
    def __init__(self, dialect, script, numeral="Latin"):
        """
        Initialization of the Preprocess class

        Arguments:
            dialect (str): the name of the dialect or its ISO 639-3  code
            script (str): the name of the script
            numeral (str): the type of the numeral
        
        """
        with open(klpt.get_data("data/preprocess_map.json")) as preprocess_file:
            self.preprocess_map = json.load(preprocess_file)

        configuration = Configuration({"dialect": dialect, "script": script, "numeral": numeral})
        self.dialect = configuration.dialect
        self.script = configuration.script
        self.numeral = configuration.numeral
Ejemplo n.º 4
0
    def __init__(self, config_dict):#dialect, script, numeral="Latin", target_script=None, unknown="�"):
        """

        Args:
            dialect (str): the name of the dialect or its ISO 639-3 code
            script (str): the name of the script
            numeral (str): the type of the numeral

        """

        with open(klpt.get_data("data/default-options.json")) as options_file:
            self.options = json.load(options_file)
        
        self.unknown = None

        if "script" in config_dict:
            self.validate_script(config_dict["script"])
        else:
            self.script = None

        if "dialect" in config_dict:
            self.validate_dialect(config_dict["dialect"])
        else:
            self.dialect = None

        if "numeral" in config_dict:
            self.validate_numeral(config_dict["numeral"])
        else:
            self.numeral = None
    
        if "target_script" in config_dict:
            self.validate_target_script(config_dict["target_script"])
        else:
            self.target_script = None

        if "unknown" in config_dict:
            self.validate_unknown(config_dict["unknown"])
        else:
            self.user_UNKNOWN = "�"
Ejemplo n.º 5
0
 def setUp(self):
     with open(klpt.get_data("data/test_cases.json")) as f:
         self.test_cases = json.load(f)
                 
     with open(klpt.get_data("data/default-options.json")) as f:
         self.options = json.load(f)
Ejemplo n.º 6
0
 def __init__(self, dialect="Kurmanji", script="Latin"):
     self.t = ATTFST(klpt.get_data("data/kmr-Latn.att"))
Ejemplo n.º 7
0
    def __init__(self,
                 dialect,
                 script,
                 target_script,
                 unknown="�",
                 numeral="Latin"):
        """Initializing using a Configuration object

        To do:
            - "لە ئیسپانیا ژنان لە دژی ‘patriarkavirus’ ڕێپێوانیان کرد": "le îspanya jinan le dijî ‘patriarkavirus’ řêpêwanyan kird"
            - "egerçî damezrandnî rêkxrawe kurdîyekan her rêpênedraw mabûnewe Inzîbat.": "ئەگەرچی دامەزراندنی ڕێکخراوە کوردییەکان هەر رێپێنەدراو مابوونەوە ئنزیبات.",

        Args:
            mode ([type]): [description]
            unknown (str, optional): [description]. Defaults to "�".
            numeral (str, optional): [description]. Defaults to "Latin". Modifiable only if the source script is in Arabic. Otherwise, the Default value will be Latin.

        Raises:
            ValueError: [description]
            ValueError: [description]

        """
        # with open("data/default-options.json") as f:
        #     options = json.load(f)

        self.UNKNOWN = "�"
        with open(klpt.get_data("data/wergor.json")) as f:
            self.wergor_configurations = json.load(f)

        with open(klpt.get_data("data/preprocess_map.json")) as f:
            self.preprocess_map = json.load(f)["normalizer"]

        configuration = Configuration({
            "dialect": dialect,
            "script": script,
            "numeral": numeral,
            "target_script": target_script,
            "unknown": unknown
        })
        # self.preprocess_map = object.preprocess_map["normalizer"]
        self.dialect = configuration.dialect
        self.script = configuration.script
        self.numeral = configuration.numeral
        self.mode = configuration.mode
        self.target_script = configuration.target_script
        self.user_UNKNOWN = configuration.user_UNKNOWN

        # self.mode = mode
        # if mode=="arabic_to_latin":
        #     target_script = "Latin"
        # elif mode=="latin_to_arabic":
        #     target_script = "Arabic"
        # else:
        #     raise ValueError(f'Unknown transliteration option. Available options: {options["transliterator"]}')

        # if len(unknown):
        #     self.user_UNKNOWN = unknown
        # else:
        #     raise ValueError(f'Unknown unknown tag. Select a non-empty token (e.g. <UNK>.')

        self.characters_mapping = self.wergor_configurations[
            "characters_mapping"]
        self.digits_mapping = self.preprocess_map["universal"]["numerals"][
            self.target_script]
        self.digits_mapping_all = list(
            set(
                list(self.preprocess_map["universal"]["numerals"][
                    self.target_script].keys()) +
                list(self.preprocess_map["universal"]["numerals"][
                    self.target_script].values())))
        self.punctuation_mapping = self.wergor_configurations["punctuation"][
            self.target_script]
        self.punctuation_mapping_all = list(
            set(
                list(self.wergor_configurations["punctuation"][
                    self.target_script].keys()) +
                list(self.wergor_configurations["punctuation"][
                    self.target_script].values())))
        # self.tricky_characters = self.wergor_configurations["characters_mapping"]
        self.wy_mappings = self.wergor_configurations["wy_mappings"]

        self.hemze = self.wergor_configurations["hemze"]
        self.bizroke = self.wergor_configurations["bizroke"]
        self.uw_iy_forms = self.wergor_configurations["uw_iy_forms"]
        self.target_char = self.wergor_configurations["target_char"]
        self.arabic_vowels = self.wergor_configurations["arabic_vowels"]
        self.arabic_cons = self.wergor_configurations["arabic_cons"]
        self.latin_vowels = self.wergor_configurations["latin_vowels"]
        self.latin_cons = self.wergor_configurations["latin_cons"]

        self.characters_pack = {
            "arabic_to_latin": self.characters_mapping.values(),
            "latin_to_arabic": self.characters_mapping.keys()
        }
        if self.target_script == "Arabic":
            self.prep = Preprocess("Sorani", "Latin", numeral=self.numeral)
        else:
            self.prep = Preprocess("Sorani", "Latin", numeral="Latin")