def __init__(self, dialect, script, numeral="Latin", separator='▁'): # validate parameters with open(klpt.get_data("data/tokenize.json")) as tokenize_file: self.tokenize_map = json.load(tokenize_file) with open(klpt.get_data("data/preprocess_map.json")) as preprocess_file: self.preprocess_map = json.load(preprocess_file) # sentence tokenizer variables self.dialect, self.script = dialect, script self.alphabets = "([%s])"%"".join(self.tokenize_map["sent_tokenize"][self.dialect][self.script]["alphabet"]) self.prefixes = self.tokenize_map["sent_tokenize"][self.dialect][self.script]["prefixes"] self.suffixes = self.tokenize_map["sent_tokenize"][self.dialect][self.script]["suffixes"] self.starters = self.tokenize_map["sent_tokenize"][self.dialect][self.script]["starters"] self.websites = self.tokenize_map["sent_tokenize"]["universal"]["websites"] self.acronyms = self.tokenize_map["sent_tokenize"][self.dialect][self.script]["acronyms"] self.digits = "([%s])"%"".join(list(set(list(self.preprocess_map["normalizer"]["universal"]["numerals"][numeral].values())))) # load lexicons with open(klpt.data_directory["tokenize"][self.dialect][self.script], "r") as f_lexicon: self.lexicon = json.load(f_lexicon)["Lexicon"] self.mwe_lexicon = {lemma: form for lemma, form in self.lexicon.items() if "-" in lemma} with open(klpt.data_directory["morphemes"][self.dialect], "r") as f_morphemes: self.morphemes = json.load(f_morphemes)["Morphemes"]["Concatenated"][self.script]
def __init__(self, dialect, script): self.dialect = dialect self.script = script self.hunspell_flags = {"po": "pos", "is": "description", "ts": "terminal_suffix", "ds": "formation"} if self.dialect == "Sorani" and self.script == "Arabic": self.huns = Hunspell("ckb-Arab", hunspell_data_dir=klpt.get_data("data/")) else: if not (self.dialect == "Kurmanji" and self.script == "Latin"): raise Exception("Sorry, only Sorani dialect in the Arabic script is supported now. Stay tuned for other dialects and scripts!")
def __init__(self, dialect, script, numeral="Latin"): """ Initialization of the Preprocess class Arguments: dialect (str): the name of the dialect or its ISO 639-3 code script (str): the name of the script numeral (str): the type of the numeral """ with open(klpt.get_data("data/preprocess_map.json")) as preprocess_file: self.preprocess_map = json.load(preprocess_file) configuration = Configuration({"dialect": dialect, "script": script, "numeral": numeral}) self.dialect = configuration.dialect self.script = configuration.script self.numeral = configuration.numeral
def __init__(self, config_dict):#dialect, script, numeral="Latin", target_script=None, unknown="�"): """ Args: dialect (str): the name of the dialect or its ISO 639-3 code script (str): the name of the script numeral (str): the type of the numeral """ with open(klpt.get_data("data/default-options.json")) as options_file: self.options = json.load(options_file) self.unknown = None if "script" in config_dict: self.validate_script(config_dict["script"]) else: self.script = None if "dialect" in config_dict: self.validate_dialect(config_dict["dialect"]) else: self.dialect = None if "numeral" in config_dict: self.validate_numeral(config_dict["numeral"]) else: self.numeral = None if "target_script" in config_dict: self.validate_target_script(config_dict["target_script"]) else: self.target_script = None if "unknown" in config_dict: self.validate_unknown(config_dict["unknown"]) else: self.user_UNKNOWN = "�"
def setUp(self): with open(klpt.get_data("data/test_cases.json")) as f: self.test_cases = json.load(f) with open(klpt.get_data("data/default-options.json")) as f: self.options = json.load(f)
def __init__(self, dialect="Kurmanji", script="Latin"): self.t = ATTFST(klpt.get_data("data/kmr-Latn.att"))
def __init__(self, dialect, script, target_script, unknown="�", numeral="Latin"): """Initializing using a Configuration object To do: - "لە ئیسپانیا ژنان لە دژی ‘patriarkavirus’ ڕێپێوانیان کرد": "le îspanya jinan le dijî ‘patriarkavirus’ řêpêwanyan kird" - "egerçî damezrandnî rêkxrawe kurdîyekan her rêpênedraw mabûnewe Inzîbat.": "ئەگەرچی دامەزراندنی ڕێکخراوە کوردییەکان هەر رێپێنەدراو مابوونەوە ئنزیبات.", Args: mode ([type]): [description] unknown (str, optional): [description]. Defaults to "�". numeral (str, optional): [description]. Defaults to "Latin". Modifiable only if the source script is in Arabic. Otherwise, the Default value will be Latin. Raises: ValueError: [description] ValueError: [description] """ # with open("data/default-options.json") as f: # options = json.load(f) self.UNKNOWN = "�" with open(klpt.get_data("data/wergor.json")) as f: self.wergor_configurations = json.load(f) with open(klpt.get_data("data/preprocess_map.json")) as f: self.preprocess_map = json.load(f)["normalizer"] configuration = Configuration({ "dialect": dialect, "script": script, "numeral": numeral, "target_script": target_script, "unknown": unknown }) # self.preprocess_map = object.preprocess_map["normalizer"] self.dialect = configuration.dialect self.script = configuration.script self.numeral = configuration.numeral self.mode = configuration.mode self.target_script = configuration.target_script self.user_UNKNOWN = configuration.user_UNKNOWN # self.mode = mode # if mode=="arabic_to_latin": # target_script = "Latin" # elif mode=="latin_to_arabic": # target_script = "Arabic" # else: # raise ValueError(f'Unknown transliteration option. Available options: {options["transliterator"]}') # if len(unknown): # self.user_UNKNOWN = unknown # else: # raise ValueError(f'Unknown unknown tag. Select a non-empty token (e.g. <UNK>.') self.characters_mapping = self.wergor_configurations[ "characters_mapping"] self.digits_mapping = self.preprocess_map["universal"]["numerals"][ self.target_script] self.digits_mapping_all = list( set( list(self.preprocess_map["universal"]["numerals"][ self.target_script].keys()) + list(self.preprocess_map["universal"]["numerals"][ self.target_script].values()))) self.punctuation_mapping = self.wergor_configurations["punctuation"][ self.target_script] self.punctuation_mapping_all = list( set( list(self.wergor_configurations["punctuation"][ self.target_script].keys()) + list(self.wergor_configurations["punctuation"][ self.target_script].values()))) # self.tricky_characters = self.wergor_configurations["characters_mapping"] self.wy_mappings = self.wergor_configurations["wy_mappings"] self.hemze = self.wergor_configurations["hemze"] self.bizroke = self.wergor_configurations["bizroke"] self.uw_iy_forms = self.wergor_configurations["uw_iy_forms"] self.target_char = self.wergor_configurations["target_char"] self.arabic_vowels = self.wergor_configurations["arabic_vowels"] self.arabic_cons = self.wergor_configurations["arabic_cons"] self.latin_vowels = self.wergor_configurations["latin_vowels"] self.latin_cons = self.wergor_configurations["latin_cons"] self.characters_pack = { "arabic_to_latin": self.characters_mapping.values(), "latin_to_arabic": self.characters_mapping.keys() } if self.target_script == "Arabic": self.prep = Preprocess("Sorani", "Latin", numeral=self.numeral) else: self.prep = Preprocess("Sorani", "Latin", numeral="Latin")