def turkify(text: Text) -> Union[Text, None]: if text: deasciifier = Deasciifier(text) return deasciifier.convert_to_turkish() else: return None
def to_tr(s): tokens = re.split("(\\$.*?\\$)",s) res = [] for x in tokens: if x[0]=='$' and x[-1] == '$': res.append(x); continue dea = Deasciifier(x) x = dea.convert_to_turkish() res.append(x) return ''.join(res)
def turkish_tokenize(self, string): """ This helper method tokenises the text with respect to rules specific to the Turkish language. :param string: The text that is already tokenised by the common tokeniser. :type string: str :return: The tokenised output of the text. :rtype: list """ string = UnicodeTr(string).lower() deasciifier = Deasciifier(string) string = deasciifier.convert_to_turkish() string = self.common_tokenize(string) return string.strip().split()
def __call__(self, doc): tokens = [ token for token in tokenizer(doc, self.lang, True) if token.isalnum() and len(token) > 0 and not token.isspace() ] # we can eliminate punctuation as well tokens = [token.lower() for token in tokens] if self.remove_numbers: number_pattern = "[a-zA-z]{,3}\d{6,}" tokens = [re.sub(number_pattern, "", token) for token in tokens] if self.eliminate_stopwords: stopwords = stopword_lists.get_stopwords(lang="tr") tokens = [token for token in tokens if token not in stopwords] if self.apply_stemming: tokens = [tr_stemmer.stem2(token) for token in tokens] if self.deasciify: tokens = [ Deasciifier(token).convert_to_turkish() for token in tokens ] tokens = [token.strip() for token in tokens] tokens = [token for token in tokens if len(token) > 0] # or not token.isspace()] return tokens
def deasciify_words(words, lang): if lang in ["tr", "turkish"]: return [Deasciifier(token).convert_to_turkish() for token in words] else: return words '''
def post(self): submitted_content = SubmittedContent() if users.get_current_user(): submitted_content.author = users.get_current_user() submitted_content.content = self.request.get('content') submitted_content.put() string = cgi.escape(self.request.get('content')) dea = Deasciifier(string) result = dea.convert_to_turkish() template_values = {'result': result} path = os.path.join(os.path.dirname(__file__), 'deasciify.html') self.response.out.write(template.render(path, template_values))
def stem(self, tokens: list, deasciify, asciify): #stemming #print("stems", tokens) result = [] for word in tokens: if len(word) < 3: continue if "http" in word: continue # deasciify the string if deasciify == True: deasciifier = Deasciifier(word) word = deasciifier.convert_to_turkish() if asciify: asciify_string = str.maketrans("çğıöşü", "cgiosu") word = word.translate(asciify_string) word = word if len(word) < 7 else word[:6] result.append(word) return result # words = defaultdict(int) # series = history_clean['msg'][:50] # names = history_clean['name'][:50] # for idx, sentence in enumerate(series): # # print(sentence, idx) # #name = names[idx] # sentence = sentence.split(' ') # for word in sentence: # words[word] += 1 # sorted_words = sorted(words.items(), key=lambda kv: kv[1]) # # print(sorted_words[-50:]) # obj = detector.TurkishNLP() # # obj.download() # obj.create_word_set() # def correct_words(text): # text = obj.list_words(text) # text = obj.auto_correct(text) # return text
def __call__(self, doc): #tokens = super.__call__(self, doc) tokens = _TokenHandler.__call__(self, doc) ''' if self.stemming: tokens = [tr_stemmer.stem2(token) for token in tokens] ''' if self.deasciify: tokens = [ Deasciifier(token).convert_to_turkish() for token in tokens ] return tokens
def deasciify_tr_text(text): words = text.split() punkts = string.punctuation npunct1 = 0 npunct2 = 0 correct_words = [] for w in words: lpunct = "" # to remove the punctuation upon spelling correction and put back them afterwards rpunct = "" # leading:1 char and ending: 3chars correct_word = "" if is_punctuation(w): correct_word = w else: lw = w.lstrip(punkts) # remove leading punctuation npunct1 = len(w) - len( lw) # take the difference to put the punkts back if not 0 lpunct = w[:npunct1] rw = w.rstrip(punkts) npunct2 = len(w) - len(rw) if npunct2 > 0: # otherwise the slicer selects the whole string rpunct = w[-npunct2:] no_punct_word = w.strip(punkts) suggested_word = Deasciifier(no_punct_word).convert_to_turkish() correct_word = lpunct + suggested_word + rpunct correct_words.append(correct_word) ''' print(w, len(w), no_punct_word, len(no_punct_word)) print("l:", lpunct, "-s:", suggested_word, "-r:", rpunct) print("####\n") ''' correcttext = " ".join(correct_words) return correcttext
def test_convert_to_turkish(self): for i in range(len(self.ascii_strings)): dea = Deasciifier(self.ascii_strings[i]) result = dea.convert_to_turkish() self.assertEqual(result, self.turkish_strings[i])
def deasciify_words(words, lang): if lang in ["tr", "turkish"]: return [Deasciifier(token).convert_to_turkish() for token in words] else: # not applicable for english and arabic return words
def correct_letters(text): return Deasciifier(text).convert_to_turkish()
def deasciify_word(word): return Deasciifier(word).convert_to_turkish()
def deascii(text): my_ascii_turkish_txt = text deasciifier = Deasciifier(my_ascii_turkish_txt.decode("utf-8")) my_deasciified_turkish_txt = deasciifier.convert_to_turkish() return my_deasciified_turkish_txt.encode("utf-8")
import re