def turkify(text: Text) -> Union[Text, None]: if text: deasciifier = Deasciifier(text) return deasciifier.convert_to_turkish() else: return None
def to_tr(s): tokens = re.split("(\\$.*?\\$)",s) res = [] for x in tokens: if x[0]=='$' and x[-1] == '$': res.append(x); continue dea = Deasciifier(x) x = dea.convert_to_turkish() res.append(x) return ''.join(res)
def to_tr(s): tokens = re.split("(\\$.*?\\$)",s) res = [] for x in tokens: if x[0]=='$' and x[-1] == '$': res.append(x); continue dea = Deasciifier(x) x = dea.convert_to_turkish() res.append(x) return ''.join(res)
def turkish_tokenize(self, string): """ This helper method tokenises the text with respect to rules specific to the Turkish language. :param string: The text that is already tokenised by the common tokeniser. :type string: str :return: The tokenised output of the text. :rtype: list """ string = UnicodeTr(string).lower() deasciifier = Deasciifier(string) string = deasciifier.convert_to_turkish() string = self.common_tokenize(string) return string.strip().split()
def post(self): submitted_content = SubmittedContent() if users.get_current_user(): submitted_content.author = users.get_current_user() submitted_content.content = self.request.get('content') submitted_content.put() string = cgi.escape(self.request.get('content')) dea = Deasciifier(string) result = dea.convert_to_turkish() template_values = {'result': result} path = os.path.join(os.path.dirname(__file__), 'deasciify.html') self.response.out.write(template.render(path, template_values))
def stem(self, tokens: list, deasciify, asciify): #stemming #print("stems", tokens) result = [] for word in tokens: if len(word) < 3: continue if "http" in word: continue # deasciify the string if deasciify == True: deasciifier = Deasciifier(word) word = deasciifier.convert_to_turkish() if asciify: asciify_string = str.maketrans("çğıöşü", "cgiosu") word = word.translate(asciify_string) word = word if len(word) < 7 else word[:6] result.append(word) return result # words = defaultdict(int) # series = history_clean['msg'][:50] # names = history_clean['name'][:50] # for idx, sentence in enumerate(series): # # print(sentence, idx) # #name = names[idx] # sentence = sentence.split(' ') # for word in sentence: # words[word] += 1 # sorted_words = sorted(words.items(), key=lambda kv: kv[1]) # # print(sorted_words[-50:]) # obj = detector.TurkishNLP() # # obj.download() # obj.create_word_set() # def correct_words(text): # text = obj.list_words(text) # text = obj.auto_correct(text) # return text
def test_convert_to_turkish(self): for i in range(len(self.ascii_strings)): dea = Deasciifier(self.ascii_strings[i]) result = dea.convert_to_turkish() self.assertEqual(result, self.turkish_strings[i])
def test_convert_to_turkish(self): for i in range(len(self.ascii_strings)): dea = Deasciifier(self.ascii_strings[i]) result = dea.convert_to_turkish() self.assertEqual(result, self.turkish_strings[i])
import re
def deascii(text): my_ascii_turkish_txt = text deasciifier = Deasciifier(my_ascii_turkish_txt.decode("utf-8")) my_deasciified_turkish_txt = deasciifier.convert_to_turkish() return my_deasciified_turkish_txt.encode("utf-8")