def turkify(text: Text) -> Union[Text, None]:
    if text:
        deasciifier = Deasciifier(text)
        return deasciifier.convert_to_turkish()
    else:

        return None
Example #2
0
def to_tr(s):    
    tokens = re.split("(\\$.*?\\$)",s)
    res = []
    for x in tokens:
        if x[0]=='$' and x[-1] == '$': res.append(x); continue
        dea = Deasciifier(x)
        x = dea.convert_to_turkish()
        res.append(x)
    return ''.join(res)
Example #3
0
def to_tr(s):    
    tokens = re.split("(\\$.*?\\$)",s)
    res = []
    for x in tokens:
        if x[0]=='$' and x[-1] == '$': res.append(x); continue
        dea = Deasciifier(x)
        x = dea.convert_to_turkish()
        res.append(x)
    return ''.join(res)
Example #4
0
    def turkish_tokenize(self, string):
        """
        This helper method tokenises the text with respect to rules specific to the Turkish language.

        :param string: The text that is already tokenised by the common tokeniser.
        :type string: str
        :return: The tokenised output of the text.
        :rtype: list
        """

        string = UnicodeTr(string).lower()
        deasciifier = Deasciifier(string)
        string = deasciifier.convert_to_turkish()
        string = self.common_tokenize(string)
        return string.strip().split()
Example #5
0
    def post(self):

        submitted_content = SubmittedContent()

        if users.get_current_user():
            submitted_content.author = users.get_current_user()

        submitted_content.content = self.request.get('content')
        
        submitted_content.put()      

        string = cgi.escape(self.request.get('content'))
        dea = Deasciifier(string)
        result = dea.convert_to_turkish()
        
        template_values = {'result': result}

        path = os.path.join(os.path.dirname(__file__), 'deasciify.html')
        self.response.out.write(template.render(path, template_values))
Example #6
0
    def stem(self, tokens: list, deasciify, asciify):
        #stemming
        #print("stems", tokens)
        result = []
        for word in tokens:
            if len(word) < 3:
                continue
            if "http" in word:
                continue
            # deasciify the string
            if deasciify == True:
                deasciifier = Deasciifier(word)
                word = deasciifier.convert_to_turkish()
            if asciify:
                asciify_string = str.maketrans("çğıöşü", "cgiosu")
                word = word.translate(asciify_string)
            word = word if len(word) < 7 else word[:6]
            result.append(word)
        return result


# words = defaultdict(int)
# series = history_clean['msg'][:50]
# names = history_clean['name'][:50]
# for idx, sentence in enumerate(series):
#     # print(sentence, idx)
#     #name = names[idx]
#     sentence = sentence.split(' ')
#     for word in sentence:
#         words[word] += 1
# sorted_words = sorted(words.items(), key=lambda kv: kv[1])
#
# print(sorted_words[-50:])

# obj = detector.TurkishNLP()
# # obj.download()
# obj.create_word_set()
# def correct_words(text):
#   text = obj.list_words(text)
#   text = obj.auto_correct(text)
#   return text
Example #7
0
 def test_convert_to_turkish(self):
     for i in range(len(self.ascii_strings)):
         dea = Deasciifier(self.ascii_strings[i])
         result = dea.convert_to_turkish()
         self.assertEqual(result, self.turkish_strings[i])
 def test_convert_to_turkish(self):
     for i in range(len(self.ascii_strings)):
         dea = Deasciifier(self.ascii_strings[i])
         result = dea.convert_to_turkish()
         self.assertEqual(result, self.turkish_strings[i])
Example #9
0
import re
Example #10
0
def deascii(text):
	my_ascii_turkish_txt = text
	deasciifier = Deasciifier(my_ascii_turkish_txt.decode("utf-8"))
	my_deasciified_turkish_txt = deasciifier.convert_to_turkish()

	return my_deasciified_turkish_txt.encode("utf-8")