def turkify(text: Text) -> Union[Text, None]:
    if text:
        deasciifier = Deasciifier(text)
        return deasciifier.convert_to_turkish()
    else:

        return None
Esempio n. 2
0
def to_tr(s):    
    tokens = re.split("(\\$.*?\\$)",s)
    res = []
    for x in tokens:
        if x[0]=='$' and x[-1] == '$': res.append(x); continue
        dea = Deasciifier(x)
        x = dea.convert_to_turkish()
        res.append(x)
    return ''.join(res)
Esempio n. 3
0
def to_tr(s):    
    tokens = re.split("(\\$.*?\\$)",s)
    res = []
    for x in tokens:
        if x[0]=='$' and x[-1] == '$': res.append(x); continue
        dea = Deasciifier(x)
        x = dea.convert_to_turkish()
        res.append(x)
    return ''.join(res)
Esempio n. 4
0
    def turkish_tokenize(self, string):
        """
        This helper method tokenises the text with respect to rules specific to the Turkish language.

        :param string: The text that is already tokenised by the common tokeniser.
        :type string: str
        :return: The tokenised output of the text.
        :rtype: list
        """

        string = UnicodeTr(string).lower()
        deasciifier = Deasciifier(string)
        string = deasciifier.convert_to_turkish()
        string = self.common_tokenize(string)
        return string.strip().split()
Esempio n. 5
0
    def __call__(self, doc):

        tokens = [
            token for token in tokenizer(doc, self.lang, True)
            if token.isalnum() and len(token) > 0 and not token.isspace()
        ]  # we can eliminate punctuation  as well
        tokens = [token.lower() for token in tokens]

        if self.remove_numbers:
            number_pattern = "[a-zA-z]{,3}\d{6,}"
            tokens = [re.sub(number_pattern, "", token) for token in tokens]

        if self.eliminate_stopwords:
            stopwords = stopword_lists.get_stopwords(lang="tr")
            tokens = [token for token in tokens if token not in stopwords]

        if self.apply_stemming:
            tokens = [tr_stemmer.stem2(token) for token in tokens]

        if self.deasciify:
            tokens = [
                Deasciifier(token).convert_to_turkish() for token in tokens
            ]

        tokens = [token.strip() for token in tokens]
        tokens = [token for token in tokens
                  if len(token) > 0]  # or not token.isspace()]
        return tokens
Esempio n. 6
0
def deasciify_words(words, lang):

    if lang in ["tr", "turkish"]:
        return [Deasciifier(token).convert_to_turkish() for token in words]
    else:
        return words
    '''
Esempio n. 7
0
    def post(self):

        submitted_content = SubmittedContent()

        if users.get_current_user():
            submitted_content.author = users.get_current_user()

        submitted_content.content = self.request.get('content')
        
        submitted_content.put()      

        string = cgi.escape(self.request.get('content'))
        dea = Deasciifier(string)
        result = dea.convert_to_turkish()
        
        template_values = {'result': result}

        path = os.path.join(os.path.dirname(__file__), 'deasciify.html')
        self.response.out.write(template.render(path, template_values))
Esempio n. 8
0
    def stem(self, tokens: list, deasciify, asciify):
        #stemming
        #print("stems", tokens)
        result = []
        for word in tokens:
            if len(word) < 3:
                continue
            if "http" in word:
                continue
            # deasciify the string
            if deasciify == True:
                deasciifier = Deasciifier(word)
                word = deasciifier.convert_to_turkish()
            if asciify:
                asciify_string = str.maketrans("çğıöşü", "cgiosu")
                word = word.translate(asciify_string)
            word = word if len(word) < 7 else word[:6]
            result.append(word)
        return result


# words = defaultdict(int)
# series = history_clean['msg'][:50]
# names = history_clean['name'][:50]
# for idx, sentence in enumerate(series):
#     # print(sentence, idx)
#     #name = names[idx]
#     sentence = sentence.split(' ')
#     for word in sentence:
#         words[word] += 1
# sorted_words = sorted(words.items(), key=lambda kv: kv[1])
#
# print(sorted_words[-50:])

# obj = detector.TurkishNLP()
# # obj.download()
# obj.create_word_set()
# def correct_words(text):
#   text = obj.list_words(text)
#   text = obj.auto_correct(text)
#   return text
Esempio n. 9
0
    def __call__(self, doc):

        #tokens = super.__call__(self, doc)
        tokens = _TokenHandler.__call__(self, doc)
        '''
        if self.stemming:
            tokens = [tr_stemmer.stem2(token) for token in tokens] 
        '''
        if self.deasciify:
            tokens = [
                Deasciifier(token).convert_to_turkish() for token in tokens
            ]

        return tokens
Esempio n. 10
0
def deasciify_tr_text(text):

    words = text.split()

    punkts = string.punctuation

    npunct1 = 0
    npunct2 = 0

    correct_words = []

    for w in words:

        lpunct = ""  # to remove the punctuation upon spelling correction and put back them afterwards
        rpunct = ""  # leading:1 char and ending: 3chars
        correct_word = ""

        if is_punctuation(w):
            correct_word = w
        else:

            lw = w.lstrip(punkts)  # remove leading punctuation
            npunct1 = len(w) - len(
                lw)  # take the difference to put the punkts back if not 0
            lpunct = w[:npunct1]

            rw = w.rstrip(punkts)
            npunct2 = len(w) - len(rw)
            if npunct2 > 0:  # otherwise the slicer selects the whole string
                rpunct = w[-npunct2:]

            no_punct_word = w.strip(punkts)

            suggested_word = Deasciifier(no_punct_word).convert_to_turkish()
            correct_word = lpunct + suggested_word + rpunct

        correct_words.append(correct_word)
        '''
        print(w, len(w), no_punct_word, len(no_punct_word))
        print("l:", lpunct, "-s:", suggested_word, "-r:", rpunct)
        print("####\n")
        '''

    correcttext = " ".join(correct_words)
    return correcttext
Esempio n. 11
0
 def test_convert_to_turkish(self):
     for i in range(len(self.ascii_strings)):
         dea = Deasciifier(self.ascii_strings[i])
         result = dea.convert_to_turkish()
         self.assertEqual(result, self.turkish_strings[i])
Esempio n. 12
0
def deasciify_words(words, lang):

    if lang in ["tr", "turkish"]:
        return [Deasciifier(token).convert_to_turkish() for token in words]
    else:  # not applicable for english and arabic
        return words
Esempio n. 13
0
 def correct_letters(text):
     return Deasciifier(text).convert_to_turkish()
Esempio n. 14
0
def deasciify_word(word):
    return Deasciifier(word).convert_to_turkish()
Esempio n. 15
0
 def test_convert_to_turkish(self):
     for i in range(len(self.ascii_strings)):
         dea = Deasciifier(self.ascii_strings[i])
         result = dea.convert_to_turkish()
         self.assertEqual(result, self.turkish_strings[i])
Esempio n. 16
0
def deascii(text):
	my_ascii_turkish_txt = text
	deasciifier = Deasciifier(my_ascii_turkish_txt.decode("utf-8"))
	my_deasciified_turkish_txt = deasciifier.convert_to_turkish()

	return my_deasciified_turkish_txt.encode("utf-8")
Esempio n. 17
0
import re