Python synsets Exemples, pythainlp.corpus.wordnet.synsets Python Exemples

Exemple #1

0

Afficher le fichier

    def test_wordnet(self):
        self.assertIsInstance(wordnet.langs(), list)
        self.assertIn("tha", wordnet.langs())

        self.assertEqual(
            wordnet.synset("spy.n.01").lemma_names("tha"), ["สปาย", "สายลับ"])
        self.assertIsNotNone(wordnet.synsets("นก"))
        self.assertIsNotNone(wordnet.all_synsets(pos=wn.ADJ))

        self.assertIsNotNone(wordnet.lemmas("นก"))
        self.assertIsNotNone(wordnet.all_lemma_names(pos=wn.ADV))
        self.assertIsNotNone(wordnet.lemma("cat.n.01.cat"))

        self.assertEqual(wordnet.morphy("dogs"), "dog")

        bird = wordnet.synset("bird.n.01")
        mouse = wordnet.synset("mouse.n.01")
        self.assertEqual(wordnet.path_similarity(bird, mouse),
                         bird.path_similarity(mouse))
        self.assertEqual(wordnet.wup_similarity(bird, mouse),
                         bird.wup_similarity(mouse))
        self.assertEqual(wordnet.lch_similarity(bird, mouse),
                         bird.lch_similarity(mouse))

        cat_key = wordnet.synsets("แมว")[0].lemmas()[0].key()
        self.assertIsNotNone(wordnet.lemma_from_key(cat_key))

Exemple #2

0

Afficher le fichier

    def find_synonyms(self,
                      word: str,
                      pos: str = None,
                      postag_corpus: str = "lst20") -> List[str]:
        """
        Find synonyms from wordnet

        :param str word: word
        :param str pos: part-of-speech type
        :param str postag_corpus: postag corpus name
        :return: list of synonyms
        :rtype: List[str]
        """
        self.synonyms = []
        if pos is None:
            self.list_synsets = wordnet.synsets(word)
        else:
            self.p2w_pos = postype2wordnet(pos, postag_corpus)
            if self.p2w_pos != '':
                self.list_synsets = wordnet.synsets(word, pos=self.p2w_pos)
            else:
                self.list_synsets = wordnet.synsets(word)

        for self.synset in wordnet.synsets(word):
            for self.syn in self.synset.lemma_names(lang='tha'):
                self.synonyms.append(self.syn)

        self.synonyms_without_duplicates = list(
            OrderedDict.fromkeys(self.synonyms))
        return self.synonyms_without_duplicates

Exemple #3

0

Afficher le fichier

Fichier : __init__.py Projet : wannaphongcom/pythainlp

    def test_wordnet(self):
        self.assertIsNotNone(wordnet.langs())

        self.assertEqual(
            wordnet.synset("spy.n.01").lemma_names("tha"), ["สปาย", "สายลับ"]
        )
        self.assertIsNotNone(wordnet.synsets("นก"))
        self.assertIsNotNone(wordnet.all_synsets(pos=wn.ADJ))

        self.assertIsNotNone(wordnet.lemmas("นก"))
        self.assertIsNotNone(wordnet.all_lemma_names(pos=wn.ADV))
        self.assertIsNotNone(wordnet.lemma("cat.n.01.cat"))

        self.assertEqual(wordnet.morphy("dogs"), "dog")

        bird = wordnet.synset("bird.n.01")
        mouse = wordnet.synset("mouse.n.01")
        self.assertEqual(
            wordnet.path_similarity(bird, mouse), bird.path_similarity(mouse)
        )
        self.assertEqual(
            wordnet.wup_similarity(bird, mouse), bird.wup_similarity(mouse)
        )

        cat_key = wordnet.synsets("แมว")[0].lemmas()[0].key()
        self.assertIsNotNone(wordnet.lemma_from_key(cat_key))

Exemple #4

0

Afficher le fichier

def split_word(text):
    tokens = word_tokenize(text, engine='newmm')

    # # Remove stop words ภาษาไทย และภาษาอังกฤษ
    # tokens = [i for i in tokens if not i in th_stop and not i in en_stop]

    # หารากศัพท์ภาษาไทย และภาษาอังกฤษ
    # English
    tokens = [p_stemmer.stem(i) for i in tokens]

    # Thai
    tokens_temp = []
    for i in tokens:
        w_syn = wordnet.synsets(i)
        if (len(w_syn) > 0) and (len(w_syn[0].lemma_names('tha')) > 0):
            tokens_temp.append(w_syn[0].lemma_names('tha')[0])
        else:
            tokens_temp.append(i)

    tokens = tokens_temp

    # ลบตัวเลข
    tokens = [i for i in tokens if not i.isnumeric()]

    # ลบช่องว่าง
    tokens = [i for i in tokens if not ' ' in i]

    # tokens_list = [split_word(txt) for txt in text_list]

    return tokens

Exemple #5

0

Afficher le fichier

def split_word(text):
    # ตัดคำโดยใช้ dict ใน corpus ที่ผม edit ไป มันจะตัดเฉพาะเมนูอาหารที่ผมใส่ไปใน words.th.txt
    tokens = word_tokenize(text, engine='dict')

    # Remove stop words ภาษาไทย และภาษาอังกฤษ
    tokens = [i for i in tokens if not i in th_stop and not i in en_stop]

    # หารากศัพท์ภาษาไทย และภาษาอังกฤษ
    # English
    tokens = [p_stemmer.stem(i) for i in tokens]

    # Thai
    tokens_temp = []
    for i in tokens:
        w_syn = wordnet.synsets(i)
        if (len(w_syn) > 0) and (len(w_syn[0].lemma_names('tha')) > 0):
            tokens_temp.append(w_syn[0].lemma_names('tha')[0])
        else:
            tokens_temp.append(i)

    tokens = tokens_temp

    # ลบตัวเลข
    tokens = [i for i in tokens if not i.isnumeric()]

    # ลบช่องว่าง
    tokens = [i for i in tokens if not ' ' in i]

    return tokens

Exemple #6

0

Afficher le fichier

def split_word(text):
    th_stop = tuple(thai_stopwords())
    en_stop = tuple(get_stop_words('en'))
    p_stemmer = PorterStemmer()

    tokens = word_tokenize(text,engine='newmm')
    
    # Remove Thai and English stop words
    tokens = [i for i in tokens if not i in th_stop and not i in en_stop]

    # Find Thai and English stem words
    # English
    tokens = [p_stemmer.stem(i) for i in tokens]
    
    # Thai
    tokens_temp=[]
    for i in tokens:
        w_syn = wordnet.synsets(i)
        if (len(w_syn)>0) and (len(w_syn[0].lemma_names('tha'))>0):
            tokens_temp.append(w_syn[0].lemma_names('tha')[0])
        else:
            tokens_temp.append(i)
    
    tokens = tokens_temp
    
    # Remove numbers
    tokens = [i for i in tokens if not i.isnumeric()]
    
    # Remove space
    tokens = [i for i in tokens if not ' ' in i]

    return tokens

Exemple #7

0

Afficher le fichier

def compute_wordnet_path_scores(pairs):
    """
        Compute WordNet path similarity for a list of input word pairs
        Note: Thai WordNet has 3 methods to compute a similarity value: wordnet.path_similarity, wordnet.lch_similarity, wordnet.wup_similarity
            lch_similarity we can't use. path_similarity seems to have better results than wup_similarity

        If we don't find a path between the two works, we add "None" to the result list

        @returns: this list of simility scores, and the number of OOV-word-pairs
    """

    structed_oov_pairs = 0
    wn_scores = []

    for index, pair in enumerate(pairs):

        w1 = wordnet.synsets(pair[0])
        w2 = wordnet.synsets(pair[1])

        if len(w1) > 0 and len(w2) > 0:
            # just use the first synset of each term
            if WORDNET_PATH_SIMILARITY_TYPE == 'first_synset':
                path = wordnet.path_similarity(w1[0], w2[0])

            # return the highest sim between all synset combinations
            elif WORDNET_PATH_SIMILARITY_TYPE == 'most_similar':
                path = -1
                for syn1 in w1:
                    for syn2 in w2:
                        tmppath = wordnet.path_similarity(syn1, syn2)
                        if tmppath and tmppath > path: path = tmppath
                if path == -1:
                    # if no path found, set back to None
                    path = None
            else:
                raise RuntimeError(
                    'WORDNET_PATH_SIMILARITY_TYPE is not set in config!')

            wn_scores.append(path)
        else:
            wn_scores.append(None)
            structed_oov_pairs += 1

    return wn_scores, structed_oov_pairs

Exemple #8

0

Afficher le fichier

Fichier : evaluate.py Projet : alexpulich/word-embeddings-benchmarks

def compute_wordnet_path_scores(pairs):
    """
        Compute WordNet path similarity for a list of input word pairs
        Note: Thai WordNet has 3 methods to compute a similarity value: wordnet.path_similarity, wordnet.lch_similarity, wordnet.wup_similarity
            lch_similarity we can't use. path_similarity seems to have better results than wup_similarity

        If we don't find a path between the two works, we add "None" to the result list

        @returns: this list of simility scores, and the number of OOV-word-pairs
    """
    print("DEBUG: starting compute_wordnet_path_scores")
    from pythainlp.corpus import wordnet

    structed_oov_pairs = 0  # wohlg: we count word pairs for which we have no path
    wn_scores = []

    for index, pair in enumerate(pairs):

        w1 = wordnet.synsets(pair[0])
        w2 = wordnet.synsets(pair[1])

        if len(w1) > 0 and len(w2) > 0:
            if WORDNET_PATH_SIMILARITY_TYPE == 'first_synset':  # just use the first synset of each term
                path = wordnet.path_similarity(w1[0], w2[0])
                # path = wordnet.lch_similarity(w1[0], w2[0]) ## we can't use it, requires the same part-of-speech for both words
                # path = wordnet.wup_similarity(w1[0], w2[0])
            elif WORDNET_PATH_SIMILARITY_TYPE == 'most_similar':  # return the highest sim between all synset combinations
                path = -1
                for syn1 in w1:
                    for syn2 in w2:
                        tmppath = wordnet.path_similarity(syn1, syn2)
                        if tmppath and tmppath > path: path = tmppath
                if path == -1:
                    path = None  # if no path found, set back to None

            wn_scores.append(path)
        else:
            wn_scores.append(None)
            structed_oov_pairs += 1

    return wn_scores, structed_oov_pairs

Exemple #9

0

Afficher le fichier

def split_word(text):
    th_stop = tuple(stopwords.words('thai'))
    en_stop = tuple(get_stop_words('en'))
    p_stemmer = PorterStemmer()
    tokens = word_tokenize(text)
    tokens = [i for i in tokens if not i in th_stop and not i in en_stop]

    tokens = [p_stemmer.stem(i) for i in tokens]

    tokens_temp = []
    for i in tokens:
        w_syn = wordnet.synsets(i)
        if (len(w_syn) > 0) and (len(w_syn[0].lemma_names('tha')) > 0):
            tokens_temp.append(w_syn[0].lemma_names('tha')[0])
        else:
            tokens_temp.append(i)

    tokens = tokens_temp

    tokens = [i for i in tokens if not i.isnumeric()]
    tokens = [i for i in tokens if not ' ' in i]

    return tokens

Exemple #10

0

Afficher le fichier

vowel = 'เแโใไ'
e_alphabet = 'abcdefghijklmnopqrstuvwxyz'
be_alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
num = '0123456789'

al = [alphabet,vowel,e_alphabet,be_alphabet,num]
for a in al:
    for index in a:
        t_word = []
        for word in dict[index]:
            t_word.append(word)

        for word in t_word:
            tmp = dict[index][word]
            synonyms = []
            for syn in wordnet.synsets(word):
                for s in syn.lemma_names('tha'):
                    synonyms.append(s)

            synonyms = list(set(synonyms))
            print(synonyms)

            for i in synonyms:
                try :
                    dict[i[0]][i]
                except KeyError:
                    try :
                        dict[i[0]][i] = tmp
                    except KeyError:
                        continue
start = time.time()

Exemple #11

0

Afficher le fichier

def findDocuments():
    # initial databased
    start = time.time()
    dict = SqliteDict('E:\\CPE#Y4\\databaseTF\\lastest_db\\new-db.sqlite',
                      autocommit=True)
    dict = dict['doc']
    end = time.time()
    print("Time to initial db", end - start)

    # initial data and test set
    q = open('test_set\\new_sample_questions_tokenize.json',
             mode='r',
             encoding="utf-8-sig")
    data = json.load(q)
    # validate = json.load(open("test_set\\new_sample_questions_answer.json", mode='r', encoding="utf-8-sig"))

    doc = 0
    data = data[doc:]
    print(data.__len__())
    string = ''
    question_words = stopwords.words('thai')
    question_words.append('กี่')
    question_words.append('ใด')

    test_output = []
    no_word = []
    for s in data:
        string += "question " + str(doc)
        print("question", doc, s)

        # segment until no space and do rule-based
        suffix = ['คือ', 'กี่', 'ใด']
        r = []
        for i in s:
            if ' ' in i:
                for j in i.split():
                    s.append(j)
                r.append(i)
                continue
            for j in suffix:
                if i.endswith(j) or i.startswith(j):
                    s.append(rreplace(i, j, ' ', 1))
                    r.append(i)
                    break
        for i in r:
            s.remove(i)
        ########################################################################################

        s.sort()
        s = list(set(s))
        search = []
        cantfind = []

        # # find by sqlitedict

        for f in range(s.__len__()):
            if (s[f].isspace()) or (s[f] in question_words):
                continue
            if (s[f][0] == ' ') or (s[f][-1] == ' '):
                s[f] = s[f].strip()

            try:
                tmp = dict[s[f][0]][s[f]]
                search.append((s[f], tmp))

            except KeyError:  # # if no index find by synonyms
                cantfind.append(s[f])
                synonyms = []
                for syn in wordnet.synsets(s[f]):
                    for i in syn.lemma_names('tha'):
                        synonyms.append(i)

                if s[f] in synonyms:
                    synonyms.remove(s[f])
                for i in synonyms:
                    try:
                        tmp = dict[i[0]][i]
                        search.append((i, tmp))
                        break
                    except KeyError:
                        cantfind.append(i)
        no_word.append(cantfind)
        ########################################################################################

        # remove least mean tf-idf
        word = []
        pool = []
        search.sort(key=lambda s: s[1][0][0], reverse=True)
        for i in range(0):
            if (search.__len__() > 2):
                search.pop()
            else:
                break

        search.sort(key=lambda s: len(s[1]))
        for i in range(search.__len__()):
            try:
                word.append(search[i][0])
                pool.append(search[i][1][1:])
            except IndexError:
                break

        ########################################################################################

        answer_index = []
        count = []

        # rank answer in answer pool
        c = {}
        weight = [5, 1]
        for i in range(pool.__len__()):
            for k, v in pool[i]:
                try:
                    if i < weight.__len__():
                        c[k] += v * weight[i]
                    else:
                        c[k] += v
                except KeyError:
                    if i < weight.__len__():
                        c[k] = v * weight[i]

        for key, value in c.items():
            answer_index.append(key)
            count.append(value)

        ########################################################################################
        answer_n = nlargest(count.__len__(), count)
        answer = []
        for i in answer_n:
            index = count.index(i)
            answer.append(answer_index[index])
            answer_index.pop(index)
            count.pop(index)

        print(answer.__len__(), answer[:6])
        test_output.append(answer)  ### return this .
        doc += 1

    return test_output, no_word

Exemple #12

0

Afficher le fichier

Fichier : evaluate.py Projet : alexpulich/word-embeddings-benchmarks

def compute_mahtab_scores(pairs):
    """
        Based on https://aclweb.org/anthology/S17-2040
        Section: 3.2   

        # TODO Alexey: maybe there is an implementation of this and we don't have to implement ourselves!
        # See in https://aclweb.org/anthology/S17-2040 and in https://www.aclweb.org/anthology/S16-1091

    """

    from pythainlp.corpus import wordnet

    structed_oov_pairs = 0  # wohlg: we count word pairs for which we have no path
    mahtab_scores = []
    current_score = None

    for index, pair in enumerate(pairs):

        w1 = wordnet.synsets(pair[0])
        w2 = wordnet.synsets(pair[1])

        if len(w1) > 0 and len(w2) > 0:

            # remark Gerhard: we check for current_score==-1 just to check that we didn't set the score

            # *** Step 1: "If two words are exactly the same or are two different writing forms of one word or belong to the same synset, the distance will be zero (D(x,y)=0)." ***

            ## words are the sam
            if pair[0] == pair[1]:
                current_score = 0
                continue

            ## "are two different writing forms of one word" -- Gerhard: don't know how to handle this -> skip?!

            ## "belong to the same synset"
            s1 = wordnet.synsets(pair[0])
            s2 = wordnet.synsets(pair[0])
            # TODO: check that there is an otherlapping synset

            # *** Step 2: "If two words have more than four common senses in their corresponding synsets, the distance will be one (D(x, y) =1)" ***
            # TODO: Compute sets of senses of synsets of both words, and then see if set intersection has more than 4 elements

            # *** Step 3: "If there is a direct or two-level hypernym relation between the corresponding synsets of words, the distance will be two (D(x, y) =2)." ***

            # *** Step 4: "If two words share any common sense, the distance will be three (D(x, y) =3)" ***

            # *** Step 5: "If two words are derivationally related, the distance will be four (D(x, y) =4)." ***
            ## What does that mean??? ## maybe explained in https://www.aclweb.org/anthology/S16-1091

            ## Additional less strict rules
            # *** Step 6: "1. If there is any relation except hypernym between synsets of two words, the distance will be three (D(x, y) =3)." ***
            # *** Step 7: "2. If there is any two-links relation except hypernym between synsets of two words, the distance will be four (D(x, y) =4)." ***
            # *** Step 8: "3. If there is any three-links relation between synsets of two words, the distance will be five (D(x, y) =5)." ***
            # *** Step 9: "After all, if no relation is found between a pair of word to measure the distance between them, the distance will set to -1 a" ***
            current_score = -1

            # *** Step 10: "the distance will set to -1 and then we calculate similarity score using equation 1 introduced by(Rychalska et al., 2016):" ***
            # see Equation (1) in the paper, We set alpha to 0.25 and beta to 1 as these values seemed to yield the best results
            if current_score < 0:
                mahtab_scores.append(0)
            else:
                s = math.exp(-0.25 * current_score)
                mahtab_scores.append(s)
                # TODO .. test if formula works correctly

            # if Alexey is ambitionious he can have a look at BabelNet as well, but I think it's not necessary

        else:
            mahtab_scores.append(None)
            structed_oov_pairs += 1

    return mahtab_scores, structed_oov_pairs

Exemple #13

0

Afficher le fichier

Fichier : findDocuments.py Projet : patda9/th-qa-system-261491

def findDocuments(start_idx=0, end_idx=0):
    # initial databased
    q = open('./data/final/final_tokenized_question.json', mode='r', encoding="utf-8-sig") # change path
    start = time.time()
    dict = SqliteDict('./data/sqlite_db/doc_add_missing.sqlite', autocommit=True) # change path
    dict = dict['doc']
    end = time.time()
    print("Time to initial db", end - start)
    # initial data and test set
    # q = open('./ThaiQACorpus-EvaluationDataset-tokenize.json', mode='r', encoding="utf-8-sig") # change path
    
    # validate = json.load(open("./../new_sample_questions_answer.json", mode='r', encoding="utf-8-sig")) # change path

    doc = 0
    data = json.load(q)
    data = data[start_idx:end_idx]
    print(data.__len__())
    save = 0
    string = ''
    question_words = stopwords.words('thai')
    question_words.append('กี่')
    question_words.append('ใด')

    test_output = []
    for s in data:
        start = time.time()
        string += "question " + str(doc)
        print("question", doc, s)

        # segment until no space and do rule-based
        suffix = ['คือ', 'กี่', 'ใด']
        r=[]
        for i in s:
            if ' ' in i:
                for j in i.split():
                    s.append(j)
                r.append(i)
                continue
            for j in suffix:
                if i.endswith(j) or i.startswith(j):
                    s.append(rreplace(i, j, ' ', 1))
                    r.append(i)
                    break
        for i in r :
            s.remove(i)
        ########################################################################################

        s.sort()
        s = list(set(s))
        search = []
        cantfind = []

        # # find by sqlitedict

        for f in range(s.__len__()):
            if (s[f].isspace()) or (s[f] in question_words):
                continue
            if (s[f][0] == ' ') or (s[f][-1] == ' '):
                s[f] = s[f].strip()

            try:
                tmp = dict[s[f][0]][s[f]]
                search.append((s[f], tmp))

            except KeyError:  # # if no index find by synonyms
                cantfind.append(s[f])
                synonyms = []
                for syn in wordnet.synsets(s[f]):
                    for i in syn.lemma_names('tha'):
                        synonyms.append(i)

                # if synonyms.__len__() == 0 :
                #     if s[f].endswith('คือ'):
                #         synonyms.append(rreplace(s[f], 'คือ', '', 1))
                #     elif s[f].endswith('กี่'):
                #         synonyms.append(rreplace(s[f], 'กี่', '', 1))
                #     elif s[f].endswith('ใด'):
                #         synonyms.append(rreplace(s[f], 'ใด', '', 1))
                #     synonyms = deepcut.tokenize(s[f])
                if s[f] in synonyms :
                    synonyms.remove(s[f])
                for i in synonyms:
                    try:
                        tmp = dict[i[0]][i]
                        search.append((i, tmp))
                        break
                    except KeyError:
                        cantfind.append(i)

        ########################################################################################

        # remove least mean tf-idf
        word = []
        pool = []
        search.sort(key=lambda s: s[1][0][0], reverse=True)
        for i in range(0):
            if (search.__len__() > 2):
                search.pop()
            else:
                break

        search.sort(key=lambda s: len(s[1]))
        for i in range(search.__len__()):
            try:
                word.append(search[i][0])
                pool.append(search[i][1][1:])
            except IndexError:
                break
        # weight shortest in case shortest + best tf-idf
        # for i in range(pool[0].__len__()):
        #     pool[0][i][1] *= 3

        ########################################################################################

        answer_index = []
        count = []

        # rank answer in answer pool
        c = {}
        weight = [5,1]
        for i in range(pool.__len__()):
            for k, v in pool[i]:
                try:
                    if i < weight.__len__():
                        c[k] += v*weight[i]
                    else:
                        c[k] += v
                except KeyError:
                    if i < weight.__len__():
                        c[k] = v*weight[i]


        for key, value in c.items():
            answer_index.append(key)
            count.append(value)

        ########################################################################################
        answer_n = nlargest(count.__len__(), count)
        answer = []
        for i in answer_n:
            index = count.index(i)
            answer.append(answer_index[index])
            answer_index.pop(index)
            count.pop(index)

        print(answer.__len__(), answer[:6])
        test_output.append(answer[:50]) ### return this .
        doc += 1

    return test_output

Exemple #14

0

Afficher le fichier

def Processing(E1):

    p_stemmer = PorterStemmer()

    ThaiWord = list(thaisw.words('thai'))
    #print(' Thaiwords : ', ThaiWord)
    EngWord = list(set(engsw.words('english')))
    #print(' ew : ',EngWord, ' : ', type(EngWord))
    Morewords = [
        u'การ', u'การทำงาน', u'ทำงาน', u'เสมอ', u'krub', u'Test', u'nan', u' ',
        u'test', u'.', u',', u'ทำ', u'-', u'/'
    ]
    All_Stop_Word = ThaiWord + EngWord + Morewords
    #print(' ALL : ',All_Stop_Word)

    EntryList = []
    for n in E1:
        # check=detect(n[0])   # th or en
        #print(' text : ', n[0], ' :: ',check)
        EntryList.append(n[0])

        #print(' EntryList : ', EntryList)

        Outcome = []
    for r in EntryList:
        Dummy = []
        tokens = []
        tokens = list(eng_tokens(r))
        lowered = [t.lower() for t in tokens]
        #print(' Dummy : ',lowered)
        lowered = " ".join(lowered)
        #Dummy=list(thai_tokens(lowered, engine='newmm'))
        words = set(thai_words())
        words.add(u'ไทยเบฟ')
        words.add(u'ผสานพลัง')
        words.add(u'โอกาส')
        words.add(u'ถังไม้โอ๊ค')
        custom_tokenizer = Tokenizer(words)
        Dummy = list(custom_tokenizer.word_tokenize(lowered))
        #print(' Dummy 2 : ',Dummy)
        Outcome.append(Dummy)

    #print(' Outcome : ',Outcome, ' : ', len(Outcome))

    NoStop = []
    for n in Outcome:
        Dummy = []
        Dummy = [word for word in n if word not in All_Stop_Word]
        NoStop.append(Dummy)

    print(' No stop : ', NoStop, ' len: ', len(NoStop))

    Lemma = []
    for n in NoStop:
        Dummy = []
        Dummy = [p_stemmer.stem(word) for word in n]
        Lemma.append(Dummy)

    print(' Lemma : ', Lemma, ' len: ', len(Lemma))
    '''
    # Instantiate the WordNetLemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    # Lemmatize all tokens into a new list: lemmatized
    Lemma=[]
    for n in NoStop:
        Dummy=[]
        Dummy = [wordnet_lemmatizer.lemmatize(t) for t in n]
        Lemma.append(Dummy)
    #print(' lemma : ', Lemma, '  ::  ', type(Lemma))
    '''

    Lemma_temp = []
    for n in Lemma:
        Dummy = []
        for i in n:
            w_syn = wordnet.synsets(i)
            if (len(w_syn) > 0) and (len(w_syn[0].lemma_names('tha')) > 0):
                Dummy.append(w_syn[0].lemma_names('tha')[0])
            else:
                Dummy.append(i)
        Lemma_temp.append(Dummy)

    Lemma = Lemma_temp

    Lemma_temp = []
    for n in Lemma:
        Dummy = []
        Dummy = [i for i in n if not i.isnumeric()]
        Lemma_temp.append(Dummy)
    Lemma = Lemma_temp

    Lemma_temp = []
    for n in Lemma:
        Dummy = []
        Dummy = [i for i in n if not ' ' in i]
        Lemma_temp.append(Dummy)
    Lemma = Lemma_temp

    #print(' lemma : ', Lemma, '  ::  ', type(Lemma))
    return Lemma

Exemple #15

0

Afficher le fichier

Fichier : thainlp.py Projet : eveem/senior_project

# In[ ]:


wn.synset("object.n.01").lemma_names(lang="jpn")


# In[ ]:


x = list(wn.all_synsets("n"))


# In[ ]:


x[0].lemma_names(lang="tha")


# In[ ]:


wn.synsets("親", lang="jpn")


# In[ ]:


wn.synset("gray.a.01").lemma_names(lang="eng")