コード例 #1
0
ファイル: word_embedding.py プロジェクト: Foysal87/sbnltk
class fasttext_embedding:
    __dl = downloader()
    __w2v = None

    def __init__(self):
        self.__dl.download('fasttext_w2v',
                           sbnltk_default.sbnltk_root_path + 'model/')
        self.__w2v = fasttext.load_model(sbnltk_default.sbnltk_root_path +
                                         'model/fasttext_w2v.model')

    def get_vector(self, word):
        try:
            return self.__w2v[word]
        except:
            raise ValueError('Sorry!! Word is not exist in vocab!!')

    def get_nearest_neighbors(self, word, n=5):
        return self.__w2v.get_nearest_neighbors(word, k=n)

    def cosine_distance(self, word1, word2):
        if word1 == word2:
            return 1.0
        try:
            vec1 = self.__w2v[word1]
        except:
            raise ValueError('Sorry!! 1st word is not exist in vocab!!')
        try:
            vec2 = self.__w2v[word2]
        except:
            raise ValueError('Sorry!! 2nd word is not exist in vocab!!')

        return (1.0 - spatial.distance.cosine(vec1, vec2))
コード例 #2
0
ファイル: Postag.py プロジェクト: Foysal87/sbnltk
class static_postag:
    __dl=downloader()
    __dict={}
    __stemmer=None
    __bp=preprocessor()
    __tokenizer=wordTokenizer()
    def __init__(self):
        self.__dl.download('postag_static',sbnltk_default.sbnltk_root_path+'dataset/')
        self.__stemmer=stemmerOP()
        path=sbnltk_default.sbnltk_root_path+'dataset/postag_static.txt'
        for word in open(path,'r'):
            word=word.replace('\n','')
            tokens=self.__tokenizer.basic_tokenizer(word)
            wd=tokens[0]
            val=tokens[-1]
            self.__dict[wd]=val
    def tag(self,sent):
        tokens=self.__tokenizer.basic_tokenizer(sent)
        ans=[]
        for word in tokens:
            if self.__bp.is_number(word):
                ans.append((word,'NUM'))
                continue
            if self.__dict.get(word):
               ans.append((word,self.__dict[word]))
               continue
            if self.__dict.get(self.__bp.word_normalize(word)) :
                ans.append((word, self.__dict[self.__bp.word_normalize(word)]))
                continue
            stem_word=self.__stemmer.stemWord(word)
            if self.__dict.get(stem_word):
               ans.append((word,self.__dict[stem_word]))
               continue
            ans.append((word,'unk'))
        return ans
コード例 #3
0
class Bangla_sentence_embedding_gd:
    __dl = downloader()
    __model = None

    def __init__(self):
        if os.path.exists(sbnltk_default.sbnltk_root_path +
                          'model/Towhid-Sust-transformer') == False:
            self.__dl.download('sentence_embedding_transformer_gd',
                               sbnltk_default.sbnltk_root_path + 'model/')
            with zipfile.ZipFile(
                    sbnltk_default.sbnltk_root_path +
                    'model/sentence_embedding_transformer_gd.zip',
                    'r') as file:
                file.extractall(sbnltk_default.sbnltk_root_path + 'model/')
            os.remove(sbnltk_default.sbnltk_root_path +
                      'model/sentence_embedding_transformer_gd.zip')
        self.__model = SentenceTransformer(sbnltk_default.sbnltk_root_path +
                                           'model/Towhid-Sust-transformer')

    def encode_sentence_list(self, sentences):
        embeddings = {}
        sentence_embeddings = self.__model.encode(sentences)
        for sentence, embedding in zip(sentences, sentence_embeddings):
            embeddings[sentence] = embedding
        return embeddings

    def encode_single_sentence(self, sentence):
        return self.__model.encode(sentence)

    def similarity_of_two_sentence(self, sentence1, sentence2):
        embed = self.encode_sentence_list([sentence1, sentence2])
        return util.pytorch_cos_sim(embed[sentence1], embed[sentence2])

    def similarity_of_two_embedding(self, embedding1, embedding2):
        return util.pytorch_cos_sim(embedding1, embedding2)
コード例 #4
0
ファイル: Stemmer.py プロジェクト: Foysal87/sbnltk
class stemmerOP:
    __wordtokens = wordTokenizer()
    __word_vec = []
    __word_dict = {}
    __word_dict2 = {}
    __bp = preprocessor()
    __dl=downloader()
    def __init__(self):
        self.__dl.download('rootword_list',sbnltk_default.sbnltk_root_path+'dataset/')
        self.__dl.download('ner_static',sbnltk_default.sbnltk_root_path+'dataset/')
        for word in open(sbnltk_default.sbnltk_root_path + 'dataset/ner_static.txt', "r"):
            word = word.replace('\n', '')
            segment = word.split(' ')
            word = segment[:-1]
            for i in word:
                self.__word_dict[i]=1
        for word in open(sbnltk_default.sbnltk_root_path+'dataset/rootword_list.txt', "r"):
            word=word.replace('\n','')
            self.__word_dict2[word]=1
    def __search(self,word):
        if (self.__bp.word_normalize(word) in self.__word_dict) or (word in self.__word_dict) or (word in self.__word_dict2) or (self.__bp.word_normalize(word) in self.__word_dict2):
            return True
        return False
    def __bnCompare(self,item1,item2):
        return (len(item1)<len(item2))-(len(item1)>len(item2))

    def stemWord(self,word):
        try:
            if self.__word_dict2.get(word)!=None:
                return word
            suf_arr=[]
            for wd in rule_words:
                if re.search('.*' + wd + '$', word):
                    suf_arr.append(wd)
            suf_arr = sorted(suf_arr, key=functools.cmp_to_key(self.__bnCompare))
            if len(suf_arr)>0:
                for i in suf_arr:
                    if i in rule_dict:
                        ind = len(word) - len(i)
                        new_word=word[0:ind]+rule_dict[i]
                        if self.__search(new_word):
                            return new_word
                    ind = len(word) - len(i)
                    new_word = word[0:ind]
                    if len(new_word)==0:
                        return word
                    if self.__search(new_word):
                        return new_word
            return word
        except:
            print(f"{sbnltk_default.bcolors.FAIL}ERROR 101: Error in stemming!! {sbnltk_default.bcolors.ENDC}")
    def stemSent(self,sent):
        tokens=self.__wordtokens.basic_tokenizer(sent)
        temp_tokens=[]
        for i in tokens:
            temp_tokens.append(self.stemWord(i))
        result = ' '.join(temp_tokens)

        return result
コード例 #5
0
ファイル: word_embedding.py プロジェクト: Foysal87/sbnltk
class glove_embedding:
    __dl = downloader()
    __embeddings_dict = {}

    def __init__(self):
        self.__dl.download('glove_embedding',
                           sbnltk_default.sbnltk_root_path + 'model/')
        self.__dl.download('glove_id2word',
                           sbnltk_default.sbnltk_root_path + 'model/')
        path = sbnltk_default.sbnltk_root_path + 'model/glove_embedding.pkl'
        model = pickle.load(open(path, 'rb'))
        id2word = sbnltk_default.sbnltk_root_path + 'model/glove_id2word.txt'
        with open(id2word, 'r') as f:
            for l in f:
                values = l.split()
                ind = int(values[0])
                word = str(values[1])
                vec = model[ind]
                if len(vec) < 100:
                    continue
                self.__embeddings_dict[word] = vec

    def get_vector(self, word):
        if word in self.__embeddings_dict:
            return self.__embeddings_dict[word]
        return np.zeros(100)

    def cosine_distance(self, word1, word2):
        vec1 = np.zeros(100)
        vec2 = np.zeros(100)
        flg = 0
        if word1 in self.__embeddings_dict:
            vec1 = self.__embeddings_dict[word1]
            flg += 1
        if word2 in self.__embeddings_dict:
            vec2 = self.__embeddings_dict[word2]
            flg += 1
        if flg == 0:
            return 0.5
        d = 1.0 - spatial.distance.cosine(vec1, vec2)
        return d

    def get_nearest_neighbors(self, item, n):
        vec = []
        if item not in self.__embeddings_dict:
            vec.append(item)
            return vec
        result = sorted(
            self.__embeddings_dict.keys(),
            key=lambda word: spatial.distance.euclidean(
                self.__embeddings_dict[word], self.__embeddings_dict[item]))
        j = 0
        for i in result:
            if j >= n:
                break
            vec.append(i)
            j += 1

        return vec
コード例 #6
0
ファイル: word_embedding.py プロジェクト: Foysal87/sbnltk
class gensim_word2vec_embedding:
    __dl = downloader()
    __embeddings_dict = {}

    def __init__(self):
        self.__dl.download('gensim_w2v',
                           sbnltk_default.sbnltk_root_path + 'model/')
        path = sbnltk_default.sbnltk_root_path + 'model/gensim_w2v.txt'
        with open(path, 'r') as f:
            for l in f:
                values = l.split()
                word = str(values[0])
                vec = np.asarray(values[1:], "float32")
                if len(vec) < 100:
                    continue
                self.__embeddings_dict[word] = vec

    def get_vector(self, word):
        if word in self.__embeddings_dict:
            return self.__embeddings_dict[word]
        return np.zeros(100)

    def cosine_distance(self, word1, word2):
        if word1 == word2:
            return 1.0
        vec1 = np.zeros(100)
        vec2 = np.zeros(100)
        flg = 0
        if word1 in self.__embeddings_dict:
            vec1 = self.__embeddings_dict[word1]
            flg += 1
        if word2 in self.__embeddings_dict:
            vec2 = self.__embeddings_dict[word2]
            flg += 1
        if flg <= 1:
            return 0.5
        d = 1.0 - spatial.distance.cosine(vec1, vec2)
        return d

    def get_nearest_neighbors(self, item, n):
        vec = []
        if item not in self.__embeddings_dict:
            vec.append(item)
            return vec
        result = sorted(
            self.__embeddings_dict.keys(),
            key=lambda word: spatial.distance.euclidean(
                self.__embeddings_dict[word], self.__embeddings_dict[item]))
        j = 0
        for i in result:
            if j >= n:
                break
            vec.append(i)
            j += 1

        return vec
コード例 #7
0
ファイル: NER.py プロジェクト: Foysal87/sbnltk
class static_NER:
    __ner_static_data = {}
    __bp = preprocessor()
    __stemmer = stemmerOP()
    __dl = downloader()

    def __init__(self):
        self.__dl.download('ner_static',
                           sbnltk_default.sbnltk_root_path + 'dataset/')
        for word in open(
                sbnltk_default.sbnltk_root_path + 'dataset/ner_static.txt',
                "r"):
            word = word.replace('\n', '')
            segment = word.split(' ')
            tag = segment[-1]
            word = segment[:-1]
            word = ' '.join(word)

            self.__ner_static_data[word] = tag

    def tag(self, sentence):
        segment = sentence.split()
        stems = self.__stemmer.stemSent(sentence)
        stems = stems.split()
        i = 0
        sentence_tags = []
        while (i < len(segment)):
            j = len(segment)
            flg = 0
            while (j > i):
                now = ' '.join(segment[i:j])
                now2 = ' '.join(stems[i:j])
                if self.__ner_static_data.get(now) != None:
                    sentence_tags.append((now, self.__ner_static_data[now]))
                    i = j - 1
                    flg = 1
                    break
                if self.__ner_static_data.get(now2) != None:
                    sentence_tags.append((now, self.__ner_static_data[now2]))
                    i = j - 1
                    flg = 1
                j -= 1
            if flg == 0:
                sentence_tags.append((segment[i], 'O'))
            i += 1
        return sentence_tags
コード例 #8
0
ファイル: NER.py プロジェクト: Foysal87/sbnltk
class sklearn_NER:
    __dl = downloader()
    __bp = preprocessor()
    __sk_model = None

    def __init__(self):
        self.__dl.download('sklearn_ner',
                           sbnltk_default.sbnltk_root_path + 'model/')
        self.__sk_model = pickle.load(
            open(sbnltk_default.sbnltk_root_path + 'model/sklearn_ner.pkl',
                 'rb'))

    def word2features(self, sent, i):
        return {
            'word': sent[i],
            'is_first': i == 0,
            'is_last': i == len(sent) - 1,
            'is_capitalized': sent[i][0].upper() == sent[i][0],
            'is_all_caps': sent[i].upper() == sent[i],
            'is_all_lower': sent[i].lower() == sent[i],
            'prefix-1': sent[i][0],
            'prefix-2': sent[i][:2],
            'prefix-3': sent[i][:3],
            'suffix-1': sent[i][-1],
            'suffix-2': sent[i][-2:],
            'suffix-3': sent[i][-3:],
            'prev_word': '' if i == 0 else sent[i - 1],
            'next_word': '' if i == len(sent) - 1 else sent[i + 1],
            'is_numeric': sent[i].isdigit()
        }

    def tag(self, text):
        if len(text) == 0:
            return []
        words = text.split()
        sentence_features = [
            self.word2features(words, i) for i in range(len(words))
        ]
        return list(zip(words,
                        self.__sk_model.predict([sentence_features])[0]))
コード例 #9
0
class bert_Multilingual_Uncased_Postag:
    __model = None
    __dl = downloader()
    __device = True if torch.cuda.is_available() else False
    __module_found = 1
    try:
        import simpletransformers.ner.ner_model as nermodel
        __module_found = 1
    except:
        __module_found = 0

    def __init__(self):
        if self.__module_found == 0:
            raise ValueError(
                'Please install simpletransformers!! install Command: pip3 install simpletransformers'
            )
        if os.path.exists(sbnltk_default.sbnltk_root_path +
                          'model/bert_multi_uncased_postag') == False:
            self.__dl.download('bert_multi_uncased_postag',
                               sbnltk_default.sbnltk_root_path + 'model/')
            with zipfile.ZipFile(
                    sbnltk_default.sbnltk_root_path +
                    'model/bert_multi_uncased_postag.zip', 'r') as file:
                file.extractall(sbnltk_default.sbnltk_root_path + 'model/')
            os.remove(sbnltk_default.sbnltk_root_path +
                      'model/bert_multi_uncased_postag.zip')
        t_h = sbnltk_default.sbnltk_root_path + 'model/bert_multi_uncased_postag/model_args.json'
        t_g = sbnltk_default.sbnltk_root_path + 'model/bert_multi_uncased_postag/'
        self.__model = self.nermodel.NERModel('bert',
                                              t_g,
                                              use_cuda=self.__device,
                                              args=t_h)

    def tag(self, sentences):
        d, f = self.__model.predict(sentences)
        return d
コード例 #10
0
ファイル: Preprocessor.py プロジェクト: Foysal87/sbnltk
class preprocessor:

    __dl = downloader()
    __word_list = {}
    __stopwords = []

    def __init__(self):
        self.__dl.download('bangla_word_list',
                           sbnltk_default.sbnltk_root_path + 'dataset/')
        self.__dl.download('stopword_list',
                           sbnltk_default.sbnltk_root_path + 'dataset/')
        for line in open(
                sbnltk_default.sbnltk_root_path +
                'dataset/bangla_word_list.txt', 'r'):
            line = line.rstrip('\n')
            self.__word_list[line] = 1
        model_path = sbnltk_default.sbnltk_root_path + "dataset/stopword_list.txt"
        for i in open(model_path, "r"):
            i = i.rstrip("\n")
            self.__stopwords.append(i)

    def punctuation_remove(self, text):
        try:
            whitespace = re.compile(
                u"[\s\u0020\u00a0\u1680\u180e\u202f\u205f\u3000\u2000-\u200a]+",
                re.UNICODE)
            bangla_fullstop = u"\u0964"
            punctSeq = u"['\"“”‘’]+|[.?!,…]+|[:;]+"
            punc = u"[(),$%^&*+={}\[\]:\"|\'\~`<>/,€¦!?½£¶™¼©⅐⅑⅒⅓⅔⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞⅟↉¤¿º;-]+"
            text = whitespace.sub(" ", text).strip()
            text = re.sub(punctSeq, " ", text)
            text = re.sub(bangla_fullstop, " ", text)
            text = re.sub(punc, " ", text)
            text = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~]', ' ', text)
            text = text.replace("\\", " ")
            return text
        except:
            print(
                f"{sbnltk_default.bcolors.FAIL} ERROR 201: Error in Removing punctuation!! {sbnltk_default.bcolors.ENDC}"
            )
            return text

    def dust_removal(self, word):

        try:
            s = ""
            for c in word:
                g = c.encode("unicode_escape")
                g = g.upper()
                g = g[2:]
                g = g.decode('utf-8')
                if g in StaticArray.bn2en:
                    s += c
            if len(s) == 0:
                return word
            return s
        except:
            print(
                f"{sbnltk_default.bcolors.FAIL} ERROR 202: Error in Removing dust!! {sbnltk_default.bcolors.ENDC}"
            )
            return word

    def dust_removal_sent(self, sentence):
        words = sentence.split()
        temp = []
        for i in words:
            temp.append(self.dust_removal(i))
        temp = ' '.join(temp)
        return temp

    def stopword_remove(self, text):
        try:
            querywords = text.split()
            resultwords = [
                word for word in querywords if word not in self.__stopwords
            ]
            result = ' '.join(resultwords)
            return result
        except:
            print(
                f"{sbnltk_default.bcolors.FAIL} ERROR 203: Error in Removing stop word!! {sbnltk_default.bcolors.ENDC}"
            )
            return text

    def word_normalize(self, word):
        try:
            s = ""
            for c in word:
                g = c.encode("unicode_escape")
                g = g.upper()
                g = g[2:]
                g = g.decode('utf-8')
                if g in StaticArray.bn_norm:
                    g = StaticArray.bn_norm[g].encode().decode('utf-8')
                    s += g
                    continue
                s += c
            return s
        except:
            print(
                f"{sbnltk_default.bcolors.FAIL} ERROR 204: Error in word normalization!! {sbnltk_default.bcolors.ENDC}"
            )
            return word

    def bangla_to_english_Conversion(self, word):
        try:
            s = ""
            for c in word:
                g = c.encode("unicode_escape")
                g = g.upper()
                g = g[2:]
                g = g.decode('utf-8')
                if g in StaticArray.bn2enPunc:
                    if len(s) > 0 and s[-1] == 'a':
                        s = s[:-1]
                    s += StaticArray.bn2enPunc[g]
                    continue
                if g in StaticArray.bn2en:
                    s += StaticArray.bn2en[g]
            return s
        except:
            print(
                f"{sbnltk_default.bcolors.FAIL} ERROR 205: Error in Bangla to English Conversion!! {sbnltk_default.bcolors.ENDC}"
            )
            return word

    def __bnCompare(self, item1, item2):
        g1 = self.bangla_to_english_Conversion(item1)
        g2 = self.bangla_to_english_Conversion(item2)
        return (g1 > g2) - (g1 < g2)

    def isBanglaWord(self, word):
        if word in self.__word_list:
            return True
        return False

    def isBangla(self, word):
        for c in word:
            g = c.encode("unicode_escape")
            g = g.upper()
            g = g[2:]
            g = g.decode('utf-8')
            if g in StaticArray.bn2en:
                return True
        return False

    def bn_word_sort_en_sys(self, vec):
        try:
            temp_vec = []
            for i in vec:
                if self.isBangla(i):
                    i = self.dust_removal(i)
                    temp_vec.append(
                        self.punctuation_remove(i).replace(' ', ''))
            vec = list(set(temp_vec))
            vec = sorted(vec, key=functools.cmp_to_key(self.__bnCompare))
            return vec
        except:
            print(
                f"{sbnltk_default.bcolors.FAIL} ERROR 206: Error in Sort bangla words according English alphabet!! {sbnltk_default.bcolors.ENDC}"
            )
            return vec

    def __bnCompare2(self, item1, item2):
        ln = min(len(item1), len(item2))
        for i in range(ln):
            if item1[i] == item2[i]:
                continue
            g1 = item1[i].encode("unicode_escape")
            g1 = g1.upper()
            g1 = g1[2:]
            g1 = g1.decode('utf-8')
            g1 = StaticArray.bn_serial[g1]
            g2 = item2[i].encode("unicode_escape")
            g2 = g2.upper()
            g2 = g2[2:]
            g2 = g2.decode('utf-8')
            g2 = StaticArray.bn_serial[g2]
            return (g1 > g2) - (g1 < g2)
        return (len(item1) > len(item2)) - (len(item1) < len(item2))

    def bn_word_sort_bn_sys(self, vec):
        try:
            temp_vec = []
            for i in vec:
                if self.isBangla(i):
                    i = self.dust_removal(i)
                    temp_vec.append(
                        self.punctuation_remove(i).replace(' ', ''))
            vec = list(set(temp_vec))
            vec = sorted(vec, key=functools.cmp_to_key(self.__bnCompare2))
            return vec
        except:
            print(
                f"{sbnltk_default.bcolors.FAIL} ERROR 207: Error in Sort Bangla words according Bangla alphabet!! {sbnltk_default.bcolors.ENDC}"
            )
            return vec

    def is_number(self, word):
        for c in word:
            g = c.encode("unicode_escape")
            g = g.upper()
            g = g[2:]
            g = g.decode('utf-8')
            if g in StaticArray.bn2enNum:
                return True
        return False

    def extra_space_remove(self, sent):
        while len(sent) > 0 and sent[0] == ' ':
            sent = sent[1:]
        temp = ''
        for i in sent:
            if len(temp) > 0 and temp[-1] == ' ' and i == ' ':
                continue
            temp += i
        return temp
コード例 #11
0
ファイル: SentimentAnalyzer.py プロジェクト: Foysal87/sbnltk
class sentimentAnalyzer:
    __dl = downloader()
    __sentiment_models = [('LR', 'Logistic Regression'),
                          ('LSVC', 'Linear SVC'),
                          ('MNB', 'Multinomial naive bayes'),
                          ('RF', 'Random Forest'),
                          ('BERT', 'Bert Sentiment Analysis')]
    __root_path = sbnltk_default.sbnltk_root_path

    def all_sentiment_models(self):
        st = 'All Sentiment analysis models name with code\n'
        for sent in self.__sentiment_models:
            st += sent[1] + ' ::: ' + sent[0] + '\n'
        return st

    def __LR(self, sentences):
        self.__dl.download('sentiment_LR',
                           sbnltk_default.sbnltk_root_path + 'model/')
        self.__dl.download('sentiment_vector',
                           sbnltk_default.sbnltk_root_path + 'model/')
        logreg = pickle.load(
            open(sbnltk_default.sbnltk_root_path + 'model/sentiment_LR.pkl',
                 'rb'))
        vectorizer = pickle.load(
            open(
                sbnltk_default.sbnltk_root_path + 'model/sentiment_vector.pkl',
                'rb'))
        unknown_vectors = vectorizer.transform(sentences)
        unknown_words_df = pd.DataFrame(unknown_vectors.toarray(),
                                        columns=vectorizer.get_feature_names())
        pred = []
        prop = []
        for i in range(len(sentences)):
            pred.append(logreg.predict(unknown_words_df)[i])
            prop.append(logreg.predict_proba(unknown_words_df)[:, 1][i])
        return pred, prop

    def __LSVC(self, sentences):
        self.__dl.download('sentiment_LSVC',
                           sbnltk_default.sbnltk_root_path + 'model/')
        self.__dl.download('sentiment_vector',
                           sbnltk_default.sbnltk_root_path + 'model/')
        svc = pickle.load(
            open(sbnltk_default.sbnltk_root_path + 'model/sentiment_LSVC.pkl',
                 'rb'))
        vectorizer = pickle.load(
            open(
                sbnltk_default.sbnltk_root_path + 'model/sentiment_vector.pkl',
                'rb'))
        unknown_vectors = vectorizer.transform(sentences)
        unknown_words_df = pd.DataFrame(unknown_vectors.toarray(),
                                        columns=vectorizer.get_feature_names())
        pred = []
        for i in range(len(sentences)):
            pred.append(svc.predict(unknown_words_df)[i])
        return pred

    def __MNB(self, sentences):
        self.__dl.download('sentiment_MNB',
                           sbnltk_default.sbnltk_root_path + 'model/')
        self.__dl.download('sentiment_vector',
                           sbnltk_default.sbnltk_root_path + 'model/')
        mnb = pickle.load(
            open(sbnltk_default.sbnltk_root_path + 'model/sentiment_MNB.pkl',
                 'rb'))
        vectorizer = pickle.load(
            open(
                sbnltk_default.sbnltk_root_path + 'model/sentiment_vector.pkl',
                'rb'))
        unknown_vectors = vectorizer.transform(sentences)
        unknown_words_df = pd.DataFrame(unknown_vectors.toarray(),
                                        columns=vectorizer.get_feature_names())
        pred = []
        prop = []
        for i in range(len(sentences)):
            pred.append(mnb.predict(unknown_words_df)[i])
            prop.append(mnb.predict_proba(unknown_words_df)[:, 1][i])
        return pred, prop

    def __RF(self, sentences):
        self.__dl.download('sentiment_RF',
                           sbnltk_default.sbnltk_root_path + 'model/')
        self.__dl.download('sentiment_vector',
                           sbnltk_default.sbnltk_root_path + 'model/')
        rf = pickle.load(
            open(sbnltk_default.sbnltk_root_path + 'model/sentiment_RF.pkl',
                 'rb'))
        vectorizer = pickle.load(
            open(
                sbnltk_default.sbnltk_root_path + 'model/sentiment_vector.pkl',
                'rb'))
        unknown_vectors = vectorizer.transform(sentences)
        unknown_words_df = pd.DataFrame(unknown_vectors.toarray(),
                                        columns=vectorizer.get_feature_names())
        pred = []
        prop = []
        for i in range(len(sentences)):
            pred.append(rf.predict(unknown_words_df)[i])
            prop.append(rf.predict_proba(unknown_words_df)[:, 1][i])
        return pred, prop

    def __sentence_convert_data(self, data):
        tokenizer = BertTokenizer.from_pretrained(
            sbnltk_default.sbnltk_root_path +
            'model/sentiment_multilingual_vocab.txt')
        SEQ_LEN = 147
        tokens, masks, segments = [], [], []
        token = tokenizer.encode(data,
                                 max_length=SEQ_LEN,
                                 truncation=True,
                                 padding='max_length')
        num_zeros = token.count(0)
        mask = [1] * (SEQ_LEN - num_zeros) + [0] * num_zeros
        segment = [0] * SEQ_LEN
        tokens.append(token)
        segments.append(segment)
        masks.append(mask)
        tokens = np.array(tokens)
        masks = np.array(masks)
        segments = np.array(segments)
        return [tokens, masks, segments]

    def __b_predict(self, bert, sentences):
        pred = []
        prop = []
        for sent in sentences:
            data_x = self.__sentence_convert_data(sent)
            predict = bert.predict(data_x)
            predict_value = np.ravel(predict)
            predict_answer = np.round(predict_value, 0).item()
            if predict_answer == 0:
                pred.append(0)
                prop.append((1.0 - predict_value[0]))
            else:
                pred.append(1)
                prop.append((predict_value[0]))
        return pred, prop

    def __create_sentiment_bert(self):
        SEQ_LEN = 147
        model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        token_inputs = tf.keras.layers.Input((SEQ_LEN, ),
                                             dtype=tf.int32,
                                             name='input_word_ids')
        mask_inputs = tf.keras.layers.Input((SEQ_LEN, ),
                                            dtype=tf.int32,
                                            name='input_masks')
        segment_inputs = tf.keras.layers.Input((SEQ_LEN, ),
                                               dtype=tf.int32,
                                               name='input_segment')
        bert_outputs = model([token_inputs, mask_inputs, segment_inputs])
        bert_outputs = bert_outputs[1]
        sentiment_first = tf.keras.layers.Dense(
            1,
            activation='sigmoid',
            kernel_initializer=tf.keras.initializers.TruncatedNormal(
                stddev=0.02))(bert_outputs)
        sentiment_model = tf.keras.Model(
            [token_inputs, mask_inputs, segment_inputs], sentiment_first)
        opt = tfa.optimizers.RectifiedAdam(lr=2.0e-5, weight_decay=0.0025)
        sentiment_model.compile(optimizer=opt,
                                loss=tf.keras.losses.BinaryCrossentropy(),
                                metrics=['acc'])
        return sentiment_model

    def __BERT(self, sentence):
        self.__dl.download('sentiment_BERT',
                           sbnltk_default.sbnltk_root_path + 'model/')
        self.__dl.download('sentiment_multilingual_vocab',
                           sbnltk_default.sbnltk_root_path + 'model/')
        bert = self.__create_sentiment_bert()
        bert.load_weights(sbnltk_default.sbnltk_root_path +
                          'model/sentiment_BERT.h5')
        return self.__b_predict(bert, sentence)

    def predict(self, model_code, sentences):
        if len(sentences) == 0:
            raise ValueError(
                'Empty list of Sentences is detected in Sentiment analysis!!')
        if model_code == 'LR':
            pred, prop = self.__LR(sentences)
            return pred, prop
        elif model_code == 'LSVC':
            pred = self.__LSVC(sentences)
            return pred
        elif model_code == 'MNB':
            pred, prop = self.__MNB(sentences)
            return pred, prop
        elif model_code == 'RF':
            pred, prop = self.__RF(sentences)
            return pred, prop
        elif model_code == 'BERT':
            pred, prop = self.__BERT(sentences)
            return pred, prop
        else:
            raise ValueError('Model code Does not exist!!\n' +
                             self.all_sentiment_models())
コード例 #12
0
class bert_multilingual_cased_postag:
    __dl = downloader()
    __model = None
    __tokenizer = None
    __device = None
    __tag2idx = {
        'CC': 10,
        'CD': 8,
        'DT': 6,
        'IN': 5,
        'JJ': 0,
        'NN': 4,
        'NNP': 3,
        'NNS': 1,
        'PRE': 12,
        'PRF': 9,
        'PRP': 13,
        'RB': 7,
        'VB': 2,
        'WH': 11
    }
    __tags2vals = {}

    # isinstance
    def __init__(self):
        self.__device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.__dl.download('bert_multi_cased_postag',
                           sbnltk_default.sbnltk_root_path + 'model/')
        self.__dl.download('bert_vocab_postag',
                           sbnltk_default.sbnltk_root_path + 'model/')
        self.__tokenizer = BertTokenizer.from_pretrained(
            sbnltk_default.sbnltk_root_path + 'model/bert_vocab_postag.txt')
        self.__model = torch.load(sbnltk_default.sbnltk_root_path +
                                  'model/bert_multi_cased_postag.pth',
                                  map_location=self.__device)
        for i in self.__tag2idx:
            self.__tags2vals[self.__tag2idx[i]] = i
        self.__model.eval()

    def tag(self, sentences):

        max_seq_len = 128  # tokens
        batch_s = 8
        all_sentence_tags = []
        for sentence in sentences:
            sentence = [sentence]
            words = sentence[0].split()
            false_labels = []
            for w in range(len(words)):
                false_labels.append('NN')
            labels = [false_labels]
            tokenized_texts = [
                self.__tokenizer.tokenize(sent) for sent in sentence
            ]
            X = pad_sequences([
                self.__tokenizer.convert_tokens_to_ids(txt)
                for txt in tokenized_texts
            ],
                              maxlen=max_seq_len,
                              dtype="long",
                              truncating="post",
                              padding="post")
            Y = pad_sequences([[self.__tag2idx.get(l) for l in lab]
                               for lab in labels],
                              maxlen=max_seq_len,
                              value=self.__tag2idx["NN"],
                              padding="post",
                              dtype="long",
                              truncating="post")
            attention_masks = [[float(i > 0) for i in ii] for ii in X]
            X_train = torch.tensor(X)
            Y_train = torch.tensor(Y)
            Mask_train = torch.tensor(attention_masks)
            data_valid = TensorDataset(X_train, Mask_train, Y_train)
            data_valid_sampler = SequentialSampler(data_valid)
            DL_valid = DataLoader(data_valid,
                                  sampler=data_valid_sampler,
                                  batch_size=batch_s)
            predictions = []
            for batch in DL_valid:
                batch = tuple(t.to(self.__device) for t in batch)
                b_input_ids, b_input_mask, b_labels = batch
                with torch.no_grad():
                    logits = self.__model(b_input_ids,
                                          token_type_ids=None,
                                          attention_mask=b_input_mask)
                logits = logits.detach().cpu().numpy()
                predictions.extend(
                    [list(p) for p in np.argmax(logits, axis=2)])
            pred_tags = [[self.__tags2vals[p_i] for p_i in p]
                         for p in predictions]
            pred_tags = pred_tags[0][:(len(words))]
            temp_dict = []
            for i in range(len(words)):
                temp_dict.append((words[i], pred_tags[i]))
            all_sentence_tags.append(temp_dict)
        return all_sentence_tags