class static_postag: __dl=downloader() __dict={} __stemmer=None __bp=preprocessor() __tokenizer=wordTokenizer() def __init__(self): self.__dl.download('postag_static',sbnltk_default.sbnltk_root_path+'dataset/') self.__stemmer=stemmerOP() path=sbnltk_default.sbnltk_root_path+'dataset/postag_static.txt' for word in open(path,'r'): word=word.replace('\n','') tokens=self.__tokenizer.basic_tokenizer(word) wd=tokens[0] val=tokens[-1] self.__dict[wd]=val def tag(self,sent): tokens=self.__tokenizer.basic_tokenizer(sent) ans=[] for word in tokens: if self.__bp.is_number(word): ans.append((word,'NUM')) continue if self.__dict.get(word): ans.append((word,self.__dict[word])) continue if self.__dict.get(self.__bp.word_normalize(word)) : ans.append((word, self.__dict[self.__bp.word_normalize(word)])) continue stem_word=self.__stemmer.stemWord(word) if self.__dict.get(stem_word): ans.append((word,self.__dict[stem_word])) continue ans.append((word,'unk')) return ans
class stemmerOP: __wordtokens = wordTokenizer() __word_vec = [] __word_dict = {} __word_dict2 = {} __bp = preprocessor() __dl=downloader() def __init__(self): self.__dl.download('rootword_list',sbnltk_default.sbnltk_root_path+'dataset/') self.__dl.download('ner_static',sbnltk_default.sbnltk_root_path+'dataset/') for word in open(sbnltk_default.sbnltk_root_path + 'dataset/ner_static.txt', "r"): word = word.replace('\n', '') segment = word.split(' ') word = segment[:-1] for i in word: self.__word_dict[i]=1 for word in open(sbnltk_default.sbnltk_root_path+'dataset/rootword_list.txt', "r"): word=word.replace('\n','') self.__word_dict2[word]=1 def __search(self,word): if (self.__bp.word_normalize(word) in self.__word_dict) or (word in self.__word_dict) or (word in self.__word_dict2) or (self.__bp.word_normalize(word) in self.__word_dict2): return True return False def __bnCompare(self,item1,item2): return (len(item1)<len(item2))-(len(item1)>len(item2)) def stemWord(self,word): try: if self.__word_dict2.get(word)!=None: return word suf_arr=[] for wd in rule_words: if re.search('.*' + wd + '$', word): suf_arr.append(wd) suf_arr = sorted(suf_arr, key=functools.cmp_to_key(self.__bnCompare)) if len(suf_arr)>0: for i in suf_arr: if i in rule_dict: ind = len(word) - len(i) new_word=word[0:ind]+rule_dict[i] if self.__search(new_word): return new_word ind = len(word) - len(i) new_word = word[0:ind] if len(new_word)==0: return word if self.__search(new_word): return new_word return word except: print(f"{sbnltk_default.bcolors.FAIL}ERROR 101: Error in stemming!! {sbnltk_default.bcolors.ENDC}") def stemSent(self,sent): tokens=self.__wordtokens.basic_tokenizer(sent) temp_tokens=[] for i in tokens: temp_tokens.append(self.stemWord(i)) result = ' '.join(temp_tokens) return result
class static_NER: __ner_static_data = {} __bp = preprocessor() __stemmer = stemmerOP() __dl = downloader() def __init__(self): self.__dl.download('ner_static', sbnltk_default.sbnltk_root_path + 'dataset/') for word in open( sbnltk_default.sbnltk_root_path + 'dataset/ner_static.txt', "r"): word = word.replace('\n', '') segment = word.split(' ') tag = segment[-1] word = segment[:-1] word = ' '.join(word) self.__ner_static_data[word] = tag def tag(self, sentence): segment = sentence.split() stems = self.__stemmer.stemSent(sentence) stems = stems.split() i = 0 sentence_tags = [] while (i < len(segment)): j = len(segment) flg = 0 while (j > i): now = ' '.join(segment[i:j]) now2 = ' '.join(stems[i:j]) if self.__ner_static_data.get(now) != None: sentence_tags.append((now, self.__ner_static_data[now])) i = j - 1 flg = 1 break if self.__ner_static_data.get(now2) != None: sentence_tags.append((now, self.__ner_static_data[now2])) i = j - 1 flg = 1 j -= 1 if flg == 0: sentence_tags.append((segment[i], 'O')) i += 1 return sentence_tags
class sklearn_NER: __dl = downloader() __bp = preprocessor() __sk_model = None def __init__(self): self.__dl.download('sklearn_ner', sbnltk_default.sbnltk_root_path + 'model/') self.__sk_model = pickle.load( open(sbnltk_default.sbnltk_root_path + 'model/sklearn_ner.pkl', 'rb')) def word2features(self, sent, i): return { 'word': sent[i], 'is_first': i == 0, 'is_last': i == len(sent) - 1, 'is_capitalized': sent[i][0].upper() == sent[i][0], 'is_all_caps': sent[i].upper() == sent[i], 'is_all_lower': sent[i].lower() == sent[i], 'prefix-1': sent[i][0], 'prefix-2': sent[i][:2], 'prefix-3': sent[i][:3], 'suffix-1': sent[i][-1], 'suffix-2': sent[i][-2:], 'suffix-3': sent[i][-3:], 'prev_word': '' if i == 0 else sent[i - 1], 'next_word': '' if i == len(sent) - 1 else sent[i + 1], 'is_numeric': sent[i].isdigit() } def tag(self, text): if len(text) == 0: return [] words = text.split() sentence_features = [ self.word2features(words, i) for i in range(len(words)) ] return list(zip(words, self.__sk_model.predict([sentence_features])[0]))
def __init__(self): self.__stemmer = stemmerOP() self.__pre = preprocessor() self.__tokenizer = wordTokenizer() self.__sentT = sentenceTokenizer() self.__posT = sklearn_postag()
class sentenceTokenizer: __pre=preprocessor() def basic_tokenizer(self,text): text=text.replace('\n',' ') tokens = [] s = "" bangla_fullstop = '0964' for c in text: g = c.encode("unicode_escape") g = g.upper() g = g[2:] g = g.decode('utf-8') if g == bangla_fullstop: tokens.append(s) s = "" continue s += c if len(s) > 0: tokens.append(s) return tokens def customized_tokenizer(self,text,punc=True,norm=False,dust=False): tokens=[] text = text.replace('\n', ' ') s="" bangla_fullstop = '0964' for c in text: g = c.encode("unicode_escape") g = g.upper() g = g[2:] g = g.decode('utf-8') if g==bangla_fullstop: tokens.append(s) s="" continue s+=c if len(s)>0: tokens.append(s) try: temp_tokens=[] for i in tokens: if punc==True: i=bp.punctuation_remove(i) if norm==True: i=bp.word_normalize(i) if len(bp.dust_removal_sent(i))!=0 and dust==True: i=bp.dust_removal_sent(i) temp_tokens.append(i) return temp_tokens except: print(f"{sbnltk_default.bcolors.FAIL} ERROR 302: Error in Customized Sentence Tokenizer!! {sbnltk_default.bcolors.ENDC}") return tokens def sentence_vector_to_text(self,sentences,full_stop=True): if full_stop==True: text=sbnltk_default.bangla_full_stop.join(sentences) text+=sbnltk_default.bangla_full_stop else: text=' '.join(sentences) text=self.__pre.extra_space_remove(text) return text def sentence_cluster(self,text,max_length=100,punc=True,norm=False,dust=False): tokens = [] text = text.replace('\n', ' ') s = "" bangla_fullstop = '0964' for c in text: g = c.encode("unicode_escape") g = g.upper() g = g[2:] g = g.decode('utf-8') if g == bangla_fullstop: tokens.append(s) s = "" continue s += c if len(s) > 0: tokens.append(s) try: ''' tmp_tokens: temporary tokens for returning word_tokens: taking word from each string tmp_sent: temporary sentence which length at most : max length ''' tmp_tokens=[] for sent in tokens: if len(sent)>max_length: l=len(sent) word_tokens=sent.split() tmp_sent='' for w in word_tokens: if len(str(tmp_sent+w))>max_length: tmp_sent=tmp_sent[:-1] tmp_tokens.append(tmp_sent) tmp_sent='' tmp_sent=tmp_sent+w+' ' tmp_tokens.append(tmp_sent) else: tmp_tokens.append(sent) tmp_tokens2=[] for sent in tmp_tokens: if punc==True: sent=bp.punctuation_remove(sent) if norm==True: sent=bp.word_normalize(sent) if len(bp.dust_removal_sent(sent)) != 0 and dust == True: sent = bp.dust_removal_sent(sent) tmp_tokens2.append(sent) return tmp_tokens2 except: print(f"{sbnltk_default.bcolors.FAIL} ERROR 303: Error in Sentence clustering!! {sbnltk_default.bcolors.ENDC}") return tokens
WordTokenizer basic_tokenizer: only split words by space customized_tokenizer: split words, can remove punctuation (initially True), can make normalization(initially False), can remove dust(intitially False) SentenceTokenizer: basic_tokenizer: only split by Bangla_fullstop customized_tokenizer: split sentences, can remove punctuation (initially True), can make normalization(initially False), can remove dust(intitially False) sentence_cluster: clustering sentences with max_length, can remove punctuation (initially True), can make normalization(initially False), can remove dust(intitially False) ''' from sbnltk.Preprocessor import preprocessor from sbnltk import sbnltk_default bp=preprocessor() class wordTokenizer: def basic_tokenizer(self,text): tokens = text.split() return tokens def customized_tokenizer(self,text,punc=True,norm=False,dust=False): if punc==True: text=bp.punctuation_remove(text) if norm==True: text=bp.word_normalize(text) tokens=text.split() try: if dust==True: temp_tokens=[] for i in tokens: