def stem_using_stempel(self, stem_type="default", words=None): if stem_type == "polimorf": stemmer = StempelStemmer.polimorf() else: stemmer = StempelStemmer.default() if words is None: words = self.words stem_words = [stemmer.stem(w) for w in words] return stem_words
def get_java_stemmer(stemmer_table_fpath, jar_fpath): os.environ['CLASSPATH'] = jar_fpath from jnius import autoclass FileInputStream = autoclass('java.io.FileInputStream') StempelStemmer = autoclass( 'org.apache.lucene.analysis.stempel.StempelStemmer') stemmerTrie = StempelStemmer.load(FileInputStream(stemmer_table_fpath)) return StempelStemmer(stemmerTrie)
def __init__(self): self.DATASET_PATH = DATA_CONFIG["data_path"] self.corpus = None self.bayes = None self.svm_model = None self.stemmer = StempelStemmer.polimorf() self.processor = BasicProcessor() self.tfidf = TfidfVectorizer(max_features=5000) self.encoder = LabelEncoder() self.stop_words = self.read_file( DATA_CONFIG["stopwords_path"]).split('\n') self.codes_map = { "POWTRZ": "Wniosek o powtórzenie roku studiów/powtarzanie przedmiotu", "PRZEP": "Wniosek o przepisanie oceny", "WYKR": "Wniosek o wykreślenie z listy studentów", "IPS": "Wniosek o Indywidualny Program Studiów", "ECTS": "Wniosek o kontynuowanie studiów z deficytem punktów ECTS", "INZ": "Rejestracja pracy inżynierskiej", "DZIEKAN": "Podanie do dziekana", "PRAKT": "Wniosek o zgodę na odbycie praktyki studenckiej", "WARUN": "Wniosek o wpis warunkowy", "REAKT": "Wniosek o reaktywację studiów", "LIC": "Rejestracja pracy licencjackiej" } self.get_data() self.train_model()
def add_keyword_if_not_exists(self, word): if word[0] != "\"": stemmer = StempelStemmer.polimorf() word = stemmer.stem(word) else: word = word[1:-1] if self.get_keyword(word) is None: return Keyword(word=word).save() else: return self.get_keyword(word)
def cleanall(df): #converting given variable to Series for further cleaning df = pd.Series(df) #making all letters lowercase, to avoid further issues regarding case sensivity df = df.str.lower() #declaring a function to find and remove certain patterns using regex def remove_pattern(text,pattern): #re.findall() finds the patterns, e.g. @user, and puts it in a list for further task r = re.findall(pattern,text) #re.sub() removes pattern from the sentences in the dataset for i in r: text = re.sub(i,"",text) return text #removing '@user' pattern through vectorize function df = np.vectorize(remove_pattern)(df, "@[\w]*") #removing 'https://t.co/' pattern through vectorize function #since in our database there are tweets with links, we need to remove links as well df = np.vectorize(remove_pattern)(df, "https://t.co/[\w]*") df = pd.Series(df) #replacing the symbols and punctuation with spaces, leaving all the polish special characters in place df = df.str.replace("[^\w]", " ") #getting rid of all the numbers in the tweets df = df.str.replace("[0-9]", " ") #removing '#hashtag' pattern, since in our database hashtags are rarely used and it makes more sense to just get rid of them df = np.vectorize(remove_pattern)(df, "#[\w]*") df = pd.Series(df) #tokenizing the tweets df = df.apply(lambda x: x.split()) #reading a .txt list of stopwords into a python list (stopwords were taken from https://github.com/bieli/stopwords) with open('../lib/polishstopwords.txt', 'r') as stopwords: stop = stopwords.read().splitlines() #getting rid of the stop words for i in range(len(df)): df[i] = [j for j in df[i] if j not in stop] #stemming the tweets (stripping the suffixes) using pystempel library ps = StempelStemmer.polimorf() df = df.apply(lambda x: [ps.stem(i) for i in x]) #Polish stemmer is defective and sometimes turns random words into NoneType, which prevents stiching the tokens back into strings #we need to remove all NoneType objects in order to proceed for i in range(len(df)): df[i] = [j for j in df[i] if j] #stiching the tweets back together for i in range(len(df)): df[i] = ' '.join(df[i]) return df
import sys sys.path.insert(0, "../../VoiceAssistant") # check operating system version import platform isLinux = 'Linux' == platform.system() import tkinter as tk from tkinter import font as tk_font import command_manager import threading from UI.HomePage.home_page_layout import HomePage from UI.Login.login_layout import LoginPage from UI.Register.resgister_layout import RegisterPage from stempel import StempelStemmer stemmer = StempelStemmer.polimorf() class Main(tk.Tk): def __init__(self, *args, **kwargs): tk.Tk.__init__(self, *args, **kwargs) self.title_font = tk_font.Font(family='Helvetica', size=18, weight="bold", slant="italic") container = tk.Frame(self) container.pack(side="top", fill="both", expand=True) container.grid_rowconfigure(0, weight=1) container.grid_columnconfigure(0, weight=1)
with open('resources/generated/news_data.json', 'r') as f: data = json.load(f) all_words = [] all_articles = [] all_categories = [] for article in data['articles']: art = str(article['description']).lower() tokens = nltk.wordpunct_tokenize(art) all_words.extend(tokens) all_articles.append(art.lower()) all_categories.append(article['category']) stemmer = StempelStemmer.default() all_words = [stemmer.stem(word) for word in all_words] all_words = list(set(all_words)) all_words.remove(None) all_words = sorted(all_words) with open('resources/generated/input_layer_words.txt', 'w') as datafile: json.dump(all_words, datafile) unique_categories = ['sports', 'health', 'business', 'entertainment', 'technology'] x = [] y = [] for article in all_articles:
def test_polimorf(): stemmer = StempelStemmer.from_file('../data/polimorf/stemmer_polimorf.tbl.gz') assert stemmer.stem('jabłkami') == 'jabłko'
def __init__(self, text_df): self.text_df = text_df self.stemmer = StempelStemmer.default()
def get_python_stemmer(stemmer_table_fpath): from stempel import StempelStemmer return StempelStemmer.from_file(stemmer_table_fpath)
def get_stemmed_word(self, word): stemmer = StempelStemmer.polimorf() return stemmer.stem(word)
import spacy import platform import functools import KeyExt.config from keybert import KeyBERT from string import punctuation from nltk.stem import SnowballStemmer from stempel import StempelStemmer # Initialize all required stemmers once. stemmers = { 'english': SnowballStemmer('english'), 'french': SnowballStemmer('french'), 'spanish': SnowballStemmer('spanish'), 'portuguese': SnowballStemmer('portuguese'), 'polish': StempelStemmer.default() } def load_models(): """ Function which loads the english NLP model, and the Keybert model. This needs to run once since all models need a few seconds to load. """ return (spacy.load('en_core_web_sm'), KeyBERT('distiluse-base-multilingual-cased-v2')) def preprocess(lis, language): """ Function which applies stemming to a