def german_semantic(text): from nltk.corpus import stopwords from nltk.stem.cistem import Cistem stopwords = set(stopwords.words("german")) liste = [] stemmer = Cistem() wordlist = [] # clean up the text text = "".join(text.lower()) text = text.replace('[^\w\s]', '') text = re.sub("\s+", " ", text) # delete stopwords for word in text.split(): if word not in stopwords: liste.append(word) text = " ".join(liste) # stemmer for word in text.split(): word = stemmer.segment(word)[0] wordlist.append(word) text = " ".join(wordlist) # sentiment blob = TextBlobDE(text) sentiment_polarity = blob.sentiment.polarity sentiment_subjectivity = blob.sentiment.subjectivity return sentiment_polarity, sentiment_subjectivity
def stem(text): if type(text) is str: text = tokenize(text) stemmer = Cistem() for index, word in enumerate(text): text[index] = stemmer.stem(word) return ' '.join(text)
def stem_cistem(x): from nltk.stem.cistem import Cistem stemmer = Cistem() s_text = [] for word in x: s_text.append(stemmer.stem(word)) s_text = ''.join(s_text) return s_text
def stem(string, stemmer="porter", **kwargs): if stemmer == "porter": impl = PorterStemmer() elif stemmer == "lancaster": impl = LancasterStemmer() elif stemmer == "regex": regexp = kwargs['regexp'] if 'min' in kwargs: min = kwargs['min'] else: mins = 0 impl = RegexpStemmer(regexp=regexp, min=min) elif stemmer == "isri": impl = ISRIStemmer() elif stemmer == "snowball": if 'language' in kwargs: language = kwargs['language'] else: language = 'english' impl = SnowballStemmer(language=language) elif stemmer == "rslp": impl = RSLPStemmer() elif stemmer == "cistem": if 'case_insensitive' in kwargs: case_insensitive = kwargs['case_insensitive'] else: case_insensitive = False impl = Cistem(case_insensitive=case_insensitive) else: return string return impl.stem(string)
def determinar_seguimiento(titulo_principal, titulo_querella): stemmer = Cistem() regex = r'\b\w+\b' palabras_titulo_principal = [stemmer.stem("".join(re.findall(regex, palabra.lower()))) for palabra in titulo_principal.split(" ") if palabra not in stopwords.words('english')] palabras_titulo_querella = [stemmer.stem("".join(re.findall(regex, palabra.lower()))) for palabra in titulo_querella.split(" ") if palabra not in stopwords.words('english')] """AQUI SE DEBEN INCLUIR CONSULTAS A API'S DE NLP PARA OBTENER ENTIDADES Y CONCEPTOS""" porcentaje_coincidencia = 0 for palabra_titulo_querella in palabras_titulo_querella: if palabra_titulo_querella in palabras_titulo_principal: porcentaje_coincidencia += 1 porcentaje_coincidencia /= len(palabras_titulo_principal) if porcentaje_coincidencia >= 0.4: return True else: return False
def build_corpus(): env_path = Path('../') / '.env' load_dotenv(dotenv_path=env_path) connection_string = os.getenv("DATABASE_URL") mongo_client = MongoClient(connection_string) t4g_database = mongo_client.test jobs_collection = t4g_database.jobs jobs = jobs_collection.find() size = jobs.count() stemmer = Cistem() corpus = [] ids = [] for i, job in enumerate(jobs): if i % 1000 == 0: print(f'{i}/{size}') indices = [] print(job['_id']) title = job['title'] _id = job['_id'] ids.append(_id) text = job['detailed_activities'].strip() text = ' '.join(text.split()) for index in range(len(text)): if text[index].isupper() and index > 1 and text[ index - 1] is not " " and text[ index - 2] is not " " and not text.endswith( text[index]) and text[index + 1] is not " ": indices.append(index) for index in reversed(indices): text = text[:index] + " " + text[index:] text = re.sub('[^A-Za-zä-üÄ-Ü]', ' ', text) text = text.lower() tokenized_text = word_tokenize(text) words = [] for word in tokenized_text: stemmed_word = stemmer.stem(word).strip() if stemmed_word not in stopwords.words( 'german') and word not in stopwords.words( 'german' ) and len(stemmed_word) > 2 and stemmed_word not in [ 'it', '3d' ] and stemmed_word not in title: words.append(stemmed_word) corpus.append(' '.join(words)) return corpus, ids
## First step: Tokenize each text from nltk.tokenize import RegexpTokenizer ## Load library for removing stopwords from nltk.corpus import stopwords ##nltk.download('stopwords') --> First time has to be downloaded # Import libraries for stemming from nltk.stem import PorterStemmer from nltk.tokenize import sent_tokenize stemmer_ps = PorterStemmer() from nltk.stem.cistem import Cistem stemmer_cs = Cistem() # Import lemmatization libraries from nltk.stem import WordNetLemmatizer lemmatizer = WordNetLemmatizer() #nltk.download('wordnet') # Load stop words stop_words = stopwords.words('english') #print(stop_words[:5]) tokenizer = RegexpTokenizer(r'\w+') texts_clean = [] texts_aux = [] aux = []
def __init__(self, case_insensitive: BooleanValue()): self.case_insensitive = case_insensitive NltkStemmer.__init__(self) _Cistem.__init__(self, case_insensitive=case_insensitive)
def stem(text): stemmer = Cistem() for index, word in enumerate(text): text[index] = stemmer.stem(word) return text
import pandas as pd import nltk from nltk.stem.cistem import Cistem # Initialise tokenizer = nltk.tokenize.TweetTokenizer(strip_handles=True, reduce_len=True) stemmer = Cistem() # Read Dataset tweets = pd.read_csv('data/all_tweets.tsv', sep='\t', header=None) # Provide rehashed wordlist that is used to filter tweets by topic keywords = [] index_topic_tweets = [] inp = open("../data/topic_wordlist.txt", "r") for line in inp.readlines(): line = line.replace('\n', '') keywords.append(line) for word in keywords: word = word.replace('ä', 'ae') word = word.replace('ö', 'oe') word = word.replace('ü', 'ue') word = word.replace('ß', 'ss') word = word.lower() keyword = stemmer.stem(word) for index, row in tweets.iterrows(): # Tokenization words = tokenizer.tokenize(row[2]) # Remove short tokens
def _get_cis_stemmer(self, case_insensitive): return Cistem(case_insensitive=case_insensitive)
from nltk.stem.cistem import Cistem import pandas as pd import numpy as np from nltk.tokenize import sent_tokenize from nltk import pos_tag from nltk.corpus import stopwords from sklearn.preprocessing import LabelEncoder from sklearn.feature_extraction.text import TfidfVectorizer from sklearn import model_selection, naive_bayes, svm from sklearn.metrics import accuracy_score, f1_score import re np.random.seed(500) stemmer = Cistem() header_list = ["text_id","text","task_1","task_2"] regex = re.compile('[,\.!?|#@;:!]') Corpus = pd.read_csv("train_german.tsv",encoding='latin-1', sep="\t",names=header_list) Corpus['text'].dropna(inplace=True) Corpus['text'] = [entry.lower() for entry in Corpus['text']] Corpus['text'] = [re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', entry) for entry in Corpus['text']] Corpus['text'] = [entry.replace(".", " ") for entry in Corpus['text']] Corpus['text'] = [regex.sub(' ', entry) for entry in Corpus['text']] Corpus['text'] = [entry.split() for entry in Corpus['text']] #Corpus['text']= [sent_tokenize(entry, language='german') for entry in Corpus['text']] for index,entry in enumerate(Corpus['text']): Final_words = [] for word in entry: if word not in stopwords.words('german') and word.isalpha():
def preprocess(text: str, bool_to_lowercase=True, bool_remove_html_tags=True, bool_remove_links=True, bool_remove_special_symbols=True, bool_remove_punctuation=True, bool_seperate_numbers_from_text=True, bool_stemming=True, bool_word_tokenize=True, bool_sentence_tokenize=False) -> str: # todo: lemmatization # transform to lower case if bool_to_lowercase: text = text.lower() # remove symbols, html-tags and links if bool_remove_html_tags: logging.debug('Removing html tags') text = remove_html_tags(text) if bool_remove_links: logging.debug('Removing links') text = remove_links(text) if bool_remove_special_symbols: logging.debug('Removing special symbols') text = remove_special_symbols(text) # separate numbers from text if bool_seperate_numbers_from_text: logging.debug('Separating numbers from text') text = seperate_numbers_from_text(text) if bool_remove_punctuation: logging.debug('Removing punctuation') text = remove_punctuation_from_text(text) # tokenize words if bool_word_tokenize: logging.debug('Apply word tokenizing') text_list = word_tokenize(text) if bool_stemming: logging.debug('Apply stemming') text_list = [ Cistem(case_insensitive=False).stem(token) for token in text_list ] # remove stopwords logging.debug('Removing stopwords') text = remove_stop_words(text_list) if bool_sentence_tokenize: logging.debug('Apply sentence tokenizing') text = sent_tokenize(text, language='german') return text
if args.log_to_file: log_file_name = args.target.strip() + '.log' else: log_file_name = None logging.basicConfig( filename=log_file_name, format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info("Using up to {} CPUs".format(args.threads)) if args.lemmatize: logging.info("Perform lemmatisation using nltk") nlp_de = stanfordnlp.Pipeline(lang="de", processors="tokenize,lemma", use_gpu=True) if args.stem: logging.info("Perform stemming using nltk''s CISTEM stemmer") stemmer = Cistem() stop_words = stopwords.words('german') # get corpus sentences class CorpusSentences(object): def __init__(self, directory_name: str, chunk_size: int = 10000): self.directory_name = directory_name self.n = chunk_size def __iter__(self): for file_name in os.listdir(self.directory_name): logging.info("Use corpus file %s " % file_name) with codecs.open(os.path.join(self.directory_name, file_name), encoding="utf-8") as f: while True:
def wordCount(data, dictOutput, catList): # lade die stopwords stopwords = load_stopwords() # Create a new dictionary for the output outList = collections.OrderedDict() # Number of non-dictionary words nonDict = 0 # Convert to lowercase data = data.lower().replace("\n", " ") # Tokenize and create a frequency distribution tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(data) fdist = nltk.FreqDist(tokens) wc = len(tokens) # Using the Cistem stemmer for wildcards, create a stemmed version of the data # Cistem: needed for german words/stemming cistem = Cistem() # wenn ein Wort/Token in den stopwords vorkommt, ignoriere dieses # ansonsten: speichere das gestemmte Wort in der Liste stems = [cistem.stem(word) for word in tokens if word not in stopwords and len(word) > 0] fdist_stem = nltk.FreqDist(stems) # Access categories and populate the output dictionary with keys for cat in catList: outList[cat[0]] = 0 # Dictionaries are more useful fdist_dict = dict(fdist) fdist_stem_dict = dict(fdist_stem) # Number of classified words classified = 0 for key in dictOutput: if "*" in key and key[:-1] in fdist_stem_dict: classified = classified + fdist_stem_dict[key[:-1]] for cat in dictOutput[key]: if cat.isalpha(): outList[cat] = outList[cat] + fdist_stem_dict[key[:-1]] elif key in fdist_dict: classified = classified + fdist_dict[key] for cat in dictOutput[key]: try: outList[cat] = outList[cat] + fdist_dict[key] except KeyError: pass # Calculate the percentage of words classified if wc > 0: percClassified = (float(classified) / float(wc)) * 100 else: percClassified = 0 # Return the categories, the words used, the word count, the number of words classified, # and the percentage of words classified. return [outList, tokens, wc, classified, percClassified]
#!usr/bin/env python #coding:utf8 from nltk.tokenize import TweetTokenizer from nltk.stem.cistem import Cistem from nltk.corpus import stopwords import nltk from sklearn.feature_extraction.text import CountVectorizer import matplotlib.pyplot as plt from wordcloud import WordCloud from pathlib import Path from sklearn.cluster import KMeans from sklearn.manifold import TSNE nltk.download('stopwords') tknzr= TweetTokenizer() stemmer = Cistem(True) file_in = open("../data/postillon.txt", "r") file_out = open("../build/preprocessed/postillon_stem.txt", "w") for line in file_in: tokenized = tknzr.tokenize(line) for word in tokenized: if word in stopwords.words('german'): tokenized.remove(word) word = stemmer.stem(word) token_text = " ".join(tokenized) file_out.write(token_text+'\n') file_in.close() file_out.close() data = open("../build/preprocessed/postillon_stem.txt", "r") vectorizer = CountVectorizer(max_features=1000, ngram_range=(1, 3))