def determinar_seguimiento(titulo_principal, titulo_querella): stemmer = Cistem() regex = r'\b\w+\b' palabras_titulo_principal = [stemmer.stem("".join(re.findall(regex, palabra.lower()))) for palabra in titulo_principal.split(" ") if palabra not in stopwords.words('english')] palabras_titulo_querella = [stemmer.stem("".join(re.findall(regex, palabra.lower()))) for palabra in titulo_querella.split(" ") if palabra not in stopwords.words('english')] """AQUI SE DEBEN INCLUIR CONSULTAS A API'S DE NLP PARA OBTENER ENTIDADES Y CONCEPTOS""" porcentaje_coincidencia = 0 for palabra_titulo_querella in palabras_titulo_querella: if palabra_titulo_querella in palabras_titulo_principal: porcentaje_coincidencia += 1 porcentaje_coincidencia /= len(palabras_titulo_principal) if porcentaje_coincidencia >= 0.4: return True else: return False
def stem(text): if type(text) is str: text = tokenize(text) stemmer = Cistem() for index, word in enumerate(text): text[index] = stemmer.stem(word) return ' '.join(text)
def stem_cistem(x): from nltk.stem.cistem import Cistem stemmer = Cistem() s_text = [] for word in x: s_text.append(stemmer.stem(word)) s_text = ''.join(s_text) return s_text
def build_corpus(): env_path = Path('../') / '.env' load_dotenv(dotenv_path=env_path) connection_string = os.getenv("DATABASE_URL") mongo_client = MongoClient(connection_string) t4g_database = mongo_client.test jobs_collection = t4g_database.jobs jobs = jobs_collection.find() size = jobs.count() stemmer = Cistem() corpus = [] ids = [] for i, job in enumerate(jobs): if i % 1000 == 0: print(f'{i}/{size}') indices = [] print(job['_id']) title = job['title'] _id = job['_id'] ids.append(_id) text = job['detailed_activities'].strip() text = ' '.join(text.split()) for index in range(len(text)): if text[index].isupper() and index > 1 and text[ index - 1] is not " " and text[ index - 2] is not " " and not text.endswith( text[index]) and text[index + 1] is not " ": indices.append(index) for index in reversed(indices): text = text[:index] + " " + text[index:] text = re.sub('[^A-Za-zä-üÄ-Ü]', ' ', text) text = text.lower() tokenized_text = word_tokenize(text) words = [] for word in tokenized_text: stemmed_word = stemmer.stem(word).strip() if stemmed_word not in stopwords.words( 'german') and word not in stopwords.words( 'german' ) and len(stemmed_word) > 2 and stemmed_word not in [ 'it', '3d' ] and stemmed_word not in title: words.append(stemmed_word) corpus.append(' '.join(words)) return corpus, ids
tokenizer = RegexpTokenizer(r'\w+') texts_clean = [] texts_aux = [] aux = [] for article in texts_labels_np: # Text to lower case text = article[0].lower() # Tokenize and Remove punctuation tokens = tokenizer.tokenize(text) # Remove stop words tokens = [word for word in tokens if word not in stop_words] # Stemming for token in tokens: aux.append(stemmer_cs.stem(token)) tokens = aux texts_aux.append(tokens) texts_aux.append(article[1]) texts_clean.append(texts_aux) texts_aux = [] aux = [] ## ## # Emedding the data ## ## # Transforming labels into numbers [business, entertainment, politics, sport, tech] -- [0,1,2,3,4] for text in texts_clean:
def stem(text): stemmer = Cistem() for index, word in enumerate(text): text[index] = stemmer.stem(word) return text
# Provide rehashed wordlist that is used to filter tweets by topic keywords = [] index_topic_tweets = [] inp = open("../data/topic_wordlist.txt", "r") for line in inp.readlines(): line = line.replace('\n', '') keywords.append(line) for word in keywords: word = word.replace('ä', 'ae') word = word.replace('ö', 'oe') word = word.replace('ü', 'ue') word = word.replace('ß', 'ss') word = word.lower() keyword = stemmer.stem(word) for index, row in tweets.iterrows(): # Tokenization words = tokenizer.tokenize(row[2]) # Remove short tokens for word in words: if len(word) > 2: word = word.lower() word = word.replace("#", "") word = word.replace('ä', 'ae') word = word.replace('ö', 'oe') word = word.replace('ü', 'ue') word = word.replace('ß', 'ss') word = stemmer.stem(word) if word not in nltk.corpus.stopwords.words('german'):
regex = re.compile('[,\.!?|#@;:!]') Corpus = pd.read_csv("train_german.tsv",encoding='latin-1', sep="\t",names=header_list) Corpus['text'].dropna(inplace=True) Corpus['text'] = [entry.lower() for entry in Corpus['text']] Corpus['text'] = [re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', entry) for entry in Corpus['text']] Corpus['text'] = [entry.replace(".", " ") for entry in Corpus['text']] Corpus['text'] = [regex.sub(' ', entry) for entry in Corpus['text']] Corpus['text'] = [entry.split() for entry in Corpus['text']] #Corpus['text']= [sent_tokenize(entry, language='german') for entry in Corpus['text']] for index,entry in enumerate(Corpus['text']): Final_words = [] for word in entry: if word not in stopwords.words('german') and word.isalpha(): Final_words.append(stemmer.stem(word)) Corpus.loc[index,'text_final'] = str(Final_words) print(Corpus['text_final']) Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['task_1'],test_size=0.3) Encoder = LabelEncoder() Train_Y = Encoder.fit_transform(Train_Y) Test_Y = Encoder.fit_transform(Test_Y) Tfidf_vect = TfidfVectorizer(max_features=5000) Tfidf_vect.fit(Corpus['text_final']) Train_X_Tfidf = Tfidf_vect.transform(Train_X) Test_X_Tfidf = Tfidf_vect.transform(Test_X) SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto') SVM.fit(Train_X_Tfidf,Train_Y) predictions_SVM = SVM.predict(Test_X_Tfidf) print("SVM F1 Score Task1-> ",f1_score(predictions_SVM, Test_Y, average='weighted')*100)
def wordCount(data, dictOutput, catList): # lade die stopwords stopwords = load_stopwords() # Create a new dictionary for the output outList = collections.OrderedDict() # Number of non-dictionary words nonDict = 0 # Convert to lowercase data = data.lower().replace("\n", " ") # Tokenize and create a frequency distribution tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(data) fdist = nltk.FreqDist(tokens) wc = len(tokens) # Using the Cistem stemmer for wildcards, create a stemmed version of the data # Cistem: needed for german words/stemming cistem = Cistem() # wenn ein Wort/Token in den stopwords vorkommt, ignoriere dieses # ansonsten: speichere das gestemmte Wort in der Liste stems = [cistem.stem(word) for word in tokens if word not in stopwords and len(word) > 0] fdist_stem = nltk.FreqDist(stems) # Access categories and populate the output dictionary with keys for cat in catList: outList[cat[0]] = 0 # Dictionaries are more useful fdist_dict = dict(fdist) fdist_stem_dict = dict(fdist_stem) # Number of classified words classified = 0 for key in dictOutput: if "*" in key and key[:-1] in fdist_stem_dict: classified = classified + fdist_stem_dict[key[:-1]] for cat in dictOutput[key]: if cat.isalpha(): outList[cat] = outList[cat] + fdist_stem_dict[key[:-1]] elif key in fdist_dict: classified = classified + fdist_dict[key] for cat in dictOutput[key]: try: outList[cat] = outList[cat] + fdist_dict[key] except KeyError: pass # Calculate the percentage of words classified if wc > 0: percClassified = (float(classified) / float(wc)) * 100 else: percClassified = 0 # Return the categories, the words used, the word count, the number of words classified, # and the percentage of words classified. return [outList, tokens, wc, classified, percClassified]
from wordcloud import WordCloud from pathlib import Path from sklearn.cluster import KMeans from sklearn.manifold import TSNE nltk.download('stopwords') tknzr= TweetTokenizer() stemmer = Cistem(True) file_in = open("../data/postillon.txt", "r") file_out = open("../build/preprocessed/postillon_stem.txt", "w") for line in file_in: tokenized = tknzr.tokenize(line) for word in tokenized: if word in stopwords.words('german'): tokenized.remove(word) word = stemmer.stem(word) token_text = " ".join(tokenized) file_out.write(token_text+'\n') file_in.close() file_out.close() data = open("../build/preprocessed/postillon_stem.txt", "r") vectorizer = CountVectorizer(max_features=1000, ngram_range=(1, 3)) X = vectorizer.fit_transform(data).toarray() #print(vectorizer.get_feature_names()) #print(X) contents = Path("../build/preprocessed/postillon_stem.txt").read_text() wordcloud = WordCloud(background_color='white', width=1920, height=1080