def determinar_seguimiento(titulo_principal, titulo_querella):
    stemmer = Cistem()
    regex = r'\b\w+\b'
    palabras_titulo_principal = [stemmer.stem("".join(re.findall(regex, palabra.lower()))) for palabra in titulo_principal.split(" ") if palabra not in stopwords.words('english')]
    palabras_titulo_querella = [stemmer.stem("".join(re.findall(regex, palabra.lower()))) for palabra in titulo_querella.split(" ") if palabra not in stopwords.words('english')]
    """AQUI SE DEBEN INCLUIR CONSULTAS A API'S DE NLP PARA OBTENER ENTIDADES
    Y CONCEPTOS"""
    porcentaje_coincidencia = 0
    for palabra_titulo_querella in palabras_titulo_querella:
        if palabra_titulo_querella in palabras_titulo_principal:
            porcentaje_coincidencia += 1
    porcentaje_coincidencia /= len(palabras_titulo_principal)
    if porcentaje_coincidencia >= 0.4:
        return True
    else:
        return False
Exemple #2
0
def stem(text):
    if type(text) is str:
        text = tokenize(text)
    stemmer = Cistem()
    for index, word in enumerate(text):
        text[index] = stemmer.stem(word)
    return ' '.join(text)
Exemple #3
0
def stem_cistem(x):
    from nltk.stem.cistem import Cistem
    stemmer = Cistem()
    s_text = []
    for word in x:
        s_text.append(stemmer.stem(word))
    s_text = ''.join(s_text)
    return s_text
Exemple #4
0
def build_corpus():
    env_path = Path('../') / '.env'
    load_dotenv(dotenv_path=env_path)
    connection_string = os.getenv("DATABASE_URL")
    mongo_client = MongoClient(connection_string)
    t4g_database = mongo_client.test
    jobs_collection = t4g_database.jobs
    jobs = jobs_collection.find()
    size = jobs.count()
    stemmer = Cistem()
    corpus = []
    ids = []
    for i, job in enumerate(jobs):
        if i % 1000 == 0: print(f'{i}/{size}')
        indices = []
        print(job['_id'])
        title = job['title']
        _id = job['_id']
        ids.append(_id)
        text = job['detailed_activities'].strip()
        text = ' '.join(text.split())
        for index in range(len(text)):
            if text[index].isupper() and index > 1 and text[
                    index - 1] is not " " and text[
                        index - 2] is not " " and not text.endswith(
                            text[index]) and text[index + 1] is not " ":
                indices.append(index)

        for index in reversed(indices):
            text = text[:index] + " " + text[index:]

        text = re.sub('[^A-Za-zä-üÄ-Ü]', ' ', text)
        text = text.lower()
        tokenized_text = word_tokenize(text)
        words = []
        for word in tokenized_text:
            stemmed_word = stemmer.stem(word).strip()
            if stemmed_word not in stopwords.words(
                    'german') and word not in stopwords.words(
                        'german'
                    ) and len(stemmed_word) > 2 and stemmed_word not in [
                        'it', '3d'
                    ] and stemmed_word not in title:
                words.append(stemmed_word)

        corpus.append(' '.join(words))
    return corpus, ids
Exemple #5
0
tokenizer = RegexpTokenizer(r'\w+')
texts_clean = []
texts_aux = []
aux = []

for article in texts_labels_np:
    # Text to lower case
    text = article[0].lower()
    # Tokenize and Remove punctuation
    tokens = tokenizer.tokenize(text)
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    # Stemming
    for token in tokens:
        aux.append(stemmer_cs.stem(token))
    tokens = aux

    texts_aux.append(tokens)
    texts_aux.append(article[1])
    texts_clean.append(texts_aux)
    texts_aux = []
    aux = []

##
##
# Emedding the data
##
##
# Transforming labels into numbers [business, entertainment, politics, sport, tech] -- [0,1,2,3,4]
for text in texts_clean:
Exemple #6
0
def stem(text):
    stemmer = Cistem()
    for index, word in enumerate(text):
        text[index] = stemmer.stem(word)
    return text
# Provide rehashed wordlist that is used to filter tweets by topic
keywords = []
index_topic_tweets = []
inp = open("../data/topic_wordlist.txt", "r")
for line in inp.readlines():
    line = line.replace('\n', '')
    keywords.append(line)

for word in keywords:
    word = word.replace('ä', 'ae')
    word = word.replace('ö', 'oe')
    word = word.replace('ü', 'ue')
    word = word.replace('ß', 'ss')
    word = word.lower()
    keyword = stemmer.stem(word)

for index, row in tweets.iterrows():
    # Tokenization
    words = tokenizer.tokenize(row[2])
    # Remove short tokens
    for word in words:
        if len(word) > 2:
            word = word.lower()
            word = word.replace("#", "")
            word = word.replace('ä', 'ae')
            word = word.replace('ö', 'oe')
            word = word.replace('ü', 'ue')
            word = word.replace('ß', 'ss')
            word = stemmer.stem(word)
            if word not in nltk.corpus.stopwords.words('german'):
Exemple #8
0
regex = re.compile('[,\.!?|#@;:!]')
Corpus = pd.read_csv("train_german.tsv",encoding='latin-1', sep="\t",names=header_list)
Corpus['text'].dropna(inplace=True)
Corpus['text'] = [entry.lower() for entry in Corpus['text']]
Corpus['text'] = [re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', entry) for entry in Corpus['text']]
Corpus['text'] = [entry.replace(".", " ") for entry in Corpus['text']]
Corpus['text'] = [regex.sub(' ', entry) for entry in Corpus['text']]
Corpus['text'] = [entry.split() for entry in Corpus['text']]
#Corpus['text']= [sent_tokenize(entry, language='german') for entry in Corpus['text']]


for index,entry in enumerate(Corpus['text']):
    Final_words = []
    for word in entry:
        if word not in stopwords.words('german') and word.isalpha():
            Final_words.append(stemmer.stem(word))
    Corpus.loc[index,'text_final'] = str(Final_words)
print(Corpus['text_final'])

Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['task_1'],test_size=0.3)
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
predictions_SVM = SVM.predict(Test_X_Tfidf)
print("SVM F1 Score Task1-> ",f1_score(predictions_SVM, Test_Y, average='weighted')*100)
Exemple #9
0
def wordCount(data, dictOutput, catList):
	# lade die stopwords
	stopwords = load_stopwords()

	# Create a new dictionary for the output
	outList = collections.OrderedDict()

	# Number of non-dictionary words
	nonDict = 0

	# Convert to lowercase
	data = data.lower().replace("\n", " ")

	# Tokenize and create a frequency distribution
	tokenizer = RegexpTokenizer(r'\w+')
	tokens = tokenizer.tokenize(data)

	fdist = nltk.FreqDist(tokens)
	wc = len(tokens)

	# Using the Cistem stemmer for wildcards, create a stemmed version of the data
	# Cistem: needed for german words/stemming
	cistem = Cistem()

	# wenn ein Wort/Token in den stopwords vorkommt, ignoriere dieses
	# ansonsten: speichere das gestemmte Wort in der Liste
	stems = [cistem.stem(word) for word in tokens if word not in stopwords and len(word) > 0]
	fdist_stem = nltk.FreqDist(stems)

	# Access categories and populate the output dictionary with keys
	for cat in catList:
		outList[cat[0]] = 0

	# Dictionaries are more useful
	fdist_dict = dict(fdist)
	fdist_stem_dict = dict(fdist_stem)

	# Number of classified words
	classified = 0

	for key in dictOutput:
		if "*" in key and key[:-1] in fdist_stem_dict:
			classified = classified + fdist_stem_dict[key[:-1]]
			for cat in dictOutput[key]:
				if cat.isalpha():
					outList[cat] = outList[cat] + fdist_stem_dict[key[:-1]]
		elif key in fdist_dict:
			classified = classified + fdist_dict[key]
			for cat in dictOutput[key]:
				try:
					outList[cat] = outList[cat] + fdist_dict[key]
				except KeyError:
					pass

	# Calculate the percentage of words classified
	if wc > 0:
		percClassified = (float(classified) / float(wc)) * 100
	else:
		percClassified = 0

	# Return the categories, the words used, the word count, the number of words classified,
	# and the percentage of words classified.
	return [outList, tokens, wc, classified, percClassified]
Exemple #10
0
from wordcloud import WordCloud
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

nltk.download('stopwords')
tknzr= TweetTokenizer()
stemmer = Cistem(True)
file_in = open("../data/postillon.txt", "r")
file_out = open("../build/preprocessed/postillon_stem.txt", "w")
for line in file_in:
    tokenized = tknzr.tokenize(line)
    for word in tokenized:
        if word in stopwords.words('german'):
            tokenized.remove(word)
        word = stemmer.stem(word)
    token_text = " ".join(tokenized)   
    file_out.write(token_text+'\n')      
file_in.close()
file_out.close()

data = open("../build/preprocessed/postillon_stem.txt", "r")
vectorizer = CountVectorizer(max_features=1000, ngram_range=(1, 3))
X = vectorizer.fit_transform(data).toarray()
#print(vectorizer.get_feature_names())
#print(X)

contents = Path("../build/preprocessed/postillon_stem.txt").read_text()
wordcloud = WordCloud(background_color='white',
                      width=1920,
                      height=1080