Ejemplo n.º 1
0
def get_similitude_projet_activite(projet, activite):
    """
    projet : motivation du filleul
    activite : activité du parrain
    -----------
    Stem all word in description.
    If match with this stem description (1 word activite in projet) return 1 else 0.
    
    If projet or activite is NaN, return 0
    """
    if projet is np.nan:
        return 0
    if activite is np.nan:
        return 0
    projet_stem = [
        stem.stem(word) for word in word_tokenize(projet) if word not in stop
    ]
    activite_stem = [
        stem.stem(word) for word in word_tokenize(activite) if word not in stop
    ]

    if len(set(activite_stem).intersection(projet_stem)) >= 1:
        return 1
    else:
        return 0
Ejemplo n.º 2
0
def get_similitude_projet_activite(projet, activite):
    """
    projet : motivation du filleul
    activite : activité du parrain
    -----------
    Stem all word in description.
    If match with this stem description (1 word activite in projet) return 1 else 0.
    
    If projet or activite is NaN, return 0
    """
    if projet is np.nan:
        return 0
    if activite is np.nan:
        return 0
    projet_stem = [stem.stem(word) for word in word_tokenize(projet) if word not in stop]
    activite_stem = [stem.stem(word) for word in word_tokenize(activite) if word not in stop]
    
    if len(set(activite_stem).intersection(projet_stem)) >= 1:
        return 1
    else:
        return 0    
Ejemplo n.º 3
0
def lemmatize_stemming(text):
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
Ejemplo n.º 4
0
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
Ejemplo n.º 5
0
stopword_list = list(set(nl.stopwords.words('english')))

ps = ps.PorterStemmer()
def generate_tfidf(text_corpora):
    vectorizer = tf.TfidfVectorizer(lowercase=False)
    vectorizer.fit(text_corpora)
    vector = vectorizer.transform(text_corpora)
    return vector
stopword_list = list(set(nl.stopwords.words('english')))
r_df = pd.read_csv("python4.csv",encoding = "ISO-8859-1")
print(r_df)


text_corpora = [s.translate(str.maketrans("","","0123456789")) for s in r_df.loc[:,"scraptweets"]]
words_data = [nt.word_tokenize(s.lower()) for s in text_corpora]
words_data = [[ ps.stem(word) for word in sent if word not in stopword_list ] for sent in words_data  ]
sent_data  = [" ".join(sent) for sent in words_data]

vector = generate_tfidf(sent_data)

kmeans_obj = km.KMeans(n_clusters = 5, max_iter=100)
clusters = kmeans_obj.fit(vector)

r_df["label"]=clusters.labels_  
print("cluster 1")

r_df.loc[r_df["label"]==0]
print(r_df.loc[r_df["label"]==1])
r_df.to_csv("Clustered_tweet2.txt",index=False)

file = open('Clustered_tweet1.txt', encoding="utf8",)