def get_similitude_projet_activite(projet, activite): """ projet : motivation du filleul activite : activité du parrain ----------- Stem all word in description. If match with this stem description (1 word activite in projet) return 1 else 0. If projet or activite is NaN, return 0 """ if projet is np.nan: return 0 if activite is np.nan: return 0 projet_stem = [ stem.stem(word) for word in word_tokenize(projet) if word not in stop ] activite_stem = [ stem.stem(word) for word in word_tokenize(activite) if word not in stop ] if len(set(activite_stem).intersection(projet_stem)) >= 1: return 1 else: return 0
def get_similitude_projet_activite(projet, activite): """ projet : motivation du filleul activite : activité du parrain ----------- Stem all word in description. If match with this stem description (1 word activite in projet) return 1 else 0. If projet or activite is NaN, return 0 """ if projet is np.nan: return 0 if activite is np.nan: return 0 projet_stem = [stem.stem(word) for word in word_tokenize(projet) if word not in stop] activite_stem = [stem.stem(word) for word in word_tokenize(activite) if word not in stop] if len(set(activite_stem).intersection(projet_stem)) >= 1: return 1 else: return 0
def lemmatize_stemming(text): stemmer = SnowballStemmer("english", ignore_stopwords=True) return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def lemmatize_stemming(text): return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
stopword_list = list(set(nl.stopwords.words('english'))) ps = ps.PorterStemmer() def generate_tfidf(text_corpora): vectorizer = tf.TfidfVectorizer(lowercase=False) vectorizer.fit(text_corpora) vector = vectorizer.transform(text_corpora) return vector stopword_list = list(set(nl.stopwords.words('english'))) r_df = pd.read_csv("python4.csv",encoding = "ISO-8859-1") print(r_df) text_corpora = [s.translate(str.maketrans("","","0123456789")) for s in r_df.loc[:,"scraptweets"]] words_data = [nt.word_tokenize(s.lower()) for s in text_corpora] words_data = [[ ps.stem(word) for word in sent if word not in stopword_list ] for sent in words_data ] sent_data = [" ".join(sent) for sent in words_data] vector = generate_tfidf(sent_data) kmeans_obj = km.KMeans(n_clusters = 5, max_iter=100) clusters = kmeans_obj.fit(vector) r_df["label"]=clusters.labels_ print("cluster 1") r_df.loc[r_df["label"]==0] print(r_df.loc[r_df["label"]==1]) r_df.to_csv("Clustered_tweet2.txt",index=False) file = open('Clustered_tweet1.txt', encoding="utf8",)