def create_word_cloud(): """ create the cloud of word with wordcloud_fa module""" # mask.jpg is a image in black and white picture that word will write in that mask_array = np.array(Image.open("mask.jpg")) wc = WordCloudFa(persian_normalize=True, include_numbers=False, mask=mask_array, background_color="white", collocations=False) with open('telegramtxt.txt', 'r') as file: text = file.read() frequencies = wc.process_text(text) avr = 0 count = 0 frequencies = {k: v for k, v in frequencies.items() if v > 1} for k, v in frequencies.items(): count += 1 avr += v avr = avr // count print(f'avr of word count : {avr}') frequencies = {k: v for k, v in frequencies.items() if v > avr} frequencies = { k: v for k, v in sorted( frequencies.items(), key=lambda item: item[1], reverse=True) } word_cloud = wc.generate_from_frequencies(frequencies) image = word_cloud.to_image() image.save('cloud.png')
def save_word_cloud(user_name: str, api): raw_tweets = [] for tweet in tweepy.Cursor(api.user_timeline, id=user_name).items(): raw_tweets.append(tweet.text) # Normalize words tokenizer = WordTokenizer() lemmatizer = Lemmatizer() normalizer = Normalizer() stopwords = set( list( map(lambda w: w.strip(), codecs.open(STOPWORDS_PATH, encoding='utf8')))) words = [] for raw_tweet in raw_tweets: raw_tweet = re.sub(r"[,.;:?!،()]+", " ", raw_tweet) raw_tweet = re.sub('[^\u0600-\u06FF]+', " ", raw_tweet) raw_tweet = re.sub(r'[\u200c\s]*\s[\s\u200c]*', " ", raw_tweet) raw_tweet = re.sub(r'[\u200c]+', " ", raw_tweet) raw_tweet = re.sub(r'[\n]+', " ", raw_tweet) raw_tweet = re.sub(r'[\t]+', " ", raw_tweet) raw_tweet = normalizer.normalize(raw_tweet) raw_tweet = normalizer.character_refinement(raw_tweet) tweet_words = tokenizer.tokenize(raw_tweet) tweet_words = [ lemmatizer.lemmatize(tweet_word).split('#', 1)[0] for tweet_word in tweet_words ] tweet_words = list(filter(lambda x: x not in stopwords, tweet_words)) words.extend(tweet_words) if len(words) == 0: return # Build word_cloud mask = np.array(Image.open(MASK_PATH)) clean_string = ' '.join([str(elem) for elem in words]) clean_string = arabic_reshaper.reshape(clean_string) clean_string = get_display(clean_string) word_cloud = WordCloudFa(persian_normalize=False, mask=mask, colormap=random.sample(cmaps, 1)[0], background_color=BACKGROUND_COLOR, include_numbers=False, font_path=FONT_PATH, no_reshape=True, max_words=1000, min_font_size=2) wc = word_cloud.generate(clean_string) image = wc.to_image() image.save(word_cloud_address)
def create_word_clod(all_cluster_hashtags, path): ''' :param all_cluster_hashtags: something like :list of Counter() ,each Counter have hashtags with the number of that hashtag in that cluster :return: ''' for i, hashtags in enumerate(all_cluster_hashtags): wodcloud = WordCloudFa() wc = wodcloud.generate_from_frequencies( dict(hashtags.most_common()[0:5])) image = wc.to_image() # image.show() image.save('{0}/cluster_{1}.png'.format(path, i))
def get_image(UserID, chat_id, message, bot, update): if len(UserID) < 2: message(chat_id=chat_id.chat_id, text="آی دی نامعتبر است!") else: message(chat_id=chat_id.chat_id, text="در حال اتصال به اینستاگرام...") allword = query.start(UserID, chat_id, message, bot, update) allword_edited = removeWeirdChars(allword) my_wordcloud = WordCloudFa(font_path="Sahel.ttf", background_color="white", width=720, height=1280, margin=2).generate(allword_edited) image = my_wordcloud.to_image() saved_dir = 'analysis/' + str(UserID) + '.jpg' image.save(saved_dir) message(chat_id=chat_id.chat_id, text="درحال ارسال عکس...") return saved_dir
def execute(self): numberOfPages = 1 numberOfTweetsPerPage = 200 counter = 0 cloud = "" txt = "" if self.numberOfTweets > 200: numberOfPages = ceil(self.numberOfTweets / 200) else: numberOfTweetsPerPage = self.numberOfTweets for i in range(numberOfPages): tweets = self.api.user_timeline(screen_name=self.username, count=numberOfTweetsPerPage, page=i) for each in tweets: cloud = each.text cloud = re.sub(r'[A-Za-z@_]*', '', cloud) counter += 1 txt = txt + ' ' + each.text txt = re.sub(r'[A-Za-z@]*', '', txt) twitter_mask = np.array( Image.open(path.join(self.d, "templates/cloud/twitter-logo.jpg"))) stop = [ 'می', 'من', 'که', 'به', 'رو', 'از', 'ولی', 'با', 'یه', 'این', 'نمی', 'هم', 'شد', 'ها', 'اما', 'تو', 'واقعا', 'در', 'نه', 'دارم', 'باید', 'آره', 'برای', 'تا', 'چه', 'کنم', 'بود', 'همه', 'دیگه', 'ای', 'اون', 'تی', 'حالا', 'بی', 'د', 'چرا', 'بابا', 'منم', 'کیه', 'توی', 'نیست', 'چی', 'باشه', 'که', 'بودم', 'می کنم', 'که', 'اینه', 'بهتر', 'داره', 'اینه', 'که', 'کردن', 'می', 'کن', 'بعد', 'دیگه', '', '', '', '' ] wc = WordCloudFa( # font_path='IranNastaliq.ttf', persian_normalize=True, max_words=1000, margin=0, width=3000, height=2500, min_font_size=1, max_font_size=1000, background_color=self.backGround, mask=twitter_mask, include_numbers=False, collocations=False) wc.add_stop_words(stop) wc.generate(txt) directory = 'static/images/' + self.username + '.png' directory = path.join(self.d, directory) image = wc.to_image() image.save(directory)
def show_chat_word_cloud(directory): with codecs.open(os.path.join(directory, 'chats.txt'), 'r', encoding='utf8') as file: print("Start putting words in picture") mask_array = np.array(Image.open("telegram.png")) wordcloud = WordCloudFa(persian_normalize=True, mask=mask_array, collocations=False) stop_words = [] with codecs.open("stop_words.txt", 'r', encoding='utf8') as words: for word in words: stop_words.append(word[:-2]) wordcloud.add_stop_words(stop_words) text = delete_extra_characters(file.read()) wc = wordcloud.generate(text) image = wc.to_image() image.show() image.save(os.path.join(directory, 'wordcloud.png'))
def main(): database = DB("ganjoor.s3db") database.connect() rows = database.select("""select p.cat_id,v.text from poem as p join verse as v on p.id=v.poem_id where p.cat_id=24 """) f = open("verses.txt", "w") for r in rows: f.write(r[1]) f.write('\n') f.close() wc = WordCloudFa(width=1200, height=800) with open('verses.txt', 'r') as file: text = file.read() word_cloud = wc.generate(text) image = word_cloud.to_image() image.show() image.save('hafez.png')
def __init__(self, mask=None, size=900, stop_words_addr=default_stop_words_path, mask_addr=None): self.hazm_normalizer = hazm.Normalizer() self.parsivar_normalizer = parsivar.Normalizer() self.stemmer = hazm.Stemmer() self.lemmatizer = hazm.Lemmatizer() self.stop_words = set(hazm.stopwords_list(stop_words_addr)) mask = np.array( Image.open(mask_addr)) if mask_addr is not None else None self.generator = WordCloud(width=size, height=size, include_numbers=False, persian_normalize=False, collocations=True, mask=mask, background_color='white')
from wordcloud_fa import WordCloudFa wodcloud = WordCloudFa(no_reshape=True, persian_normalize=True, include_numbers=False, collocations=False, width=800, height=400) text = "" with open('persian-example.txt', 'r') as file: text = file.read() wc = wodcloud.generate(text) image = wc.to_image() image.show() image.save('persian-example.png')
def remove_punctuations(text): translator = str.maketrans('', '', punctuations_list) return text.translate(translator) words = remove_punctuations(words) words = re.sub( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))', '', words, flags=re.MULTILINE) words = re.sub(r"@(\w+)", ' ', words, flags=re.MULTILINE) wordcloud = WordCloudFa(persian_normalize=True, stopwords=list(STOPWORDS) + hazm.stopwords_list(), include_numbers=False, background_color='white', width=700, height=500) frequencies = wordcloud.process_text(words) wc = wordcloud.generate_from_frequencies(frequencies) image = wc.to_image() st.image(image) # Dataframe st.subheader('**Data**') st.write(data) # Random Tweet col1, col2 = st.beta_columns(2) with col1: st.markdown('')
from wordcloud_fa import WordCloudFa wodcloud = WordCloudFa(persian_normalize=True, include_numbers=False, background_color="white") text = "" with open('mixed-example.txt', 'r') as file: text = file.read() wc = wodcloud.generate(text) image = wc.to_image() image.show() image.save('mixed-example.png')
def make_wordcloud(font_path=None, text_path='tweets/result.txt', bw_img="input/southpark1.png", img="input/southpark2.png", add_stopwords=ADD_STOPWORDS, bg_color='black', include_numbers=True, max_words=500, random_color=False): assert type(add_stopwords) == type(list()) # get data directory (using getcwd() is needed to support running example in generated IPython notebook) d = path.dirname(__file__) if "__file__" in locals() else getcwd() # load text text = open(path.join(d, text_path), encoding='utf-8').read() # load image. This has been modified in gimp to be brighter and have more saturation. image = np.array(Image.open(path.join(d, bw_img))) # subsample by factor of 3. Very lossy but for a wordcloud we don't really care. mask_color = np.array(Image.open(path.join(d, img))) # create mask white is "masked out" twitter_mask = image.copy() twitter_mask[twitter_mask.sum(axis=2) == 0] = 255 # some finesse: we enforce boundaries between colors so they get less washed out. # For that we do some edge detection in the image edges = np.mean([ gaussian_gradient_magnitude(mask_color[:, :, i] / 255., 2) for i in range(3) ], axis=0) twitter_mask[edges > .01] = 255 # Add another stopword stop_words = stopwords.words('english') ##stop_words_fa = stopwords.words('farsi') for word in add_stopwords: STOPWORDS.add(Normalizer().normalize(word)) stop_words.extend(STOPWORDS) stop_words.extend(EN_STOPWORDS) stop_words = set(stop_words) # Getting rid of the stopwords text_list = [word for word in text.split() if word not in stop_words] # Converting the list to a text text = ' '.join([str(elem) for elem in text_list]) text.replace('\u200c', '') # Generate a word cloud image wordcloud = WordCloudFa(font_path=font_path, persian_normalize=True, include_numbers=include_numbers, max_words=max_words, stopwords=stop_words, margin=0, width=3000, height=3000, min_font_size=1, max_font_size=2300, random_state=True, background_color=bg_color, mask=twitter_mask, relative_scaling=0, repeat=True).generate(text) if not random_color: image_colors = ImageColorGenerator(mask_color) wordcloud.recolor(color_func=image_colors) image = wordcloud.to_image() image.show() image.save('output/twitter_mask.png')
exit() text = "" print("cleaning") text = " ".join([clean_word(word) for word in raw_text.split()]) ################################# print_stats(text) print("generating cloud") mask_array = np_array(Image.open(MASK)) wc_instance = WordCloudFa( width=900, height=900, background_color=BG_COLOR, font_path=FONT, mask=mask_array, persian_normalize=True, include_numbers=False, stopwords=load_stop_words(), ) word_cloud = wc_instance.generate(text) result_image = word_cloud.to_image() result_image.save(RESULT_FILE_ADD) result_image.show()
#print( "\n----\n".join(tweets_simple) ) to_print = "\n\n".join(tweets_simple) f = open("out/cleaned.txt", "w") f.write(to_print) f.close() ####################################### mask_array = np.array(Image.open("masks/tw.png")) with open('out/cleaned.txt', 'r') as file: text = file.read() wc = WordCloudFa( width=900, height=900, background_color="white", font_path="fonts/font2.ttf", mask=mask_array, persian_normalize=True, include_numbers=False, ) word_cloud = wc.generate(text) image = word_cloud.to_image() image.save(f"out/{idish}.png") image.show()
class WordCloudGen: """Word Cloud Generator""" def __init__(self, mask=None, size=900, stop_words_addr=default_stop_words_path, mask_addr=None): self.hazm_normalizer = hazm.Normalizer() self.parsivar_normalizer = parsivar.Normalizer() self.stemmer = hazm.Stemmer() self.lemmatizer = hazm.Lemmatizer() self.stop_words = set(hazm.stopwords_list(stop_words_addr)) mask = np.array( Image.open(mask_addr)) if mask_addr is not None else None self.generator = WordCloud(width=size, height=size, include_numbers=False, persian_normalize=False, collocations=True, mask=mask, background_color='white') def get_word_cloud(self, msgs): return self.generator.generate_from_text( self._preprocess(msgs)).to_image() def _preprocess(self, msgs): words = [] for msg in msgs: msg = re.sub(r"https?:\/\/\S*", "", msg) # https://github.com/MasterScrat/Chatistics msg = re.sub(r"\@\S*", "", msg) msg = self._normalize(msg) msg = msg.replace("ؤ", "و") msg = msg.replace("أ", "ا") msg = self._remove_punctuations(msg) msg = self._remove_weird_chars(msg) msg = self._remove_postfixes(msg) for word in msg.split(): if self._is_stop_word(word): word = "" if word: # word = self.stemmer.stem(word) word = word.replace(u"\u200c", "") words.append(word) return " ".join(words) def _normalize(self, text): text = self.hazm_normalizer.normalize(text) text = self.parsivar_normalizer.normalize(text) return text def _is_stop_word(self, word): if word in { "بابا", "کار", "وقت", "دست", "خدا", "انقد", " چقد", "نیس", "جدی", "ینی", "چقد", "واسه", "دگ", "اینقد", "gt", "lt" }: return True if word.startswith("در"): modified_word = word[3:] if self._is_stop_verb(modified_word): return True if word.startswith("ب"): modified_word = word[2:] if self._is_stop_verb(modified_word): return True if word in self.stop_words: return True if self.stemmer.stem(word) in self.stop_words: return True if self._is_stop_verb(word): return True if self._is_stop_verb(word.replace("می", "می\u200c", 1)): # میمیرد -> میمیرد return True if (word[0] == "ب" or word[0] == "ن"): # برو، نره if word[1:] in self.stop_words: return True if word[-1] == "ه": if word[1:-1] + "ود" in self.stop_words: return True if word[-1] == "ه": modified_word = word[:-1] + "د" # داره if self._is_stop_verb(modified_word): return True if self._is_stop_verb(modified_word.replace("می", "می\u200c", 1)): return True if modified_word in self.stop_words: return True modified_word = word[:-1] + "ود" # میره if self._is_stop_verb(modified_word): return True if self._is_stop_verb(modified_word.replace("می", "می\u200c", 1)): return True modified_word = word + "د" # میده if self._is_stop_verb(modified_word): return True if self._is_stop_verb(modified_word.replace("می", "می\u200c", 1)): return True modified_word = word[:-1] # رفته if self._is_stop_verb(modified_word): return True if self._is_stop_verb(modified_word.replace("می", "می\u200c", 1)): return True if word[-1] == "ن": modified_word = word + "د" # داره if self._is_stop_verb(modified_word): return True if self._is_stop_verb(modified_word.replace("می", "می\u200c", 1)): return True if "میا" in word: modified_word = word.replace("میا", "می\u200cآی") if self._is_stop_verb(modified_word): return True if "گ" in word: modified_word = word.replace("گ", "گوی") modified_word = modified_word.replace("گویه", "گوید") modified_word = modified_word.replace("گوین", "گویند") if self._is_stop_verb(modified_word): return True if self._is_stop_verb(modified_word.replace("می", "می\u200c", 1)): return True if word[-1] == "ا": modified_word = word[:-1] + "ی" # حتا -> حتی if modified_word in self.stop_words: return True if "ا" in word: modified_word = word[::-1].replace("ا", "اه\u200c", 1)[::-1] if modified_word in self.stop_words: return True if self.stemmer.stem(modified_word) in self.stop_words: return True modified_word = word[::-1].replace("ا", "یاه\u200c", 1)[::-1] if modified_word in self.stop_words: return True if self.stemmer.stem(modified_word) in self.stop_words: return True if word[-1] == "ن": modified_word = word[:-1] # حتمن -> حتماً if self.stemmer.stem(modified_word) in self.stop_words: return True modified_word = word[:-1] + "ا" # حتمن -> حتماً if modified_word in self.stop_words: return True modified_word = word[:-1] + "لا" # اصن -> اصلاً if modified_word in self.stop_words: return True if word[-1] == "و": # خودشو -> خودش را modified_word = word[:-1] if modified_word in self.stop_words: return True if self.stemmer.stem(modified_word) in self.stop_words: return True if "و" in word: modified_word = word.replace("و", "ا") # همون -> همان modified_word = modified_word.replace("اا", "آ") # اومده -> آمده if modified_word in self.stop_words: return True if self.stemmer.stem(modified_word) in self.stop_words: return True if self._is_stop_verb(modified_word): # نمیدونم -> نمیدانم return True if self._is_stop_verb(modified_word.replace("می", "می\u200c", 1)): return True if word[-1] == "ا": # خودشونو -> خودشان را modified_word = word[:-1] if modified_word in self.stop_words: return True if self.stemmer.stem(modified_word) in self.stop_words: return True if "خوا" in word: # میخوام modified_word = word.replace("خوا", "خواه", 1) if self._is_stop_verb(modified_word): return True if self._is_stop_verb(modified_word.replace("می", "می\u200c", 1)): return True if "خا" in word: # میخام modified_word = word.replace("خا", "خواه", 1) if self._is_stop_verb(modified_word): return True if self._is_stop_verb(modified_word.replace("می", "می\u200c", 1)): return True if "تو" in word: # میخام modified_word = word.replace("تو", "اوت", 1) if self._is_stop_verb(modified_word): return True if self._is_stop_verb(modified_word.replace("می", "می\u200c", 1)): return True if "ر" in word: # میرم modified_word = word.replace("ر", "رو", 1) if self._is_stop_verb(modified_word): return True if self._is_stop_verb(modified_word.replace("می", "می\u200c", 1)): return True if "بود" in word: # رفته بودم modified_word = re.sub("ه[\u200c ]بود.*", "", word) if self._is_stop_verb(modified_word): return True if self._is_stop_verb(modified_word.replace("می", "می\u200c", 1)): return True if word == "فک": modified_word = "فکر" if modified_word in self.stop_words: return True return False def _is_stop_verb(self, word): lem = self.lemmatizer.lemmatize(word).split("#") if len(lem) == 2: if lem[0] in self.stop_words or lem[1] in self.stop_words: return True return False @staticmethod def _remove_punctuations(text): return punctuation_patterns.sub(" ", text) @staticmethod def _remove_weird_chars(text): return weird_patterns.sub(" ", text) @staticmethod def _remove_postfixes(text): text = text.replace("ٔ ", " ") text = text.replace(" ی ", " ") text = text.replace(" ها ", " ") text = text.replace("ها ", " ") text = text.replace(" های ", " ") text = text.replace("های ", " ") return text
from wordcloud_fa import WordCloudFa import numpy as np from PIL import Image mask = np.array(Image.open("mask.png")) # Passing `no_reshape` parameter for you may cause problem in showing Farsi texts. If your output from the example # is not true, you can remove that parameter wodcloud = WordCloudFa(persian_normalize=True, include_numbers=False, background_color="white", mask=mask, no_reshape=True) # Adding extra stop words: wodcloud.add_stop_words(['the', 'and', 'with', 'by', 'in', 'to', 'to the', 'of', 'it', 'is', 'th', 'its', 'for', '[ ]', '. [', '] [']) text = "" with open('mixed-example.txt', 'r') as file: text = file.read() wc = wodcloud.generate(text) image = wc.to_image() image.show() image.save('masked-example.png')
from wordcloud_fa import WordCloudFa wodcloud = WordCloudFa(persian_normalize=True) text = "" with open('persian-example.txt', 'r') as file: text = file.read() wc = wodcloud.generate(text) image = wc.to_image() image.show() image.save('persian-example.png')
text = text.translate(str.maketrans(' ', ' ', "\n")) word_list = WordTokenizer().tokenize(text) stop_words = stopwords.words('english') punctuations = list(string.punctuation) words = [ word.strip() for word in word_list if word not in stop_words and word not in stop_words_main and word not in punctuations ] text = "" for x in words: text += x + " " # alice = np.array(Image.open("mask.png")) word_cloud = WordCloudFa(persian_normalize=True, width=2000, height=2000, margin=20, repeat=False, max_words=500) frequencies = word_cloud.process_text(text) wc = word_cloud.generate_from_frequencies(frequencies) fig = plt.figure(figsize=(20, 20), facecolor=None) plt.figure() plt.imshow(word_cloud) plt.axis('off') plt.savefig('WordsCloud.png', dpi=2000, transparent=True) plt.show()
text = get_tweets_from_user( username) # to get tweets of a specific user by its username break else: print("you should enter f or u!") text = get_tweets(text) text = remove_bad_tweets(text) text = "\n".join(text) text = get_words(text) print(len(text)) text = remove_bad_words(text) print(len(text)) text1 = "\n".join(text) text1 = removeWeirdChars(text1) mask_array = np.array(Image.open(mask_path)) my_wc = WordCloudFa(width=1200, height=1200, background_color=background_color, mask=mask_array, persian_normalize=True, repeat=False, collocations=True) my_wc.add_stop_words_from_file("../stop_words_kian.txt") open("edited_tweets.txt", "w").write(text1) my_wc.generate(text1) image = my_wc.to_image() image.show() filename = datetime.now().strftime("%Y-%m-%d-%H-%M") image.save('Images/{time}_photo.png'.format(time=filename))
from wordcloud_fa import WordCloudFa wodcloud = WordCloudFa(persian_normalize=True, include_numbers=False) text = "" with open('english-example.txt', 'r') as file: text = file.read() wc = wodcloud.generate(text) image = wc.to_image() image.show() image.save('english-example.png')
import numpy as np d = path.dirname(__file__) text = open(path.join(d, 'tweets/result.txt'), encoding='utf-8').read() # Add another stopword twitter_mask = np.array(Image.open(path.join(d, "input/southpark1.png"))) stopwords = set(STOPWORDS) stopwords |= EN_STOPWORDS # Generate a word cloud image wordcloud = WordCloudFa(persian_normalize=True, include_numbers=False, max_words=200, stopwords=stopwords, margin=0, width=3000, height=3000, min_font_size=10, max_font_size=2300, random_state=True, background_color="black", mask=twitter_mask).generate(text) image = wordcloud.to_image() image.show() image.save('output/twitter_mask.png')
counter += 1 txt = txt + ' ' + each.text print(counter, cloud) txt = re.sub(r'[A-Za-z@]*', '', txt) twitter_mask = np.array(Image.open(path.join(d, "twitter-logo.jpg"))) stop = [ 'می', 'من', 'که', 'به', 'رو', 'از', 'ولی', 'با', 'یه', 'این', 'نمی', 'هم', 'شد', 'ها', 'اما', 'تو', 'واقعا', 'در', 'نه', 'دارم', 'باید', 'آره', 'برای', 'تا', 'چه', 'کنم', 'بود', 'همه', 'دیگه', 'ای', 'اون', 'تی', 'حالا', 'بی', 'د', 'چرا', 'بابا', 'منم', 'کیه', 'توی', 'نیست', 'چی', 'باشه', 'که', 'بودم', 'می کنم', 'که', 'اینه', 'بهتر', 'داره', 'اینه', 'که' ] wc = WordCloudFa(persian_normalize=True, max_words=30000, margin=0, width=3000, height=2500, min_font_size=1, max_font_size=3000, background_color="white", mask=twitter_mask, include_numbers=False, stopwords=stop).generate(txt) image = wc.to_image() image.show() image.save('twitter.png')