def cleanText(text): text = text.replace("\n", " ").replace("", " ").replace("\r", " ").replace( "", "").replace("", "") text = PersianWordCloud.remove_ar(text) text = get_display(arabic_reshaper.reshape(ps.run(text))) return text
def word_cloud(self, model: LdaModel, stopwords_path, save_path): with open(stopwords_path, 'r', encoding='utf8') as f: words = f.readlines() stopwords = add_stop_words(words) print('stop words added') word_cloud = PersianWordCloud(only_persian=True, max_words=10, stopwords=stopwords, width=800, height=800, background_color='black', min_font_size=1, max_font_size=300) topics = model.show_topics(formatted=False) for i, topic in enumerate(topics): topic_words = dict(topic[1]) print(topic_words) new = {} for word in topic_words.keys(): reshaped = get_display(arabic_reshaper.reshape(word)) new[reshaped] = topic_words[word] print(new) word_cloud.generate_from_frequencies(new) image = word_cloud.to_image() image.show() s = save_path + '_topic_' + str(i) + '.png' print(s) image.save(s)
def generate_wallpapers_by_surah_number(surah_number): current_surah_index = surah_number - 1 try: with open('quran_data/quran-ar.json') as quran_ar_json_file: quran_ar_json = json.load(quran_ar_json_file) except FileNotFoundError: print("Local quran-ar.json file not found, downloading now...") print("Please wait...") import_quran_arabic_into_json() try: with open('quran_data/quran-en.json') as quran_en_json_file: quran_en_json = json.load(quran_en_json_file) except FileNotFoundError: print('Local quran-en.json file not found, downloading now...') print('Please wait...') import_quran_english_into_json() for ayah_ar, ayah_en in zip( quran_ar_json["data"]["surahs"][current_surah_index]["ayahs"], quran_en_json["data"]["surahs"][current_surah_index]["ayahs"]): # print(ayah_ar["text"]) # print(ayah_en["text"]) # print('Quran[' + str(current_surah) + ":" + str(ayah_en["numberInSurah"]) + ']') ayah_arabic_text = ayah_ar["text"] reshaped_arabic_text = arabic_reshaper.reshape(ayah_arabic_text) ayah_arabic_text = get_display(reshaped_arabic_text) ayah_english_text = ayah_en["text"] surah_number = str(surah_number) ayah_number = str(ayah_en["numberInSurah"]) ayah_english_text += ' - Quran[' + str(surah_number) + ":" + str( ayah_en["numberInSurah"]) + ']' write_ayah_on_image(ayah_arabic_text, ayah_english_text, surah_number, ayah_number)
def clean_text(text: str, context: str, stop_words) -> str: print("cleaning text") text = remove_unwanted_chars(text) if general_config["ARABIC_RESHAPER"]: text = get_display(arabic_reshaper.reshape(text)) word_list = text.split(" ") general_cleaned = [clean_word(word) for word in word_list] custom_cleaned = general_cleaned if context == "twitter" or context == "data": custom_cleaned = [clean_twitter_word(word) for word in general_cleaned] elif context == "telegram": custom_cleaned = [ clean_telegram_word(word) for word in general_cleaned ] if general_config["STOP_WORD_CLEAN"] == "manual": custom_cleaned = [ word for word in custom_cleaned if not is_stop_word(word, stop_words) ] return " ".join(custom_cleaned)
def draw_cloud(cleantweets, image_path, monthly=False, show_image=False): text = " ".join(str(tweet) for tweet in cleantweets) text = get_display(arabic_reshaper.reshape(text)) tokens = word_tokenize(text) dic = Counter(tokens) print(dic.most_common(max_words)) twitter_mask = np.array(Image.open("twitter-logo.png")) font_path = select_a_font() words = max_words if monthly: words = max_words // 2 wordcloud = WordCloud(font_path=font_path, max_words=words, margin=0, width=5000, height=5000, min_font_size=4, max_font_size=750, background_color="white", mask=twitter_mask) wordcloud.generate_from_frequencies(dic) image = wordcloud.to_image() wordcloud.to_file(image_path) if show_image: image.show() print(f"Generated image {image_path}")
def reshape_words(words: Iterable) -> List[str]: """ This method make words proper for displaying in the wordcloud. We first join all words together as a string, then reshape them and finally split them, because it's faster than reshaping each word separately. ATTENTION: words should not contain \n because this character used as separator. so if a word has `\n` character in it, considers as two separate words. :param words: an iterable including words :return: a ist of proper words for showing in the WordCloud """ combined_words: str = "".join(x + "\n" for x in words) return get_display(arabic_reshaper.reshape(combined_words)).split("\n")
def draw_graph(edgeList, user_id, file_name): lis = [] for i in edgeList: reshaped_text = arabic_reshaper.reshape(i[0]) artext = get_display(reshaped_text) reshaped_text1 = arabic_reshaper.reshape(i[1]) artext1 = get_display(reshaped_text1) lis.append((artext, artext1)) G = nx.OrderedMultiDiGraph() G.add_edges_from(lis) pos = nx.spring_layout(G, k=0.99, iterations=50) nx.draw_networkx_nodes(G, pos, node_color='#AED6F1', node_size=2500) nx.draw_networkx_edges(G, pos, edgelist=G.edges(), edge_color='#95A5A6', arrows=True) nx.draw_networkx_labels(G, pos, font_size=20, font_family='Times New Roman') graph_path = os.path.join(os.path.join(BASE_DIR, 'static/media'), 'graph') user_path = os.path.join(graph_path, str(user_id)) if not os.path.isdir(graph_path): os.mkdir(graph_path) if not os.path.isdir(user_path): os.mkdir(user_path) path = os.path.join(user_path, file_name) plt.tight_layout() plt.savefig(path+'.png', format="PNG") plt.show()
def get_emb_matrix(word_index, max_features, embedding_file): embeddings_index = load_word_emb(word_index, embedding_file) all_embs = np.stack(embeddings_index.values()) emb_mean, emb_std = all_embs.mean(), all_embs.std() embed_size = all_embs.shape[1] embedding_matrix = np.random.normal(emb_mean, emb_std, (max_features, embed_size)) for word, i in word_index.items(): if i >= max_features: continue embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector if embedding_vector is None: word = arabic_reshaper.reshape(word) print(word) return embedding_matrix
def create_word_cloud(keywords, file_id): data = arabic_reshaper.reshape(keywords) data = get_display(data) # add this line wordCloud = WordCloud(font_path='arial', background_color='white', mode='RGB', width=2000, height=1000).generate(data) plt.imshow(wordCloud) plt.axis("off") plt.tight_layout() plt.savefig( os.path.join(BASE_DIR, 'static/media/wordCloud/' + str(file_id) + '.png')) plt.show() img_path = os.path.join('wordCloud', str(file_id) + '.png') return img_path
def generate(self, text): """Generate wordcloud from text. The input "text" is expected to be a natural text. If you pass a sorted list of words, words will appear in your output twice. To remove this duplication, set ``collocations=False``. Alias to generate_from_text. Calls process_text and generate_from_frequencies. Returns ------- self """ # reshape persian words text = get_display(arabic_reshaper.reshape(text)) return self.generate_from_text(text)
def draw_cloud(cleantweets, image_path, show_image=False): top_words = dict() for key, value in Counter(ngram).most_common(max_words): top_words[get_display(arabic_reshaper.reshape(key))] = value twitter_mask = np.array(Image.open("twitter-logo.jpg")) font_path = select_a_font() wordcloud = WordCloud( font_path=font_path, max_words=max_words, margin=0, width=800, height=800, min_font_size=1, max_font_size=500, background_color="white", mask=twitter_mask ) wordcloud.generate_from_frequencies(top_words) image = wordcloud.to_image() wordcloud.to_file(image_path) if show_image: image.show() print(f"Generated image {image_path}")
from wordcloud import WordCloud, STOPWORDS import matplotlib.pyplot as plt from arabic_reshaper import arabic_reshaper from bidi.algorithm import get_display file = open('txt-per.txt', encoding="utf-8") reader = file.read() text = arabic_reshaper.reshape(reader) print(text) text = get_display(arabic_reshaper.reshape(text)) stopwords = set(STOPWORDS) stopwords.add("RT") stopwords.add("text") stopwords.add("https") stopwords.add("co") cloud = WordCloud(font_path='/Users/alisalehi/Library/Fonts/XNazanin.TTF', stopwords=stopwords).generate( get_display(arabic_reshaper.reshape(text))) image = cloud.to_image() image.show() #plt.imshow(cloud) #plt.show()
def normalize_hebrew(raw_text: str): return get_display(arabic_reshaper.reshape(raw_text))
text = text.replace(eng, "") cleared_text = " ".join(text.split()) #Remove Extra White Spaces return cleared_text print("\n ****** \n Mate it's not stuck! just wait a little! \n ****** \n") cloud = "" for status in tweets: text = status.text text = preprocessing(text) if text: cloud = cloud + text from arabic_reshaper import arabic_reshaper from bidi.algorithm import get_display cloudfinal = arabic_reshaper.reshape(cloud) cloudfinal = get_display(arabic_reshaper.reshape(cloud)) print( "\n ****** \n Press enter to continue if it's still stuck :))) \n ****** \n" ) cloudwords = WordCloud(font_path="font.ttf", background_color="white").generate(cloudfinal) plt.imshow(cloudwords, interpolation="bilinear") plt.axis('off') plt.show() savename = input("gimme the file name to save! : ") WordCloud.to_file(cloudwords, savename + ".png")
def add_stop_words(words): for word in words: words_reshape = get_display(arabic_reshaper.reshape(word)) STOPWORDS.add(words_reshape) return STOPWORDS
'دِ': 'د', 'بِ': 'ب', 'زِ': 'ز', 'ذِ': 'ذ', 'شِ': 'ش', 'سِ': 'س', 'ى': 'ی', 'ي': 'ی' } pattern = "|".join(map(re.escape, dic.keys())) return re.sub(pattern, lambda m: dic[m.group()], text) item1 = itemgetter(1) FONT_PATH = os.environ.get( "FONT_PATH", os.path.join(os.path.dirname(__file__), "fonts/Vazir-Light.ttf")) stop_words_reshape = get_display( arabic_reshaper.reshape( open((os.path.join(os.path.dirname(__file__), 'stopwords')), encoding='utf-8').read())) STOPWORDS = set([x.strip() for x in stop_words_reshape.split('\n')]) def add_stop_words(words): for word in words: words_reshape = get_display(arabic_reshaper.reshape(word)) STOPWORDS.add(words_reshape) return STOPWORDS
text = open(r'C:\TECHNICAL\MISC\Python\Firdoos.txt','r',encoding='utf8').read() # Calling function to remove punctuations using unicode codepoints starting with 'P' #text = remove_punct(text) # Tokenizing text into words #words = nltk.word_tokenize(text) Method does not work now words = nltk.tokenize.wordpunct_tokenize(text) # Remove stop words from text text = remove_stopwords(words) # Reconstruct Arabic/Urdu sentences to be used in applications that don't support Arabic script. from arabic_reshaper import arabic_reshaper text = arabic_reshaper.reshape(text) # For right-to-left text rendering, need to use get_display from python-bidi # Install using pip install python-bidi from bidi.algorithm import get_display text = get_display(arabic_reshaper.reshape(text)) # Generate a word cloud image fontpath = r"C:\Users\Humera\AppData\Local\Microsoft\Windows\Fonts\urdu-najd-regular-1.ttf" wordcloud = WordCloud(font_path=fontpath).generate(text) # Display the generated image: the matplotlib way import matplotlib.pyplot as plt #matplotlib.rc('font', family='Tahoma') plt.imshow(wordcloud.recolor(random_state=2017))
result = [word for word in text2 if word not in arabic_words] final_result = ' '.join(result) return final_result with open('LamaAlherbish_tweets.json') as f: print('hh') data_file = json.load(f) # print(data_file) for i in data_file: cleaned_text = clean_Text(str=i['text']) tweets_data.append(cleaned_text.split()) tweets_data2 = list(chain.from_iterable(tweets_data)) twenty_most_commom_words_with_frequency = Counter(tweets_data2).most_common(20) for word in twenty_most_commom_words_with_frequency: print(word[0]) twenty_most_commom_words.append(word[0]) # # convert list to string and generate unique_string = (" ").join(twenty_most_commom_words) reshaped_texts = arabic_reshaper.reshape(unique_string) reshaped_texts = get_display(reshaped_texts) wordcloud = WordCloud(font_path='Fonts/Supplemental/Damascus.ttc', width=700, height=300, background_color="white").generate(reshaped_texts) plt.axis('off') plt.imshow(wordcloud, interpolation='bilinear') plt.savefig('LamaAlherbish_wordCloud.png')
def remove_ar(text): dic = { 'ك': 'ک', 'دِ': 'د', 'بِ': 'ب', 'زِ': 'ز', 'ذِ': 'ذ', 'شِ': 'ش', 'سِ': 'س', 'ى': 'ی', 'ي': 'ی' } pattern = "|".join(map(re.escape, dic.keys())) return re.sub(pattern, lambda m: dic[m.group()], text) item1 = itemgetter(1) FONT_PATH = os.environ.get("FONT_PATH", os.path.join(os.path.dirname(__file__), "fonts/Vazir-Light.ttf")) stop_words_reshape = get_display(arabic_reshaper.reshape(codecs.open( (os.path.join(os.path.dirname(__file__), 'stopwords')), encoding='utf-8').read())) STOPWORDS = set([x.strip() for x in stop_words_reshape.split('\n')]) def add_stop_words(words): for word in words: words_reshape = get_display(arabic_reshaper.reshape(word)) STOPWORDS.add(words_reshape) return STOPWORDS