Esempio n. 1
0
def cleanText(text):
    text = text.replace("\n", " ").replace("‌",
                                           " ").replace("\r", " ").replace(
                                               "‎", "").replace("‏", "")
    text = PersianWordCloud.remove_ar(text)
    text = get_display(arabic_reshaper.reshape(ps.run(text)))
    return text
Esempio n. 2
0
    def word_cloud(self, model: LdaModel, stopwords_path, save_path):
        with open(stopwords_path, 'r', encoding='utf8') as f:
            words = f.readlines()

        stopwords = add_stop_words(words)
        print('stop words added')
        word_cloud = PersianWordCloud(only_persian=True,
                                      max_words=10,
                                      stopwords=stopwords,
                                      width=800,
                                      height=800,
                                      background_color='black',
                                      min_font_size=1,
                                      max_font_size=300)
        topics = model.show_topics(formatted=False)

        for i, topic in enumerate(topics):
            topic_words = dict(topic[1])
            print(topic_words)
            new = {}
            for word in topic_words.keys():
                reshaped = get_display(arabic_reshaper.reshape(word))
                new[reshaped] = topic_words[word]
            print(new)
            word_cloud.generate_from_frequencies(new)
            image = word_cloud.to_image()
            image.show()
            s = save_path + '_topic_' + str(i) + '.png'
            print(s)
            image.save(s)
Esempio n. 3
0
def generate_wallpapers_by_surah_number(surah_number):
    current_surah_index = surah_number - 1

    try:
        with open('quran_data/quran-ar.json') as quran_ar_json_file:
            quran_ar_json = json.load(quran_ar_json_file)
    except FileNotFoundError:
        print("Local quran-ar.json file not found, downloading now...")
        print("Please wait...")
        import_quran_arabic_into_json()

    try:
        with open('quran_data/quran-en.json') as quran_en_json_file:
            quran_en_json = json.load(quran_en_json_file)
    except FileNotFoundError:
        print('Local quran-en.json file not found, downloading now...')
        print('Please wait...')
        import_quran_english_into_json()

    for ayah_ar, ayah_en in zip(
            quran_ar_json["data"]["surahs"][current_surah_index]["ayahs"],
            quran_en_json["data"]["surahs"][current_surah_index]["ayahs"]):
        # print(ayah_ar["text"])
        # print(ayah_en["text"])
        # print('Quran[' + str(current_surah) + ":" + str(ayah_en["numberInSurah"]) + ']')
        ayah_arabic_text = ayah_ar["text"]
        reshaped_arabic_text = arabic_reshaper.reshape(ayah_arabic_text)
        ayah_arabic_text = get_display(reshaped_arabic_text)
        ayah_english_text = ayah_en["text"]
        surah_number = str(surah_number)
        ayah_number = str(ayah_en["numberInSurah"])
        ayah_english_text += ' - Quran[' + str(surah_number) + ":" + str(
            ayah_en["numberInSurah"]) + ']'
        write_ayah_on_image(ayah_arabic_text, ayah_english_text, surah_number,
                            ayah_number)
Esempio n. 4
0
def clean_text(text: str, context: str, stop_words) -> str:
    print("cleaning text")

    text = remove_unwanted_chars(text)
    if general_config["ARABIC_RESHAPER"]:
        text = get_display(arabic_reshaper.reshape(text))

    word_list = text.split(" ")
    general_cleaned = [clean_word(word) for word in word_list]

    custom_cleaned = general_cleaned

    if context == "twitter" or context == "data":
        custom_cleaned = [clean_twitter_word(word) for word in general_cleaned]

    elif context == "telegram":
        custom_cleaned = [
            clean_telegram_word(word) for word in general_cleaned
        ]

    if general_config["STOP_WORD_CLEAN"] == "manual":
        custom_cleaned = [
            word for word in custom_cleaned
            if not is_stop_word(word, stop_words)
        ]

    return " ".join(custom_cleaned)
Esempio n. 5
0
def draw_cloud(cleantweets, image_path, monthly=False, show_image=False):
    text = " ".join(str(tweet) for tweet in cleantweets)
    text = get_display(arabic_reshaper.reshape(text))
    tokens = word_tokenize(text)
    dic = Counter(tokens)
    print(dic.most_common(max_words))
    twitter_mask = np.array(Image.open("twitter-logo.png"))
    font_path = select_a_font()
    words = max_words
    if monthly:
        words = max_words // 2
    wordcloud = WordCloud(font_path=font_path,
                          max_words=words,
                          margin=0,
                          width=5000,
                          height=5000,
                          min_font_size=4,
                          max_font_size=750,
                          background_color="white",
                          mask=twitter_mask)
    wordcloud.generate_from_frequencies(dic)

    image = wordcloud.to_image()
    wordcloud.to_file(image_path)
    if show_image:
        image.show()
    print(f"Generated image {image_path}")
Esempio n. 6
0
 def reshape_words(words: Iterable) -> List[str]:
     """
     This method make words proper for displaying in the wordcloud.
     We first join all words together as a string, then reshape them and finally split them, because it's faster than
     reshaping each word separately.
     ATTENTION: words should not contain \n because this character used as separator. so if a word has `\n` character
     in it, considers as two separate words.
     :param words: an iterable including words
     :return: a ist of proper words for showing in the WordCloud
     """
     combined_words: str = "".join(x + "\n" for x in words)
     return get_display(arabic_reshaper.reshape(combined_words)).split("\n")
Esempio n. 7
0
def draw_graph(edgeList, user_id, file_name):
    lis = []
    for i in edgeList:
        reshaped_text = arabic_reshaper.reshape(i[0])
        artext = get_display(reshaped_text)
        reshaped_text1 = arabic_reshaper.reshape(i[1])
        artext1 = get_display(reshaped_text1)
        lis.append((artext, artext1))
    G = nx.OrderedMultiDiGraph()
    G.add_edges_from(lis)
    pos = nx.spring_layout(G, k=0.99, iterations=50)
    nx.draw_networkx_nodes(G, pos, node_color='#AED6F1', node_size=2500)
    nx.draw_networkx_edges(G, pos, edgelist=G.edges(), edge_color='#95A5A6', arrows=True)
    nx.draw_networkx_labels(G, pos, font_size=20, font_family='Times New Roman')
    graph_path = os.path.join(os.path.join(BASE_DIR, 'static/media'), 'graph')
    user_path = os.path.join(graph_path, str(user_id))
    if not os.path.isdir(graph_path):
        os.mkdir(graph_path)
    if not os.path.isdir(user_path):
        os.mkdir(user_path)
    path = os.path.join(user_path, file_name)
    plt.tight_layout()
    plt.savefig(path+'.png', format="PNG")
    plt.show()
def get_emb_matrix(word_index, max_features, embedding_file):
    embeddings_index = load_word_emb(word_index, embedding_file)
    all_embs = np.stack(embeddings_index.values())
    emb_mean, emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    embedding_matrix = np.random.normal(emb_mean, emb_std,
                                        (max_features, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        if embedding_vector is None:
            word = arabic_reshaper.reshape(word)
            print(word)

    return embedding_matrix
Esempio n. 9
0
def create_word_cloud(keywords, file_id):
    data = arabic_reshaper.reshape(keywords)
    data = get_display(data)  # add this line
    wordCloud = WordCloud(font_path='arial',
                          background_color='white',
                          mode='RGB',
                          width=2000,
                          height=1000).generate(data)
    plt.imshow(wordCloud)
    plt.axis("off")
    plt.tight_layout()
    plt.savefig(
        os.path.join(BASE_DIR,
                     'static/media/wordCloud/' + str(file_id) + '.png'))
    plt.show()
    img_path = os.path.join('wordCloud', str(file_id) + '.png')
    return img_path
Esempio n. 10
0
    def generate(self, text):
        """Generate wordcloud from text.

        The input "text" is expected to be a natural text. If you pass a sorted
        list of words, words will appear in your output twice. To remove this
        duplication, set ``collocations=False``.

        Alias to generate_from_text.

        Calls process_text and generate_from_frequencies.

        Returns
        -------
        self
        """
        # reshape persian words
        text = get_display(arabic_reshaper.reshape(text))
        return self.generate_from_text(text)
Esempio n. 11
0
def draw_cloud(cleantweets, image_path, show_image=False):
    top_words = dict()
    for key, value in Counter(ngram).most_common(max_words):
        top_words[get_display(arabic_reshaper.reshape(key))] = value
    twitter_mask = np.array(Image.open("twitter-logo.jpg"))
    font_path = select_a_font()
    wordcloud = WordCloud(
        font_path=font_path,
        max_words=max_words,
        margin=0,
        width=800,
        height=800,
        min_font_size=1,
        max_font_size=500,
        background_color="white",
        mask=twitter_mask
    )
    wordcloud.generate_from_frequencies(top_words)

    image = wordcloud.to_image()
    wordcloud.to_file(image_path)
    if show_image:
        image.show()
    print(f"Generated image {image_path}")
Esempio n. 12
0
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from arabic_reshaper import arabic_reshaper
from bidi.algorithm import get_display

file = open('txt-per.txt', encoding="utf-8")
reader = file.read()
text = arabic_reshaper.reshape(reader)
print(text)
text = get_display(arabic_reshaper.reshape(text))
stopwords = set(STOPWORDS)
stopwords.add("RT")
stopwords.add("text")
stopwords.add("https")
stopwords.add("co")

cloud = WordCloud(font_path='/Users/alisalehi/Library/Fonts/XNazanin.TTF',
                  stopwords=stopwords).generate(
                      get_display(arabic_reshaper.reshape(text)))

image = cloud.to_image()
image.show()

#plt.imshow(cloud)
#plt.show()
Esempio n. 13
0
def normalize_hebrew(raw_text: str):
    return get_display(arabic_reshaper.reshape(raw_text))
Esempio n. 14
0
        text = text.replace(eng, "")

    cleared_text = " ".join(text.split())  #Remove Extra White Spaces

    return cleared_text


print("\n ****** \n Mate it's not stuck! just wait a little! \n ****** \n")
cloud = ""
for status in tweets:
    text = status.text
    text = preprocessing(text)
    if text:
        cloud = cloud + text

from arabic_reshaper import arabic_reshaper
from bidi.algorithm import get_display
cloudfinal = arabic_reshaper.reshape(cloud)
cloudfinal = get_display(arabic_reshaper.reshape(cloud))

print(
    "\n ****** \n Press enter to continue if it's still stuck :))) \n ****** \n"
)

cloudwords = WordCloud(font_path="font.ttf",
                       background_color="white").generate(cloudfinal)
plt.imshow(cloudwords, interpolation="bilinear")
plt.axis('off')
plt.show()
savename = input("gimme the file name to save! : ")
WordCloud.to_file(cloudwords, savename + ".png")
Esempio n. 15
0
def add_stop_words(words):
    for word in words:
        words_reshape = get_display(arabic_reshaper.reshape(word))
        STOPWORDS.add(words_reshape)
        return STOPWORDS
Esempio n. 16
0
            'دِ': 'د',
            'بِ': 'ب',
            'زِ': 'ز',
            'ذِ': 'ذ',
            'شِ': 'ش',
            'سِ': 'س',
            'ى': 'ی',
            'ي': 'ی'
        }
        pattern = "|".join(map(re.escape, dic.keys()))
        return re.sub(pattern, lambda m: dic[m.group()], text)


item1 = itemgetter(1)

FONT_PATH = os.environ.get(
    "FONT_PATH",
    os.path.join(os.path.dirname(__file__), "fonts/Vazir-Light.ttf"))
stop_words_reshape = get_display(
    arabic_reshaper.reshape(
        open((os.path.join(os.path.dirname(__file__), 'stopwords')),
             encoding='utf-8').read()))
STOPWORDS = set([x.strip() for x in stop_words_reshape.split('\n')])


def add_stop_words(words):
    for word in words:
        words_reshape = get_display(arabic_reshaper.reshape(word))
        STOPWORDS.add(words_reshape)
        return STOPWORDS
Esempio n. 17
0
text = open(r'C:\TECHNICAL\MISC\Python\Firdoos.txt','r',encoding='utf8').read()


# Calling function to remove punctuations using unicode codepoints starting with 'P'
#text = remove_punct(text)

# Tokenizing text into words
#words = nltk.word_tokenize(text) Method does not work now
words = nltk.tokenize.wordpunct_tokenize(text)

# Remove stop words from text
text = remove_stopwords(words)

# Reconstruct Arabic/Urdu sentences to be used in applications that don't support Arabic script.
from arabic_reshaper import arabic_reshaper
text = arabic_reshaper.reshape(text)

# For right-to-left text rendering, need to use get_display from python-bidi
# Install using pip install python-bidi
from bidi.algorithm import get_display
text = get_display(arabic_reshaper.reshape(text))

# Generate a word cloud image
fontpath = r"C:\Users\Humera\AppData\Local\Microsoft\Windows\Fonts\urdu-najd-regular-1.ttf"
wordcloud =  WordCloud(font_path=fontpath).generate(text)

# Display the generated image: the matplotlib way
import matplotlib.pyplot as plt

#matplotlib.rc('font', family='Tahoma')
plt.imshow(wordcloud.recolor(random_state=2017))
    result = [word for word in text2 if word not in arabic_words]

    final_result = ' '.join(result)

    return final_result


with open('LamaAlherbish_tweets.json') as f:
    print('hh')
    data_file = json.load(f)
    # print(data_file)
    for i in data_file:
        cleaned_text = clean_Text(str=i['text'])
        tweets_data.append(cleaned_text.split())
    tweets_data2 = list(chain.from_iterable(tweets_data))
    twenty_most_commom_words_with_frequency = Counter(tweets_data2).most_common(20)
    for word in twenty_most_commom_words_with_frequency:
        print(word[0])
        twenty_most_commom_words.append(word[0])


# # convert list to string and generate
unique_string = (" ").join(twenty_most_commom_words)
reshaped_texts = arabic_reshaper.reshape(unique_string)
reshaped_texts = get_display(reshaped_texts)
wordcloud = WordCloud(font_path='Fonts/Supplemental/Damascus.ttc', width=700, height=300, background_color="white").generate(reshaped_texts)
plt.axis('off')
plt.imshow(wordcloud, interpolation='bilinear')
plt.savefig('LamaAlherbish_wordCloud.png')
Esempio n. 19
0
    def remove_ar(text):
        dic = {
            'ك': 'ک',
            'دِ': 'د',
            'بِ': 'ب',
            'زِ': 'ز',
            'ذِ': 'ذ',
            'شِ': 'ش',
            'سِ': 'س',
            'ى': 'ی',
            'ي': 'ی'
        }
        pattern = "|".join(map(re.escape, dic.keys()))
        return re.sub(pattern, lambda m: dic[m.group()], text)


item1 = itemgetter(1)

FONT_PATH = os.environ.get("FONT_PATH", os.path.join(os.path.dirname(__file__),
                                                     "fonts/Vazir-Light.ttf"))
stop_words_reshape = get_display(arabic_reshaper.reshape(codecs.open(
    (os.path.join(os.path.dirname(__file__), 'stopwords')), encoding='utf-8').read()))
STOPWORDS = set([x.strip() for x in stop_words_reshape.split('\n')])


def add_stop_words(words):
    for word in words:
        words_reshape = get_display(arabic_reshaper.reshape(word))
        STOPWORDS.add(words_reshape)
    return STOPWORDS