def generateWordcloud(cleanedTweets):
    # join tweets to a single string
    words = ' '.join(cleanedTweets['text'])

    # remove URLs, RTs, and twitter handles
    no_urls_no_tags = " ".join([
        word for word in words.split()
        if 'http' not in word and not word.startswith('@') and word != 'RT'
    ])
    stopwords = set(STOPWORDS)
    stopwords.add("SAP")

    wc = WordCloud(font_path='CabinSketch-Bold.ttf',
                   background_color="white",
                   max_words=30,
                   width=600,
                   height=300,
                   stopwords=stopwords).generate(no_urls_no_tags)

    plt.figure(figsize=(20, 10), facecolor='k')
    plt.imshow(wc)
    plt.axis("off")
    plt.tight_layout(pad=0)
    plt.show()

    plt.imshow(wc)
    #plt.axis('off')
    plt.savefig('my_twitter_wordcloud_1.png', dpi=300)
    plt.show()
Beispiel #2
0
def word_cloud(lst):

    string = " ".join(lst)

    def grey_color_func(word,
                        font_size,
                        position,
                        orientation,
                        random_state=None,
                        **kwargs):
        return "hsl(0, 0%%, %d%%)" % random.randint(60, 100)

    mask = np.array(Image.open("twitter_mask.png"))

    stopwords = set(STOPWORDS)
    stopwords.add("int")
    stopwords.add("ext")

    wc = WordCloud(max_words=75,
                   mask=mask,
                   stopwords=stopwords,
                   margin=10,
                   random_state=1).generate(string)

    plt.imshow(wc.recolor(color_func=grey_color_func, random_state=3),
               interpolation="bilinear")
    plt.axis("off")
    plt.figure()
    return plt.show()
Beispiel #3
0
def clean_text(texts):  # list of strings
    import re
    from nltk.stem import PorterStemmer
    ps = PorterStemmer()

    stopwords = set_stopwords()
    stopwords.add('ext')
    stopwords.add('int')
    res = []
    for text in texts:
        text = text.lower()

        text = re.sub("\'ve", " have ", text)
        text = re.sub("\'re", " are ", text)
        text = re.sub("n't", " not ", text)
        text = re.sub("\'ll", " will ", text)
        text = re.sub("cut to", " ", text)
        text = re.sub("scene shifts", " ", text)
        text = re.sub("scene", " ", text)

        text = re.sub("[^A-Za-z\n]", " ", text)
        text = re.sub("\n", " ", text)

        text = ' '.join([
            ps.stem(w) for w in word_tokenize(text)
            if (w not in stopwords) and (len(w) > 1)
        ])
        if text != '':
            res.append(text)

    return res
    def process(self, inputs: ValueMap, outputs: ValueMap):

        stopwords = set()
        _languages = inputs.get_value_obj("languages")

        if _languages.is_set:
            all_stopwords = get_stopwords()
            languages: ListModel = _languages.data

            for language in languages.list_data:

                if language not in all_stopwords.fileids():
                    raise KiaraProcessingException(
                        f"Invalid language: {language}. Available: {', '.join(all_stopwords.fileids())}."
                    )
                stopwords.update(get_stopwords().words(language))

        _stopword_lists = inputs.get_value_obj("stopword_lists")
        if _stopword_lists.is_set:
            stopword_lists: ListModel = _stopword_lists.data
            for stopword_list in stopword_lists.list_data:
                if isinstance(stopword_list, str):
                    stopwords.add(stopword_list)
                else:
                    stopwords.update(stopword_list)

        outputs.set_value("stopwords_list", sorted(stopwords))
    def _compute_frequencies(self, word_sent, freq, customStopWords=None):
        #         print word_sent
        #         freq = collections.defaultdict(int)
        # defaultdict is a standard dictionary
        if customStopWords is None:
            stopwords = set(self._stopwords)
        else:
            stopwords = set(customStopWords).union(self._stopwords)
        newStopWords = ['new', 'york', 'times', 'washington', 'post', '-']
        for newWord in newStopWords:
            stopwords.add(newWord)
        for sentence in word_sent:
            for word in sentence:
                if word not in stopwords:
                    freq[word] += 1
                    print word + "\t" + str(freq) + "\n\n"


#         flattened = [val for sublist in collectAll for val in sublist]
#         flattenedAgain = [val for sublist in flattened for val in sublist]
#         print flattenedAgain

        m = float(max(freq.values()))
        for word in freq.keys():
            freq[word] = freq[word] / m
            if freq[word] >= self._max_cut or freq[word] <= self._min_cut:
                del freq[word]
        return freq
def get_term_lines(path):
    with open(path, 'r') as file:
        lines = list()
        line_start = 0
        for line in file:
            lines.append((line, line_start))
            line_start += len(line)

    term_lines = list()
    from nltk.corpus import stopwords
    #from nltk.tokenize import word_tokenize
    stopwords = set(stopwords.words('english'))
    stopwords.add("able")
    stopwords.add("other")
    stopwords.add("another")
    stopwords.add("whether")
    stopwords.add(",")
    for text, position in lines:
        term_line = list()
        text = text.lower()
        words = [word for word in text[0:-1].replace(",", " ,").split(" ")]
        for word in words:
            if word in stopwords:
                term_line.append(Term(word, -1))
            if word not in [term.text for term in term_line]:
                term_line.append(Term(word, text.find(word) + position))
        #print([term.text for term in term_line])
        term_lines.append(term_line)
    return term_lines
def get_stopwords():
    stop_words_sp = set(stopwords.words('spanish'))
    stop_words_en = set(stopwords.words('english'))
    stopwords = stop_words_sp | stop_words_en
    stopwords.add('para')
    spanish_stemmer = SnowballStemmer('spanish')
    return set(map(spanish_stemmer.stem, stopwords))
Beispiel #8
0
def getStopWords(path):
    stopwords = set()
    with open(path, "r") as f:
        lines = f.readlines()
    for line in lines:
        stopwords.add(line.replace("\r\n", "").rstrip())
    return stopwords
Beispiel #9
0
def load_stopwords(file):
    stopwords = set()
    with open(file, "r") as input_file:
        for line in input_file:
            if not line.strip():
                continue
            words = line.split()
            stopwords.add(words[0])
    return stopwords
Beispiel #10
0
def tokenizeText(sample):
    doc = nlp(sample)
    stopwords = set(STOPLIST)
    stopwords.add("queryset")
    lemmas = [token.lemma_ for token in doc if not token.is_stop]
    a_lemmas = [
        lemma for lemma in lemmas if (lemma.isalpha() and lemma != '-PRON-')
        and lemma not in stopwords and lemma not in SYMBOLS
    ]
    return a_lemmas
Beispiel #11
0
def getStopWords():
    """
    返回stopwords_file_path给定的stop words
    :return:
    """
    stopwords = set()
    with open(stopwords_file_path) as file:
        for line in file:
            stopwords.add(line.strip())
    return stopwords
Beispiel #12
0
def load_twitter_stopwords(file):
    stopwords = set()
    with open(file, "r") as input_file:
        for line in input_file:
            if not line.strip():
                continue
            words = line.split(",")
            for word in words:
                stopwords.add(word)
    return stopwords
def read_stopwords():
    stopwords = set()
    f = open('stopwords.txt')
    for line in f:
        word = line.strip()
        if not word:
            continue
        stopwords.add(word)
    f.close()
    return stopwords
    def obter_palavras(self):
        from nltk.corpus import stopwords

        palavras = word_tokenize(self.texto_artigo.lower())
        stopwords = set(list(punctuation))
        stopwords.add('“')
        stopwords.add('”')
        palavras = [
            palavra for palavra in palavras if palavra not in stopwords
        ]
        return palavras
Beispiel #15
0
def create_wordcloud(text):
    from wordcloud import WordCloud, STOPWORDS
    stopwords = set(STOPWORDS)
    additional_stopwords = [
        'one', 'see', 'yes', 'really', 'yeah', 'maybe', 'say', 'know', 'think',
        'well', 'lot', 'make', 'will', 'also', 'don', 'going', 'go',
        'something', 'everything'
    ]
    for new_word in additional_stopwords:
        stopwords.add(new_word)
    wc = wordcloud.WordCloud(stopwords=stopwords).generate(text)
    return wc
Beispiel #16
0
 def read_stopwords_in_set(self):
     stopwords = set()
     try:
         with open(self.stop_words_file_path, "r") as file:
             # stopwords_file must a contain single stop-word per line
             line = file.readline()
             while line:
                 stopwords.add(line.strip())
                 line = file.readline()
     except FileNotFoundError as fe:
         print(f"Stopwords file not found!{fe}")
     return stopwords
    def obter_palavras_sem_stopwords(self):

        from nltk.corpus import stopwords

        palavras = word_tokenize(self.obter_texto_artigos())
        stopwords = set(stopwords.words('portuguese') + list(punctuation))
        stopwords.add('“')
        stopwords.add('”')
        self.palavras_sem_stopwords = [
            palavra for palavra in palavras if palavra not in stopwords
        ]
        return self.palavras_sem_stopwords
Beispiel #18
0
    def remove_stopwords(self, cleaned_tweets, searchTerm):
        stopwords = set(STOPWORDS)
        stopwords.add(searchTerm)

        for tweetId in cleaned_tweets:
            cleanTweet = ''
            tokens = cleaned_tweets[tweetId].split()
            for i in range(len(tokens)):
                tokens[i] = tokens[i].lower()
                if tokens[i] not in stopwords:
                    cleanTweet += tokens[i] + " "
            cleaned_tweets[tweetId] = cleanTweet
        return cleaned_tweets
    def show_img(self, board, limit_num):
        reddit = praw.Reddit(client_id='EwAVjgascYrGIg',
                             client_secret='Z7HahaiGdEKl3e57vml1VkC0pVc',
                             user_agent='hunghunghung1231')

        subreddit = reddit.subreddit(
            board)  #input what subreddit you want to search for
        top_subreddit = subreddit.top()  # grab most up-voted topics all-time
        top_subreddit = subreddit.hot(limit=limit_num)  #使用者輸入要爬幾篇文章
        # for submission in subreddit.top(limit=limit_num):
        #     print(submission.title, submission.id)
        topics_dict = {
            "title": [],
            "score": [],
            "id": [],
            "url": [],
            "comms_num": [],
            "created": [],
            "body": []
        }
        for submission in top_subreddit:
            topics_dict["title"].append(submission.title)
            topics_dict["score"].append(submission.score)
            topics_dict["id"].append(submission.id)
            topics_dict["url"].append(submission.url)
            topics_dict["comms_num"].append(submission.num_comments)
            topics_dict["created"].append(submission.created)
            topics_dict["body"].append(submission.selftext)
        topics_data = pd.DataFrame(topics_dict)

        def get_date(created):
            return dt.datetime.fromtimestamp(created)

        _timestamp = topics_data["created"].apply(get_date)
        topics_data = topics_data.assign(timestamp=_timestamp)
        comment_string = ''  #set an dict to count words
        word_list = []
        for word in topics_data['title']:
            comment_string += word

        stopwords = set(stopwords_l)
        stopwords.add('https')
        stopwords.add('gif')
        wc = WordCloud(height=500,
                       width=1000,
                       background_color='white',
                       stopwords=stopwords).generate(comment_string)
        img = wc.to_file('img.png')
        plt.imshow(img)
        plt.axis("off")
        plt.show()
def remove_stop(reviews):
    new_reviews = {}
    stopwords = Set()
    reader = csv.reader(open('snowball_stopwords.txt', 'rb'))
    for row in reader:
        stopwords.add(row[0])

    for review in reviews:
        new_reviews[review] = []
        for word in reviews:
            if word not in stopwords:
                new_reviews[review].append(word)
        print new_reviews[review]
    return new_reviews
def remove_stop(reviews):
    new_reviews = {}
    stopwords = Set()
    reader = csv.reader(open("snowball_stopwords.txt", "rb"))
    for row in reader:
        stopwords.add(row[0])

    for review in reviews:
        new_reviews[review] = []
        for word in reviews:
            if word not in stopwords:
                new_reviews[review].append(word)
        print new_reviews[review]
    return new_reviews
Beispiel #22
0
def load_stopwords(path):
    """
    This function loads a stopword list from the *path* file and returns a 
    set of words. Lines begining by '#' are ignored.
    """

    # Set of stopwords
    stopwords = set([])

    # For each line in the file
    for line in codecs.open(path, 'r', 'utf-8'):
        if not re.search('^#', line) and len(line.strip()) > 0:
            stopwords.add(line.strip().lower())

    # Return the set of stopwords
    return stopwords
    def __init__(self, fname, verbose):
        #fixme: check format of supported file
        self.stories = []

        # collect stop words
        stopwords = set()
        fin = open(
            '/Users/msingh/cs221/project/mctDataSet/mctest-master/data/stopwords.txt',
            'rU')
        for stopword in fin:
            s = stopword.lower().strip()
            stopwords.add(s)
        fin.close()

        # process stories
        fin = open(fname, 'rU')
        if verbose > 0:
            print 'Reading file %s: START' % (fname)
        for story in fin:
            story = story.strip()
            s = Story(stopwords)
            data = re.split('\t', story)
            s.setName(data[0])
            if verbose > 4:
                print 'Reading story %s' % (data[0])
            properties = re.split(';', data[1])
            for p in properties:
                (name, v) = re.split(': ', p)
                if name == 'Author':
                    if verbose > 4:
                        print 'Setting Author %s for story %s' % (v, data[0])
                    s.setAuthor(v)
                    continue
                if name == 'Work Time(s)' or name == 'Work Time':
                    s.setTime(v)
                continue
            s.setStory(data[2])
            index = 3
            while True:
                s.setQuestion(data[index], data[index + 1:index + 5])
                index += 5
                if index + 5 > len(data):
                    break
            self.stories.append(s)
        fin.close()
        if verbose > 0:
            print 'Reading file %s: DONE' % (fname)
Beispiel #24
0
    def show_wordcloud(self,
                       data,
                       title='High',
                       size=5,
                       name='wc',
                       file='X',
                       makefile=False,
                       show=True):

        stopwords = set(STOPWORDS)
        stopwords.add('shoe')

        if file != 'X':
            masking = np.array(Image.open(X))

            wordcloud = WordCloud(background_color='white',
                                  stopwords=stopwords,
                                  max_words=100,
                                  max_font_size=70,
                                  scale=1,
                                  random_state=2,
                                  mask=masking).generate(data)

        else:
            wordcloud = WordCloud(background_color='white',
                                  stopwords=stopwords,
                                  max_words=100,
                                  max_font_size=70,
                                  scale=1,
                                  random_state=2).generate(data)

        if show == True:
            fig = plt.figure(1, figsize=(5 + size, 5 + size))
            plt.axis('off')
            if title:
                fig.suptitle(title, fontsize=20)
                fig.subplots_adjust(top=2.3)

            plt.imshow(wordcloud)
            plt.show()

        if makefile == True:
            nn = '{}.jpg'.format(name)
            wordcloud.to_file(nn)
Beispiel #25
0
def nube_de_palabras(text):
    stopwords = set(STOPWORDS)
    stopwords.add("queryset")
    stopwords.add("'")
    plt.figure(figsize=(20, 5))

    wordcloud = WordCloud(background_color='white',
                          stopwords=stopwords).generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")

    image = io.BytesIO()
    plt.savefig(image, format='png')
    image.seek(0)  # rewind the data
    string = base64.b64encode(image.read())

    image_64 = 'data:image/png;base64,' + urllib.parse.quote(string)
    return image_64
Beispiel #26
0
def fnCreate_WordCloud(i, text):
    if len(text) > 5:
        stopwords = set(STOPWORDS)
        stopwords.add("said")

        d = path.dirname(__file__)
        parent_dir = path.abspath(d + "/../")
        d = path.join(parent_dir, r'nlp\static\images\img')

        # lower max_font_size
        wordcloud = WordCloud(background_color="white",
                              max_font_size=45,
                              stopwords=stopwords).generate(text)

        ## The pil way (if you don't have matplotlib)
        image = wordcloud.to_image()

        # store default colored image
        filename = str(i) + '.png'
        image.save(path.join(d, filename), "PNG")
Beispiel #27
0
    def write_crawl_results(self, the_path, my_query, the_cnt_in):
        #let use fetch_urls to get URLs then pass to the my_scraper function
        import pandas as pd
        #import re
        from nltk.stem import PorterStemmer
        from nltk.corpus import stopwords
        stopwords = set(stopwords.words('english'))
        stopwords.add('e')
        my_stem = PorterStemmer()
        # guess loop needs to be here
        query_df = pd.DataFrame()
        for q in my_query:
            the_urls_list = self.fetch_urls(q, the_cnt_in)
            #print(the_urls_list)
            for word in the_urls_list:
                tmp_txt = self.my_scraper(word)
                if len(tmp_txt) != 0:
                    try:
                        body_stem = [my_stem.stem(word) for word in tmp_txt]
                        query_df = query_df.append(
                            {
                                'body basic': tmp_txt,
                                'body stem': body_stem,
                                'label': q
                            },
                            ignore_index=True)

                        #query_df['body basic'] = query_df.append(tmp_txt, ignore_index = True)
                    #  query_df['body stem'] = query_df.append([my_stem.stem(word) for word in tmp_txt], ignore_index = True)
                    # query_df['label'] = query_df.append(q, ignore_index = True)

                    #tmp_txt = [word.lower() for word in tmp_txt.split() if word not in stopwords]
                    # tmp_df = pd.DataFrame([tmp_txt], columns =['body basic'])
                    #tmp_df['stemmed text'] = self.clean_data(tmp_txt)
                    # tmp_df['body stem'] = [my_stem.stem(word) for word in tmp_txt]
                    # tmp_df['label'] = my_query
                    # query_df = query_df.append(tmp_df, ignore_index=True)
                    except:
                        pass

        print(query_df)
def create_wordcloud(yt_comments):
    # Prepare Data for wordcloud
    current_time = time.time()
    image_path = 'static/wordcloud_images/wordcloud' + str(
        current_time) + '.png'
    comment_words = ""
    for words in yt_comments:
        comment_words = comment_words + words + ' '

    # adding movie script specific stopwords
    stopwords = set(STOPWORDS)
    stopwords.add("movie")
    stopwords.add("film")
    stopwords.add("trailer")

    wordcloud = WordCloud(width=400,
                          height=400,
                          background_color='cyan',
                          stopwords=stopwords,
                          min_font_size=10).generate(comment_words)

    # plot the WordCloud image
    plt.figure(figsize=(8, 8))
    plt.imshow(wordcloud)
    plt.axis("off")
    # plt.tight_layout(pad=0)
    plt.savefig(image_path)
    return image_path
def generate_wordcloud(source, keyword, text):
    T = Tokenizer()
    stopwords = set(STOPWORDS)
    stopwords.add(keyword.split('@')[-1])
    for w in T.stopwords:
        stopwords.add(w)

    wc_path = ""
    try:
        # Generate a word cloud image
        wordcloud = WordCloud(background_color="white",
                              max_words=50,
                              width=400,
                              height=400,
                              stopwords=stopwords).generate(text)

        wc_path = os.path.join(wordcloud_dir, "%s_%s.jpg" % (source, keyword))
        wordcloud.to_file(wc_path)
    except Exception as ex:
        print("generate_wordcloud", str(ex))

    return wc_path
Beispiel #30
0
from __future__ import division
import json
import nltk
from sklearn.feature_extraction import DictVectorizer
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
stopwords.add('DT')
stopwords.add('CD')
import sys
import os
nltk.data.path.append(os.getcwd() + "/../downloads/nltk_data/")
sys.path.append(os.getcwd() + "/Code/")
import string
import re
import subprocess as sp
import numpy as np
from informative_prior_logistic_regresssion_sw import InformativePriorLogisticRegressionWeight
from eval.entity_level_evaluation import load_gold
from eval.entity_level_evaluation import load_dictionary
import operator
import matplotlib.pyplot as plt
"""
Model to train 
"""


def train_informative_logreg(_X, _Q, w0, b0, C=5):
    """
    _X: size (Nmentions by Nfeat)
    _Q: vector length Nmentions. for each, P(z_i=1|x, y)
    model: for example a LogisticRegression object.
Beispiel #31
0
print "step 2"
# words = []
# words_count = []
#for f in filtered_words:
#  for t in f :
#    temp = t.lower()
#    if temp != 'much' and temp != 'last' and temp != 'next' and temp != 'green' :
#      if pos_tag(word_tokenize(temp))[0][1] == 'JJ'  :
#            text += " "
#            text += temp

print "step 3"

# read the mask image
# taken from
# http://www.stencilry.org/stencils/movies/alice%20in%20wonderland/255fk.jpg
alice_mask = np.array(Image.open(path.join(d, "names/MG1.png")))

stopwords = set(STOPWORDS)
stopwords.add("said")

wc = WordCloud(background_color="white",
               max_words=3000,
               mask=alice_mask,
               stopwords=stopwords)
# generate word cloud
wc.generate(show_text)

# store to file
wc.to_file(path.join(d, "Mcgee.png"))
Beispiel #32
0
    top_three = np.argsort(W[idx])[::-1][:3]
    return top_three


if __name__ == '__main__':
    content = get_content()

    stopwords = set(stopwords.words('english'))
    for word in [
            'also', 'would', 'could', 'saw', 'report', 'bfro', 'like', 'said',
            'YEAR', 'SEASON', 'MONTH', 'STATE', 'COUNTY', 'LOCATION',
            'DETAILS', 'TOWN', 'NEAREST', 'ROAD', 'OBSERVED', 'NOTICED',
            'OTHER', 'WITNESSES', 'STORIES', 'TIME', 'CONDITIONS',
            'ENVIRONMENT'
    ]:
        stopwords.add(word)
    vectorizer = CountVectorizer(stop_words=stopwords)
    td_mat = vectorizer.fit_transform(content)
    V = td_mat.toarray()
    feature_names = vectorizer.get_feature_names()
    k = 5

    nmf = NMF(n_components=k)
    nmf.fit(V)
    W = nmf.transform(V)
    H = nmf.components_
    err = nmf.reconstruction_err_

    #top words per topic
    top_words = get_top_words(H, feature_names)
Beispiel #33
0
    wordcloud = WordCloud(background_color=None, mode='RGBA',
                          random_state=42).generate_from_frequencies(pairs)
    print counts
    plt.figure()
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.savefig(filename, bbox_inches='tight')


def tokenize_text(stopwords, text):
    ''' split text, stem word, remove stopwords '''
    lemma = WordNetLemmatizer()

    tokenizer = RegexpTokenizer(r'\w+')
    return [lemma.lemmatize(w.lower()) for w in tokenizer.tokenize(text)
            if w.lower() not in stopwords]


if __name__ == '__main__':

    stopwords = set(stopwords.words('english'))
    stopwords.add('using')
    stopwords.add('dis')

    new_tokenizer = partial(tokenize_text, stopwords)
    fin, fout = 'paper-titles.txt', 'wordcloud.png'

    with open(fin) as f:
        titles = [new_tokenizer(line) for line in f]
        make_wordcloud(fout, titles)
Beispiel #34
0
def load_stopwords(path):
    stopwords = set([])
    with open(path, "r") as f:
        for line in f.readlines():
            stopwords.add(line.rstrip())
    return stopwords
Beispiel #35
0
    para = re.sub('<footref.*?/>', '', para)
    para = re.sub('</?signpost>', '', para)
    para = re.sub('</?description>', '', para)
    para = re.sub('</?blockquote>', '', para)
    para = re.sub('</?bookref.*?>', '', para)
    return textwrap.wrap(para, width)

sections = OrderedDict()
custom = json.load(open('fotw_custom.json'))
parser = etree.XMLParser(resolve_entities=False)
tree = etree.parse('fotw.xml', parser=parser)
root = tree.getroot()

stopwords = set(stopwords.words('english'))
for w in ['turn', 'wish', 'want', 'turning', 'rather', 'would', 'along', 'upon', 'another']:
    stopwords.add(w)

for sect_elem in root.findall('.//section[@class="numbered"]')[1:]:
#for sect_elem in root.findall('.//section[@id="sect%s"]' % 181):
    sect_id = sect_elem.find('.//title').text
    sect_paras = []
    choices = []
    combat = {}
    enemies = []
    rnt_found = False
    ac_found = False
    stats_found = False
    undead_found = False
    sommerswerd_found = False
    immune_to_mindblast_found = False
    illustration_found = False