コード例 #1
0
def checkStopWordList(word, TOPIC):
    common_words = [s.lower() for s in TOPIC.split()]
    stopwords.update(common_words)
    if word.lower() in stopwords:
        return False
    else:
        return True
コード例 #2
0
    def preprocess(self):

        preprocessed_docs_tmp = self.documents
        preprocessed_docs_tmp = [doc.lower() for doc in preprocessed_docs_tmp]
        preprocessed_docs_tmp = [
            doc.translate(
                str.maketrans(string.punctuation,
                              ' ' * len(string.punctuation)))
            for doc in preprocessed_docs_tmp
        ]
        stopwords = self.stopwords
        stopwords.update(self.new_stopwords)
        preprocessed_docs_tmp = [
            ' '.join(
                [w for w in doc.split() if len(w) > 0 and w not in stopwords])
            for doc in preprocessed_docs_tmp
        ]

        vectorizer = CountVectorizer(max_features=self.vocabulary_size,
                                     token_pattern=r'\b[a-zA-Z]{2,}\b')
        vectorizer.fit_transform(preprocessed_docs_tmp)
        vocabulary = set(vectorizer.get_feature_names())
        preprocessed_docs_tmp = [
            ' '.join([w for w in doc.split() if w in vocabulary])
            for doc in preprocessed_docs_tmp
        ]

        preprocessed_docs, unpreprocessed_docs = [], []
        for i, doc in enumerate(preprocessed_docs_tmp):
            if len(doc) > 0:
                preprocessed_docs.append(doc)
                unpreprocessed_docs.append(self.documents[i])

        return preprocessed_docs, unpreprocessed_docs, list(vocabulary)
コード例 #3
0
    def process(self, inputs: ValueMap, outputs: ValueMap):

        stopwords = set()
        _languages = inputs.get_value_obj("languages")

        if _languages.is_set:
            all_stopwords = get_stopwords()
            languages: ListModel = _languages.data

            for language in languages.list_data:

                if language not in all_stopwords.fileids():
                    raise KiaraProcessingException(
                        f"Invalid language: {language}. Available: {', '.join(all_stopwords.fileids())}."
                    )
                stopwords.update(get_stopwords().words(language))

        _stopword_lists = inputs.get_value_obj("stopword_lists")
        if _stopword_lists.is_set:
            stopword_lists: ListModel = _stopword_lists.data
            for stopword_list in stopword_lists.list_data:
                if isinstance(stopword_list, str):
                    stopwords.add(stopword_list)
                else:
                    stopwords.update(stopword_list)

        outputs.set_value("stopwords_list", sorted(stopwords))
コード例 #4
0
def parse_corpus(corpus):
    
    import nltk
    from nltk import wordpunct_tokenize
    from nltk.corpus import stopwords
    from nltk.stem.porter import PorterStemmer
    #from nltk.tag.stanford import StanfordNERTagger
    from nltk.tag import pos_tag
    import string
    import itertools
    
    """
    commonWords = ['the', 'of', 'and', 'a', 'to', 'in', 'is', 'you', 'that', 'it', 'he', 'was', 'for', 'on', 'are', 'as', 'with', 'his', 'they', 'I', 'at', 'be', 'this', 'have', 'from', 'or', 'one', 'had', 'by', 'word', 'but', 'not', 'what', 'all', 'were', 'we', 'when'\
, 'your', 'can', 'said', 'there', 'use', 'an', 'each', 'which', 'she', 'do', 'how', 'their', 'if', 'will', 'up', 'other', 'about', 'out', 'many', 'then', 'them', 'these', 'so', 'some', 'her', 'would', 'make', 'like', 'him', 'into', 'time', 'has', 'look', 'two', 'mo\
re', 'write', 'go', 'see', 'number', 'no', 'way', 'could', 'people', 'my', 'than', 'first', 'water', 'been', 'call', 'who', 'oil', 'its', 'now', 'find', 'long', 'down', 'day', 'did', 'get', 'come', 'made', 'may', 'part']
    listOfCharToExclude = ['.', ',', ':', '"', '+', '!', '?', '/', "'", '*', '(', ')', '$', '@', '&', '*',']','[']
    """
    stopwords = set(stopwords.words('english'))
    stopwords.update(string.punctuation)
    stopwords.update([p[0] + p[1] for p in itertools.product(string.punctuation, string.punctuation)])
    #stopwords.update(commonWords)
    #stopwords.update(listOfCharToExclude)
    
    #st = StanfordNERTagger('/home/orange63/TextMining/Project2/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz',
    #                       '/home/orange63/TextMining/Project2/stanford-ner-2014-06-16/stanford-ner-3.4.jar')
    porter = PorterStemmer()
    
    corpus_token = wordpunct_tokenize(corpus)
    corpus_token = [word for word in corpus_token if word not in stopwords]
    #corpus_no_people = [p[0] for p in filter(lambda x: x[1] != 'PERSON', st.tag(corpus_token))]
    corpus_NN = [p[0] for p in filter(lambda x: (x[1] == 'NN') or (x[1] == 'NNP'), pos_tag(corpus_token))]
    corpus_stem = [porter.stem(word) for word in corpus_NN]
    
    return ' '.join(corpus_stem)
コード例 #5
0
def make_chunk(text):
    from nltk.corpus import stopwords
    from string import punctuation
    stopwords = set(stopwords.words('english') + list(punctuation))
    stopwords.update([
        '.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}',
        '”', '“', "Advertisement"
    ])

    print(text)
    f_tokens = []
    #text = remove_stopwords(text)
    #text = normalise(text)
    tokens = nltk.tokenize.word_tokenize(text)
    for w in tokens:
        if w not in stopwords:
            f_tokens.append(w)

    pos = nltk.pos_tag(f_tokens)
    cp = nltk.RegexpParser(chunkGram)

    chunk_parser = nltk.RegexpParser(chunkGram)
    chunged = chunk_parser.parse(pos)
    #  print(chunged)
    array = []
    for subtree in chunged.subtrees(filter=lambda t: t.label() == "#Chunk"):
        array.append(" ".join([a for (a, b) in subtree.leaves()]))

    arr = nltk.FreqDist(array)
    key_chunk = arr.most_common(5)
    return key_chunk
コード例 #6
0
def remove_english_stopwords(text):
    set(stopwords.words('english'))
    stopwords.update(
        ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{',
         '}'])  # remove it if you need punctuation
    list_of_words = [
        i.lower() for i in wordpunct_tokenize(text)
        if i.lower() not in stopwords
    ]
    return list_of_words
コード例 #7
0
def prepCloud(Topic_text, Topic):
    Topic = str(Topic).lower()
    Topic = ' '.join(re.sub('([^0-9A-Za-z \t])', ' ', Topic).split())
    Topic = re.split("\s+", str(Topic))
    stopwords = set(STOPWORDS)
    stopwords.update(
        Topic
    )  ### Add our topic in Stopwords, so it doesnt appear in wordClous
    ###
    text_new = " ".join(
        [txt for txt in Topic_text.split() if txt not in stopwords])
    return text_new
コード例 #8
0
def remove_stopwords(text):
    from nltk.corpus import stopwords
    from string import punctuation
    from nltk.tokenize import sent_tokenize, word_tokenize
    filt_text = []
    stopwords = set(stopwords.words('english') + list(punctuation))
    stopwords.update(
        ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'])
    words = word_tokenize(text)
    for w in words:
        if w not in stopwords:
            filt_text.append(w)
    filter_str = ''.join(filt_text)
    return filter_str
コード例 #9
0
def wordcloud(brand1,df1,brand2,df2):
    stopwords = set(STOPWORDS)
    tags=[]
    tags.extend(brand1.split())
    tags.extend(brand2.split())
    
    tags.extend(subStrings("".join(brand1.split())))
    tags.extend(subStrings("".join(brand2.split())))

    stopwords.update(tags)

    plt.figure(figsize=(10, 8), dpi=150)

    plt.subplot(2, 2, 1)
    pos = df1.loc[df1['sentiment_type'] == 'POSITIVE']
    poswords=" ".join(word for word in pos.tweet if brand1 not in word)
    wordcloud1 = WordCloud(stopwords=stopwords).generate(poswords)
    plt.imshow(wordcloud1, interpolation='bilinear')
    plt.title(str.title(brand1)+' positive')
    plt.axis("off")

    plt.subplot(2, 2, 2)
    pos = df2.loc[df2['sentiment_type'] == 'NEGATIVE']
    poswords=" ".join(word for word in pos.tweet if brand2 not in word)
    wordcloud2 = WordCloud(stopwords=stopwords).generate(poswords)
    plt.imshow(wordcloud2, interpolation='bilinear')
    plt.title(str.title(brand2)+' negative')
    plt.axis("off")

    plt.subplot(2, 2, 3)
    pos = df1.loc[df1['sentiment_type'] == 'NEGATIVE']
    poswords=" ".join(word for word in pos.tweet if brand1 not in word)
    wordcloud3 = WordCloud(stopwords=stopwords).generate(poswords)
    plt.imshow(wordcloud3, interpolation='bilinear')
    plt.title(str.title(brand1)+' negative')
    plt.axis("off")

    plt.subplot(2, 2, 4)
    pos = df2.loc[df2['sentiment_type'] == 'POSITIVE']
    poswords=" ".join(word for word in pos.tweet if brand2 not in word)
    wordcloud4 = WordCloud(stopwords=stopwords).generate(poswords)
    plt.imshow(wordcloud4, interpolation='bilinear')
    plt.title(str.title(brand2)+' positive')
    plt.axis("off")
    plt.savefig('cloud.png')
    mediaobj = anvil.media.from_file('cloud.png')
    return mediaobj
コード例 #10
0
    def tokenize_text(self, text):
        tokens = []
        # Adding to stopwords
        stopwords = STOPWORDS.copy()
        stopwords = set(stopwords)
        spanish = self._get_spanish_stopwords()
        stopwords.update(spanish)
        stopwords.update(['http', 'f**k', 'rt'])

        for sent in nltk.sent_tokenize(text):
            for word in nltk.word_tokenize(sent):
                # if word not in stopwords:
                if len(word) < 2:
                    continue
                # tokens.append(self._lemmatize_stemming(word.lower()))
                tokens.append(word.lower())
        return tokens
コード例 #11
0
    def process(self, inputs: ValueMap, outputs: ValueMap) -> None:

        import pyarrow as pa

        custom_stopwords = inputs.get_value_data("additional_stopwords")

        if inputs.get_value_obj("languages").is_set:
            _languages: ListModel = inputs.get_value_data("languages")
            languages = _languages.list_data
        else:
            languages = []

        stopwords = set()
        if languages:
            for language in languages:
                if language not in get_stopwords().fileids():
                    raise KiaraProcessingException(
                        f"Invalid language: {language}. Available: {', '.join(get_stopwords().fileids())}."
                    )
                stopwords.update(get_stopwords().words(language))

        if custom_stopwords:
            stopwords.update(custom_stopwords)

        orig_array = inputs.get_value_obj("tokens_array")  # type: ignore

        if not stopwords:
            outputs.set_value("tokens_array", orig_array)
            return

        # if hasattr(orig_array, "to_pylist"):
        #     token_lists = orig_array.to_pylist()

        tokens_array = orig_array.data.arrow_array

        # TODO: use vaex for this
        result = []
        for token_list in tokens_array:

            cleaned_list = [
                x for x in token_list.as_py() if x.lower() not in stopwords
            ]
            result.append(cleaned_list)

        outputs.set_value("tokens_array", pa.chunked_array(pa.array(result)))
コード例 #12
0
class TwitterUser:

    auth = tweepy.AppAuthHandler(Config.CONSUMER_KEY, Config.CONSUMER_SECRET)

    #Construct the API instance
    api = tweepy.API(auth, wait_on_rate_limit=True)  # create an API object

    # create tokenizer that gets words (only alphabetical words)
    tokenizer = RegexpTokenizer(r'\w+')

    stopwords = set(stopwords.words('english'))
    stopwords.update({"https", "http"})

    def __init__(self, handle: str):
        self.handle = handle
        self.user = self.api.get_user(handle)
        self.bio = self.user.description
        self.timeline = self.user.timeline(count=200)
        self.num_of_tweets = len(self.timeline)
        self.stopwords.add(handle)

    def get_top_words(self, limit: int = 10, word_len_min: int = 2) -> list:
        '''
        Return top common words from tweets
        '''
        all_words = list()

        for tweet in self.timeline:
            # print(tweet.text, "\n-----\n\n\n")
            words = self.tokenizer.tokenize(tweet.text)
            for word in words:
                if len(word) > word_len_min and word not in self.stopwords:
                    all_words.append(word.lower())

        word_distribution = nltk.FreqDist(all_words)
        top_words = word_distribution.most_common(limit)
        return top_words

    ''' We can cross reference the text'''

    def get_top_hastags(self, limit: int = 10, word_len_min: int = 2) -> list:
        '''
        Return top common hashtags from tweets
        '''
        all_words = list()

        for tweet in self.timeline:
            # print(tweet.text, "\n-----\n\n\n")
            words = re.findall(r"(\#[a-zA-Z]+\b)(?!;)", tweet.text)
            for word in words:
                if len(word) > word_len_min and word not in self.stopwords:
                    all_words.append(word.lower())

        word_distribution = nltk.FreqDist(all_words)
        hashtags = word_distribution.most_common(limit)
        return hashtags
コード例 #13
0
def main():
    from nltk.corpus import stopwords
    stopwords = set(stopwords.words('polish'))
    stopwords.update(['zgłoś', 'naruszenie', 'wczytuję', 'działam'])
    for i in range(100):
        stopwords.update(np.arange(0, 100))
    np.random.seed(0)
    fNames = readTags()
    numberLabels = {}
    for i in range(len(fNames)):
        numberLabels[fNames[i]] = i

    dataSet = open("dataSet.csv", 'w')
    model = Word2Vec.load('embeddings.bin')

    for name in fNames:
        print('creating data set for tag:', name)
        ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
        allArticlesForTag = glob.glob(ROOT_DIR + '/texts/' + name + '/*.txt')
        for article in allArticlesForTag:
            with open(article) as f:
                text = ''
                for line in f:
                    if len(line) > 200:
                        text += line
                postVecs = []
                if len(text) > 0:
                    if len(text) > 2000:
                        texts = re.split('\s{4,}', text)
                        for i in texts:
                            if len(i) > 50:
                                postVecs.append(
                                    postsToAveEmbeddings(i, model, stopwords))
                    else:
                        postVecs.append(
                            postsToAveEmbeddings(text, model, stopwords))

            for v in postVecs:
                for x in v:
                    dataSet.write(str(x) + ',')
                dataSet.write(str(numberLabels[name]) + '\n')

    dataSet.close()
コード例 #14
0
ファイル: eda.py プロジェクト: mrethana/news_bias_final
def word_cloud(source_name, subjectivity_floor):

    list_words = all_words.word[(all_words.subjectivity < subjectivity_floor)]
    df2 = final_df[(final_df.source_name == source_name)]
    all_text = []
    for blob in df2.text:
        all_text.append(blob)
    corpus = '-'.join(all_text)
    corpus = corpus.lower()
    stopwords = set(STOPWORDS)
    stopwords.update(list_words)

    # Generate a word cloud image
    wordcloud = WordCloud(background_color="white",
                          stopwords=stopwords).generate(corpus)
    # Display the generated image:
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()
コード例 #15
0
def content_text(text):

    #Remove common stopwords including "he's" and "she's"
    stopwords = set(nltk.corpus.stopwords.words('english'))
    stopwords.update(("He's", "She's"))

    with_stp = Counter()
    without_stp = Counter()
    with open(text) as f:
        for line in f:
            spl = line.split()
            # update count off all words in the line that are in stopwrods
            with_stp.update(w.lower().rstrip(punctuation) for w in spl
                            if w.lower() in stopwords)
            # update count off all words in the line that are not in stopwords
            without_stp.update(w.lower().rstrip(punctuation) for w in spl
                               if w not in stopwords)
    # return a list with top ten most common words from each
    return [x for x in with_stp.most_common(10)
            ], [y for y in without_stp.most_common(10)]
コード例 #16
0
ファイル: bayes.py プロジェクト: gabrieleger/cs102
class NLProcessor:
    stemmer = PorterStemmer()
    stopwords = set(stopwords.words('english'))
    stopwords.update([
        '.', ',', '.,', '-', '–', '"', "'", '?', '!', ':', ';', '(', ')', '[',
        ']', '{', '}'
    ])
    stopwords.update(['$', '%', '#', '/', '‘', '’', '“', '”', '·', '`'])

    @staticmethod
    def split_sentence(sentence: str):
        return wordpunct_tokenize(sentence)

    @staticmethod
    def norm_word(word: str):
        word = word.lower()
        if word in NLProcessor.stopwords:
            return None

        return NLProcessor.stemmer.stem(word)
コード例 #17
0
def create_word_cloud(text_vec):

    text = " ".join(review for review in text_vec)
    #print ("There are {} words in the combination of all review.".format(len(text)))

    # Create stopword list:
    stopwords = set(STOPWORDS)
    stopwords.update(["my", "trade"])

    # Generate a word cloud image
    wordcloud = WordCloud(stopwords=stopwords,
                          background_color="white").generate(text)

    # Display the generated image:
    # the matplotlib way:
    # plt.figure(figsize = (8, 8), facecolor = None)
    # plt.imshow(wordcloud, interpolation='bilinear')
    # plt.axis("off")
    # plt.tight_layout(pad = 0)
    # plt.show()
    return wordcloud, get_vader_sentiment(text), get_textblob_sentiment(text)
コード例 #18
0
def countMostCommonWords():
    from nltk.corpus import stopwords
    file = open('analysis.txt', 'a')
    stopwords = set(stopwords.words('polish'))
    stopwords.update(
        ['zgłoś', 'naruszenie', '1', '2', '3', '4', '5', '6', '7', '8', '9'])
    fNames = readTags()
    for fName in fNames:
        print("\nTag:", fName)
        content = ''
        ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
        allArticlesForTag = glob.glob(ROOT_DIR + '/texts/' + fName + '/*.txt')
        for article in allArticlesForTag:
            with open(article) as f:
                content += f.read()
        content = content.lower()
        tokenizer = nltk.RegexpTokenizer(r'\w+')
        content = tokenizer.tokenize(content)
        mostCommon = mostCommonWords(content, stopwords)
        file.write(fName + ': ')
        for word in mostCommon:
            file.write(word + ' ')
        file.write('\n')
コード例 #19
0
ファイル: util.py プロジェクト: xinyu72/521_project
def wordcloud(df, group):
    matplotlib.use('Agg')
    text = " ".join(review for review in df.review)
    print("There are {} words in the combination of all review.".format(
        len(text)))

    # Create stopword list:
    stopwords = set(STOPWORDS)
    stopwords.update(["room", "hotel", "desk", "Chicago", "stay", "day"])

    # Generate a word cloud image
    wordcloud = WordCloud(stopwords=stopwords,
                          collocations=False,
                          max_words=100,
                          background_color="white").generate(text)

    # Display the generated image:
    # the matplotlib way:
    plt.figure(figsize=(10, 8))
    plt.title("Word Cloud for " + group + ' reviews', fontsize=18)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    img_title = "Word Cloud for " + group + ' reviews'
    plt.savefig(img_title + '.png')
コード例 #20
0
from sklearn.feature_extraction import stop_words
import random


def grey_color_func(word,
                    font_size,
                    position,
                    orientation,
                    random_state=None,
                    **kwargs):
    return "hsl(0, 0%%, %d%%)" % random.randint(60, 100)


stopwords = set(STOPWORDS)
newStopWords = ['job_querry', 'description', 'contrat', 'city']
stopwords.update(newStopWords)

wordcloud = WordCloud(
    background_color='black',
    stopwords=stopwords,
    max_words=1500,
    max_font_size=200,
    width=1000,
    height=600,
    random_state=42,
).generate(" ".join(df['title'].astype(str)))

fig = plt.figure(figsize=(12, 12))
plt.imshow(wordcloud.recolor(color_func=grey_color_func, random_state=3),
           interpolation="bilinear")
plt.title("WORD CLOUD title", fontsize=25)
コード例 #21
0
ファイル: hw2.py プロジェクト: YenTingWang/wordcount
text = open('Building_Global_Community.txt').read()

norm_text = text.lower()

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
import string

stopwords.update(string.punctuation)
from nltk import wordpunct_tokenize

words = wordpunct_tokenize(norm_text)

filtered_words = [word for word in words if word not in stopwords if word.isalpha()]

from collections import Counter
counter = Counter(filtered_words)

for word, count in counter.most_common(20):
        print("%s: %d" % (word, count))
コード例 #22
0
def stopwords_cleaned(sentence):
    res = []
    for word in sentence:
        if word not in stopwords:
            res.append(word)
    return res


df['tokenized_removed_stopwords'] = df['tokenized_sentences'].apply(
    stopwords_cleaned)
from wordcloud import WordCloud, STOPWORDS
text_words = ''
stopwords = set(STOPWORDS)
stopwords.update([
    "br", "href", "amazon", "food", "gp", "ve", "grocery", "store", "although",
    "suscribe", "though", "think", "thought", "maybe"
])

#wordcloud
text = " ".join(review for review in df.Text)

wordcloud = WordCloud(width=800,
                      height=800,
                      background_color='black',
                      stopwords=stopwords,
                      min_font_size=10).generate(text)

plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad=0)
コード例 #23
0
list_join_wd_permutations = set()

#make list of mwe tokenizers joined by space
mwe_tokenizer = MWETokenizer(separator=' ')

#dictionary to count the food words
counter = dict()

#total food count
total = 0

#smallword_list = ['of', 'the', 'and', 'out', 'na', 'vit', 'n']
stopwords = set(stopwords.words('english'))

#add more words to stopwords list
stopwords.update(['n', 'na', 'new', 'vit', 'style', 'low', 'sprd', 'it\'s', 'dried', 'fungi', 'wonder', 'one', 'tongue', 'flavor', 'flavors', 'w', 'always', 'made', 'vegan', 'white', 'good', 'little', 'go', 'eye', 'end', 'delight', 'cloud', 'blue', 'back', 'without', 'warm', 'stuff', 'skin', 'right', 'real', 'past', 'outside', 'next', 'morning', 'hi', 'heart', 'head', 'gold', 'general', 'fr', 'eat', 'drink', 'big', 'baby', 'way', 'use', 'ultra', 'super', 'sub', 'start', 'soft', 'si', 'shaped', 'power', 'plus', 'part', 'old', 'november', 'mixed', 'meal', 'less', 'late', 'kit', 'game', 'kit', 'friends', 'eight', 'dog', 'deep', 'de', 'combination', 'blends', 'bear', 'animal', 'add', 'ear', 'kid', 'boy', 'oh', 'top', 'tree', 'side', 'shapes', 'prior', 'neck', 'mix', 'french', 'food', 'balls', 'young', 'wld', 'wash', 'user', 'types', 'type', 'store', 'southern', 'smart', 'slices', 'pods', 'plate', 'party', 'ones', 'lunch', 'leg', 'label', 'jo', 'item', 'inch', 'iced', 'higher', 'half', 'giant', 'g', 'foods', 'filling', 'filled', 'family', 'eyes', 'es', 'energy', 'dove', 'dogs', 'cup', 'cubes', 'cooking', 'child', 'christmas', 'character', 'ch', 'box', 'bowl', 'boo', 'black', 'bite', 'bar', 'b', 'arizona', 'rl', 'r', 'butt', 'mr', 'mt', 'pm', 'post', 'ross', 'x', 'touch', 'well', 'life', 'long', 'great', 'covered', 'year', 'spread', 'mini', 'straight', 'feet', 'weed', 'sea', 'sweet', 'fluffy', 'healthy', 'treats', 'light', 'snacks', 'fat', 'pink', 'cool', 'sunshine', 'rainbow', 'rolled', 'louis', 'sun', 'ground', 'mixture', 'home', 'full', 'summer', 'stars', 'star', 'recipe', 'la', 'ocean', 'hawaiian', 'chick', 'bright', 'blood', 'bit', 'bamboo', 'yellow', 'wrapped', 'women', 'winter', 'whole', 'vitamin', 'thick', 'smoked', 'slip', 'slice', 'silver', 'silk', 'serving', 'seeded', 'savory', 'restaurant', 'red', 'quick', 'quarters', 'proof', 'pound', 'popeyes', 'pockets', 'pillow', 'oscar', 'original', 'non', 'necks', 'moist', 'mediterranean', 'japanese', 'inside', 'includes', 'heat', 'hand', 'green', 'fun', 'form', 'done', 'art', 'adventure','break','beach','base','balance', 'brisk', 'baking'])

#stopwords below that are words part of multi word foods, stops only the individual words, not the multi word
stopwords2 = ['mashed', 'mash', 'colada', 'hot', 'whipped', 'whip', 'cold', 'fresh', 'vanilla ice', 'purple', 'moose', 'fried', 'edible', 'roll', 'rolls', 'wedding', 'soy', 'peanut', 'pop', 'homemade', 'blueberry', 'almond', 'vegetable', 'tap', 'tuna', 'sugared', 'straw', 'stone', 'spring', 'shake', 'sauce', 'rose', 'roasted', 'ricotta', 'puffed', 'powdered', 'pina', 'olive', 'oil', 'oat', 'mineral', 'maple', 'liquid', 'joy', 'grilled', 'greens', 'golden', 'goddess', 'glazed', 'frosted', 'fiji', 'dry', 'drop', 'double', 'dinner', 'desser', 'dark', 'cut', 'crunchy', 'crunch', 'crisp', 'chinese', 'chews', 'cheesy', 'buttery', 'bright', 'brand', 'bliss', 'apple', 'alcoholic','wasabi', 'cotton', 'lucky', 'seeds', 'flakes', 'tropical','chicken','drops', 'buds', 'bud', 'bakery', 'bites', 'cliff', 'coconut cotton', 'breakfast']

for food in list_of_report:
    description = food["Description"].lower() #converts everything to lower case
    tokens = word_tokenize(description)
    words = [word for word in tokens if word.isalpha() and word not in stopwords]


    #iterates through the food descriptions
    for word in words:

        #if food description contains certain words generate permutations of the description and append to the list
コード例 #24
0
ファイル: prac4.py プロジェクト: javamania/getIEEE
jsonArray = json_data.get("value")  # 총 5433개의 문서
jsonArray1 = list({v['title']: v
                   for v in jsonArray}.values())  # title로 구분하여 중복 제거
#print(jsonArray)

print(len(jsonArray))
print(len(jsonArray1))

wlem = nltk.WordNetLemmatizer()
#lemmatized_words = []

# Stop Words 등록하기
stopwords = set(STOPWORDS)
stopwords.update([
    'system', 'service', 'paper', 'software', 'business', 'process',
    'information'
])
# print(stopwords) # 불용어 출력

year2002 = []
year2003 = []
year2004 = []
year2005 = []
year2006 = []
year2007 = []
year2008 = []
year2009 = []
year2010 = []
year2011 = []
year2012 = []
year2013 = []
# wiesliam
# these should be the only imports you need
import tweepy
import nltk
nltk.download('averaged_perceptron_tagger')
import json
import sys
from nltk.corpus import stopwords
stopwords = set(stopwords.words("english"))
stopwords.update(["http", "https", "RT"])

# write your code here
# usage should be python3 part1.py <username> <num_tweets>
#print("hey")
username = sys.argv[1]
num_tweets = sys.argv[2]


def iterate(tagged_words, tag):
    #print(tagged_words, tag)
    relevant_terms = {}
    for term in tagged_words:
        #print(term[1])
        if term[1][:2] == tag:
            #print(term[1])
            if term[0] not in relevant_terms:
                relevant_terms[term[0]] = 1
            else:
                relevant_terms[term[0]] = relevant_terms[term[0]] + 1

    sorted_words = sorted(relevant_terms.items(),
コード例 #26
0
from collections import Counter
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

d = path.dirname(__file__)

# Read the whole text.
text = open(path.join(d, 'mydissertation.txt')).read()
wordcount = Counter(text.split())

# read the mask / color image taken from the image
grad_coloring = np.array(Image.open(path.join(d, "Graduation-cap-blue.jpg")))
stopwords = set(STOPWORDS)
stopwords.update(
    ('Figure', 'based', 'et', 'al', 'Therefore', 'used', 'using', 'show',
     'shown'
     'and', 'I', 'A', 'And', 'So', 'arnt', 'This', 'When', 'It', 'many',
     'Many', 'so', 'cant', 'Yes', 'yes', 'No', 'no', 'These', 'these'))

filtered_words = [word for word in text.split() if word not in stopwords]
filterwordcount = Counter(filtered_words)
filterwordcount.most_common(1)

wc = WordCloud(background_color="white",
               max_words=1000,
               mask=grad_coloring,
               stopwords=stopwords,
               max_font_size=40,
               random_state=42)
# generate word cloud
wc.generate(text)
コード例 #27
0
import csv
import pandas as pd
from pandas import Series, DataFrame
from sklearn import cluster, datasets, metrics
from sklearn.cluster import SpectralClustering
pathProg = 'C:\\Python27'
os.chdir(pathProg)

stopwords = set(stopwords.words('english'))

import string
cc = string.punctuation
dd = '--'
for symbol in cc:
    stopwords.add(symbol)
    stopwords.update(cc)
    stopwords.add("--")
    stopwords.add("'s")
    stopwords.add("'ve")
    stopwords.add("'re")
    stopwords.add("n't")
    stopwords.add("``")
    stopwords.add("''")

bb = []
file = open(pathProg + '/building_global_community.txt', 'r')
f = file.read()

bb = f.lower()
bb = word_tokenize(bb)
コード例 #28
0
#PROSSESING DONE ////////////////////////////////
def rejoin_words(row):
    my_list = row['lemmatized_words']
    joined_words = (" ".join(my_list))
    return joined_words


df['processed'] = df.apply(rejoin_words, axis=1)
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

stopwords = set(STOPWORDS)
stopwords.update([
    'we', 'will', 'aren', 'couldn', 'didn', 'doesn', 'don', 'hadn', 'dont',
    'doesnt', 'cant', 'couldnt', 'couldve', 'im', 'ive', 'isnt', 'theres',
    'wasnt', 'wouldnt', 'a', 'also', 'like', 'hasn', 'haven', 'isn', 'let',
    'll', 'mustn', 're', 'shan', 'shouldn', 've', 'wasn', 'weren', 'won',
    'wouldn', 'ha', 'wa', 'ldnont'
])

#VECTORIZING ////////////////////////////////
bow_vectorizer = CountVectorizer(max_df=0.90,
                                 min_df=2,
                                 max_features=1000,
                                 stop_words=stopwords)
bow = bow_vectorizer.fit_transform(df['processed'])
top_sum = bow.toarray().sum(axis=0)
top_sum_cv = [top_sum]
columns_cv = bow_vectorizer.get_feature_names()
x_traincvdf = pd.DataFrame(top_sum_cv, columns=columns_cv)
tfidf_vectorizer = TfidfVectorizer(max_df=0.90,
コード例 #29
0
#!/usr/bin/env python
from os import path
import os
import nltk

import matplotlib.pyplot as plt
from wordcloud import WordCloud #, STOPWORDS

# Descarga un listado de stopwords en distintos idiomas
# en este caso, español; se pueden usar varios diccionarios
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = set(stopwords.words('spanish'))
stopwords.update(["Media","omitted","1","2","3","4","5","6","7","8","9","0","/","AM","PM","-"])
#stopwords.discard("qué") # Eilimina una palabra de las stopwords para que sea tenida en cuanta

# get data directory (using getcwd() is needed to support running example in generated IPython notebook)
d = path.dirname(__file__) if "__file__" in locals() else os.getcwd()

# Read the whole text.
text = open(path.join(d, 'compilado.txt'),encoding="utf-8").read()

# Genera el wordcloud con parámetros adicionales disponibles en la documentación
wc = WordCloud(background_color="white",max_words=500,width=4000,height=2000,repeat=False,
               stopwords=stopwords,contour_width=3,contour_color='steelblue')

# generate word cloud
wc.generate(text)

# store to file
wc.to_file(path.join(d, "output.png"))
コード例 #30
0
from nltk.stem.snowball import SnowballStemmer
from nltk import word_tokenize
from nltk import pos_tag
from nltk import FreqDist

#input from user

test_num=0
test_num = input( "Input the number of Research articles to be classified:")
testpaths = []
for i in range (test_num):
    test=str(raw_input("Enter path:"))
    testpaths.append(test)
####################################Pre processing Test Data #####################################################
stopwords = set(stopwords.words('english'))
stopwords.update([',','.','?','!','}','(',')',']','[','=','|','*','0','.',':',';','@','^','%','$','+','_','-','9','8','7','6','1','2','3','4','5','*'])


num_classes=5

for i in range(test_num):
    with codecs.open(testpaths[i],'r',encoding='utf8') as file:
        text1 = file.read()
 
    text1 = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", text1)
    taggedlist = pos_tag(text1)
    for w in taggedlist:
        if w[1]=='PRP' or w[1] == 'DT' or w[1]=='CC':
            text1 = text1.replace(" "+w[0]+" ","")
    tokens = word_tokenize(text1)
    tokens = [x.encode('UTF8') for x in tokens]
コード例 #31
0
text = []
compoundscore = []
sid = SentimentIntensityAnalyzer()
track_name = list(dict.fromkeys(track_name))
for i in range(len(track_name)):
    song = genius.search_song(track_name[i], artist_name[i])
    songlyrics = song.lyrics.replace("\n", " ").replace("\\'", "\'")
    lyrics[track_name[i]] = songlyrics
    songlyrics = songlyrics.replace('(', '').replace(')', '')
    text.append(re.sub("[\\[].*?[\\]]", "", songlyrics))
    scores = sid.polarity_scores(text[i])
    compoundscore.append(scores['compound'])
text = ' '.join(map(str, text))
print(text.encode("utf-8"))
stopwords = set(stopwords.words('english'))
stopwords.update(["br", "href", "la", "yeah", "yuh", "wan", "i'm"])

sentences = sent_tokenize(text)
words = word_tokenize(text)
words_no_punc = []
for w in words:
    if w.isalpha():
        words_no_punc.append(w.lower())

ps = PorterStemmer()
clean_words = []
for w in words_no_punc:
    if w not in stopwords:
        clean_words.append(ps.stem(w))
fdist = FreqDist(clean_words)
print(fdist.most_common(10))