コード例 #1
0
def main(sheet, stopw):
    tweetlist = sheet_extractor(sheet)
    new_tweetlist = punctuation_splitter(tweetlist)
    tweet_words = just_tweet_words(new_tweetlist)
    clean_tweetlist = clean_tweets(new_tweetlist)
    tweet_stopw = stopwords(tweet_words, stopw)
    tweet_topw = topwords(tweet_stopw)
    tweet_hasht = hashtags(tweet_words)
    tweet_bigram = bigrams(clean_tweetlist)
    tweet_trigram = trigrams(clean_tweetlist)
    t_s = tweet_sent(new_tweetlist)
    # all lists as one so that it can be passed as one argument
    all_list = []
    all_list.append(tweet_topw)
    all_list.append(tweet_hasht)
    all_list.append(tweet_bigram)
    all_list.append(tweet_trigram)
    all_list.append(emojis)

    # apply the feature extractor to the tweets
    featuresets = [(extract_tweet_features(tweet, all_list), sent)
                   for (tweet, sent) in t_s]
    # divide the data into a train_set (90% of the data) and a devtest_set (10% of the data) as the test_set is separate
    train_set = featuresets[:(int(len(featuresets) * 0.9))]
    devtest_set = featuresets[(int(len(featuresets) * 0.9)):]
    # train the classifier with the train_set
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print("accuracy: " + str(nltk.classify.accuracy(classifier, devtest_set)))
    classifier.show_most_informative_features(20)
コード例 #2
0
ファイル: sentwi.py プロジェクト: Hoang23/Sentwi
def processAndVader():
    tweetProcessor_vader = TwitterProcessing.TwitterProcessing(
        TweetTokenizer(), stopwords())
    lSentiment_vader = []
    lSentiment_vader = vaderSentimentAnalysis(getTweets(), True,
                                              tweetProcessor_vader)
    return lSentiment_vader
def main():
    import os
    folder = "egos/"
    output = "Documentos Finais/"
    for doc in os.listdir(folder):
        print("\n--> " + doc)

        file = folder + doc
        f = open(file, 'r')

        doc_save = output + doc
        outfile = open(doc_save, 'a+')


        lines = f.readlines()

        termos = {}

        for line in lines:
            tweet = stopwords(line)
            if len(tweet) > 0:
                cont = 0
                for termo in tweet:
                    if not termo in termos.keys():
                        termos[termo] = contagem(lines, termo)
                    #Salvando
                    if termos[termo] > 1:
                        outfile.write(str(termo + " "))
                        cont += 1
                if cont > 0:
                    outfile.write(str("\n"))

        f.close()
        outfile.close()
コード例 #4
0
def cleaner(text):
    text = remove_punctuations(text)
    text = small(text)
    text = num_remove(text)
    text = stopwords(text)
    text = lemmatize(text)
    return text
コード例 #5
0
def cluster_texts(texts, filenames, clusters):
    #TF IDF
    #Transform texts to Tf-Idf coordinates
    stop_words = lemm(stopwords())
    vectorizer = TfidfVectorizer(tokenizer=process_text,
                                 stop_words=stop_words,
                                 max_df=0.5,
                                 min_df=0.03,
                                 lowercase=True)

    vectorizer = vectorizer.fit(texts)
    tfidf_model = vectorizer.transform(texts)
    first_vector_tfidfvectorizer = tfidf_model[0]

    tdidf_df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(),
                            index=vectorizer.get_feature_names(),
                            columns=["tfidf"])
    tdidf_df = tdidf_df.sort_values(by=["tfidf"], ascending=False)
    print('#####################################')
    print('TD IDF - Collective Frequencies:')
    print(tdidf_df.to_string())

    print('#####################################')
    print('TD IDF - Frequencies per doc:')
    print(tfidf_model)

    #COUNT
    count_vectorizer = CountVectorizer(tokenizer=process_text,
                                       stop_words=stop_words,
                                       max_df=0.5,
                                       min_df=0.03,
                                       lowercase=True)

    count_vectorizer = count_vectorizer.fit(texts)
    count = count_vectorizer.transform(texts)
    first_vector_countvectorizer = count[0]

    count_df = pd.DataFrame(first_vector_countvectorizer.T.todense(),
                            index=count_vectorizer.get_feature_names())
    count_df = count_df.sort_values(by=0, ascending=False)
    print('#####################################')
    print('Term Count:')
    print(count_df.to_string())

    #Cluster texts using K-Means
    km_model = KMeans(n_clusters=clusters)
    km_model.fit(tfidf_model)

    clustering = collections.defaultdict(list)
    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(idx)

    print('#####################################')
    print('Clusters:')
    print(dict(clustering))
コード例 #6
0
def sc_transform(df_col):
    # clean text from the column
    df_col = df_col.apply(lambda x: stopwords(remove_punctuation(x)))
    # get frequencies and filter for uncommon words
    freq_d = pd.Series(' '.join(df_col).split()).value_counts()
    rare_words = freq_d[freq_d <= 4]
    freq_d = freq_d[freq_d > 4]
    # compute BoW vector and its norm to get one value
    df_col = df_col.apply(
        lambda x: [x for x in x.split() if x not in rare_words])
    dict_d = corpora.Dictionary(df_col)
    df_col = df_col.apply(lambda desc: LA.norm(dict_d.doc2bow(desc)))
    return df_col
コード例 #7
0
def preprocess(fdict):
    dict = {}
    for key in fdict.keys():
        list = []
        for doc in fdict[key]:
            clean = cleanlines(doc)
            tokener = tokener(clean)
            lemmatize = lemmatize(tokener)
            sy = sy(lemmatize)
            steming = stem(sy)
            list = stopwords(steming)
            list.append(list)
        dict.setdefault(key,list)
    return dict
def TF_IDF(df, colname, min_df, ngram_range_tuple):
    from sklearn.feature_extraction.text import TfidfVectorizer
    a = list(df[colname])
    vectorizer = TfidfVectorizer(min_df=min_df,
                                 max_features=10000,
                                 tokenizer=tokenizer_tf_idf,
                                 stop_words=stopwords(),
                                 ngram_range=ngram_range_tuple)
    vz = vectorizer.fit_transform(a)
    tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
    tfidf = pd.DataFrame(columns=['tfidf']).from_dict(dict(tfidf),
                                                      orient='index')
    tfidf.columns = ['tfidf']

    return tfidf
def final_tokenzied_list(df, colname, ngrams):
    stop_words = stopwords()
    paragraph_list = df[colname].tolist()
    df['tokens'] = df[colname].map(tokenizer_tf_idf)

    n_grams = 2
    while n_grams < ngrams + 1:
        col_name = str(n_grams) + '_' + 'grams'
        n_gram_df = text_to_ngrams(paragraph_list, stop_words, n_grams)
        df = pd.merge(df, n_gram_df, on='doct_no', how='left')
        df['tokens'] = np.where(df[col_name].isnull(), df['tokens'],
                                df['tokens'] + df[col_name])
        df.drop([col_name], axis=1, inplace=True)
        n_grams = n_grams + 1

    return df
コード例 #10
0
 def __call__(self, docs):
     terms_field = self.terms_field
     get_lang = self.get_lang
     stopwords = self._stopwords
     for doc in docs:
         lang = get_lang(doc)
         lang_stopwords = stopwords(lang)
         tfield = doc[terms_field]
         to_del = []
         for term in tfield:
             if term in lang_stopwords:
                 to_del.append(term)
         self._logger.debug("Remove %s stopwords" % (len(to_del)))
         for keys in to_del:
             del tfield[keys]
     return docs
コード例 #11
0
def preprocess(phrase):
    cleaned_phrase = re.sub(re.compile("<.*?>"), "", phrase) # remove HTML tags
    cleaned_phrase = re.sub("[^A-Za-z0-9]+", " ", cleaned_phrase) # keep only words


    cleaned_phrase = cleaned_phrase.lower()

    tokens = nltk.word_tokenize(cleaned_phrase)
    stop_words = stopwords("english") # stop words to remove

    filtered_phrase = [word for word in tokens if word not in stop_words] # remove stop words

    lemmatizer = WorldNetLemmatizer()

    lemmed_phrase = [lemmatizer.lemmaize(word) for word in filtered_phrase]
    phrase = " ".join(lemmed_phrase)
    return phrase
コード例 #12
0
def preprocess_text(sentence):
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords('english')
    updated_text = ''
    text_data = sentence

    text_data = re.sub(r"[^\w\s]", '', text_data)  # Removing Punctuations
    text_tokens = nltk.word_tokenize(text_data)  # Tokenization
    updated_text_tokens = [
        token for token in text_tokens if token not in stop_words
    ]  # Stop words removal

    for t in updated_text_tokens:
        updated_text = updated_text + ' ' + str(
            lemmatizer.lemmatize(t)).lower()

    return updated_text
コード例 #13
0
def split_and_process_lines(string,
                            correct_spelling=False,
                            stemming=False,
                            remove_stopwords=False,
                            expand_contractions=False):
    if string is None:
        return None
    lines = []
    for line in re.split("[.?!]", string):
        line = remove_punctuation(line)
        line = remove_redundant_spaces(line)
        if expand_contractions:
            line = contractions(line)
        line_array = line.split(" ")
        if correct_spelling:
            line_array = autocorrect(line_array)
        if stemming:
            line_array = stem(line_array)
        if remove_stopwords:
            line_array = stopwords(line_array)
        if len(line_array) > 1:
            lines.append(" ".join(line_array).lower())
    return lines
コード例 #14
0
def ent_getword(text, rel):
    word_ent = {}
    word_list = set()
    for i in text:
        tokens = nltk.word_tokenize(text)
        filtered = stopwords(tokens)
        for j in filtered:
            if j not in word_list:
                word_list.add(j)
    for word in word_list:
        X = []
        for text_i in text:
            if text_i in text:
                X.append(1)
            else:
                X.append(0)
        word_ent[word] = calc_ent_grap(X, rel)
    word_ent = sorted(word_ent.items(), key=lambda d: d[1])[:20]
    word_list = {}
    for word in word_ent:
        for i in wordnet(word[0]):  # wordnet  Expansion
            if i not in word_list:
                word_list[i] = word_ent[1]
    return word_list
コード例 #15
0
    return

    #print(dictionary)


Train = "split.train"
Test = "split.test"

print("Answer to Question 1:")
features_matrix, labels_train = extract_features(Train)
print("")
print("Answer to Question 2:")
features = topwords(Train)
print("")
print("Answer to Question 3:")
features_matrix = stopwords(Train, 10)
print("Words after excluding stop words have been saved to nbStopWords.txt ")
print("")
print("Answer to Question 5:")
features_matrix = programtopwordsLogOdds(Train)

model = MultinomialNB()
#model.fit(features_matrix,labels_train)

# def extract_features(root_dir):
#     with open('C:/Users/cool dude/PycharmProjects/hello/hw9 (1)/split.test', 'r') as fin:
#         mylist = [line.rstrip('\n') for line in fin]
#        #features_matrix = np.zeros((len(mylist), 3000))
#         train_labels = np.zeros(len(mylist))
#         print(train_labels)
#
コード例 #16
0
ファイル: tosql.py プロジェクト: mayashaked/evaluatethis
#
# Author:      Maya Shaked
#
# Created:     02/15/2018
#-------------------------------------------------------------------------------

import pandas as pd
import sqlite3
import aggregate_numerical_data as agg_num
from nltk.corpus import stopwords
import dyadic_partitioning as dy

EVALS_PART_1 = 'evals_json_version_5_part1'
EVALS_PART_2 = 'evals_json_version_5_part2'
SQL_DB_PATH = 'reevaluations.db'
STOPWORDS = stopwords("english") + ['class', 'classes', 'professor', \
'professors' 'course', 'courses', 'ta', 'tas']

def pre_process(sql_db_path, evals_part_1, evals_part_2):
    '''
    Takes the SQL database path as well as the two json files containing 
    all the evaluations and adds the numerical scores and sentiment analysis 
    scores from aggregate_numerical_scores. adds the dyadic partitioning 
    results. and cleans the dataframe

      - sql_db_pth is a string
      - evals_part_1 is a string
      - evals_part_2 is a string

    Returns a database object and a pandas dataframe
    '''
コード例 #17
0
ファイル: history.py プロジェクト: Dedenne/Sosoproject
length = len(book.text1)
actorpot = [book.text1[length//2+indexes[i]] for i in range len(indexes)]
actorpot = [book.text1[length//2+indexes[i]] for i in range(len(indexes))]
actorpot
motspertinents(book.text2,30)
motspertinents(book.text2,100)
runfile('C:/Users/rapha/.spyder-py3/temp.py', wdir='C:/Users/rapha/.spyder-py3')
testX = createtestX(book.text2)
predictions2 = DTC.predict(testX)
length = len(book.text2)
indexes = [i for i in range(len(predictions2))if predictions2[i] == 1]
actorpot = [book.text1[length//2+indexes[i]] for i in range(len(indexes))]
actorpot = [book.text2[length//2+indexes[i]] for i in range(len(indexes))]
actorpot
'Edward' in actorpot
stopwords()
stopwords(english)
stopwords(en)
runfile('C:/Users/rapha/.spyder-py3/temp.py', wdir='C:/Users/rapha/.spyder-py3')
actors(book.text1)
runfile('C:/Users/rapha/.spyder-py3/temp.py', wdir='C:/Users/rapha/.spyder-py3')
actors(book.text1)
actors(book.text2)
actors(book.text3)
runfile('C:/Users/rapha/.spyder-py3/temp.py', wdir='C:/Users/rapha/.spyder-py3')
actors(book.text3)
actors(book.text4)
runfile('C:/Users/rapha/.spyder-py3/temp.py', wdir='C:/Users/rapha/.spyder-py3')
scored[1]
scored[(',','and')]
scored.(',','and')
コード例 #18
0
            fo.write(l.name)
        line=reader.readline()
    fo.close()    
file = open("saibaba.csv")
reader = csv.reader(file)
sentencetokenize(reader)
fp1 = open('sentence.txt', 'r')
line=fp1.readline()
while line:
    
    line=fp1.readline()
fp2 = open('clean.txt', 'r')
shorthanddata(fp2)
fp3=open('expansion.txt','r')
wordtokenize(fp3)
fp4=open('words.txt','r')
stopwords(fp4)
fp5=open('wordswithoutstopwords.txt','r')
stemy(fp5)
fp6=open("stemming.txt",'r')
stemreplacer(fp6)
fp7=open("stemreplacer.txt","a+")
antonymfilter(fp7)




 


コード例 #19
0
    for i in keyword:
        word = stopword.remove(i)
        if word != '':
            hasil_keyword.append(word)

    hasil_label = []
    for i in abstrak:
        word = stopword.remove(i)
        if word != '':
            hasil_label.append(word)
    return (hasil_judul, hasil_abstrak, hasil_keyword, hasil_label)


hasil_stopwords = []
for data in range(len(hasil_tokenize)):
    hasil = stopwords(hasil_tokenize[data][0], hasil_tokenize[data][1],
                      hasil_tokenize[data][2], hasil_tokenize[data][3])
    hasil_stopwords.append(hasil)
print(hasil_stopwords)

factory = StemmerFactory()
stemmer = factory.create_stemmer()


def stemming(judul, abstrak, keyword, label):
    hasil_judul = []
    for i in judul:
        word = stemmer.stem(i)
        if word != '':
            hasil_judul.append(word)

    hasil_abstrak = []
コード例 #20
0
3) 'maketrans' is used to translate the variable to a predefined mapped values.
4) string.punctuation means that we are targeting the punctuation marks present inside the sentence.
5) ('a', 'b', c) maps as ('variables that need to be repalaced', 'new variables which will be put instead of the new variables', 'values to be removed')
'''
cleaned_text = lower_case.translate(str.maketrans('', '', string.punctuation))

# Tokenisation: Splitting the words in a sentence and storing these words in a list.
tokenised_words = word_tokenize(cleaned_text, "english")

# Removing the stop words which are redundant in a sentence.

# Creating an empty list for storing the new words acquired after removing the stop words.
new_words = []
# Traversing through the total words achieved after tokenization.
for word in tokenised_words:
    if word not in stopwords('english'):
        # Appending the word to the empty list.
        new_words.append(word)

# The list which will contain all the emotion attributes from the read.txt file and emotions file
emotion_list = []
# Opening new file
with open('emotions.txt', 'r') as file:
    for line in file:

        # Removing spaces, commas etc from the emotions file.
        clean_line = line.replace('\n', '').replace(',',
                                                    '').replace("'",
                                                                '').strip()
        # Assigning the word before colon to word and after colon to emotion.
        word, emotion = clean_line.split(':')
コード例 #21
0
	for str in text.split():
		 if str.startswith('#'):
			 return str.split()

			
>>> term_split("hey yoo me #dayofthedream")
['#dayofthedream']
>>> 
>>> 
>>> 
>>> from nltk.corpus import stopwords
>>> stopword=stopwords.words('english')
>>> def stopwords(text):
	return [i for i in text.split() if i in stopword]

>>> stopwords("this is a foo bar sentence and example sentence for us to learn this subject")
['this', 'is', 'a', 'and', 'for', 'to', 'this']
>>> 
>>> 
>>> def stopwords(text):
	if [i for i in text.split() if i in stopword]:
		return True
	else:
		return False

	
>>> stopwords("foo bar sentence example sentence")
False
>>> 
>>> 
>>> 
コード例 #22
0
def filter_stopwords(tokens):
    global stopwordp
    sw = stopwords()
    w1 = [x for x in tokens if not (x in sw)]
    return [y for y in w1 if not (None == stopwordp.match(y))]
tweets_target = list(tweets['target'])
tweets_list = list(tweets['text'])

# tweets_list = tweets_list.apply(remove_punctuation)
tweets = [remove_punctuation(x) for x in tweets_list]

sw = stopwords.words('english')


def stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    return " ".join(text)


# tweets_list = tweets_list.apply(stopwords)
tweets = [stopwords(x) for x in tweets_list]

# create an object of stemming function
stemmer = SnowballStemmer("english")


def stemming(text):
    '''a function which stems each word in the given text'''
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text)


# tweets_list = tweets_list.apply(stemming)
tweets = [stemming(x) for x in tweets_list]
# tweets.head(10)
コード例 #24
0
import sys, nltk
from collections import defaultdict
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

METHODS = ['simplified', 'best-avg', 'orig', 'leading']
stops = stopwords('english')


def getDocuments(paths):
    docs = []
    docs_tokens = []
    for path in paths:
        with open(path, 'r', encoding='utf8') as f:
            s = f.read()
            f.close()
        sents = sent_tokenize(s)
        tokens = [word_tokenize(sent) for sent in sents]
        docs.append(sents)
        docs_tokens.append(tokens)
    return docs, docs_tokens


def sumBasic(documents, docs_tokens, method='orig', length=100):
    # Get probs
    probs = defaultdict(int)
    sentprobs = defaultdict(int)
    result = ""

    if method == 'leading':
        sentences = [sen for doc in documents for sen in doc]
コード例 #25
0
I believe that India got its first vision of this in 1857, when we started the war of Independence. It is this freedom that we must protect and nurture and build on. If we are not free, no one will respect us.

My second vision for India’s DEVELOPMENT.

For fifty years we have been a developing nation. It is time we see ourselves as a developed nation. We are among top five nations of the world in terms of GDP. We have 10 per cent growth rate in most areas. Our poverty levels are falling. Our achievements are being globally recognised today. Yet we lack the self-confidence to see ourselves as a developed nation,
 self-reliant and self-assured. Isn’t this incorrect"""

import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

ps = PorterStemmer()
Wordnet = WordNetLemmatizer()
sentences = nltk.sent_tokenize(paragraph)
corpus = []

for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])
    review = review.lower()
    review = review.split()
    review = [
        ps.stem(word) for word in review
        if not word in set(stopwords('english'))
    ]
    review = ' '.join(review)
    corpus.append(review)

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.transform(corpus).toarray()
コード例 #26
0
ファイル: main.py プロジェクト: ohhappylife/favorability
    # Cursor Setting
    cursor = tweepy.Cursor(api.search,
                           q=keyword,
                           since=dsince,
                           until=duntil,
                           tweet_mode='extended',
                           count=number,
                           lang='en',
                           geocode=location,
                           include_entities=True)

    for i, tweet in enumerate(cursor.items(3000)):
        if i == 0:
            wfile.write("{},{},{},{},{}".format('no', 'time', 'favorite_count',
                                                'tweet_retweet_count',
                                                'tweet_text' + '\n'))
        try:
            wfile.write("{},{},{},{},{}".format(
                i, tweet.created_at, tweet.favorite_count, tweet.retweet_count,
                (emoji(
                    stopwords(
                        strip_all_entities(
                            strip_links(
                                removeRT(tweet.retweeted_status.full_text.
                                         lower().replace('\n', '')))))))) +
                        '\n')
        except:
            pass

    wfile.close()
import nltk

# Tokenization
from nltk.tokenize import word_tokenize, sent_tokenize
text = "Mary had a little lamb. Her fleece was white as snow"
sents = sent_tokenize(text)
print (sents)
words = [word_tokenize(sent) for sent in sents]
print (words)

#Stopwords Removal
from nltk.corpus import stopwords
from string import punctuation
customStopWords = set(stopwords('english')+list(punctuation))
wordsNotStopWords = [for word in word_tokenize(text) if word not in customStopWords]
print(customStopWords)

#N_Grams
from nltk.collocations import *
bigramMeasures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(wordsNotStopWords)
print(sorted(finder.ngram_fd.items()))

#Stemming
text2 = "Mary closed on closing night when she was in the mood to close"
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
stemmedWords = [st.stem(word) for word in word_tokenize(text2)]
stemmedNotStopWords = [word for word in word_tokenize(text2) if word not in customStopWords]
print(stemmedWords)
print(stemmedWords)
def K_m_cluster(num_clusters, df, text_col_name, top_n_terms):

    import numpy as np
    import pandas as pd
    import bokeh.plotting as bp
    from bokeh.models import HoverTool, BoxSelectTool
    from bokeh.plotting import figure, show, output_notebook

    from sklearn.feature_extraction.text import TfidfVectorizer
    import warnings
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    from sklearn.cluster import MiniBatchKMeans

    vectorizer = TfidfVectorizer(min_df=0.01,
                                 max_features=10000,
                                 tokenizer=tokenizer_tf_idf,
                                 stop_words=stopwords(),
                                 ngram_range=(1, 1))

    vz = vectorizer.fit_transform(list(df[text_col_name]))
    kmeans_model = MiniBatchKMeans(n_clusters=num_clusters,
                                   init='k-means++',
                                   n_init=1,
                                   init_size=1000,
                                   batch_size=1000,
                                   verbose=False,
                                   max_iter=1000)
    kmeans = kmeans_model.fit(vz)
    kmeans_clusters = kmeans.predict(vz)
    kmeans_distances = kmeans.transform(vz)

    high_impact_cluster_terms = kmeans.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()

    from sklearn.manifold import TSNE
    tsne_model = TSNE(n_components=2, verbose=1, random_state=1)
    tsne_kmeans = tsne_model.fit_transform(kmeans_distances)
    kmeans_df = pd.DataFrame(tsne_kmeans, columns=['x', 'y'])
    kmeans_df['cluster'] = kmeans_clusters
    kmeans_df['text'] = df[text_col_name]

    cumpercent = 0

    Km_file = []
    for i in range(num_clusters):
        clust_num = i
        percent_in_doc = 100 * len(
            kmeans_df[kmeans_df['cluster'] == i]) / len(kmeans_df)
        hot_terms = ''
        for j in high_impact_cluster_terms[i, :top_n_terms]:
            hot_terms += terms[j] + ' | '
        Km_file.append(
            dict(clust_num=clust_num,
                 clust_terms=hot_terms,
                 percent_in_doc=round(percent_in_doc, 2)))
    Km_file = pd.DataFrame(Km_file)
    name_csv = 'Km_clusters' + '_' + str(num_clusters) + '.csv'
    Km_file.to_csv(name_csv)

    colormap = np.array([
        "#6d8dca", "#69de53", "#723bca", "#c3e14c", "#c84dc9", "#68af4e",
        "#6e6cd5", "#e3be38", "#4e2d7c", "#5fdfa8", "#d34690", "#3f6d31",
        "#d44427", "#7fcdd8", "#cb4053", "#5e9981", "#803a62", "#9b9e39",
        "#c88cca", "#e1c37b", "#34223b", "#bdd8a3", "#6e3326", "#cfbdce",
        "#d07d3c", "#52697d", "#7d6d33", "#d27c88", "#36422b", "#b68f79"
    ])

    kmeans_df['colors'] = colormap[kmeans_clusters]

    plot_kmeans = bp.figure(
        plot_width=700,
        plot_height=600,
        title="KMeans clustering of the news",
        tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
        x_axis_type=None,
        y_axis_type=None,
        min_border=1)

    plot_kmeans.scatter(x='x', y='y', color='colors', source=kmeans_df)

    hover = plot_kmeans.select(dict(type=HoverTool))
    hover.tooltips = {"text": "@text", "cluster": "@cluster"}
    show(plot_kmeans)

    return kmeans_df
コード例 #29
0
                continue

            subject = entries[0]
            count = int(entries[1].replace("\n", ""))

            dict[subject] = count

    return dict


if __name__ == "__main__":
    tokenizer = RegexpTokenizer(r'\w+')
    stemmer = SnowballStemmer("english")

    ## get stop words and normalize
    initial_stopwords = stopwords()
    stopwords = list()
    for s in initial_stopwords:
        s = normalize_string(s)
        stopwords.append(s)

    print("Loading index")
    mention_dict = load_index("../data/surface_forms_new.txt")
    subject_predicates_dict = [
    ]  # load_subject_predicates("data/SimpleQuestions_v2/freebase-FB2M.txt")
    subject_triple_counts = [
    ]  #load_subject_triple_counts("data/subject_triple_counts.txt")

    dataset_names = ["test"]
    max_ngram_size = 10
    exclude_small_ngrams = True
コード例 #30
0
""" Module which tokenizes the given data, i.e. filters out
stopwords(prepositions, conjunctions and so on), punctuation marks,
and gives us sentence and word arrays"""

import string
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords


def new_tokenize(input_string):
    """ Returns a list containing a list of words and a list of sentences
    given by nltk tokenize function and cleans up the punctuation marks
    and common stop words like 'or' and 'and' """

    stop_words = set(stopwords.words('english') + list(string.punctuation))
    unfiltered_words = word_tokenize(input_string)
    words = [word for word in unfiltered_words if word not in stop_words]
    sentences = sent_tokenize(input_string)
    return [words, sentences]