def find_politician_names(debate_soup_dict):
    for debate in debate_soup_dict.keys():
        raw = debate_soup_dict[debate]["soup"].get_text()
        # raw = raw.replace("\\\", "")
        raw = raw.replace("\\", "")
        raw = raw.replace(".", ". ")
        raw = raw.replace("?", "? ")
        raw = raw.replace("!", "! ")
        raw = raw.replace("  ", " ")
        raw = raw.replace("[applause]", "")
        raw = raw.replace("[crosstalk]", "")
        raw = raw.replace("[laughter]" "[Laughter]" "(LAUGHTER)", "")
        tokens = nltk.word_tokenize(raw)
        speech = nltk.Text(tokens)
        sent_detector = nltk.data.load("tokenizers/punkt/english.pickle")
        sents = sent_detector.tokenize(raw.strip())

        # find candidate names, most commonly repeated first words of sentences, not common words
        colon_names = []
        dumbWords = stopwords.words("english")

        for sent in sents:
            if ":" in sent:
                sent = sent.split(":")
                possible_name = sent[0]
                if len(possible_name) < 25:
                    colon_names.append(possible_name)

        fdist1 = FreqDist(colon_names)
        fdist2 = FreqDist(sents)
        mostFreq = fdist1.most_common(1)[0][1]
        if mostFreq > 20:
            debate_soup_dict[debate]["names"] = fdist1.most_common(10)
        else:
            debate_soup_dict[debate]["names"] = fdist2.most_common(10)
Example #2
0
def follow_description(api, friend_list, screen_name):
    the_list = []
    all_tags = []

    for friend in friend_list:
        username = friend[0]
        frequency = friend[1]

        print(username)

        try:
            user = api.get_user(screen_name=username)
            for list_obj in user.lists_memberships(screen_name=username, count=50):
                for w in list_obj.name.lower().split(" "):
                    # print(w)
                    all_tags.append(w)

        except TweepError as err:
            print(err.reason)
            break

    # print(all_tags)
    the_list_name = strip_words(all_tags)
    the_list_dist = FreqDist(the_list_name)

    # for w in the_list_dist:
    #     print ('***' + str(w))

    print(the_list_dist.most_common(20))
    return the_list_dist.most_common(20)
def get_hosts(year):
    '''Hosts is a list of one or more strings. Do NOT change the name
    of this function or what it returns.'''
    # Your code here
    file_name = 'gg%s.json' % year
    with open(file_name, 'r') as data:
        db = json.load(data)
    hosts = []
    pairs = []
    for f in db:
        e = f['text']
        if 'and' in e.lower():
            for proper in strip_proper_pairs(normalize_str(e).split()):
                pair = proper.split('and')
                if len(pair) == 2:
                    if pair[0] != ' ' and pair[1] != ' ':
                        pairs.append((pair[0].lower().replace('\'','\"').strip(' '), pair[1].lower().replace('\'','\"').strip(' ')))
    pairs_freq = FreqDist(pairs)
    if len(pairs_freq.most_common(10)[0][0][0].split(' ')) < 2:
        hosts.append(pairs_freq.most_common(10)[1][0][0])
        hosts.append(pairs_freq.most_common(10)[1][0][1])
    else:
        hosts.append(pairs_freq.most_common(10)[0][0][0])
        hosts.append(pairs_freq.most_common(10)[0][0][1])
    return hosts
Example #4
0
def get_top_followings(screen_name):
    # authorize twitter, initialize tweepy

    api = TwitterGrabber.initialise_api(0)

    print(api.get_status)

    # initialize a list to hold all the tweepy Tweets
    all_tweets = []

    # make initial request for most recent tweets (200 is the maximum allowed count)
    new_tweets = api.user_timeline(screen_name=screen_name, count=200)

    # get the user object
    # user = api.get_user(screen_name=screen_name)
    # print(user.lists_subscriptions)

    # save most recent tweets
    all_tweets.extend(new_tweets)

    # save the id of the oldest tweet less one
    oldest = all_tweets[-1].id - 1

    # keep grabbing tweets until there are no tweets left to grab
    while len(new_tweets) < 0:
        # print("getting tweets before %s" % oldest)

        # all subsequent requests use the max_id param to prevent duplicates
        new_tweets = api.user_timeline(screen_name=screen_name, count=200, max_id=oldest)

        # save most recent tweets
        all_tweets.extend(new_tweets)

        # update the id of the oldest tweet less one
        oldest = all_tweets[-1].id - 1

        print("...%s tweets downloaded so far" % (len(all_tweets)))

    tweet_text = []

    for tweet in all_tweets:
        tweet_text.append(tweet.text)

    content = []
    retweets = []

    for tweet in tweet_text:
        words = word_tokenize(tweet, 'english')
        content.extend(strip_words(words))

        if words[0] == 'RT':
            retweets.append(words[2])

    tweet_distribution = FreqDist(retweets)

    print(tweet_distribution.most_common(20))

    a = follow_description(api, tweet_distribution.most_common(20), screen_name)

    return a
Example #5
0
def get_monograms_freqdist(tokens):
    freq_dist = FreqDist(tokens)
    # print FreqDist.N(freq_dist)
    print 'Returned monograms'

    print freq_dist.most_common(10)
    temp_list = freq_dist.most_common(100)
    temp_dict = dict((item[0], item[1]) for item in temp_list)
    ordered_freq_dist = OrderedDict(sorted(temp_dict.items(), key=lambda x: x[1], reverse=True))

    return ordered_freq_dist
Example #6
0
def get_trigrams_freqdist(tokens):
    tri_grams = trigrams(tokens)
    print 'Returned trigrams'

    freq_dist_trigrams = FreqDist(tri_grams)
    print freq_dist_trigrams.most_common(10)

    freq_dist_trigrams_new = dict()
    for item in freq_dist_trigrams.items():
        temp_str = item[0]
        temp_key = temp_str[0] + ' ' + temp_str[1] + ' ' + temp_str[2]
        freq_dist_trigrams_new[temp_key] = item[1]
    freq_dist_trigrams_new = OrderedDict(sorted(freq_dist_trigrams_new.items(), key=lambda x: x[1], reverse=True))

    return freq_dist_trigrams_new
Example #7
0
def create_word_freq(db):
  db = getattr(db, "Posts")
  #client.command("CREATE CLASS concepted EXTENDS E")

  client.command("DELETE EDGE concepted")
  #client.command('create property frequency.freq string')

  #client.command("DELETE VERTEX frequency")
  data =  db.find().batch_size(50)
  concept = client.command("SELECT name FROM concept")
  c = [c.name for c in concept]
  for d in data:
    if not 'Body' in d:
        display= ''
    else:
        display= cleanhtml(d['Body'].replace('\n', ' ').replace('\r', '').replace('\\', ''))
        tokens = nltk.word_tokenize(display)
        fdist=FreqDist(tokens)
        i  = fdist.most_common()
        for k in i:
          if k[0].lower() in c:
            try:
                client.command("CREATE EDGE concepted FROM (SELECT FROM concept WHERE name = '{0}') TO (SELECT FROM Content WHERE PostId = {1}) SET strength = {2}".format(k[0].lower(),d['_id'],k[1]))
            except:
              continue
def get_list_dists_for(member_id):
    print(member_id, file=sys.stderr)

    # cursor.execute(get_listcount_for_member, [member_id])

    # mlistcount = cursor.fetchone()[0]
    cursor.execute(get_listinfo_for_member, [member_id])

    tstrout = ''
    rows = cursor.fetchall()
    for row in rows:
        c_line = str(row)
        c_line = ''.join(filter(lambda x: x in string.printable, c_line))

        if len(c_line):
            parsed_text = f.parse(c_line, True)
            strout = ''
            if parsed_text is not None:
                for item in s.items():
                    # print(1,str(item[1]))
                    word = []
                    for word in item[1]:
                        strout = strout + ' ' + word
            if len(strout):
                tstrout = tstrout + ' ' + strout
    # print(tstrout)
    words = nltk.tokenize.word_tokenize(tstrout)
    the_list_dist = FreqDist(words)

    return str(member_id) + " on " + str(len(rows)) + " lists: " + str(the_list_dist.most_common(20))
def transmit_vocabulary(t_token, t_lang):
    languages = ['danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian',
                 'norwegian', 'portuguese', 'russian', 'spanish', 'swedish', 'turkish']
    voc_stopwords = set()
    if t_lang in languages:
        voc_stopwords = set(stopwords.words(t_lang))
    i_f = codecs.open('csv/'+t_token+'.csv', 'r', 'utf-8')
    lines = i_f.readlines()
    all_tweets = []
    corpus_size = 0
    for line in lines:
        row = line.split('\t')
        words = word_tokenize(row[1])
        all_tweets.extend([w.lower() for w in words])
        corpus_size += 1
    freq_distribution = FreqDist(all_tweets)
    cats_vocabulary_elements = []
    for word, frequency in freq_distribution.most_common(1000):
        if word not in voc_stopwords:
            cats_vocabulary_elements.append('["' + word + '", ' + str(frequency) + ']')
    cats_vocabulary = '['+','.join(cats_vocabulary_elements)+']'
    print(cats_vocabulary)
    result_data = {'token': t_token, 'result': cats_vocabulary}
    json_data = json.dumps(result_data)
    results_request = urllib2.Request('http://mediamining.univ-lyon2.fr/cats/module/resultFile')
    results_request.add_header('Content-Type', 'application/json')
    results_request.data = json_data.encode('utf-8')
    urllib2.urlopen(results_request)
    print('Transmitted vocabulary for token '+t_token)
    os.remove('csv/' + t_token + '.csv')
Example #10
0
def analyzeTitles():
    fulltitles = []
    titles = []
    with open('../top100clean.csv', 'rb') as bookfile:
        reader = csv.reader(bookfile)
        for row in reader:
            if "..." in row[0]:
                row[0] = " ".join(row[0].split(" ")[:-1])
            words = nltk.word_tokenize(row[0])
            for w in words:
                if w.isalpha() and w.lower() not in ['the','a']:
                    titles.append(w.lower())
            fulltitles.append(row[0])

    titleset = nltk.Text(titles)
    wordsintitle = [len(f.split(" ")) for f in fulltitles]
    wit_fd = FreqDist(wordsintitle)
    print "\nw.i.t.\tfreq"
    print "--------------------"
    for numwords, times in wit_fd.iteritems():
        print str(numwords) + "\t" + str(times)
    print "\n"

    print "\nword\t\tfreq"
    print "--------------------"
    fd = FreqDist(titleset)
    common_words = fd.most_common(25)
    for k, v in common_words:
        print str(k) + "\t\t" + str(v)
Example #11
0
def cleaner(filename):
	textfile = open(os.path.join(app.config['UPLOAD_FOLDER'], filename),'r')
	text = []
	all_dates = []
	complete_text = []
	words_list = []
	nodes = []
	for line in textfile:
		datetime,chat = line.split('-')
		date, time = datetime.split(',')
		loc = chat.find(':')

		#if len(chat.split(':'))==3:
		#	print chat
		user,text = chat[:loc],chat[loc+2:]
		text = text.replace("\n",'')
		words = text.split(' ')
		for i in words:
			words_list.append(i)
		complete_text.append(text)
		nodes.append(user)
		all_dates.append(date)

	#print set(nodes)
	#print set(all_dates)
	fdist = FreqDist(words_list)
	f1 = fdist.most_common(100)
	create_csv('wordcloud.csv',f1)
	textfile.close()
Example #12
0
 def find_names(self):
     """creates a frequency distribution of the
     most common names in the texts"""
     names_list = LIST_OF_NAMES
     name_tokens = [w for w in self.tokens if w in names_list]
     fd = FreqDist(name_tokens)
     return fd.most_common(50)
def setUpOwnSubjectStopWords():
    for topic in topics_table_noun_only_title:
        #only limiting it to a specified length

        #might want to look into the numeric part
        all_description = [ds for ds in topics_table_noun_only_description[topic] if len(ds) > 5].join()
        all_topics = [topics for topics in topics_table_noun_only_title[topic] if len(ds) > 5].join()


        fdist_description = FreqDist(all_description)
        fidst_topics = FreqDist(all_topics)

        ten_most_common_descr = fdist_description.most_common(10)
        ten_most_common_topic = fdist_description.most_common(10)
        built_topic_stop_words[topic] = [word for word,freq in ten_most_common_descr ]
        built_topic_stop_words[topic].append([word for word, freq in ten_most_common_topic])

        #here we set up the top 5-10 words (we need to look into the data more to find
        #the hard margin of the good numerical value to stop, but for simplicity sake, we
        #pick 5 for now, let's see how our accuracy changes when change the most frequent words


    for topic in built_topic_stop_words:
        print built_topic_stop_words[topic]
        print "\n"
def getTopNFreqWords(textArr,N):
    fdist = FreqDist(textArr)
    topWordsWithFreq = fdist.most_common(N)
    topWords=[]
    for word in topWordsWithFreq:
        topWords.append(word[0])
    return topWords
Example #15
0
    def generate_ngrams_profile(self, text, profile_size, min_size=2, max_size=3):
        """
        It reads incoming text, generates all possible N-grams, with sizes ranging between min_size and max_size and counts the occurrences of all N-grams.

        Parameters
        ----------
        text : unicode

        profile_size : int

        min_size : int, optional (default=2)

        max_size : int, optional (default=3)

        Returns
        -------
        ngram_profile : FreqDist object

        """
        raw_ngrams = []
        text = self.sanitize_text(text)
        for n in range(min_size, max_size+1):
            for ngram in ngrams(text, n):
                raw_ngrams.append(''.join(unicode(i) for i in ngram))
        fdist = FreqDist(raw_ngrams)
        ngram_profile = fdist.most_common(n=profile_size)
        return ngram_profile
Example #16
0
def main():
    for file in glob.glob('./*/*/*.json'):
        data = loadFile(file)
        processMessages(data)

    fdist = FreqDist(all_tokens)
    # Output top 50 words
    for word, frequency in fdist.most_common(50):
        print('%s;%d' % (word, frequency)).encode('utf-8')
Example #17
0
    def __commonWords(self, pos,  number=100):
        """
        Find common words in the text.
        """
        from nltk import FreqDist

        vocab = FreqDist(pos)
        common = [word[0] for (word, _) in vocab.most_common(100) if word[1] == 'NN' or word[1] == 'NNS'  or word[1] == 'NNP'  or word[1] == 'NNPS']
        return common
Example #18
0
def freq():
    movies = Movie.query.all()
    all_string = '';
    for movie in movies:
        all_string += movie.stemmed;
    all_list = all_string.split('/')
    fdist = FreqDist([w for w in all_list if len(w)>9])
    common_l = fdist.most_common(300)
    return render_template('freq.html',commons = common_l)
Example #19
0
    def populate_comments(self, face_post):
        comments_cleaned = []
        for comment in face_post.get_comments():
            tokens = nltk.word_tokenize(comment)
            base_cleaned = [w for w in tokens if w not in self.stopwords and len(w) > 1]
            comments_cleaned.extend(base_cleaned)
            self.cleaned.extend(base_cleaned)

        dist = FreqDist(comments_cleaned)
        distribution = dist.most_common(30)
        face_post.set_distribution(distribution)
Example #20
0
def main():
    # класс частного распределения
    fd_text1 = FreqDist(book.text1)

    print(str.format('Объект частотного распределения: {}', fd_text1))

    print(str.format(
        '50 наиболее встречаемых слов: {}', fd_text1.most_common(50)
    ))

    fd_text1.plot(50, cumulative=True)
Example #21
0
def Estadisticas():

    print('total', len(movie_reviews.fileids()))
    print('categorias', movie_reviews.categories())
    print('total positivos', len(movie_reviews.fileids('pos')))
    print('total negativos', len(movie_reviews.fileids('neg')))

    all_words = [word.lower() for word in movie_reviews.words()]
    all_words_frequency = FreqDist(all_words)
    print('10 palabras más frecuentes', all_words_frequency.most_common(10))
    print('cantidad de veces que se repite la palabra happy',
          all_words_frequency['happy'])
Example #22
0
    def __commonWords(self, pos, number=100):
        """
        Find common words in the text.
        """
        from nltk import FreqDist

        vocab = FreqDist(pos)

        common = [(word[0], index) for (word, index) in vocab.most_common(100)
                  if word[1] == 'NN' or word[1] == 'NNS' or word[1] == 'NNP'
                  or word[1] == 'NNPS']
        return common
Example #23
0
def top_word_frequency_graph(text, k, label):
    fdist1 = FreqDist([w for w in text])
    word_freq = fdist1.most_common()

    # get frequency statistics
    total = 0
    for w in word_freq:
        total += w[1]

    word_freq_per = {}
    for w in word_freq:
        word_freq_per[w[0]] = (100 * w[1] / total)
Example #24
0
def Datos():
    documents = []
    for category in movie_reviews.categories():
        for fileid in movie_reviews.fileids(category):
            documents.append((movie_reviews.words(fileid), category))
    shuffle(documents)

    stopwords_english = stopwords.words('english')
    all_words = [word.lower() for word in movie_reviews.words()]
    all_words_clean = []
    for word in all_words:
        if word not in stopwords_english and word not in string.punctuation:
            all_words_clean.append(word)

    all_words_frequency = FreqDist(all_words_clean)
    print(all_words_frequency.most_common(10))
    most_common_words = all_words_frequency.most_common(2000)
    print(most_common_words[:10])
    word_features = [item[0] for item in most_common_words]

    return documents, word_features
 def get_most_common_words(cls, textual_data, n):
     """
     This function is used to extract the most common words from the submitted textual information.
     :param textual_data: Textual information
     :param n: The number of words to extract from the input.
     :return: The most common words from the input.
     :rtype: list
     """
     words = word_tokenize(textual_data.lower())
     frequency_distribution = FreqDist(words)
     most_common = frequency_distribution.most_common(n)
     return most_common
Example #26
0
	def __init__(self, normaliser, conllu, direction='ltr', cutoff=0.9, maximise_information=True):
		check_context = FreqDist(normaliser.print_full())
		size = int(len(check_context) * cutoff)
		check_context = [i[0] for i in check_context.most_common(size)]
		if direction == 'ltr':
			self.check_context = {i.split(':')[1]: i.split(':')[0] for i in check_context}
		else:
			self.check_context = {i.split(':')[0]: i.split(':')[1] for i in check_context}

		self.check_free = set(normaliser.print_stream())
		self.conllu = conllu
		self.maximise_information = maximise_information
def classify(message):

    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')
    text = twitter_samples.strings('tweets.20150430-223406.json')
    tweet_tokens = twitter_samples.tokenized('positive_tweets.json')

    stop_words = stopwords.words('english')

    positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    freq_dist_pos = FreqDist(all_pos_words)
    print(freq_dist_pos.most_common(10))

    positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, "Positive")
                         for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                         for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset

    random.shuffle(dataset)

    train_data = dataset[:7000]
    test_data = dataset[7000:]

    classifier = NaiveBayesClassifier.train(train_data)

    #print("Accuracy is:", classify.accuracy(classifier, test_data))

    #print(classifier.show_most_informative_features(10))

    custom_tweet = message

    custom_tokens = remove_noise(word_tokenize(custom_tweet))

    return (classifier.classify(dict([token, True] for token in custom_tokens)))
Example #28
0
def printTop(n, min_len):
    out_file = "output_min_" + str(n) + "_len_gt_" + str(min_len) + ".csv"
    fdist1 = FreqDist(tokens)
    most_common = fdist1.most_common(n)
    out_fh = open(out_file, "w")
    for tup in most_common:
        if (len(tup[0]) >= min_len):
            line = "\"" + tup[0] + "\"," + str(tup[1]) + "\n"
            #            print line
            out_fh.write(line)

    out_fh.close()
Example #29
0
def most_hashtag(df_tweets):
    import pandas as pd
    import numpy as np
    from mlxtend.frequent_patterns import apriori
    from mlxtend.preprocessing import TransactionEncoder
    from mlxtend.frequent_patterns import association_rules

    data = df_tweets.hashtags.apply(lambda x: np.nan if len(x) <= 0 else x)
    all_hashtags = list(data.dropna())

    hashtags = []
    for i in all_hashtags:
        for j in i:
            hashtags.append(j)

    hash_str = ''
    for i in hashtags:
        hash_str += i + ' '

        hash_str = hash_str.lower()
        hashtags2 = hash_str.split()

    freq = FreqDist(hashtags2)
    hash_most_freq = pd.DataFrame(data=freq.most_common(10),
                                  columns=['Hashtag', 'Frequency'])
    list_freq = list(hash_most_freq.Hashtag)

    all_hashtags_lower = [[h.lower() for h in line] for line in all_hashtags]

    def select_hashtag(freq, all_hash):
        select = []

        for list_hash in all_hash:
            for f in freq:
                if (len(list_hash) >= 2 and (f in list_hash)):
                    select.append(list_hash)
                    break
                else:
                    pass
        return select

    select = select_hashtag(list_freq, all_hashtags_lower)

    te = TransactionEncoder()
    te_ary = te.fit(select).transform(select)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    frequent_itemsets = apriori(df, min_support=0.2, use_colnames=True)

    rules = association_rules(frequent_itemsets,
                              metric="lift",
                              min_threshold=1)

    return rules
Example #30
0
def form_word_embeddings_samples(X_train, X_test):

    #creating vocabulary for training
    X_train = [word_tokenize(x.lower()) for x in X_train]
    X_test = [word_tokenize(x.lower()) for x in X_test]

    x_distr = FreqDist(np.concatenate(X_train + X_test))
    x_vocab = x_distr.most_common(min(len(x_distr), 10000))

    x_idx2word = [word[0] for word in x_vocab]
    x_idx2word.insert(0, '<PADDING>')
    x_idx2word.append('<NA>')

    x_word2idx = {word: idx for idx, word in enumerate(x_idx2word)}

    x_train_seq = np.zeros(
        (len(X_train), 20), dtype=np.int32
    )  # padding implicitly present, as the index of the padding token is 0

    #using an embedding for samples training data
    for i, da in enumerate(X_train):
        for j, token in enumerate(da):
            # truncate long Titles
            if j >= 20:
                break

            # represent each token with the corresponding index
            if token in x_word2idx:
                x_train_seq[i][j] = x_word2idx[token]
            else:
                x_train_seq[i][j] = x_word2idx['<NA>']

    x_test_seq = np.zeros(
        (len(X_test), 20), dtype=np.int32
    )  # padding implicitly present, as the index of the padding token is 0

    # form embeddings for samples testing data
    for i, da in enumerate(X_test):
        for j, token in enumerate(da):
            # truncate long Titles
            if j >= 20:
                break

            # represent each token with the corresponding index
            if token in x_word2idx:
                x_test_seq[i][j] = x_word2idx[token]
            else:
                x_test_seq[i][j] = x_word2idx['<NA>']

    print(
        "---------------------------formed word-embeddings for samples-----------------------------------"
    )
    return x_train_seq, x_test_seq
    def top_words(self, num_words=10):
        '''
        Takes an int (num_words) and prints the (num_words) most common words in the corpus.
        '''
        cleaned_reviews = self.reviews

        one_big_string = " ".join(cleaned_reviews)
        splits = one_big_string.split()
        freq_splits = FreqDist(splits)
        print("\n")
        for i, term in enumerate(freq_splits.most_common(num_words)):
            print(f'{i+1}. {term}')
Example #32
0
 def nameTheTopic(self, message, tokenizer):
     message = self.concatenate_list_data(message, tokenizer)
     finder = BigramCollocationFinder.from_words(word_tokenize(message))
     f = finder.ngram_fd.items()
     flipped = list([(v, k) for k, v in f])
     flipped.sort(reverse=True)
     if float(flipped[0][0]) > 1:
         s = " "
         return s.join(flipped[0][1]).title()
     else:
         frequency = FreqDist(word_tokenize(message))
         return frequency.most_common(1)[0][0].title()
Example #33
0
def tf(text):
    #frequence
    tf_score = {}
    freq_dist = FreqDist(text)
    kwords = freq_dist.most_common(None)
    total_words = len(kwords)

    for k in kwords:
        if k is not None:
            tf_score[k[0]] = k[1] / total_words

    return tf_score
Example #34
0
    def _makeProfile(self, auth, text):
        wdlist = self._preprocess(text)

        totalvocab = len(wdlist)

        fd = FreqDist(wdlist)
        commonwords = dict(fd.most_common(self.topN))
        df = pd.DataFrame.from_dict(commonwords, orient='index', columns=[auth])
        df = df/totalvocab #normalize
        df = df.transpose()

        return df
Example #35
0
	def featex(data):
		res = [];
		
		for d in data:
			fdist = FreqDist(w.lower() for w in d[0]);
			words = fdist.most_common(100)[5:];

			feat_dict = {w : fdist[w] for w in words};
			
			res = res + [(feat_dict, d[1], d[2])];

		return res;
Example #36
0
def learnColors(colorNamesDf, n=43):
    '''
    Learn the basic color names and their RGB values from the data frameself.
    Return list of (basic_color_term objects: (name + rgb tuple))
    '''
    words = []
    results = []

    red_n = defaultdict(int)
    red_sum = defaultdict(int)
    green_n = defaultdict(int)
    green_sum = defaultdict(int)
    blue_n = defaultdict(int)
    blue_sum = defaultdict(int)

    for index, row in colorNamesDf.iterrows():
        tokens = tok.tokenize(row['color_name_raw'])

        #Store data for average rgb values per token
        for t in tokens:
            red_n[t] += 1
            red_sum[t] += row['red']

            green_n[t] += 1
            green_sum[t] += row['green']

            blue_n[t] += 1
            blue_sum[t] += row['blue']

        words += tokens

    fd = FreqDist(words)

    basic_color_terms = [c for (c, f) in fd.most_common(43)]
    drop = ['of', 'mist', 'sea', 'sweet', 'spring', 'ice', 'sky', 'light', 'garden', 'stone', 'deep',
    'golden', 'dark', 'pale', 'soft', 'the', 'fresh', 'mountain', 'sage', 'desert']
    basic_color_terms = [color for color in basic_color_terms if color not in drop]

    #Take the average R, G, B values for each of the basic color terms
    for basic_color_term in basic_color_terms:
        r = red_sum[basic_color_term]/red_n[basic_color_term]
        g = green_sum[basic_color_term]/green_n[basic_color_term]
        b = blue_sum[basic_color_term]/blue_n[basic_color_term]
        newColor = Color(basic_color_term, r, g, b)

        #Add related words to the basic color object
        relatedWords = findRelatedWords(basic_color_term)

        newColor.setRelatedWords(relatedWords)

        results.append(newColor)

    return results
Example #37
0
    def print_top_words_from_text(self, text):
        print("\n<b>Top 10 words</b>")
        tokens = self.tokenize_without_punctuation(text)

        nltkText = nltk.Text(tokens)
        fdist = FreqDist(nltkText)
        print("<ul>")
        for word, frequency in fdist.most_common(10):
            print("  <li>")
            print("     %s : <b>%d</b>" % (word, frequency))
            print("  </li>")
        print("</ul>")
Example #38
0
def getMostCommonWords(allTweets):
    regex = re.compile('[^\w ]')
    string = ''
    for tweet in allTweets:
        string +=  (" " + tweet.text)
    string = string.lower()
    string = regex.sub('', string)
    tokens = pos_tag(word_tokenize(string))
    tokens = cleanUpPosTags(tokens)
    # print(tokens)
    fdist = FreqDist(tokens)
    return fdist.most_common(100)
Example #39
0
def train_social():
    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')
    text = twitter_samples.strings('tweets.20150430-223406.json')
    # tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]
    stop_words = stopwords.words('english')

    positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    freq_dist_pos = FreqDist(all_pos_words)
    print(freq_dist_pos.most_common(10))

    positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, "Positive")
                         for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                         for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset

    random.shuffle(dataset)

    train_data = dataset[:7000]
    test_data = dataset[7000:]

    # Classifier - TODO Add persistence
    classifier = NaiveBayesClassifier.train(train_data)

    print("Accuracy is:", classify.accuracy(classifier, test_data))

    print(classifier.show_most_informative_features(100))

    custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."

    custom_tokens = remove_noise(word_tokenize(custom_tweet))

    print(custom_tweet, classifier.classify(dict([token, True] for token in custom_tokens)))
    return classifier
Example #40
0
def assign_relations(dataset, name="cross_relations"):
    num_rows, num_cols = len(dataset), len(dataset[0])
    cross_relations = {}
    for u, v in product(xrange(num_cols), repeat=2):
        if u == v:
            continue

        cross_sents = []
        deduplicated_set = set()

        for i, row in enumerate(dataset):
            if isinstance(row[u], TEntry) and isinstance(row[v], TEntry):
                for reln in row.attrs["cross_relations"]:
                    if (row[u].text.lower() in set(obj.lower()
                                                   for obj in reln.subjects)
                            and row[v].text.lower() in set(
                                obj.lower() for obj in reln.objects)):
                        if (reln.doc_name, reln.sent_num) in deduplicated_set:
                            continue
                        deduplicated_set.add((reln.doc_name, reln.sent_num))
                        cross_sents.append(reln.sent)

        freq_dict = FreqDist([
            token["lemma"].lower() for tokens in cross_sents
            for token in tokens if token["word"].lower() not in STOPWORDS
            and token["pos"].startswith("VB")
        ])

        verb_set = {word for word, _ in freq_dict.most_common()[:5]}

        cross_reln = []
        deduplicated_set = set()
        for i, row in enumerate(dataset):
            if isinstance(row[u], TEntry) and isinstance(row[v], TEntry):
                for reln in row.attrs["cross_relations"]:
                    if (row[u].text.lower() in set(obj.lower()
                                                   for obj in reln.subjects)
                            and row[v].text.lower() in set(
                                obj.lower() for obj in reln.objects)):
                        if (reln.doc_name, reln.sent_num) in deduplicated_set:
                            continue
                        deduplicated_set.add((reln.doc_name, reln.sent_num))

                        accepted = False
                        for token in reln.sent[slice(*reln.relation_span)]:
                            if token["lemma"] in verb_set and token[
                                    "pos"].startswith("V"):
                                accepted = True
                        if accepted:
                            cross_reln.append((reln, reln.relation))

        cross_relations[u, v] = cross_reln
    dataset.attrs[name] = cross_relations
Example #41
0
def new_terms(texts):
    #df=pd.read_csv(csv_file)
    #texts=list(df.iloc[:,0])

    tokens = preprocess(texts)
    tokens_join = list(chain.from_iterable(tokens))
    #print('tokens')
    #print(tokens_join)
    fdist = FreqDist(str(w) for w in tokens_join)

    mostCommon = fdist.most_common(100)
    return mostCommon
Example #42
0
def get_token_list(df, col, freq=False):
    """Takes in a DataFrame and column that contains tokenized texts 
       and returns a list containing all the tokens (including duplicates) from that 
       column. If freq=True, the function will also print out the number of 
       unique tokens and the top 25 most common words as well as their counts based
       on nltk's FreqDist.
       
    Args:
        df (Pandas DataFrame): DataFrame from which to obtain tokenized text.
        col (str): Name of the column that contains the text to tokenize.
        freq (bool, default=False): Whether to print summary of token list.
    
    Returns:
        iterable: List of tokens as strings.
    
    Example:
        >>> df = pd.DataFrame({'numbers': [2, 4],
                   'text': [['an', 'example'],
                           ['another', 'example']]})

        >>> example_tokens = get_token_list(df, col='text', freq=True)
        >>> example_tokens
        
        ********** text Summary **********

        Number of unique words = 3
        token   count
        0   example 2
        1   an      1
        2   another 1
        
        ['an', 'example', 'another', 'example']
    
       """

    import pandas as pd
    from nltk import FreqDist

    ## Create list of all tokens
    tokens = []
    for text in df[col].to_list():
        tokens.extend(text)

    if freq:
        # Make a FreqDist from token list
        fd = FreqDist(tokens)

        # Display length of the FreqDist (# of unique tokens) and 25 most common words
        print('\n********** {} Summary **********\n'.format(col))
        print('Number of unique words = {}'.format(len(fd)))
        display(pd.DataFrame(fd.most_common(25), columns=['token', 'count']))

    return tokens
def word_freq(corpus, article):
    wordfreq = []
    wf = FreqDist(corpus)

    for word, freq in wf.most_common():
        # (word-frequency / body-tokens-length ) * 100
        rel = (freq / len(corpus)) * 100
        wwf = word, freq, rel

        wordfreq.append(wwf)

    return wordfreq
Example #44
0
    def lr_metric(self, word, top5):
        num_neg = 0
        num_pos = 0

        if top5:
            freq = FreqDist([w for rew in self.clean_neg for w in rew])
            best_neg, _ = zip(*freq.most_common(5))
            freq = FreqDist([w for rew in self.clean_pos for w in rew])
            best_pos, _ = zip(*freq.most_common(5))
            print("Top 5 words used in negative reviews:")
            print(best_neg)
            print("Top 5 words used in positive reviews:")
            print(best_pos)

        num_neg = self.neg_counts.get(word)
        num_pos = self.pos_counts.get(word)

        if num_pos >= 10 and num_neg >= 10:
            return float(num_pos / num_neg)
        else:
            return -1
Example #45
0
 def create_vocabulary(self, column, num_words_per_text):
     vocabulary = []
     for lemmatized_text in self.data[column]:
         freq_dist = FreqDist(lemmatized_text)
         count = 0
         for word, times in freq_dist.most_common(50):
             if count == 10:
                 break
             elif word not in vocabulary and times > 1:
                 vocabulary.append(word)
                 count += 1
     self.vocabulary = vocabulary
Example #46
0
def extract_ngrams(text,
                   low=1,
                   high=2,
                   lowercase=False,
                   filter_punctuation=True,
                   binary=False,
                   least_common=None,
                   most_common=None,
                   normalize=False,
                   sample=False):
    #text = ' '.join(review.paragraphs)
    tokens = None

    # Make lowercase
    if lowercase:
        tokens = word_tokenize(text.lower())
    else:
        tokens = word_tokenize(text)

    # Remove Punctuation
    if filter_punctuation:
        words = [t for t in tokens if t not in PUNCTUATION]
    else:
        words = [t for t in tokens]

    # Do the N Gram Thing
    ngram_counts = {}
    assert not (
        sample and binary
    ), "Please don't make sample and binary True. One or the other or neither pls"
    for n in range(low, high + 1):
        ngram_freqdist = FreqDist(ngrams(words, n))
        grams_to_consider = ngram_freqdist
        if least_common:
            assert least_common > 0.0 and least_common <= 1.0, \
                    'Least common must be a proportion, not %.3f' % least_common
            num_least_common = int(least_common * ngram_freqdist.N())
            grams_to_consider = []
            for bleh in ngram_freqdist.most_common()[-1 * num_least_common:]:
                gram, count = bleh
                grams_to_consider.append(gram)
        for gram in grams_to_consider:
            if sample:
                ngram_counts[gram] = ngram_freqdist.freq(gram)
            elif binary:
                ngram_counts[gram] = True
            else:
                ngram_counts[gram] = ngram_freqdist[gram]
    if normalize:
        total_counts = sum(count for ngram, count in ngram_counts.items())
        for gram, count in ngram_counts.items():
            ngram_counts[gram] = count / total_counts
    return ngram_counts
Example #47
0
def RetornNucleo(frase):
    # Instancia a spacy no modelo portugues
    nlp = spacy.load("pt")

    # submeter a frase no modelo spacy
    doc = nlp(frase)

    #sem as palavras irrelevantes
    frase2 = principal.RemoverIrrelevantes(frase)
    doc2 = nlp(frase2)

    # Gerar dataset das dependecias
    dep_palavra = []
    for token in doc:
        while token.head != token:
            x = token.text
            dep_palavra.append(x)
            token = token.head

    # Gerar dataset das dependecias sem as palavras irrelevante
    dep_PalvraIrr = []
    if len(frase2.split()) > 1:
        for token in doc2:
            while token.head != token:
                x = token.text
                dep_PalvraIrr.append(x)
                token = token.head
    else:
        dep_PalvraIrr.append(frase2)

    maiorfreq = FreqDist(item for item in dep_palavra)
    maiorfreq2 = FreqDist(item for item in dep_PalvraIrr)

    mf1 = maiorfreq.most_common(1)
    mf2 = maiorfreq2.most_common(1)

    if mf1 == mf2:
        return mf1
    else:
        return ValidarNucleos(frase, mf1[0][0], mf2[0][0])
Example #48
0
def plt_keyword_frequency(title='{} most frequent keywords',
                          x_label='Keyword',
                          y_label='Frequency',
                          limit=20,
                          save=False,
                          is_long_title=False,
                          file='plt_keyword_frequency.png',
                          fig_size=PLOT_DIMENSIONS):
    """" Display a plot that shows the most frequent keywords. The number of words is determined by the limit. """

    # set the style
    sns.set(style=SN_STYLE, font=FONT_NAME)

    plt.figure(figsize=fig_size)

    title = title.format(limit)

    # get data
    df = roll.roll_with_entities_df()

    # filter out rows without keywords
    df_keywords = df[df[common.KEYWORDS_COL].notnull()]

    # get keywords as a list
    all_keywords = df_keywords[common.KEYWORDS_COL].to_list()

    # remove the semicolon delimiter
    kw = []
    for k in all_keywords:
        for i in k.split(';'):
            kw.append(i)

    kw_freq = FreqDist(kw)

    # create a pandas as df
    kw_df = pd.DataFrame(kw_freq.most_common(limit),
                         columns=['Word', 'Frequency']).set_index('Word')

    sns.set()

    kw_df.plot(kind='bar', legend=None)
    plt.xticks(fontsize=10, fontname=FONT_NAME)
    plt.yticks(fontsize=10, fontname=FONT_NAME)

    # plot labels
    set_labels(x_label, y_label)

    # plot title
    title_text(title, is_long_title)

    # show or save the image to file
    save_or_show(save=save, plot_file_name=file)
Example #49
0
def load_words(source, vocab_size=10000, limit=None, max_length=None):
    """
    Loads sentences (or other natural language sequences) from a text file. Assumes a single sequence per line.

    :param source: Text file to read from
    :param vocab_size: Maximum number of words to retain. If there are more unique words than this, the most frequent
        "vocab_size" words are used, and the rest are replaced by the <UNK> symbol
    :param limit: If not None, only the first "character_limit" characters are read. Useful for debugging on large corpora.
    :param max_length: If not none, any sentence longer containing more words than this is removed.

    :return: (1) A list of lists of integers representing the encoded sentences, (3) a dict from strings to ints
        representing the mapping from words to indices (2) a list of strings representing the mapping from indices to
        words.
    """

    # Reading raw text from source and destination files
    f = open(source, 'r')
    x_data = f.read()
    f.close()

    print('raw data read')

    if limit is not None:
        x_data = x_data[:limit]

    # Splitting raw text into array of sequences
    x = [text_to_word_sequence(x) for x in x_data.split('\n') if len(x) > 0]

    if max_length is not None:
        x = [s for s in x if len(s) <= max_length]

    # Creating the vocabulary set with the most common words (leaving room for PAD, START, UNK)
    dist = FreqDist(np.hstack(x))
    x_vocab = dist.most_common(vocab_size - len(EXTRA_SYMBOLS))

    # Creating an array of words from the vocabulary set, we will use this array as index-to-word dictionary
    i2w = [word[0] for word in x_vocab]
    # Adding the word "ZERO" to the beginning of the array
    i2w = EXTRA_SYMBOLS + i2w

    # Creating the word-to-index dictionary from the array created above
    w2i = {word: ix for ix, word in enumerate(i2w)}

    # Converting each word to its index value
    for i, sentence in enumerate(x):
        for j, word in enumerate(sentence):
            if word in w2i:
                x[i][j] = w2i[word]
            else:
                x[i][j] = w2i['<UNK>']

    return x, w2i, i2w
Example #50
0
def text_similarity():
    """
    文本相似度
    :return:
    """
    text1 = 'I like the movie so much '
    text2 = 'That is a good movie '
    text3 = 'This is a great one '
    text4 = 'That is a really bad movie '
    text5 = 'This is a terribl  e movie'

    text = text1 + text2 + text3 + text4 + text5
    words = nltk.word_tokenize(text)
    freq_dist = FreqDist(words)
    print(freq_dist['is'])

    # 取出常用的n=5个单词
    n = 5
    # 构造“常用单词列表”
    most_common_words = freq_dist.most_common(n)
    print(most_common_words)

    def lookup_pos(most_common_words):
        """
            查找常用单词的位置
        """
        result = {}
        pos = 0
        for word in most_common_words:
            result[word[0]] = pos
            pos += 1
        return result

    # 记录位置
    std_pos_dict = lookup_pos(most_common_words)
    print(std_pos_dict)

    # 新文本
    new_text = 'That one is a good movie. This is so good!'

    # 初始化向量
    freq_vec = [0] * n

    # 分词
    new_words = nltk.word_tokenize(new_text)

    # 在“常用单词列表”上计算词频
    for new_word in new_words:
        if new_word in list(std_pos_dict.keys()):
            freq_vec[std_pos_dict[new_word]] += 1

    print(freq_vec)
Example #51
0
def analyzeAuthors():

    authors = []
    with open('../top100clean.csv', 'rb') as bookfile:
        reader = csv.reader(bookfile)
        for row in reader:
            authors.append(row[4])

    authorset = nltk.Text(authors)
    fd = FreqDist(authorset)
    prolific = fd.most_common(10)
    for k, v in prolific:
        print str(k) + "\t" + str(v)
Example #52
0
def removeStopwords(t, n, label="No label"):
    frequency = FreqDist(t)
    s1 = frequency.most_common(n)
    s1 = [x[0] for x in s1]
##
##    print "Top "+str(n)+" stopwords of "+label
##    printList(stopwords)
##    
##    new_text = [x for x in t if x not in stopwords]
##    return new_text
    s2 = stopwords.words("english")
    s = set(s1+s2)
    return [x for x in t if x not in s]
    def ViewWords(self, event):
	self.processing.SetLabel('Procrssing....')
	global filePath 
	if filePath == "":
		self.OutLabel.SetLabel("Output\n\nPlease select file.") 
		return
	stopwords = nltk.corpus.stopwords.words('english')
	f = open(filePath, 'r').read()
	tokens	= nltk.word_tokenize(f)
	words	= [w.lower() for w in tokens]
	alphawords = [w for w in words if not alpha_filter(w)]
	stoppedwords = [w for w in alphawords if not w in stopwords]
	fdist	= FreqDist(stoppedwords)
	inp = self.editname.GetValue()
	if inp == '' or inp == 0 or inp == None:
		inp =10000 
	val = int(inp)
	outstr = "Output : \nWord\t\t --> \t Frequency\n"
	self.OutLabel.SetLabel(str(fdist.most_common(val)))
	for word in fdist.most_common(val):
		print	word 
		outstr = outstr +str(word[0]).ljust(40)+ str(word[1])+"\n"
	self.OutLabel.SetLabel(str(outstr))
	self.processing.SetLabel('Done.')
Example #54
0
def getDict(filename) :
	fileComment = open(filename , 'rU')
	# our data is in latin-1 format
	rawData = fileComment.read().decode('latin-1')
	tokens = word_tokenize(rawData)

	# remove the common and non-alpha tokens
	ignoredWords = stopwords.words('english')
	filterNoAlpha = [i.lower()for i in tokens if i.isalpha()]
	filtered_words = [i for i in filterNoAlpha if i not in ignoredWords]

	fdist1 = FreqDist(filtered_words)
	final = fdist1.most_common(200)

	fileComment.close()
	return dict(final)
Example #55
0
def processTexts(processedListOfTexts):
    #FreqDist
    print('\n--------------------------------------------------\nCompute most frequent indicative words')
    #requires tokenized nltk/string. Also can use ' '.join(processedListOfTexts)
    fullDist = FreqDist(str(word) for word in processedListOfTexts)
    #Use items() because most_common() doesn't work on mac.
    #Note: Please confirm if output of items() is sorted by value
    top=[word[0] for word in fullDist.most_common(50) if word[0] not in stopWordSet]
    for word in top:
        print(word)
    relevantNouns=[]
    print('\n--------------------------------------------------\nRelevant Nouns:')
    for word in top:
        if word in nounList:
            relevantNouns.append(word)
            print(word)
Example #56
0
def common_tri(textt):
    word=word_tokenize(textt)
    fdist=FreqDist(trigrams(word))
    h=fdist.most_common(1)
    h=str(h)
    trans=maketrans(symbols,whitespace)
    x= h.translate(trans)
    x = x.strip()
    print(x)
    
##he function operates for trigrams, simple adjustment can be made to use whatever 
##number of terms (n grams)
    
    
    

    
Example #57
0
    def get_palavras_frequentes(self):
        """Documentar.
        """
        if self._palavras_frequentes is None:

            print "-- Verificando as palavras mais frequentes do corpus."

            # Teste - retorna apenas as 2000 palavras mais frequentes do corpus
            todas_palavras = [word.lower() for word in self._corpus.words()]
            freq_dist_palavras = FreqDist(todas_palavras)
            frequencia_palavras = freq_dist_palavras.most_common(2000)  # 2000 palavras mais frequentes
            
            self._palavras_frequentes = [palavra for palavra, frequencia in frequencia_palavras]
            
            # all_words = FreqDist(word.lower() for word in self.corpus.words())
            # self.word_features = list(all_words)[:2000]
        return self._palavras_frequentes
Example #58
0
def check_svc_bef_aft(list_line, command):
    # check the freq of words before and after bus service
    # check the freq of words before and after of word (number) which is non bus svc
    text = ''
    for i in range(0, len(list_line), 3):
        split_first = 0
        split_second = 0

        if i % 3 == 0:
            split_first = list_line[i].strip().split('\t')
        j = i + 1
        if j % 3 == 1:
            split_second = list_line[j].strip().split('\t')

        for k in range(0, len(split_second)):
            if command == 'before_svc':
                if int(split_second[k]) == 1:  # mean bus svc
                    if command == 'before_svc':
                        if k > 0:  # bus svc doesn't appear at the first position of sentences
                            text = text + split_first[k - 1].lower() + ' '  # take the word before
                print i, k, split_first[k]

            if command == 'after_svc':
                if int(split_second[k]) == 1:  # mean bus svc
                    if command == 'after_svc':
                        if k < len(split_second) - 1:
                            text = text + split_first[k + 1].lower() + ' '  # take the word after

            if command == 'before_notsvc':
                if RepresentsInt(split_first[k]) is True and int(split_second[k]) != 1:  # text is a number and not a bus svc
                    if k > 0:  # bus svc doesn't appear at the last position of sentences
                        text = text + split_first[k - 1].lower() + ' '

            if command == 'after_notsvc':
                if RepresentsInt(split_first[k]) is True and int(split_second[k]) != 1:  # text is a number and not a bus svc
                    if k < len(split_second) - 1:  # bus svc doesn't appear at the last position of sentences
                        text = text + split_first[k + 1].lower() + ' '

    fdist = FreqDist()
    tokens = word_tokenize(str(text))
    fdist.update(tokens)
    for value in fdist.most_common(len(fdist)):
        print value[0], '\t', value[1]

    print text
Example #59
0
    def featuresextr(self, set_name='featurespace.csv'):
        new_set = []
        for tweet_tokens, features, weight in self.new_tweetset:  # features are pending till performance methodoly
            #  print '\n{}'.format(tweet_tokens)
            total_tokens = len(tweet_tokens)
            frequencies = FreqDist(tweet_tokens)
            words_tfidf = [(t_word, round(self.termfrequency(count_w, total_tokens) * self.inversdocfreq(t_word), 2))
                           for t_word, count_w in frequencies.most_common()]
            tfidf_vector = tuple([value for unigram, value in words_tfidf])

            feat_bigrams = self.posbigrams(tweet_tokens)
            ortony_occur = self.wordsoccurrences(tweet_tokens)
            profane_occur = self.wordsoccurrences(tweet_tokens, option='profane')
            preprocessed_twits = self.new_tweetset[:, 0]
            guardar_csv(preprocessed_twits, 'recursos/processed_twits_slang.csv')
            new_set.append((sum(tfidf_vector), ortony_occur, profane_occur, sum(feat_bigrams), weight))
        guardar_csv(new_set, 'recursos/{}'.format(set_name))
        self.features_space = np.array(new_set)
Example #60
0
def create_document_properties(cat, doc):
    words = [clean_string(x) for x in corpus.words(fileids=doc) if clean_string(x)!='']
    tot_words = len(words)
    sentences = len(corpus.sents(fileids=doc))
    word_sent = [len(x) for x in corpus.sents(doc)]
    avg_word = sum(word_sent)/len(word_sent)
    char_word = [len(x) for x in words]
    avg_char = sum(char_word)/len(char_word)
    words_ex_stop = [x for x in words if x not in stop]
    freq_dist = FreqDist(words_ex_stop)
    common = freq_dist.most_common(5)
    return (
        {
            "category": cat, "doc": doc,
            "tot_words": tot_words, "avg_char": avg_char,
            "sentences": sentences, "avg_word": avg_word,
            "most_common": common[0][0], "most_common_freq": common[0][1]
        }
    )