Exemple #1
0
    def store_freqdists(self):
        """
        Build NLTK frequency distributions based on feature counts and store them to Redis.
        """
        #TODO: this step and the above may possibly be combined

        word_fd = FreqDist()
        label_word_freqdist = ConditionalFreqDist()

        pos_words = self.r.zrange('positive_wordcounts', 0, -1, withscores=True, desc=True)
        neg_words = self.r.zrange('negative_wordcounts', 0, -1, withscores=True, desc=True)

        assert pos_words and neg_words, 'Requires wordcounts to be stored in redis.'

        #build a condtional freqdist with the feature counts per label
        for word, count in pos_words:
            word_fd.inc(word, count)
            label_word_freqdist['positive'].inc(word, count)

        for word,count in neg_words:
            word_fd.inc(word, count)
            label_word_freqdist['negative'].inc(word, count)

        self.pickle_store('word_fd', word_fd)
        self.pickle_store('label_fd', label_word_freqdist)
Exemple #2
0
def most_informative_words(corpus, categories=['dem', 'rep'], count=2500):
    fd = FreqDist()
    cond_fd = ConditionalFreqDist()
    word_counts = {}

    for cat in categories:
        for word in corpus.words(categories=[cat]):
            word = word.lower().strip(".!?:,/ ")
            if not word.isalpha() or word in stopset:
                continue
            fd.inc(word)
            cond_fd[cat].inc(word)

        word_counts[cat] = cond_fd[cat].N()

    total_word_count = sum(word_counts.values())

    word_scores = collections.defaultdict(int)
    for word, freq in fd.iteritems():
        for cat in categories:
            cat_word_score = BigramAssocMeasures.chi_sq(
                cond_fd[cat][word],
                (freq, word_counts[cat]),
                total_word_count)
            word_scores[word] += cat_word_score

    informative_words = sorted(word_scores.iteritems(),
                               key=lambda (w, s): s,
                               reverse=True)[:count]
    return set([w for w, s in informative_words])
def text_to_vector(docs, metric):
    """ Create frequency based feature-vector from text

    Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
    """
    doc_freqs = FreqDist() # Distribution over how many documents each word appear in.
    tf_dists = [] # List of TF distributions per document

    # Create freq_dist for each document
    for doc in docs:
        doc = preprocess.preprocess_text(doc)
        fd = FreqDist()
        for word in doc: fd.inc(word)
        doc_freqs.update(fd.samples())
        tf_dists.append(fd)


    all_tokens = doc_freqs.keys()
    num_docs = len(docs)
    num_features = len(all_tokens)


    # Build feature x document matrix
    matrix = np.zeros((num_features, num_docs))
    for i, fd in enumerate(tf_dists):
        if metric == FrequencyMetrics.TF:
            v = [fd.freq(word) for word in all_tokens]
        elif metric == FrequencyMetrics.TF_IDF:
            v = [fd.freq(word) * math.log(float(num_docs)/doc_freqs[word]) for word in all_tokens]
        else:
            raise ValueError("No such feature type: %s" % feature_type);
        matrix[:,i] = v

    return matrix
def create_word_scores(posWords,negWords,posTag,negTag):
    from nltk.probability import FreqDist, ConditionalFreqDist
    import itertools 
    posWords = list(itertools.chain(*posWords)) #把多维数组解链成一维数组
    negWords = list(itertools.chain(*negWords)) #同理

    word_fd = FreqDist() #可统计所有词的词频
    cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频
    for word in posWords:
        #help(FreqDist)
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd[posTag][word]+= 1#cond_word_fd['pos'].inc(word)
    for word in negWords:
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd[negTag][word]+= 1#cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd[posTag].N() #积极词的数量
    neg_word_count = cond_word_fd[negTag].N() #消极词的数量
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd[posTag][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd[negTag][word], (freq, neg_word_count), total_word_count) #同理
        word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量

    return word_scores #包括了每个词和这个词的信息量
def create_words_bigrams_scores():
    posdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/pos_review.xlsx", 1, 1)
    negdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/neg_review.xlsx", 1, 1)
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word]+=1
        cond_word_fd['pos'][word]+=1

    for word in neg:
        word_fd[word]+=1
        cond_word_fd['neg'][word]+=1
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
	word_fd = FreqDist()
	label_word_fd = ConditionalFreqDist()
	
	for label, words in labelled_words:
		for word in words:
			word_fd.inc(word)
			label_word_fd[label].inc(word)
	
	n_xx = label_word_fd.N()
	high_info_words = set()
	
	for label in label_word_fd.conditions():
		n_xi = label_word_fd[label].N()
		word_scores = collections.defaultdict(int)
		
		for word, n_ii in label_word_fd[label].iteritems():
			n_ix = word_fd[word]
			score = score_fn(n_ii, (n_ix, n_xi), n_xx)
			word_scores[word] = score
		
		bestwords = [word for word, score in word_scores.iteritems() if score >= min_score]
		high_info_words |= set(bestwords)
	
	return high_info_words
Exemple #7
0
def summarize(self, input, num_sentences ):
                s=[]
                punt_list=['.',',','!','?']
                summ_sentences = []
                sentences=input
                #sentences = sent_tokenize(input)
                lowercase_sentences =[sentence.lower() 
                        for sentence in sentences]
                #print lowercase_sentences
                saito=' '.join(sentences)
                s=input
                ts=''.join([ o for o in s if not o in  punt_list ]).split()
                lowercase_words=[word.lower() for word in ts]
                words = [word for word in lowercase_words if word not in stopwords.words()]
                word_frequencies = FreqDist(words)
                
                most_frequent_words = [pair[0] for pair in 
                        word_frequencies.items()[:100]]

                # add sentences with the most frequent words
                if(len(s) < num_sentences):
                    num_sentences=len(s)
                for word in most_frequent_words:
                        for i in range(len(lowercase_sentences)):
                            if len(summ_sentences) < num_sentences:
                                        if (lowercase_sentences[i] not in summ_sentences and word in lowercase_sentences[i]):
                                                summ_sentences.append(lowercase_sentences[i])
                            else:
								break
                        if len(summ_sentences) >= num_sentences:
                             break  
                        
                # reorder the selected sentences
                summ_sentences.sort( lambda s1, s2: saito.find(s1) - saito.find(s2) )
                return summ_sentences
def get_top_words(directory, n, file):
	num_docs = 0.0
	flist = {}
	result = {}
	for f in os.listdir(directory):
		#stop = "/Users/oliverfengpet/Dropbox/TwitterAffect/stoplist.txt"
		
		num_docs+=1
		rawContents = load_file_tokens(directory+'/'+f)
		fdist = FreqDist( rawContents )
		normalF = max(fdist.values())
		
		for key in fdist.keys():
			fdist[key]=float(float(fdist[key])/normalF)
	
		flist[directory+'/'+f] = fdist
		
		
	for key in flist[file].keys():
		num_appear=0
		for key_file in flist.keys():
			if key in flist[key_file].keys():
				num_appear+=1
		
		result[key] = flist[file][key]*math.log(num_docs/(num_appear))
	
	sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1),reverse=True)
	
	top_x = sorted_x[:n]
	result = []
	
	for item in top_x:
		result.append(item[0])
	
	return result
Exemple #9
0
    def train_supervised(self, labelled_sequences, **kwargs):
        """
        Supervised training maximising the joint probability of the symbol and
        state sequences. This is done via collecting frequencies of
        transitions between states, symbol observations while within each
        state and which states start a sentence. These frequency distributions
        are then normalised into probability estimates, which can be
        smoothed if desired.

        :return: the trained model
        :rtype: HiddenMarkovModelTagger
        :param labelled_sequences: the training data, a set of
            labelled sequences of observations
        :type labelled_sequences: list
        :param kwargs: may include an 'estimator' parameter, a function taking
            a FreqDist and a number of bins and returning a CProbDistI;
            otherwise a MLE estimate is used
        """

        # default to the MLE estimate
        estimator = kwargs.get('estimator')
        if estimator is None:
            estimator = lambda fdist, bins: MLEProbDist(fdist)

        # count occurrences of starting states, transitions out of each state
        # and output symbols observed in each state
        known_symbols = set(self._symbols)
        known_states = set(self._states)

        starting = FreqDist()
        transitions = ConditionalFreqDist()
        outputs = ConditionalFreqDist()
        for sequence in labelled_sequences:
            lasts = None
            for token in sequence:
                state = token[_TAG]
                symbol = token[_TEXT]
                if lasts is None:
                    starting.inc(state)
                else:
                    transitions[lasts].inc(state)
                outputs[state].inc(symbol)
                lasts = state

                # update the state and symbol lists
                if state not in known_states:
                    self._states.append(state)
                    known_states.add(state)

                if symbol not in known_symbols:
                    self._symbols.append(symbol)
                    known_symbols.add(symbol)

        # create probability distributions (with smoothing)
        N = len(self._states)
        pi = estimator(starting, N)
        A = ConditionalProbDist(transitions, estimator, N)
        B = ConditionalProbDist(outputs, estimator, len(self._symbols))

        return HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi)
Exemple #10
0
class VocabBuilder:
    """
    Creates a vocabulary after scanning a corpus.
    """

    def __init__(self, lang="english", min_length=3, cut_first=100):
        """
        Set the minimum length of words and which stopword list (by language) to
        use.
        """
        self._counts = FreqDist()
        self._stop = set(stopwords.words(lang))
        self._min_length = min_length
        self._cut_first = cut_first

        print("Using stopwords: %s ... " % " ".join(list(self._stop)[:10]))

    def scan(self, words):
        """
        Add a list of words as observed.
        """

        for ii in [x.lower() for x in words if x.lower() not in self._stop \
                       and len(x) >= self._min_length]:
            self._counts.inc(ii)

    def vocab(self, size=5000):
        """
        Return a list of the top words sorted by frequency.
        """
        if len(self._counts) > self._cut_first + size:
            return self._counts.keys()[self._cut_first:(size + self._cut_first)]
        else:
            return self._counts.keys()[:size]
Exemple #11
0
def make_summary( text):
	sent = []
	stemmed = []
	tokens = word_tokenize(text)
	sent = sent_tokenize(text)
	for token in tokens:
		if token in stopwords.words('english'):
			tokens.remove(token)
	stemmer = PorterStemmer()

	for token in tokens:
	 	stemmed.append(stemmer.stem(token))
#freq(stemmed)
	for word in stemmed:
		word.lower()
	word_freq = FreqDist(stemmed)

	most_freq_words = [pair[0] for pair in word_freq.items()[:60]]

	working_sent = [sentence.lower() for sentence in sent]

	out_sent = []

	for word in most_freq_words:
		for i in range(0,len(working_sent)):
			if (word in working_sent[i] and sent[i] not in out_sent):
				out_sent.append(sent[i])
				break
			if len(out_sent) >= 5:
			 	break
		
		if len(out_sent) >= 5:
			break

	return reorder(out_sent,text)
Exemple #12
0
    def similar(self, word, num=20):
        """
        Distributional similarity: find other words which appear in the
        same contexts as the specified word; list most similar words first.

        :param word: The word used to seed the similarity search
        :type word: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.similar_words()
        """
        if '_word_context_index' not in self.__dict__:
            print('Building word-context index...')
            self._word_context_index = ContextIndex(self.tokens,
                                                    filter=lambda x:x.isalpha(),
                                                    key=lambda s:s.lower())

#        words = self._word_context_index.similar_words(word, num)

        word = word.lower()
        wci = self._word_context_index._word_to_contexts
        if word in wci.conditions():
            contexts = set(wci[word])
            fd = FreqDist(w for w in wci.conditions() for c in wci[w]
                          if c in contexts and not w == word)
            words = islice(fd.keys(), num)
            print(tokenwrap(words))
        else:
            print("No matches")
def most_frequent_words(path,top):
    root_path = "./"+path;
    writers = os.listdir(root_path);
    word_set = set();
    for writer in writers:
        if writer.find(".") != -1:
            continue;
        inside_folder = root_path + "//" +writer;
        files = os.listdir(inside_folder);
        formated_text = "";
        for file in files:
            file_path = root_path + "//" +writer+"//"+ file;
            fw = open(file_path,"r",encoding="utf8");
            article = fw.read();
            #print(article);
            formated_text+=" ";
            formated_text += formatText(article);
            fw.close();

        words = get_bigrams(formated_text);
        fdist = FreqDist(w for w in words if
                         len(w) > 1 and isEnglish(w) == False and w != "``");
        keys = fdist.most_common(top);
        for key in keys:
            #print(str(key[0]) + " , " + str(key[1]) + "\n");
            word_set.add(key[0]);
    print(word_set);
    fw = open("./Features/Bigrams.csv","w",encoding="utf8");
    for word in word_set:
        fw.write(word);
        fw.write("\n");
    fw.close();
Exemple #14
0
 def mostCommWords(self, tag, pos_tag_pattern):
     """
         This is a help method for mostCommNouns and mostCommVerbs.
         Argument:   tag --  a hashtag that we want to compute the most commonly hashtag with
                     pos_tag_pattern
                         --  the regular expression that used to match the POS tags
         return:     a list of the top 20 nouns associated with the input hashtag
         """
     words={}
     topTwenty=[]
     j = 0
     for line in self.lines:
         hasTag = False
         for t in self.tokenizer(line, hashtag_pattern):
             if t == tag:
                 hasTag = True
                 break
         if hasTag:
             counts = FreqDist()
             tokens = self.tokenizer(line, word_pattern)
             pos = nltk.pos_tag(tokens)
             for p in pos:
                 if re.match(pos_tag_pattern,p[1]):
                     counts.inc(p[0])
             for n in counts.keys():
                 if words.has_key(n):
                     words[n] = words[n]+counts[n]
                 else:
                     words[n] = counts[n]
     words_sorted_by_counts = sorted(words.items(), key=lambda x: x[1], reverse=True)
     for i in range(0,20):
         topTwenty.append(words_sorted_by_counts[i][0])
     
     return topTwenty
    def train_MLT(self, tagged_train_data, untagged_training_data):
        """
        Builds a most likely tag tagger from the given tagged training data as WORDS
        :param train_data:
        :return: model
        """
        # find the set of words
        words = set()
        for sent in untagged_training_data:
            for word in sent:
                words.add(word)
        # Define mlt_dict of format {word1:{(word1,tag1):count1, (word1, tag2):count2 ........},..........}
        mlt_dict = dict()
        # Initialize keys and values to it
        for word in words:
            mlt_dict[word] = dict()
        # Compute the freq dist of tagged words
        tagged_words_fdist = FreqDist(tagged_train_data)

        for tagged_word, count in tagged_words_fdist.items():
            (mlt_dict[tagged_word[0]])[tagged_word] = count

        # Update the dict to contain the most likely tag for each word
        #for word, inside_dict in mlt_dict.items():
        #   max_val = max(inside_dict.values())
        #    inside_dict =
        print("Training is done!")
        return mlt_dict
Exemple #16
0
def fun10():
    """frequency distribution"""
    fdist1 = FreqDist(text1)
    # print fdist1
    vocabulary1 = fdist1.keys()
    # print vocabulary1[:50]
    fdist1.plot(50, cumulative=True)
        def GetHighInformationWordsChi(num_bestwords):
            word_fd = FreqDist()
            label_word_fd = ConditionalFreqDist()
 
            for word in movie_reviews.words(categories=['pos']):
                word_fd[word.lower()] +=1
                label_word_fd['pos'][word.lower()] +=1
 
            for word in movie_reviews.words(categories=['neg']):
                word_fd[word.lower()] +=1
                label_word_fd['neg'][word.lower()] +=1
 
            pos_word_count = label_word_fd['pos'].N()
            neg_word_count = label_word_fd['neg'].N()
            total_word_count = pos_word_count + neg_word_count
 
            word_scores = {}
 
            for word, freq in word_fd.iteritems():
                pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
                    (freq, pos_word_count), total_word_count)
                neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
                    (freq, neg_word_count), total_word_count)
                word_scores[word] = pos_score + neg_score
 
            best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:num_bestwords]
            bestwords = set([w for w, s in best])
            return bestwords
def ngram4All():
    to_save_folder = "./#Ngram_4[.]/"
    folder_list = os.listdir("./");
    for folder in folder_list:
        if folder.find(".") != -1:
            continue;
        folder_name = "./" + folder + "/"
        data_path = folder_name + "data.doc";
        fw = open(data_path, "r", encoding="utf8")
        text = fw.read();
        words = word_tokenize(text);
        valid_word = [w for w in words if len(w) > 1 and w != "``"]
        nlist4 = []
        vlen = len(valid_word);
        for i in range(0,vlen-3):
            nlist4.append(valid_word[i]+" "+valid_word[i+1]+" "+valid_word[i+2] + " " +valid_word[i+3])

        fdist = FreqDist(w for w in nlist4)
        keys = fdist.most_common(len(fdist.keys()))
        dataFreq = ""
        for key in keys:
            dataFreq += str(key[0])+ "," + str(key[1]) + "\n"
        make_sure_path_exists(to_save_folder + folder)
        writer = open(to_save_folder + folder + "/" + folder + "[Ngram_4_Freq].csv", "w+", encoding="utf8")
        writer.write(dataFreq)
        fw.close()
        writer.close()
Exemple #19
0
def get_frequency(data_file,all_vocab):
    
    input_file = open(data_file, "r")
    input_file_contents = input_file.read()

    words = nltk.tokenize.word_tokenize(input_file_contents, 'english')
    fdist = FreqDist(words)
    print(fdist)

    output_file = open("../Training/vocab_freq.txt", "w")
    

    for word, frequency in fdist.most_common(4000):        
        if word in all_vocab  and word!='+' and word!='-':
            output_file.write(word + " : " + str(frequency) + "\n")        
            
    output_file.close()
    return 1

#data = "data.txt"
#stop_words = "stopwords.txt"

#accuracy= multinomial_naive_bayes_unigram(data, data, stop_words)
#print(accuracy)
#print("Separating Done!!")
def BigramAll():
    to_save_folder = "./#Bigram[.]/"
    folder_list = os.listdir("./");
    for folder in folder_list:
        if folder.find(".") != -1 :
            continue;
        folder_name = "./" + folder + "/"
        data_path = folder_name+"data.doc";
        fw = open(data_path,"r",encoding="utf8");
        text = fw.read();
        words = word_tokenize(text);

        big = list(bigrams(w for w in words if len(w) > 1 and w != "``"));
        myBig = []
        for bi in big:
            myBig.append(bi[0]+" "+bi[1]);

        fdist = FreqDist(str(w) for w in myBig);

        keys = fdist.most_common(len(fdist.keys()))
        dataFreq = "";
        for key in keys:
            dataFreq+= str(key[0]).strip()+","+str(key[1]).strip()+"\n";

        make_sure_path_exists(to_save_folder+folder)
        writer = open(to_save_folder+folder+"/"+folder+"[bigram_Freq].csv","w+",encoding="utf8");
        writer.write(dataFreq);
        fw.close();
        writer.close();
def trigramAll():
    to_save_folder = "./#Trigram[.]/"
    folder_list = os.listdir("./");
    for folder in folder_list:
        if folder.find(".") != -1:
            continue;
        folder_name = "./" + folder + "/"
        data_path = folder_name + "data.doc";
        fw = open(data_path, "r", encoding="utf8");
        text = fw.read();
        words = word_tokenize(text);
        valid_word = [w for w in words if len(w) > 1 and w != "``"];
        tri_list = [];
        vlen = len(valid_word);
        for i in range(0,vlen-2):
            tri_list.append(valid_word[i]+" "+valid_word[i+1]+" "+valid_word[i+2]);

        fdist = FreqDist(w for w in tri_list);

        keys = fdist.most_common(len(fdist.keys()))
        dataFreq = "";
        for key in keys:
            dataFreq += str(key[0]).strip()+ "," + str(key[1]).strip() + "\n";

        make_sure_path_exists(to_save_folder + folder)
        writer = open(to_save_folder + folder + "/" + folder + "[Triram_Freq].csv", "w+", encoding="utf8");
        writer.write(dataFreq);
        fw.close();
        writer.close();
Exemple #22
0
def buildCategoryDictionary(category):
    tweetList = twitter_fetch.get_tweets_text(classn=category)
    freq = FreqDist()
    for tweet in tweetList:
        freq.update(word for word in tokenizeTweet(tweet))
    saveDictionaryToFile(freq, category + categoryDictFilePath)
    return freq
	def classify(self, feats):
		counts = FreqDist()
		
		for classifier in self._classifiers:
			counts.inc(classifier.classify(feats))
		
		return counts.max()
Exemple #24
0
def word_tag_model(words, tagged_words, limit=200):
    fd = FreqDist(words)
    cfd = ConditionalFreqDist(tagged_words)

    most_freq = (word for word, count in fd.most_common(limit))

    return dict((word, cfd[word].max()) for word in most_freq)
def get_term_freq_dict(data):
    # Change it to lower case
    lower_data = data.lower()
    
    # Tokenize it
    tokens = word_tokenize(lower_data)
    freq_dist = FreqDist(tokens)
    
    # Lemmatize it
    word_freq = {}
    
    for term in freq_dist.keys():
        lemmatize_term = wordnet.lemmatize(term)
        val = freq_dist.get(term)
        
        # If it exist in word_freq, add value
        if lemmatize_term in word_freq:
            freq = word_freq[lemmatize_term]
            word_freq[lemmatize_term] = freq + val
            
        # Else, assign value
        else:
            word_freq[lemmatize_term] = val
    
    
    return word_freq
def create_word_bigram_scores(posWords, negWords):
    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)

    pos = posWords + posBigrams #词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[str(word)] += 1 
        cond_word_fd['pos'][str(word)] += 1
    for word in neg:
	    word_fd[str(word)] += 1
	    cond_word_fd['neg'][str(word)] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
Exemple #27
0
	def choose_tag(self, tokens, index, history):
		tags = FreqDist()
		
		for tagger in self._taggers:
			tags.inc(tagger.choose_tag(tokens, index, history))
		
		return tags.max()
  def scores(self, docId):
    """
    Return the score from the given document to every other
    document in the index. Documents not listed are assumed
    to have no similarity detected by shared terms.

    :param docId: ID of doc to compare other docs to.
    :returns: A list of tuples of (document ID, similarity score).
      Larger scores are better.
    """
    if not self._idf:
      self._computeIdfs()
    # Track the scores
    #
    docScores = FreqDist()
    for termid, freq in self.termFrequencies[docId].iteritems():
      # Find the frequency with which this term appears in other documents.
      #
      inverseDocumentFrequency = self._idf[termid]
      for otherDocId in self.termsToDocuments[termid]:
        if otherDocId == docId:
          # Skip this document
          continue
        # Find the term frequency of the term in the other document. 
        #
        otherFreq = self.termFrequencies[docId][termid]
        # Score proportional to product of frequencies times the inverse of
        # the document frequency.
        #
        docScores.inc(otherDocId, freq * otherFreq * inverseDocumentFrequency)

    return docScores
def create_word_scores(posWords, negWords):
    file_scores = file("cn_sample_data/scores.txt", "w")
    #迭代,将多个序列合并
    
    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in posWords:
        word_fd[str(word)] += 1 
        cond_word_fd['pos'][str(word)] += 1
    for word in negWords:
	    word_fd[str(word)] += 1
	    cond_word_fd['neg'][str(word)] += 1
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count
    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][str(word)], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][str(word)], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score
    sorted(word_scores.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)
    for key in word_scores:
        file_scores.write(str(key)+" : " + str(word_scores[str(key)])+ "\n")
    file_scores.close()
    return word_scores 
Exemple #30
0
def train(labeled_featuresets, estimator=ELEProbDist):
        label_freqdist = FreqDist()
        feature_freqdist = defaultdict(FreqDist)
        feature_values = defaultdict(set)
        fnames = set()

        for featureset, label in labeled_featuresets:
            label_freqdist.inc(label)
            for fname, fval in featureset.items():
                feature_freqdist[label, fname].inc(fval)
                feature_values[fname].add(fval)
                fnames.add(fname)

        for label in label_freqdist:
            num_samples = label_freqdist[label]
            for fname in fnames:
                count = feature_freqdist[label, fname].N()
                feature_freqdist[label, fname].inc(None, num_samples-count)
                feature_values[fname].add(None)

        label_probdist = estimator(label_freqdist)

        feature_probdist = {}
        for ((label, fname), freqdist) in feature_freqdist.items():
            probdist = estimator(freqdist, bins=len(feature_values[fname]))
            feature_probdist[label,fname] = probdist

        return NaiveBayesClassifier(label_probdist, feature_probdist)
    #This step is needed to collapse runs of space characters into one
    text = ' '.join(text.split())
    """
    spans = TOKENIZER.span_tokenize(text)
    tokens = (text[begin : end] for (begin, end) in spans)
    """

    tokens = TOKENIZER.tokenize(text)
    for ngram in ngrams(tokens, order):
        #Increment the count for the bigram. Automatically handles any
        #bigram not seen before. The join expression turns 2 separate
        #single‑character strings into one 2‑character string
        if '  ' not in ''.join(ngram):
            frequencies[''.join(ngram)] += 1

    return


if __name__ == '__main__':
    #Initialize the mapping
    frequencies = FreqDist()
    #The order of the ngrams is the first command line argument
    ngram_order = int(sys.argv[2])
    #Pull the input data from the console
    count_ngrams(frequencies, ngram_order)
    outputfp = open(sys.argv[3], 'w')
    json.dump(dict(frequencies), outputfp)
    print('Stored frequencies of {} encountered N‑grams.'.format(
        len(frequencies)))
Exemple #32
0
#for x in sentences:
#    print(x)
#    print("----")

#divide text into words, and print
words = word_tokenize(text)
#print(len(words))
#for x in words:
#    print(x)
#    print("----")

#############################
#Find the frequence of words in text
from nltk.probability import FreqDist

fdist = FreqDist(words)
#print the 10 most common words
mostCommon10 = fdist.most_common(10)
#for x in mostCommon10:
#    print(x)
#plot a graph of word distribution
import matplotlib.pyplot as plot
#fdist.plot(10)

############
#remove punctuation marks
words_no_punc = []
for w in words:
    if w.isalpha():
        words_no_punc.append(w.lower())
#print(words_no_punc)
 def leaf(labeled_featuresets):
     label = FreqDist(label
                      for (featureset, label) in labeled_featuresets).max()
     return DecisionTreeClassifier(label)
from nltk.tokenize import word_tokenize

Tokens = word_tokenize(dataset)

#print (Tokens)

#No. of tokens in the dataset

len(Tokens)

#Freq of occurence of distinct elements

from nltk.probability import FreqDist

fdist = FreqDist()

for word in Tokens:

    fdist[word.lower()] += 1

fdist

fdist.plot(20)

#-------------------------Stemming----------------------------------------

from nltk.stem import PorterStemmer

pst = PorterStemmer()
Exemple #35
0
paragraph = input("Enter the paragraph \n")
para = ""
for i in paragraph:
    if not i in '.,!?():':
        para += i
print(para+ "\n \n \n")
stop = set(stopwords.words('english'))
listop = [i for i in para.lower().split() if i not in stop]
print(listop)
print("\n")
ps = nltk.stem.PorterStemmer()
listopandstem = [ps.stem(i) for i in listop]
print(listopandstem)
print("\n \n")
fdist =  FreqDist(listopandstem)
#print(fdist)
vowel = [word for word in listop if word[0] in 'aeiou']
m = term_freq(vowel)
print(m)
print("\n")
x = input("Enter search word")
print(m[x], x)
li_term = m.keys()
li_freq = m.values()

df = pd.DataFrame({'li_freq':li_freq,
                   'li_term':li_term})
 
writer = ExcelWriter('abc.xlsx')
df.to_excel(writer,'Sheet1',index=False)
counter = Counter()
word = []
filtered = [word for word in text if (len(word) >= 5 and word not in stopWords and word not in stops and word != int)]
counts = Counter(filtered)

pbar = pyprind.ProgBar(len([filtered]),
                       title='Counting word occurrences...')
word_counts = sorted(counts, key=counts.get, reverse=True)
word_to_int = {word: ii for ii, word in enumerate(word_counts, 1)}

counts.update(text)
print (counts.update(text))
pbar.update()

fdist = FreqDist(filtered)
vocab = fdist.keys()
print (vocab[:30])

mpl_fig = plt.figure(figsize=(12, 8))
ax = mpl_fig.add_subplot(111)

plt.title("Words with Highest Frequencies in Immunotherapy-related Tweets")
ax.set_xlabel('Term')
ax.set_ylabel('Frequency')
plt.xlabel("Term")
plt.ylabel("Frequency")

mpl_fig.tight_layout()
fdist.plot(40, cumulative=True)
mpl_fig.savefig("linechart.png")
Exemple #37
0
import nltk
from nltk.corpus import gutenberg
from nltk.probability import FreqDist
import matplotlib
import matplotlib.pyplot as plt

matplotlib.use('TkAgg')
fd = FreqDist()
for text in gutenberg.fileids():
    for word in gutenberg.words(text):
        fd[word] += 1
ranks = []
freqs = []
for rank, word in enumerate(fd):
    ranks.append(rank + 1)
    freqs.append(fd[word])
plt.loglog(ranks, freqs)
plt.xlabel('frequency(f)', fontsize=14, fontweight='bold')
plt.ylabel('rank(r)', fontsize=14, fontweight='bold')
plt.grid(True)
plt.show()
Exemple #38
0
def get_frequency(wordsSelected):
    return FreqDist(wordsSelected)
Exemple #39
0
    # request the result of the url
    response = requests.get(topStoriesURL).json()

    # store the result in a json file and read the file contents
    main_functions.save_to_file(response, "JSON_Files/topStories.json")
    topStoriesOutput = main_functions.read_from_file("JSON_Files/topStories.json")

    # the following block of code cleans up the list variable so only desirable words are left
    toProcess = ""
    for i in topStoriesOutput["results"]:
        toProcess = toProcess + i["abstract"]

    words = word_tokenize(toProcess)

    fdist = FreqDist(words)

    words_no_punc = []
    for w in words:
        if w.isalpha():
            words_no_punc.append(w.lower())

    fdist2 = FreqDist(words_no_punc)

    clean_words = []
    for w in words_no_punc:
        if w not in stopwords:
            clean_words.append(w)

    fdist3 = FreqDist(clean_words)
    return [[word for word in headline if word not in stop_words]
            for headline in headlines]


def export(data, data_name):
    csv_filename = data_name + ".csv"
    with open(csv_filename, 'w') as out:
        csv_out = csv.writer(out)
        csv_out.writerow(['name', 'count'])
        for row in data:
            csv_out.writerow(row)


input_filename = "abcnews-date-text.csv"
output_filename = "list_data_conflicts"

df = pd.read_csv(input_filename)
headlines = list(df["headline_text"].as_matrix())

keywords = ["refugee", "cyber"]

tokenized_headlines = tokenize_headlines(headlines)

for keyword in keywords:
    filtered_headlines = remove_stopwords(
        filter_headlines(tokenized_headlines, [keyword]))
    tokens = list(itertools.chain.from_iterable(filtered_headlines))
    distribution = FreqDist(tokens)
    most_common = distribution.most_common(70)
    export(data=most_common, data_name=keyword)
Exemple #41
0
import pandas as pd
import numpy as np
import nltk
import os
import nltk.corpus
from nltk.tokenize import RegexpTokenizer
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer

with open("C:/Users/David/Documents/GIT/ENRON/master/arnold-j_mails.txt",
          "r",
          encoding="utf-8") as file:
    text = file.read()

lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')
text = lemmatizer.lemmatize(text)
liste = tokenizer.tokenize(text)
fdist = FreqDist(liste)
top = fdist.most_common(5000)
print((top[5])[1])

with open("C:/Users/David/Documents/GIT/ENRON/master/nb_mots.txt",
          "w",
          encoding="utf-8") as file:
    file.write(str(top))
Exemple #42
0
class TnT(TaggerI):
    '''
    TnT - Statistical POS tagger

    IMPORTANT NOTES:

    * DOES NOT AUTOMATICALLY DEAL WITH UNSEEN WORDS

      - It is possible to provide an untrained POS tagger to
        create tags for unknown words, see __init__ function

    * SHOULD BE USED WITH SENTENCE-DELIMITED INPUT

      - Due to the nature of this tagger, it works best when
        trained over sentence delimited input.
      - However it still produces good results if the training
        data and testing data are separated on all punctuation eg: [,.?!]
      - Input for training is expected to be a list of sentences
        where each sentence is a list of (word, tag) tuples
      - Input for tag function is a single sentence
        Input for tagdata function is a list of sentences
        Output is of a similar form

    * Function provided to process text that is unsegmented

      - Please see basic_sent_chop()


    TnT uses a second order Markov model to produce tags for
    a sequence of input, specifically:

      argmax [Proj(P(t_i|t_i-1,t_i-2)P(w_i|t_i))] P(t_T+1 | t_T)

    IE: the maximum projection of a set of probabilities

    The set of possible tags for a given word is derived
    from the training data. It is the set of all tags
    that exact word has been assigned.

    The probability of a tag for a given word is the linear
    interpolation of 3 markov models; a zero-order, first-order,
    and a second order model.

      P(t_i| t_i-1, t_i-2) = l1*P(t_i) + l2*P(t_i| t_i-1) +
                             l3*P(t_i| t_i-1, t_i-2)

    A beam search is used to limit the memory usage of the algorithm.
    The degree of the beam can be changed using N in the initialization.
    N represents the maximum number of possible solutions to maintain
    while tagging.

    It is possible to differentiate the tags which are assigned to
    capitalized words. However this does not result in a significant
    gain in the accuracy of the results.
    '''
    def __init__(self, unk=None, Trained=False, N=1000, C=False):
        '''
        Construct a TnT statistical tagger. Tagger must be trained
        before being used to tag input.

        :param unk: instance of a POS tagger, conforms to TaggerI
        :type  unk:(TaggerI)
        :param Trained: Indication that the POS tagger is trained or not
        :type  Trained: boolean
        :param N: Beam search degree (see above)
        :type  N:(int)
        :param C: Capitalization flag
        :type  C: boolean

        Initializer, creates frequency distributions to be used
        for tagging

        _lx values represent the portion of the tri/bi/uni taggers
        to be used to calculate the probability

        N value is the number of possible solutions to maintain
        while tagging. A good value for this is 1000

        C is a boolean value which specifies to use or
        not use the Capitalization of the word as additional
        information for tagging.
        NOTE: using capitalization may not increase the accuracy
        of the tagger
        '''

        self._uni = FreqDist()
        self._bi = ConditionalFreqDist()
        self._tri = ConditionalFreqDist()
        self._wd = ConditionalFreqDist()
        self._eos = ConditionalFreqDist()
        self._l1 = 0.0
        self._l2 = 0.0
        self._l3 = 0.0
        self._N = N
        self._C = C
        self._T = Trained

        self._unk = unk

        # statistical tools (ignore or delete me)
        self.unknown = 0
        self.known = 0

    def train(self, data):
        '''
        Uses a set of tagged data to train the tagger.
        If an unknown word tagger is specified,
        it is trained on the same data.

        :param data: List of lists of (word, tag) tuples
        :type data: tuple(str)
        '''

        # Ensure that local C flag is initialized before use
        C = False

        if self._unk is not None and self._T == False:
            self._unk.train(data)

        for sent in data:
            history = ['BOS', 'BOS']
            for w, t in sent:

                # if capitalization is requested,
                # and the word begins with a capital
                # set local flag C to True
                if self._C and w[0].isupper(): C = True

                self._wd[w].inc(t)
                self._uni.inc((t, C))
                self._bi[history[1]].inc((t, C))
                self._tri[tuple(history)].inc((t, C))

                history.append((t, C))
                history.pop(0)

                # set local flag C to false for the next word
                C = False

            self._eos[t].inc('EOS')

        # compute lambda values from the trained frequency distributions
        self._compute_lambda()

        #(debugging -- ignore or delete me)
        #print "lambdas"
        #print i, self._l1, i, self._l2, i, self._l3

    def _compute_lambda(self):
        '''
        creates lambda values based upon training data

        NOTE: no need to explicitly reference C,
        it is contained within the tag variable :: tag == (tag,C)

        for each tag trigram (t1, t2, t3)
        depending on the maximum value of
        - f(t1,t2,t3)-1 / f(t1,t2)-1
        - f(t2,t3)-1 / f(t2)-1
        - f(t3)-1 / N-1

        increment l3,l2, or l1 by f(t1,t2,t3)

        ISSUES -- Resolutions:
        if 2 values are equal, increment both lambda values
        by (f(t1,t2,t3) / 2)
        '''

        # temporary lambda variables
        tl1 = 0.0
        tl2 = 0.0
        tl3 = 0.0

        # for each t1,t2 in system
        for history in self._tri.conditions():
            (h1, h2) = history

            # for each t3 given t1,t2 in system
            # (NOTE: tag actually represents (tag,C))
            # However no effect within this function
            for tag in self._tri[history].samples():

                # if there has only been 1 occurrence of this tag in the data
                # then ignore this trigram.
                if self._uni[tag] == 1:
                    continue

                # safe_div provides a safe floating point division
                # it returns -1 if the denominator is 0
                c3 = self._safe_div((self._tri[history][tag] - 1),
                                    (self._tri[history].N() - 1))
                c2 = self._safe_div((self._bi[h2][tag] - 1),
                                    (self._bi[h2].N() - 1))
                c1 = self._safe_div((self._uni[tag] - 1), (self._uni.N() - 1))

                # if c1 is the maximum value:
                if (c1 > c3) and (c1 > c2):
                    tl1 += self._tri[history][tag]

                # if c2 is the maximum value
                elif (c2 > c3) and (c2 > c1):
                    tl2 += self._tri[history][tag]

                # if c3 is the maximum value
                elif (c3 > c2) and (c3 > c1):
                    tl3 += self._tri[history][tag]

                # if c3, and c2 are equal and larger than c1
                elif (c3 == c2) and (c3 > c1):
                    tl2 += float(self._tri[history][tag]) / 2.0
                    tl3 += float(self._tri[history][tag]) / 2.0

                # if c1, and c2 are equal and larger than c3
                # this might be a dumb thing to do....(not sure yet)
                elif (c2 == c1) and (c1 > c3):
                    tl1 += float(self._tri[history][tag]) / 2.0
                    tl2 += float(self._tri[history][tag]) / 2.0

                # otherwise there might be a problem
                # eg: all values = 0
                else:
                    #print "Problem", c1, c2 ,c3
                    pass

        # Lambda normalisation:
        # ensures that l1+l2+l3 = 1
        self._l1 = tl1 / (tl1 + tl2 + tl3)
        self._l2 = tl2 / (tl1 + tl2 + tl3)
        self._l3 = tl3 / (tl1 + tl2 + tl3)

    def _safe_div(self, v1, v2):
        '''
        Safe floating point division function, does not allow division by 0
        returns -1 if the denominator is 0
        '''
        if v2 == 0:
            return -1
        else:
            return float(v1) / float(v2)

    def tagdata(self, data):
        '''
        Tags each sentence in a list of sentences

        :param data:list of list of words
        :type data: [[string,],]
        :return: list of list of (word, tag) tuples

        Invokes tag(sent) function for each sentence
        compiles the results into a list of tagged sentences
        each tagged sentence is a list of (word, tag) tuples
        '''
        res = []
        for sent in data:
            res1 = self.tag(sent)
            res.append(res1)
        return res

    def tag(self, data):
        '''
        Tags a single sentence

        :param data: list of words
        :type data: [string,]

        :return: [(word, tag),]

        Calls recursive function '_tagword'
        to produce a list of tags

        Associates the sequence of returned tags
        with the correct words in the input sequence

        returns a list of (word, tag) tuples
        '''

        current_state = [(['BOS', 'BOS'], 1.0)]

        sent = list(data)

        tags = self._tagword(sent, current_state)

        res = []
        for i in range(len(sent)):
            # unpack and discard the C flags
            (t, C) = tags[i + 2]
            res.append((sent[i], t))

        return res

    def _tagword(self, sent, current_states):
        '''
        :param sent : List of words remaining in the sentence
        :type sent  : [word,]
        :param current_states : List of possible tag combinations for
                                the sentence so far, and the probability
                                associated with each tag combination
        :type current_states  : [([tag, ],prob), ]

        Tags the first word in the sentence and
        recursively tags the reminder of sentence

        Uses formula specified above to calculate the probability
        of a particular tag
        '''

        # if this word marks the end of the sentance,
        # return the most probable tag
        if sent == []:
            (h, p) = current_states[0]
            return h

        # otherwise there are more words to be tagged
        word = sent[0]
        sent = sent[1:]
        new_states = []

        # if the Capitalisation is requested,
        # initalise the flag for this word
        C = False
        if self._C and word[0].isupper(): C = True

        # if word is known
        # compute the set of possible tags
        # and their associated probabilities
        if word in self._wd.conditions():
            self.known += 1

            for (history, curr_sent_prob) in current_states:
                probs = []

                for t in self._wd[word].samples():
                    p_uni = self._uni.freq((t, C))
                    p_bi = self._bi[history[-1]].freq((t, C))
                    p_tri = self._tri[tuple(history[-2:])].freq((t, C))
                    p_wd = float(self._wd[word][t]) / float(self._uni[(t, C)])
                    p = self._l1 * p_uni + self._l2 * p_bi + self._l3 * p_tri
                    p2 = p * p_wd

                    probs.append(((t, C), p2))

                # compute the result of appending each tag to this history
                for (tag, prob) in probs:
                    new_states.append((history + [tag], curr_sent_prob * prob))

        # otherwise a new word, set of possible tags is unknown
        else:
            self.unknown += 1

            # since a set of possible tags,
            # and the probability of each specific tag
            # can not be returned from most classifiers:
            # specify that any unknown words are tagged with certainty
            p = 1

            # if no unknown word tagger has been specified
            # then use the tag 'Unk'
            if self._unk is None:
                tag = ('Unk', C)

            # otherwise apply the unknown word tagger
            else:
                [(_w, t)] = list(self._unk.tag([word]))
                tag = (t, C)

            for (history, prob) in current_states:
                history.append(tag)

            new_states = current_states

        # now have computed a set of possible new_states

        # sort states by prob
        # set is now ordered greatest to least probability
        new_states.sort(reverse=True, key=itemgetter(1))

        # del everything after N (threshold)
        # this is the beam search cut
        if len(new_states) > self._N:
            new_states = new_states[:self._N]

        # compute the tags for the rest of the sentence
        # return the best list of tags for the sentence
        return self._tagword(sent, new_states)
Frequency Analysis
Letter
Word
Bigrams
Plots


# Input
# NLTK example: frequence analysis
import nltk
from nltk.corpus import gutenberg
from nltk.probability import FreqDist

# get raw text from "Sense and Sensibility" by Jane Austen
raw = gutenberg.raw("austen-sense.txt")
fd_letters = FreqDist(raw)

words = gutenberg.words("austen-sense.txt")
fd_words = FreqDist(words)
sas = nltk.Text(words)

# these 2 lines let us size the freq dist plot
import matplotlib.pyplot as plt
plt.figure(figsize=(20, 5)) 

# frequency plot for letters from SAS
fd_letters.plot(100)

# Output
**Shows graph plot
Exemple #44
0
from nltk.tokenize import sent_tokenize

sentencas = sent_tokenize(texto)
palavras = word_tokenize(texto.lower())

from nltk.corpus import stopwords
from string import punctuation

stopwords = set(stopwords.words('portuguese') + list(punctuation))
palavras_sem_stopwords = [
    palavra for palavra in palavras if palavra not in stopwords
]

from nltk.probability import FreqDist

frequencia = FreqDist(palavras_sem_stopwords)

from collections import defaultdict

sentencas_importantes = defaultdict(int)

for i, sentenca in enumerate(sentencas):
    for palavra in word_tokenize(sentenca.lower()):
        if palavra in frequencia:
            sentencas_importantes[i] += frequencia[palavra]

from heapq import nlargest

idx_sentencas_importantes = nlargest(4, sentencas_importantes,
                                     sentencas_importantes.get)
Exemple #45
0
#getting main text file
file = open('sampledata.txt', 'r')
filetext = file.read()
#Cleaning the text from the unwanted characters, other methods like regular expression are applicable but this one is easier
filetext = filetext.replace('<s>', '')
filetext = filetext.replace('</s>', '')
filetext = filetext.replace('-', '')
tokens = word_tokenize(filetext)
print(tokens)
#Getting the vocabulary
file = open('sampledata.vocab.txt', 'r')
filetext = file.read()
vocab = word_tokenize(filetext)
print(vocab)

fr = FreqDist(tokens)
print('X     |       P(X)')
print('___________________')
for s in fr.items():
    for d in vocab:
        if d == s[0]:
            print(d, '    |       ', (s[1] / len(tokens)).__round__(2))
UNK = 0
for d in vocab:
    r = [item for item in fr if item[0] != d]
    isemp = not all(r)
    if isemp == True:
        UNK += (r[1] / len(tokens)).__round__(2)
print('UNK   |       ', UNK)
print('== UNIGRAMS AFTER LAPLACE SMOOTHING ==')
lpt = LaplaceProbDist(fr)
Exemple #46
0
def build_train_data():
    global train_word_id
    global train_data_single
    global train_info
    global train_tags
    global stop_words
    train_word_id = []
    train_data_single = {}
    train_info = {}
    train_tags = ['NULL']
    stop_words = []

    word_fd = FreqDist()  #可统计所有词的词频
    cond_word_fd = ConditionalFreqDist()  #可统计积极文本中的词频和消极文本中的词频

    with open(STOP_FILE, 'r') as fin:
        for line in fin:
            line = line.strip()
            if not line or line[0] == '#': continue
            stop_words.append(line)
    print("STOP WORD SIZE:%d\n" % (len(stop_words)))

    with open(WHITE_FILE, 'r') as fin:
        for line in fin:
            line = line.strip()
            if not line or line[0] == '#': continue
            white_words.append(line)
    print("WHITE WORD SIZE:%d\n" % (len(white_words)))

    for parent, dirname, filenames in os.walk(DATA_DIR):
        for filename in filenames:
            if filename[-6:] != '_p.txt': continue
            tag_name = filename[:-6]
            print("正在处理:%s" % (tag_name))
            train_tags.append(tag_name)
            tag_id = train_tags.index(tag_name)
            train_info[tag_id] = []
            line_num = 0
            with open(DATA_DIR + '/' + filename, 'r') as fin:
                for line in fin:
                    line_num += 1
                    if not line_num % 1000: print('LINE:%d' % (line_num))
                    line = line.strip()
                    objs = []
                    for item in line.split():
                        if len(item) == 1 and item not in white_words:
                            continue
                        item_id = term_to_id(item)
                        if item_id not in objs:
                            word_fd[item_id] += 1
                            cond_word_fd[tag_id][item_id] += 1
                            objs.append(item_id)
                    train_info[tag_id].append(objs)

    print('Randomize>>>')
    cond_word_sum = {}
    for tag in train_tags[1:]:
        tag_id = train_tags.index(tag)
        shuffle(train_info[tag_id])
        cond_word_sum[tag_id] = cond_word_fd[tag_id].N()
        print("SUM:%s->%d" % (tag, cond_word_sum[tag_id]))
    total_w_count = word_fd.N()
    print("TOTAL:%d" % (total_w_count))

    global sorted_word_scores
    sorted_word_scores = {}
    word_scores = {}

    word_scores_sub = {}
    print("CALC CHI-SQUARE...")
    for word, freq in word_fd.items():
        word_scores[word] = 0
        for tag in train_tags[1:]:
            tag_id = train_tags.index(tag)
            word_scores[word] += \
            BigramAssocMeasures.chi_sq(cond_word_fd[tag_id][word], (freq, cond_word_sum[tag_id]), total_w_count)
    sorted_word_scores = sorted(word_scores.items(),
                                key=lambda e: e[1],
                                reverse=True)

    del cond_word_sum
    del word_fd
    del cond_word_fd

    return
Exemple #47
0
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import pandas as pd
text="""Hello Mr. Smith, how are you doing today? The weather is great, and city is awesome.
The sky is pinkish-blue. You shouldn't eat cardboard"""
tokenized_word=word_tokenize(text)
# print(tokenized_word)
freqdist = FreqDist(tokenized_word)
stop_w = set(stopwords.words("english"))
filtered_sent = []
for w in tokenized_word:
    if w not in stop_w:
        filtered_sent.append(w)
# print(filtered_sent)


ps = PorterStemmer
lem = nltk.WordNetLemmatizer()
stemmed_words = []
# for w in filtered_sent:
#     stemmed_words.append(ps.stem(w))
# lemmatized_words = []
# for w in filtered_sent:
#     lemmatized_words.append(lem.lemmatize(w,"v"))

nltk.pos_tag(tokenized_word)

data = pd.read_csv("train.tsv", sep='\t')
Exemple #48
0
    # Get rid of the stop words
    stopwords = stopwords.words("english")

    clean_words = []

    for w in words_no_punctuation:
        if w not in stopwords:
            clean_words.append(w)

    # Display the Frequency option
    st.subheader("II - Frequency Distribution")
    frequencyChk = st.checkbox("Click here to generate frequency distribution")

    # Display frequency distribution graph
    if frequencyChk:
        fdist = FreqDist(clean_words)

        most_common_words = fdist.most_common(10)
        top_words = []
        word_count = []

        for j in most_common_words:
            top_words.append(j[0])
            word_count.append(j[1])

        plt.figure(figsize=(10, 6))
        plt.plot(top_words, word_count, color='green', linewidth=2, marker='d')
        plt.xlabel('Words')
        plt.ylabel('Count')
        plt.grid()
        st.pyplot()
#Split by space
train_text = train_text.strip().split(" ")

#identify tag and add space
#split by space for 2d list with word and its corresponding tag
train_text2=[]
for row in train_text:
    row=re.sub(r'(\/[A-Z]*(\|?)[^\d|^/]+)$',r' \1',row)
    train_text2.append(row.strip().split(" "))

#dataframe for easy interpretation
cols = ["words","tags"]
df = pd.DataFrame(data=train_text2, columns=cols)

#Frequency of tags (tn)
fdist_tags = FreqDist(df["tags"])

#Frequency of tags corresponding to the words (wordn,t1...tn)
word_tag=df.groupby(["words","tags"]).size()

#Frequency of tag with its previous tag (tn-1, tn)
two_tags =[]
for x in range(len(df)-1):
    two_tags.append(str(df["tags"][x])+" "+str(df["tags"][x+1]))

fdist_two_tags = FreqDist(two_tags)

#testing
#Tokenize test data
test_tokens = test_text.strip().split(" ")
def thongKeTop(text):
    fdist2 = FreqDist(text)  #Thống kê số từ nhiều nhất
    dictOj = dict(fdist2)
    return dictOj
'''

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import brown
from nltk.probability import FreqDist

#text from Harper Lee, To Kill a Mockingbird
text = '''
Atticus said to Jem one day, “I’d rather you shot at tin cans in the backyard, but I know you’ll go after birds. 
Shoot all the blue jays you want, if you can hit ‘em, but remember it’s a sin to kill a mockingbird.” 
That was the only time I ever heard Atticus say it was a sin to do something, and I asked Miss Maudie about it. 
“Your father’s right,” she said. 
“Mockingbirds don’t do one thing except make music for us to enjoy. 
They don’t eat up people’s gardens, don’t nest in corn cribs, they don’t do one thing but sing their hearts out for us. 
That’s why it’s a sin to kill a mockingbird.
'''

text_tags = nltk.pos_tag(word_tokenize(text))
print(text_tags)
frequent = FreqDist(tag for (word, tag) in text_tags)

import collections
word_counts = collections.Counter((words[0] for words in text_tags 
                                    if len(words[0])>1))   # just words, not marks

print('============================================================================================')
print(f'The five most frequent words are: {word_counts.most_common(5)}')    # Five more frequent
print('============================================================================================')

Exemple #52
0
 def find_frequent_words(self, all_words):
     freqdist = FreqDist(word.lower() for word in all_words
                         if word.lower() in self._valid_words)
     return freqdist
tagged_corpus = load_corpus_reader(args.corpus,
                                   reader=args.reader,
                                   fileids=args.fileids)

if not tagged_corpus:
    raise ValueError('%s is an unknown corpus')

if args.trace:
    print 'loading %s' % args.corpus

##############
## counting ##
##############

wc = 0
tag_counts = FreqDist()
taglen = 7
word_set = set()

if args.simplify_tags and args.corpus not in ['conll2000', 'switchboard']:
    kwargs = {'simplify_tags': True}
else:
    kwargs = {}

for word, tag in tagged_corpus.tagged_words(fileids=args.fileids, **kwargs):
    if len(tag) > taglen:
        taglen = len(tag)

    if args.corpus in ['conll2000', 'switchboard'] and args.simplify_tags:
        tag = simplify_wsj_tag(tag)
Exemple #54
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Dec 31 08:21:05 2016

@author: megan
"""

from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords

fname = 'actbacFB.txt'
frequency = 50

with open(fname, 'r', encoding="utf-8") as f:
    data = f.read().replace('\n', '')

# get list of most frequent words
words = word_tokenize(data)
lowercase_words = [word.lower() for word in words
                   if word not in stopwords.words() and word.isalpha()]
word_frequencies = FreqDist(lowercase_words)
most_frequent_words = FreqDist(lowercase_words).most_common(frequency)

# print out the keywords more nicely
for pair in most_frequent_words:
    print(pair[0],":",pair[1])
    
Exemple #55
0
paper_txt = lemmatizer.lemmatize(paper_txt)

tokens = word_tokenize(paper_txt)

# remove stop-word token
stopwords.words('english')

for token in tokens:
    token = token.lower()

clean_tokens = tokens[:]
for token in tokens:
    if token in stopwords.words('english') or token == "We" or token == "The":
        clean_tokens.remove(token)

fdist = FreqDist(word.lower() for word in clean_tokens)

print(fdist.most_common(20))
"""
for key, val in fdist.items():
    print(str(key) + ':' + str(val))
"""
# fdist.plot(20, cumulative=False)
"""
nlu = NaturalLanguageUnderstandingV1(
    username='******',
    password='******',
    version='2018-03-16'
)

response = nlu.analyze(
allWords = []
for wordList in tokens:
    allWords += wordList

# Remove All Stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
allWordsFinal = []
for w in allWords:
    if w not in stop_words:
        allWordsFinal.append(w)

# Run frequency distribution of words and plot on a graph.
from nltk.probability import FreqDist
fdist = FreqDist(allWordsFinal)

import matplotlib.pyplot as plt
fdist.plot(80)
plt.show()
#last_75 = FreqDist(dict(fdist.most_common()[-480:]))
#last_75.plot()

# Create and generate a word cloud image of most frequent words.
from wordcloud import WordCloud
words = (" ").join(allWordsFinal)
wc = WordCloud(width=1600,
               height=800,
               background_color="white",
               max_words=200,
               contour_width=3).generate(words)
Exemple #57
0
def create_frequency_dist(words):
    fdist = FreqDist(word.lower() for word in words)
    return fdist
txt.translate({ord(c): None for c in string.whitespace})

txt = txt.replace("gays", "gay").replace("lesbians", "lesbian").replace("seattles", "seattle").replace("citys", "city")
print(txt)

stopwords = set(STOPWORDS)
commonwords = {"time", "one", "began", "among", "another", "see", "part", "many", "day", "day", "way", "times",
               "still", "news", "three", "came", "became", "made", "wanted", "seemed", "made", "now", "society",
               "ing", "time", "first", "new", "called", "said", "come", "two", "city", "group", "state", "year",
               "case", "member", "even", "later", "month", "years", "much", "week", "county", "name", "example"
               "well", "members", "us", "say", "s"}
stopwords.update(commonwords)

# tokenize and calculate the word frequencies
tokens = nltk.tokenize.word_tokenize(txt)
fDist = FreqDist(tokens)
print(fDist.most_common(20))

# remove the stop words and common words
filtered_fDist = nltk.FreqDist(dict((word, freq) for word, freq in fDist.items() if word not in stopwords))

print(filtered_fDist)
filtered_fDist.plot(20)


print("generating wordcloud...")
mask_array = npy.array(Image.open("img/cloud.jpg"))
wc = WordCloud(font_path='arial', background_color="white", max_words=50, prefer_horizontal=1, mask=mask_array, scale=3, stopwords=stopwords, collocations=False)
wc.generate_from_frequencies(filtered_fDist)
# wc.generate(txt)
wc.to_file(wcPath)
Exemple #59
0
            new2.append(word)

text = ""

for i in range(len(new2)):
    text += new2[i] + " "

text = text.lower()  #один регистр

text = text.replace(".", "")
text = text.replace(",", "")
text = text.replace("!", "")
text = text.replace("-", "")
text = text.replace("_", "")
text = text.replace("?", "")
text = text.replace("[", "")
text = text.replace("]", "")
text = text.replace("'", "")
text = text.replace(";", "")
text = text.replace("''", "")
text = text.replace(":", "")
text = text.replace("``", "")

text_tokens = word_tokenize(text)

text = nltk.Text(text_tokens)

fdist = FreqDist(text)

print(fdist.most_common(10))
Exemple #60
0
stopwords = stopwords.words('english')


def remove_stopwords_and_punctuation(words):
    return [w for w in words if w.isalpha() and w not in stopwords]


#Helper function. Given a list of reviews, return a list of all the words in those reviews
#To understand this look at the description of functools.reduce in https://docs.python.org/3/library/functools.html
def get_all_words(amazon_reviews):
    return reduce(lambda words, review: words + review.words(), amazon_reviews,
                  [])


#A frequency distribution over all words in positive book reviews
pos_freqdist = FreqDist(
    remove_stopwords_and_punctuation(get_all_words(pos_train)))
neg_freqdist = FreqDist(
    remove_stopwords_and_punctuation(get_all_words(neg_train)))


def most_frequent_words(freqdist, k):
    return [word for word, count in freqdist.most_common(k)]


def words_above_threshold(freqdist, k):
    return [word for word in freqdist if freqdist[word] > k]


top_pos = most_frequent_words(pos_freqdist, 100)
top_neg = most_frequent_words(neg_freqdist, 100)
above_pos = words_above_threshold(pos_freqdist, 100)