Beispiel #1
0
    def _get_bigram_scores(self, posdata, negdata):
        pos_words = list(itertools.chain(*posdata))
        neg_words = list(itertools.chain(*negdata))

        pos_bigram_finder = BigramCollocationFinder.from_words(pos_words)
        neg_bigram_finder = BigramCollocationFinder.from_words(neg_words)
        pos_bigrams = pos_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
        neg_bigrams = neg_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

        pos = pos_words + pos_bigrams
        neg = neg_words + neg_bigrams

        word_fd = FreqDist()
        cond_word_fd = ConditionalFreqDist()
        for word in pos:
            word_fd[word] += 1
            cond_word_fd['pos'][word] += 1
        for word in neg:
            word_fd[word] += 1
            cond_word_fd['neg'][word] += 1

        pos_word_count = cond_word_fd['pos'].N()
        neg_word_count = cond_word_fd['neg'].N()
        total_word_count = pos_word_count + neg_word_count

        word_scores = {}
        for word, freq in word_fd.iteritems():
            pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
            neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
            word_scores[word] = pos_score + neg_score

        return word_scores
def create_word_bigram_scores():
    posdata = tp.seg_fil_txt("/home/hadoop/goodnew.txt")
    negdata = tp.seg_fil_txt("/home/hadoop/badnew.txt")
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finderr = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finderr.nbest(BigramAssocMeasures.chi_sq,350000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq,350000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd.inc(word)
        cond_word_fd['pos'].inc(word)
    for word in neg:
        word_fd.inc(word)
        cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
def create_word_bigram_scores(posWords, negWords, n = 5000):
    # (posWords,negWords) = readwordarr()
    posWords = list(itertools.chain(*posWords))
    negWords = list(itertools.chain(*negWords))
    bigramfinder = BigramCollocationFinder.from_words(posWords)
    posbigrams = bigramfinder.nbest(BigramAssocMeasures.chi_sq, n)
    bigramfinder = BigramCollocationFinder.from_words(negWords)
    negbigrams = bigramfinder.nbest(BigramAssocMeasures.chi_sq, n)
    posWords = posWords + posbigrams
    negWords = negWords + negbigrams
    wordscores = {}
    wordfd = FreqDist()
    conditionwordfd = ConditionalFreqDist()
    for word in posWords:
        wordfd[word]+=1
        conditionwordfd['pos'][word]+=1
        
    for word in negWords:
        wordfd[word]+=1
        conditionwordfd['neg'][word]+=1
    
    pos_word_count = conditionwordfd['pos'].N()
    neg_word_count = conditionwordfd['neg'].N()
    totalcount = pos_word_count + neg_word_count
    for word,freq in wordfd.items():
        pos_score = BigramAssocMeasures.chi_sq(conditionwordfd['pos'][word], (freq, pos_word_count), totalcount)
        neg_score = BigramAssocMeasures.chi_sq(conditionwordfd['neg'][word], (freq, neg_word_count), totalcount)
        wordscores[word] = pos_score + neg_score
    return wordscores
def create_word_bigram_scores():
    posdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl','r'))
    negdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl','r'))
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams #词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd['pos'][word]+= 1 #cond_word_fd['pos'].inc(word)
    for word in neg:
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd['neg'][word]+= 1#cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
def create_word_bigram_scores(posWords, negWords, score_method = BigramAssocMeasures.chi_sq):
    '''
    以双词来统计词的信息量
    '''
    bigram_finder = BigramCollocationFinder.from_words(posWords)
    posBigrams = bigram_finder.nbest(score_method, 5000)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    negBigrams = bigram_finder.nbest(score_method, 5000)
    pos = posWords + posBigrams #词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in neg:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count
    print("BIGRAM_IN_POSWORD_NUMS : %d\tBIGRAM_IN_NEGWORD_NUMS : %d" % (pos_word_count, neg_word_count))

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = score_method(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = score_method(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score
    return word_scores
def create_word_bigram_scores(posWords, negWords):
    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)

    pos = posWords + posBigrams #词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[str(word)] += 1 
        cond_word_fd['pos'][str(word)] += 1
    for word in neg:
	    word_fd[str(word)] += 1
	    cond_word_fd['neg'][str(word)] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
def best_bigrams(sents_tagged, stopwords, score_fn=BigramAssocMeasures.likelihood_ratio, n=300):
    sents_pos = []
    sents_neg = []

    # Separate positive and negative sentences.
    for tag, sent in sents_tagged:
        if tag == 1:
            sents_pos.append(sent)
        elif tag == -1:
            sents_neg.append(sent)

    # Extract words from positive and negative sentences.
    words_pos = [word.lower() for s in sents_pos for word in word_tokenize(s) if word not in string.punctuation]
    words_neg = [word.lower() for s in sents_neg for word in word_tokenize(s) if word not in string.punctuation]

    # Find the best bigrams for positive sentences based on informative collocations
    bigram_finder1 = BigramCollocationFinder.from_words(words_pos)
    bigrams_best_pos = bigram_finder1.nbest(score_fn, n)

    # Find the best bigrams for negative sentences based on informative collocations
    bigram_finder2 = BigramCollocationFinder.from_words(words_neg)
    bigrams_best_neg = bigram_finder2.nbest(score_fn, n)

    bigrams_all = list(set(bigrams_best_pos).union(set(bigrams_best_neg)))

    # Select only the bigrams that have either one of the word greater than length 3
    bigrams_best = [bigram for bigram in bigrams_all
            if len(bigram[0]) > 3 and len(bigram[1]) > 3
            and bigram[0] not in ex and bigram[1] not in ex ]


    return bigrams_best
Beispiel #8
0
def create_word_bigram_scores():
    posdata = tp.seg_fil_senti_excel("~", 1, 1)
    negdata = tp.seg_fil_senti_excel("~", 1, 1)
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    last_word = ConditionalFreqDist()
    for word in pos:
        word_fd.inc(word)
        last_word['pos'].inc(word)
    for word in neg:
        word_fd.inc(word)
        last_word['neg'].inc(word)

    pos_word_count = last_word['pos'].N()
    neg_word_count = last_word['neg'].N()
    totalnumber = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(last_word['pos'][word], (freq, pos_word_count), totalnumber)
        neg_score = BigramAssocMeasures.chi_sq(last_word['neg'][word], (freq, neg_word_count), totalnumber)
        word_scores[word] = pos_score + neg_score

    return word_scores
def create_bigram_scores():
    posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", "1", "1")
    negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", "1", "1")
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000)

    pos = posBigrams
    neg = negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd.inc(word)
        cond_word_fd['pos'].inc(word)
    for word in neg:
        word_fd.inc(word)
        cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
def create_words_bigrams_scores():
    posdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/pos_review.xlsx", 1, 1)
    negdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/neg_review.xlsx", 1, 1)
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word]+=1
        cond_word_fd['pos'][word]+=1

    for word in neg:
        word_fd[word]+=1
        cond_word_fd['neg'][word]+=1
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
Beispiel #11
0
def create_word_bigram_scores():
	bigram_finder = BigramCollocationFinder.from_words(posWords)
	bigram_finder = BigramCollocationFinder.from_words(negWords)
	posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
	negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
	pos = posWords + posBigrams #词和双词搭配
	neg = negWords + negBigrams
	return get_scores(pos, neg)
def create_word_bigram_scores():
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))
    
    objWords = list(itertools.chain(*objdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    
    bigram_finder = BigramCollocationFinder.from_words(objWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    
    objBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)


    pos = posWords + posBigrams
    neg = negWords + negBigrams
    
    obj = objWords + objBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in neg:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1
    for word in objWords:
        word_fd[word] += 1
        cond_word_fd['obj'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    
    obj_word_count = cond_word_fd['obj'].N()
    total_word_count = pos_word_count + neg_word_count + obj_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
       
        obj_score = BigramAssocMeasures.chi_sq(cond_word_fd['obj'][word], (freq, obj_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score + obj_score

    return word_scores
Beispiel #13
0
    def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        :seealso: find_collocations
        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not (
            '_collocations' in self.__dict__
            and self._num == num
            and self._window_size == window_size
        ):
            self._num = num
            self._window_size = window_size

            # print("Building collocations list")
            from nltk.corpus import stopwords

            ignored_words = stopwords.words('english')
            finder = BigramCollocationFinder.from_words(self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations]
        print(tokenwrap(colloc_strings, separator="; "))
 def get_bigram(self, features_list):
     #Top ten best bigrams are selected
     score = BigramAssocMeasures.chi_sq
     all_bigrams = BigramCollocationFinder.from_words(features_list)
     best_bigrams = all_bigrams.nbest(score, self.bigram_threshold)
     selected_bigrams = [(bigram, True) for bigram in best_bigrams]
     return selected_bigrams
Beispiel #15
0
    def get_frequencies(self, desc):

        stopset = set(stopwords.words('english'))
        filter_stops = lambda w: len(w) < 3 or w in stopset
        words = word_tokenize(desc)

        print '------gram--------'
        words_to_count = [word for word in words if word not in stopset]
        words_to_count = [word for word in words_to_count if not len(word) < 3]
        c = Counter(words_to_count)
        single = c.most_common(20)
        print single

        print '------bigram--------'
        bcf = BigramCollocationFinder.from_words(words)
        bcf.apply_word_filter(filter_stops)
        bigrm = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 15)
        print bigrm

        print '------trigram--------'
        tcf = TrigramCollocationFinder.from_words(words)
        tcf.apply_word_filter(filter_stops)
        tcf.apply_freq_filter(3)  #only keep those that appear more than 3 times
        trigrm = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 10)
        print trigrm

        matches = [single,bigrm,trigrm]
        return matches
def get_bigrams1(tweet, score_fn=BigramAssocMeasures.chi_sq, n=200):
        bigramslist = []
        bigram_finder = BigramCollocationFinder.from_words(tweet)
        bigrams = bigram_finder.nbest(score_fn, n)
        for bigram in bigrams:
            bigramslist.append(' '.join(str(i) for i in bigram))
        print bigramslist
def best_bigram_word_features(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    d = dict([(bigram, True) for bigram in bigrams])
    d.update(best_word_features(words))

    return d
def bigram_word_features(words, limit, stop, stopset, word_score_placeholder, \
                                          score_fn=BigramAssocMeasures.chi_sq):
  if stop:
    words = [w for w in words if w not in stopset]
  bigram_finder = BigramCollocationFinder.from_words(words)
  bigrams = bigram_finder.nbest(score_fn, limit)
  return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
def tweet_features(tweet):
    tweet_words = word_tokenize(tweet)
    bigram_finder = BigramCollocationFinder.from_words(tweet_words)
    score_fn=BigramAssocMeasures.chi_sq
    bigrams = bigram_finder.nbest(score_fn, 200)
    print bigrams
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
Beispiel #20
0
def get_bag_of_bigrams_words(
        word_list,
        score_fn=BigramAssocMeasures.chi_sq,
        n=200):
    bigram_finder = BigramCollocationFinder.from_words(word_list)
    bigrams = bigram_finder.nbest(score_fn, n)
    return get_bag_of_words(word_list + bigrams)
Beispiel #21
0
 def get_collocations(self):
     ignored_words = stopwords.words('english')
     finder = BigramCollocationFinder.from_words(self.text_array,2)
     finder.apply_freq_filter(3)
     finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
     bigram_measures = BigramAssocMeasures()
     return finder.nbest(bigram_measures.likelihood_ratio,40)
Beispiel #22
0
def bigram_words(words, score_fn = BigramAssocMeasures.chi_sq, n=200):

    bigram_finder = BigramCollocationFinder.from_words(words)
    
    bigrams = bigram_finder.nbest(score_fn, n)
    
    return bag_of_words(words + bigrams)
Beispiel #23
0
 def converter(tokens):
     bigram_finder = BigramCollocationFinder.from_words(tokens)
     bigrams = bigram_finder.nbest(score_fn, n)
     return (
         {ngram: True for ngram in itertools.chain(tokens, bigrams)},
         label
     )
Beispiel #24
0
def stopword_filtered_bigrams(text, score_fn=BigramAssocMeasures.chi_sq, n=500):
    '''Removes the stopwords and computes the best bigrams'''
    stopset = set(stopwords.words('english'))
    words = [word for word in tokenize(text) if word not in stopset]
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
Beispiel #25
0
 def collaction_discovery(self):
     self.corpus = nltk.word_tokenize(self.corpus.lower())
     bigramm_finder = BigramCollocationFinder.from_words(self.corpus)
     filter_bigram = lambda w: len(w) < 3 or w in self.stopwords_
     bigramm_finder.apply_word_filter(filter_bigram)
     top_10_bigrams = bigramm_finder.nbest(BigramAssocMeasures.likelihood_ratio, 10)
     return top_10_bigrams
 def get_bigrams(self, tweet, score_fn=BigramAssocMeasures.chi_sq, n=200):
         bigramslist = []
         bigram_finder = BigramCollocationFinder.from_words(tweet)
         bigrams = bigram_finder.nbest(score_fn, n)
         for bigram in bigrams:
             bigramslist.append(' '.join(str(i) for i in bigram))
         return bigramslist #This is list e.g. ['you dude', 'Hi How', 'How are', 'are you']
Beispiel #27
0
def demo_collocations(self, num=40, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        @seealso: L{find_collocations}
        @param num: The maximum number of collocations to print.
        @type num: C{int}
        @param window_size: The number of tokens spanned by a collocation (default=2)
        @type window_size: C{int}
        """
        if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
            self._num = num
            self._window_size = window_size
            print "Building collocations list"
            from nltk.corpus import stopwords
            ignored_words = stopwords.words('english')
            from nltk.collocations import BigramCollocationFinder
            finder = BigramCollocationFinder.from_words(self.tokens, window_size) 
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            from nltk.metrics import f_measure, BigramAssocMeasures
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1+u' '+w2 for w1, w2 in self._collocations]
        print "List {0} collocations".format(num)
        print tokenwrap(colloc_strings, separator=u'; ')
def ShowCollocations():
	text.insert(END, "If this doesn't work, please check you have NLTK, PyYAML and the stopword list from the NLTK loaded. See Help for details \n\n\n")
	import nltk
	from nltk.collocations import BigramCollocationFinder
	from nltk.collocations import TrigramCollocationFinder
	from nltk.metrics import BigramAssocMeasures
	from nltk.metrics import TrigramAssocMeasures
	pattern = r'''(?x)([A-Z]\.)+|\w+([-']\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;"'?():-_']'''
	data = resultsbox.get(1.0,END)
	rawtext=nltk.regexp_tokenize(data, pattern)
	prepcolloc = [word.lower() for word in rawtext if not word in stopwords and word.isalpha()]
	text.delete(1.0, END)
	text.insert(END, "Collocations (occurring at least 3 times with a PMI of 10)\n")
	text.insert(END, "\nBigram Collocations:\n")
	bigram = BigramAssocMeasures()
	bigramfinder = BigramCollocationFinder.from_words(prepcolloc)
	bigramfinder.apply_freq_filter (3)
	bigrams=bigramfinder.nbest(bigram.pmi, 10)
	for item in bigrams:
		first = item[0]
		second = item[1]
		text.insert(END, first)
		text.insert(END, " ")
		text.insert(END, second)
		text.insert(END, "\n")
Beispiel #29
0
def bigram(ytcomments, drug):
    bi = BigramAssocMeasures()
    bi_finder = BigramCollocationFinder.from_words(ytcomments, window_size = 20)
    top_general = bi_finder.nbest(bi.pmi,30)
    bi_finder.apply_ngram_filter(lambda w1,w2: w1 != drug and w2 != drug)
    top_bi = bi_finder.nbest(bi.pmi, 30)
    return top_bi
 def get_bigrams(self, rawText, score_fn=BigramAssocMeasures.pmi, n=40):
     # TODO configuration value
     clean_text = TokensCleaner.clean(self, rawText, cleaning_level=3)
     bigram_finder = BigramCollocationFinder.from_words(clean_text['3'])
     bigram_measures = BigramAssocMeasures()
     bigrams = bigram_finder.nbest(bigram_measures.pmi, n)
     return bigrams
Beispiel #31
0
def create_features(X, user_data=None):
    res = []

    for date, comment, user in X:
        feat = {}
        has_hate_word = has_drug_word = has_cult_word = has_occult_word = has_porn_word = 0
        has_fwenzel_word = 0
        has_swastika = swastika in comment

        comment = comment.lower()

        comment = parse_text(comment)

        comment = nltk.clean_html(comment)

        sents = sent_tokenize(comment)
        doc = []
        for sent in sents:
            # Tokenize each sentence.
            doc += wordtokenizer.tokenize(sent)

        def repl_filter(x):
            return x.lower() not in ["nl", "nl2", "nbsp", "nbsp2", "dummyhtml"]

        # Remove stopwords and replacement tokens.
        doc = filter(repl_filter, doc)

        for i, word in enumerate(doc):
            if doc[i] in bad_words:
                doc[i] = '_badword_'

            doc[i] = ps.stem(doc[i])

            doc[i] = wnl.lemmatize(doc[i])

            if doc[i] in bad_words:
                doc[i] = '_badword_'

            if doc[i] in hate_words:
                has_hate_word = 1
            if doc[i] in drug_words:
                has_drug_word = 1
            if doc[i] in cult_words:
                has_cult_word = 1
            if doc[i] in occult_words:
                has_occult_word = 1
            if doc[i] in porn_words:
                has_porn_word = 1
            if doc[i] in fwenzel_words:
                has_fwenzel_word = 1

        bigram_finder = BigramCollocationFinder.from_words(doc)
        bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, n=5)

        bigram = dict([(ngram, True)
                       for ngram in itertools.chain(doc, bigrams)])

        feat.update(bigram)

        text_vocab = set(w for w in doc if w.isalpha())
        unusual = text_vocab.difference(english_vocab)
        unusual_ratio = len(unusual) / len(text_vocab) if len(
            text_vocab) != 0 else -1.0

        unusual2 = unusual.difference(set("_badword_"))
        unusual_ratio2 = len(unusual2) / len(text_vocab) if len(
            text_vocab) != 0 else -1.0

        if user_data is not None:
            user_info = user_data[user]

        has_bad_word = True
        for word in bad_words:
            if word in comment.lower():
                break
        else:
            has_bad_word = False

        def n_none(x):
            return int(x) if x is not None else 0

        def c_none(x):
            return x if x is not None else "__None__"

        readability = ReadabilityTool(comment)

        read_feat = {}
        for f, val in readability.analyzedVars.items():
            if f != 'words':
                read_feat["_" + f] = val
        for test, val in readability.tests_given_lang['eng'].items():
            read_feat["__" + test] = val(readability.text)

        feat['_always_present'] = True
        feat['_word_num'] = len(doc)
        feat['_sent_num'] = len(sents)
        feat['_word_var'] = len(set(doc)) / len(doc) if len(doc) != 0 else -1.0
        feat['_sent_var'] = len(set(sents)) / len(sents)
        feat['_unusual_ratio'] = unusual_ratio
        feat['_unusual_ratio2'] = unusual_ratio2
        if user_data is not None:
            feat['_username'] = user
            feat['_user_subcount'] = int(user_info['SubscriberCount'])
            feat['_user_friends'] = int(user_info['FriendsAdded'])
            feat['_user_favs'] = int(user_info['VideosFavourited'])
            feat['_user_videorates'] = int(user_info['VideosRated'])
            feat['_user_videouploads'] = int(user_info['VideosUploaded'])
            feat['_user_videocomments'] = int(user_info['VideosCommented'])
            feat['_user_videoshares'] = int(user_info['VideosShared'])
            feat['_user_usersubs'] = int(user_info['UserSubscriptionsAdded'])
            feat['_user_gender'] = c_none(user_info['Gender'])
            feat['_user_age'] = n_none(user_info['Age'])
            feat['_user_closed'] = user_info['UserAccountClosed']
            feat['_user_suspended'] = user_info['UserAccountSuspended']
            feat['_user_has_gender'] = 1 if user_info[
                'Gender'] is not None else 0
            feat['_user_has_school'] = 1 if user_info[
                'School'] is not None else 0
            feat[
                '_user_has_books'] = 1 if user_info['Books'] is not None else 0
            feat['_user_has_movies'] = 1 if user_info[
                'Movies'] is not None else 0
            feat[
                '_user_has_music'] = 1 if user_info['Music'] is not None else 0
            feat['_user_has_location'] = 1 if user_info[
                'Location'] is not None else 0
            feat['_user_has_hometown'] = 1 if user_info[
                'Hometown'] is not None else 0
    #        feat['_user_last'] = user_info['LastWebAccess']

    # Dictionary features
        feat['_has_bad_word'] = has_bad_word
        #        feat['_has_hate_word'] = has_hate_word
        #        feat['_has_drug_word'] = has_drug_word
        feat['_has_cult_word'] = has_cult_word
        feat['_has_swastika'] = has_swastika
        #        feat['_has_occult_word'] = has_occult_word
        #        feat['_has_has_fwenzel_word'] = has_fwenzel_word
        feat['_has_porn_word'] = has_porn_word
        feat['_has_swastika'] = has_swastika
        feat.update(read_feat)

        #        print feat
        res.append(feat)
    return res
Beispiel #32
0
def bigram(words, score_fn=BigramAssocMeasures.chi_sq, n=500):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)  # 使用卡方统计的方法,选择排名前1000的词语
    newBigrams = [u + v for (u, v) in bigrams]
Beispiel #33
0
 def take_bigram(self, text, stop_words):
     finder = BigramCollocationFinder.from_words(text)
     return finder.nbest(BigramAssocMeasures.likelihood_ratio, 15)
Beispiel #34
0
def bigrams(text, score_fn=BigramAssocMeasures.chi_sq, n=500):
    '''Find the best n bigrams of a text by means of a give measure.'''
    words = tokenize(text)
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
def bigrams(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bag_of_words(bigrams)
Beispiel #36
0
def bigram(words, score_fn=BigramAssocMeasures.chi_sq, n=1000):
    bigram_finder = BigramCollocationFinder.from_words(words)  # 把文本变成双词搭配的形式
    bigrams = bigram_finder.nbest(score_fn, n)  # 使用了卡方统计的方法,选择排名前1000的双词
    return bag_of_words(bigrams)
def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    d = dict([(bigram, True) for bigram in bigrams])
    d.update(best_word_feats(words))
    return d
Beispiel #38
0
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')
text = 'Mary had a little lamb. Her fleece was white as snow. Lamb little '
sents = sent_tokenize(text)
#print(sents)
#words = word_tokenize(text)
words = [word_tokenize(t) for t in sents]
#print(words)
customstopwords = set(stopwords.words('english') + list(punctuation))
#print(customstopwords)
wordsstop = [
    word for word in word_tokenize(text) if word not in customstopwords
]
#print(wordsstop )
bm = BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(wordsstop)  # can do trigrams too.
#print(sorted(finder.ngram_fd.items()))

text2 = 'Mary closed closer in close'
st = LancasterStemmer()  # reduces to root form.
stemw = [st.stem(i) for i in word_tokenize(text2)]
#print(set(stemw))
#print(nltk.pos_tag(word_tokenize(text2))) # part of speech tagging

for ss in wordnet.synsets('bass'):
    pass  #print(ss,ss.definition())

sense1 = lesk(word_tokenize("Sing in a lower tone, along with the bass"),
              'bass')
#print(sense1,sense1.definition())
Beispiel #39
0
TEXT_DIR = "./_TEXT"
READMES = sorted(
    [f for f in listdir_nohidden(TEXT_DIR) if isfile(join(TEXT_DIR, f))])

bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

bi_dict = dict()

for README in READMES:
    readme_file_name = TEXT_DIR + "/" + README
    with open(readme_file_name, "r") as readme_file:
        readme_contents = onlyLetters(readme_file.read())
        words = readme_contents.split(" ")
        removeStopwords(words)
        bi_finder = BigramCollocationFinder.from_words(words)
        bi_collocations = bi_finder.nbest(bigram_measures.likelihood_ratio, 10)

        for collocation in bi_collocations:
            if len(collocation[0]) + len(collocation[1]) > 1:
                incrementDict(" ".join(collocation), bi_dict)

if " " in bi_dict:
    bi_dict.pop(" ")

bi_dict_sorted = OrderedDict(
    sorted(bi_dict.items(), reverse=True, key=lambda (k, v): (v, k)))
bi_dict_json = json.dumps(take(1000, bi_dict_sorted))

with open("bigram_words.json", "w") as bigram_file:
    bigram_file.write(bi_dict_json)
Beispiel #40
0
t_df = pd.DataFrame(t_array, columns = range(len(cleaned_tweets)), index = list_vocab)

sum_df = t_df.sum(axis = 1, skipna = True)
sum_df = pd.DataFrame(sum_df, columns = ['Frequency'])
sum_df = sum_df.sort_values(by = 'Frequency', ascending = False)
print(sum_df.head(50))
print(sum_df.sum())

#-----------------------------#

cvec = CountVectorizer(analyzer=lambda x:x.split(','))
c_feat = cvec.fit_transform(split_words_j)
# vocabs = [w for w in cvec.vocabulary_.keys()]

flattened_split_words = [y for x in split_words for y in x]
biagram_collocation = BigramCollocationFinder.from_words(flattened_split_words) 
th_stop = get_th_stop()
filter_stops = lambda w: len(w) < 3 or w in th_stop
biagram_collocation.apply_word_filter(filter_stops)

biagram = biagram_collocation.score_ngrams(BigramAssocMeasures.likelihood_ratio)

prefix_keys = collections.defaultdict(list)
for key, scores in biagram:
   prefix_keys[key[0]].append((key[1], scores))

for key in prefix_keys:
   prefix_keys[key].sort(key = lambda x: -x[1])

n_words = int(sys.argv[2])
def bigrams_words_features(words,
                           nbigrams,
                           measure=BigramAssocMeasures.chi_sq):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(measure, nbigrams)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
Beispiel #42
0
def bigram_words(words, score_fn=BigramAssocMeasures.chi_sq, n=1000):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)

    return bag_of_words(words + bigrams)  # 所有词和(信息量大的)双词搭配一起作为特征
def  bigram(words,score_fn=BigramAssocMeasures.chi_sq,n=1000):
    bigram_finder=BigramCollocationFinder.from_words(words)
    bigrams= bigram_finder.nbest(score_fn,n)
    newBigrams = [u+v for (u,v) in bigrams]
    return bag_of_words(newBigrams) 
Beispiel #44
0
        l.append(z)
    l = sorted(l,key=itemgetter(1),reverse=True)
    return(l[0:300])
            
top_words_quadcounter(job_text)



special_chars = ['--','...','\n','•','®']
a = ' '.join(job_text)
a = a.translate(str.maketrans('','',string.punctuation)).lower() #remove punctuation and make lower case
for char in special_chars:
    a = a.replace(char, ' ') #replace special char with a space
resultwords = [word for word in a.split(' ') if word.lower() not in stopwords]
text = ' '.join(resultwords)
finder = BigramCollocationFinder.from_words(word_tokenize(text))
for k,v in finder.ngram_fd.items():
    print(k,v)


##deep copy. save a copy.




a = ' '.join(job_text)
a = a.translate(str.maketrans('','',string.punctuation)).lower() #remove punctuation and make lower case
a = a.replace('\n', ' ') #replace \n with a space
a = a.replace('•', ' ')
resultwords = [word for word in a.split(' ') if word.lower() not in stopwords]
    
def main(in_dir: Path,
         out_dir: Path,
         num_corpus_chunks: int,
         min_frequency: int,
         conserve_RAM: bool = False) -> None:
    Path.mkdir(out_dir, parents=True, exist_ok=True)
    preview = open(out_dir / f'vocab.txt', 'w')

    corpus: List[LabeledDoc] = []
    for part_index in tqdm(range(num_corpus_chunks), desc='Loading cache'):
        with open(in_dir / f'tokenized_{part_index}.pickle', 'rb') as in_file:
            corpus += pickle.load(in_file)

    # Lowercase, discard punctuations, replace numbers, deduplicate
    number = re.compile(r'\d')
    starts_with_letter = re.compile(r"^\w")
    select_punctuations = re.compile(r"[@#&:]|.com")
    norm_freq: Counter[str] = Counter()
    existed: Set[Tuple[str, ...]] = set()
    duplicates = 0
    for doc in tqdm(corpus, desc='Normalizing tokens'):
        for sent in doc.sentences:
            for token in sent.tokens:
                if not starts_with_letter.search(token):
                    continue
                if select_punctuations.search(token):
                    continue
                if number.search(token):
                    norm_token = '<NUM>'
                else:
                    norm_token = token.lower()
                sent.normalized_tokens.append(norm_token)
                norm_freq[norm_token] += 1
            if conserve_RAM:
                del sent.tokens
            # all_norm_tokens += sent.normalized_tokens
            hashable = tuple(sent.normalized_tokens)
            if hashable not in existed:
                existed.add(hashable)
            else:
                duplicates += 1

        doc.sentences = [  # Filter out duplicate sentences
            sent for sent in doc.sentences if tuple(sent.tokens) not in existed
        ]
    print(f'Number of duplicate sentences = {duplicates:,}')

    UNK_filtered_freq: Counter[str] = Counter()
    for key, val in norm_freq.items():
        if val >= min_frequency:
            UNK_filtered_freq[key] = val
        else:
            UNK_filtered_freq['<UNK>'] += val
    print(f'Number of filtered unigrams = {len(UNK_filtered_freq):,}')
    print(f'Number of filtered unigrams = {len(UNK_filtered_freq):,}',
          file=preview)

    all_norm_tokens: List[str] = [
        nt for doc in corpus for sent in doc.sentences
        for nt in sent.normalized_tokens
    ]

    special_tokens = {'<UNK>', '<NUM>', "n't", "n’t"}
    print('Finding bigrams...')
    bigram_finder = BigramCollocationFinder.from_words(all_norm_tokens)
    num_tokens = len(all_norm_tokens)
    bigram_finder.apply_freq_filter(min_frequency)
    stop_words = set(stopwords.words('english')).union(special_tokens)
    bigram_finder.apply_word_filter(lambda word: word in stop_words)
    bigrams = bigram_finder.score_ngrams(BigramAssocMeasures().raw_freq)
    # bigrams = bigram_finder.score_ngrams(BigramAssocMeasures().pmi)
    print(f'Number of filtered bigrams = {len(bigrams):,}')
    print(f'Number of filtered bigrams = {len(bigrams):,}', file=preview)
    with open(out_dir / 'bigrams.txt', 'w') as bigram_file:
        for bigram, relative_freq in bigrams:
            absolute_freq = relative_freq * num_tokens
            bigram_str = ' '.join(bigram)
            # bigram_file.write(f'{relative_freq:.4f}\t{bigram_str}\n')  # for PMI
            bigram_file.write(f'{absolute_freq:.0f}\t{bigram_str}\n')

    # print('Finding trigrams...')
    # trigram_finder = TrigramCollocationFinder.from_words(all_norm_tokens)
    # trigram_finder.apply_freq_filter(min_frequency)
    # trigram_finder.apply_word_filter(lambda word: word in stop_words)
    # # trigram_finder.apply_ngram_filter(
    # #     lambda w1, w2, w3: (w1 in stop_words) or (w3 in stop_words) or (w2 in special_tokens))
    # trigrams = trigram_finder.score_ngrams(TrigramAssocMeasures().raw_freq)
    # print(f'Number of filtered trigrams = {len(trigrams):,}')
    # print(f'Number of filtered trigrams = {len(trigrams):,}', file=preview)
    # with open(out_dir / 'trigrams.txt', 'w') as trigram_file:
    #     for trigram, relative_freq in trigrams:
    #         absolute_freq = relative_freq * num_tokens
    #         trigram_str = ' '.join(trigram)
    #         trigram_file.write(f'{absolute_freq:.0f}\t{trigram_str}\n')
    del all_norm_tokens

    # Multi-Word Expression tokenize to underscored
    underscorer = MWETokenizer([bi for bi, _ in bigrams
                                ])  # maybe add affordable care act
    # underscorer = MWETokenizer(
    #     [tri for tri, _ in trigrams] + [bi for bi, _ in bigrams])
    vocab: Counter[str] = Counter()
    for doc in tqdm(corpus, desc='Underscoring multi-phrase expressions'):
        for sent in doc.sentences:
            sent.underscored_tokens = underscorer.tokenize(
                sent.normalized_tokens)
            vocab.update(sent.underscored_tokens)
            if conserve_RAM:
                del sent.normalized_tokens
    print('Pickling...')
    with open(out_dir / 'MWE_underscored.pickle', 'wb') as out_file:
        pickle.dump(corpus, out_file)

    for key, val in vocab.most_common():
        if val >= min_frequency:
            print(f'{val:,}:\t{key}', file=preview)
    preview.close()
        print(len(sentiment_list))


trump20_sent = sentiment_ct(trump_speech, "Trump 2020 ")
biden20_sent = sentiment_ct(biden_speech, "Biden 2020 ")
pence20_sent = sentiment_ct(pence20_speech, "Trump 2020 ")
harris20_sent = sentiment_ct(harris20_speech, "Biden 2020 ")
trump16_sent = sentiment_ct(trump16_speech, "Trump 2016 ")
clinton16_sent = sentiment_ct(clinton16_speech, "Clinton 2016 ")



################################################################################################
# Bigrams
# 2020 POTUS
dnc_finder = BigramCollocationFinder.from_words(biden_tokens)
dnc_finder.nbest(BigramAssocMeasures.chi_sq, 30) # top 30 DNC bigrams
dnc_finder.score_ngrams(BigramAssocMeasures.raw_freq)[:30] # bigrams with scores
# plot barchart
plot_word_freqs(dnc_finder.score_ngrams(BigramAssocMeasures.raw_freq)[:30], 'b', "Top 30 bigrams in Biden's 2020 DNC Speech", "Frequency Score")
# plot network
visualize_bigram(dnc_finder.score_ngrams(BigramAssocMeasures.raw_freq)[:30], .6, "Top 30 bigrams in Biden's 2020 DNC Speech") # democrat network


rnc_finder = BigramCollocationFinder.from_words(trump_tokens)
rnc_finder.nbest(BigramAssocMeasures.raw_freq, 30) # top 30 RNC bigrams
rnc_finder.score_ngrams(BigramAssocMeasures.raw_freq)[:30] # bigrams with scores
# plot barchart
plot_word_freqs(rnc_finder.score_ngrams(BigramAssocMeasures.raw_freq)[:30], 'r', "Top 30 bigrams in Trump's 2020 RNC Speech", "Frequency Score")
# plot network
visualize_bigram(rnc_finder.score_ngrams(BigramAssocMeasures.raw_freq)[:30], 0.6, "Top 30 bigrams in Trump's 2020 RNC Speech") # republican network
Beispiel #47
0
def extract_bigram(words):
    finder = BigramCollocationFinder.from_words(words)
    return finder.nbest(bigram_measures.pmi, 5)
plt.figure(figsize = (50,25))
plt.imshow(bigram_wordcloud,interpolation = 'bilinear')
plt.axis("off")
plt.show()






# =============================================================================
#

from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
finder=BigramCollocationFinder.from_words(email_wc_na)
 
a=finder.nbest(BigramAssocMeasures.likelihood_ratio, 10)
print(a)
 =============================================================================
#pos tagging
 =============================================================================
import nltk
nltk.download('averaged_perceptron_tagger')
token=nltk.word_tokenize(email_wc_a)
a=list(nltk.pos_tag(token))

 
from nltk import pos_tag
from nltk import RegexpParser
 
Beispiel #49
0
@author: issfz
"""
import string
from nltk.corpus import reuters
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures  #bigram associations
from nltk.corpus import stopwords

grain_tok = [reuters.words(f) for f in reuters.fileids('grain')
             ]  #return values are already tokenised
trade_tok = [reuters.words(f) for f in reuters.fileids('trade')]

words = [w.lower() for f in grain_tok
         for w in f]  #lower case to prevent case sensitivity
bcf = BigramCollocationFinder.from_words(
    words)  # will give words but not matrix
top100 = bcf.nbest(
    BigramAssocMeasures.likelihood_ratio, 100
)  #will give top n no of best candidates based on certain criteria(likelihood ration we can use to start with)
top = [(t1, t2) for (t1, t2) in top100
       if t1 not in string.punctuation and t2 not in string.punctuation]

stopset = set(stopwords.words('english'))
filter_stops = lambda w: len(
    w
) < 3 or w in stopset  # filter stop words , more filtering required, prepare filter pattern first , we use lambda function
bcf.apply_word_filter(filter_stops)
bcf.nbest(BigramAssocMeasures.likelihood_ratio, 10)
bcf.nbest(BigramAssocMeasures.chi_sq, 10)
bcf.nbest(BigramAssocMeasures.jaccard, 10)
bcf.nbest(BigramAssocMeasures.mi_like, 10)
Beispiel #50
0
    stopwordsList.append('x')
    stopwordsList.append('z')
    stopwordsList.append('Pp')
    stopwordsList.append('Pq')

    return stopwordsList


stopwords = prepareStopWords()

# fdist = FreqDist(text)
# fdist_no_punc_no_stopwords = nltk.FreqDist(dict((word, freq) for word, freq in fdist.items() if word not in stopwords and word.isalpha()))

# bigramas
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(words)
finder.nbest(bigram_measures.pmi, 10)
finder.apply_freq_filter(3)
finder.nbest(bigram_measures.pmi, 10)

# WC para los bigramas mas frecuentes
stopWords = stopwords
text_content = [
    ''.join(re.split("[ .,;:!?‘’``''@#$%^_&*()<>{}~\n\t\\\-]", word))
    for word in text
]

text_content = [word for word in text_content if word not in stopWords]
text_content = [s for s in text_content if len(s) != 0]
text_content = [WNL.lemmatize(t) for t in text_content]
finder = BigramCollocationFinder.from_words(text_content)
Beispiel #51
0
data_token = pd.read_csv(processed_path + "processed+tokenized.csv")
data_token['message'] = data_token['message'].apply(eval)

#########################Entire Dictionary#####################################
flat_list = []
for sublist in data_token['message']:
    for item in sublist:
        flat_list.append(item)
######################finds top bigrams and trigrams###########################

bigrams = nltk.collocations.BigramAssocMeasures()
trigrams = nltk.collocations.TrigramAssocMeasures()

trigramfinder = TrigramCollocationFinder.from_words(flat_list)
bigramfinder = BigramCollocationFinder.from_words(flat_list)

bigram_freq = bigramfinder.ngram_fd.items()
bigramFreqTable = pd.DataFrame(list(bigram_freq),
                               columns=['bigram',
                                        'freq']).sort_values(by='freq',
                                                             ascending=False)

trigram_freq = trigramfinder.ngram_fd.items()
trigramFreqTable = pd.DataFrame(list(trigram_freq),
                                columns=['trigram',
                                         'freq']).sort_values(by='freq',
                                                              ascending=False)

bigramFreqTable.to_csv(raw_path + "bigramFreqTable.csv")
trigramFreqTable.to_csv(raw_path + "trigramFreqTable.csv")
Beispiel #52
0
from nltk.corpus import webtext
from nltk.metrics import BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder
from nltk.corpus import stopwords

textWord = [w.lower() for w in webtext.words('pirates.txt')]
finder = BigramCollocationFinder.from_words(textWord)
#print(finder.nbest(BigramAssocMeasures.likelihood_ratio,10))
ignored_word = set(stopwords.words('english'))
print(ignored_word)
filterStpos = lambda w: len(w) < 3 or w in ignored_word

finder.apply_word_filter(filterStpos)
print(finder.nbest(BigramAssocMeasures.likelihood_ratio, 10))
        num = len(dic.positions(word)) + 1
    except:
        return 1
    return num


# Instantiate dictionary used to count syllables
dic = pyphen.Pyphen(lang='en')

# Instantiate corpus reader for word selection
ignoredWords = set(stopwords.words("english"))
filterStops = lambda w: len(w) < 3 or w in ignoredWords

# Load the brown corpus, get the collocations for each word and scores based on the likelihood of occurrence
bigramMeasures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(nltk.corpus.brown.words())
scored = finder.score_ngrams(bigramMeasures.likelihood_ratio)

# Create dictionary of lists to associate keys with all their bigram pairs (word, likelihood ratio)
dictList = collections.defaultdict(list)
for key, score in scored:
    dictList[key[0]].append((key[1], score))

# Get words from picture and assess for suitability
first = choice(tags)
tags.remove(first)
second = choice(tags)
tags.remove(second)
third = choice(tags)

# Create lists to hold words, syllables and max syllables for each line
Beispiel #54
0
def word_features(words, score_fn=BAM.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict((bg, True) for bg in chain(words, bigrams))
Beispiel #55
0
def bigram_feats(text, score_fn=BigramAssocMeasures.pmi, n_best=200):
    bigram_finder = BigramCollocationFinder.from_words(text)
    n_grams = bigram_finder.nbest(score_fn, n_best)
    return dict([(n_gram, True) for n_gram in n_grams])
Beispiel #56
0
 def findBigrams(self, tweet):
     words = [w for w in tweet]
     bigrams = BigramCollocationFinder.from_words(words)
     return bigrams.nbest(BigramAssocMeasures.likelihood_ratio, 20)
import nltk
from nltk.collocations import BigramCollocationFinder
from utils import tokenize_transcripts, get_files

# a list of tokens for each of the talks
transcript_tokens = tokenize_transcripts(stem=True)

# built in bigram metrics are in here
bigram_measures = nltk.collocations.BigramAssocMeasures()

# compute top bigrams and output results to console
for i, file in enumerate(get_files()):
    finder = BigramCollocationFinder.from_words(transcript_tokens[i])
    bigrams = finder.score_ngrams(bigram_measures.likelihood_ratio)

    print(file)
    for [tokens, value] in bigrams[0:50]:
        print('{},{}'.format(" ".join(tokens), value))
    print('---------\n')
Beispiel #58
0
wordcloud = WordCloud(max_font_size=40).generate(word_cloud_fr)

import matplotlib.pyplot as plt

plt.imshow(wordcloud, interpolation='bilinear')

plt.axis("off")
plt.savefig('word_cloud_fr.png')
#plt.show()

#English bigrams

from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

bcf = BigramCollocationFinder.from_words(words_en)
from nltk.corpus import stopwords

stopset = sw
filter_stops = lambda w: len(w) < 3 or w in stopset
bcf.apply_word_filter(filter_stops)
bcf_list = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 20)
bcf_joint_list = []
for words in bcf_list:
    bcf_joint_list.append(' '.join(words))

#save list in txt file
with open("bigrams_en.txt", "w") as output:
    output.write(str(bcf_joint_list))

#English trigrams
Beispiel #59
0
#Continuing to work from NLP for hackers.
import nltk
from nltk import word_tokenize
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from nltk.collocations import TrigramAssocMeasures, TrigramCollocationFinder

#Load ulysees into a variable.
with open('messages_only.txt', 'r', encoding="utf-8") as myfile:
    text = myfile.read()

#tokenize the text
tokens = word_tokenize(text)

bigram_measures = BigramAssocMeasures()
trigram_measures = TrigramAssocMeasures()

#compute length-2 collocations
finder = BigramCollocationFinder.from_words(tokens)

finder.apply_freq_filter(5)

print(finder.nbest(bigram_measures.pmi, 20))

finder = nltk.collocations.TrigramCollocationFinder.from_words(tokens)

#only trigrams that appear 5+ times
finder.apply_freq_filter(5)

#return the 50 trigrams with the highest PMI
print(finder.nbest(trigram_measures.pmi, 20))
Beispiel #60
0
def main():
#     parser = ArgumentParser()
#     parser.add_argument("--folder", type=str, dest="folder")
#     args = parser.parse_args()
#     hotelname = args.folder

    # Load the tokenized reviews (sentences)
#    infile = args.folder+"/"+hotelname+"_NB_trainingdata.senttokens_sel.pyvar"
    infile = "NB_data/NB_trainingdata.senttokens.pyvar"
    infile = open(infile, 'r')
    print infile
    word_tokens_byreviewid = pickle.load(infile)
    infile.close()

    # Load the training reviewids
#    infile = args.folder+"/"+hotelname+"_NB_trainingdata.labels.pyvar"
    infile = "NB_data/NB_trainingdata.labels.pyvar"
    infile = open(infile, 'r')
    keepreviewids = pickle.load(infile)
    infile.close()

    word_tokens_byreviewid_expanded = {}
    negtags = []; postags = []
    for reviewid in word_tokens_byreviewid:
        sents = word_tokens_byreviewid[reviewid]
        for sent_idx in range(0, len(sents)):
            tag = (reviewid, str(sent_idx))
            word_tokens_byreviewid_expanded[tag] = sents[sent_idx]
            if reviewid in keepreviewids['1']:
                negtags.append(tag)
            if reviewid in keepreviewids['5']:
                postags.append(tag)
    print "neg sents: %d\t pos sents: %d" %(len(negtags), len(postags))

#     # Stem the words in the sentences
#     # Separate each sentence into a unique entry
#     word_tokens_byreviewid_expanded = {}
#     negtags = []; postags = []
#     tag_expanded = []
#     for reviewid in word_tokens_byreviewid:
#         sents = word_tokens_byreviewid[reviewid]
# 	for sent_idx in range(0, len(sents)):
# 	    tag = (reviewid, str(sent_idx))
#             tag_expanded.append(tag)
# #             print tag
# # 	    print sents[sent_idx]
#             word_tokens_byreviewid_expanded[tag] = sents[sent_idx]
            
# 	    if reviewid in keepreviewids[0]:
#                 negtags.append(tag)
# #                 print "negtag : "
# #                 print tag
# #                 print negtags
# 	    if reviewid in keepreviewids[1]:
# 		postags.append(tag)
#     print "neg sents: %d\t pos sents: %d" %(len(negtags), len(postags))

#     print negtags, len(negtags), len(set(negtags))
#     print
# #    print postags
#    print word_tokens_byreviewid_expanded
#     print tag_expanded
#     print word_tokens_byreviewid_expanded[tag_expanded]

#     all_words = []
#     # Get all words to analyze frequency of unigrams and bigrams
#     for t in tag_expanded : 
#         print tag
#         tag = t 
#         word = word_tokens_byreviewid_expanded[tag]

#         token=nltk.word_tokenize(word)
# #        print len(token)
    
#         for i  in range (0, len(token)): 
#             all_words.append(token[i])
# #    all_words = [word for tag in word_tokens_byreviewid_expanded for word in word_tokens_byreviewid_expanded[tag]]
# #    all_words =word_tokens_byreviewid_expanded[negtags] 
#     # Get all the stop words
#     stopwords = get_bad_words()
# #    print all_words

    # Get all words to analyze frequency of unigrams and bigrams                                                                                                                                                                         
    all_words = [word for tag in word_tokens_byreviewid_expanded for word in word_tokens_byreviewid_expanded[tag]]
    # Get all the stop words                                                                                                                                                                                                             
    stopwords = get_bad_words()

#    dispersion_plot(all_words,postags)
    # Trigrams
    trigram_finder = TrigramCollocationFinder.from_words(all_words)
    trigram_finder.apply_ngram_filter(lambda w1, w2, w3: w1 in stopwords or w3 in stopwords)
    trigram_finder.apply_freq_filter(10)
    trigrams = trigram_finder.nbest(TrigramAssocMeasures.raw_freq, 2000)
    print "Number trigrams: %d" %len(trigrams)
#    print trigrams[:100]

    # Bigrams
    bigram_finder = BigramCollocationFinder.from_words(all_words)
    bigram_finder.apply_freq_filter(20)
    bigram_finder.apply_word_filter(lambda stopword: stopword in stopwords)
    bigrams = bigram_finder.nbest(BigramAssocMeasures.raw_freq, 2000)
    print "Number bigrams: %d" %len(bigrams)
#    print bigrams[:100]

    # Unigrams
    word_freq_dist = DataFrame(dict(FreqDist(all_words)).items(), columns = ['word','count'])
    word_freq_dist = word_freq_dist[word_freq_dist['count'] > 20]
#    print word_freq_dist
    good_features = list(set(word_freq_dist['word']) - stopwords)
    print "Number unigrams: %d" %len(good_features)
    good_features.extend(bigrams)
    good_features.extend(trigrams)
#    print good_features

    # Output the features in the model
#    outfile =  args.folder+"/"+ args.folder+"_NB_sentiment.model.features.pyvar"
    outfile =  "NB_data/NB_sentiment.model.features.pyvar"
    outfile = open(outfile, 'w')
    pickle.dump(good_features, outfile)
    outfile.close()
    

    # Calculate the features
    negfeatures = [(get_sent_features(word_tokens_byreviewid_expanded[fid], good_features), 'neg') 
                   for fid in negtags]
    posfeatures = [(get_sent_features(word_tokens_byreviewid_expanded[fid], good_features), 'pos') 
                   for fid in postags]
#    print negfeatures

#     # Shuffle and balance the two classes
#     n_min = min([len(negfeatures), len(posfeatures)])
#     random.shuffle(negfeatures)
#     negfeatures = negfeatures[:n_min]
#     random.shuffle(posfeatures)
#     posfeatures = posfeatures[:n_min]

#     # Define training and testing data
#     numfolds = 10
#     foldsize = n_min/numfolds
#     negfolds = make_folds(negfeatures, foldsize)
#     posfolds = make_folds(posfeatures, foldsize)

    negfolds = cross_validation.StratifiedKFold(negfeatures, n_folds=10)
    print negfolds
    posfolds = cross_validation.StratifiedKFold(posfeatures, n_folds=10)
    print posfolds

    # 10 fold cross validation
    outfile = "NB_data/NB_sentiment.model.performance.tab"
    outfile = open(outfile, 'w')
    outfile.write("Fold\taccuracy\tpos_precision\tpos_recall\tneg_precision\tneg_recall\n")
    for fold in range(0, numfolds):
	outfile.write("%d\t" %fold)
	testdata = negfolds[fold] + posfolds[fold]
	traindata = []
	for i in range(0, numfolds):
	    if i != fold:
		traindata += negfolds[i]
		traindata += posfolds[i]
    	print 'train on %d instances, test on %d instances' % (len(traindata), len(testdata))

        result = eval_classifier(traindata, testdata)
        accuracy, posprecision, posrecall, negprecision, negrecall = result
        print  result
        outfile.write("%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n"%(accuracy, posprecision, posrecall, negprecision, negrecall))
    outfile.close()

    # Save the classifier trained using all data
    classifier = NaiveBayesClassifier.train(negfeatures + posfeatures)
#    outfile = args.folder+"/"+ args.folder+"_NB_sentiment.model.pyvar" 
    outfile = "NB_data/NB_sentiment.model.pyvar" 
    outfile = open(outfile, 'w')
    pickle.dump(classifier, outfile)
    outfile.close()