Python BigramCollocationFinder.from_wordsの例、nltk.collocations.BigramCollocationFinder.from_words Pythonの例

コード例 #1

0

ファイルを表示

ファイル: Extractor.py プロジェクト: Palazor/sentiment

    def _get_bigram_scores(self, posdata, negdata):
        pos_words = list(itertools.chain(*posdata))
        neg_words = list(itertools.chain(*negdata))

        pos_bigram_finder = BigramCollocationFinder.from_words(pos_words)
        neg_bigram_finder = BigramCollocationFinder.from_words(neg_words)
        pos_bigrams = pos_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
        neg_bigrams = neg_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

        pos = pos_words + pos_bigrams
        neg = neg_words + neg_bigrams

        word_fd = FreqDist()
        cond_word_fd = ConditionalFreqDist()
        for word in pos:
            word_fd[word] += 1
            cond_word_fd['pos'][word] += 1
        for word in neg:
            word_fd[word] += 1
            cond_word_fd['neg'][word] += 1

        pos_word_count = cond_word_fd['pos'].N()
        neg_word_count = cond_word_fd['neg'].N()
        total_word_count = pos_word_count + neg_word_count

        word_scores = {}
        for word, freq in word_fd.iteritems():
            pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
            neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
            word_scores[word] = pos_score + neg_score

        return word_scores

コード例 #2

0

ファイルを表示

ファイル: store sentiment classifier.py プロジェクト: lihui19891118/Sentimental-analysis

def create_word_bigram_scores():
    posdata = tp.seg_fil_txt("/home/hadoop/goodnew.txt")
    negdata = tp.seg_fil_txt("/home/hadoop/badnew.txt")
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finderr = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finderr.nbest(BigramAssocMeasures.chi_sq,350000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq,350000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd.inc(word)
        cond_word_fd['pos'].inc(word)
    for word in neg:
        word_fd.inc(word)
        cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores

コード例 #3

0

ファイルを表示

ファイル: sentimentexample.py プロジェクト: eleanordong/datamining

def create_word_bigram_scores(posWords, negWords, n = 5000):
    # (posWords,negWords) = readwordarr()
    posWords = list(itertools.chain(*posWords))
    negWords = list(itertools.chain(*negWords))
    bigramfinder = BigramCollocationFinder.from_words(posWords)
    posbigrams = bigramfinder.nbest(BigramAssocMeasures.chi_sq, n)
    bigramfinder = BigramCollocationFinder.from_words(negWords)
    negbigrams = bigramfinder.nbest(BigramAssocMeasures.chi_sq, n)
    posWords = posWords + posbigrams
    negWords = negWords + negbigrams
    wordscores = {}
    wordfd = FreqDist()
    conditionwordfd = ConditionalFreqDist()
    for word in posWords:
        wordfd[word]+=1
        conditionwordfd['pos'][word]+=1
        
    for word in negWords:
        wordfd[word]+=1
        conditionwordfd['neg'][word]+=1
    
    pos_word_count = conditionwordfd['pos'].N()
    neg_word_count = conditionwordfd['neg'].N()
    totalcount = pos_word_count + neg_word_count
    for word,freq in wordfd.items():
        pos_score = BigramAssocMeasures.chi_sq(conditionwordfd['pos'][word], (freq, pos_word_count), totalcount)
        neg_score = BigramAssocMeasures.chi_sq(conditionwordfd['neg'][word], (freq, neg_word_count), totalcount)
        wordscores[word] = pos_score + neg_score
    return wordscores

コード例 #4

0

ファイルを表示

ファイル: extractFeatures_org.py プロジェクト: coolspiderghy/sina_weibo_crawler

def create_word_bigram_scores():
    posdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl','r'))
    negdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl','r'))
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams #词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd['pos'][word]+= 1 #cond_word_fd['pos'].inc(word)
    for word in neg:
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd['neg'][word]+= 1#cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores

コード例 #5

0

ファイルを表示

ファイル: classifiers_score.py プロジェクト: JoshuaMichaelKing/Stock-SentimentAnalysis

def create_word_bigram_scores(posWords, negWords, score_method = BigramAssocMeasures.chi_sq):
    '''
    以双词来统计词的信息量
    '''
    bigram_finder = BigramCollocationFinder.from_words(posWords)
    posBigrams = bigram_finder.nbest(score_method, 5000)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    negBigrams = bigram_finder.nbest(score_method, 5000)
    pos = posWords + posBigrams #词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in neg:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count
    print("BIGRAM_IN_POSWORD_NUMS : %d\tBIGRAM_IN_NEGWORD_NUMS : %d" % (pos_word_count, neg_word_count))

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = score_method(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = score_method(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score
    return word_scores

コード例 #6

0

ファイルを表示

ファイル: process.py プロジェクト: delili/NLP_Comments_Sentiment_Analysis

def create_word_bigram_scores(posWords, negWords):
    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)

    pos = posWords + posBigrams #词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[str(word)] += 1 
        cond_word_fd['pos'][str(word)] += 1
    for word in neg:
	    word_fd[str(word)] += 1
	    cond_word_fd['neg'][str(word)] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores

コード例 #7

0

ファイルを表示

ファイル: Bigrams_Features.py プロジェクト: AJRenold/classification_assignment_i256

def best_bigrams(sents_tagged, stopwords, score_fn=BigramAssocMeasures.likelihood_ratio, n=300):
    sents_pos = []
    sents_neg = []

    # Separate positive and negative sentences.
    for tag, sent in sents_tagged:
        if tag == 1:
            sents_pos.append(sent)
        elif tag == -1:
            sents_neg.append(sent)

    # Extract words from positive and negative sentences.
    words_pos = [word.lower() for s in sents_pos for word in word_tokenize(s) if word not in string.punctuation]
    words_neg = [word.lower() for s in sents_neg for word in word_tokenize(s) if word not in string.punctuation]

    # Find the best bigrams for positive sentences based on informative collocations
    bigram_finder1 = BigramCollocationFinder.from_words(words_pos)
    bigrams_best_pos = bigram_finder1.nbest(score_fn, n)

    # Find the best bigrams for negative sentences based on informative collocations
    bigram_finder2 = BigramCollocationFinder.from_words(words_neg)
    bigrams_best_neg = bigram_finder2.nbest(score_fn, n)

    bigrams_all = list(set(bigrams_best_pos).union(set(bigrams_best_neg)))

    # Select only the bigrams that have either one of the word greater than length 3
    bigrams_best = [bigram for bigram in bigrams_all
            if len(bigram[0]) > 3 and len(bigram[1]) > 3
            and bigram[0] not in ex and bigram[1] not in ex ]


    return bigrams_best

コード例 #8

0

ファイルを表示

ファイル: score.py プロジェクト: TianyiM/Final-Project

def create_word_bigram_scores():
    posdata = tp.seg_fil_senti_excel("~", 1, 1)
    negdata = tp.seg_fil_senti_excel("~", 1, 1)
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    last_word = ConditionalFreqDist()
    for word in pos:
        word_fd.inc(word)
        last_word['pos'].inc(word)
    for word in neg:
        word_fd.inc(word)
        last_word['neg'].inc(word)

    pos_word_count = last_word['pos'].N()
    neg_word_count = last_word['neg'].N()
    totalnumber = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(last_word['pos'][word], (freq, pos_word_count), totalnumber)
        neg_score = BigramAssocMeasures.chi_sq(last_word['neg'][word], (freq, neg_word_count), totalnumber)
        word_scores[word] = pos_score + neg_score

    return word_scores

コード例 #9

0

ファイルを表示

ファイル: store sentiment classifier.py プロジェクト: EricChanBD/Review-Helpfulness-Prediction

def create_bigram_scores():
    posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", "1", "1")
    negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", "1", "1")
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000)

    pos = posBigrams
    neg = negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd.inc(word)
        cond_word_fd['pos'].inc(word)
    for word in neg:
        word_fd.inc(word)
        cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores

コード例 #10

0

ファイルを表示

ファイル: pos_neg_ml_feature.py プロジェクト: wac81/LSI-for-ChineseDocument

def create_words_bigrams_scores():
    posdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/pos_review.xlsx", 1, 1)
    negdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/neg_review.xlsx", 1, 1)
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word]+=1
        cond_word_fd['pos'][word]+=1

    for word in neg:
        word_fd[word]+=1
        cond_word_fd['neg'][word]+=1
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores

コード例 #11

0

ファイルを表示

ファイル: feature_extrac.py プロジェクト: yyr93520/NLPproject

def create_word_bigram_scores():
	bigram_finder = BigramCollocationFinder.from_words(posWords)
	bigram_finder = BigramCollocationFinder.from_words(negWords)
	posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
	negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
	pos = posWords + posBigrams #词和双词搭配
	neg = negWords + negBigrams
	return get_scores(pos, neg)

コード例 #12

0

ファイルを表示

ファイル: weibo_sentiment_classifier.py プロジェクト: Irradiatepy/weibo_sentiment_analysis

def create_word_bigram_scores():
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))
    
    objWords = list(itertools.chain(*objdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    
    bigram_finder = BigramCollocationFinder.from_words(objWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    
    objBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)


    pos = posWords + posBigrams
    neg = negWords + negBigrams
    
    obj = objWords + objBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in neg:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1
    for word in objWords:
        word_fd[word] += 1
        cond_word_fd['obj'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    
    obj_word_count = cond_word_fd['obj'].N()
    total_word_count = pos_word_count + neg_word_count + obj_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
       
        obj_score = BigramAssocMeasures.chi_sq(cond_word_fd['obj'][word], (freq, obj_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score + obj_score

    return word_scores

コード例 #13

0

ファイルを表示

ファイル: text.py プロジェクト: prz3m/kind2anki

    def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        :seealso: find_collocations
        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not (
            '_collocations' in self.__dict__
            and self._num == num
            and self._window_size == window_size
        ):
            self._num = num
            self._window_size = window_size

            # print("Building collocations list")
            from nltk.corpus import stopwords

            ignored_words = stopwords.words('english')
            finder = BigramCollocationFinder.from_words(self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations]
        print(tokenwrap(colloc_strings, separator="; "))

コード例 #14

0

ファイルを表示

ファイル: BookClassifier.py プロジェクト: karthik-chandrasekar/BookClassifier

 def get_bigram(self, features_list):
     #Top ten best bigrams are selected
     score = BigramAssocMeasures.chi_sq
     all_bigrams = BigramCollocationFinder.from_words(features_list)
     best_bigrams = all_bigrams.nbest(score, self.bigram_threshold)
     selected_bigrams = [(bigram, True) for bigram in best_bigrams]
     return selected_bigrams

コード例 #15

0

ファイルを表示

ファイル: grapher.py プロジェクト: amac441/Metten

    def get_frequencies(self, desc):

        stopset = set(stopwords.words('english'))
        filter_stops = lambda w: len(w) < 3 or w in stopset
        words = word_tokenize(desc)

        print '------gram--------'
        words_to_count = [word for word in words if word not in stopset]
        words_to_count = [word for word in words_to_count if not len(word) < 3]
        c = Counter(words_to_count)
        single = c.most_common(20)
        print single

        print '------bigram--------'
        bcf = BigramCollocationFinder.from_words(words)
        bcf.apply_word_filter(filter_stops)
        bigrm = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 15)
        print bigrm

        print '------trigram--------'
        tcf = TrigramCollocationFinder.from_words(words)
        tcf.apply_word_filter(filter_stops)
        tcf.apply_freq_filter(3)  #only keep those that appear more than 3 times
        trigrm = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 10)
        print trigrm

        matches = [single,bigrm,trigrm]
        return matches

コード例 #16

0

ファイルを表示

ファイル: Test.py プロジェクト: svenka22/Twitter-Sentiment-Analysis

def get_bigrams1(tweet, score_fn=BigramAssocMeasures.chi_sq, n=200):
        bigramslist = []
        bigram_finder = BigramCollocationFinder.from_words(tweet)
        bigrams = bigram_finder.nbest(score_fn, n)
        for bigram in bigrams:
            bigramslist.append(' '.join(str(i) for i in bigram))
        print bigramslist

コード例 #17

0

ファイルを表示

ファイル: analyze_tweets.py プロジェクト: seanfreiburg/chicago_tweet_grabber

def best_bigram_word_features(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    d = dict([(bigram, True) for bigram in bigrams])
    d.update(best_word_features(words))

    return d

コード例 #18

0

ファイルを表示

ファイル: nbayes_sentiment.py プロジェクト: gmichopoulos/sentiment_analysis_toolkit

def bigram_word_features(words, limit, stop, stopset, word_score_placeholder, \
                                          score_fn=BigramAssocMeasures.chi_sq):
  if stop:
    words = [w for w in words if w not in stopset]
  bigram_finder = BigramCollocationFinder.from_words(words)
  bigrams = bigram_finder.nbest(score_fn, limit)
  return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])

コード例 #19

0

ファイルを表示

ファイル: naive_bayes_classifier_bigrams.py プロジェクト: MARS87/ieor242

def tweet_features(tweet):
    tweet_words = word_tokenize(tweet)
    bigram_finder = BigramCollocationFinder.from_words(tweet_words)
    score_fn=BigramAssocMeasures.chi_sq
    bigrams = bigram_finder.nbest(score_fn, 200)
    print bigrams
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])

コード例 #20

0

ファイルを表示

ファイル: text_utils.py プロジェクト: fruser/review-analyzer

def get_bag_of_bigrams_words(
        word_list,
        score_fn=BigramAssocMeasures.chi_sq,
        n=200):
    bigram_finder = BigramCollocationFinder.from_words(word_list)
    bigrams = bigram_finder.nbest(score_fn, n)
    return get_bag_of_words(word_list + bigrams)

コード例 #21

0

ファイルを表示

ファイル: generator.py プロジェクト: emmajhyde/PoetryGenerator

 def get_collocations(self):
     ignored_words = stopwords.words('english')
     finder = BigramCollocationFinder.from_words(self.text_array,2)
     finder.apply_freq_filter(3)
     finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
     bigram_measures = BigramAssocMeasures()
     return finder.nbest(bigram_measures.likelihood_ratio,40)

コード例 #22

0

ファイルを表示

ファイル: bigram.py プロジェクト: RGaonkar/bookmark_me

def bigram_words(words, score_fn = BigramAssocMeasures.chi_sq, n=200):

    bigram_finder = BigramCollocationFinder.from_words(words)
    
    bigrams = bigram_finder.nbest(score_fn, n)
    
    return bag_of_words(words + bigrams)

コード例 #23

0

ファイルを表示

ファイル: classifier.py プロジェクト: benneic/sentimento

 def converter(tokens):
     bigram_finder = BigramCollocationFinder.from_words(tokens)
     bigrams = bigram_finder.nbest(score_fn, n)
     return (
         {ngram: True for ngram in itertools.chain(tokens, bigrams)},
         label
     )

コード例 #24

0

ファイルを表示

ファイル: load_samples.py プロジェクト: hotpro/cmpe239-project2

def stopword_filtered_bigrams(text, score_fn=BigramAssocMeasures.chi_sq, n=500):
    '''Removes the stopwords and computes the best bigrams'''
    stopset = set(stopwords.words('english'))
    words = [word for word in tokenize(text) if word not in stopset]
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])

コード例 #25

0

ファイルを表示

ファイル: collocations.py プロジェクト: rjoganah/Dynamic_IR

 def collaction_discovery(self):
     self.corpus = nltk.word_tokenize(self.corpus.lower())
     bigramm_finder = BigramCollocationFinder.from_words(self.corpus)
     filter_bigram = lambda w: len(w) < 3 or w in self.stopwords_
     bigramm_finder.apply_word_filter(filter_bigram)
     top_10_bigrams = bigramm_finder.nbest(BigramAssocMeasures.likelihood_ratio, 10)
     return top_10_bigrams

コード例 #26

0

ファイルを表示

ファイル: DataPreprocessing.py プロジェクト: svenka22/Twitter-Sentiment-Analysis

 def get_bigrams(self, tweet, score_fn=BigramAssocMeasures.chi_sq, n=200):
         bigramslist = []
         bigram_finder = BigramCollocationFinder.from_words(tweet)
         bigrams = bigram_finder.nbest(score_fn, n)
         for bigram in bigrams:
             bigramslist.append(' '.join(str(i) for i in bigram))
         return bigramslist #This is list e.g. ['you dude', 'Hi How', 'How are', 'are you']

コード例 #27

0

ファイルを表示

ファイル: NLTK_tools.py プロジェクト: dreampocketit/bocard

def demo_collocations(self, num=40, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        @seealso: L{find_collocations}
        @param num: The maximum number of collocations to print.
        @type num: C{int}
        @param window_size: The number of tokens spanned by a collocation (default=2)
        @type window_size: C{int}
        """
        if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
            self._num = num
            self._window_size = window_size
            print "Building collocations list"
            from nltk.corpus import stopwords
            ignored_words = stopwords.words('english')
            from nltk.collocations import BigramCollocationFinder
            finder = BigramCollocationFinder.from_words(self.tokens, window_size) 
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            from nltk.metrics import f_measure, BigramAssocMeasures
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1+u' '+w2 for w1, w2 in self._collocations]
        print "List {0} collocations".format(num)
        print tokenwrap(colloc_strings, separator=u'; ')

コード例 #28

0

ファイルを表示

ファイル: collocationreadability.py プロジェクト: muranava/Text-Tools

def ShowCollocations():
	text.insert(END, "If this doesn't work, please check you have NLTK, PyYAML and the stopword list from the NLTK loaded. See Help for details \n\n\n")
	import nltk
	from nltk.collocations import BigramCollocationFinder
	from nltk.collocations import TrigramCollocationFinder
	from nltk.metrics import BigramAssocMeasures
	from nltk.metrics import TrigramAssocMeasures
	pattern = r'''(?x)([A-Z]\.)+|\w+([-']\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;"'?():-_']'''
	data = resultsbox.get(1.0,END)
	rawtext=nltk.regexp_tokenize(data, pattern)
	prepcolloc = [word.lower() for word in rawtext if not word in stopwords and word.isalpha()]
	text.delete(1.0, END)
	text.insert(END, "Collocations (occurring at least 3 times with a PMI of 10)\n")
	text.insert(END, "\nBigram Collocations:\n")
	bigram = BigramAssocMeasures()
	bigramfinder = BigramCollocationFinder.from_words(prepcolloc)
	bigramfinder.apply_freq_filter (3)
	bigrams=bigramfinder.nbest(bigram.pmi, 10)
	for item in bigrams:
		first = item[0]
		second = item[1]
		text.insert(END, first)
		text.insert(END, " ")
		text.insert(END, second)
		text.insert(END, "\n")

コード例 #29

0

ファイルを表示

ファイル: ytpy.py プロジェクト: juliasun/Youtube-Tox

def bigram(ytcomments, drug):
    bi = BigramAssocMeasures()
    bi_finder = BigramCollocationFinder.from_words(ytcomments, window_size = 20)
    top_general = bi_finder.nbest(bi.pmi,30)
    bi_finder.apply_ngram_filter(lambda w1,w2: w1 != drug and w2 != drug)
    top_bi = bi_finder.nbest(bi.pmi, 30)
    return top_bi

コード例 #30

0

ファイルを表示

ファイル: BigramsExtraction.py プロジェクト: ssteku/NLPRelatedPhenomenon

 def get_bigrams(self, rawText, score_fn=BigramAssocMeasures.pmi, n=40):
     # TODO configuration value
     clean_text = TokensCleaner.clean(self, rawText, cleaning_level=3)
     bigram_finder = BigramCollocationFinder.from_words(clean_text['3'])
     bigram_measures = BigramAssocMeasures()
     bigrams = bigram_finder.nbest(bigram_measures.pmi, n)
     return bigrams

コード例 #31

0

ファイルを表示

def create_features(X, user_data=None):
    res = []

    for date, comment, user in X:
        feat = {}
        has_hate_word = has_drug_word = has_cult_word = has_occult_word = has_porn_word = 0
        has_fwenzel_word = 0
        has_swastika = swastika in comment

        comment = comment.lower()

        comment = parse_text(comment)

        comment = nltk.clean_html(comment)

        sents = sent_tokenize(comment)
        doc = []
        for sent in sents:
            # Tokenize each sentence.
            doc += wordtokenizer.tokenize(sent)

        def repl_filter(x):
            return x.lower() not in ["nl", "nl2", "nbsp", "nbsp2", "dummyhtml"]

        # Remove stopwords and replacement tokens.
        doc = filter(repl_filter, doc)

        for i, word in enumerate(doc):
            if doc[i] in bad_words:
                doc[i] = '_badword_'

            doc[i] = ps.stem(doc[i])

            doc[i] = wnl.lemmatize(doc[i])

            if doc[i] in bad_words:
                doc[i] = '_badword_'

            if doc[i] in hate_words:
                has_hate_word = 1
            if doc[i] in drug_words:
                has_drug_word = 1
            if doc[i] in cult_words:
                has_cult_word = 1
            if doc[i] in occult_words:
                has_occult_word = 1
            if doc[i] in porn_words:
                has_porn_word = 1
            if doc[i] in fwenzel_words:
                has_fwenzel_word = 1

        bigram_finder = BigramCollocationFinder.from_words(doc)
        bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, n=5)

        bigram = dict([(ngram, True)
                       for ngram in itertools.chain(doc, bigrams)])

        feat.update(bigram)

        text_vocab = set(w for w in doc if w.isalpha())
        unusual = text_vocab.difference(english_vocab)
        unusual_ratio = len(unusual) / len(text_vocab) if len(
            text_vocab) != 0 else -1.0

        unusual2 = unusual.difference(set("_badword_"))
        unusual_ratio2 = len(unusual2) / len(text_vocab) if len(
            text_vocab) != 0 else -1.0

        if user_data is not None:
            user_info = user_data[user]

        has_bad_word = True
        for word in bad_words:
            if word in comment.lower():
                break
        else:
            has_bad_word = False

        def n_none(x):
            return int(x) if x is not None else 0

        def c_none(x):
            return x if x is not None else "__None__"

        readability = ReadabilityTool(comment)

        read_feat = {}
        for f, val in readability.analyzedVars.items():
            if f != 'words':
                read_feat["_" + f] = val
        for test, val in readability.tests_given_lang['eng'].items():
            read_feat["__" + test] = val(readability.text)

        feat['_always_present'] = True
        feat['_word_num'] = len(doc)
        feat['_sent_num'] = len(sents)
        feat['_word_var'] = len(set(doc)) / len(doc) if len(doc) != 0 else -1.0
        feat['_sent_var'] = len(set(sents)) / len(sents)
        feat['_unusual_ratio'] = unusual_ratio
        feat['_unusual_ratio2'] = unusual_ratio2
        if user_data is not None:
            feat['_username'] = user
            feat['_user_subcount'] = int(user_info['SubscriberCount'])
            feat['_user_friends'] = int(user_info['FriendsAdded'])
            feat['_user_favs'] = int(user_info['VideosFavourited'])
            feat['_user_videorates'] = int(user_info['VideosRated'])
            feat['_user_videouploads'] = int(user_info['VideosUploaded'])
            feat['_user_videocomments'] = int(user_info['VideosCommented'])
            feat['_user_videoshares'] = int(user_info['VideosShared'])
            feat['_user_usersubs'] = int(user_info['UserSubscriptionsAdded'])
            feat['_user_gender'] = c_none(user_info['Gender'])
            feat['_user_age'] = n_none(user_info['Age'])
            feat['_user_closed'] = user_info['UserAccountClosed']
            feat['_user_suspended'] = user_info['UserAccountSuspended']
            feat['_user_has_gender'] = 1 if user_info[
                'Gender'] is not None else 0
            feat['_user_has_school'] = 1 if user_info[
                'School'] is not None else 0
            feat[
                '_user_has_books'] = 1 if user_info['Books'] is not None else 0
            feat['_user_has_movies'] = 1 if user_info[
                'Movies'] is not None else 0
            feat[
                '_user_has_music'] = 1 if user_info['Music'] is not None else 0
            feat['_user_has_location'] = 1 if user_info[
                'Location'] is not None else 0
            feat['_user_has_hometown'] = 1 if user_info[
                'Hometown'] is not None else 0
    #        feat['_user_last'] = user_info['LastWebAccess']

    # Dictionary features
        feat['_has_bad_word'] = has_bad_word
        #        feat['_has_hate_word'] = has_hate_word
        #        feat['_has_drug_word'] = has_drug_word
        feat['_has_cult_word'] = has_cult_word
        feat['_has_swastika'] = has_swastika
        #        feat['_has_occult_word'] = has_occult_word
        #        feat['_has_has_fwenzel_word'] = has_fwenzel_word
        feat['_has_porn_word'] = has_porn_word
        feat['_has_swastika'] = has_swastika
        feat.update(read_feat)

        #        print feat
        res.append(feat)
    return res

コード例 #32

0

ファイルを表示

ファイル: judge.py プロジェクト: HZ-njupt/Weibo

def bigram(words, score_fn=BigramAssocMeasures.chi_sq, n=500):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)  # 使用卡方统计的方法，选择排名前1000的词语
    newBigrams = [u + v for (u, v) in bigrams]

コード例 #33

0

ファイルを表示

 def take_bigram(self, text, stop_words):
     finder = BigramCollocationFinder.from_words(text)
     return finder.nbest(BigramAssocMeasures.likelihood_ratio, 15)

コード例 #34

0

ファイルを表示

ファイル: load_samples.py プロジェクト: stathius/yenlp

def bigrams(text, score_fn=BigramAssocMeasures.chi_sq, n=500):
    '''Find the best n bigrams of a text by means of a give measure.'''
    words = tokenize(text)
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])

コード例 #35

0

ファイルを表示

ファイル: StoreSentimentClassifier.py プロジェクト: shawshany/RESEARCH-ON-PRICE-FORECASTING-ALGORITHM-WITH-INTERNET-BIG-DATA

def bigrams(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bag_of_words(bigrams)

コード例 #36

0

ファイルを表示

def bigram(words, score_fn=BigramAssocMeasures.chi_sq, n=1000):
    bigram_finder = BigramCollocationFinder.from_words(words)  # 把文本变成双词搭配的形式
    bigrams = bigram_finder.nbest(score_fn, n)  # 使用了卡方统计的方法，选择排名前1000的双词
    return bag_of_words(bigrams)

コード例 #37

0

ファイルを表示

ファイル: 06_2_subtitle_sentiment.py プロジェクト: sreal19/cinemetrics

def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    d = dict([(bigram, True) for bigram in bigrams])
    d.update(best_word_feats(words))
    return d

コード例 #38

0

ファイルを表示

ファイル: NLTK.py プロジェクト: arasandt/Others

#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')
text = 'Mary had a little lamb. Her fleece was white as snow. Lamb little '
sents = sent_tokenize(text)
#print(sents)
#words = word_tokenize(text)
words = [word_tokenize(t) for t in sents]
#print(words)
customstopwords = set(stopwords.words('english') + list(punctuation))
#print(customstopwords)
wordsstop = [
    word for word in word_tokenize(text) if word not in customstopwords
]
#print(wordsstop )
bm = BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(wordsstop)  # can do trigrams too.
#print(sorted(finder.ngram_fd.items()))

text2 = 'Mary closed closer in close'
st = LancasterStemmer()  # reduces to root form.
stemw = [st.stem(i) for i in word_tokenize(text2)]
#print(set(stemw))
#print(nltk.pos_tag(word_tokenize(text2))) # part of speech tagging

for ss in wordnet.synsets('bass'):
    pass  #print(ss,ss.definition())

sense1 = lesk(word_tokenize("Sing in a lower tone, along with the bass"),
              'bass')
#print(sense1,sense1.definition())

コード例 #39

0

ファイルを表示

ファイル: bigram.py プロジェクト: letform/open-source-words

TEXT_DIR = "./_TEXT"
READMES = sorted(
    [f for f in listdir_nohidden(TEXT_DIR) if isfile(join(TEXT_DIR, f))])

bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

bi_dict = dict()

for README in READMES:
    readme_file_name = TEXT_DIR + "/" + README
    with open(readme_file_name, "r") as readme_file:
        readme_contents = onlyLetters(readme_file.read())
        words = readme_contents.split(" ")
        removeStopwords(words)
        bi_finder = BigramCollocationFinder.from_words(words)
        bi_collocations = bi_finder.nbest(bigram_measures.likelihood_ratio, 10)

        for collocation in bi_collocations:
            if len(collocation[0]) + len(collocation[1]) > 1:
                incrementDict(" ".join(collocation), bi_dict)

if " " in bi_dict:
    bi_dict.pop(" ")

bi_dict_sorted = OrderedDict(
    sorted(bi_dict.items(), reverse=True, key=lambda (k, v): (v, k)))
bi_dict_json = json.dumps(take(1000, bi_dict_sorted))

with open("bigram_words.json", "w") as bigram_file:
    bigram_file.write(bi_dict_json)

コード例 #40

0

ファイルを表示

t_df = pd.DataFrame(t_array, columns = range(len(cleaned_tweets)), index = list_vocab)

sum_df = t_df.sum(axis = 1, skipna = True)
sum_df = pd.DataFrame(sum_df, columns = ['Frequency'])
sum_df = sum_df.sort_values(by = 'Frequency', ascending = False)
print(sum_df.head(50))
print(sum_df.sum())

#-----------------------------#

cvec = CountVectorizer(analyzer=lambda x:x.split(','))
c_feat = cvec.fit_transform(split_words_j)
# vocabs = [w for w in cvec.vocabulary_.keys()]

flattened_split_words = [y for x in split_words for y in x]
biagram_collocation = BigramCollocationFinder.from_words(flattened_split_words) 
th_stop = get_th_stop()
filter_stops = lambda w: len(w) < 3 or w in th_stop
biagram_collocation.apply_word_filter(filter_stops)

biagram = biagram_collocation.score_ngrams(BigramAssocMeasures.likelihood_ratio)

prefix_keys = collections.defaultdict(list)
for key, scores in biagram:
   prefix_keys[key[0]].append((key[1], scores))

for key in prefix_keys:
   prefix_keys[key].sort(key = lambda x: -x[1])

n_words = int(sys.argv[2])

コード例 #41

0

ファイルを表示

ファイル: bigram_classifier.py プロジェクト: voronoii/recommend_perf

def bigrams_words_features(words,
                           nbigrams,
                           measure=BigramAssocMeasures.chi_sq):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(measure, nbigrams)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])

コード例 #42

0

ファイルを表示

def bigram_words(words, score_fn=BigramAssocMeasures.chi_sq, n=1000):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)

    return bag_of_words(words + bigrams)  # 所有词和（信息量大的）双词搭配一起作为特征

コード例 #43

0

ファイルを表示

ファイル: Report_code_Chen_Yuqi_Zhang_Guoran.py プロジェクト: DavidChen25/Covid-19-Sentiment-Analysis

def  bigram(words,score_fn=BigramAssocMeasures.chi_sq,n=1000):
    bigram_finder=BigramCollocationFinder.from_words(words)
    bigrams= bigram_finder.nbest(score_fn,n)
    newBigrams = [u+v for (u,v) in bigrams]
    return bag_of_words(newBigrams)

コード例 #44

0

ファイルを表示

        l.append(z)
    l = sorted(l,key=itemgetter(1),reverse=True)
    return(l[0:300])
            
top_words_quadcounter(job_text)



special_chars = ['--','...','\n','•','®']
a = ' '.join(job_text)
a = a.translate(str.maketrans('','',string.punctuation)).lower() #remove punctuation and make lower case
for char in special_chars:
    a = a.replace(char, ' ') #replace special char with a space
resultwords = [word for word in a.split(' ') if word.lower() not in stopwords]
text = ' '.join(resultwords)
finder = BigramCollocationFinder.from_words(word_tokenize(text))
for k,v in finder.ngram_fd.items():
    print(k,v)


##deep copy. save a copy.




a = ' '.join(job_text)
a = a.translate(str.maketrans('','',string.punctuation)).lower() #remove punctuation and make lower case
a = a.replace('\n', ' ') #replace \n with a space
a = a.replace('•', ' ')
resultwords = [word for word in a.split(' ') if word.lower() not in stopwords]

コード例 #45

0

ファイルを表示

ファイル: S2_underscore_phrases.py プロジェクト: awebson/congressional_adversary

def main(in_dir: Path,
         out_dir: Path,
         num_corpus_chunks: int,
         min_frequency: int,
         conserve_RAM: bool = False) -> None:
    Path.mkdir(out_dir, parents=True, exist_ok=True)
    preview = open(out_dir / f'vocab.txt', 'w')

    corpus: List[LabeledDoc] = []
    for part_index in tqdm(range(num_corpus_chunks), desc='Loading cache'):
        with open(in_dir / f'tokenized_{part_index}.pickle', 'rb') as in_file:
            corpus += pickle.load(in_file)

    # Lowercase, discard punctuations, replace numbers, deduplicate
    number = re.compile(r'\d')
    starts_with_letter = re.compile(r"^\w")
    select_punctuations = re.compile(r"[@#&:]|.com")
    norm_freq: Counter[str] = Counter()
    existed: Set[Tuple[str, ...]] = set()
    duplicates = 0
    for doc in tqdm(corpus, desc='Normalizing tokens'):
        for sent in doc.sentences:
            for token in sent.tokens:
                if not starts_with_letter.search(token):
                    continue
                if select_punctuations.search(token):
                    continue
                if number.search(token):
                    norm_token = '<NUM>'
                else:
                    norm_token = token.lower()
                sent.normalized_tokens.append(norm_token)
                norm_freq[norm_token] += 1
            if conserve_RAM:
                del sent.tokens
            # all_norm_tokens += sent.normalized_tokens
            hashable = tuple(sent.normalized_tokens)
            if hashable not in existed:
                existed.add(hashable)
            else:
                duplicates += 1

        doc.sentences = [  # Filter out duplicate sentences
            sent for sent in doc.sentences if tuple(sent.tokens) not in existed
        ]
    print(f'Number of duplicate sentences = {duplicates:,}')

    UNK_filtered_freq: Counter[str] = Counter()
    for key, val in norm_freq.items():
        if val >= min_frequency:
            UNK_filtered_freq[key] = val
        else:
            UNK_filtered_freq['<UNK>'] += val
    print(f'Number of filtered unigrams = {len(UNK_filtered_freq):,}')
    print(f'Number of filtered unigrams = {len(UNK_filtered_freq):,}',
          file=preview)

    all_norm_tokens: List[str] = [
        nt for doc in corpus for sent in doc.sentences
        for nt in sent.normalized_tokens
    ]

    special_tokens = {'<UNK>', '<NUM>', "n't", "n’t"}
    print('Finding bigrams...')
    bigram_finder = BigramCollocationFinder.from_words(all_norm_tokens)
    num_tokens = len(all_norm_tokens)
    bigram_finder.apply_freq_filter(min_frequency)
    stop_words = set(stopwords.words('english')).union(special_tokens)
    bigram_finder.apply_word_filter(lambda word: word in stop_words)
    bigrams = bigram_finder.score_ngrams(BigramAssocMeasures().raw_freq)
    # bigrams = bigram_finder.score_ngrams(BigramAssocMeasures().pmi)
    print(f'Number of filtered bigrams = {len(bigrams):,}')
    print(f'Number of filtered bigrams = {len(bigrams):,}', file=preview)
    with open(out_dir / 'bigrams.txt', 'w') as bigram_file:
        for bigram, relative_freq in bigrams:
            absolute_freq = relative_freq * num_tokens
            bigram_str = ' '.join(bigram)
            # bigram_file.write(f'{relative_freq:.4f}\t{bigram_str}\n')  # for PMI
            bigram_file.write(f'{absolute_freq:.0f}\t{bigram_str}\n')

    # print('Finding trigrams...')
    # trigram_finder = TrigramCollocationFinder.from_words(all_norm_tokens)
    # trigram_finder.apply_freq_filter(min_frequency)
    # trigram_finder.apply_word_filter(lambda word: word in stop_words)
    # # trigram_finder.apply_ngram_filter(
    # #     lambda w1, w2, w3: (w1 in stop_words) or (w3 in stop_words) or (w2 in special_tokens))
    # trigrams = trigram_finder.score_ngrams(TrigramAssocMeasures().raw_freq)
    # print(f'Number of filtered trigrams = {len(trigrams):,}')
    # print(f'Number of filtered trigrams = {len(trigrams):,}', file=preview)
    # with open(out_dir / 'trigrams.txt', 'w') as trigram_file:
    #     for trigram, relative_freq in trigrams:
    #         absolute_freq = relative_freq * num_tokens
    #         trigram_str = ' '.join(trigram)
    #         trigram_file.write(f'{absolute_freq:.0f}\t{trigram_str}\n')
    del all_norm_tokens

    # Multi-Word Expression tokenize to underscored
    underscorer = MWETokenizer([bi for bi, _ in bigrams
                                ])  # maybe add affordable care act
    # underscorer = MWETokenizer(
    #     [tri for tri, _ in trigrams] + [bi for bi, _ in bigrams])
    vocab: Counter[str] = Counter()
    for doc in tqdm(corpus, desc='Underscoring multi-phrase expressions'):
        for sent in doc.sentences:
            sent.underscored_tokens = underscorer.tokenize(
                sent.normalized_tokens)
            vocab.update(sent.underscored_tokens)
            if conserve_RAM:
                del sent.normalized_tokens
    print('Pickling...')
    with open(out_dir / 'MWE_underscored.pickle', 'wb') as out_file:
        pickle.dump(corpus, out_file)

    for key, val in vocab.most_common():
        if val >= min_frequency:
            print(f'{val:,}:\t{key}', file=preview)
    preview.close()

コード例 #46

0

ファイルを表示

ファイル: speech_analysis.py プロジェクト: sean-nik/PoliticalSpeechAnalysis

        print(len(sentiment_list))


trump20_sent = sentiment_ct(trump_speech, "Trump 2020 ")
biden20_sent = sentiment_ct(biden_speech, "Biden 2020 ")
pence20_sent = sentiment_ct(pence20_speech, "Trump 2020 ")
harris20_sent = sentiment_ct(harris20_speech, "Biden 2020 ")
trump16_sent = sentiment_ct(trump16_speech, "Trump 2016 ")
clinton16_sent = sentiment_ct(clinton16_speech, "Clinton 2016 ")



################################################################################################
# Bigrams
# 2020 POTUS
dnc_finder = BigramCollocationFinder.from_words(biden_tokens)
dnc_finder.nbest(BigramAssocMeasures.chi_sq, 30) # top 30 DNC bigrams
dnc_finder.score_ngrams(BigramAssocMeasures.raw_freq)[:30] # bigrams with scores
# plot barchart
plot_word_freqs(dnc_finder.score_ngrams(BigramAssocMeasures.raw_freq)[:30], 'b', "Top 30 bigrams in Biden's 2020 DNC Speech", "Frequency Score")
# plot network
visualize_bigram(dnc_finder.score_ngrams(BigramAssocMeasures.raw_freq)[:30], .6, "Top 30 bigrams in Biden's 2020 DNC Speech") # democrat network


rnc_finder = BigramCollocationFinder.from_words(trump_tokens)
rnc_finder.nbest(BigramAssocMeasures.raw_freq, 30) # top 30 RNC bigrams
rnc_finder.score_ngrams(BigramAssocMeasures.raw_freq)[:30] # bigrams with scores
# plot barchart
plot_word_freqs(rnc_finder.score_ngrams(BigramAssocMeasures.raw_freq)[:30], 'r', "Top 30 bigrams in Trump's 2020 RNC Speech", "Frequency Score")
# plot network
visualize_bigram(rnc_finder.score_ngrams(BigramAssocMeasures.raw_freq)[:30], 0.6, "Top 30 bigrams in Trump's 2020 RNC Speech") # republican network

コード例 #47

0

ファイルを表示

def extract_bigram(words):
    finder = BigramCollocationFinder.from_words(words)
    return finder.nbest(bigram_measures.pmi, 5)

コード例 #48

0

ファイルを表示

ファイル: text.py プロジェクト: AvinashAlicatta/text-classificaton-using-NLP

plt.figure(figsize = (50,25))
plt.imshow(bigram_wordcloud,interpolation = 'bilinear')
plt.axis("off")
plt.show()






# =============================================================================
#

from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
finder=BigramCollocationFinder.from_words(email_wc_na)
 
a=finder.nbest(BigramAssocMeasures.likelihood_ratio, 10)
print(a)
 =============================================================================
#pos tagging
 =============================================================================
import nltk
nltk.download('averaged_perceptron_tagger')
token=nltk.word_tokenize(email_wc_a)
a=list(nltk.pos_tag(token))

 
from nltk import pos_tag
from nltk import RegexpParser

コード例 #49

0

ファイルを表示

@author: issfz
"""
import string
from nltk.corpus import reuters
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures  #bigram associations
from nltk.corpus import stopwords

grain_tok = [reuters.words(f) for f in reuters.fileids('grain')
             ]  #return values are already tokenised
trade_tok = [reuters.words(f) for f in reuters.fileids('trade')]

words = [w.lower() for f in grain_tok
         for w in f]  #lower case to prevent case sensitivity
bcf = BigramCollocationFinder.from_words(
    words)  # will give words but not matrix
top100 = bcf.nbest(
    BigramAssocMeasures.likelihood_ratio, 100
)  #will give top n no of best candidates based on certain criteria(likelihood ration we can use to start with)
top = [(t1, t2) for (t1, t2) in top100
       if t1 not in string.punctuation and t2 not in string.punctuation]

stopset = set(stopwords.words('english'))
filter_stops = lambda w: len(
    w
) < 3 or w in stopset  # filter stop words , more filtering required, prepare filter pattern first , we use lambda function
bcf.apply_word_filter(filter_stops)
bcf.nbest(BigramAssocMeasures.likelihood_ratio, 10)
bcf.nbest(BigramAssocMeasures.chi_sq, 10)
bcf.nbest(BigramAssocMeasures.jaccard, 10)
bcf.nbest(BigramAssocMeasures.mi_like, 10)

コード例 #50

0

ファイルを表示

    stopwordsList.append('x')
    stopwordsList.append('z')
    stopwordsList.append('Pp')
    stopwordsList.append('Pq')

    return stopwordsList


stopwords = prepareStopWords()

# fdist = FreqDist(text)
# fdist_no_punc_no_stopwords = nltk.FreqDist(dict((word, freq) for word, freq in fdist.items() if word not in stopwords and word.isalpha()))

# bigramas
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(words)
finder.nbest(bigram_measures.pmi, 10)
finder.apply_freq_filter(3)
finder.nbest(bigram_measures.pmi, 10)

# WC para los bigramas mas frecuentes
stopWords = stopwords
text_content = [
    ''.join(re.split("[ .,;:!?‘’``''@#$%^_&*()<>{}~\n\t\\\-]", word))
    for word in text
]

text_content = [word for word in text_content if word not in stopWords]
text_content = [s for s in text_content if len(s) != 0]
text_content = [WNL.lemmatize(t) for t in text_content]
finder = BigramCollocationFinder.from_words(text_content)

コード例 #51

0

ファイルを表示

data_token = pd.read_csv(processed_path + "processed+tokenized.csv")
data_token['message'] = data_token['message'].apply(eval)

#########################Entire Dictionary#####################################
flat_list = []
for sublist in data_token['message']:
    for item in sublist:
        flat_list.append(item)
######################finds top bigrams and trigrams###########################

bigrams = nltk.collocations.BigramAssocMeasures()
trigrams = nltk.collocations.TrigramAssocMeasures()

trigramfinder = TrigramCollocationFinder.from_words(flat_list)
bigramfinder = BigramCollocationFinder.from_words(flat_list)

bigram_freq = bigramfinder.ngram_fd.items()
bigramFreqTable = pd.DataFrame(list(bigram_freq),
                               columns=['bigram',
                                        'freq']).sort_values(by='freq',
                                                             ascending=False)

trigram_freq = trigramfinder.ngram_fd.items()
trigramFreqTable = pd.DataFrame(list(trigram_freq),
                                columns=['trigram',
                                         'freq']).sort_values(by='freq',
                                                              ascending=False)

bigramFreqTable.to_csv(raw_path + "bigramFreqTable.csv")
trigramFreqTable.to_csv(raw_path + "trigramFreqTable.csv")

コード例 #52

0

ファイルを表示

from nltk.corpus import webtext
from nltk.metrics import BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder
from nltk.corpus import stopwords

textWord = [w.lower() for w in webtext.words('pirates.txt')]
finder = BigramCollocationFinder.from_words(textWord)
#print(finder.nbest(BigramAssocMeasures.likelihood_ratio,10))
ignored_word = set(stopwords.words('english'))
print(ignored_word)
filterStpos = lambda w: len(w) < 3 or w in ignored_word

finder.apply_word_filter(filterStpos)
print(finder.nbest(BigramAssocMeasures.likelihood_ratio, 10))

コード例 #53

0

ファイルを表示

ファイル: HaikuVision.py プロジェクト: seanpatersondev/HaikuVision

        num = len(dic.positions(word)) + 1
    except:
        return 1
    return num


# Instantiate dictionary used to count syllables
dic = pyphen.Pyphen(lang='en')

# Instantiate corpus reader for word selection
ignoredWords = set(stopwords.words("english"))
filterStops = lambda w: len(w) < 3 or w in ignoredWords

# Load the brown corpus, get the collocations for each word and scores based on the likelihood of occurrence
bigramMeasures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(nltk.corpus.brown.words())
scored = finder.score_ngrams(bigramMeasures.likelihood_ratio)

# Create dictionary of lists to associate keys with all their bigram pairs (word, likelihood ratio)
dictList = collections.defaultdict(list)
for key, score in scored:
    dictList[key[0]].append((key[1], score))

# Get words from picture and assess for suitability
first = choice(tags)
tags.remove(first)
second = choice(tags)
tags.remove(second)
third = choice(tags)

# Create lists to hold words, syllables and max syllables for each line

コード例 #54

0

ファイルを表示

ファイル: general.py プロジェクト: dushr/TWSS

def word_features(words, score_fn=BAM.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict((bg, True) for bg in chain(words, bigrams))

コード例 #55

0

ファイルを表示

def bigram_feats(text, score_fn=BigramAssocMeasures.pmi, n_best=200):
    bigram_finder = BigramCollocationFinder.from_words(text)
    n_grams = bigram_finder.nbest(score_fn, n_best)
    return dict([(n_gram, True) for n_gram in n_grams])

コード例 #56

0

ファイルを表示

 def findBigrams(self, tweet):
     words = [w for w in tweet]
     bigrams = BigramCollocationFinder.from_words(words)
     return bigrams.nbest(BigramAssocMeasures.likelihood_ratio, 20)

コード例 #57

0

ファイルを表示

ファイル: analyze_bigrams.py プロジェクト: sandboxdp/ovc-2016-videos

import nltk
from nltk.collocations import BigramCollocationFinder
from utils import tokenize_transcripts, get_files

# a list of tokens for each of the talks
transcript_tokens = tokenize_transcripts(stem=True)

# built in bigram metrics are in here
bigram_measures = nltk.collocations.BigramAssocMeasures()

# compute top bigrams and output results to console
for i, file in enumerate(get_files()):
    finder = BigramCollocationFinder.from_words(transcript_tokens[i])
    bigrams = finder.score_ngrams(bigram_measures.likelihood_ratio)

    print(file)
    for [tokens, value] in bigrams[0:50]:
        print('{},{}'.format(" ".join(tokens), value))
    print('---------\n')

コード例 #58

0

ファイルを表示

wordcloud = WordCloud(max_font_size=40).generate(word_cloud_fr)

import matplotlib.pyplot as plt

plt.imshow(wordcloud, interpolation='bilinear')

plt.axis("off")
plt.savefig('word_cloud_fr.png')
#plt.show()

#English bigrams

from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

bcf = BigramCollocationFinder.from_words(words_en)
from nltk.corpus import stopwords

stopset = sw
filter_stops = lambda w: len(w) < 3 or w in stopset
bcf.apply_word_filter(filter_stops)
bcf_list = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 20)
bcf_joint_list = []
for words in bcf_list:
    bcf_joint_list.append(' '.join(words))

#save list in txt file
with open("bigrams_en.txt", "w") as output:
    output.write(str(bcf_joint_list))

#English trigrams

コード例 #59

0

ファイルを表示

#Continuing to work from NLP for hackers.
import nltk
from nltk import word_tokenize
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from nltk.collocations import TrigramAssocMeasures, TrigramCollocationFinder

#Load ulysees into a variable.
with open('messages_only.txt', 'r', encoding="utf-8") as myfile:
    text = myfile.read()

#tokenize the text
tokens = word_tokenize(text)

bigram_measures = BigramAssocMeasures()
trigram_measures = TrigramAssocMeasures()

#compute length-2 collocations
finder = BigramCollocationFinder.from_words(tokens)

finder.apply_freq_filter(5)

print(finder.nbest(bigram_measures.pmi, 20))

finder = nltk.collocations.TrigramCollocationFinder.from_words(tokens)

#only trigrams that appear 5+ times
finder.apply_freq_filter(5)

#return the 50 trigrams with the highest PMI
print(finder.nbest(trigram_measures.pmi, 20))

コード例 #60

0

ファイルを表示

ファイル: NB_model.py プロジェクト: hee6721/Cleansleep

def main():
#     parser = ArgumentParser()
#     parser.add_argument("--folder", type=str, dest="folder")
#     args = parser.parse_args()
#     hotelname = args.folder

    # Load the tokenized reviews (sentences)
#    infile = args.folder+"/"+hotelname+"_NB_trainingdata.senttokens_sel.pyvar"
    infile = "NB_data/NB_trainingdata.senttokens.pyvar"
    infile = open(infile, 'r')
    print infile
    word_tokens_byreviewid = pickle.load(infile)
    infile.close()

    # Load the training reviewids
#    infile = args.folder+"/"+hotelname+"_NB_trainingdata.labels.pyvar"
    infile = "NB_data/NB_trainingdata.labels.pyvar"
    infile = open(infile, 'r')
    keepreviewids = pickle.load(infile)
    infile.close()

    word_tokens_byreviewid_expanded = {}
    negtags = []; postags = []
    for reviewid in word_tokens_byreviewid:
        sents = word_tokens_byreviewid[reviewid]
        for sent_idx in range(0, len(sents)):
            tag = (reviewid, str(sent_idx))
            word_tokens_byreviewid_expanded[tag] = sents[sent_idx]
            if reviewid in keepreviewids['1']:
                negtags.append(tag)
            if reviewid in keepreviewids['5']:
                postags.append(tag)
    print "neg sents: %d\t pos sents: %d" %(len(negtags), len(postags))

#     # Stem the words in the sentences
#     # Separate each sentence into a unique entry
#     word_tokens_byreviewid_expanded = {}
#     negtags = []; postags = []
#     tag_expanded = []
#     for reviewid in word_tokens_byreviewid:
#         sents = word_tokens_byreviewid[reviewid]
# 	for sent_idx in range(0, len(sents)):
# 	    tag = (reviewid, str(sent_idx))
#             tag_expanded.append(tag)
# #             print tag
# # 	    print sents[sent_idx]
#             word_tokens_byreviewid_expanded[tag] = sents[sent_idx]
            
# 	    if reviewid in keepreviewids[0]:
#                 negtags.append(tag)
# #                 print "negtag : "
# #                 print tag
# #                 print negtags
# 	    if reviewid in keepreviewids[1]:
# 		postags.append(tag)
#     print "neg sents: %d\t pos sents: %d" %(len(negtags), len(postags))

#     print negtags, len(negtags), len(set(negtags))
#     print
# #    print postags
#    print word_tokens_byreviewid_expanded
#     print tag_expanded
#     print word_tokens_byreviewid_expanded[tag_expanded]

#     all_words = []
#     # Get all words to analyze frequency of unigrams and bigrams
#     for t in tag_expanded : 
#         print tag
#         tag = t 
#         word = word_tokens_byreviewid_expanded[tag]

#         token=nltk.word_tokenize(word)
# #        print len(token)
    
#         for i  in range (0, len(token)): 
#             all_words.append(token[i])
# #    all_words = [word for tag in word_tokens_byreviewid_expanded for word in word_tokens_byreviewid_expanded[tag]]
# #    all_words =word_tokens_byreviewid_expanded[negtags] 
#     # Get all the stop words
#     stopwords = get_bad_words()
# #    print all_words

    # Get all words to analyze frequency of unigrams and bigrams                                                                                                                                                                         
    all_words = [word for tag in word_tokens_byreviewid_expanded for word in word_tokens_byreviewid_expanded[tag]]
    # Get all the stop words                                                                                                                                                                                                             
    stopwords = get_bad_words()

#    dispersion_plot(all_words,postags)
    # Trigrams
    trigram_finder = TrigramCollocationFinder.from_words(all_words)
    trigram_finder.apply_ngram_filter(lambda w1, w2, w3: w1 in stopwords or w3 in stopwords)
    trigram_finder.apply_freq_filter(10)
    trigrams = trigram_finder.nbest(TrigramAssocMeasures.raw_freq, 2000)
    print "Number trigrams: %d" %len(trigrams)
#    print trigrams[:100]

    # Bigrams
    bigram_finder = BigramCollocationFinder.from_words(all_words)
    bigram_finder.apply_freq_filter(20)
    bigram_finder.apply_word_filter(lambda stopword: stopword in stopwords)
    bigrams = bigram_finder.nbest(BigramAssocMeasures.raw_freq, 2000)
    print "Number bigrams: %d" %len(bigrams)
#    print bigrams[:100]

    # Unigrams
    word_freq_dist = DataFrame(dict(FreqDist(all_words)).items(), columns = ['word','count'])
    word_freq_dist = word_freq_dist[word_freq_dist['count'] > 20]
#    print word_freq_dist
    good_features = list(set(word_freq_dist['word']) - stopwords)
    print "Number unigrams: %d" %len(good_features)
    good_features.extend(bigrams)
    good_features.extend(trigrams)
#    print good_features

    # Output the features in the model
#    outfile =  args.folder+"/"+ args.folder+"_NB_sentiment.model.features.pyvar"
    outfile =  "NB_data/NB_sentiment.model.features.pyvar"
    outfile = open(outfile, 'w')
    pickle.dump(good_features, outfile)
    outfile.close()
    

    # Calculate the features
    negfeatures = [(get_sent_features(word_tokens_byreviewid_expanded[fid], good_features), 'neg') 
                   for fid in negtags]
    posfeatures = [(get_sent_features(word_tokens_byreviewid_expanded[fid], good_features), 'pos') 
                   for fid in postags]
#    print negfeatures

#     # Shuffle and balance the two classes
#     n_min = min([len(negfeatures), len(posfeatures)])
#     random.shuffle(negfeatures)
#     negfeatures = negfeatures[:n_min]
#     random.shuffle(posfeatures)
#     posfeatures = posfeatures[:n_min]

#     # Define training and testing data
#     numfolds = 10
#     foldsize = n_min/numfolds
#     negfolds = make_folds(negfeatures, foldsize)
#     posfolds = make_folds(posfeatures, foldsize)

    negfolds = cross_validation.StratifiedKFold(negfeatures, n_folds=10)
    print negfolds
    posfolds = cross_validation.StratifiedKFold(posfeatures, n_folds=10)
    print posfolds

    # 10 fold cross validation
    outfile = "NB_data/NB_sentiment.model.performance.tab"
    outfile = open(outfile, 'w')
    outfile.write("Fold\taccuracy\tpos_precision\tpos_recall\tneg_precision\tneg_recall\n")
    for fold in range(0, numfolds):
	outfile.write("%d\t" %fold)
	testdata = negfolds[fold] + posfolds[fold]
	traindata = []
	for i in range(0, numfolds):
	    if i != fold:
		traindata += negfolds[i]
		traindata += posfolds[i]
    	print 'train on %d instances, test on %d instances' % (len(traindata), len(testdata))

        result = eval_classifier(traindata, testdata)
        accuracy, posprecision, posrecall, negprecision, negrecall = result
        print  result
        outfile.write("%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n"%(accuracy, posprecision, posrecall, negprecision, negrecall))
    outfile.close()

    # Save the classifier trained using all data
    classifier = NaiveBayesClassifier.train(negfeatures + posfeatures)
#    outfile = args.folder+"/"+ args.folder+"_NB_sentiment.model.pyvar" 
    outfile = "NB_data/NB_sentiment.model.pyvar" 
    outfile = open(outfile, 'w')
    pickle.dump(classifier, outfile)
    outfile.close()