Python BigramCollocationFinderの例、nltk.collocations.BigramCollocationFinder Pythonの例

コード例 #1

0

ファイルを表示

ファイル: Extractor.py プロジェクト: Palazor/sentiment

    def _get_bigram_scores(self, posdata, negdata):
        pos_words = list(itertools.chain(*posdata))
        neg_words = list(itertools.chain(*negdata))

        pos_bigram_finder = BigramCollocationFinder.from_words(pos_words)
        neg_bigram_finder = BigramCollocationFinder.from_words(neg_words)
        pos_bigrams = pos_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
        neg_bigrams = neg_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

        pos = pos_words + pos_bigrams
        neg = neg_words + neg_bigrams

        word_fd = FreqDist()
        cond_word_fd = ConditionalFreqDist()
        for word in pos:
            word_fd[word] += 1
            cond_word_fd['pos'][word] += 1
        for word in neg:
            word_fd[word] += 1
            cond_word_fd['neg'][word] += 1

        pos_word_count = cond_word_fd['pos'].N()
        neg_word_count = cond_word_fd['neg'].N()
        total_word_count = pos_word_count + neg_word_count

        word_scores = {}
        for word, freq in word_fd.iteritems():
            pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
            neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
            word_scores[word] = pos_score + neg_score

        return word_scores

コード例 #2

0

ファイルを表示

ファイル: preproc_fea_extraction.py プロジェクト: yngwiet/Twitter-Sentiment-Analysis

    def get_unibigram_features(all_words, uni_feanum, bi_feanum):
        word_fd = nltk.FreqDist(all_words)
        bigram_fd = nltk.FreqDist(nltk.bigrams(all_words))

        if uni_feanum == 'max':
            uni_feanum = len(list(word_fd.keys()))
        elif uni_feanum > len(list(word_fd.keys())):
            uni_feanum = len(list(word_fd.keys()))

        if bi_feanum == 'max':
            bi_feanum = len(list(bigram_fd.keys()))
        elif bi_feanum > len(list(bigram_fd.keys())):
            bi_feanum = len(list(bigram_fd.keys()))

        finder = BigramCollocationFinder(word_fd, bigram_fd)
        bigrams = finder.nbest(BigramAssocMeasures.chi_sq, bi_feanum)

        print "the number of unigram features is", uni_feanum
        print "the number of bigram features is", bi_feanum

        featuples = word_fd.most_common(uni_feanum)

        selected_words = []

        for i in range(uni_feanum):
            selected_words.append(featuples[i][0])

        features = []
        for ngram in itertools.chain(selected_words, bigrams):
            features.append(ngram)

        return features

コード例 #3

0

ファイルを表示

ファイル: classifiers_score.py プロジェクト: JoshuaMichaelKing/Stock-SentimentAnalysis

def create_word_bigram_scores(posWords, negWords, score_method = BigramAssocMeasures.chi_sq):
    '''
    以双词来统计词的信息量
    '''
    bigram_finder = BigramCollocationFinder.from_words(posWords)
    posBigrams = bigram_finder.nbest(score_method, 5000)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    negBigrams = bigram_finder.nbest(score_method, 5000)
    pos = posWords + posBigrams #词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in neg:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count
    print("BIGRAM_IN_POSWORD_NUMS : %d\tBIGRAM_IN_NEGWORD_NUMS : %d" % (pos_word_count, neg_word_count))

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = score_method(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = score_method(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score
    return word_scores

コード例 #4

0

ファイルを表示

ファイル: store sentiment classifier.py プロジェクト: EricChanBD/Review-Helpfulness-Prediction

def create_bigram_scores():
    posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", "1", "1")
    negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", "1", "1")
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000)

    pos = posBigrams
    neg = negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd.inc(word)
        cond_word_fd['pos'].inc(word)
    for word in neg:
        word_fd.inc(word)
        cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores

コード例 #5

0

ファイルを表示

ファイル: sentimentexample.py プロジェクト: eleanordong/datamining

def create_word_bigram_scores(posWords, negWords, n = 5000):
    # (posWords,negWords) = readwordarr()
    posWords = list(itertools.chain(*posWords))
    negWords = list(itertools.chain(*negWords))
    bigramfinder = BigramCollocationFinder.from_words(posWords)
    posbigrams = bigramfinder.nbest(BigramAssocMeasures.chi_sq, n)
    bigramfinder = BigramCollocationFinder.from_words(negWords)
    negbigrams = bigramfinder.nbest(BigramAssocMeasures.chi_sq, n)
    posWords = posWords + posbigrams
    negWords = negWords + negbigrams
    wordscores = {}
    wordfd = FreqDist()
    conditionwordfd = ConditionalFreqDist()
    for word in posWords:
        wordfd[word]+=1
        conditionwordfd['pos'][word]+=1
        
    for word in negWords:
        wordfd[word]+=1
        conditionwordfd['neg'][word]+=1
    
    pos_word_count = conditionwordfd['pos'].N()
    neg_word_count = conditionwordfd['neg'].N()
    totalcount = pos_word_count + neg_word_count
    for word,freq in wordfd.items():
        pos_score = BigramAssocMeasures.chi_sq(conditionwordfd['pos'][word], (freq, pos_word_count), totalcount)
        neg_score = BigramAssocMeasures.chi_sq(conditionwordfd['neg'][word], (freq, neg_word_count), totalcount)
        wordscores[word] = pos_score + neg_score
    return wordscores

コード例 #6

0

ファイルを表示

ファイル: extractFeatures_org.py プロジェクト: coolspiderghy/sina_weibo_crawler

def create_word_bigram_scores():
    posdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl','r'))
    negdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl','r'))
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams #词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd['pos'][word]+= 1 #cond_word_fd['pos'].inc(word)
    for word in neg:
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd['neg'][word]+= 1#cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores

コード例 #7

0

ファイルを表示

ファイル: process.py プロジェクト: delili/NLP_Comments_Sentiment_Analysis

def create_word_bigram_scores(posWords, negWords):
    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)

    pos = posWords + posBigrams #词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[str(word)] += 1 
        cond_word_fd['pos'][str(word)] += 1
    for word in neg:
	    word_fd[str(word)] += 1
	    cond_word_fd['neg'][str(word)] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores

コード例 #8

0

ファイルを表示

ファイル: Bigrams_Features.py プロジェクト: AJRenold/classification_assignment_i256

def best_bigrams(sents_tagged, stopwords, score_fn=BigramAssocMeasures.likelihood_ratio, n=300):
    sents_pos = []
    sents_neg = []

    # Separate positive and negative sentences.
    for tag, sent in sents_tagged:
        if tag == 1:
            sents_pos.append(sent)
        elif tag == -1:
            sents_neg.append(sent)

    # Extract words from positive and negative sentences.
    words_pos = [word.lower() for s in sents_pos for word in word_tokenize(s) if word not in string.punctuation]
    words_neg = [word.lower() for s in sents_neg for word in word_tokenize(s) if word not in string.punctuation]

    # Find the best bigrams for positive sentences based on informative collocations
    bigram_finder1 = BigramCollocationFinder.from_words(words_pos)
    bigrams_best_pos = bigram_finder1.nbest(score_fn, n)

    # Find the best bigrams for negative sentences based on informative collocations
    bigram_finder2 = BigramCollocationFinder.from_words(words_neg)
    bigrams_best_neg = bigram_finder2.nbest(score_fn, n)

    bigrams_all = list(set(bigrams_best_pos).union(set(bigrams_best_neg)))

    # Select only the bigrams that have either one of the word greater than length 3
    bigrams_best = [bigram for bigram in bigrams_all
            if len(bigram[0]) > 3 and len(bigram[1]) > 3
            and bigram[0] not in ex and bigram[1] not in ex ]


    return bigrams_best

コード例 #9

0

ファイルを表示

ファイル: score.py プロジェクト: TianyiM/Final-Project

def create_word_bigram_scores():
    posdata = tp.seg_fil_senti_excel("~", 1, 1)
    negdata = tp.seg_fil_senti_excel("~", 1, 1)
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    last_word = ConditionalFreqDist()
    for word in pos:
        word_fd.inc(word)
        last_word['pos'].inc(word)
    for word in neg:
        word_fd.inc(word)
        last_word['neg'].inc(word)

    pos_word_count = last_word['pos'].N()
    neg_word_count = last_word['neg'].N()
    totalnumber = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(last_word['pos'][word], (freq, pos_word_count), totalnumber)
        neg_score = BigramAssocMeasures.chi_sq(last_word['neg'][word], (freq, neg_word_count), totalnumber)
        word_scores[word] = pos_score + neg_score

    return word_scores

コード例 #10

0

ファイルを表示

ファイル: store sentiment classifier.py プロジェクト: lihui19891118/Sentimental-analysis

def create_word_bigram_scores():
    posdata = tp.seg_fil_txt("/home/hadoop/goodnew.txt")
    negdata = tp.seg_fil_txt("/home/hadoop/badnew.txt")
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finderr = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finderr.nbest(BigramAssocMeasures.chi_sq,350000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq,350000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd.inc(word)
        cond_word_fd['pos'].inc(word)
    for word in neg:
        word_fd.inc(word)
        cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores

コード例 #11

0

ファイルを表示

ファイル: pos_neg_ml_feature.py プロジェクト: wac81/LSI-for-ChineseDocument

def create_words_bigrams_scores():
    posdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/pos_review.xlsx", 1, 1)
    negdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/neg_review.xlsx", 1, 1)
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word]+=1
        cond_word_fd['pos'][word]+=1

    for word in neg:
        word_fd[word]+=1
        cond_word_fd['neg'][word]+=1
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores

コード例 #12

0

ファイルを表示

ファイル: feature_extrac.py プロジェクト: yyr93520/NLPproject

def create_word_bigram_scores():
	bigram_finder = BigramCollocationFinder.from_words(posWords)
	bigram_finder = BigramCollocationFinder.from_words(negWords)
	posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
	negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
	pos = posWords + posBigrams #词和双词搭配
	neg = negWords + negBigrams
	return get_scores(pos, neg)

コード例 #13

0

ファイルを表示

ファイル: weibo_sentiment_classifier.py プロジェクト: Irradiatepy/weibo_sentiment_analysis

def create_word_bigram_scores():
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))
    
    objWords = list(itertools.chain(*objdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    
    bigram_finder = BigramCollocationFinder.from_words(objWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    
    objBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)


    pos = posWords + posBigrams
    neg = negWords + negBigrams
    
    obj = objWords + objBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in neg:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1
    for word in objWords:
        word_fd[word] += 1
        cond_word_fd['obj'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    
    obj_word_count = cond_word_fd['obj'].N()
    total_word_count = pos_word_count + neg_word_count + obj_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
       
        obj_score = BigramAssocMeasures.chi_sq(cond_word_fd['obj'][word], (freq, obj_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score + obj_score

    return word_scores

コード例 #14

0

ファイルを表示

ファイル: Test.py プロジェクト: svenka22/Twitter-Sentiment-Analysis

def get_bigrams1(tweet, score_fn=BigramAssocMeasures.chi_sq, n=200):
        bigramslist = []
        bigram_finder = BigramCollocationFinder.from_words(tweet)
        bigrams = bigram_finder.nbest(score_fn, n)
        for bigram in bigrams:
            bigramslist.append(' '.join(str(i) for i in bigram))
        print bigramslist

コード例 #15

0

ファイルを表示

ファイル: text.py プロジェクト: prz3m/kind2anki

    def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        :seealso: find_collocations
        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not (
            '_collocations' in self.__dict__
            and self._num == num
            and self._window_size == window_size
        ):
            self._num = num
            self._window_size = window_size

            # print("Building collocations list")
            from nltk.corpus import stopwords

            ignored_words = stopwords.words('english')
            finder = BigramCollocationFinder.from_words(self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations]
        print(tokenwrap(colloc_strings, separator="; "))

コード例 #16

0

ファイルを表示

ファイル: analyze_tweets.py プロジェクト: seanfreiburg/chicago_tweet_grabber

def best_bigram_word_features(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    d = dict([(bigram, True) for bigram in bigrams])
    d.update(best_word_features(words))

    return d

コード例 #17

0

ファイルを表示

ファイル: nbayes_sentiment.py プロジェクト: gmichopoulos/sentiment_analysis_toolkit

def bigram_word_features(words, limit, stop, stopset, word_score_placeholder, \
                                          score_fn=BigramAssocMeasures.chi_sq):
  if stop:
    words = [w for w in words if w not in stopset]
  bigram_finder = BigramCollocationFinder.from_words(words)
  bigrams = bigram_finder.nbest(score_fn, limit)
  return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])

コード例 #18

0

ファイルを表示

ファイル: naive_bayes_classifier_bigrams.py プロジェクト: MARS87/ieor242

def tweet_features(tweet):
    tweet_words = word_tokenize(tweet)
    bigram_finder = BigramCollocationFinder.from_words(tweet_words)
    score_fn=BigramAssocMeasures.chi_sq
    bigrams = bigram_finder.nbest(score_fn, 200)
    print bigrams
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])

コード例 #19

0

ファイルを表示

ファイル: text_utils.py プロジェクト: fruser/review-analyzer

def get_bag_of_bigrams_words(
        word_list,
        score_fn=BigramAssocMeasures.chi_sq,
        n=200):
    bigram_finder = BigramCollocationFinder.from_words(word_list)
    bigrams = bigram_finder.nbest(score_fn, n)
    return get_bag_of_words(word_list + bigrams)

コード例 #20

0

ファイルを表示

ファイル: NLTK_tools.py プロジェクト: dreampocketit/bocard

def demo_collocations(self, num=40, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        @seealso: L{find_collocations}
        @param num: The maximum number of collocations to print.
        @type num: C{int}
        @param window_size: The number of tokens spanned by a collocation (default=2)
        @type window_size: C{int}
        """
        if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
            self._num = num
            self._window_size = window_size
            print "Building collocations list"
            from nltk.corpus import stopwords
            ignored_words = stopwords.words('english')
            from nltk.collocations import BigramCollocationFinder
            finder = BigramCollocationFinder.from_words(self.tokens, window_size) 
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            from nltk.metrics import f_measure, BigramAssocMeasures
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1+u' '+w2 for w1, w2 in self._collocations]
        print "List {0} collocations".format(num)
        print tokenwrap(colloc_strings, separator=u'; ')

コード例 #21

0

ファイルを表示

ファイル: collocations.py プロジェクト: rjoganah/Dynamic_IR

 def collaction_discovery(self):
     self.corpus = nltk.word_tokenize(self.corpus.lower())
     bigramm_finder = BigramCollocationFinder.from_words(self.corpus)
     filter_bigram = lambda w: len(w) < 3 or w in self.stopwords_
     bigramm_finder.apply_word_filter(filter_bigram)
     top_10_bigrams = bigramm_finder.nbest(BigramAssocMeasures.likelihood_ratio, 10)
     return top_10_bigrams

コード例 #22

0

ファイルを表示

ファイル: classifier.py プロジェクト: benneic/sentimento

 def converter(tokens):
     bigram_finder = BigramCollocationFinder.from_words(tokens)
     bigrams = bigram_finder.nbest(score_fn, n)
     return (
         {ngram: True for ngram in itertools.chain(tokens, bigrams)},
         label
     )

コード例 #23

0

ファイルを表示

ファイル: grapher.py プロジェクト: amac441/Metten

    def get_frequencies(self, desc):

        stopset = set(stopwords.words('english'))
        filter_stops = lambda w: len(w) < 3 or w in stopset
        words = word_tokenize(desc)

        print '------gram--------'
        words_to_count = [word for word in words if word not in stopset]
        words_to_count = [word for word in words_to_count if not len(word) < 3]
        c = Counter(words_to_count)
        single = c.most_common(20)
        print single

        print '------bigram--------'
        bcf = BigramCollocationFinder.from_words(words)
        bcf.apply_word_filter(filter_stops)
        bigrm = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 15)
        print bigrm

        print '------trigram--------'
        tcf = TrigramCollocationFinder.from_words(words)
        tcf.apply_word_filter(filter_stops)
        tcf.apply_freq_filter(3)  #only keep those that appear more than 3 times
        trigrm = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 10)
        print trigrm

        matches = [single,bigrm,trigrm]
        return matches

コード例 #24

0

ファイルを表示

ファイル: BigramsExtraction.py プロジェクト: ssteku/NLPRelatedPhenomenon

 def get_bigrams(self, rawText, score_fn=BigramAssocMeasures.pmi, n=40):
     # TODO configuration value
     clean_text = TokensCleaner.clean(self, rawText, cleaning_level=3)
     bigram_finder = BigramCollocationFinder.from_words(clean_text['3'])
     bigram_measures = BigramAssocMeasures()
     bigrams = bigram_finder.nbest(bigram_measures.pmi, n)
     return bigrams

コード例 #25

0

ファイルを表示

ファイル: DataPreprocessing.py プロジェクト: svenka22/Twitter-Sentiment-Analysis

 def get_bigrams(self, tweet, score_fn=BigramAssocMeasures.chi_sq, n=200):
         bigramslist = []
         bigram_finder = BigramCollocationFinder.from_words(tweet)
         bigrams = bigram_finder.nbest(score_fn, n)
         for bigram in bigrams:
             bigramslist.append(' '.join(str(i) for i in bigram))
         return bigramslist #This is list e.g. ['you dude', 'Hi How', 'How are', 'are you']

コード例 #26

0

ファイルを表示

ファイル: generator.py プロジェクト: emmajhyde/PoetryGenerator

 def get_collocations(self):
     ignored_words = stopwords.words('english')
     finder = BigramCollocationFinder.from_words(self.text_array,2)
     finder.apply_freq_filter(3)
     finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
     bigram_measures = BigramAssocMeasures()
     return finder.nbest(bigram_measures.likelihood_ratio,40)

コード例 #27

0

ファイルを表示

ファイル: collocationreadability.py プロジェクト: muranava/Text-Tools

def ShowCollocations():
	text.insert(END, "If this doesn't work, please check you have NLTK, PyYAML and the stopword list from the NLTK loaded. See Help for details \n\n\n")
	import nltk
	from nltk.collocations import BigramCollocationFinder
	from nltk.collocations import TrigramCollocationFinder
	from nltk.metrics import BigramAssocMeasures
	from nltk.metrics import TrigramAssocMeasures
	pattern = r'''(?x)([A-Z]\.)+|\w+([-']\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;"'?():-_']'''
	data = resultsbox.get(1.0,END)
	rawtext=nltk.regexp_tokenize(data, pattern)
	prepcolloc = [word.lower() for word in rawtext if not word in stopwords and word.isalpha()]
	text.delete(1.0, END)
	text.insert(END, "Collocations (occurring at least 3 times with a PMI of 10)\n")
	text.insert(END, "\nBigram Collocations:\n")
	bigram = BigramAssocMeasures()
	bigramfinder = BigramCollocationFinder.from_words(prepcolloc)
	bigramfinder.apply_freq_filter (3)
	bigrams=bigramfinder.nbest(bigram.pmi, 10)
	for item in bigrams:
		first = item[0]
		second = item[1]
		text.insert(END, first)
		text.insert(END, " ")
		text.insert(END, second)
		text.insert(END, "\n")

コード例 #28

0

ファイルを表示

ファイル: BookClassifier.py プロジェクト: karthik-chandrasekar/BookClassifier

 def get_bigram(self, features_list):
     #Top ten best bigrams are selected
     score = BigramAssocMeasures.chi_sq
     all_bigrams = BigramCollocationFinder.from_words(features_list)
     best_bigrams = all_bigrams.nbest(score, self.bigram_threshold)
     selected_bigrams = [(bigram, True) for bigram in best_bigrams]
     return selected_bigrams

コード例 #29

0

ファイルを表示

ファイル: load_samples.py プロジェクト: hotpro/cmpe239-project2

def stopword_filtered_bigrams(text, score_fn=BigramAssocMeasures.chi_sq, n=500):
    '''Removes the stopwords and computes the best bigrams'''
    stopset = set(stopwords.words('english'))
    words = [word for word in tokenize(text) if word not in stopset]
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])

コード例 #30

0

ファイルを表示

ファイル: ytpy.py プロジェクト: juliasun/Youtube-Tox

def bigram(ytcomments, drug):
    bi = BigramAssocMeasures()
    bi_finder = BigramCollocationFinder.from_words(ytcomments, window_size = 20)
    top_general = bi_finder.nbest(bi.pmi,30)
    bi_finder.apply_ngram_filter(lambda w1,w2: w1 != drug and w2 != drug)
    top_bi = bi_finder.nbest(bi.pmi, 30)
    return top_bi

コード例 #31

0

ファイルを表示

ファイル: test_nltk.py プロジェクト: ChyengJason/Wandoujia

def bigram_words(words, score_fn=BigramAssocMeasures.chi_sq, n=1000):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bag_of_words(words + bigrams)  #所有词和（信息量大的）双词搭配一起作为特征

コード例 #32

0

ファイルを表示

ファイル: test_nltk.py プロジェクト: ChyengJason/Wandoujia

def bigram(words, score_fn=BigramAssocMeasures.chi_sq, n=1000):
    bigram_finder = BigramCollocationFinder.from_words(words)  #把文本变成双词搭配的形式
    bigrams = bigram_finder.nbest(score_fn, n)  #使用了卡方统计的方法，选择排名前1000的双词
    return bag_of_words(bigrams)

コード例 #33

0

ファイルを表示

 def get_bigrams(self, words):
     bigram_finder = BigramCollocationFinder.from_words(words)
     self.biagrams = bigram_finder.nbest(self.bigram_score_funcion,
                                         self.top_ngram_count)
     return self.biagrams

コード例 #34

0

ファイルを表示


def flatten_corpus(corpus):
    return ' '.join([document.strip() for document in corpus])


def get_top_ngrams(corpus, ngram_val=1, limit=5):
    corpus = flatten_corpus(corpus)
    tokens = nltk.word_tokenize(corpus)
    ngrams = compute_ngrams(tokens, ngram_val)
    ngrams_freq_dist = nltk.FreqDist(ngrams)
    sorted_ngrams_fd = sorted(ngrams_freq_dist.items(),
                              key=itemgetter(1),
                              reverse=True)
    sorted_ngrams = sorted_ngrams_fd[0:limit]
    sorted_ngrams = [(' '.join(text), freq) for text, freq in sorted_ngrams]
    return sorted_ngrams


print(get_top_ngrams(corpus=norm_alice, ngram_val=3, limit=10))

finder = BigramCollocationFinder.from_documents(
    [item.split() for item in norm_alice])
bigram_measures = BigramAssocMeasures()

print(finder.nbest(bigram_measures.raw_freq, 10))

# Now using gensim
print("Sentence: ", norm_alice[2])
key_words = keywords(norm_alice[2], ratio=1.0, scores=True, lemmatize=True)
print([(item, round(score, 3)) for item, score in key_words][:25])

コード例 #35

0

ファイルを表示

def bi(text):
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    finder=BigramCollocationFinder.from_words(word_tokenize(text))
    finder.apply_freq_filter(5)
    finder.nbest(bigram_measures.pmi, 5) 
    return finder.ngram_fd.items()

コード例 #36

0

ファイルを表示

ファイル: tutorialText.py プロジェクト: saxes20/EmotionalAnalysisCNN

def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    d = dict([(bigram, True) for bigram in bigrams])
    d.update(best_word_feats(words))
    return d

コード例 #37

0

ファイルを表示

def get_bigrams(tokens, freq_filter=None):
    finder = BigramCollocationFinder.from_words(tokens)
    if freq_filter:
        finder.apply_freq_filter(freq_filter)
    return list(' '.join(b[0]) for b in finder.ngram_fd.items())

コード例 #38

0

ファイルを表示

ファイル: reviewtest_MAX_17092018.py プロジェクト: max620/nlp_journey

def bigram(collat_data):
    df_co = pd.DataFrame.to_string(collat_data,
                                   columns=['lemmatization']).split(',')
    bcf = BigramCollocationFinder.from_words(df_co)
    top20 = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 20)
    return top20

コード例 #39

0

ファイルを表示

ファイル: freq_ngrams.py プロジェクト: dllllb/contest-dh-turing

def find_bigrams(sentences, n_ngrams):
    cf = BigramCollocationFinder.from_documents(sentences)
    fng = cf.nbest(BigramAssocMeasures.likelihood_ratio, n_ngrams)
    return fng

コード例 #40

0

ファイルを表示

modelkmeans = KMeans(init='k-means++', max_iter=200, n_init=100)
modelkmeans.fit(X)
order_centroids = modelkmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(modelkmeans.n_clusters):
    print("Cluster {}:".format(i)),
    for ind in order_centroids[i, :10]:
        print("{}".format(terms[ind]))

s = all_text_docs[name]
tokens = word_tokenize(s)
text = nltk.Text(tokens)
text.collocations()
text.concordance('social')
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(word_tokenize(s))
finder.nbest(bigram_measures.pmi, 10)

######
NUM_TOPICS = 10

vectorizer = CountVectorizer(min_df=5,
                             max_df=0.9,
                             stop_words='english',
                             lowercase=True)
data_vectorized = vectorizer.fit_transform(train_clean_sentences)

# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS,
                                      max_iter=10,
                                      learning_method='online')

コード例 #41

0

ファイルを表示

ファイル: Predicting+Cyberbullying+Twitter+ Code1.py プロジェクト: kohirmanideep/hack-srm

def bigrams_words(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bigrams

コード例 #42

0

ファイルを表示

ファイル: data visualization.py プロジェクト: vipinmatthews/consumerComplaints

with open("D:/Python/Consumer Complaints/Consumer_Complaints_CreditCard.csv", 'r') as file:
  complaints = list(csv.reader(file))
  file.close()

compClean = []
for i in range(len(complaints)):
    tokens = re.sub("[^A-Za-z0-9()'.]+", " ", complaints[i][5])
    tokens = re.sub('!', ".", tokens)
    compClean.append(tokens)


from nltk.corpus import webtext
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
words = [w.lower() for w in webtext.words('D:/Python/Consumer Complaints/complaintsDump.txt')]
bcf = BigramCollocationFinder.from_words(words)

#from nltk.collocations import TrigramCollocationFinder
#from nltk.metrics import TrigramAssocMeasures
#tcf = TrigramCollocationFinder.from_words(words)
#tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 4)

from nltk.corpus import stopwords
stopset = set(stopwords.words('english'))
filter_stops = lambda w: len(w) < 3 or w in stopset
bcf.apply_word_filter(filter_stops)
collocations = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 50)

newText = 'a credit card is issued to me'
tokens = re.sub(" ".join(collocations[1]), "-".join(collocations[1]), newText)

コード例 #43

0

ファイルを表示

def bag_of_bigrams_words(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    # finds words that often occur togther
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bag_of_words(words + bigrams)

コード例 #44

0

ファイルを表示

def analyze_text(text, filename, stopwords, min_length, freq, total_ngrams,
                 min_measure, bigrams_only, trigrams_only):
    print(len(text), filename)
    words = [
        w.lower() for w in text if w not in string.punctuation
        if w.lower() not in stopwords and len(w) >= min_length
    ]

    bigrams = None
    b_prefix_keys = None
    trigrams = None
    t_prefix_keys = None

    # what follows could totally be generalized
    if not trigrams_only:
        # Bigrams
        print("Generating bigrams from", filename)
        b_finder = BigramCollocationFinder.from_words(words)
        b_finder.ngram_fd
        b_finder.apply_freq_filter(freq)
        # if stopwords:
        #   b_finder.apply_word_filter(lambda w: w in stopwords)
        bigrams = b_finder.nbest(BigramAssocMeasures.pmi, total_ngrams)
        b_scored = b_finder.score_ngrams(BigramAssocMeasures.pmi)
        b_prefix_keys = collections.defaultdict(list)
        for key, scores in b_scored:
            if scores > min_measure:
                b_prefix_keys[key[0]].append((key[1], scores))

    # Trigrams
    if not bigrams_only:
        print("Generating trigrams from", filename)
        t_finder = TrigramCollocationFinder.from_words(words)
        t_finder.apply_freq_filter(freq)
        # if stopwords:
        #   t_finder.apply_word_filter(lambda w: w in stopwords)
        trigrams = t_finder.nbest(TrigramAssocMeasures.pmi, total_ngrams)
        t_scored = t_finder.score_ngrams(TrigramAssocMeasures.pmi)
        t_prefix_keys = collections.defaultdict(list)
        for key, scores in t_scored:
            if scores > min_measure:
                t_prefix_keys[key[0]].append((key[1], key[2], scores))

    if bigrams_only:
        ret = {
            'bigrams': bigrams,
            'b_prefix': b_prefix_keys,
            'b_fd': b_finder.ngram_fd
        }
    elif trigrams_only:
        ret = {
            'trigrams': trigrams,
            't_prefix': t_prefix_keys,
            't_fd': t_finder.ngram_fd
        }
    else:
        ret = {
            'bigrams': bigrams,
            'b_prefix': b_prefix_keys,
            'b_fd': b_finder.ngram_fd,
            'trigrams': trigrams,
            't_prefix': t_prefix_keys,
            't_fd': t_finder.ngram_fd
        }
    return ret

コード例 #45

0

ファイルを表示

ファイル: Clustering.py プロジェクト: estephenson/ai-mental-health

def main():
    # stopwords to filter out for collocations
    stopwords_eng = set(stopwords.words("english"))
    stopwords_eng.add(b'et')
    stopwords_eng.add(b'al')


    # bigram identifier from nltk
    bigram_measures = nltk.collocations.BigramAssocMeasures()

    # tf-idf vectorizer from nltk
    tfidf_vectorizer = TfidfVectorizer(stop_words='english',
                                 use_idf=True,
                                 tokenizer=tokenize_and_stem, ngram_range=(1,3))

    file = open('CultureRelatedDiaognosticIssues.txt','r')
    a = []
    names = []
    for line in file:
        miniList = line.split("|")
        names.append(int(miniList[0].strip()))
        a.append(miniList[1].strip())
    file.close()

    allvocab_stemmed = []
    allvocab_tokenized = []

    for element in a:
        stemmed_result = tokenize_and_stem(element)
        allvocab_stemmed.extend(stemmed_result)

        tokenized_result = tokenize_only(element)
        allvocab_tokenized.extend(tokenized_result)

    # data frame that contains stems and tokenized words
    vocab_frame = pd.DataFrame({'words': allvocab_tokenized},
    index = allvocab_stemmed)

    # tf-idf matrix for the terms in the corpus
    tfidf_matrix = tfidf_vectorizer.fit_transform(a)
    terms = tfidf_vectorizer.get_feature_names()

    # number of clusters
    num_clusters = 10

    # fitting the k-means algorithm and saving it in a .pkl file
    km = KMeans(n_clusters=num_clusters)
    km.fit(tfidf_matrix)
    joblib.dump(km,  'cluster.pkl')
    km = joblib.load('cluster.pkl')
    clusters = km.labels_.tolist()

    # data frame that saves the chapter, the text, and the assigned cluster
    dsm = {'chapter': names, 'text': a, 'cluster': clusters}
    frame = pd.DataFrame(dsm, index = [clusters], columns = ['chapter', 'text', 'cluster'])

    #groupby cluster for aggregation purposes
    grouped = frame['chapter'].groupby(frame['cluster'])

    # getting rid of all punctuation for bigram measures - will use this later
    puncTokenizer = RegexpTokenizer(r'\w+')

    print("Top terms per cluster:")
    print()

    #sort cluster centers by proximity to centroid
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]

    for i in range(num_clusters):
        print("Cluster %d words:" % i, end='')

        for ind in order_centroids[i, :6]:
            print(vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
        print()

        print("Cluster %d titles:" % i, end='')
        for title in frame.ix[i]['chapter'].values.tolist():
            print(str(title) + " , ", end='')
        print()

        # this for-loop finds the most common pairs of words in each diagnosis
        for text in frame.ix[i]['text'].values.tolist():
            data_tokens = puncTokenizer.tokenize(text)
            data_tokens = [x.lower() for x in data_tokens]

            tokens = [w for w in data_tokens if w not in stopwords_eng]

            finder = BigramCollocationFinder.from_words(tokens)
            print('Printing collocations in this chapter:')
            print(finder.nbest(bigram_measures.likelihood_ratio, 5))
            print()
    print()
    print()

    # distribution of clusters
    plt.hist(km.labels_, bins=num_clusters)
    plt.show()

コード例 #46

0

ファイルを表示

ファイル: general.py プロジェクト: azizmb/TWSS

def word_features(words, score_fn=BAM.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict((bg, True) for bg in chain(words, bigrams))

コード例 #47

0

ファイルを表示

def bigram_words(words, score_fn=BigramAssocMeasures.pmi, n=121):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bag_of_words(words + bigrams)

コード例 #48

0

ファイルを表示

def create_features(X, user_data=None):
    res = []

    for date, comment, user in X:
        feat = {}
        has_hate_word = has_drug_word = has_cult_word = has_occult_word = has_porn_word = 0
        has_fwenzel_word = 0
        has_swastika = swastika in comment

        comment = comment.lower()

        comment = parse_text(comment)

        comment = nltk.clean_html(comment)

        sents = sent_tokenize(comment)
        doc = []
        for sent in sents:
            # Tokenize each sentence.
            doc += wordtokenizer.tokenize(sent)

        def repl_filter(x):
            return x.lower() not in ["nl", "nl2", "nbsp", "nbsp2", "dummyhtml"]

        # Remove stopwords and replacement tokens.
        doc = filter(repl_filter, doc)

        for i, word in enumerate(doc):
            if doc[i] in bad_words:
                doc[i] = '_badword_'

            doc[i] = ps.stem(doc[i])

            doc[i] = wnl.lemmatize(doc[i])

            if doc[i] in bad_words:
                doc[i] = '_badword_'

            if doc[i] in hate_words:
                has_hate_word = 1
            if doc[i] in drug_words:
                has_drug_word = 1
            if doc[i] in cult_words:
                has_cult_word = 1
            if doc[i] in occult_words:
                has_occult_word = 1
            if doc[i] in porn_words:
                has_porn_word = 1
            if doc[i] in fwenzel_words:
                has_fwenzel_word = 1

        bigram_finder = BigramCollocationFinder.from_words(doc)
        bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, n=5)

        bigram = dict([(ngram, True)
                       for ngram in itertools.chain(doc, bigrams)])

        feat.update(bigram)

        text_vocab = set(w for w in doc if w.isalpha())
        unusual = text_vocab.difference(english_vocab)
        unusual_ratio = len(unusual) / len(text_vocab) if len(
            text_vocab) != 0 else -1.0

        unusual2 = unusual.difference(set("_badword_"))
        unusual_ratio2 = len(unusual2) / len(text_vocab) if len(
            text_vocab) != 0 else -1.0

        if user_data is not None:
            user_info = user_data[user]

        has_bad_word = True
        for word in bad_words:
            if word in comment.lower():
                break
        else:
            has_bad_word = False

        def n_none(x):
            return int(x) if x is not None else 0

        def c_none(x):
            return x if x is not None else "__None__"

        readability = ReadabilityTool(comment)

        read_feat = {}
        for f, val in readability.analyzedVars.items():
            if f != 'words':
                read_feat["_" + f] = val
        for test, val in readability.tests_given_lang['eng'].items():
            read_feat["__" + test] = val(readability.text)

        feat['_always_present'] = True
        feat['_word_num'] = len(doc)
        feat['_sent_num'] = len(sents)
        feat['_word_var'] = len(set(doc)) / len(doc) if len(doc) != 0 else -1.0
        feat['_sent_var'] = len(set(sents)) / len(sents)
        feat['_unusual_ratio'] = unusual_ratio
        feat['_unusual_ratio2'] = unusual_ratio2
        if user_data is not None:
            feat['_username'] = user
            feat['_user_subcount'] = int(user_info['SubscriberCount'])
            feat['_user_friends'] = int(user_info['FriendsAdded'])
            feat['_user_favs'] = int(user_info['VideosFavourited'])
            feat['_user_videorates'] = int(user_info['VideosRated'])
            feat['_user_videouploads'] = int(user_info['VideosUploaded'])
            feat['_user_videocomments'] = int(user_info['VideosCommented'])
            feat['_user_videoshares'] = int(user_info['VideosShared'])
            feat['_user_usersubs'] = int(user_info['UserSubscriptionsAdded'])
            feat['_user_gender'] = c_none(user_info['Gender'])
            feat['_user_age'] = n_none(user_info['Age'])
            feat['_user_closed'] = user_info['UserAccountClosed']
            feat['_user_suspended'] = user_info['UserAccountSuspended']
            feat['_user_has_gender'] = 1 if user_info[
                'Gender'] is not None else 0
            feat['_user_has_school'] = 1 if user_info[
                'School'] is not None else 0
            feat[
                '_user_has_books'] = 1 if user_info['Books'] is not None else 0
            feat['_user_has_movies'] = 1 if user_info[
                'Movies'] is not None else 0
            feat[
                '_user_has_music'] = 1 if user_info['Music'] is not None else 0
            feat['_user_has_location'] = 1 if user_info[
                'Location'] is not None else 0
            feat['_user_has_hometown'] = 1 if user_info[
                'Hometown'] is not None else 0
    #        feat['_user_last'] = user_info['LastWebAccess']

    # Dictionary features
        feat['_has_bad_word'] = has_bad_word
        #        feat['_has_hate_word'] = has_hate_word
        #        feat['_has_drug_word'] = has_drug_word
        feat['_has_cult_word'] = has_cult_word
        feat['_has_swastika'] = has_swastika
        #        feat['_has_occult_word'] = has_occult_word
        #        feat['_has_has_fwenzel_word'] = has_fwenzel_word
        feat['_has_porn_word'] = has_porn_word
        feat['_has_swastika'] = has_swastika
        feat.update(read_feat)

        #        print feat
        res.append(feat)
    return res

コード例 #49

0

ファイルを表示

ファイル: util.py プロジェクト: rohankshir/football

def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    print words, "\n"
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])

コード例 #50

0

ファイルを表示

ファイル: testing.py プロジェクト: riamf/jupyternotes

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from nltk.stem import LancasterStemmer

f = open('data.txt', 'r')
lines = f.readlines()
f.close()

custom_stopwords = set(stopwords.words('english') + list(punctuation))

tokenized_lines = []
for line in lines:
    tokenized_words = [
        word for word in word_tokenize(line) if word not in custom_stopwords
    ]
    tokenized_lines.append(tokenized_words)

bigram_measures = BigramAssocMeasures()
ngrams = []
for line in tokenized_lines:
    ngrams.append(
        sorted(BigramCollocationFinder.from_words(line).ngram_fd.items()))

st = LancasterStemmer()
stemmed = []
for line in tokenized_lines:
    stemmed_words = [st.stem(word) for word in line]
    stemmed.append(stemmed_words)

for st in stemmed:
    print(st)

コード例 #51

0

ファイルを表示

def bag_of_bigram_words(words, score_fn=BigramAssocMeasures.chi_sq, n=100):
    bigram_finder= BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bag_of_non_stopwords(words+bigrams)

コード例 #52

0

ファイルを表示

ファイル: sandbox.py プロジェクト: TheRealMarcusChiu/PythonMasterExample

import nltk
nltk.download('punkt')
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures, TrigramCollocationFinder

bi_dict = dict()
bg_measures = BigramAssocMeasures()
with open('text/text.txt', 'r') as file:
    text = file.read()
    table = str.maketrans(dict.fromkeys('0123456789'))
    textWithoutNumbers = text.translate(table)

    words = nltk.word_tokenize(textWithoutNumbers)

    bi_finder = BigramCollocationFinder.from_words(words, window_size=2)
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    bi_finder.apply_freq_filter(2)
    t = bi_finder.ngram_fd.items()
    ngram = list(t)
    ngram.sort(key=lambda item: item[-1], reverse=False)
    for (k, v) in ngram:
        print(k, v)
    bi_finder.score_ngrams(bigram_measures.pmi)
    bi_collocs = bi_finder.nbest(bg_measures.likelihood_ratio, 10)
    print(bi_collocs)

    tri_finder = TrigramCollocationFinder.from_words(words)
    bi_finder.apply_freq_filter(5)
    t = tri_finder.ngram_fd.items()
    ngram = list(t)
    ngram.sort(key=lambda item: item[-1], reverse=False)
    for (k, v) in ngram:

コード例 #53

0

ファイルを表示

def ExtractCollocationFeatures(train_dataset,
                               test_dataset,
                               X_train_filename,
                               X_test_filename,
                               window_size,
                               n_features,
                               balance_dataset=False,
                               remove_center_interval=None):

    # This method extract Collocations of two words within the given
    # window of words as features from the given train and test datasets.
    # It returns X, Y matrices the vectorizer and a list with the feature names.
    # It also stores those X matrices in txt files with names X_train_filename and
    # X_test_filename under the /feature_matrices folder.
    # There are five tuneable parameters:
    # - window_size: size of the window
    # - n_features: number of features considered.
    # - balance_dataset: set to True to balance the training dataset.
    # - remove_center_interval: format: [-0.2, 0.2]. To remove samples with DW-Nominate inside
    # the interval.

    print("Reading datasets...")
    path_train = "../datasets/train/"
    train_dataset_df = pd.read_csv(path_train + train_dataset,
                                   sep="|",
                                   encoding="latin_1",
                                   header=None)
    train_dataset_df.columns = ['nominate_dim1', 'nominate_dim2', 'speech']

    #Remove rows with DW-nominates close to 0
    if type(remove_center_interval) != type(None):
        train_dataset_df['ideology'] = train_dataset_df['nominate_dim1'].apply(
            lambda x: 0 if (float(x) > remove_center_interval[0] and float(x) <
                            remove_center_interval[1]) else x)
        train_dataset_df = train_dataset_df[train_dataset_df['ideology'] != 0]

    train_dataset_df['ideology'] = train_dataset_df['nominate_dim1'].apply(
        lambda x: 1.0 if (float(x) >= 0) else -1.0)

    path_test = "../datasets/test/"
    test_dataset_df = pd.read_csv(path_test + test_dataset,
                                  sep="|",
                                  encoding="latin_1",
                                  header=None)
    test_dataset_df.columns = ['nominate_dim1', 'nominate_dim2', 'speech']

    if balance_dataset == True:
        positive_rows = len(
            train_dataset_df[train_dataset_df['ideology'] == 1.0])
        negative_rows = len(
            train_dataset_df[train_dataset_df['ideology'] == -1.0])
        if positive_rows > negative_rows:
            n = positive_rows - negative_rows

            indices = train_dataset_df[train_dataset_df['ideology'] ==
                                       1.0].index.values.tolist()
            drop_indices = random.sample(indices, n)
            train_dataset_df = train_dataset_df.drop(drop_indices)
        else:
            n = negative_rows - positive_rows

            indices = train_dataset_df[train_dataset_df['ideology'] ==
                                       -1.0].index.values.tolist()
            drop_indices = random.sample(indices, n)
            train_dataset_df = train_dataset_df.drop(drop_indices)

    train_speeches = train_dataset_df['speech'].values.tolist()
    Y_train = train_dataset_df['ideology'].values.tolist()

    if type(remove_center_interval) != type(None):
        test_dataset_df['ideology'] = test_dataset_df['nominate_dim1'].apply(
            lambda x: 0 if (float(x) > remove_center_interval[0] and float(x) <
                            remove_center_interval[1]) else x)
        test_dataset_df = test_dataset_df[test_dataset_df['ideology'] != 0]

    test_dataset_df['ideology'] = test_dataset_df['nominate_dim1'].apply(
        lambda x: 1.0 if (float(x) >= 0) else -1.0)

    test_speeches = test_dataset_df['speech'].values.tolist()
    Y_test = test_dataset_df['ideology'].values.tolist()

    print("Extracting features from train dataset...")
    t_start = time.time()

    stop_words = stopwords.words('english')

    total_bigrams = {}
    bigrams_per_speech_train = []
    t0 = time.time()
    print(len(train_speeches))
    for i in range(0, len(train_speeches)):
        if (i % 1000 == 0):
            print(i)
            t1 = time.time()
            print(str(t1 - t0) + " segundos")
            t0 = time.time()

        speech = train_speeches[i]
        speech = speech.lower()
        speech = speech.translate(str.maketrans('', '', string.punctuation))
        words = speech.split()
        stop_words = set(stopwords.words('english'))
        filtered_words = [w for w in words if not w in stop_words]

        bcf = BigramCollocationFinder.from_words(filtered_words,
                                                 window_size=window_size)

        for item in bcf.ngram_fd.items():
            if item[0] not in total_bigrams:
                total_bigrams.update({item[0]: item[1]})
            else:
                total_bigrams[item[0]] += item[1]

        bigrams_per_speech_train.append(bcf.ngram_fd.items())

    print("Total bigrams finded: ", len(total_bigrams))

    feature_names = []
    most_frequent_bigrams_sorted = sorted(total_bigrams.items(),
                                          key=lambda x: x[1],
                                          reverse=True)[:n_features]
    print("Number of features: ", len(most_frequent_bigrams_sorted))
    most_frequent_bigrams = dict(most_frequent_bigrams_sorted)

    for i in range(0, len(most_frequent_bigrams_sorted)):
        feature_names.append(most_frequent_bigrams_sorted[i][0])
    print(len(feature_names))

    order = list(range(0, len(feature_names)))
    collocation_order = dict(zip(feature_names, order))

    print("Computing X_train...")

    X_train_matrix = np.zeros(
        (len(bigrams_per_speech_train), len(feature_names)))

    for i in range(0, len(bigrams_per_speech_train)):
        if (i % 1000 == 0):
            print(i)
            t1 = time.time()
            print(str(t1 - t0) + " segundos")
            t0 = time.time()
        bigrams_per_speech_i = dict(bigrams_per_speech_train[i])
        for bigram in bigrams_per_speech_i:
            if bigram in most_frequent_bigrams:
                column = collocation_order[bigram]
                X_train_matrix[i][column] = bigrams_per_speech_i[bigram]
    print("Creating dataframe...")
    X_train_df = pd.DataFrame(X_train_matrix, columns=feature_names)
    t1 = time.time()
    print(str(t1 - t0) + " segundos")
    t0 = time.time()

    pathX = "../feature_matrices/"
    print("Saving X_train into a txt file...")
    X_train_df.to_csv(pathX + X_train_filename,
                      header=feature_names,
                      index=None,
                      sep=',')
    print("Transforming X_train into a csr_matrix...")
    X_train = csr_matrix(X_train_df)

    print("Extracting bigrams from test dataset...")

    bigrams_per_speech_test = []
    t0 = time.time()
    print(len(test_speeches))
    for i in range(0, len(test_speeches)):
        if (i % 1000 == 0):
            print(i)
            t1 = time.time()
            print(str(t1 - t0) + " segundos")
            t0 = time.time()

        speech = test_speeches[i]
        speech = speech.lower()
        speech = speech.translate(str.maketrans('', '', string.punctuation))
        words = speech.split()
        stop_words = set(stopwords.words('english'))
        filtered_words = [w for w in words if not w in stop_words]

        bcf = BigramCollocationFinder.from_words(filtered_words,
                                                 window_size=window_size)
        bigrams_per_speech_test.append(bcf.ngram_fd.items())

    print("Computing X_test...")

    X_test_matrix = np.zeros(
        (len(bigrams_per_speech_test), len(feature_names)))
    for i in range(0, len(bigrams_per_speech_test)):
        if (i % 1000 == 0):
            print(i)
            t1 = time.time()
            print(str(t1 - t0) + " segundos")
            t0 = time.time()
        bigrams_per_speech_i = dict(bigrams_per_speech_test[i])
        for bigram in bigrams_per_speech_i:
            if bigram in most_frequent_bigrams:
                column = collocation_order[bigram]
                X_test_matrix[i][column] = bigrams_per_speech_i[bigram]

    print("Creating dataframe...")
    X_test_df = pd.DataFrame(X_test_matrix, columns=feature_names)
    t1 = time.time()
    print(str(t1 - t0) + " segundos")
    t0 = time.time()

    print("Saving X_test into a txt file...")
    X_test_df.to_csv(pathX + X_test_filename,
                     header=feature_names,
                     index=None,
                     sep=',')
    print("Transforming X_train into a csr_matrix...")
    X_test = csr_matrix(X_test_df)

    t_end = time.time()
    total_time = t_end - t_start
    print("Total time: ")
    print(str(total_time) + " segundos")

    return X_train, Y_train, X_test, Y_test, feature_names

コード例 #54

0

ファイルを表示

    ngram_freq_dist = nltk.FreqDist(ngrams)
    sorted_ngrams_fd = sorted(ngram_freq_dist.items(),
                              key=itemgetter(1),
                              reverse=True)
    sorted_ngrams = sorted_ngrams_fd[0:limit]
    sorted_ngrams = [(' '.join(text), freq) for text, freq in sorted_ngrams]

    return sorted_ngrams


corpus, category = get_data()

from nltk.collocations import BigramCollocationFinder
from nltk.collocations import BigramAssocMeasures

finder = BigramCollocationFinder.from_documents(
    [item.split() for item in corpus])
bigram_measures = BigramAssocMeasures()

print finder.nbest(bigram_measures.raw_freq, 10)

from nltk.collocations import TrigramCollocationFinder
from nltk.collocations import TrigramAssocMeasures

finder = TrigramCollocationFinder.from_documents(
    [item.split() for item in corpus])
trigram_measures = TrigramAssocMeasures()

print finder.nbest(trigram_measures.raw_freq, 10)
print finder.nbest(trigram_measures.pmi, 10)

# print get_top_ngrams(corpus, ngram_val=2, limit=10)

コード例 #55

0

ファイルを表示

ファイル: evaluation.py プロジェクト: voe09/Movie-Review-Sentiment-Analysis

def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    words_nopunc = [word for word in words if word not in string.punctuation]
    bigram_finder = BigramCollocationFinder.from_words(words_nopunc)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words_nopunc, bigrams)])

コード例 #56

0

ファイルを表示

from nltk.corpus import stopwords
from nltk.corpus import webtext
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

set = set(stopwords.words('english'))
stops_filter = lambda w: len(w) < 3 or w in set
tokens = [t.lower() for t in webtext.words('grail.txt')]
words = BigramCollocationFinder.from_words(tokens)
words.apply_word_filter(stops_filter)
print(words.nbest(BigramAssocMeasures.likelihood_ratio, 10))

コード例 #57

0

ファイルを表示

from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import MinMaxScaler

import seaborn as sns

df = pd.read_csv('../preprocessed_dataset.csv')
df.head()

# Calculating number of repeated bigrams per song. Only considered bigrams of which repetition frequency is greater than 3
bigram_score = []
for i in range(len(df.index)):
    mean_pmi = 0.0
    pmi_bigram = []
    text = df["Lyrics"][i].split()
    coll_bia = bigram_collocation.from_words(text)
    coll_bia.apply_freq_filter(3)
    bigram_freq = coll_bia.ngram_fd.items()
    bigramFreqTable = pd.DataFrame(list(bigram_freq),
                                   columns=['bigram', 'freq'
                                            ]).sort_values(by='freq',
                                                           ascending=False)
    bigram_score.append(len(bigramFreqTable.index.values))

# Calculating number of repeated trigrams per song. Only considered trigrams of which repetition frequency is greater than 3
trigram_score = []
for i in range(len(df.index)):
    mean_pmi = 0.0
    pmi_trigram = []
    text = df["Lyrics"][i].split()
    coll_tri = trigram_collocation.from_words(text)

コード例 #58

0

ファイルを表示

plt.show()

fd = fdist_no_punc_no_stopwords

# las mas comunes
fd.most_common(50)

# diagramas_dispersion
text.dispersion_plot(["God", "mind", "knowledge"])
text.dispersion_plot(["power", "reason", "nature"])
# text.concordance("god")

# bigramas
# from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(words)
finder.nbest(bigram_measures.pmi, 10)
finder.apply_freq_filter(3)
finder.nbest(bigram_measures.pmi, 10)

# lo que aqui cambia es el cambio de filtro

# WC para los bigramas mas frecuentes
stopWords = stopwords
text_content = [
    ''.join(re.split("[ .,;:!?‘’``''@#$%^_&*()<>{}~\n\t\\\-]", word))
    for word in text
]

text_content = [word for word in text_content if word not in stopWords]
text_content = [s for s in text_content if len(s) != 0]

コード例 #59

0

ファイルを表示

ファイル: collocations.py プロジェクト: alturutin/NLP_ru

corpus = []
while True:
    l = hpmor.readline()
    if l == '': break
    l = re.sub(r"[^а-яё \t-]", "", l.lower()).strip().split()
    if l: corpus.extend(l)

bigram_measures = BigramAssocMeasures()
trigram_measures = TrigramAssocMeasures()

stop = set(stopwords.words('russian'))
stop.update(['гарри', 'поттер', 'профессор'
             ])  # добавим самые популярные слова из текста в стоп-лист
corpus_ = list(filter(lambda x: x not in stop, corpus))

finder = BigramCollocationFinder.from_words(corpus_)
finder3 = TrigramCollocationFinder.from_words(corpus_)

# фильтры по частотам и стоп-слова
finder.apply_freq_filter(5)
finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in stop)
finder3.apply_freq_filter(5)
finder3.apply_word_filter(lambda w: len(w) < 3 or w.lower() in stop)

# биграммы и триграммы
raw_bigrams = finder.nbest(bigram_measures.raw_freq, 100)
pmi_bigrams = finder.nbest(bigram_measures.pmi, 100)
raw_trigrams = finder3.nbest(trigram_measures.raw_freq, 100)
pmi_trigrams = finder3.nbest(trigram_measures.pmi, 100)

コード例 #60

0

ファイルを表示

ファイル: judge.py プロジェクト: HZ-njupt/Weibo

def bigram(words, score_fn=BigramAssocMeasures.chi_sq, n=500):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)  # 使用卡方统计的方法，选择排名前1000的词语
    newBigrams = [u + v for (u, v) in bigrams]