Python BigramCollocationFinder Exemples, nltk.collocations.BigramCollocationFinder Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : Extractor.py Projet : Palazor/sentiment

    def _get_bigram_scores(self, posdata, negdata):
        pos_words = list(itertools.chain(*posdata))
        neg_words = list(itertools.chain(*negdata))

        pos_bigram_finder = BigramCollocationFinder.from_words(pos_words)
        neg_bigram_finder = BigramCollocationFinder.from_words(neg_words)
        pos_bigrams = pos_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
        neg_bigrams = neg_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

        pos = pos_words + pos_bigrams
        neg = neg_words + neg_bigrams

        word_fd = FreqDist()
        cond_word_fd = ConditionalFreqDist()
        for word in pos:
            word_fd[word] += 1
            cond_word_fd['pos'][word] += 1
        for word in neg:
            word_fd[word] += 1
            cond_word_fd['neg'][word] += 1

        pos_word_count = cond_word_fd['pos'].N()
        neg_word_count = cond_word_fd['neg'].N()
        total_word_count = pos_word_count + neg_word_count

        word_scores = {}
        for word, freq in word_fd.iteritems():
            pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
            neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
            word_scores[word] = pos_score + neg_score

        return word_scores

Exemple #2

0

Afficher le fichier

Fichier : preproc_fea_extraction.py Projet : yngwiet/Twitter-Sentiment-Analysis

    def get_unibigram_features(all_words, uni_feanum, bi_feanum):
        word_fd = nltk.FreqDist(all_words)
        bigram_fd = nltk.FreqDist(nltk.bigrams(all_words))

        if uni_feanum == 'max':
            uni_feanum = len(list(word_fd.keys()))
        elif uni_feanum > len(list(word_fd.keys())):
            uni_feanum = len(list(word_fd.keys()))

        if bi_feanum == 'max':
            bi_feanum = len(list(bigram_fd.keys()))
        elif bi_feanum > len(list(bigram_fd.keys())):
            bi_feanum = len(list(bigram_fd.keys()))

        finder = BigramCollocationFinder(word_fd, bigram_fd)
        bigrams = finder.nbest(BigramAssocMeasures.chi_sq, bi_feanum)

        print "the number of unigram features is", uni_feanum
        print "the number of bigram features is", bi_feanum

        featuples = word_fd.most_common(uni_feanum)

        selected_words = []

        for i in range(uni_feanum):
            selected_words.append(featuples[i][0])

        features = []
        for ngram in itertools.chain(selected_words, bigrams):
            features.append(ngram)

        return features

Exemple #3

0

Afficher le fichier

Fichier : classifiers_score.py Projet : JoshuaMichaelKing/Stock-SentimentAnalysis

def create_word_bigram_scores(posWords, negWords, score_method = BigramAssocMeasures.chi_sq):
    '''
    以双词来统计词的信息量
    '''
    bigram_finder = BigramCollocationFinder.from_words(posWords)
    posBigrams = bigram_finder.nbest(score_method, 5000)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    negBigrams = bigram_finder.nbest(score_method, 5000)
    pos = posWords + posBigrams #词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in neg:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count
    print("BIGRAM_IN_POSWORD_NUMS : %d\tBIGRAM_IN_NEGWORD_NUMS : %d" % (pos_word_count, neg_word_count))

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = score_method(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = score_method(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score
    return word_scores

Exemple #4

0

Afficher le fichier

Fichier : store sentiment classifier.py Projet : EricChanBD/Review-Helpfulness-Prediction

def create_bigram_scores():
    posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", "1", "1")
    negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", "1", "1")
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000)

    pos = posBigrams
    neg = negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd.inc(word)
        cond_word_fd['pos'].inc(word)
    for word in neg:
        word_fd.inc(word)
        cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores

Exemple #5

0

Afficher le fichier

Fichier : sentimentexample.py Projet : eleanordong/datamining

def create_word_bigram_scores(posWords, negWords, n = 5000):
    # (posWords,negWords) = readwordarr()
    posWords = list(itertools.chain(*posWords))
    negWords = list(itertools.chain(*negWords))
    bigramfinder = BigramCollocationFinder.from_words(posWords)
    posbigrams = bigramfinder.nbest(BigramAssocMeasures.chi_sq, n)
    bigramfinder = BigramCollocationFinder.from_words(negWords)
    negbigrams = bigramfinder.nbest(BigramAssocMeasures.chi_sq, n)
    posWords = posWords + posbigrams
    negWords = negWords + negbigrams
    wordscores = {}
    wordfd = FreqDist()
    conditionwordfd = ConditionalFreqDist()
    for word in posWords:
        wordfd[word]+=1
        conditionwordfd['pos'][word]+=1
        
    for word in negWords:
        wordfd[word]+=1
        conditionwordfd['neg'][word]+=1
    
    pos_word_count = conditionwordfd['pos'].N()
    neg_word_count = conditionwordfd['neg'].N()
    totalcount = pos_word_count + neg_word_count
    for word,freq in wordfd.items():
        pos_score = BigramAssocMeasures.chi_sq(conditionwordfd['pos'][word], (freq, pos_word_count), totalcount)
        neg_score = BigramAssocMeasures.chi_sq(conditionwordfd['neg'][word], (freq, neg_word_count), totalcount)
        wordscores[word] = pos_score + neg_score
    return wordscores

Exemple #6

0

Afficher le fichier

Fichier : extractFeatures_org.py Projet : coolspiderghy/sina_weibo_crawler

def create_word_bigram_scores():
    posdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl','r'))
    negdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl','r'))
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams #词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd['pos'][word]+= 1 #cond_word_fd['pos'].inc(word)
    for word in neg:
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd['neg'][word]+= 1#cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores

Exemple #7

0

Afficher le fichier

Fichier : process.py Projet : delili/NLP_Comments_Sentiment_Analysis

def create_word_bigram_scores(posWords, negWords):
    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)

    pos = posWords + posBigrams #词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[str(word)] += 1 
        cond_word_fd['pos'][str(word)] += 1
    for word in neg:
	    word_fd[str(word)] += 1
	    cond_word_fd['neg'][str(word)] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores

Exemple #8

0

Afficher le fichier

Fichier : Bigrams_Features.py Projet : AJRenold/classification_assignment_i256

def best_bigrams(sents_tagged, stopwords, score_fn=BigramAssocMeasures.likelihood_ratio, n=300):
    sents_pos = []
    sents_neg = []

    # Separate positive and negative sentences.
    for tag, sent in sents_tagged:
        if tag == 1:
            sents_pos.append(sent)
        elif tag == -1:
            sents_neg.append(sent)

    # Extract words from positive and negative sentences.
    words_pos = [word.lower() for s in sents_pos for word in word_tokenize(s) if word not in string.punctuation]
    words_neg = [word.lower() for s in sents_neg for word in word_tokenize(s) if word not in string.punctuation]

    # Find the best bigrams for positive sentences based on informative collocations
    bigram_finder1 = BigramCollocationFinder.from_words(words_pos)
    bigrams_best_pos = bigram_finder1.nbest(score_fn, n)

    # Find the best bigrams for negative sentences based on informative collocations
    bigram_finder2 = BigramCollocationFinder.from_words(words_neg)
    bigrams_best_neg = bigram_finder2.nbest(score_fn, n)

    bigrams_all = list(set(bigrams_best_pos).union(set(bigrams_best_neg)))

    # Select only the bigrams that have either one of the word greater than length 3
    bigrams_best = [bigram for bigram in bigrams_all
            if len(bigram[0]) > 3 and len(bigram[1]) > 3
            and bigram[0] not in ex and bigram[1] not in ex ]


    return bigrams_best

Exemple #9

0

Afficher le fichier

Fichier : score.py Projet : TianyiM/Final-Project

def create_word_bigram_scores():
    posdata = tp.seg_fil_senti_excel("~", 1, 1)
    negdata = tp.seg_fil_senti_excel("~", 1, 1)
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    last_word = ConditionalFreqDist()
    for word in pos:
        word_fd.inc(word)
        last_word['pos'].inc(word)
    for word in neg:
        word_fd.inc(word)
        last_word['neg'].inc(word)

    pos_word_count = last_word['pos'].N()
    neg_word_count = last_word['neg'].N()
    totalnumber = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(last_word['pos'][word], (freq, pos_word_count), totalnumber)
        neg_score = BigramAssocMeasures.chi_sq(last_word['neg'][word], (freq, neg_word_count), totalnumber)
        word_scores[word] = pos_score + neg_score

    return word_scores

Exemple #10

0

Afficher le fichier

Fichier : store sentiment classifier.py Projet : lihui19891118/Sentimental-analysis

def create_word_bigram_scores():
    posdata = tp.seg_fil_txt("/home/hadoop/goodnew.txt")
    negdata = tp.seg_fil_txt("/home/hadoop/badnew.txt")
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finderr = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finderr.nbest(BigramAssocMeasures.chi_sq,350000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq,350000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd.inc(word)
        cond_word_fd['pos'].inc(word)
    for word in neg:
        word_fd.inc(word)
        cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores

Exemple #11

0

Afficher le fichier

Fichier : pos_neg_ml_feature.py Projet : wac81/LSI-for-ChineseDocument

def create_words_bigrams_scores():
    posdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/pos_review.xlsx", 1, 1)
    negdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/neg_review.xlsx", 1, 1)
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word]+=1
        cond_word_fd['pos'][word]+=1

    for word in neg:
        word_fd[word]+=1
        cond_word_fd['neg'][word]+=1
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores

Exemple #12

0

Afficher le fichier

Fichier : feature_extrac.py Projet : yyr93520/NLPproject

def create_word_bigram_scores():
	bigram_finder = BigramCollocationFinder.from_words(posWords)
	bigram_finder = BigramCollocationFinder.from_words(negWords)
	posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
	negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
	pos = posWords + posBigrams #词和双词搭配
	neg = negWords + negBigrams
	return get_scores(pos, neg)

Exemple #13

0

Afficher le fichier

Fichier : weibo_sentiment_classifier.py Projet : Irradiatepy/weibo_sentiment_analysis

def create_word_bigram_scores():
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))
    
    objWords = list(itertools.chain(*objdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    
    bigram_finder = BigramCollocationFinder.from_words(objWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    
    objBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)


    pos = posWords + posBigrams
    neg = negWords + negBigrams
    
    obj = objWords + objBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in neg:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1
    for word in objWords:
        word_fd[word] += 1
        cond_word_fd['obj'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    
    obj_word_count = cond_word_fd['obj'].N()
    total_word_count = pos_word_count + neg_word_count + obj_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
       
        obj_score = BigramAssocMeasures.chi_sq(cond_word_fd['obj'][word], (freq, obj_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score + obj_score

    return word_scores

Exemple #14

0

Afficher le fichier

Fichier : Test.py Projet : svenka22/Twitter-Sentiment-Analysis

def get_bigrams1(tweet, score_fn=BigramAssocMeasures.chi_sq, n=200):
        bigramslist = []
        bigram_finder = BigramCollocationFinder.from_words(tweet)
        bigrams = bigram_finder.nbest(score_fn, n)
        for bigram in bigrams:
            bigramslist.append(' '.join(str(i) for i in bigram))
        print bigramslist

Exemple #15

0

Afficher le fichier

Fichier : text.py Projet : prz3m/kind2anki

    def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        :seealso: find_collocations
        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not (
            '_collocations' in self.__dict__
            and self._num == num
            and self._window_size == window_size
        ):
            self._num = num
            self._window_size = window_size

            # print("Building collocations list")
            from nltk.corpus import stopwords

            ignored_words = stopwords.words('english')
            finder = BigramCollocationFinder.from_words(self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations]
        print(tokenwrap(colloc_strings, separator="; "))

Exemple #16

0

Afficher le fichier

Fichier : analyze_tweets.py Projet : seanfreiburg/chicago_tweet_grabber

def best_bigram_word_features(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    d = dict([(bigram, True) for bigram in bigrams])
    d.update(best_word_features(words))

    return d

Exemple #17

0

Afficher le fichier

Fichier : nbayes_sentiment.py Projet : gmichopoulos/sentiment_analysis_toolkit

def bigram_word_features(words, limit, stop, stopset, word_score_placeholder, \
                                          score_fn=BigramAssocMeasures.chi_sq):
  if stop:
    words = [w for w in words if w not in stopset]
  bigram_finder = BigramCollocationFinder.from_words(words)
  bigrams = bigram_finder.nbest(score_fn, limit)
  return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])

Exemple #18

0

Afficher le fichier

Fichier : naive_bayes_classifier_bigrams.py Projet : MARS87/ieor242

def tweet_features(tweet):
    tweet_words = word_tokenize(tweet)
    bigram_finder = BigramCollocationFinder.from_words(tweet_words)
    score_fn=BigramAssocMeasures.chi_sq
    bigrams = bigram_finder.nbest(score_fn, 200)
    print bigrams
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])

Exemple #19

0

Afficher le fichier

Fichier : text_utils.py Projet : fruser/review-analyzer

def get_bag_of_bigrams_words(
        word_list,
        score_fn=BigramAssocMeasures.chi_sq,
        n=200):
    bigram_finder = BigramCollocationFinder.from_words(word_list)
    bigrams = bigram_finder.nbest(score_fn, n)
    return get_bag_of_words(word_list + bigrams)

Exemple #20

0

Afficher le fichier

Fichier : NLTK_tools.py Projet : dreampocketit/bocard

def demo_collocations(self, num=40, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        @seealso: L{find_collocations}
        @param num: The maximum number of collocations to print.
        @type num: C{int}
        @param window_size: The number of tokens spanned by a collocation (default=2)
        @type window_size: C{int}
        """
        if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
            self._num = num
            self._window_size = window_size
            print "Building collocations list"
            from nltk.corpus import stopwords
            ignored_words = stopwords.words('english')
            from nltk.collocations import BigramCollocationFinder
            finder = BigramCollocationFinder.from_words(self.tokens, window_size) 
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            from nltk.metrics import f_measure, BigramAssocMeasures
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1+u' '+w2 for w1, w2 in self._collocations]
        print "List {0} collocations".format(num)
        print tokenwrap(colloc_strings, separator=u'; ')

Exemple #21

0

Afficher le fichier

Fichier : collocations.py Projet : rjoganah/Dynamic_IR

 def collaction_discovery(self):
     self.corpus = nltk.word_tokenize(self.corpus.lower())
     bigramm_finder = BigramCollocationFinder.from_words(self.corpus)
     filter_bigram = lambda w: len(w) < 3 or w in self.stopwords_
     bigramm_finder.apply_word_filter(filter_bigram)
     top_10_bigrams = bigramm_finder.nbest(BigramAssocMeasures.likelihood_ratio, 10)
     return top_10_bigrams

Exemple #22

0

Afficher le fichier

Fichier : classifier.py Projet : benneic/sentimento

 def converter(tokens):
     bigram_finder = BigramCollocationFinder.from_words(tokens)
     bigrams = bigram_finder.nbest(score_fn, n)
     return (
         {ngram: True for ngram in itertools.chain(tokens, bigrams)},
         label
     )

Exemple #23

0

Afficher le fichier

Fichier : grapher.py Projet : amac441/Metten

    def get_frequencies(self, desc):

        stopset = set(stopwords.words('english'))
        filter_stops = lambda w: len(w) < 3 or w in stopset
        words = word_tokenize(desc)

        print '------gram--------'
        words_to_count = [word for word in words if word not in stopset]
        words_to_count = [word for word in words_to_count if not len(word) < 3]
        c = Counter(words_to_count)
        single = c.most_common(20)
        print single

        print '------bigram--------'
        bcf = BigramCollocationFinder.from_words(words)
        bcf.apply_word_filter(filter_stops)
        bigrm = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 15)
        print bigrm

        print '------trigram--------'
        tcf = TrigramCollocationFinder.from_words(words)
        tcf.apply_word_filter(filter_stops)
        tcf.apply_freq_filter(3)  #only keep those that appear more than 3 times
        trigrm = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 10)
        print trigrm

        matches = [single,bigrm,trigrm]
        return matches

Exemple #24

0

Afficher le fichier

Fichier : BigramsExtraction.py Projet : ssteku/NLPRelatedPhenomenon

 def get_bigrams(self, rawText, score_fn=BigramAssocMeasures.pmi, n=40):
     # TODO configuration value
     clean_text = TokensCleaner.clean(self, rawText, cleaning_level=3)
     bigram_finder = BigramCollocationFinder.from_words(clean_text['3'])
     bigram_measures = BigramAssocMeasures()
     bigrams = bigram_finder.nbest(bigram_measures.pmi, n)
     return bigrams

Exemple #25

0

Afficher le fichier

Fichier : DataPreprocessing.py Projet : svenka22/Twitter-Sentiment-Analysis

 def get_bigrams(self, tweet, score_fn=BigramAssocMeasures.chi_sq, n=200):
         bigramslist = []
         bigram_finder = BigramCollocationFinder.from_words(tweet)
         bigrams = bigram_finder.nbest(score_fn, n)
         for bigram in bigrams:
             bigramslist.append(' '.join(str(i) for i in bigram))
         return bigramslist #This is list e.g. ['you dude', 'Hi How', 'How are', 'are you']

Exemple #26

0

Afficher le fichier

Fichier : generator.py Projet : emmajhyde/PoetryGenerator

 def get_collocations(self):
     ignored_words = stopwords.words('english')
     finder = BigramCollocationFinder.from_words(self.text_array,2)
     finder.apply_freq_filter(3)
     finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
     bigram_measures = BigramAssocMeasures()
     return finder.nbest(bigram_measures.likelihood_ratio,40)

Exemple #27

0

Afficher le fichier

Fichier : collocationreadability.py Projet : muranava/Text-Tools

def ShowCollocations():
	text.insert(END, "If this doesn't work, please check you have NLTK, PyYAML and the stopword list from the NLTK loaded. See Help for details \n\n\n")
	import nltk
	from nltk.collocations import BigramCollocationFinder
	from nltk.collocations import TrigramCollocationFinder
	from nltk.metrics import BigramAssocMeasures
	from nltk.metrics import TrigramAssocMeasures
	pattern = r'''(?x)([A-Z]\.)+|\w+([-']\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;"'?():-_']'''
	data = resultsbox.get(1.0,END)
	rawtext=nltk.regexp_tokenize(data, pattern)
	prepcolloc = [word.lower() for word in rawtext if not word in stopwords and word.isalpha()]
	text.delete(1.0, END)
	text.insert(END, "Collocations (occurring at least 3 times with a PMI of 10)\n")
	text.insert(END, "\nBigram Collocations:\n")
	bigram = BigramAssocMeasures()
	bigramfinder = BigramCollocationFinder.from_words(prepcolloc)
	bigramfinder.apply_freq_filter (3)
	bigrams=bigramfinder.nbest(bigram.pmi, 10)
	for item in bigrams:
		first = item[0]
		second = item[1]
		text.insert(END, first)
		text.insert(END, " ")
		text.insert(END, second)
		text.insert(END, "\n")

Exemple #28

0

Afficher le fichier

Fichier : BookClassifier.py Projet : karthik-chandrasekar/BookClassifier

 def get_bigram(self, features_list):
     #Top ten best bigrams are selected
     score = BigramAssocMeasures.chi_sq
     all_bigrams = BigramCollocationFinder.from_words(features_list)
     best_bigrams = all_bigrams.nbest(score, self.bigram_threshold)
     selected_bigrams = [(bigram, True) for bigram in best_bigrams]
     return selected_bigrams

Exemple #29

0

Afficher le fichier

Fichier : load_samples.py Projet : hotpro/cmpe239-project2

def stopword_filtered_bigrams(text, score_fn=BigramAssocMeasures.chi_sq, n=500):
    '''Removes the stopwords and computes the best bigrams'''
    stopset = set(stopwords.words('english'))
    words = [word for word in tokenize(text) if word not in stopset]
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])

Exemple #30

0

Afficher le fichier

Fichier : ytpy.py Projet : juliasun/Youtube-Tox

def bigram(ytcomments, drug):
    bi = BigramAssocMeasures()
    bi_finder = BigramCollocationFinder.from_words(ytcomments, window_size = 20)
    top_general = bi_finder.nbest(bi.pmi,30)
    bi_finder.apply_ngram_filter(lambda w1,w2: w1 != drug and w2 != drug)
    top_bi = bi_finder.nbest(bi.pmi, 30)
    return top_bi

Exemple #31

0

Afficher le fichier

Fichier : test_nltk.py Projet : ChyengJason/Wandoujia

def bigram_words(words, score_fn=BigramAssocMeasures.chi_sq, n=1000):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bag_of_words(words + bigrams)  #所有词和（信息量大的）双词搭配一起作为特征

Exemple #32

0

Afficher le fichier

Fichier : test_nltk.py Projet : ChyengJason/Wandoujia

def bigram(words, score_fn=BigramAssocMeasures.chi_sq, n=1000):
    bigram_finder = BigramCollocationFinder.from_words(words)  #把文本变成双词搭配的形式
    bigrams = bigram_finder.nbest(score_fn, n)  #使用了卡方统计的方法，选择排名前1000的双词
    return bag_of_words(bigrams)

Exemple #33

0

Afficher le fichier

 def get_bigrams(self, words):
     bigram_finder = BigramCollocationFinder.from_words(words)
     self.biagrams = bigram_finder.nbest(self.bigram_score_funcion,
                                         self.top_ngram_count)
     return self.biagrams

Exemple #34

0

Afficher le fichier


def flatten_corpus(corpus):
    return ' '.join([document.strip() for document in corpus])


def get_top_ngrams(corpus, ngram_val=1, limit=5):
    corpus = flatten_corpus(corpus)
    tokens = nltk.word_tokenize(corpus)
    ngrams = compute_ngrams(tokens, ngram_val)
    ngrams_freq_dist = nltk.FreqDist(ngrams)
    sorted_ngrams_fd = sorted(ngrams_freq_dist.items(),
                              key=itemgetter(1),
                              reverse=True)
    sorted_ngrams = sorted_ngrams_fd[0:limit]
    sorted_ngrams = [(' '.join(text), freq) for text, freq in sorted_ngrams]
    return sorted_ngrams


print(get_top_ngrams(corpus=norm_alice, ngram_val=3, limit=10))

finder = BigramCollocationFinder.from_documents(
    [item.split() for item in norm_alice])
bigram_measures = BigramAssocMeasures()

print(finder.nbest(bigram_measures.raw_freq, 10))

# Now using gensim
print("Sentence: ", norm_alice[2])
key_words = keywords(norm_alice[2], ratio=1.0, scores=True, lemmatize=True)
print([(item, round(score, 3)) for item, score in key_words][:25])

Exemple #35

0

Afficher le fichier

def bi(text):
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    finder=BigramCollocationFinder.from_words(word_tokenize(text))
    finder.apply_freq_filter(5)
    finder.nbest(bigram_measures.pmi, 5) 
    return finder.ngram_fd.items()

Exemple #36

0

Afficher le fichier

Fichier : tutorialText.py Projet : saxes20/EmotionalAnalysisCNN

def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    d = dict([(bigram, True) for bigram in bigrams])
    d.update(best_word_feats(words))
    return d

Exemple #37

0

Afficher le fichier

def get_bigrams(tokens, freq_filter=None):
    finder = BigramCollocationFinder.from_words(tokens)
    if freq_filter:
        finder.apply_freq_filter(freq_filter)
    return list(' '.join(b[0]) for b in finder.ngram_fd.items())

Exemple #38

0

Afficher le fichier

Fichier : reviewtest_MAX_17092018.py Projet : max620/nlp_journey

def bigram(collat_data):
    df_co = pd.DataFrame.to_string(collat_data,
                                   columns=['lemmatization']).split(',')
    bcf = BigramCollocationFinder.from_words(df_co)
    top20 = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 20)
    return top20

Exemple #39

0

Afficher le fichier

Fichier : freq_ngrams.py Projet : dllllb/contest-dh-turing

def find_bigrams(sentences, n_ngrams):
    cf = BigramCollocationFinder.from_documents(sentences)
    fng = cf.nbest(BigramAssocMeasures.likelihood_ratio, n_ngrams)
    return fng

Exemple #40

0

Afficher le fichier

modelkmeans = KMeans(init='k-means++', max_iter=200, n_init=100)
modelkmeans.fit(X)
order_centroids = modelkmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(modelkmeans.n_clusters):
    print("Cluster {}:".format(i)),
    for ind in order_centroids[i, :10]:
        print("{}".format(terms[ind]))

s = all_text_docs[name]
tokens = word_tokenize(s)
text = nltk.Text(tokens)
text.collocations()
text.concordance('social')
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(word_tokenize(s))
finder.nbest(bigram_measures.pmi, 10)

######
NUM_TOPICS = 10

vectorizer = CountVectorizer(min_df=5,
                             max_df=0.9,
                             stop_words='english',
                             lowercase=True)
data_vectorized = vectorizer.fit_transform(train_clean_sentences)

# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS,
                                      max_iter=10,
                                      learning_method='online')

Exemple #41

0

Afficher le fichier

Fichier : Predicting+Cyberbullying+Twitter+ Code1.py Projet : kohirmanideep/hack-srm

def bigrams_words(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bigrams

Exemple #42

0

Afficher le fichier

Fichier : data visualization.py Projet : vipinmatthews/consumerComplaints

with open("D:/Python/Consumer Complaints/Consumer_Complaints_CreditCard.csv", 'r') as file:
  complaints = list(csv.reader(file))
  file.close()

compClean = []
for i in range(len(complaints)):
    tokens = re.sub("[^A-Za-z0-9()'.]+", " ", complaints[i][5])
    tokens = re.sub('!', ".", tokens)
    compClean.append(tokens)


from nltk.corpus import webtext
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
words = [w.lower() for w in webtext.words('D:/Python/Consumer Complaints/complaintsDump.txt')]
bcf = BigramCollocationFinder.from_words(words)

#from nltk.collocations import TrigramCollocationFinder
#from nltk.metrics import TrigramAssocMeasures
#tcf = TrigramCollocationFinder.from_words(words)
#tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 4)

from nltk.corpus import stopwords
stopset = set(stopwords.words('english'))
filter_stops = lambda w: len(w) < 3 or w in stopset
bcf.apply_word_filter(filter_stops)
collocations = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 50)

newText = 'a credit card is issued to me'
tokens = re.sub(" ".join(collocations[1]), "-".join(collocations[1]), newText)

Exemple #43

0

Afficher le fichier

def bag_of_bigrams_words(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    # finds words that often occur togther
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bag_of_words(words + bigrams)

Exemple #44

0

Afficher le fichier

def analyze_text(text, filename, stopwords, min_length, freq, total_ngrams,
                 min_measure, bigrams_only, trigrams_only):
    print(len(text), filename)
    words = [
        w.lower() for w in text if w not in string.punctuation
        if w.lower() not in stopwords and len(w) >= min_length
    ]

    bigrams = None
    b_prefix_keys = None
    trigrams = None
    t_prefix_keys = None

    # what follows could totally be generalized
    if not trigrams_only:
        # Bigrams
        print("Generating bigrams from", filename)
        b_finder = BigramCollocationFinder.from_words(words)
        b_finder.ngram_fd
        b_finder.apply_freq_filter(freq)
        # if stopwords:
        #   b_finder.apply_word_filter(lambda w: w in stopwords)
        bigrams = b_finder.nbest(BigramAssocMeasures.pmi, total_ngrams)
        b_scored = b_finder.score_ngrams(BigramAssocMeasures.pmi)
        b_prefix_keys = collections.defaultdict(list)
        for key, scores in b_scored:
            if scores > min_measure:
                b_prefix_keys[key[0]].append((key[1], scores))

    # Trigrams
    if not bigrams_only:
        print("Generating trigrams from", filename)
        t_finder = TrigramCollocationFinder.from_words(words)
        t_finder.apply_freq_filter(freq)
        # if stopwords:
        #   t_finder.apply_word_filter(lambda w: w in stopwords)
        trigrams = t_finder.nbest(TrigramAssocMeasures.pmi, total_ngrams)
        t_scored = t_finder.score_ngrams(TrigramAssocMeasures.pmi)
        t_prefix_keys = collections.defaultdict(list)
        for key, scores in t_scored:
            if scores > min_measure:
                t_prefix_keys[key[0]].append((key[1], key[2], scores))

    if bigrams_only:
        ret = {
            'bigrams': bigrams,
            'b_prefix': b_prefix_keys,
            'b_fd': b_finder.ngram_fd
        }
    elif trigrams_only:
        ret = {
            'trigrams': trigrams,
            't_prefix': t_prefix_keys,
            't_fd': t_finder.ngram_fd
        }
    else:
        ret = {
            'bigrams': bigrams,
            'b_prefix': b_prefix_keys,
            'b_fd': b_finder.ngram_fd,
            'trigrams': trigrams,
            't_prefix': t_prefix_keys,
            't_fd': t_finder.ngram_fd
        }
    return ret

Exemple #45

0

Afficher le fichier

Fichier : Clustering.py Projet : estephenson/ai-mental-health

def main():
    # stopwords to filter out for collocations
    stopwords_eng = set(stopwords.words("english"))
    stopwords_eng.add(b'et')
    stopwords_eng.add(b'al')


    # bigram identifier from nltk
    bigram_measures = nltk.collocations.BigramAssocMeasures()

    # tf-idf vectorizer from nltk
    tfidf_vectorizer = TfidfVectorizer(stop_words='english',
                                 use_idf=True,
                                 tokenizer=tokenize_and_stem, ngram_range=(1,3))

    file = open('CultureRelatedDiaognosticIssues.txt','r')
    a = []
    names = []
    for line in file:
        miniList = line.split("|")
        names.append(int(miniList[0].strip()))
        a.append(miniList[1].strip())
    file.close()

    allvocab_stemmed = []
    allvocab_tokenized = []

    for element in a:
        stemmed_result = tokenize_and_stem(element)
        allvocab_stemmed.extend(stemmed_result)

        tokenized_result = tokenize_only(element)
        allvocab_tokenized.extend(tokenized_result)

    # data frame that contains stems and tokenized words
    vocab_frame = pd.DataFrame({'words': allvocab_tokenized},
    index = allvocab_stemmed)

    # tf-idf matrix for the terms in the corpus
    tfidf_matrix = tfidf_vectorizer.fit_transform(a)
    terms = tfidf_vectorizer.get_feature_names()

    # number of clusters
    num_clusters = 10

    # fitting the k-means algorithm and saving it in a .pkl file
    km = KMeans(n_clusters=num_clusters)
    km.fit(tfidf_matrix)
    joblib.dump(km,  'cluster.pkl')
    km = joblib.load('cluster.pkl')
    clusters = km.labels_.tolist()

    # data frame that saves the chapter, the text, and the assigned cluster
    dsm = {'chapter': names, 'text': a, 'cluster': clusters}
    frame = pd.DataFrame(dsm, index = [clusters], columns = ['chapter', 'text', 'cluster'])

    #groupby cluster for aggregation purposes
    grouped = frame['chapter'].groupby(frame['cluster'])

    # getting rid of all punctuation for bigram measures - will use this later
    puncTokenizer = RegexpTokenizer(r'\w+')

    print("Top terms per cluster:")
    print()

    #sort cluster centers by proximity to centroid
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]

    for i in range(num_clusters):
        print("Cluster %d words:" % i, end='')

        for ind in order_centroids[i, :6]:
            print(vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
        print()

        print("Cluster %d titles:" % i, end='')
        for title in frame.ix[i]['chapter'].values.tolist():
            print(str(title) + " , ", end='')
        print()

        # this for-loop finds the most common pairs of words in each diagnosis
        for text in frame.ix[i]['text'].values.tolist():
            data_tokens = puncTokenizer.tokenize(text)
            data_tokens = [x.lower() for x in data_tokens]

            tokens = [w for w in data_tokens if w not in stopwords_eng]

            finder = BigramCollocationFinder.from_words(tokens)
            print('Printing collocations in this chapter:')
            print(finder.nbest(bigram_measures.likelihood_ratio, 5))
            print()
    print()
    print()

    # distribution of clusters
    plt.hist(km.labels_, bins=num_clusters)
    plt.show()

Exemple #46

0

Afficher le fichier

Fichier : general.py Projet : azizmb/TWSS

def word_features(words, score_fn=BAM.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict((bg, True) for bg in chain(words, bigrams))

Exemple #47

0

Afficher le fichier

def bigram_words(words, score_fn=BigramAssocMeasures.pmi, n=121):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bag_of_words(words + bigrams)

Exemple #48

0

Afficher le fichier

def create_features(X, user_data=None):
    res = []

    for date, comment, user in X:
        feat = {}
        has_hate_word = has_drug_word = has_cult_word = has_occult_word = has_porn_word = 0
        has_fwenzel_word = 0
        has_swastika = swastika in comment

        comment = comment.lower()

        comment = parse_text(comment)

        comment = nltk.clean_html(comment)

        sents = sent_tokenize(comment)
        doc = []
        for sent in sents:
            # Tokenize each sentence.
            doc += wordtokenizer.tokenize(sent)

        def repl_filter(x):
            return x.lower() not in ["nl", "nl2", "nbsp", "nbsp2", "dummyhtml"]

        # Remove stopwords and replacement tokens.
        doc = filter(repl_filter, doc)

        for i, word in enumerate(doc):
            if doc[i] in bad_words:
                doc[i] = '_badword_'

            doc[i] = ps.stem(doc[i])

            doc[i] = wnl.lemmatize(doc[i])

            if doc[i] in bad_words:
                doc[i] = '_badword_'

            if doc[i] in hate_words:
                has_hate_word = 1
            if doc[i] in drug_words:
                has_drug_word = 1
            if doc[i] in cult_words:
                has_cult_word = 1
            if doc[i] in occult_words:
                has_occult_word = 1
            if doc[i] in porn_words:
                has_porn_word = 1
            if doc[i] in fwenzel_words:
                has_fwenzel_word = 1

        bigram_finder = BigramCollocationFinder.from_words(doc)
        bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, n=5)

        bigram = dict([(ngram, True)
                       for ngram in itertools.chain(doc, bigrams)])

        feat.update(bigram)

        text_vocab = set(w for w in doc if w.isalpha())
        unusual = text_vocab.difference(english_vocab)
        unusual_ratio = len(unusual) / len(text_vocab) if len(
            text_vocab) != 0 else -1.0

        unusual2 = unusual.difference(set("_badword_"))
        unusual_ratio2 = len(unusual2) / len(text_vocab) if len(
            text_vocab) != 0 else -1.0

        if user_data is not None:
            user_info = user_data[user]

        has_bad_word = True
        for word in bad_words:
            if word in comment.lower():
                break
        else:
            has_bad_word = False

        def n_none(x):
            return int(x) if x is not None else 0

        def c_none(x):
            return x if x is not None else "__None__"

        readability = ReadabilityTool(comment)

        read_feat = {}
        for f, val in readability.analyzedVars.items():
            if f != 'words':
                read_feat["_" + f] = val
        for test, val in readability.tests_given_lang['eng'].items():
            read_feat["__" + test] = val(readability.text)

        feat['_always_present'] = True
        feat['_word_num'] = len(doc)
        feat['_sent_num'] = len(sents)
        feat['_word_var'] = len(set(doc)) / len(doc) if len(doc) != 0 else -1.0
        feat['_sent_var'] = len(set(sents)) / len(sents)
        feat['_unusual_ratio'] = unusual_ratio
        feat['_unusual_ratio2'] = unusual_ratio2
        if user_data is not None:
            feat['_username'] = user
            feat['_user_subcount'] = int(user_info['SubscriberCount'])
            feat['_user_friends'] = int(user_info['FriendsAdded'])
            feat['_user_favs'] = int(user_info['VideosFavourited'])
            feat['_user_videorates'] = int(user_info['VideosRated'])
            feat['_user_videouploads'] = int(user_info['VideosUploaded'])
            feat['_user_videocomments'] = int(user_info['VideosCommented'])
            feat['_user_videoshares'] = int(user_info['VideosShared'])
            feat['_user_usersubs'] = int(user_info['UserSubscriptionsAdded'])
            feat['_user_gender'] = c_none(user_info['Gender'])
            feat['_user_age'] = n_none(user_info['Age'])
            feat['_user_closed'] = user_info['UserAccountClosed']
            feat['_user_suspended'] = user_info['UserAccountSuspended']
            feat['_user_has_gender'] = 1 if user_info[
                'Gender'] is not None else 0
            feat['_user_has_school'] = 1 if user_info[
                'School'] is not None else 0
            feat[
                '_user_has_books'] = 1 if user_info['Books'] is not None else 0
            feat['_user_has_movies'] = 1 if user_info[
                'Movies'] is not None else 0
            feat[
                '_user_has_music'] = 1 if user_info['Music'] is not None else 0
            feat['_user_has_location'] = 1 if user_info[
                'Location'] is not None else 0
            feat['_user_has_hometown'] = 1 if user_info[
                'Hometown'] is not None else 0
    #        feat['_user_last'] = user_info['LastWebAccess']

    # Dictionary features
        feat['_has_bad_word'] = has_bad_word
        #        feat['_has_hate_word'] = has_hate_word
        #        feat['_has_drug_word'] = has_drug_word
        feat['_has_cult_word'] = has_cult_word
        feat['_has_swastika'] = has_swastika
        #        feat['_has_occult_word'] = has_occult_word
        #        feat['_has_has_fwenzel_word'] = has_fwenzel_word
        feat['_has_porn_word'] = has_porn_word
        feat['_has_swastika'] = has_swastika
        feat.update(read_feat)

        #        print feat
        res.append(feat)
    return res

Exemple #49

0

Afficher le fichier

Fichier : util.py Projet : rohankshir/football

def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    print words, "\n"
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])

Exemple #50

0

Afficher le fichier

Fichier : testing.py Projet : riamf/jupyternotes

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from nltk.stem import LancasterStemmer

f = open('data.txt', 'r')
lines = f.readlines()
f.close()

custom_stopwords = set(stopwords.words('english') + list(punctuation))

tokenized_lines = []
for line in lines:
    tokenized_words = [
        word for word in word_tokenize(line) if word not in custom_stopwords
    ]
    tokenized_lines.append(tokenized_words)

bigram_measures = BigramAssocMeasures()
ngrams = []
for line in tokenized_lines:
    ngrams.append(
        sorted(BigramCollocationFinder.from_words(line).ngram_fd.items()))

st = LancasterStemmer()
stemmed = []
for line in tokenized_lines:
    stemmed_words = [st.stem(word) for word in line]
    stemmed.append(stemmed_words)

for st in stemmed:
    print(st)

Exemple #51

0

Afficher le fichier

def bag_of_bigram_words(words, score_fn=BigramAssocMeasures.chi_sq, n=100):
    bigram_finder= BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bag_of_non_stopwords(words+bigrams)

Exemple #52

0

Afficher le fichier

Fichier : sandbox.py Projet : TheRealMarcusChiu/PythonMasterExample

import nltk
nltk.download('punkt')
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures, TrigramCollocationFinder

bi_dict = dict()
bg_measures = BigramAssocMeasures()
with open('text/text.txt', 'r') as file:
    text = file.read()
    table = str.maketrans(dict.fromkeys('0123456789'))
    textWithoutNumbers = text.translate(table)

    words = nltk.word_tokenize(textWithoutNumbers)

    bi_finder = BigramCollocationFinder.from_words(words, window_size=2)
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    bi_finder.apply_freq_filter(2)
    t = bi_finder.ngram_fd.items()
    ngram = list(t)
    ngram.sort(key=lambda item: item[-1], reverse=False)
    for (k, v) in ngram:
        print(k, v)
    bi_finder.score_ngrams(bigram_measures.pmi)
    bi_collocs = bi_finder.nbest(bg_measures.likelihood_ratio, 10)
    print(bi_collocs)

    tri_finder = TrigramCollocationFinder.from_words(words)
    bi_finder.apply_freq_filter(5)
    t = tri_finder.ngram_fd.items()
    ngram = list(t)
    ngram.sort(key=lambda item: item[-1], reverse=False)
    for (k, v) in ngram:

Exemple #53

0

Afficher le fichier

def ExtractCollocationFeatures(train_dataset,
                               test_dataset,
                               X_train_filename,
                               X_test_filename,
                               window_size,
                               n_features,
                               balance_dataset=False,
                               remove_center_interval=None):

    # This method extract Collocations of two words within the given
    # window of words as features from the given train and test datasets.
    # It returns X, Y matrices the vectorizer and a list with the feature names.
    # It also stores those X matrices in txt files with names X_train_filename and
    # X_test_filename under the /feature_matrices folder.
    # There are five tuneable parameters:
    # - window_size: size of the window
    # - n_features: number of features considered.
    # - balance_dataset: set to True to balance the training dataset.
    # - remove_center_interval: format: [-0.2, 0.2]. To remove samples with DW-Nominate inside
    # the interval.

    print("Reading datasets...")
    path_train = "../datasets/train/"
    train_dataset_df = pd.read_csv(path_train + train_dataset,
                                   sep="|",
                                   encoding="latin_1",
                                   header=None)
    train_dataset_df.columns = ['nominate_dim1', 'nominate_dim2', 'speech']

    #Remove rows with DW-nominates close to 0
    if type(remove_center_interval) != type(None):
        train_dataset_df['ideology'] = train_dataset_df['nominate_dim1'].apply(
            lambda x: 0 if (float(x) > remove_center_interval[0] and float(x) <
                            remove_center_interval[1]) else x)
        train_dataset_df = train_dataset_df[train_dataset_df['ideology'] != 0]

    train_dataset_df['ideology'] = train_dataset_df['nominate_dim1'].apply(
        lambda x: 1.0 if (float(x) >= 0) else -1.0)

    path_test = "../datasets/test/"
    test_dataset_df = pd.read_csv(path_test + test_dataset,
                                  sep="|",
                                  encoding="latin_1",
                                  header=None)
    test_dataset_df.columns = ['nominate_dim1', 'nominate_dim2', 'speech']

    if balance_dataset == True:
        positive_rows = len(
            train_dataset_df[train_dataset_df['ideology'] == 1.0])
        negative_rows = len(
            train_dataset_df[train_dataset_df['ideology'] == -1.0])
        if positive_rows > negative_rows:
            n = positive_rows - negative_rows

            indices = train_dataset_df[train_dataset_df['ideology'] ==
                                       1.0].index.values.tolist()
            drop_indices = random.sample(indices, n)
            train_dataset_df = train_dataset_df.drop(drop_indices)
        else:
            n = negative_rows - positive_rows

            indices = train_dataset_df[train_dataset_df['ideology'] ==
                                       -1.0].index.values.tolist()
            drop_indices = random.sample(indices, n)
            train_dataset_df = train_dataset_df.drop(drop_indices)

    train_speeches = train_dataset_df['speech'].values.tolist()
    Y_train = train_dataset_df['ideology'].values.tolist()

    if type(remove_center_interval) != type(None):
        test_dataset_df['ideology'] = test_dataset_df['nominate_dim1'].apply(
            lambda x: 0 if (float(x) > remove_center_interval[0] and float(x) <
                            remove_center_interval[1]) else x)
        test_dataset_df = test_dataset_df[test_dataset_df['ideology'] != 0]

    test_dataset_df['ideology'] = test_dataset_df['nominate_dim1'].apply(
        lambda x: 1.0 if (float(x) >= 0) else -1.0)

    test_speeches = test_dataset_df['speech'].values.tolist()
    Y_test = test_dataset_df['ideology'].values.tolist()

    print("Extracting features from train dataset...")
    t_start = time.time()

    stop_words = stopwords.words('english')

    total_bigrams = {}
    bigrams_per_speech_train = []
    t0 = time.time()
    print(len(train_speeches))
    for i in range(0, len(train_speeches)):
        if (i % 1000 == 0):
            print(i)
            t1 = time.time()
            print(str(t1 - t0) + " segundos")
            t0 = time.time()

        speech = train_speeches[i]
        speech = speech.lower()
        speech = speech.translate(str.maketrans('', '', string.punctuation))
        words = speech.split()
        stop_words = set(stopwords.words('english'))
        filtered_words = [w for w in words if not w in stop_words]

        bcf = BigramCollocationFinder.from_words(filtered_words,
                                                 window_size=window_size)

        for item in bcf.ngram_fd.items():
            if item[0] not in total_bigrams:
                total_bigrams.update({item[0]: item[1]})
            else:
                total_bigrams[item[0]] += item[1]

        bigrams_per_speech_train.append(bcf.ngram_fd.items())

    print("Total bigrams finded: ", len(total_bigrams))

    feature_names = []
    most_frequent_bigrams_sorted = sorted(total_bigrams.items(),
                                          key=lambda x: x[1],
                                          reverse=True)[:n_features]
    print("Number of features: ", len(most_frequent_bigrams_sorted))
    most_frequent_bigrams = dict(most_frequent_bigrams_sorted)

    for i in range(0, len(most_frequent_bigrams_sorted)):
        feature_names.append(most_frequent_bigrams_sorted[i][0])
    print(len(feature_names))

    order = list(range(0, len(feature_names)))
    collocation_order = dict(zip(feature_names, order))

    print("Computing X_train...")

    X_train_matrix = np.zeros(
        (len(bigrams_per_speech_train), len(feature_names)))

    for i in range(0, len(bigrams_per_speech_train)):
        if (i % 1000 == 0):
            print(i)
            t1 = time.time()
            print(str(t1 - t0) + " segundos")
            t0 = time.time()
        bigrams_per_speech_i = dict(bigrams_per_speech_train[i])
        for bigram in bigrams_per_speech_i:
            if bigram in most_frequent_bigrams:
                column = collocation_order[bigram]
                X_train_matrix[i][column] = bigrams_per_speech_i[bigram]
    print("Creating dataframe...")
    X_train_df = pd.DataFrame(X_train_matrix, columns=feature_names)
    t1 = time.time()
    print(str(t1 - t0) + " segundos")
    t0 = time.time()

    pathX = "../feature_matrices/"
    print("Saving X_train into a txt file...")
    X_train_df.to_csv(pathX + X_train_filename,
                      header=feature_names,
                      index=None,
                      sep=',')
    print("Transforming X_train into a csr_matrix...")
    X_train = csr_matrix(X_train_df)

    print("Extracting bigrams from test dataset...")

    bigrams_per_speech_test = []
    t0 = time.time()
    print(len(test_speeches))
    for i in range(0, len(test_speeches)):
        if (i % 1000 == 0):
            print(i)
            t1 = time.time()
            print(str(t1 - t0) + " segundos")
            t0 = time.time()

        speech = test_speeches[i]
        speech = speech.lower()
        speech = speech.translate(str.maketrans('', '', string.punctuation))
        words = speech.split()
        stop_words = set(stopwords.words('english'))
        filtered_words = [w for w in words if not w in stop_words]

        bcf = BigramCollocationFinder.from_words(filtered_words,
                                                 window_size=window_size)
        bigrams_per_speech_test.append(bcf.ngram_fd.items())

    print("Computing X_test...")

    X_test_matrix = np.zeros(
        (len(bigrams_per_speech_test), len(feature_names)))
    for i in range(0, len(bigrams_per_speech_test)):
        if (i % 1000 == 0):
            print(i)
            t1 = time.time()
            print(str(t1 - t0) + " segundos")
            t0 = time.time()
        bigrams_per_speech_i = dict(bigrams_per_speech_test[i])
        for bigram in bigrams_per_speech_i:
            if bigram in most_frequent_bigrams:
                column = collocation_order[bigram]
                X_test_matrix[i][column] = bigrams_per_speech_i[bigram]

    print("Creating dataframe...")
    X_test_df = pd.DataFrame(X_test_matrix, columns=feature_names)
    t1 = time.time()
    print(str(t1 - t0) + " segundos")
    t0 = time.time()

    print("Saving X_test into a txt file...")
    X_test_df.to_csv(pathX + X_test_filename,
                     header=feature_names,
                     index=None,
                     sep=',')
    print("Transforming X_train into a csr_matrix...")
    X_test = csr_matrix(X_test_df)

    t_end = time.time()
    total_time = t_end - t_start
    print("Total time: ")
    print(str(total_time) + " segundos")

    return X_train, Y_train, X_test, Y_test, feature_names

Exemple #54

0

Afficher le fichier

    ngram_freq_dist = nltk.FreqDist(ngrams)
    sorted_ngrams_fd = sorted(ngram_freq_dist.items(),
                              key=itemgetter(1),
                              reverse=True)
    sorted_ngrams = sorted_ngrams_fd[0:limit]
    sorted_ngrams = [(' '.join(text), freq) for text, freq in sorted_ngrams]

    return sorted_ngrams


corpus, category = get_data()

from nltk.collocations import BigramCollocationFinder
from nltk.collocations import BigramAssocMeasures

finder = BigramCollocationFinder.from_documents(
    [item.split() for item in corpus])
bigram_measures = BigramAssocMeasures()

print finder.nbest(bigram_measures.raw_freq, 10)

from nltk.collocations import TrigramCollocationFinder
from nltk.collocations import TrigramAssocMeasures

finder = TrigramCollocationFinder.from_documents(
    [item.split() for item in corpus])
trigram_measures = TrigramAssocMeasures()

print finder.nbest(trigram_measures.raw_freq, 10)
print finder.nbest(trigram_measures.pmi, 10)

# print get_top_ngrams(corpus, ngram_val=2, limit=10)

Exemple #55

0

Afficher le fichier

Fichier : evaluation.py Projet : voe09/Movie-Review-Sentiment-Analysis

def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    words_nopunc = [word for word in words if word not in string.punctuation]
    bigram_finder = BigramCollocationFinder.from_words(words_nopunc)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words_nopunc, bigrams)])

Exemple #56

0

Afficher le fichier

from nltk.corpus import stopwords
from nltk.corpus import webtext
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

set = set(stopwords.words('english'))
stops_filter = lambda w: len(w) < 3 or w in set
tokens = [t.lower() for t in webtext.words('grail.txt')]
words = BigramCollocationFinder.from_words(tokens)
words.apply_word_filter(stops_filter)
print(words.nbest(BigramAssocMeasures.likelihood_ratio, 10))

Exemple #57

0

Afficher le fichier

from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import MinMaxScaler

import seaborn as sns

df = pd.read_csv('../preprocessed_dataset.csv')
df.head()

# Calculating number of repeated bigrams per song. Only considered bigrams of which repetition frequency is greater than 3
bigram_score = []
for i in range(len(df.index)):
    mean_pmi = 0.0
    pmi_bigram = []
    text = df["Lyrics"][i].split()
    coll_bia = bigram_collocation.from_words(text)
    coll_bia.apply_freq_filter(3)
    bigram_freq = coll_bia.ngram_fd.items()
    bigramFreqTable = pd.DataFrame(list(bigram_freq),
                                   columns=['bigram', 'freq'
                                            ]).sort_values(by='freq',
                                                           ascending=False)
    bigram_score.append(len(bigramFreqTable.index.values))

# Calculating number of repeated trigrams per song. Only considered trigrams of which repetition frequency is greater than 3
trigram_score = []
for i in range(len(df.index)):
    mean_pmi = 0.0
    pmi_trigram = []
    text = df["Lyrics"][i].split()
    coll_tri = trigram_collocation.from_words(text)

Exemple #58

0

Afficher le fichier

plt.show()

fd = fdist_no_punc_no_stopwords

# las mas comunes
fd.most_common(50)

# diagramas_dispersion
text.dispersion_plot(["God", "mind", "knowledge"])
text.dispersion_plot(["power", "reason", "nature"])
# text.concordance("god")

# bigramas
# from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(words)
finder.nbest(bigram_measures.pmi, 10)
finder.apply_freq_filter(3)
finder.nbest(bigram_measures.pmi, 10)

# lo que aqui cambia es el cambio de filtro

# WC para los bigramas mas frecuentes
stopWords = stopwords
text_content = [
    ''.join(re.split("[ .,;:!?‘’``''@#$%^_&*()<>{}~\n\t\\\-]", word))
    for word in text
]

text_content = [word for word in text_content if word not in stopWords]
text_content = [s for s in text_content if len(s) != 0]

Exemple #59

0

Afficher le fichier

Fichier : collocations.py Projet : alturutin/NLP_ru

corpus = []
while True:
    l = hpmor.readline()
    if l == '': break
    l = re.sub(r"[^а-яё \t-]", "", l.lower()).strip().split()
    if l: corpus.extend(l)

bigram_measures = BigramAssocMeasures()
trigram_measures = TrigramAssocMeasures()

stop = set(stopwords.words('russian'))
stop.update(['гарри', 'поттер', 'профессор'
             ])  # добавим самые популярные слова из текста в стоп-лист
corpus_ = list(filter(lambda x: x not in stop, corpus))

finder = BigramCollocationFinder.from_words(corpus_)
finder3 = TrigramCollocationFinder.from_words(corpus_)

# фильтры по частотам и стоп-слова
finder.apply_freq_filter(5)
finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in stop)
finder3.apply_freq_filter(5)
finder3.apply_word_filter(lambda w: len(w) < 3 or w.lower() in stop)

# биграммы и триграммы
raw_bigrams = finder.nbest(bigram_measures.raw_freq, 100)
pmi_bigrams = finder.nbest(bigram_measures.pmi, 100)
raw_trigrams = finder3.nbest(trigram_measures.raw_freq, 100)
pmi_trigrams = finder3.nbest(trigram_measures.pmi, 100)

Exemple #60

0

Afficher le fichier

Fichier : judge.py Projet : HZ-njupt/Weibo

def bigram(words, score_fn=BigramAssocMeasures.chi_sq, n=500):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)  # 使用卡方统计的方法，选择排名前1000的词语
    newBigrams = [u + v for (u, v) in bigrams]