コード例 #1
0
ファイル: Extractor.py プロジェクト: Palazor/sentiment
    def _get_bigram_scores(self, posdata, negdata):
        pos_words = list(itertools.chain(*posdata))
        neg_words = list(itertools.chain(*negdata))

        pos_bigram_finder = BigramCollocationFinder.from_words(pos_words)
        neg_bigram_finder = BigramCollocationFinder.from_words(neg_words)
        pos_bigrams = pos_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
        neg_bigrams = neg_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

        pos = pos_words + pos_bigrams
        neg = neg_words + neg_bigrams

        word_fd = FreqDist()
        cond_word_fd = ConditionalFreqDist()
        for word in pos:
            word_fd[word] += 1
            cond_word_fd['pos'][word] += 1
        for word in neg:
            word_fd[word] += 1
            cond_word_fd['neg'][word] += 1

        pos_word_count = cond_word_fd['pos'].N()
        neg_word_count = cond_word_fd['neg'].N()
        total_word_count = pos_word_count + neg_word_count

        word_scores = {}
        for word, freq in word_fd.iteritems():
            pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
            neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
            word_scores[word] = pos_score + neg_score

        return word_scores
コード例 #2
0
    def get_unibigram_features(all_words, uni_feanum, bi_feanum):
        word_fd = nltk.FreqDist(all_words)
        bigram_fd = nltk.FreqDist(nltk.bigrams(all_words))

        if uni_feanum == 'max':
            uni_feanum = len(list(word_fd.keys()))
        elif uni_feanum > len(list(word_fd.keys())):
            uni_feanum = len(list(word_fd.keys()))

        if bi_feanum == 'max':
            bi_feanum = len(list(bigram_fd.keys()))
        elif bi_feanum > len(list(bigram_fd.keys())):
            bi_feanum = len(list(bigram_fd.keys()))

        finder = BigramCollocationFinder(word_fd, bigram_fd)
        bigrams = finder.nbest(BigramAssocMeasures.chi_sq, bi_feanum)

        print "the number of unigram features is", uni_feanum
        print "the number of bigram features is", bi_feanum

        featuples = word_fd.most_common(uni_feanum)

        selected_words = []

        for i in range(uni_feanum):
            selected_words.append(featuples[i][0])

        features = []
        for ngram in itertools.chain(selected_words, bigrams):
            features.append(ngram)

        return features
コード例 #3
0
def create_word_bigram_scores(posWords, negWords, score_method = BigramAssocMeasures.chi_sq):
    '''
    以双词来统计词的信息量
    '''
    bigram_finder = BigramCollocationFinder.from_words(posWords)
    posBigrams = bigram_finder.nbest(score_method, 5000)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    negBigrams = bigram_finder.nbest(score_method, 5000)
    pos = posWords + posBigrams #词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in neg:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count
    print("BIGRAM_IN_POSWORD_NUMS : %d\tBIGRAM_IN_NEGWORD_NUMS : %d" % (pos_word_count, neg_word_count))

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = score_method(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = score_method(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score
    return word_scores
def create_bigram_scores():
    posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", "1", "1")
    negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", "1", "1")
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000)

    pos = posBigrams
    neg = negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd.inc(word)
        cond_word_fd['pos'].inc(word)
    for word in neg:
        word_fd.inc(word)
        cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
コード例 #5
0
def create_word_bigram_scores(posWords, negWords, n = 5000):
    # (posWords,negWords) = readwordarr()
    posWords = list(itertools.chain(*posWords))
    negWords = list(itertools.chain(*negWords))
    bigramfinder = BigramCollocationFinder.from_words(posWords)
    posbigrams = bigramfinder.nbest(BigramAssocMeasures.chi_sq, n)
    bigramfinder = BigramCollocationFinder.from_words(negWords)
    negbigrams = bigramfinder.nbest(BigramAssocMeasures.chi_sq, n)
    posWords = posWords + posbigrams
    negWords = negWords + negbigrams
    wordscores = {}
    wordfd = FreqDist()
    conditionwordfd = ConditionalFreqDist()
    for word in posWords:
        wordfd[word]+=1
        conditionwordfd['pos'][word]+=1
        
    for word in negWords:
        wordfd[word]+=1
        conditionwordfd['neg'][word]+=1
    
    pos_word_count = conditionwordfd['pos'].N()
    neg_word_count = conditionwordfd['neg'].N()
    totalcount = pos_word_count + neg_word_count
    for word,freq in wordfd.items():
        pos_score = BigramAssocMeasures.chi_sq(conditionwordfd['pos'][word], (freq, pos_word_count), totalcount)
        neg_score = BigramAssocMeasures.chi_sq(conditionwordfd['neg'][word], (freq, neg_word_count), totalcount)
        wordscores[word] = pos_score + neg_score
    return wordscores
コード例 #6
0
def create_word_bigram_scores():
    posdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl','r'))
    negdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl','r'))
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams #词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd['pos'][word]+= 1 #cond_word_fd['pos'].inc(word)
    for word in neg:
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd['neg'][word]+= 1#cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
コード例 #7
0
def create_word_bigram_scores(posWords, negWords):
    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)

    pos = posWords + posBigrams #词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[str(word)] += 1 
        cond_word_fd['pos'][str(word)] += 1
    for word in neg:
	    word_fd[str(word)] += 1
	    cond_word_fd['neg'][str(word)] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
コード例 #8
0
def best_bigrams(sents_tagged, stopwords, score_fn=BigramAssocMeasures.likelihood_ratio, n=300):
    sents_pos = []
    sents_neg = []

    # Separate positive and negative sentences.
    for tag, sent in sents_tagged:
        if tag == 1:
            sents_pos.append(sent)
        elif tag == -1:
            sents_neg.append(sent)

    # Extract words from positive and negative sentences.
    words_pos = [word.lower() for s in sents_pos for word in word_tokenize(s) if word not in string.punctuation]
    words_neg = [word.lower() for s in sents_neg for word in word_tokenize(s) if word not in string.punctuation]

    # Find the best bigrams for positive sentences based on informative collocations
    bigram_finder1 = BigramCollocationFinder.from_words(words_pos)
    bigrams_best_pos = bigram_finder1.nbest(score_fn, n)

    # Find the best bigrams for negative sentences based on informative collocations
    bigram_finder2 = BigramCollocationFinder.from_words(words_neg)
    bigrams_best_neg = bigram_finder2.nbest(score_fn, n)

    bigrams_all = list(set(bigrams_best_pos).union(set(bigrams_best_neg)))

    # Select only the bigrams that have either one of the word greater than length 3
    bigrams_best = [bigram for bigram in bigrams_all
            if len(bigram[0]) > 3 and len(bigram[1]) > 3
            and bigram[0] not in ex and bigram[1] not in ex ]


    return bigrams_best
コード例 #9
0
ファイル: score.py プロジェクト: TianyiM/Final-Project
def create_word_bigram_scores():
    posdata = tp.seg_fil_senti_excel("~", 1, 1)
    negdata = tp.seg_fil_senti_excel("~", 1, 1)
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    last_word = ConditionalFreqDist()
    for word in pos:
        word_fd.inc(word)
        last_word['pos'].inc(word)
    for word in neg:
        word_fd.inc(word)
        last_word['neg'].inc(word)

    pos_word_count = last_word['pos'].N()
    neg_word_count = last_word['neg'].N()
    totalnumber = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(last_word['pos'][word], (freq, pos_word_count), totalnumber)
        neg_score = BigramAssocMeasures.chi_sq(last_word['neg'][word], (freq, neg_word_count), totalnumber)
        word_scores[word] = pos_score + neg_score

    return word_scores
コード例 #10
0
def create_word_bigram_scores():
    posdata = tp.seg_fil_txt("/home/hadoop/goodnew.txt")
    negdata = tp.seg_fil_txt("/home/hadoop/badnew.txt")
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finderr = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finderr.nbest(BigramAssocMeasures.chi_sq,350000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq,350000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd.inc(word)
        cond_word_fd['pos'].inc(word)
    for word in neg:
        word_fd.inc(word)
        cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
コード例 #11
0
def create_words_bigrams_scores():
    posdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/pos_review.xlsx", 1, 1)
    negdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/neg_review.xlsx", 1, 1)
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word]+=1
        cond_word_fd['pos'][word]+=1

    for word in neg:
        word_fd[word]+=1
        cond_word_fd['neg'][word]+=1
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
コード例 #12
0
ファイル: feature_extrac.py プロジェクト: yyr93520/NLPproject
def create_word_bigram_scores():
	bigram_finder = BigramCollocationFinder.from_words(posWords)
	bigram_finder = BigramCollocationFinder.from_words(negWords)
	posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
	negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
	pos = posWords + posBigrams #词和双词搭配
	neg = negWords + negBigrams
	return get_scores(pos, neg)
def create_word_bigram_scores():
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))
    
    objWords = list(itertools.chain(*objdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    
    bigram_finder = BigramCollocationFinder.from_words(objWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    
    objBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)


    pos = posWords + posBigrams
    neg = negWords + negBigrams
    
    obj = objWords + objBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in neg:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1
    for word in objWords:
        word_fd[word] += 1
        cond_word_fd['obj'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    
    obj_word_count = cond_word_fd['obj'].N()
    total_word_count = pos_word_count + neg_word_count + obj_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
       
        obj_score = BigramAssocMeasures.chi_sq(cond_word_fd['obj'][word], (freq, obj_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score + obj_score

    return word_scores
コード例 #14
0
def get_bigrams1(tweet, score_fn=BigramAssocMeasures.chi_sq, n=200):
        bigramslist = []
        bigram_finder = BigramCollocationFinder.from_words(tweet)
        bigrams = bigram_finder.nbest(score_fn, n)
        for bigram in bigrams:
            bigramslist.append(' '.join(str(i) for i in bigram))
        print bigramslist
コード例 #15
0
ファイル: text.py プロジェクト: prz3m/kind2anki
    def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        :seealso: find_collocations
        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not (
            '_collocations' in self.__dict__
            and self._num == num
            and self._window_size == window_size
        ):
            self._num = num
            self._window_size = window_size

            # print("Building collocations list")
            from nltk.corpus import stopwords

            ignored_words = stopwords.words('english')
            finder = BigramCollocationFinder.from_words(self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations]
        print(tokenwrap(colloc_strings, separator="; "))
コード例 #16
0
def best_bigram_word_features(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    d = dict([(bigram, True) for bigram in bigrams])
    d.update(best_word_features(words))

    return d
コード例 #17
0
def bigram_word_features(words, limit, stop, stopset, word_score_placeholder, \
                                          score_fn=BigramAssocMeasures.chi_sq):
  if stop:
    words = [w for w in words if w not in stopset]
  bigram_finder = BigramCollocationFinder.from_words(words)
  bigrams = bigram_finder.nbest(score_fn, limit)
  return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
コード例 #18
0
def tweet_features(tweet):
    tweet_words = word_tokenize(tweet)
    bigram_finder = BigramCollocationFinder.from_words(tweet_words)
    score_fn=BigramAssocMeasures.chi_sq
    bigrams = bigram_finder.nbest(score_fn, 200)
    print bigrams
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
コード例 #19
0
ファイル: text_utils.py プロジェクト: fruser/review-analyzer
def get_bag_of_bigrams_words(
        word_list,
        score_fn=BigramAssocMeasures.chi_sq,
        n=200):
    bigram_finder = BigramCollocationFinder.from_words(word_list)
    bigrams = bigram_finder.nbest(score_fn, n)
    return get_bag_of_words(word_list + bigrams)
コード例 #20
0
ファイル: NLTK_tools.py プロジェクト: dreampocketit/bocard
def demo_collocations(self, num=40, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        @seealso: L{find_collocations}
        @param num: The maximum number of collocations to print.
        @type num: C{int}
        @param window_size: The number of tokens spanned by a collocation (default=2)
        @type window_size: C{int}
        """
        if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
            self._num = num
            self._window_size = window_size
            print "Building collocations list"
            from nltk.corpus import stopwords
            ignored_words = stopwords.words('english')
            from nltk.collocations import BigramCollocationFinder
            finder = BigramCollocationFinder.from_words(self.tokens, window_size) 
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            from nltk.metrics import f_measure, BigramAssocMeasures
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1+u' '+w2 for w1, w2 in self._collocations]
        print "List {0} collocations".format(num)
        print tokenwrap(colloc_strings, separator=u'; ')
コード例 #21
0
ファイル: collocations.py プロジェクト: rjoganah/Dynamic_IR
 def collaction_discovery(self):
     self.corpus = nltk.word_tokenize(self.corpus.lower())
     bigramm_finder = BigramCollocationFinder.from_words(self.corpus)
     filter_bigram = lambda w: len(w) < 3 or w in self.stopwords_
     bigramm_finder.apply_word_filter(filter_bigram)
     top_10_bigrams = bigramm_finder.nbest(BigramAssocMeasures.likelihood_ratio, 10)
     return top_10_bigrams
コード例 #22
0
ファイル: classifier.py プロジェクト: benneic/sentimento
 def converter(tokens):
     bigram_finder = BigramCollocationFinder.from_words(tokens)
     bigrams = bigram_finder.nbest(score_fn, n)
     return (
         {ngram: True for ngram in itertools.chain(tokens, bigrams)},
         label
     )
コード例 #23
0
ファイル: grapher.py プロジェクト: amac441/Metten
    def get_frequencies(self, desc):

        stopset = set(stopwords.words('english'))
        filter_stops = lambda w: len(w) < 3 or w in stopset
        words = word_tokenize(desc)

        print '------gram--------'
        words_to_count = [word for word in words if word not in stopset]
        words_to_count = [word for word in words_to_count if not len(word) < 3]
        c = Counter(words_to_count)
        single = c.most_common(20)
        print single

        print '------bigram--------'
        bcf = BigramCollocationFinder.from_words(words)
        bcf.apply_word_filter(filter_stops)
        bigrm = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 15)
        print bigrm

        print '------trigram--------'
        tcf = TrigramCollocationFinder.from_words(words)
        tcf.apply_word_filter(filter_stops)
        tcf.apply_freq_filter(3)  #only keep those that appear more than 3 times
        trigrm = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 10)
        print trigrm

        matches = [single,bigrm,trigrm]
        return matches
コード例 #24
0
 def get_bigrams(self, rawText, score_fn=BigramAssocMeasures.pmi, n=40):
     # TODO configuration value
     clean_text = TokensCleaner.clean(self, rawText, cleaning_level=3)
     bigram_finder = BigramCollocationFinder.from_words(clean_text['3'])
     bigram_measures = BigramAssocMeasures()
     bigrams = bigram_finder.nbest(bigram_measures.pmi, n)
     return bigrams
コード例 #25
0
 def get_bigrams(self, tweet, score_fn=BigramAssocMeasures.chi_sq, n=200):
         bigramslist = []
         bigram_finder = BigramCollocationFinder.from_words(tweet)
         bigrams = bigram_finder.nbest(score_fn, n)
         for bigram in bigrams:
             bigramslist.append(' '.join(str(i) for i in bigram))
         return bigramslist #This is list e.g. ['you dude', 'Hi How', 'How are', 'are you']
コード例 #26
0
 def get_collocations(self):
     ignored_words = stopwords.words('english')
     finder = BigramCollocationFinder.from_words(self.text_array,2)
     finder.apply_freq_filter(3)
     finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
     bigram_measures = BigramAssocMeasures()
     return finder.nbest(bigram_measures.likelihood_ratio,40)
コード例 #27
0
def ShowCollocations():
	text.insert(END, "If this doesn't work, please check you have NLTK, PyYAML and the stopword list from the NLTK loaded. See Help for details \n\n\n")
	import nltk
	from nltk.collocations import BigramCollocationFinder
	from nltk.collocations import TrigramCollocationFinder
	from nltk.metrics import BigramAssocMeasures
	from nltk.metrics import TrigramAssocMeasures
	pattern = r'''(?x)([A-Z]\.)+|\w+([-']\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;"'?():-_']'''
	data = resultsbox.get(1.0,END)
	rawtext=nltk.regexp_tokenize(data, pattern)
	prepcolloc = [word.lower() for word in rawtext if not word in stopwords and word.isalpha()]
	text.delete(1.0, END)
	text.insert(END, "Collocations (occurring at least 3 times with a PMI of 10)\n")
	text.insert(END, "\nBigram Collocations:\n")
	bigram = BigramAssocMeasures()
	bigramfinder = BigramCollocationFinder.from_words(prepcolloc)
	bigramfinder.apply_freq_filter (3)
	bigrams=bigramfinder.nbest(bigram.pmi, 10)
	for item in bigrams:
		first = item[0]
		second = item[1]
		text.insert(END, first)
		text.insert(END, " ")
		text.insert(END, second)
		text.insert(END, "\n")
コード例 #28
0
 def get_bigram(self, features_list):
     #Top ten best bigrams are selected
     score = BigramAssocMeasures.chi_sq
     all_bigrams = BigramCollocationFinder.from_words(features_list)
     best_bigrams = all_bigrams.nbest(score, self.bigram_threshold)
     selected_bigrams = [(bigram, True) for bigram in best_bigrams]
     return selected_bigrams
コード例 #29
0
def stopword_filtered_bigrams(text, score_fn=BigramAssocMeasures.chi_sq, n=500):
    '''Removes the stopwords and computes the best bigrams'''
    stopset = set(stopwords.words('english'))
    words = [word for word in tokenize(text) if word not in stopset]
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
コード例 #30
0
ファイル: ytpy.py プロジェクト: juliasun/Youtube-Tox
def bigram(ytcomments, drug):
    bi = BigramAssocMeasures()
    bi_finder = BigramCollocationFinder.from_words(ytcomments, window_size = 20)
    top_general = bi_finder.nbest(bi.pmi,30)
    bi_finder.apply_ngram_filter(lambda w1,w2: w1 != drug and w2 != drug)
    top_bi = bi_finder.nbest(bi.pmi, 30)
    return top_bi
コード例 #31
0
ファイル: test_nltk.py プロジェクト: ChyengJason/Wandoujia
def bigram_words(words, score_fn=BigramAssocMeasures.chi_sq, n=1000):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bag_of_words(words + bigrams)  #所有词和(信息量大的)双词搭配一起作为特征
コード例 #32
0
ファイル: test_nltk.py プロジェクト: ChyengJason/Wandoujia
def bigram(words, score_fn=BigramAssocMeasures.chi_sq, n=1000):
    bigram_finder = BigramCollocationFinder.from_words(words)  #把文本变成双词搭配的形式
    bigrams = bigram_finder.nbest(score_fn, n)  #使用了卡方统计的方法,选择排名前1000的双词
    return bag_of_words(bigrams)
コード例 #33
0
 def get_bigrams(self, words):
     bigram_finder = BigramCollocationFinder.from_words(words)
     self.biagrams = bigram_finder.nbest(self.bigram_score_funcion,
                                         self.top_ngram_count)
     return self.biagrams
コード例 #34
0

def flatten_corpus(corpus):
    return ' '.join([document.strip() for document in corpus])


def get_top_ngrams(corpus, ngram_val=1, limit=5):
    corpus = flatten_corpus(corpus)
    tokens = nltk.word_tokenize(corpus)
    ngrams = compute_ngrams(tokens, ngram_val)
    ngrams_freq_dist = nltk.FreqDist(ngrams)
    sorted_ngrams_fd = sorted(ngrams_freq_dist.items(),
                              key=itemgetter(1),
                              reverse=True)
    sorted_ngrams = sorted_ngrams_fd[0:limit]
    sorted_ngrams = [(' '.join(text), freq) for text, freq in sorted_ngrams]
    return sorted_ngrams


print(get_top_ngrams(corpus=norm_alice, ngram_val=3, limit=10))

finder = BigramCollocationFinder.from_documents(
    [item.split() for item in norm_alice])
bigram_measures = BigramAssocMeasures()

print(finder.nbest(bigram_measures.raw_freq, 10))

# Now using gensim
print("Sentence: ", norm_alice[2])
key_words = keywords(norm_alice[2], ratio=1.0, scores=True, lemmatize=True)
print([(item, round(score, 3)) for item, score in key_words][:25])
コード例 #35
0
def bi(text):
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    finder=BigramCollocationFinder.from_words(word_tokenize(text))
    finder.apply_freq_filter(5)
    finder.nbest(bigram_measures.pmi, 5) 
    return finder.ngram_fd.items()
コード例 #36
0
def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    d = dict([(bigram, True) for bigram in bigrams])
    d.update(best_word_feats(words))
    return d
コード例 #37
0
def get_bigrams(tokens, freq_filter=None):
    finder = BigramCollocationFinder.from_words(tokens)
    if freq_filter:
        finder.apply_freq_filter(freq_filter)
    return list(' '.join(b[0]) for b in finder.ngram_fd.items())
コード例 #38
0
def bigram(collat_data):
    df_co = pd.DataFrame.to_string(collat_data,
                                   columns=['lemmatization']).split(',')
    bcf = BigramCollocationFinder.from_words(df_co)
    top20 = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 20)
    return top20
コード例 #39
0
def find_bigrams(sentences, n_ngrams):
    cf = BigramCollocationFinder.from_documents(sentences)
    fng = cf.nbest(BigramAssocMeasures.likelihood_ratio, n_ngrams)
    return fng
コード例 #40
0
modelkmeans = KMeans(init='k-means++', max_iter=200, n_init=100)
modelkmeans.fit(X)
order_centroids = modelkmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(modelkmeans.n_clusters):
    print("Cluster {}:".format(i)),
    for ind in order_centroids[i, :10]:
        print("{}".format(terms[ind]))

s = all_text_docs[name]
tokens = word_tokenize(s)
text = nltk.Text(tokens)
text.collocations()
text.concordance('social')
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(word_tokenize(s))
finder.nbest(bigram_measures.pmi, 10)

######
NUM_TOPICS = 10

vectorizer = CountVectorizer(min_df=5,
                             max_df=0.9,
                             stop_words='english',
                             lowercase=True)
data_vectorized = vectorizer.fit_transform(train_clean_sentences)

# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS,
                                      max_iter=10,
                                      learning_method='online')
コード例 #41
0
def bigrams_words(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bigrams
コード例 #42
0
with open("D:/Python/Consumer Complaints/Consumer_Complaints_CreditCard.csv", 'r') as file:
  complaints = list(csv.reader(file))
  file.close()

compClean = []
for i in range(len(complaints)):
    tokens = re.sub("[^A-Za-z0-9()'.]+", " ", complaints[i][5])
    tokens = re.sub('!', ".", tokens)
    compClean.append(tokens)


from nltk.corpus import webtext
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
words = [w.lower() for w in webtext.words('D:/Python/Consumer Complaints/complaintsDump.txt')]
bcf = BigramCollocationFinder.from_words(words)

#from nltk.collocations import TrigramCollocationFinder
#from nltk.metrics import TrigramAssocMeasures
#tcf = TrigramCollocationFinder.from_words(words)
#tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 4)

from nltk.corpus import stopwords
stopset = set(stopwords.words('english'))
filter_stops = lambda w: len(w) < 3 or w in stopset
bcf.apply_word_filter(filter_stops)
collocations = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 50)

newText = 'a credit card is issued to me'
tokens = re.sub(" ".join(collocations[1]), "-".join(collocations[1]), newText)
コード例 #43
0
def bag_of_bigrams_words(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    # finds words that often occur togther
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bag_of_words(words + bigrams)
コード例 #44
0
def analyze_text(text, filename, stopwords, min_length, freq, total_ngrams,
                 min_measure, bigrams_only, trigrams_only):
    print(len(text), filename)
    words = [
        w.lower() for w in text if w not in string.punctuation
        if w.lower() not in stopwords and len(w) >= min_length
    ]

    bigrams = None
    b_prefix_keys = None
    trigrams = None
    t_prefix_keys = None

    # what follows could totally be generalized
    if not trigrams_only:
        # Bigrams
        print("Generating bigrams from", filename)
        b_finder = BigramCollocationFinder.from_words(words)
        b_finder.ngram_fd
        b_finder.apply_freq_filter(freq)
        # if stopwords:
        #   b_finder.apply_word_filter(lambda w: w in stopwords)
        bigrams = b_finder.nbest(BigramAssocMeasures.pmi, total_ngrams)
        b_scored = b_finder.score_ngrams(BigramAssocMeasures.pmi)
        b_prefix_keys = collections.defaultdict(list)
        for key, scores in b_scored:
            if scores > min_measure:
                b_prefix_keys[key[0]].append((key[1], scores))

    # Trigrams
    if not bigrams_only:
        print("Generating trigrams from", filename)
        t_finder = TrigramCollocationFinder.from_words(words)
        t_finder.apply_freq_filter(freq)
        # if stopwords:
        #   t_finder.apply_word_filter(lambda w: w in stopwords)
        trigrams = t_finder.nbest(TrigramAssocMeasures.pmi, total_ngrams)
        t_scored = t_finder.score_ngrams(TrigramAssocMeasures.pmi)
        t_prefix_keys = collections.defaultdict(list)
        for key, scores in t_scored:
            if scores > min_measure:
                t_prefix_keys[key[0]].append((key[1], key[2], scores))

    if bigrams_only:
        ret = {
            'bigrams': bigrams,
            'b_prefix': b_prefix_keys,
            'b_fd': b_finder.ngram_fd
        }
    elif trigrams_only:
        ret = {
            'trigrams': trigrams,
            't_prefix': t_prefix_keys,
            't_fd': t_finder.ngram_fd
        }
    else:
        ret = {
            'bigrams': bigrams,
            'b_prefix': b_prefix_keys,
            'b_fd': b_finder.ngram_fd,
            'trigrams': trigrams,
            't_prefix': t_prefix_keys,
            't_fd': t_finder.ngram_fd
        }
    return ret
コード例 #45
0
def main():
    # stopwords to filter out for collocations
    stopwords_eng = set(stopwords.words("english"))
    stopwords_eng.add(b'et')
    stopwords_eng.add(b'al')


    # bigram identifier from nltk
    bigram_measures = nltk.collocations.BigramAssocMeasures()

    # tf-idf vectorizer from nltk
    tfidf_vectorizer = TfidfVectorizer(stop_words='english',
                                 use_idf=True,
                                 tokenizer=tokenize_and_stem, ngram_range=(1,3))

    file = open('CultureRelatedDiaognosticIssues.txt','r')
    a = []
    names = []
    for line in file:
        miniList = line.split("|")
        names.append(int(miniList[0].strip()))
        a.append(miniList[1].strip())
    file.close()

    allvocab_stemmed = []
    allvocab_tokenized = []

    for element in a:
        stemmed_result = tokenize_and_stem(element)
        allvocab_stemmed.extend(stemmed_result)

        tokenized_result = tokenize_only(element)
        allvocab_tokenized.extend(tokenized_result)

    # data frame that contains stems and tokenized words
    vocab_frame = pd.DataFrame({'words': allvocab_tokenized},
    index = allvocab_stemmed)

    # tf-idf matrix for the terms in the corpus
    tfidf_matrix = tfidf_vectorizer.fit_transform(a)
    terms = tfidf_vectorizer.get_feature_names()

    # number of clusters
    num_clusters = 10

    # fitting the k-means algorithm and saving it in a .pkl file
    km = KMeans(n_clusters=num_clusters)
    km.fit(tfidf_matrix)
    joblib.dump(km,  'cluster.pkl')
    km = joblib.load('cluster.pkl')
    clusters = km.labels_.tolist()

    # data frame that saves the chapter, the text, and the assigned cluster
    dsm = {'chapter': names, 'text': a, 'cluster': clusters}
    frame = pd.DataFrame(dsm, index = [clusters], columns = ['chapter', 'text', 'cluster'])

    #groupby cluster for aggregation purposes
    grouped = frame['chapter'].groupby(frame['cluster'])

    # getting rid of all punctuation for bigram measures - will use this later
    puncTokenizer = RegexpTokenizer(r'\w+')

    print("Top terms per cluster:")
    print()

    #sort cluster centers by proximity to centroid
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]

    for i in range(num_clusters):
        print("Cluster %d words:" % i, end='')

        for ind in order_centroids[i, :6]:
            print(vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
        print()

        print("Cluster %d titles:" % i, end='')
        for title in frame.ix[i]['chapter'].values.tolist():
            print(str(title) + " , ", end='')
        print()

        # this for-loop finds the most common pairs of words in each diagnosis
        for text in frame.ix[i]['text'].values.tolist():
            data_tokens = puncTokenizer.tokenize(text)
            data_tokens = [x.lower() for x in data_tokens]

            tokens = [w for w in data_tokens if w not in stopwords_eng]

            finder = BigramCollocationFinder.from_words(tokens)
            print('Printing collocations in this chapter:')
            print(finder.nbest(bigram_measures.likelihood_ratio, 5))
            print()
    print()
    print()

    # distribution of clusters
    plt.hist(km.labels_, bins=num_clusters)
    plt.show()
コード例 #46
0
ファイル: general.py プロジェクト: azizmb/TWSS
def word_features(words, score_fn=BAM.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict((bg, True) for bg in chain(words, bigrams))
コード例 #47
0
def bigram_words(words, score_fn=BigramAssocMeasures.pmi, n=121):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bag_of_words(words + bigrams)
コード例 #48
0
def create_features(X, user_data=None):
    res = []

    for date, comment, user in X:
        feat = {}
        has_hate_word = has_drug_word = has_cult_word = has_occult_word = has_porn_word = 0
        has_fwenzel_word = 0
        has_swastika = swastika in comment

        comment = comment.lower()

        comment = parse_text(comment)

        comment = nltk.clean_html(comment)

        sents = sent_tokenize(comment)
        doc = []
        for sent in sents:
            # Tokenize each sentence.
            doc += wordtokenizer.tokenize(sent)

        def repl_filter(x):
            return x.lower() not in ["nl", "nl2", "nbsp", "nbsp2", "dummyhtml"]

        # Remove stopwords and replacement tokens.
        doc = filter(repl_filter, doc)

        for i, word in enumerate(doc):
            if doc[i] in bad_words:
                doc[i] = '_badword_'

            doc[i] = ps.stem(doc[i])

            doc[i] = wnl.lemmatize(doc[i])

            if doc[i] in bad_words:
                doc[i] = '_badword_'

            if doc[i] in hate_words:
                has_hate_word = 1
            if doc[i] in drug_words:
                has_drug_word = 1
            if doc[i] in cult_words:
                has_cult_word = 1
            if doc[i] in occult_words:
                has_occult_word = 1
            if doc[i] in porn_words:
                has_porn_word = 1
            if doc[i] in fwenzel_words:
                has_fwenzel_word = 1

        bigram_finder = BigramCollocationFinder.from_words(doc)
        bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, n=5)

        bigram = dict([(ngram, True)
                       for ngram in itertools.chain(doc, bigrams)])

        feat.update(bigram)

        text_vocab = set(w for w in doc if w.isalpha())
        unusual = text_vocab.difference(english_vocab)
        unusual_ratio = len(unusual) / len(text_vocab) if len(
            text_vocab) != 0 else -1.0

        unusual2 = unusual.difference(set("_badword_"))
        unusual_ratio2 = len(unusual2) / len(text_vocab) if len(
            text_vocab) != 0 else -1.0

        if user_data is not None:
            user_info = user_data[user]

        has_bad_word = True
        for word in bad_words:
            if word in comment.lower():
                break
        else:
            has_bad_word = False

        def n_none(x):
            return int(x) if x is not None else 0

        def c_none(x):
            return x if x is not None else "__None__"

        readability = ReadabilityTool(comment)

        read_feat = {}
        for f, val in readability.analyzedVars.items():
            if f != 'words':
                read_feat["_" + f] = val
        for test, val in readability.tests_given_lang['eng'].items():
            read_feat["__" + test] = val(readability.text)

        feat['_always_present'] = True
        feat['_word_num'] = len(doc)
        feat['_sent_num'] = len(sents)
        feat['_word_var'] = len(set(doc)) / len(doc) if len(doc) != 0 else -1.0
        feat['_sent_var'] = len(set(sents)) / len(sents)
        feat['_unusual_ratio'] = unusual_ratio
        feat['_unusual_ratio2'] = unusual_ratio2
        if user_data is not None:
            feat['_username'] = user
            feat['_user_subcount'] = int(user_info['SubscriberCount'])
            feat['_user_friends'] = int(user_info['FriendsAdded'])
            feat['_user_favs'] = int(user_info['VideosFavourited'])
            feat['_user_videorates'] = int(user_info['VideosRated'])
            feat['_user_videouploads'] = int(user_info['VideosUploaded'])
            feat['_user_videocomments'] = int(user_info['VideosCommented'])
            feat['_user_videoshares'] = int(user_info['VideosShared'])
            feat['_user_usersubs'] = int(user_info['UserSubscriptionsAdded'])
            feat['_user_gender'] = c_none(user_info['Gender'])
            feat['_user_age'] = n_none(user_info['Age'])
            feat['_user_closed'] = user_info['UserAccountClosed']
            feat['_user_suspended'] = user_info['UserAccountSuspended']
            feat['_user_has_gender'] = 1 if user_info[
                'Gender'] is not None else 0
            feat['_user_has_school'] = 1 if user_info[
                'School'] is not None else 0
            feat[
                '_user_has_books'] = 1 if user_info['Books'] is not None else 0
            feat['_user_has_movies'] = 1 if user_info[
                'Movies'] is not None else 0
            feat[
                '_user_has_music'] = 1 if user_info['Music'] is not None else 0
            feat['_user_has_location'] = 1 if user_info[
                'Location'] is not None else 0
            feat['_user_has_hometown'] = 1 if user_info[
                'Hometown'] is not None else 0
    #        feat['_user_last'] = user_info['LastWebAccess']

    # Dictionary features
        feat['_has_bad_word'] = has_bad_word
        #        feat['_has_hate_word'] = has_hate_word
        #        feat['_has_drug_word'] = has_drug_word
        feat['_has_cult_word'] = has_cult_word
        feat['_has_swastika'] = has_swastika
        #        feat['_has_occult_word'] = has_occult_word
        #        feat['_has_has_fwenzel_word'] = has_fwenzel_word
        feat['_has_porn_word'] = has_porn_word
        feat['_has_swastika'] = has_swastika
        feat.update(read_feat)

        #        print feat
        res.append(feat)
    return res
コード例 #49
0
ファイル: util.py プロジェクト: rohankshir/football
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    print words, "\n"
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
コード例 #50
0
ファイル: testing.py プロジェクト: riamf/jupyternotes
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from nltk.stem import LancasterStemmer

f = open('data.txt', 'r')
lines = f.readlines()
f.close()

custom_stopwords = set(stopwords.words('english') + list(punctuation))

tokenized_lines = []
for line in lines:
    tokenized_words = [
        word for word in word_tokenize(line) if word not in custom_stopwords
    ]
    tokenized_lines.append(tokenized_words)

bigram_measures = BigramAssocMeasures()
ngrams = []
for line in tokenized_lines:
    ngrams.append(
        sorted(BigramCollocationFinder.from_words(line).ngram_fd.items()))

st = LancasterStemmer()
stemmed = []
for line in tokenized_lines:
    stemmed_words = [st.stem(word) for word in line]
    stemmed.append(stemmed_words)

for st in stemmed:
    print(st)
コード例 #51
0
def bag_of_bigram_words(words, score_fn=BigramAssocMeasures.chi_sq, n=100):
    bigram_finder= BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bag_of_non_stopwords(words+bigrams)
コード例 #52
0
import nltk
nltk.download('punkt')
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures, TrigramCollocationFinder

bi_dict = dict()
bg_measures = BigramAssocMeasures()
with open('text/text.txt', 'r') as file:
    text = file.read()
    table = str.maketrans(dict.fromkeys('0123456789'))
    textWithoutNumbers = text.translate(table)

    words = nltk.word_tokenize(textWithoutNumbers)

    bi_finder = BigramCollocationFinder.from_words(words, window_size=2)
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    bi_finder.apply_freq_filter(2)
    t = bi_finder.ngram_fd.items()
    ngram = list(t)
    ngram.sort(key=lambda item: item[-1], reverse=False)
    for (k, v) in ngram:
        print(k, v)
    bi_finder.score_ngrams(bigram_measures.pmi)
    bi_collocs = bi_finder.nbest(bg_measures.likelihood_ratio, 10)
    print(bi_collocs)

    tri_finder = TrigramCollocationFinder.from_words(words)
    bi_finder.apply_freq_filter(5)
    t = tri_finder.ngram_fd.items()
    ngram = list(t)
    ngram.sort(key=lambda item: item[-1], reverse=False)
    for (k, v) in ngram:
コード例 #53
0
def ExtractCollocationFeatures(train_dataset,
                               test_dataset,
                               X_train_filename,
                               X_test_filename,
                               window_size,
                               n_features,
                               balance_dataset=False,
                               remove_center_interval=None):

    # This method extract Collocations of two words within the given
    # window of words as features from the given train and test datasets.
    # It returns X, Y matrices the vectorizer and a list with the feature names.
    # It also stores those X matrices in txt files with names X_train_filename and
    # X_test_filename under the /feature_matrices folder.
    # There are five tuneable parameters:
    # - window_size: size of the window
    # - n_features: number of features considered.
    # - balance_dataset: set to True to balance the training dataset.
    # - remove_center_interval: format: [-0.2, 0.2]. To remove samples with DW-Nominate inside
    # the interval.

    print("Reading datasets...")
    path_train = "../datasets/train/"
    train_dataset_df = pd.read_csv(path_train + train_dataset,
                                   sep="|",
                                   encoding="latin_1",
                                   header=None)
    train_dataset_df.columns = ['nominate_dim1', 'nominate_dim2', 'speech']

    #Remove rows with DW-nominates close to 0
    if type(remove_center_interval) != type(None):
        train_dataset_df['ideology'] = train_dataset_df['nominate_dim1'].apply(
            lambda x: 0 if (float(x) > remove_center_interval[0] and float(x) <
                            remove_center_interval[1]) else x)
        train_dataset_df = train_dataset_df[train_dataset_df['ideology'] != 0]

    train_dataset_df['ideology'] = train_dataset_df['nominate_dim1'].apply(
        lambda x: 1.0 if (float(x) >= 0) else -1.0)

    path_test = "../datasets/test/"
    test_dataset_df = pd.read_csv(path_test + test_dataset,
                                  sep="|",
                                  encoding="latin_1",
                                  header=None)
    test_dataset_df.columns = ['nominate_dim1', 'nominate_dim2', 'speech']

    if balance_dataset == True:
        positive_rows = len(
            train_dataset_df[train_dataset_df['ideology'] == 1.0])
        negative_rows = len(
            train_dataset_df[train_dataset_df['ideology'] == -1.0])
        if positive_rows > negative_rows:
            n = positive_rows - negative_rows

            indices = train_dataset_df[train_dataset_df['ideology'] ==
                                       1.0].index.values.tolist()
            drop_indices = random.sample(indices, n)
            train_dataset_df = train_dataset_df.drop(drop_indices)
        else:
            n = negative_rows - positive_rows

            indices = train_dataset_df[train_dataset_df['ideology'] ==
                                       -1.0].index.values.tolist()
            drop_indices = random.sample(indices, n)
            train_dataset_df = train_dataset_df.drop(drop_indices)

    train_speeches = train_dataset_df['speech'].values.tolist()
    Y_train = train_dataset_df['ideology'].values.tolist()

    if type(remove_center_interval) != type(None):
        test_dataset_df['ideology'] = test_dataset_df['nominate_dim1'].apply(
            lambda x: 0 if (float(x) > remove_center_interval[0] and float(x) <
                            remove_center_interval[1]) else x)
        test_dataset_df = test_dataset_df[test_dataset_df['ideology'] != 0]

    test_dataset_df['ideology'] = test_dataset_df['nominate_dim1'].apply(
        lambda x: 1.0 if (float(x) >= 0) else -1.0)

    test_speeches = test_dataset_df['speech'].values.tolist()
    Y_test = test_dataset_df['ideology'].values.tolist()

    print("Extracting features from train dataset...")
    t_start = time.time()

    stop_words = stopwords.words('english')

    total_bigrams = {}
    bigrams_per_speech_train = []
    t0 = time.time()
    print(len(train_speeches))
    for i in range(0, len(train_speeches)):
        if (i % 1000 == 0):
            print(i)
            t1 = time.time()
            print(str(t1 - t0) + " segundos")
            t0 = time.time()

        speech = train_speeches[i]
        speech = speech.lower()
        speech = speech.translate(str.maketrans('', '', string.punctuation))
        words = speech.split()
        stop_words = set(stopwords.words('english'))
        filtered_words = [w for w in words if not w in stop_words]

        bcf = BigramCollocationFinder.from_words(filtered_words,
                                                 window_size=window_size)

        for item in bcf.ngram_fd.items():
            if item[0] not in total_bigrams:
                total_bigrams.update({item[0]: item[1]})
            else:
                total_bigrams[item[0]] += item[1]

        bigrams_per_speech_train.append(bcf.ngram_fd.items())

    print("Total bigrams finded: ", len(total_bigrams))

    feature_names = []
    most_frequent_bigrams_sorted = sorted(total_bigrams.items(),
                                          key=lambda x: x[1],
                                          reverse=True)[:n_features]
    print("Number of features: ", len(most_frequent_bigrams_sorted))
    most_frequent_bigrams = dict(most_frequent_bigrams_sorted)

    for i in range(0, len(most_frequent_bigrams_sorted)):
        feature_names.append(most_frequent_bigrams_sorted[i][0])
    print(len(feature_names))

    order = list(range(0, len(feature_names)))
    collocation_order = dict(zip(feature_names, order))

    print("Computing X_train...")

    X_train_matrix = np.zeros(
        (len(bigrams_per_speech_train), len(feature_names)))

    for i in range(0, len(bigrams_per_speech_train)):
        if (i % 1000 == 0):
            print(i)
            t1 = time.time()
            print(str(t1 - t0) + " segundos")
            t0 = time.time()
        bigrams_per_speech_i = dict(bigrams_per_speech_train[i])
        for bigram in bigrams_per_speech_i:
            if bigram in most_frequent_bigrams:
                column = collocation_order[bigram]
                X_train_matrix[i][column] = bigrams_per_speech_i[bigram]
    print("Creating dataframe...")
    X_train_df = pd.DataFrame(X_train_matrix, columns=feature_names)
    t1 = time.time()
    print(str(t1 - t0) + " segundos")
    t0 = time.time()

    pathX = "../feature_matrices/"
    print("Saving X_train into a txt file...")
    X_train_df.to_csv(pathX + X_train_filename,
                      header=feature_names,
                      index=None,
                      sep=',')
    print("Transforming X_train into a csr_matrix...")
    X_train = csr_matrix(X_train_df)

    print("Extracting bigrams from test dataset...")

    bigrams_per_speech_test = []
    t0 = time.time()
    print(len(test_speeches))
    for i in range(0, len(test_speeches)):
        if (i % 1000 == 0):
            print(i)
            t1 = time.time()
            print(str(t1 - t0) + " segundos")
            t0 = time.time()

        speech = test_speeches[i]
        speech = speech.lower()
        speech = speech.translate(str.maketrans('', '', string.punctuation))
        words = speech.split()
        stop_words = set(stopwords.words('english'))
        filtered_words = [w for w in words if not w in stop_words]

        bcf = BigramCollocationFinder.from_words(filtered_words,
                                                 window_size=window_size)
        bigrams_per_speech_test.append(bcf.ngram_fd.items())

    print("Computing X_test...")

    X_test_matrix = np.zeros(
        (len(bigrams_per_speech_test), len(feature_names)))
    for i in range(0, len(bigrams_per_speech_test)):
        if (i % 1000 == 0):
            print(i)
            t1 = time.time()
            print(str(t1 - t0) + " segundos")
            t0 = time.time()
        bigrams_per_speech_i = dict(bigrams_per_speech_test[i])
        for bigram in bigrams_per_speech_i:
            if bigram in most_frequent_bigrams:
                column = collocation_order[bigram]
                X_test_matrix[i][column] = bigrams_per_speech_i[bigram]

    print("Creating dataframe...")
    X_test_df = pd.DataFrame(X_test_matrix, columns=feature_names)
    t1 = time.time()
    print(str(t1 - t0) + " segundos")
    t0 = time.time()

    print("Saving X_test into a txt file...")
    X_test_df.to_csv(pathX + X_test_filename,
                     header=feature_names,
                     index=None,
                     sep=',')
    print("Transforming X_train into a csr_matrix...")
    X_test = csr_matrix(X_test_df)

    t_end = time.time()
    total_time = t_end - t_start
    print("Total time: ")
    print(str(total_time) + " segundos")

    return X_train, Y_train, X_test, Y_test, feature_names
コード例 #54
0
    ngram_freq_dist = nltk.FreqDist(ngrams)
    sorted_ngrams_fd = sorted(ngram_freq_dist.items(),
                              key=itemgetter(1),
                              reverse=True)
    sorted_ngrams = sorted_ngrams_fd[0:limit]
    sorted_ngrams = [(' '.join(text), freq) for text, freq in sorted_ngrams]

    return sorted_ngrams


corpus, category = get_data()

from nltk.collocations import BigramCollocationFinder
from nltk.collocations import BigramAssocMeasures

finder = BigramCollocationFinder.from_documents(
    [item.split() for item in corpus])
bigram_measures = BigramAssocMeasures()

print finder.nbest(bigram_measures.raw_freq, 10)

from nltk.collocations import TrigramCollocationFinder
from nltk.collocations import TrigramAssocMeasures

finder = TrigramCollocationFinder.from_documents(
    [item.split() for item in corpus])
trigram_measures = TrigramAssocMeasures()

print finder.nbest(trigram_measures.raw_freq, 10)
print finder.nbest(trigram_measures.pmi, 10)

# print get_top_ngrams(corpus, ngram_val=2, limit=10)
コード例 #55
0
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    words_nopunc = [word for word in words if word not in string.punctuation]
    bigram_finder = BigramCollocationFinder.from_words(words_nopunc)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words_nopunc, bigrams)])
コード例 #56
0
from nltk.corpus import stopwords
from nltk.corpus import webtext
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

set = set(stopwords.words('english'))
stops_filter = lambda w: len(w) < 3 or w in set
tokens = [t.lower() for t in webtext.words('grail.txt')]
words = BigramCollocationFinder.from_words(tokens)
words.apply_word_filter(stops_filter)
print(words.nbest(BigramAssocMeasures.likelihood_ratio, 10))
コード例 #57
0
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import MinMaxScaler

import seaborn as sns

df = pd.read_csv('../preprocessed_dataset.csv')
df.head()

# Calculating number of repeated bigrams per song. Only considered bigrams of which repetition frequency is greater than 3
bigram_score = []
for i in range(len(df.index)):
    mean_pmi = 0.0
    pmi_bigram = []
    text = df["Lyrics"][i].split()
    coll_bia = bigram_collocation.from_words(text)
    coll_bia.apply_freq_filter(3)
    bigram_freq = coll_bia.ngram_fd.items()
    bigramFreqTable = pd.DataFrame(list(bigram_freq),
                                   columns=['bigram', 'freq'
                                            ]).sort_values(by='freq',
                                                           ascending=False)
    bigram_score.append(len(bigramFreqTable.index.values))

# Calculating number of repeated trigrams per song. Only considered trigrams of which repetition frequency is greater than 3
trigram_score = []
for i in range(len(df.index)):
    mean_pmi = 0.0
    pmi_trigram = []
    text = df["Lyrics"][i].split()
    coll_tri = trigram_collocation.from_words(text)
コード例 #58
0
plt.show()

fd = fdist_no_punc_no_stopwords

# las mas comunes
fd.most_common(50)

# diagramas_dispersion
text.dispersion_plot(["God", "mind", "knowledge"])
text.dispersion_plot(["power", "reason", "nature"])
# text.concordance("god")

# bigramas
# from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(words)
finder.nbest(bigram_measures.pmi, 10)
finder.apply_freq_filter(3)
finder.nbest(bigram_measures.pmi, 10)

# lo que aqui cambia es el cambio de filtro

# WC para los bigramas mas frecuentes
stopWords = stopwords
text_content = [
    ''.join(re.split("[ .,;:!?‘’``''@#$%^_&*()<>{}~\n\t\\\-]", word))
    for word in text
]

text_content = [word for word in text_content if word not in stopWords]
text_content = [s for s in text_content if len(s) != 0]
コード例 #59
0
ファイル: collocations.py プロジェクト: alturutin/NLP_ru
corpus = []
while True:
    l = hpmor.readline()
    if l == '': break
    l = re.sub(r"[^а-яё \t-]", "", l.lower()).strip().split()
    if l: corpus.extend(l)

bigram_measures = BigramAssocMeasures()
trigram_measures = TrigramAssocMeasures()

stop = set(stopwords.words('russian'))
stop.update(['гарри', 'поттер', 'профессор'
             ])  # добавим самые популярные слова из текста в стоп-лист
corpus_ = list(filter(lambda x: x not in stop, corpus))

finder = BigramCollocationFinder.from_words(corpus_)
finder3 = TrigramCollocationFinder.from_words(corpus_)

# фильтры по частотам и стоп-слова
finder.apply_freq_filter(5)
finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in stop)
finder3.apply_freq_filter(5)
finder3.apply_word_filter(lambda w: len(w) < 3 or w.lower() in stop)

# биграммы и триграммы
raw_bigrams = finder.nbest(bigram_measures.raw_freq, 100)
pmi_bigrams = finder.nbest(bigram_measures.pmi, 100)
raw_trigrams = finder3.nbest(trigram_measures.raw_freq, 100)
pmi_trigrams = finder3.nbest(trigram_measures.pmi, 100)

コード例 #60
0
ファイル: judge.py プロジェクト: HZ-njupt/Weibo
def bigram(words, score_fn=BigramAssocMeasures.chi_sq, n=500):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)  # 使用卡方统计的方法,选择排名前1000的词语
    newBigrams = [u + v for (u, v) in bigrams]