Exemple #1
0
    def _get_bigram_scores(self, posdata, negdata):
        pos_words = list(itertools.chain(*posdata))
        neg_words = list(itertools.chain(*negdata))

        pos_bigram_finder = BigramCollocationFinder.from_words(pos_words)
        neg_bigram_finder = BigramCollocationFinder.from_words(neg_words)
        pos_bigrams = pos_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
        neg_bigrams = neg_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

        pos = pos_words + pos_bigrams
        neg = neg_words + neg_bigrams

        word_fd = FreqDist()
        cond_word_fd = ConditionalFreqDist()
        for word in pos:
            word_fd[word] += 1
            cond_word_fd['pos'][word] += 1
        for word in neg:
            word_fd[word] += 1
            cond_word_fd['neg'][word] += 1

        pos_word_count = cond_word_fd['pos'].N()
        neg_word_count = cond_word_fd['neg'].N()
        total_word_count = pos_word_count + neg_word_count

        word_scores = {}
        for word, freq in word_fd.iteritems():
            pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
            neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
            word_scores[word] = pos_score + neg_score

        return word_scores
    def get_unibigram_features(all_words, uni_feanum, bi_feanum):
        word_fd = nltk.FreqDist(all_words)
        bigram_fd = nltk.FreqDist(nltk.bigrams(all_words))

        if uni_feanum == 'max':
            uni_feanum = len(list(word_fd.keys()))
        elif uni_feanum > len(list(word_fd.keys())):
            uni_feanum = len(list(word_fd.keys()))

        if bi_feanum == 'max':
            bi_feanum = len(list(bigram_fd.keys()))
        elif bi_feanum > len(list(bigram_fd.keys())):
            bi_feanum = len(list(bigram_fd.keys()))

        finder = BigramCollocationFinder(word_fd, bigram_fd)
        bigrams = finder.nbest(BigramAssocMeasures.chi_sq, bi_feanum)

        print "the number of unigram features is", uni_feanum
        print "the number of bigram features is", bi_feanum

        featuples = word_fd.most_common(uni_feanum)

        selected_words = []

        for i in range(uni_feanum):
            selected_words.append(featuples[i][0])

        features = []
        for ngram in itertools.chain(selected_words, bigrams):
            features.append(ngram)

        return features
def create_word_bigram_scores(posWords, negWords, score_method = BigramAssocMeasures.chi_sq):
    '''
    以双词来统计词的信息量
    '''
    bigram_finder = BigramCollocationFinder.from_words(posWords)
    posBigrams = bigram_finder.nbest(score_method, 5000)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    negBigrams = bigram_finder.nbest(score_method, 5000)
    pos = posWords + posBigrams #词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in neg:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count
    print("BIGRAM_IN_POSWORD_NUMS : %d\tBIGRAM_IN_NEGWORD_NUMS : %d" % (pos_word_count, neg_word_count))

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = score_method(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = score_method(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score
    return word_scores
def create_bigram_scores():
    posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", "1", "1")
    negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", "1", "1")
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000)

    pos = posBigrams
    neg = negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd.inc(word)
        cond_word_fd['pos'].inc(word)
    for word in neg:
        word_fd.inc(word)
        cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
def create_word_bigram_scores(posWords, negWords, n = 5000):
    # (posWords,negWords) = readwordarr()
    posWords = list(itertools.chain(*posWords))
    negWords = list(itertools.chain(*negWords))
    bigramfinder = BigramCollocationFinder.from_words(posWords)
    posbigrams = bigramfinder.nbest(BigramAssocMeasures.chi_sq, n)
    bigramfinder = BigramCollocationFinder.from_words(negWords)
    negbigrams = bigramfinder.nbest(BigramAssocMeasures.chi_sq, n)
    posWords = posWords + posbigrams
    negWords = negWords + negbigrams
    wordscores = {}
    wordfd = FreqDist()
    conditionwordfd = ConditionalFreqDist()
    for word in posWords:
        wordfd[word]+=1
        conditionwordfd['pos'][word]+=1
        
    for word in negWords:
        wordfd[word]+=1
        conditionwordfd['neg'][word]+=1
    
    pos_word_count = conditionwordfd['pos'].N()
    neg_word_count = conditionwordfd['neg'].N()
    totalcount = pos_word_count + neg_word_count
    for word,freq in wordfd.items():
        pos_score = BigramAssocMeasures.chi_sq(conditionwordfd['pos'][word], (freq, pos_word_count), totalcount)
        neg_score = BigramAssocMeasures.chi_sq(conditionwordfd['neg'][word], (freq, neg_word_count), totalcount)
        wordscores[word] = pos_score + neg_score
    return wordscores
def create_word_bigram_scores():
    posdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl','r'))
    negdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl','r'))
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams #词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd['pos'][word]+= 1 #cond_word_fd['pos'].inc(word)
    for word in neg:
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd['neg'][word]+= 1#cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
def create_word_bigram_scores(posWords, negWords):
    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)

    pos = posWords + posBigrams #词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[str(word)] += 1 
        cond_word_fd['pos'][str(word)] += 1
    for word in neg:
	    word_fd[str(word)] += 1
	    cond_word_fd['neg'][str(word)] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
def best_bigrams(sents_tagged, stopwords, score_fn=BigramAssocMeasures.likelihood_ratio, n=300):
    sents_pos = []
    sents_neg = []

    # Separate positive and negative sentences.
    for tag, sent in sents_tagged:
        if tag == 1:
            sents_pos.append(sent)
        elif tag == -1:
            sents_neg.append(sent)

    # Extract words from positive and negative sentences.
    words_pos = [word.lower() for s in sents_pos for word in word_tokenize(s) if word not in string.punctuation]
    words_neg = [word.lower() for s in sents_neg for word in word_tokenize(s) if word not in string.punctuation]

    # Find the best bigrams for positive sentences based on informative collocations
    bigram_finder1 = BigramCollocationFinder.from_words(words_pos)
    bigrams_best_pos = bigram_finder1.nbest(score_fn, n)

    # Find the best bigrams for negative sentences based on informative collocations
    bigram_finder2 = BigramCollocationFinder.from_words(words_neg)
    bigrams_best_neg = bigram_finder2.nbest(score_fn, n)

    bigrams_all = list(set(bigrams_best_pos).union(set(bigrams_best_neg)))

    # Select only the bigrams that have either one of the word greater than length 3
    bigrams_best = [bigram for bigram in bigrams_all
            if len(bigram[0]) > 3 and len(bigram[1]) > 3
            and bigram[0] not in ex and bigram[1] not in ex ]


    return bigrams_best
Exemple #9
0
def create_word_bigram_scores():
    posdata = tp.seg_fil_senti_excel("~", 1, 1)
    negdata = tp.seg_fil_senti_excel("~", 1, 1)
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    last_word = ConditionalFreqDist()
    for word in pos:
        word_fd.inc(word)
        last_word['pos'].inc(word)
    for word in neg:
        word_fd.inc(word)
        last_word['neg'].inc(word)

    pos_word_count = last_word['pos'].N()
    neg_word_count = last_word['neg'].N()
    totalnumber = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(last_word['pos'][word], (freq, pos_word_count), totalnumber)
        neg_score = BigramAssocMeasures.chi_sq(last_word['neg'][word], (freq, neg_word_count), totalnumber)
        word_scores[word] = pos_score + neg_score

    return word_scores
def create_word_bigram_scores():
    posdata = tp.seg_fil_txt("/home/hadoop/goodnew.txt")
    negdata = tp.seg_fil_txt("/home/hadoop/badnew.txt")
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finderr = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finderr.nbest(BigramAssocMeasures.chi_sq,350000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq,350000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd.inc(word)
        cond_word_fd['pos'].inc(word)
    for word in neg:
        word_fd.inc(word)
        cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
def create_words_bigrams_scores():
    posdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/pos_review.xlsx", 1, 1)
    negdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/neg_review.xlsx", 1, 1)
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word]+=1
        cond_word_fd['pos'][word]+=1

    for word in neg:
        word_fd[word]+=1
        cond_word_fd['neg'][word]+=1
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
Exemple #12
0
def create_word_bigram_scores():
	bigram_finder = BigramCollocationFinder.from_words(posWords)
	bigram_finder = BigramCollocationFinder.from_words(negWords)
	posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
	negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
	pos = posWords + posBigrams #词和双词搭配
	neg = negWords + negBigrams
	return get_scores(pos, neg)
def create_word_bigram_scores():
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))
    
    objWords = list(itertools.chain(*objdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    
    bigram_finder = BigramCollocationFinder.from_words(objWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    
    objBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)


    pos = posWords + posBigrams
    neg = negWords + negBigrams
    
    obj = objWords + objBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in neg:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1
    for word in objWords:
        word_fd[word] += 1
        cond_word_fd['obj'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    
    obj_word_count = cond_word_fd['obj'].N()
    total_word_count = pos_word_count + neg_word_count + obj_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
       
        obj_score = BigramAssocMeasures.chi_sq(cond_word_fd['obj'][word], (freq, obj_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score + obj_score

    return word_scores
def get_bigrams1(tweet, score_fn=BigramAssocMeasures.chi_sq, n=200):
        bigramslist = []
        bigram_finder = BigramCollocationFinder.from_words(tweet)
        bigrams = bigram_finder.nbest(score_fn, n)
        for bigram in bigrams:
            bigramslist.append(' '.join(str(i) for i in bigram))
        print bigramslist
Exemple #15
0
    def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        :seealso: find_collocations
        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not (
            '_collocations' in self.__dict__
            and self._num == num
            and self._window_size == window_size
        ):
            self._num = num
            self._window_size = window_size

            # print("Building collocations list")
            from nltk.corpus import stopwords

            ignored_words = stopwords.words('english')
            finder = BigramCollocationFinder.from_words(self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations]
        print(tokenwrap(colloc_strings, separator="; "))
def best_bigram_word_features(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    d = dict([(bigram, True) for bigram in bigrams])
    d.update(best_word_features(words))

    return d
def bigram_word_features(words, limit, stop, stopset, word_score_placeholder, \
                                          score_fn=BigramAssocMeasures.chi_sq):
  if stop:
    words = [w for w in words if w not in stopset]
  bigram_finder = BigramCollocationFinder.from_words(words)
  bigrams = bigram_finder.nbest(score_fn, limit)
  return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
def tweet_features(tweet):
    tweet_words = word_tokenize(tweet)
    bigram_finder = BigramCollocationFinder.from_words(tweet_words)
    score_fn=BigramAssocMeasures.chi_sq
    bigrams = bigram_finder.nbest(score_fn, 200)
    print bigrams
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
Exemple #19
0
def get_bag_of_bigrams_words(
        word_list,
        score_fn=BigramAssocMeasures.chi_sq,
        n=200):
    bigram_finder = BigramCollocationFinder.from_words(word_list)
    bigrams = bigram_finder.nbest(score_fn, n)
    return get_bag_of_words(word_list + bigrams)
Exemple #20
0
def demo_collocations(self, num=40, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        @seealso: L{find_collocations}
        @param num: The maximum number of collocations to print.
        @type num: C{int}
        @param window_size: The number of tokens spanned by a collocation (default=2)
        @type window_size: C{int}
        """
        if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
            self._num = num
            self._window_size = window_size
            print "Building collocations list"
            from nltk.corpus import stopwords
            ignored_words = stopwords.words('english')
            from nltk.collocations import BigramCollocationFinder
            finder = BigramCollocationFinder.from_words(self.tokens, window_size) 
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            from nltk.metrics import f_measure, BigramAssocMeasures
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1+u' '+w2 for w1, w2 in self._collocations]
        print "List {0} collocations".format(num)
        print tokenwrap(colloc_strings, separator=u'; ')
Exemple #21
0
 def collaction_discovery(self):
     self.corpus = nltk.word_tokenize(self.corpus.lower())
     bigramm_finder = BigramCollocationFinder.from_words(self.corpus)
     filter_bigram = lambda w: len(w) < 3 or w in self.stopwords_
     bigramm_finder.apply_word_filter(filter_bigram)
     top_10_bigrams = bigramm_finder.nbest(BigramAssocMeasures.likelihood_ratio, 10)
     return top_10_bigrams
Exemple #22
0
 def converter(tokens):
     bigram_finder = BigramCollocationFinder.from_words(tokens)
     bigrams = bigram_finder.nbest(score_fn, n)
     return (
         {ngram: True for ngram in itertools.chain(tokens, bigrams)},
         label
     )
Exemple #23
0
    def get_frequencies(self, desc):

        stopset = set(stopwords.words('english'))
        filter_stops = lambda w: len(w) < 3 or w in stopset
        words = word_tokenize(desc)

        print '------gram--------'
        words_to_count = [word for word in words if word not in stopset]
        words_to_count = [word for word in words_to_count if not len(word) < 3]
        c = Counter(words_to_count)
        single = c.most_common(20)
        print single

        print '------bigram--------'
        bcf = BigramCollocationFinder.from_words(words)
        bcf.apply_word_filter(filter_stops)
        bigrm = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 15)
        print bigrm

        print '------trigram--------'
        tcf = TrigramCollocationFinder.from_words(words)
        tcf.apply_word_filter(filter_stops)
        tcf.apply_freq_filter(3)  #only keep those that appear more than 3 times
        trigrm = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 10)
        print trigrm

        matches = [single,bigrm,trigrm]
        return matches
 def get_bigrams(self, rawText, score_fn=BigramAssocMeasures.pmi, n=40):
     # TODO configuration value
     clean_text = TokensCleaner.clean(self, rawText, cleaning_level=3)
     bigram_finder = BigramCollocationFinder.from_words(clean_text['3'])
     bigram_measures = BigramAssocMeasures()
     bigrams = bigram_finder.nbest(bigram_measures.pmi, n)
     return bigrams
 def get_bigrams(self, tweet, score_fn=BigramAssocMeasures.chi_sq, n=200):
         bigramslist = []
         bigram_finder = BigramCollocationFinder.from_words(tweet)
         bigrams = bigram_finder.nbest(score_fn, n)
         for bigram in bigrams:
             bigramslist.append(' '.join(str(i) for i in bigram))
         return bigramslist #This is list e.g. ['you dude', 'Hi How', 'How are', 'are you']
 def get_collocations(self):
     ignored_words = stopwords.words('english')
     finder = BigramCollocationFinder.from_words(self.text_array,2)
     finder.apply_freq_filter(3)
     finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
     bigram_measures = BigramAssocMeasures()
     return finder.nbest(bigram_measures.likelihood_ratio,40)
def ShowCollocations():
	text.insert(END, "If this doesn't work, please check you have NLTK, PyYAML and the stopword list from the NLTK loaded. See Help for details \n\n\n")
	import nltk
	from nltk.collocations import BigramCollocationFinder
	from nltk.collocations import TrigramCollocationFinder
	from nltk.metrics import BigramAssocMeasures
	from nltk.metrics import TrigramAssocMeasures
	pattern = r'''(?x)([A-Z]\.)+|\w+([-']\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;"'?():-_']'''
	data = resultsbox.get(1.0,END)
	rawtext=nltk.regexp_tokenize(data, pattern)
	prepcolloc = [word.lower() for word in rawtext if not word in stopwords and word.isalpha()]
	text.delete(1.0, END)
	text.insert(END, "Collocations (occurring at least 3 times with a PMI of 10)\n")
	text.insert(END, "\nBigram Collocations:\n")
	bigram = BigramAssocMeasures()
	bigramfinder = BigramCollocationFinder.from_words(prepcolloc)
	bigramfinder.apply_freq_filter (3)
	bigrams=bigramfinder.nbest(bigram.pmi, 10)
	for item in bigrams:
		first = item[0]
		second = item[1]
		text.insert(END, first)
		text.insert(END, " ")
		text.insert(END, second)
		text.insert(END, "\n")
 def get_bigram(self, features_list):
     #Top ten best bigrams are selected
     score = BigramAssocMeasures.chi_sq
     all_bigrams = BigramCollocationFinder.from_words(features_list)
     best_bigrams = all_bigrams.nbest(score, self.bigram_threshold)
     selected_bigrams = [(bigram, True) for bigram in best_bigrams]
     return selected_bigrams
def stopword_filtered_bigrams(text, score_fn=BigramAssocMeasures.chi_sq, n=500):
    '''Removes the stopwords and computes the best bigrams'''
    stopset = set(stopwords.words('english'))
    words = [word for word in tokenize(text) if word not in stopset]
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
Exemple #30
0
def bigram(ytcomments, drug):
    bi = BigramAssocMeasures()
    bi_finder = BigramCollocationFinder.from_words(ytcomments, window_size = 20)
    top_general = bi_finder.nbest(bi.pmi,30)
    bi_finder.apply_ngram_filter(lambda w1,w2: w1 != drug and w2 != drug)
    top_bi = bi_finder.nbest(bi.pmi, 30)
    return top_bi
Exemple #31
0
def bigram_words(words, score_fn=BigramAssocMeasures.chi_sq, n=1000):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bag_of_words(words + bigrams)  #所有词和(信息量大的)双词搭配一起作为特征
Exemple #32
0
def bigram(words, score_fn=BigramAssocMeasures.chi_sq, n=1000):
    bigram_finder = BigramCollocationFinder.from_words(words)  #把文本变成双词搭配的形式
    bigrams = bigram_finder.nbest(score_fn, n)  #使用了卡方统计的方法,选择排名前1000的双词
    return bag_of_words(bigrams)
Exemple #33
0
 def get_bigrams(self, words):
     bigram_finder = BigramCollocationFinder.from_words(words)
     self.biagrams = bigram_finder.nbest(self.bigram_score_funcion,
                                         self.top_ngram_count)
     return self.biagrams
Exemple #34
0

def flatten_corpus(corpus):
    return ' '.join([document.strip() for document in corpus])


def get_top_ngrams(corpus, ngram_val=1, limit=5):
    corpus = flatten_corpus(corpus)
    tokens = nltk.word_tokenize(corpus)
    ngrams = compute_ngrams(tokens, ngram_val)
    ngrams_freq_dist = nltk.FreqDist(ngrams)
    sorted_ngrams_fd = sorted(ngrams_freq_dist.items(),
                              key=itemgetter(1),
                              reverse=True)
    sorted_ngrams = sorted_ngrams_fd[0:limit]
    sorted_ngrams = [(' '.join(text), freq) for text, freq in sorted_ngrams]
    return sorted_ngrams


print(get_top_ngrams(corpus=norm_alice, ngram_val=3, limit=10))

finder = BigramCollocationFinder.from_documents(
    [item.split() for item in norm_alice])
bigram_measures = BigramAssocMeasures()

print(finder.nbest(bigram_measures.raw_freq, 10))

# Now using gensim
print("Sentence: ", norm_alice[2])
key_words = keywords(norm_alice[2], ratio=1.0, scores=True, lemmatize=True)
print([(item, round(score, 3)) for item, score in key_words][:25])
Exemple #35
0
def bi(text):
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    finder=BigramCollocationFinder.from_words(word_tokenize(text))
    finder.apply_freq_filter(5)
    finder.nbest(bigram_measures.pmi, 5) 
    return finder.ngram_fd.items()
def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    d = dict([(bigram, True) for bigram in bigrams])
    d.update(best_word_feats(words))
    return d
Exemple #37
0
def get_bigrams(tokens, freq_filter=None):
    finder = BigramCollocationFinder.from_words(tokens)
    if freq_filter:
        finder.apply_freq_filter(freq_filter)
    return list(' '.join(b[0]) for b in finder.ngram_fd.items())
def bigram(collat_data):
    df_co = pd.DataFrame.to_string(collat_data,
                                   columns=['lemmatization']).split(',')
    bcf = BigramCollocationFinder.from_words(df_co)
    top20 = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 20)
    return top20
def find_bigrams(sentences, n_ngrams):
    cf = BigramCollocationFinder.from_documents(sentences)
    fng = cf.nbest(BigramAssocMeasures.likelihood_ratio, n_ngrams)
    return fng
Exemple #40
0
modelkmeans = KMeans(init='k-means++', max_iter=200, n_init=100)
modelkmeans.fit(X)
order_centroids = modelkmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(modelkmeans.n_clusters):
    print("Cluster {}:".format(i)),
    for ind in order_centroids[i, :10]:
        print("{}".format(terms[ind]))

s = all_text_docs[name]
tokens = word_tokenize(s)
text = nltk.Text(tokens)
text.collocations()
text.concordance('social')
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(word_tokenize(s))
finder.nbest(bigram_measures.pmi, 10)

######
NUM_TOPICS = 10

vectorizer = CountVectorizer(min_df=5,
                             max_df=0.9,
                             stop_words='english',
                             lowercase=True)
data_vectorized = vectorizer.fit_transform(train_clean_sentences)

# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS,
                                      max_iter=10,
                                      learning_method='online')
def bigrams_words(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bigrams
with open("D:/Python/Consumer Complaints/Consumer_Complaints_CreditCard.csv", 'r') as file:
  complaints = list(csv.reader(file))
  file.close()

compClean = []
for i in range(len(complaints)):
    tokens = re.sub("[^A-Za-z0-9()'.]+", " ", complaints[i][5])
    tokens = re.sub('!', ".", tokens)
    compClean.append(tokens)


from nltk.corpus import webtext
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
words = [w.lower() for w in webtext.words('D:/Python/Consumer Complaints/complaintsDump.txt')]
bcf = BigramCollocationFinder.from_words(words)

#from nltk.collocations import TrigramCollocationFinder
#from nltk.metrics import TrigramAssocMeasures
#tcf = TrigramCollocationFinder.from_words(words)
#tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 4)

from nltk.corpus import stopwords
stopset = set(stopwords.words('english'))
filter_stops = lambda w: len(w) < 3 or w in stopset
bcf.apply_word_filter(filter_stops)
collocations = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 50)

newText = 'a credit card is issued to me'
tokens = re.sub(" ".join(collocations[1]), "-".join(collocations[1]), newText)
Exemple #43
0
def bag_of_bigrams_words(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    # finds words that often occur togther
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bag_of_words(words + bigrams)
Exemple #44
0
def analyze_text(text, filename, stopwords, min_length, freq, total_ngrams,
                 min_measure, bigrams_only, trigrams_only):
    print(len(text), filename)
    words = [
        w.lower() for w in text if w not in string.punctuation
        if w.lower() not in stopwords and len(w) >= min_length
    ]

    bigrams = None
    b_prefix_keys = None
    trigrams = None
    t_prefix_keys = None

    # what follows could totally be generalized
    if not trigrams_only:
        # Bigrams
        print("Generating bigrams from", filename)
        b_finder = BigramCollocationFinder.from_words(words)
        b_finder.ngram_fd
        b_finder.apply_freq_filter(freq)
        # if stopwords:
        #   b_finder.apply_word_filter(lambda w: w in stopwords)
        bigrams = b_finder.nbest(BigramAssocMeasures.pmi, total_ngrams)
        b_scored = b_finder.score_ngrams(BigramAssocMeasures.pmi)
        b_prefix_keys = collections.defaultdict(list)
        for key, scores in b_scored:
            if scores > min_measure:
                b_prefix_keys[key[0]].append((key[1], scores))

    # Trigrams
    if not bigrams_only:
        print("Generating trigrams from", filename)
        t_finder = TrigramCollocationFinder.from_words(words)
        t_finder.apply_freq_filter(freq)
        # if stopwords:
        #   t_finder.apply_word_filter(lambda w: w in stopwords)
        trigrams = t_finder.nbest(TrigramAssocMeasures.pmi, total_ngrams)
        t_scored = t_finder.score_ngrams(TrigramAssocMeasures.pmi)
        t_prefix_keys = collections.defaultdict(list)
        for key, scores in t_scored:
            if scores > min_measure:
                t_prefix_keys[key[0]].append((key[1], key[2], scores))

    if bigrams_only:
        ret = {
            'bigrams': bigrams,
            'b_prefix': b_prefix_keys,
            'b_fd': b_finder.ngram_fd
        }
    elif trigrams_only:
        ret = {
            'trigrams': trigrams,
            't_prefix': t_prefix_keys,
            't_fd': t_finder.ngram_fd
        }
    else:
        ret = {
            'bigrams': bigrams,
            'b_prefix': b_prefix_keys,
            'b_fd': b_finder.ngram_fd,
            'trigrams': trigrams,
            't_prefix': t_prefix_keys,
            't_fd': t_finder.ngram_fd
        }
    return ret
def main():
    # stopwords to filter out for collocations
    stopwords_eng = set(stopwords.words("english"))
    stopwords_eng.add(b'et')
    stopwords_eng.add(b'al')


    # bigram identifier from nltk
    bigram_measures = nltk.collocations.BigramAssocMeasures()

    # tf-idf vectorizer from nltk
    tfidf_vectorizer = TfidfVectorizer(stop_words='english',
                                 use_idf=True,
                                 tokenizer=tokenize_and_stem, ngram_range=(1,3))

    file = open('CultureRelatedDiaognosticIssues.txt','r')
    a = []
    names = []
    for line in file:
        miniList = line.split("|")
        names.append(int(miniList[0].strip()))
        a.append(miniList[1].strip())
    file.close()

    allvocab_stemmed = []
    allvocab_tokenized = []

    for element in a:
        stemmed_result = tokenize_and_stem(element)
        allvocab_stemmed.extend(stemmed_result)

        tokenized_result = tokenize_only(element)
        allvocab_tokenized.extend(tokenized_result)

    # data frame that contains stems and tokenized words
    vocab_frame = pd.DataFrame({'words': allvocab_tokenized},
    index = allvocab_stemmed)

    # tf-idf matrix for the terms in the corpus
    tfidf_matrix = tfidf_vectorizer.fit_transform(a)
    terms = tfidf_vectorizer.get_feature_names()

    # number of clusters
    num_clusters = 10

    # fitting the k-means algorithm and saving it in a .pkl file
    km = KMeans(n_clusters=num_clusters)
    km.fit(tfidf_matrix)
    joblib.dump(km,  'cluster.pkl')
    km = joblib.load('cluster.pkl')
    clusters = km.labels_.tolist()

    # data frame that saves the chapter, the text, and the assigned cluster
    dsm = {'chapter': names, 'text': a, 'cluster': clusters}
    frame = pd.DataFrame(dsm, index = [clusters], columns = ['chapter', 'text', 'cluster'])

    #groupby cluster for aggregation purposes
    grouped = frame['chapter'].groupby(frame['cluster'])

    # getting rid of all punctuation for bigram measures - will use this later
    puncTokenizer = RegexpTokenizer(r'\w+')

    print("Top terms per cluster:")
    print()

    #sort cluster centers by proximity to centroid
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]

    for i in range(num_clusters):
        print("Cluster %d words:" % i, end='')

        for ind in order_centroids[i, :6]:
            print(vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
        print()

        print("Cluster %d titles:" % i, end='')
        for title in frame.ix[i]['chapter'].values.tolist():
            print(str(title) + " , ", end='')
        print()

        # this for-loop finds the most common pairs of words in each diagnosis
        for text in frame.ix[i]['text'].values.tolist():
            data_tokens = puncTokenizer.tokenize(text)
            data_tokens = [x.lower() for x in data_tokens]

            tokens = [w for w in data_tokens if w not in stopwords_eng]

            finder = BigramCollocationFinder.from_words(tokens)
            print('Printing collocations in this chapter:')
            print(finder.nbest(bigram_measures.likelihood_ratio, 5))
            print()
    print()
    print()

    # distribution of clusters
    plt.hist(km.labels_, bins=num_clusters)
    plt.show()
Exemple #46
0
def word_features(words, score_fn=BAM.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict((bg, True) for bg in chain(words, bigrams))
Exemple #47
0
def bigram_words(words, score_fn=BigramAssocMeasures.pmi, n=121):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bag_of_words(words + bigrams)
Exemple #48
0
def create_features(X, user_data=None):
    res = []

    for date, comment, user in X:
        feat = {}
        has_hate_word = has_drug_word = has_cult_word = has_occult_word = has_porn_word = 0
        has_fwenzel_word = 0
        has_swastika = swastika in comment

        comment = comment.lower()

        comment = parse_text(comment)

        comment = nltk.clean_html(comment)

        sents = sent_tokenize(comment)
        doc = []
        for sent in sents:
            # Tokenize each sentence.
            doc += wordtokenizer.tokenize(sent)

        def repl_filter(x):
            return x.lower() not in ["nl", "nl2", "nbsp", "nbsp2", "dummyhtml"]

        # Remove stopwords and replacement tokens.
        doc = filter(repl_filter, doc)

        for i, word in enumerate(doc):
            if doc[i] in bad_words:
                doc[i] = '_badword_'

            doc[i] = ps.stem(doc[i])

            doc[i] = wnl.lemmatize(doc[i])

            if doc[i] in bad_words:
                doc[i] = '_badword_'

            if doc[i] in hate_words:
                has_hate_word = 1
            if doc[i] in drug_words:
                has_drug_word = 1
            if doc[i] in cult_words:
                has_cult_word = 1
            if doc[i] in occult_words:
                has_occult_word = 1
            if doc[i] in porn_words:
                has_porn_word = 1
            if doc[i] in fwenzel_words:
                has_fwenzel_word = 1

        bigram_finder = BigramCollocationFinder.from_words(doc)
        bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, n=5)

        bigram = dict([(ngram, True)
                       for ngram in itertools.chain(doc, bigrams)])

        feat.update(bigram)

        text_vocab = set(w for w in doc if w.isalpha())
        unusual = text_vocab.difference(english_vocab)
        unusual_ratio = len(unusual) / len(text_vocab) if len(
            text_vocab) != 0 else -1.0

        unusual2 = unusual.difference(set("_badword_"))
        unusual_ratio2 = len(unusual2) / len(text_vocab) if len(
            text_vocab) != 0 else -1.0

        if user_data is not None:
            user_info = user_data[user]

        has_bad_word = True
        for word in bad_words:
            if word in comment.lower():
                break
        else:
            has_bad_word = False

        def n_none(x):
            return int(x) if x is not None else 0

        def c_none(x):
            return x if x is not None else "__None__"

        readability = ReadabilityTool(comment)

        read_feat = {}
        for f, val in readability.analyzedVars.items():
            if f != 'words':
                read_feat["_" + f] = val
        for test, val in readability.tests_given_lang['eng'].items():
            read_feat["__" + test] = val(readability.text)

        feat['_always_present'] = True
        feat['_word_num'] = len(doc)
        feat['_sent_num'] = len(sents)
        feat['_word_var'] = len(set(doc)) / len(doc) if len(doc) != 0 else -1.0
        feat['_sent_var'] = len(set(sents)) / len(sents)
        feat['_unusual_ratio'] = unusual_ratio
        feat['_unusual_ratio2'] = unusual_ratio2
        if user_data is not None:
            feat['_username'] = user
            feat['_user_subcount'] = int(user_info['SubscriberCount'])
            feat['_user_friends'] = int(user_info['FriendsAdded'])
            feat['_user_favs'] = int(user_info['VideosFavourited'])
            feat['_user_videorates'] = int(user_info['VideosRated'])
            feat['_user_videouploads'] = int(user_info['VideosUploaded'])
            feat['_user_videocomments'] = int(user_info['VideosCommented'])
            feat['_user_videoshares'] = int(user_info['VideosShared'])
            feat['_user_usersubs'] = int(user_info['UserSubscriptionsAdded'])
            feat['_user_gender'] = c_none(user_info['Gender'])
            feat['_user_age'] = n_none(user_info['Age'])
            feat['_user_closed'] = user_info['UserAccountClosed']
            feat['_user_suspended'] = user_info['UserAccountSuspended']
            feat['_user_has_gender'] = 1 if user_info[
                'Gender'] is not None else 0
            feat['_user_has_school'] = 1 if user_info[
                'School'] is not None else 0
            feat[
                '_user_has_books'] = 1 if user_info['Books'] is not None else 0
            feat['_user_has_movies'] = 1 if user_info[
                'Movies'] is not None else 0
            feat[
                '_user_has_music'] = 1 if user_info['Music'] is not None else 0
            feat['_user_has_location'] = 1 if user_info[
                'Location'] is not None else 0
            feat['_user_has_hometown'] = 1 if user_info[
                'Hometown'] is not None else 0
    #        feat['_user_last'] = user_info['LastWebAccess']

    # Dictionary features
        feat['_has_bad_word'] = has_bad_word
        #        feat['_has_hate_word'] = has_hate_word
        #        feat['_has_drug_word'] = has_drug_word
        feat['_has_cult_word'] = has_cult_word
        feat['_has_swastika'] = has_swastika
        #        feat['_has_occult_word'] = has_occult_word
        #        feat['_has_has_fwenzel_word'] = has_fwenzel_word
        feat['_has_porn_word'] = has_porn_word
        feat['_has_swastika'] = has_swastika
        feat.update(read_feat)

        #        print feat
        res.append(feat)
    return res
Exemple #49
0
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    print words, "\n"
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
Exemple #50
0
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from nltk.stem import LancasterStemmer

f = open('data.txt', 'r')
lines = f.readlines()
f.close()

custom_stopwords = set(stopwords.words('english') + list(punctuation))

tokenized_lines = []
for line in lines:
    tokenized_words = [
        word for word in word_tokenize(line) if word not in custom_stopwords
    ]
    tokenized_lines.append(tokenized_words)

bigram_measures = BigramAssocMeasures()
ngrams = []
for line in tokenized_lines:
    ngrams.append(
        sorted(BigramCollocationFinder.from_words(line).ngram_fd.items()))

st = LancasterStemmer()
stemmed = []
for line in tokenized_lines:
    stemmed_words = [st.stem(word) for word in line]
    stemmed.append(stemmed_words)

for st in stemmed:
    print(st)
Exemple #51
0
def bag_of_bigram_words(words, score_fn=BigramAssocMeasures.chi_sq, n=100):
    bigram_finder= BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bag_of_non_stopwords(words+bigrams)
import nltk
nltk.download('punkt')
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures, TrigramCollocationFinder

bi_dict = dict()
bg_measures = BigramAssocMeasures()
with open('text/text.txt', 'r') as file:
    text = file.read()
    table = str.maketrans(dict.fromkeys('0123456789'))
    textWithoutNumbers = text.translate(table)

    words = nltk.word_tokenize(textWithoutNumbers)

    bi_finder = BigramCollocationFinder.from_words(words, window_size=2)
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    bi_finder.apply_freq_filter(2)
    t = bi_finder.ngram_fd.items()
    ngram = list(t)
    ngram.sort(key=lambda item: item[-1], reverse=False)
    for (k, v) in ngram:
        print(k, v)
    bi_finder.score_ngrams(bigram_measures.pmi)
    bi_collocs = bi_finder.nbest(bg_measures.likelihood_ratio, 10)
    print(bi_collocs)

    tri_finder = TrigramCollocationFinder.from_words(words)
    bi_finder.apply_freq_filter(5)
    t = tri_finder.ngram_fd.items()
    ngram = list(t)
    ngram.sort(key=lambda item: item[-1], reverse=False)
    for (k, v) in ngram:
Exemple #53
0
def ExtractCollocationFeatures(train_dataset,
                               test_dataset,
                               X_train_filename,
                               X_test_filename,
                               window_size,
                               n_features,
                               balance_dataset=False,
                               remove_center_interval=None):

    # This method extract Collocations of two words within the given
    # window of words as features from the given train and test datasets.
    # It returns X, Y matrices the vectorizer and a list with the feature names.
    # It also stores those X matrices in txt files with names X_train_filename and
    # X_test_filename under the /feature_matrices folder.
    # There are five tuneable parameters:
    # - window_size: size of the window
    # - n_features: number of features considered.
    # - balance_dataset: set to True to balance the training dataset.
    # - remove_center_interval: format: [-0.2, 0.2]. To remove samples with DW-Nominate inside
    # the interval.

    print("Reading datasets...")
    path_train = "../datasets/train/"
    train_dataset_df = pd.read_csv(path_train + train_dataset,
                                   sep="|",
                                   encoding="latin_1",
                                   header=None)
    train_dataset_df.columns = ['nominate_dim1', 'nominate_dim2', 'speech']

    #Remove rows with DW-nominates close to 0
    if type(remove_center_interval) != type(None):
        train_dataset_df['ideology'] = train_dataset_df['nominate_dim1'].apply(
            lambda x: 0 if (float(x) > remove_center_interval[0] and float(x) <
                            remove_center_interval[1]) else x)
        train_dataset_df = train_dataset_df[train_dataset_df['ideology'] != 0]

    train_dataset_df['ideology'] = train_dataset_df['nominate_dim1'].apply(
        lambda x: 1.0 if (float(x) >= 0) else -1.0)

    path_test = "../datasets/test/"
    test_dataset_df = pd.read_csv(path_test + test_dataset,
                                  sep="|",
                                  encoding="latin_1",
                                  header=None)
    test_dataset_df.columns = ['nominate_dim1', 'nominate_dim2', 'speech']

    if balance_dataset == True:
        positive_rows = len(
            train_dataset_df[train_dataset_df['ideology'] == 1.0])
        negative_rows = len(
            train_dataset_df[train_dataset_df['ideology'] == -1.0])
        if positive_rows > negative_rows:
            n = positive_rows - negative_rows

            indices = train_dataset_df[train_dataset_df['ideology'] ==
                                       1.0].index.values.tolist()
            drop_indices = random.sample(indices, n)
            train_dataset_df = train_dataset_df.drop(drop_indices)
        else:
            n = negative_rows - positive_rows

            indices = train_dataset_df[train_dataset_df['ideology'] ==
                                       -1.0].index.values.tolist()
            drop_indices = random.sample(indices, n)
            train_dataset_df = train_dataset_df.drop(drop_indices)

    train_speeches = train_dataset_df['speech'].values.tolist()
    Y_train = train_dataset_df['ideology'].values.tolist()

    if type(remove_center_interval) != type(None):
        test_dataset_df['ideology'] = test_dataset_df['nominate_dim1'].apply(
            lambda x: 0 if (float(x) > remove_center_interval[0] and float(x) <
                            remove_center_interval[1]) else x)
        test_dataset_df = test_dataset_df[test_dataset_df['ideology'] != 0]

    test_dataset_df['ideology'] = test_dataset_df['nominate_dim1'].apply(
        lambda x: 1.0 if (float(x) >= 0) else -1.0)

    test_speeches = test_dataset_df['speech'].values.tolist()
    Y_test = test_dataset_df['ideology'].values.tolist()

    print("Extracting features from train dataset...")
    t_start = time.time()

    stop_words = stopwords.words('english')

    total_bigrams = {}
    bigrams_per_speech_train = []
    t0 = time.time()
    print(len(train_speeches))
    for i in range(0, len(train_speeches)):
        if (i % 1000 == 0):
            print(i)
            t1 = time.time()
            print(str(t1 - t0) + " segundos")
            t0 = time.time()

        speech = train_speeches[i]
        speech = speech.lower()
        speech = speech.translate(str.maketrans('', '', string.punctuation))
        words = speech.split()
        stop_words = set(stopwords.words('english'))
        filtered_words = [w for w in words if not w in stop_words]

        bcf = BigramCollocationFinder.from_words(filtered_words,
                                                 window_size=window_size)

        for item in bcf.ngram_fd.items():
            if item[0] not in total_bigrams:
                total_bigrams.update({item[0]: item[1]})
            else:
                total_bigrams[item[0]] += item[1]

        bigrams_per_speech_train.append(bcf.ngram_fd.items())

    print("Total bigrams finded: ", len(total_bigrams))

    feature_names = []
    most_frequent_bigrams_sorted = sorted(total_bigrams.items(),
                                          key=lambda x: x[1],
                                          reverse=True)[:n_features]
    print("Number of features: ", len(most_frequent_bigrams_sorted))
    most_frequent_bigrams = dict(most_frequent_bigrams_sorted)

    for i in range(0, len(most_frequent_bigrams_sorted)):
        feature_names.append(most_frequent_bigrams_sorted[i][0])
    print(len(feature_names))

    order = list(range(0, len(feature_names)))
    collocation_order = dict(zip(feature_names, order))

    print("Computing X_train...")

    X_train_matrix = np.zeros(
        (len(bigrams_per_speech_train), len(feature_names)))

    for i in range(0, len(bigrams_per_speech_train)):
        if (i % 1000 == 0):
            print(i)
            t1 = time.time()
            print(str(t1 - t0) + " segundos")
            t0 = time.time()
        bigrams_per_speech_i = dict(bigrams_per_speech_train[i])
        for bigram in bigrams_per_speech_i:
            if bigram in most_frequent_bigrams:
                column = collocation_order[bigram]
                X_train_matrix[i][column] = bigrams_per_speech_i[bigram]
    print("Creating dataframe...")
    X_train_df = pd.DataFrame(X_train_matrix, columns=feature_names)
    t1 = time.time()
    print(str(t1 - t0) + " segundos")
    t0 = time.time()

    pathX = "../feature_matrices/"
    print("Saving X_train into a txt file...")
    X_train_df.to_csv(pathX + X_train_filename,
                      header=feature_names,
                      index=None,
                      sep=',')
    print("Transforming X_train into a csr_matrix...")
    X_train = csr_matrix(X_train_df)

    print("Extracting bigrams from test dataset...")

    bigrams_per_speech_test = []
    t0 = time.time()
    print(len(test_speeches))
    for i in range(0, len(test_speeches)):
        if (i % 1000 == 0):
            print(i)
            t1 = time.time()
            print(str(t1 - t0) + " segundos")
            t0 = time.time()

        speech = test_speeches[i]
        speech = speech.lower()
        speech = speech.translate(str.maketrans('', '', string.punctuation))
        words = speech.split()
        stop_words = set(stopwords.words('english'))
        filtered_words = [w for w in words if not w in stop_words]

        bcf = BigramCollocationFinder.from_words(filtered_words,
                                                 window_size=window_size)
        bigrams_per_speech_test.append(bcf.ngram_fd.items())

    print("Computing X_test...")

    X_test_matrix = np.zeros(
        (len(bigrams_per_speech_test), len(feature_names)))
    for i in range(0, len(bigrams_per_speech_test)):
        if (i % 1000 == 0):
            print(i)
            t1 = time.time()
            print(str(t1 - t0) + " segundos")
            t0 = time.time()
        bigrams_per_speech_i = dict(bigrams_per_speech_test[i])
        for bigram in bigrams_per_speech_i:
            if bigram in most_frequent_bigrams:
                column = collocation_order[bigram]
                X_test_matrix[i][column] = bigrams_per_speech_i[bigram]

    print("Creating dataframe...")
    X_test_df = pd.DataFrame(X_test_matrix, columns=feature_names)
    t1 = time.time()
    print(str(t1 - t0) + " segundos")
    t0 = time.time()

    print("Saving X_test into a txt file...")
    X_test_df.to_csv(pathX + X_test_filename,
                     header=feature_names,
                     index=None,
                     sep=',')
    print("Transforming X_train into a csr_matrix...")
    X_test = csr_matrix(X_test_df)

    t_end = time.time()
    total_time = t_end - t_start
    print("Total time: ")
    print(str(total_time) + " segundos")

    return X_train, Y_train, X_test, Y_test, feature_names
Exemple #54
0
    ngram_freq_dist = nltk.FreqDist(ngrams)
    sorted_ngrams_fd = sorted(ngram_freq_dist.items(),
                              key=itemgetter(1),
                              reverse=True)
    sorted_ngrams = sorted_ngrams_fd[0:limit]
    sorted_ngrams = [(' '.join(text), freq) for text, freq in sorted_ngrams]

    return sorted_ngrams


corpus, category = get_data()

from nltk.collocations import BigramCollocationFinder
from nltk.collocations import BigramAssocMeasures

finder = BigramCollocationFinder.from_documents(
    [item.split() for item in corpus])
bigram_measures = BigramAssocMeasures()

print finder.nbest(bigram_measures.raw_freq, 10)

from nltk.collocations import TrigramCollocationFinder
from nltk.collocations import TrigramAssocMeasures

finder = TrigramCollocationFinder.from_documents(
    [item.split() for item in corpus])
trigram_measures = TrigramAssocMeasures()

print finder.nbest(trigram_measures.raw_freq, 10)
print finder.nbest(trigram_measures.pmi, 10)

# print get_top_ngrams(corpus, ngram_val=2, limit=10)
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    words_nopunc = [word for word in words if word not in string.punctuation]
    bigram_finder = BigramCollocationFinder.from_words(words_nopunc)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words_nopunc, bigrams)])
Exemple #56
0
from nltk.corpus import stopwords
from nltk.corpus import webtext
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

set = set(stopwords.words('english'))
stops_filter = lambda w: len(w) < 3 or w in set
tokens = [t.lower() for t in webtext.words('grail.txt')]
words = BigramCollocationFinder.from_words(tokens)
words.apply_word_filter(stops_filter)
print(words.nbest(BigramAssocMeasures.likelihood_ratio, 10))
Exemple #57
0
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import MinMaxScaler

import seaborn as sns

df = pd.read_csv('../preprocessed_dataset.csv')
df.head()

# Calculating number of repeated bigrams per song. Only considered bigrams of which repetition frequency is greater than 3
bigram_score = []
for i in range(len(df.index)):
    mean_pmi = 0.0
    pmi_bigram = []
    text = df["Lyrics"][i].split()
    coll_bia = bigram_collocation.from_words(text)
    coll_bia.apply_freq_filter(3)
    bigram_freq = coll_bia.ngram_fd.items()
    bigramFreqTable = pd.DataFrame(list(bigram_freq),
                                   columns=['bigram', 'freq'
                                            ]).sort_values(by='freq',
                                                           ascending=False)
    bigram_score.append(len(bigramFreqTable.index.values))

# Calculating number of repeated trigrams per song. Only considered trigrams of which repetition frequency is greater than 3
trigram_score = []
for i in range(len(df.index)):
    mean_pmi = 0.0
    pmi_trigram = []
    text = df["Lyrics"][i].split()
    coll_tri = trigram_collocation.from_words(text)
Exemple #58
0
plt.show()

fd = fdist_no_punc_no_stopwords

# las mas comunes
fd.most_common(50)

# diagramas_dispersion
text.dispersion_plot(["God", "mind", "knowledge"])
text.dispersion_plot(["power", "reason", "nature"])
# text.concordance("god")

# bigramas
# from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(words)
finder.nbest(bigram_measures.pmi, 10)
finder.apply_freq_filter(3)
finder.nbest(bigram_measures.pmi, 10)

# lo que aqui cambia es el cambio de filtro

# WC para los bigramas mas frecuentes
stopWords = stopwords
text_content = [
    ''.join(re.split("[ .,;:!?‘’``''@#$%^_&*()<>{}~\n\t\\\-]", word))
    for word in text
]

text_content = [word for word in text_content if word not in stopWords]
text_content = [s for s in text_content if len(s) != 0]
Exemple #59
0
corpus = []
while True:
    l = hpmor.readline()
    if l == '': break
    l = re.sub(r"[^а-яё \t-]", "", l.lower()).strip().split()
    if l: corpus.extend(l)

bigram_measures = BigramAssocMeasures()
trigram_measures = TrigramAssocMeasures()

stop = set(stopwords.words('russian'))
stop.update(['гарри', 'поттер', 'профессор'
             ])  # добавим самые популярные слова из текста в стоп-лист
corpus_ = list(filter(lambda x: x not in stop, corpus))

finder = BigramCollocationFinder.from_words(corpus_)
finder3 = TrigramCollocationFinder.from_words(corpus_)

# фильтры по частотам и стоп-слова
finder.apply_freq_filter(5)
finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in stop)
finder3.apply_freq_filter(5)
finder3.apply_word_filter(lambda w: len(w) < 3 or w.lower() in stop)

# биграммы и триграммы
raw_bigrams = finder.nbest(bigram_measures.raw_freq, 100)
pmi_bigrams = finder.nbest(bigram_measures.pmi, 100)
raw_trigrams = finder3.nbest(trigram_measures.raw_freq, 100)
pmi_trigrams = finder3.nbest(trigram_measures.pmi, 100)

Exemple #60
0
def bigram(words, score_fn=BigramAssocMeasures.chi_sq, n=500):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)  # 使用卡方统计的方法,选择排名前1000的词语
    newBigrams = [u + v for (u, v) in bigrams]