Example #1
0
	def __init__(self,tokens):

		stopwords = set([('5','stars')])
		num_words = len(tokens)

		finder = BigramCollocationFinder.from_words(tokens)
		bigram_measures = nltk.collocations.BigramAssocMeasures()
		finder.apply_freq_filter(int(.0002*num_words)) # some parameter tunin?
		bigrams = finder.nbest(bigram_measures.pmi,15)

		tfinder = TrigramCollocationFinder.from_words(tokens)
		trigram_measures = nltk.collocations.TrigramAssocMeasures()
		tfinder.apply_freq_filter(int(.0001*num_words))
		trigrams = tfinder.nbest(trigram_measures.pmi,10)

		# merge bigrams and trigrams
		phrases = bigrams
		combined = []
		for bigram in bigrams:
			other_bigrams = list(set(bigrams) - set(bigram))
			for other_bigram in other_bigrams:
				if bigram[1] == other_bigram[0]:
					combined.append((bigram[0],bigram[1],other_bigram[1]))
		for trigram in trigrams:
			if trigram in set(combined):
				phrases.append(trigram)
				phrases.remove((trigram[0],trigram[1]))
				phrases.remove((trigram[1],trigram[2]))

		p = POS()
		self.phrases = [phrase for phrase in phrases 
			if p.percent_noun(phrase[-1]) > 0.5 and phrase not in stopwords]
Example #2
0
    def __init__(self, tokens):

        stopwords = set([('5', 'stars')])
        num_words = len(tokens)

        finder = BigramCollocationFinder.from_words(tokens)
        bigram_measures = nltk.collocations.BigramAssocMeasures()
        finder.apply_freq_filter(int(.0002 *
                                     num_words))  # some parameter tunin?
        bigrams = finder.nbest(bigram_measures.pmi, 15)

        tfinder = TrigramCollocationFinder.from_words(tokens)
        trigram_measures = nltk.collocations.TrigramAssocMeasures()
        tfinder.apply_freq_filter(int(.0001 * num_words))
        trigrams = tfinder.nbest(trigram_measures.pmi, 10)

        # merge bigrams and trigrams
        phrases = bigrams
        combined = []
        for bigram in bigrams:
            other_bigrams = list(set(bigrams) - set(bigram))
            for other_bigram in other_bigrams:
                if bigram[1] == other_bigram[0]:
                    combined.append((bigram[0], bigram[1], other_bigram[1]))
        for trigram in trigrams:
            if trigram in set(combined):
                phrases.append(trigram)
                phrases.remove((trigram[0], trigram[1]))
                phrases.remove((trigram[1], trigram[2]))

        p = POS()
        self.phrases = [
            phrase for phrase in phrases
            if p.percent_noun(phrase[-1]) > 0.5 and phrase not in stopwords
        ]
Example #3
0
	def __init__(self,tokens,category):

		# these generic words will be ignored
		stopwords = ['product','price','reviews','unit','model',
		'purchase','amount','item']
		words = category.split()
		# add the category and its plural (crude) to stopwords
		for el in words:
			stopwords.append(el)
			stopwords.append(el+'s')

		# will have to change this!
		self.tokens = tokens
		self.num_words = len(self.tokens)

		# calculate freq dist from tokens
		self.unigram_fd = nltk.FreqDist(self.tokens)
		self.unique_words = len(self.unigram_fd)

		# get frequent unigram nouns
		pos = POS()
		common_unigrams = self.unigram_fd.most_common(int(.02*self.unique_words))
		self.unigrams = [pair for pair in common_unigrams \
			if pair[0] not in stopwords and pos.percent_noun(pair[0]) > 0.5]

		# use threshold? get slightly better w/o the threshold.
		# threshold = .001
		# self.unigrams = [pair for pair in common_unigrams \
		# 	if pair[1] > int(threshold*self.num_words) and pair[0] not in stopwords 
		# 	and pos.percent_noun(pair[0]) > 0.5]

		# create a pandas DataFrame indexed by word, review corpus
		zipped = zip(*self.unigrams)
		df_reviews = pd.DataFrame(list(zipped[1]),index=list(zipped[0]),
			columns=['count_reviews'])

		# a list of words from 'generic' corpus
		generic_words = self.chat_words()

		# create a pandas DataFrame indexed by word, generic
		self.generic_words = [w.lower() for w in generic_words]
		generic_fd = nltk.FreqDist(generic_words)+\
			nltk.FreqDist(nltk.bigrams(generic_words)) + \
			nltk.FreqDist(nltk.trigrams(generic_words))
		zipped_generic = zip(*generic_fd.items())
		df_generic = pd.DataFrame(list(zipped_generic[1]),\
			index=list(zipped_generic[0]),columns=['count_generic'])

		# merge the two on words
		df = df_reviews.join(df_generic)
		self.df = df.fillna(0)

		# compute generic freq.
		term_freq = self.term_freq_log()
		inverse_generic_freq = self.inverse_generic_freq()

		self.scores = term_freq * inverse_generic_freq
		self.scores.sort()
		self.unigrams = list(reversed(self.scores.index))
Example #4
0
    def frequent_nouns_counts(self, tokens):
        """Return frequently mentioned nouns WITH COUNTS"""

        unigram_fd = nltk.FreqDist(tokens)
        pos = POS()
        common_unigrams = unigram_fd.most_common(int(self.top_pct * len(unigram_fd)))

        nouns = [
            pair for pair in common_unigrams if pair[0] not in self.stopwords() and pos.percent_noun(pair[0]) > 0.5
        ]

        return nouns
Example #5
0
    def frequent_nouns_counts(self, tokens):
        """Return frequently mentioned nouns WITH COUNTS"""

        unigram_fd = nltk.FreqDist(tokens)
        pos = POS()
        common_unigrams = unigram_fd.most_common(
            int(self.top_pct * len(unigram_fd)))

        nouns = [
            pair for pair in common_unigrams if
            pair[0] not in self.stopwords() and pos.percent_noun(pair[0]) > 0.5
        ]

        return nouns
Example #6
0
    def phrases(self, tokens):

        bigrams = self.top_bigrams(tokens)
        trigrams = self.top_trigrams(tokens)
        phrases = self.merged_phrases(bigrams, trigrams)

        # clean the phrases to be NOUN PHRASES and w/o phrasal stopwords.
        p = POS()
        stopwords = {('5', 'stars')}
        phrases = [phrase for phrase in phrases
                   if p.percent_noun(phrase[-1]) > 0.5
                   and phrase not in stopwords]

        return phrases
Example #7
0
    def phrases(self, tokens):

        bigrams = self.top_bigrams(tokens)
        trigrams = self.top_trigrams(tokens)
        phrases = self.merged_phrases(bigrams, trigrams)

        # clean the phrases to be NOUN PHRASES and w/o phrasal stopwords.
        p = POS()
        stopwords = {('5', 'stars')}
        phrases = [
            phrase for phrase in phrases
            if p.percent_noun(phrase[-1]) > 0.5 and phrase not in stopwords
        ]

        return phrases
Example #8
0
    def __init__(self, tokens, category):

        # these generic words will be ignored
        stopwords = [
            'product', 'price', 'reviews', 'unit', 'model', 'purchase',
            'amount', 'item'
        ]
        words = category.split()
        # add the category and its plural (crude) to stopwords
        for el in words:
            stopwords.append(el)
            stopwords.append(el + 's')

        # will have to change this!
        self.tokens = tokens
        self.num_words = len(self.tokens)

        # calculate freq dist from tokens
        self.unigram_fd = nltk.FreqDist(self.tokens)
        self.unique_words = len(self.unigram_fd)

        # get frequent unigram nouns
        pos = POS()
        common_unigrams = self.unigram_fd.most_common(
            int(.02 * self.unique_words))
        self.unigrams = [pair for pair in common_unigrams \
         if pair[0] not in stopwords and pos.percent_noun(pair[0]) > 0.5]

        # use threshold? get slightly better w/o the threshold.
        # threshold = .001
        # self.unigrams = [pair for pair in common_unigrams \
        # 	if pair[1] > int(threshold*self.num_words) and pair[0] not in stopwords
        # 	and pos.percent_noun(pair[0]) > 0.5]

        # create a pandas DataFrame indexed by word, review corpus
        zipped = zip(*self.unigrams)
        df_reviews = pd.DataFrame(list(zipped[1]),
                                  index=list(zipped[0]),
                                  columns=['count_reviews'])

        # a list of words from 'generic' corpus
        generic_words = self.chat_words()

        # create a pandas DataFrame indexed by word, generic
        self.generic_words = [w.lower() for w in generic_words]
        generic_fd = nltk.FreqDist(generic_words)+\
         nltk.FreqDist(nltk.bigrams(generic_words)) + \
         nltk.FreqDist(nltk.trigrams(generic_words))
        zipped_generic = zip(*generic_fd.items())
        df_generic = pd.DataFrame(list(zipped_generic[1]),\
         index=list(zipped_generic[0]),columns=['count_generic'])

        # merge the two on words
        df = df_reviews.join(df_generic)
        self.df = df.fillna(0)

        # compute generic freq.
        term_freq = self.term_freq_log()
        inverse_generic_freq = self.inverse_generic_freq()

        self.scores = term_freq * inverse_generic_freq
        self.scores.sort()
        self.unigrams = list(reversed(self.scores.index))