Example #1
0
	def __init__(self,tokens):

		stopwords = set([('5','stars')])
		num_words = len(tokens)

		finder = BigramCollocationFinder.from_words(tokens)
		bigram_measures = nltk.collocations.BigramAssocMeasures()
		finder.apply_freq_filter(int(.0002*num_words)) # some parameter tunin?
		bigrams = finder.nbest(bigram_measures.pmi,15)

		tfinder = TrigramCollocationFinder.from_words(tokens)
		trigram_measures = nltk.collocations.TrigramAssocMeasures()
		tfinder.apply_freq_filter(int(.0001*num_words))
		trigrams = tfinder.nbest(trigram_measures.pmi,10)

		# merge bigrams and trigrams
		phrases = bigrams
		combined = []
		for bigram in bigrams:
			other_bigrams = list(set(bigrams) - set(bigram))
			for other_bigram in other_bigrams:
				if bigram[1] == other_bigram[0]:
					combined.append((bigram[0],bigram[1],other_bigram[1]))
		for trigram in trigrams:
			if trigram in set(combined):
				phrases.append(trigram)
				phrases.remove((trigram[0],trigram[1]))
				phrases.remove((trigram[1],trigram[2]))

		p = POS()
		self.phrases = [phrase for phrase in phrases 
			if p.percent_noun(phrase[-1]) > 0.5 and phrase not in stopwords]
Example #2
0
    def __init__(self, tokens):

        stopwords = set([('5', 'stars')])
        num_words = len(tokens)

        finder = BigramCollocationFinder.from_words(tokens)
        bigram_measures = nltk.collocations.BigramAssocMeasures()
        finder.apply_freq_filter(int(.0002 *
                                     num_words))  # some parameter tunin?
        bigrams = finder.nbest(bigram_measures.pmi, 15)

        tfinder = TrigramCollocationFinder.from_words(tokens)
        trigram_measures = nltk.collocations.TrigramAssocMeasures()
        tfinder.apply_freq_filter(int(.0001 * num_words))
        trigrams = tfinder.nbest(trigram_measures.pmi, 10)

        # merge bigrams and trigrams
        phrases = bigrams
        combined = []
        for bigram in bigrams:
            other_bigrams = list(set(bigrams) - set(bigram))
            for other_bigram in other_bigrams:
                if bigram[1] == other_bigram[0]:
                    combined.append((bigram[0], bigram[1], other_bigram[1]))
        for trigram in trigrams:
            if trigram in set(combined):
                phrases.append(trigram)
                phrases.remove((trigram[0], trigram[1]))
                phrases.remove((trigram[1], trigram[2]))

        p = POS()
        self.phrases = [
            phrase for phrase in phrases
            if p.percent_noun(phrase[-1]) > 0.5 and phrase not in stopwords
        ]
Example #3
0
	def __init__(self,tokens,category):

		# these generic words will be ignored
		stopwords = ['product','price','reviews','unit','model',
		'purchase','amount','item']
		words = category.split()
		# add the category and its plural (crude) to stopwords
		for el in words:
			stopwords.append(el)
			stopwords.append(el+'s')

		# will have to change this!
		self.tokens = tokens
		self.num_words = len(self.tokens)

		# calculate freq dist from tokens
		self.unigram_fd = nltk.FreqDist(self.tokens)
		self.unique_words = len(self.unigram_fd)

		# get frequent unigram nouns
		pos = POS()
		common_unigrams = self.unigram_fd.most_common(int(.02*self.unique_words))
		self.unigrams = [pair for pair in common_unigrams \
			if pair[0] not in stopwords and pos.percent_noun(pair[0]) > 0.5]

		# use threshold? get slightly better w/o the threshold.
		# threshold = .001
		# self.unigrams = [pair for pair in common_unigrams \
		# 	if pair[1] > int(threshold*self.num_words) and pair[0] not in stopwords 
		# 	and pos.percent_noun(pair[0]) > 0.5]

		# create a pandas DataFrame indexed by word, review corpus
		zipped = zip(*self.unigrams)
		df_reviews = pd.DataFrame(list(zipped[1]),index=list(zipped[0]),
			columns=['count_reviews'])

		# a list of words from 'generic' corpus
		generic_words = self.chat_words()

		# create a pandas DataFrame indexed by word, generic
		self.generic_words = [w.lower() for w in generic_words]
		generic_fd = nltk.FreqDist(generic_words)+\
			nltk.FreqDist(nltk.bigrams(generic_words)) + \
			nltk.FreqDist(nltk.trigrams(generic_words))
		zipped_generic = zip(*generic_fd.items())
		df_generic = pd.DataFrame(list(zipped_generic[1]),\
			index=list(zipped_generic[0]),columns=['count_generic'])

		# merge the two on words
		df = df_reviews.join(df_generic)
		self.df = df.fillna(0)

		# compute generic freq.
		term_freq = self.term_freq_log()
		inverse_generic_freq = self.inverse_generic_freq()

		self.scores = term_freq * inverse_generic_freq
		self.scores.sort()
		self.unigrams = list(reversed(self.scores.index))
Example #4
0
    def frequent_nouns_counts(self, tokens):
        """Return frequently mentioned nouns WITH COUNTS"""

        unigram_fd = nltk.FreqDist(tokens)
        pos = POS()
        common_unigrams = unigram_fd.most_common(int(self.top_pct * len(unigram_fd)))

        nouns = [
            pair for pair in common_unigrams if pair[0] not in self.stopwords() and pos.percent_noun(pair[0]) > 0.5
        ]

        return nouns
Example #5
0
    def frequent_nouns_counts(self, tokens):
        """Return frequently mentioned nouns WITH COUNTS"""

        unigram_fd = nltk.FreqDist(tokens)
        pos = POS()
        common_unigrams = unigram_fd.most_common(
            int(self.top_pct * len(unigram_fd)))

        nouns = [
            pair for pair in common_unigrams if
            pair[0] not in self.stopwords() and pos.percent_noun(pair[0]) > 0.5
        ]

        return nouns
Example #6
0
    def phrases(self, tokens):

        bigrams = self.top_bigrams(tokens)
        trigrams = self.top_trigrams(tokens)
        phrases = self.merged_phrases(bigrams, trigrams)

        # clean the phrases to be NOUN PHRASES and w/o phrasal stopwords.
        p = POS()
        stopwords = {('5', 'stars')}
        phrases = [phrase for phrase in phrases
                   if p.percent_noun(phrase[-1]) > 0.5
                   and phrase not in stopwords]

        return phrases
Example #7
0
    def phrases(self, tokens):

        bigrams = self.top_bigrams(tokens)
        trigrams = self.top_trigrams(tokens)
        phrases = self.merged_phrases(bigrams, trigrams)

        # clean the phrases to be NOUN PHRASES and w/o phrasal stopwords.
        p = POS()
        stopwords = {('5', 'stars')}
        phrases = [
            phrase for phrase in phrases
            if p.percent_noun(phrase[-1]) > 0.5 and phrase not in stopwords
        ]

        return phrases
Example #8
0
    def __init__(self,
                 llwl='Brown',
                 llNL=2,
                 percen=80,
                 NE=True,
                 Col=True,
                 Gram=True,
                 Chu=True):
        '''      
        @param llwl:LogLikleyHood Corpa name ('Brown','AmE06','BE06')
        @param llNL:LogLikleyHood 
        @param percen: Presision of output default = 20, 20% returned
        @param NE: Uses NE default True 
        @param Col: Uses Collocation default True
        @param Gram: Uses N-Grams default True
        @param Chu: Uses Chunking default True
        '''

        self.NEs = NE
        self.Col = Col
        self.Gram = Gram
        self.Chu = Chu
        self.p = percen
        print 'Starting to build ', llwl
        self.LL = LogLikelihood(wordlist=llwl, NLength=llNL)
        print 'LL Loaded'
        self.POS = POS()
        print 'POS Loaded'
        self.GD = GetData()
        print 'GD Loaded'
        self.Cu = Chunker(self.POS)
        print 'Cu Loaded'
        self.FL = Filter()
        print 'FL Loaded'
        self.CC = Collocation(self.POS)
        print 'CC Loaded'
        self.Ng = NGram()
        print 'Ng Loaded'
        self.S = Select(percentil=self.p)
        print 'S Loaded'
        self.To = Tokenize(self.FL)
        print 'To Loaded'
Example #9
0
from SSBParser import SSBParser
from POS import POS
from unknown_word_handler import unknown_word_handler



parser = SSBParser()
train_clean_file = 'BROWN-clean.pos.txt'
test_clean_file = 'SnapshotBROWN-clean.pos.txt'
parser.gen_clean_file('BROWN.pos.all',train_clean_file)
parser.gen_clean_file('SnapshotBROWN.pos.all.txt',test_clean_file)

print('1(i) : Baseline statistical tagger implemented for entire Brown corpus.')
print('***********************************************************************')

print('1(ii) : Calculating performance for snapshot')
pos = POS(train_clean_file,test_clean_file)
print('***********************************************************************')


print('1(iii) : Calculating performance for news collected from web. please see news.txt to see the input file')
uwh = unknown_word_handler('news.txt','news-clean.txt',pos)
test_clean_file = 'news-clean.txt'
pos = POS(train_clean_file,test_clean_file)
Example #10
0
    def __init__(self, tokens, category):

        # these generic words will be ignored
        stopwords = [
            'product', 'price', 'reviews', 'unit', 'model', 'purchase',
            'amount', 'item'
        ]
        words = category.split()
        # add the category and its plural (crude) to stopwords
        for el in words:
            stopwords.append(el)
            stopwords.append(el + 's')

        # will have to change this!
        self.tokens = tokens
        self.num_words = len(self.tokens)

        # calculate freq dist from tokens
        self.unigram_fd = nltk.FreqDist(self.tokens)
        self.unique_words = len(self.unigram_fd)

        # get frequent unigram nouns
        pos = POS()
        common_unigrams = self.unigram_fd.most_common(
            int(.02 * self.unique_words))
        self.unigrams = [pair for pair in common_unigrams \
         if pair[0] not in stopwords and pos.percent_noun(pair[0]) > 0.5]

        # use threshold? get slightly better w/o the threshold.
        # threshold = .001
        # self.unigrams = [pair for pair in common_unigrams \
        # 	if pair[1] > int(threshold*self.num_words) and pair[0] not in stopwords
        # 	and pos.percent_noun(pair[0]) > 0.5]

        # create a pandas DataFrame indexed by word, review corpus
        zipped = zip(*self.unigrams)
        df_reviews = pd.DataFrame(list(zipped[1]),
                                  index=list(zipped[0]),
                                  columns=['count_reviews'])

        # a list of words from 'generic' corpus
        generic_words = self.chat_words()

        # create a pandas DataFrame indexed by word, generic
        self.generic_words = [w.lower() for w in generic_words]
        generic_fd = nltk.FreqDist(generic_words)+\
         nltk.FreqDist(nltk.bigrams(generic_words)) + \
         nltk.FreqDist(nltk.trigrams(generic_words))
        zipped_generic = zip(*generic_fd.items())
        df_generic = pd.DataFrame(list(zipped_generic[1]),\
         index=list(zipped_generic[0]),columns=['count_generic'])

        # merge the two on words
        df = df_reviews.join(df_generic)
        self.df = df.fillna(0)

        # compute generic freq.
        term_freq = self.term_freq_log()
        inverse_generic_freq = self.inverse_generic_freq()

        self.scores = term_freq * inverse_generic_freq
        self.scores.sort()
        self.unigrams = list(reversed(self.scores.index))
Example #11
0
class runable(object):
    '''
    Class for selecting keywords and extracting keywords from online contentent.
    '''
    def __init__(self,
                 llwl='Brown',
                 llNL=2,
                 percen=80,
                 NE=True,
                 Col=True,
                 Gram=True,
                 Chu=True):
        '''      
        @param llwl:LogLikleyHood Corpa name ('Brown','AmE06','BE06')
        @param llNL:LogLikleyHood 
        @param percen: Presision of output default = 20, 20% returned
        @param NE: Uses NE default True 
        @param Col: Uses Collocation default True
        @param Gram: Uses N-Grams default True
        @param Chu: Uses Chunking default True
        '''

        self.NEs = NE
        self.Col = Col
        self.Gram = Gram
        self.Chu = Chu
        self.p = percen
        print 'Starting to build ', llwl
        self.LL = LogLikelihood(wordlist=llwl, NLength=llNL)
        print 'LL Loaded'
        self.POS = POS()
        print 'POS Loaded'
        self.GD = GetData()
        print 'GD Loaded'
        self.Cu = Chunker(self.POS)
        print 'Cu Loaded'
        self.FL = Filter()
        print 'FL Loaded'
        self.CC = Collocation(self.POS)
        print 'CC Loaded'
        self.Ng = NGram()
        print 'Ng Loaded'
        self.S = Select(percentil=self.p)
        print 'S Loaded'
        self.To = Tokenize(self.FL)
        print 'To Loaded'

    def Select(self, url, depth):
        '''
        Determin the best keywords for a webpage.
        
        @param url: the base url to start sampaling from
        @param depth: the depth of the website to be sampled
        
        @return: the list of selected keywords, ordered with the highest rated words to the lower bownd of array.
        '''
        #Get data from web page
        text = self.GD.getWebPage(url, depth)

        #Tokonize sentance and words
        tok = self.To.Tok(text)

        #POS tag the text
        pos = self.POS.POSTag(tok, 'tok')

        #Log Likly Hood
        log = self.LL.calcualte(tok)

        #Collocations
        if self.Col == True:
            col = self.CC.col(pos, tok)
        else:
            col = []

        #NE Extraction
        if self.NEs == True:
            ne = self.Cu.Chunks(pos,
                                nodes=['PERSON', 'ORGANIZATION', 'LOCATION'])
        else:
            ne = []

        #Extract NP
        if self.Chu == True:
            chu = [self.Cu.parse(p) for p in pos]
        else:
            chu = []

        #Creat N-gram
        if self.Gram == True:
            ga = self.Ng.Grams(pos, n=6)
        else:
            ga = []

        return self.S.keywords(ne, ga, col, chu, log)