Example #1
1
def process(f, return_tokens=True, return_freqdist=True):
    """
    Function to process deals data.
    Splits text into sentences. FreqDist is incremented from tokenization.
    Using PunktWordTokenizer, since it is a decent regexp-based tokenizer.
    Deals are also about domain names. Not intending to split it up

    :rtype : FreqDist, list() of str
    :param f: Input file with a deal per line
    """
    fd = FreqDist()
    tokens = []
    fh = open(f, 'r')
    sentences = [line.strip() for line in fh.readlines()]
    for line in sentences:
        t = []
        for word in PunktWordTokenizer().tokenize(line.lower()):
            if word not in set(stopwords.words('english')) and word not in set(string.punctuation):
                if return_tokens:
                    t.append(word)
                if return_freqdist:
                    fd.inc(word)
        tokens.append(t)
    fh.close()
    return fd, sentences, tokens
Example #2
0
class Index:
    """
    The Index class stores an index for a document.
    """
    def __init__(self):
        self._freq_dist = None
        self._document = None

    def index(self, document):
        self._document = document
        if self._freq_dist == None:
            self._freq_dist = FreqDist()
            for term in self.terms():
                self._freq_dist.inc(term)

    def reset(self):
        "Reset the index"
        self._freq_dist = None

    def freq_dist(self):
        if self._freq_dist == None:
            self.index()
        return self._freq_dist

    # return the number of times a term appears in this document
    def freq(self, term):
        if not self._freq_dist:
            self.index()
        return self._freq_dist[term]

    def tf(self, term):
        if not self._freq_dist:
            self.index()
        return float(self._freq_dist[term]) / float(self._freq_dist.N())
Example #3
0
    def proto(self, num, language, authors, token_vocab, token_df, lemma_vocab,
              pos_vocab, synset_vocab, stemmer):
        d = Document()
        assert language == self.lang

        if self._id:
            d.id = self._id
        else:
            d.id = num

        d.language = language
        d.title = self.title.strip()
        num_sentences = max(self._sentences) + 1

        tf_token = FreqDist()
        for ii in self.tokens():
            tf_token.inc(ii)

        for ii in xrange(num_sentences):
            s = d.sentences.add()
            for jj in self._sentences[ii]:
                w = s.words.add()
                w.token = token_vocab[jj.word]
                w.lemma = lemma_vocab[jj.lemma]
                w.pos = pos_vocab[jj.pos]
                w.relation = pos_vocab[jj.rel]
                w.parent = jj.parent
                w.offset = jj.offset
                w.tfidf = token_df.compute_tfidf(jj.word,
                                                 tf_token.freq(jj.word))
        return d
Example #4
0
def dotranslate(sent, parser, tdop):
	# todo: tokenize sentence by maximizing unigram probabilities
	# in training corpus, to detect multiword units
	sent = sent.split()

	# parse sentence with bitpar, gives an n-best list
	try:
		parsetrees1 = list(parser.nbest_parse(sent))
	except Exception as e:
		parsetrees1 = []
		print "parsing failed", e
		return (), {}

	# undo binarization and auxilary POS tags introduced to accomodate bitpar:
	parsetrees = FreqDist()
	for tree in parsetrees1:
		tree.un_chomsky_normal_form()
		parsetrees.inc(removeforcepos(tree).freeze(), count=tree.prob())

	# for each parsetree, get a list of translations
	resultfd = {}
	for m, tree in enumerate(parsetrees):
		print "parse tree", tree
		for nn, (result, prob) in enumerate(
			tdop.get_mlt_deriv_multi(tree, smoothing=True, verbose=False)):
			if not result: continue
			key = (undecorate_with_ids(result).freeze(),
				sum(1 if "@" in a.node else 0 for a in result.subtrees()))
			resultfd[key] = resultfd.get(key, 0.0) + prob
	return parsetrees, resultfd
Example #5
0
def sent_length_fdist_single(address, exclude=excludePuncts(), corpus=inaugural):
	fd = FreqDist()
	
	for sent in corpus.sents(address):
		nopunct_sent = [word for word in sent if not word in exclude]
		fd.inc(len(nopunct_sent))
	
	return fd
Example #6
0
    # return the frequency distribution as the result
    return adist

# define a function to make a FreqDist from a list of tokens that has no tokens
#   that contain non-alphabetical characters or words in the stopword list

def alphaStopFreqDist(words, stoplist):
    # make a new frequency distribution called asdist
Example #7
0
def word_fdist_single(address, exclude=excludes(), corpus=inaugural):
	fd = FreqDist()
	
	for word in corpus.words(address):
		if not word.lower() in exclude:
			fd.inc(word.lower())
	
	return fd
Example #8
0
def word_fdist_single(address, exclude=excludes(), corpus=inaugural):
    fd = FreqDist()

    for word in corpus.words(address):
        if not word.lower() in exclude:
            fd.inc(word.lower())

    return fd
Example #9
0
def content_FreqDist_generator(articles_list):
    # get the FreqDist of all articles
    all_fdist = FreqDist()
    for article in articles_list:
        for item in article.content_freqDist().iteritems():
            key = item[0]
            value = item[1]
            all_fdist.inc(key, value)
    return all_fdist
Example #10
0
def word_fdist(address_list, exclude=excludes(), corpus=inaugural):
	total_fd = FreqDist()
	
	for address in address_list:
		fd = word_fdist_single(address, exclude, corpus)
		for word in fd.keys():
			total_fd.inc(word, fd[word])
	
	return total_fd
Example #11
0
def sent_length_fdist(address_list, exclude=excludePuncts(), corpus=inaugural):
    total_fd = FreqDist()

    for address in address_list:
        fd = sent_length_fdist_single(address, exclude, corpus)
        for len in fd.keys():
            total_fd.inc(len, fd[len])

    return total_fd
Example #12
0
def word_fdist(address_list, exclude=excludes(), corpus=inaugural):
    total_fd = FreqDist()

    for address in address_list:
        fd = word_fdist_single(address, exclude, corpus)
        for word in fd.keys():
            total_fd.inc(word, fd[word])

    return total_fd
Example #13
0
def sent_length_fdist(address_list, exclude=excludePuncts(), corpus=inaugural):
	total_fd = FreqDist()
	
	for address in address_list:
		fd = sent_length_fdist_single(address, exclude, corpus)
		for len in fd.keys():
			total_fd.inc(len, fd[len])
	
	return total_fd
    def __extract_level_words(self, levels_db, level, values):
        words_number_per_value = self.__configuration_map["most_frequent_words_number_per_value"]
        most_freq_words = {}
        for value in values:
            fdist = FreqDist()
            for word_dist in levels_db[level][value]:
                fdist.inc(word_dist[0], count = word_dist[1])

            most_freq_words[value] = fdist.items()[:words_number_per_value]
        return most_freq_words
Example #15
0
def sent_length_fdist_single(address,
                             exclude=excludePuncts(),
                             corpus=inaugural):
    fd = FreqDist()

    for sent in corpus.sents(address):
        nopunct_sent = [word for word in sent if not word in exclude]
        fd.inc(len(nopunct_sent))

    return fd
Example #16
0
    def kneser_ney(self, context, word):
        """
        Return the log probability of a word given a context given
        Kneser Ney backoff
        """

        bgram = (context, word)
        unigram_freq = FreqDist()

        theta = self._kn_concentration
        vocabulary = 1 / len(self._vocab_freq.keys())
        discount_delta = self._kn_discount
        unigram_T = len(self._context_freq.keys())
        bigram_T = self._context_freq[context]

        for i in self._gram_freq:
            unigram_freq.inc(i[1])

        # Unigram Restaurant
        # C_0,x
        count_unirest_wordTable = unigram_freq[word]
        # C_0,.
        count_unirest_allTable = unigram_freq.N()

        # u_Bigram Restaurant
        # C_u,x
        count_birest_wordTable = self._gram_freq[bgram]

        # C_u,.
        count_birest_allTable = self._context_freq[context]

        existingTable_numer = count_birest_wordTable - discount_delta
        existingTable_denom = theta + count_birest_allTable
        existingTable = existingTable_numer / existingTable_denom

        if existingTable < 0:
            existingTable = 0

        newTable_numer = theta + (bigram_T * discount_delta)
        newTable_denom = theta + count_birest_allTable
        newTable = newTable_numer / newTable_denom

        back_a_numer = count_unirest_wordTable - discount_delta
        back_a_denom = count_unirest_allTable + theta
        back_a = back_a_numer / back_a_denom
        if back_a < 0:
            back_a = 0

        back_b_numer = theta + (unigram_T * discount_delta)
        back_b_denom = count_unirest_allTable + theta
        back_b = back_b_numer / back_b_denom
        back_b = back_b * vocabulary

        result = existingTable + (newTable * (back_a + back_b))
        return lg(result)
    def kneser_ney(self, context, word):
        """
        Return the log probability of a word given a context given
        Kneser Ney backoff
        """

        bgram = (context, word)
        unigram_freq = FreqDist()

        theta = self._kn_concentration
        vocabulary = 1 / len(self._vocab_freq.keys())
        discount_delta = self._kn_discount
        unigram_T = len(self._context_freq.keys())
        bigram_T = self._context_freq[context]

        for i in self._gram_freq:
            unigram_freq.inc(i[1])

        # Unigram Restaurant
        # C_0,x
        count_unirest_wordTable = unigram_freq[word]
        # C_0,.
        count_unirest_allTable = unigram_freq.N()

        # u_Bigram Restaurant
        # C_u,x
        count_birest_wordTable = self._gram_freq[bgram]

        # C_u,.
        count_birest_allTable = self._context_freq[context]

        existingTable_numer = count_birest_wordTable - discount_delta
        existingTable_denom = theta + count_birest_allTable
        existingTable = existingTable_numer / existingTable_denom

        if existingTable < 0:
            existingTable = 0

        newTable_numer = theta + (bigram_T * discount_delta)
        newTable_denom = theta + count_birest_allTable
        newTable = newTable_numer / newTable_denom

        back_a_numer = count_unirest_wordTable - discount_delta
        back_a_denom = count_unirest_allTable + theta
        back_a = back_a_numer / back_a_denom
        if back_a < 0:
            back_a = 0

        back_b_numer = theta + (unigram_T * discount_delta)
        back_b_denom = count_unirest_allTable + theta
        back_b = back_b_numer / back_b_denom
        back_b = back_b * vocabulary

        result = existingTable + (newTable * (back_a + back_b))
        return lg(result)
Example #18
0
    def __getTimelineFeatures(self, timeline):
        logger.info(u"Get timeline features")
        tweets = []
        self.__changePhase(PHASE["GET_TIMELINE_URLS"])
        for t in timeline:
            try:
                tweet = TweetText(t, self.__urlBuilder, self.__userBuilder)
            except:
                logger.exception(u"Error: \"" + unicode(t) + u"\"")
                raise ValueError(t)
            logger.debug(u"Tweet:" + unicode(tweet))
            tweets.append(tweet)

        urls = []
        ti = 0
        for tweet in tweets:
            for url in tweet.urls():
                self.__breakIfStopped()
                self.__urlResolver.addUrlToQueue(url)
                urls.append(url)
            logger.info(u"Tweet:" + unicode(tweet))
            ti += 1
            self.__proc = 100 * float(ti) / float(len(tweets))

        #Kategorie
        self.__changePhase(PHASE["GET_TIMELINE_FEATURES"])
        url2labels = {}
        ui = 0
        for url in urls:
            self.__breakIfStopped()
            if not url.isError():
                logger.debug(u"Classify " + unicode(url.getUrl()))
                url2labels[url.getExpandedUrl()] = self._classifier().classify(url.getText())
            ui += 1
            self.__proc = 100 * float(ui) / float(len(urls))

        labelsFreq = FreqDist()
        for labels in url2labels.values():
            for label in labels:
                labelsFreq.inc(label)
        self.__catFreq = labelsFreq.items()
        logger.info(u"Categories: "  + unicode(labelsFreq.items()))
        labelsFreqValues = [(item[0], item[1]) for item in labelsFreq.items() if item[0] not in ['short', 'medium', 'long']]
        #normalizacja
        labelsFreqValues = {label: float(freq) / float(max([f for l,f in labelsFreqValues])) for label, freq in labelsFreqValues}
        logger.info(u"Category factors: "  + unicode(labelsFreqValues))

        #Języki
        langFreq = FreqDist()
        for u in urls:
            langFreq.inc(u.lang())
        self.__langFreq = langFreq.items()
        logger.info(u"Languages: " + unicode(langFreq.items()))

        return labelsFreqValues
Example #19
0
    def train_supervised(self, labelled_sequences, **kwargs):
        """
        Supervised training maximising the joint probability of the symbol and
        state sequences. This is done via collecting frequencies of
        transitions between states, symbol observations while within each
        state and which states start a sentence. These frequency distributions
        are then normalised into probability estimates, which can be
        smoothed if desired.

        @return: the trained model
        @rtype: HiddenMarkovModelTagger
        @param labelled_sequences: the training data, a set of
            labelled sequences of observations
        @type labelled_sequences: list
        @param kwargs: may include an 'estimator' parameter, a function taking
            a C{FreqDist} and a number of bins and returning a C{ProbDistI};
            otherwise a MLE estimate is used
        """

        # default to the MLE estimate
        estimator = kwargs.get('estimator')
        if estimator == None:
            estimator = lambda fdist, bins: MLEProbDist(fdist)

        # count occurences of starting states, transitions out of each state
        # and output symbols observed in each state
        starting = FreqDist()
        transitions = ConditionalFreqDist()
        outputs = ConditionalFreqDist()
        for sequence in labelled_sequences:
            lasts = None
            for token in sequence:
                state = token[_TAG]
                symbol = token[_TEXT]
                if lasts == None:
                    starting.inc(state)
                else:
                    transitions[lasts].inc(state)
                outputs[state].inc(symbol)
                lasts = state

                # update the state and symbol lists
                if state not in self._states:
                    self._states.append(state)
                if symbol not in self._symbols:
                    self._symbols.append(symbol)

        # create probability distributions (with smoothing)
        N = len(self._states)
        pi = estimator(starting, N)
        A = ConditionalProbDist(transitions, estimator, False, N)
        B = ConditionalProbDist(outputs, estimator, False, len(self._symbols))
                               
        return HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi)
Example #20
0
    def handle(self, *args, **options):
    	fdist = FreqDist()
    	print "Analyzing raw data"
    	limit = 10
    	if args:
    		raw_datas = RawData.objects.filter(pk__in=args)
    	else:
	   		raw_datas = RawData.objects.all()[:limit]
    	tagged_data = []
    	for raw_data in raw_datas:
    		words = nltk.word_tokenize(raw_data.data)
    		tagged_data.extend(nltk.pos_tag(words))
    		for word in words:
    			word = word.strip()
    			if word:
	    			fdist.inc(word)

    	print "Anaylzed %s items" % len(raw_datas)
    	print

    	print "Top word: %s" % fdist.max()
    	print 

    	print "Top 10 words"
    	for word in fdist.keys()[:10]:
    		times = fdist[word]
    		print " -- %s occurred %s times" % (word, times)
    	print

    	
    	print "Bottom 10 words"
    	for word in fdist.keys()[-10:]:
    		times = fdist[word]
    		print " -- %s occurred %s times" % (word, times)
    	print

    	print "Words occurring between 50-100 times"
    	words = [ word for word in fdist.keys() if fdist[word] >= 50 and fdist[word] <= 100 ]
    	print ", ".join(words)


    	cfdist = ConditionalFreqDist()
    	for (word, tag) in tagged_data:
    		cfdist[tag].inc(word)
    	
    	print "Most popular noun: %s" % cfdist["NN"].max()
    	print 

    	print "Top 50 nouns"
    	for word in cfdist["NN"].keys()[:50]:
    		times = cfdist["NN"][word]
    		print " -- %s occurred %s times" % (word, times)
    	print
Example #21
0
def recompute_cluster_dists(text, cluster_descr):
    c_freqs = FreqDist()
    for c in text.clusters(cluster_descr):
        c_freqs.inc(c)
    c_dist = MLEProbDist(c_freqs)

    c_bi_freqs = FreqDist()
    for bi_c in bigrams(text.clusters(cluster_descr)):
        c_bi_freqs.inc(bi_c)
    c_bi_dist = MLEProbDist(c_bi_freqs)

    return c_dist, c_bi_dist
Example #22
0
    #   and if it is not on the stop word list
    #     add it to the frequency distribution
    for word in words:
        if not pattern.match(word):
            if not word in stoplist:
                asdist.inc(word)
    # return the frequency distribution as the result
    return asdist
# Bigram frequency distribution function.
# This version also makes sure that each word in the bigram occurs in a word
#   frequency distribution without non-alphabetical characters and stopwords
#       This will also work with an empty stopword list if you don't want stopwords.
Example #23
0
	def mostprobableparse(self, sent, sample=None):
		"""warning: this problem is NP-complete. using an unsorted
		chart parser avoids unnecessary sorting (since we need all
		derivations anyway).
		
		@param sent: a sequence of terminals
		@param sample: None or int; if int then sample that many parses"""
		p = FreqDist()
		for a in self.parser.nbest_parse(sent, sample):
			p.inc(removeids(a).freeze(), a.prob())
		if p.max():
			return ProbabilisticTree(p.max().node, p.max(), prob=p[p.max()])
		else: raise ValueError("no parse")
Example #24
0
    def classify(self, sentence, tokenizer_lang, ngram_length=3):
        features = []
        for ii in self.tokenizers[tokenizer_lang].tokenize(sentence):
            d = {}
            for jj in ingrams(ii, ngram_length):
                d[jj] = d.get(jj, 0) + 1
            features.append(d)
        data = SparseDataSet(features)

        f = FreqDist()
        for ii in [self._labels[self._classifier.classify(data, x)[0]] for x in xrange(len(features))]:
            f.inc(ii)
        return f
def mapper(key,value):
    sentence = value.split()
    for (index, tagtuple) in enumerate(sentence):
        token, tag = get_token_tag(tagtuple)
        if we_like(token, tag):
            fd = FreqDist()
            token = token.lower()
            window = sentence[index+1:index+5]
            for windowtuple in window:
                wtoken, wtag = get_token_tag(windowtuple)
                if we_like(wtoken, wtag):
                    wtoken = wtoken.lower()
                    fd.inc(wtoken)
            yield token, tuple(fd.items())
Example #26
0
def mapper(key, value):
    sentence = value.split()
    for (index, tagtuple) in enumerate(sentence):
        token, tag = get_token_tag(tagtuple)
        if we_like(token, tag):
            fd = FreqDist()
            token = token.lower()
            window = sentence[index + 1:index + 5]
            for windowtuple in window:
                wtoken, wtag = get_token_tag(windowtuple)
                if we_like(wtoken, wtag):
                    wtoken = wtoken.lower()
                    fd.inc(wtoken)
            yield token, tuple(fd.items())
Example #27
0
 def parse(self,doc,mode='list'):
     stream = self.makeTokenStream(doc)
     if mode == 'list':
         tokens = []
         while stream.incrementToken(): tokens.append(stream.getAttribute(CharTermAttribute.class_).toString())
     elif mode == 'set':
         tokens = set()
         while stream.incrementToken(): tokens.add(stream.getAttribute(CharTermAttribute.class_).toString())
     elif mode == 'FreqDist':
         tokens = FD()
         while stream.incrementToken(): tokens.inc(stream.getAttribute(CharTermAttribute.class_).toString())
     else:
         raise TypeError("mode 恮 type ćŒé•ć„ć¾ć™ć€‚")
     stream.close()
     return tokens
    def classify(self, sentence, tokenizer_lang, ngram_length=3):
        features = []
        for ii in self.tokenizers[tokenizer_lang].tokenize(sentence):
            d = {}
            for jj in ingrams(ii, ngram_length):
                d[jj] = d.get(jj, 0) + 1
            features.append(d)
        data = SparseDataSet(features)

        f = FreqDist()
        for ii in [
                self._labels[self._classifier.classify(data, x)[0]]
                for x in xrange(len(features))
        ]:
            f.inc(ii)
        return f
    def __call__(self, key, value):
        sent = value.split()
        for idx, tagged in enumerate(sent):
            token, tag = self.split_tagged(tagged)

            if self.valid(token, tag):
                dist = FreqDist()
                window = sent[idx + 1:idx + 5]

                for wtagged in window:
                    wtoken, wtag = self.split_tagged(wtagged)

                    if self.valid(wtoken, wtag):
                        dist.inc(wtoken)

                yield token, tuple(dist.items())
    def __call__(self, key, value):
        sent = value.split()
        for idx, tagged in enumerate(sent):
            token, tag = self.split_tagged(tagged)

            if self.valid(token, tag):
                dist   = FreqDist()
                window = sent[idx+1:idx+5]

                for wtagged in window:
                    wtoken, wtag = self.split_tagged(wtagged)

                    if self.valid(wtoken, wtag):
                        dist.inc(wtoken)

                yield token, tuple(dist.items())
Example #31
0
def main():
    """
    Return the X most common stems from the dataset.
    X = VOC_NUMBER constant.
    """
    fdist = FreqDist()
    for line in fileinput.input():
        try:
            for stem in get_stems(line):
                if stem not in ENGLISH_STOPWORDS:
                    fdist.inc(stem)
        except UnicodeDecodeError:
            pass

    keys = fdist.keys()[:VOC_NUMBER]
    for s in keys:
        print s
Example #32
0
class Text(object):
    def __init__(self, source, gen_func=lambda x: x):
        self.dictionary = Dictionary([gen_func(source)])
        self.gen_func = gen_func
        self.source = source

        self.word_freqs = FreqDist()
        for word in self.words():
            self.word_freqs.inc(word)

        self.word_dist = MLEProbDist(self.word_freqs)

    def words(self):
        return (self.dictionary.token2id[token] for token in self.gen_func(self.source))

    def clusters(self, cluster_descr):
        return (cluster_descr.index[word] for word in self.words())
    def gen_word_freqs(self, train_sents):
        """
        Generates word frequencies from the training sentences for the feature
        classifier.

        @type train_sents: C{list} of C{list} of tuples of (C{str}, C{str})
        @param train_sents: A list of tagged sentences.

        @rtype: C{FreqDist}
        @return: a L{frequency distribution<nltk.FreqDist()>},
        counting how often each word occurs in the training sentences.
        """
        word_freqdist = FreqDist()
        for tagged_sent in train_sents:
            for (word, _tag) in tagged_sent:
                word_freqdist.inc(word)
        return word_freqdist
Example #34
0
class Model():
    def __init__(self, data, minimum_vocab_fraction=.02, include_ngrams=True):
        self.doc_freq = FreqDist()
        for count, (label, text) in enumerate(data, start=1):
            for word in set(
                    utils.tokenize(text, include_ngrams, limit_ngrams=True)):
                self.doc_freq.inc(word)
        self.doc_count = count

        self.min_vocab_freq = 1
        self.max_vocab_freq = .95 * self.doc_count
        print 'Min/max vocabulary frequency:', self.min_vocab_freq, self.max_vocab_freq

        self.features = sorted(filter(self._is_valid_feature, self.doc_freq))

    def _is_valid_feature(self, feature):
        doc_freq = self.doc_freq[feature]
        return doc_freq > self.min_vocab_freq and doc_freq < self.max_vocab_freq
Example #35
0
class Model():

    def __init__(self, data, minimum_vocab_fraction=.02, include_ngrams=True):
        self.doc_freq = FreqDist()
        for count, (label, text) in enumerate(data, start=1):
            for word in set(utils.tokenize(text, include_ngrams, limit_ngrams=True)):
                self.doc_freq.inc(word)
        self.doc_count = count

        self.min_vocab_freq = 1
        self.max_vocab_freq = .95 * self.doc_count
        print 'Min/max vocabulary frequency:', self.min_vocab_freq, self.max_vocab_freq

        self.features = sorted(filter(self._is_valid_feature, self.doc_freq))

    def _is_valid_feature(self, feature):
        doc_freq = self.doc_freq[feature]
        return doc_freq > self.min_vocab_freq and doc_freq < self.max_vocab_freq
Example #36
0
class Text(object):
    def __init__(self, source, gen_func=lambda x: x):
        self.dictionary = Dictionary([gen_func(source)])
        self.gen_func = gen_func
        self.source = source

        self.word_freqs = FreqDist()
        for word in self.words():
            self.word_freqs.inc(word)

        self.word_dist = MLEProbDist(self.word_freqs)

    def words(self):
        return (self.dictionary.token2id[token]
                for token in self.gen_func(self.source))

    def clusters(self, cluster_descr):
        return (cluster_descr.index[word] for word in self.words())
Example #37
0
def build_vocabulary(save_state_file='state.pkl.gz'):
    counter = FreqDist()
    total_line_count = 0
    for url_suffix in urls_1grams:
        print url_suffix
        current_line_num = 0
        for line in buffered_download(base_url_1grams % url_suffix):
            current_line_num += 1
            total_line_count += 1
            try:
                tokens, year, total_count, _ = parse_line(line)
                counter.inc(tokens[0], total_count)
            except:
                print "error parsing line"
                print line
        if save_state_file:
            print 'saving state'
            save_state(save_state_file, counter, current_line_num, url_suffix)
    return counter
Example #38
0
def parse(doc,mode='list'):
    '''
    ę–‡å­—åˆ—åž‹ć§ć‚ć‚‹ doc 悒 Solr 恮 JapaneseAnalyzer恧 å½¢ę…‹ē“ č§£ęžć—态mode ć§ęŒ‡å®šć—ćŸåž‹ć§čæ”恙怂
    mode : 'list' or 'set' or 'FreqDist'
    '''
    stream = _makeTokenStream(doc)
    if mode == 'list':
        tokens = []
        while stream.incrementToken(): tokens.append(stream.getAttribute(CharTermAttribute.class_).toString())
    elif mode == 'set':
        tokens = set()
        while stream.incrementToken(): tokens.add(stream.getAttribute(CharTermAttribute.class_).toString())
    elif mode == 'FreqDist':
        tokens = FD()
        while stream.incrementToken(): tokens.inc(stream.getAttribute(CharTermAttribute.class_).toString())
    else:
        raise TypeError("mode 恮 type ćŒé•ć„ć¾ć™ć€‚")
    stream.close()
    return tokens
Example #39
0
def run_EM(no_of_iter, ten_de):
    #Run EM for specified number of iterations
    print("Running EM")
    ## pseudocode from http://www.statmt.org/mtm2/data/day2-1x2.pdf
    ##    do until convergence
    ##    set count(e|f) to 0 for all e,f
    ##    set total(f) to 0 for all f
    ##    for all sentence pairs (e_s,f_s)
    ##    for all words e in e_s
    ##    total_s(e) = 0
    ##    for all words f in f_s
    ##    total_s(e) += t(e|f)
    ##    for all words e in e_s
    ##    for all words f in f_s
    ##    count(e|f) += t(e|f) / total_s(e)
    ##    total(f) += t(e|f) / total_s(e)
    ##    for all f
    ##    for all e
    ##    t(e|f) = count(e|f) / total(f)
    #print(ten_de)
    N = len(de_inp)
    for i in range(no_of_iter):
        print("Doing iteration "+str(i))
        totalde = FDist()
        counten_de = CondFDist()
        for sent in range(N):
            total_s = FDist()
            for en_word in en_inp[sent].split():
                for de_word in de_inp[sent].split():
                    total_s.inc(en_word, ten_de[de_word][en_word])
##                    print(en_word)
##                    print(de_word)
##                    print(total_s[en_word])
            for en_word in en_inp[sent].split():
                for de_word in de_inp[sent].split():
                    counten_de[de_word].inc(en_word, ten_de[de_word][en_word]/total_s[en_word])
                    totalde.inc(de_word, ten_de[de_word][en_word]/total_s[en_word])
        for de_word in ten_de.conditions():
            for en_word in ten_de[de_word].keys():
                ten_de[de_word][en_word] = counten_de[de_word][en_word]/totalde[de_word]
    return ten_de
# åƼ兄 gutenberg 集
from nltk.corpus import gutenberg

# éƒ½ęœ‰äŗ›ä»€ä¹ˆčÆ­ę–™åœØčæ™äøŖ集合里ļ¼Ÿ
print(gutenberg.fileids())
# ['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']

# åƼ兄 FreqDist ē±»
from nltk import FreqDist

# 频ēŽ‡åˆ†åøƒå®žä¾‹åŒ–
fd = FreqDist()
# ē»Ÿč®”ę–‡ęœ¬äø­ēš„čƍ例
for word in gutenberg.words('austen-persuasion.txt'):
    fd.inc(word)

print(fd.N())  # total number of samples
# 98171
print(fd.B())  # number of bins or unique samples
# 6132
# 得到前 10 äøŖęŒ‰é¢‘ēŽ‡ęŽ’åŗåŽēš„čƍ
for word in fd.keys()[:10]:
    print(word, fd[word])

# ================čæč”Œę—¶é—“讔ꗶ================
run_time = time.time() - start_time
if run_time < 60:  # äø¤ä½å°ę•°ēš„ē§’
    print("耗ꗶ:{:.2f}ē§’".format(run_time))
elif run_time < 3600:  # 分ē§’å–ę•“
    print("耗ꗶ:{:.0f}分{:.0f}ē§’".format(run_time // 60, run_time % 60))
 def __call__(self, key, values):
     dist = FreqDist()
     for fd in values:
         for k, v in fd:
             dist.inc(k, v)
     yield key, tuple(dist.items())
Example #42
0
#
# Second
#
# Here we will determine the relative frequencies of English bigrams in the text
# Then we will calculate the entropy of the bigram distribution

# create a list to store bigrams in
english_model_bigrams = []

index = 0
english_bigram_fdist = FreqDist()

# we'll be iterating through the string but stop one item short of normal
# this allows us to create bigram windows
while index < (len(english_model_content) - 1):
    english_bigram_fdist.inc(english_model_content[index:index + 2])
    index += 1

english_bigram_entropy = 0.0

# now loop and get the entropy for english unigrams
for bigram in english_bigram_fdist.samples():
    english_bigram_entropy += english_bigram_fdist.freq(bigram) * math.log(
        english_bigram_fdist.freq(bigram), 2)

english_bigram_entropy = -english_bigram_entropy

print "The English Bigram Entropy is: " + str(english_bigram_entropy)

#
# Third
Example #43
0
if __name__ == "__main__":
    import nltk
    from nltk import FreqDist
    from nltk.corpus import gutenberg, brown, treebank
    import re

    # Find all examples of "thou *th"
    thou_regexp = re.compile(r"[Tt]hou\s[\w]*t\s")
    thou_count = FreqDist()
    for ii in thou_regexp.findall(gutenberg.raw('bible-kjv.txt')):
        thou_count.inc(ii)
    print("\n".join("%s:%i" % (x, thou_count[x])
                    for x in thou_count.keys()[:10]))

    # Find everything that looks like a street
    street_regexp = re.compile(r"[A-Z]\w*\s[S]treet")
    for fileid in gutenberg.fileids():
        print(fileid, street_regexp.findall(gutenberg.raw(fileid)))

    print("-----------------------------------------")

    # Find repeated words
    repeat_regexp = re.compile(r'\b(\w+)\s(\1\b)+')
    for fileid in gutenberg.fileids():
        matches = list(repeat_regexp.finditer(gutenberg.raw(fileid)))
        print(fileid, [x.group(0) for x in matches])

    print("-----------------------------------------")

    # Find repeated words separated by some other word
    repeat_regexp = re.compile(r"\b(\w+)\s\w+\s(\1\b)+")
Example #44
0
class Cosine():
    def __init__(self, stem=True, lemm=True):
        self.raw_inputs = []
        self.inputs = []
        self.vectors = []
        self.words = []
        self.fd = FreqDist()
        self.cos_values = []
        self.stemmer = nltk.porter.PorterStemmer()
        self.lemmatizer = nltk.wordnet.WordNetLemmatizer()
        self.lemm = lemm
        self.stem = stem
        return

    def set_input(self, txt):
        self.raw_inputs.append(txt)
        temp = []
        new_text = txt
        if self.stem:
            for word in nltk.word_tokenize(txt):
                if word.lower() in stopwords.words():
                    continue
                temp.append(self.stemmer.stem(word))
            new_text = ' '.join(temp)
        if self.lemm:
            for word in nltk.word_tokenize(new_text):
                if word.lower() in stopwords.words():
                    continue
                temp.append(self.lemmatizer.lemmatize(word))
            new_text = ' '.join(temp)
        self.inputs.append(new_text)
        return

    def setup_tftable(self):
        for txt in self.inputs:
            sents = nltk.sent_tokenize(txt)
            for sent in sents:  # for each sentence in the given text
                words = nltk.word_tokenize(sent)
                for word in words:
                    self.fd.inc(word)

        self.tftable = [[k, 0] for k in self.fd.keys()]
        return self.tftable

    def vectorize(self):
        tft = self.setup_tftable()
        vecs = []
        for txt in self.inputs:
            vecs.append(self.vectorize_one(txt))
        self.vectors = []
        for v in vecs:
            self.vectors.append(tuple(i[1] for i in v))
        return self.vectors

    def vectorize_one(self, txt):
        #we will take bag of words with word count
        myvector = copy.deepcopy(self.tftable)
        sents = nltk.sent_tokenize(txt)
        for sent in sents:  # for each sentence in the given text
            words = nltk.word_tokenize(sent)
            for word in words:
                for item in myvector:
                    if item[0] == word:
                        item[1] += 1
        return myvector

    #initialize a matrix that would contain cosine similarity value for each vector in the LVS against every other vector
    def init_cos_matrix(self, dim):
        values = []
        for m in range(dim):
            row = []
            for n in range(dim):
                row.append(None)
            values.append(row)
        return values

    def cosine(
        self,
        vecs=None
    ):  #returns the cosine similarity of the input vectors taken from self.vectors
        self.cos_values = self.init_cos_matrix(len(self.vectors))
        if vecs == None:
            vecs = self.vectors
        for u in range(len(vecs)):
            #self.cos_values.append([])
            for v in range(u, len(vecs)):
                angle = nltk.cluster.cosine_distance(vecs[u], vecs[v])
                value = math.cos(angle)
                self.cos_values[v][u] = self.cos_values[u][v] = (
                    angle,
                    value,
                )
        return self.cos_values

    def compute_similarity(self,
                           messages,
                           stem=True,
                           lemm=True,
                           threshold=0.75):
        #given a list of messages computes the similarity and returns the matrix
        #messages is of the form: [message1, message2, ..., messagen]
        self.stem = stem
        self.lemm = lemm
        for m in messages:
            self.set_input(m)
        self.vectorize()
        values = self.cosine()
        return values
Example #45
0
def train(db_name,
          samples=200000,
          classifier_type='naivebayes',
          extractor_type='words',
          best_features=10000,
          processes=8,
          purge=False):
    """
    Train with samples from sqlite database and stores the resulting classifier in Redis.

    Arguments:
    db_name (str) -- Name of the training database to use stored in ~/.synt

    Keyword arguments:
    samples (int) -- Amount of samples to train on.
    classifier_type (str) -- Type of classifier to use. Available classifiers are 'naivebayes'.
    extractor_type (str) -- Type of extractor to use. Available extractors are 'words', 'stopwords', 'bestwords'.
    best_features (int) -- Amount of highly informative features to store.
    processes (int) -- The amount of processes to be used for counting features in parallel.
    purge (bool) -- If true will flush the redis database.
    """
    m = RedisManager(purge=purge)

    extractor = get_extractor(extractor_type)

    if not db_exists(db_name):
        raise ValueError("Database '%s' does not exist." % db_name)

    if classifier_type in m.r.keys():
        print("Classifier exists in Redis. Purge to re-train.")
        return

    classifier = config.CLASSIFIERS.get(classifier_type)
    if not classifier:  #classifier not supported
        raise ValueError("Classifier '%s' not supported." % classifier_type)

    #retrieve training samples from database
    train_samples = get_samples(db_name, samples)

    m.store_feature_counts(train_samples, processes=processes)
    m.store_feature_scores()

    if best_features and best_features > 1:
        m.store_best_features(best_features)

    label_freqdist = FreqDist()
    feature_freqdist = defaultdict(FreqDist)

    #retreieve the actual samples processed for label
    neg_processed, pos_processed = m.r.get('negative_processed'), m.r.get(
        'positive_processed')
    label_freqdist.inc('negative', int(neg_processed))
    label_freqdist.inc('positive', int(pos_processed))

    labeled_feature_freqs = m.pickle_load('labeled_feature_freqs')
    labels = labeled_feature_freqs.keys()

    #feature extraction
    feat_ex = extractor()
    extracted_set = set([
        feat_ex.extract(labeled_feature_freqs[label].keys(), as_list=True)
        for label in labels
    ][0])

    #increment the amount of times a given feature for label occured and fill in the missing occurences with Falses
    for label in labels:
        samples = label_freqdist[label]
        for fname in extracted_set:
            trues = labeled_feature_freqs[label].get(fname, 0)
            falses = samples - trues
            feature_freqdist[label, fname].inc(True, trues)
            feature_freqdist[label, fname].inc(False, falses)

    #create the P(label) distribution
    estimator = ELEProbDist
    label_probdist = estimator(label_freqdist)

    #create the P(fval|label, fname) distribution
    feature_probdist = {}
    for ((label, fname), freqdist) in feature_freqdist.items():
        probdist = estimator(freqdist, bins=2)
        feature_probdist[label, fname] = probdist

    #TODO: naivebayes supports this prototype, future classifiers will most likely not
    trained_classifier = classifier(label_probdist, feature_probdist)

    m.pickle_store(classifier_type, trained_classifier)
    m.r.set('trained_to', samples)
    m.r.set('trained_db', db_name)
    m.r.set('trained_classifier', classifier_type)
    m.r.set('trained_extractor', extractor_type)
Example #46
0
class CorpusReader:
    """
    A collection of documents
    """
    def __init__(self, base, doc_limit=-1, bigram_limit=-1):
        self._file_base = base
        self._files = defaultdict(set)
        self._total_docs = 0

        self._bigram_finder = {}
        self._bigram_limit = bigram_limit

        self._author_freq = FreqDist()
        self._word_df = defaultdict(DfCalculator)
        self._word_freq = defaultdict(FreqDist)
        self._lemma_freq = defaultdict(FreqDist)
        self._bigram_freq = defaultdict(FreqDist)
        self._tag_freq = defaultdict(FreqDist)
        self._synset_freq = FreqDist()

        self._author_lookup = {}
        self._word_lookup = defaultdict(dict)
        self._lemma_lookup = defaultdict(dict)
        self._stop_words = defaultdict(set)
        self._pos_tag_lookup = defaultdict(dict)
        self._bigram_lookup = defaultdict(dict)
        self._synset_lookup = {}

        self._doc_limit = doc_limit
        self._stemmer = Snowball()

    def lang_iter(self, lang):
        print "DOC LIMIT %i" % self._doc_limit
        file_list = list(self._files[lang])

        doc_num = 0

        file_list.sort()
        random.seed(0)
        random.shuffle(file_list)

        if len(file_list) > 100:
            for ff in file_list:
                for dd in self.doc_factory(lang, ff):
                    if self._doc_limit > 0 and doc_num >= self._doc_limit:
                        return
                    doc_num += 1
                    yield dd
        else:
            file_list = list(self.doc_factory(lang, x) for x in file_list)
            for dd in poll_iterator(file_list):
                if self._doc_limit > 0 and doc_num >= self._doc_limit:
                    return
                doc_num += 1
                yield dd

    def __iter__(self):
        """
        Return documents.
        """

        # We have two different types of behavior depending on the number of
        # files.  If we have lots of files, then just go through them

        for ll in self._files:
            for ii in self.lang_iter(ll):
                yield ii

    def sample(self, num_docs=-1, rand_seed=0):
        """
        Iterate over a subset of the documents.  Given the same random
        seed, the results should be consistent.
        """
        raise NotImplementedError

    def build_vocab(self):
        """
        Create counts for all of the tokens.  Does care about lemmatization and
        will create separate vocab for that.  Also ignores tags.
        """
        self._author_freq = FreqDist()

        print "Building vocab:"
        doc = 0
        for ii in self:
            doc += 1
            if doc % 100 == 0:
                print("Doc %i / %i (total estimated)" % \
                          (doc, self._total_docs))
                self._total_docs = max(self._total_docs, doc)
            for jj in ii.authors():
                self._author_freq.inc(jj)
            for jj in ii.lemmas(self._stemmer):
                try:
                    jj.encode("utf-8", "replace")
                    self._lemma_freq[ii.lang].inc(jj)
                except ValueError:
                    None
            for jj in ii.tokens():
                try:
                    jj.encode("utf-8", "replace")
                    self._word_freq[ii.lang].inc(jj)
                    self._word_df[ii.lang].word_seen(doc, jj)
                except ValueError:
                    None
            for jj in ii.synsets():
                self._synset_freq.inc(jj)
            for jj in ii.pos_tags():
                self._tag_freq[ii.lang].inc(jj)
            for jj in ii.relations():
                self._tag_freq[ii.lang].inc(jj)

        self._total_docs = doc
        self.init_stop()

        if self._bigram_limit > 0:
            for ii in self._word_freq:
                bf = BigramFinder(language=LANGUAGE_ID[ii])
                self._bigram_finder[ii] = bf
                bf.set_counts(self._word_freq[ii])
                print("Finding bigrams in language %i" % ii)

            doc = 0
            for ii in self:
                lang = ii.language()
                doc += 1
                if doc % 100 == 0:
                    print("Doc %i / %i" % (doc, self._total_docs))
                self._bigram_finder[lang].add_ngram_counts(ii.tokens())

            print("Scoring bigrams")
            bigrams = {}

            for lang in self._word_freq:
                bf = self._bigram_finder[lang]
                bf.find_ngrams([])

                bigrams[lang] = bf.real_ngrams(self._bigram_limit)
                print("First 10 bigrams")
                for ii in bigrams[lang].keys()[:10]:
                    print("%s_%s" % ii)

            print("Creating new counts after subtracting bigrams")
            doc = 0
            for ii in self:
                doc += 1
                lang = ii.language()
                bf = self._bigram_finder[lang]
                if doc % 100 == 0:
                    print("Doc %i / %i" % (doc, self._total_docs))

#                for jj in ii.tokens():
#                    self._bigram_freq[lang].inc(jj)

                for jj in ii.sentences():
                    for kk in iterable_to_bigram(jj, bigrams[lang],
                                                 bf.normalize_word):
                        self._bigram_freq[lang].inc(kk)

    def init_stop(self):
        """
        Requires vocabulary to be built first to know which languages appear.
        """
        s = StopWords()
        self._stop_words = defaultdict(set)
        for ll in self._word_freq:
            language_name = LANGUAGE_ID[ll]
            print "Loading stopwords for", ll, " from", language_name
            try:
                self._stop_words[ll] = s[language_name]
            except IOError:
                print "Could not load stop words for", language_name
            print "Loaded", len(self._stop_words[ll]), "words."

            # Make sure lemmatized versions are also in
            temp_stop = list(self._stop_words[ll])
            for ii in temp_stop:
                self._stop_words[ll].add(self._stemmer(ll, ii))

    def doc_factory(self, lang, filename):
        raise NotImplementedError

    def fill_proto_vocab(self, frequency_count, vocab_generator, lookup, name):
        for ll in frequency_count:
            voc = vocab_generator()
            voc.language = ll
            word_id = 0
            for tt in frequency_count[ll]:
                word = voc.terms.add()
                word.id = word_id
                word.original = tt
                word.ascii = tt.encode("ascii", "replace")
                word.frequency = frequency_count[ll][tt]
                word.stop_word = tt in self._stop_words[ll]
                if tt in lookup[ll]:
                    assert lookup[ll][tt] == word_id
                else:
                    lookup[ll][tt] = word_id

                if word_id < 50 or (word_id < 1000 and "_" in word.ascii):
                    print("%s\t%i\t%s\t%s\t%i" %
                          (name, word_id, word.ascii, str(
                              word.stop_word), word.frequency))

                word_id += 1

    def fill_proto_language_independent_vocab(self, frequency_count,
                                              vocab_generator, lookup, name):
        word_id = 0
        for tt in frequency_count:
            if not tt:
                continue
            word = vocab_generator()
            word.id = word_id
            word.original = tt
            word.ascii = tt.encode("ascii", "replace")
            word.frequency = frequency_count[tt]

            lookup[tt] = word_id

            word_id += 1

    def new_section(self):
        """
        Create a new corpus section and return it
        """
        c = Corpus()

        self.fill_proto_vocab(self._word_freq, c.tokens.add, self._word_lookup,
                              "TOKEN")
        self.fill_proto_vocab(self._bigram_freq, c.bigrams.add,
                              self._bigram_lookup, "BIGRAM")
        self.fill_proto_vocab(self._lemma_freq, c.lemmas.add,
                              self._lemma_lookup, "LEMMA")
        self.fill_proto_vocab(self._tag_freq, c.pos.add, self._pos_tag_lookup,
                              "TAG")
        self.fill_proto_language_independent_vocab(self._author_freq,
                                                   c.authors.terms.add,
                                                   self._author_lookup,
                                                   "AUTHOR")
        self.fill_proto_language_independent_vocab(self._synset_freq,
                                                   c.synsets.terms.add,
                                                   self._synset_lookup,
                                                   "SYNSET")

        return c

    def add_language(self, pattern, language=ENGLISH):
        search = self._file_base + pattern
        print "SEARCH:", search
        for ii in glob(search):
            self._files[language].add(ii)
            self._total_docs += 1

    def write_proto(self, path, name, docs_in_sec=10000):
        self.build_vocab()
        section = self.new_section()
        doc_id = 0

        bigram_list = {}
        if self._bigram_limit > 0:
            for lang in self._bigram_finder:
                bf = self._bigram_finder[lang]
                bigram_list[lang] = bf.real_ngrams(self._bigram_limit)

        for lang in self._files:
            doc_num = 0
            section_num = 0

            filename = "%s/%s_%s_%i" % (path, \
                       name, LANGUAGE_ID[lang], section_num)
            print path

            for doc in self.lang_iter(lang):
                if doc_num >= docs_in_sec:
                    print "Done with section ", \
                        section_num, " we've written ", doc_id
                    # Write the file
                    write_proto(filename + ".index", section)

                    section = self.new_section()
                    section_num += 1
                    doc_num = 0
                    filename = "%s/%s_%s_%i" % (path, name, LANGUAGE_ID[lang],
                                                section_num)
                if not os.path.exists(filename):
                    os.mkdir(filename)

                assert lang in self._word_lookup, "%i not in vocab, %s" % \
                    (lang, str(self._word_lookup.keys()))
                if doc_id % 100 == 0:
                    print "Writing out ", lang, filename, doc_id, "/", \
                        len(self._files[lang])

                if self._bigram_limit > 0:
                    bf = self._bigram_finder[lang]
                    doc_proto = doc.proto(doc_id, lang, self._author_lookup,
                                          self._word_lookup[lang],
                                          self._word_df[lang],
                                          self._lemma_lookup[lang],
                                          self._pos_tag_lookup[lang],
                                          self._synset_lookup, self._stemmer,
                                          self._bigram_lookup[lang],
                                          bigram_list[lang], bf.normalize_word)
                else:
                    doc_proto = doc.proto(doc_id, lang, self._author_lookup,
                                          self._word_lookup[lang],
                                          self._word_df[lang],
                                          self._lemma_lookup[lang],
                                          self._pos_tag_lookup[lang],
                                          self._synset_lookup, self._stemmer)

                write_proto("%s/%i" % (filename, doc_id), doc_proto)

                section.doc_filenames.append("%s_%s_%i/%i" % \
                                                 (name, LANGUAGE_ID[lang],
                                                  section_num, doc_id))
                doc_id += 1
                doc_num += 1

            # We don't want to mix languages, so we close out each section when
            # done with a language

            if doc_num > 0:
                write_proto(filename + ".index", section)
                section = self.new_section()
                doc_num = 0
                section_num += 1
                filename = "%s/%s_%s_%i" % (path, name, LANGUAGE_ID[lang],
                                            section_num)
                if not os.path.exists(filename):
                    os.mkdir(filename)
            print doc_id, " files written"
Example #47
0
class BigramLanguageModel:
    def __init__(self,
                 unk_cutoff,
                 jm_lambda=0.6,
                 dirichlet_alpha=0.1,
                 katz_cutoff=5,
                 kn_discount=0.1,
                 kn_concentration=1.0,
                 tokenize_function=TreebankWordTokenizer().tokenize,
                 normalize_function=lower):
        self._unk_cutoff = unk_cutoff
        self._jm_lambda = jm_lambda
        self._dirichlet_alpha = dirichlet_alpha
        self._katz_cutoff = katz_cutoff
        self._kn_concentration = kn_concentration
        self._kn_discount = kn_discount
        self._vocab_final = False

        self._tokenizer = tokenize_function
        self._normalizer = normalize_function

        # Add your code here!
        self._vocab_freq = FreqDist()
        self._gram_freq = FreqDist()
        self._context_freq = FreqDist()

        self._vocab_freq[kSTART] += kUNK_CUTOFF + 1
        self._vocab_freq[kEND] += kUNK_CUTOFF + 1

    def train_seen(self, word, count=1):
        """
        Tells the language model that a word has been seen @count times.  This
        will be used to build the final vocabulary.
        """
        assert not self._vocab_final, \
            "Trying to add new words to finalized vocab"

        self._vocab_freq.inc(word, count)

        return self._vocab_freq[word]

    def tokenize(self, sent):
        """
        Returns a generator over tokens in the sentence.  

        No modify
        """
        for ii in self._tokenizer(sent):
            yield ii

    def vocab_lookup(self, word):
        """
        Given a word, provides a vocabulary representation.  Words under the
        cutoff threshold shold have the same value.  All words with counts
        greater than or equal to the cutoff should be unique and consistent.
        """

        assert self._vocab_final, \
            "Vocab must be finalized before looking up words"

        freqCount = self._vocab_freq[word]

        if freqCount > self._unk_cutoff:
            return word
        else:
            return "<UNK>"

    def finalize(self):
        """
        Fixes the vocabulary as static, prevents keeping additional vocab from
        being added

        No modify
        """
        self._vocab_final = True

    def tokenize_and_censor(self, sentence):
        """
        Given a sentence, yields a sentence suitable for training or
        testing.  Prefix the sentence with <s>, replace words not in
        the vocabulary with <UNK>, and end the sentence with </s>.

        No modify
        """
        yield self.vocab_lookup(kSTART)
        for ii in self._tokenizer(sentence):
            yield self.vocab_lookup(self._normalizer(ii))
        yield self.vocab_lookup(kEND)

    def normalize(self, word):
        """
        Normalize a word

        No modify
        """
        return self._normalizer(word)

    def mle(self, context, word):
        """
        Return the log MLE estimate of a word given a context.  If the
        MLE would be negative infinity, use kNEG_INF
        """
        prob = 0.0
        bgram = (context, word)

        numer = self._gram_freq[bgram]
        denom = self._context_freq[context]

        if denom == 0:
            return kNEG_INF

        if self._gram_freq[bgram] != 0:
            prob = numer / denom

        if prob == 0.0:
            return kNEG_INF
        else:
            return lg(prob)

    def laplace(self, context, word):
        """
        Return the log MLE estimate of a word given a context.
        """
        bgram = (context, word)

        numer = self._gram_freq[bgram] + 1
        denom = len(self._vocab_freq.keys()) + self._context_freq[context]

        prob = numer / denom

        return lg(prob)

    def good_turing(self, context, word):
        """
        Return the Good Turing probability of a word given a context
        """
        return 0.0

    def jelinek_mercer(self, context, word):
        """
        Return the Jelinek-Mercer log probability estimate of a word
        given a context; interpolates context probability with the
        overall corpus probability.
        """

        bigram = (context, word)
        bigram_prob = 0

        unigram_prob = (1 - self._jm_lambda) * (1 / len(self._vocab_freq))

        for i in self._gram_freq:
            if i == bigram:
                bigram_count = 1
                bigram_prob = (self._jm_lambda + unigram_prob) * bigram_count

        result = unigram_prob + bigram_prob
        return lg(result)

    def kneser_ney(self, context, word):
        """
        Return the log probability of a word given a context given
        Kneser Ney backoff
        """

        bgram = (context, word)
        unigram_freq = FreqDist()

        theta = self._kn_concentration
        vocabulary = 1 / len(self._vocab_freq.keys())
        discount_delta = self._kn_discount
        unigram_T = len(self._context_freq.keys())
        bigram_T = self._context_freq[context]

        for i in self._gram_freq:
            unigram_freq.inc(i[1])

        # Unigram Restaurant
        # C_0,x
        count_unirest_wordTable = unigram_freq[word]
        # C_0,.
        count_unirest_allTable = unigram_freq.N()

        # u_Bigram Restaurant
        # C_u,x
        count_birest_wordTable = self._gram_freq[bgram]

        # C_u,.
        count_birest_allTable = self._context_freq[context]

        existingTable_numer = count_birest_wordTable - discount_delta
        existingTable_denom = theta + count_birest_allTable
        existingTable = existingTable_numer / existingTable_denom

        if existingTable < 0:
            existingTable = 0

        newTable_numer = theta + (bigram_T * discount_delta)
        newTable_denom = theta + count_birest_allTable
        newTable = newTable_numer / newTable_denom

        back_a_numer = count_unirest_wordTable - discount_delta
        back_a_denom = count_unirest_allTable + theta
        back_a = back_a_numer / back_a_denom
        if back_a < 0:
            back_a = 0

        back_b_numer = theta + (unigram_T * discount_delta)
        back_b_denom = count_unirest_allTable + theta
        back_b = back_b_numer / back_b_denom
        back_b = back_b * vocabulary

        result = existingTable + (newTable * (back_a + back_b))
        return lg(result)

    def dirichlet(self, context, word):
        """
        Additive smoothing, assuming independent Dirichlets with fixed
        hyperparameter.
        """

        prob = 0.0
        bgram = (context, word)

        numer = self._gram_freq[bgram] + self._dirichlet_alpha
        denom = self._context_freq[context] + (self._dirichlet_alpha *
                                               len(self._vocab_freq.keys()))

        prob = numer / denom

        return lg(prob)

    def add_train(self, sentence):
        """
        Add the counts associated with a sentence.
        """

        # You'll need to complete this function, but here's a line of code that
        # will hopefully get you started.

        # Add new vocab counts
        nopunc_tokenize = RegexpTokenizer(r'\w+')
        nopunc_list = nopunc_tokenize.tokenize(sentence)
        for i in nopunc_list:
            self._vocab_freq[i] += 1

        # Count occurances of bigrams
        for context, word in bigrams(self.tokenize_and_censor(sentence)):
            x = (context, word)
            self._gram_freq.inc(x)
            self._context_freq.inc(context)

    def perplexity(self, sentence, method):
        """
        Compute the perplexity of a sentence given a estimation method

        No modify
        """
        return 2.0 ** (-1.0 * mean([method(context, word) for context, word in \
                                    bigrams(self.tokenize_and_censor(sentence))]))

    def sample(self, samples=25):
        """
        Sample words from the language model.
        
        @arg samples The number of samples to return.
        """
        yield ""
        return
Example #48
0
def pmi(a, b):
    return log(pairs[a, b]) - log(pairs.N()) - log(unigrams[a]) - log(
        unigrams[b]) + 2 * log(unigrams.N())


h = FrameHierarchy.load()
# training data contains a bad frame
valid_names = {f.name for f in h._frames.values()}

with codecs.open("../../../training/data/naacl2012/cv.train.sentences.json",
                 encoding="utf8") as train_file:
    train = [json.loads(line) for line in train_file]

unsorted_frames = ([(f['target']['spans'][0]['start'], f['target']['name'])
                    for f in s['frames']] for s in train)
frames = [[name for start, name in sorted(s) if name in valid_names]
          for s in unsorted_frames]
del unsorted_frames
unigrams = FreqDist(chain(*frames))
pairs = FreqDist(
    chain(*[[tuple(sorted(b)) for b in combinations(f, 2)] for f in frames]))
pmis = FreqDist({(a, b): pmi(a, b)
                 for a, b in pairs.keys()
                 if unigrams[a] >= THRESHOLD and unigrams[b] >= THRESHOLD})

unigrams_with_ancestors = FreqDist(unigrams)
for u in unigrams:
    for a in h.ancestors(h._frames[u]):
        unigrams_with_ancestors.inc(a.name)
Example #49
0
def reducer(key, values):
    finalfd = FreqDist()
    for fd in values:
        for k, v in fd:
            finalfd.inc(k, v)
    yield key, tuple(finalfd.items())
Example #50
-1
def category_by_pos():
    from nltk.corpus import brown
    from nltk import FreqDist
    from nltk import DecisionTreeClassifier
    from nltk import NaiveBayesClassifier
    from nltk import classify

    suffix_fdist = FreqDist()
    for word in brown.words():
        word = word.lower()
        suffix_fdist.inc(word[-1:])
        suffix_fdist.inc(word[-2:])
        suffix_fdist.inc(word[-3:])

    common_suffixes = suffix_fdist.keys()[:100]
#    print common_suffixes

    def pos_features(word):
        features = {}
        for suffix in common_suffixes:
            features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
        return features

    tagged_words = brown.tagged_words(categories='news')
    featuresets = [(pos_features(n), g) for (n, g) in tagged_words]
    size = int(len(featuresets) * 0.1)
    train_set, test_set = featuresets[size:], featuresets[:size]
#    classifier = DecisionTreeClassifier.train(train_set)
#    print 'Decision Tree %f' % classify.accuracy(classifier, test_set)

    classifier = NaiveBayesClassifier.train(train_set)
    print 'NaiveBay %f' % classify.accuracy(classifier, test_set)