Ejemplo n.º 1
0
    def __init__(self, fileid):
        try:
            # Reads the UDHR file
            corpus = udhr.raw(fileid)
        except:
            print("UDHR language file " + fileid + " does not exist",
                  file=sys.stderr)
            sys.exit(1)

        # Generate training dataset, lowercase and newlines converted to space
        self.train = re.sub(r'[\n]+', ' ', corpus[0:1000].strip().lower())
        # Generate dev dataset
        self.dev = corpus[1000:1100]

        # Convert training words to single characters
        tokens = list(self.train)
        self.unigram = tokens
        self.bigram = list(nltk.bigrams(tokens))
        self.trigram = list(nltk.trigrams(tokens))
        # Generate unigram frequency distirbution
        self.unigramFreq = FreqDist(self.unigram)
        # Generate bigram frequency distribution
        self.bigramFreq = ConditionalFreqDist(self.bigram)
        # Generate trigram frequency distribution
        self.trigramFreq = ConditionalFreqDist(
            list(((w0, w1), w2) for w0, w1, w2 in self.trigram))
    def build_top_words(self):
        pos_reviews = [(review, c) for (review, c) in self.documents
                       if c == 'pos']
        neg_reviews = [(review, c) for (review, c) in self.documents
                       if c == 'neg']

        pos_words = [token for (review, c) in pos_reviews for token in review]
        neg_words = [token for (review, c) in neg_reviews for token in review]

        fd_all = FreqDist(pos_words + neg_words)
        pos_class_words = [('pos', word) for word in pos_words]
        neg_class_words = [('neg', word) for word in neg_words]
        cfd_pos = ConditionalFreqDist(pos_class_words)
        cfd_neg = ConditionalFreqDist(neg_class_words)

        pos_word_count = len(pos_words)
        neg_word_count = len(neg_words)
        total_word_count = pos_word_count + neg_word_count

        word_scores = {}

        for (word, freq) in fd_all.items():
            pos_score = BigramAssocMeasures.chi_sq(cfd_pos['pos'][word],
                                                   (freq, pos_word_count),
                                                   total_word_count)
            neg_score = BigramAssocMeasures.chi_sq(cfd_neg['neg'][word],
                                                   (freq, neg_word_count),
                                                   total_word_count)
            word_scores[word] = pos_score + neg_score

        best = sorted(word_scores.items(), reverse=True,
                      key=lambda x: x[1])[:1000]
        self.top_words = set([w for w, s in best])
Ejemplo n.º 3
0
def train_model():
    """Create ngram model from Project Gutenberg texts"""
    text = ''
    for corpus in CORPORA:
        with open(corpus, 'r') as file_:
            text += file_.read().replace('\n', '')

    sents = sent_tokenize(text.lower())
    tokens = []
    # appends <start> and <end> tokens to each sentence
    for sent in sents:
        sent = 'START ' + sent + ' END'
        tokens += word_tokenize(sent)

    ngrams_ = tuple(ngrams(tokens, N_VAL))

    # bigram frequency distribution
    bi_cfdist = ConditionalFreqDist((ngram[0], ngram[:2]) for ngram in ngrams_)

    # bigram probability distribution
    bi_cpdist = ConditionalProbDist(bi_cfdist, LaplaceProbDist)

    # conditional frequency distribution
    cfdist = ConditionalFreqDist(
        (ngram[:N_MINUS1], ngram) for ngram in ngrams_)

    # conditional probability
    cpdist = ConditionalProbDist(cfdist, LaplaceProbDist)

    return bi_cpdist, cpdist
Ejemplo n.º 4
0
 def __init__(self, file):
     corpus = udhr.raw(file)
     self.training_set = corpus[0:1000]
     token = list(self.training_set)
     self.unigram = token
     self.bigram = list(nltk.bigrams(token))
     self.trigram = list(nltk.trigrams(token))
     self.unigram_frequency = FreqDist(self.unigram)
     self.bigram_frequency = ConditionalFreqDist(self.bigram)
     self.trigam_frequency = ConditionalFreqDist(
         list(((x, y), z) for x, y, z in self.trigram))
Ejemplo n.º 5
0
    def train_supervised(self, labelled_sequences, **kwargs):
        """
        Supervised training maximising the joint probability of the symbol and
        state sequences. This is done via collecting frequencies of
        transitions between states, symbol observations while within each
        state and which states start a sentence. These frequency distributions
        are then normalised into probability estimates, which can be
        smoothed if desired.

        @return: the trained model
        @rtype: HiddenMarkovModelTagger
        @param labelled_sequences: the training data, a set of
            labelled sequences of observations
        @type labelled_sequences: list
        @param kwargs: may include an 'estimator' parameter, a function taking
            a C{FreqDist} and a number of bins and returning a C{ProbDistI};
            otherwise a MLE estimate is used
        """

        # default to the MLE estimate
        estimator = kwargs.get('estimator')
        if estimator == None:
            estimator = lambda fdist, bins: MLEProbDist(fdist)

        # count occurences of starting states, transitions out of each state
        # and output symbols observed in each state
        starting = FreqDist()
        transitions = ConditionalFreqDist()
        outputs = ConditionalFreqDist()
        for sequence in labelled_sequences:
            lasts = None
            for token in sequence:
                state = token[_TAG]
                symbol = token[_TEXT]
                if lasts == None:
                    starting.inc(state)
                else:
                    transitions[lasts].inc(state)
                outputs[state].inc(symbol)
                lasts = state

                # update the state and symbol lists
                if state not in self._states:
                    self._states.append(state)
                if symbol not in self._symbols:
                    self._symbols.append(symbol)

        # create probability distributions (with smoothing)
        N = len(self._states)
        pi = estimator(starting, N)
        A = ConditionalProbDist(transitions, estimator, False, N)
        B = ConditionalProbDist(outputs, estimator, False, len(self._symbols))
                               
        return HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi)
    def __init__(self, corpura):

        corpus = udhr.raw(corpura)

        self.TrainingSet = corpus[0:1000]
        token = list(self.TrainingSet)

        self.Uni = token
        self.Bi = list(nltk.bigrams(token))
        self.Tri = list(nltk.trigrams(token))

        self.UniFreq = FreqDist(self.Uni)
        self.BiFreq = ConditionalFreqDist(self.Bi)
        self.TriFreq = ConditionalFreqDist(
            list(((w1, w2), w3) for w1, w2, w3 in self.Tri))
Ejemplo n.º 7
0
    def buildTransitionMatrix(self, tagged_corpus: list, train_size):
        train = tagged_corpus[:int(train_size * len(tagged_corpus))]
        random.shuffle(train)
        #construction of the transition matrix
        transition = ConditionalFreqDist()
        for (tag1, tag2) in train:

            if tag1 not in transition:
                transition[tag1] = FreqDist()
            if tag2 not in transition[tag1]:
                transition[tag1][tag2] = 0.0

            transition[tag1][tag2] += 1

        for tag in transition.keys():
            somme = 0.0
            for value in transition[tag].values():
                somme += value
            for successor in transition[tag].keys():
                transition[tag][successor] = round(
                    float("{0:.6f}".format(transition[tag][successor] /
                                           somme)), 6)

        self.TRANSITION_MATRIX = transition
        return transition
Ejemplo n.º 8
0
 def __init__(self, sentences):
     # FIXME should use smoothing here. I tried SimpleGoodTuringProbDist but
     # it returns zero probability for event with freq=1. Possibly due to
     # too small test corpus
     self.cfd = ConditionalFreqDist(
         (ngram[:-1], ngram[-1]) for sentence in sentences
         for ngram in ngrams(sentence, 3, pad_left=True))
Ejemplo n.º 9
0
 def get_bigrams(self, text):
     list_bigrams = bigrams(text)
     cfd = ConditionalFreqDist(list_bigrams)
     list = []
     for i in cfd:
         list.append(cfd[i])
     return list
Ejemplo n.º 10
0
    def constructTransitionMatrix(self, sourceFilesList: list):
        #construction of the transition matrix
        for fileName in sourceFilesList:
            file = open(fileName, 'r', encoding="windows-1256")
            fileFinal = ""
            for line in file:
                line = line.upper()
                if (len(line) > 1):
                    if not line.startswith("<S>"):
                        fileFinal += '<S> ' + line[:-1] + ' <E>\n'
                    else:
                        fileFinal += line[:-1] + '\n'
            file.close()

        tokens = [el for el in re.split("[\s\n]+", fileFinal) if el != '']
        self.initialProbabilities = FreqDist([
            tokens[i] for i in range(1, len(tokens)) if tokens[i - 1] == '<S>'
        ])

        self.tags = list(set(tokens))
        self.bigramDist = FreqDist(list(bigrams(tokens)))
        Trigrams = list(trigrams(tokens))
        cfd = ConditionalFreqDist(((el[2], (el[0], el[1])) for el in Trigrams))

        for word in cfd.conditions():
            for bigram in cfd[word]:
                cfd[word][bigram] = round(
                    float("{0:.6f}".format(cfd[word].freq(bigram))), 6)

        self.TRANSITION_MATRIX = cfd
        return cfd
def train_model_get_cosine_matrix(statements):
    statements = [statement.split() for statement in statements]

    frequencies = FreqDist(w for word in statements for w in word)

    conditionalFrequencies = ConditionalFreqDist(
                                (key,word)
                                for key in sorted(frequencies.keys())
                                for statement in statements
                                for word in statement 
                                if key in statement)
        
    pmi = [[npmi_scorer(frequencies[worda], 
                  frequencies[wordb], 
                  conditionalFrequencies[worda][wordb], 
                  len(frequencies.keys()),
                  2,
                  sum(frequencies[key] for key in frequencies.keys()))
        for wordb in sorted(frequencies.keys())]
        for worda in sorted(frequencies.keys())]
        
        
    pmi = np.array(pmi)
    pmi[np.isinf(pmi)] = -1
    pmi[np.where(pmi < 0)] = 0
        
    pmi = pd.DataFrame(pmi)
    pmi.columns = sorted(frequencies.keys())
    pmi.index = sorted(frequencies.keys())

    return pmi
Ejemplo n.º 12
0
 def test_tabulate(self):
     empty = ConditionalFreqDist()
     self.assertEqual(empty.conditions(), [])
     with pytest.raises(ValueError):
         empty.tabulate(
             conditions="BUG")  # nonexistent keys shouldn't be added
     self.assertEqual(empty.conditions(), [])
Ejemplo n.º 13
0
 def train(self):
     """
     This trains a simple baseline which just uses majority class voting for every word in vocabulary
     disregarding of its context
     """
     self.word_pos_cfd = ConditionalFreqDist(
         tp for seq_list in self.corpus.train
         for tp in seq_list.get_tag_word_tuples())
Ejemplo n.º 14
0
 def test_plot(self):
     empty = ConditionalFreqDist()
     self.assertEqual(empty.conditions(), [])
     try:
         empty.plot(conditions="BUG")  # nonexistent keys shouldn't be added
     except:
         pass
     self.assertEqual(empty.conditions(), [])
Ejemplo n.º 15
0
def find_language(string):
    text=string.split(" ")
    text=[word for word in text if word.isalpha()]
    l=len(text)
    avail_langs=[file for file in udhr.fileids() if 'Latin1' in file]
    cfd=ConditionalFreqDist([(lang, word) for lang in avail_langs for word in [word for word in text if word in udhr.words(lang)]])
    ls=sorted([(lang,cfd[lang]) for lang in avail_langs], key=lambda tple: tple[1].N())
    print("The most probable language of the text is {0} with {1:3.3f}% probability.".format(ls[-1][0].replace('-Latin1',''), 100*ls[-1][1].N()/l))
Ejemplo n.º 16
0
    def __init__(self, corpus):
        """Initializer of the BigramWordCandidateProvider.

        Args:
            corpus: An iterable of word strings.
        """
        _bigrams = bigrams(corpus)
        self._cfd = ConditionalFreqDist(_bigrams)
    def train(self):
        """ Construct the conditional frequencies and probabilities """
        #extract tags from sentences

        tags = [tag for (_, tag) in self.tagged_sents]
        self.replaceUnique()
        self.emission_frequencies = ConditionalFreqDist(
            [tup[::-1] for tup in self.tagged_sents])
        self.tagset_size = len(self.emission_frequencies.conditions())

        # emission - probability that a certain tag is a certain word
        # e.g. probability that a VB is 'race'
        self.emission_probabilities = ConditionalProbDist(
            self.emission_frequencies, MLEProbDist)
        self.transition_frequencies = ConditionalFreqDist(bigrams(tags))
        self.transition_probabilities = ConditionalProbDist(
            self.transition_frequencies, MLEProbDist)
        self.word_tag_frequencies = ConditionalFreqDist(self.tagged_sents)
Ejemplo n.º 18
0
    def conditional_freq(self):
        result = []
        cfd = ConditionalFreqDist(self.bigram_list)

        for key, values in cfd.items():
            for word, freq in values.items():
                result.append((key, word, freq))

        return result
Ejemplo n.º 19
0
 def suf_tag_freq(self):
     cfd = ConditionalFreqDist()
     for w in set(self.wt_freq.keys()) - set(self.c_words):
         for t in self.wt_freq[w].keys():
             for suf_len in xrange(1, max(self.max_suf_len, len(w))):
                 suf = w[-suf_len:]
                 cfd[suf].inc(t, self.wt_freq[w][t])
             cfd[''].inc(t)
     return cfd
Ejemplo n.º 20
0
    def tabulateWordsInAllGeners(self, theWords):
        """
		find the distribution of a word within all Brown corpus genres
		@params theWord: the word/list of words to find info about
		"""
        cdf = ConditionalFreqDist((genre, word)
                                  for genre in brown.categories()
                                  for word in brown.words(categories=genre))
        cdf.tabulate(samples=theWords, conditions=brown.categories())
Ejemplo n.º 21
0
 def __init__(self, n, training_data):
     """Create an n order model using training_data."""
     # Set n and train
     self._n = n
     train_ngrams = _make_ngram_tuples(training_data, self._n)
     self._cfd = ConditionalFreqDist(
         (context, event) for (context, event) in train_ngrams)
     self._estimators = dict((context, self._cfd[context])
                             for context in self._cfd.conditions())
Ejemplo n.º 22
0
def language_model(collection):
    from nltk import ConditionalProbDist
    from nltk import ConditionalFreqDist
    from nltk import bigrams
    from nltk import MLEProbDist
    words = tokenize_collection(collection)
    freq_model = ConditionalFreqDist(bigrams(words))
    prob_model = ConditionalProbDist(freq_model, MLEProbDist)
    return prob_model
Ejemplo n.º 23
0
    def _train(self, tagged_corpus, cutoff=0, verbose=False):
        """
        Initialize this C{ContextTagger}'s L{_context_to_tag} table
        based on the given training data.  In particular, for each
        context C{I{c}} in the training data, set
        C{_context_to_tag[I{c}]} to the most frequent tag for that
        context.  However, exclude any contexts that are already
        tagged perfectly by the backoff tagger(s).

        The old value of C{self._context_to_tag} (if any) is discarded.

        @param tagged_corpus: A tagged corpus.  Each item should be
            a C{list} of C{(word, tag)} tuples.
        @param cutoff: If the most likely tag for a context occurs
            fewer than C{cutoff} times, then exclude it from the
            context-to-tag table for the new tagger.
        """

        token_count = hit_count = 0

        # A context is considered 'useful' if it's not already tagged
        # perfectly by the backoff tagger.
        useful_contexts = set()
        
        # Count how many times each tag occurs in each context.
        fd = ConditionalFreqDist()
        for sentence in tagged_corpus:
            tokens, tags = zip(*sentence)
            for index, (token, tag) in enumerate(sentence):
                # Record the event.
                token_count += 1
                context = self.context(tokens, index, tags[:index])
                if context is None: continue
                fd[context].inc(tag)
                # If the backoff got it wrong, this context is useful:
                if (self.backoff is None or
                    tag != self.backoff.tag_one(tokens, index, tags[:index])):
                    useful_contexts.add(context)

        # Build the context_to_tag table -- for each context, figure
        # out what the most likely tag is.  Only include contexts that
        # we've seen at least `cutoff` times.
        for context in useful_contexts:
            best_tag = fd[context].max()
            hits = fd[context][best_tag]
            if hits > cutoff:
                self._context_to_tag[context] = best_tag
                hit_count += hits

        # Display some stats, if requested.
        if verbose:
            size = len(self._context_to_tag)
            backoff = 100 - (hit_count * 100.0)/ token_count
            pruning = 100 - (size * 100.0) / len(fd.conditions())
            print "[Trained Unigram tagger:",
            print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
                size, backoff, pruning)
Ejemplo n.º 24
0
def get_lookup_tagger_accuracy(test_set, lookup_tagger_basis, corpus):
    words = [word for sent in lookup_tagger_basis for word in sent]
    fd = FreqDist(words)
    cfd = ConditionalFreqDist(corpus.tagged_words())
    most_freq_words = fd.most_common(200)
    likely_tags = dict(
        (word[0], cfd[word[0]].max()) for (word, _) in most_freq_words)
    baseline_tagger = UnigramTagger(model=likely_tags)
    result = baseline_tagger.evaluate(test_set)
    return result
Ejemplo n.º 25
0
 def __get_conditional_freq_dist(self):
     t = trange(
         len(self.__ngram),
         desc=
         f'Creating Conditional frequency distributions for {len(self.__ngram[0])}-gram'
     )
     condition_pairs = []
     for i in t:
         words = self.__ngram[i]
         condition_pairs.append((tuple(words[:-1]), words[-1]))
     return ConditionalFreqDist(condition_pairs)
Ejemplo n.º 26
0
def display():
    import pylab
    words_by_freq = FreqDist(brown.words(categories='news')).most_common(2**15)
    cfd = ConditionalFreqDist(brown.tagged_words(categories='news'))
    sizes = 2 ** pylab.arange(15)
    perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]
    pylab.plot(sizes, perfs, '-bo')
    pylab.title('Lookup Tagger Performance with Varying Model Size')
    pylab.xlabel('Model Size')
    pylab.ylabel('Performance')
    pylab.show()
Ejemplo n.º 27
0
def conditional_dist():
    cfdist = ConditionalFreqDist()
    fileids = corpus.gutenberg.fileids()
    for id in fileids:
        condition = id
        filteredText = freq_dist_filter(corpus.gutenberg.words(condition))
        for word in filteredText:
            if word not in cfdist[condition]:
                cfdist[condition][word] = 0
            cfdist[condition][word] += 1
    return cfdist
Ejemplo n.º 28
0
 def __init__(self):
     """Initializes the del_probs and ins_probs variables to empty MLE probability distributions,
     and the sub_probs to an empty conditional probability distribution."""
     self.del_probs = MLEProbDist(
         FreqDist()
     )  # a MLE probability distribution representing how likely each character is to be deleted
     self.ins_probs = MLEProbDist(
         FreqDist()
     )  # a MLE probability distribution representing how likely each character is to be inserted
     self.sub_probs = ConditionalProbDist(
         ConditionalFreqDist(), MLEProbDist
     )  # a Conditional Probability Distribution representing how likely a given character is to be replaced by another character
Ejemplo n.º 29
0
    def tabulateWordsInPeriods(self, theWords):
        """
		find the distribution of words within the years, based in Inaugural corpus
		@params theWords: the word/list of words to find info about
		"""
        cdf = ConditionalFreqDist((textid[:4], target)
                                  for textid in inaugural.fileids()
                                  for word in inaugural.words(textid)
                                  for target in theWords
                                  if word.lower().startswith(target)
                                  or word.lower().endswith(target))
        cdf.tabulate()
Ejemplo n.º 30
0
    def generateText(self, text, word, num=15):
        """
		Generate semi-random text based on what's the likelihood of two words to appear together
		depending on the frequency distribution of a text bigrams
		@params text: the target text
		@params word: the seed word
		@params num: the length of the generated text, set to 15 as a default
		"""
        bigrams = nltk.bigrams(text)
        cfdist = ConditionalFreqDist(bigrams)
        for i in range(num):
            print(word, end=' ')
            word = cfdist[word].max()