コード例 #1
0
    def refine(self,
               labeled_featuresets,
               entropy_cutoff,
               depth_cutoff,
               support_cutoff,
               binary=False,
               feature_values=None,
               verbose=False):
        if len(labeled_featuresets) <= support_cutoff: return
        if self._fname is None: return
        if depth_cutoff <= 0: return
        for fval in self._decisions:
            fval_featuresets = [(featureset, label)
                                for (featureset, label) in labeled_featuresets
                                if featureset.get(self._fname) == fval]

            label_freqs = FreqDist(label
                                   for (featureset, label) in fval_featuresets)
            if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
                self._decisions[fval] = DecisionTreeClassifier.train(
                    fval_featuresets, entropy_cutoff, depth_cutoff,
                    support_cutoff, binary, feature_values, verbose)
        if self._default is not None:
            default_featuresets = [
                (featureset, label)
                for (featureset, label) in labeled_featuresets
                if featureset.get(self._fname) not in self._decisions
            ]
            label_freqs = FreqDist(label for (featureset,
                                              label) in default_featuresets)
            if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
                self._default = DecisionTreeClassifier.train(
                    default_featuresets, entropy_cutoff, depth_cutoff,
                    support_cutoff, binary, feature_values, verbose)
コード例 #2
0
 def __init__(self, freqdist, bins=None):
     MLEProbDist.__init__(self, freqdist, bins)
     self._probarray = np.zeros((len(freqdist), ))
     self._probmap = {}
     for i, item in enumerate(freqdist.keys()):
         self._probarray[i] = freqdist.freq(item)
         self._probmap[i] = item
コード例 #3
0
ファイル: generator.py プロジェクト: tocubed/imitare
    def generate_alternative(self, n):
        """
        Generate n words using a more complicated algorithm
        """
        generated_tags = []
        generated_lemmas = []
        generated_words = []

        # Incrementally generate (tag, lemma) pairs
        for i in range(n):
            tag_choice = None # Start with nothing

            # Loop through n-grams of grammar
            size = 2 * self._n
            while size > 2:
                tag_choices = self._tags_ngram.backoff_search(
                    generated_tags, backoff_limit=2, predicate=lambda tag: True, start_n=size)

                # Determine valid lemmas in context with these tag choices
                tag_to_lemma = {}
                if tag_choices is not None:
                    for tag, _ in tag_choices.items():
                        # For each tag, find valid lemmas in context with that tag
                        lemma = self._lemmas_ngram.choose_word(
                            generated_lemmas, backoff_limit=2, predicate=lambda lemma: lemma in self._tag_lemmas[tag])
                        if lemma is not None:
                            tag_to_lemma[tag] = lemma

                    if len(tag_to_lemma) > 1:
                        # We have found valid (tag, lemma) pairs
                        tag_probdist = MLEProbDist(FreqDist(
                            {tag: freq for tag, freq in tag_choices.items() if tag in tag_to_lemma}))
                        tag_choice = tag_probdist.generate() # Randomly select the tag
                        lemma_choice = tag_to_lemma[tag_choice] # Set the lemma
                        break
                size -= 1 # Lower to smaller n-gram for more tag choices

            if tag_choice is None:
                # We still didn't find a valid (tag, lemma) pair, fallback
                tag_choice = MLEProbDist(tag_choices).generate()
                lemma_choice = MLEProbDist(
                    self._tag_lemmas[tag_choice]).generate()

            generated_tags.append(tag_choice)
            generated_lemmas.append(lemma_choice)

        # Generate all words based on (tag, lemma) pairs
        for (tag, lemma) in zip(generated_tags, generated_lemmas):
            # Search for and choose word with correct lemma/tag
            choices = self._words_ngram.backoff_search(
                generated_words, backoff_limit=2, predicate=lambda word: word in self._tag_lemma_words[(tag, lemma)])
            if choices is None:
                # Could not find a good word, choose from list
                choices = self._tag_lemma_words[(tag, lemma)]
            generated_words.append(MLEProbDist(choices).generate())

        return list(self._word_ids.transform_ids(generated_words))
コード例 #4
0
ファイル: epidemic.py プロジェクト: haohu1/seattle_flu
def add_individual(number_individuals, res_address, diagnosis):
    total_individuals = []
    new_address = res_address.sample(number_individuals).to_dict('records')
    for idx in xrange(number_individuals):
        diagnosis_freq_dist = FreqDist(diagnosis)
        diagnosis_prob_dist = MLEProbDist(diagnosis_freq_dist)
        diagnosis_random = diagnosis_prob_dist.generate()
        full_address = new_address[idx]['ADDR_FULL'] + '|' + new_address[idx]['CTYNAME'] + '|' + new_address[idx]['ZIP5']
        gender, age = get_gender_age(new_address[idx])
        new_individual = {'Date_Inf': current_date, 'Gender': gender, 'Age': age, 'Census_Tract': new_address[idx]['GEOID'], 'Address':full_address, 'LON':new_address[idx]['LON'], 'LAT':new_address[idx]['LAT'], 'Diagnosis': diagnosis_random}
        total_individuals.append(new_individual)
    return pd.DataFrame.from_records(total_individuals)
コード例 #5
0
ファイル: epidemic.py プロジェクト: haohu1/seattle_flu
def get_gender_age(full_address):
    GEOID = full_address['GEOID']
    try:
        age_gender_dist = KC_age_gender.loc[[GEOID]].loc[:,'M0-4':'F85-120']
        age_gender_freq_dist = FreqDist(age_gender_dist)
        age_gender_prob_dist_age_gender = MLEProbDist(age_gender_freq_dist)
        age_gender_random = age_gender_prob_dist_age_gender.generate()
        gender = age_gender_random[0]
        age = age_gender_random[1:]
        return gender, age
    except:
        return np.nan, np.nan
コード例 #6
0
def gen_sent(ngram):

    global lis

    # "n" contains the ngram number
    n = lis[1]
    #number of required sentences is stored in sent_num
    sent_num = lis[2]
    i = 0
    for i in range(sent_num):
        j = True

        # we are using this window to go through the sentence with n-1 previous
        # words stored in the window
        window = []
        sent = ""
        for size in range(n - 1):
            window.append('<start>')
        while j == True:
            tup_win = tuple(window)
            if tup_win not in ngram.keys():
                sys.exit("We don't have a start line")

            # FreqDist and MLEProbDist function will transform the frequencies to probabilities
            # by performing (item freq/ sum of frequencies)
            freq_dist = FreqDist(ngram[tup_win])

            #prob_dist.generate() will take in the freq-distance and generate a random token
            # according to the distribution
            prob_dist = MLEProbDist(freq_dist)
            next_w = prob_dist.generate()

            #the following condition is used to detect the end of line
            if (next_w == "." or next_w == "?" or next_w == "!"):
                j = False
                sent += next_w
                continue

            #We'd like to make sure the apostrophe token has no space before or after it...
            # ... as well as the begining of the line
            elif (next_w == "m" or next_w == "s" or next_w == "re"
                  or next_w == "," or next_w == "’" or next_w == "ve"
                  or next_w == "t" or tup_win[-1] == '<start>'):
                sent += next_w
            else:
                sent += " %s" % next_w

            #moving the window forward by popping and appending
            window.pop(0)
            window.append(next_w)

        print("\nSentence %s:\n%s" % (i + 1, sent))
コード例 #7
0
ファイル: ngram.py プロジェクト: gpaulbr/ERelp
def _estimator(fdist, bins):
    """
    Default estimator function using an MLEProbDist.
    """
    # can't be an instance method of NgramModel as they 
    # can't be pickled either.
    return MLEProbDist(fdist)
コード例 #8
0
def TransitionsGenerate(
    AddCorpus, train_p, tagger, estimator
):  #recalculate the transition matrix using train plain text + additional corpus

    if estimator is None:
        estimator = lambda fdist, bins: MLEProbDist(fdist)

    data = train_p
    data.extend(AddCorpus)
    print(type(data))
    for s in data:
        s = list(s)

    N = len(tagger._states)

    transitions = ConditionalFreqDist()
    for sentence in data:
        lasts = None
        sentence = list(sentence.strip('\n'))
        for character in sentence:
            state = character
            if not lasts is None:
                transitions[lasts][state] += 1
            lasts = state
    A = ConditionalProbDist(transitions, estimator, N)
    return A
コード例 #9
0
ファイル: hmm.py プロジェクト: VinodhSubramanian1193/NLP
    def train_supervised(self, labelled_sequences, **kwargs):
        """
        Supervised training maximising the joint probability of the symbol and
        state sequences. This is done via collecting frequencies of
        transitions between states, symbol observations while within each
        state and which states start a sentence. These frequency distributions
        are then normalised into probability estimates, which can be
        smoothed if desired.

        :return: the trained model
        :rtype: HiddenMarkovModelTagger
        :param labelled_sequences: the training data, a set of
            labelled sequences of observations
        :type labelled_sequences: list
        :param kwargs: may include an 'estimator' parameter, a function taking
            a FreqDist and a number of bins and returning a CProbDistI;
            otherwise a MLE estimate is used
        """

        # default to the MLE estimate
        estimator = kwargs.get('estimator')
        if estimator is None:
            estimator = lambda fdist, bins: MLEProbDist(fdist)

        # count occurrences of starting states, transitions out of each state
        # and output symbols observed in each state
        known_symbols = set(self._symbols)
        known_states = set(self._states)

        starting = FreqDist()
        transitions = ConditionalFreqDist()
        outputs = ConditionalFreqDist()
        for sequence in labelled_sequences:
            lasts = None
            for token in sequence:
                state = token[_TAG]
                symbol = token[_TEXT]
                if lasts is None:
                    starting.inc(state)
                else:
                    transitions[lasts].inc(state)
                outputs[state].inc(symbol)
                lasts = state

                # update the state and symbol lists
                if state not in known_states:
                    self._states.append(state)
                    known_states.add(state)

                if symbol not in known_symbols:
                    self._symbols.append(symbol)
                    known_symbols.add(symbol)

        # create probability distributions (with smoothing)
        N = len(self._states)
        pi = estimator(starting, N)
        A = ConditionalProbDist(transitions, estimator, N)
        B = ConditionalProbDist(outputs, estimator, len(self._symbols))

        return HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi)
コード例 #10
0
ファイル: ngram.py プロジェクト: yonas-g/HornMorpho
    def __init__(self, n, train, estimator=None):
        """
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during training.

        @param n: the order of the language model (ngram size)
        @type n: C{int}
        @param train: the training text
        @type train: C{list} of C{list} of C{string}
        @param estimator: a function for generating a probability distribution
        @type estimator: a function that takes a C{ConditionalFreqDist} and returns
              a C{ConditionalProbDist}
        """

        self._n = n

        if estimator == None:
            estimator = lambda fdist, bins: MLEProbDist(fdist)

        cfd = ConditionalFreqDist()
        self._ngrams = set()
        self._prefix = ('', ) * (n - 1)

        for ngram in ingrams(chain(self._prefix, train), n):
            self._ngrams.add(ngram)
            context = tuple(ngram[:-1])
            token = ngram[-1]
            cfd[context].inc(token)

        self._model = ConditionalProbDist(cfd, estimator, len(cfd))

        # recursively construct the lower-order models
        if n > 1:
            self._backoff = NgramModel(n - 1, train, estimator)
コード例 #11
0
def trainModelLM(laplace, symbols, train_output, train_transition):
    extra_set = []
    for i in symbols:
        for j in symbols:
            extra_set.append((i, j))

    transition = suppleText(train_transition)
    initial = []
    output = []
    for i in range(len(train_output)):
        initial.append(train_output[i][0][1])
        for j in range(len(train_output[i])):
            output.append(train_output[i][j])

    if laplace:
        transition += extra_set
        initial += symbols
        output += extra_set
    transition_cfd = ConditionalFreqDist(transition)
    transition_cqd = ConditionalProbDist(transition_cfd, MLEProbDist)
    inital_cfd = FreqDist(initial)
    initial_cqd = MLEProbDist(inital_cfd)
    output_cfd = ConditionalFreqDist(output)
    output_cqd = ConditionalProbDist(output_cfd, MLEProbDist)
    model = hmm.HiddenMarkovModelTagger(symbols=symbols,
                                        states=symbols,
                                        transitions=transition_cqd,
                                        outputs=output_cqd,
                                        priors=initial_cqd)
    return model
コード例 #12
0
def plot_word_dist_as_cloud(word_dist, file_name=None, plot=False):
    prob_dist = MLEProbDist(word_dist)
    viz_dict = {}
    for word_tuple in word_dist:
        string = ' '.join(word_tuple)
        viz_dict[string] = prob_dist.prob(word_tuple)

    wordcloud = WordCloud(max_words=100).generate_from_frequencies(viz_dict)
    if file_name != None:
        wordcloud.to_file("img/" + file_name +".png")

    if plot:
        plt.figure()
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis("off")
        plt.show()
コード例 #13
0
def train_supervised2(trainer,
                      labelled_sequences,
                      plain_sequences,
                      estimator=None):
    _TAG = 1
    _TEXT = 0
    if estimator is None:
        estimator = lambda fdist, bins: MLEProbDist(fdist)

        # count occurrences of starting states, transitions out of each state
        # and output symbols observed in each state
    known_symbols = set(trainer._symbols)
    known_states = set(trainer._states)

    starting = FreqDist()
    transitions = ConditionalFreqDist()
    outputs = ConditionalFreqDist()
    # =================code added to supplement transition matrix====================
    for sequence in plain_sequences:
        lasts = None
        for token in sequence:
            if lasts is None:
                pass
            else:
                transitions[lasts][token] += 1
            lasts = token

            if token not in known_states:
                trainer._states.append(token)
                known_states.add(token)
    # ================================end============================================
    for sequence in labelled_sequences:
        lasts = None
        for token in sequence:
            state = token[_TAG]
            symbol = token[_TEXT]
            if lasts is None:
                starting[state] += 1
            else:
                transitions[lasts][state] += 1
            outputs[state][symbol] += 1
            lasts = state

            # update the state and symbol lists
            if state not in known_states:
                trainer._states.append(state)
                known_states.add(state)

            if symbol not in known_symbols:
                trainer._symbols.append(symbol)
                known_symbols.add(symbol)

    # create probability distributions (with smoothing)
    N = len(trainer._states)
    pi = estimator(starting, N)
    A = ConditionalProbDist(transitions, estimator, N)
    B = ConditionalProbDist(outputs, estimator, len(trainer._symbols))

    return HiddenMarkovModelTagger(trainer._symbols, trainer._states, A, B, pi)
コード例 #14
0
 def _generate_one_predicated(self, context, backoff_limit, predicate):
     context = tuple(context)[1 - self._n:]
     choices = self.backoff_search(context, backoff_limit,
                                   predicate)  # Possible tokens
     if choices is not None:
         return MLEProbDist(choices).generate()
     else:
         return None
コード例 #15
0
 def _generate_one(self, context, backoff_limit):
     context = tuple(context)[1 - self._n:]
     while not context in self and len(context) >= backoff_limit:
         context = context[1:]
     if context in self:
         return MLEProbDist(
             self[context]).generate()  # Select from possible tokens
     else:
         return None
コード例 #16
0
ファイル: ngram.py プロジェクト: asingh62/NLP-Ngrams
def sentence_generator(gramFreq,numofsentences):
    i = 0
    for  i in range (numofsentences):
        sentenceGen = True
        sentencelist = ()
        generateSentence = ""
        for size in range (int(ngrams)-1):
            sentencelist += ('<start>',)   
        while sentenceGen == True:
            token_dict = {}
            for index, val in ngrams_frequency.items():
                index2 = index[:-1]
                if index2 == sentencelist:
                    token_dict.update({index[-1]: val})

            # generating frequency using the function
            frequencyDistribution = FreqDist(token_dict)

            # generating probability using the function
            probabilityDistribution = MLEProbDist(frequencyDistribution)

            # predicting the next word
            next_word = probabilityDistribution.generate()
            
            # words having ".,?,!"
            if (next_word =="." or next_word == "?" or next_word == "!"):
                sentenceGen = False
                generateSentence += next_word
                continue
            
            # words having , '
            elif (next_word == "," or next_word == "’"):
                generateSentence += next_word
                
            else:
                generateSentence += " %s"%next_word

            if len(sentencelist) != 0 :   
                my_list = list(sentencelist)
                my_list.pop(0)
                my_list.append(next_word)
                sentencelist = tuple(my_list)
        # Display sentences
        print ("\nSentence %s: %s"%(i+1,generateSentence))
コード例 #17
0
def gen_sentence(ngram):
    global arg
    i = 0
    # n in ngrams
    n = arg[1]
    # number of sentences to generate
    m = arg[2]
    for i in range(m):
        j = True
        table = []
        sentence = ""
        for size in range(n - 1):
            table.append('<START>')
        while j == True:
            tuple_table = tuple(table)
            if tuple_table not in ngram.keys():
                # when start is not available
                sys.exit("No start line!")
            # generating frequency
            frequency = FreqDist(ngram[tuple_table])
            # generating probability
            probability = MLEProbDist(frequency)
            # predicting the next word
            pred_word = probability.generate()

            # words having ".,?,!"
            if (pred_word == "." or pred_word == "?" or pred_word == "!"):
                j = False
                sentence += pred_word
                continue
            # words having , ' or START tag
            elif (pred_word == "," or pred_word == "’"
                  or tuple_table[-1] == '<START>'):
                sentence += pred_word
            else:
                sentence += " %s" % pred_word
            table.pop(0)
            table.append(pred_word)
        # Display sentences
        print("\nSentence %s:\n%s" % (i + 1, sentence))
コード例 #18
0
    def __init__(self, corpus, n, estimator=None):
        if estimator is None:
            estimator = lambda fdist, bins: MLEProbDist(fdist)
        bi = []
        self._l = []
        for tree in corpus[:n]:
            ts = tree.leaves()
            sent = ['START'] + ts
            bi += nltk.bigrams(sent)
            self._l.append(len(sent))

        cfd = ConditionalFreqDist(bi)
        self._model = ConditionalProbDist(cfd, estimator, len(cfd))
コード例 #19
0
    def generate(self, n):
        """
        Generate n words using copied grammar, generated lemmas, and words based on lemmas
        """
        start = random.randint(n, len(self._tags) - n)
        generated_tags = self._tags[
            start:start + n]  # Copy a random section of POS tags for grammar

        # Generate sequence of lemmas based off of grammar
        generated_lemmas = []
        for tag in generated_tags:
            # Search for and choose a lemma with correct tag
            choice = self._lemmas_ngram.choose_word(
                generated_lemmas,
                backoff_limit=2,
                predicate=lambda lemma: lemma in self._tag_lemmas[tag])
            if choice is None:
                # Could not find a good lemma for current POS tag, choose from list
                choice = MLEProbDist(self._tag_lemmas[tag]).generate()
            generated_lemmas.append(choice)

        # Generate sequence of words based off of lemmas and grammar
        generated_words = []
        for (tag, lemma) in zip(generated_tags, generated_lemmas):
            # Search for and choose word with correct lemma/tag
            choices = self._words_ngram.backoff_search(
                generated_words,
                backoff_limit=2,
                predicate=lambda word: word in self._tag_lemma_words[
                    (tag, lemma)])
            if choices is None:
                # Could not find a good word, choose from list
                choices = self._tag_lemma_words[(tag, lemma)]
            generated_words.append(MLEProbDist(choices).generate())

        return list(self._word_ids.transform_ids(generated_words))
コード例 #20
0
def validate_pcfg_generate(grammar):
    pd = makeLhrProbDict(grammar)
    productions = []
    cfd = ConditionalFreqDist()
    
    for i in np.arange(1000):
        tree = pcfg_generate(grammar)
        productions += tree.productions()    

    for p in productions:
        cfd[p.lhs()].inc(p.rhs())
        
    for c in cfd.conditions():
        p = MLEProbDist(cfd[c])
        q = pd[c]
        div = KL_Divergence(p,q)
        print "KL_Divergence for %s = %f" %(c , div)
コード例 #21
0
ファイル: decipher.py プロジェクト: emulhall/COMP550
def train_transitions(labelled_sequences,
                      additional_transitions,
                      estimator=None):
    # default to the MLE estimate
    if estimator is None:
        estimator = lambda fdist, bins: MLEProbDist(fdist)

    # count occurrences of starting states, transitions out of each state
    # and output symbols observed in each state
    known_symbols = []
    known_states = []

    starting = FreqDist()
    transitions = ConditionalFreqDist()
    outputs = ConditionalFreqDist()
    for sequence in labelled_sequences:
        lasts = None
        for token in sequence:
            state = token[0]
            symbol = token[1]
            if lasts is None:
                starting[state] += 1
            else:
                transitions[lasts][state] += 1
            outputs[state][symbol] += 1
            lasts = state

            # update the state and symbol lists
            if state not in known_states:
                known_states.append(state)

            if symbol not in known_symbols:
                known_symbols.append(symbol)

    # create probability distributions (with smoothing)
    N = len(known_states)
    pi = estimator(starting, N)
    A = ConditionalProbDist(
        ConditionalFreqDist.__add__(transitions, additional_transitions),
        estimator, N)
    B = ConditionalProbDist(outputs, estimator, len(known_symbols))
    return hmm.HiddenMarkovModelTagger(known_states, known_symbols, A, B, pi)
コード例 #22
0
ファイル: hmm_with_mini_relmin.py プロジェクト: finsqm/MInf
    def doesnt_work(self, y):
        """
		Code adapted from NLTK implementation of supervised training in HMMs
		"""

        estimator = lambda fdist, bins: MLEProbDist(fdist)

        transitions = ConditionalFreqDist()
        outputs = ConditionalFreqDist()
        for sequence in y:
            lasts = None
            for state in sequence:
                if lasts is not None:
                    transitions[lasts][state] += 1
                lasts = state

        N = self.number_of_states + 2
        model = ConditionalProbDist(transitions, estimator, N)

        return model
コード例 #23
0
def main():
  DEBUG =1
  depRelFile=open(sys.argv[1],'r')	#file with dep rel tuples
  ReadDictFromFile(sys.argv[2],lemma_dict) #lemma file
  modelFile = open(sys.argv[3],'w')
  if (len(sys.argv)==5):
    DEBUG = int(sys.argv[4])

  print "---Done loading lemma file---"
 
  print "---Computing CDF....---"
  incompletePairs = ComputeFreqDist(depRelFile)
  print "---Done computing CDF---"
  print "incomplete pairs: ",incompletePairs
  
  if(DEBUG):
  
    print "Info about F(arg)"
    print "unique samples: ", argFD.B()
    print "total seen samples: ", argFD.N()
    print "top arg:", argFD.max()
    print "count for support: ", argFD['support']
    print "Info about CFD(arg|rel,vb)"
    print "unique conditions seen: ", len(argVbRelCFD.conditions())
    print "total seen samples", argVbRelCFD.N()
    top_CFD1 = sorted(argVbRelCFD[('dobj','enjoy')].items(),key=operator.itemgetter(1), reverse=True)[:10]
    print "all dobj,enjoy: ", argVbRelCFD[('dobj','enjoy')].N()
    print "top dobj for enjoy:\n",top_CFD1
    print "Info about CFD(arg|vb)"
    print "unique conditions seen: ", len(argVbCFD.conditions())
    print "total seen samples", argVbCFD.N()
    top_CFD2 = sorted(argVbCFD['enjoy'].items(),key=operator.itemgetter(1), reverse=True)[:10]
    print "all enjoy: ", argVbCFD['enjoy'].N()
    print "top arg for enjoy:\n",top_CFD2


  print "---Computing MLE PDFs....---"
  argVbRelPDF = ConditionalProbDist(argVbRelCFD,MLEProbDist)
  argVbPDF = ConditionalProbDist(argVbCFD,MLEProbDist)
  argPDF = MLEProbDist(argFD)


    #I'm not sure here Types is equivelent with argVbRelCFD.conditions() or unique condition+arg
    #!!!!! lambda should be for each history P(a|v) T = count of unique (v a) pairs starting with v
    # for each condition v -> sum(CFD[v].B() -> how many unique arguments I've seen after this condition)
   

  print "---Computing Witten-Bell smoothed PDFs....---"

  #for unseen pairs we multiply the backoff_weight with the probability of the backoff model
  #e.g. if  c(rel,vb,arg)=0 and c(vb,arg)>0 then P(arg|rel,vb)=argRelVbPDFWB_backoff_weights[(rel,vb)] * argVbPDFWB[vb].prob(arg)
  argPDFWB, backoff_uniform = ComputeWBArg(argPDF)
  argVbPDFWB, argVbPDFWB_backoff_weights,  countArgVB = ComputeWBVbArg(argVbPDF,argPDFWB)
  argRelVbPDFWB,argRelVbPDFWB_backoff_weights, countRelVbArg = ComputeWBRelVbArg(argVbRelPDF,argVbPDFWB)


  if(DEBUG):
    print "P(support|dobs,enjoy)"
    print argVbRelPDF[('dobj','enjoy')].prob('support')
    print argRelVbPDFWB[('dobj','enjoy')]['support']
    print "No args following (dobj,enjoy)", argVbRelCFD[('dobj','enjoy')].B()
    print "P(support|enjoy)"
    print argVbPDF['enjoy'].prob('support')
    print argVbPDFWB['enjoy']['support']
    print "P(support)"
    print argPDF.prob('support')
    print argPDFWB['support']

  WriteToArpaFormat(modelFile, len(argPDFWB),countArgVB,countRelVbArg,argPDFWB,argVbPDFWB,argRelVbPDFWB, backoff_uniform, argVbPDFWB_backoff_weights, argRelVbPDFWB_backoff_weights)

  if(DEBUG):
    #print sorted(argVbPDFWB['enjoy'],key=operator.itemgetter(1), reverse=True)[:5] #[('enjoy','support')]
    
    for condition in argVbPDFWB.keys()[:10]:
      sum1 = 0
      sum2 = 0
      for prob in argVbPDFWB[condition].values():
        sum1+=prob
      for arg in argVbCFD[condition].items():
        sum2+=argVbPDF[condition].prob(arg[0])
      print "total prob: ", sum1, sum2

    print "P_WB(support|dobj, enjoy)"
    print argRelVbPDFWB[('dobj','enjoy')]['support']

    for condition in argRelVbPDFWB.keys()[:10]:
      sum = 0
      for prob in argRelVbPDFWB[condition].values():
        sum+=prob
      print "total prob: ", sum
コード例 #24
0
tokenized_text = len(nltk.sent_tokenize(inputFile))
print(sent_tokenize(inputFile))
print("tokenized text: ", tokenized_text, "\n")

tokenized_text = nltk.word_tokenize(inputFile)
tokenized_text = [word.lower() for word in tokenized_text if word.isalpha()]
print("Lower cased text: ", tokenized_text)
print("Word Count: ", len(tokenized_text), "\n")

freq_dist_uni = nltk.FreqDist(tokenized_text)
print("Most common 10 unigram: ", freq_dist_uni.most_common(10), "\n",
      "least common 3 words: ",
      freq_dist_uni.most_common()[-3:], "\n")

prob_distArray = []
prob_dist_uni = MLEProbDist(freq_dist_uni)
for s in prob_dist_uni.samples():
    prob_distArray.append(prob_dist_uni.prob(s))
i = 0
for lim in freq_dist_uni.most_common(10):
    print(lim, prob_distArray[i])
    i += 1

elep = ELEProbDist(freq_dist_uni)
for s in elep.samples():
    prob_distArray.append(elep.prob(s))
i = 0
for lim in freq_dist_uni.most_common(10):
    print(lim, prob_distArray[i], "\n")
    i += 1
コード例 #25
0
def main():
    parser = argparse.ArgumentParser(description='Text decipher options')
    parser.add_argument('cipher_folder', help='cipher data folder')
    parser.add_argument('--laplace',
                        '-laplace',
                        action='store_true',
                        default=False,
                        help='Laplace Smoothing')
    parser.add_argument('--langmod',
                        '-lm',
                        action='store_true',
                        default=False,
                        help='Improved decoder')

    args = parser.parse_args()
    cipher_folder = args.cipher_folder
    laplace = args.laplace
    langmod = args.langmod
    number_of_supp_lines = 100  #the more lines the slower the code!

    train_data, test_data, train_plain = get_data(cipher_folder)
    preprocess_supp_data()
    supp_data = read_preprocessed_supp_data(number_of_supp_lines)
    for line in train_plain:  #this is so later we have all the transitions in the same place
        supp_data.extend(list(line))

    if laplace:
        smoothing = LaplaceProbDist
    else:
        smoothing = MLEProbDist

    trainer = hmm.HiddenMarkovModelTrainer()
    decoder = trainer.train_supervised(train_data, smoothing)

    #decoder_supp = trainer_supp.train_unsupervised(supp_data, update_outputs=False, model=decoder)
    #because there's a bug in train_unsupervised (although I found out how to fix it!), I will have to do this manually....
    #code copied from the nltk train_supervised method
    #here, we are updating the transition data to include our supplemental data
    if langmod:
        states = decoder._states
        symbols = decoder._symbols
        outputs = decoder._outputs
        priors = decoder._priors
        starting = FreqDist()  #declaring
        transitions = ConditionalFreqDist(
        )  #declaring, why we needed all the transitions in the same place
        for item in supp_data:
            for sequence in supp_data:
                lasts = None
                for state in sequence:
                    if lasts is None:
                        starting[state] += 1
                    else:
                        transitions[lasts][state] += 1
                    lasts = state

        if laplace:
            estimator = LaplaceProbDist
        else:
            estimator = lambda fdist, bins: MLEProbDist(
                fdist)  #getting this straight from the source code

        N = len(states)
        pi = estimator(starting, N)
        A = ConditionalProbDist(transitions, estimator, N)
        #conditionalPD is actually already defined by our previously trained model as outputs.
        #we don't have new ones!
        decoder = HiddenMarkovModelTagger(symbols, states, A, outputs, pi)

    print(decoder.test(test_data))
    for sent in test_data:
        print "".join([y[1] for y in decoder.tag([x[0] for x in sent])])
コード例 #26
0
#

from nltk.token import *
from nltk.tokenizer import WhitespaceTokenizer
from nltk.probability import FreqDist
from nltk.probability import MLEProbDist
from nltk.draw.plot import Plot

freq_dist = FreqDist()
corpus = Token(TEXT=open('dados/may2001_pdf.torto').read())
WhitespaceTokenizer().tokenize(corpus)

for token in corpus['SUBTOKENS']:
	freq_dist.inc(token['TEXT'])

prob_dist = MLEProbDist(freq_dist)

# P(x) = freq(x)
prob_dist.prob('the')
freq_dist.freq('the')

#
# Estimating the probability distribution for roll2
#

import random
from nltk.token import *
from nltk.tokenizer import WhitespaceTokenizer
from nltk.probability import FreqDist
from nltk.probability import MLEProbDist
コード例 #27
0
            def estimator(fdist, bins): return MLEProbDist(fdist)

        # count occurrences of starting states, transitions out of each state
        # and output symbols observed in each state
        known_symbols = set(self._symbols)
コード例 #28
0
def ml_estimator(freqdist):
    return MLEProbDist(freqdist)
コード例 #29
0
ファイル: decipher.py プロジェクト: Catosine/COMP550_NLP
    def train_supervised(self, labelled_sequences, extra_data=False, estimator=None):
        # This is copied from HiddenMarkovModelTrainer

        if estimator is None:
            estimator = lambda fdist, bins: MLEProbDist(fdist)

        # count occurrences of starting states, transitions out of each state
        # and output symbols observed in each state
        known_symbols = set(self._symbols)
        known_states = set(self._states)

        starting = FreqDist()
        transitions = ConditionalFreqDist()
        outputs = ConditionalFreqDist()
        for sequence in labelled_sequences:
            lasts = None
            for token in sequence:
                state = token[1]
                symbol = token[0]
                if lasts is None:
                    starting[state] += 1
                else:
                    transitions[lasts][state] += 1
                outputs[state][symbol] += 1
                lasts = state

                # update the state and symbol lists
                if state not in known_states:
                    self._states.append(state)
                    known_states.add(state)

                if symbol not in known_symbols:
                    self._symbols.append(symbol)
                    known_symbols.add(symbol)

        if extra_data:
            print('-'*20)
            print("Using extra data to calculate transition probability")
            sent = ""
            for word in tqdm(treebank.words()):
                if word == '.':
                    sent = sent[:-1] + word
                    lasts = None
                    for c in sent:
                        if c in list(string.ascii_lowercase)+[' ', ',', '.']:
                            if lasts is not None:
                                transitions[lasts][c] += 1
                        lasts = c
                    sent = ""
                elif word == ',':
                    sent = sent[:-1] + word + ' '
                else:
                    sent += word + ' '

        # create probability distributions (with smoothing)
        N = len(self._states)
        pi = estimator(starting, N)
        A = ConditionalProbDist(transitions, estimator, N)
        B = ConditionalProbDist(outputs, estimator, len(self._symbols))

        return hmm.HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi)
コード例 #30
0
 # transition prob
 for row in X_train:
     lasts = None
     for ch in list(row):
         if(lasts is not None):
             transitional[lasts][ch] += 1
         lasts = ch
  
 # emission prob
 for row in sequences:
     for pair in row:
         emissional[pair[1]][pair[0]] += 1
 
 if(improved_laplace): 
     print("################## Laplace ####################### \n")
     estimator= nltk.probability.LaplaceProbDist 
 else:
     estimator = lambda fdist, bins: MLEProbDist(fdist)
     
 N = len(symbols)
 PI = estimator(Pi, N)
 A = ConditionalProbDist(transitional, estimator, N)
 B = ConditionalProbDist(emissional, estimator ,N)
  
 tagger = HiddenMarkovModelTagger(states, symbols, A, B, PI)
 print("\n ################## C{} Decryption Results #######################".format(int(i)) )
 for row in test_cipher:
     print(tagger.best_path(row))
 
 print("\n ################## C{} Accuracy Results #######################". format(int(i)) )
 print(tagger.test(tester))
コード例 #31
0
ファイル: task2_1.py プロジェクト: AdiKrasin/nlp_task3

def compute_kl_divergence(mle_dist1, mle_dist2):
    ans = 0
    for p in mle_dist1.freqdist():
        for q in mle_dist2.freqdist():
            if p.rhs() == q.rhs():
                ans += p.prob() * math.log(p.prob() / q.prob())
    return ans


for lhs in lhs_of_prods:
    prods = [
        ProbabilisticProduction(prod.lhs(), prod.rhs(), prob=prod.prob()) for prod in productions_corpus if
        str(prod.lhs()) == lhs
    ]
    prods_for_toy_pcfg2 = [
        ProbabilisticProduction(prod.lhs(), prod.rhs(), prob=prod.prob()) for prod in productions_toy_pcfg2 if
        str(prod.lhs()) == lhs
    ]
    if len(prods):
        MLE_prob_dist = MLEProbDist(FreqDist(prods))
    if len(prods_for_toy_pcfg2):
        MLE_prob_dist_for_toy_pcfg2 = MLEProbDist(FreqDist(prods_for_toy_pcfg2))
    if not(len(prods) and len(prods_for_toy_pcfg2)):
        print('skipping {} because this nt does not appear in both cases'.format(lhs))
    else:
        print('this is the KL-Divergence {} for this lhs {}'.format(compute_kl_divergence(MLE_prob_dist,
                                                                                          MLE_prob_dist_for_toy_pcfg2),
                                                                    lhs))