Esempio n. 1
0
    def _get_feature_vectors(
        cls,
        doc: Document,
        gamma: float,
        tf: Optional[Mapping[Word, float]] = None,
    ) -> List[np.ndarray]:
        word_fdist = FreqDist(doc.words)
        word_pdist = LidstoneProbDist(word_fdist, gamma)

        vecs = []
        for para in doc:
            for i, sent in enumerate(para):
                vec = []
                # Sentence position in paragraph
                if i == 0:
                    vec.append(1.)
                elif i == len(para) - 1:
                    vec.append(2. if len(para) == 2 else 3.)
                else:
                    vec.append(2.)
                # Number of terms
                vec.append(math.log(len(sent) + 1))
                # Probability of terms in document
                vec.append(sum(math.log(word_pdist.prob(w)) for w in sent))
                # Probability of terms in a baseline document
                if tf is not None:
                    vec.append(sum(math.log(tf[w]) for w in sent if w in tf))
                vecs.append(np.array(vec))
        return vecs
Esempio n. 2
0
 def __init__(self, fd, *args, **kwargs):
     LidstoneProbDist.__init__(self, fd, 0.01, args[-1])
     samples = fd.samples()
     self._probs = dict(zip([0]*len(samples), samples))
     self._logprobs = dict(zip([0]*len(samples), samples))        
     for sample in samples:
         self._logprobs[sample] = LidstoneProbDist.logprob(self, sample)
         self._probs[sample] = LidstoneProbDist.prob(self, sample)
Esempio n. 3
0
 def prob(self, sample):
     if sample not in self._probs:
         self._probs[sample] = LidstoneProbDist.prob(self, sample)
     return self._probs.get(sample)
Esempio n. 4
0
    for j in range(n):
        A[i][j] = tags_bigrams_prob_train[Q[i]].prob(Q[j])
print(sum(A[1, :]))

# ### Vector $\pi$

# In[10]:

tags_first_train = [sent[0][1] for sent in training]
tags_first_freq_train = FreqDist(tags_first_train)
tags_first_prob_train = LidstoneProbDist(tags_first_freq_train, .1, n)

print(list(tags_first_freq_train.items())[:10])

for i in range(n):
    Pi[i] = tags_first_prob_train.prob(Q[i])
print(sum(Pi))

# ### Matrix $B$

# In[11]:

observations_freq_train = ConditionalFreqDist(zip(words_train, tags_train))
observations_prob_train = lidstone_cond_freq(observations_freq_train, m)

for i in tqdm(range(n)):
    for j in range(m):
        B[i][j] = observations_prob_train[Q[i]].prob(V[j])
print(sum(B[1, :]))

# ## Question 4