def _get_feature_vectors( cls, doc: Document, gamma: float, tf: Optional[Mapping[Word, float]] = None, ) -> List[np.ndarray]: word_fdist = FreqDist(doc.words) word_pdist = LidstoneProbDist(word_fdist, gamma) vecs = [] for para in doc: for i, sent in enumerate(para): vec = [] # Sentence position in paragraph if i == 0: vec.append(1.) elif i == len(para) - 1: vec.append(2. if len(para) == 2 else 3.) else: vec.append(2.) # Number of terms vec.append(math.log(len(sent) + 1)) # Probability of terms in document vec.append(sum(math.log(word_pdist.prob(w)) for w in sent)) # Probability of terms in a baseline document if tf is not None: vec.append(sum(math.log(tf[w]) for w in sent if w in tf)) vecs.append(np.array(vec)) return vecs
def __init__(self, fd, *args, **kwargs): LidstoneProbDist.__init__(self, fd, 0.01, args[-1]) samples = fd.samples() self._probs = dict(zip([0]*len(samples), samples)) self._logprobs = dict(zip([0]*len(samples), samples)) for sample in samples: self._logprobs[sample] = LidstoneProbDist.logprob(self, sample) self._probs[sample] = LidstoneProbDist.prob(self, sample)
def prob(self, sample): if sample not in self._probs: self._probs[sample] = LidstoneProbDist.prob(self, sample) return self._probs.get(sample)
for j in range(n): A[i][j] = tags_bigrams_prob_train[Q[i]].prob(Q[j]) print(sum(A[1, :])) # ### Vector $\pi$ # In[10]: tags_first_train = [sent[0][1] for sent in training] tags_first_freq_train = FreqDist(tags_first_train) tags_first_prob_train = LidstoneProbDist(tags_first_freq_train, .1, n) print(list(tags_first_freq_train.items())[:10]) for i in range(n): Pi[i] = tags_first_prob_train.prob(Q[i]) print(sum(Pi)) # ### Matrix $B$ # In[11]: observations_freq_train = ConditionalFreqDist(zip(words_train, tags_train)) observations_prob_train = lidstone_cond_freq(observations_freq_train, m) for i in tqdm(range(n)): for j in range(m): B[i][j] = observations_prob_train[Q[i]].prob(V[j]) print(sum(B[1, :])) # ## Question 4