def compute_TF(all_tokens_dict): """ Compute term frequeny per sentence @returns {str:sentence : {str:word: float:P(word|sentence)}} """ return {sentence: counts_to_probs(Counter(strip_junk_tokens(words))) \ for (sentence, words) in all_tokens_dict.items()}
def build(all_tokens_dict): # Strip all the junk: corpus = strip_junk_tokens(chain.from_iterable(all_tokens_dict.values())) # Get all unique words: unique_words = list(set(corpus)) # Sort in ascending order from A..Z unique_words.sort() # Assign an index to each word corenlp_words_index = {k: i for (i, k) in enumerate(unique_words)} return (all_tokens_dict, corenlp_words_index)
def featureize(F, observation_ids): (all_tokens_dict, word_indices, TFIDF, UNK) = F n = len(word_indices) m = len(observation_ids) # Observations X = np.zeros((m,n), dtype=np.float) for (i,ob_id) in enumerate(observation_ids, start=0): for token in strip_junk_tokens(all_tokens_dict[ob_id]): j = word_indices[token] X[i][j] = TFIDF[ob_id][token] return X
def featureize(F, observation_ids): (all_tokens_dict, corenlp_words_index) = F n = len(corenlp_words_index) m = len(observation_ids) # Observations X = np.zeros((m, n), dtype=np.float) for (i, ob_id) in enumerate(observation_ids, start=0): for token in strip_junk_tokens(all_tokens_dict[ob_id]): # Binary indicator: X[i][corenlp_words_index[token]] = 1 return X
def compute_DF(all_tokens_dict): """ Compute document frequency per word @returns {str:word : int:document-count} """ df_counts = Counter() # Number of times a word occurs in all observations # Tabulate the number of documents each word appears in for words in all_tokens_dict.values(): for word in set(strip_junk_tokens(words)): if word not in df_counts: df_counts[word] = 1 else: df_counts[word] += 1 return df_counts
def build(train_ids, all_tokens_dict): # Strip all the junk: corpus = strip_junk_tokens(chain.from_iterable(all_tokens_dict.values())) # Get all unique words: unique_words = list(set(corpus)) # Term frequeny probabilities per sentence TFIDF = compute_TFIDF(all_tokens_dict) # Unknown token IDF value UNK = math.log(len(all_tokens_dict)) # Sort in ascending order from A..Z unique_words.sort() # Assign an index to each word word_indices = {k:i for (i,k) in enumerate(unique_words, start=0)} return (all_tokens_dict, word_indices, TFIDF, UNK)