def __init__(self, num_features, min_df=10): self.priors = {} self.condprobs = defaultdict(lambda: defaultdict(float)) self.class_label_counts = defaultdict(int) self.token_tf_counts = defaultdict(lambda: defaultdict(int)) self.token_df_counts = defaultdict(lambda: defaultdict(int)) # Info-Gain requires knowing document frequency counts self.total_tf = 0 self.feature_selection = IGFeatureSelection(num_features, min_df)
class NBTrain: def __init__(self, num_features, min_df=10): self.priors = {} self.condprobs = defaultdict(lambda: defaultdict(float)) self.class_label_counts = defaultdict(int) self.token_tf_counts = defaultdict(lambda: defaultdict(int)) self.token_df_counts = defaultdict(lambda: defaultdict(int)) # Info-Gain requires knowing document frequency counts self.total_tf = 0 self.feature_selection = IGFeatureSelection(num_features, min_df) def addDocument(self, tokens, class_label): self.total_tf+=len(tokens) self.class_label_counts[class_label]+=1 for token in tokens: self.token_tf_counts[token][class_label]+=1 # Term frequencies for tokens in set(tokens): self.token_df_counts[token][class_label]+=1 # Document Frequencies def train(self): #calculate class priors doc_count = sum(self.class_label_counts.values()) for class_label, count in self.class_label_counts.items(): self.priors[class_label] = count/doc_count #calcualte count of each token in everey class. If token never seen in class then token_counts[token][class_label] = 0 token_tf_counts = defaultdict(lambda: defaultdict(int)) token_df_counts = defaultdict(lambda: defaultdict(int)) for token, class_counts in self.token_tf_counts.items(): for class_label in self.class_label_counts.keys(): try: token_tf_counts[token][class_label] = self.token_tf_counts[token][class_label] token_df_counts[token][class_label] = self.token_df_counts[token][class_label] except KeyError: token_tf_counts[token][class_label] = 0 token_df_counts[token][class_label] = 0 selected_tokens = self.feature_selection.selectFeatures(token_df_counts, self.class_label_counts) #calculate conditional probabilities for selected tokens vocabulary_size = len(self.token_tf_counts.keys()) for token in selected_tokens.keys(): for class_label in self.class_label_counts.keys(): self.condprobs[token][class_label] = (token_tf_counts[token][class_label] + 1)/(self.total_tf + vocabulary_size) return self.priors, self.condprobs
def __init__(self, num_features, min_df=10): self.priors = {} self.condprobs = defaultdict(lambda: defaultdict(float)) self.class_label_counts = defaultdict(int) self.token_df_counts = defaultdict(lambda: defaultdict(int)) self.feature_selection = IGFeatureSelection(num_features, min_df)