def __init__(self, freq_data, selected_doc=None, context_doc_groups=None, total_cnt=None): self.phrase_cnt = agg_phrase_cnt(freq_data, selected_doc) self.phrase_cnt_context = {} self.sum_cnt = sum(self.phrase_cnt.values()) self.sum_cnt_context = {} for group, docs in context_doc_groups.items(): if len(docs) == 1: self.phrase_cnt_context[group] = total_cnt else: self.phrase_cnt_context[group] = agg_phrase_cnt(freq_data, docs) self.sum_cnt_context[group] = sum(self.phrase_cnt_context[group].values())
def compute_freq_portion(self): phrase_freq_portion = agg_phrase_cnt(self.freq_data, self.selected_doc) # print phrase_freq_portion freq_sum = sum(phrase_freq_portion.values()) for phrase in phrase_freq_portion: phrase_freq_portion[phrase] /= float(freq_sum) return phrase_freq_portion
def __init__(self, freq_data, selected_doc=None, context_doc_groups=None, total_cnt=None, global_scores=None): self.phrase_cnt = agg_phrase_cnt(freq_data, selected_doc) self.phrase_df = agg_phrase_df(freq_data, selected_doc) self.phrase_cnt_context = {} self.phrase_df_context = {} self.max_df = max(self.phrase_df.values()) self.max_df_context = {} self.dc_context = {} self.self_dc = len(selected_doc) self.sum_cnt = sum(self.phrase_cnt.values()) self.sum_cnt_context = {} self.global_scores = global_scores for group, docs in context_doc_groups.items(): if len(docs) == 1 and docs[0] == '-1': self.phrase_cnt_context[group] = total_cnt else: self.phrase_cnt_context[group] = agg_phrase_cnt(freq_data, docs) self.phrase_df_context[group] = agg_phrase_df(freq_data, docs) self.max_df_context[group] = max(self.phrase_df_context[group].values()) self.dc_context[group] = len(docs) self.sum_cnt_context[group] = sum(self.phrase_cnt_context[group].values())
def __init__(self, freq_data, parsed_file, selected_doc=None): self.phrase_cnt = agg_phrase_cnt(freq_data, selected_doc) self.parsed_file = parsed_file self.selected_doc = selected_doc