def __init__(self, freq_data, selected_docs, context_doc_groups, global_scores): print 'start query' self.selected_docs = selected_docs self.phrase_cnt = agg_phrase_cnt(freq_data, selected_docs) self.phrase_df = agg_phrase_df(freq_data, selected_docs) self.phrase_cnt_context = {} self.phrase_df_context = {} if len(self.phrase_df) > 0: self.max_df = max(self.phrase_df.values()) else: self.max_df = 0 self.max_df_context = {} self.dc_context = {} self.self_dc = len(selected_docs) self.sum_cnt = sum(self.phrase_cnt.values()) self.sum_cnt_context = {} self.global_scores = global_scores for group, docs in context_doc_groups.items(): self.phrase_cnt_context[group] = agg_phrase_cnt(freq_data, docs) self.phrase_df_context[group] = agg_phrase_df(freq_data, docs) if len(self.phrase_df_context[group]) > 0: self.max_df_context[group] = max(self.phrase_df_context[group].values()) else: self.max_df_context[group] = 0 self.dc_context[group] = len(docs) self.sum_cnt_context[group] = sum(self.phrase_cnt_context[group].values()) # added for exploration self.context_groups = {} self.ranked_list = []
def update_selected_docs(self, freq_data, selected_docs, phrases=[]): if not phrases: self.selected_docs = selected_docs self.phrase_cnt = agg_phrase_cnt(freq_data, selected_docs) self.phrase_df = agg_phrase_df(freq_data, selected_docs) if len(self.phrase_df) > 0: self.max_df = max(self.phrase_df.values()) else: self.max_df = 0 self.self_dc = len(selected_docs) self.sum_cnt = sum(self.phrase_cnt.values()) self.ranked_list = []