def __init__(self, freq_data, selected_doc=None, context_doc_groups=None, total_cnt=None):
		self.phrase_cnt = agg_phrase_cnt(freq_data, selected_doc)
		self.phrase_cnt_context = {}
		self.sum_cnt = sum(self.phrase_cnt.values())
		self.sum_cnt_context = {}
		for group, docs in context_doc_groups.items():
			if len(docs) == 1:
				self.phrase_cnt_context[group] = total_cnt
			else:
				self.phrase_cnt_context[group] = agg_phrase_cnt(freq_data, docs)
			self.sum_cnt_context[group] = sum(self.phrase_cnt_context[group].values())
  def compute_freq_portion(self):
    phrase_freq_portion = agg_phrase_cnt(self.freq_data, self.selected_doc)
    # print phrase_freq_portion

    freq_sum = sum(phrase_freq_portion.values())
    for phrase in phrase_freq_portion:
      phrase_freq_portion[phrase] /= float(freq_sum)

    return phrase_freq_portion
	def __init__(self, freq_data, selected_doc=None, context_doc_groups=None, total_cnt=None, global_scores=None):
		self.phrase_cnt = agg_phrase_cnt(freq_data, selected_doc)
		self.phrase_df = agg_phrase_df(freq_data, selected_doc)
		self.phrase_cnt_context = {}
		self.phrase_df_context = {}
		self.max_df = max(self.phrase_df.values())
		self.max_df_context = {}
		self.dc_context = {}
		self.self_dc = len(selected_doc)
		self.sum_cnt = sum(self.phrase_cnt.values())
		self.sum_cnt_context = {}
		self.global_scores = global_scores
		for group, docs in context_doc_groups.items():
			if len(docs) == 1 and docs[0] == '-1':
				self.phrase_cnt_context[group] = total_cnt
			else:
				self.phrase_cnt_context[group] = agg_phrase_cnt(freq_data, docs)
				self.phrase_df_context[group] = agg_phrase_df(freq_data, docs)
				self.max_df_context[group] = max(self.phrase_df_context[group].values())
				self.dc_context[group] = len(docs)
			self.sum_cnt_context[group] = sum(self.phrase_cnt_context[group].values())
Exemple #4
0
 def __init__(self, freq_data, parsed_file, selected_doc=None):
   self.phrase_cnt = agg_phrase_cnt(freq_data, selected_doc)
   self.parsed_file = parsed_file
   self.selected_doc = selected_doc