コード例 #1
0
ファイル: olaporp.py プロジェクト: fangbo-tao/TextExp
	def __init__(self, freq_data, selected_docs, context_doc_groups, global_scores):
		print 'start query'
		self.selected_docs = selected_docs
		self.phrase_cnt = agg_phrase_cnt(freq_data, selected_docs)
		self.phrase_df = agg_phrase_df(freq_data, selected_docs)
		self.phrase_cnt_context = {}
		self.phrase_df_context = {}
		if len(self.phrase_df) > 0:
			self.max_df = max(self.phrase_df.values())
		else:
			self.max_df = 0
		self.max_df_context = {}
		self.dc_context = {}
		self.self_dc = len(selected_docs)
		self.sum_cnt = sum(self.phrase_cnt.values())
		self.sum_cnt_context = {}
		self.global_scores = global_scores
		for group, docs in context_doc_groups.items():
			self.phrase_cnt_context[group] = agg_phrase_cnt(freq_data, docs)
			self.phrase_df_context[group] = agg_phrase_df(freq_data, docs)
			if len(self.phrase_df_context[group]) > 0:
				self.max_df_context[group] = max(self.phrase_df_context[group].values())
			else:
				self.max_df_context[group] = 0
			self.dc_context[group] = len(docs)
			self.sum_cnt_context[group] = sum(self.phrase_cnt_context[group].values())

		# added for exploration
		self.context_groups = {}
		self.ranked_list = []
コード例 #2
0
ファイル: strategy1.py プロジェクト: fangbo-tao/TextExp
def extract_features(parsed_file, unified_file, cube_file, cube_context_file, freq_data_file, stop_word_file, freq_pattern_file, base_dir, total_docs, filtered_cell_str):
  print filtered_cell_str
  freq_patterns = load_freq_patterns(freq_pattern_file)
  freq_data = load_freq(freq_data_file)
  cubes = load_cube(cube_file)
  contexts = load_context(cube_context_file)
  unified_list = load_unified_list(unified_file)
  print contexts.keys()
  #cubes['all'] = [i for i in range(total_docs)]
  all_docs = [i for i in range(total_docs)]
  total_cnt = agg_phrase_cnt(freq_data, all_docs)

  print sum(total_cnt.values())
 
  #extract the features of phrases in each cube
  phrase_feature_all = {}
  idx = 0
  for att in cubes:
    if att != filtered_cell_str: #'Topic|Sports;Location|Illinois;':
      continue
    print "start processing " + att
    selected_doc = cubes[att]
    selected_context = contexts[att]
    #print selected_context
    feature_extractor = FeatureExtractor(parsed_file, selected_doc, selected_context, freq_data, stop_word_file, freq_patterns, total_cnt, unified_list)
    phrase_features = feature_extractor.extract_features()
    for phrase in phrase_features:
      norm_phrase = normalize(phrase)
      phrase_features[phrase].append(unified_list[norm_phrase])
      cell_phrase = "{0}{1}".format(att.replace('|', '_').replace(';', '_').replace(' ', '_').lower(), norm_phrase)
      phrase_feature_all[cell_phrase] = phrase_features[phrase]

  file_name = "{0}/{1}.fea".format(base_dir, "cells.fea")
  save_features(file_name, phrase_feature_all)
コード例 #3
0
ファイル: olaporp.py プロジェクト: fangbo-tao/TextExp
	def update_selected_docs(self, freq_data, selected_docs, phrases=[]):
		if not phrases:
			self.selected_docs = selected_docs
			self.phrase_cnt = agg_phrase_cnt(freq_data, selected_docs)
			self.phrase_df = agg_phrase_df(freq_data, selected_docs)
			if len(self.phrase_df) > 0:
				self.max_df = max(self.phrase_df.values())
			else:
				self.max_df = 0
			self.self_dc = len(selected_docs)
			self.sum_cnt = sum(self.phrase_cnt.values())
			self.ranked_list = []