def freq_file(): in_file_name = "realdata/freq_count/ivr_non_risk.csv" out_file_name = in_file_name + "_result" pattern_name = "realdata/freq_count/keyword_row" pattern_list = read_list(pattern_name) pattern_dict = {} tries = Tries() tries.put_list(pattern_list) for token in pattern_list: pattern_dict[token] = 0 for line in open(in_file_name): result = tries.search_line(line) for token in result: pattern_dict[token] += 1 write_dict(pattern_dict,out_file_name)
def cal_chi(self_dict,other_dict,self_num,other_num,out_name=None): chi_dict = dict() for key in self_dict: a = self_dict[key] #self_freq b = 0 if key in other_dict: b = other_dict[key] #other_freq c = self_num - a # self_absent d = other_num - b #other_absent nominator = (a*d-b*c)**2 * (self_num + other_num) denominator = (a + b) * ( c + d ) * (a + c) * (b + d) if denominator == 0: continue value = 1.0 * nominator / denominator chi_dict[key] = value if out_name is not None: write_dict(chi_dict,out_name) return chi_dict