def freq_file(): in_file_name = "realdata/freq_count/ivr_non_risk.csv" out_file_name = in_file_name + "_result" pattern_name = "realdata/freq_count/keyword_row" pattern_list = read_list(pattern_name) pattern_dict = {} tries = Tries() tries.put_list(pattern_list) for token in pattern_list: pattern_dict[token] = 0 for line in open(in_file_name): result = tries.search_line(line) for token in result: pattern_dict[token] += 1 write_dict(pattern_dict,out_file_name)
class CountFreq: def __init__(self,key_in_name,doc_in_name,base_time): self.word_dict = {} self.freq_dict = {} self.tries = Tries() self.base_time = base_time key_list = list(get_word_set(key_in_name)) self.tries.put_list(key_list) for line in open(doc_in_name): self.read_line(line) self.write_dict_to_db() def read_line(self,line): #to prevent the duplicate count of a certain term key_set = set() previous_index = -1 for chat_record in line.strip().split("|"): tokens = chat_record.strip().split(",") chat_time = tokens[0].strip() index = None try: index = from_timestamp_to_index(chat_time,self.base_time) except ValueError as error: print line, error.message continue chat_content = tokens[1].strip() result = self.tries.search_line(chat_content) if len(result) == 0: continue contain_new_word = False for word in result: if word in key_set: continue self.put_to_dict(word,index) key_set.add(word) contain_new_word = True # do not add a document for the same index if contain_new_word and previous_index != index: if index not in self.freq_dict: self.freq_dict[index] = 1 else: self.freq_dict[index] += 1 previous_index = index def put_to_dict(self,word,index): if word not in self.word_dict: self.word_dict[word] = {} if index not in self.word_dict[word]: self.word_dict[word][index] = 1 else: self.word_dict[word][index] += 1 def write_dict_to_db(self): conn = sqlite3.connect('realdata/application/time_series/time.db') conn.text_factory = str date = self.base_time[:10] for key in self.word_dict: conn.execute('insert into word_freq values(?,?,?)',(key,date,str(self.word_dict[key]))) conn.commit() conn.close()