def get_sentence_accuracy_top_one(): '''样本文件中词的准确度''' from cut_sentence import Cut_Sentence matched_words_count = 0 total_words_count = 312525.0 cs = Cut_Sentence() correct_sentence_set = set() top_one_sentence_list = [] correct_sentence_list = [] checkout_filename = os.path.join(PATH, '0709modify', 'aaaaaaaaaaaa_weight.txt') with codecs.open(checkout_filename, encoding='utf-8') as f: count = 0 for line in f.readlines(): if line.startswith('*'): # correct_sentence_list.extend(cs.cut_with_weight(line.strip()[1:])) # correct_sentence_list = cs.cut_with_weight(line.strip()[1:]) correct_sentence_set = cs.cut_with_weight(line.strip()[1:]) count = 0 else: count += 1 if count == 1: sentence = line.split('\t')[0] # top_one_sentence_list.extend(cs.cut_with_weight(sentence)) top_one_sentence_list = cs.cut_with_weight(sentence) for words in top_one_sentence_list: if words in correct_sentence_set: matched_words_count += 1 # print len(top_one_sentence_list), len(correct_sentence_list) print matched_words_count / total_words_count
def cuted_varify_sample(): '''利用weight值对varify_sample文件进行切割''' from cut_sentence import Cut_Sentence ws = Cut_Sentence() word_input_role_dic = {} def _load_input_role(): '''加载汉61633基础词库词为key与输入规则为value的字典''' input_role_filename = os.path.join( PATH, '0709modify', 'combine_5233_and_top60000_pinyin_role.txt') with codecs.open(input_role_filename, encoding='utf-8') as f: for line in f.readlines(): splited_line = line.split('\t') word = splited_line[0] input_role = splited_line[-1].strip() word_input_role_dic[word] = input_role # _load_input_role() varify_sample_filename = os.path.join(PATH, '0709modify', 'cuted_varify_sample.txt') output_filename = os.path.join(PATH, '0709modify', 'unigram', 'only_cuted_sentence_varify_sample.txt') with codecs.open(varify_sample_filename, encoding='utf-8')as f,\ codecs.open(output_filename, mode='wb', encoding='utf-8') as wf: for line in f.readlines(): word_list = ws.cut_with_weight(line.strip()) cuted_sentence = ' '.join(word_list) # input_role_list = [word_input_role_dic[item] for item in word_list] # input_role_str = ' '.join(input_role_list) # com_str = '\t'.join((line.strip(),input_role_str)) wf.write(cuted_sentence + '\n')
def get_words_accuracy_top_one(): '''样本文件中词的准确度AW''' from cut_sentence import Cut_Sentence matched_words_count = 0 # total_words_count = 312525.0 cs = Cut_Sentence() correct_sentence_set = set() total_top_one_sentence_list = [] top_one_sentence_list = [] correct_sentence_list = [] checkout_filename = os.path.join(PATH, '0709modify', 'cut_path_lenght_limit_50.txt') with codecs.open(checkout_filename, encoding='utf-8') as f: count = 0 for line in f.readlines(): if line.startswith('*'): correct_sentence_list.extend(cs.cut_with_weight(line.strip()[1:])) correct_sentence_set = cs.cut_with_weight(line.strip()[1:]) count = 0 else: count += 1 if count == 1: sentence = line.split('\t')[0] total_top_one_sentence_list.extend(cs.cut_with_weight(sentence)) # total_top_one_sentence_list.extend(sentence.split()) top_one_sentence_list = cs.cut_with_weight(sentence) for words in top_one_sentence_list: if words in correct_sentence_set: matched_words_count += 1 print len(total_top_one_sentence_list), len(correct_sentence_list) print str(matched_words_count/float(len(total_top_one_sentence_list))*100)+'%'
def gen_word_freq_from_linguistic_data(): '''词表+句子语料''' cs = Cut_Sentence() whole_word_freq_dic = {} whole_word_freq_set = set() with codecs.open(src_filename, encoding='utf-8') as f: while True: line = f.readline() if not line: break splited_words_tuple = cs.cut(line) if len(splited_words_tuple) == 1: if splited_words_tuple[0] in whole_word_freq_set: whole_word_freq_dic[splited_words_tuple[0]] += 1 else: whole_word_freq_set.add(splited_words_tuple[0]) whole_word_freq_dic[splited_words_tuple[0]] = 1 else: for splited_words_param in splited_words_tuple: if splited_words_param in whole_word_freq_set: whole_word_freq_dic[splited_words_param] += 1 else: whole_word_freq_set.add(splited_words_param) whole_word_freq_dic[splited_words_param] = 1 temp_filename = os.path.join(PATH, '0709modify', 'word_freq_from_95K.txt') word_freq_str_list = ['\t'.join((key,str(value)))+'\n' for (key,value) in whole_word_freq_dic.items()] with codecs.open(temp_filename, mode='wb', encoding='utf-8') as wf: wf.writelines(word_freq_str_list)
def cuted_varify_sample(): '''利用weight值对varify_sample文件进行切割''' from cut_sentence import Cut_Sentence ws = Cut_Sentence() word_input_role_dic = {} def _load_input_role(): '''加载汉61633基础词库词为key与输入规则为value的字典''' input_role_filename = os.path.join(PATH, '0709modify', 'combine_5233_and_top60000_pinyin_role.txt') with codecs.open(input_role_filename, encoding='utf-8') as f: for line in f.readlines(): splited_line = line.split('\t') word = splited_line[0] input_role = splited_line[-1].strip() word_input_role_dic[word] = input_role # _load_input_role() varify_sample_filename = os.path.join(PATH, '0709modify', 'cuted_varify_sample.txt') output_filename = os.path.join(PATH, '0709modify', 'unigram', 'only_cuted_sentence_varify_sample.txt') with codecs.open(varify_sample_filename, encoding='utf-8')as f,\ codecs.open(output_filename, mode='wb', encoding='utf-8') as wf: for line in f.readlines(): word_list = ws.cut_with_weight(line.strip()) cuted_sentence = ' '.join(word_list) # input_role_list = [word_input_role_dic[item] for item in word_list] # input_role_str = ' '.join(input_role_list) # com_str = '\t'.join((line.strip(),input_role_str)) wf.write(cuted_sentence+'\n')
def cut_lines_into_words(): '''将行(句子)切割成词,其间以空格隔开''' from cut_sentence import Cut_Sentence cs = Cut_Sentence() for file_count in range(26, 29): print file_count src_filename = os.path.join(PATH, '0709modify', 'cuted_linguistic_stample', '%s.txt'%file_count) try: assert os.path.exists(src_filename) except AssertionError: print '%s does not exist !!'%src_filename with codecs.open(src_filename, encoding='utf-8') as f: cuted_lines_list = [' '.join(cs.cut_with_weight(line))+'\n' for line in f.readlines()] codecs.open(src_filename, mode='wb', encoding='utf-8').writelines(cuted_lines_list)