def main(): start = int(input("start:")) stop = int(input("stop:")) for i in range(start, stop + 1): dic, url_list = d.getdict() d.train_one_passage(dic, url_list, text_str='corpus/' + repr(i) + '-std.txt') d.output_to_dict(dic, url_list)
def Query(): dic,url_list=dictionary.getdict() word_list=[] for ele in dic: word_list.append(ele["Word"]) queryword=queryLabel.clipboard_get() if queryword in word_list:QueryResult.set(queryword+' is IN dictionary') else:QueryResult.set(queryword +' is NOT in dictionary')
def Query(): dic,url_list=dictionary.getdict() word_list=[] for ele in dic: word_list.append(ele["Word"]) queryword=queryLabel.clipboard_get() q_list=list(queryword) while '|' in q_list:q_list.remove('|') queryword=''.join(q_list) if queryword in word_list:QueryResult.set(queryword+' is IN dictionary') else:QueryResult.set(queryword +' is NOT in dictionary')
def Add_to_dictionary(): print('现在开始执行Add_to_dictionary()') def output_to_dict(dic, url_list): def freshdict(dic, file_str): '''This output dic into a .dic file''' def format_fix(fix_dict): # for Prefix and Suffix ret = '' for ele in fix_dict.keys(): ret = ret + ele + ':' + str(fix_dict[ele]) + ',' return ret f = open(file_str, 'w', encoding='UTF-8') for entry in dic: #中文|Word|360|Num|简体:290,繁体:60,None:10|Pre|分词:230,自修:100,考试:20,None:10|Suf| f.write("{0}|Word|{1}|Num|{2}|Pre|{3}|Suf|\n".format( entry['Word'], entry['Num'], format_fix(entry['Pre']), format_fix(entry['Suf']))) f.close() def freshurl(url_list, file_str): """This output new_url_list into a .log file""" f = open(file_str, 'w', encoding='UTF-8') for ele in url_list: if ele not in urls: f.write(ele) f.writelines(urls) f.close() #Refresh Version Information f = open("dict/latest.log", "w") file_str = date_str + '.dic' f.write(file_str + '\n') f.close() freshdict(dic, "dict/" + file_str) start = int( input("""Now we add words in ariticles into a fresh dict! Start:""")) stop = int(input("Stop:")) for i in range(start, stop + 1): dic, url_list = d.getdict() d.train_one_passage(dic, url_list, text_str='corpus/' + repr(i) + '-std.txt') d.output_to_dict(dic, url_list)
in_dic=False for ele in dic: if phrase == ele['Word']: in_dic=True std_score+=length_coefficent[len(phrase)] prefix_match=ele['Pre'].get(phrase_list[index-1],1) suffix_match=ele['Suf'].get(phrase_list[index+1],1) average_match=(prefix_match+suffix_match)/2 #Max?Min?Ave? # print(prefix_match,suffix_match,average_num) ratio=float(average_match)/float(average_num) std_score+=math.log10(ratio*10)*10 # print(phrase,math.log10(ratio*10)*10) ## if (ratio>=1):std_score+=math.log10(ratio*10)*10 ## if (ratio<1):std_score+=math.sqrt(ratio)*10-10 #The scoring method# break if (in_dic==False):std_score-=5 # this is a parameter return std_score if __name__=='__main__': #for test dic,u=d.getdict() text='''工作组|织|上|的|贸易|错|开|发票|等''' t_l=text.split('|') print(t_l,score_after_segment(t_l,dic))
def use_final(s1, dic, url, index): result_str = final.segment(s1, dic) f = open('corpus/' + repr(index) + '-final.txt', 'w', encoding='UTF-8') f.write(url) f.write(result_str) f.close() import sys import os sys.path.append(os.path.abspath('.')) if __name__ == '___main__': start = int(input("start:")) stop = int(input("stop:")) ##start=10 ##stop=10 for i in range(start, stop + 1): f = open('corpus/' + repr(i) + '.txt', 'r', encoding='UTF-8') url = f.readline() #Remove the URL in head s1 = f.read() f.close() dic, url_list = dictionary.getdict() ## use_fmm(s1,dic,url,i) ## use_stat(s1,dic,url,i) ## use_stat_opt(s1,dic,url,i) ## use_dp_opt(s1,dic,url,i) use_final(s1, dic, url, i)