def clean_skip_words_all(): filepath_skip_words_all_new = getTheFile( "local/skip_words/skip_words_all_new.human.txt") filepath_skip_words_all_auto = getTheFile( "localskip_words/test_question_all.auto.txt") skip_words_all_new = libfile.file2list(filepath_skip_words_all_new) to_remove = set() for i in range(0, len(skip_words_all_new)): for j in range(i + 1, len(skip_words_all_new)): if skip_words_all_new[i] in skip_words_all_new[j]: to_remove.add(skip_words_all_new[j]) elif skip_words_all_new[j] in skip_words_all_new[i]: to_remove.add(skip_words_all_new[i]) print "to remove ", len(to_remove) libfile.lines2file( sorted(list(to_remove)), getTheFile("local/skip_words/skip_words_all_to_remove.txt")) skip_words_all_new = set(skip_words_all_new) skip_words_all_new.difference_update(to_remove) print "skip_words_all_new after removing to_remove", len( skip_words_all_new) skip_words_all_auto = libfile.file2list(filepath_skip_words_all_auto) skip_words_all_auto = set(skip_words_all_auto) print "skip_words_all_new ", len(skip_words_all_new) print "skip_words_all_auto ", len(skip_words_all_auto) skip_words_all_new = removeLen1Word(skip_words_all_new) skip_words_all_auto = removeLen1Word(skip_words_all_auto) print "skip_words_all_new after remove len 1", len(skip_words_all_new) print "skip_words_all_auto after remove len 1", len(skip_words_all_auto) skip_words_all_core = skip_words_all_new.intersection(skip_words_all_auto) skip_words_all_new.difference_update(skip_words_all_core) print "skip_words_all_core ", len(skip_words_all_core) api = ZhidaoNlp(debug=True) skip_words_all_diff = set() for word in skip_words_all_new: detected_skip_words = api.detect_skip_words(word, skip_words_all_core) if len(detected_skip_words) == 0: skip_words_all_diff.add(word) print "skip_words_all_diff ", len(skip_words_all_diff) libfile.lines2file(sorted(list(skip_words_all_core)), getTheFile("local/skip_words/skip_words_all_core.txt")) libfile.lines2file(sorted(list(skip_words_all_diff)), getTheFile("local/skip_words/skip_words_all_diff.txt"))
def test_is_question_baike(): assert (is_question_baike(None) == False) import libfile filenames = [ getTheFile("baike_questions_pos.txt"), getTheFile("baike_questions_neg.txt") ] counter = collections.Counter() regex_white1 = ur"你知道|我[国们]" regex_black = ur"你|我" regex_white2 = ur"什么|最|哪|谁|百科|吗|是|有|多|怎么?样?|啥|如何|距离|历史|介绍|信息|?" for filename in filenames: print "=====", filename lines = libfile.file2list(filename) for line in lines: if is_question_baike(line): actual = "_pos" else: actual = "_neg" if not actual in filename: counter["F"] += 1 print line else: counter["T"] += 1 print counter, "error rate", 1.0 * counter["F"] / (counter["F"] + counter["T"])
def eval_fn(): api = ZhidaoNlp(debug=False) filenames = [ (getTheFile("test/test_question_skip_yes.human.txt"), 1), # ( getLocalFile("chat4xianliao/chat/input/xianer_all_question.txt"), 0 ), (getTheFile("test/test_question_skip_no.human.txt"), 0), (getTheFile("test/test_ask_baike_all.human.txt"), 0), (getTheFile("test/test_ask_chat_all.human.txt"), 0), ] tests = [] for filename, expect in filenames: entry = {"data": libfile.file2list(filename), "expect": expect} tests.append(entry) gcounter["from_{}".format(os.path.basename(filename))] = len( entry["data"]) target_names = [u"正常", u"敏感词"] all_detected_skip_words = collections.Counter() setattr(api, "all_detected_skip_words", all_detected_skip_words) libdata.eval_fn(tests, target_names, fn_classify_0619, api) libfile.lines2file( false_positive, getTheFile("local/skip_words/chat8xianer12w_test_false_positive.txt")) libfile.lines2file( false_negative, getTheFile("local/skip_words/chat8xianer12w_test_false_negative.txt")) libfile.lines2file(libdata.items2sample(true_negative, 1500 if len(true_negative)>1500 else len(true_negative)), \ getTheFile("local/skip_words/chat8xianer12w_test_true_negative.txt")) print json.dumps(gcounter, ensure_ascii=False), "\n\n------ all done"
def eval_filter(query_filters=[1, 3, 2], flag_debug=False): api = ZhidaoNlp() api.debug = flag_debug for query_filter in [1, 3, 2]: api.query_filter = query_filter if flag_debug: api.all_words = collections.Counter() filenames = [ (getLocalFile("baike/baike_questions_pos.human.txt"), 1), (getLocalFile("baike/baike_questions_neg.human.txt"), 0), (getLocalFile("baike/baike_questions_chat.human.txt"), 0), (getTheFile("test/test_ask_baike_all.human.txt"), 1), (getTheFile("test/test_ask_chat_all.human.txt"), 0), ] all_words = collections.Counter() tests = [] all_query = set() for filename, expect in filenames: print "=====", filename entry = {"data": libfile.file2list(filename), "expect": expect} temp = set(entry["data"]) temp.difference_update(all_query) entry["data"] = list(temp) all_query.update(entry["data"]) tests.append(entry) #gcounter["from_{}".format(os.path.basename(filename))] = len(entry["data"]) target_names = [u"不是", u"是百科"] libdata.eval_fn(tests, target_names, fn_query_filter, api) print json.dumps(gcounter, indent=4, sort_keys=True) if flag_debug: for word, cnt in all_words.most_common(20): print word, cnt pass
# [cx_freeze] # エラーが起きるとダイアログでスタックトレースが表示されるが, # これには以下問題がある. # - 長文な上に専門的なので利用者が面食らう # - ビルド環境の Python パスが見えてしまう # これを隠蔽するため例外を吸収した上で, ダイアログで出すようにする. try: args = parse_arguments() if args.input == None: abort('-i option required.') if not (libfile.is_file(args.input)): abort('The datafile "{}" is invalid'.format(args.input)) lines = libfile.file2list(args.input) # [cx_freeze] # os.path.abspath(os.path.dirname(__file__)) がお手軽だが # __file__ は実行ファイルでは未定義になってしまうので使わない. SELF_FULLPATH = os.path.abspath(sys.argv[0]) SELF_DIR = os.path.dirname(SELF_FULLPATH) wx, wy = args.windowx, args.windowy px, py = center_pos(wx, wy) isrh = libisearch.ISearcher() isrh.set_caption(CAPTION) \ .set_window_rect(px, py, wx, wy) \ .set_on_text(on_text) \ .set_on_enter(on_enter) \ .set_search_func(search_func) \
def learn_skip_words_0619(): api = ZhidaoNlp(debug=True) print json.dumps(gcounter, ensure_ascii=False), "\n\n------ load all raw", skip_words_raw = collections.Counter() filenames = glob.glob(getTheFile("local/skip_words/skip_words_*.raw.txt")) for filename in filenames: for phrase in libfile.file2list(filename): gcounter["from_{}".format(os.path.basename(filename))] += 1 skip_words_raw[phrase] += 1 gcounter["skip_words_raw_loaded"] = len(skip_words_raw) print json.dumps(gcounter, ensure_ascii=False), "\n\n------ generate clean", skip_words_clean = collections.Counter() for phrase in skip_words_raw: temp = api.cut_text(phrase) for word in temp: skip_words_clean[word] += skip_words_raw[phrase] gcounter["skip_words_clean"] = len(skip_words_clean) print json.dumps( gcounter, ensure_ascii=False), "\n\n------ estimate raw outside clean" skip_words_raw_diff = set(skip_words_raw) skip_words_raw_diff.difference_update(skip_words_clean) for phrase in libdata.items2sample(skip_words_raw_diff): print phrase, skip_words_raw[phrase] gcounter["skip_words_raw_diff"] = len(skip_words_raw_diff) print json.dumps(gcounter, ensure_ascii=False), "\n\n------ load not clean " not_skip_words_clean = set() filenames = glob.glob(getTheFile("model/skip_words_no.human.txt")) for filename in filenames: for line in libfile.file2list(filename): if line not in not_skip_words_clean: gcounter["from_{}".format(os.path.basename(filename))] += 1 not_skip_words_clean.add(line) gcounter["not_skip_words_clean_loaded"] = len(not_skip_words_clean) print json.dumps(gcounter, ensure_ascii=False), "\n\n------ filter clean with not " skip_words_all = set(skip_words_clean) skip_words_all.difference_update(not_skip_words_clean) gcounter["skip_words_all"] = len(skip_words_all) filename = getTheFile("local/skip_words/test_question_all.auto.txt") libfile.lines2file(sorted(list(skip_words_all)), filename) print json.dumps(gcounter, ensure_ascii=False), "\n\n------ eval performance" filenames = [ (getTheFile("test/test_question_skip_no.human.txt"), 0), # ( getTheFile("local/baike/baike_questions_pos.human.txt"), 0), # [ getTheFile("local/baike/baike_questions_neg.human.txt"), 0 ], (getTheFile("test/test_question_skip_yes.human.txt"), 1), ] all_detected_skip_words = collections.Counter() counter = collections.Counter() tests = [] for filename, expect in filenames: entry = {"data": libfile.file2list(filename), "expect": expect} tests.append(entry) gcounter["from_{}".format(os.path.basename(filename))] = len( entry["data"]) target_names = [u"正常", u"敏感词"] setattr(api, "all_detected_skip_words", all_detected_skip_words) setattr(api, "skip_words_all", skip_words_all) libdata.eval_fn(tests, target_names, fn_classify_0619, api)
def main(): #print sys.argv if len(sys.argv) < 2: show_help() return option = sys.argv[1] if "eval_filter" == option: eval_filter() elif "debug_filter" == option: eval_filter([2], True) elif "test_is_baike_realtime" == option: # python hzlib/task_api_zhidao.py test api = ZhidaoNlp() if len(sys.argv) > 2: question = sys.argv[1] query_filter = 2 if len(sys.argv) > 3: query_filter = int(sys.argv[2]) ret = api.is_question_baike(question, query_filter=query_filter) print question, ret, query_filter else: question = u"那月亮为什么会跟着我走" ret = api.is_question_baike(question) print question, ret assert (not ret) question = u"天空为什么是蓝色的" ret = api.is_question_baike(question) print question, ret assert (ret) elif "test_chat_realtime" == option: # python hzlib/task_api_zhidao.py test api = ZhidaoFetch() if len(sys.argv) > 2: question = sys.argv[2] query_filter = 2 if len(sys.argv) > 3: query_filter = int(sys.argv[3]) print question, query_filter ret = api.search_chat_best(question, query_filter=query_filter) print question, query_filter libdata.print_json(ret) else: question = u"你喜欢蓝色么?" ret = api.search_chat_best(question) print question libdata.print_json(ret) elif "test_chat_cache" == option: # python hzlib/task_api_zhidao.py test config = { "batch_id": "test-test-20160620", "length": 1, "crawl_http_method": "get", "crawl_gap": 1, "crawl_use_js_engine": False, "crawl_timeout": 10, "crawl_http_headers": {}, "debug": False, "cache_server": "http://192.168.1.179:8000" } api = ZhidaoFetch(config) if len(sys.argv) > 2: question = sys.argv[2] query_filter = 2 if len(sys.argv) > 3: query_filter = int(sys.argv[3]) ret = api.search_chat_best(question, query_filter=query_filter) print question, query_filter libdata.print_json(ret) else: question = u"你喜欢蓝色么?" ret = api.search_chat_best(question) print question libdata.print_json(ret) elif "test_baike_realtime" == option: # python hzlib/task_api_zhidao.py test_baike_realtime api = ZhidaoFetch() if len(sys.argv) > 2: question = sys.argv[2] query_filter = 2 if len(sys.argv) > 3: query_filter = int(sys.argv[3]) print question, query_filter ret = api.search_baike_best(question, query_filter=query_filter) print question, query_filter libdata.print_json(ret) else: question = u"严重贫血怎么办" question = u"天空是什么颜色的?" ret = api.search_baike_best(question, keep_result=True) print question libdata.print_json(ret) elif option.startswith("test_baike_cache"): # python hzlib/task_api_zhidao.py test_baike_cache print "========" config = { "batch_id": "test-test-20160620", "length": 1, "crawl_http_method": "get", "crawl_gap": 1, "crawl_use_js_engine": False, "crawl_timeout": 10, "crawl_http_headers": {}, "debug": True, "cache_server": "http://192.168.1.179:8000" } api = ZhidaoFetch(config) if option == "test_baike_cache_one": #question = u"你喜欢蓝色么?" question = u"天空是什么颜色的?" question = u"掏粪男孩就是指TFboys吗?" question = u"爱因斯坦是谁" if len(sys.argv) > 2: question = sys.argv[2] ret = api.search_baike_best(question) libdata.print_json(ret) print question else: filename_question = sys.argv[2] questions = libfile.file2list(filename_question) if questions: filename = u"{}.temp".format(filename_question) libfile.lines2file(sorted(list(questions)), filename) print len(questions) results = [] for question in questions: query_filter = 2 if len(sys.argv) > 3: query_filter = int(sys.argv[3]) debug_item = {} ret = api.search_baike_best(question, query_filter=query_filter, debug_item=debug_item) print question, query_filter #libdata.print_json(ret) if not ret: debug_item["best"] = u"异常" debug_item["query"] = question results.append(debug_item) elif not ret.get("items_all", []): debug_item["query"] = question debug_item["best"] = u"无结果" results.append(debug_item) else: for item in ret.get("items_all", []): item["query"] = question results.append(item) if item["id"] == ret.get("best_qapair", {}).get("id"): item["best"] = u"最优" else: item["best"] = u"" filename = getLocalFile("temp/test_baike_cache.{}.xls".format( os.path.basename(filename_question))) fields = [ u"标注", "best", "debug_note", "query", "answers", "match_score", "cnt_like", "cnt_answer", "question", "id", "answers_raw", "question_content" ] libfile.writeExcel(results, fields, filename) elif "test_jieba" == option: api = ZhidaoNlp() question = sys.argv[2] if not isinstance(question, unicode): question = question.decode("utf-8") temp = api.cut_text(question) print json.dumps(list(temp), ensure_ascii=False) temp = api.pseg.cut(question) for word, pos in temp: print word, pos