def export_skip_words(): lines = set() lines = libfile.file2set(getTheFile("model/skip_words_all.human.txt")) lines.difference_update( libfile.file2set(getTheFile("model/skip_words_all_no.human.txt"))) libfile.lines2file(sorted(list(lines)), getTheFile("model/skip_words_x_all.auto.txt")) lines.update( libfile.file2set(getTheFile("model/skip_words_all_ext.human.txt"))) libfile.lines2file(sorted(list(lines)), getTheFile("model/skip_words_x_nlp.auto.txt"))
def eval_fn(): api = ZhidaoNlp(debug=False) filenames = [ (getTheFile("test/test_question_skip_yes.human.txt"), 1), # ( getLocalFile("chat4xianliao/chat/input/xianer_all_question.txt"), 0 ), (getTheFile("test/test_question_skip_no.human.txt"), 0), (getTheFile("test/test_ask_baike_all.human.txt"), 0), (getTheFile("test/test_ask_chat_all.human.txt"), 0), ] tests = [] for filename, expect in filenames: entry = {"data": libfile.file2list(filename), "expect": expect} tests.append(entry) gcounter["from_{}".format(os.path.basename(filename))] = len( entry["data"]) target_names = [u"正常", u"敏感词"] all_detected_skip_words = collections.Counter() setattr(api, "all_detected_skip_words", all_detected_skip_words) libdata.eval_fn(tests, target_names, fn_classify_0619, api) libfile.lines2file( false_positive, getTheFile("local/skip_words/chat8xianer12w_test_false_positive.txt")) libfile.lines2file( false_negative, getTheFile("local/skip_words/chat8xianer12w_test_false_negative.txt")) libfile.lines2file(libdata.items2sample(true_negative, 1500 if len(true_negative)>1500 else len(true_negative)), \ getTheFile("local/skip_words/chat8xianer12w_test_true_negative.txt")) print json.dumps(gcounter, ensure_ascii=False), "\n\n------ all done"
def clean_skip_words_all(): filepath_skip_words_all_new = getTheFile( "local/skip_words/skip_words_all_new.human.txt") filepath_skip_words_all_auto = getTheFile( "localskip_words/test_question_all.auto.txt") skip_words_all_new = libfile.file2list(filepath_skip_words_all_new) to_remove = set() for i in range(0, len(skip_words_all_new)): for j in range(i + 1, len(skip_words_all_new)): if skip_words_all_new[i] in skip_words_all_new[j]: to_remove.add(skip_words_all_new[j]) elif skip_words_all_new[j] in skip_words_all_new[i]: to_remove.add(skip_words_all_new[i]) print "to remove ", len(to_remove) libfile.lines2file( sorted(list(to_remove)), getTheFile("local/skip_words/skip_words_all_to_remove.txt")) skip_words_all_new = set(skip_words_all_new) skip_words_all_new.difference_update(to_remove) print "skip_words_all_new after removing to_remove", len( skip_words_all_new) skip_words_all_auto = libfile.file2list(filepath_skip_words_all_auto) skip_words_all_auto = set(skip_words_all_auto) print "skip_words_all_new ", len(skip_words_all_new) print "skip_words_all_auto ", len(skip_words_all_auto) skip_words_all_new = removeLen1Word(skip_words_all_new) skip_words_all_auto = removeLen1Word(skip_words_all_auto) print "skip_words_all_new after remove len 1", len(skip_words_all_new) print "skip_words_all_auto after remove len 1", len(skip_words_all_auto) skip_words_all_core = skip_words_all_new.intersection(skip_words_all_auto) skip_words_all_new.difference_update(skip_words_all_core) print "skip_words_all_core ", len(skip_words_all_core) api = ZhidaoNlp(debug=True) skip_words_all_diff = set() for word in skip_words_all_new: detected_skip_words = api.detect_skip_words(word, skip_words_all_core) if len(detected_skip_words) == 0: skip_words_all_diff.add(word) print "skip_words_all_diff ", len(skip_words_all_diff) libfile.lines2file(sorted(list(skip_words_all_core)), getTheFile("local/skip_words/skip_words_all_core.txt")) libfile.lines2file(sorted(list(skip_words_all_diff)), getTheFile("local/skip_words/skip_words_all_diff.txt"))
def learn_skip_words_0619(): api = ZhidaoNlp(debug=True) print json.dumps(gcounter, ensure_ascii=False), "\n\n------ load all raw", skip_words_raw = collections.Counter() filenames = glob.glob(getTheFile("local/skip_words/skip_words_*.raw.txt")) for filename in filenames: for phrase in libfile.file2list(filename): gcounter["from_{}".format(os.path.basename(filename))] += 1 skip_words_raw[phrase] += 1 gcounter["skip_words_raw_loaded"] = len(skip_words_raw) print json.dumps(gcounter, ensure_ascii=False), "\n\n------ generate clean", skip_words_clean = collections.Counter() for phrase in skip_words_raw: temp = api.cut_text(phrase) for word in temp: skip_words_clean[word] += skip_words_raw[phrase] gcounter["skip_words_clean"] = len(skip_words_clean) print json.dumps( gcounter, ensure_ascii=False), "\n\n------ estimate raw outside clean" skip_words_raw_diff = set(skip_words_raw) skip_words_raw_diff.difference_update(skip_words_clean) for phrase in libdata.items2sample(skip_words_raw_diff): print phrase, skip_words_raw[phrase] gcounter["skip_words_raw_diff"] = len(skip_words_raw_diff) print json.dumps(gcounter, ensure_ascii=False), "\n\n------ load not clean " not_skip_words_clean = set() filenames = glob.glob(getTheFile("model/skip_words_no.human.txt")) for filename in filenames: for line in libfile.file2list(filename): if line not in not_skip_words_clean: gcounter["from_{}".format(os.path.basename(filename))] += 1 not_skip_words_clean.add(line) gcounter["not_skip_words_clean_loaded"] = len(not_skip_words_clean) print json.dumps(gcounter, ensure_ascii=False), "\n\n------ filter clean with not " skip_words_all = set(skip_words_clean) skip_words_all.difference_update(not_skip_words_clean) gcounter["skip_words_all"] = len(skip_words_all) filename = getTheFile("local/skip_words/test_question_all.auto.txt") libfile.lines2file(sorted(list(skip_words_all)), filename) print json.dumps(gcounter, ensure_ascii=False), "\n\n------ eval performance" filenames = [ (getTheFile("test/test_question_skip_no.human.txt"), 0), # ( getTheFile("local/baike/baike_questions_pos.human.txt"), 0), # [ getTheFile("local/baike/baike_questions_neg.human.txt"), 0 ], (getTheFile("test/test_question_skip_yes.human.txt"), 1), ] all_detected_skip_words = collections.Counter() counter = collections.Counter() tests = [] for filename, expect in filenames: entry = {"data": libfile.file2list(filename), "expect": expect} tests.append(entry) gcounter["from_{}".format(os.path.basename(filename))] = len( entry["data"]) target_names = [u"正常", u"敏感词"] setattr(api, "all_detected_skip_words", all_detected_skip_words) setattr(api, "skip_words_all", skip_words_all) libdata.eval_fn(tests, target_names, fn_classify_0619, api)
def main(): #print sys.argv if len(sys.argv) < 2: show_help() return option = sys.argv[1] if "eval_filter" == option: eval_filter() elif "debug_filter" == option: eval_filter([2], True) elif "test_is_baike_realtime" == option: # python hzlib/task_api_zhidao.py test api = ZhidaoNlp() if len(sys.argv) > 2: question = sys.argv[1] query_filter = 2 if len(sys.argv) > 3: query_filter = int(sys.argv[2]) ret = api.is_question_baike(question, query_filter=query_filter) print question, ret, query_filter else: question = u"那月亮为什么会跟着我走" ret = api.is_question_baike(question) print question, ret assert (not ret) question = u"天空为什么是蓝色的" ret = api.is_question_baike(question) print question, ret assert (ret) elif "test_chat_realtime" == option: # python hzlib/task_api_zhidao.py test api = ZhidaoFetch() if len(sys.argv) > 2: question = sys.argv[2] query_filter = 2 if len(sys.argv) > 3: query_filter = int(sys.argv[3]) print question, query_filter ret = api.search_chat_best(question, query_filter=query_filter) print question, query_filter libdata.print_json(ret) else: question = u"你喜欢蓝色么?" ret = api.search_chat_best(question) print question libdata.print_json(ret) elif "test_chat_cache" == option: # python hzlib/task_api_zhidao.py test config = { "batch_id": "test-test-20160620", "length": 1, "crawl_http_method": "get", "crawl_gap": 1, "crawl_use_js_engine": False, "crawl_timeout": 10, "crawl_http_headers": {}, "debug": False, "cache_server": "http://192.168.1.179:8000" } api = ZhidaoFetch(config) if len(sys.argv) > 2: question = sys.argv[2] query_filter = 2 if len(sys.argv) > 3: query_filter = int(sys.argv[3]) ret = api.search_chat_best(question, query_filter=query_filter) print question, query_filter libdata.print_json(ret) else: question = u"你喜欢蓝色么?" ret = api.search_chat_best(question) print question libdata.print_json(ret) elif "test_baike_realtime" == option: # python hzlib/task_api_zhidao.py test_baike_realtime api = ZhidaoFetch() if len(sys.argv) > 2: question = sys.argv[2] query_filter = 2 if len(sys.argv) > 3: query_filter = int(sys.argv[3]) print question, query_filter ret = api.search_baike_best(question, query_filter=query_filter) print question, query_filter libdata.print_json(ret) else: question = u"严重贫血怎么办" question = u"天空是什么颜色的?" ret = api.search_baike_best(question, keep_result=True) print question libdata.print_json(ret) elif option.startswith("test_baike_cache"): # python hzlib/task_api_zhidao.py test_baike_cache print "========" config = { "batch_id": "test-test-20160620", "length": 1, "crawl_http_method": "get", "crawl_gap": 1, "crawl_use_js_engine": False, "crawl_timeout": 10, "crawl_http_headers": {}, "debug": True, "cache_server": "http://192.168.1.179:8000" } api = ZhidaoFetch(config) if option == "test_baike_cache_one": #question = u"你喜欢蓝色么?" question = u"天空是什么颜色的?" question = u"掏粪男孩就是指TFboys吗?" question = u"爱因斯坦是谁" if len(sys.argv) > 2: question = sys.argv[2] ret = api.search_baike_best(question) libdata.print_json(ret) print question else: filename_question = sys.argv[2] questions = libfile.file2list(filename_question) if questions: filename = u"{}.temp".format(filename_question) libfile.lines2file(sorted(list(questions)), filename) print len(questions) results = [] for question in questions: query_filter = 2 if len(sys.argv) > 3: query_filter = int(sys.argv[3]) debug_item = {} ret = api.search_baike_best(question, query_filter=query_filter, debug_item=debug_item) print question, query_filter #libdata.print_json(ret) if not ret: debug_item["best"] = u"异常" debug_item["query"] = question results.append(debug_item) elif not ret.get("items_all", []): debug_item["query"] = question debug_item["best"] = u"无结果" results.append(debug_item) else: for item in ret.get("items_all", []): item["query"] = question results.append(item) if item["id"] == ret.get("best_qapair", {}).get("id"): item["best"] = u"最优" else: item["best"] = u"" filename = getLocalFile("temp/test_baike_cache.{}.xls".format( os.path.basename(filename_question))) fields = [ u"标注", "best", "debug_note", "query", "answers", "match_score", "cnt_like", "cnt_answer", "question", "id", "answers_raw", "question_content" ] libfile.writeExcel(results, fields, filename) elif "test_jieba" == option: api = ZhidaoNlp() question = sys.argv[2] if not isinstance(question, unicode): question = question.decode("utf-8") temp = api.cut_text(question) print json.dumps(list(temp), ensure_ascii=False) temp = api.pseg.cut(question) for word, pos in temp: print word, pos