Exemple #1
0
def export_skip_words():
    lines = set()
    lines = libfile.file2set(getTheFile("model/skip_words_all.human.txt"))
    lines.difference_update(
        libfile.file2set(getTheFile("model/skip_words_all_no.human.txt")))
    libfile.lines2file(sorted(list(lines)),
                       getTheFile("model/skip_words_x_all.auto.txt"))

    lines.update(
        libfile.file2set(getTheFile("model/skip_words_all_ext.human.txt")))
    libfile.lines2file(sorted(list(lines)),
                       getTheFile("model/skip_words_x_nlp.auto.txt"))
Exemple #2
0
def eval_fn():
    api = ZhidaoNlp(debug=False)
    filenames = [
        (getTheFile("test/test_question_skip_yes.human.txt"), 1),
        # ( getLocalFile("chat4xianliao/chat/input/xianer_all_question.txt"), 0 ),
        (getTheFile("test/test_question_skip_no.human.txt"), 0),
        (getTheFile("test/test_ask_baike_all.human.txt"), 0),
        (getTheFile("test/test_ask_chat_all.human.txt"), 0),
    ]
    tests = []
    for filename, expect in filenames:
        entry = {"data": libfile.file2list(filename), "expect": expect}
        tests.append(entry)
        gcounter["from_{}".format(os.path.basename(filename))] = len(
            entry["data"])

    target_names = [u"正常", u"敏感词"]
    all_detected_skip_words = collections.Counter()
    setattr(api, "all_detected_skip_words", all_detected_skip_words)
    libdata.eval_fn(tests, target_names, fn_classify_0619, api)

    libfile.lines2file(
        false_positive,
        getTheFile("local/skip_words/chat8xianer12w_test_false_positive.txt"))
    libfile.lines2file(
        false_negative,
        getTheFile("local/skip_words/chat8xianer12w_test_false_negative.txt"))
    libfile.lines2file(libdata.items2sample(true_negative, 1500 if len(true_negative)>1500 else len(true_negative)), \
    getTheFile("local/skip_words/chat8xianer12w_test_true_negative.txt"))
    print json.dumps(gcounter, ensure_ascii=False), "\n\n------ all done"
Exemple #3
0
def clean_skip_words_all():
    filepath_skip_words_all_new = getTheFile(
        "local/skip_words/skip_words_all_new.human.txt")
    filepath_skip_words_all_auto = getTheFile(
        "localskip_words/test_question_all.auto.txt")

    skip_words_all_new = libfile.file2list(filepath_skip_words_all_new)

    to_remove = set()
    for i in range(0, len(skip_words_all_new)):
        for j in range(i + 1, len(skip_words_all_new)):
            if skip_words_all_new[i] in skip_words_all_new[j]:
                to_remove.add(skip_words_all_new[j])
            elif skip_words_all_new[j] in skip_words_all_new[i]:
                to_remove.add(skip_words_all_new[i])
    print "to remove ", len(to_remove)
    libfile.lines2file(
        sorted(list(to_remove)),
        getTheFile("local/skip_words/skip_words_all_to_remove.txt"))

    skip_words_all_new = set(skip_words_all_new)
    skip_words_all_new.difference_update(to_remove)
    print "skip_words_all_new after removing to_remove", len(
        skip_words_all_new)

    skip_words_all_auto = libfile.file2list(filepath_skip_words_all_auto)
    skip_words_all_auto = set(skip_words_all_auto)

    print "skip_words_all_new ", len(skip_words_all_new)
    print "skip_words_all_auto ", len(skip_words_all_auto)

    skip_words_all_new = removeLen1Word(skip_words_all_new)
    skip_words_all_auto = removeLen1Word(skip_words_all_auto)

    print "skip_words_all_new after remove len 1", len(skip_words_all_new)
    print "skip_words_all_auto after remove len 1", len(skip_words_all_auto)

    skip_words_all_core = skip_words_all_new.intersection(skip_words_all_auto)
    skip_words_all_new.difference_update(skip_words_all_core)

    print "skip_words_all_core ", len(skip_words_all_core)

    api = ZhidaoNlp(debug=True)
    skip_words_all_diff = set()
    for word in skip_words_all_new:
        detected_skip_words = api.detect_skip_words(word, skip_words_all_core)
        if len(detected_skip_words) == 0:
            skip_words_all_diff.add(word)
    print "skip_words_all_diff ", len(skip_words_all_diff)

    libfile.lines2file(sorted(list(skip_words_all_core)),
                       getTheFile("local/skip_words/skip_words_all_core.txt"))
    libfile.lines2file(sorted(list(skip_words_all_diff)),
                       getTheFile("local/skip_words/skip_words_all_diff.txt"))
Exemple #4
0
def learn_skip_words_0619():
    api = ZhidaoNlp(debug=True)

    print json.dumps(gcounter, ensure_ascii=False), "\n\n------ load all raw",
    skip_words_raw = collections.Counter()
    filenames = glob.glob(getTheFile("local/skip_words/skip_words_*.raw.txt"))
    for filename in filenames:
        for phrase in libfile.file2list(filename):
            gcounter["from_{}".format(os.path.basename(filename))] += 1
            skip_words_raw[phrase] += 1
    gcounter["skip_words_raw_loaded"] = len(skip_words_raw)

    print json.dumps(gcounter,
                     ensure_ascii=False), "\n\n------ generate clean",
    skip_words_clean = collections.Counter()
    for phrase in skip_words_raw:
        temp = api.cut_text(phrase)
        for word in temp:
            skip_words_clean[word] += skip_words_raw[phrase]
    gcounter["skip_words_clean"] = len(skip_words_clean)

    print json.dumps(
        gcounter, ensure_ascii=False), "\n\n------ estimate raw outside clean"
    skip_words_raw_diff = set(skip_words_raw)
    skip_words_raw_diff.difference_update(skip_words_clean)
    for phrase in libdata.items2sample(skip_words_raw_diff):
        print phrase, skip_words_raw[phrase]
    gcounter["skip_words_raw_diff"] = len(skip_words_raw_diff)

    print json.dumps(gcounter,
                     ensure_ascii=False), "\n\n------ load not clean "
    not_skip_words_clean = set()
    filenames = glob.glob(getTheFile("model/skip_words_no.human.txt"))
    for filename in filenames:
        for line in libfile.file2list(filename):
            if line not in not_skip_words_clean:
                gcounter["from_{}".format(os.path.basename(filename))] += 1
                not_skip_words_clean.add(line)
    gcounter["not_skip_words_clean_loaded"] = len(not_skip_words_clean)

    print json.dumps(gcounter,
                     ensure_ascii=False), "\n\n------ filter clean with not "
    skip_words_all = set(skip_words_clean)
    skip_words_all.difference_update(not_skip_words_clean)
    gcounter["skip_words_all"] = len(skip_words_all)
    filename = getTheFile("local/skip_words/test_question_all.auto.txt")
    libfile.lines2file(sorted(list(skip_words_all)), filename)

    print json.dumps(gcounter,
                     ensure_ascii=False), "\n\n------ eval performance"
    filenames = [
        (getTheFile("test/test_question_skip_no.human.txt"), 0),
        #        ( getTheFile("local/baike/baike_questions_pos.human.txt"), 0),
        #        [ getTheFile("local/baike/baike_questions_neg.human.txt"), 0 ],
        (getTheFile("test/test_question_skip_yes.human.txt"), 1),
    ]
    all_detected_skip_words = collections.Counter()
    counter = collections.Counter()
    tests = []
    for filename, expect in filenames:
        entry = {"data": libfile.file2list(filename), "expect": expect}
        tests.append(entry)
        gcounter["from_{}".format(os.path.basename(filename))] = len(
            entry["data"])

    target_names = [u"正常", u"敏感词"]
    setattr(api, "all_detected_skip_words", all_detected_skip_words)
    setattr(api, "skip_words_all", skip_words_all)
    libdata.eval_fn(tests, target_names, fn_classify_0619, api)
Exemple #5
0
def main():
    #print sys.argv

    if len(sys.argv) < 2:
        show_help()
        return

    option = sys.argv[1]

    if "eval_filter" == option:
        eval_filter()

    elif "debug_filter" == option:
        eval_filter([2], True)

    elif "test_is_baike_realtime" == option:
        # python hzlib/task_api_zhidao.py test
        api = ZhidaoNlp()
        if len(sys.argv) > 2:
            question = sys.argv[1]
            query_filter = 2
            if len(sys.argv) > 3:
                query_filter = int(sys.argv[2])
            ret = api.is_question_baike(question, query_filter=query_filter)
            print question, ret, query_filter
        else:
            question = u"那月亮为什么会跟着我走"
            ret = api.is_question_baike(question)
            print question, ret
            assert (not ret)
            question = u"天空为什么是蓝色的"
            ret = api.is_question_baike(question)
            print question, ret
            assert (ret)

    elif "test_chat_realtime" == option:
        # python hzlib/task_api_zhidao.py test
        api = ZhidaoFetch()
        if len(sys.argv) > 2:
            question = sys.argv[2]
            query_filter = 2
            if len(sys.argv) > 3:
                query_filter = int(sys.argv[3])
            print question, query_filter
            ret = api.search_chat_best(question, query_filter=query_filter)
            print question, query_filter
            libdata.print_json(ret)

        else:
            question = u"你喜欢蓝色么?"
            ret = api.search_chat_best(question)
            print question
            libdata.print_json(ret)

    elif "test_chat_cache" == option:
        # python hzlib/task_api_zhidao.py test

        config = {
            "batch_id": "test-test-20160620",
            "length": 1,
            "crawl_http_method": "get",
            "crawl_gap": 1,
            "crawl_use_js_engine": False,
            "crawl_timeout": 10,
            "crawl_http_headers": {},
            "debug": False,
            "cache_server": "http://192.168.1.179:8000"
        }
        api = ZhidaoFetch(config)
        if len(sys.argv) > 2:
            question = sys.argv[2]
            query_filter = 2
            if len(sys.argv) > 3:
                query_filter = int(sys.argv[3])
            ret = api.search_chat_best(question, query_filter=query_filter)
            print question, query_filter
            libdata.print_json(ret)

        else:
            question = u"你喜欢蓝色么?"
            ret = api.search_chat_best(question)
            print question
            libdata.print_json(ret)

    elif "test_baike_realtime" == option:
        # python hzlib/task_api_zhidao.py test_baike_realtime
        api = ZhidaoFetch()
        if len(sys.argv) > 2:
            question = sys.argv[2]
            query_filter = 2
            if len(sys.argv) > 3:
                query_filter = int(sys.argv[3])
            print question, query_filter
            ret = api.search_baike_best(question, query_filter=query_filter)
            print question, query_filter
            libdata.print_json(ret)

        else:
            question = u"严重贫血怎么办"
            question = u"天空是什么颜色的?"
            ret = api.search_baike_best(question, keep_result=True)
            print question
            libdata.print_json(ret)

    elif option.startswith("test_baike_cache"):
        # python hzlib/task_api_zhidao.py test_baike_cache
        print "========"
        config = {
            "batch_id": "test-test-20160620",
            "length": 1,
            "crawl_http_method": "get",
            "crawl_gap": 1,
            "crawl_use_js_engine": False,
            "crawl_timeout": 10,
            "crawl_http_headers": {},
            "debug": True,
            "cache_server": "http://192.168.1.179:8000"
        }
        api = ZhidaoFetch(config)
        if option == "test_baike_cache_one":
            #question = u"你喜欢蓝色么?"
            question = u"天空是什么颜色的?"
            question = u"掏粪男孩就是指TFboys吗?"
            question = u"爱因斯坦是谁"
            if len(sys.argv) > 2:
                question = sys.argv[2]
            ret = api.search_baike_best(question)
            libdata.print_json(ret)
            print question
        else:
            filename_question = sys.argv[2]
            questions = libfile.file2list(filename_question)

            if questions:
                filename = u"{}.temp".format(filename_question)
                libfile.lines2file(sorted(list(questions)), filename)

            print len(questions)
            results = []
            for question in questions:
                query_filter = 2
                if len(sys.argv) > 3:
                    query_filter = int(sys.argv[3])
                debug_item = {}
                ret = api.search_baike_best(question,
                                            query_filter=query_filter,
                                            debug_item=debug_item)
                print question, query_filter
                #libdata.print_json(ret)
                if not ret:
                    debug_item["best"] = u"异常"
                    debug_item["query"] = question
                    results.append(debug_item)

                elif not ret.get("items_all", []):
                    debug_item["query"] = question
                    debug_item["best"] = u"无结果"
                    results.append(debug_item)

                else:
                    for item in ret.get("items_all", []):
                        item["query"] = question
                        results.append(item)
                        if item["id"] == ret.get("best_qapair", {}).get("id"):
                            item["best"] = u"最优"
                        else:
                            item["best"] = u""

            filename = getLocalFile("temp/test_baike_cache.{}.xls".format(
                os.path.basename(filename_question)))
            fields = [
                u"标注", "best", "debug_note", "query", "answers", "match_score",
                "cnt_like", "cnt_answer", "question", "id", "answers_raw",
                "question_content"
            ]
            libfile.writeExcel(results, fields, filename)

    elif "test_jieba" == option:
        api = ZhidaoNlp()
        question = sys.argv[2]
        if not isinstance(question, unicode):
            question = question.decode("utf-8")

        temp = api.cut_text(question)
        print json.dumps(list(temp), ensure_ascii=False)

        temp = api.pseg.cut(question)
        for word, pos in temp:
            print word, pos