Exemple #1
0
def clean_skip_words_all():
    filepath_skip_words_all_new = getTheFile(
        "local/skip_words/skip_words_all_new.human.txt")
    filepath_skip_words_all_auto = getTheFile(
        "localskip_words/test_question_all.auto.txt")

    skip_words_all_new = libfile.file2list(filepath_skip_words_all_new)

    to_remove = set()
    for i in range(0, len(skip_words_all_new)):
        for j in range(i + 1, len(skip_words_all_new)):
            if skip_words_all_new[i] in skip_words_all_new[j]:
                to_remove.add(skip_words_all_new[j])
            elif skip_words_all_new[j] in skip_words_all_new[i]:
                to_remove.add(skip_words_all_new[i])
    print "to remove ", len(to_remove)
    libfile.lines2file(
        sorted(list(to_remove)),
        getTheFile("local/skip_words/skip_words_all_to_remove.txt"))

    skip_words_all_new = set(skip_words_all_new)
    skip_words_all_new.difference_update(to_remove)
    print "skip_words_all_new after removing to_remove", len(
        skip_words_all_new)

    skip_words_all_auto = libfile.file2list(filepath_skip_words_all_auto)
    skip_words_all_auto = set(skip_words_all_auto)

    print "skip_words_all_new ", len(skip_words_all_new)
    print "skip_words_all_auto ", len(skip_words_all_auto)

    skip_words_all_new = removeLen1Word(skip_words_all_new)
    skip_words_all_auto = removeLen1Word(skip_words_all_auto)

    print "skip_words_all_new after remove len 1", len(skip_words_all_new)
    print "skip_words_all_auto after remove len 1", len(skip_words_all_auto)

    skip_words_all_core = skip_words_all_new.intersection(skip_words_all_auto)
    skip_words_all_new.difference_update(skip_words_all_core)

    print "skip_words_all_core ", len(skip_words_all_core)

    api = ZhidaoNlp(debug=True)
    skip_words_all_diff = set()
    for word in skip_words_all_new:
        detected_skip_words = api.detect_skip_words(word, skip_words_all_core)
        if len(detected_skip_words) == 0:
            skip_words_all_diff.add(word)
    print "skip_words_all_diff ", len(skip_words_all_diff)

    libfile.lines2file(sorted(list(skip_words_all_core)),
                       getTheFile("local/skip_words/skip_words_all_core.txt"))
    libfile.lines2file(sorted(list(skip_words_all_diff)),
                       getTheFile("local/skip_words/skip_words_all_diff.txt"))
Exemple #2
0
def test_is_question_baike():

    assert (is_question_baike(None) == False)

    import libfile

    filenames = [
        getTheFile("baike_questions_pos.txt"),
        getTheFile("baike_questions_neg.txt")
    ]
    counter = collections.Counter()
    regex_white1 = ur"你知道|我[国们]"
    regex_black = ur"你|我"
    regex_white2 = ur"什么|最|哪|谁|百科|吗|是|有|多|怎么?样?|啥|如何|距离|历史|介绍|信息|?"
    for filename in filenames:
        print "=====", filename
        lines = libfile.file2list(filename)

        for line in lines:
            if is_question_baike(line):
                actual = "_pos"
            else:
                actual = "_neg"

            if not actual in filename:
                counter["F"] += 1
                print line
            else:
                counter["T"] += 1
    print counter, "error rate", 1.0 * counter["F"] / (counter["F"] +
                                                       counter["T"])
Exemple #3
0
def eval_fn():
    api = ZhidaoNlp(debug=False)
    filenames = [
        (getTheFile("test/test_question_skip_yes.human.txt"), 1),
        # ( getLocalFile("chat4xianliao/chat/input/xianer_all_question.txt"), 0 ),
        (getTheFile("test/test_question_skip_no.human.txt"), 0),
        (getTheFile("test/test_ask_baike_all.human.txt"), 0),
        (getTheFile("test/test_ask_chat_all.human.txt"), 0),
    ]
    tests = []
    for filename, expect in filenames:
        entry = {"data": libfile.file2list(filename), "expect": expect}
        tests.append(entry)
        gcounter["from_{}".format(os.path.basename(filename))] = len(
            entry["data"])

    target_names = [u"正常", u"敏感词"]
    all_detected_skip_words = collections.Counter()
    setattr(api, "all_detected_skip_words", all_detected_skip_words)
    libdata.eval_fn(tests, target_names, fn_classify_0619, api)

    libfile.lines2file(
        false_positive,
        getTheFile("local/skip_words/chat8xianer12w_test_false_positive.txt"))
    libfile.lines2file(
        false_negative,
        getTheFile("local/skip_words/chat8xianer12w_test_false_negative.txt"))
    libfile.lines2file(libdata.items2sample(true_negative, 1500 if len(true_negative)>1500 else len(true_negative)), \
    getTheFile("local/skip_words/chat8xianer12w_test_true_negative.txt"))
    print json.dumps(gcounter, ensure_ascii=False), "\n\n------ all done"
Exemple #4
0
def eval_filter(query_filters=[1, 3, 2], flag_debug=False):
    api = ZhidaoNlp()
    api.debug = flag_debug
    for query_filter in [1, 3, 2]:
        api.query_filter = query_filter

        if flag_debug:
            api.all_words = collections.Counter()

        filenames = [
            (getLocalFile("baike/baike_questions_pos.human.txt"), 1),
            (getLocalFile("baike/baike_questions_neg.human.txt"), 0),
            (getLocalFile("baike/baike_questions_chat.human.txt"), 0),
            (getTheFile("test/test_ask_baike_all.human.txt"), 1),
            (getTheFile("test/test_ask_chat_all.human.txt"), 0),
        ]
        all_words = collections.Counter()

        tests = []
        all_query = set()
        for filename, expect in filenames:
            print "=====", filename
            entry = {"data": libfile.file2list(filename), "expect": expect}
            temp = set(entry["data"])
            temp.difference_update(all_query)
            entry["data"] = list(temp)
            all_query.update(entry["data"])
            tests.append(entry)
            #gcounter["from_{}".format(os.path.basename(filename))] = len(entry["data"])

        target_names = [u"不是", u"是百科"]
        libdata.eval_fn(tests, target_names, fn_query_filter, api)
        print json.dumps(gcounter, indent=4, sort_keys=True)

        if flag_debug:
            for word, cnt in all_words.most_common(20):
                print word, cnt
                pass
Exemple #5
0

# [cx_freeze]
# エラーが起きるとダイアログでスタックトレースが表示されるが,
# これには以下問題がある.
# - 長文な上に専門的なので利用者が面食らう
# - ビルド環境の Python パスが見えてしまう
# これを隠蔽するため例外を吸収した上で, ダイアログで出すようにする.
try:
    args = parse_arguments()

    if args.input == None:
        abort('-i option required.')
    if not (libfile.is_file(args.input)):
        abort('The datafile "{}" is invalid'.format(args.input))
    lines = libfile.file2list(args.input)

    # [cx_freeze]
    # os.path.abspath(os.path.dirname(__file__)) がお手軽だが
    # __file__ は実行ファイルでは未定義になってしまうので使わない.
    SELF_FULLPATH = os.path.abspath(sys.argv[0])
    SELF_DIR = os.path.dirname(SELF_FULLPATH)

    wx, wy = args.windowx, args.windowy
    px, py = center_pos(wx, wy)
    isrh = libisearch.ISearcher()
    isrh.set_caption(CAPTION) \
        .set_window_rect(px, py, wx, wy) \
        .set_on_text(on_text) \
        .set_on_enter(on_enter) \
        .set_search_func(search_func) \
Exemple #6
0
def learn_skip_words_0619():
    api = ZhidaoNlp(debug=True)

    print json.dumps(gcounter, ensure_ascii=False), "\n\n------ load all raw",
    skip_words_raw = collections.Counter()
    filenames = glob.glob(getTheFile("local/skip_words/skip_words_*.raw.txt"))
    for filename in filenames:
        for phrase in libfile.file2list(filename):
            gcounter["from_{}".format(os.path.basename(filename))] += 1
            skip_words_raw[phrase] += 1
    gcounter["skip_words_raw_loaded"] = len(skip_words_raw)

    print json.dumps(gcounter,
                     ensure_ascii=False), "\n\n------ generate clean",
    skip_words_clean = collections.Counter()
    for phrase in skip_words_raw:
        temp = api.cut_text(phrase)
        for word in temp:
            skip_words_clean[word] += skip_words_raw[phrase]
    gcounter["skip_words_clean"] = len(skip_words_clean)

    print json.dumps(
        gcounter, ensure_ascii=False), "\n\n------ estimate raw outside clean"
    skip_words_raw_diff = set(skip_words_raw)
    skip_words_raw_diff.difference_update(skip_words_clean)
    for phrase in libdata.items2sample(skip_words_raw_diff):
        print phrase, skip_words_raw[phrase]
    gcounter["skip_words_raw_diff"] = len(skip_words_raw_diff)

    print json.dumps(gcounter,
                     ensure_ascii=False), "\n\n------ load not clean "
    not_skip_words_clean = set()
    filenames = glob.glob(getTheFile("model/skip_words_no.human.txt"))
    for filename in filenames:
        for line in libfile.file2list(filename):
            if line not in not_skip_words_clean:
                gcounter["from_{}".format(os.path.basename(filename))] += 1
                not_skip_words_clean.add(line)
    gcounter["not_skip_words_clean_loaded"] = len(not_skip_words_clean)

    print json.dumps(gcounter,
                     ensure_ascii=False), "\n\n------ filter clean with not "
    skip_words_all = set(skip_words_clean)
    skip_words_all.difference_update(not_skip_words_clean)
    gcounter["skip_words_all"] = len(skip_words_all)
    filename = getTheFile("local/skip_words/test_question_all.auto.txt")
    libfile.lines2file(sorted(list(skip_words_all)), filename)

    print json.dumps(gcounter,
                     ensure_ascii=False), "\n\n------ eval performance"
    filenames = [
        (getTheFile("test/test_question_skip_no.human.txt"), 0),
        #        ( getTheFile("local/baike/baike_questions_pos.human.txt"), 0),
        #        [ getTheFile("local/baike/baike_questions_neg.human.txt"), 0 ],
        (getTheFile("test/test_question_skip_yes.human.txt"), 1),
    ]
    all_detected_skip_words = collections.Counter()
    counter = collections.Counter()
    tests = []
    for filename, expect in filenames:
        entry = {"data": libfile.file2list(filename), "expect": expect}
        tests.append(entry)
        gcounter["from_{}".format(os.path.basename(filename))] = len(
            entry["data"])

    target_names = [u"正常", u"敏感词"]
    setattr(api, "all_detected_skip_words", all_detected_skip_words)
    setattr(api, "skip_words_all", skip_words_all)
    libdata.eval_fn(tests, target_names, fn_classify_0619, api)
Exemple #7
0
def main():
    #print sys.argv

    if len(sys.argv) < 2:
        show_help()
        return

    option = sys.argv[1]

    if "eval_filter" == option:
        eval_filter()

    elif "debug_filter" == option:
        eval_filter([2], True)

    elif "test_is_baike_realtime" == option:
        # python hzlib/task_api_zhidao.py test
        api = ZhidaoNlp()
        if len(sys.argv) > 2:
            question = sys.argv[1]
            query_filter = 2
            if len(sys.argv) > 3:
                query_filter = int(sys.argv[2])
            ret = api.is_question_baike(question, query_filter=query_filter)
            print question, ret, query_filter
        else:
            question = u"那月亮为什么会跟着我走"
            ret = api.is_question_baike(question)
            print question, ret
            assert (not ret)
            question = u"天空为什么是蓝色的"
            ret = api.is_question_baike(question)
            print question, ret
            assert (ret)

    elif "test_chat_realtime" == option:
        # python hzlib/task_api_zhidao.py test
        api = ZhidaoFetch()
        if len(sys.argv) > 2:
            question = sys.argv[2]
            query_filter = 2
            if len(sys.argv) > 3:
                query_filter = int(sys.argv[3])
            print question, query_filter
            ret = api.search_chat_best(question, query_filter=query_filter)
            print question, query_filter
            libdata.print_json(ret)

        else:
            question = u"你喜欢蓝色么?"
            ret = api.search_chat_best(question)
            print question
            libdata.print_json(ret)

    elif "test_chat_cache" == option:
        # python hzlib/task_api_zhidao.py test

        config = {
            "batch_id": "test-test-20160620",
            "length": 1,
            "crawl_http_method": "get",
            "crawl_gap": 1,
            "crawl_use_js_engine": False,
            "crawl_timeout": 10,
            "crawl_http_headers": {},
            "debug": False,
            "cache_server": "http://192.168.1.179:8000"
        }
        api = ZhidaoFetch(config)
        if len(sys.argv) > 2:
            question = sys.argv[2]
            query_filter = 2
            if len(sys.argv) > 3:
                query_filter = int(sys.argv[3])
            ret = api.search_chat_best(question, query_filter=query_filter)
            print question, query_filter
            libdata.print_json(ret)

        else:
            question = u"你喜欢蓝色么?"
            ret = api.search_chat_best(question)
            print question
            libdata.print_json(ret)

    elif "test_baike_realtime" == option:
        # python hzlib/task_api_zhidao.py test_baike_realtime
        api = ZhidaoFetch()
        if len(sys.argv) > 2:
            question = sys.argv[2]
            query_filter = 2
            if len(sys.argv) > 3:
                query_filter = int(sys.argv[3])
            print question, query_filter
            ret = api.search_baike_best(question, query_filter=query_filter)
            print question, query_filter
            libdata.print_json(ret)

        else:
            question = u"严重贫血怎么办"
            question = u"天空是什么颜色的?"
            ret = api.search_baike_best(question, keep_result=True)
            print question
            libdata.print_json(ret)

    elif option.startswith("test_baike_cache"):
        # python hzlib/task_api_zhidao.py test_baike_cache
        print "========"
        config = {
            "batch_id": "test-test-20160620",
            "length": 1,
            "crawl_http_method": "get",
            "crawl_gap": 1,
            "crawl_use_js_engine": False,
            "crawl_timeout": 10,
            "crawl_http_headers": {},
            "debug": True,
            "cache_server": "http://192.168.1.179:8000"
        }
        api = ZhidaoFetch(config)
        if option == "test_baike_cache_one":
            #question = u"你喜欢蓝色么?"
            question = u"天空是什么颜色的?"
            question = u"掏粪男孩就是指TFboys吗?"
            question = u"爱因斯坦是谁"
            if len(sys.argv) > 2:
                question = sys.argv[2]
            ret = api.search_baike_best(question)
            libdata.print_json(ret)
            print question
        else:
            filename_question = sys.argv[2]
            questions = libfile.file2list(filename_question)

            if questions:
                filename = u"{}.temp".format(filename_question)
                libfile.lines2file(sorted(list(questions)), filename)

            print len(questions)
            results = []
            for question in questions:
                query_filter = 2
                if len(sys.argv) > 3:
                    query_filter = int(sys.argv[3])
                debug_item = {}
                ret = api.search_baike_best(question,
                                            query_filter=query_filter,
                                            debug_item=debug_item)
                print question, query_filter
                #libdata.print_json(ret)
                if not ret:
                    debug_item["best"] = u"异常"
                    debug_item["query"] = question
                    results.append(debug_item)

                elif not ret.get("items_all", []):
                    debug_item["query"] = question
                    debug_item["best"] = u"无结果"
                    results.append(debug_item)

                else:
                    for item in ret.get("items_all", []):
                        item["query"] = question
                        results.append(item)
                        if item["id"] == ret.get("best_qapair", {}).get("id"):
                            item["best"] = u"最优"
                        else:
                            item["best"] = u""

            filename = getLocalFile("temp/test_baike_cache.{}.xls".format(
                os.path.basename(filename_question)))
            fields = [
                u"标注", "best", "debug_note", "query", "answers", "match_score",
                "cnt_like", "cnt_answer", "question", "id", "answers_raw",
                "question_content"
            ]
            libfile.writeExcel(results, fields, filename)

    elif "test_jieba" == option:
        api = ZhidaoNlp()
        question = sys.argv[2]
        if not isinstance(question, unicode):
            question = question.decode("utf-8")

        temp = api.cut_text(question)
        print json.dumps(list(temp), ensure_ascii=False)

        temp = api.pseg.cut(question)
        for word, pos in temp:
            print word, pos