Ejemplo n.º 1
0
    def run_test_search_realtime(self, filename, limit):
        results = []
        counter = collections.Counter()

        with codecs.open(filename) as f:
            for line in f:
                if line.startswith("#"):
                    continue
                line = line.strip()
                if not line:
                    continue
                ret = self.run_query(line, limit)
                counter["query"] += 1
                for item in ret:
                    #print json.dumps(item, ensure_ascii=False, indent=4, sort_keys=True)
                    results.append(item)
                    for p in ["source", "result_index"]:
                        counter["{}_{}".format(p, item[p])] += 1
                    for p in ["question", "answers"]:
                        if p in item:
                            if not isinstance(item[p], unicode):
                                item[p] = item[p].decode("gb18030")

        filename_output = getLocalFile(
            os.path.basename(filename.replace("human.txt", "xls")))
        libfile.writeExcel(results, [
            "id", "source", "result_index", "cnt_like", "cnt_answer", "query",
            "question_id", "question", "answers"
        ], filename_output)
        #libfile.writeExcel(results, ["query", "source", "cnt_like",  "cnt_answer", "question", "answers"], filename_output)
        print counter
Ejemplo n.º 2
0
    def fudan_gen_excel(self, result):
        items = []
        keys = ['word', 'entity', 'attribute', 'value']
        filename = 'fudan_eav.xlsx'

        for word, entity, avps in result:
              for a,v in avps:
                items.append({'word': word.decode('utf-8'),
                              'entity': entity,
                              'attribute': a,
                              'value': v})

        writeExcel(items, keys, filename)
Ejemplo n.º 3
0
    def init_from_json(self):
        map_items = {}
        dirname = getLocalFile("raw/chat0708/*")
        for filename in glob.glob(dirname):
            src = os.path.basename(filename).replace(".txt", "")
            for line in libfile.file2list(filename):
                gcounter["total_" + src] += 1
                item = json.loads(line)
                item["source"] = src

                item["answers"] = clean_answer(item["answers"])
                item["question"] = clean_question(item["question"])
                if len(item["answers"]) < 2:
                    gcounter["items_skip_empty_answer"] += 1
                    continue

                label = ""
                if not label:
                    label = self.api_nlp.detect_skip_words(item["answers"])
                    if label:
                        label = u"敏感词:{}".format(u",".join(label))
                        print label, item["answers"]
                        gcounter["minganci_answer"] += 1
                    else:
                        label = ""
                if not label:
                    if re.search("^[0-9\-]$", item["answers"]):
                        label = "number"
                item["label"] = label

                q = item["question"]
                if q not in map_items:
                    map_items[q] = item
                    gcounter["from_" + src] += 1
                else:
                    map_items[q] = item
                    gcounter["overwrite_" + src] += 1
                    print "overwrite", q, src, map_items[q]["answers"], item[
                        "answers"]

        gcounter["init_from_json"] = len(map_items)

        filename = getLocalFile("temp/qa0708chat.xls")
        items = sorted(map_items.values(), key=lambda x: x["question"])
        libfile.writeExcel(items, ["label", "question", "answers", "source"],
                           filename)
Ejemplo n.º 4
0
    def run_get_best_search_realtime(self, filename):
        results = []
        counter = collections.Counter()

        lines = libfile.file2list(filename)
        for query_parser in [0]:
            for line in sorted(lines):
                cnt_label = "query_{}".format(query_parser)
                if counter[cnt_label] % 10 == 0:
                    print datetime.datetime.now().isoformat(
                    ), counter[cnt_label], line
                counter[cnt_label] += 1

                ret_one = search_zhidao_best(line,
                                             query_filter=0,
                                             query_parser=query_parser)
                if ret_one:
                    item = ret_one["best_qapair"]

                    print "=====>", line
                    print "------", item["match_score"], item["question"]
                    print item["answers"], "*******", item["answers_raw"][
                        len(item["answers"]):]

                    for p in ["query"]:
                        item[p] = ret_one[p]
                    #print json.dumps(item, ensure_ascii=False, indent=4, sort_keys=True)
                    results.append(item)
                    for p in ["source", "result_index"]:
                        counter["{}_{}".format(p, item[p])] += 1
                    for p in ["question", "answers"]:
                        if p in item:
                            if not isinstance(item[p], unicode):
                                item[p] = item[p].decode("gb18030")

        filename_output = getLocalFile(
            os.path.basename(filename.replace("human.txt", "xls")))
        libfile.writeExcel(results, [
            "id", "source", "result_index", "cnt_like", "cnt_answer", "query",
            "question_id", "question", "answers"
        ], filename_output)
        #libfile.writeExcel(results, ["query", "source", "cnt_like",  "cnt_answer", "question", "answers"], filename_output)
        print counter
Ejemplo n.º 5
0
def run_chat_realtime(query_filter, query_parser, limit):
    data = read_kidsfaq2w(limit)
    print "Length of sample data ",len(data)


    to_write_question = []
    to_write_answer = []

    for query in data:
        top3_item = agt.search_chat_top_n(query, 3, query_filter=query_filter, query_parser=query_parser)
        print libdata.print_json(top3_item)

        if top3_item:
            for key in ["qapair0","qapair1","qapair2"]:
                if key in top3_item.keys():
                    one_item_question = {
                        "query" : query,
                        "q" : top3_item[key]["question"]
                    }

                    one_item_answer = {
                        "query" : query,
                        "a" : top3_item[key]["answers"]
                    }

                    to_write_question.append(one_item_question)
                    to_write_answer.append(one_item_answer)

                    print libdata.print_json(one_item_question)
                    print libdata.print_json(one_item_answer)
                else:
                    break

            print "===================================\n"
    libfile.writeExcel(to_write_question, ["query", "q"], getLocalFile(KIDS_2W_SAMPLE_RESULT_QUESTION))
    libfile.writeExcel(to_write_answer, ["query", "a"], getLocalFile(KIDS_2W_SAMPLE_RESULT_ANSWER))
Ejemplo n.º 6
0
def fetch_detail(worker_id=None, worker_num=None, limit=None, config_index="prod", filename_input=None, fetch_option="top_n", fetch_limit=100):
    flag_batch = (worker_id is not None and worker_num is not None and worker_num>1)
    flag_prod = (config_index == "prod")
    flag_slack = (flag_prod and worker_id == 0)

    job_name = os.path.basename(filename_input).replace(".txt","")
    output_dir = "output0623"
    if flag_batch:
        filename_output_xls = getLocalFile("{}/{}.{}_worker.xls".format(output_dir, job_name, worker_id))
        filename_output_xls2 = getLocalFile("{}/{}.{}_worker_query.xls".format(output_dir, job_name, worker_id))
        filename_output_json = getLocalFile("{}/{}.{}_worker.json.txt".format(output_dir, job_name, worker_id))
    else:
        filename_output_xls = getLocalFile("{}/{}.batch_{}.all.xls".format(output_dir, job_name, config_index))
        filename_output_xls2 = getLocalFile("{}/{}.batch_{}.all_query.xls".format(output_dir, job_name, config_index))
        filename_output_json = getLocalFile("{}/{}.batch_{}.all.json.txt".format(output_dir, job_name, config_index))

    CONFIG ={
        "local":{
                "batch_id": "zhidao-search0623-20160621",
                "crawl_http_method": "get",
                "crawl_gap": 3,
                "crawl_use_js_engine": False,
                "crawl_timeout": 10,
                "crawl_http_headers": {},
                "note": "知道搜索,闲聊",
                "debug": True,
    #             "cache_server": "http://52.196.166.54:8000"  #内网IP
    #             "cache_server": "http://52.196.166.54:8000"
                "cache_server": "http://192.168.1.179:8000"
            },
        "prod":{
                "batch_id": "zhidao-search0623-20160621",
                "crawl_http_method": "get",
                "crawl_gap": 5,
                "crawl_use_js_engine": False,
                "crawl_timeout": 10,
                "crawl_http_headers": {},
                "note": "知道搜索,闲聊",
                "debug": False,
            }

    }
    print filename_input
    if not filename_input:
        print "FATAL "
        return
    else:
        list_query = libfile.file2list(filename_input)
        print "Length of kidsfaq2w ", len(list_query)

    config = CONFIG[config_index]
    #config = {}
    api = ZhidaoFetch(config)



    ts_start = time.time()
    ts_lap_start = time.time()
    counter = collections.Counter()
    if limit:
        step = len(list_query)/limit
        list_query = [list_query[i*step] for i in range(limit)]
    print len(list_query)

    if flag_slack:
        slack_msg( u"AWS {}/{}. run {} batch_id: {}, urls: {} debug: {}".format(
            worker_id,
            worker_num,
            config["note"],
            config["batch_id"],
            len(list_query),
            config.get("debug",False)) )

    results = []
    with codecs.open(filename_output_json, 'w') as fjson:
        for query in list_query:

            if counter["visited"] % 1000 ==0:
                print datetime.datetime.now().isoformat(), counter
            counter["visited"]+=1
            if flag_batch:
                if (counter["visited"] % worker_num) != worker_id:
                    counter["skipped_peer"]+=1
                    continue


            counter["processed"]+=1
            if counter["processed"] % 1000 == 0:
                if flag_slack:
                    slack_msg( "AWS {}/{}. working {}. lap {} seconds. {}".format(
                            worker_id,
                            worker_num,
                            config["batch_id"],
                            int( time.time() - ts_lap_start ),
                            json.dumps(counter) ))
                    ts_lap_start = time.time()

            if "search_all" == fetch_option:
                ret = api.search_all(query, limit = fetch_limit)
            else:
                ret = fn_fetch(query )

            if ret:
                ret["query"] = query
                fjson.write(u"{}\n".format(json.dumps(ret, ensure_ascii=False)))

            if ret and ret.get("items"):
                counter["has_result"] +=1
                counter["total_qa"] += len(ret["items"])
                if config.get("debug"):
                    print len(ret["items"]), json.dumps(ret, ensure_ascii=False)
                for item in ret["items"]:
                    #print json.dumps(item, ensure_ascii=False, indent=4, sort_keys=True)
                    results.append(item)
                    item["query"] = query
                    for p in ["source"]:
                        counter["{}_{}".format(p, item[p])] +=1
                    for p in [  "question", "answers"]:
                        if p in item:
                            if not isinstance(item[p], unicode):
                                item[p] = item[p].decode("gb18030")
            else:
                counter["missing_data"] +=1
                pass



    #libfile.writeExcel(results, [ "id", "source", "result_index", "cnt_like",  "cnt_answer", "query", "question_id", "question", "answers"], filename_output_xls)
    #libfile.writeExcel(results, [ "id","is_good", "match_score", "result_index", "cnt_like",  "cnt_answer", "query", "question", "answers"], filename_output_xls, page_size=5000)
    #print filename_output_xls
#    libfile.writeExcel(results, [ "label","query", "answers", "match_score", "question"], filename_output_xls)
    libfile.writeExcel(results, [ "label","question", "answers"], filename_output_xls)
    libfile.writeExcel(results, [ "label","query", "answers", "match_score", "question"], filename_output_xls2)


    duration_sec =  int( time.time() -ts_start )
    print "all done, seconds", duration_sec, duration_sec/counter["visited"], counter

    if flag_slack:
        slack_msg( "AWS {}/{}. done {}. total {} seconds".format(
                worker_id,
                worker_num,
                config["batch_id"],
                duration_sec) )
Ejemplo n.º 7
0
    def init_zhidao_qa(self):
        #clean rewrite

        dataset_index_list = [
            "qa0708query",
            "qa0708question",
        ]
        for dataset_index in dataset_index_list:
            dirname = getLocalFile("raw/{}/*".format(dataset_index))
            map_items = {}
            for filename in glob.glob(dirname):

                gcounter["files"] += 1
                ret = libfile.readExcel(
                    ["category", "question", "answers", "type"],
                    filename,
                    start_row=1)
                if ret:
                    for items in ret.values():
                        for item in items:
                            gcounter["items"] += 1

                            qa = u"{}{}".format(item["question"],
                                                item["answers"])
                            item["id"] = es_api.gen_es_id(qa)
                            if item["id"] in map_items:
                                gcounter["items_skip_dup"] += 1
                                continue

                            if not item["type"] in [1, "1"]:
                                gcounter["items_skip_drop"] += 1
                                continue

                            item["answers"] = clean_answer(u"{}".format(
                                item["answers"]))
                            item["question"] = clean_question(u"{}".format(
                                item["question"]))
                            if len(item["answers"]) < 2:
                                gcounter["items_skip_empty_answer"] += 1
                                continue

                            skip_words = self.api_nlp.detect_skip_words(
                                item["answers"],
                                check_list=[
                                    "skip_words_all", "skip_words_zhidao"
                                ])
                            if skip_words:
                                print "SKIP", u"/".join(
                                    skip_words), "\t---\t", item[
                                        "question"], "\t---\t", item["answers"]
                                gcounter["items_skip_minganci"] += 1
                                continue

                            item_new = {"source": dataset_index}
                            for p in ["question", "answers", "id"]:
                                item_new[p] = item[p]
                            map_items[item_new["question"]] = item_new

            gcounter["init_from_{}".format(dataset_index)] = len(map_items)
            print len(map_items)

            filename = getLocalFile("temp/{}.xls".format(dataset_index))
            items = sorted(map_items.values(), key=lambda x: x["question"])
            libfile.writeExcel(items,
                               ["label", "question", "answers", "source"],
                               filename)
Ejemplo n.º 8
0
    def init_xianer7w_rewrite(self):
        dataset_index = "xianer7w_rewrite"
        gcounter[dataset_index] = 1

        ids = set()

        filename = getLocalFile("raw/rewrite/xianer7w_rewrite_map.xlsx")
        print filename

        gcounter["files"] += 1
        ret = libfile.readExcel(["question", "old_answers", "answers"],
                                filename,
                                start_row=0)

        #collect answer mapping
        map_answers = {}
        for item in ret.values()[0]:
            if item.get("old_answers"):
                a_old = item["old_answers"].strip()
            if item.get("answers"):
                a = item["answers"].strip()

            if a and a_old:
                map_answers[a_old] = a

        print len(map_answers)

        filename = getLocalFile("raw/rewrite/xianer7w_rewrite.xlsx")
        print filename

        gcounter["files"] += 1
        ret = libfile.readExcel(["question", "old_answers", "answers"],
                                filename,
                                start_row=0)

        #use mapping
        items = []
        for item in ret.values()[0]:
            gcounter["items"] += 1
            q = item["question"]
            if item["old_answers"]:
                a = map_answers.get(item["old_answers"])
            else:
                a = ""

            if not a:
                #print "SKIP no mapping", q, item["old_answers"]
                gcounter["items_no_mapping"] += 1
                continue

            qa = q + a
            item["id"] = es_api.gen_es_id(q)
            if item["id"] in ids:
                gcounter["items_skip_dup"] += 1
                continue

            skip_words = self.api_nlp.detect_skip_words(
                qa, check_list=["skip_words_all"])
            if skip_words:
                print "SKIP", u"/".join(
                    skip_words), "\t---\t", item["question"], "\t---\t", a
                gcounter["items_skip_minganci"] += 1
                continue

            ids.add(item["id"])
            item_new = {
                "question": q,
                "answers": a,
                "id": item["id"],
            }
            items.append(item_new)

        gcounter["qa0708rewrite"] = len(ids)

        filename = getLocalFile("temp/qa0708rewrite.xls")
        libfile.writeExcel(items, ["label", "question", "answers"], filename)
Ejemplo n.º 9
0
    def _merge_chat(self, filenames, option):
        filename_todo = getLocalFile("input/{}_todo.txt".format(option))
        print "filename_todo", filename_todo
        q_todo = set()
        if os.path.exists(filename_todo):
            q_todo = libfile.file2set(filename_todo)
            gcounter["q_todo"] = len(q_todo)
            print "filename_todo", filename_todo, len(q_todo)

        filename_skip = getLocalFile("input/{}_skip.txt".format(option))
        print "filename_skip", filename_skip
        q_skip = set()
        if os.path.exists(filename_skip):
            q_skip = libfile.file2set(filename_skip)
            gcounter["q_skip"] = len(q_skip)
            print "filename_skip", filename_skip, len(q_skip)

        data = {}
        q_all = set()
        for filename in filenames:
            #print filename
            gcounter["files"] += 1
            ret = libfile.readExcel(["category", "question", "answers"],
                                    filename,
                                    start_row=1)
            if ret:
                for items in ret.values():
                    for item in items:
                        gcounter["items"] += 1

                        q_all.add(item["question"])

                        if q_skip and item["question"] in q_skip:
                            gcounter["items_skip"] += 1
                            continue

                        item["id"] = es_api.gen_es_id(item["question"] +
                                                      item["answers"])
                        if item["id"] in data:
                            continue

                        for dataset_index in ["chat8cmu6w", "chat8xianer12w"]:
                            if dataset_index in filename:
                                gcounter["from_" + dataset_index] += 1

                        label = self.filter_qa_by_label(
                            item["category"], item["question"],
                            item["answers"])
                        if label:
                            item["label"] = label
                            #print "SKIP", label, "\t---\t", item["question"], "\t---\t", item["answers"]
                            #gcounter["_esdata_label_{}".format(label)]+=1
                        #elif not self.api_nlp.is_question_baike(item["question"]):
                        #    item["label"] = u"百科"
                        else:
                            item["label"] = u""
                        xlabel = re.sub(":.*$", "", item["label"])
                        gcounter["data_with_label_{}".format(xlabel)] += 1

                        data[item["id"]] = item
                        item_new = {}
                        for p in ["question", "answers", "id"]:
                            item_new[p] = item[p]

        gcounter["data"] = len(data)
        results = sorted(data.values(), key=lambda x: x["question"])
        print len(data), len(results)
        filename_output = getLocalFile("output/edit_{}.xls".format(option))
        libfile.writeExcel(results, ["label", "question", "answers"],
                           filename_output)

        filename_output = getLocalFile(
            "edit0623/sample1000_edit_{}.xls".format(option))
        libfile.writeExcel(libdata.items2sample(data.values(), limit=1000),
                           ["label", "question", "answers"], filename_output)

        if q_todo:
            q_todo.difference_update(q_all)
            filename_output = getLocalFile(
                "edit0623/question_miss_{}.xls".format(option))
            libfile.lines2file(sorted(list(q_todo)), filename_output)
            gcounter["q_all"] = len(q_all)
            gcounter["q_miss"] = len(q_todo)

        page_size = 2000
        max_page = len(results) / page_size + 1
        for i in range(max_page):
            filename_output = getLocalFile("edit0623/edit_{}_{}.xls".format(
                option, i))
            #print filename_output
            idx_start = i * page_size
            idx_end = min(len(results), (i + 1) * page_size)
            libfile.writeExcel(results[idx_start:idx_end],
                               ["label", "question", "answers"],
                               filename_output)