def run_test_search_realtime(self, filename, limit): results = [] counter = collections.Counter() with codecs.open(filename) as f: for line in f: if line.startswith("#"): continue line = line.strip() if not line: continue ret = self.run_query(line, limit) counter["query"] += 1 for item in ret: #print json.dumps(item, ensure_ascii=False, indent=4, sort_keys=True) results.append(item) for p in ["source", "result_index"]: counter["{}_{}".format(p, item[p])] += 1 for p in ["question", "answers"]: if p in item: if not isinstance(item[p], unicode): item[p] = item[p].decode("gb18030") filename_output = getLocalFile( os.path.basename(filename.replace("human.txt", "xls"))) libfile.writeExcel(results, [ "id", "source", "result_index", "cnt_like", "cnt_answer", "query", "question_id", "question", "answers" ], filename_output) #libfile.writeExcel(results, ["query", "source", "cnt_like", "cnt_answer", "question", "answers"], filename_output) print counter
def fudan_gen_excel(self, result): items = [] keys = ['word', 'entity', 'attribute', 'value'] filename = 'fudan_eav.xlsx' for word, entity, avps in result: for a,v in avps: items.append({'word': word.decode('utf-8'), 'entity': entity, 'attribute': a, 'value': v}) writeExcel(items, keys, filename)
def init_from_json(self): map_items = {} dirname = getLocalFile("raw/chat0708/*") for filename in glob.glob(dirname): src = os.path.basename(filename).replace(".txt", "") for line in libfile.file2list(filename): gcounter["total_" + src] += 1 item = json.loads(line) item["source"] = src item["answers"] = clean_answer(item["answers"]) item["question"] = clean_question(item["question"]) if len(item["answers"]) < 2: gcounter["items_skip_empty_answer"] += 1 continue label = "" if not label: label = self.api_nlp.detect_skip_words(item["answers"]) if label: label = u"敏感词:{}".format(u",".join(label)) print label, item["answers"] gcounter["minganci_answer"] += 1 else: label = "" if not label: if re.search("^[0-9\-]$", item["answers"]): label = "number" item["label"] = label q = item["question"] if q not in map_items: map_items[q] = item gcounter["from_" + src] += 1 else: map_items[q] = item gcounter["overwrite_" + src] += 1 print "overwrite", q, src, map_items[q]["answers"], item[ "answers"] gcounter["init_from_json"] = len(map_items) filename = getLocalFile("temp/qa0708chat.xls") items = sorted(map_items.values(), key=lambda x: x["question"]) libfile.writeExcel(items, ["label", "question", "answers", "source"], filename)
def run_get_best_search_realtime(self, filename): results = [] counter = collections.Counter() lines = libfile.file2list(filename) for query_parser in [0]: for line in sorted(lines): cnt_label = "query_{}".format(query_parser) if counter[cnt_label] % 10 == 0: print datetime.datetime.now().isoformat( ), counter[cnt_label], line counter[cnt_label] += 1 ret_one = search_zhidao_best(line, query_filter=0, query_parser=query_parser) if ret_one: item = ret_one["best_qapair"] print "=====>", line print "------", item["match_score"], item["question"] print item["answers"], "*******", item["answers_raw"][ len(item["answers"]):] for p in ["query"]: item[p] = ret_one[p] #print json.dumps(item, ensure_ascii=False, indent=4, sort_keys=True) results.append(item) for p in ["source", "result_index"]: counter["{}_{}".format(p, item[p])] += 1 for p in ["question", "answers"]: if p in item: if not isinstance(item[p], unicode): item[p] = item[p].decode("gb18030") filename_output = getLocalFile( os.path.basename(filename.replace("human.txt", "xls"))) libfile.writeExcel(results, [ "id", "source", "result_index", "cnt_like", "cnt_answer", "query", "question_id", "question", "answers" ], filename_output) #libfile.writeExcel(results, ["query", "source", "cnt_like", "cnt_answer", "question", "answers"], filename_output) print counter
def run_chat_realtime(query_filter, query_parser, limit): data = read_kidsfaq2w(limit) print "Length of sample data ",len(data) to_write_question = [] to_write_answer = [] for query in data: top3_item = agt.search_chat_top_n(query, 3, query_filter=query_filter, query_parser=query_parser) print libdata.print_json(top3_item) if top3_item: for key in ["qapair0","qapair1","qapair2"]: if key in top3_item.keys(): one_item_question = { "query" : query, "q" : top3_item[key]["question"] } one_item_answer = { "query" : query, "a" : top3_item[key]["answers"] } to_write_question.append(one_item_question) to_write_answer.append(one_item_answer) print libdata.print_json(one_item_question) print libdata.print_json(one_item_answer) else: break print "===================================\n" libfile.writeExcel(to_write_question, ["query", "q"], getLocalFile(KIDS_2W_SAMPLE_RESULT_QUESTION)) libfile.writeExcel(to_write_answer, ["query", "a"], getLocalFile(KIDS_2W_SAMPLE_RESULT_ANSWER))
def fetch_detail(worker_id=None, worker_num=None, limit=None, config_index="prod", filename_input=None, fetch_option="top_n", fetch_limit=100): flag_batch = (worker_id is not None and worker_num is not None and worker_num>1) flag_prod = (config_index == "prod") flag_slack = (flag_prod and worker_id == 0) job_name = os.path.basename(filename_input).replace(".txt","") output_dir = "output0623" if flag_batch: filename_output_xls = getLocalFile("{}/{}.{}_worker.xls".format(output_dir, job_name, worker_id)) filename_output_xls2 = getLocalFile("{}/{}.{}_worker_query.xls".format(output_dir, job_name, worker_id)) filename_output_json = getLocalFile("{}/{}.{}_worker.json.txt".format(output_dir, job_name, worker_id)) else: filename_output_xls = getLocalFile("{}/{}.batch_{}.all.xls".format(output_dir, job_name, config_index)) filename_output_xls2 = getLocalFile("{}/{}.batch_{}.all_query.xls".format(output_dir, job_name, config_index)) filename_output_json = getLocalFile("{}/{}.batch_{}.all.json.txt".format(output_dir, job_name, config_index)) CONFIG ={ "local":{ "batch_id": "zhidao-search0623-20160621", "crawl_http_method": "get", "crawl_gap": 3, "crawl_use_js_engine": False, "crawl_timeout": 10, "crawl_http_headers": {}, "note": "知道搜索,闲聊", "debug": True, # "cache_server": "http://52.196.166.54:8000" #内网IP # "cache_server": "http://52.196.166.54:8000" "cache_server": "http://192.168.1.179:8000" }, "prod":{ "batch_id": "zhidao-search0623-20160621", "crawl_http_method": "get", "crawl_gap": 5, "crawl_use_js_engine": False, "crawl_timeout": 10, "crawl_http_headers": {}, "note": "知道搜索,闲聊", "debug": False, } } print filename_input if not filename_input: print "FATAL " return else: list_query = libfile.file2list(filename_input) print "Length of kidsfaq2w ", len(list_query) config = CONFIG[config_index] #config = {} api = ZhidaoFetch(config) ts_start = time.time() ts_lap_start = time.time() counter = collections.Counter() if limit: step = len(list_query)/limit list_query = [list_query[i*step] for i in range(limit)] print len(list_query) if flag_slack: slack_msg( u"AWS {}/{}. run {} batch_id: {}, urls: {} debug: {}".format( worker_id, worker_num, config["note"], config["batch_id"], len(list_query), config.get("debug",False)) ) results = [] with codecs.open(filename_output_json, 'w') as fjson: for query in list_query: if counter["visited"] % 1000 ==0: print datetime.datetime.now().isoformat(), counter counter["visited"]+=1 if flag_batch: if (counter["visited"] % worker_num) != worker_id: counter["skipped_peer"]+=1 continue counter["processed"]+=1 if counter["processed"] % 1000 == 0: if flag_slack: slack_msg( "AWS {}/{}. working {}. lap {} seconds. {}".format( worker_id, worker_num, config["batch_id"], int( time.time() - ts_lap_start ), json.dumps(counter) )) ts_lap_start = time.time() if "search_all" == fetch_option: ret = api.search_all(query, limit = fetch_limit) else: ret = fn_fetch(query ) if ret: ret["query"] = query fjson.write(u"{}\n".format(json.dumps(ret, ensure_ascii=False))) if ret and ret.get("items"): counter["has_result"] +=1 counter["total_qa"] += len(ret["items"]) if config.get("debug"): print len(ret["items"]), json.dumps(ret, ensure_ascii=False) for item in ret["items"]: #print json.dumps(item, ensure_ascii=False, indent=4, sort_keys=True) results.append(item) item["query"] = query for p in ["source"]: counter["{}_{}".format(p, item[p])] +=1 for p in [ "question", "answers"]: if p in item: if not isinstance(item[p], unicode): item[p] = item[p].decode("gb18030") else: counter["missing_data"] +=1 pass #libfile.writeExcel(results, [ "id", "source", "result_index", "cnt_like", "cnt_answer", "query", "question_id", "question", "answers"], filename_output_xls) #libfile.writeExcel(results, [ "id","is_good", "match_score", "result_index", "cnt_like", "cnt_answer", "query", "question", "answers"], filename_output_xls, page_size=5000) #print filename_output_xls # libfile.writeExcel(results, [ "label","query", "answers", "match_score", "question"], filename_output_xls) libfile.writeExcel(results, [ "label","question", "answers"], filename_output_xls) libfile.writeExcel(results, [ "label","query", "answers", "match_score", "question"], filename_output_xls2) duration_sec = int( time.time() -ts_start ) print "all done, seconds", duration_sec, duration_sec/counter["visited"], counter if flag_slack: slack_msg( "AWS {}/{}. done {}. total {} seconds".format( worker_id, worker_num, config["batch_id"], duration_sec) )
def init_zhidao_qa(self): #clean rewrite dataset_index_list = [ "qa0708query", "qa0708question", ] for dataset_index in dataset_index_list: dirname = getLocalFile("raw/{}/*".format(dataset_index)) map_items = {} for filename in glob.glob(dirname): gcounter["files"] += 1 ret = libfile.readExcel( ["category", "question", "answers", "type"], filename, start_row=1) if ret: for items in ret.values(): for item in items: gcounter["items"] += 1 qa = u"{}{}".format(item["question"], item["answers"]) item["id"] = es_api.gen_es_id(qa) if item["id"] in map_items: gcounter["items_skip_dup"] += 1 continue if not item["type"] in [1, "1"]: gcounter["items_skip_drop"] += 1 continue item["answers"] = clean_answer(u"{}".format( item["answers"])) item["question"] = clean_question(u"{}".format( item["question"])) if len(item["answers"]) < 2: gcounter["items_skip_empty_answer"] += 1 continue skip_words = self.api_nlp.detect_skip_words( item["answers"], check_list=[ "skip_words_all", "skip_words_zhidao" ]) if skip_words: print "SKIP", u"/".join( skip_words), "\t---\t", item[ "question"], "\t---\t", item["answers"] gcounter["items_skip_minganci"] += 1 continue item_new = {"source": dataset_index} for p in ["question", "answers", "id"]: item_new[p] = item[p] map_items[item_new["question"]] = item_new gcounter["init_from_{}".format(dataset_index)] = len(map_items) print len(map_items) filename = getLocalFile("temp/{}.xls".format(dataset_index)) items = sorted(map_items.values(), key=lambda x: x["question"]) libfile.writeExcel(items, ["label", "question", "answers", "source"], filename)
def init_xianer7w_rewrite(self): dataset_index = "xianer7w_rewrite" gcounter[dataset_index] = 1 ids = set() filename = getLocalFile("raw/rewrite/xianer7w_rewrite_map.xlsx") print filename gcounter["files"] += 1 ret = libfile.readExcel(["question", "old_answers", "answers"], filename, start_row=0) #collect answer mapping map_answers = {} for item in ret.values()[0]: if item.get("old_answers"): a_old = item["old_answers"].strip() if item.get("answers"): a = item["answers"].strip() if a and a_old: map_answers[a_old] = a print len(map_answers) filename = getLocalFile("raw/rewrite/xianer7w_rewrite.xlsx") print filename gcounter["files"] += 1 ret = libfile.readExcel(["question", "old_answers", "answers"], filename, start_row=0) #use mapping items = [] for item in ret.values()[0]: gcounter["items"] += 1 q = item["question"] if item["old_answers"]: a = map_answers.get(item["old_answers"]) else: a = "" if not a: #print "SKIP no mapping", q, item["old_answers"] gcounter["items_no_mapping"] += 1 continue qa = q + a item["id"] = es_api.gen_es_id(q) if item["id"] in ids: gcounter["items_skip_dup"] += 1 continue skip_words = self.api_nlp.detect_skip_words( qa, check_list=["skip_words_all"]) if skip_words: print "SKIP", u"/".join( skip_words), "\t---\t", item["question"], "\t---\t", a gcounter["items_skip_minganci"] += 1 continue ids.add(item["id"]) item_new = { "question": q, "answers": a, "id": item["id"], } items.append(item_new) gcounter["qa0708rewrite"] = len(ids) filename = getLocalFile("temp/qa0708rewrite.xls") libfile.writeExcel(items, ["label", "question", "answers"], filename)
def _merge_chat(self, filenames, option): filename_todo = getLocalFile("input/{}_todo.txt".format(option)) print "filename_todo", filename_todo q_todo = set() if os.path.exists(filename_todo): q_todo = libfile.file2set(filename_todo) gcounter["q_todo"] = len(q_todo) print "filename_todo", filename_todo, len(q_todo) filename_skip = getLocalFile("input/{}_skip.txt".format(option)) print "filename_skip", filename_skip q_skip = set() if os.path.exists(filename_skip): q_skip = libfile.file2set(filename_skip) gcounter["q_skip"] = len(q_skip) print "filename_skip", filename_skip, len(q_skip) data = {} q_all = set() for filename in filenames: #print filename gcounter["files"] += 1 ret = libfile.readExcel(["category", "question", "answers"], filename, start_row=1) if ret: for items in ret.values(): for item in items: gcounter["items"] += 1 q_all.add(item["question"]) if q_skip and item["question"] in q_skip: gcounter["items_skip"] += 1 continue item["id"] = es_api.gen_es_id(item["question"] + item["answers"]) if item["id"] in data: continue for dataset_index in ["chat8cmu6w", "chat8xianer12w"]: if dataset_index in filename: gcounter["from_" + dataset_index] += 1 label = self.filter_qa_by_label( item["category"], item["question"], item["answers"]) if label: item["label"] = label #print "SKIP", label, "\t---\t", item["question"], "\t---\t", item["answers"] #gcounter["_esdata_label_{}".format(label)]+=1 #elif not self.api_nlp.is_question_baike(item["question"]): # item["label"] = u"百科" else: item["label"] = u"" xlabel = re.sub(":.*$", "", item["label"]) gcounter["data_with_label_{}".format(xlabel)] += 1 data[item["id"]] = item item_new = {} for p in ["question", "answers", "id"]: item_new[p] = item[p] gcounter["data"] = len(data) results = sorted(data.values(), key=lambda x: x["question"]) print len(data), len(results) filename_output = getLocalFile("output/edit_{}.xls".format(option)) libfile.writeExcel(results, ["label", "question", "answers"], filename_output) filename_output = getLocalFile( "edit0623/sample1000_edit_{}.xls".format(option)) libfile.writeExcel(libdata.items2sample(data.values(), limit=1000), ["label", "question", "answers"], filename_output) if q_todo: q_todo.difference_update(q_all) filename_output = getLocalFile( "edit0623/question_miss_{}.xls".format(option)) libfile.lines2file(sorted(list(q_todo)), filename_output) gcounter["q_all"] = len(q_all) gcounter["q_miss"] = len(q_todo) page_size = 2000 max_page = len(results) / page_size + 1 for i in range(max_page): filename_output = getLocalFile("edit0623/edit_{}_{}.xls".format( option, i)) #print filename_output idx_start = i * page_size idx_end = min(len(results), (i + 1) * page_size) libfile.writeExcel(results[idx_start:idx_end], ["label", "question", "answers"], filename_output)