def PM2_by_IRSyS_without_query_feedback(topics=EBOLA_TOPICS, w=None, suggestor=None, if_use_clean_text=False, boost_params=1e11, tot_itr_times=2, every_itr_doc_cnt=5, use_subquery_cnt=5, result_div_lmd=0.5, lm_lmd=2000.0): logging.info("init IR sys...") solrs = get_all_ebola_solrs() print "solr cnt:", len(solrs) irsys = IRSys(solrs, ws=w) logging.info("loading... LMD...") lm = LMDirichlet(lmd=lm_lmd) if if_use_clean_text: lm.load(LMDirichlet_clean_Json) else: lm.load(LMDirichlet_Json) jig = JigClient(tot_itr_times=tot_itr_times) for tid, topic in topics: print "tot_itr_times:", tot_itr_times print "every_itr_doc_cnt:", every_itr_doc_cnt print "use_subquery_cnt:", use_subquery_cnt print "lm_lmd:", lm_lmd print "result_div_lmd:", result_div_lmd logging.info("prepare data for %s" % topic) already_select_key_set = set() D = [] #TODO 求vs... 还是按照subquery的能检索出来的东西来算 subquerys_vs = [] subquerys = suggestor.get_subquery_by_topic_id(tid) logging.info("init PM2 for %s" % topic) docs_list = irsys.retrieve_docs([topic], with_query_field=True)[0:1000] docs_list = preproces_docs_list(docs_list) R_left = get_R_left(docs_list, already_select_key_set) pm2 = PM2(subquerys, subquerys_vs, R_left, lmd=result_div_lmd) for i in range(tot_itr_times): if i == 0: file_cnt = 0 for _ in docs_list: if _[0] not in already_select_key_set: already_select_key_set.add(_) D.append(docs_list[1][2][KEY]) R_left.remove(_[1][2]) file_cnt += 1 if file_cnt >= every_itr_doc_cnt: break # elif i == 1: else: #TODO:根据迭代的轮次做修改 subquerys = suggestor.get_subquery_by_topic_id(tid) ranked_docs = pm2.select_doc()
def OLD_xQuAD__without_query_feedback_select_one_by_one_cos_sim_wc( topics=EBOLA_TOPICS, w=None, suggestor=None, if_use_clean_text=False, boost_params=1, if_stem=True, candidate_doc_cnt=700): tot_itr_times = 2 every_itr_doc_cnt = 5 use_subquery_cnt = 5 lm_lmd = 1.0 xquad_lmd = 0.6 logging.info("loading idf dict") idf_dict = json.load(codecs.open(STEM_IDF_DICT_EBOLA, 'r', 'utf-8')) print "tot word BEFORE to str cnt:", len(idf_dict.items()) err_cnt = 0 for k in idf_dict.keys(): v = idf_dict[k] idf_dict.pop(k) try: k = str(k) idf_dict[k] = v except: err_cnt += 1 # print "UNICODE TO STR ERR:", k print "UNICODE TO STR ERR CNT:", err_cnt print "tot word after to str cnt:", len(idf_dict.items()) # from src.utils.data_utils import basic_preprocess logging.info("loading... LMD...") lm = LMDirichlet(lmd=lm_lmd) if if_use_clean_text: print "load:", LMDirichlet_without_stem_lower lm.load(LMDirichlet_clean_Json) else: print "load:", LMDirichlet_without_stem_lower # lm.load(LMDirichlet_Json) lm.load(LMDirichlet_without_stem_lower) logging.info("initing xQuAD...") xquad = xQuAD(lm, lmd=xquad_lmd, alpha=1.0) logging.info("get all solrs...") solrs = get_all_ebola_solrs() print "solr cnt:", len(solrs) # w = [1] * len(solrs) # w = [3, 1, 1, 1, 1] #提高1.5% irsys = IRSys(solrs, ws=w) # jig = JigClient(tot_itr_times=tot_itr_times) jig = JigClient_OLD(tot_itr_times=2, base_jig_dir=EBOLA_POLAR_JIG_DIR) for tid, topic in topics: print "tot_itr_times:", tot_itr_times print "every_itr_doc_cnt:", every_itr_doc_cnt print "use_subquery_cnt:", use_subquery_cnt print "lm_lmd:", lm_lmd print "xquad_lmd:", xquad_lmd print "if_stem:", if_stem print "candidate doc cnt:", candidate_doc_cnt # already_select_key_set表示的是 已经选的key set, D表示的是已经选的文章,文章的格式是{}这种而不是IRSys的 already_select_key_set = set() D = [] logging.info("search for topic %s %s" % (tid, topic)) logging.info("preprocess data...") # query_word_list = basic_preprocess(topic, if_lower=True, if_stem=if_stem) query_word_list = basic_preprocess_for_query(topic, if_lower=True, if_stem=if_stem) print "===> !!!! query_word_list:", query_word_list for _ in query_word_list: if not lm.C.has_key(_): print "!!!!==> LM not has key:", _ docs_list = irsys.retrieve_docs( [topic], with_query_field=True)[0:candidate_doc_cnt] docs_list = preproces_docs_list(docs_list, if_stem=if_stem) logging.info("cal dcs...") dc_dict = cal_dc_dicts(docs_list) check_cnt = 0 print "??????????++++++!!!!!!!!!>>>>>>>>>CHECK DC DICT :" for k in dc_dict.keys(): print "dc k,v:", k, dc_dict[k] check_cnt += 1 if check_cnt >= 1: break subquerys = suggestor.get_subquery_by_topic_id( tid, if_related=False)[0:use_subquery_cnt] # subquerys = clean_subquerys_to_query_lists(subquerys, lm, if_stem=if_stem) subquerys = clean_subquerys_to_query_lists_and_filter_query( subquerys, lm, if_stem=if_stem, query_words=query_word_list) print "===> subqueries:", subquerys file_ptr = 0 for i in range(tot_itr_times): print "itr:", i, " tid:", tid this_itr_select_docs = [] if i == 0 or len(subquerys) == 0: if len(subquerys) == 0: print "======@@@@@@@@@@@@> subquery cnt is zero, tid, topic:", tid, topic print docs_list[0] while len(this_itr_select_docs) < 5 and file_ptr < len( docs_list): if docs_list[file_ptr][0] in already_select_key_set: continue this_itr_select_docs.append(docs_list[file_ptr][1][2]) already_select_key_set.add( docs_list[file_ptr][1][2]['key']) # D.append( docs_list[file_ptr][1][2] ) file_ptr += 1 jig_format_docs = irsys.items2jigdocs( docs_list)[i * every_itr_doc_cnt:i * every_itr_doc_cnt + every_itr_doc_cnt] jig.run_itr(jig_format_docs, topic_id=tid) # elif i == 1: else: #use xQuAD to select best docs R_left = get_R_left(docs_list, already_select_key_set) ranked_docs = [] for ixquad_selected in range(every_itr_doc_cnt): ranked_docs = xquad.select_doc_u_cos( query_word_list, R_left, D, subquerys, dc_dicts=dc_dict, ret_rel_div_score=True) d = ranked_docs[ 0] #这个d的格式是[doc{}, xquad score, rel_score, div_score格式] # if i == 0: if d[0][KEY] in already_select_key_set: print "############!!!!!!!!!ERROR >>>>>>>>>>>> SELECT DUP:", d[ KEY] print "-----CHECK SCORE SELECTED, [ xquad score, rel_score, div_score格式]->>>:", d[ 1:] #TODO:这里需要检查一下要不要加D D.append(d[0]) D[-1][SCORE] = d[1] already_select_key_set.add(d[0][KEY]) R_left.remove(d[0]) # this_itr_select_docs = ranked_docs[0:every_itr_doc_cnt] # this_itr_select_docs = ranked_docs[0:every_itr_doc_cnt] this_itr_select_docs = [] for i, _ in enumerate(ranked_docs): if _[0][KEY] in already_select_key_set: continue this_itr_select_docs.append(_) if len(this_itr_select_docs) >= 5: if i >= 5: print "^^^^^^^^^ [ERROR] ThErE must be DUP......!!, i:", i break jig_format_docs = [] for d in this_itr_select_docs: #TODO:需要检查一下,这里的score,因为第一轮的score和这里太不一样了,需要考虑下怎么处理,需要验证一下score随便设置是不是可以的... jig_format_docs.append((0, d[0][KEY], d[1] * boost_params)) iresult = jig.run_itr(jig_format_docs, topic_id=tid) if iresult is not None: print "itr result , i:", i if type(iresult) == list: for _ in iresult: print _ else: print iresult print "======== CHECK DUP:", len( already_select_key_set), tot_itr_times * 5 jig.judge()
def xQuAD_by_IRSys_ebola_without_query_feedback(topics=EBOLA_TOPICS, w=None, suggestor=None, if_use_clean_text=False, boost_params=1e11): logging.info("loading... LMD...") lm = LMDirichlet() if if_use_clean_text: lm.load(LMDirichlet_clean_Json) else: lm.load(LMDirichlet_Json) logging.info("initing xQuAD...") xquad = xQuAD(lm, lmd=0.5, alpha=0.5) logging.info("get all solrs...") solrs = get_all_ebola_solrs() print "solr cnt:", len(solrs) # w = [1] * len(solrs) # w = [3, 1, 1, 1, 1] #提高1.5% irsys = IRSys(solrs, ws=w) tot_itr_times = 5 every_itr_doc_cnt = 5 jig = JigClient(tot_itr_times=tot_itr_times) # already_select_key_set表示的是 已经选的key set, D表示的是已经选的文章,文章的格式是{}这种而不是IRSys的 already_select_key_set = set() D = [] for tid, topic in topics: logging.info("search for topic %s %s" % (tid, topic)) logging.info("preprocess data...") query_word_list = basic_preprocess(topic) print "query_word_list:", query_word_list docs_list = irsys.retrieve_docs([topic], with_query_field=True) docs_list = preproces_docs_list(docs_list)[0:1000] logging.info("cal dcs...") dcs_dict = cal_dc_dicts(docs_list) key_set = set() #强制再搞一次去重 logging.info("======> STRICT REMOVE DUP") print "before remove dup by key:", len(docs_list) new_docs_list = [] for d in docs_list: key = d[0].strip() if key not in key_set: new_docs_list.append(d) print "after remove dup by key:", len(new_docs_list) logging.info("======> REMOVE DUP END") docs_list = new_docs_list file_ptr = 0 for i in range(tot_itr_times): print "itr:", i, " tid:", tid this_itr_select_docs = [] if i == 0: print docs_list[0] while len(this_itr_select_docs) < 5 and file_ptr < len( docs_list): if docs_list[file_ptr][0] in already_select_key_set: continue this_itr_select_docs.append(docs_list[file_ptr][1][2]) already_select_key_set.add( docs_list[file_ptr][1][2]['key']) file_ptr += 1 jig_format_docs = irsys.items2jigdocs( docs_list)[i * every_itr_doc_cnt:i * every_itr_doc_cnt + every_itr_doc_cnt] jig.run_itr(jig_format_docs, topic_id=tid) # elif i == 1: else: #use xQuAD to select best docs docs_left = docs_list[file_ptr:] R_left = get_R_left(docs_left, already_select_key_set) subquerys = suggestor.get_subquery_by_topic_id( tid, if_related=False)[0:5] subquerys = clean_subquerys_to_query_lists(subquerys) print "===> subqueries:", subquerys ranked_docs = xquad.select_doc_u(query_word_list, R_left, D, subquerys) for d in ranked_docs[0:5]: D.append(d[0]) D[-1][SCORE] = d[1] this_itr_select_docs = ranked_docs[0:every_itr_doc_cnt] jig_format_docs = [] for d in this_itr_select_docs: #TODO:需要检查一下,这里的score,因为第一轮的score和这里太不一样了,需要考虑下怎么处理,需要验证一下score随便设置是不是可以的... jig_format_docs.append((0, d[0][KEY], d[1] * boost_params)) jig.run_itr(jig_format_docs, topic_id=tid) jig.judge()