def nyt_seg_LMD(topic_id, query): tot_itr_times = 1 solr = SolrClient(SOLR_SEG_nyt_LMD_URL) jig = JigClient(topic_id=topic_id, tot_itr_times=tot_itr_times, base_jig_dir=NYT_JIG_DIR) print "jig dir:", jig.base_dir query_range = [ # '', 'content_full_text', ] docs = retrieval_top_k_doc_full(query, solr, k=1000, query_range=query_range) # interact_with_jig(jig, docs, tot_itr_times) st_ptr = 0 for i in range(tot_itr_times): rslt = jig.run_itr(docs[st_ptr:st_ptr + 5], topic_id=topic_id) st_ptr += 5 print "itr:", i # , " rslt:", rslt if rslt is not None: for _ in rslt: print _ else: print "None"
def rejudege(all_subs, iter_counts, max_iter_count): for iter_count in iter_counts: jig = JigClient(tot_itr_times=iter_count, topic_id=None, base_jig_dir=EBOLA_NYT_JIG_DIR) should_judge = False if iter_count > max_iter_count: logging.error("iter_count: %s is larger than max_iter_count: %s", max_iter_count) for topic_id in all_subs: sub_list = all_subs[topic_id] # print topic_id, len(sub_list) if len(sub_list) <= iter_count * 5: continue for i in range(iter_count): should_judge = True jig_result = jig.run_itr(sub_list[:5], topic_id=topic_id) print_jig_result(jig_result) sub_list = sub_list[5:] try: if should_judge: jig.judge() except: print "shit judge error"
def nyt_irsys_blending(): tot_itr_times = 1 topic_id = 'dd17-1' ws = [3, 1, 1, 1, 1] jig = JigClient(topic_id=topic_id, tot_itr_times=tot_itr_times, base_jig_dir=NYT_JIG_DIR) solrs = get_all_nyt_solrs() irsys = IRSys(solrs, ws=ws) for topic_id, topic_name in NYT_TOPICS: print "topic_id, topic name:", topic_id, topic_name query = [topic_name] query_range = [ # '', # 'content_full_text', 'content', ] for i in range(tot_itr_times): docs = irsys.retrieve_docs(query, query_field='content', with_query_field=False) print "docs 0~3:", docs[0:3] jig_format_docs = irsys.items2jigdocs(docs) iresult = jig.run_itr(jig_format_docs[i * 5:i * 5 + 5], topic_id=topic_id) print "iresult, i:", i if iresult is not None: for _ in iresult: print _ else: logging.error("[ERROR] iresult None ") jig.judge()
def test_6(): logging.info("read idf dic... ") idf_dic = json.load(codecs.open(idf_dic_path, 'r')) logging.info("load idf dic end...") solr = SolrClient(BASE_SOLR_URL) tot_itr_times = 1 topic_id = "DD16-1" jig = JigClient(topic_id, tot_itr_times=tot_itr_times) # topic_query = ["US Military Crisis Response"] # for topic in EBOLA_TOPICS: topic_query = [preprocess_query(topic[1])] topic_id = topic[0] print "topic query:", topic_query print "topic id:", topic_id # interact_with_jig_to_change_vec(jig, solr, query, idf_dic, itr_cnt=5) #interact_with_jig_to_change_vec(jig, solr, topic_query, idf_dic, itr_cnt=tot_itr_times) interact_with_jig_to_change_vec_use_jig_ret( jig, solr, topic_query, idf_dic, itr_cnt=tot_itr_times, topic_id=topic_id, query_range=[ "content_p", "content_h1", "content_h2", "content_h3", "content_h4", "content_h5" ]) jig.judge()
def use_full(): topic_query = ["US Military Crisis Response"] # topic_query = ["Crisis Response"] # topic_query = [ "US Military Crisis Response ebola outbreak epidemic" ] # topic_query = ["US Military Crisis Response ebola"] #fight combat commits against seeks" ] # topic_query = ["US Military Crisis Response outbreak"] # topic_query = ["Who are key leaders (field grade officers and senior NCO’s) in charge of U.S. military units combating the ebola epidemic in Africa, what are the protocols for personnel safety, and what is their mission"] # topic_query = ["US Military Crisis Response fight combat"] topic_query = [','.join(topic_query[0].split())] topic_id = "DD16-1" # topic_id, topic_query = ('DD16-26', ' African Culture') #('DD16-24', 'Olu-Ibukun Koye Spread EVD to Port Harcourt') #('DD16-3', 'healthcare impacts of ebola') # topic_query = [','.join(topic_query[0].split())] tot_itr_times = 1 print "topic query:", topic_query solr = SolrClient(FULL_SOLR_URL) jig = JigClient(topic_id, tot_itr_times=tot_itr_times) docs = retrieval_top_k_doc_full(topic_query, solr, 1000, query_range=['title', 'content']) interact_with_jig(jig, docs, tot_itr_times) jig.judge()
def test_12(): logging.info("read idf dic... ") idf_dic = json.load(codecs.open(idf_dic_path, 'r')) logging.info("load idf dic end...") topic_query = ["US Military Crisis Response"] topic_id = "DD16-1" print "topic query:", topic_query solr = SolrClient(BASE_SOLR_URL) tot_itr_times = 5 jig = JigClient(topic_id, tot_itr_times=tot_itr_times) for tid, topic_query in EBOLA_TOPICS: #interact_with_jig_to_change_vec(jig, solr, query, idf_dic, itr_cnt=5) interact_with_jig_to_change_vec(jig, solr, [topic_query], idf_dic, topic_id=tid, itr_cnt=tot_itr_times) jig.judge()
def use_psg(): topic_query = ["US Military Crisis Response"] # topic_query = ["US Military Crisis Response fight combat commits against seeks"] # topic_query = [ "US Military Crisis Response ebola outbreak epidemic" ] # topic_query = ["US Military Crisis Response ebola"] #fight combat commits against seeks" ] # topic_query = ["US Military Crisis Response outbreak"] # topic_query = [','.join(topic_query[0].split() )] # topic_query = [ ','.join(topic_query[0].split() + [ "US Military", "Military Crisis", "Crisis Response", ] ) ] # topic_query = ["Who are key leaders (field grade officers and senior NCO’s) in charge of U.S. military units combating the ebola epidemic in Africa, what are the protocols for personnel safety, and what is their mission"] topic_id = "DD16-1" tot_itr_times = 1 print "topic query:", topic_query query_range = [ "content_title", "content_p", "content_h1", "content_h2", "content_h3", "content_h4", "content_h5" ] query_range = ["content_p", "content_h3", "content_h4", "content_h5"] solr = SolrClient(BASE_SOLR_URL) jig = JigClient(topic_id, tot_itr_times=tot_itr_times) docs = retrieval_top_k_doc(topic_query, solr, 1000, query_range=query_range) interact_with_jig(jig, docs, tot_itr_times) jig.judge()
def test_full_irsys(w=None, topics=EBOLA_TOPICS): logging.info("get all solrs...") solrs = get_all_ebola_solrs() print "solr cnt:", len(solrs) # w = [1] * len(solrs) # solrs += [SolrClient(solr_url=SOLR_EBOLA_LMD2500)] w = [ 3, 1, 1, 1, 1, ] #提高1.5% irsys = IRSys(solrs, ws=w) tot_itr_times = 1 every_itr_doc_cnt = 5 jig = JigClient(tot_itr_times=tot_itr_times) for tid, topic in topics: logging.info("search for topic %s %s" % (tid, topic)) # docs_list = irsys.retrieve_docs([topic]) # docs_list = irsys.retrieve_docs(topic.split()) print " =====>>> CHECK:", docs_list[0] key_set = set() # 强制再搞一次去重 logging.info("======> STRICT REMOVE DUP") print "before remove dup by key:", len(docs_list) new_docs_list = [] for d in docs_list: key = d[0].strip() if key not in key_set: new_docs_list.append(d) print "after remove dup by key:", len(new_docs_list) docs_list = new_docs_list logging.info("======> REMOVE DUP END") for i in range(tot_itr_times): jig_format_docs = irsys.items2jigdocs( docs_list)[i * every_itr_doc_cnt:i * every_itr_doc_cnt + every_itr_doc_cnt] print "itr:", i, " tid:", tid irslt = jig.run_itr(jig_format_docs, topic_id=tid) print "itr i:", i, " rslt:" if irslt is not None: for _ in irslt: print _ else: print None jig.judge()
def base_nyt_seg_data(): # NYT_TOPICS = [ # ('dd17-2', 'Who Outed Valerie Plame?'), # ] topic_id = 'dd17-1' # topic_name = "Return of Klimt paintings to Maria Altmann"#.split(' ') tot_itr_times = 1 solr_url = SOLR_SEG_nyt_LMD768_URL solr = SolrClient(solr_url) jig = JigClient(topic_id=topic_id, tot_itr_times=tot_itr_times, base_jig_dir=NYT_JIG_DIR) print "jig dir:", jig.base_dir for topic_id, topic_name in NYT_TOPICS: # print topic_name print "topic id, topic name:", topic_id, topic_name query = [topic_name] query_range = [ # '', 'content_full_text', ] docs = retrieval_top_k_doc_full(query, solr, k=1000, query_range=query_range) # interact_with_jig(jig, docs, tot_itr_times) print "BEFORE REMOVE DUP:", len(docs) docs = remove_dup(docs) print "AFTER REMOVE DUP:", len(docs) st_ptr = 0 for i in range(tot_itr_times): rslt = jig.run_itr(docs[st_ptr:st_ptr + 5], topic_id=topic_id) st_ptr += 5 print "itr:", i # , " rslt:", rslt if rslt is not None: for _ in rslt: print _ else: print "None" jig.judge()
def test_10(): logging.info("read idf dic... ") idf_dic = json.load(codecs.open(idf_dic_path, 'r')) logging.info("load idf dic end...") solr = SolrClient(BASE_SOLR_URL) tot_itr_times = 5 topic_id = "DD16-1" jig = JigClient(topic_id, tot_itr_times=tot_itr_times) # topic_query = ["US Military Crisis Response"] # # vecutils = VecUtils() pseudo_cnt = 5 pseudo_word_cnt = 0 for topic in EBOLA_TOPICS: topic_query = [preprocess_query(topic[1])] init_query_words = topic_query[0].split() topic_id = topic[0] logging.info("qe end...") # interact_with_jig_to_change_vec(jig, solr, query, idf_dic, itr_cnt=5) interact_with_jig_to_change_vec(jig, solr, topic_query, idf_dic, itr_cnt=tot_itr_times, topic_id=topic_id, query_range=[ "content_p", "content_h1", "content_h2", "content_h3", "content_h4", "content_h5" ]) # interact_with_jig_to_change_vec_use_jig_ret_use_pseudo_query(jig, solr, topic_query, idf_dic, itr_cnt=tot_itr_times, # topic_id=topic_id, # query_range=Query_Range, use_pseudo=True, pseudo_query =pseudoquery) jig.judge()
def use_psg_all_topic(): tot_itr_times = 1 jig = JigClient(topic_id="DD16-1", tot_itr_times=tot_itr_times) solr = SolrClient(BASE_SOLR_URL) for topic_id, topic_query in EBOLA_TOPICS: print "topic query:", topic_query query_range = [ "content_title", "content_p", "content_h1", "content_h2", "content_h3", "content_h4", "content_h5" ] # query_range = ["content_h5", ] query_range = ["content_p", "content_h2"] print "!!!!!+++++++++++> query_range:", query_range docs = retrieval_top_k_doc([topic_query], solr, 1000, query_range=query_range) interact_with_jig_by_topic(jig, docs, tot_itr_times, tid=topic_id) jig.judge()
def test_13(): logging.info("read idf dic... ") idf_dic = json.load(codecs.open(idf_dic_path, 'r')) logging.info("load idf dic end...") topic_query = ["US Military Crisis Response"] topic_id = "DD16-1" print "topic query:", topic_query solr = SolrClient(BASE_SOLR_URL) tot_itr_times = 5 jig = JigClient(topic_id, tot_itr_times=tot_itr_times) for tid, topic_query in EBOLA_TOPICS: pass
def rejudege(process_count, iter_counts, max_iter_count, out_dir, dtype="ebola"): import time frames = [] for i in range(process_count): ps = "{}-{}".format(process_count, i) try: frames.extend( json.load(codecs.open(out_dir.format(ps), "r", "utf-8"))) except: pass print "queue len:", len(frames) json.dump(frames, codecs.open(out_dir.format(int(time.time())), "w", "utf-8")) rank = {frame["topic_id"]: frame["sub_list"] for frame in frames} json.dump(rank, codecs.open("frames/{}_rank.json".format(dtype), "w", "utf-8")) for iter_count in iter_counts: jig = JigClient(tot_itr_times=iter_count, topic_id=None, base_jig_dir=EBOLA_NYT_JIG_DIR) if iter_count > max_iter_count: logging.error("iter_count: %s is larger than max_iter_count: %s", max_iter_count) for frame in frames: topic_id = frame["topic_id"] sub_list = frame["sub_list"] ddocs = frame["ddocs"] for i in range(iter_count): jig.run_itr([ddocs[str(did)] for did in sub_list[:5]], topic_id=topic_id) sub_list = sub_list[5:] jig.judge()
st_ptr = 0 for i in range(interact_times): rslt = jig.run_itr(docs[st_ptr:st_ptr+5]) st_ptr += 5 print "itr:", i #, " rslt:", rslt for _ in rslt: print _ if __name__ == '__main__': topic_query = ["US Military Crisis Response"] # topic_query = [ "US Military Crisis Response ebola outbreak epidemic fight seeks spread" ] # topic_query = ["US Military Crisis Response ebola"] #fight combat commits against seeks" ] topic_query = ["US Military Crisis Response outbreak"] # topic_query = ["Who are key leaders (field grade officers and senior NCO’s) in charge of U.S. military units combating the ebola epidemic in Africa, what are the protocols for personnel safety, and what is their mission"] topic_id = "DD16-1" print "topic query:", topic_query solr = SolrClient(BASE_SOLR_URL) jig = JigClient(topic_id) docs = retrieval_top_k_doc(topic_query, solr, 1000) interact_with_jig(jig, docs, 5) jig.judge() __END__ = True
def _thread_main(process_id, process_count, in_dir, out_dir, dtype, se_name, iter_count, return_count, likehood, test=False): if "nytimes" in dtype: solr = SolrClient(SOLR_SEG_nyt_LMD_URL) else: solr = SolrClient(FULL_SOLR_URL) key2id = du.load_ebola_map("key2id") se_results = du.load_se_results(se_name=se_name, dtype=dtype) # id2key = du.load_ebola_map("id2key") # jig = JigClient(tot_itr_times=iter_count, topic_id=None) jig = JigClient(tot_itr_times=iter_count, topic_id=None, base_jig_dir=EBOLA_NYT_JIG_DIR) if "ebola" in dtype: topics = eb_topics if "nytimes" in dtype: topics = ny_topics if test: topics = [eb_topics[2], eb_topics[3], eb_topics[5], eb_topics[24]] logging.root.setLevel(logging.WARNING) frame_list = [] topics = topics[process_id::process_count] print topics for tid, topic in topics: logging.info("[#] id: %s, topic: %s\n", tid, topic) if "ebola" in dtype: solr_docs = retrieval_top_k_doc_full([topic], solr, 600, query_range=['content'], key2id=key2id) else: solr_docs = solr.query_fields(keywords=[topic], fl="title,key,date", rows=return_count) solr_docs = [(d["key"], d["key"], d["score"]) for d in solr_docs] ddocs = {d[0]: d for d in solr_docs} doc_ids = ddocs.keys() if "ebola" in dtype: jdocs = { did: exu.extract_ebola(in_file=in_dir.format(did)) for did in doc_ids[:return_count] } else: jdocs = {} for did in doc_ids[:return_count]: try: jdocs[did] = json.load( codecs.open(in_dir.format(did), "r", "utf-8")) except: pass sim_docs = get_se_sim(jdocs, se_results[tid], likehood, dtype=dtype) sim_ratings = [(did, (sim_docs[did]), ddocs[did][2]) for did in sim_docs] limit_score = likehood # (likehood + 0.1) * solr_docs[return_count][2] sim_ratings = sorted(sim_ratings, key=itemgetter(1, 2), reverse=True) logging.info("[#] sim_ratings: %s", sim_ratings) frame = { "solr_docs": solr_docs, "return_count": return_count, "sim_docs": sim_docs, "ddocs": ddocs, # "jdocs": jdocs "sim_ratings": sim_ratings, "iter_count": iter_count, "topic_id": tid, "topic": topic } sim_ids = [item[0] for item in sim_ratings if item[1] > limit_score] sub_list, sub_set = [], set() solr_p = 0 for i in range(iter_count): sub_ids = set(doc_ids[:solr_p]) doc_ids = doc_ids[solr_p:] while len(sub_ids) < 5: if len(sim_ids) > 0: sub_ids.add(sim_ids[0]) sim_ids = sim_ids[1:] elif len(doc_ids) > 0: sub_ids.add(doc_ids[0]) doc_ids = doc_ids[1:] else: break sub_ids = set([did for did in sub_ids if did not in sub_set]) sub_set.update(sub_ids) sub_list.extend(list(sub_ids)) # result = jig.run_itr( # [ddocs[did] for did in sub_ids], topic_id=tid) # print_jig_result(result) frame["sub_list"] = sub_list frame_list.append(frame) # jig.judge() ps = "{}-{}".format(process_count, process_id) json.dump(frame_list, codecs.open(out_dir.format(ps), "w", "utf-8")) if test: jig.judge() rejudege(process_count, iter_counts=[1, 2, 3, 5], max_iter_count=iter_count, out_dir=out_dir)
def test_lm_weight_field(w=None, topics=EBOLA_TOPICS): logging.info("get all solrs...") solrs = get_all_ebola_solrs() solrs = [SolrClient(solr_url=SOLR_EBOLA_CLEAN_FULL_WITH_A)] print "solr cnt:", len(solrs) w = [1] * len(solrs) # w = [3, 1, 1, 1, 1] #提高1.5% # irsys = IRSys(solrs, ws=w) solr = SolrClient(solr_url=SOLR_EBOLA_CLEAN_FULL_WITH_A) tot_itr_times = 1 every_itr_doc_cnt = 5 jig = JigClient(tot_itr_times=tot_itr_times) for tid, topic in topics: logging.info("search for topic %s %s" % (tid, topic)) # docs_list = solr.query_fields_by_weight( keywords=[topic], query_fields=['title', 'content', 'a'], ws=[0.3, 0.7, 0.1], fl='key') # docs_list = irsys.retrieve_docs(topic.split()) print " =====>>> CHECK:", docs_list[0] key_set = set() # 强制再搞一次去重 logging.info("======> STRICT REMOVE DUP") print "before remove dup by key:", len(docs_list) new_docs_list = [] for d in docs_list: key = d['key'].strip() if key not in key_set: new_docs_list.append(d) print "after remove dup by key:", len(new_docs_list) docs_list = new_docs_list logging.info("======> REMOVE DUP END") for i in range(tot_itr_times): st = i * every_itr_doc_cnt en = i * every_itr_doc_cnt + every_itr_doc_cnt jig_format_docs = [] for j_ in range(st, en): jig_format_docs.append( (0, docs_list[j_]['key'], docs_list[j_]['score'])) # jig_format_docs = irsys.items2jigdocs(docs_list)[i*every_itr_doc_cnt:i*every_itr_doc_cnt + every_itr_doc_cnt] print "itr:", i, " tid:", tid irslt = jig.run_itr(jig_format_docs, topic_id=tid) print "itr i:", i, " rslt:" if irslt is not None: for _ in irslt: print _ else: print None jig.judge()
rslt = jig.run_itr(docs[st_ptr:st_ptr+5]) st_ptr += 5 print ("itr:", i) #, " rslt:", rslt for _ in rslt: print (_) if __name__ == '__main__': topic_query = ["US,Military,Crisis,Response"] topic_query = [','.join(topic_query[0].split() )] topic_id = "DD16-1" tot_itr_times = 1 #solr = SolrClient("http://172.22.0.11:8983/solr/ebola_extract/select?") solr = SolrClient("http://10.61.2.168:8989/solr/ebola_paragraph/select?") jig = JigClient(topic_id, tot_itr_times=tot_itr_times) docs = retrieval_top_k_doc(topic_query, solr, 1000) interact_with_jig(jig, docs, tot_itr_times) jig.judge() dict = jig.get_result_dict() #fl = codecs.open("dict.txt", "w", "utf-8") #fl.write(json.dumps(dict)) print(dict) __END__ = True
def test_8_tf_idf(): logging.info("read idf dic... ") idf_dic = json.load(codecs.open(idf_dic_path, 'r')) logging.info("load idf dic end...") solr = SolrClient(BASE_SOLR_URL) tot_itr_times = 1 topic_id = "DD16-1" jig = JigClient(topic_id, tot_itr_times=tot_itr_times) # topic_query = ["US Military Crisis Response"] # # vecutils = VecUtils() pseudo_cnt = 5 pseudo_word_cnt = 0 for topic in EBOLA_TOPICS: topic_query = [preprocess_query(topic[1])] init_query_words = topic_query[0].split() topic_id = topic[0] print "pseudo_cnt, pseudo_word_cnt:", pseudo_cnt, pseudo_word_cnt print "topic query:", topic_query print "topic id:", topic_id logging.info("qe...") # qe = QE_w2v(topic_query[0], vecutils) Query_Range = ["content_p"] pre_docs = retrieval_top_k_doc_with_content(topic_query, solr, k=pseudo_cnt, query_range=Query_Range) pseudo_top_doc = [] for d in pre_docs: pseudo_top_doc.append(d[3]) pseudo_top_doc = ' '.join(pseudo_top_doc) pseudo_top_doc = pseudo_top_doc.split() pseudo_top_words = expand_by_tfidf_candidate_words( idf_dic, init_words=init_query_words, cwords=pseudo_top_doc, ret_cnt=pseudo_word_cnt) pseudoquery = word2query_by_sim(pseudo_top_words) pseudoquery = topic_query[0] + ' ' + pseudoquery print "init query before pseudoquery:", topic_query[0] print "form pseudoquery:", pseudoquery logging.info("qe end...") # interact_with_jig_to_change_vec(jig, solr, query, idf_dic, itr_cnt=5) # interact_with_jig_to_change_vec(jig, solr, topic_query, idf_dic, itr_cnt=tot_itr_times) interact_with_jig_to_change_vec_use_jig_ret_use_pseudo_query( jig, solr, topic_query, idf_dic, itr_cnt=tot_itr_times, topic_id=topic_id, query_range=Query_Range, use_pseudo=True, pseudo_query=pseudoquery) jig.judge()