def count(self, query, index, document_type): query.build_query() resp, content = self.__request( '/%s/%s/_count/' % (index, document_type), 'GET', query.get_object_json()) t = json.loads(content) return t['count']
def scrape_contribs(party, start_year, end_year=None, contribs_dir=None, get_address=True, federal=True, riding=True, q_reports=False, summary=False): session = requests.Session() contribs = [] if summary: contribs_dir += '/summaries' if not os.path.exists(contribs_dir): os.makedirs(contribs_dir) for year in range(start_year, end_year + 1): csvpath = (os.path.join(contribs_dir, '{}.{}.csv'.format(party, year)) if contribs_dir is not None else None) # run each search if they are explicitly enabled, or both if neither are if federal or not riding: print 'Getting federal party contributions for {} in {}'.format(party, year) queryid = build_query(session, party, True, year, q_reports) contribs.extend(search_contribs(session, queryid, True, year, get_address, csvpath, q_reports, summary)) if riding or not federal: print 'Getting local riding association contributions for {} in {}'.format(party, year) queryid = build_query(session, party, False, year, q_reports) contribs.extend(search_contribs(session, queryid, False, year, get_address, csvpath, q_reports, summary)) return contribs
def scrape_contribs(party, start_year, end_year=None, contribs_dir=None, get_address=True, federal=True, riding=True): session = requests.Session() contribs = [] for year in range(start_year, end_year + 1): csvpath = (os.path.join(contribs_dir, '{}.{}.csv'.format(party, year)) if contribs_dir is not None else None) # run each search if they are explicitly enabled, or both if neither are if federal or not riding: print 'Getting federal party contributions for {} in {}'.format( party, year) queryid = build_query(session, party, True, year) contribs.extend( search_contribs(session, queryid, True, year, get_address, csvpath)) if riding or not federal: print 'Getting local riding association contributions for {} in {}'.format( party, year) queryid = build_query(session, party, False, year) contribs.extend( search_contribs(session, queryid, False, year, get_address, csvpath)) return contribs
res_doc_path = './res' trec_eval_path = './trec_eval/trec_eval' res_path = './eval/res.txt' qrels_path = './eval/qrels.txt' eval_path = './eval/eval.txt' # 返回的相关词个数 k1 = 5 k2 = 15 if __name__ == '__main__': start = time.time() print('开始执行') # 构建查询 print('根据文件' + query_path + '构建查询并作查询扩展') query_list = build_query(query_path, w2v_path, vocab_path, k1) print('构建查询完毕') # bm模型 print("构建BM25模型") bm = BM25() print('构建BM25模型完毕') # 导入倒排表 print('从' + invert_table_path + '处导入倒排表') bm.build(invert_table_path) print('导入完毕') # 查询 print("开始查询") res = start_query(bm, query_list, k2) print('存储查询结果到目录' + res_doc_path) get_doc_cont(res, res_doc_path, doc_path) # 计算p@10
def query(self, query, index, document_type): query.build_query() resp, content = self.__request( '/%s/%s/_search/' % (index, document_type), 'GET', query.get_query()) return DataCollection(content)
def count(self, query, index, document_type): query.build_query() resp, content = self.__request('/%s/%s/_count/' % (index, document_type), 'GET', query.get_object_json()) t = json.loads(content) return t['count']
def query(self, query, index, document_type): query.build_query() resp, content = self.__request('/%s/%s/_search/' % (index, document_type), 'GET', query.get_query()) return DataCollection(content)