def TREC_preprocess(collection, index, hits): if collection in ['test', 'arqmath-2020-task1', 'arqmath-2021-task1', 'arqmath-2021-task1-refined']: for hit in hits: doc = pya0.index_lookup_doc(index, hit['docid']) hit['_'] = hit['docid'] hit['docid'] = int(doc['url']) elif collection in ['arqmath-2020-task2', 'arqmath-2021-task2', 'arqmath-2021-task2-refined']: for hit in hits: doc = pya0.index_lookup_doc(index, hit['docid']) formulaID, postID, threadID, type_, visualID = doc['url'].split(',') hit['_'] = formulaID hit['docid'] = int(postID) else: raise NotImplementedError
def TREC_reverse(collection, index, hits): if collection in ['test', 'arqmath-2020-task1', 'arqmath-2021-task1', 'arqmath-2021-task1-refined']: for hit in hits: trec_docid = hit['docid'] hit['trec_docid'] = trec_docid doc = pya0.index_lookup_doc(index, trec_docid) hit['docid'] = int(doc['extern_id']) # get internal doc ID elif collection in ['arqmath-2020-task2', 'arqmath-2021-task2', 'arqmath-2021-task2-refined']: for hit in hits: trec_docid = int(hit['_']) hit['trec_docid'] = trec_docid hit['_'] = str(hit['docid']) # docid is actually post ID here doc = pya0.index_lookup_doc(index, trec_docid) hit['docid'] = int(doc['extern_id']) # get internal doc ID else: raise NotImplementedError
def _featslookup__arqmath_2020_task1(topic_query, index, docid): qid, query, qtags = topic_query # qnum qnum = int(qid.split('.')[1]) # doc doc = pya0.index_lookup_doc(index, docid) # doc score result_JSON = pya0.search(index, query, verbose=False, topk=1, log=None, docid=docid) results = json.loads(result_JSON) doc_s = results['hits'][0] if results['ret_code'] == 0 and len(results['hits']) > 0 else {'score': 0} score = doc_s['score'] if doc_s['docid'] == docid else 0 # tags dtags = doc['tags'] qtags = tokenize_text(qtags, no_punctuation=True, rm_stopwords=False) dtags = tokenize_text(dtags, no_punctuation=True, rm_stopwords=False) n_tag_match = len(set(dtags) & set(qtags)) # upvotes upvotes = int(doc['title'].split(':')[1]) return [qnum, upvotes, n_tag_match, score]
def visualize_hits(index, run_name, qid, query, hits, qrels=None, scores=None): # lookup document content for hit in hits: docid = hit['docid'] doc = pya0.index_lookup_doc(index, docid) hit['content'] = doc['content'] # output HTML preview if qrels: output_html_topic_run(run_name, qid, query, hits, qrels=qrels, judged_only=True, scores=scores) if True: output_html_topic_run(run_name, qid, query, hits, qrels=qrels, judged_only=False, scores=scores)
from .eval import TREC_output collection_driver.TREC_preprocess(collection, index, hits) TREC_output(hits, 'TEST.0', append=False, output_file=trec_output) # output HTML file if args.visualize_run: from .visualize import visualize abort_on_network_index(index) visualize(index, args.visualize_run, adhoc_query=origin_query, collection=collection) elif args.docid: abort_on_network_index(index) doc = pya0.index_lookup_doc(index, args.docid) print(json.dumps(doc, indent=4)) elif args.collection: abort_on_invalid_collection(args.collection) if args.trec_output is None: print('Error: Must specify a TREC output file to run topics') exit(1) run_topics( index, args.collection, output=trec_output, topk=topk, verbose=verbose,
writer, "[imath]x^2 + y^2 = z^2[/imath] is called pythagreon therom", extern_id="401") pya0.writer_flush(writer) pya0.writer_add_doc(writer, content="prove inequality by induction: " + "[imath] (a + b)^n \geq a^n + b^n [/imath]", url="https://math.stackexchange.com/questions/2528544", extern_id="402") pya0.writer_flush(writer) if pya0.writer_maintain(writer, force=True): print('index merged') pya0.writer_close(writer) else: print('Testing read-only mode, seg_dict loading and index caching') HOME = os.getenv("HOME") ix = pya0.index_open(index_path, option="r", segment_dict=f"{HOME}/cppjieba/dict") pya0.index_memcache(ix, term_cache=3, math_cache=5) # in MB print(pya0.index_lookup_doc(ix, 2)) # test doc raw content lookup print(pya0.index_lookup_doc(ix, 402)) # test inverted ID lookup pya0.index_print_summary(ix) pya0.index_close(ix)
def rm3_expand_query(index, query, hits, feedbackTerms=20, feedbackDocs=10): # create hit vectors q_lst = tokenize_query(query) vocab = set() d_vectors = [] d_scores = [] for hit in hits[:feedbackDocs]: docID = hit['docid'] doc = pya0.index_lookup_doc(index, docID) d_lst = tokenize_content(doc['content'], whitelist=q_lst) d_vec = list2vec(d_lst) d_vectors.append(d_vec) d_scores.append(hit['score']) vocab.update(d_vec.keys()) # generate relevance_model (RM) relevance_model = dict([(w, 0) for w in vocab]) # P(w|R) \prox P(w|q1...qn) / Z for word in vocab: word_weight = 0 # P(w,q1...qn) = sum_D P(D) * P(w|D) * QueryLikelihood for i, d_vec in enumerate(d_vectors): freq = d_vec[word] if word in d_vec else 0 score = d_scores[i] norm = sum(d_vec.values()) word_weight += (freq / norm) * score relevance_model[word] = word_weight # P(w|R) \prox P(w,q1...qn) / sum_w P(w,q1...qn) normalize_vec(relevance_model) # query RM normalization (L1) q_vec = list2vec(q_lst) normalize_vec(q_vec) # interpolate document RM with query RM relevance_model = interpolate(q_vec, relevance_model) # sort and select the top feedback terms as new query keywords new_query = sorted(relevance_model.items(), key=lambda item: item[1], reverse=True) new_query = new_query[:max(feedbackTerms, len(query))] new_query = dict(new_query) normalize_vec(new_query) # convert to required query format query = [] for q in new_query: if q.find('[imath]') >= 0: splits = re.split('\[imath\]|\[/imath\]', q) query.append({ 'type': 'tex', 'str': splits[1], 'field': 'content', 'boost': new_query[q] }) else: query.append({ 'type': 'term', 'str': q, 'field': 'content', 'boost': new_query[q] }) return query