Example #1
0
def TREC_preprocess(collection, index, hits):
    if collection in ['test', 'arqmath-2020-task1', 'arqmath-2021-task1', 'arqmath-2021-task1-refined']:
        for hit in hits:
            doc = pya0.index_lookup_doc(index, hit['docid'])
            hit['_'] = hit['docid']
            hit['docid'] = int(doc['url'])

    elif collection in ['arqmath-2020-task2', 'arqmath-2021-task2', 'arqmath-2021-task2-refined']:
        for hit in hits:
            doc = pya0.index_lookup_doc(index, hit['docid'])
            formulaID, postID, threadID, type_, visualID = doc['url'].split(',')
            hit['_'] = formulaID
            hit['docid'] = int(postID)
    else:
        raise NotImplementedError
Example #2
0
def TREC_reverse(collection, index, hits):
    if collection in ['test', 'arqmath-2020-task1', 'arqmath-2021-task1', 'arqmath-2021-task1-refined']:
        for hit in hits:
            trec_docid = hit['docid']
            hit['trec_docid'] = trec_docid
            doc = pya0.index_lookup_doc(index, trec_docid)
            hit['docid'] = int(doc['extern_id']) # get internal doc ID
    elif collection in ['arqmath-2020-task2', 'arqmath-2021-task2', 'arqmath-2021-task2-refined']:
        for hit in hits:
            trec_docid = int(hit['_'])
            hit['trec_docid'] = trec_docid
            hit['_'] = str(hit['docid']) # docid is actually post ID here
            doc = pya0.index_lookup_doc(index, trec_docid)
            hit['docid'] = int(doc['extern_id']) # get internal doc ID
    else:
        raise NotImplementedError
Example #3
0
def _featslookup__arqmath_2020_task1(topic_query, index, docid):
    qid, query, qtags = topic_query
    # qnum
    qnum = int(qid.split('.')[1])
    # doc
    doc = pya0.index_lookup_doc(index, docid)
    # doc score
    result_JSON = pya0.search(index, query, verbose=False, topk=1, log=None, docid=docid)
    results = json.loads(result_JSON)
    doc_s = results['hits'][0] if results['ret_code'] == 0 and len(results['hits']) > 0 else {'score': 0}
    score = doc_s['score'] if doc_s['docid'] == docid else 0
    # tags
    dtags = doc['tags']
    qtags = tokenize_text(qtags, no_punctuation=True, rm_stopwords=False)
    dtags = tokenize_text(dtags, no_punctuation=True, rm_stopwords=False)
    n_tag_match = len(set(dtags) & set(qtags))
    # upvotes
    upvotes = int(doc['title'].split(':')[1])
    return [qnum, upvotes, n_tag_match, score]
Example #4
0
def visualize_hits(index, run_name, qid, query, hits, qrels=None, scores=None):
    # lookup document content
    for hit in hits:
        docid = hit['docid']
        doc = pya0.index_lookup_doc(index, docid)
        hit['content'] = doc['content']
    # output HTML preview
    if qrels:
        output_html_topic_run(run_name,
                              qid,
                              query,
                              hits,
                              qrels=qrels,
                              judged_only=True,
                              scores=scores)
    if True:
        output_html_topic_run(run_name,
                              qid,
                              query,
                              hits,
                              qrels=qrels,
                              judged_only=False,
                              scores=scores)
Example #5
0
            from .eval import TREC_output
            collection_driver.TREC_preprocess(collection, index, hits)
            TREC_output(hits, 'TEST.0', append=False, output_file=trec_output)

        # output HTML file
        if args.visualize_run:
            from .visualize import visualize
            abort_on_network_index(index)
            visualize(index,
                      args.visualize_run,
                      adhoc_query=origin_query,
                      collection=collection)

    elif args.docid:
        abort_on_network_index(index)
        doc = pya0.index_lookup_doc(index, args.docid)
        print(json.dumps(doc, indent=4))

    elif args.collection:
        abort_on_invalid_collection(args.collection)

        if args.trec_output is None:
            print('Error: Must specify a TREC output file to run topics')
            exit(1)

        run_topics(
            index,
            args.collection,
            output=trec_output,
            topk=topk,
            verbose=verbose,
Example #6
0
        writer,
        "[imath]x^2 + y^2 = z^2[/imath] is called pythagreon therom",
        extern_id="401")
    pya0.writer_flush(writer)

    pya0.writer_add_doc(writer,
                        content="prove inequality by induction: " +
                        "[imath] (a + b)^n \geq a^n + b^n [/imath]",
                        url="https://math.stackexchange.com/questions/2528544",
                        extern_id="402")
    pya0.writer_flush(writer)

    if pya0.writer_maintain(writer, force=True):
        print('index merged')

    pya0.writer_close(writer)

else:
    print('Testing read-only mode, seg_dict loading and index caching')
    HOME = os.getenv("HOME")
    ix = pya0.index_open(index_path,
                         option="r",
                         segment_dict=f"{HOME}/cppjieba/dict")

    pya0.index_memcache(ix, term_cache=3, math_cache=5)  # in MB
    print(pya0.index_lookup_doc(ix, 2))  # test doc raw content lookup
    print(pya0.index_lookup_doc(ix, 402))  # test inverted ID lookup
    pya0.index_print_summary(ix)

pya0.index_close(ix)
Example #7
0
def rm3_expand_query(index, query, hits, feedbackTerms=20, feedbackDocs=10):
    # create hit vectors
    q_lst = tokenize_query(query)
    vocab = set()
    d_vectors = []
    d_scores = []
    for hit in hits[:feedbackDocs]:
        docID = hit['docid']
        doc = pya0.index_lookup_doc(index, docID)
        d_lst = tokenize_content(doc['content'], whitelist=q_lst)
        d_vec = list2vec(d_lst)
        d_vectors.append(d_vec)
        d_scores.append(hit['score'])
        vocab.update(d_vec.keys())

    # generate relevance_model (RM)
    relevance_model = dict([(w, 0)
                            for w in vocab])  # P(w|R) \prox P(w|q1...qn) / Z
    for word in vocab:
        word_weight = 0  # P(w,q1...qn) = sum_D P(D) * P(w|D) * QueryLikelihood
        for i, d_vec in enumerate(d_vectors):
            freq = d_vec[word] if word in d_vec else 0
            score = d_scores[i]
            norm = sum(d_vec.values())
            word_weight += (freq / norm) * score
        relevance_model[word] = word_weight

    # P(w|R) \prox P(w,q1...qn) / sum_w P(w,q1...qn)
    normalize_vec(relevance_model)

    # query RM normalization (L1)
    q_vec = list2vec(q_lst)
    normalize_vec(q_vec)

    # interpolate document RM with query RM
    relevance_model = interpolate(q_vec, relevance_model)

    # sort and select the top feedback terms as new query keywords
    new_query = sorted(relevance_model.items(),
                       key=lambda item: item[1],
                       reverse=True)
    new_query = new_query[:max(feedbackTerms, len(query))]
    new_query = dict(new_query)
    normalize_vec(new_query)

    # convert to required query format
    query = []
    for q in new_query:
        if q.find('[imath]') >= 0:
            splits = re.split('\[imath\]|\[/imath\]', q)
            query.append({
                'type': 'tex',
                'str': splits[1],
                'field': 'content',
                'boost': new_query[q]
            })
        else:
            query.append({
                'type': 'term',
                'str': q,
                'field': 'content',
                'boost': new_query[q]
            })
    return query