Example #1
0
def extract_feat_on_qid(qid):
    print "qid: ", qid
    q_meta = data_utils.load_quaser_qmeta_by_id(qid)
    #q_lctx=data_utils.load_quaser_lctx_by_id(qid)

    q_question = q_meta['question']
    q_question_tokens = set(word_tokenize(q_question))
    q_question_tokens = [w for w in q_question_tokens if not w in stop_words]

    feat_fn = os.path.join(feat_root_path, str(qid) + ".pkl")
    cand2question_dict = {}
    #cand2context_dict={}

    for cand in candidate_list:
        #print cand
        #cid2bin_dict={}

        cand_vec = word_embed_mat[vocab.word2id[cand]]
        cand_question_vec = calc_bin_vec(cand_vec, q_question_tokens)
        cand2question_dict[cand] = cand_question_vec
        '''
        for ctx_id,ctx_str in q_lctx.iteritems():
            ctx_str=ctx_str['question']
            ctx_str_tokens = set(word_tokenize(ctx_str))
            ctx_str_tokens = [w for w in ctx_str_tokens if not w in stop_words]

            cur_bin=calc_bin_vec(cand_vec,ctx_str_tokens)
            cid2bin_dict[ctx_id]=cur_bin

        cand2context_dict[cand]=cid2bin_dict
        '''

    gen_utils.write_dict_to_pkl(cand2question_dict, feat_fn)

    return
Example #2
0
def extract_feat_on_qid(qid):
    print qid
    dst_feat_fn = os.path.join(feat_root_path, str(qid) + ".pkl")

    q_context_dict = data_utils.load_quaser_lctx_by_id(qid)
    all_q_tokens = []
    for qid, q_context in q_context_dict.iteritems():
        q_context_text = q_context['question']
        tokens = word_tokenize(q_context_text)
        all_q_tokens += tokens

    token_dict, doclen = get_token_dict(all_q_tokens)

    word_indri_score = {}

    for cand_word in candidate_list:
        p_mle_tc = ctf_dict[cand_word] / float(C)
        if not cand_word in token_dict:
            tf = 0.5
        else:
            tf = token_dict[cand_word]
        p_score = (1 - lam) * float(tf + mu * p_mle_tc) / float(
            doclen + mu) + lam * p_mle_tc
        word_indri_score[cand_word] = p_score

    gen_utils.write_dict_to_pkl(word_indri_score, dst_feat_fn)
    return
Example #3
0
def extract_feat_on_qid(qid):
    print qid
    q_context_dict=data_utils.load_quaser_lctx_by_id(qid)

    #   load the query sentences, so that we can exclude it accordingly
    q_query=data_utils.load_quaser_qmeta_by_id(qid)
    q_question=q_query['question']
    q_tokens=set(word_tokenize(q_question))

    dst_feat_fn=os.path.join(feat_root_path,str(qid)+".pkl")

    all_context_tokens=[]

    for qid,q_context in q_context_dict.iteritems():
        q_context_text=q_context['question']
        #   LUKE: can you toeknize the text we needed as a feature using nltk as following?
        tokens = word_tokenize(q_context_text)

        #   remove the stop words
        tokens = [w for w in tokens if not w in stop_words]

        #   remove the candidates in original questions
        tokens = [w for w in tokens if (not w in q_tokens)]

        all_context_tokens+=tokens

    all_context_tokens=set(all_context_tokens)
    token_avg_dist_dict={}
    for cand_token in candidate_list:
        #print cand_token
        dist=calc_avg_dist(set(list(cand_token)),all_context_tokens)
        token_avg_dist_dict[cand_token]=dist
    gen_utils.write_dict_to_pkl(token_avg_dist_dict,dst_feat_fn)

    return
Example #4
0
def extract_feat_on_qid(qid):
    print qid
    q_context_dict = data_utils.load_quaser_lctx_by_id(qid)
    dst_feat_fn = os.path.join(feat_root_path, str(qid) + ".pkl")

    all_context_tokens = []
    for qid, q_context in q_context_dict.iteritems():
        q_context_text = q_context['question']
        #   LUKE: can you toeknize the text we needed as a feature using nltk as following?
        tokens = word_tokenize(q_context_text)

        #   remove the stop words
        tokens = [w for w in tokens if not w in stop_words]
        #   remove those words not in candidate list
        tokens = [w for w in tokens if w in candidate_list]

        all_context_tokens += tokens

    if len(all_context_tokens) != 0:
        #   obtain the one with maximum frequency
        most_common_token = most_common_in_list(all_context_tokens)
    else:
        most_common_token = None
    dst_meta = {
        'cand_token_list': all_context_tokens,
        'most_common': most_common_token
    }

    gen_utils.write_dict_to_pkl(dst_meta, dst_feat_fn)

    return
Example #5
0
def tag_pos(qid):
    qtokens = gen_utils.read_dict_from_pkl(os.path.join(data_cfg.root_path,qid & "_tokens.pkl"))
    qpos = pos_tagger.tag(qtokens)

    pos_list = []
    for q in qpos:
        tok, pos = q
        pos_list.append(pos)

    pos_fn=os.path.join(feat_root_path,str(qid)+".pkl")
    gen_utils.write_dict_to_pkl(pos_list,pos_fn)

    return
Example #6
0
def context(qid, threshold):
    qtokens = gen_utils.read_dict_from_pkl(
        os.path.join(data_cfg.root_path,
                     qid & "_tokens.pkl"))  #check actual address
    conVectorsSt = []
    conVectorsDm = []

    targVectorsSt = []
    targVectorsDm = []

    for i in range(qtokens.size()):
        token = qtokens[i]

        qvecSt = standard_vectors.lookup(token)
        qvecDo = domain_vectors.lookup(token)

        if token in candidate_list:
            targVectorsSt.append(qvecSt)
            targVectorsDo.append(qvecDo)
        else:
            conVectorsSt.append(qvecSt)
            conVectorsDo.append(qvecDo)

    count = 0
    stanDistTotal = 0
    domDistTotal = 0
    for i in range(targVectorsSt.size()):
        count += 1
        targSt = targVectorsSt[i]
        targDo = targVectorsDo[i]

        for j in range(conVectorsSt.size()):
            conSt = conVectorsSt[j]
            conDo = conVectorsSt[j]

            stanDistTotal += vecDistance(targSt, conSt)
            domDistTotal += vecDistance(targDo, conDo)

    stanDistTotal = float(stanDistTotal) / count
    domDistTotal = float(domDistTotal) / count

    if stanDistTotal > domDistTotal:
        context_feat = domDistTotal / stanDistTotal
    else:
        context_feat = -1 * (stanDistTotal / domDistTotal)

    con_feat_fn = os.path.join(feat_root_path, str(qid) + ".pkl")
    gen_utils.write_dict_to_pkl(context_feat, con_feat_fn)

    return
Example #7
0
def extract_feat_on_qid(qid):
    print qid
    dst_feat_fn = os.path.join(feat_root_path, str(qid) + ".pkl")

    q_context_dict = data_utils.load_quaser_lctx_by_id(qid)
    all_q_tokens = []
    for qid, q_context in q_context_dict.iteritems():
        q_context_text = q_context['question']
        tokens = word_tokenize(q_context_text)
        all_q_tokens += tokens

    token_dict, doclen = get_token_dict(all_q_tokens)

    word_bm25_score = {}
    for cand_word in candidate_list:

        try:
            df = df_dict[cand_word]
        except:
            df = 0

        try:
            tf = token_dict[cand_word]
        except:
            tf = 0

        rsj_weight = math.log((N - df + 0.5) / float(df + 0.5))
        tf_weight = tf / float(tf + k1 * ((1 - b) + b *
                                          (doclen / float(avg_doclen))))
        user_weight = (k3 + 1) * 1 / float((k3 + 1))

        all_score = rsj_weight * tf_weight * user_weight
        word_bm25_score[cand_word] = all_score
        if all_score != 0:
            print cand_word, qid
        pass

    gen_utils.write_dict_to_pkl(word_bm25_score, dst_feat_fn)
    return
Example #8
0
def pred_a_cfg(cfg):


    feat_type=cfg[0];trn_split=cfg[2];eval_list=cfg[-1]

    feat_root_path = os.path.join(data_cfg.feat_root_path, feat_type)
    pred_root_path = os.path.join(data_cfg.pred_root_path,model_type,feat_type)
    if not os.path.exists(pred_root_path):
        os.makedirs(pred_root_path)

    model_root_path = os.path.join(data_cfg.model_root_path,feat_type,trn_split,model_type)
    model_fn=os.path.join(model_root_path,"model.pkl")
    model=gen_utils.read_dict_from_pkl(model_fn)


    qid_list=gen_utils.read_dict_from_pkl(eval_list)
    for qid in qid_list:
        print "Loading feature: ", qid
        feat_fn = os.path.join(feat_root_path, str(qid) + ".npz")
        if not os.path.exists(feat_fn):
            print "Warning: feature not exist:", qid
            continue

        feat=np.load(feat_fn)['feat']
        feat=np.expand_dims(feat, axis=0)
        pred_label = cand_list[model.predict(feat)[0]]
        all_pred_score = model.decision_function(feat)[0].tolist()

        q_pred = {'all_pred_probs': all_pred_score, 'pred_term': pred_label}
        pred_fn=os.path.join(pred_root_path,str(qid)+".pkl")
        gen_utils.write_dict_to_pkl(q_pred,pred_fn)


        pass

    return
Example #9
0
def handle_a_split(split):

    global gid


    lctx_fn=os.path.join(lctx_root_path,split+"_contexts.json.gz")
    sctx_fn=os.path.join(sctx_root_path,split+"_contexts.json.gz")
    q_fn=os.path.join(q_root_path,split+"_questions.json.gz")
    assert os.path.isfile(lctx_fn)
    assert os.path.isfile(sctx_fn)
    assert os.path.isfile(q_fn)
    print "Handling: ",lctx_fn

    f_lctx=gzip.open(lctx_fn)
    f_sctx=gzip.open(sctx_fn)
    f_q=gzip.open(q_fn)

    for lctx,sctx,q in zip(f_lctx,f_sctx,f_q):
        #print "Handling q:",q
        lctx=json.loads(lctx);sctx=json.loads(sctx);q=json.loads(q)

        lctx_dict=parse_ctx(lctx['contexts'])
        sctx_dict=parse_ctx(sctx['contexts'])

        q_meta=q
        gt=str(q_meta['answer'])

        lctx_fn=os.path.join(data_cfg.long_ctx_root_path,str(gid)+".pkl")
        sctx_fn=os.path.join(data_cfg.short_ctx_root_path,str(gid)+".pkl")
        q_meta_fn=os.path.join(data_cfg.q_root_path,str(gid)+".pkl")
        gt_fn=os.path.join(data_cfg.gt_root_path,str(gid)+".pkl")

        gid+=1
        gen_utils.write_dict_to_pkl(lctx_dict,lctx_fn)
        gen_utils.write_dict_to_pkl(sctx_dict,sctx_fn)
        gen_utils.write_dict_to_pkl(q_meta,q_meta_fn)
        gen_utils.write_dict_to_pkl(gt,gt_fn)
        pass


    f_lctx.close()
    f_sctx.close()
    f_q.close()

    return gid
Example #10
0
        feat_fn = os.path.join(feat_root_path, str(qid) + ".npz")
        if not os.path.exists(feat_fn):
            print "Warning: feature not exist:", qid
            continue

        gt_token = data_utils.load_quaser_gt_by_id(qid)
        gt_label = cand_list.index(gt_token)
        feat = np.load(feat_fn)['feat']

        ret_feats.append(feat)
        ret_labels.append(gt_label)

    return ret_feats, ret_labels


if __name__ == "__main__":

    for cfg in train_cfgs:
        qid_list = gen_utils.read_dict_from_pkl(cfg[1])
        model_root_path = os.path.join(data_cfg.model_root_path, cfg[0],
                                       cfg[2], model_type)
        print model_root_path
        all_train_feats, all_train_labels = load_feat_and_labels(qid_list, cfg)
        print "training model, cfg: ", cfg
        model = MultinomialNB()
        model = model.fit(all_train_feats, all_train_labels)
        if not os.path.exists(model_root_path):
            os.makedirs(model_root_path)
        model_fn = os.path.join(model_root_path, "model.pkl")
        gen_utils.write_dict_to_pkl(model, model_fn)
Example #11
0
candidate_list=gen_utils.read_lines_from_text_file(data_cfg.cand_lst_fn)

if __name__=="__main__":
    qa_list = gen_utils.read_dict_from_pkl(lst_fn)
    for qid in qa_list:

        if qid%1000==0:
            print "classify on qid: ",qid
        feat_fn = os.path.join(feat_root_path, str(qid) + ".pkl")

        wd_dict = gen_utils.read_dict_from_pkl(feat_fn)

        cur_probs = [];
        max_prob = -10000;
        max_label = None
        for cand_term in candidate_list:
            if cand_term in wd_dict:
                cur_prob=-wd_dict[cand_term]
            else:
                cur_prob=0
            if cur_prob>max_prob:
                max_label=cand_term
                max_prob=cur_prob
            cur_probs.append(cur_prob)

        q_pred = {'all_pred_probs': cur_probs, 'pred_term': max_label}
        pred_fn = os.path.join(pred_root_path, str(qid) + ".pkl")
        gen_utils.write_dict_to_pkl(q_pred, pred_fn)

    print "done."
Example #12
0
        gen_utils.write_dict_to_pkl(q_meta,q_meta_fn)
        gen_utils.write_dict_to_pkl(gt,gt_fn)
        pass


    f_lctx.close()
    f_sctx.close()
    f_q.close()

    return gid

'''
if __name__=="__main__":
    for split in split_list:
        cur_gid=handle_a_split(split)
        print split,cur_gid

    print "done."
'''

if __name__=="__main__":
    trn_list=[_ for _ in xrange(0,31049)]
    test_list=[_ for _ in xrange(31049,34223)]
    val_list=[_ for _ in xrange(34223,37362)]
    all_list=trn_list+test_list+val_list

    gen_utils.write_dict_to_pkl(trn_list,data_cfg.trn_list_fn)
    gen_utils.write_dict_to_pkl(val_list,data_cfg.val_list_fn)
    gen_utils.write_dict_to_pkl(test_list,data_cfg.tst_list_fn)
    gen_utils.write_dict_to_pkl(all_list,data_cfg.all_list_fn)
    print "done."
Example #13
0
        for cid, context in q_context_dict.iteritems():
            cur_question = context['question']
            cur_question_tokens = word_tokenize(cur_question)
            all_context_tokens += cur_question_tokens
            doc_len = doc_len + len(cur_question_tokens)
        all_doc_len.append(doc_len)

        if len(all_doc_len) > 2000:
            break

    import numpy as np

    return np.average(all_doc_len)


if __name__ == "__main__":
    dst_fn = os.path.join(data_cfg.dataset_root_path, "BM25_meta.pkl")
    bm25_meta = gen_utils.read_dict_from_pkl(dst_fn)
    avg_doc_len = calc_avg_doc_len()

    bm25_meta['avg_doc_len'] = avg_doc_len
    '''
    N=calc_bm25_N()
    cand_df_dict=calc_candidate_df()
    bm25_meta={'N':N,'df_dict':cand_df_dict}
    gen_utils.write_dict_to_pkl(bm25_meta,dst_fn)
    '''

    gen_utils.write_dict_to_pkl(bm25_meta, dst_fn)
    print "done."
Example #14
0
        q_context_dict = data_utils.load_quaser_lctx_by_id(qid)

        for cid, context in q_context_dict.iteritems():
            cur_question = context['question']
            cur_question_tokens = word_tokenize(cur_question)
            all_context_tokens += cur_question_tokens

        token_cnt_dict={}
        for token in all_context_tokens:
            if not token in token_cnt_dict:
                token_cnt_dict[token]=0

            token_cnt_dict[token]+=1
        c_len+=len(all_context_tokens)
        for cand in candidate_list:
            if cand in token_cnt_dict:
                ret_dict[cand]+=token_cnt_dict[cand]
                print cand,ret_dict[cand]

        print qid

    return ret_dict,c_len

if __name__=="__main__":
    indri_meta={'lambda':lam,'mu':mu}
    dst_fn = os.path.join(data_cfg.dataset_root_path, "Indri_meta.pkl")
    ctf_dict,c_len=get_ctf_dict()
    indri_meta['ctf']=ctf_dict
    indri_meta['C']=c_len
    gen_utils.write_dict_to_pkl(indri_meta,dst_fn)
    print "done."