def filter_id_set_on_conf(conf, cand_id_set, corr):
    ret_id_set = set()
    feat_name = conf[0]
    clas_name = conf[1]

    pred_root_path = os.path.join(data_cfg.pred_root_path, clas_name,
                                  feat_name)
    gt_root_path = data_cfg.gt_root_path

    for qid in cand_id_set:

        gt_fn = os.path.join(gt_root_path, str(qid) + ".pkl")
        pred_fn = os.path.join(pred_root_path, str(qid) + ".pkl")
        pred_meta = gen_utils.read_dict_from_pkl(pred_fn)
        gt = gen_utils.read_dict_from_pkl(gt_fn)
        pred_term = pred_meta['pred_term']

        if corr and gt == pred_term:
            ret_id_set.add(qid)
            continue

        if (not corr) and gt != pred_term:
            ret_id_set.add(qid)
            continue

    return ret_id_set
Example #2
0
def evaluate_a_conf(conf):
    feat_type = conf[0]
    cls_type = conf[1]
    qid_list = gen_utils.read_dict_from_pkl(conf[2])
    pred_root_path = os.path.join(data_cfg.pred_root_path, cls_type, feat_type)
    gt_root_path = data_cfg.gt_root_path

    total_ins = 0

    acc_dict = {}
    for k in topk_list:
        acc_dict[k] = 0

    for qid in qid_list:
        total_ins += 1
        gt_fn = os.path.join(gt_root_path, str(qid) + ".pkl")
        pred_fn = os.path.join(pred_root_path, str(qid) + ".pkl")
        pred_meta = gen_utils.read_dict_from_pkl(pred_fn)
        gt = gen_utils.read_dict_from_pkl(gt_fn)

        for k in topk_list:
            ret = pred_k_correct(pred_meta['all_pred_probs'], gt, k)
            if ret:
                acc_dict[k] += 1
        pass

    for k in topk_list:
        acc_dict[k] = float(acc_dict[k]) / float(total_ins)

    return acc_dict
Example #3
0
def vec_bm25_func(feat_root_path, qid):
    feat_fn = os.path.join(feat_root_path, str(qid) + ".pkl")
    ret_feat = np.asarray([0.0] * len(cand_list))

    try:
        org_feat = gen_utils.read_dict_from_pkl(feat_fn)
        for key, val in org_feat:
            ret_feat[cand_list.index(key)] = val
    except:
        pass

    return ret_feat
Example #4
0
def tag_pos(qid):
    qtokens = gen_utils.read_dict_from_pkl(os.path.join(data_cfg.root_path,qid & "_tokens.pkl"))
    qpos = pos_tagger.tag(qtokens)

    pos_list = []
    for q in qpos:
        tok, pos = q
        pos_list.append(pos)

    pos_fn=os.path.join(feat_root_path,str(qid)+".pkl")
    gen_utils.write_dict_to_pkl(pos_list,pos_fn)

    return
Example #5
0
def evaluate_a_conf(conf):
    feat_type = conf[0]
    cls_type = conf[1]
    qid_list = gen_utils.read_dict_from_pkl(conf[2])
    pred_root_path = os.path.join(data_cfg.pred_root_path, cls_type, feat_type)
    gt_root_path = data_cfg.gt_root_path

    acc_count = 0
    total_ins = 0

    for qid in qid_list:
        total_ins += 1
        gt_fn = os.path.join(gt_root_path, str(qid) + ".pkl")
        pred_fn = os.path.join(pred_root_path, str(qid) + ".pkl")
        pred_meta = gen_utils.read_dict_from_pkl(pred_fn)
        gt = gen_utils.read_dict_from_pkl(gt_fn)
        pred_term = pred_meta['pred_term']
        if gt == pred_term:
            acc_count += 1

        pass

    return acc_count / float(total_ins)
Example #6
0
def vec_embed_sim(feat_root_path, qid, bin_size=10):
    feat_fn = os.path.join(feat_root_path, str(qid) + ".pkl")
    ret_feat = np.asarray([0.0] * bin_size * len(cand_list))

    try:
        org_feat = gen_utils.read_dict_from_pkl(feat_fn)
        for i in xrange(0, len(cand_list)):
            token = cand_list[i]
            ret_feat[i * bin_size:(i + 1) * bin_size] = org_feat[token]

    except:
        pass

    return ret_feat
Example #7
0
def context(qid, threshold):
    qtokens = gen_utils.read_dict_from_pkl(
        os.path.join(data_cfg.root_path,
                     qid & "_tokens.pkl"))  #check actual address
    conVectorsSt = []
    conVectorsDm = []

    targVectorsSt = []
    targVectorsDm = []

    for i in range(qtokens.size()):
        token = qtokens[i]

        qvecSt = standard_vectors.lookup(token)
        qvecDo = domain_vectors.lookup(token)

        if token in candidate_list:
            targVectorsSt.append(qvecSt)
            targVectorsDo.append(qvecDo)
        else:
            conVectorsSt.append(qvecSt)
            conVectorsDo.append(qvecDo)

    count = 0
    stanDistTotal = 0
    domDistTotal = 0
    for i in range(targVectorsSt.size()):
        count += 1
        targSt = targVectorsSt[i]
        targDo = targVectorsDo[i]

        for j in range(conVectorsSt.size()):
            conSt = conVectorsSt[j]
            conDo = conVectorsSt[j]

            stanDistTotal += vecDistance(targSt, conSt)
            domDistTotal += vecDistance(targDo, conDo)

    stanDistTotal = float(stanDistTotal) / count
    domDistTotal = float(domDistTotal) / count

    if stanDistTotal > domDistTotal:
        context_feat = domDistTotal / stanDistTotal
    else:
        context_feat = -1 * (stanDistTotal / domDistTotal)

    con_feat_fn = os.path.join(feat_root_path, str(qid) + ".pkl")
    gen_utils.write_dict_to_pkl(context_feat, con_feat_fn)

    return
Example #8
0
def pred_a_cfg(cfg):


    feat_type=cfg[0];trn_split=cfg[2];eval_list=cfg[-1]

    feat_root_path = os.path.join(data_cfg.feat_root_path, feat_type)
    pred_root_path = os.path.join(data_cfg.pred_root_path,model_type,feat_type)
    if not os.path.exists(pred_root_path):
        os.makedirs(pred_root_path)

    model_root_path = os.path.join(data_cfg.model_root_path,feat_type,trn_split,model_type)
    model_fn=os.path.join(model_root_path,"model.pkl")
    model=gen_utils.read_dict_from_pkl(model_fn)


    qid_list=gen_utils.read_dict_from_pkl(eval_list)
    for qid in qid_list:
        print "Loading feature: ", qid
        feat_fn = os.path.join(feat_root_path, str(qid) + ".npz")
        if not os.path.exists(feat_fn):
            print "Warning: feature not exist:", qid
            continue

        feat=np.load(feat_fn)['feat']
        feat=np.expand_dims(feat, axis=0)
        pred_label = cand_list[model.predict(feat)[0]]
        all_pred_score = model.decision_function(feat)[0].tolist()

        q_pred = {'all_pred_probs': all_pred_score, 'pred_term': pred_label}
        pred_fn=os.path.join(pred_root_path,str(qid)+".pkl")
        gen_utils.write_dict_to_pkl(q_pred,pred_fn)


        pass

    return
Example #9
0
def vec_mfe_func(feat_root_path, qid):
    feat_fn = os.path.join(feat_root_path, str(qid) + ".pkl")

    ret_feat = np.asarray([0.0] * len(cand_list))

    try:
        org_feat = gen_utils.read_dict_from_pkl(feat_fn)
        ret_feat[cand_list.index(org_feat['most_common'])] = 1

        for other_token in org_feat['cand_token_list']:
            ret_feat[cand_list.index(other_token)] = 0.5
    except:
        #   feature may not exist
        pass

    return ret_feat
Example #10
0
def get_cand_gt_dist():


    qid_list=gen_utils.read_dict_from_pkl(qid_list_fn)
    #   calculate the candidate ground-truth distribution
    count_dict={}
    for cand in candidate_list:

        question=data_utils.load_quaser_qmeta_by_id(0)
        print "pase"

        count_dict[cand]=0
    for qid in qid_list:
        gt=data_utils.load_quaser_gt_by_id(qid)
        count_dict[gt]+=1
    for key,val in count_dict.iteritems():
        print key,val/float(sum(count_dict.values()))
    return
Example #11
0
def load_quaser_qmeta_by_id(id):
    pass
    q_fn = os.path.join(q_root_path, str(id) + ".pkl")
    q = gen_utils.read_dict_from_pkl(q_fn)
    return q
Example #12
0
def load_quaser_sctx_by_id(id):
    pass
    sctx_fn = os.path.join(sctx_root_path, str(id) + ".pkl")
    sctx = gen_utils.read_dict_from_pkl(sctx_fn)
    return sctx
Example #13
0
def load_quaser_gt_by_id(id):
    pass
    gt_fn=os.path.join(gt_root_path,str(id)+".pkl")
    gt=gen_utils.read_dict_from_pkl(gt_fn)
    return gt
Example #14
0
import os
import sys
sys.path.append("../")
sys.path.append("../../")
import config.data_config as data_cfg
import utils.gen_utils as gen_utils
import utils.data_utils as data_utils
from modules.VocabEntry import VocabEntry

lst_fn=data_cfg.all_list_fn
qa_list=gen_utils.read_dict_from_pkl(lst_fn)

gt_root_path=data_cfg.gt_root_path
lctx_root_path=data_cfg.long_ctx_root_path
sctx_root_path=data_cfg.short_ctx_root_path
q_root_path=data_cfg.q_root_path

cand_list=gen_utils.read_lines_from_text_file(data_cfg.cand_lst_fn)

def get_q_text(qid):
    ret_lines=[]
    gt_text,lctx_dict,sctx_dict,q_meta=data_utils.load_quaser_all_by_id(qid)
    ret_lines.append(gt_text)
    for _,meta in lctx_dict.iteritems():
        ret_lines.append(meta['question'])

    for _, meta in sctx_dict.iteritems():
        ret_lines.append(meta['question'])

    ret_lines.append(q_meta['question'])
    ret_lines.append(q_meta['answer'])
Example #15
0
'''
    11/11/2018: Calculate the BM25 meta information. (N and dft)
'''

import os
import sys
sys.path.append("../")
sys.path.append("../../")
import config.data_config as data_cfg
import utils.gen_utils as gen_utils
import utils.data_utils as data_utils
from nltk import word_tokenize

#   process for all type of questions
lst_fn = data_cfg.all_list_fn
q_list = gen_utils.read_dict_from_pkl(lst_fn)
candidate_list = gen_utils.read_lines_from_text_file(data_cfg.cand_lst_fn)


def calc_bm25_N():
    doc_id_list = gen_utils.read_dict_from_pkl(lst_fn)
    N = len(doc_id_list)
    return N


def calc_candidate_df():
    cand_df_dict = {}
    for cand_word in candidate_list:
        if not cand_word in cand_df_dict:
            cand_df_dict[cand_word] = 0
Example #16
0
        #   obtain the one with maximum frequency
        most_common_token = most_common_in_list(all_context_tokens)
    else:
        most_common_token = None
    dst_meta = {
        'cand_token_list': all_context_tokens,
        'most_common': most_common_token
    }

    gen_utils.write_dict_to_pkl(dst_meta, dst_feat_fn)

    return


if __name__ == "__main__":
    qa_list = gen_utils.read_dict_from_pkl(lst_fn)

    #   use multi-thread for fast processing
    thread_pool = []

    #   extract feature for a specific question/answer context
    for qid in qa_list:
        th = threading.Thread(target=extract_feat_on_qid, args=(qid, ))
        th.start()
        thread_pool.append(th)

        while len(threading.enumerate()) >= MAX_TH:
            pass

    for th in thread_pool:
        th.join()
Example #17
0
import os
import sys
sys.path.append("../")
import config.data_config as data_cfg
import utils.gen_utils as gen_utils
import utils.data_utils as data_utils
import threading
import nltk

lst_fn = data_cfg.all_list_fn

candidate_list = gen_utils.read_lines_from_text_file(data_cfg.cad_lst.fn)

standard_vectors = gen_utils.read_dict_from_pkl(
    data_cfg.standard_vectors)  #address
domain_vectors = gen_utils.read_dict_from_pkl(
    data_cfg.domain_vectors)  #address


def context(qid, threshold):
    qtokens = gen_utils.read_dict_from_pkl(
        os.path.join(data_cfg.root_path,
                     qid & "_tokens.pkl"))  #check actual address
    conVectorsSt = []
    conVectorsDm = []

    targVectorsSt = []
    targVectorsDm = []

    for i in range(qtokens.size()):
        token = qtokens[i]
Example #18
0
#   the name of the feature
feat_type = "w_embed_sim"
MAX_TH = 16

stop_words = set(stopwords.words('english'))

#   create the feature root path
feat_root_path = os.path.join(data_cfg.feat_root_path, feat_type)
if not os.path.exists(feat_root_path):
    os.makedirs(feat_root_path)

candidate_list = gen_utils.read_lines_from_text_file(data_cfg.cand_lst_fn)

word_embed_mat = np.load(
    "/home/jiangl1/data/11791_data/quaser/raw/id2wordvec.npy")
vocab = gen_utils.read_dict_from_pkl(
    "/home/jiangl1/data/11791_data/quaser/vocab.pkl")

print "Initialize finished..."


def calc_bin_vec(cand_vec, token_list):
    bin = np.zeros(10)
    for token in token_list:
        try:
            token_vec = word_embed_mat[vocab.word2id[token]]
        except:
            continue
        sim = 1 - spatial.distance.cosine(cand_vec, token_vec)
        sim_dim = int((sim + 1) / 0.2 - 1)  #(from -1 to 1, 0.2 per interval)
        bin[sim_dim] += 1
Example #19
0
from boom.modules import Module
from multiprocessing import Pool
from nltk import word_tokenize
from nltk.corpus import stopwords
import utils.gen_utils as gen_utils
import math

indri_meta = gen_utils.read_dict_from_pkl("data/Indri_meta.pkl")

stop_words = set(stopwords.words('english'))
candidate_list = gen_utils.read_lines_from_text_file("data/candidate.lst")

lam = indri_meta['lambda']
mu = indri_meta['mu']
C = indri_meta['C']
ctf_dict = indri_meta['ctf']


def get_token_dict(tokens):
    ret_dict = {}
    total_token = 0
    for token in tokens:
        if not token in ret_dict:
            ret_dict[token] = 0
        ret_dict[token] += 1
        total_token += 1

    return ret_dict, total_token


def multi_process_helper(args):
Example #20
0
import os
import sys
sys.path.append("../")
import config.data_config as data_cfg
import utils.gen_utils as gen_utils
import utils.data_utils as data_utils

trn_lst=gen_utils.read_dict_from_pkl(data_cfg.trn_list_fn)
print "Initialize finished..."
ins_lst=trn_lst[0:2]

if __name__=="__main__":

    all_ins=[]
    for ins_id in ins_lst:
        print ins_id
        question=data_utils.load_quaser_qmeta_by_id(ins_id)
        context=data_utils.load_quaser_lctx_by_id(ins_id)
        q_and_context={'question':question,'context':context}
        all_ins.append(q_and_context)

    gen_utils.write_dict_to_json(all_ins,"/home/jiangl1/data/11791_data/jiang_codes/BOOM/examples/QS/test_data.json")
    print "done."
Example #21
0
from boom.modules import Module
from multiprocessing import Pool
from nltk import word_tokenize
from nltk.corpus import stopwords
import utils.gen_utils as gen_utils
import math

bm25_meta = gen_utils.read_dict_from_pkl("data/BM25_meta.pkl")
N = bm25_meta['N']
avg_doclen = bm25_meta['avg_doc_len']
df_dict = bm25_meta['df_dict']
stop_words = set(stopwords.words('english'))
candidate_list = gen_utils.read_lines_from_text_file("data/candidate.lst")


def get_token_dict(tokens):
    ret_dict = {}
    total_token = 0
    for token in tokens:
        if not token in ret_dict:
            ret_dict[token] = 0
        ret_dict[token] += 1
        total_token += 1

    return ret_dict, total_token


def multi_process_helper(args):
    q_and_context_list = args[0]
    k1 = args[1]
    b = args[2]
Example #22
0
from boom.modules import Module
from multiprocessing import Pool
from nltk import word_tokenize
from nltk.corpus import stopwords
import utils.gen_utils as gen_utils
import numpy as np

stop_words = set(stopwords.words('english'))
candidate_list = gen_utils.read_lines_from_text_file("data/candidate.lst")
model = gen_utils.read_dict_from_pkl(
    "models/MF-e.MF-i.bm25_scores.indri_scores/train/linear_svm/model.pkl")


def multi_process_helper(args):
    q_and_context_list = args[0]
    ret_list = []

    print "In prediction: ", len(q_and_context_list), type(q_and_context_list)
    id = 0

    for q_and_context in q_and_context_list:
        id += 1
        print "In prediction, ind ", id, "/", len(q_and_context_list)

        feat = np.asarray(q_and_context['final_feat'])
        feat = np.expand_dims(feat, axis=0)
        pred_label = candidate_list[model.predict(feat)[0]]

        final_meta = {}
        final_meta['pred'] = pred_label
        final_meta['q_meta'] = q_and_context['question']
Example #23
0
#   process for all type of questions
lst_fn = data_cfg.all_list_fn

#   the name of the feature
feat_type = "indri_scores"

MAX_TH = 32

#   create the feature root path
feat_root_path = os.path.join(data_cfg.feat_root_path, feat_type)
if not os.path.exists(feat_root_path):
    os.makedirs(feat_root_path)

candidate_list = gen_utils.read_lines_from_text_file(data_cfg.cand_lst_fn)
indri_meta = gen_utils.read_dict_from_pkl(
    os.path.join(data_cfg.dataset_root_path, "Indri_meta.pkl"))

lam = indri_meta['lambda']
mu = indri_meta['mu']
C = indri_meta['C']
ctf_dict = indri_meta['ctf']

print lam, mu, C
print "init finiished..."


def get_token_dict(tokens):
    ret_dict = {}
    total_token = 0
    for token in tokens:
        if not token in ret_dict:
'''
   11/22/2018: Viz and compare the qualitative errors on the given data.
'''

import os
import sys
sys.path.append("../")
import config.data_config as data_cfg
import utils.gen_utils as gen_utils
import utils.data_utils as data_utils

qid_list = gen_utils.read_dict_from_pkl(data_cfg.tst_list_fn)
'''
#   all correct
#   a list of correction cfgs (intersection will be applied internally)
corr_conf_list=[("MF-e.MF-i.bm25_scores.indri_scores","linear_svm"),("MF-e.MF-i","linear_svm")]

#   a list of error cfgs (intersection will be applied internally)
erro_conf_list=[]
'''

MAX_VIZ = 10
'''
#   one correct
#   a list of correction cfgs (intersection will be applied internally)
corr_conf_list=[("MF-e.MF-i.bm25_scores.indri_scores","linear_svm")]

#   a list of error cfgs (intersection will be applied internally)
erro_conf_list=[("MF-e.MF-i","linear_svm")]
'''
Example #25
0
def calc_bm25_N():
    doc_id_list = gen_utils.read_dict_from_pkl(lst_fn)
    N = len(doc_id_list)
    return N
Example #26
0
        feat_fn = os.path.join(feat_root_path, str(qid) + ".npz")
        if not os.path.exists(feat_fn):
            print "Warning: feature not exist:", qid
            continue

        gt_token = data_utils.load_quaser_gt_by_id(qid)
        gt_label = cand_list.index(gt_token)
        feat = np.load(feat_fn)['feat']

        ret_feats.append(feat)
        ret_labels.append(gt_label)

    return ret_feats, ret_labels


if __name__ == "__main__":

    for cfg in train_cfgs:
        qid_list = gen_utils.read_dict_from_pkl(cfg[1])
        model_root_path = os.path.join(data_cfg.model_root_path, cfg[0],
                                       cfg[2], model_type)
        print model_root_path
        all_train_feats, all_train_labels = load_feat_and_labels(qid_list, cfg)
        print "training model, cfg: ", cfg
        model = MultinomialNB()
        model = model.fit(all_train_feats, all_train_labels)
        if not os.path.exists(model_root_path):
            os.makedirs(model_root_path)
        model_fn = os.path.join(model_root_path, "model.pkl")
        gen_utils.write_dict_to_pkl(model, model_fn)
Example #27
0
#   process for all type of questions
lst_fn = data_cfg.all_list_fn

feat_type = "bm25_scores"
classifier_type = "heuristic"

feat_root_path = os.path.join(data_cfg.feat_root_path, feat_type)
pred_root_path = os.path.join(data_cfg.pred_root_path, classifier_type,
                              feat_type)
if not os.path.exists(pred_root_path):
    os.makedirs(pred_root_path)

candidate_list = gen_utils.read_lines_from_text_file(data_cfg.cand_lst_fn)

if __name__ == "__main__":
    qa_list = gen_utils.read_dict_from_pkl(lst_fn)
    for qid in qa_list:
        print "classify on qid: ", qid
        feat_fn = os.path.join(feat_root_path, str(qid) + ".pkl")
        bm25_dict = gen_utils.read_dict_from_pkl(feat_fn)

        cur_prob = []
        max_prob = -1
        max_label = None

        for cand_term in candidate_list:
            cur_prob.append(bm25_dict[cand_term])
            if bm25_dict[cand_term] > max_prob:
                max_prob = bm25_dict[cand_term]
                max_label = cand_term