def main(args):
    exp_path = args["exp_path"]
    isTraining = args["isTraining"]
    model_name = args["model_name"]

    if isTraining:
        data_path = "data/Train"
        rel_path = "../Corpus/TDT2/Train/QDRelevanceTDT2_forHMMOutSideTrain"
    else:
        data_path = "data/Test"
        rel_path = "../Corpus/TDT2/AssessmentTrainSet/AssessmentTrainSet.txt"

    # Read data
    with open(data_path + "/qry_IDs.pkl", "rb") as f:
        qry_IDs = pickle.load(f)
    with open(data_path + "/doc_IDs.pkl", "rb") as f:
        doc_IDs = pickle.load(f)

    qry_tf = np.load(data_path + "/x_qry_tf_mdl.npy")
    doc = np.load(data_path + "/doc_mdl.npy")

    #mean = np.load(exp_path + "/mean.npy")
    #stdv = np.load(exp_path + "/stdv.npy")
    #valid_idx = np.nonzero(stdv)

    # Load model
    model = load_model(exp_path + "/" + model_name)
    # Evaluation
    evaluate_model = EvaluateModel(rel_path, isTraining)

    with tf.device('/device:GPU:0'):
        # Train
        #qry_tf[:, valid_idx] = (qry_tf[:, valid_idx] - mean[valid_idx]) / stdv[valid_idx]
        rel_mdl = model.predict(qry_tf)
        #rel_mdl = rel_mdl * stdv[-1] + mean[-1]

    qry_docs_ranking = cosineFast(rel_mdl, qry_IDs, doc, doc_IDs)
    mAP = evaluate_model.mAP(qry_docs_ranking)
    print(mAP)
    def __init__(self,
                 qry_path=None,
                 rel_path=None,
                 isTraining=True,
                 doc_path=None):
        # default training step
        if qry_path == None:
            qry_path = "../Corpus/TDT2/Train/XinTrainQryTDT2/QUERY_WDID_NEW"
        if doc_path == None:
            doc_path = "../Corpus/TDT2/SPLIT_DOC_WDID_NEW"
        if rel_path == None:
            rel_path = "../Corpus/TDT2/Train/QDRelevanceTDT2_forHMMOutSideTrain"
        self.vocab_size = 51253
        # relevance set
        self.rel_set = ProcDoc.readRELdict(rel_path, isTraining)
        self.evaluate_model = EvaluateModel(rel_path, isTraining)

        # read documents
        doc = ProcDoc.readFile(doc_path)
        self.doc = ProcDoc.docPreproc(doc)
        self.doc_len = Statistical.compLenAcc(self.doc)

        # read queries
        qry = ProcDoc.readFile(qry_path)
        self.qry_tf = ProcDoc.qryPreproc(qry, self.rel_set)
        self.qry_len = Statistical.compLenAcc(self.qry_tf)
        [self.qry, self.doc] = Statistical.TFIDF(self.qry_tf, self.doc,
                                                 self.qry_len, self.doc_len)

        # dict to numpy
        self.qry_tf, self.qry_tf_IDs = self.__dict2np(self.qry_tf)
        self.qry, self.qry_IDs = self.__dict2np(self.qry, self.qry_tf_IDs)
        self.doc, self.doc_IDs = self.__dict2np(self.doc)

        # precompute len(document)
        self.doc = Statistical.l2Normalize(self.doc)
Exemple #3
0
def precision(y_true, y_pred):
    """Precision metric.
    Only computes a batch-wise average of precision.
    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


input_data_process = InputDataProcess(NUM_OF_FEATS, MAX_QRY_LENGTH,
                                      MAX_DOC_LENGTH)  #, test_path)
evaluate_model = EvaluateModel(
    "../Corpus/TDT2/Train/QDRelevanceTDT2_forHMMOutSideTrain", True)
# Parameters
params = {
    'input_data_process': input_data_process,
    'dim_x': MAX_QRY_LENGTH,
    'dim_y': MAX_DOC_LENGTH,
    'dim_x1': NUM_OF_FEATS,
    'batch_size': batch_size,
    'shuffle': False
}

[partition, labels,
 partition_answer] = input_data_process.genTrainValidSet(percent)

# Generators
training_generator = DataGenerator(**params).generate(labels,
Exemple #4
0
query_model = query_unigram
Pickle.dump(query_model, open("model/query_model.pkl", "wb"), True)

# remove template file
for rm_file in remove_list:
    if os.path.isfile("model/" + rm_file):
        os.remove("model/" + rm_file)

# Embedded Query Expansion
m_list = np.linspace(4, 4, num=1)
m = 1
interpolated_aplpha_list = np.linspace(0, 1.0, num=11)
word2vec = word2vec_model.word2vec_model(word_emb_path)

embd = EmbeddedBased(query_wordcount, collection, word2vec)
evaluate_model = EvaluateModel(relevance_path)
EQE1 = []
EQE2 = []
print "Embedded..."
# Embedding-based system (hyperparameter)
tmp_eqe1 = embd.embedded_query_expansion_ci(0.4, 4)
tmp_eqe2 = embd.embedded_query_expansion_qi(0.4, 4)
tmp_eqe1 = ProcDoc.modeling(tmp_eqe1, background_model, query_lambda)
tmp_eqe2 = ProcDoc.modeling(tmp_eqe2, background_model, query_lambda)
EQE1.append([ProcDoc.dict2np(tmp_eqe1), tmp_eqe1])
EQE2.append([ProcDoc.dict2np(tmp_eqe2), tmp_eqe2])

Pickle.dump(EQE1, open("model/eqe1_10.pkl", "wb"), True)
Pickle.dump(EQE2, open("model/eqe2_10.pkl", "wb"), True)
'''
EQE1 = Pickle.load(open("model/eqe1_10.pkl", "rb"))
Exemple #5
0
query_unigram = ProcDoc.unigram(dict(query_wordcount))
query_model = query_unigram
Pickle.dump(query_model, open("model/query_model.pkl", "wb"), True)
'''
# remove template file
for rm_file in remove_list:
    if os.path.isfile("model/" + rm_file):
        os.remove("model/" + rm_file)
'''
# Embedded Query Expansion
m_list = np.linspace(1, 80, num=1)
m = 1
interpolated_aplpha_list = np.linspace(0, 1.0, num=11)
word2vec = word2vec_model.word2vec_model()
evaluate_model = EvaluateModel(
    "../Corpus/TDT2/AssessmentTrainSet/AssessmentTrainSet.txt")
EQE1 = []
EQE2 = []
for m in m_list:
    [tmp_eqe1,
     tmp_eqe2] = Embedded_based.EmbeddedQuery(query_wordcount, collection,
                                              word2vec, 1, int(m))
    tmp_eqe1 = ProcDoc.modeling(tmp_eqe1, background_model, query_lambda)
    tmp_eqe2 = ProcDoc.modeling(tmp_eqe2, background_model, query_lambda)
    EQE1.append(ProcDoc.dict2np(tmp_eqe1))
    EQE2.append(ProcDoc.dict2np(tmp_eqe2))

Pickle.dump(EQE1, open("model/eqe1_10.pkl", "wb"), True)
Pickle.dump(EQE2, open("model/eqe2_10.pkl", "wb"), True)
'''
EQE1 = Pickle.load(open("model/eqe1_10.pkl", "rb"))
query_unigram = ProcDoc.unigram(query_wordcount)
query_model = ProcDoc.modeling(query_unigram, background_model, query_lambda)
'''
for q, w_uni in query_model.items():
    if q in HMMTraingSetDict:
        continue
    else:
        query_model.pop(q, None)

print(len(query_model.keys()))
'''
# query process
print("query ...")

evaluate_model = EvaluateModel(rel_path)
query_docs_point_fb = {}
query_model_fb = {}
mAP_list = []
for step in xrange(1, 15):
    query_docs_dict = {}
    AP = 0
    mAP = 0

    for q_key, q_word_prob in query_model.items():
        print(step - 1, end='\r')
        docs_point = {}
        for doc_key, doc_words_prob in doc_unigram.items():
            point = 0
            # calculate each query value for the document
            for query_word, query_prob in q_word_prob.items():
Exemple #7
0
TraingSet_path = "../Corpus/ResultsTrainSet/ResultsTrainSet.txt"
TraingSetDict = defaultdict(list)
with open(TraingSet_path, 'r') as file:
    # read content of query document (doc, content)
    title = ""
    for line in file.readlines():
        result = line.split()

        if len(result) == 0:
            continue
        if len(result) > 2:
            title = result[2]
            continue
        TraingSetDict[title].append(result[0])

eval = EvaluateModel(
    "../Corpus/TDT2/AssessmentTrainSet/AssessmentTrainSet.txt", False)
assess = eval.getAss()
P_R_table = np.zeros(11)
for q_key, results in TraingSetDict.items():
    p_max = 0
    num_correct = 0.
    start_recall = 0
    recall_acc = 0.
    for pos, doc_name in enumerate(results):
        t_pos = pos + 1
        if doc_name in assess[q_key]:
            num_correct += 1.
            precision = num_correct / t_pos
            recall_acc = num_correct / len(assess[q_key])
        if precision > p_max:
            p_max = precision