def preprocess_all_questions(questions, idf, w2v, stopword):
    processed_questions = list()
    stopwords = stopword
    for question in questions:
        title_words = remove_stopwords(question.title, stopwords)
        if len(title_words) <= 2:
            continue
        if title_words[-1] == '?':
            title_words = title_words[:-1]
        question.title_words = title_words
        question.matrix = init_doc_matrix(question.title_words, w2v)
        question.idf_vector = init_doc_idf_vector(question.title_words, idf)
        processed_questions.append(question)
    return processed_questions
Example #2
0
def get_result(query_x):
    query = query_x
    dq_res = list()
    print("query : %s...%s" % (query, time.strftime('%Y-%m-%d %H:%M:%S')))
    query_word = preprocessing_for_query(query)
    print query_word
    query_matrix = init_doc_matrix(query_word, w2v_model)
    query_idf = init_doc_idf_vector(query_word , idf_vocab)
    top_dq = get_dq(query_word, topnum, questions, query_idf, query_matrix)
    cur_res_dict = []
    for i in range(len(top_dq)):
        q = top_dq[i][0]
        sim = top_dq[i][1]
        # print "#%s\nId : %s\nTitle : %s\nSimilarity : %s\n" % (i, q.id, q.title, sim)
        cur_res_dict.append((q.id, round(sim, 2)))
    dq_res.append([query, cur_res_dict])

    #dqres_fpath = os.path.join(res_dir, 'rq_res.csv')
    #header = ["query", "rq_id_list"]
    #write_list_to_csv(dq_res, dqres_fpath, header)

    print 'sentence selection...', time.strftime('%Y-%m-%d %H:%M:%S')
    ss_res = list()
    ss_qid = list()
    for query, top_dq_id_and_sim in dq_res:
        top_ss = get_ss(query_word, topnum, top_dq_id_and_sim, stopword)
        ss_res.append((query, top_ss[0]))
        ss_qid.append((query, top_ss[1]))
    #print ss_qid
    #print ss_res
    
    print 'get summary...', time.strftime('%Y-%m-%d %H:%M:%S')

    rank_list = []
    summ = []
    for query, ss in ss_res:
        query = ' '.join(preprocessing_for_query(query))
        summ, rank_list= get_summary(query, ss, 5, w2v_model, idf_vocab, stopword)
        # summary = '\n'.join([x.capitalize() for x in summ])
        # res.append([query, summary])

    info = list()
    for i in range(0,len(rank_list)):
        qid = dict()
        qid["string"] = summ[i]
        qid["id"] = ss_qid[0][1][rank_list[i]]
        info.append(qid)
    
    return jsonify({"info": info})
    dq_res = list()
    stopword = read_EN_stopwords()

    #process questions
    questions = preprocess_all_questions(repo, idf_vocab, w2v_model, stopword)

    # repo_idtitle = {}
    # stopwords = read_EN_stopwords()
    # for q in repo:
    #     title_w = remove_stopwords(q.title, stopwords)
    #     repo_idtitle[q.id] = title_w

    for query in query_list:
        print("query : %s...%s" % (query, time.strftime('%Y-%m-%d %H:%M:%S')))
        query_word = preprocessing_for_query(query)
        query_matrix = init_doc_matrix(query_word, w2v_model)
        query_idf = init_doc_idf_vector(query_word , idf_vocab)
        top_dq = get_dq(query_word, topnum, questions, query_idf, query_matrix)
        cur_res_dict = []
        for i in range(len(top_dq)):
            q = top_dq[i][0]
            sim = top_dq[i][1]
            # print "#%s\nId : %s\nTitle : %s\nSimilarity : %s\n" % (i, q.id, q.title, sim)
            cur_res_dict.append((q.id, round(sim, 2)))
        dq_res.append([query, cur_res_dict])

    dqres_fpath = os.path.join(res_dir, 'rq_res.csv')
    header = ["query", "rq_id_list"]
    write_list_to_csv(dq_res, dqres_fpath, header)

    print 'sentence selection...', time.strftime('%Y-%m-%d %H:%M:%S')