voc[w] = voc[w] + 1.0

        # body_wlist = word_tokenize(q.body.strip())
        # for w in body_wlist:
        #     if w not in cur_word_set:
        #         cur_word_set.add(w)
        #         if w not in voc.keys():
        #             voc[w] = 1.0
        #         else:
        #             voc[w] = voc[w] + 1.0

        count += 1
        if count % 10000 == 0:
            print 'processing %s unit...' % count, get_current_time()
    for key in voc.keys():
        idf = math.log(total_num / (voc[key] + 1.0))
        voc[key] = idf
    sorted_voc = sorted(voc.items(), key=operator.itemgetter(1))
    return sorted_voc





if __name__ == '__main__':
    fpath = 'idf_vocab.csv'
    header = ['word', 'idf']
    vocab = build_IDF_vocabulary()
    write_list_to_csv(vocab, fpath, header)
    print 'Done.'
    #         processed_sentence.append(sentence1)

    # summary = '\n'.join([x.capitalize() for x in selected_sentence])
    return selected_sentence, rank_list


def load_ss_result(ss_fpath):
    import pandas as pd
    ss_res = list()
    df = pd.read_csv(ss_fpath)
    for idx, row in df.iterrows():
        ss_res.append((row[0], eval(row[1])))
    return ss_res


if __name__ == '__main__':
    ss_fpath = os.path.join(res_dir, 'ss_res.csv')

    topk = 5
    res = list()
    # print load_ss_result(ss_fpath)
    for query, ss in load_ss_result(ss_fpath):
        query = ' '.join(preprocessing_for_query(query))
        sum = get_summary(query, ss, topk)
        res.append([query, sum])
        print("summary\n%s" % sum)

    res_fpath = os.path.join(res_dir, 'summary_res.csv')
    header = ["query", "summary"]
    write_list_to_csv(res, res_fpath, header)
    try:
        cur.execute(sql)
        results = cur.fetchall()
        for row in results:
            postId = row[2]
            related_postId = row[3]
            if postId in java_id_set and related_postId in java_id_set:
                if postId not in id_dict:
                    id_dict[postId] = True
                if related_postId not in id_dict:
                    id_dict[related_postId] = True
            cnt += 1
            if cnt % 10000 == 0:
                print('Processing %s...' % cnt, get_current_time())
    except Exception as e:
        print e
    cur.close()
    con.close()
    print("# relevant qid = %s" % len(id_dict), get_current_time())
    return sorted(list(id_dict.keys()))


if __name__ == '__main__':
    java_qid_set_fpath = 'java_qid_list.csv'
    java_id_set = load_java_qid_set(java_qid_set_fpath)
    related_id_list = extract_java_relevant_ids_from_postlink(java_id_set)
    # post id list
    related_id_list_fpath = 'related_qid_list.txt'
    header = ['Id']
    write_list_to_csv(related_id_list, related_id_list_fpath, header)