コード例 #1
0
def save_ss_result_our_approach(query, top_ss, summary, path):
    str_tmp = query + '\n'
    # str_tmp += ('Relevant Sentences : \n')
    # for [sent, Relevance, Entity, A_Score, Order, Pattern, HTMLTag, Entropy, Score, q_id] in top_ss:
    #     str_tmp += (str(sent) + '\n')
    #     str_tmp += (str(Score) + '\n')
    str_tmp += ('Summary : \n' + summary)
    write_file(path, str_tmp)
コード例 #2
0
def save_dq_result_our_approach(query, entities, top_dq, path,
                                high_relevant_id, rank):
    str_tmp = query + '\n'
    for entity in entities:
        str_tmp += (str(entity) + ' ')
    str_tmp = str_tmp.strip() + '\n'
    for [id, title, final_similarity] in top_dq:
        str_tmp += (str(id) + '\n')
        str_tmp += (str(title) + '\n')
        str_tmp += (str(final_similarity) + '\n')
    for id in high_relevant_id:
        str_tmp += (str(id) + ' ')
    str_tmp = str_tmp.strip() + '\n'
    for i in range(0, 100, 1):
        str_tmp += (str(rank[i]) + ' ')
    write_file(path, str_tmp.strip())
コード例 #3
0
def save_dq_result_baseline(queryNum, query, top_dq, path):
    pdf_str_tmp = 'Q_No.' + str(queryNum) + '\n'
    pdf_str_tmp += ('Query : ' + query + '\n\n')
    txt_str_tmp = (query + '\n')
    Num = 1
    for [id, title, final_similarity] in top_dq:
        # pdf str
        pdf_str_tmp += ('No.' + str(Num) + '\n')
        pdf_str_tmp += ('Title : ' + str(title) + '\n')
        pdf_str_tmp += ('Link : http://stackoverflow.com/questions/' +
                        str(id) + '\n')
        pdf_str_tmp += '\n'
        Num += 1
        # txt str
        txt_str_tmp += (str(id) + '\n')
        txt_str_tmp += (str(title) + '\n')
        txt_str_tmp += (str(final_similarity) + '\n')
    write_pdf_file(path + '.pdf', pdf_str_tmp.strip().split('\n'))
    write_file(path + '.txt', txt_str_tmp.strip())
コード例 #4
0
    dic = set()
    try:
        cur.execute(sql)
        results = cur.fetchall()
        cnt = 0
        for row in results:

            # tag : '<java><xml><csv><data-conversion>'
            tag_list_tmp = row[12].replace('<', ' ').replace('>', ' ').replace(
                '  ', ' ').strip()
            for tag_tmp in tag_list_tmp.split(' '):
                if tag_tmp not in dic:
                    dic.add(tag_tmp)
            cnt += 1
            if cnt % 1000 == 0:
                print 'processing ' + str(cnt) + ' instance'
    except Exception as e:
        print e
    con.close()
    return dic


if __name__ == '__main__':
    path_of_dic = 'entity_dic.txt'
    dic = extract_tag_info_from_java_table()
    dic_str = ''
    for tag_tmp in dic:
        dic_str += (tag_tmp + '\n')
    write_file(path_of_dic, dic_str)
    print 'Done.'
コード例 #5
0
'''
Attention : PLZ DON'T RERUN THIS CODE! OR ALL THE DATA ARE INVALID!
'''

if __name__ == '__main__':
    path_of_post_id_list = 'post_id_list.txt'
    id_list = read_sentence_by_line(path_of_post_id_list)
    path_of_post_pair = 'post_pair_list.txt'
    pair_list = read_post_pair(path_of_post_pair)
    testnum = 100
    size = len(id_list)
    random_list = get_random_list(0, size, testnum)
    dic = {}
    for random_num in random_list:
        id = id_list[random_num]
        high_relevant_id_list = get_high_relevant_questions(id, pair_list)
        dic[id] = high_relevant_id_list
    write_str = ''
    for id in dic.keys():
        # query id
        # query
        # high relevant id list
        write_str += str(id) + '\n'
        write_str += (str(read_specific_question_from_repo(id).title) + '\n')
        for relevant_id in dic[id]:
            write_str += (str(relevant_id) + ' ')
        write_str = write_str.strip() + '\n'
    path_of_query = 'query.txt'
    write_file(path_of_query, write_str)
    print 'Done.'
コード例 #6
0
from utils.file_util import write_file
from pathConfig import get_base_path

path_of_voc = get_base_path() + '/_2_sentence_selection/Entropy/idf_voc.txt'


def read_voc():
    file = open(path_of_voc)
    voc = {}
    for line in file:
        word_idf = line.split('   ')
        word = word_idf[0]
        idf = float(word_idf[1].strip())
        voc[word] = idf
    return voc


if __name__ == '__main__':
    reponum = 50000
    voc_str = ''
    voc = read_voc()
    for key in voc.keys():
        voc_str += (key + '   ' + str(voc[key]) + '\n')
    write_file(path_of_voc, voc_str.strip())
    print 'Done.'
コード例 #7
0
ファイル: save_sina_to_txt.py プロジェクト: lxw273486636/news
            #  只有文章
            article.append(temp)

            temp.append(temp_filter)
            # 文章加评论
            article_comment.append(temp)
            # excel索引
            excel.append(excel_temp)
    print('[{}]--data process finally'.format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
    return article_comment, article, excel


if __name__ == '__main__':
    article_comment, article, excel = get_record()
    # article存储路径
    article_path = "/Users/red/Desktop/temp/news/data/sj_data/sina_data/article_txt"
    doc.path_exists(article_path)
    # article_comment存储路径
    article_comment_path = "/Users/red/Desktop/temp/news/data/sj_data/sina_data/article_comment_txt"
    doc.path_exists(article_comment_path)

    for i in range(len(excel)):
        file_util.write_file(os.path.join(article_path, excel[i][0] + '.txt'), article[i][0])
        file_util.write_file(os.path.join(article_comment_path, excel[i][0] + '.txt'),
                             str(article_comment[i][0]) + '\n\n' + str(article_comment[i][1]))
    print('[{}]--file write finally'.format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))

    title = ['文件编号', 'url', '时间', '话题']
    xlwt_util.save_xlwt(4, 'sheet1', title, excel, '/Users/red/Desktop/temp/news/data/sj_data/sina_data/index.xls')
    print('[{}]--excel write finally'.format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
# -*- coding: UTF-8 -*-

from utils.file_util import read_sentence_by_line
from utils.Random_util import get_random_list
from dataset_util import get_high_relevant_questions, read_post_pair
from utils.db_util import read_specific_question_from_repo
from utils.file_util import write_file

if __name__ == '__main__':
    path_of_file = 'query.txt'
    file = open(path_of_file)
    linenum = 1
    write_str = ''
    for line in file:
        # query id
        # query
        # high relevant id list
        line = line.strip()
        if linenum % 3 != 0:
            write_str += line + '\n'
        linenum += 1
    path_of_query = 'open_query.txt'
    write_file(path_of_query, write_str.strip())
    print 'Done.'
コード例 #9
0
ファイル: save_sohu_to_txt.py プロジェクト: lxw273486636/news
            ]
            temp = [filters.stripTagSimple(item.get('article_content'))]
            #  只有文章
            article.append(temp)

            # excel索引
            excel.append(excel_temp)
    print('[{}]--data process finally'.format(
        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
    return article, excel


if __name__ == '__main__':
    article, excel = get_record()
    # article存储路径
    article_path = "/Users/red/Desktop/temp/news/data/sj_data/sohu_data/article_txt"
    doc.path_exists(article_path)

    for i in range(len(excel)):
        file_util.write_file(os.path.join(article_path, excel[i][0] + '.txt'),
                             article[i][0])
    print('[{}]--file write finally'.format(
        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))

    title = ['文件编号', 'url', '时间', '分类']
    xlwt_util.save_xlwt(
        4, 'sheet1', title, excel,
        '/Users/red/Desktop/temp/news/data/sj_data/sohu_data/index.xls')
    print('[{}]--excel write finally'.format(
        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))