def save_ss_result_our_approach(query, top_ss, summary, path): str_tmp = query + '\n' # str_tmp += ('Relevant Sentences : \n') # for [sent, Relevance, Entity, A_Score, Order, Pattern, HTMLTag, Entropy, Score, q_id] in top_ss: # str_tmp += (str(sent) + '\n') # str_tmp += (str(Score) + '\n') str_tmp += ('Summary : \n' + summary) write_file(path, str_tmp)
def save_dq_result_our_approach(query, entities, top_dq, path, high_relevant_id, rank): str_tmp = query + '\n' for entity in entities: str_tmp += (str(entity) + ' ') str_tmp = str_tmp.strip() + '\n' for [id, title, final_similarity] in top_dq: str_tmp += (str(id) + '\n') str_tmp += (str(title) + '\n') str_tmp += (str(final_similarity) + '\n') for id in high_relevant_id: str_tmp += (str(id) + ' ') str_tmp = str_tmp.strip() + '\n' for i in range(0, 100, 1): str_tmp += (str(rank[i]) + ' ') write_file(path, str_tmp.strip())
def save_dq_result_baseline(queryNum, query, top_dq, path): pdf_str_tmp = 'Q_No.' + str(queryNum) + '\n' pdf_str_tmp += ('Query : ' + query + '\n\n') txt_str_tmp = (query + '\n') Num = 1 for [id, title, final_similarity] in top_dq: # pdf str pdf_str_tmp += ('No.' + str(Num) + '\n') pdf_str_tmp += ('Title : ' + str(title) + '\n') pdf_str_tmp += ('Link : http://stackoverflow.com/questions/' + str(id) + '\n') pdf_str_tmp += '\n' Num += 1 # txt str txt_str_tmp += (str(id) + '\n') txt_str_tmp += (str(title) + '\n') txt_str_tmp += (str(final_similarity) + '\n') write_pdf_file(path + '.pdf', pdf_str_tmp.strip().split('\n')) write_file(path + '.txt', txt_str_tmp.strip())
dic = set() try: cur.execute(sql) results = cur.fetchall() cnt = 0 for row in results: # tag : '<java><xml><csv><data-conversion>' tag_list_tmp = row[12].replace('<', ' ').replace('>', ' ').replace( ' ', ' ').strip() for tag_tmp in tag_list_tmp.split(' '): if tag_tmp not in dic: dic.add(tag_tmp) cnt += 1 if cnt % 1000 == 0: print 'processing ' + str(cnt) + ' instance' except Exception as e: print e con.close() return dic if __name__ == '__main__': path_of_dic = 'entity_dic.txt' dic = extract_tag_info_from_java_table() dic_str = '' for tag_tmp in dic: dic_str += (tag_tmp + '\n') write_file(path_of_dic, dic_str) print 'Done.'
''' Attention : PLZ DON'T RERUN THIS CODE! OR ALL THE DATA ARE INVALID! ''' if __name__ == '__main__': path_of_post_id_list = 'post_id_list.txt' id_list = read_sentence_by_line(path_of_post_id_list) path_of_post_pair = 'post_pair_list.txt' pair_list = read_post_pair(path_of_post_pair) testnum = 100 size = len(id_list) random_list = get_random_list(0, size, testnum) dic = {} for random_num in random_list: id = id_list[random_num] high_relevant_id_list = get_high_relevant_questions(id, pair_list) dic[id] = high_relevant_id_list write_str = '' for id in dic.keys(): # query id # query # high relevant id list write_str += str(id) + '\n' write_str += (str(read_specific_question_from_repo(id).title) + '\n') for relevant_id in dic[id]: write_str += (str(relevant_id) + ' ') write_str = write_str.strip() + '\n' path_of_query = 'query.txt' write_file(path_of_query, write_str) print 'Done.'
from utils.file_util import write_file from pathConfig import get_base_path path_of_voc = get_base_path() + '/_2_sentence_selection/Entropy/idf_voc.txt' def read_voc(): file = open(path_of_voc) voc = {} for line in file: word_idf = line.split(' ') word = word_idf[0] idf = float(word_idf[1].strip()) voc[word] = idf return voc if __name__ == '__main__': reponum = 50000 voc_str = '' voc = read_voc() for key in voc.keys(): voc_str += (key + ' ' + str(voc[key]) + '\n') write_file(path_of_voc, voc_str.strip()) print 'Done.'
# 只有文章 article.append(temp) temp.append(temp_filter) # 文章加评论 article_comment.append(temp) # excel索引 excel.append(excel_temp) print('[{}]--data process finally'.format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) return article_comment, article, excel if __name__ == '__main__': article_comment, article, excel = get_record() # article存储路径 article_path = "/Users/red/Desktop/temp/news/data/sj_data/sina_data/article_txt" doc.path_exists(article_path) # article_comment存储路径 article_comment_path = "/Users/red/Desktop/temp/news/data/sj_data/sina_data/article_comment_txt" doc.path_exists(article_comment_path) for i in range(len(excel)): file_util.write_file(os.path.join(article_path, excel[i][0] + '.txt'), article[i][0]) file_util.write_file(os.path.join(article_comment_path, excel[i][0] + '.txt'), str(article_comment[i][0]) + '\n\n' + str(article_comment[i][1])) print('[{}]--file write finally'.format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) title = ['文件编号', 'url', '时间', '话题'] xlwt_util.save_xlwt(4, 'sheet1', title, excel, '/Users/red/Desktop/temp/news/data/sj_data/sina_data/index.xls') print('[{}]--excel write finally'.format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
# -*- coding: UTF-8 -*- from utils.file_util import read_sentence_by_line from utils.Random_util import get_random_list from dataset_util import get_high_relevant_questions, read_post_pair from utils.db_util import read_specific_question_from_repo from utils.file_util import write_file if __name__ == '__main__': path_of_file = 'query.txt' file = open(path_of_file) linenum = 1 write_str = '' for line in file: # query id # query # high relevant id list line = line.strip() if linenum % 3 != 0: write_str += line + '\n' linenum += 1 path_of_query = 'open_query.txt' write_file(path_of_query, write_str.strip()) print 'Done.'
] temp = [filters.stripTagSimple(item.get('article_content'))] # 只有文章 article.append(temp) # excel索引 excel.append(excel_temp) print('[{}]--data process finally'.format( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) return article, excel if __name__ == '__main__': article, excel = get_record() # article存储路径 article_path = "/Users/red/Desktop/temp/news/data/sj_data/sohu_data/article_txt" doc.path_exists(article_path) for i in range(len(excel)): file_util.write_file(os.path.join(article_path, excel[i][0] + '.txt'), article[i][0]) print('[{}]--file write finally'.format( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) title = ['文件编号', 'url', '时间', '分类'] xlwt_util.save_xlwt( 4, 'sheet1', title, excel, '/Users/red/Desktop/temp/news/data/sj_data/sohu_data/index.xls') print('[{}]--excel write finally'.format( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))