def getCompare(request): print(request.POST['keyword']) print(request.POST['file_name']) print(request.POST['statusParse']) file_name = request.POST['file_name'] statusParse = request.POST['statusParse'] BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) path = os.path.join(BASE_DIR, "./files/" + file_name) keyword = request.POST['keyword'].spilt(',') upload_file_type = 'xml' # print(check_file_exist(path)) # print(path) if not ut.check_file_exist(path): return render(request, "ft_retriver/status.html", ut.deal_failed()) else: artical_setA = [] title = [] content = [] title, content = ps.parse_xml_abstract_title(path) numbers = len(title) type = 'xml' for i in range(0, numbers): print(i) artical = {} artical['status_title'], artical[ 'located_title'] = ps.located_keyword(keyword, title[i]) artical['status_content'], artical[ 'located_content'] = ps.located_keyword(keyword, content[i]) if artical['status_title'] == True or artical[ 'status_content'] == True: # 有重再全算 artical['character_content'] = ps.count_character(content[i]) artical['wordset'], artical['word_content'] = ps.count_words( content[i]) artical['sentence_content'] = ps.count_sentence2(content[i]) artical['keyword_title_hit'] = len(artical['located_title']) artical['keyword_content_hit'] = len( artical['located_content']) artical['title'] = ut.mark_string(title[i], keyword) artical['content'] = ut.mark_string(content[i], keyword) artical['id'] = 'id_' + str(i) artical_setA.append(artical) hitA = len(artical_setA) return render(request, "ft_retriver/result3.html", locals())
def get_artical_set_xml(path, keyword, count): artical_set = [] title = [] content = [] title, content = ps.parse_xml_abstract_title(path) numbers = len(title) index = 0 for i in range(0, numbers): print(i) artical = {} artical['status_title'], artical['located_title'] = ps.located_keyword( keyword, title[i]) artical['status_content'], artical[ 'located_content'] = ps.located_keyword(keyword, content[i]) if artical['status_title'] == True or artical['status_content'] == True: # 有重再全算 if index > count: break artical['character_content'] = ps.count_character(content[i]) artical['wordset'], artical['word_content'] = ps.count_words( content[i]) artical['wordset'] = artical['wordset'].most_common(10) artical['sentence_content'] = ps.count_sentence2(content[i]) artical['keyword_title_hit'] = len(artical['located_title']) artical['keyword_content_hit'] = len(artical['located_content']) artical['title'] = ut.mark_string(title[i], keyword) artical['o_title'] = title[i] artical['content'] = ut.mark_string(content[i], keyword) artical['id'] = str(index) # artical['common'] = wheather_important_words(artical['wordset']) artical_set.append(artical) index = index + 1 hit = len(artical_set) return artical_set
def tf_idf(request): if request.method == "POST": status = 'filein' keyword = 'notfound' keyword = request.POST['keyword'] version = 'notfound' version = request.POST['tf_idf_version'] myFile = request.FILES.get("myfile", None) if not myFile or keyword == 'notfound' or version == 'notfound': return redirect('../tf_idf/') elif ut.check_xml( myFile.name ) and keyword != 'notfound' and version != 'notfound': destination = open(os.path.join("./files", myFile.name), 'wb+') for chunk in myFile.chunks(): destination.write(chunk) destination.close() file_name = myFile.name titles, contents = ti.parse_xml_abstract_title('./files/' + myFile.name) artical_set = [] title = [] content = [] numbers = len(title) print(len(titles)) print("----Merge Start----") contents = ti.merge_contents_titles(titles, contents) print("----Merge finished----") print( "----Delete stop words in every artical and remove numbers Start----" ) contents_clean_wordset = ti.filter_to_clean_wordset(contents) print( "----Delete stop words in every artical and remove numbers finished----" ) print("----Count numbers of every artical start----") contents_words_after_count = ti.count_words_in_one_document( contents_clean_wordset) print("----Count numbers of every artical finished----") idf = ti.count_idf(keyword, contents_clean_wordset) if version == "v1": tf_idf_list = [] tf_list = [] tf_list = ti.count_tf(keyword, contents_words_after_count) for index in range(0, len(tf_list)): tf_idf = tf_list[index] * idf tf_idf_with_index = (tf_idf, index) tf_idf_list.append(tf_idf_with_index) sorted_tf_idf_list = [] sorted_tf_idf_list = sorted(tf_idf_list, key=lambda tup: tup[0]) sorted_tf_idf_list.reverse() elif version == "v2": tf_idf_list = [] tf_list = [] tf_list = ti.tf_raw_count(keyword, contents_words_after_count) for index in range(0, len(tf_list)): tf_idf = ti.tf_logarithm(tf_list[index]) * idf tf_idf_with_index = (tf_idf, index) tf_idf_list.append(tf_idf_with_index) sorted_tf_idf_list = [] sorted_tf_idf_list = sorted(tf_idf_list, key=lambda tup: tup[0]) sorted_tf_idf_list.reverse() elif version == "v3": tf_idf_list = [] tf_list = [] tf_list = ti.tf_double_k_normalization( keyword, contents_words_after_count, 0.5) for index in range(0, len(tf_list)): tf_idf = tf_list[index] * idf tf_idf_with_index = (tf_idf, index) tf_idf_list.append(tf_idf_with_index) sorted_tf_idf_list = [] sorted_tf_idf_list = sorted(tf_idf_list, key=lambda tup: tup[0]) sorted_tf_idf_list.reverse() #count cosine similarity print("----Cosine Similarity initialize----") corpus = ti.merge_contents_titles(titles, contents) vectorizer = CountVectorizer(min_df=1) X = vectorizer.fit_transform(corpus) feature_name = vectorizer.get_feature_names() A = X.toarray() A_sparse = sparse.csr_matrix(A) similarities = cosine_similarity(A_sparse) print("----Cosine Similarity initialize finished----") for content_index in sorted_tf_idf_list: #print(content_index[0]) print(content_index) i = content_index[1] score = content_index[0] print(score) artical = {} artical['status_title'], artical[ 'located_title'] = ps.located_keyword(keyword, titles[i]) artical['status_content'], artical[ 'located_content'] = ps.located_keyword( keyword, contents[i]) if artical['status_title'] == True or artical[ 'status_content'] == True: # 有重再全算 artical['character_content'] = ps.count_character( contents[i]) artical['wordset'], artical[ 'word_content'] = ps.count_words(contents[i]) artical['sentence_content'] = ps.count_sentence2( contents[i]) artical['title'] = ut.mark_string(titles[i], keyword) artical['content'] = ut.mark_string(contents[i], keyword) artical['tf_idf'] = content_index[0] artical['tf_idf_about'] = round(content_index[0], 3) artical['keyword_title_hit'] = len( artical['located_title']) artical['keyword_content_hit'] = len( artical['located_content']) artical['title'] = ut.mark_string(titles[i], keyword) artical['content'] = ut.mark_string(contents[i], keyword) artical['id'] = 'id_' + str(i) artical['wordset_p'] = ps.wordset_by_poter(contents[i]) artical['common'] = wheather_important_words( artical['wordset']) artical['wordset_len'] = len(artical['wordset']) artical['wordset_p_len'] = len(artical['wordset_p']) relative_index = list(np.argsort(-similarities[i]))[1:6] artical['relative_artical'] = [] artical['ra_simlarity'] = [] for k in relative_index: print(k) ra_string = "[Similarity:" + str( round(similarities[i][k], 3)) + "] " + titles[k] artical['relative_artical'].append(ra_string) artical_set.append(artical) return render(request, "ft_retriver/tf_idf.html", locals()) else: status = 'missing something' return render(request, "ft_retriver/tf_idf.html", locals()) else: status = 'none' return render(request, "ft_retriver/tf_idf.html", locals())
def json_deal(request): file_name = request.POST['file_name'] BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) path = os.path.join(BASE_DIR, "./files/" + file_name) keyword = request.POST['keyword'] type = 'json' if not ut.check_file_exist(path): return render(request, "ft_retriver/status.html", ut.deal_failed()) else: json_obj = js.load_json_from_file(path) content, user = js.get_text_from_json_object(json_obj) artical_set = [] title = [] numbers = len(content) count = 1 print(content) for i in range(0, numbers): print(i) artical = {} artical['status_content'], artical[ 'located_content'] = ps.located_keyword(keyword, content[i]) if artical['status_content'] == True: # 有重再全算 artical['character_content'] = ps.count_character(content[i]) artical['wordset'], artical['word_content'] = ps.count_words( content[i]) artical['sentence_content'] = ps.count_sentence2(content[i]) artical['keyword_content_hit'] = len( artical['located_content']) artical['content'] = ut.mark_string(content[i], keyword) print(artical['content']) artical['tag'] = ut.sperate_tag(content[i]) artical['http'] = ut.sperate_http(content[i]) artical['id'] = 'id' + str(i) # print(type(content[i])) if len(user) != 0: artical['title'] = "這是第 " + str( count) + "篇含有關鍵字的推文,來自author_id: " + str(user[i]) else: artical['title'] = "這是第 " + str(count) + "篇含有關鍵字的推文" count += 1 artical['common'] = wheather_important_words( artical['wordset']) artical['words'], artical['value'] = ps.zipf_picture_data( artical['wordset']) # artical['a'] = sperate_tag(content[i]) # print(type(content[i])) artical['wordset_p'] = ps.wordset_by_poter(content[i]) artical['words_p'], artical['value_p'] = ps.zipf_picture_data( artical['wordset_p']) artical['ca_id_json'] = 'ca_id_json_' + str(i) artical['ca_id_json_p'] = 'ca_id_json_p_' + str(i) artical['wordset_len'] = len(artical['wordset']) artical['wordset_p_len'] = len(artical['wordset_p']) artical['len_id_json'] = 'len_id_json' + str(i) display_list = [] length = len(artical['words_p']) for j in range(0, length): display = {} if i < len(artical['words_p']): display['key'] = artical['words'][j] display['value'] = artical['value'][j] display['key_p'] = artical['words_p'][j] display['value_p'] = artical['value_p'][j] else: display['key'] = artical['words'][j] display['value'] = artical['value'][j] display['key_p'] = '' display['value_p'] = '' display['row'] = j + 1 display_list.append(display) artical['display_list'] = display_list artical_set.append(artical) hit = len(artical_set) if (hit != 0): return render(request, "ft_retriver/result3.html", locals()) else: hit = 0 newkeyword = ed.fix_wrong_input(keyword) url = '../json/' file_name = file_name # statusParse = statusParse return render(request, "ft_retriver/redirect.html", locals())
def xml_deal(request): print(request.POST['keyword']) print(request.POST['file_name']) print(request.POST['statusParse']) file_name = request.POST['file_name'] statusParse = request.POST['statusParse'] BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) print(BASE_DIR) path = os.path.join(BASE_DIR, "./files/" + file_name) upload_file_type = 'xml' cheacker = request.POST['keyword'].find(',') if cheacker != -1: keyword = request.POST['keyword'] keywords = request.POST['keyword'].split(',') articalset_b = get_artical_set_xml(path, keywords[0], 5) articalset_a = get_artical_set_xml(path, keywords[1], 5) sr = same_rate(articalset_b, articalset_a) return render(request, "ft_retriver/result_sep.html", locals()) # 表示有兩個關鍵字,要分兩邊 else: # 表示跟原來的依樣 keyword = request.POST['keyword'] if not ut.check_file_exist(path): return render(request, "ft_retriver/status.html", ut.deal_failed()) else: artical_set = [] title = [] content = [] title, content = ps.parse_xml_abstract_title(path) ut.make_a_dictionary(content) numbers = len(title) type = 'xml' for i in range(0, numbers): print(i) artical = {} artical['status_title'], artical[ 'located_title'] = ps.located_keyword(keyword, title[i]) artical['status_content'], artical[ 'located_content'] = ps.located_keyword( keyword, content[i]) if artical['status_title'] == True or artical[ 'status_content'] == True: # 有重再全算 artical['character_content'] = ps.count_character( content[i]) artical['wordset'], artical[ 'word_content'] = ps.count_words(content[i]) artical['sentence_content'] = ps.count_sentence2( content[i]) artical['keyword_title_hit'] = len( artical['located_title']) artical['keyword_content_hit'] = len( artical['located_content']) artical['title'] = ut.mark_string(title[i], keyword) artical['content'] = ut.mark_string(content[i], keyword) artical['id'] = 'id_' + str(i) artical['wordset_p'] = ps.wordset_by_poter(content[i]) artical['common'] = wheather_important_words( artical['wordset']) artical['words'], artical['value'] = ps.zipf_picture_data( artical['wordset']) artical['words_p'], artical[ 'value_p'] = ps.zipf_picture_data(artical['wordset_p']) artical['ca_id_xml'] = 'ca_id_xml_' + str(i) artical['ca_id_xml_p'] = 'ca_id_xml_p_' + str(i) artical['wordset_len'] = len(artical['wordset']) artical['wordset_p_len'] = len(artical['wordset_p']) artical['len_id_xml'] = 'len_id_xml' + str(i) display_list = [] length = len(artical['words']) for j in range(0, length): display = {} if i < len(artical['words_p']): display['key'] = artical['words'][j] display['value'] = artical['value'][j] display['key_p'] = artical['words_p'][j] display['value_p'] = artical['value_p'][j] else: display['key'] = artical['words'][j] display['value'] = artical['value'][j] display['key_p'] = '' display['value_p'] = '' display['row'] = j + 1 display_list.append(display) artical['display_list'] = display_list artical_set.append(artical) hit = len(artical_set) # needToFix = 'no!' if (hit != 0): return render(request, "ft_retriver/result3.html", locals()) else: hit = 0 newkeyword = ed.fix_wrong_input(keyword) url = '../xml/' file_name = file_name statusParse = statusParse return render(request, "ft_retriver/redirect.html", locals())
import os BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) path = os.path.join(BASE_DIR, "files\pubmed_fever_100.xml") print(BASE_DIR) print(path) keyword = 'brca' artical_set = [] title = [] content = [] title, content = ps.parse_xml_abstract_title(path) numbers = len(title) # print(title[0]) for i in range(0, numbers): print(i) artical = {} artical['title'] = title[i] artical['content'] = content[i] artical['character_content'] = ps.count_character(content[i]) artical['word_content'] = ps.count_words(content[i]) artical['sentence_content'] = ps.count_sentence(content[i]) artical['status_title'], artical['located_title'] = ps.located_keyword( keyword, title[i]) artical['status_content'], artical['located_content'] = ps.located_keyword( keyword, content[i]) artical_set.append(artical) print(len(artical_set))