def ExtractFeatureWords(filename): result = Reader.readfile(filename) result_split = Segmentation.SplitCluster(result) delete_result = Candidate.DeleteRepetition(result_split) candidate = Candidate.BuildClass(delete_result) Candidate.CalLenScore(candidate, 2, 15) Candidate.CalSupScore(candidate, result_split) Candidate.CalPosScore(candidate, result_split) for i in range(len(candidate)): for j in range(len(candidate[i])): candidate[i][j].CalScore() candidate_list = Candidate.GenCandidateList(candidate) sorted_candidate_list = Candidate.CandidateListSort(candidate_list) # extracted_word = Candidate.CutByRank(sorted_candidate_list,0.5) # PrintExtractedWord(extracted_word) extracted_word = Candidate.CutByScore(sorted_candidate_list, 2.9) # PrintExtractedWord(extracted_word) # extracted_word = Candidate.CutByRankAndScore(sorted_candidate_list,0.05,2.9) # PrintExtractedWord(extracted_word) extracted_result = Candidate.ExtractedWordDeleteRepetition(extracted_word) # PrintExtractedResult(extracted_result) # print("") # print("") # print("") # print("") # # extracted_word = Candidate.CutByScore(sorted_candidate_list,2.0) # print("the result of cut by score: ") # for i in range(len(extracted_word)): # print("") # for j in range(len(extracted_word[i])): # print(extracted_word[i][j]) return extracted_result
def ExtractedWordDeleteRepetition(extracted_word): """ 得出每个聚类去重后的所有关键词 :param extracted_word: 未去重的每个聚类类别内部的关键词 :return: 去重后提取的关键词 """ extracted_result = [] for i in range(len(extracted_word)): for j in range(len(extracted_word[i])): if extracted_word[i][j][0] not in extracted_result: extracted_result.append(extracted_word[i][j][0]) return extracted_result if __name__ == '__main__': result = Reader.readfile('result38.bin') result_split = Segmentation.SplitCluster(result) delete_result = DeleteRepetition(result_split) candidate = BuildClass(delete_result) CalLenScore(candidate,2,15) CalSupScore(candidate,result_split) CalPosScore(candidate,result_split) for i in range(len(candidate)): for j in range(len(candidate[i])): candidate[i][j].CalScore() candidate_list = GenCandidateList(candidate) sorted_candidate_list = CandidateListSort(candidate_list) for i in range(len(sorted_candidate_list)): print("") for j in range(len(sorted_candidate_list[i])): print(sorted_candidate_list[i][j])