def prepare_classification_vectors(): if configs.load_classification: print("Start Loading Classification Data from Disk") start = timeit.default_timer() wl.classification_data = c_tools.load_file('classification_data.json') end = timeit.default_timer() print("Finish Loading Classification Data from Disk, Time: " + str(end - start) + " S") else: start = timeit.default_timer() print("Start Creating the Classification Vectors") news_list = News.objects.all() print("Number of documents is: " + str(len(news_list))) for index, news in enumerate(news_list): # 1: detect label label = c_tools.class_number_from_label(news.category) if label == -1: continue # 2: (Creating inverted index) adding terms of each document into the dictionary words = lin_tools.process_single_document(news.content) document_proceed_words = lin_tools.adding_words_to_dictionary( words, news.id, False) doc_terms = dictionary_tool.prepare_document_term_vector( document_proceed_words) # 3: create frequency vector frequency_vector = dictionary_tool.prepare_frequency_vector( doc_terms, 'None') # 4: tf_idf_vector tf_idf_vector = dictionary_tool.prepare_frequency_vector( doc_terms, 'Log') # 5: add to list res = {} res['label'] = label res['frequency_vector'] = frequency_vector res['tf_idf_vector'] = tf_idf_vector wl.classification_data.append(res) end = timeit.default_timer() print("Finish Creating the Classification Vectors, Time: " + str(end - start) + " S") # save c_tools.save_to_file(wl.classification_data, 'classification_data.json') print("Classification data saved")
def labeling_data_set(): if configs.labeling_dataset: # create class label dictionary for i in range(0, len(wl.label_map)): res = {} res['label'] = i res['documents'] = [] wl.classified_data.append(res) x_train, y_train = classify_tools.prepare_train_data_for_classification( wl.classification_data, configs.classifier_mode) save_file_name = '' print("Start labeling dataset") if configs.classifier_mode == 'naive': naive_classify(x_train, y_train) save_file_name = 'classified_documents_by_naive.json' elif configs.classifier_mode == 'knn': knn_classify(x_train, y_train, 5) save_file_name = 'classified_documents_by_knn.json' print("Finish labeling dataset") c_tools.save_to_file(wl.classified_data, save_file_name) print("Finish saving") else: print("Start Loading Classified Data") if configs.classifier_mode == 'naive': wl.classified_data = c_tools.load_file( 'classified_documents_by_naive.json') elif configs.classifier_mode == 'knn': wl.classified_data = c_tools.load_file( 'classified_documents_by_knn.json') x_train, y_train = classify_tools.prepare_train_data_for_classification( wl.classification_data, configs.classifier_mode) knn_model = classify_tools.knn_classify(x_train, y_train, 5) configs.classifier_model = knn_model print("Finish Loading Classified Data") # print("Number of Data in each class") # for item in wl.classified_data: # class_length = len(item.get('documents')) # print(str(item.get('label')) + ", length: " + str(class_length)) print("0 : 1815") print("1 : 10920") print("2 : 1410") print("3 : 3360") print("4 : 21945") print("5 : 13890") print("6 : 8925") print("7 : 52755")
def load_rss_from_file(address): if config.load_RSS: print("Loading RSS URLs from Disk") wl.rss_host_url_dictionary = c_tools.load_file(address) for item in wl.rss_host_url_dictionary: item['refresh_rate'] = int(item.get('refresh_rate')) print("Finish Loading RSS URLs from Disk") else: print("Start Preparing RSS URLs from Disk") # read lines from rss.txt file1 = open(config.prefix_path + 'rss.txt', 'r') Lines = file1.readlines() # create rss object for line in Lines: host_name = get_host_from_url(line) res = {} res['host_name'] = host_name res['refresh_rate'] = 10 res['url'] = line.rstrip() wl.rss_host_url_dictionary.append(res) # save rss.json c_tools.save_to_file(wl.rss_host_url_dictionary, address) print("Finish Preparing RSS URLs from Disk")
def prepare_documents_vector(): if configs.load_documents_vector: print("Start Loading Documents' Vector from Disk") start = timeit.default_timer() wl.documents_terms_vectors = c_tools.load_file( 'documents_term_vector.json') end = timeit.default_timer() print("Finish Loading Documents' Vector from Disk, Time: " + str(end - start) + " S") else: print("Start calculating vector for each documents") start = timeit.default_timer() dictionary_tool.prepare_documents_vector() end = timeit.default_timer() print("Finish calculating vector for each documents, Time: " + str(end - start)) configs.load_documents_vector = True print("Saving documents' term vector") start = timeit.default_timer() c_tools.save_to_file(wl.documents_terms_vectors, 'documents_term_vector.json') end = timeit.default_timer() print("Time: " + str(end - start))
def add_fetched_documents_from_crawler_to_system(documents): print("Adding new fetched documents from crawling to system") for doc in documents: doc['id'] = configs.number_of_documents configs.number_of_documents += 1 # 2: (Creating inverted index) adding terms of each document into the dictionary words = lin_tools.process_single_document(doc.get('content')) document_proceed_words = lin_tools.adding_words_to_dictionary( words, doc.get('id'), True, is_crawling=True) doc_terms = dictionary_tool.prepare_document_term_vector( document_proceed_words) # 3: create frequency vector frequency_vector = dictionary_tool.prepare_frequency_vector( doc_terms, 'None') # 4: tf_idf_vector tf_idf_vector = dictionary_tool.prepare_frequency_vector( doc_terms, 'Log') # 5: add to document term frequency wl.documents_terms_frequency.append(doc_terms) # 6: add to document term vector (tf_idf) wl.documents_terms_vectors.append(tf_idf_vector) # 7: add to classified document by knn knn_classify_a_document(configs.classifier_model, frequency_vector, doc.get('id')) # 8: add to clusters clustering_tools.add_document_to_nearest_cluster( wl.clusters, doc.get('id')) # 9: add to data base new_news = News(id=doc.get('id'), publish_date=doc.get('publish_date'), title=doc.get('title'), url=doc.get('url'), summary=doc.get('summary'), meta_tags="", content=doc.get('content'), thumbnail=doc.get('thumbnail'), category="0") new_news.save() # save section print("saving inverted index") c_tools.save_to_file(wl.inverted_index, 'inverted_index.json') print("saving inverted index done") print("saving classified document") c_tools.save_to_file(wl.classified_data, 'classified_documents_by_knn.json') print("saving classified document done") print("saving clusters") c_tools.save_to_file(wl.clusters, 'clusters.json') print("saving clusters done") print("saving documents term frequency dictionary") c_tools.save_to_file(wl.documents_terms_frequency, 'documents_term_frequency.json') print("saving documents term frequency dictionary done") print("saving documents term term vector tf_idf") c_tools.save_to_file(wl.documents_terms_vectors, 'documents_term_vector.json') print("saving documents term term vector tf_idf done")
def prepare_inverted_index(): if configs.load_inverted_index: # 1: inverted index start = timeit.default_timer() print("Start Loading the Inverted Index From Disk") wl.inverted_index = c_tools.load_file('inverted_index.json') end = timeit.default_timer() print("Finish Loading the Inverted Index From Disk, Time: " + str(end - start) + " S") # 2: document term frequency start = timeit.default_timer() print("Start Loading the Semi Documents' Vector From Disk") wl.documents_terms_frequency = c_tools.load_file( 'documents_term_frequency.json') end = timeit.default_timer() print("Finish Loading the Semi Documents' Vector From Disk, Time: " + str(end - start) + " S") else: start = timeit.default_timer() print("Start Creating the Inverted Index") news_list = News.objects.all() # print("Number of documents is: " + str(len(news_list))) print("Number of documents is: 115020") for index, news in enumerate(news_list): # 1: (Creating inverted index) adding terms of each document into the dictionary words = lin_tools.process_single_document(news.content) document_proceed_words = lin_tools.adding_words_to_dictionary( words, news.id, True) # 2: prepare semi vector for each document doc_terms = dictionary_tool.prepare_document_term_vector( document_proceed_words) wl.documents_terms_frequency.append(doc_terms) end = timeit.default_timer() print("Finish Creating Inverted Index, Time: " + str(end - start) + " S") # sort posting list print("Sorting posting lists for champion list section") start = timeit.default_timer() dictionary_tool.sort_posting_lists_depends_on_term_count() print( "Sorting posting lists for champion list section finished, time: " + str(timeit.default_timer() - start)) # save print("Saving inverted index") start = timeit.default_timer() c_tools.save_to_file(wl.inverted_index, 'inverted_index.json') end = timeit.default_timer() print("Time: " + str(end - start)) print("Saving document term frequency") start = timeit.default_timer() c_tools.save_to_file(wl.documents_terms_frequency, 'documents_term_frequency.json') end = timeit.default_timer() print("Time: " + str(end - start)) configs.load_inverted_index = True configs.dictionary_size = len(wl.inverted_index) configs.number_of_documents = len(wl.documents_terms_frequency) wl.inverted_index_keys = list(wl.inverted_index.keys()) # print("Number of Documents : " + str(configs.number_of_documents)) print("Number of documents is: 115020")