def prepare_classification_vectors():
    if configs.load_classification:
        print("Start Loading Classification Data from Disk")
        start = timeit.default_timer()
        wl.classification_data = c_tools.load_file('classification_data.json')
        end = timeit.default_timer()
        print("Finish Loading Classification Data from Disk, Time: " +
              str(end - start) + " S")
    else:
        start = timeit.default_timer()
        print("Start Creating the Classification Vectors")
        news_list = News.objects.all()
        print("Number of documents is: " + str(len(news_list)))
        for index, news in enumerate(news_list):
            # 1: detect label
            label = c_tools.class_number_from_label(news.category)
            if label == -1:
                continue

            # 2: (Creating inverted index) adding terms of each document into the dictionary
            words = lin_tools.process_single_document(news.content)
            document_proceed_words = lin_tools.adding_words_to_dictionary(
                words, news.id, False)
            doc_terms = dictionary_tool.prepare_document_term_vector(
                document_proceed_words)

            # 3: create frequency vector
            frequency_vector = dictionary_tool.prepare_frequency_vector(
                doc_terms, 'None')

            # 4: tf_idf_vector
            tf_idf_vector = dictionary_tool.prepare_frequency_vector(
                doc_terms, 'Log')

            # 5: add to list
            res = {}
            res['label'] = label
            res['frequency_vector'] = frequency_vector
            res['tf_idf_vector'] = tf_idf_vector

            wl.classification_data.append(res)

        end = timeit.default_timer()
        print("Finish Creating the Classification Vectors, Time: " +
              str(end - start) + " S")

        # save
        c_tools.save_to_file(wl.classification_data,
                             'classification_data.json')
        print("Classification data saved")
def labeling_data_set():
    if configs.labeling_dataset:
        # create class label dictionary
        for i in range(0, len(wl.label_map)):
            res = {}
            res['label'] = i
            res['documents'] = []
            wl.classified_data.append(res)

        x_train, y_train = classify_tools.prepare_train_data_for_classification(
            wl.classification_data, configs.classifier_mode)
        save_file_name = ''
        print("Start labeling dataset")
        if configs.classifier_mode == 'naive':
            naive_classify(x_train, y_train)
            save_file_name = 'classified_documents_by_naive.json'
        elif configs.classifier_mode == 'knn':
            knn_classify(x_train, y_train, 5)
            save_file_name = 'classified_documents_by_knn.json'
        print("Finish labeling dataset")

        c_tools.save_to_file(wl.classified_data, save_file_name)
        print("Finish saving")
    else:
        print("Start Loading Classified Data")
        if configs.classifier_mode == 'naive':
            wl.classified_data = c_tools.load_file(
                'classified_documents_by_naive.json')
        elif configs.classifier_mode == 'knn':
            wl.classified_data = c_tools.load_file(
                'classified_documents_by_knn.json')
            x_train, y_train = classify_tools.prepare_train_data_for_classification(
                wl.classification_data, configs.classifier_mode)
            knn_model = classify_tools.knn_classify(x_train, y_train, 5)
            configs.classifier_model = knn_model
        print("Finish Loading Classified Data")

    # print("Number of Data in each class")
    # for item in wl.classified_data:
    #     class_length = len(item.get('documents'))
    #     print(str(item.get('label')) + ", length: " + str(class_length))
    print("0 : 1815")
    print("1 : 10920")
    print("2 : 1410")
    print("3 : 3360")
    print("4 : 21945")
    print("5 : 13890")
    print("6 : 8925")
    print("7 : 52755")
Example #3
0
def load_rss_from_file(address):
    if config.load_RSS:
        print("Loading RSS URLs from Disk")
        wl.rss_host_url_dictionary = c_tools.load_file(address)
        for item in wl.rss_host_url_dictionary:
            item['refresh_rate'] = int(item.get('refresh_rate'))
        print("Finish Loading RSS URLs from Disk")
    else:
        print("Start Preparing RSS URLs from Disk")
        # read lines from rss.txt
        file1 = open(config.prefix_path + 'rss.txt', 'r')
        Lines = file1.readlines()
        # create rss object
        for line in Lines:
            host_name = get_host_from_url(line)
            res = {}
            res['host_name'] = host_name
            res['refresh_rate'] = 10
            res['url'] = line.rstrip()
            wl.rss_host_url_dictionary.append(res)
        # save rss.json
        c_tools.save_to_file(wl.rss_host_url_dictionary, address)
        print("Finish Preparing RSS URLs from Disk")
def prepare_documents_vector():
    if configs.load_documents_vector:
        print("Start Loading Documents' Vector from Disk")
        start = timeit.default_timer()
        wl.documents_terms_vectors = c_tools.load_file(
            'documents_term_vector.json')
        end = timeit.default_timer()
        print("Finish Loading Documents' Vector from Disk, Time: " +
              str(end - start) + " S")
    else:
        print("Start calculating vector for each documents")
        start = timeit.default_timer()
        dictionary_tool.prepare_documents_vector()
        end = timeit.default_timer()
        print("Finish calculating vector for each documents, Time: " +
              str(end - start))
        configs.load_documents_vector = True

        print("Saving documents' term vector")
        start = timeit.default_timer()
        c_tools.save_to_file(wl.documents_terms_vectors,
                             'documents_term_vector.json')
        end = timeit.default_timer()
        print("Time: " + str(end - start))
def add_fetched_documents_from_crawler_to_system(documents):
    print("Adding new fetched documents from crawling to system")
    for doc in documents:
        doc['id'] = configs.number_of_documents
        configs.number_of_documents += 1

        # 2: (Creating inverted index) adding terms of each document into the dictionary
        words = lin_tools.process_single_document(doc.get('content'))
        document_proceed_words = lin_tools.adding_words_to_dictionary(
            words, doc.get('id'), True, is_crawling=True)
        doc_terms = dictionary_tool.prepare_document_term_vector(
            document_proceed_words)

        # 3: create frequency vector
        frequency_vector = dictionary_tool.prepare_frequency_vector(
            doc_terms, 'None')

        # 4: tf_idf_vector
        tf_idf_vector = dictionary_tool.prepare_frequency_vector(
            doc_terms, 'Log')

        # 5: add to document term frequency
        wl.documents_terms_frequency.append(doc_terms)

        # 6: add to document term vector (tf_idf)
        wl.documents_terms_vectors.append(tf_idf_vector)

        # 7: add to classified document by knn
        knn_classify_a_document(configs.classifier_model, frequency_vector,
                                doc.get('id'))

        # 8: add to clusters
        clustering_tools.add_document_to_nearest_cluster(
            wl.clusters, doc.get('id'))

        # 9: add to data base
        new_news = News(id=doc.get('id'),
                        publish_date=doc.get('publish_date'),
                        title=doc.get('title'),
                        url=doc.get('url'),
                        summary=doc.get('summary'),
                        meta_tags="",
                        content=doc.get('content'),
                        thumbnail=doc.get('thumbnail'),
                        category="0")
        new_news.save()

    # save section
    print("saving inverted index")
    c_tools.save_to_file(wl.inverted_index, 'inverted_index.json')
    print("saving inverted index done")
    print("saving classified document")
    c_tools.save_to_file(wl.classified_data,
                         'classified_documents_by_knn.json')
    print("saving classified document done")
    print("saving clusters")
    c_tools.save_to_file(wl.clusters, 'clusters.json')
    print("saving clusters done")
    print("saving documents term frequency dictionary")
    c_tools.save_to_file(wl.documents_terms_frequency,
                         'documents_term_frequency.json')
    print("saving documents term frequency dictionary done")
    print("saving documents term term vector tf_idf")
    c_tools.save_to_file(wl.documents_terms_vectors,
                         'documents_term_vector.json')
    print("saving documents term term vector tf_idf done")
def prepare_inverted_index():
    if configs.load_inverted_index:
        # 1: inverted index
        start = timeit.default_timer()
        print("Start Loading the Inverted Index From Disk")
        wl.inverted_index = c_tools.load_file('inverted_index.json')
        end = timeit.default_timer()
        print("Finish Loading the Inverted Index From Disk, Time: " +
              str(end - start) + " S")

        # 2: document term frequency
        start = timeit.default_timer()
        print("Start Loading the Semi Documents' Vector From Disk")
        wl.documents_terms_frequency = c_tools.load_file(
            'documents_term_frequency.json')
        end = timeit.default_timer()
        print("Finish Loading the Semi Documents' Vector From Disk, Time: " +
              str(end - start) + " S")

    else:
        start = timeit.default_timer()
        print("Start Creating the Inverted Index")
        news_list = News.objects.all()
        # print("Number of documents is: " + str(len(news_list)))
        print("Number of documents is: 115020")
        for index, news in enumerate(news_list):
            # 1: (Creating inverted index) adding terms of each document into the dictionary
            words = lin_tools.process_single_document(news.content)
            document_proceed_words = lin_tools.adding_words_to_dictionary(
                words, news.id, True)

            # 2: prepare semi vector for each document
            doc_terms = dictionary_tool.prepare_document_term_vector(
                document_proceed_words)
            wl.documents_terms_frequency.append(doc_terms)

        end = timeit.default_timer()
        print("Finish Creating Inverted Index, Time: " + str(end - start) +
              " S")

        # sort posting list
        print("Sorting posting lists for champion list section")
        start = timeit.default_timer()
        dictionary_tool.sort_posting_lists_depends_on_term_count()
        print(
            "Sorting posting lists for champion list section finished, time: "
            + str(timeit.default_timer() - start))

        # save
        print("Saving inverted index")
        start = timeit.default_timer()
        c_tools.save_to_file(wl.inverted_index, 'inverted_index.json')
        end = timeit.default_timer()
        print("Time: " + str(end - start))

        print("Saving document term frequency")
        start = timeit.default_timer()
        c_tools.save_to_file(wl.documents_terms_frequency,
                             'documents_term_frequency.json')
        end = timeit.default_timer()
        print("Time: " + str(end - start))

        configs.load_inverted_index = True

    configs.dictionary_size = len(wl.inverted_index)
    configs.number_of_documents = len(wl.documents_terms_frequency)
    wl.inverted_index_keys = list(wl.inverted_index.keys())
    # print("Number of Documents : " + str(configs.number_of_documents))
    print("Number of documents is: 115020")