Example #1
0
def main():
    # SETTINGS
    NUM_PAGES = 10000
    corpus_filename = "corpus10k.txt"
    stopwords_filename = "stopwords10k.txt"

    myTfIdf = TfIdf(corpus_filename, stopwords_filename)

    content = []
    worker_threads = []

    url = "http://en.wikipedia.org/wiki/Special:Random"

    for i in range(NUM_PAGES):
        t = threading.Thread(target=clean_html_thread, args=(url, content))
        t.start()
        worker_threads.append(t)

    for t in worker_threads:
        t.join()

    for t in worker_threads:
        if not t.isAlive():
            # get results from thtead
            t.handled = True
    worker_threads = [t for t in worker_threads if not t.handled]

    for document in content:
        myTfIdf.add_input_document(document)
        print_keywords(document)

    myTfIdf.save_corpus_to_file(corpus_filename, stopwords_filename)