Ejemplo n.º 1
0
def feat(folder):
    docs = preprocess_documents(read_files(os.path.join(folder, "*.txt")))
    assert (len(docs) > 0)
    #stemmer = PorterStemmer()
    #docs = stemmer.stem_documents(docs)
    td_dict, vocab = tc(docs)
    td = to_sparse_matrix(td_dict, vocab).toarray()
    idf = to_vector(idf_from_tc(td_dict), vocab)
    print "term-document matrix size", td.shape
    return td, idf, vocab
Ejemplo n.º 2
0
def feat(folder):
    docs = preprocess_documents(read_files(os.path.join(folder, "*.txt")))
    assert(len(docs) > 0)
    #stemmer = PorterStemmer()
    #docs = stemmer.stem_documents(docs)
    td_dict, vocab = tc(docs)
    td = to_sparse_matrix(td_dict, vocab).toarray()
    idf = to_vector(idf_from_tc(td_dict), vocab)
    print "term-document matrix size", td.shape
    return td, idf, vocab
Ejemplo n.º 3
0
def feat(folder):
    global num_topics
    # docs = list(preprocess_documents(read_files(os.path.join(folder, "*.txt"))))
    ret_val_1 = read_json(folder + "/cleaned.json")
    json_files_list = ret_val_1[1]
    docs = list(preprocess_documents(ret_val_1[0]))
    assert(len(docs) > 0)
    print("len(docs) =",len(docs))
    # Uncomment this later and fix it with the new json theme
    # docs_2 = list(docs)
    # docs_reduced = reduce_docs(docs)
    #
    #
    # if docs_reduced.__len__() != docs_2.__len__():
    #
    #     list_1 = docs_to_delete(docs=docs_2, docs_red=docs_reduced)
    #     delete_docs(list_1)
    #
    #     docs = preprocess_documents(read_files(os.path.join(folder, "*.txt")))
    #     assert(len(docs) > 0)
    #     print("len(docs) =",len(docs))
    # Uncomment ends here

    # num_topics = int(len(docs) / topic_divider)
    # if(num_topics < 2):
    #     num_topics = 2
    #stemmer = PorterStemmer()
    #docs = stemmer.stem_documents(docs)
    td_dict, vocab = tc(docs)

    print("'''''''''''''''''''''''''''''''")
    # print(td_dict)

    empty_docs_list = []

    for doc in range(len(docs)):
        if docs[doc] == '':
            print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Empty doc detected with id:',doc,' and file name is:',json_files_list[doc])
            empty_docs_list.append(doc)

    print ('len(td_dict) =', len(td_dict))
    print ('len(vocab) =',len(vocab))
    global number_of_words
    global number_of_docs
    number_of_words = len(vocab)
    number_of_docs = len(td_dict)
    print('type(docs):',type(docs))
    print('type(vocab):',type(vocab))
    # print('docs',docs)
    # print('td_dict:',td_dict)
    # print('vocab',vocab)
    td = to_sparse_matrix(td_dict, vocab).toarray()
    # print('td:',td)
    print('type(td):',type(td))
    # idf = to_vector(idf_from_tc(td_dict), vocab)
    print ("term-document matrix size", td.shape)
    print(td.shape[0],'terms by',td.shape[1],'docs')
    print("size of term-document matrix in bytes according to sys.getsizeof =",sys.getsizeof(td))
    if topic_divider == 0:
        pass
    else:
        num_topics = int(td.shape[1] / topic_divider)
    # num_topics = 30
    # num_topics = 7
    # num_topics = 2
    if (num_topics < 2):
        num_topics = 2
    # matrix_to_file(td)
    # print 'td\n',td
    # print 'vocab\n',vocab
    # return td, idf, vocab
    # exit(0)
    return td, vocab, empty_docs_list, json_files_list