# coding: GBK import configuration import ReadData import math from operator import itemgetter fileToWords = ReadData.ReadAllCatalogs(configuration.training_data_directory) wordFrequency = {} wordDocFrequency = {} wordidf = {} doc_word_frequency = {} # the number of documents docCount = 0 #the default number of features is 2000 featureNum = configuration.feature_number #get word list and sort them by their idf value, return the [(word, idf value), ...] def wordStatistic(): global wordFrequency global wordDocFrequency global wordidf global docCount global doc_word_frequency for catalog in fileToWords: catalog = fileToWords[catalog] docCount += len(catalog)
# coding: GBK import configuration import Training import ReadData import math print 'began to get training_doc_vector' training_doc_vector = Training.getDocVector() print 'finished getting training_doc_vector' print 'began to get test_files_to_words' test_files_to_words = ReadData.ReadAllCatalogs( configuration.test_data_directory, False) print 'finished getting test_files_to_words' def getDocVector(content, featureVector): fileVector = {} for catalog in content: catalog = content[catalog] for doc in catalog: wordlist = catalog[doc] vector = [] for feature in featureVector: vector.append(wordlist.count(feature[0]) * feature[1]) fileVector[doc] = vector return fileVector def similarity(vectora, vectorb):