예제 #1
0
# coding: GBK
import configuration
import ReadData
import math
from operator import itemgetter

fileToWords = ReadData.ReadAllCatalogs(configuration.training_data_directory)

wordFrequency = {}
wordDocFrequency = {}
wordidf = {}
doc_word_frequency = {}

# the number of documents
docCount = 0

#the default number of features is 2000
featureNum = configuration.feature_number


#get word list and sort them by their idf value, return the [(word, idf value), ...]
def wordStatistic():
    global wordFrequency
    global wordDocFrequency
    global wordidf
    global docCount
    global doc_word_frequency

    for catalog in fileToWords:
        catalog = fileToWords[catalog]
        docCount += len(catalog)
예제 #2
0
# coding: GBK

import configuration
import Training
import ReadData
import math

print 'began to get training_doc_vector'
training_doc_vector = Training.getDocVector()
print 'finished getting training_doc_vector'

print 'began to get test_files_to_words'
test_files_to_words = ReadData.ReadAllCatalogs(
    configuration.test_data_directory, False)
print 'finished getting test_files_to_words'


def getDocVector(content, featureVector):
    fileVector = {}
    for catalog in content:
        catalog = content[catalog]
        for doc in catalog:
            wordlist = catalog[doc]
            vector = []
            for feature in featureVector:
                vector.append(wordlist.count(feature[0]) * feature[1])
            fileVector[doc] = vector
    return fileVector


def similarity(vectora, vectorb):