コード例 #1
0
def testEffects(in_filename,out_filename):
    fracs = [0.01,0.1,0.2,0.5,0.6,0.75,0.9,1]
    trainResults = []
    testResults = []
    for f in fracs:
        temp = pd.read_csv(in_filename)
        (_,trainSet) = split.split(temp,"trainingSet.csv","testSet.csv",f,False)
        testSet = pd.read_csv("testSet.csv")
        (probs,headers) = nb.trainModel(1.0,trainSet)
        print("Fraction size: " + str(f))
        trainResult = nb.test(trainSet,probs,headers,"Training Data Set")
        trainResults.append(trainResult)
        testResult = nb.test(testSet,probs,headers,"Test Data Set")
        testResults.append(testResult)
    return (fracs,trainResults,testResults)
コード例 #2
0
def testEffects(in_filename,out_filename):
    bins = [2,5,10,50,100,200]
    trainResults = []
    testResults = []
    for b in bins:
        temp = pd.read_csv(in_filename)
        current = discretize.discretize(temp,out_filename,b,False)
        (trainSet,testSet) = split.split(current,"trainingSet.csv","testSet.csv",0.2,False)
        (probs,headers) = nb.trainModel(1,trainSet)
        print("Bin size: " + str(b))
        trainResult = nb.test(trainSet,probs,headers,"Training Data Set")
        trainResults.append(trainResult)
        testResult = nb.test(testSet,probs,headers,"Test Data Set")
        testResults.append(testResult)
    return (bins,trainResults,testResults)
コード例 #3
0
# classify the test dataset
# read the test dataset
label_test_buf = list()
test_path = os.path.expanduser('./spam_classification/SPARSE.TEST')
with open(test_path, newline='') as test:
    reader = csv.reader(test, delimiter=' ')
    for row in reader:
        label_test_buf.append(int(row[0]))
label_test = np.asarray(label_test_buf, dtype=int)

nd_test = len(label_test)
count_d_w_test = np.zeros([nd_test, nw], dtype=int)
with open(test_path, newline='') as test:
    reader = csv.reader(test, delimiter=' ')
    for d_id, row in enumerate(reader):
        current_email = csv.reader(row[2:-1], delimiter=':')
        for rows in current_email:
            w_id = int(rows[0])
            count = int(rows[1])
            count_d_w_test[d_id][w_id - 1] = count

df_test = pd.DataFrame(count_d_w_test)
nb_model = nb.train(df_train)
nb_predictions = nb.test(nb_model, df_test)
y = pd.Series(label_test)
nb_error = nb.compute_error(y, nb_predictions)
print('NB Test error: {}'.format(nb_error))

words = nb.k_most_indicative_words(5, nb_model.to_dataframe().iloc[:, :-1])
print('The {} most spam-worthy words are: {}'.format(len(words), words))
コード例 #4
0
ファイル: nb_main.py プロジェクト: giraykskn/METUAssignments
    dataLines = dataFile.readlines()
    labelsLines = labelsFile.readlines()
    for line in dataLines:
        data.append(line.rstrip('\n').split(' '))
    for line in labelsLines:
        labels.append(line.rstrip('\n'))
    return data, labels


train_data, train_labels = readFile('train')
test_data, test_labels = readFile('test')

vocab = vocabulary(train_data)
pi = estimate_pi(train_labels)
theta = estimate_theta(train_data, train_labels, vocab)
t = test(theta, pi, vocab, test_data)
predicted = []
for data in t:
    pred = None
    maxScore = -sys.float_info.max
    for score in data:
        if score[0] > maxScore:
            maxScore = score[0]
            pred = score[1]
    predicted.append(pred)

correct = 0
for i in range(len(test_labels)):
    if test_labels[i] == predicted[i]:
        correct += 1
コード例 #5
0
pi_gt = {'class2': 0.5, 'class1': 0.5}

vocab = vocabulary(train_data)
print('Vocabulary result: {}'.format(vocab_gt == vocab))

pi = estimate_pi(train_labels)
pi_success = True
for class_name in pi_gt:
    if abs(pi_gt[class_name] - pi[class_name]) > 10**-5:
        pi_success = False
print('Pi result: {}'.format(pi_success))

theta = estimate_theta(train_data, train_labels, vocab)
theta_success = True
for class_name in theta_gt:
    for word in theta[class_name]:
        if abs(theta[class_name][word] - theta_gt[class_name][word]) > 10**-5:
            theta_success = False
print('Theta result: {}'.format(theta_success))

scores = test(theta, pi, vocab, test_data)
scores_success = True
for score_gt, score_pred in zip(scores_gt, scores):
    gt_dict = dict([(x[1], x[0]) for x in score_gt])
    pred_dict = dict([(x[1], x[0]) for x in score_pred])
    for class_name in gt_dict:
        if abs(gt_dict[class_name] - pred_dict[class_name]) > 10**-5:
            scores_success = False
print('Scores result: {}'.format(scores_success))