Ejemplo n.º 1
done = 0
# numList = [0]
resFile = open("/home/x/ccrList.txt", "w")
for numberTraining in numList:
    # print numberTraining
    done = 0
    dataFile = open("ml_new_train.csv", "r")
    testFile = open("ml_new_valid.csv", "r")
        if done == 1:
            print "Skipping the word count..."
            classWordCount, classWordTotal, classTotal, textsWithWord, ftd, fnn, datClass = pp.getWordCount(
                dataFile, excludeSimpleWords, numberTraining
            done = 1
    except NameError:
        classWordCount, classWordTotal, classTotal, textsWithWord, ftd, fnn, datClass = pp.getWordCount(
            dataFile, excludeSimpleWords, numberTraining
        done = 1

    # print ftd[0][0]
    # print 'Computing TF-IDF!'
    tf, idf, tfidf = pp.getTfIdf(classTotal, textsWithWord, ftd, typeFlag)
    if doNN == 1:
        # getTfNN( N, nt, fnn, flagType )
        tfNN = pp.getTfNN(fnn)
        # print 'Predicting!'
import numpy as np

# Flags
excludeSimpleWords = 1	# Exclude articles (stopwords in nltk)
isValidate = 1		# If 1, compare predicted and actual classes

# Parameters
numberTraining = 0 	# If >0, training will stop after this number of examples
numClasses = 4		# Number of classes in data. Should probably detect it automatically, but meh.

trainFile = 'ml_new_train.csv'
testFile = 'ml_new_valid.csv'

# Declarations
pc = [0.0 for i in range(numClasses)]	# Probability of a class, P(c)
pwc = [{} for i in range(numClasses)]	# Probability of a word given a class, P(w|c)

#with open('ml_dataset_train.csv', 'r') as dataFile:
with open(trainFile,'r') as dataFile:

	# Count words
	classWordCount, classWordTotal, classTotal = pp.getWordCount( dataFile, excludeSimpleWords, numberTraining )

	# Get probabilities
	Pc, Pwc = pp.getProb( classWordCount, classWordTotal, classTotal )
	# p( c )

	# Get prediction on validation.
	predClass, CCR = pp.doPrediction( open(testFile,'r'), Pc, Pwc, classWordTotal, isValidate )
	print CCR