import csvfuncs as cf
import bagOfWords as bow
from sklearn.neighbors import KNeighborsClassifier

td = cf.readTrainingData();
#trainXRaw = td.cities;
trainXBow = bow.getBOWFromFile('training_BOW.csv')
trainY1   = td.cityCodes
trainY2   = td.countryCodes
#wordList = bow.getWordListFromCSV('training_wordList.csv')

#validXRaw = cf.readXData('validation.csv')
#validXBow = bow.getBOW(validXRaw,wordList,'validation')
validXBow = bow.getBOWFromFile('validation_BOW.csv')

print "training a KNN classifier with the training data for Y1"
clfY1 = KNeighborsClassifier(n_neighbors =3)
clfY1.fit(trainXBow,Y1)
print "predicting Y1 using the KNN classifier"
Y1_hat = clfY1.predict(validXBow)

print "training a KNN classifier with the training data for Y2"
clfY2 = KNeighborsClassifier(n_neighbors =3)
clfY2.fit(trainXBow,Y2)
print "predicting Y2 using the KNN classifier"
Y2_hat = clfY2.predict(validXBow)

print "prediction complete"

f = open('validation_result_KNN.csv','w')
for i in range(len(Y1_hat)):
import csvfuncs as cf
import bagOfWords as bow
from sklearn.ensemble import RandomForestClassifier

td = cf.readTrainingData()

### if getting the bow for the first time
#trainXRaw = td.cities;
#trainXBow = bow.getBOWTrain(trainXRaw,'training')

### reading the bow saved in a file, its slow to generate
trainXBow = bow.getBOWFromFile('training_BOW.csv')
trainY1 = td.cityCodes
trainY2 = td.countryCodes

### reading the trainind data word list, used to create
### the validation, test data bow
#wordList = bow.getWordListFromCSV('training_wordList.csv')

### if getting the bow for the validation for the first time
#validXRaw = cf.readXData('validation.csv')
#validXBow = bow.getBOW(validXRaw,wordList,'validation')

### reading the validation bow saved in a file
validXBow = bow.getBOWFromFile('validation_BOW.csv')

### training the random forrest classifier to
### predict the city codes
print "training a RandomForestClassifier with the training data for Y1"
clfY1 = RandomForestClassifier(n_estimators=10)
clfY1.fit(trainXBow, trainY1)
def test():
	td = cf.readTrainingData()
	rwl=getRawList(td.cities)