예제 #1
0
                        content = '%s,%s\n' % (eachLine, clfResults[resultCounter])
                        fw.write(content)
                        resultCounter += 1

if __name__ == "__main__":
	reload(sys)
	sys.setdefaultencoding('utf8')

	initDirectories()
	handleArgv()

	if Config.PRE_PROCESS_ON == 1:
		xmlConverter = XmlConverter()
		xmlConverter.convertDoc(0, Config.DATA_SIZE)
		xmlConverter.convertQuery(0, Config.QUERY_SIZE)
                xmlConverter.convertTestData(0, Config.TEST_DATA_SIZE)

	else:
		docReader = DocReader()
		docModeler = DocModeler()
                trainDataReader = TrainDataReader()

                featureModeler = FeatureBasedModeler()
                Y, trainDataIdxs = trainDataReader.getTrainAnswers()
                print "Get Train Answers Done"

                tfidfMat, Y = featureModeler.extractFeaturesMatrix(trainDataIdxs, Y)
                print "Calc data features done"
                '''
                print tfidfMat.shape
                transformer = random_projection.SparseRandomProjection(n_components=700000)