def cross_validation(number_of_features):
	import sys
	import codecs
	from Model import Topic, Article
	from Topics import Topics
	from sqlalchemy import create_engine
	from sqlalchemy.orm import sessionmaker

	engine=create_engine('mysql://*****:*****@localhost/data_mining?use_unicode=1&charset=utf8')
	Session=sessionmaker(bind=engine)
	session=Session()

	topics=Topics(session, CosineSimilarityClassifier(number_of_features))
	topics.print_out()

	print >>sys.stderr,'Adding articles'
	i=0
	for line in open('../data/train.dat'):
		id=int(line)
		topics.add_article(id)
		i+=1
#		if i==200:
#			break

	topics.print_out()

	topics.build_index( "../data/lang/lemmatization.dat", "../data/lang/stopwords.dat")
	topics.train_classifier()

	correct=0.0
	number_of_articles=0.0

	confusion_matrix={}

	for line in open('../data/test.dat'):
		article=session.query(Article).filter(Article.id==int(line)).one()
		assigned_topic=topics.assign_topic_to_article(article.text)
		print number_of_articles,article.title
		try:
			target_topic=topics.get_topics()[article.topic.id].get_parent_at_level(2)

			if target_topic.id not in confusion_matrix:
				confusion_matrix[target_topic.id]={}
			confusion_matrix[target_topic.id][assigned_topic.id]=confusion_matrix[target_topic.id].get(assigned_topic.id,0)+1

			if target_topic.id==assigned_topic.id:
				correct+=1
				print ' + ',target_topic.name,' / ',assigned_topic.name
			else:
				print ' - ',target_topic.name,' / ',assigned_topic.name

			number_of_articles+=1
			print 'Accuracy:',(correct/number_of_articles)*100,'% '
		except:
			print sys.exc_info()
			pass
#		if number_of_articles>50:
#			break

	print 'Accuracy:',(correct/number_of_articles)*100,'% '

	precision={}
	for t in confusion_matrix:
		print topics.get_topics()[t].name
		true_pos=confusion_matrix[t].get(t,0)
		tp_fp=sum([confusion_matrix[c].get(t,0) for c in confusion_matrix])
		if tp_fp==0:
			precision[t]=1
		else:
			precision[t]=float(true_pos)/tp_fp

	recall={}
	for t in confusion_matrix:
		true_pos=confusion_matrix[t].get(t,0)
		tp_fn=sum([confusion_matrix[t].get(c,0) for c in confusion_matrix])
		if tp_fn==0:
			recall[t]=1
		else:
			recall[t]=float(true_pos)/tp_fn

	conf_matrix_out=codecs.open('confusion_matrix'+str(number_of_features)+'.csv',encoding='utf-8',mode='w')

	conf_matrix_out.write(',')
	for a in confusion_matrix:
		conf_matrix_out.write('%s,'%topics.get_topics()[a].name)
	conf_matrix_out.write('Recall\n')
	for a in confusion_matrix:
		conf_matrix_out.write('%s,'%topics.get_topics()[a].name)
		for b in confusion_matrix:
			conf_matrix_out.write('%s,'%confusion_matrix[a].get(b,0))
		conf_matrix_out.write('%s\n'%recall[a])
	conf_matrix_out.write('Precision,')
	for b in confusion_matrix:
		conf_matrix_out.write('%s,'%precision[b])
	conf_matrix_out.write('\n\n')

	conf_matrix_out.write('F-measure\n')
	av_fm=0
	for b in confusion_matrix:
		if precision[b]==0 or recall[b]==0:
			fm=0
		else:
			fm=2*precision[b]*recall[b]/(precision[b]+recall[b])
		conf_matrix_out.write('%s,%s\n'%(topics.get_topics()[b].name,fm))
		av_fm+=fm
	av_fm/=len(confusion_matrix)
	conf_matrix_out.write('\n')

	conf_matrix_out.write('Average F-measure,%s\n'%av_fm)
	conf_matrix_out.write('Accuracy,%s\n'%(correct/number_of_articles))
def cross_validation():
	import psyco
	psyco.full()

	import sys
	import codecs
	from Model import Topic, Article
	from Topics import Topics
	from sqlalchemy import create_engine
	from sqlalchemy.orm import sessionmaker

	engine=create_engine('mysql://*****:*****@localhost/data_mining?use_unicode=1&charset=utf8')
	Session=sessionmaker(bind=engine)
	session=Session()

	topics=Topics(session, KNearestNeighborClassifier())
	topics.print_out()

	print >>sys.stderr,'Adding articles'
	i=0
	for line in open('../data/train.dat'):
		id=int(line)
		topics.add_article(id)
		i+=1
		if i==10000:
			break

	topics.build_index( "../data/lang/lemmatization.dat", "../data/lang/stopwords.dat")
	topics.train_classifier()

	correct=0.0
	number_of_articles=0.0

	confusion_matrix={}

	for line in open('../data/test.dat'):
		article=session.query(Article).filter(Article.id==int(line)).one()
		assigned_topic=topics.assign_topic_to_article(article.text)
		print number_of_articles,article.title
		try:
			target_topic=topics.get_topics()[article.topic.id].get_parent_at_level(2)

			if target_topic.id not in confusion_matrix:
				confusion_matrix[target_topic.id]={}
			confusion_matrix[target_topic.id][assigned_topic.id]=confusion_matrix[target_topic.id].get(assigned_topic.id,0)+1

			if target_topic.id==assigned_topic.id:
				correct+=1
				print ' + ',target_topic.name,' / ',assigned_topic.name
			else:
				print ' - ',target_topic.name,' / ',assigned_topic.name

			number_of_articles+=1
			print 'Accuracy:',(correct/number_of_articles)*100,'% '
		except:
			print sys.exc_info()
			pass
#		if number_of_articles>1000:
#			break

	print 'Accuracy:',(correct/number_of_articles)*100,'% '

	conf_matrix_out=codecs.open('confusion_matrix.csv',encoding='utf-8',mode='w')

	conf_matrix_out.write(',')
	for a in confusion_matrix:
		conf_matrix_out.write('%s,'%topics.get_topics()[a].name)
	conf_matrix_out.write('\n')
	for a in confusion_matrix:
		conf_matrix_out.write('%s,'%topics.get_topics()[a].name)
		for b in confusion_matrix:
			conf_matrix_out.write('%s,'%confusion_matrix[a].get(b,0))
		conf_matrix_out.write('\n')