def plot_2nd_level_distribution():
	from sqlalchemy import create_engine
	from sqlalchemy.orm import sessionmaker
	
	from Topics import Topics
	from KNearestNeighborClassifier import KNearestNeighborClassifier

	engine=create_engine('mysql://*****:*****@localhost/data_mining?use_unicode=1&charset=utf8')
	Article.metadata.create_all(engine)
	Session=sessionmaker(bind=engine)
	session=Session()

	topics=Topics(session, None)
	i=0
	for article in session.query(Article.id):
		topics.add_article(article.id)
		if i%500==0:
			print i
		i+=1
#		if i>1000:
#			break

	topic_nodes=[t for a in topics.root_topic.subtopics for t in a.subtopics]
	counts=[len(t.get_all_articles(session)) for t in topic_nodes]
	w=3
	x=[]
	y=[]
	start=0
	for c in counts:
		y.append(float(c)/sum(counts)*100)
#		y.append(float(c))
		x.append(start)
		start+=w+1
	plt.bar(x,y,width=w)
	plt.grid()
	plt.ylabel('Procenat od ukupnog broja [%]')
	plt.xticks([t+w/2 for t in x],[t.name for t in topic_nodes],rotation=90)
	for tx,ty,cnt in zip(x,y,counts):
		plt.text(tx+w/2,ty,str(cnt))
	plt.title(u'Distibucija članaka po temama (ukupno %d članaka)'%sum(counts),ha='center')
	plt.show()
def getDocsFromTopic(keyword):
    paperIds = []
    #Find the topic that the keyword is in
    topics = Topics().populateTopics()
    topicNum = -1
    for i in range(len(topics)):
        if keyword in topics[i]:
            topicNum = i

    #Find documents that have that topic
    for key in docsTopicMapping:
        if topicNum in docsTopicMapping[key]:
            paperIds.append(key)

    return paperIds
def cross_validation(number_of_features):
	import sys
	import codecs
	from Model import Topic, Article
	from Topics import Topics
	from sqlalchemy import create_engine
	from sqlalchemy.orm import sessionmaker

	engine=create_engine('mysql://*****:*****@localhost/data_mining?use_unicode=1&charset=utf8')
	Session=sessionmaker(bind=engine)
	session=Session()

	topics=Topics(session, CosineSimilarityClassifier(number_of_features))
	topics.print_out()

	print >>sys.stderr,'Adding articles'
	i=0
	for line in open('../data/train.dat'):
		id=int(line)
		topics.add_article(id)
		i+=1
#		if i==200:
#			break

	topics.print_out()

	topics.build_index( "../data/lang/lemmatization.dat", "../data/lang/stopwords.dat")
	topics.train_classifier()

	correct=0.0
	number_of_articles=0.0

	confusion_matrix={}

	for line in open('../data/test.dat'):
		article=session.query(Article).filter(Article.id==int(line)).one()
		assigned_topic=topics.assign_topic_to_article(article.text)
		print number_of_articles,article.title
		try:
			target_topic=topics.get_topics()[article.topic.id].get_parent_at_level(2)

			if target_topic.id not in confusion_matrix:
				confusion_matrix[target_topic.id]={}
			confusion_matrix[target_topic.id][assigned_topic.id]=confusion_matrix[target_topic.id].get(assigned_topic.id,0)+1

			if target_topic.id==assigned_topic.id:
				correct+=1
				print ' + ',target_topic.name,' / ',assigned_topic.name
			else:
				print ' - ',target_topic.name,' / ',assigned_topic.name

			number_of_articles+=1
			print 'Accuracy:',(correct/number_of_articles)*100,'% '
		except:
			print sys.exc_info()
			pass
#		if number_of_articles>50:
#			break

	print 'Accuracy:',(correct/number_of_articles)*100,'% '

	precision={}
	for t in confusion_matrix:
		print topics.get_topics()[t].name
		true_pos=confusion_matrix[t].get(t,0)
		tp_fp=sum([confusion_matrix[c].get(t,0) for c in confusion_matrix])
		if tp_fp==0:
			precision[t]=1
		else:
			precision[t]=float(true_pos)/tp_fp

	recall={}
	for t in confusion_matrix:
		true_pos=confusion_matrix[t].get(t,0)
		tp_fn=sum([confusion_matrix[t].get(c,0) for c in confusion_matrix])
		if tp_fn==0:
			recall[t]=1
		else:
			recall[t]=float(true_pos)/tp_fn

	conf_matrix_out=codecs.open('confusion_matrix'+str(number_of_features)+'.csv',encoding='utf-8',mode='w')

	conf_matrix_out.write(',')
	for a in confusion_matrix:
		conf_matrix_out.write('%s,'%topics.get_topics()[a].name)
	conf_matrix_out.write('Recall\n')
	for a in confusion_matrix:
		conf_matrix_out.write('%s,'%topics.get_topics()[a].name)
		for b in confusion_matrix:
			conf_matrix_out.write('%s,'%confusion_matrix[a].get(b,0))
		conf_matrix_out.write('%s\n'%recall[a])
	conf_matrix_out.write('Precision,')
	for b in confusion_matrix:
		conf_matrix_out.write('%s,'%precision[b])
	conf_matrix_out.write('\n\n')

	conf_matrix_out.write('F-measure\n')
	av_fm=0
	for b in confusion_matrix:
		if precision[b]==0 or recall[b]==0:
			fm=0
		else:
			fm=2*precision[b]*recall[b]/(precision[b]+recall[b])
		conf_matrix_out.write('%s,%s\n'%(topics.get_topics()[b].name,fm))
		av_fm+=fm
	av_fm/=len(confusion_matrix)
	conf_matrix_out.write('\n')

	conf_matrix_out.write('Average F-measure,%s\n'%av_fm)
	conf_matrix_out.write('Accuracy,%s\n'%(correct/number_of_articles))
def cross_validation():
	import psyco
	psyco.full()

	import sys
	import codecs
	from Model import Topic, Article
	from Topics import Topics
	from sqlalchemy import create_engine
	from sqlalchemy.orm import sessionmaker

	engine=create_engine('mysql://*****:*****@localhost/data_mining?use_unicode=1&charset=utf8')
	Session=sessionmaker(bind=engine)
	session=Session()

	topics=Topics(session, KNearestNeighborClassifier())
	topics.print_out()

	print >>sys.stderr,'Adding articles'
	i=0
	for line in open('../data/train.dat'):
		id=int(line)
		topics.add_article(id)
		i+=1
		if i==10000:
			break

	topics.build_index( "../data/lang/lemmatization.dat", "../data/lang/stopwords.dat")
	topics.train_classifier()

	correct=0.0
	number_of_articles=0.0

	confusion_matrix={}

	for line in open('../data/test.dat'):
		article=session.query(Article).filter(Article.id==int(line)).one()
		assigned_topic=topics.assign_topic_to_article(article.text)
		print number_of_articles,article.title
		try:
			target_topic=topics.get_topics()[article.topic.id].get_parent_at_level(2)

			if target_topic.id not in confusion_matrix:
				confusion_matrix[target_topic.id]={}
			confusion_matrix[target_topic.id][assigned_topic.id]=confusion_matrix[target_topic.id].get(assigned_topic.id,0)+1

			if target_topic.id==assigned_topic.id:
				correct+=1
				print ' + ',target_topic.name,' / ',assigned_topic.name
			else:
				print ' - ',target_topic.name,' / ',assigned_topic.name

			number_of_articles+=1
			print 'Accuracy:',(correct/number_of_articles)*100,'% '
		except:
			print sys.exc_info()
			pass
#		if number_of_articles>1000:
#			break

	print 'Accuracy:',(correct/number_of_articles)*100,'% '

	conf_matrix_out=codecs.open('confusion_matrix.csv',encoding='utf-8',mode='w')

	conf_matrix_out.write(',')
	for a in confusion_matrix:
		conf_matrix_out.write('%s,'%topics.get_topics()[a].name)
	conf_matrix_out.write('\n')
	for a in confusion_matrix:
		conf_matrix_out.write('%s,'%topics.get_topics()[a].name)
		for b in confusion_matrix:
			conf_matrix_out.write('%s,'%confusion_matrix[a].get(b,0))
		conf_matrix_out.write('\n')
def results(page):
    global temp_sypm
    global temp_race
    global temp_topic
    global t_setQuestion
    global g_results

    if type(page) is not int:
        page = int(page.encode('utf-8'))

    if request.method == 'POST':

        symp = request.form['symptom']
        race_q = request.form['race']
        topic = request.form['topic']
        question = request.form['set']
        if len(symp) == 0 or (symp == 'None'):
            temp_sypm = ""
            symp = ""
        else:
            temp_sypm = symp

        if len(race_q) == 0 or (race_q == 'None'):
            temp_race = ""
            race_q = ""
        else:
            temp_race = race_q

        if len(question) != 0 or (question != 'None'):
            t_setQuestion = question

        if len(topic) == 0 or (topic == 'None'):
            temp_topic = ""
            topic = ""
        else:
            temp_topic = topic
    else:
        symp = temp_sypm
        race_q = temp_race
        question = t_setQuestion
        topic = temp_topic

    docs = {}
    docs['symp'] = symp
    docs['race'] = race_q
    docs['topic'] = topic
    docs['question'] = question

    search = Search(index='covid_19_index')

    s = None

    if len(symp) > 0:
        full_query = "risk factors " + symp
        s = search.query('multi_match',
                         query=full_query,
                         type='cross_fields',
                         fields=['title', 'abstract', 'body_text'])
    if len(race_q) > 0:
        full_query = "risk " + race_q
        s = search.query('multi_match',
                         query=full_query,
                         type='cross_fields',
                         fields=['title', 'abstract', 'body_text'])
    if len(topic) > 0:
        s = search.query('ids', values=getDocsFromTopic(topic))
    if len(question) > 0 & (question != 'None'):
        s = search.query('multi_match',
                         query=question,
                         type='cross_fields',
                         fields=['title', 'abstract', 'body_text'])

    start = 0 + (page - 1) * 10
    end = 10 + (page - 1) * 10

    topicsObj = Topics()

    if s is None:
        return render_template('results.html',
                               results={},
                               res_num=0,
                               page_num=0,
                               total=0,
                               queries=docs,
                               topics=topicsObj.startingTopics())
    # execute search and return results in specified range.
    response = s[start:end].execute()
    result_list = {}

    for hit in response.hits:
        result = {}
        result_list['score'] = hit.meta.score
        result['title'] = hit.title
        result['abstract'] = hit.abstract

        result['text'] = hit.body_text
        result_list[hit.meta.id] = result

    g_results = result_list
    num_results = response.hits.total['value']

    if num_results > 0:
        rem = num_results % 10
        total_pages = num_results / 10
        if rem > 0:
            total_pages = total_pages + 1
        return render_template(
            'results.html',
            results=result_list,
            res_num=num_results,
            page_num=page,
            total=total_pages,
            queries=docs,
            recommendedTopics=topicsObj.recommendedTopics(topic),
            topics=topicsObj.startingTopics())
    else:
        message = []
        message.append('Cannot formulate results')

    return render_template(
        'results.html',
        results=message,
        res_num=num_results,
        page_num=page,
        queries=docs,
        recommendedTopics=topicsObj.recommendedTopics(topic),
        topics=topicsObj.startingTopics())
def search():
    topics = Topics()
    return render_template('home_page.html', topics=topics.startingTopics())