def plot_2nd_level_distribution(): from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker from Topics import Topics from KNearestNeighborClassifier import KNearestNeighborClassifier engine=create_engine('mysql://*****:*****@localhost/data_mining?use_unicode=1&charset=utf8') Article.metadata.create_all(engine) Session=sessionmaker(bind=engine) session=Session() topics=Topics(session, None) i=0 for article in session.query(Article.id): topics.add_article(article.id) if i%500==0: print i i+=1 # if i>1000: # break topic_nodes=[t for a in topics.root_topic.subtopics for t in a.subtopics] counts=[len(t.get_all_articles(session)) for t in topic_nodes] w=3 x=[] y=[] start=0 for c in counts: y.append(float(c)/sum(counts)*100) # y.append(float(c)) x.append(start) start+=w+1 plt.bar(x,y,width=w) plt.grid() plt.ylabel('Procenat od ukupnog broja [%]') plt.xticks([t+w/2 for t in x],[t.name for t in topic_nodes],rotation=90) for tx,ty,cnt in zip(x,y,counts): plt.text(tx+w/2,ty,str(cnt)) plt.title(u'Distibucija članaka po temama (ukupno %d članaka)'%sum(counts),ha='center') plt.show()
def getDocsFromTopic(keyword): paperIds = [] #Find the topic that the keyword is in topics = Topics().populateTopics() topicNum = -1 for i in range(len(topics)): if keyword in topics[i]: topicNum = i #Find documents that have that topic for key in docsTopicMapping: if topicNum in docsTopicMapping[key]: paperIds.append(key) return paperIds
def cross_validation(number_of_features): import sys import codecs from Model import Topic, Article from Topics import Topics from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker engine=create_engine('mysql://*****:*****@localhost/data_mining?use_unicode=1&charset=utf8') Session=sessionmaker(bind=engine) session=Session() topics=Topics(session, CosineSimilarityClassifier(number_of_features)) topics.print_out() print >>sys.stderr,'Adding articles' i=0 for line in open('../data/train.dat'): id=int(line) topics.add_article(id) i+=1 # if i==200: # break topics.print_out() topics.build_index( "../data/lang/lemmatization.dat", "../data/lang/stopwords.dat") topics.train_classifier() correct=0.0 number_of_articles=0.0 confusion_matrix={} for line in open('../data/test.dat'): article=session.query(Article).filter(Article.id==int(line)).one() assigned_topic=topics.assign_topic_to_article(article.text) print number_of_articles,article.title try: target_topic=topics.get_topics()[article.topic.id].get_parent_at_level(2) if target_topic.id not in confusion_matrix: confusion_matrix[target_topic.id]={} confusion_matrix[target_topic.id][assigned_topic.id]=confusion_matrix[target_topic.id].get(assigned_topic.id,0)+1 if target_topic.id==assigned_topic.id: correct+=1 print ' + ',target_topic.name,' / ',assigned_topic.name else: print ' - ',target_topic.name,' / ',assigned_topic.name number_of_articles+=1 print 'Accuracy:',(correct/number_of_articles)*100,'% ' except: print sys.exc_info() pass # if number_of_articles>50: # break print 'Accuracy:',(correct/number_of_articles)*100,'% ' precision={} for t in confusion_matrix: print topics.get_topics()[t].name true_pos=confusion_matrix[t].get(t,0) tp_fp=sum([confusion_matrix[c].get(t,0) for c in confusion_matrix]) if tp_fp==0: precision[t]=1 else: precision[t]=float(true_pos)/tp_fp recall={} for t in confusion_matrix: true_pos=confusion_matrix[t].get(t,0) tp_fn=sum([confusion_matrix[t].get(c,0) for c in confusion_matrix]) if tp_fn==0: recall[t]=1 else: recall[t]=float(true_pos)/tp_fn conf_matrix_out=codecs.open('confusion_matrix'+str(number_of_features)+'.csv',encoding='utf-8',mode='w') conf_matrix_out.write(',') for a in confusion_matrix: conf_matrix_out.write('%s,'%topics.get_topics()[a].name) conf_matrix_out.write('Recall\n') for a in confusion_matrix: conf_matrix_out.write('%s,'%topics.get_topics()[a].name) for b in confusion_matrix: conf_matrix_out.write('%s,'%confusion_matrix[a].get(b,0)) conf_matrix_out.write('%s\n'%recall[a]) conf_matrix_out.write('Precision,') for b in confusion_matrix: conf_matrix_out.write('%s,'%precision[b]) conf_matrix_out.write('\n\n') conf_matrix_out.write('F-measure\n') av_fm=0 for b in confusion_matrix: if precision[b]==0 or recall[b]==0: fm=0 else: fm=2*precision[b]*recall[b]/(precision[b]+recall[b]) conf_matrix_out.write('%s,%s\n'%(topics.get_topics()[b].name,fm)) av_fm+=fm av_fm/=len(confusion_matrix) conf_matrix_out.write('\n') conf_matrix_out.write('Average F-measure,%s\n'%av_fm) conf_matrix_out.write('Accuracy,%s\n'%(correct/number_of_articles))
def cross_validation(): import psyco psyco.full() import sys import codecs from Model import Topic, Article from Topics import Topics from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker engine=create_engine('mysql://*****:*****@localhost/data_mining?use_unicode=1&charset=utf8') Session=sessionmaker(bind=engine) session=Session() topics=Topics(session, KNearestNeighborClassifier()) topics.print_out() print >>sys.stderr,'Adding articles' i=0 for line in open('../data/train.dat'): id=int(line) topics.add_article(id) i+=1 if i==10000: break topics.build_index( "../data/lang/lemmatization.dat", "../data/lang/stopwords.dat") topics.train_classifier() correct=0.0 number_of_articles=0.0 confusion_matrix={} for line in open('../data/test.dat'): article=session.query(Article).filter(Article.id==int(line)).one() assigned_topic=topics.assign_topic_to_article(article.text) print number_of_articles,article.title try: target_topic=topics.get_topics()[article.topic.id].get_parent_at_level(2) if target_topic.id not in confusion_matrix: confusion_matrix[target_topic.id]={} confusion_matrix[target_topic.id][assigned_topic.id]=confusion_matrix[target_topic.id].get(assigned_topic.id,0)+1 if target_topic.id==assigned_topic.id: correct+=1 print ' + ',target_topic.name,' / ',assigned_topic.name else: print ' - ',target_topic.name,' / ',assigned_topic.name number_of_articles+=1 print 'Accuracy:',(correct/number_of_articles)*100,'% ' except: print sys.exc_info() pass # if number_of_articles>1000: # break print 'Accuracy:',(correct/number_of_articles)*100,'% ' conf_matrix_out=codecs.open('confusion_matrix.csv',encoding='utf-8',mode='w') conf_matrix_out.write(',') for a in confusion_matrix: conf_matrix_out.write('%s,'%topics.get_topics()[a].name) conf_matrix_out.write('\n') for a in confusion_matrix: conf_matrix_out.write('%s,'%topics.get_topics()[a].name) for b in confusion_matrix: conf_matrix_out.write('%s,'%confusion_matrix[a].get(b,0)) conf_matrix_out.write('\n')
def results(page): global temp_sypm global temp_race global temp_topic global t_setQuestion global g_results if type(page) is not int: page = int(page.encode('utf-8')) if request.method == 'POST': symp = request.form['symptom'] race_q = request.form['race'] topic = request.form['topic'] question = request.form['set'] if len(symp) == 0 or (symp == 'None'): temp_sypm = "" symp = "" else: temp_sypm = symp if len(race_q) == 0 or (race_q == 'None'): temp_race = "" race_q = "" else: temp_race = race_q if len(question) != 0 or (question != 'None'): t_setQuestion = question if len(topic) == 0 or (topic == 'None'): temp_topic = "" topic = "" else: temp_topic = topic else: symp = temp_sypm race_q = temp_race question = t_setQuestion topic = temp_topic docs = {} docs['symp'] = symp docs['race'] = race_q docs['topic'] = topic docs['question'] = question search = Search(index='covid_19_index') s = None if len(symp) > 0: full_query = "risk factors " + symp s = search.query('multi_match', query=full_query, type='cross_fields', fields=['title', 'abstract', 'body_text']) if len(race_q) > 0: full_query = "risk " + race_q s = search.query('multi_match', query=full_query, type='cross_fields', fields=['title', 'abstract', 'body_text']) if len(topic) > 0: s = search.query('ids', values=getDocsFromTopic(topic)) if len(question) > 0 & (question != 'None'): s = search.query('multi_match', query=question, type='cross_fields', fields=['title', 'abstract', 'body_text']) start = 0 + (page - 1) * 10 end = 10 + (page - 1) * 10 topicsObj = Topics() if s is None: return render_template('results.html', results={}, res_num=0, page_num=0, total=0, queries=docs, topics=topicsObj.startingTopics()) # execute search and return results in specified range. response = s[start:end].execute() result_list = {} for hit in response.hits: result = {} result_list['score'] = hit.meta.score result['title'] = hit.title result['abstract'] = hit.abstract result['text'] = hit.body_text result_list[hit.meta.id] = result g_results = result_list num_results = response.hits.total['value'] if num_results > 0: rem = num_results % 10 total_pages = num_results / 10 if rem > 0: total_pages = total_pages + 1 return render_template( 'results.html', results=result_list, res_num=num_results, page_num=page, total=total_pages, queries=docs, recommendedTopics=topicsObj.recommendedTopics(topic), topics=topicsObj.startingTopics()) else: message = [] message.append('Cannot formulate results') return render_template( 'results.html', results=message, res_num=num_results, page_num=page, queries=docs, recommendedTopics=topicsObj.recommendedTopics(topic), topics=topicsObj.startingTopics())
def search(): topics = Topics() return render_template('home_page.html', topics=topics.startingTopics())