def make_all_frequency(): vector = vector_search() conn = pymysql.connect(host=DB_HOST, port=DB_PORT, user=DB_USER, passwd=DB_PW, db=DB_NAME, charset="utf8") cursor = conn.cursor() # Get amount of documents not included ngram_relation table cursor.execute("select count(*) from blog_data where analyzed_flag=0") amount = cursor.fetchone()[0] # Divide and process all documents while amount!=0: print amount if amount < 10000: sql = "select no, title, body, tags, url from blog_data where analyzed_flag=0 limit 0, %d" % amount amount = 0 else: amount -= 10000 sql = "select no, title, body, tags, url from blog_data where analyzed_flag=0 limit 0, 10000" cursor.execute(sql) document_result = cursor.fetchall() # Each document for no, title, body, tags, url in document_result: print no, title frequency_result = vector.make_frequency(title, body, tags) # Each term in a document # info[0]=frequency, info[1]=weight, info[2]=Ns(the number of combined terms) data_list = [(no, term, info[0], info[2]) for term, info in frequency_result] cursor.executemany("insert ignore into ngram_relation(blogid, term, frequency, Ns) values(%s, %s, %s, %s)", data_list) # After calculating the frequency of each term, set analyzed flag from 0 to 1 cursor.execute("update blog_data set analyzed_flag=1 where no=%s", (no,)) conn.commit() cursor.close() conn.close()
def search(): error = None # GET request search_query = request.args.get("q", '') ## Get query data from GET request ## Calculate the frequency of each term ## Calculate TF, IDF, weight of each term vector = vector_search() # Make the frequency of each term analyzed_query = vector.make_frequency(title=search_query) analyzed_query = sorted(analyzed_query, key=lambda x:x[1][0], reverse=True) for i, (keyword, info) in enumerate(analyzed_query): print i, keyword, info # Get maximum value of frequency of terms try: max_f = analyzed_query[0][1][0] except Exception, e: print e error = u"검색 결과가 존재하지 않습니다." return render_template("search.html", error=error)