def __init__(self): self.db_fetcher = DataBaseFetcher() # mysql handle self.redis_cache = CacheHelper('127.0.0.1', 6379, 1, '') # redis self.dict_simple_science, self.dict_simple_art = self.getSimpleQustion( ) self.dict_question_topic = self.getQuestionTopic() self.dict_topic = self.getTopicDict()
def __init__(self): self.db_fetcher = DataBaseFetcher() # mysql handle self.dict_topic = self.getTopicDict() # topic self.dict_quality = {5: 25, 4: 20, 3: 15, 2: 10, 1: 5} self.dict_question_base_info = self.getQuestionBaseInfo() # base self.dict_question_quality = self.getQuestionQuality() self.dict_question_topic, self.dict_topic_question = self.getQuestionTopic( ) # topic
def __init__(self): self.db_fetcher = DataBaseFetcher() self.tag2comlist = self.load_info( 'select cid,tag from crm.company_tag') self.tag_child = self.load_info( 'select tag,parent_tag from crm.tag_tag_relation') self.tag_parent = self.load_info( 'select parent_tag,tag from crm.tag_tag_relation') self.total_num, self.manual_tag_count = self.tag_cid()
def __init__(self, stage, subject): self.fetcher = DataBaseFetcher() self.topicDict, self.topicImportDict = self.getTopic(stage, subject) self.quesTopicDict, self.topicQuestionDict = self.getQuestionTopic( ) # static # student exercise self.recordList = self.cacheRecord('question_result.txt') self.topicStudentDict = self.getTopicStudent()
def __init__(self, mysql_fetcher=None, handler_redis=None): self.mysql_fetcher = mysql_fetcher if mysql_fetcher is None: self.mysql_fetcher = DataBaseFetcher() self.handler_redis = handler_redis if handler_redis is None: self.handler_redis = redis.Redis( host='1c34f95e1b12494b.m.cnbja.kvstore.aliyuncs.com', password='******', port=6379, db=5) self.load_entity_dict()
def __init__(self, exercise_id, update_time='2017-11-01 00:00:00'): # do something inits self.db_fetcher = DataBaseFetcher() self.exercise_id = exercise_id self.dict_content_answer = {} # error content self.curr_time = datetime.datetime.now() self.update_time = update_time # update time self.recommend = RecommendQuestion() self.dict_realtion_quesion, self.dict_question_num = self.getRelationQuestion( ) self.dict_parent_quesion = self.getChildQuestion() self.dict_question_diff = self.getRightDifficulty() self.dict_point, self.dict_topic_point = self.getPoints() self.dict_students = self.getStudentId() self.dict_student_score, self.dict_student_records = {}, { x: [] for x in self.dict_students } self.exam_list_records, self.practice_list_records = self.getExerciseRecord( ) #records # dict self.dict_question_types = { 1: '选择题', 2: '填空题', 4: '简答题', 6: '综合题', 17: '计算题', 44: '证明题' } self.dict_question_difficulty = { 1: '易', 2: '中', 3: '难', 4: '极难' } # 1:易 2:中 3:难 4:极难 # feedback self.dict_students_resource = self.getStudentResource() # resource
#!/usr/bin/python # -*- coding: utf-8 -*- import sys reload(sys) sys.setdefaultencoding("utf-8") import json, urllib, urllib2, json, chardet, re from http_request import * from bs4 import BeautifulSoup sys.path.append('../../') from common.db_fetcher import DataBaseFetcher db_fetcher = DataBaseFetcher() # database jquery_host = "http://jquery.cuishifeng.cn" jquery_url = "http://jquery.cuishifeng.cn/index.html" def parse_jquery_attribute(attribute_url): attribute_html = get_request(attribute_url, jquery_host, {}) soup = BeautifulSoup(attribute_html, 'lxml') content = soup.find_all('div', id='content') # retVal retVal = content.h2.span.text if content.h2 is not None else '' # summarize summarize = soup.find('div', class_='desc')
#!/usr/bin/env python # -*- coding: utf8 -*- import os,json,datetime,re,time import sys reload(sys) sys.setdefaultencoding('utf-8') sys.path.append(os.path.realpath(os.path.join(os.path.dirname(__file__), '../../'))) from common.db_fetcher import DataBaseFetcher db_fetcher = DataBaseFetcher() def get_domain_list(url_file = 'urls.txt'): domain_list = [] with open(url_file) as url_f: for line in url_f: url = line.strip() if 'https' in url: start_pos = 8 else: start_pos = 7 str_url = url[start_pos:] end_pos = str_url.find('/') if end_pos == -1: end_pos = len(str_url) domain = str_url[:end_pos] domain_list.append(domain) return domain_list def create_html_table(datas, title=None):
#!/usr/bin/env python #coding=utf8 from __future__ import division import sys,os reload(sys) sys.setdefaultencoding('utf-8') sys.path.append(os.path.realpath(os.path.join(os.path.dirname(__file__), '../../'))) import json,math,MySQLdb,datetime,time import similary_question as sq from recommend import RecommendQuestion from common.db_fetcher import DataBaseFetcher recommend = RecommendQuestion() # object db_fetcher = DataBaseFetcher() question_list = [] # set delete_sql = "DELETE FROM error_question_log WHERE question_id IN ( select c.question_id from (SELECT a.question_id FROM error_question_log a JOIN neworiental_v3.entity_question b ON a.question_id = b.id WHERE b.subject_id != 4) c)" db_fetcher.commit_sql_cmd(delete_sql, 'mysql_logdata') # update rows = db_fetcher.get_sql_result("select question_id from error_question_log", 'mysql_logdata') for row in rows: qid = row[0] question_list.append(qid) def getQuestionWords(question_list): # question word dict_question_info = {} lSet = int(len(question_list) / 1000) + 2 start, end = 0, 0 for i in range(1, lSet):
class CzMathGraph(object): """docstring for czMathGraph""" def __init__(self, stage, subject): self.fetcher = DataBaseFetcher() self.topicDict, self.topicImportDict = self.getTopic(stage, subject) self.quesTopicDict, self.topicQuestionDict = self.getQuestionTopic( ) # static # student exercise self.recordList = self.cacheRecord('question_result.txt') self.topicStudentDict = self.getTopicStudent() # sample def cacheRecord(self, fName, maxWin=3000000): recordList = [] pos = 0 with open(fName) as handleFile: for line in handleFile: arrContent = line.strip().split('\t') studyId, questionId, res = int(arrContent[1]), int( arrContent[2]), arrContent[5] isFlag = False if questionId in self.quesTopicDict: topicSet = self.quesTopicDict[questionId] for topic in topicSet: if topic in self.topicDict: isFlag = True break if isFlag == False: continue # normalize data try: res = int(res) except: try: result = 0 decode_json = json.loads(res) for resDict in decode_json: if resDict['result'] == 1: result += 1 else: result -= 1 res = 1 if result > 0 else 2 except: pass if res != 1: continue # 过滤负样本 recordList.append((studyId, questionId, res)) pos += 1 if pos >= maxWin: break return recordList def getTopicStudent(self): topicStudentDict = {} for item in self.recordList: studyId, questionId, res = item if questionId not in self.quesTopicDict: continue for topic in self.quesTopicDict[questionId]: if topic not in topicStudentDict: topicStudentDict[topic] = {} if studyId not in topicStudentDict[topic]: topicStudentDict[topic][studyId] = 0 topicStudentDict[topic][studyId] += 1 return topicStudentDict # 统计条件概率 def staticTopicProb(self, topic, topicList): if topic not in self.topicStudentDict: return 0.0 fenmuSet = set() # 条件student集合 isFirst = True for item in topicList: if item not in self.topicStudentDict: continue itemSet = set([x for x in self.topicStudentDict[item]]) if isFirst: fenmuSet = itemSet else: fenmuSet = fenmuSet & itemSet fenmu, fenzi = 0.0, 0.0 for student in fenmuSet: for item in topicList: fenmu += self.topicStudentDict[item][student] if student in self.topicStudentDict[topic]: fenzi += self.topicStudentDict[topic][student] p = fenzi / fenmu if fenmu > 0 and fenzi < fenmu else 0.0 return p # 获取词典 def getTopic(self, stage, subjectId): topicDict = {} topicImportDict = {} sql = "select id, name, is_important from entity_topic where subject_id= %s and stage_id = %s" % ( subjectId, stage) db_rows = self.fetcher.get_sql_result(sql, "mysql_v3") for row in db_rows: topicId, name, important = row topicDict[topicId] = name topicImportDict[topicId] = important return topicDict, topicImportDict # 获取问题主题 def getQuestionTopic(self): quesTopicDict = {} topicQuestionDict = {} db_rows = self.fetcher.get_sql_result( "select topic_id, question_id from link_question_topic ", "mysql_v3") for row in db_rows: topicId, questionId = row if topicId not in topicQuestionDict: topicQuestionDict[topicId] = [] topicQuestionDict[topicId].append(questionId) if questionId not in quesTopicDict: quesTopicDict[questionId] = [] quesTopicDict[questionId].append(topicId) quesTopicDict = { key: set(value) for key, value in quesTopicDict.items() } # list to set topicQuestionDict = { key: set(value) for key, value in topicQuestionDict.items() } return quesTopicDict, topicQuestionDict # 统计问题正确率 def statisQuestion(self, subject_id): questionDict = {} db_rows = self.fetcher.get_sql_result( "select question_id, sum(answer_num) answernum, sum(right_num) rightnum from entity_trail_question where subject_id = %s GROUP BY question_id " % subject_id, "mysql_v3") for row in db_rows: questionId, answer, right = row questionDict[questionId] = answer, right return questionDict def isLoopGraph(self, pairTopic, iniRelList, childDict): startTopic, endTopic = pairTopic if startTopic == endTopic: return True # loop stackList = [startTopic] missSet, popSet = set(), set() # init set and pop set while len(stackList) > 0: if endTopic in missSet: return True # the graph is unicom topic = stackList[-1] stackList.pop() # pop if (topic not in childDict) or topic in missSet: continue popSet.add(topic) # add pop set childSet = childDict[topic] for child in childSet: stackList.append(child) missSet.add(child) return False # 构造有像图知识网络 def getTopicDAG(self, threshold=30, inDegreeThr=3, outDegreeThr=4): iniRelList = [] relDict = {} topicList = [topic for topic in self.topicDict] for i in xrange(len(topicList)): for j in xrange(i + 1, len(topicList)): if topicList[i] in self.topicQuestionDict and topicList[ j] in self.topicQuestionDict: iSet, jSet = self.topicQuestionDict[ topicList[i]], self.topicQuestionDict[topicList[j]] key = "%s-%s" % (topicList[i], topicList[j]) mLen = len(iSet & jSet) if mLen > threshold: relDict[key] = mLen sortRel = sorted(relDict.items(), key=lambda x: x[1], reverse=True) inDegreeDict = {} # 入度记录字典 outDegreeDict = {} # 出度记录字典 childDict = {} # child for item in sortRel: inDegreeThr, outDegreeThr = 3, 4 arrTopic = [int(x) for x in item[0].split("-")] if len(arrTopic) != 2: continue start, end = arrTopic[0], arrTopic[1] # 判断转移方向 p0 = self.staticTopicProb(start, [end]) p1 = self.staticTopicProb(end, [start]) pairTopic = (start, end) if p1 < p0 else (end, start) start, end = pairTopic # if self.topicImportDict[start] < 1: outDegreeThr = 1 if self.topicImportDict[end] < 1: inDegreeThr = 1 inDegree = inDegreeDict[end] if end in inDegreeDict else 0 outDegree = outDegreeDict[start] if start in outDegreeDict else 0 isLoop = self.isLoopGraph(pairTopic, iniRelList, childDict) bFlag = (isLoop == False) and (inDegree < inDegreeThr) and ( outDegree < outDegreeThr) if bFlag: iniRelList.append(pairTopic) if end not in childDict: childDict[end] = set() childDict[end].add(start) # 出度 and 入度 outDegreeDict[start] = 1 + outDegree inDegreeDict[end] = 1 + inDegree return iniRelList # 构建系数矩阵数组 def getRowsNormalize(self, topicList): rowsData = [] topicLen = len(topicList) topicIndex = {topicList[i]: i for i in xrange(topicLen)} for item in self.recordList: studyId, questionId, res = item arr = [0] * topicLen if questionId in self.quesTopicDict: topicSet = self.quesTopicDict[questionId] for topic in topicSet: if topic in topicIndex: arr[topicIndex[topic]] = 1 sparr = pd.SparseArray(arr) rowsData.append(arr) return rowsData def makeModel(self): # graph structure initRelation = self.getTopicDAG() # 概率图构建 relationList = [] stuModel = BayesianModel() # DAG for edge in initRelation: try: start, end = edge stuModel.add_edge(str(start), str(end)) relationList.append(edge) except: continue # save file with open('model.txt', 'w') as write_f: for item in relationList: write_f.write( '%s,%s\n' % (self.topicDict[item[0]], self.topicDict[item[1]])) # learning from data topicList = [topic for topic in self.topicDict] rowsData = self.getRowsNormalize(topicList) print 'the rows len is:%s' % len(rowsData)
class RecommendQuestion(object): def __init__(self): self.db_fetcher = DataBaseFetcher() # mysql handle self.dict_topic = self.getTopicDict() # topic self.dict_quality = {5: 25, 4: 20, 3: 15, 2: 10, 1: 5} self.dict_question_base_info = self.getQuestionBaseInfo() # base self.dict_question_quality = self.getQuestionQuality() self.dict_question_topic, self.dict_topic_question = self.getQuestionTopic( ) # topic def getThisQuestionTopic(self): return self.dict_question_topic def getThisQuestionBaseInfo(self): return self.dict_question_base_info def getQuestionBaseInfo(self, save_file='question_base.txt'): dict_question_base_info = {} max_num = 0 with open(save_file) as handle_f: for line in handle_f: arr = line.strip().split('\t') if len(arr) != 3: continue qid, difficulty, question_type, upload_id = long(arr[0]), int( arr[1]), int(arr[2]), str(arr[3]).strip() try: upload_id = long(upload_id) except: upload_id = 0 dict_question_base_info[ qid] = difficulty, question_type, upload_id max_num = qid sql = "select id, difficulty, question_type, upload_id from neworiental_v3.entity_question where subject_id = 4 and id > %s and state != \'DISABLED\' order by id asc" % max_num rows = self.db_fetcher.get_sql_result(sql, "mysql_logdata") with open(save_file, 'a') as write_f: for row in rows: qid, difficulty, str_question_type, upload_id = row question_type = 1 if str_question_type == '选择题' or str_question_type == '单选题': question_type = 1 elif str_question_type == '填空题': question_type = 2 else: question_type = 3 dict_question_base_info[ qid] = difficulty, question_type, upload_id write_f.write('%s\t%s\t%s\t%s\n' % (qid, difficulty, question_type, upload_id)) return dict_question_base_info def getQuestionQuality(self, save_file='quality.txt'): dict_question_quality = {} max_num = 0 with open(save_file) as handle_f: for line in handle_f: arr = line.strip().split('\t') if len(arr) != 3: continue qid, question_id, extra_score = arr[0], long(arr[1]), int( arr[2]) dict_question_quality[question_id] = extra_score max_num = qid sql = "select id, question_id, extra_score from neworiental_v3.entity_question_quality where id > %s" % max_num rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') with open(save_file, 'a') as write_f: for row in rows: qid, question_id, extra_score = row dict_question_quality[question_id] = extra_score write_f.write('%s\t%s\t%s\n' % (qid, question_id, extra_score)) return dict_question_quality def getQuestionTopic(self, save_file='topic.txt'): dict_question_topic, dict_topic_question = {}, {} max_num = 0 with open(save_file) as handle_f: for line in handle_f: arr = line.strip().split('\t') if len(arr) != 3: continue tid, question_id, topic_id = arr[0], arr[1], arr[2] question_id = long(question_id) topic_id = long(topic_id) if question_id not in dict_question_topic: dict_question_topic[question_id] = set() if topic_id not in dict_topic_question: dict_topic_question[topic_id] = {} extra_score = 3 if question_id in self.dict_question_quality: extra_score = self.dict_question_quality[question_id] difficulty, question_type = 1, 1 if question_id in self.dict_question_base_info: difficulty, question_type, upload_id = self.dict_question_base_info[ question_id] dict_question_topic[question_id].add(topic_id) # add key = '%s-%s' % (difficulty, question_type) if key not in dict_topic_question[topic_id]: dict_topic_question[topic_id][key] = [] dict_topic_question[topic_id][key].append( (question_id, extra_score)) max_num = tid sql = "select id, question_id, topic_id from neworiental_v3.link_question_topic where id > %s" % max_num rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') with open(save_file, 'a') as write_f: for row in rows: tid, question_id, topic_id = row if question_id not in dict_question_topic: dict_question_topic[question_id] = set() if topic_id not in dict_topic_question: dict_topic_question[topic_id] = {} extra_score = 3 if question_id in self.dict_question_quality: extra_score = self.dict_question_quality[question_id] difficulty, question_type = 1, 1 if question_id in self.dict_question_base_info: difficulty, str_question_type, upload_id = self.dict_question_base_info[ question_id] if str_question_type == '选择题' or str_question_type == '单选题': question_type = 1 elif str_question_type == '填空题': question_type = 2 else: question_type = 3 key = '%s-%s' % (difficulty, question_type) dict_question_topic[question_id].add(topic_id) # add if key not in dict_topic_question[topic_id]: dict_topic_question[topic_id][key] = [] dict_topic_question[topic_id][key].append( (question_id, extra_score)) write_f.write('%s\t%s\t%s\n' % (tid, question_id, topic_id)) for topic in dict_topic_question: # sort for key in dict_topic_question[topic]: dict_topic_question[topic][key] = sorted( dict_topic_question[topic][key], key=lambda x: x[-1], reverse=True) # return dict_question_topic, dict_topic_question def getTopicDict(self): dict_topic = {} sql = "select id, `name` from neworiental_v3.entity_topic where subject_id = 4" rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: topic_id, name = row topic_id = str(topic_id) dict_topic[topic_id] = name return dict_topic def getQuestionRate(self): dict_question_rate = {} sql = "select question_id, rate from question_accuracy" rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: question_id, rate = row rate = float(rate) dict_question_rate[question_id] = rate return dict_question_rate def getHeaders(self, teacher_id='eefbac5a4e58432ca317642a2077d362'): return { 'Host': 'jiaoshi.okjiaoyu.cn', 'Connection': 'keep-alive', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'X-Requested-With': 'XMLHttpRequest', 'is_new_okay': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36', 'Referer': 'http://jiaoshi.okjiaoyu.cn/quizcenter_vm/quizcenter', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 'Cookie': '_ga=GA1.2.1647528971.1502935646; _gid=GA1.2.1532869714.1506481969; org_id=113; teacher_id=%s; Hm_lvt_2014de1ca4ec84db492ebee33b1dc46c=1506044442,1506324027,1506407711,1506479104; Hm_lpvt_2014de1ca4ec84db492ebee33b1dc46c=1506492873' % teacher_id } def getUrlContent(self, url): req = urllib2.Request(url, headers=self.getHeaders()) hh = urllib2.HTTPHandler() opener = urllib2.build_opener(hh) reply = opener.open(req, timeout=30) if reply.info().get('Content-Encoding') == 'gzip': buf = StringIO(reply.read()) f = gzip.GzipFile(fileobj=buf) res = f.read() f.close() buf.close() reply.close() else: res = reply.read() return res def normalLeven(self, str1, str2): len_str1 = len(str1) + 1 len_str2 = len(str2) + 1 #create matrix matrix = [0 for n in range(len_str1 * len_str2)] #init x axis for i in range(len_str1): matrix[i] = i #init y axis for j in range(0, len(matrix), len_str1): if j % len_str1 == 0: matrix[j] = j // len_str1 for i in range(1, len_str1): for j in range(1, len_str2): if str1[i - 1] == str2[j - 1]: cost = 0 else: cost = 1 matrix[j * len_str1 + i] = min( matrix[(j - 1) * len_str1 + i] + 1, matrix[j * len_str1 + (i - 1)] + 1, matrix[(j - 1) * len_str1 + (i - 1)] + cost) return matrix[-1] def translate(self, str): line = str.strip().decode('utf-8', 'ignore') p2 = re.compile(ur'[^\u4e00-\u9fa5]') zh = " ".join(p2.split(line)).strip() zh = ",".join(zh.split()) outStr = zh return outStr def calScoreQustion(self, question_id, list_rank): dict_score = {} pos = 0 dict_qid_type = {} set_source = self.dict_question_topic[ question_id] if question_id in self.dict_question_topic else set() for item in list_rank: qid, qtype, topic_list, extra_score = item set_target = self.dict_question_topic[ qid] if qid in self.dict_question_topic else set() score = (1 - numpy.tanh( 0.03 * pos)) * 60 + self.dict_quality[extra_score] if len(set_target & set_source) > 0: score += 15 dict_score[qid] = score dict_qid_type[qid] = qtype pos += 1 sort_res = sorted(dict_score.items(), key=lambda x: x[1], reverse=True) list_res = [] for item in sort_res: qid = item[0] qtype = dict_qid_type[qid] list_res.append((qid, qtype)) return list_res def randomTopicHeader(self, question_id, list_res, pre_set, rank_size, throld_size): # 头部数据处理 if question_id in self.dict_question_topic: topic_set = self.dict_question_topic[question_id] tLen = len(topic_set) if tLen > 0: index = random.randint(0, tLen - 1) topic = list(topic_set)[index] # first if question_id in self.dict_question_base_info: difficulty, qtype, upload_id = self.dict_question_base_info[ question_id] key = '%s-%s' % (difficulty, qtype) if key in self.dict_topic_question[topic]: list_res_question = [] arr_content_question = self.dict_topic_question[topic][ key] for content_item in arr_content_question: res_question, extra_score = content_item if res_question not in pre_set and res_question != question_id: list_res_question.append(res_question) if len(list_res_question) > rank_size: break rnt = throld_size - len(list_res) while rnt > 0 and len(list_res_question) > 0: pos = random.randint(0, len(list_res_question) - 1) list_res.append(list_res_question[pos]) rnt += -1 return list_res def getEsResult(self, question_id, text, keywords, difficulty, bqtype, pre_set, throld_size, throld_page=10, throld_cost=5, rank_size=20): """ questionType: 1 选择题 2 填空题 3 判断题 4 简答题 6 综合题 difficulty : 1:易 2:中 3:难 4:极难 """ list_rank = [] headers = self.getHeaders() # get headers str_pre = '' if bqtype == 1: bqtype = 101 for page in range(1, throld_page): try: url = "http://jiaoshi.okjiaoyu.cn/teacher-center/search_singlequiz?keyword=%s&subject_id=4&queryType=4&difficulty=%s&type=%s&page=%s" % ( keywords, difficulty, bqtype, page) # url = "http://jiaoshi.okjiaoyu.cn/teacher-center/search_singlequiz?_=1506480814420&teacher_id=ce36ea7e9c864af39d186b8e8b672864&type=&difficulty=%s&page=%s&keyword=%s&subject_id=4" % (difficulty, page, keywords) res = self.getUrlContent(url) res = res.replace('\r', '').replace('\n', '').replace( 'null', '').replace(':,', ':\"\",').strip() dict_result = json.loads(res.encode('utf8')) total_page = dict_result['data']['total_page'] if total_page < throld_page: throld_page = total_page # range update list_result = dict_result['data']['list'] for item in list_result: qid = item['id'] question_body = self.translate(item['question_body']) analysis = str(item['analysis']).strip().replace(' ', '') topic_list = item['topic_list'] if len(topic_list) >= 1: type_name = item['type_name'] if type_name == '选择题': option_list = item['option_list'] if len(option_list) != 4: continue cost = self.normalLeven(str_pre, question_body) if qid in self.dict_question_quality: extra_score = self.dict_question_quality[qid] else: extra_score = 1 if item['type_alias'] == '单选题' or item[ 'type_alias'] == '选择题': qtype = 1 elif item['type_alias'] == '填空题': qtype = 2 else: qtype = 3 bCharge = (cost > throld_cost) and (len(analysis) > 50) if bCharge: list_rank.append( (qid, qtype, item['topic_list'], extra_score)) str_pre = question_body except: continue list_index = self.calScoreQustion(question_id, list_rank) # rank res_size = throld_size - 1 if len( list_index) > throld_size - 1 else len(list_index) list_res = [x[0] for x in list_index[:res_size]] # end output list_res = self.randomTopicHeader( question_id, list_res, pre_set, rank_size, throld_size) # header question random return list_res
#!/usr/bin/env python # -*- coding: UTF-8 -*- import os, json, datetime, re, time import sys reload(sys) sys.setdefaultencoding('utf-8') sys.path.append( os.path.realpath(os.path.join(os.path.dirname(__file__), '../../'))) from common.db_fetcher import DataBaseFetcher db_fetcher = DataBaseFetcher() if __name__ == "__main__": # update news_report company now_time = datetime.datetime.now() yes_time = now_time + datetime.timedelta(days=-1) yes_date = yes_time.strftime('%Y-%m-%d') db_data = db_fetcher.get_sql_result( 'select id, article_json, publish from kr_articles where publish = \'%s\'' % yes_date, 'mysql_insight') for pos in range(len(db_data)): kid = str(db_data[pos][0]) try: article_json = str(db_data[pos][1]) article_obj = json.loads(article_json, strict=False) url = 'http://36kr.com/p/%s.html' % kid title = article_obj['title'] source = "36Kr" publish_date = '%s 00:00:00' % str(db_data[pos][2]) domain = '36kr.com'
import urllib, urllib2 from urlparse import urlparse import traceback from cStringIO import StringIO import gzip import pdb import json import ConfigParser import cookielib from pyquery import PyQuery import re, codecs sys.path.append('../../') from common.db_fetcher import DataBaseFetcher db_fetcher = DataBaseFetcher() en_threshold = 50 class Unbuffered: def __init__(self, stream): self.stream = stream def write(self, data): self.stream.write(data) self.stream.flush() def __getattr__(self, attr): return getattr(self.stream, attr)
Created on Mon Dec 15 10:38:02 2015 @author: liming """ import time, datetime import os, sys, re import json import urllib from urllib2 import Request, urlopen, URLError, HTTPError #导入urllib2模块 sys.path.append( os.path.realpath(os.path.join(os.path.dirname(__file__), '../seo'))) from seo_extractor import SeoExtractor from common.db_fetcher import DataBaseFetcher mysql_fetcher = DataBaseFetcher() threshold = 15.85 # 阈值 class HotsFind(object): ch_title_dict = {} cache_weight_dict = {} def __init__(self, keyword_file): self.seo_extractor = SeoExtractor('../seo/com_df_idf') self.keyword_set, self.verb_set = self.read_keywords(keyword_file) self.entity_dict = self.load_entity_dict() def clear_history_init(self, keyword_file):
class Report(object): """docstring for Report""" def __init__(self, exercise_id, update_time='2017-11-01 00:00:00'): # do something inits self.db_fetcher = DataBaseFetcher() self.exercise_id = exercise_id self.dict_content_answer = {} # error content self.curr_time = datetime.datetime.now() self.update_time = update_time # update time self.recommend = RecommendQuestion() self.dict_realtion_quesion, self.dict_question_num = self.getRelationQuestion( ) self.dict_parent_quesion = self.getChildQuestion() self.dict_question_diff = self.getRightDifficulty() self.dict_point, self.dict_topic_point = self.getPoints() self.dict_students = self.getStudentId() self.dict_student_score, self.dict_student_records = {}, { x: [] for x in self.dict_students } self.exam_list_records, self.practice_list_records = self.getExerciseRecord( ) #records # dict self.dict_question_types = { 1: '选择题', 2: '填空题', 4: '简答题', 6: '综合题', 17: '计算题', 44: '证明题' } self.dict_question_difficulty = { 1: '易', 2: '中', 3: '难', 4: '极难' } # 1:易 2:中 3:难 4:极难 # feedback self.dict_students_resource = self.getStudentResource() # resource def getExerciseRecord(self): exam_list_records = [] practice_list_records = [] sql = "select a.question_id, a.student_id, a.ret_num, a.submit_answer, a.submit_time, b.question_type from entity_student_exercise a join \ neworiental_v3.entity_question b on b.id = a.question_id where a.submit_time > \'%s\' and a.exercise_source = 6" % ( self.update_time) dict_is_choice = {} rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: question, student_id, ret, answer, submit_time, question_type = row student_id = long(student_id) if answer == 'null' or answer == 'None' or answer == '' or answer == 'NULL' or answer == None or len( answer) == 0: continue if question in self.dict_realtion_quesion: exam_list_records.append([ question, student_id, ret, answer, submit_time, question_type ]) self.dict_student_records[student_id].append([ question, student_id, ret, answer, submit_time, question_type ]) self.dict_content_answer['%s_%s' % (question, student_id)] = answer elif question in self.dict_parent_quesion: dict_content = self.dict_parent_quesion[question] parent_question_id = dict_content['parent_id'] rank = dict_content['rank'] link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[ parent_question_id] if question_num >= 17 and question_num <= 21: if rank == 1: exam_list_records.append([ parent_question_id, student_id, ret, answer, submit_time, question_type ]) self.dict_student_records[student_id].append([ parent_question_id, student_id, ret, answer, submit_time, question_type ]) elif rank == 2: self.dict_content_answer['%s_%s' % (parent_question_id, student_id)] = answer elif question_num >= 22 and question_num <= 23: key = '%s_%s' % (parent_question_id, student_id) if rank == 1: if answer == 'A': dict_is_choice[key] = 1 else: dict_is_choice[key] = 0 elif rank == 2: if key in dict_is_choice: if dict_is_choice[key] == 1: exam_list_records.append([ parent_question_id, student_id, ret, answer, submit_time, question_type ]) self.dict_student_records[student_id].append([ parent_question_id, student_id, ret, answer, submit_time, question_type ]) elif rank == 3: self.dict_content_answer[key] = answer else: practice_list_records.append([ question, student_id, ret, answer, submit_time, question_type ]) return exam_list_records, practice_list_records def getStudentId(self, file_name='students.txt'): list_students = [] with open(file_name) as handle_f: for line in handle_f: system_id = long(line.strip()) list_students.append(system_id) set_studends = set(list_students) # students set str_content = ','.join([str(x) for x in set_studends]) sql = "select a.`system_id`, a.`name` from neworiental_user.entity_user a where type = 2 and a.system_id in (%s)" % str_content dict_students = {} rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: student_id, name = row if student_id in set_studends: dict_students[student_id] = name return dict_students def getRightDifficulty(self): dict_question_diff = {} sql = "select b.id, b.difficulty from neworiental_v3.entity_question b join link_question_answer a on b.id = a.link_question_id where a.update_time >= \'%s\'" % self.update_time rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: question_id, difficulty = row dict_question_diff[question_id] = int(difficulty) return dict_question_diff def getChildQuestion(self): dict_parent_list = {} dict_parent_question = {} sql = "select a.id, a.parent_question_id from link_question_answer b join neworiental_v3.entity_question a on a.parent_question_id = b.link_answer_id \ where b.question_num >= 17 and b.update_time >= \'%s\'" % self.update_time rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: qid = row[0] parent_id = row[1] if parent_id not in dict_parent_list: dict_parent_list[parent_id] = [] dict_parent_list[parent_id].append(qid) for parent_id in dict_parent_list: sort_question = sorted(dict_parent_list[parent_id], reverse=False) count = 1 for item in sort_question: dict_parent_question[item] = { 'parent_id': parent_id, 'rank': count } count += 1 return dict_parent_question def getRelationQuestion(self): dict_realtion_quesion = {} dict_point_question_num = {} sql = "select a.link_question_id, a.link_answer_id, a.link_point_id, a.question_num, b.difficulty, a.question_exercise_id, b.question_type from link_question_answer a join neworiental_v3.entity_question b on a.link_question_id = \ b.id where a.update_time >= \'%s\'" % self.update_time rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') dict_parent_quesion = {} link_parent_list = [] for row in rows: link_question_id, link_answer_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = row dict_realtion_quesion[ link_answer_id] = link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type arr_point = link_point_id.strip().split(',') for point in arr_point: if point not in dict_point_question_num: dict_point_question_num[point] = [] dict_point_question_num[point].append([ link_question_id, question_num, difficulty, question_exercise_id, question_type ]) return dict_realtion_quesion, dict_point_question_num def getScore(self, question_num, answer): ascore, score = 0, 0 if question_num >= 1 and question_num <= 16: score = 5 if (answer == 'A' or answer == 'B') else 0 ascore = 5 elif question_num >= 17 and question_num <= 21: if answer == 'A': score = 4 elif answer == 'B': score = 7 elif answer == 'C': score = 9 elif answer == 'D': score = 12 ascore = 12 elif question_num >= 22 and question_num <= 23: if answer == 'A': score = 4 elif answer == 'B': score = 7 elif answer == 'C': score = 9 elif answer == 'D': score = 10 ascore = 10 return ascore, score def statQustionReport(self): dict_question_count, dict_question_right = {}, {} dict_point_count, dict_point_right = {}, {} dict_error_students, dict_right_students = {}, {} for item in self.exam_list_records: question, student_id, ret, answer, submit_time, question_type = item if question in self.dict_realtion_quesion: link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[ question] arr_points = link_point_id.strip().split(',') ascore, score = self.getScore(question_num, answer) is_right = (answer == 'A') or (answer == 'B') # 1 point for point in arr_points: if point in self.dict_point: name, ptype, question_type, level, parent_id, link_id = self.dict_point[ point] if name not in dict_point_count: dict_point_count[name] = 0 dict_point_right[name] = 0 dict_point_right[name] += 1 if ( question_num <= 16 and is_right) or ( question_num >= 17 and answer == 'D') else 0 # dict_point_count[name] += 1 if link_question_id not in dict_question_count: dict_question_right[link_question_id] = 0 dict_question_count[link_question_id] = 0 dict_question_right[ link_question_id] += 1 if ret == 1 else 0 # dict_question_count[link_question_id] += 1 # questions if link_question_id not in dict_right_students: dict_right_students[link_question_id] = [] dict_error_students[link_question_id] = [] if (question_num <= 16 and is_right) or (question_num >= 17 and answer == 'D'): dict_right_students[link_question_id].append(student_id) else: dict_error_students[link_question_id].append(student_id) if student_id not in self.dict_student_score: self.dict_student_score[student_id] = 0 self.dict_student_score[student_id] += score dict_point_acc = {} for point_name in dict_point_count: acc = dict_point_right[point_name] / dict_point_count[point_name] dict_point_acc[point_name] = acc #print '%s \t%s' % (point_name, acc) # students dict_students_point_question = self.calStudentPoints(dict_point_acc) return dict_students_point_question def getErrorText(self, answer): if answer == 'C': return '审题时,粗心看错了' elif answer == 'D': return '审题时,题意读不懂' elif answer == 'E': return '析题时,思路模糊不清' elif answer == 'F': return '析题时,思路方向错误' elif answer == 'G': return '解题时,公式记错了' elif answer == 'H': return '解题时,计算出错了' elif answer == 'I': return '未作答(或未完成),时间来不及了' elif answer == 'A' or answer == 'B': return '答题正确' def getQuestionWords(self): dict_question_words = {} str_question = ','.join([ str(self.dict_realtion_quesion[x][0]) for x in self.dict_realtion_quesion ]) str_sql = "select c.id, c.subject_id, t.type_id,t.struct_id, c.json_data from neworiental_v3.entity_question c left join neworiental_v3.entity_question_type t on \ t.type_id=c.question_type_id where c.id in (%s)" % str_question s_rows = self.db_fetcher.get_sql_result(str_sql, 'mysql_logdata') for row in s_rows: question, subject_id, type_id, struct_id, json_data = row if json_data != '' and json_data != 'None' and json_data != None: keywords, res = sq.predict(question, subject_id, type_id, struct_id, json_data) dict_question_words[question] = keywords, res return dict_question_words def calStudentPoints(self, dict_point_acc): dict_coef = {x: 1.0 for x in self.dict_point} # bate = 0.25 dict_student_points = {} dict_students_point_question = {} # student -> error question for student_id in self.dict_students: if student_id not in dict_student_points: dict_student_points[student_id] = {} records = sorted(self.dict_student_records[student_id], key=lambda x: x[-1], reverse=True) dict_error = { 'C': 0.0, 'D': 0.0, 'E': 0.0, 'F': 0.0, 'G': 0.0, 'H': 0.0, 'I': 0.0 } mass_score = 0 dict_question_point = {} for item in records: question, student_id, ret, answer, submit_time, question_type = item if answer == 'answer': continue if question in self.dict_realtion_quesion: link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[ question] if question_exercise_id != self.exercise_id: continue if question_type == '单选题' or question_type == '选择题': qtype = 1 elif question_type == '填空题': qtype = 2 else: qtype = 3 difficulty = self.dict_question_diff[link_question_id] arr_points = link_point_id.strip().split(',') a_score, score = self.getScore(question_num, answer) content = '%s_%s' % (question, student_id) if content in self.dict_content_answer: answer = self.dict_content_answer[content] if answer != 'A' and answer != 'B' and answer != 'answer': dict_error[answer] += a_score - score if answer == 'C' or answer == 'G' or answer == 'H': mass_score += a_score - score for point in arr_points: if point not in dict_question_point: dict_question_point[point] = [] dict_question_point[point].append( (link_question_id, qtype, submit_time)) name, ptype, question_type, level, parent_id, link_id = self.dict_point[ point] acc = dict_point_acc[name] if point not in dict_student_points[student_id]: dict_student_points[student_id][point] = 0.0 dict_student_points[student_id][ point] += dict_coef[point] * ( a_score - score ) * acc / math.log(1 + bate * (level + difficulty)) sort_error = sorted(dict_error.items(), key=lambda x: x[1], reverse=True) if len(dict_student_points[student_id]) > 0: sort_point = sorted(dict_student_points[student_id].items(), key=lambda x: x[1], reverse=True) student_name = self.dict_students[student_id] point_name1, ptype1, question_type1, level1, parent_id1, link_id1 = self.dict_point[ point] point_name2, ptype2, question_type2, level2, parent_id2, link_id2 = self.dict_point[ sort_point[1][0]] point_name3, ptype3, question_type3, level3, parent_id2, link_id3 = self.dict_point[ sort_point[2][0]] is_power = 1 if mass_score > 5 else 0 error1, error1score = sort_error[0][0], sort_error[0][1] error2, error2score = sort_error[1][0], sort_error[1][1] error1_name, error2_name = self.getErrorText( error1), self.getErrorText(error2) dict_students_point_question[ student_id] = sort_point, dict_question_point return dict_students_point_question def getPoint2Question(self): dict_point_org_question = {} for answer_id in self.dict_realtion_quesion: link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[ answer_id] arr_point = link_point_id.strip().split(',') for item in arr_point: if item not in dict_point_org_question: dict_point_org_question[item] = [] dict_point_org_question[item].append(link_question_id) return dict_point_org_question def pointsRecQuestion(self, dict_students_point_question, dict_diff, throld=4): # 1,2 choice question 1:easy 2:difficulty 3,4: comprehensive problem 3:easy 4:difficulty # dict_point_org_question = self.getPoint2Question() # org question point dict_question_text = self.getQuestionWords() dict_question_target = {} dict_student_recommend_question = {} print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % ( '学生id', '学生名字', '考点', '考点名称', '原题id', '推荐习题id', '关键词', '难度') for student_id in dict_students_point_question: arr_point, dict_question_point_target = dict_students_point_question[ student_id] student_name = self.dict_students[student_id] haveTime = 12 recommend_questions = [] pre_set = set() for item_target in arr_point: if haveTime <= 0 and len(recommend_questions) % 3 == 0: break item_point = item_target[0] arr_question = sorted(dict_question_point_target[item_point], key=lambda x: x[-1], reverse=True) key = '%s-%s' % (student_id, item_point) rateRight, difficulty = dict_diff[ key] if key in dict_diff else (0.0, 1) pre_set = pre_set | set([x[0] for x in arr_question]) # for base_question, qtype, submit_time in arr_question: if haveTime <= 0 and len(recommend_questions) % 3 == 0: break keywords, text = dict_question_text[base_question] text = text.replace('\n', '').strip('\r').strip() # filter base_question_key = '%s-%s' % (base_question, difficulty) if base_question_key in dict_question_target: target_questions = dict_question_target[ base_question_key] else: target_questions = self.recommend.getEsResult( base_question, text, keywords, difficulty, qtype, pre_set, throld) dict_question_target[ base_question_key] = target_questions for rec_question in target_questions: if haveTime <= 0 and len(recommend_questions) % 3 == 0: break if rec_question in pre_set: continue haveTime += -1 recommend_questions.append( (base_question, keywords, rec_question, item_point, difficulty)) pre_set.add(rec_question) for rec_item in recommend_questions: source_question, keywords, rec_question, item_point, difficulty = rec_item point_name, ptype, question_type, level, parent_id, link_id = self.dict_point[ item_point] print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % ( student_id, student_name, item_point, point_name, source_question, rec_question, keywords, difficulty) def importDefault(self, is_new, dict_question_topic): defult_list = [(10805815, 0), (10806190, 2), (10805815, 4), (10806184, 6), (10806182, 8), (10806169, 10), (10806104, 12), (10806057, 14), (10805815, 16), (10806068, 18)] update_sql = "insert into entity_recommend_question_bytopic(system_id, type,chapter_id,topic_id, question_id, `master`, duration, important, subject_id, score, school_publish, org_id, org_type) values" new_sql = "insert into sync_student_recommend_question(system_id, resource_type, resource_id, subject_id, tag1, tag2, score, type, type_level, type_id) values" db_rows = self.db_fetcher.get_sql_result( "select student_id from entity_student_white_list", "mysql_white_list") is_first = 1 for row in db_rows: student_id = row[0] for item in defult_list: question_id, subject_id = item type_id = 2186 if question_id in dict_question_topic: set_topic = dict_question_topic[question_id] if len(set_topic) > 0: type_id = set_topic.pop() if is_first == 1: update_sql += "(%s, 2, 0, 0, %s, 2, 2, 1, %s, %s, 0, 113, 2)" % ( student_id, question_id, subject_id, 0) if is_new == 1: new_sql += "(%s, 1, %s, %s, \'最近新错\', \'重难点\', %s, %s, %s, %s)" % ( student_id, question_id, subject_id, 0, 1, 3, type_id) is_first = 0 else: update_sql += ",(%s, 2, 0, 0, %s, 2, 2, 1, %s, %s, 0, 113, 2)" % ( student_id, question_id, subject_id, 0) new_sql += ",(%s, 1, %s, %s, \'最近新错\', \'重难点\', %s, %s, %s, %s)" % ( student_id, question_id, subject_id, 0, 1, 3, type_id) if is_new == 1: self.db_fetcher.commit_sql_cmd(new_sql, 'mysql_v3_white_list') else: self.db_fetcher.commit_sql_cmd(update_sql, 'mysql_white_list') def import2DataBase(self, flag, is_new=1, fname='student_rec.txt'): dict_question_topic = self.recommend.getThisQuestionTopic( ) # question topic if is_new == 1: self.db_fetcher.commit_sql_cmd( "delete from sync_student_recommend_question", 'mysql_v3_white_list') # update else: self.db_fetcher.commit_sql_cmd( "delete from entity_recommend_question_bytopic", 'mysql_white_list') # update # self.importDefault(is_new, dict_question_topic) # import if flag == 1: # 1,3 5 recomend update_sql = "insert into entity_recommend_question_bytopic(system_id, type,chapter_id,topic_id, question_id, `master`, duration, important, subject_id, score, school_publish, org_id, org_type) values" insert_sql = "insert into entity_question_recommend(student_id,student_name,point,point_name,question_id,recommend_id,keywords,difficulty) values" new_sql = "insert into sync_student_recommend_question(system_id, resource_type, resource_id, subject_id, tag1, tag2, score, type, type_level, type_id) values" insert_score, is_first = 0, 1 with open(fname) as rec_f: for line in rec_f: if insert_score > 1: arr = line.strip().split('\t') student_id, question_id = arr[0], long(arr[5]) type_id = 2186 if question_id in dict_question_topic: set_topic = dict_question_topic[question_id] if len(set_topic) > 0: type_id = set_topic.pop() if is_first == 1: update_sql += "(%s, 2, 0, 0, %s, 2, 2, 1, %s, %s, 0, 113, 2)" % ( student_id, question_id, 0, insert_score) insert_sql += "(%s,\'%s\',%s, \'%s\', %s, %s, \'%s\',%s)" % ( arr[0], arr[1], arr[2], arr[3], arr[4], arr[5], arr[6], arr[7]) if is_new == 1: new_sql += "(%s, 1, %s, %s, \'最近新错\', \'重难点\', %s, %s, %s, %s)" % ( student_id, question_id, 0, insert_score, 1, 3, type_id) is_first = 0 else: update_sql += ",(%s, 2, 0, 0, %s, 2, 2, 1, %s, %s, 0, 113, 2)" % ( student_id, question_id, 0, insert_score) insert_sql += ",(%s,\'%s\',%s, \'%s\', %s, %s, \'%s\',%s)" % ( arr[0], arr[1], arr[2], arr[3], arr[4], arr[5], arr[6], arr[7]) if is_new == 1: new_sql += ",(%s, 1, %s, %s, \'最近新错\', \'重难点\', %s, %s, %s, %s)" % ( student_id, question_id, 0, insert_score, 1, 3, type_id) insert_score += 1 # self.db_fetcher.commit_sql_cmd(insert_sql, 'mysql_logdata') if is_new == 1: # update mysql self.db_fetcher.commit_sql_cmd(new_sql, 'mysql_v3_white_list') # new else: self.db_fetcher.commit_sql_cmd(update_sql, 'mysql_white_list') def getPoints(self): dict_point = {} dict_topic_point = {} sql = 'select id, name, type, question_type, level, parent_id, link_id, topics_id from entity_exam_points' # do something rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: point_id, name, ptype, question_type, level, parent_id, link_id, topics_id = row point_id = str(point_id) dict_point[ point_id] = name, ptype, question_type, level, parent_id, link_id if ptype == 3: # arr_topic = [long(x) for x in topics_id.strip().split(',')] for topic in arr_topic: if topic not in dict_topic_point: dict_topic_point[topic] = [] dict_topic_point[topic].append(point_id) return dict_point, dict_topic_point def getStudentResource(self): dict_students_resource = {} str_content = ','.join([str(x) for x in self.dict_students]) sql = "select b.student_id, a.resource_id, b.deadline from neworiental_v3.link_respackage_publish_resource a JOIN neworiental_v3.link_respackage_student b on \ a.publish_id = b.publish_id where a.resource_type = 2 and b.student_id in (%s)" % ( str_content) rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: student_id, resource_id, deadline = row if resource_id > 0: if student_id not in dict_students_resource: dict_students_resource[student_id] = set() dict_students_resource[student_id].add(resource_id) return dict_students_resource def getQuestionBaseInfo(self, question_set): dict_question_difficulty = {} str_content = ','.join([str(x) for x in question_set]) sql = "select b.id, b.difficulty from neworiental_v3.entity_question where b.id in (%s)" % str_content rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: question_id, difficulty = row dict_question_diff[question_id] = int(difficulty) return dict_question_difficulty def updateStuAdapt2Difficult(self, range_throld=10, difficulty_throld=10): dict_exam_question = {} dict_exam_difficulty = {} dict_question_topic = self.recommend.getThisQuestionTopic() # for exam_item in self.exam_list_records: question, student_id, ret, answer, submit_time, question_type = exam_item if question in self.dict_realtion_quesion: link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[ question] arr_point = link_point_id.strip().split(',') is_right = 1 if answer == 'A' or answer == 'B' else 0 for point in arr_point: key = '%s-%s' % (student_id, point) if key not in dict_exam_question: dict_exam_question[key] = [] dict_exam_difficulty[key] = {} record = (question, is_right, difficulty, submit_time, question_type) dict_exam_question[key].append(record) if difficulty not in dict_exam_difficulty[key]: dict_exam_difficulty[key][difficulty] = [] dict_exam_difficulty[key][difficulty].append(record) question_set = set([x[0] for x in self.practice_list_records]) dict_question_difficulty = self.getQuestionBaseInfo(question_set) for practice_item in self.practice_list_records: question, student_id, ret, answer, submit_time, question_type = practice_item is_right = 1 if ret == 1 else 0 if question in dict_question_difficulty: difficulty = self.dict_question_difficulty[question] if question in dict_question_topic: topic_set = dict_question_topic[question] for topic in topic_set: if topic in self.dict_topic_point: point = self.dict_topic_point[topic] key = '%s-%s' % (student_id, point) if key not in dict_exam_question: dict_exam_question[key] = [] dict_exam_difficulty[key] = {} if difficulty not in dict_exam_difficulty[key]: dict_exam_difficulty[key][difficulty] = [] record = (question, is_right, difficulty, submit_time, question_type) dict_exam_difficulty[key][difficulty].append( record) dict_exam_question[key].append(record) dict_res = {} self.db_fetcher.commit_sql_cmd('delete from entity_student_feature', 'mysql_logdata') for key in dict_exam_question: arr_content = key.split('-') student_id, point = arr_content[0], arr_content[1] question_records = sorted(dict_exam_question[key], key=lambda x: x[-1], reverse=True) right_cnt = 0 throld = min(len(question_records), range_throld) #if throld < 10: continue for record_item in question_records[:throld]: question, is_right, difficulty, submit_time, question_type = record_item right_cnt += is_right rateRight = right_cnt / range_throld adaDiff, pDiff = 0, -1 for difficulty in dict_exam_difficulty[key]: drecords = sorted(dict_exam_difficulty[key][difficulty], key=lambda x: x[-1], reverse=True) difficulty_throld = min(len(drecords), difficulty_throld) #if difficulty_throld < 4: continue dright_cnt = 0 for diff_item in drecords[:difficulty_throld]: question, is_right, difficulty, submit_time, question_type = diff_item dright_cnt += is_right drateRight = dright_cnt / difficulty_throld if drateRight > pDiff: adaDiff = difficulty pDiff = drateRight point_name, ptype, question_type, level, parent_id, link_id = self.dict_point[ point] insert_sql = "insert into entity_student_feature(student_id, point_id, point_name, adapt_difficulty, master) values(%s, %s, \'%s\', %s, %s)" \ % (student_id, point, point_name, adaDiff, rateRight) insert_id = self.db_fetcher.commit_sql_cmd(insert_sql, 'mysql_logdata') dict_res[key] = rateRight, difficulty return dict_res def getStudentScore(self): dict_student_score = {} for exam_item in self.exam_list_records: exam_question, student_id, ret, answer, submit_time, question_type = exam_item if exam_question in self.dict_realtion_quesion: link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[ exam_question] a_score, i_score = self.getScore(question_num, answer) exam_key = '%s-%s' % (student_id, question_exercise_id) if exam_key not in dict_student_score: dict_student_score[exam_key] = 0 dict_student_score[exam_key] += i_score return dict_student_score def getFirstPoint(self): # point dict_relation_point13 = {} sql = "select a.id id3, a.`name` name3, c.id id1, c.`name` name1 from entity_exam_points a join entity_exam_points \ b on b.id = a.parent_id join entity_exam_points c on c.id = b.parent_id where a.type = 3" rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: point3, name3, point1, name1 = row dict_relation_point13[point3] = name3, point1, name1 return dict_relation_point13 def getExerciseName(self): dict_exercise_name = {} sql = "select DISTINCT a.question_exercise_id, b.resource_name from link_question_answer a join neworiental_v3.entity_exercise b \ on b.id = a.question_exercise_id where a.update_time >= \'%s\'" % self.update_time exercise_rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for exercise_row in exercise_rows: exercise_id, resource_name = exercise_row dict_exercise_name[exercise_id] = resource_name return dict_exercise_name def getExamData(self, dict_student_score, dict_relation_point13, dict_exercise_name): for exam_item in self.exam_list_records: # exam question, student_id, ret, answer, submit_time, question_type = exam_item if question in self.dict_realtion_quesion: link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[ question] arr_points = [int(x) for x in link_point_id.strip().split(',')] exam_key = '%s-%s' % (student_id, question_exercise_id) str_point1, str_name1, str_point3, str_name3 = '', '', '', '' is_first = True for point3 in arr_points: name3, point1, name1 = dict_relation_point13[point3] if is_first == True: str_point3, str_name3, str_point1, str_name1 = str( point3), str(name3), str(point1), str(name1) is_first = False else: str_point3 += ',%d' % point3 str_name3 += ',%s' % name3 str_point1 += ',%d' % point1 str_name1 += ',%s' % name1 resource_name = dict_exercise_name[question_exercise_id] exercise_score = dict_student_score[ exam_key] if exam_key in dict_student_score else 0 a_score, i_score = self.getScore(question_num, answer) error_text = self.getErrorText(answer) print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % ( student_id, link_question_id, question_type, str_point1, str_name1, str_point3, str_name3, submit_time, resource_name, exercise_score, question_exercise_id, a_score, i_score, difficulty, ret, question_num, error_text, 1) def getPracticeData(self, dict_relation_point13, dict_question_topic, dict_question_base_info): for practice_item in self.practice_list_records: # practice question, student_id, ret, answer, submit_time, question_type = practice_item str_point1, str_name1, str_point3, str_name3 = '', '', '', '' is_first = True if question in dict_question_topic: if question in dict_question_topic: topic_set = dict_question_topic[question] for topic in topic_set: if topic in self.dict_topic_point: arr_point3 = [ int(x) for x in self.dict_topic_point[topic] ] for point3 in arr_point3: name3, point1, name1 = dict_relation_point13[ point3] if is_first == True: str_point3, str_name3, str_point1, str_name1 = str( point3), str(name3), str(point1), str( name1) is_first = False else: str_point3 += ',%s' % point3 str_name3 += ',%s' % name3 str_point1 += ',%s' % point1 str_name1 += ',%s' % name1 a_score, i_score = 0, 0 # 单选题 选择题 填空题 if question_type == '单选题' or question_type == '选择题' or question_type == '填空题': a_score = 5 else: a_score = 12 if ret == 1: i_score = a_score elif ret == 5: i_score = 0.5 * a_score difficulty, qtype = 1, 1 if question in dict_question_base_info: difficulty, qtype = dict_question_base_info[question] errorText = '正确' if ret == 1 else '错误' print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % ( student_id, question, question_type, str_point1, str_name1, str_point3, str_name3, submit_time, '推送练习', 0, 0, a_score, i_score, difficulty, ret, 0, errorText, 0) def getData(self): dict_student_score = self.getStudentScore() dict_relation_point13 = self.getFirstPoint() # point dict_exercise_name = self.getExerciseName() dict_question_topic = self.recommend.getThisQuestionTopic( ) # question topic dict_question_base_info = self.recommend.getThisQuestionBaseInfo( ) # question quality print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % \ ('学生id', '题目id', '题目类型', '一级考点id', '一级考点名称', '三级考点id', '三级考点名称', '提交时间', '试卷名称', '成绩', '题集id', '题目满分', '题目得分', '题目难度', '答题结果', '试卷题号', '错误原因', '考练类型') self.getExamData(dict_student_score, dict_relation_point13, dict_exercise_name) # exam self.getPracticeData(dict_relation_point13, dict_question_topic, dict_question_base_info) # practice def recommendMonday(self, is_new=0, throld=500): dict_question_topic = self.recommend.getThisQuestionTopic( ) # question topic d1 = datetime.datetime.now() d3 = d1 + datetime.timedelta(days=-14) str_monday = d3.strftime("%Y-%m-%d 00:00:00") self.db_fetcher.commit_sql_cmd( "delete from entity_recommend_question_bytopic", 'mysql_white_list') # update self.importDefault(0, dict_question_topic) # import is_first, insert_score = 1, 0 new_sql = "insert into sync_student_recommend_question(system_id, resource_type, resource_id, subject_id, tag1, tag2, score, type, type_level, type_id) values" update_sql = "insert into entity_recommend_question_bytopic(system_id, type,chapter_id, topic_id, question_id, `master`, duration, important, subject_id, score, school_publish, org_id, org_type) values" dict_student_cnt = {} records = self.exam_list_records #+ self.practice_list_records for item in records: question, student_id, ret, answer, submit_time, question_type = item if submit_time > str_monday: if question in self.dict_realtion_quesion: link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[ question] answer_key = '%s_%s' % (question, student_id) if answer_key in self.dict_content_answer: answer = self.dict_content_answer[answer_key] if answer == 'A' or answer == 'B': ret = 1 else: ret = 2 if ret != 1: if student_id not in dict_student_cnt: dict_student_cnt[student_id] = 0 if dict_student_cnt[student_id] < throld: # print '%s\t%s' % (student_id, link_question_id) if is_first == 1: update_sql += "(%s, 2, 0, 0, %s, 2, 2, 1, %s, %s, 0, 113, 2)" % ( student_id, link_question_id, 0, insert_score) if is_new == 1: new_sql += "(%s, 1, %s, %s, \'最近新错\', \'重难点\', %s, %s, %s, %s)" % ( student_id, link_question_id, 0, insert_score, 1, 3, 2186) is_first = 0 else: update_sql += ",(%s, 2, 0, 0, %s, 2, 2, 1, %s, %s, 0, 113, 2)" % ( student_id, link_question_id, 0, insert_score) if is_new == 1: new_sql += "(%s, 1, %s, %s, \'最近新错\', \'重难点\', %s, %s, %s, %s)" % ( student_id, link_question_id, 0, insert_score, 1, 3, 2186) dict_student_cnt[student_id] += 1 insert_score += 1 if is_new == 1: self.db_fetcher.commit_sql_cmd(new_sql, 'mysql_v3_white_list') else: self.db_fetcher.commit_sql_cmd(update_sql, 'mysql_white_list')
class TextAnalysis(object): media_dict = None entity_dict = None seo_extractor = SeoExtractor('../seo/com_df_idf') def __init__(self, mysql_fetcher=None, handler_redis=None): self.mysql_fetcher = mysql_fetcher if mysql_fetcher is None: self.mysql_fetcher = DataBaseFetcher() self.handler_redis = handler_redis if handler_redis is None: self.handler_redis = redis.Redis( host='1c34f95e1b12494b.m.cnbja.kvstore.aliyuncs.com', password='******', port=6379, db=5) self.load_entity_dict() def set_media_id(self, media_id): self.media_dict = self.get_media_dict(media_id) # 通过media_id 与向量字典关联 def set_media_dict(self, media_id, forward_index_dict): if media_id in forward_index_dict: self.media_dict = forward_index_dict[media_id] else: self.media_dict = self.get_media_dict(media_id) def load_entity_dict(self): self.entity_dict = set() # handle company company_file = '../../company_recognize/com_name2id.txt' with open(company_file) as com_f: for line in com_f: tokens = line.strip().split('\t') if len(tokens) != 2: continue self.entity_dict.add(tokens[0]) # handle industry db_industry = self.mysql_fetcher.get_sql_result( 'select distinct name from dict_industry', 'mysql_crm') for row_industry in db_industry: self.entity_dict.add(row_industry[0]) # handle organization db_organization = self.mysql_fetcher.get_sql_result( 'select distinct name_abbr from organization', 'mysql_crm') for row_organization in db_organization: self.entity_dict.add(row_organization[0]) # 获取文章关键词-权重值列表 def get_article_weight(self, media_id): response_list = self.seo_extractor.extract_article(media_id, title_weight=2, title_coeff=1.4) if response_list is None or len(response_list) == 0: return {} keyword_dict = {} # result dict for keyword, weight in response_list: if keyword in self.entity_dict: weight *= 1.4 keyword_dict[unicode(keyword, 'utf8')] = weight keyword_dict = { key: value for key, value in keyword_dict.items() if value > term_weight_threshold } if len(keyword_dict) < 5: keyword_dict = { key: value for key, value in keyword_dict.items() if value > 0.68 * term_weight_threshold } return keyword_dict # 删除历史记录的文本向量 def delete_redis_text(self, update_time_range): db_data = self.mysql_fetcher.get_sql_result( "select id from kr_articles where publish < \'%s\'" % update_time_range, 'mysql_insight') for i in range(len(db_data)): media_id = db_data[i][0] str_key = '%s_%s' % (kForwardIndexRedisKey, media_id) self.handler_redis.delete(str_key) # 计算两个向量的余弦相似度 def cos_similarity(self, v1, v2): if len(v1) != len(v2): # 向量维数必须相等 return 0.0 fen_divisor = 0.0 # 除数 dividend_v1 = 0.0 # 被除数 dividend_v2 = 0.0 # 被除数 for pos in range(len(v1)): fen_divisor += v1[pos] * v2[pos] dividend_v1 += v1[pos]**2 dividend_v2 += v2[pos]**2 fen_dividend = math.sqrt(dividend_v1) * math.sqrt(dividend_v2) return (fen_divisor / fen_dividend if fen_dividend > 0.0 else 0.0) # 初始化文本向量 def init_vector(self, vector1, dict_weight): if len(vector1) < 1: return [] v1 = [] # 文本向量vector for pos in range(len(vector1)): if vector1[pos] in dict_weight: v1.append(dict_weight[vector1[pos]]) else: v1.append(0.0) return v1 # 重建media文本信息的正排,并更新redis def media_text_to_forward_index(self, media_id): # 获取需要的字符串 dict_term_weight = self.get_article_weight(media_id) # 存入redis 文本关键词数据 key = media_id str_key = '%s_%s' % (kForwardIndexRedisKey, media_id) str_value = json.dumps(dict_term_weight) self.handler_redis.set(str_key, str_value) return dict_term_weight # 重建update_time_range时间之后的media文本信息的倒排,并更新redis def media_text_to_revert_index(self, update_time_range): #self.delete_redis_text(update_time_range) # 更新redis db_data = self.mysql_fetcher.get_sql_result( 'select id from kr_articles where publish >=\'%s\'' % update_time_range, 'mysql_insight') if len(db_data) < 1: return None revert_index_dict = {} # 倒排索引字典 n_count = 1 for i in range(len(db_data)): media_id = db_data[i][0] # self.cache_media_relate(media_id,update_time_range) # 缓存倒排索引 media_dict = self.get_media_dict(media_id) for term, weight in media_dict.items(): if term not in revert_index_dict: revert_index_dict[term] = {} revert_index_dict[term][media_id] = weight str_key = kRevertIndexRedisKey str_value = str(revert_index_dict) self.handler_redis.set(str_key, str_value) return revert_index_dict # 获取正排 def get_media_dict(self, media_id): media_dict = None str_key = '%s_%s' % (kForwardIndexRedisKey, media_id) str_value = self.handler_redis.get(str_key) # 判断redis 数据库中是否包含media_id 信息 if str_value is None: media_dict = self.media_text_to_forward_index(media_id) else: media_dict = json.loads(str_value) return media_dict # 用倒排计算文本相似度 def calc_simi_using_revert_index(self, revert_index_dict, meta_dict): if revert_index_dict is None: return {} dict_molecule = {} # 分子对象 dict_denominator1 = 0.0 # 分母对象1 result_dict = {} # 结果集字典 key = media_id media_list = [] # 计算所有的d 与该文档的相似度的分子分母 for term, weight in self.media_dict.items(): dict_denominator1 += weight**2 if term not in revert_index_dict: continue doc_weight_list = revert_index_dict[term] for media_id in doc_weight_list: # 获取相关列表 media_list.append(media_id) if media_id in dict_molecule: dict_molecule[ media_id] += weight * doc_weight_list[media_id] else: dict_molecule[ media_id] = weight * doc_weight_list[media_id] for media_id in media_list: if media_id not in meta_dict: continue x, y, dict_denominator2 = meta_dict[media_id] # 分母对象2 fen_dividend = math.sqrt(dict_denominator1) * math.sqrt( dict_denominator2) result_dict[media_id] = dict_molecule[ media_id] / fen_dividend if fen_dividend > 0.0 else 0.0 return result_dict # 构造相似向量 def dict_merge2list(self, d_media, d_compare): vec_words = [] b_similarity = False for media_key in d_media: if media_key in d_compare: vec_words.append(media_key) b_similarity = True return vec_words, b_similarity # 用正排计算文本相似度 def calculate_text_weight(self, compare_id, forward_index_dict): # 判断该向量是否存在 if self.media_dict is None: return 0.0 if compare_id in forward_index_dict: compare_dict = forward_index_dict[compare_id] else: compare_dict = self.get_media_dict(compare_id) # 判断比较向量是否存在 if compare_dict is None: return 0.0 vec_words, b_similarity = self.dict_merge2list(self.media_dict, compare_dict) if b_similarity == False: return 0.0 v1 = self.init_vector(vec_words, self.media_dict) v2 = self.init_vector(vec_words, compare_dict) # 计算余弦相似度值 d_result = self.cos_similarity(v1, v2) return d_result
class Report(object): """docstring for Report""" def __init__(self, update_time='2017-11-01 00:00:00'): # do something inits self.db_fetcher = DataBaseFetcher() self.dict_content_answer = {} # error content self.update_time = update_time # update time self.recommend = RecommendQuestion() self.dict_realtion_quesion, self.dict_question_num, self.dict_question_score = self.getRelationQuestion( ) self.dict_parent_quesion = self.getChildQuestion() self.dict_question_diff = self.getRightDifficulty() self.dict_point, self.dict_topic_point = self.getPoints() self.dict_students = self.getStudentId() self.dict_student_score, self.dict_student_records = {}, { x: [] for x in self.dict_students } self.exam_list_records, self.practice_list_records = self.getExerciseRecord( ) #records # dict self.dict_question_types = { 1: '选择题', 2: '填空题', 4: '简答题', 6: '综合题', 17: '计算题', 44: '证明题' } self.dict_question_difficulty = { 1: '易', 2: '中', 3: '难', 4: '极难' } # 1:易 2:中 3:难 4:极难 # feedback self.dict_students_resource = self.getStudentResource() # resource def getExerciseRecord(self): exam_list_records = [] practice_list_records = [] sql = "select a.question_id, a.student_id, a.ret_num, a.submit_answer, a.submit_time, b.question_type from entity_student_exercise a join \ neworiental_v3.entity_question b on b.id = a.question_id where a.submit_time > \'%s\' and a.exercise_source = 6" % ( self.update_time) dict_is_choice = {} rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: question, student_id, ret, answer, submit_time, question_type = row student_id = long(student_id) if answer == 'null' or answer == 'None' or answer == '' or answer == 'NULL' or answer == None or len( answer) == 0: continue if question in self.dict_realtion_quesion: exam_list_records.append([ question, student_id, ret, answer, submit_time, question_type ]) self.dict_student_records[student_id].append([ question, student_id, ret, answer, submit_time, question_type ]) self.dict_content_answer['%s_%s' % (question, student_id)] = answer elif question in self.dict_parent_quesion: dict_content = self.dict_parent_quesion[question] parent_question_id = dict_content['parent_id'] rank = dict_content['rank'] link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[ parent_question_id] if question_num >= 17 and question_num <= 21: if rank == 1: exam_list_records.append([ parent_question_id, student_id, ret, answer, submit_time, question_type ]) self.dict_student_records[student_id].append([ parent_question_id, student_id, ret, answer, submit_time, question_type ]) elif rank == 2: self.dict_content_answer['%s_%s' % (parent_question_id, student_id)] = answer elif question_num >= 22 and question_num <= 23: key = '%s_%s' % (parent_question_id, student_id) if rank == 1: if answer == 'A': dict_is_choice[key] = 1 else: dict_is_choice[key] = 0 elif rank == 2: if key in dict_is_choice: if dict_is_choice[key] == 1: exam_list_records.append([ parent_question_id, student_id, ret, answer, submit_time, question_type ]) self.dict_student_records[student_id].append([ parent_question_id, student_id, ret, answer, submit_time, question_type ]) elif rank == 3: self.dict_content_answer[key] = answer else: practice_list_records.append([ question, student_id, ret, answer, submit_time, question_type ]) return exam_list_records, practice_list_records def getStudentId(self, file_name='students.txt'): list_students = [] with open(file_name) as handle_f: for line in handle_f: system_id = long(line.strip()) list_students.append(system_id) set_studends = set(list_students) # students set str_content = ','.join([str(x) for x in set_studends]) sql = "select a.`system_id`, a.`name` from neworiental_user.entity_user a where type = 2 and a.system_id in (%s)" % str_content dict_students = {} rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: student_id, name = row if student_id in set_studends: dict_students[student_id] = name return dict_students def getRightDifficulty(self): dict_question_diff = {} sql = "select b.id, b.difficulty from neworiental_v3.entity_question b join link_question_answer a on b.id = a.link_question_id where a.update_time >= \'%s\'" % self.update_time rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: question_id, difficulty = row dict_question_diff[question_id] = int(difficulty) return dict_question_diff def getChildQuestion(self): dict_parent_list = {} dict_parent_question = {} sql = "select a.id, a.parent_question_id from link_question_answer b join neworiental_v3.entity_question a on a.parent_question_id = b.link_answer_id \ where b.question_num >= 17 and b.update_time >= \'%s\'" % self.update_time rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: qid = row[0] parent_id = row[1] if parent_id not in dict_parent_list: dict_parent_list[parent_id] = [] dict_parent_list[parent_id].append(qid) for parent_id in dict_parent_list: sort_question = sorted(dict_parent_list[parent_id], reverse=False) count = 1 for item in sort_question: dict_parent_question[item] = { 'parent_id': parent_id, 'rank': count } count += 1 return dict_parent_question def getRelationQuestion(self): dict_realtion_quesion = {} dict_point_question_num = {} dict_question_score = {} sql = "select a.link_question_id, a.link_answer_id, a.link_point_id, a.question_num, b.difficulty, a.question_exercise_id, b.question_type, a.score \ from link_question_answer a join neworiental_v3.entity_question b on a.link_question_id = b.id where a.update_time >= \'%s\'" % self.update_time rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') dict_parent_quesion = {} link_parent_list = [] for row in rows: link_question_id, link_answer_id, link_point_id, question_num, difficulty, question_exercise_id, question_type, score = row dict_realtion_quesion[ link_answer_id] = link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type dict_question_score[link_question_id] = score arr_point = link_point_id.strip().split(',') for point in arr_point: if point not in dict_point_question_num: dict_point_question_num[point] = [] dict_point_question_num[point].append([ link_question_id, question_num, difficulty, question_exercise_id, question_type ]) return dict_realtion_quesion, dict_point_question_num, dict_question_score def getScore(self, question_num, answer, ascore): ascore, score = 0, 0 if question_num >= 1 and question_num <= 16: score = 5 if (answer == 'A' or answer == 'B') else 0 ascore = 5 elif question_num >= 17 and question_num <= 23: if answer == 'A': if ascore == 10: score = 4 elif ascore == 12: score = 4 elif ascore == 13: score = 5 elif ascore == 14: score = 5 elif ascore == 15: score = 6 elif answer == 'B': if ascore == 10: score = 7 elif ascore == 12: score = 8 elif ascore == 13: score = 9 elif ascore == 14: score = 10 elif ascore == 15: score = 11 elif answer == 'C': if ascore == 10: score = 9 elif ascore == 12: score = 11 elif ascore == 13: score = 12 elif ascore == 14: score = 13 elif ascore == 15: score = 15 elif answer == 'D': score = ascore return score def getRecentlyExercise(throld=5): set_recent = set() sql = "select distinct question_exercise_id from link_question_answer order by question_exercise_id desc limit %s" % throld rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: eid = row eid = long(eid) set_recent.add(eid) return set_recent def getAllScore(self, link_question_id, question_num): ascore = 0 if link_question_id in self.dict_question_score: ascore = self.dict_question_score[link_question_id] else: if question_num >= 1 and question_num <= 16: ascore = 5 elif question_num >= 17 and question_num <= 21: ascore = 12 elif question_num >= 22 and question_num <= 23: ascore = 10 return ascore def statQustionReport(self, range_throld=4, range_target=20): dict_students_point_question = {} dict_student_score = {} for item in self.exam_list_records: question, student_id, ret, answer, submit_time, question_type = item if question in self.dict_realtion_quesion: link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[ question] arr_points = link_point_id.strip().split(',') if student_id not in dict_student_score: dict_student_score[student_id] = 0 ascore = self.getAllScore(link_question_id, question_num) score = self.getScore(question_num, answer, ascore) dict_student_score[student_id] += score is_right = (answer == 'A') or (answer == 'B') if student_id not in dict_students_point_question: dict_students_point_question[student_id] = {} # student id point_name = '-'.join([str(x) for x in arr_points]) # name if point_name not in dict_students_point_question[student_id]: dict_students_point_question[student_id][point_name] = [] dict_students_point_question[student_id][point_name].append( (link_question_id, is_right, question_type, difficulty, submit_time)) # question dict_res_student_point_question = {} for student_id in dict_students_point_question: target_rate = 0.5 if student_id in dict_student_score: target_rate = (dict_student_score[student_id] * 0.2 * 1.2) / 150 dict_point_student_question = dict_students_point_question[ student_id] dict_res_student_point_question[student_id] = {} for point_name in dict_point_student_question: arr_question = dict_point_student_question[point_name] target_questions = [] sort_question = sorted(arr_question, key=lambda x: x[-1], reverse=True) right_rate, cnt_rate = 0, range_target if len(sort_question) < cnt_rate: cnt_rate = len(sort_question) for item in sort_question[:cnt_rate]: link_question_id, is_right, qtype, difficulty, submit_time = item if is_right == 0: target_questions.append( (link_question_id, qtype, difficulty, submit_time)) else: right_rate += 1 if right_rate / cnt_rate < target_rate and len( target_questions) > 0: dict_res_student_point_question[student_id][ point_name] = target_questions return dict_res_student_point_question def getErrorText(self, answer): if answer == 'C': return '审题时,粗心看错了' elif answer == 'D': return '审题时,题意读不懂' elif answer == 'E': return '析题时,思路模糊不清' elif answer == 'F': return '析题时,思路方向错误' elif answer == 'G': return '解题时,公式记错了' elif answer == 'H': return '解题时,计算出错了' elif answer == 'I': return '未作答(或未完成),时间来不及了' elif answer == 'A' or answer == 'B': return '答题正确' def getQuestionWords(self): dict_question_words = {} str_question = ','.join([ str(self.dict_realtion_quesion[x][0]) for x in self.dict_realtion_quesion ]) str_sql = "select c.id, c.subject_id, t.type_id,t.struct_id, c.json_data from neworiental_v3.entity_question c left join neworiental_v3.entity_question_type t on \ t.type_id=c.question_type_id where c.id in (%s)" % str_question s_rows = self.db_fetcher.get_sql_result(str_sql, 'mysql_logdata') for row in s_rows: question, subject_id, type_id, struct_id, json_data = row if json_data != '' and json_data != 'None' and json_data != None: keywords, res = sq.predict(question, subject_id, type_id, struct_id, json_data) dict_question_words[question] = keywords, res return dict_question_words def getPoint2Question(self): dict_point_org_question = {} for answer_id in self.dict_realtion_quesion: link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[ answer_id] arr_point = link_point_id.strip().split(',') for item in arr_point: if item not in dict_point_org_question: dict_point_org_question[item] = [] dict_point_org_question[item].append(link_question_id) return dict_point_org_question def pointsRecQuestion(self, dict_students_point_question, throld=4, error_range=3): # 1,2 choice question 1:easy 2:difficulty 3,4: comprehensive problem 3:easy 4:difficulty # dict_point_org_question = self.getPoint2Question() # org question point dict_question_text = self.getQuestionWords() dict_question_base_info = self.recommend.getThisQuestionBaseInfo( ) # question quality dict_question_topic = self.recommend.getThisQuestionTopic( ) # question topic # self.recommendMonday(dict_question_base_info) # error questionsss dict_question_target = {} dict_student_recommend_question = {} cid = 1 student_target = open('student_target.txt', 'w') for student_id in dict_students_point_question: sort_student_item = sorted( dict_students_point_question[student_id].items(), key=lambda x: len(x[-1]), reverse=False) student_name = self.dict_students[student_id] haveTime = 60 recommend_questions = [] pre_set = set() for item_target in sort_student_item: if haveTime <= 0: break item_point, arr_question = item_target item_one = item_point.strip().split('-')[0] key = '%s-%s' % (student_id, item_one) pre_set = pre_set | set([x[0] for x in arr_question]) # if len(arr_question) > error_range: arr_question = arr_question[:error_range] for base_question, qtype, difficulty, submit_time in arr_question: if haveTime <= 0: break if qtype == '选择题': qtype = 1 elif qtype == '填空题': qtype = 2 else: qtype = 3 keywords, text = dict_question_text[base_question] text = text.replace('\n', '').strip('\r').strip() # filter base_question_key = '%s-%s' % (base_question, difficulty) if base_question_key in dict_question_target: target_questions = dict_question_target[ base_question_key] else: target_questions = self.recommend.getEsResult( base_question, text, keywords, difficulty, qtype, pre_set, throld) dict_question_target[ base_question_key] = target_questions for rec_question in target_questions: if haveTime <= 0: break if rec_question in pre_set: continue haveTime += -1 recommend_questions.append( (base_question, keywords, rec_question, item_one, difficulty)) pre_set.add(rec_question) insert_score = 0 for rec_item in recommend_questions: source_question, keywords, rec_question, item_point, difficulty = rec_item point_name, ptype, question_type, level, parent_id, link_id = self.dict_point[ item_point] type_id = 2186 if rec_question in dict_question_base_info: difficulty, question_type, upload_id = dict_question_base_info[ rec_question] else: upload_id = 0 if rec_question in dict_question_topic: set_topic = dict_question_topic[rec_question] if len(set_topic) > 0: type_id = set_topic.pop() str_time = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s\t%s\t1\t%s\t%s\t针对练习\t重点\t%s\t%s\t%s\t%s\t%s" % ( cid, student_id, rec_question, 4, insert_score, str_time, 1, 3, type_id) student_target.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (cid, student_id, student_name, item_point, point_name, source_question, rec_question, keywords, difficulty)) insert_score += 1 cid += 1 def getPoints(self): dict_point = {} dict_topic_point = {} sql = 'select id, name, type, question_type, level, parent_id, link_id, topics_id from entity_exam_points' # do something rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: point_id, name, ptype, question_type, level, parent_id, link_id, topics_id = row point_id = str(point_id) dict_point[ point_id] = name, ptype, question_type, level, parent_id, link_id if ptype == 3: # arr_topic = [long(x) for x in topics_id.strip().split(',')] for topic in arr_topic: if topic not in dict_topic_point: dict_topic_point[topic] = [] dict_topic_point[topic].append(point_id) return dict_point, dict_topic_point def getStudentResource(self): dict_students_resource = {} str_content = ','.join([str(x) for x in self.dict_students]) sql = "select b.student_id, a.resource_id, b.deadline from neworiental_v3.link_respackage_publish_resource a JOIN neworiental_v3.link_respackage_student b on \ a.publish_id = b.publish_id where a.resource_type = 2 and b.student_id in (%s)" % ( str_content) rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: student_id, resource_id, deadline = row if resource_id > 0: if student_id not in dict_students_resource: dict_students_resource[student_id] = set() dict_students_resource[student_id].add(resource_id) return dict_students_resource def getQuestionBaseInfo(self, question_set): dict_question_difficulty = {} str_content = ','.join([str(x) for x in question_set]) sql = "select b.id, b.difficulty from neworiental_v3.entity_question where b.id in (%s)" % str_content rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: question_id, difficulty = row dict_question_diff[question_id] = int(difficulty) return dict_question_difficulty def getStudentScore(self): dict_student_score = {} for exam_item in self.exam_list_records: exam_question, student_id, ret, answer, submit_time, question_type = exam_item if exam_question in self.dict_realtion_quesion: link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[ exam_question] ascore = self.getAllScore(link_question_id, question_num) i_score = self.getScore(question_num, answer, ascore) exam_key = '%s-%s' % (student_id, question_exercise_id) if exam_key not in dict_student_score: dict_student_score[exam_key] = 0 dict_student_score[exam_key] += i_score return dict_student_score def getFirstPoint(self): # point dict_relation_point13 = {} sql = "select a.id id3, a.`name` name3, c.id id1, c.`name` name1 from entity_exam_points a join entity_exam_points \ b on b.id = a.parent_id join entity_exam_points c on c.id = b.parent_id where a.type = 3" rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: point3, name3, point1, name1 = row dict_relation_point13[point3] = name3, point1, name1 return dict_relation_point13 def getExerciseName(self): dict_exercise_name = {} sql = "select DISTINCT a.question_exercise_id, b.resource_name from link_question_answer a join neworiental_v3.entity_exercise b \ on b.id = a.question_exercise_id where a.update_time >= \'%s\'" % self.update_time exercise_rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for exercise_row in exercise_rows: exercise_id, resource_name = exercise_row dict_exercise_name[exercise_id] = resource_name return dict_exercise_name def getExamData(self, dict_student_score, dict_relation_point13, dict_exercise_name): for exam_item in self.exam_list_records: # exam question, student_id, ret, answer, submit_time, question_type = exam_item if question in self.dict_realtion_quesion: link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[ question] arr_points = [int(x) for x in link_point_id.strip().split(',')] exam_key = '%s-%s' % (student_id, question_exercise_id) str_point1, str_name1, str_point3, str_name3 = '', '', '', '' is_first = True for point3 in arr_points: name3, point1, name1 = dict_relation_point13[point3] if is_first == True: str_point3, str_name3, str_point1, str_name1 = str( point3), str(name3), str(point1), str(name1) is_first = False else: str_point3 += ',%d' % point3 str_name3 += ',%s' % name3 str_point1 += ',%d' % point1 str_name1 += ',%s' % name1 resource_name = dict_exercise_name[question_exercise_id] exercise_score = dict_student_score[ exam_key] if exam_key in dict_student_score else 0 a_score, i_score = self.getScore(question_num, answer) error_text = self.getErrorText(answer) print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % ( student_id, link_question_id, question_type, str_point1, str_name1, str_point3, str_name3, submit_time, resource_name, exercise_score, question_exercise_id, a_score, i_score, difficulty, ret, question_num, error_text, 1) def getPracticeData(self, dict_relation_point13, dict_question_topic, dict_question_base_info): for practice_item in self.practice_list_records: # practice question, student_id, ret, answer, submit_time, question_type = practice_item str_point1, str_name1, str_point3, str_name3 = '', '', '', '' is_first = True if question in dict_question_topic: if question in dict_question_topic: topic_set = dict_question_topic[question] for topic in topic_set: if topic in self.dict_topic_point: arr_point3 = [ int(x) for x in self.dict_topic_point[topic] ] for point3 in arr_point3: name3, point1, name1 = dict_relation_point13[ point3] if is_first == True: str_point3, str_name3, str_point1, str_name1 = str( point3), str(name3), str(point1), str( name1) is_first = False else: str_point3 += ',%s' % point3 str_name3 += ',%s' % name3 str_point1 += ',%s' % point1 str_name1 += ',%s' % name1 a_score, i_score = 0, 0 # 单选题 选择题 填空题 if question_type == '单选题' or question_type == '选择题' or question_type == '填空题': a_score = 5 else: a_score = 12 if ret == 1: i_score = a_score elif ret == 5: i_score = 0.5 * a_score difficulty, qtype = 1, 1 if question in dict_question_base_info: difficulty, qtype, upload_id = dict_question_base_info[ question] errorText = '正确' if ret == 1 else '错误' print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % ( student_id, question, question_type, str_point1, str_name1, str_point3, str_name3, submit_time, '推送练习', 0, 0, a_score, i_score, difficulty, ret, 0, errorText, 0) def getData(self): dict_student_score = self.getStudentScore() dict_relation_point13 = self.getFirstPoint() # point dict_exercise_name = self.getExerciseName() dict_question_topic = self.recommend.getThisQuestionTopic( ) # question topic dict_question_base_info = self.recommend.getThisQuestionBaseInfo( ) # question quality print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % \ ('学生id', '题目id', '题目类型', '一级考点id', '一级考点名称', '三级考点id', '三级考点名称', '提交时间', '试卷名称', '成绩', '题集id', '题目满分', '题目得分', '题目难度', '答题结果', '试卷题号', '错误原因', '考练类型') self.getExamData(dict_student_score, dict_relation_point13, dict_exercise_name) # exam self.getPracticeData(dict_relation_point13, dict_question_topic, dict_question_base_info) # practice def recommendMonday(self, dict_question_base_info, is_new=1, throld=500): dict_question_topic = self.recommend.getThisQuestionTopic( ) # question topic d1 = datetime.datetime.now() d3 = d1 + datetime.timedelta(days=-7) str_monday = d3.strftime("%Y-%m-%d 00:00:00") is_first, insert_score = 1, 0 dict_student_cnt = {} records = self.exam_list_records #+ self.practice_list_records for item in records: question, student_id, ret, answer, submit_time, question_type = item if submit_time > str_monday: if question in self.dict_realtion_quesion: link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[ question] answer_key = '%s_%s' % (question, student_id) if answer_key in self.dict_content_answer: answer = self.dict_content_answer[answer_key] if answer == 'A' or answer == 'B': ret = 1 else: ret = 2 if ret != 1: if student_id not in dict_student_cnt: dict_student_cnt[student_id] = 0 if dict_student_cnt[student_id] < throld: # print '%s\t%s' % (student_id, link_question_id) if link_question_id in dict_question_base_info: difficulty, question_type, upload_id = dict_question_base_info[ link_question_id] else: upload_id = 0 print "%s\t1\t%s\t%s\t\'最近新错\'\t\'重难点\'\t%s\t%s\t%s\t%s" % ( student_id, link_question_id, 0, insert_score, 1, 3, 2186) dict_student_cnt[student_id] += 1 insert_score += 1
class SimiTags(object): def __init__(self): self.db_fetcher = DataBaseFetcher() self.tag2comlist = self.load_info( 'select cid,tag from crm.company_tag') self.tag_child = self.load_info( 'select tag,parent_tag from crm.tag_tag_relation') self.tag_parent = self.load_info( 'select parent_tag,tag from crm.tag_tag_relation') self.total_num, self.manual_tag_count = self.tag_cid() def load_info(self, cmd): temp_dict = {} db_row = self.db_fetcher.get_sql_result(cmd, 'mysql_readall') for row in db_row: leaf, root = row if root not in temp_dict: temp_dict[root] = [] temp_dict[root].append(leaf) return temp_dict def tag_cid(self): db_row = self.db_fetcher.get_sql_result( 'select count(distinct cid) from crm.company_tag', 'mysql_readall') total_num = db_row[0][0] manual_tag_count = {} db_row = self.db_fetcher.get_sql_result( 'select cid,tag from crm.company_tag where type != 5', 'mysql_readall') for row in db_row: crm_id, tag = row if tag not in manual_tag_count: manual_tag_count[tag] = 0 manual_tag_count[tag] += 1 return total_num, manual_tag_count def get_correlation(self): similarity_tag = {} tag_corre_dict = {} f = 0 for tag1 in self.tag2comlist: tag_corre_dict = {} if tag1.lower() != '人工智能': continue if tag1 not in self.manual_tag_count: continue manual_num_x = self.manual_tag_count[tag1] if manual_num_x < 30: continue l1 = self.tag2comlist[tag1] sum_x = len(l1) if sum_x < 100: continue for tag2 in self.tag2comlist: if tag1 == tag2: continue if tag2 not in self.tag2comlist or tag2 not in self.manual_tag_count: continue manual_num = self.manual_tag_count[tag2] if manual_num < 2: continue l2 = self.tag2comlist[tag2] sum_y = len(l2) if sum_y > sum_x * 1.5: continue xy = len(set(l1) & set(l2)) if xy < 5: continue pf = xy * 1.0 / sum_x if pf > 0.6: continue pc = xy * 1.0 / sum_y pnc = (sum_x - xy + 1) * 1.0 / (self.total_num - sum_y) if pc < 0.01: continue if pnc != 0: value = pc / pnc if value < 1: continue correlation = math.pow(xy, 0.2) * math.pow(value, 0.8) tag_corre_dict[tag2] = correlation tag_items = tag_corre_dict.items() tag_heap = heapq.nlargest(len(tag_items), tag_items, key=lambda x: x[-1]) tag_heap = tag_heap[:40] if len(tag_heap) == 0: continue re = map(lambda x: x[0], tag_heap) if len(re) == 0: continue similarity_tag[tag1] = re print tag1, '\t', "%s" % json.dumps( re, ensure_ascii=False).encode('utf8') return similarity_tag def rm_sub(self, tag_heap): abandon_l = [] for i in range(0, len(tag_heap)): tag1, weight = tag_heap[i] if tag1 not in self.tag2comlist: continue l1 = self.tag2comlist[tag1] sum_x = len(l1) tag1_p, tag1_c = [], [] if tag1 in self.tag_child: tag1_c = self.tag_child[tag1] if tag1 in self.tag_parent: tag1_p = self.tag_parent[tag1] for j in range(i + 1, len(tag_heap) - 1): tag2, weight = tag_heap[j] if tag2 in tag1_p or tag2 in tag1_c: abandon_l.append(j) continue if tag1.find(tag2) != -1 or tag2.find(tag1) != -1: abandon_l.append(j) continue l2 = self.tag2comlist[tag2] sum_y = len(l2) xy = len(set(l1) & set(l2)) if (xy * 1.0 / sum_y) > 0.8: abandon_l.append(j) if (xy * 1.0 / sum_x) > 0.6: abandon_l.append(j) re_l = [] for i in range(0, len(tag_heap) - 1): if i in abandon_l: continue else: re_l.append(tag_heap[i][0]) return re_l
#!/usr/bin/env python # -*- coding: utf8 -*- import os, json, datetime, re, time import sys reload(sys) sys.setdefaultencoding('utf-8') sys.path.append( os.path.realpath(os.path.join(os.path.dirname(__file__), '../../'))) from company_recognize.report_to_company import ComLib from common.db_fetcher import DataBaseFetcher db_fetcher = DataBaseFetcher() cur_abs_path = os.path.dirname(os.path.abspath(__file__)) path_segment = '../../company_recognize/segment/data' file_com_name2id = '../../company_recognize/com_name2id.txt' file_filter_words = '../../company_recognize/com_filter.txt' comlib = ComLib(path_segment, file_com_name2id, None, file_filter_words) def get_event_type(title): etype = 0 rongzi_list = [ '融资', '投资', '美元', '人民币', '种子轮', '天使轮', 'PreA', 'A轮', 'B轮', 'C轮', 'D轮', 'E轮', 'IPO', '上市', '私有化', '收购', '并购', '合并', '倒闭' ] new_list = [ '发布', '推出', '研发', '启动', '新品', '新产品', '版本', '业务', '进军', '入股', '战略', '上线' ] biandong_list = [
class QualityQuestion(object): """docstring for QualityQuestion""" def __init__(self): self.db_fetcher = DataBaseFetcher() # mysql handle self.redis_cache = CacheHelper('127.0.0.1', 6379, 1, '') # redis self.dict_simple_science, self.dict_simple_art = self.getSimpleQustion( ) self.dict_question_topic = self.getQuestionTopic() self.dict_topic = self.getTopicDict() def getQuestionTopic(self, save_file='topic.txt'): dict_question_topic = {} max_num = 0 with open(save_file) as handle_f: for line in handle_f: arr = line.strip().split('\t') if len(arr) != 3: continue tid, question_id, topic_id = arr[0], long(arr[1]), int(arr[2]) if question_id not in dict_question_topic: dict_question_topic[question_id] = set() dict_question_topic[question_id].add(topic_id) max_num = tid sql = "select id, question_id, topic_id from neworiental_v3.link_question_topic where id > %s" % max_num rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') with open(save_file, 'a') as write_f: for row in rows: tid, question_id, topic_id = row if question_id not in dict_question_topic: dict_question_topic[question_id] = set() dict_question_topic[question_id].add(topic_id) write_f.write('%s\t%s\t%s\n' % (tid, question_id, topic_id)) return dict_question_topic def getTopicDict(self): dict_topic = {} sql = "select id, `name` from neworiental_v3.entity_topic where subject_id = 4" rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: topic_id, name = row topic_id = int(topic_id) dict_topic[topic_id] = name return dict_topic def makeSet2Dict(self, set_simple): dict_simple = {} for item in set_simple: arr = item.split('\t') qid, question_type, json_data = arr[0], arr[1], arr[2] qid = long(qid) dict_simple[qid] = question_type, json_data return dict_simple def getSimpleQustion(self): dict_simple_science = {} dict_simple_art = {} is_redis = self.redis_cache.exists( '62951166961') and self.redis_cache.exists('62951084550') if is_redis: # science set_simple_science = self.redis_cache.smembers('62951166961') dict_simple_science = self.makeSet2Dict(set_simple_science) # art set_simple_art = self.redis_cache.smembers('62951084550') dict_simple_art = self.makeSet2Dict(set_simple_art) else: sql = "SELECT id, question_type, json_data, upload_id from neworiental_v3.entity_question where upload_id in (62951166961, 62951084550) and json_data !=\'\' " rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: qid, question_type, json_data, upload_id = row upload_id = str(upload_id) json_data = json_data.replace('\t', '') if upload_id == '62951166961': # students majored in science dict_simple_science[qid] = question_type, json_data elif upload_id == '62951084550': dict_simple_art[qid] = question_type, json_data str_content = '%s\t%s\t%s' % (qid, question_type, json_data) self.redis_cache.sadd(upload_id, str_content) return dict_simple_science, dict_simple_art def getTopicSimpleQuestion(self): dict_topic_question = {} for qid in self.dict_simple_art: if qid in self.dict_question_topic: set_topic = self.dict_question_topic[qid] for topic in set_topic: if topic not in dict_topic_question: dict_topic_question[topic] = set() dict_topic_question[topic].add(qid) for topic in dict_topic_question: set_topic = dict_topic_question[topic] str_content = '\t'.join([str(x) for x in set_topic]) print '%s\t%s\t%s' % (topic, self.dict_topic[topic], str_content)