Example #1
0
    def __init__(self):
        self.db_fetcher = DataBaseFetcher()  # mysql handle
        self.redis_cache = CacheHelper('127.0.0.1', 6379, 1, '')  # redis
        self.dict_simple_science, self.dict_simple_art = self.getSimpleQustion(
        )

        self.dict_question_topic = self.getQuestionTopic()
        self.dict_topic = self.getTopicDict()
Example #2
0
    def __init__(self):
        self.db_fetcher = DataBaseFetcher()  # mysql handle
        self.dict_topic = self.getTopicDict()  # topic
        self.dict_quality = {5: 25, 4: 20, 3: 15, 2: 10, 1: 5}

        self.dict_question_base_info = self.getQuestionBaseInfo()  # base
        self.dict_question_quality = self.getQuestionQuality()
        self.dict_question_topic, self.dict_topic_question = self.getQuestionTopic(
        )  # topic
Example #3
0
 def __init__(self):
     self.db_fetcher = DataBaseFetcher()
     self.tag2comlist = self.load_info(
         'select cid,tag from crm.company_tag')
     self.tag_child = self.load_info(
         'select tag,parent_tag from crm.tag_tag_relation')
     self.tag_parent = self.load_info(
         'select parent_tag,tag from crm.tag_tag_relation')
     self.total_num, self.manual_tag_count = self.tag_cid()
Example #4
0
    def __init__(self, stage, subject):
        self.fetcher = DataBaseFetcher()
        self.topicDict, self.topicImportDict = self.getTopic(stage, subject)
        self.quesTopicDict, self.topicQuestionDict = self.getQuestionTopic(
        )  # static

        # student exercise
        self.recordList = self.cacheRecord('question_result.txt')
        self.topicStudentDict = self.getTopicStudent()
Example #5
0
 def __init__(self, mysql_fetcher=None, handler_redis=None):
     self.mysql_fetcher = mysql_fetcher
     if mysql_fetcher is None:
         self.mysql_fetcher = DataBaseFetcher()
     self.handler_redis = handler_redis
     if handler_redis is None:
         self.handler_redis = redis.Redis(
             host='1c34f95e1b12494b.m.cnbja.kvstore.aliyuncs.com',
             password='******',
             port=6379,
             db=5)
     self.load_entity_dict()
Example #6
0
    def __init__(self, exercise_id, update_time='2017-11-01 00:00:00'):
        # do something inits
        self.db_fetcher = DataBaseFetcher()
        self.exercise_id = exercise_id
        self.dict_content_answer = {}  # error content
        self.curr_time = datetime.datetime.now()

        self.update_time = update_time  # update time
        self.recommend = RecommendQuestion()

        self.dict_realtion_quesion, self.dict_question_num = self.getRelationQuestion(
        )
        self.dict_parent_quesion = self.getChildQuestion()
        self.dict_question_diff = self.getRightDifficulty()
        self.dict_point, self.dict_topic_point = self.getPoints()

        self.dict_students = self.getStudentId()
        self.dict_student_score, self.dict_student_records = {}, {
            x: []
            for x in self.dict_students
        }

        self.exam_list_records, self.practice_list_records = self.getExerciseRecord(
        )  #records

        # dict
        self.dict_question_types = {
            1: '选择题',
            2: '填空题',
            4: '简答题',
            6: '综合题',
            17: '计算题',
            44: '证明题'
        }
        self.dict_question_difficulty = {
            1: '易',
            2: '中',
            3: '难',
            4: '极难'
        }  # 1:易 2:中 3:难 4:极难

        # feedback
        self.dict_students_resource = self.getStudentResource()  # resource
Example #7
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys

reload(sys)
sys.setdefaultencoding("utf-8")

import json, urllib, urllib2, json, chardet, re
from http_request import *

from bs4 import BeautifulSoup

sys.path.append('../../')
from common.db_fetcher import DataBaseFetcher

db_fetcher = DataBaseFetcher()  # database

jquery_host = "http://jquery.cuishifeng.cn"
jquery_url = "http://jquery.cuishifeng.cn/index.html"


def parse_jquery_attribute(attribute_url):
    attribute_html = get_request(attribute_url, jquery_host, {})
    soup = BeautifulSoup(attribute_html, 'lxml')
    content = soup.find_all('div', id='content')

    # retVal
    retVal = content.h2.span.text if content.h2 is not None else ''

    # summarize
    summarize = soup.find('div', class_='desc')
Example #8
0
#!/usr/bin/env python
# -*- coding: utf8 -*-

import os,json,datetime,re,time
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
sys.path.append(os.path.realpath(os.path.join(os.path.dirname(__file__), '../../')))
from common.db_fetcher import DataBaseFetcher

db_fetcher = DataBaseFetcher()

def get_domain_list(url_file = 'urls.txt'):
    domain_list = []
    with open(url_file) as url_f:
        for line in url_f:
            url = line.strip()
            if 'https' in url:
                start_pos = 8
            else:
                start_pos = 7

            str_url = url[start_pos:]
            end_pos = str_url.find('/')
            if end_pos  == -1: end_pos = len(str_url)
            domain = str_url[:end_pos]
            domain_list.append(domain)

    return domain_list

def create_html_table(datas, title=None):
Example #9
0
#!/usr/bin/env python
#coding=utf8
from __future__ import division

import sys,os
reload(sys)
sys.setdefaultencoding('utf-8')
sys.path.append(os.path.realpath(os.path.join(os.path.dirname(__file__), '../../')))
import json,math,MySQLdb,datetime,time
import similary_question as sq

from recommend import RecommendQuestion
from common.db_fetcher import DataBaseFetcher

recommend = RecommendQuestion() # object
db_fetcher = DataBaseFetcher()

question_list = [] # set
delete_sql = "DELETE FROM error_question_log WHERE question_id IN ( select c.question_id from (SELECT a.question_id FROM error_question_log a JOIN neworiental_v3.entity_question b ON a.question_id = b.id WHERE b.subject_id != 4) c)"
db_fetcher.commit_sql_cmd(delete_sql, 'mysql_logdata') # update

rows = db_fetcher.get_sql_result("select question_id from error_question_log", 'mysql_logdata')
for row in rows:
    qid = row[0]
    question_list.append(qid)

def getQuestionWords(question_list): # question word
    dict_question_info = {}
    lSet = int(len(question_list) / 1000) + 2
    start, end = 0, 0
    for i in range(1, lSet):
Example #10
0
class CzMathGraph(object):
    """docstring for czMathGraph"""
    def __init__(self, stage, subject):
        self.fetcher = DataBaseFetcher()
        self.topicDict, self.topicImportDict = self.getTopic(stage, subject)
        self.quesTopicDict, self.topicQuestionDict = self.getQuestionTopic(
        )  # static

        # student exercise
        self.recordList = self.cacheRecord('question_result.txt')
        self.topicStudentDict = self.getTopicStudent()

    # sample
    def cacheRecord(self, fName, maxWin=3000000):
        recordList = []
        pos = 0
        with open(fName) as handleFile:
            for line in handleFile:
                arrContent = line.strip().split('\t')
                studyId, questionId, res = int(arrContent[1]), int(
                    arrContent[2]), arrContent[5]
                isFlag = False
                if questionId in self.quesTopicDict:
                    topicSet = self.quesTopicDict[questionId]
                    for topic in topicSet:
                        if topic in self.topicDict:
                            isFlag = True
                            break

                if isFlag == False: continue
                # normalize data
                try:
                    res = int(res)
                except:
                    try:
                        result = 0
                        decode_json = json.loads(res)
                        for resDict in decode_json:
                            if resDict['result'] == 1:
                                result += 1
                            else:
                                result -= 1

                        res = 1 if result > 0 else 2
                    except:
                        pass

                if res != 1: continue  # 过滤负样本
                recordList.append((studyId, questionId, res))
                pos += 1
                if pos >= maxWin:
                    break

        return recordList

    def getTopicStudent(self):
        topicStudentDict = {}
        for item in self.recordList:
            studyId, questionId, res = item

            if questionId not in self.quesTopicDict: continue
            for topic in self.quesTopicDict[questionId]:
                if topic not in topicStudentDict:
                    topicStudentDict[topic] = {}

                if studyId not in topicStudentDict[topic]:
                    topicStudentDict[topic][studyId] = 0

                topicStudentDict[topic][studyId] += 1

        return topicStudentDict

    # 统计条件概率
    def staticTopicProb(self, topic, topicList):
        if topic not in self.topicStudentDict:
            return 0.0

        fenmuSet = set()  # 条件student集合
        isFirst = True
        for item in topicList:
            if item not in self.topicStudentDict: continue
            itemSet = set([x for x in self.topicStudentDict[item]])
            if isFirst:
                fenmuSet = itemSet
            else:
                fenmuSet = fenmuSet & itemSet

        fenmu, fenzi = 0.0, 0.0
        for student in fenmuSet:
            for item in topicList:
                fenmu += self.topicStudentDict[item][student]

            if student in self.topicStudentDict[topic]:
                fenzi += self.topicStudentDict[topic][student]

        p = fenzi / fenmu if fenmu > 0 and fenzi < fenmu else 0.0
        return p

    #  获取词典
    def getTopic(self, stage, subjectId):
        topicDict = {}
        topicImportDict = {}
        sql = "select id, name, is_important from entity_topic where subject_id= %s and stage_id = %s" % (
            subjectId, stage)
        db_rows = self.fetcher.get_sql_result(sql, "mysql_v3")
        for row in db_rows:
            topicId, name, important = row
            topicDict[topicId] = name
            topicImportDict[topicId] = important

        return topicDict, topicImportDict

    # 获取问题主题
    def getQuestionTopic(self):
        quesTopicDict = {}
        topicQuestionDict = {}
        db_rows = self.fetcher.get_sql_result(
            "select topic_id, question_id from link_question_topic ",
            "mysql_v3")
        for row in db_rows:
            topicId, questionId = row
            if topicId not in topicQuestionDict:
                topicQuestionDict[topicId] = []

            topicQuestionDict[topicId].append(questionId)

            if questionId not in quesTopicDict:
                quesTopicDict[questionId] = []

            quesTopicDict[questionId].append(topicId)

        quesTopicDict = {
            key: set(value)
            for key, value in quesTopicDict.items()
        }  # list to set
        topicQuestionDict = {
            key: set(value)
            for key, value in topicQuestionDict.items()
        }

        return quesTopicDict, topicQuestionDict

    # 统计问题正确率
    def statisQuestion(self, subject_id):
        questionDict = {}
        db_rows = self.fetcher.get_sql_result(
            "select question_id, sum(answer_num) answernum, sum(right_num) rightnum from entity_trail_question where subject_id = %s GROUP BY question_id "
            % subject_id, "mysql_v3")
        for row in db_rows:
            questionId, answer, right = row
            questionDict[questionId] = answer, right

        return questionDict

    def isLoopGraph(self, pairTopic, iniRelList, childDict):
        startTopic, endTopic = pairTopic
        if startTopic == endTopic: return True  # loop
        stackList = [startTopic]
        missSet, popSet = set(), set()  # init set and pop set
        while len(stackList) > 0:
            if endTopic in missSet:
                return True  # the graph is unicom

            topic = stackList[-1]
            stackList.pop()  # pop
            if (topic not in childDict) or topic in missSet: continue
            popSet.add(topic)  # add pop set
            childSet = childDict[topic]
            for child in childSet:
                stackList.append(child)
                missSet.add(child)

        return False

    # 构造有像图知识网络
    def getTopicDAG(self, threshold=30, inDegreeThr=3, outDegreeThr=4):
        iniRelList = []
        relDict = {}
        topicList = [topic for topic in self.topicDict]
        for i in xrange(len(topicList)):
            for j in xrange(i + 1, len(topicList)):
                if topicList[i] in self.topicQuestionDict and topicList[
                        j] in self.topicQuestionDict:
                    iSet, jSet = self.topicQuestionDict[
                        topicList[i]], self.topicQuestionDict[topicList[j]]
                    key = "%s-%s" % (topicList[i], topicList[j])
                    mLen = len(iSet & jSet)
                    if mLen > threshold:
                        relDict[key] = mLen

        sortRel = sorted(relDict.items(), key=lambda x: x[1], reverse=True)
        inDegreeDict = {}  # 入度记录字典
        outDegreeDict = {}  #  出度记录字典
        childDict = {}  # child
        for item in sortRel:
            inDegreeThr, outDegreeThr = 3, 4
            arrTopic = [int(x) for x in item[0].split("-")]
            if len(arrTopic) != 2: continue

            start, end = arrTopic[0], arrTopic[1]

            # 判断转移方向
            p0 = self.staticTopicProb(start, [end])
            p1 = self.staticTopicProb(end, [start])
            pairTopic = (start, end) if p1 < p0 else (end, start)
            start, end = pairTopic  #

            if self.topicImportDict[start] < 1:
                outDegreeThr = 1

            if self.topicImportDict[end] < 1:
                inDegreeThr = 1

            inDegree = inDegreeDict[end] if end in inDegreeDict else 0
            outDegree = outDegreeDict[start] if start in outDegreeDict else 0

            isLoop = self.isLoopGraph(pairTopic, iniRelList, childDict)
            bFlag = (isLoop == False) and (inDegree < inDegreeThr) and (
                outDegree < outDegreeThr)
            if bFlag:
                iniRelList.append(pairTopic)
                if end not in childDict:
                    childDict[end] = set()

                childDict[end].add(start)
                # 出度 and 入度
                outDegreeDict[start] = 1 + outDegree
                inDegreeDict[end] = 1 + inDegree

        return iniRelList

    # 构建系数矩阵数组
    def getRowsNormalize(self, topicList):
        rowsData = []
        topicLen = len(topicList)
        topicIndex = {topicList[i]: i for i in xrange(topicLen)}
        for item in self.recordList:
            studyId, questionId, res = item
            arr = [0] * topicLen
            if questionId in self.quesTopicDict:
                topicSet = self.quesTopicDict[questionId]
                for topic in topicSet:
                    if topic in topicIndex:
                        arr[topicIndex[topic]] = 1

            sparr = pd.SparseArray(arr)
            rowsData.append(arr)

        return rowsData

    def makeModel(self):
        # graph structure
        initRelation = self.getTopicDAG()  # 概率图构建
        relationList = []
        stuModel = BayesianModel()  # DAG
        for edge in initRelation:
            try:
                start, end = edge
                stuModel.add_edge(str(start), str(end))
                relationList.append(edge)
            except:
                continue

        # save file
        with open('model.txt', 'w') as write_f:
            for item in relationList:
                write_f.write(
                    '%s,%s\n' %
                    (self.topicDict[item[0]], self.topicDict[item[1]]))

        # learning from data
        topicList = [topic for topic in self.topicDict]
        rowsData = self.getRowsNormalize(topicList)
        print 'the rows len is:%s' % len(rowsData)
Example #11
0
class RecommendQuestion(object):
    def __init__(self):
        self.db_fetcher = DataBaseFetcher()  # mysql handle
        self.dict_topic = self.getTopicDict()  # topic
        self.dict_quality = {5: 25, 4: 20, 3: 15, 2: 10, 1: 5}

        self.dict_question_base_info = self.getQuestionBaseInfo()  # base
        self.dict_question_quality = self.getQuestionQuality()
        self.dict_question_topic, self.dict_topic_question = self.getQuestionTopic(
        )  # topic

    def getThisQuestionTopic(self):
        return self.dict_question_topic

    def getThisQuestionBaseInfo(self):
        return self.dict_question_base_info

    def getQuestionBaseInfo(self, save_file='question_base.txt'):
        dict_question_base_info = {}
        max_num = 0
        with open(save_file) as handle_f:
            for line in handle_f:
                arr = line.strip().split('\t')
                if len(arr) != 3: continue

                qid, difficulty, question_type, upload_id = long(arr[0]), int(
                    arr[1]), int(arr[2]), str(arr[3]).strip()
                try:
                    upload_id = long(upload_id)
                except:
                    upload_id = 0

                dict_question_base_info[
                    qid] = difficulty, question_type, upload_id
                max_num = qid

        sql = "select id, difficulty, question_type, upload_id from neworiental_v3.entity_question where subject_id = 4 and id > %s and state != \'DISABLED\' order by id asc" % max_num
        rows = self.db_fetcher.get_sql_result(sql, "mysql_logdata")
        with open(save_file, 'a') as write_f:
            for row in rows:
                qid, difficulty, str_question_type, upload_id = row
                question_type = 1
                if str_question_type == '选择题' or str_question_type == '单选题':
                    question_type = 1
                elif str_question_type == '填空题':
                    question_type = 2
                else:
                    question_type = 3

                dict_question_base_info[
                    qid] = difficulty, question_type, upload_id
                write_f.write('%s\t%s\t%s\t%s\n' %
                              (qid, difficulty, question_type, upload_id))

        return dict_question_base_info

    def getQuestionQuality(self, save_file='quality.txt'):
        dict_question_quality = {}

        max_num = 0
        with open(save_file) as handle_f:
            for line in handle_f:
                arr = line.strip().split('\t')
                if len(arr) != 3: continue
                qid, question_id, extra_score = arr[0], long(arr[1]), int(
                    arr[2])
                dict_question_quality[question_id] = extra_score
                max_num = qid

        sql = "select id, question_id, extra_score from neworiental_v3.entity_question_quality where id > %s" % max_num
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        with open(save_file, 'a') as write_f:
            for row in rows:
                qid, question_id, extra_score = row

                dict_question_quality[question_id] = extra_score
                write_f.write('%s\t%s\t%s\n' % (qid, question_id, extra_score))

        return dict_question_quality

    def getQuestionTopic(self, save_file='topic.txt'):
        dict_question_topic, dict_topic_question = {}, {}
        max_num = 0
        with open(save_file) as handle_f:
            for line in handle_f:
                arr = line.strip().split('\t')
                if len(arr) != 3: continue
                tid, question_id, topic_id = arr[0], arr[1], arr[2]
                question_id = long(question_id)
                topic_id = long(topic_id)

                if question_id not in dict_question_topic:
                    dict_question_topic[question_id] = set()
                if topic_id not in dict_topic_question:
                    dict_topic_question[topic_id] = {}

                extra_score = 3
                if question_id in self.dict_question_quality:
                    extra_score = self.dict_question_quality[question_id]

                difficulty, question_type = 1, 1
                if question_id in self.dict_question_base_info:
                    difficulty, question_type, upload_id = self.dict_question_base_info[
                        question_id]

                dict_question_topic[question_id].add(topic_id)  # add
                key = '%s-%s' % (difficulty, question_type)
                if key not in dict_topic_question[topic_id]:
                    dict_topic_question[topic_id][key] = []

                dict_topic_question[topic_id][key].append(
                    (question_id, extra_score))
                max_num = tid

        sql = "select id, question_id, topic_id from neworiental_v3.link_question_topic where id > %s" % max_num
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        with open(save_file, 'a') as write_f:
            for row in rows:
                tid, question_id, topic_id = row
                if question_id not in dict_question_topic:
                    dict_question_topic[question_id] = set()

                if topic_id not in dict_topic_question:
                    dict_topic_question[topic_id] = {}

                extra_score = 3
                if question_id in self.dict_question_quality:
                    extra_score = self.dict_question_quality[question_id]

                difficulty, question_type = 1, 1
                if question_id in self.dict_question_base_info:
                    difficulty, str_question_type, upload_id = self.dict_question_base_info[
                        question_id]
                    if str_question_type == '选择题' or str_question_type == '单选题':
                        question_type = 1
                    elif str_question_type == '填空题':
                        question_type = 2
                    else:
                        question_type = 3

                key = '%s-%s' % (difficulty, question_type)
                dict_question_topic[question_id].add(topic_id)  # add
                if key not in dict_topic_question[topic_id]:
                    dict_topic_question[topic_id][key] = []

                dict_topic_question[topic_id][key].append(
                    (question_id, extra_score))
                write_f.write('%s\t%s\t%s\n' % (tid, question_id, topic_id))

        for topic in dict_topic_question:  # sort
            for key in dict_topic_question[topic]:
                dict_topic_question[topic][key] = sorted(
                    dict_topic_question[topic][key],
                    key=lambda x: x[-1],
                    reverse=True)  #

        return dict_question_topic, dict_topic_question

    def getTopicDict(self):
        dict_topic = {}
        sql = "select id, `name` from neworiental_v3.entity_topic where subject_id = 4"
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            topic_id, name = row
            topic_id = str(topic_id)
            dict_topic[topic_id] = name

        return dict_topic

    def getQuestionRate(self):
        dict_question_rate = {}
        sql = "select question_id, rate from question_accuracy"
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            question_id, rate = row
            rate = float(rate)
            dict_question_rate[question_id] = rate

        return dict_question_rate

    def getHeaders(self, teacher_id='eefbac5a4e58432ca317642a2077d362'):
        return {
            'Host':
            'jiaoshi.okjiaoyu.cn',
            'Connection':
            'keep-alive',
            'Accept':
            'application/json, text/javascript, */*; q=0.01',
            'X-Requested-With':
            'XMLHttpRequest',
            'is_new_okay':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36',
            'Referer':
            'http://jiaoshi.okjiaoyu.cn/quizcenter_vm/quizcenter',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'zh-CN,zh;q=0.8,en;q=0.6',
            'Cookie':
            '_ga=GA1.2.1647528971.1502935646; _gid=GA1.2.1532869714.1506481969; org_id=113; teacher_id=%s; Hm_lvt_2014de1ca4ec84db492ebee33b1dc46c=1506044442,1506324027,1506407711,1506479104; Hm_lpvt_2014de1ca4ec84db492ebee33b1dc46c=1506492873'
            % teacher_id
        }

    def getUrlContent(self, url):
        req = urllib2.Request(url, headers=self.getHeaders())
        hh = urllib2.HTTPHandler()
        opener = urllib2.build_opener(hh)
        reply = opener.open(req, timeout=30)
        if reply.info().get('Content-Encoding') == 'gzip':
            buf = StringIO(reply.read())
            f = gzip.GzipFile(fileobj=buf)
            res = f.read()
            f.close()
            buf.close()
            reply.close()
        else:
            res = reply.read()

        return res

    def normalLeven(self, str1, str2):
        len_str1 = len(str1) + 1
        len_str2 = len(str2) + 1
        #create matrix
        matrix = [0 for n in range(len_str1 * len_str2)]
        #init x axis
        for i in range(len_str1):
            matrix[i] = i

        #init y axis
        for j in range(0, len(matrix), len_str1):
            if j % len_str1 == 0:
                matrix[j] = j // len_str1

        for i in range(1, len_str1):
            for j in range(1, len_str2):
                if str1[i - 1] == str2[j - 1]:
                    cost = 0
                else:
                    cost = 1

                matrix[j * len_str1 + i] = min(
                    matrix[(j - 1) * len_str1 + i] + 1,
                    matrix[j * len_str1 + (i - 1)] + 1,
                    matrix[(j - 1) * len_str1 + (i - 1)] + cost)

        return matrix[-1]

    def translate(self, str):
        line = str.strip().decode('utf-8', 'ignore')
        p2 = re.compile(ur'[^\u4e00-\u9fa5]')
        zh = " ".join(p2.split(line)).strip()
        zh = ",".join(zh.split())
        outStr = zh

        return outStr

    def calScoreQustion(self, question_id, list_rank):
        dict_score = {}
        pos = 0
        dict_qid_type = {}
        set_source = self.dict_question_topic[
            question_id] if question_id in self.dict_question_topic else set()
        for item in list_rank:
            qid, qtype, topic_list, extra_score = item
            set_target = self.dict_question_topic[
                qid] if qid in self.dict_question_topic else set()
            score = (1 - numpy.tanh(
                0.03 * pos)) * 60 + self.dict_quality[extra_score]
            if len(set_target & set_source) > 0:
                score += 15

            dict_score[qid] = score
            dict_qid_type[qid] = qtype
            pos += 1

        sort_res = sorted(dict_score.items(), key=lambda x: x[1], reverse=True)
        list_res = []
        for item in sort_res:
            qid = item[0]
            qtype = dict_qid_type[qid]
            list_res.append((qid, qtype))

        return list_res

    def randomTopicHeader(self, question_id, list_res, pre_set, rank_size,
                          throld_size):
        # 头部数据处理
        if question_id in self.dict_question_topic:
            topic_set = self.dict_question_topic[question_id]
            tLen = len(topic_set)
            if tLen > 0:
                index = random.randint(0, tLen - 1)
                topic = list(topic_set)[index]  # first
                if question_id in self.dict_question_base_info:
                    difficulty, qtype, upload_id = self.dict_question_base_info[
                        question_id]

                    key = '%s-%s' % (difficulty, qtype)
                    if key in self.dict_topic_question[topic]:
                        list_res_question = []

                        arr_content_question = self.dict_topic_question[topic][
                            key]
                        for content_item in arr_content_question:
                            res_question, extra_score = content_item

                            if res_question not in pre_set and res_question != question_id:
                                list_res_question.append(res_question)

                            if len(list_res_question) > rank_size: break

                        rnt = throld_size - len(list_res)
                        while rnt > 0 and len(list_res_question) > 0:
                            pos = random.randint(0, len(list_res_question) - 1)
                            list_res.append(list_res_question[pos])
                            rnt += -1

        return list_res

    def getEsResult(self,
                    question_id,
                    text,
                    keywords,
                    difficulty,
                    bqtype,
                    pre_set,
                    throld_size,
                    throld_page=10,
                    throld_cost=5,
                    rank_size=20):
        """
        questionType:  1   选择题 2   填空题 3   判断题 4   简答题 6  综合题
        difficulty : 1:易 2:中 3:难 4:极难
        """
        list_rank = []
        headers = self.getHeaders()  # get headers
        str_pre = ''
        if bqtype == 1: bqtype = 101
        for page in range(1, throld_page):
            try:
                url = "http://jiaoshi.okjiaoyu.cn/teacher-center/search_singlequiz?keyword=%s&subject_id=4&queryType=4&difficulty=%s&type=%s&page=%s" % (
                    keywords, difficulty, bqtype, page)
                # url = "http://jiaoshi.okjiaoyu.cn/teacher-center/search_singlequiz?_=1506480814420&teacher_id=ce36ea7e9c864af39d186b8e8b672864&type=&difficulty=%s&page=%s&keyword=%s&subject_id=4" % (difficulty, page, keywords)
                res = self.getUrlContent(url)
                res = res.replace('\r', '').replace('\n', '').replace(
                    'null', '').replace(':,', ':\"\",').strip()
                dict_result = json.loads(res.encode('utf8'))
                total_page = dict_result['data']['total_page']
                if total_page < throld_page:
                    throld_page = total_page  # range update
                list_result = dict_result['data']['list']
                for item in list_result:
                    qid = item['id']
                    question_body = self.translate(item['question_body'])
                    analysis = str(item['analysis']).strip().replace(' ', '')
                    topic_list = item['topic_list']
                    if len(topic_list) >= 1:
                        type_name = item['type_name']
                        if type_name == '选择题':
                            option_list = item['option_list']
                            if len(option_list) != 4: continue

                        cost = self.normalLeven(str_pre, question_body)
                        if qid in self.dict_question_quality:
                            extra_score = self.dict_question_quality[qid]
                        else:
                            extra_score = 1

                        if item['type_alias'] == '单选题' or item[
                                'type_alias'] == '选择题':
                            qtype = 1
                        elif item['type_alias'] == '填空题':
                            qtype = 2
                        else:
                            qtype = 3

                        bCharge = (cost > throld_cost) and (len(analysis) > 50)
                        if bCharge:
                            list_rank.append(
                                (qid, qtype, item['topic_list'], extra_score))
                            str_pre = question_body
            except:
                continue

        list_index = self.calScoreQustion(question_id, list_rank)  # rank
        res_size = throld_size - 1 if len(
            list_index) > throld_size - 1 else len(list_index)
        list_res = [x[0] for x in list_index[:res_size]]  # end output

        list_res = self.randomTopicHeader(
            question_id, list_res, pre_set, rank_size,
            throld_size)  #  header question random
        return list_res
Example #12
0
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import os, json, datetime, re, time
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
sys.path.append(
    os.path.realpath(os.path.join(os.path.dirname(__file__), '../../')))
from common.db_fetcher import DataBaseFetcher

db_fetcher = DataBaseFetcher()

if __name__ == "__main__":
    # update news_report company
    now_time = datetime.datetime.now()
    yes_time = now_time + datetime.timedelta(days=-1)
    yes_date = yes_time.strftime('%Y-%m-%d')
    db_data = db_fetcher.get_sql_result(
        'select id, article_json, publish from kr_articles where publish = \'%s\''
        % yes_date, 'mysql_insight')
    for pos in range(len(db_data)):
        kid = str(db_data[pos][0])
        try:
            article_json = str(db_data[pos][1])
            article_obj = json.loads(article_json, strict=False)

            url = 'http://36kr.com/p/%s.html' % kid
            title = article_obj['title']
            source = "36Kr"
            publish_date = '%s 00:00:00' % str(db_data[pos][2])
            domain = '36kr.com'
Example #13
0
import urllib, urllib2
from urlparse import urlparse
import traceback
from cStringIO import StringIO
import gzip
import pdb
import json
import ConfigParser
import cookielib
from pyquery import PyQuery
import re, codecs

sys.path.append('../../')
from common.db_fetcher import DataBaseFetcher

db_fetcher = DataBaseFetcher()

en_threshold = 50


class Unbuffered:
    def __init__(self, stream):
        self.stream = stream

    def write(self, data):
        self.stream.write(data)
        self.stream.flush()

    def __getattr__(self, attr):
        return getattr(self.stream, attr)
Example #14
0
Created on Mon Dec 15 10:38:02 2015
@author: liming
"""

import time, datetime
import os, sys, re
import json
import urllib
from urllib2 import Request, urlopen, URLError, HTTPError  #导入urllib2模块

sys.path.append(
    os.path.realpath(os.path.join(os.path.dirname(__file__), '../seo')))
from seo_extractor import SeoExtractor
from common.db_fetcher import DataBaseFetcher

mysql_fetcher = DataBaseFetcher()

threshold = 15.85  # 阈值


class HotsFind(object):

    ch_title_dict = {}
    cache_weight_dict = {}

    def __init__(self, keyword_file):
        self.seo_extractor = SeoExtractor('../seo/com_df_idf')
        self.keyword_set, self.verb_set = self.read_keywords(keyword_file)
        self.entity_dict = self.load_entity_dict()

    def clear_history_init(self, keyword_file):
Example #15
0
class Report(object):
    """docstring for Report"""
    def __init__(self, exercise_id, update_time='2017-11-01 00:00:00'):
        # do something inits
        self.db_fetcher = DataBaseFetcher()
        self.exercise_id = exercise_id
        self.dict_content_answer = {}  # error content
        self.curr_time = datetime.datetime.now()

        self.update_time = update_time  # update time
        self.recommend = RecommendQuestion()

        self.dict_realtion_quesion, self.dict_question_num = self.getRelationQuestion(
        )
        self.dict_parent_quesion = self.getChildQuestion()
        self.dict_question_diff = self.getRightDifficulty()
        self.dict_point, self.dict_topic_point = self.getPoints()

        self.dict_students = self.getStudentId()
        self.dict_student_score, self.dict_student_records = {}, {
            x: []
            for x in self.dict_students
        }

        self.exam_list_records, self.practice_list_records = self.getExerciseRecord(
        )  #records

        # dict
        self.dict_question_types = {
            1: '选择题',
            2: '填空题',
            4: '简答题',
            6: '综合题',
            17: '计算题',
            44: '证明题'
        }
        self.dict_question_difficulty = {
            1: '易',
            2: '中',
            3: '难',
            4: '极难'
        }  # 1:易 2:中 3:难 4:极难

        # feedback
        self.dict_students_resource = self.getStudentResource()  # resource

    def getExerciseRecord(self):
        exam_list_records = []
        practice_list_records = []
        sql = "select a.question_id, a.student_id, a.ret_num, a.submit_answer, a.submit_time, b.question_type from entity_student_exercise a join \
                neworiental_v3.entity_question b on b.id = a.question_id where a.submit_time > \'%s\' and a.exercise_source = 6" % (
            self.update_time)

        dict_is_choice = {}
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            question, student_id, ret, answer, submit_time, question_type = row
            student_id = long(student_id)

            if answer == 'null' or answer == 'None' or answer == '' or answer == 'NULL' or answer == None or len(
                    answer) == 0:
                continue
            if question in self.dict_realtion_quesion:
                exam_list_records.append([
                    question, student_id, ret, answer, submit_time,
                    question_type
                ])

                self.dict_student_records[student_id].append([
                    question, student_id, ret, answer, submit_time,
                    question_type
                ])
                self.dict_content_answer['%s_%s' %
                                         (question, student_id)] = answer
            elif question in self.dict_parent_quesion:
                dict_content = self.dict_parent_quesion[question]
                parent_question_id = dict_content['parent_id']

                rank = dict_content['rank']
                link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[
                    parent_question_id]
                if question_num >= 17 and question_num <= 21:
                    if rank == 1:
                        exam_list_records.append([
                            parent_question_id, student_id, ret, answer,
                            submit_time, question_type
                        ])
                        self.dict_student_records[student_id].append([
                            parent_question_id, student_id, ret, answer,
                            submit_time, question_type
                        ])
                    elif rank == 2:
                        self.dict_content_answer['%s_%s' %
                                                 (parent_question_id,
                                                  student_id)] = answer
                elif question_num >= 22 and question_num <= 23:
                    key = '%s_%s' % (parent_question_id, student_id)
                    if rank == 1:
                        if answer == 'A':
                            dict_is_choice[key] = 1
                        else:
                            dict_is_choice[key] = 0
                    elif rank == 2:
                        if key in dict_is_choice:
                            if dict_is_choice[key] == 1:
                                exam_list_records.append([
                                    parent_question_id, student_id, ret,
                                    answer, submit_time, question_type
                                ])
                                self.dict_student_records[student_id].append([
                                    parent_question_id, student_id, ret,
                                    answer, submit_time, question_type
                                ])
                    elif rank == 3:
                        self.dict_content_answer[key] = answer
            else:
                practice_list_records.append([
                    question, student_id, ret, answer, submit_time,
                    question_type
                ])

        return exam_list_records, practice_list_records

    def getStudentId(self, file_name='students.txt'):
        list_students = []
        with open(file_name) as handle_f:
            for line in handle_f:
                system_id = long(line.strip())
                list_students.append(system_id)

        set_studends = set(list_students)  # students set
        str_content = ','.join([str(x) for x in set_studends])
        sql = "select a.`system_id`, a.`name` from neworiental_user.entity_user a where type = 2 and a.system_id in (%s)" % str_content
        dict_students = {}
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            student_id, name = row
            if student_id in set_studends:
                dict_students[student_id] = name

        return dict_students

    def getRightDifficulty(self):
        dict_question_diff = {}
        sql = "select b.id, b.difficulty from neworiental_v3.entity_question b join link_question_answer a on  b.id = a.link_question_id where a.update_time >= \'%s\'" % self.update_time
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            question_id, difficulty = row
            dict_question_diff[question_id] = int(difficulty)

        return dict_question_diff

    def getChildQuestion(self):
        dict_parent_list = {}
        dict_parent_question = {}
        sql = "select a.id, a.parent_question_id from link_question_answer b join neworiental_v3.entity_question a on a.parent_question_id = b.link_answer_id \
                where b.question_num >= 17 and b.update_time >= \'%s\'" % self.update_time
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            qid = row[0]
            parent_id = row[1]
            if parent_id not in dict_parent_list:
                dict_parent_list[parent_id] = []

            dict_parent_list[parent_id].append(qid)

        for parent_id in dict_parent_list:
            sort_question = sorted(dict_parent_list[parent_id], reverse=False)
            count = 1
            for item in sort_question:
                dict_parent_question[item] = {
                    'parent_id': parent_id,
                    'rank': count
                }
                count += 1

        return dict_parent_question

    def getRelationQuestion(self):
        dict_realtion_quesion = {}
        dict_point_question_num = {}
        sql = "select a.link_question_id, a.link_answer_id, a.link_point_id, a.question_num, b.difficulty, a.question_exercise_id, b.question_type from link_question_answer a join neworiental_v3.entity_question b on a.link_question_id = \
            b.id  where a.update_time >= \'%s\'" % self.update_time
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        dict_parent_quesion = {}
        link_parent_list = []
        for row in rows:
            link_question_id, link_answer_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = row
            dict_realtion_quesion[
                link_answer_id] = link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type
            arr_point = link_point_id.strip().split(',')
            for point in arr_point:
                if point not in dict_point_question_num:
                    dict_point_question_num[point] = []

                dict_point_question_num[point].append([
                    link_question_id, question_num, difficulty,
                    question_exercise_id, question_type
                ])

        return dict_realtion_quesion, dict_point_question_num

    def getScore(self, question_num, answer):
        ascore, score = 0, 0
        if question_num >= 1 and question_num <= 16:
            score = 5 if (answer == 'A' or answer == 'B') else 0
            ascore = 5
        elif question_num >= 17 and question_num <= 21:
            if answer == 'A':
                score = 4
            elif answer == 'B':
                score = 7
            elif answer == 'C':
                score = 9
            elif answer == 'D':
                score = 12
            ascore = 12
        elif question_num >= 22 and question_num <= 23:
            if answer == 'A':
                score = 4
            elif answer == 'B':
                score = 7
            elif answer == 'C':
                score = 9
            elif answer == 'D':
                score = 10
            ascore = 10

        return ascore, score

    def statQustionReport(self):
        dict_question_count, dict_question_right = {}, {}
        dict_point_count, dict_point_right = {}, {}
        dict_error_students, dict_right_students = {}, {}

        for item in self.exam_list_records:
            question, student_id, ret, answer, submit_time, question_type = item

            if question in self.dict_realtion_quesion:
                link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[
                    question]
                arr_points = link_point_id.strip().split(',')
                ascore, score = self.getScore(question_num, answer)
                is_right = (answer == 'A') or (answer == 'B')

                # 1 point
                for point in arr_points:
                    if point in self.dict_point:
                        name, ptype, question_type, level, parent_id, link_id = self.dict_point[
                            point]
                        if name not in dict_point_count:
                            dict_point_count[name] = 0
                            dict_point_right[name] = 0

                        dict_point_right[name] += 1 if (
                            question_num <= 16 and is_right) or (
                                question_num >= 17 and answer == 'D') else 0  #
                        dict_point_count[name] += 1

                if link_question_id not in dict_question_count:
                    dict_question_right[link_question_id] = 0
                    dict_question_count[link_question_id] = 0

                dict_question_right[
                    link_question_id] += 1 if ret == 1 else 0  #
                dict_question_count[link_question_id] += 1

                # questions
                if link_question_id not in dict_right_students:
                    dict_right_students[link_question_id] = []
                    dict_error_students[link_question_id] = []

                if (question_num <= 16 and is_right) or (question_num >= 17
                                                         and answer == 'D'):
                    dict_right_students[link_question_id].append(student_id)
                else:
                    dict_error_students[link_question_id].append(student_id)

                if student_id not in self.dict_student_score:
                    self.dict_student_score[student_id] = 0

                self.dict_student_score[student_id] += score

        dict_point_acc = {}
        for point_name in dict_point_count:
            acc = dict_point_right[point_name] / dict_point_count[point_name]
            dict_point_acc[point_name] = acc
            #print '%s \t%s' % (point_name, acc)

        # students
        dict_students_point_question = self.calStudentPoints(dict_point_acc)
        return dict_students_point_question

    def getErrorText(self, answer):
        if answer == 'C':
            return '审题时,粗心看错了'
        elif answer == 'D':
            return '审题时,题意读不懂'
        elif answer == 'E':
            return '析题时,思路模糊不清'
        elif answer == 'F':
            return '析题时,思路方向错误'
        elif answer == 'G':
            return '解题时,公式记错了'
        elif answer == 'H':
            return '解题时,计算出错了'
        elif answer == 'I':
            return '未作答(或未完成),时间来不及了'
        elif answer == 'A' or answer == 'B':
            return '答题正确'

    def getQuestionWords(self):
        dict_question_words = {}
        str_question = ','.join([
            str(self.dict_realtion_quesion[x][0])
            for x in self.dict_realtion_quesion
        ])
        str_sql = "select c.id, c.subject_id, t.type_id,t.struct_id, c.json_data from neworiental_v3.entity_question c left join neworiental_v3.entity_question_type t on \
                    t.type_id=c.question_type_id where c.id in (%s)" % str_question

        s_rows = self.db_fetcher.get_sql_result(str_sql, 'mysql_logdata')
        for row in s_rows:
            question, subject_id, type_id, struct_id, json_data = row
            if json_data != '' and json_data != 'None' and json_data != None:
                keywords, res = sq.predict(question, subject_id, type_id,
                                           struct_id, json_data)
                dict_question_words[question] = keywords, res

        return dict_question_words

    def calStudentPoints(self, dict_point_acc):
        dict_coef = {x: 1.0 for x in self.dict_point}  #
        bate = 0.25
        dict_student_points = {}
        dict_students_point_question = {}  # student -> error question
        for student_id in self.dict_students:
            if student_id not in dict_student_points:
                dict_student_points[student_id] = {}

            records = sorted(self.dict_student_records[student_id],
                             key=lambda x: x[-1],
                             reverse=True)
            dict_error = {
                'C': 0.0,
                'D': 0.0,
                'E': 0.0,
                'F': 0.0,
                'G': 0.0,
                'H': 0.0,
                'I': 0.0
            }
            mass_score = 0
            dict_question_point = {}
            for item in records:
                question, student_id, ret, answer, submit_time, question_type = item
                if answer == 'answer': continue
                if question in self.dict_realtion_quesion:
                    link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[
                        question]
                    if question_exercise_id != self.exercise_id: continue
                    if question_type == '单选题' or question_type == '选择题':
                        qtype = 1
                    elif question_type == '填空题':
                        qtype = 2
                    else:
                        qtype = 3

                    difficulty = self.dict_question_diff[link_question_id]
                    arr_points = link_point_id.strip().split(',')
                    a_score, score = self.getScore(question_num, answer)
                    content = '%s_%s' % (question, student_id)
                    if content in self.dict_content_answer:
                        answer = self.dict_content_answer[content]
                        if answer != 'A' and answer != 'B' and answer != 'answer':
                            dict_error[answer] += a_score - score
                            if answer == 'C' or answer == 'G' or answer == 'H':
                                mass_score += a_score - score

                        for point in arr_points:
                            if point not in dict_question_point:
                                dict_question_point[point] = []

                            dict_question_point[point].append(
                                (link_question_id, qtype, submit_time))
                            name, ptype, question_type, level, parent_id, link_id = self.dict_point[
                                point]
                            acc = dict_point_acc[name]
                            if point not in dict_student_points[student_id]:
                                dict_student_points[student_id][point] = 0.0

                            dict_student_points[student_id][
                                point] += dict_coef[point] * (
                                    a_score - score
                                ) * acc / math.log(1 + bate *
                                                   (level + difficulty))

            sort_error = sorted(dict_error.items(),
                                key=lambda x: x[1],
                                reverse=True)
            if len(dict_student_points[student_id]) > 0:
                sort_point = sorted(dict_student_points[student_id].items(),
                                    key=lambda x: x[1],
                                    reverse=True)
                student_name = self.dict_students[student_id]
                point_name1, ptype1, question_type1, level1, parent_id1, link_id1 = self.dict_point[
                    point]
                point_name2, ptype2, question_type2, level2, parent_id2, link_id2 = self.dict_point[
                    sort_point[1][0]]
                point_name3, ptype3, question_type3, level3, parent_id2, link_id3 = self.dict_point[
                    sort_point[2][0]]

                is_power = 1 if mass_score > 5 else 0
                error1, error1score = sort_error[0][0], sort_error[0][1]
                error2, error2score = sort_error[1][0], sort_error[1][1]
                error1_name, error2_name = self.getErrorText(
                    error1), self.getErrorText(error2)
                dict_students_point_question[
                    student_id] = sort_point, dict_question_point

        return dict_students_point_question

    def getPoint2Question(self):
        dict_point_org_question = {}
        for answer_id in self.dict_realtion_quesion:
            link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[
                answer_id]
            arr_point = link_point_id.strip().split(',')

            for item in arr_point:
                if item not in dict_point_org_question:
                    dict_point_org_question[item] = []

                dict_point_org_question[item].append(link_question_id)

        return dict_point_org_question

    def pointsRecQuestion(self,
                          dict_students_point_question,
                          dict_diff,
                          throld=4):
        # 1,2 choice question 1:easy 2:difficulty 3,4: comprehensive problem 3:easy 4:difficulty
        # dict_point_org_question = self.getPoint2Question() # org question point
        dict_question_text = self.getQuestionWords()
        dict_question_target = {}
        dict_student_recommend_question = {}
        print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (
            '学生id', '学生名字', '考点', '考点名称', '原题id', '推荐习题id', '关键词', '难度')
        for student_id in dict_students_point_question:

            arr_point, dict_question_point_target = dict_students_point_question[
                student_id]
            student_name = self.dict_students[student_id]
            haveTime = 12
            recommend_questions = []
            pre_set = set()
            for item_target in arr_point:
                if haveTime <= 0 and len(recommend_questions) % 3 == 0: break
                item_point = item_target[0]
                arr_question = sorted(dict_question_point_target[item_point],
                                      key=lambda x: x[-1],
                                      reverse=True)

                key = '%s-%s' % (student_id, item_point)
                rateRight, difficulty = dict_diff[
                    key] if key in dict_diff else (0.0, 1)

                pre_set = pre_set | set([x[0] for x in arr_question])  #
                for base_question, qtype, submit_time in arr_question:
                    if haveTime <= 0 and len(recommend_questions) % 3 == 0:
                        break
                    keywords, text = dict_question_text[base_question]
                    text = text.replace('\n', '').strip('\r').strip()  # filter

                    base_question_key = '%s-%s' % (base_question, difficulty)
                    if base_question_key in dict_question_target:
                        target_questions = dict_question_target[
                            base_question_key]
                    else:
                        target_questions = self.recommend.getEsResult(
                            base_question, text, keywords, difficulty, qtype,
                            pre_set, throld)
                        dict_question_target[
                            base_question_key] = target_questions

                    for rec_question in target_questions:
                        if haveTime <= 0 and len(recommend_questions) % 3 == 0:
                            break
                        if rec_question in pre_set: continue

                        haveTime += -1
                        recommend_questions.append(
                            (base_question, keywords, rec_question, item_point,
                             difficulty))
                        pre_set.add(rec_question)

            for rec_item in recommend_questions:
                source_question, keywords, rec_question, item_point, difficulty = rec_item
                point_name, ptype, question_type, level, parent_id, link_id = self.dict_point[
                    item_point]
                print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (
                    student_id, student_name, item_point, point_name,
                    source_question, rec_question, keywords, difficulty)

    def importDefault(self, is_new, dict_question_topic):
        defult_list = [(10805815, 0), (10806190, 2), (10805815, 4),
                       (10806184, 6), (10806182, 8), (10806169, 10),
                       (10806104, 12), (10806057, 14), (10805815, 16),
                       (10806068, 18)]
        update_sql = "insert into entity_recommend_question_bytopic(system_id, type,chapter_id,topic_id, question_id, `master`, duration, important, subject_id, score, school_publish, org_id, org_type) values"
        new_sql = "insert into sync_student_recommend_question(system_id, resource_type, resource_id, subject_id, tag1, tag2, score, type, type_level, type_id) values"
        db_rows = self.db_fetcher.get_sql_result(
            "select student_id from entity_student_white_list",
            "mysql_white_list")
        is_first = 1
        for row in db_rows:
            student_id = row[0]
            for item in defult_list:
                question_id, subject_id = item

                type_id = 2186
                if question_id in dict_question_topic:
                    set_topic = dict_question_topic[question_id]
                    if len(set_topic) > 0:
                        type_id = set_topic.pop()

                if is_first == 1:
                    update_sql += "(%s, 2, 0, 0, %s, 2, 2, 1, %s, %s, 0, 113, 2)" % (
                        student_id, question_id, subject_id, 0)
                    if is_new == 1:
                        new_sql += "(%s, 1, %s, %s, \'最近新错\', \'重难点\', %s, %s, %s, %s)" % (
                            student_id, question_id, subject_id, 0, 1, 3,
                            type_id)
                    is_first = 0
                else:
                    update_sql += ",(%s, 2, 0, 0, %s, 2, 2, 1, %s, %s, 0, 113, 2)" % (
                        student_id, question_id, subject_id, 0)
                    new_sql += ",(%s, 1, %s, %s, \'最近新错\', \'重难点\', %s, %s, %s, %s)" % (
                        student_id, question_id, subject_id, 0, 1, 3, type_id)

        if is_new == 1:
            self.db_fetcher.commit_sql_cmd(new_sql, 'mysql_v3_white_list')
        else:
            self.db_fetcher.commit_sql_cmd(update_sql, 'mysql_white_list')

    def import2DataBase(self, flag, is_new=1, fname='student_rec.txt'):
        dict_question_topic = self.recommend.getThisQuestionTopic(
        )  # question topic
        if is_new == 1:
            self.db_fetcher.commit_sql_cmd(
                "delete from sync_student_recommend_question",
                'mysql_v3_white_list')  # update
        else:
            self.db_fetcher.commit_sql_cmd(
                "delete from entity_recommend_question_bytopic",
                'mysql_white_list')  # update

        # self.importDefault(is_new, dict_question_topic) # import
        if flag == 1:  # 1,3 5 recomend
            update_sql = "insert into entity_recommend_question_bytopic(system_id, type,chapter_id,topic_id, question_id, `master`, duration, important, subject_id, score, school_publish, org_id, org_type) values"
            insert_sql = "insert into entity_question_recommend(student_id,student_name,point,point_name,question_id,recommend_id,keywords,difficulty) values"

            new_sql = "insert into sync_student_recommend_question(system_id, resource_type, resource_id, subject_id, tag1, tag2, score, type, type_level, type_id) values"
            insert_score, is_first = 0, 1
            with open(fname) as rec_f:
                for line in rec_f:
                    if insert_score > 1:
                        arr = line.strip().split('\t')
                        student_id, question_id = arr[0], long(arr[5])
                        type_id = 2186
                        if question_id in dict_question_topic:
                            set_topic = dict_question_topic[question_id]
                            if len(set_topic) > 0:
                                type_id = set_topic.pop()

                        if is_first == 1:
                            update_sql += "(%s, 2, 0, 0, %s, 2, 2, 1, %s, %s, 0, 113, 2)" % (
                                student_id, question_id, 0, insert_score)
                            insert_sql += "(%s,\'%s\',%s, \'%s\', %s, %s, \'%s\',%s)" % (
                                arr[0], arr[1], arr[2], arr[3], arr[4], arr[5],
                                arr[6], arr[7])
                            if is_new == 1:
                                new_sql += "(%s, 1, %s, %s, \'最近新错\', \'重难点\', %s, %s, %s, %s)" % (
                                    student_id, question_id, 0, insert_score,
                                    1, 3, type_id)
                            is_first = 0
                        else:
                            update_sql += ",(%s, 2, 0, 0, %s, 2, 2, 1, %s, %s, 0, 113, 2)" % (
                                student_id, question_id, 0, insert_score)
                            insert_sql += ",(%s,\'%s\',%s, \'%s\', %s, %s, \'%s\',%s)" % (
                                arr[0], arr[1], arr[2], arr[3], arr[4], arr[5],
                                arr[6], arr[7])
                            if is_new == 1:
                                new_sql += ",(%s, 1, %s, %s, \'最近新错\', \'重难点\', %s, %s, %s, %s)" % (
                                    student_id, question_id, 0, insert_score,
                                    1, 3, type_id)

                    insert_score += 1

            # self.db_fetcher.commit_sql_cmd(insert_sql, 'mysql_logdata')
            if is_new == 1:  # update mysql
                self.db_fetcher.commit_sql_cmd(new_sql,
                                               'mysql_v3_white_list')  # new
            else:
                self.db_fetcher.commit_sql_cmd(update_sql, 'mysql_white_list')

    def getPoints(self):
        dict_point = {}
        dict_topic_point = {}
        sql = 'select id, name, type, question_type, level, parent_id, link_id, topics_id from entity_exam_points'  # do something
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            point_id, name, ptype, question_type, level, parent_id, link_id, topics_id = row
            point_id = str(point_id)
            dict_point[
                point_id] = name, ptype, question_type, level, parent_id, link_id
            if ptype == 3:  #
                arr_topic = [long(x) for x in topics_id.strip().split(',')]
                for topic in arr_topic:
                    if topic not in dict_topic_point:
                        dict_topic_point[topic] = []

                    dict_topic_point[topic].append(point_id)

        return dict_point, dict_topic_point

    def getStudentResource(self):
        dict_students_resource = {}
        str_content = ','.join([str(x) for x in self.dict_students])
        sql = "select b.student_id, a.resource_id, b.deadline from neworiental_v3.link_respackage_publish_resource a JOIN neworiental_v3.link_respackage_student b on \
            a.publish_id = b.publish_id where a.resource_type = 2 and b.student_id in (%s)" % (
            str_content)

        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            student_id, resource_id, deadline = row
            if resource_id > 0:
                if student_id not in dict_students_resource:
                    dict_students_resource[student_id] = set()

                dict_students_resource[student_id].add(resource_id)

        return dict_students_resource

    def getQuestionBaseInfo(self, question_set):
        dict_question_difficulty = {}
        str_content = ','.join([str(x) for x in question_set])
        sql = "select b.id, b.difficulty from neworiental_v3.entity_question  where b.id in (%s)" % str_content
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            question_id, difficulty = row
            dict_question_diff[question_id] = int(difficulty)

        return dict_question_difficulty

    def updateStuAdapt2Difficult(self, range_throld=10, difficulty_throld=10):
        dict_exam_question = {}
        dict_exam_difficulty = {}
        dict_question_topic = self.recommend.getThisQuestionTopic()  #

        for exam_item in self.exam_list_records:
            question, student_id, ret, answer, submit_time, question_type = exam_item
            if question in self.dict_realtion_quesion:
                link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[
                    question]
                arr_point = link_point_id.strip().split(',')
                is_right = 1 if answer == 'A' or answer == 'B' else 0

                for point in arr_point:
                    key = '%s-%s' % (student_id, point)
                    if key not in dict_exam_question:
                        dict_exam_question[key] = []
                        dict_exam_difficulty[key] = {}

                    record = (question, is_right, difficulty, submit_time,
                              question_type)
                    dict_exam_question[key].append(record)
                    if difficulty not in dict_exam_difficulty[key]:
                        dict_exam_difficulty[key][difficulty] = []
                    dict_exam_difficulty[key][difficulty].append(record)

        question_set = set([x[0] for x in self.practice_list_records])
        dict_question_difficulty = self.getQuestionBaseInfo(question_set)

        for practice_item in self.practice_list_records:
            question, student_id, ret, answer, submit_time, question_type = practice_item
            is_right = 1 if ret == 1 else 0
            if question in dict_question_difficulty:
                difficulty = self.dict_question_difficulty[question]
                if question in dict_question_topic:
                    topic_set = dict_question_topic[question]
                    for topic in topic_set:
                        if topic in self.dict_topic_point:
                            point = self.dict_topic_point[topic]
                            key = '%s-%s' % (student_id, point)
                            if key not in dict_exam_question:
                                dict_exam_question[key] = []
                                dict_exam_difficulty[key] = {}

                            if difficulty not in dict_exam_difficulty[key]:
                                dict_exam_difficulty[key][difficulty] = []
                            record = (question, is_right, difficulty,
                                      submit_time, question_type)
                            dict_exam_difficulty[key][difficulty].append(
                                record)
                            dict_exam_question[key].append(record)

        dict_res = {}
        self.db_fetcher.commit_sql_cmd('delete from entity_student_feature',
                                       'mysql_logdata')
        for key in dict_exam_question:
            arr_content = key.split('-')
            student_id, point = arr_content[0], arr_content[1]
            question_records = sorted(dict_exam_question[key],
                                      key=lambda x: x[-1],
                                      reverse=True)
            right_cnt = 0
            throld = min(len(question_records), range_throld)
            #if throld < 10: continue
            for record_item in question_records[:throld]:
                question, is_right, difficulty, submit_time, question_type = record_item
                right_cnt += is_right

            rateRight = right_cnt / range_throld
            adaDiff, pDiff = 0, -1
            for difficulty in dict_exam_difficulty[key]:
                drecords = sorted(dict_exam_difficulty[key][difficulty],
                                  key=lambda x: x[-1],
                                  reverse=True)
                difficulty_throld = min(len(drecords), difficulty_throld)
                #if difficulty_throld < 4: continue
                dright_cnt = 0
                for diff_item in drecords[:difficulty_throld]:
                    question, is_right, difficulty, submit_time, question_type = diff_item
                    dright_cnt += is_right

                drateRight = dright_cnt / difficulty_throld
                if drateRight > pDiff:
                    adaDiff = difficulty
                    pDiff = drateRight

            point_name, ptype, question_type, level, parent_id, link_id = self.dict_point[
                point]
            insert_sql = "insert into entity_student_feature(student_id, point_id, point_name, adapt_difficulty, master) values(%s, %s, \'%s\', %s, %s)" \
                    % (student_id, point, point_name, adaDiff, rateRight)

            insert_id = self.db_fetcher.commit_sql_cmd(insert_sql,
                                                       'mysql_logdata')
            dict_res[key] = rateRight, difficulty

        return dict_res

    def getStudentScore(self):
        dict_student_score = {}
        for exam_item in self.exam_list_records:
            exam_question, student_id, ret, answer, submit_time, question_type = exam_item

            if exam_question in self.dict_realtion_quesion:
                link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[
                    exam_question]

                a_score, i_score = self.getScore(question_num, answer)
                exam_key = '%s-%s' % (student_id, question_exercise_id)

                if exam_key not in dict_student_score:
                    dict_student_score[exam_key] = 0
                dict_student_score[exam_key] += i_score

        return dict_student_score

    def getFirstPoint(self):  # point
        dict_relation_point13 = {}
        sql = "select a.id id3, a.`name` name3, c.id id1, c.`name` name1 from entity_exam_points a join entity_exam_points  \
                b on b.id = a.parent_id join entity_exam_points c on c.id = b.parent_id where a.type = 3"

        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            point3, name3, point1, name1 = row
            dict_relation_point13[point3] = name3, point1, name1

        return dict_relation_point13

    def getExerciseName(self):
        dict_exercise_name = {}
        sql = "select DISTINCT a.question_exercise_id, b.resource_name from link_question_answer a join neworiental_v3.entity_exercise b \
                on b.id = a.question_exercise_id where a.update_time >= \'%s\'" % self.update_time

        exercise_rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for exercise_row in exercise_rows:
            exercise_id, resource_name = exercise_row
            dict_exercise_name[exercise_id] = resource_name

        return dict_exercise_name

    def getExamData(self, dict_student_score, dict_relation_point13,
                    dict_exercise_name):
        for exam_item in self.exam_list_records:  # exam
            question, student_id, ret, answer, submit_time, question_type = exam_item
            if question in self.dict_realtion_quesion:
                link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[
                    question]
                arr_points = [int(x) for x in link_point_id.strip().split(',')]
                exam_key = '%s-%s' % (student_id, question_exercise_id)
                str_point1, str_name1, str_point3, str_name3 = '', '', '', ''
                is_first = True
                for point3 in arr_points:
                    name3, point1, name1 = dict_relation_point13[point3]
                    if is_first == True:
                        str_point3, str_name3, str_point1, str_name1 = str(
                            point3), str(name3), str(point1), str(name1)
                        is_first = False
                    else:
                        str_point3 += ',%d' % point3
                        str_name3 += ',%s' % name3
                        str_point1 += ',%d' % point1
                        str_name1 += ',%s' % name1

                resource_name = dict_exercise_name[question_exercise_id]
                exercise_score = dict_student_score[
                    exam_key] if exam_key in dict_student_score else 0
                a_score, i_score = self.getScore(question_num, answer)

                error_text = self.getErrorText(answer)
                print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (
                    student_id, link_question_id, question_type, str_point1,
                    str_name1, str_point3, str_name3, submit_time,
                    resource_name, exercise_score, question_exercise_id,
                    a_score, i_score, difficulty, ret, question_num,
                    error_text, 1)

    def getPracticeData(self, dict_relation_point13, dict_question_topic,
                        dict_question_base_info):
        for practice_item in self.practice_list_records:  # practice
            question, student_id, ret, answer, submit_time, question_type = practice_item
            str_point1, str_name1, str_point3, str_name3 = '', '', '', ''
            is_first = True
            if question in dict_question_topic:
                if question in dict_question_topic:
                    topic_set = dict_question_topic[question]
                    for topic in topic_set:
                        if topic in self.dict_topic_point:
                            arr_point3 = [
                                int(x) for x in self.dict_topic_point[topic]
                            ]
                            for point3 in arr_point3:
                                name3, point1, name1 = dict_relation_point13[
                                    point3]
                                if is_first == True:
                                    str_point3, str_name3, str_point1, str_name1 = str(
                                        point3), str(name3), str(point1), str(
                                            name1)
                                    is_first = False
                                else:
                                    str_point3 += ',%s' % point3
                                    str_name3 += ',%s' % name3
                                    str_point1 += ',%s' % point1
                                    str_name1 += ',%s' % name1

            a_score, i_score = 0, 0
            # 单选题 选择题 填空题
            if question_type == '单选题' or question_type == '选择题' or question_type == '填空题':
                a_score = 5
            else:
                a_score = 12

            if ret == 1:
                i_score = a_score
            elif ret == 5:
                i_score = 0.5 * a_score

            difficulty, qtype = 1, 1
            if question in dict_question_base_info:
                difficulty, qtype = dict_question_base_info[question]

            errorText = '正确' if ret == 1 else '错误'
            print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (
                student_id, question, question_type, str_point1, str_name1,
                str_point3, str_name3, submit_time, '推送练习', 0, 0, a_score,
                i_score, difficulty, ret, 0, errorText, 0)

    def getData(self):
        dict_student_score = self.getStudentScore()
        dict_relation_point13 = self.getFirstPoint()  # point
        dict_exercise_name = self.getExerciseName()

        dict_question_topic = self.recommend.getThisQuestionTopic(
        )  # question topic
        dict_question_base_info = self.recommend.getThisQuestionBaseInfo(
        )  # question quality
        print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % \
            ('学生id', '题目id', '题目类型', '一级考点id', '一级考点名称', '三级考点id', '三级考点名称', '提交时间', '试卷名称', '成绩', '题集id', '题目满分', '题目得分', '题目难度', '答题结果', '试卷题号', '错误原因', '考练类型')

        self.getExamData(dict_student_score, dict_relation_point13,
                         dict_exercise_name)  # exam
        self.getPracticeData(dict_relation_point13, dict_question_topic,
                             dict_question_base_info)  # practice

    def recommendMonday(self, is_new=0, throld=500):
        dict_question_topic = self.recommend.getThisQuestionTopic(
        )  # question topic
        d1 = datetime.datetime.now()
        d3 = d1 + datetime.timedelta(days=-14)
        str_monday = d3.strftime("%Y-%m-%d 00:00:00")

        self.db_fetcher.commit_sql_cmd(
            "delete from entity_recommend_question_bytopic",
            'mysql_white_list')  # update
        self.importDefault(0, dict_question_topic)  # import
        is_first, insert_score = 1, 0
        new_sql = "insert into sync_student_recommend_question(system_id, resource_type, resource_id, subject_id, tag1, tag2, score, type, type_level, type_id) values"
        update_sql = "insert into entity_recommend_question_bytopic(system_id, type,chapter_id, topic_id, question_id, `master`, duration, important, subject_id, score, school_publish, org_id, org_type) values"
        dict_student_cnt = {}
        records = self.exam_list_records  #+ self.practice_list_records
        for item in records:
            question, student_id, ret, answer, submit_time, question_type = item
            if submit_time > str_monday:
                if question in self.dict_realtion_quesion:
                    link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[
                        question]

                    answer_key = '%s_%s' % (question, student_id)
                    if answer_key in self.dict_content_answer:
                        answer = self.dict_content_answer[answer_key]
                    if answer == 'A' or answer == 'B':
                        ret = 1
                    else:
                        ret = 2

                if ret != 1:
                    if student_id not in dict_student_cnt:
                        dict_student_cnt[student_id] = 0
                    if dict_student_cnt[student_id] < throld:
                        # print '%s\t%s' % (student_id, link_question_id)
                        if is_first == 1:
                            update_sql += "(%s, 2, 0, 0, %s, 2, 2, 1, %s, %s, 0, 113, 2)" % (
                                student_id, link_question_id, 0, insert_score)
                            if is_new == 1:
                                new_sql += "(%s, 1, %s, %s, \'最近新错\', \'重难点\', %s, %s, %s, %s)" % (
                                    student_id, link_question_id, 0,
                                    insert_score, 1, 3, 2186)
                            is_first = 0
                        else:
                            update_sql += ",(%s, 2, 0, 0, %s, 2, 2, 1, %s, %s, 0, 113, 2)" % (
                                student_id, link_question_id, 0, insert_score)
                            if is_new == 1:
                                new_sql += "(%s, 1, %s, %s, \'最近新错\', \'重难点\', %s, %s, %s, %s)" % (
                                    student_id, link_question_id, 0,
                                    insert_score, 1, 3, 2186)

                        dict_student_cnt[student_id] += 1
                        insert_score += 1

        if is_new == 1:
            self.db_fetcher.commit_sql_cmd(new_sql, 'mysql_v3_white_list')
        else:
            self.db_fetcher.commit_sql_cmd(update_sql, 'mysql_white_list')
Example #16
0
class TextAnalysis(object):
    media_dict = None
    entity_dict = None
    seo_extractor = SeoExtractor('../seo/com_df_idf')

    def __init__(self, mysql_fetcher=None, handler_redis=None):
        self.mysql_fetcher = mysql_fetcher
        if mysql_fetcher is None:
            self.mysql_fetcher = DataBaseFetcher()
        self.handler_redis = handler_redis
        if handler_redis is None:
            self.handler_redis = redis.Redis(
                host='1c34f95e1b12494b.m.cnbja.kvstore.aliyuncs.com',
                password='******',
                port=6379,
                db=5)
        self.load_entity_dict()

    def set_media_id(self, media_id):
        self.media_dict = self.get_media_dict(media_id)

    # 通过media_id 与向量字典关联
    def set_media_dict(self, media_id, forward_index_dict):
        if media_id in forward_index_dict:
            self.media_dict = forward_index_dict[media_id]
        else:
            self.media_dict = self.get_media_dict(media_id)

    def load_entity_dict(self):
        self.entity_dict = set()
        # handle company
        company_file = '../../company_recognize/com_name2id.txt'
        with open(company_file) as com_f:
            for line in com_f:
                tokens = line.strip().split('\t')
                if len(tokens) != 2:
                    continue
                self.entity_dict.add(tokens[0])

        # handle industry
        db_industry = self.mysql_fetcher.get_sql_result(
            'select distinct name from dict_industry', 'mysql_crm')
        for row_industry in db_industry:
            self.entity_dict.add(row_industry[0])

        # handle organization
        db_organization = self.mysql_fetcher.get_sql_result(
            'select distinct name_abbr from organization', 'mysql_crm')
        for row_organization in db_organization:
            self.entity_dict.add(row_organization[0])

    # 获取文章关键词-权重值列表
    def get_article_weight(self, media_id):
        response_list = self.seo_extractor.extract_article(media_id,
                                                           title_weight=2,
                                                           title_coeff=1.4)
        if response_list is None or len(response_list) == 0:
            return {}

        keyword_dict = {}  # result dict
        for keyword, weight in response_list:
            if keyword in self.entity_dict:
                weight *= 1.4
            keyword_dict[unicode(keyword, 'utf8')] = weight

        keyword_dict = {
            key: value
            for key, value in keyword_dict.items()
            if value > term_weight_threshold
        }
        if len(keyword_dict) < 5:
            keyword_dict = {
                key: value
                for key, value in keyword_dict.items()
                if value > 0.68 * term_weight_threshold
            }

        return keyword_dict

    # 删除历史记录的文本向量
    def delete_redis_text(self, update_time_range):
        db_data = self.mysql_fetcher.get_sql_result(
            "select id from kr_articles where publish < \'%s\'" %
            update_time_range, 'mysql_insight')
        for i in range(len(db_data)):
            media_id = db_data[i][0]
            str_key = '%s_%s' % (kForwardIndexRedisKey, media_id)
            self.handler_redis.delete(str_key)

    # 计算两个向量的余弦相似度
    def cos_similarity(self, v1, v2):
        if len(v1) != len(v2):  # 向量维数必须相等
            return 0.0

        fen_divisor = 0.0  # 除数
        dividend_v1 = 0.0  # 被除数
        dividend_v2 = 0.0  # 被除数
        for pos in range(len(v1)):
            fen_divisor += v1[pos] * v2[pos]
            dividend_v1 += v1[pos]**2
            dividend_v2 += v2[pos]**2

        fen_dividend = math.sqrt(dividend_v1) * math.sqrt(dividend_v2)
        return (fen_divisor / fen_dividend if fen_dividend > 0.0 else 0.0)

    # 初始化文本向量
    def init_vector(self, vector1, dict_weight):
        if len(vector1) < 1:
            return []

        v1 = []  # 文本向量vector
        for pos in range(len(vector1)):
            if vector1[pos] in dict_weight:
                v1.append(dict_weight[vector1[pos]])
            else:
                v1.append(0.0)

        return v1

    # 重建media文本信息的正排,并更新redis
    def media_text_to_forward_index(self, media_id):
        # 获取需要的字符串
        dict_term_weight = self.get_article_weight(media_id)

        # 存入redis 文本关键词数据 key = media_id
        str_key = '%s_%s' % (kForwardIndexRedisKey, media_id)
        str_value = json.dumps(dict_term_weight)
        self.handler_redis.set(str_key, str_value)

        return dict_term_weight

    # 重建update_time_range时间之后的media文本信息的倒排,并更新redis
    def media_text_to_revert_index(self, update_time_range):
        #self.delete_redis_text(update_time_range)

        # 更新redis
        db_data = self.mysql_fetcher.get_sql_result(
            'select id from kr_articles where publish >=\'%s\'' %
            update_time_range, 'mysql_insight')
        if len(db_data) < 1:
            return None

        revert_index_dict = {}  # 倒排索引字典
        n_count = 1
        for i in range(len(db_data)):
            media_id = db_data[i][0]
            # self.cache_media_relate(media_id,update_time_range)

            # 缓存倒排索引
            media_dict = self.get_media_dict(media_id)
            for term, weight in media_dict.items():
                if term not in revert_index_dict:
                    revert_index_dict[term] = {}
                revert_index_dict[term][media_id] = weight

        str_key = kRevertIndexRedisKey
        str_value = str(revert_index_dict)
        self.handler_redis.set(str_key, str_value)

        return revert_index_dict

    # 获取正排
    def get_media_dict(self, media_id):
        media_dict = None
        str_key = '%s_%s' % (kForwardIndexRedisKey, media_id)
        str_value = self.handler_redis.get(str_key)
        # 判断redis 数据库中是否包含media_id 信息
        if str_value is None:
            media_dict = self.media_text_to_forward_index(media_id)
        else:
            media_dict = json.loads(str_value)
        return media_dict

    # 用倒排计算文本相似度
    def calc_simi_using_revert_index(self, revert_index_dict, meta_dict):
        if revert_index_dict is None:
            return {}

        dict_molecule = {}  # 分子对象
        dict_denominator1 = 0.0  # 分母对象1

        result_dict = {}  # 结果集字典 key = media_id
        media_list = []
        # 计算所有的d 与该文档的相似度的分子分母
        for term, weight in self.media_dict.items():
            dict_denominator1 += weight**2
            if term not in revert_index_dict:
                continue
            doc_weight_list = revert_index_dict[term]
            for media_id in doc_weight_list:
                # 获取相关列表
                media_list.append(media_id)

                if media_id in dict_molecule:
                    dict_molecule[
                        media_id] += weight * doc_weight_list[media_id]
                else:
                    dict_molecule[
                        media_id] = weight * doc_weight_list[media_id]

        for media_id in media_list:
            if media_id not in meta_dict:
                continue

            x, y, dict_denominator2 = meta_dict[media_id]  # 分母对象2
            fen_dividend = math.sqrt(dict_denominator1) * math.sqrt(
                dict_denominator2)
            result_dict[media_id] = dict_molecule[
                media_id] / fen_dividend if fen_dividend > 0.0 else 0.0

        return result_dict

    # 构造相似向量
    def dict_merge2list(self, d_media, d_compare):
        vec_words = []
        b_similarity = False

        for media_key in d_media:
            if media_key in d_compare:
                vec_words.append(media_key)
                b_similarity = True

        return vec_words, b_similarity

    # 用正排计算文本相似度
    def calculate_text_weight(self, compare_id, forward_index_dict):
        # 判断该向量是否存在
        if self.media_dict is None:
            return 0.0

        if compare_id in forward_index_dict:
            compare_dict = forward_index_dict[compare_id]
        else:
            compare_dict = self.get_media_dict(compare_id)

        # 判断比较向量是否存在
        if compare_dict is None:
            return 0.0

        vec_words, b_similarity = self.dict_merge2list(self.media_dict,
                                                       compare_dict)
        if b_similarity == False:
            return 0.0

        v1 = self.init_vector(vec_words, self.media_dict)
        v2 = self.init_vector(vec_words, compare_dict)

        # 计算余弦相似度值
        d_result = self.cos_similarity(v1, v2)
        return d_result
Example #17
0
class Report(object):
    """docstring for Report"""
    def __init__(self, update_time='2017-11-01 00:00:00'):
        # do something inits
        self.db_fetcher = DataBaseFetcher()
        self.dict_content_answer = {}  # error content

        self.update_time = update_time  # update time
        self.recommend = RecommendQuestion()

        self.dict_realtion_quesion, self.dict_question_num, self.dict_question_score = self.getRelationQuestion(
        )
        self.dict_parent_quesion = self.getChildQuestion()
        self.dict_question_diff = self.getRightDifficulty()
        self.dict_point, self.dict_topic_point = self.getPoints()

        self.dict_students = self.getStudentId()
        self.dict_student_score, self.dict_student_records = {}, {
            x: []
            for x in self.dict_students
        }

        self.exam_list_records, self.practice_list_records = self.getExerciseRecord(
        )  #records

        # dict
        self.dict_question_types = {
            1: '选择题',
            2: '填空题',
            4: '简答题',
            6: '综合题',
            17: '计算题',
            44: '证明题'
        }
        self.dict_question_difficulty = {
            1: '易',
            2: '中',
            3: '难',
            4: '极难'
        }  # 1:易 2:中 3:难 4:极难

        # feedback
        self.dict_students_resource = self.getStudentResource()  # resource

    def getExerciseRecord(self):
        exam_list_records = []
        practice_list_records = []
        sql = "select a.question_id, a.student_id, a.ret_num, a.submit_answer, a.submit_time, b.question_type from entity_student_exercise a join \
                neworiental_v3.entity_question b on b.id = a.question_id where a.submit_time > \'%s\' and a.exercise_source = 6" % (
            self.update_time)

        dict_is_choice = {}
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            question, student_id, ret, answer, submit_time, question_type = row
            student_id = long(student_id)

            if answer == 'null' or answer == 'None' or answer == '' or answer == 'NULL' or answer == None or len(
                    answer) == 0:
                continue
            if question in self.dict_realtion_quesion:
                exam_list_records.append([
                    question, student_id, ret, answer, submit_time,
                    question_type
                ])

                self.dict_student_records[student_id].append([
                    question, student_id, ret, answer, submit_time,
                    question_type
                ])
                self.dict_content_answer['%s_%s' %
                                         (question, student_id)] = answer
            elif question in self.dict_parent_quesion:
                dict_content = self.dict_parent_quesion[question]
                parent_question_id = dict_content['parent_id']

                rank = dict_content['rank']
                link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[
                    parent_question_id]
                if question_num >= 17 and question_num <= 21:
                    if rank == 1:
                        exam_list_records.append([
                            parent_question_id, student_id, ret, answer,
                            submit_time, question_type
                        ])
                        self.dict_student_records[student_id].append([
                            parent_question_id, student_id, ret, answer,
                            submit_time, question_type
                        ])
                    elif rank == 2:
                        self.dict_content_answer['%s_%s' %
                                                 (parent_question_id,
                                                  student_id)] = answer
                elif question_num >= 22 and question_num <= 23:
                    key = '%s_%s' % (parent_question_id, student_id)
                    if rank == 1:
                        if answer == 'A':
                            dict_is_choice[key] = 1
                        else:
                            dict_is_choice[key] = 0
                    elif rank == 2:
                        if key in dict_is_choice:
                            if dict_is_choice[key] == 1:
                                exam_list_records.append([
                                    parent_question_id, student_id, ret,
                                    answer, submit_time, question_type
                                ])
                                self.dict_student_records[student_id].append([
                                    parent_question_id, student_id, ret,
                                    answer, submit_time, question_type
                                ])
                    elif rank == 3:
                        self.dict_content_answer[key] = answer
            else:
                practice_list_records.append([
                    question, student_id, ret, answer, submit_time,
                    question_type
                ])

        return exam_list_records, practice_list_records

    def getStudentId(self, file_name='students.txt'):
        list_students = []
        with open(file_name) as handle_f:
            for line in handle_f:
                system_id = long(line.strip())
                list_students.append(system_id)

        set_studends = set(list_students)  # students set
        str_content = ','.join([str(x) for x in set_studends])
        sql = "select a.`system_id`, a.`name` from neworiental_user.entity_user a where type = 2 and a.system_id in (%s)" % str_content
        dict_students = {}
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            student_id, name = row
            if student_id in set_studends:
                dict_students[student_id] = name

        return dict_students

    def getRightDifficulty(self):
        dict_question_diff = {}
        sql = "select b.id, b.difficulty from neworiental_v3.entity_question b join link_question_answer a on  b.id = a.link_question_id where a.update_time >= \'%s\'" % self.update_time
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            question_id, difficulty = row
            dict_question_diff[question_id] = int(difficulty)

        return dict_question_diff

    def getChildQuestion(self):
        dict_parent_list = {}
        dict_parent_question = {}
        sql = "select a.id, a.parent_question_id from link_question_answer b join neworiental_v3.entity_question a on a.parent_question_id = b.link_answer_id \
                where b.question_num >= 17 and b.update_time >= \'%s\'" % self.update_time
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            qid = row[0]
            parent_id = row[1]
            if parent_id not in dict_parent_list:
                dict_parent_list[parent_id] = []

            dict_parent_list[parent_id].append(qid)

        for parent_id in dict_parent_list:
            sort_question = sorted(dict_parent_list[parent_id], reverse=False)
            count = 1
            for item in sort_question:
                dict_parent_question[item] = {
                    'parent_id': parent_id,
                    'rank': count
                }
                count += 1

        return dict_parent_question

    def getRelationQuestion(self):
        dict_realtion_quesion = {}
        dict_point_question_num = {}
        dict_question_score = {}
        sql = "select a.link_question_id, a.link_answer_id, a.link_point_id, a.question_num, b.difficulty, a.question_exercise_id, b.question_type, a.score \
        from link_question_answer a join neworiental_v3.entity_question b on a.link_question_id =  b.id  where a.update_time >= \'%s\'" % self.update_time
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        dict_parent_quesion = {}
        link_parent_list = []
        for row in rows:
            link_question_id, link_answer_id, link_point_id, question_num, difficulty, question_exercise_id, question_type, score = row
            dict_realtion_quesion[
                link_answer_id] = link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type
            dict_question_score[link_question_id] = score
            arr_point = link_point_id.strip().split(',')
            for point in arr_point:
                if point not in dict_point_question_num:
                    dict_point_question_num[point] = []

                dict_point_question_num[point].append([
                    link_question_id, question_num, difficulty,
                    question_exercise_id, question_type
                ])

        return dict_realtion_quesion, dict_point_question_num, dict_question_score

    def getScore(self, question_num, answer, ascore):
        ascore, score = 0, 0
        if question_num >= 1 and question_num <= 16:
            score = 5 if (answer == 'A' or answer == 'B') else 0
            ascore = 5
        elif question_num >= 17 and question_num <= 23:
            if answer == 'A':
                if ascore == 10:
                    score = 4
                elif ascore == 12:
                    score = 4
                elif ascore == 13:
                    score = 5
                elif ascore == 14:
                    score = 5
                elif ascore == 15:
                    score = 6
            elif answer == 'B':
                if ascore == 10:
                    score = 7
                elif ascore == 12:
                    score = 8
                elif ascore == 13:
                    score = 9
                elif ascore == 14:
                    score = 10
                elif ascore == 15:
                    score = 11
            elif answer == 'C':
                if ascore == 10:
                    score = 9
                elif ascore == 12:
                    score = 11
                elif ascore == 13:
                    score = 12
                elif ascore == 14:
                    score = 13
                elif ascore == 15:
                    score = 15
            elif answer == 'D':
                score = ascore

        return score

    def getRecentlyExercise(throld=5):
        set_recent = set()
        sql = "select distinct question_exercise_id from link_question_answer order by question_exercise_id desc limit %s" % throld
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            eid = row
            eid = long(eid)
            set_recent.add(eid)

        return set_recent

    def getAllScore(self, link_question_id, question_num):
        ascore = 0
        if link_question_id in self.dict_question_score:
            ascore = self.dict_question_score[link_question_id]
        else:
            if question_num >= 1 and question_num <= 16:
                ascore = 5
            elif question_num >= 17 and question_num <= 21:
                ascore = 12
            elif question_num >= 22 and question_num <= 23:
                ascore = 10

        return ascore

    def statQustionReport(self, range_throld=4, range_target=20):
        dict_students_point_question = {}
        dict_student_score = {}
        for item in self.exam_list_records:
            question, student_id, ret, answer, submit_time, question_type = item

            if question in self.dict_realtion_quesion:
                link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[
                    question]
                arr_points = link_point_id.strip().split(',')
                if student_id not in dict_student_score:
                    dict_student_score[student_id] = 0
                ascore = self.getAllScore(link_question_id, question_num)
                score = self.getScore(question_num, answer, ascore)

                dict_student_score[student_id] += score

                is_right = (answer == 'A') or (answer == 'B')
                if student_id not in dict_students_point_question:
                    dict_students_point_question[student_id] = {}  # student id

                point_name = '-'.join([str(x) for x in arr_points])  # name
                if point_name not in dict_students_point_question[student_id]:
                    dict_students_point_question[student_id][point_name] = []

                dict_students_point_question[student_id][point_name].append(
                    (link_question_id, is_right, question_type, difficulty,
                     submit_time))  # question

        dict_res_student_point_question = {}
        for student_id in dict_students_point_question:
            target_rate = 0.5
            if student_id in dict_student_score:
                target_rate = (dict_student_score[student_id] * 0.2 *
                               1.2) / 150

            dict_point_student_question = dict_students_point_question[
                student_id]
            dict_res_student_point_question[student_id] = {}
            for point_name in dict_point_student_question:
                arr_question = dict_point_student_question[point_name]
                target_questions = []
                sort_question = sorted(arr_question,
                                       key=lambda x: x[-1],
                                       reverse=True)
                right_rate, cnt_rate = 0, range_target
                if len(sort_question) < cnt_rate: cnt_rate = len(sort_question)

                for item in sort_question[:cnt_rate]:
                    link_question_id, is_right, qtype, difficulty, submit_time = item
                    if is_right == 0:
                        target_questions.append(
                            (link_question_id, qtype, difficulty, submit_time))
                    else:
                        right_rate += 1

                if right_rate / cnt_rate < target_rate and len(
                        target_questions) > 0:
                    dict_res_student_point_question[student_id][
                        point_name] = target_questions

        return dict_res_student_point_question

    def getErrorText(self, answer):
        if answer == 'C':
            return '审题时,粗心看错了'
        elif answer == 'D':
            return '审题时,题意读不懂'
        elif answer == 'E':
            return '析题时,思路模糊不清'
        elif answer == 'F':
            return '析题时,思路方向错误'
        elif answer == 'G':
            return '解题时,公式记错了'
        elif answer == 'H':
            return '解题时,计算出错了'
        elif answer == 'I':
            return '未作答(或未完成),时间来不及了'
        elif answer == 'A' or answer == 'B':
            return '答题正确'

    def getQuestionWords(self):
        dict_question_words = {}
        str_question = ','.join([
            str(self.dict_realtion_quesion[x][0])
            for x in self.dict_realtion_quesion
        ])
        str_sql = "select c.id, c.subject_id, t.type_id,t.struct_id, c.json_data from neworiental_v3.entity_question c left join neworiental_v3.entity_question_type t on \
                    t.type_id=c.question_type_id where c.id in (%s)" % str_question

        s_rows = self.db_fetcher.get_sql_result(str_sql, 'mysql_logdata')
        for row in s_rows:
            question, subject_id, type_id, struct_id, json_data = row
            if json_data != '' and json_data != 'None' and json_data != None:
                keywords, res = sq.predict(question, subject_id, type_id,
                                           struct_id, json_data)
                dict_question_words[question] = keywords, res

        return dict_question_words

    def getPoint2Question(self):
        dict_point_org_question = {}
        for answer_id in self.dict_realtion_quesion:
            link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[
                answer_id]
            arr_point = link_point_id.strip().split(',')

            for item in arr_point:
                if item not in dict_point_org_question:
                    dict_point_org_question[item] = []

                dict_point_org_question[item].append(link_question_id)

        return dict_point_org_question

    def pointsRecQuestion(self,
                          dict_students_point_question,
                          throld=4,
                          error_range=3):
        # 1,2 choice question 1:easy 2:difficulty 3,4: comprehensive problem 3:easy 4:difficulty
        # dict_point_org_question = self.getPoint2Question() # org question point
        dict_question_text = self.getQuestionWords()
        dict_question_base_info = self.recommend.getThisQuestionBaseInfo(
        )  # question quality
        dict_question_topic = self.recommend.getThisQuestionTopic(
        )  # question topic
        # self.recommendMonday(dict_question_base_info) # error questionsss
        dict_question_target = {}
        dict_student_recommend_question = {}
        cid = 1
        student_target = open('student_target.txt', 'w')
        for student_id in dict_students_point_question:
            sort_student_item = sorted(
                dict_students_point_question[student_id].items(),
                key=lambda x: len(x[-1]),
                reverse=False)
            student_name = self.dict_students[student_id]
            haveTime = 60
            recommend_questions = []
            pre_set = set()
            for item_target in sort_student_item:
                if haveTime <= 0: break
                item_point, arr_question = item_target
                item_one = item_point.strip().split('-')[0]

                key = '%s-%s' % (student_id, item_one)
                pre_set = pre_set | set([x[0] for x in arr_question])  #
                if len(arr_question) > error_range:
                    arr_question = arr_question[:error_range]

                for base_question, qtype, difficulty, submit_time in arr_question:
                    if haveTime <= 0: break
                    if qtype == '选择题':
                        qtype = 1
                    elif qtype == '填空题':
                        qtype = 2
                    else:
                        qtype = 3

                    keywords, text = dict_question_text[base_question]
                    text = text.replace('\n', '').strip('\r').strip()  # filter

                    base_question_key = '%s-%s' % (base_question, difficulty)
                    if base_question_key in dict_question_target:
                        target_questions = dict_question_target[
                            base_question_key]
                    else:
                        target_questions = self.recommend.getEsResult(
                            base_question, text, keywords, difficulty, qtype,
                            pre_set, throld)
                        dict_question_target[
                            base_question_key] = target_questions

                    for rec_question in target_questions:
                        if haveTime <= 0: break
                        if rec_question in pre_set: continue

                        haveTime += -1
                        recommend_questions.append(
                            (base_question, keywords, rec_question, item_one,
                             difficulty))
                        pre_set.add(rec_question)

            insert_score = 0
            for rec_item in recommend_questions:
                source_question, keywords, rec_question, item_point, difficulty = rec_item
                point_name, ptype, question_type, level, parent_id, link_id = self.dict_point[
                    item_point]
                type_id = 2186

                if rec_question in dict_question_base_info:
                    difficulty, question_type, upload_id = dict_question_base_info[
                        rec_question]
                else:
                    upload_id = 0

                if rec_question in dict_question_topic:
                    set_topic = dict_question_topic[rec_question]
                    if len(set_topic) > 0: type_id = set_topic.pop()

                str_time = datetime.datetime.strftime(datetime.datetime.now(),
                                                      '%Y-%m-%d %H:%M:%S')
                print "%s\t%s\t1\t%s\t%s\t针对练习\t重点\t%s\t%s\t%s\t%s\t%s" % (
                    cid, student_id, rec_question, 4, insert_score, str_time,
                    1, 3, type_id)
                student_target.write(
                    "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %
                    (cid, student_id, student_name, item_point, point_name,
                     source_question, rec_question, keywords, difficulty))
                insert_score += 1
                cid += 1

    def getPoints(self):
        dict_point = {}
        dict_topic_point = {}
        sql = 'select id, name, type, question_type, level, parent_id, link_id, topics_id from entity_exam_points'  # do something
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            point_id, name, ptype, question_type, level, parent_id, link_id, topics_id = row
            point_id = str(point_id)
            dict_point[
                point_id] = name, ptype, question_type, level, parent_id, link_id
            if ptype == 3:  #
                arr_topic = [long(x) for x in topics_id.strip().split(',')]
                for topic in arr_topic:
                    if topic not in dict_topic_point:
                        dict_topic_point[topic] = []

                    dict_topic_point[topic].append(point_id)

        return dict_point, dict_topic_point

    def getStudentResource(self):
        dict_students_resource = {}
        str_content = ','.join([str(x) for x in self.dict_students])
        sql = "select b.student_id, a.resource_id, b.deadline from neworiental_v3.link_respackage_publish_resource a JOIN neworiental_v3.link_respackage_student b on \
            a.publish_id = b.publish_id where a.resource_type = 2 and b.student_id in (%s)" % (
            str_content)

        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            student_id, resource_id, deadline = row
            if resource_id > 0:
                if student_id not in dict_students_resource:
                    dict_students_resource[student_id] = set()

                dict_students_resource[student_id].add(resource_id)

        return dict_students_resource

    def getQuestionBaseInfo(self, question_set):
        dict_question_difficulty = {}
        str_content = ','.join([str(x) for x in question_set])
        sql = "select b.id, b.difficulty from neworiental_v3.entity_question  where b.id in (%s)" % str_content
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            question_id, difficulty = row
            dict_question_diff[question_id] = int(difficulty)

        return dict_question_difficulty

    def getStudentScore(self):
        dict_student_score = {}
        for exam_item in self.exam_list_records:
            exam_question, student_id, ret, answer, submit_time, question_type = exam_item

            if exam_question in self.dict_realtion_quesion:
                link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[
                    exam_question]
                ascore = self.getAllScore(link_question_id, question_num)
                i_score = self.getScore(question_num, answer, ascore)
                exam_key = '%s-%s' % (student_id, question_exercise_id)

                if exam_key not in dict_student_score:
                    dict_student_score[exam_key] = 0
                dict_student_score[exam_key] += i_score

        return dict_student_score

    def getFirstPoint(self):  # point
        dict_relation_point13 = {}
        sql = "select a.id id3, a.`name` name3, c.id id1, c.`name` name1 from entity_exam_points a join entity_exam_points  \
                b on b.id = a.parent_id join entity_exam_points c on c.id = b.parent_id where a.type = 3"

        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            point3, name3, point1, name1 = row
            dict_relation_point13[point3] = name3, point1, name1

        return dict_relation_point13

    def getExerciseName(self):
        dict_exercise_name = {}
        sql = "select DISTINCT a.question_exercise_id, b.resource_name from link_question_answer a join neworiental_v3.entity_exercise b \
                on b.id = a.question_exercise_id where a.update_time >= \'%s\'" % self.update_time

        exercise_rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for exercise_row in exercise_rows:
            exercise_id, resource_name = exercise_row
            dict_exercise_name[exercise_id] = resource_name

        return dict_exercise_name

    def getExamData(self, dict_student_score, dict_relation_point13,
                    dict_exercise_name):
        for exam_item in self.exam_list_records:  # exam
            question, student_id, ret, answer, submit_time, question_type = exam_item
            if question in self.dict_realtion_quesion:
                link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[
                    question]
                arr_points = [int(x) for x in link_point_id.strip().split(',')]
                exam_key = '%s-%s' % (student_id, question_exercise_id)
                str_point1, str_name1, str_point3, str_name3 = '', '', '', ''
                is_first = True
                for point3 in arr_points:
                    name3, point1, name1 = dict_relation_point13[point3]
                    if is_first == True:
                        str_point3, str_name3, str_point1, str_name1 = str(
                            point3), str(name3), str(point1), str(name1)
                        is_first = False
                    else:
                        str_point3 += ',%d' % point3
                        str_name3 += ',%s' % name3
                        str_point1 += ',%d' % point1
                        str_name1 += ',%s' % name1

                resource_name = dict_exercise_name[question_exercise_id]
                exercise_score = dict_student_score[
                    exam_key] if exam_key in dict_student_score else 0
                a_score, i_score = self.getScore(question_num, answer)

                error_text = self.getErrorText(answer)
                print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (
                    student_id, link_question_id, question_type, str_point1,
                    str_name1, str_point3, str_name3, submit_time,
                    resource_name, exercise_score, question_exercise_id,
                    a_score, i_score, difficulty, ret, question_num,
                    error_text, 1)

    def getPracticeData(self, dict_relation_point13, dict_question_topic,
                        dict_question_base_info):
        for practice_item in self.practice_list_records:  # practice
            question, student_id, ret, answer, submit_time, question_type = practice_item
            str_point1, str_name1, str_point3, str_name3 = '', '', '', ''
            is_first = True
            if question in dict_question_topic:
                if question in dict_question_topic:
                    topic_set = dict_question_topic[question]
                    for topic in topic_set:
                        if topic in self.dict_topic_point:
                            arr_point3 = [
                                int(x) for x in self.dict_topic_point[topic]
                            ]
                            for point3 in arr_point3:
                                name3, point1, name1 = dict_relation_point13[
                                    point3]
                                if is_first == True:
                                    str_point3, str_name3, str_point1, str_name1 = str(
                                        point3), str(name3), str(point1), str(
                                            name1)
                                    is_first = False
                                else:
                                    str_point3 += ',%s' % point3
                                    str_name3 += ',%s' % name3
                                    str_point1 += ',%s' % point1
                                    str_name1 += ',%s' % name1

            a_score, i_score = 0, 0
            # 单选题 选择题 填空题
            if question_type == '单选题' or question_type == '选择题' or question_type == '填空题':
                a_score = 5
            else:
                a_score = 12

            if ret == 1:
                i_score = a_score
            elif ret == 5:
                i_score = 0.5 * a_score

            difficulty, qtype = 1, 1
            if question in dict_question_base_info:
                difficulty, qtype, upload_id = dict_question_base_info[
                    question]

            errorText = '正确' if ret == 1 else '错误'
            print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (
                student_id, question, question_type, str_point1, str_name1,
                str_point3, str_name3, submit_time, '推送练习', 0, 0, a_score,
                i_score, difficulty, ret, 0, errorText, 0)

    def getData(self):
        dict_student_score = self.getStudentScore()
        dict_relation_point13 = self.getFirstPoint()  # point
        dict_exercise_name = self.getExerciseName()

        dict_question_topic = self.recommend.getThisQuestionTopic(
        )  # question topic
        dict_question_base_info = self.recommend.getThisQuestionBaseInfo(
        )  # question quality
        print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % \
            ('学生id', '题目id', '题目类型', '一级考点id', '一级考点名称', '三级考点id', '三级考点名称', '提交时间', '试卷名称', '成绩', '题集id', '题目满分', '题目得分', '题目难度', '答题结果', '试卷题号', '错误原因', '考练类型')

        self.getExamData(dict_student_score, dict_relation_point13,
                         dict_exercise_name)  # exam
        self.getPracticeData(dict_relation_point13, dict_question_topic,
                             dict_question_base_info)  # practice

    def recommendMonday(self, dict_question_base_info, is_new=1, throld=500):
        dict_question_topic = self.recommend.getThisQuestionTopic(
        )  # question topic
        d1 = datetime.datetime.now()
        d3 = d1 + datetime.timedelta(days=-7)
        str_monday = d3.strftime("%Y-%m-%d 00:00:00")

        is_first, insert_score = 1, 0
        dict_student_cnt = {}
        records = self.exam_list_records  #+ self.practice_list_records
        for item in records:
            question, student_id, ret, answer, submit_time, question_type = item
            if submit_time > str_monday:
                if question in self.dict_realtion_quesion:
                    link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[
                        question]

                    answer_key = '%s_%s' % (question, student_id)
                    if answer_key in self.dict_content_answer:
                        answer = self.dict_content_answer[answer_key]
                    if answer == 'A' or answer == 'B':
                        ret = 1
                    else:
                        ret = 2

                if ret != 1:
                    if student_id not in dict_student_cnt:
                        dict_student_cnt[student_id] = 0
                    if dict_student_cnt[student_id] < throld:
                        # print '%s\t%s' % (student_id, link_question_id)
                        if link_question_id in dict_question_base_info:
                            difficulty, question_type, upload_id = dict_question_base_info[
                                link_question_id]
                        else:
                            upload_id = 0

                        print "%s\t1\t%s\t%s\t\'最近新错\'\t\'重难点\'\t%s\t%s\t%s\t%s" % (
                            student_id, link_question_id, 0, insert_score, 1,
                            3, 2186)
                        dict_student_cnt[student_id] += 1
                        insert_score += 1
Example #18
0
class SimiTags(object):
    def __init__(self):
        self.db_fetcher = DataBaseFetcher()
        self.tag2comlist = self.load_info(
            'select cid,tag from crm.company_tag')
        self.tag_child = self.load_info(
            'select tag,parent_tag from crm.tag_tag_relation')
        self.tag_parent = self.load_info(
            'select parent_tag,tag from crm.tag_tag_relation')
        self.total_num, self.manual_tag_count = self.tag_cid()

    def load_info(self, cmd):
        temp_dict = {}
        db_row = self.db_fetcher.get_sql_result(cmd, 'mysql_readall')
        for row in db_row:
            leaf, root = row
            if root not in temp_dict: temp_dict[root] = []
            temp_dict[root].append(leaf)
        return temp_dict

    def tag_cid(self):
        db_row = self.db_fetcher.get_sql_result(
            'select count(distinct cid) from crm.company_tag', 'mysql_readall')
        total_num = db_row[0][0]
        manual_tag_count = {}
        db_row = self.db_fetcher.get_sql_result(
            'select cid,tag from crm.company_tag where type != 5',
            'mysql_readall')
        for row in db_row:
            crm_id, tag = row
            if tag not in manual_tag_count: manual_tag_count[tag] = 0
            manual_tag_count[tag] += 1
        return total_num, manual_tag_count

    def get_correlation(self):
        similarity_tag = {}
        tag_corre_dict = {}
        f = 0
        for tag1 in self.tag2comlist:
            tag_corre_dict = {}
            if tag1.lower() != '人工智能': continue
            if tag1 not in self.manual_tag_count: continue
            manual_num_x = self.manual_tag_count[tag1]
            if manual_num_x < 30: continue
            l1 = self.tag2comlist[tag1]
            sum_x = len(l1)
            if sum_x < 100: continue
            for tag2 in self.tag2comlist:
                if tag1 == tag2: continue
                if tag2 not in self.tag2comlist or tag2 not in self.manual_tag_count:
                    continue
                manual_num = self.manual_tag_count[tag2]
                if manual_num < 2: continue
                l2 = self.tag2comlist[tag2]
                sum_y = len(l2)
                if sum_y > sum_x * 1.5: continue
                xy = len(set(l1) & set(l2))
                if xy < 5: continue
                pf = xy * 1.0 / sum_x
                if pf > 0.6: continue
                pc = xy * 1.0 / sum_y
                pnc = (sum_x - xy + 1) * 1.0 / (self.total_num - sum_y)
                if pc < 0.01: continue
                if pnc != 0:
                    value = pc / pnc
                if value < 1: continue
                correlation = math.pow(xy, 0.2) * math.pow(value, 0.8)
                tag_corre_dict[tag2] = correlation

            tag_items = tag_corre_dict.items()
            tag_heap = heapq.nlargest(len(tag_items),
                                      tag_items,
                                      key=lambda x: x[-1])
            tag_heap = tag_heap[:40]
            if len(tag_heap) == 0: continue
            re = map(lambda x: x[0], tag_heap)
            if len(re) == 0: continue
            similarity_tag[tag1] = re
            print tag1, '\t', "%s" % json.dumps(
                re, ensure_ascii=False).encode('utf8')
        return similarity_tag

    def rm_sub(self, tag_heap):
        abandon_l = []
        for i in range(0, len(tag_heap)):
            tag1, weight = tag_heap[i]
            if tag1 not in self.tag2comlist: continue
            l1 = self.tag2comlist[tag1]
            sum_x = len(l1)
            tag1_p, tag1_c = [], []
            if tag1 in self.tag_child: tag1_c = self.tag_child[tag1]
            if tag1 in self.tag_parent: tag1_p = self.tag_parent[tag1]
            for j in range(i + 1, len(tag_heap) - 1):
                tag2, weight = tag_heap[j]
                if tag2 in tag1_p or tag2 in tag1_c:
                    abandon_l.append(j)
                    continue
                if tag1.find(tag2) != -1 or tag2.find(tag1) != -1:
                    abandon_l.append(j)
                    continue
                l2 = self.tag2comlist[tag2]
                sum_y = len(l2)
                xy = len(set(l1) & set(l2))
                if (xy * 1.0 / sum_y) > 0.8: abandon_l.append(j)
                if (xy * 1.0 / sum_x) > 0.6: abandon_l.append(j)
        re_l = []
        for i in range(0, len(tag_heap) - 1):
            if i in abandon_l: continue
            else: re_l.append(tag_heap[i][0])
        return re_l
Example #19
0
#!/usr/bin/env python
# -*- coding: utf8 -*-

import os, json, datetime, re, time
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
sys.path.append(
    os.path.realpath(os.path.join(os.path.dirname(__file__), '../../')))
from company_recognize.report_to_company import ComLib
from common.db_fetcher import DataBaseFetcher

db_fetcher = DataBaseFetcher()
cur_abs_path = os.path.dirname(os.path.abspath(__file__))
path_segment = '../../company_recognize/segment/data'
file_com_name2id = '../../company_recognize/com_name2id.txt'
file_filter_words = '../../company_recognize/com_filter.txt'

comlib = ComLib(path_segment, file_com_name2id, None, file_filter_words)


def get_event_type(title):
    etype = 0
    rongzi_list = [
        '融资', '投资', '美元', '人民币', '种子轮', '天使轮', 'PreA', 'A轮', 'B轮', 'C轮', 'D轮',
        'E轮', 'IPO', '上市', '私有化', '收购', '并购', '合并', '倒闭'
    ]
    new_list = [
        '发布', '推出', '研发', '启动', '新品', '新产品', '版本', '业务', '进军', '入股', '战略', '上线'
    ]
    biandong_list = [
Example #20
0
class QualityQuestion(object):
    """docstring for QualityQuestion"""
    def __init__(self):
        self.db_fetcher = DataBaseFetcher()  # mysql handle
        self.redis_cache = CacheHelper('127.0.0.1', 6379, 1, '')  # redis
        self.dict_simple_science, self.dict_simple_art = self.getSimpleQustion(
        )

        self.dict_question_topic = self.getQuestionTopic()
        self.dict_topic = self.getTopicDict()

    def getQuestionTopic(self, save_file='topic.txt'):
        dict_question_topic = {}
        max_num = 0
        with open(save_file) as handle_f:
            for line in handle_f:
                arr = line.strip().split('\t')
                if len(arr) != 3: continue
                tid, question_id, topic_id = arr[0], long(arr[1]), int(arr[2])
                if question_id not in dict_question_topic:
                    dict_question_topic[question_id] = set()

                dict_question_topic[question_id].add(topic_id)
                max_num = tid

        sql = "select id, question_id, topic_id from neworiental_v3.link_question_topic where id > %s" % max_num
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        with open(save_file, 'a') as write_f:
            for row in rows:
                tid, question_id, topic_id = row
                if question_id not in dict_question_topic:
                    dict_question_topic[question_id] = set()

                dict_question_topic[question_id].add(topic_id)
                write_f.write('%s\t%s\t%s\n' % (tid, question_id, topic_id))

        return dict_question_topic

    def getTopicDict(self):
        dict_topic = {}
        sql = "select id, `name` from neworiental_v3.entity_topic where subject_id = 4"
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            topic_id, name = row
            topic_id = int(topic_id)
            dict_topic[topic_id] = name

        return dict_topic

    def makeSet2Dict(self, set_simple):
        dict_simple = {}
        for item in set_simple:
            arr = item.split('\t')
            qid, question_type, json_data = arr[0], arr[1], arr[2]
            qid = long(qid)
            dict_simple[qid] = question_type, json_data

        return dict_simple

    def getSimpleQustion(self):
        dict_simple_science = {}
        dict_simple_art = {}
        is_redis = self.redis_cache.exists(
            '62951166961') and self.redis_cache.exists('62951084550')
        if is_redis:
            # science
            set_simple_science = self.redis_cache.smembers('62951166961')
            dict_simple_science = self.makeSet2Dict(set_simple_science)
            # art
            set_simple_art = self.redis_cache.smembers('62951084550')
            dict_simple_art = self.makeSet2Dict(set_simple_art)
        else:
            sql = "SELECT id, question_type, json_data, upload_id from neworiental_v3.entity_question where upload_id in (62951166961, 62951084550) and json_data !=\'\' "
            rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
            for row in rows:
                qid, question_type, json_data, upload_id = row
                upload_id = str(upload_id)
                json_data = json_data.replace('\t', '')
                if upload_id == '62951166961':  # students majored in science
                    dict_simple_science[qid] = question_type, json_data
                elif upload_id == '62951084550':
                    dict_simple_art[qid] = question_type, json_data

                str_content = '%s\t%s\t%s' % (qid, question_type, json_data)
                self.redis_cache.sadd(upload_id, str_content)

        return dict_simple_science, dict_simple_art

    def getTopicSimpleQuestion(self):
        dict_topic_question = {}
        for qid in self.dict_simple_art:
            if qid in self.dict_question_topic:
                set_topic = self.dict_question_topic[qid]
                for topic in set_topic:
                    if topic not in dict_topic_question:
                        dict_topic_question[topic] = set()

                    dict_topic_question[topic].add(qid)

        for topic in dict_topic_question:
            set_topic = dict_topic_question[topic]
            str_content = '\t'.join([str(x) for x in set_topic])
            print '%s\t%s\t%s' % (topic, self.dict_topic[topic], str_content)