def _getSCInfoFromHtmls(self, para): ''' :param para: :return: 类型:列表 元素:第一个元素为哨兵(保存学科类别代码与名称),其余元素为数据项 其中:其余所有数据项为:datum = asyncRunFunc(self._asyncGetInstitutionInfo, paraList, asyn=False) ''' (SC_code, value) = para SC_name = value[0] SC_code_and_name = SC_code + '-' + SC_name htmls_SC_Path = self.htmlsRootPath + '\\' + SC_code_and_name self.storerIns.makeDir(htmls_SC_Path) SC_instutions = value[1:] paraList = [] final = getArgs(SC_code, SC_name) for page in SC_instutions: for ins in page: if (ins == ''): break ins_code = ins[0] ins_name = ins[1] ins_url = self._getInstitutionURL(SC_code, ins_name) html_ins_Path = htmls_SC_Path + '\\' + ins_code + '-' + ins_name para = getArgs(ins_code, ins_name, ins_url, html_ins_Path, htmls_SC_Path) paraList.append(para) datum = asyncRunFunc(self._asyncGetInstitutionInfo, paraList, poolNum=self.poolNum, asyn=self.asynFlag) datum.insert(0, final) return datum
def getResearchInfo(self, insCodeAndName, insData, insUrl): data = [] for researchData in insData: researchInfo = [] temp = getArgs(insCodeAndName[0], insUrl) researchInfo.append(temp) researchInfo = researchInfo + researchData temp = researchInfo.pop(1) researchInfo.insert(4, temp) temp = researchInfo.pop(8) temp = domain + '/zsml/kskm.jsp?id=' + temp researchInfo.insert(8, getArgs('点此查看', temp)) temp = researchInfo.pop() scope = self.getScope(temp, researchInfo, researchData, insUrl) cntList = [i for i in range(0, 4)] for cnt in cntList: researchInfo.insert(9, scope.pop()) data.append(researchInfo) return data
def _modifyData(self, department, major, researchArea): data = getArgs(department, major, researchArea) temp = [] for i in data: try: tup = findAllWithRe(i, '\((.+)\)(.+)')[0] temp.append(tup[0] + '-' + tup[1]) except IndexError: print(i) exit(-1) return temp
def getSubjectInfo(self, subjectName, datum): dic = {} for data in datum: (insName, department, major, researchArea, examType, learngType, teacher, enrolledNumer, scopeUrl, course1, course2, course3, course4, crossMajor, remark) = data # 'http://yz.chsi.com.cn/zsml/querySchAction.do?dwmc=%E5%8C%97%E4%BA%AC%E5%A4%A7%E5%AD%A6&yjxkdm=0101' # examTypeLi = ['统考', '单考'] # #examTypes = ['(不含推免)'] # enrolledNumerTypes = ['一级学科:', '专业:', '研究方向:', '院系所:'] (department, major, researchArea) = self._modifyData(department, major, researchArea) insName = insName[0] data = getArgs(insName, department, subjectName, major, researchArea, examType, learngType, enrolledNumer) value = getArgs(department, subjectName, major, researchArea, examType, learngType, enrolledNumer) if (not insName in dic): dic.update({insName: [value]}) else: dic[insName] = dic[insName] + [value] return dic
def _modifyData(self, department, major, researchArea): data = getArgs(department, major, researchArea) temp = [] for i in data: try: tup = findAllWithRe(i, '\((.+)\)(.+)')[0] temp.append(tup[0] + '-' + tup[1]) except IndexError: print(i) exit(-1) return temp # def _getEnrolledNumber(self, data): # print(data) # (insName, department, subjectName, major, researchArea, examType, learngType, enrolledNumer) = data # if(not enrolledNumer.find('一级学科:') = -1) # exit(0)
def _getCount(): data = storerIns.getPickleFileData(pklPath) if data == False: count = 0 errCount = 0 errNum = 0 errMax = 0 count2 = 0 smallestFileSize = 999999999 else: count = data['count'] errCount = data['errCount'] errNum = data['errNum'] errMax = data['errMax'] count2 = data['count2'] smallestFileSize = data['smallestFileSize'] return getArgs(count, errCount, errNum, errMax, count2, smallestFileSize)
def getInstitutionInfo(self, subjectData, xlsxDirPath): xlsxFilePath = xlsxDirPath + '//rawInfo.xlsx' sheetName = 'rawInfo' sheetHead = [ '机构名', '院系所', '专业', '研究方向', '考试方式', '学习方式', '指导教师', '拟招生人数', '考试范围', '政治', '外语', '业务课一', '业务课二', '跨专业', '备注' ] sheetDatum = [] for insInfo in subjectData: insHead = insInfo[0] insCodeAndName = [insHead[0] + '-' + insHead[1]] insUrl = insHead[2] insData = insInfo[1:] sheetDatum = sheetDatum + self.getResearchInfo( insCodeAndName, insData, insUrl) data = getArgs(xlsxFilePath, sheetName, sheetHead, sheetDatum) return data
def getRawSubjectInfo(self, subjectsInfo): instance = modifyRawSubjeectsInfo.getInstance() final = [self.xlsxRootDirPath] for subjectInfo in subjectsInfo: subjectCodeAndName = subjectInfo[0] subjectData = subjectInfo[1:] subjectCode = subjectCodeAndName[0] subjectName = subjectCodeAndName[1] xlsxDirPath = self.xlsxRootDirPath + '//' + subjectCode + '-' + subjectName data = instance.getInstitutionInfo(subjectData, xlsxDirPath) (xlsxFilePath, sheetName, sheetHead, sheetDatum) = data data = tuple(data) data = [] temp = getArgs(xlsxDirPath, xlsxFilePath, sheetName, sheetHead) data.append(temp) data = data + sheetDatum final.append(data) return final
def _getSCInfoFromPklFile(self, SC_code, value): SC_code_and_name = SC_code + '-' + value[0] pkl_SC_path = self.pklsRootPath + '\\' + SC_code_and_name + '.pkl' para = getArgs(SC_code, value) return self.storerIns.getPickleFileDataFromOtherData( pkl_SC_path, self._getSCInfoFromHtmls, para)