def getData(movieUrl): start = time.time() http = Http() [code, msg, res] = http.open(movieUrl) # time.sleep(1) print 'the process pid is:%s,cost time is:%.2f' % (os.getpid(), time.time() - start)
def getSecond(self, rdict): http = Http() for key, value in rdict.items(): print key + " " + value url = self.baseUrl + value code, msg, res = http.open(url) if (code == 200): self.parserData(key, res)
def start(self): print '++++++++++++省级领导信息+++++++++++++' http = Http() print '++++++++++++加农炮正在瞄准敌机中' code, msg, res = http.open(self.baseUrl) if code == 200: print '++++++++++++加农炮发射' soup = bs(res, 'html.parser') provinceList = soup.find_all('h2', class_='red2') print '++++++++++++加农炮确定敌机轨迹,连续发射' for province in provinceList: provinceName = province.text for line in province.find_next_sibling( 'ul', class_='clearfix ld new_ld2').find_all('dd', class_=False): area = provinceName duty = line.span.text name = line.a.string url = line.a['href'] print area + ' ' + duty + ' ' + name + ' ' + url officer = bean() officer.setProvince(area) officer.setName(name) officer.setDuty(duty) if name != '空缺': govRes = county.resumeInfo(url, self.baseRoot) if len(govRes) != 0: officer.setPosition(govRes[0]) officer.setSex(govRes[1]) officer.setBirth(govRes[2]) officer.setNativePlace(govRes[3]) officer.setEducation(govRes[4]) officer.setResume(govRes[5]) officerList = [ officer.province, officer.name, officer.duty, officer.position, officer.sex, officer.birth, officer.nativePlace, officer.education, officer.resume ] # self.flag = self.writeToexcel(self.flag,self.excelFileName,officerList) # print name+' 已经写入 '+self.excelFileName+' 的 '+str(self.flag-1)+' 行' common.write2Word(self.wordFileName, officerList) print name + " 已经写入" print '-------------------------' else: print 'network error' exit(1)
def _getProxyIp2Pool(self): # 维持一个代理池,采用的是西祠的透明代理 hp = Http.Http() _headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "zh-CN,zh;q=0.8", "Cache-Control": "max-age=0", "Host": "www.xicidaili.com", "Referer": "http://www.xicidaili.com/nt/1", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36" } url = "http://www.xicidaili.com/nt/" code, msg, page = hp.open(url, **_headers) if code != 200: print "can not get the ip" return False resList = self._parseDate(page) for r in resList: self.redisConn.sadd(self.proxyName, r) return True
def resumeInfo(url, baseUrl): resList = list() url = baseUrl + url http = Http() code, msg, res = http.open(url) if code != 200: return [] soup = bs(res, "html.parser") postion = soup.find('span', class_='red2').string if postion == None: postion = '' resList.append(postion) infoBase = soup.find('dd').find('p') infoBaseParser = str(infoBase).split('\n')[:-1] for line in infoBaseParser: value = line.split('</b>')[1].split('<br>')[0] if value == None: value = '' resList.append(value) info = soup.find('div', class_='p2j_text') resList.append(str(info.text)) return resList
def testHttp(): http = hp.Http() code,resMsg,page = http.open('http://www.zhihu.com', Cookie = 'd_c0="AEAAHF7tUgqPTser6aQFFNucOrpy_pVS_nM=|1470143843"; _zap=e0aa3eeb-4a2f-469a-b0c0-213999c8fad8; q_c1=9b67681f980240caad14cf09153f8cf4|1472825844000|1470143841000; l_cap_id="MjAyZjk4OWRkNWViNDg3NDk0Y2EzZDY3Y2RjNzMwN2Q=|1473305517|5e44a389c8a8d1c492f8e28b6c33558684f9709b"; cap_id="NWVjNTlmODg0ZjA0NDNjNmFjZjczYzVkOGE1NDVlOWM=|1473305517|483c89cfd8b6a38c9dfa3e64ca36874c5b1fc7f6"; login="******"; __utmt=1; __utma=51854390.1478424689.1473601532.1474426191.1474436825.11; __utmb=51854390.3.9.1474436838115; __utmc=51854390; __utmz=51854390.1474436825.11.11.utmcsr=baidu.com|utmccn=(referral)|utmcmd=referral|utmcct=/index.php; __utmv=51854390.100-1|2=registration_date=20131021=1^3=entry_date=20131021=1; a_t="2.0AAAAmHsfAAAXAAAA5KsJWAAAAJh7HwAAAEAAHF7tUgoXAAAAYQJVTcJo-FcAPd0Msxz2C-lOeeikr99GzbSowiV5bg9W3imFV1LlbeAWXvSVBCxarA=="; z_c0=Mi4wQUFBQW1Ic2ZBQUFBUUFBY1h1MVNDaGNBQUFCaEFsVk53bWo0VndBOTNReXpIUFlMNlU1NTZLU3YzMGJOdEtqQ0pR|1474436836|6bf62efd8b544ab5415eda5bd7a08201827a15ac') print code
def __init__(self, string): self.requestUrl = 'http://baike.baidu.com/search/word?word=' self.http = Http() self.secondUrl = 'http://baike.baidu.com' self.searchWord(string)
class spider(object): def __init__(self, string): self.requestUrl = 'http://baike.baidu.com/search/word?word=' self.http = Http() self.secondUrl = 'http://baike.baidu.com' self.searchWord(string) def searchWord(self, string): splitList = string.split(':') self.keyword = splitList[-1] self.limitWord = splitList[:-1] def start(self): code, msg, res = self.http.open(self.requestUrl + self.keyword) if code == 200: try: result = self.analyze(res, self.limitWord) return result except Exception: return ERROR.PARSEERROR, 'parse error' else: return ERROR.NETWORK, 'network failure' def analyze(self, res, limitWord): self.soup = bs(res, 'html.parser') existFlag = self.soup.find('div', class_='no-result') if existFlag != None: tool.echo(self.keyword + ' 未找到此人的关联信息') return ERROR.DATANONE, 'error' polysemant = self.soup.find('div', class_='polysemant-list') if polysemant == None: # 需要核实无异议的项目 tool.echo(self.keyword + ' 无异议') code, res = ERROR.LEVEL1, self.parserData(res) for i in limitWord: if i in res.simpleInfo: return code, res return ERROR.DATANONE, 'error' else: max = [0, ''] for line in polysemant.findAll('li'): content = line.text weight = 0 for oneLimit in limitWord: if oneLimit in content: weight += 1 if max[0] < weight: max[0] = weight max[1] = line if max[0] == 0: tool.echo(self.keyword + ' 未找到此人的关联信息') return ERROR.DATANONE, 'error' info = max[1] if info.find('span', class_='selected') != None: tool.echo(self.keyword + ' 已选择:' + info.text) return ERROR.LEVEL2, self.parserData(res) tool.echo(self.keyword + ' 优先选择' + info.text) return ERROR.LEVEL3, self.parserData( self.polysemantAchive(info.a['href'])) def polysemantAchive(self, url): code, msg, res = self.http.open(self.secondUrl + url) if code == 200: return res else: # raise Exception("network error") return 'network error' def parserData(self, res): keyword = self.keyword complexInfoList = [] complexParamDict = {} simpleInfo = '' soup = bs(res, 'html.parser') # 获取keyword的剪短介绍和详细介绍 infomation = soup.findAll('div', class_='para') if infomation != None: simpleInfo += infomation[0].text for info in infomation[1:]: complexInfoList.append(info.text) # 某些次要关键词 complexParam = soup.find('div', class_='basic-info cmn-clearfix') if complexParam != None: for line in complexParam.findAll('dl', class_='basicInfo-block'): for one in line.findAll('dt', class_='name'): complexParamDict[str(one.string).strip()] = str( one.find_next_sibling('dd', class_='value').string).strip() return self.instanceBean( [keyword, simpleInfo, complexInfoList, complexParamDict]) def instanceBean(self, list): oneBean = bean() oneBean.keyword = list[0] oneBean.simpleInfo = list[1] oneBean.complexInfoList = list[2] oneBean.complexParamDict = list[3] return oneBean
def start(self): http = Http() print '++++++++++++加农炮正在瞄准敌机中' code, msg, res = http.open(self.baseUrl) if code == 200: print '++++++++++++加农炮发射' soup = bs(res, 'html.parser') provinceList = soup.find_all('h2', class_='red2') print '++++++++++++加农炮确定敌机轨迹,连续发射' for province in provinceList: provinceName = province.text for line in province.find_next_sibling( 'div', class_='ld2 new_ld').find_all('li', class_=False): areaName = line.span.text base = line.find_all('a') secretaryBoss = base[1].string secretaryBossUrl = base[1]['href'] govBoss = base[2].string govBossUrl = base[2]['href'] partyBean = bean() partyBean.setProvince(provinceName) partyBean.setCity(areaName) partyBean.setName(secretaryBoss) if secretaryBoss != '空缺': partyRes = county.resumeInfo(secretaryBossUrl, self.baseRoot) if len(partyRes) != 0: partyBean.setPosition(partyRes[0]) partyBean.setSex(partyRes[1]) partyBean.setBirth(partyRes[2]) partyBean.setNativePlace(partyRes[3]) partyBean.setEducation(partyRes[4]) partyBean.setResume(partyRes[5]) govBean = bean() govBean.setProvince(provinceName) govBean.setCity(areaName) govBean.setName(govBoss) if govBoss != '空缺': govRes = county.resumeInfo(govBossUrl, self.baseRoot) if len(govRes) != 0: govBean.setPosition(govRes[0]) govBean.setSex(govRes[1]) govBean.setBirth(govRes[2]) govBean.setNativePlace(govRes[3]) govBean.setEducation(govRes[4]) govBean.setResume(govRes[5]) print '省:' + partyBean.province + ' 市、城区:' + partyBean.city + ' 姓名:' + partyBean.name + ' 职位:' + partyBean.position + ' 性别:' + partyBean.sex + ' 生日:' + partyBean.birth + ' 籍贯:' + partyBean.nativePlace + ' 学历:' + partyBean.education print '省:' + govBean.province + ' 市、城区:' + govBean.city + ' 姓名:' + govBean.name + ' 职位:' + govBean.position + ' 性别:' + govBean.sex + ' 生日:' + govBean.birth + ' 籍贯:' + govBean.nativePlace + ' 学历:' + govBean.education partyList = [ partyBean.province, partyBean.city, partyBean.name, '党委书记', partyBean.position, partyBean.sex, partyBean.birth, partyBean.nativePlace, partyBean.education, partyBean.resume ] govList = [ govBean.province, govBean.city, govBean.name, '政府一把手', govBean.position, govBean.sex, govBean.birth, govBean.nativePlace, govBean.education, govBean.resume ] # self.flag = viceProvince.write2excel(self.flag, self.excelFileName, partyList) # self.flag = viceProvince.write2excel(self.flag, self.excelFileName, govList) # print '++++++++++写入至xls文件中的 '+str(self.flag-2)+' 和 '+str(self.flag-1)+' 行' common.write2Word(self.wordFileName, partyList) common.write2Word(self.wordFileName, govList) print "正在写入" else: print 'network error' exit(1)
def start(self): http = Http() code, msg, res = http.open(self.startUrl) print "爬取状态:" + str(code) print "爬取信息:" + msg self.parser(res)