Example #1
0
def getData(movieUrl):
    start = time.time()
    http = Http()
    [code, msg, res] = http.open(movieUrl)
    # time.sleep(1)
    print 'the process pid is:%s,cost time is:%.2f' % (os.getpid(),
                                                       time.time() - start)
Example #2
0
 def getSecond(self, rdict):
     http = Http()
     for key, value in rdict.items():
         print key + " " + value
         url = self.baseUrl + value
         code, msg, res = http.open(url)
         if (code == 200):
             self.parserData(key, res)
Example #3
0
    def start(self):
        print '++++++++++++省级领导信息+++++++++++++'
        http = Http()
        print '++++++++++++加农炮正在瞄准敌机中'
        code, msg, res = http.open(self.baseUrl)
        if code == 200:
            print '++++++++++++加农炮发射'
            soup = bs(res, 'html.parser')
            provinceList = soup.find_all('h2', class_='red2')
            print '++++++++++++加农炮确定敌机轨迹,连续发射'
            for province in provinceList:
                provinceName = province.text
                for line in province.find_next_sibling(
                        'ul',
                        class_='clearfix ld new_ld2').find_all('dd',
                                                               class_=False):
                    area = provinceName
                    duty = line.span.text
                    name = line.a.string
                    url = line.a['href']
                    print area + ' ' + duty + ' ' + name + ' ' + url

                    officer = bean()
                    officer.setProvince(area)
                    officer.setName(name)
                    officer.setDuty(duty)

                    if name != '空缺':
                        govRes = county.resumeInfo(url, self.baseRoot)
                        if len(govRes) != 0:
                            officer.setPosition(govRes[0])
                            officer.setSex(govRes[1])
                            officer.setBirth(govRes[2])
                            officer.setNativePlace(govRes[3])
                            officer.setEducation(govRes[4])
                            officer.setResume(govRes[5])

                    officerList = [
                        officer.province, officer.name, officer.duty,
                        officer.position, officer.sex, officer.birth,
                        officer.nativePlace, officer.education, officer.resume
                    ]
                    # self.flag = self.writeToexcel(self.flag,self.excelFileName,officerList)
                    # print name+'  已经写入 '+self.excelFileName+' 的 '+str(self.flag-1)+' 行'
                    common.write2Word(self.wordFileName, officerList)
                    print name + "  已经写入"
                print '-------------------------'

        else:
            print 'network error'
            exit(1)
Example #4
0
    def _getProxyIp2Pool(self):
        # 维持一个代理池,采用的是西祠的透明代理
        hp = Http.Http()
        _headers = {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Encoding":
            "gzip, deflate, sdch",
            "Accept-Language":
            "zh-CN,zh;q=0.8",
            "Cache-Control":
            "max-age=0",
            "Host":
            "www.xicidaili.com",
            "Referer":
            "http://www.xicidaili.com/nt/1",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"
        }
        url = "http://www.xicidaili.com/nt/"

        code, msg, page = hp.open(url, **_headers)
        if code != 200:
            print "can not get the ip"
            return False

        resList = self._parseDate(page)
        for r in resList:
            self.redisConn.sadd(self.proxyName, r)
        return True
Example #5
0
    def resumeInfo(url, baseUrl):
        resList = list()
        url = baseUrl + url
        http = Http()
        code, msg, res = http.open(url)
        if code != 200:
            return []
        soup = bs(res, "html.parser")
        postion = soup.find('span', class_='red2').string
        if postion == None:
            postion = ''
        resList.append(postion)
        infoBase = soup.find('dd').find('p')
        infoBaseParser = str(infoBase).split('\n')[:-1]
        for line in infoBaseParser:
            value = line.split('</b>')[1].split('<br>')[0]
            if value == None:
                value = ''
            resList.append(value)

        info = soup.find('div', class_='p2j_text')
        resList.append(str(info.text))
        return resList
Example #6
0
def testHttp():
    http = hp.Http()
    code,resMsg,page = http.open('http://www.zhihu.com',
                    Cookie = 'd_c0="AEAAHF7tUgqPTser6aQFFNucOrpy_pVS_nM=|1470143843"; _zap=e0aa3eeb-4a2f-469a-b0c0-213999c8fad8; q_c1=9b67681f980240caad14cf09153f8cf4|1472825844000|1470143841000; l_cap_id="MjAyZjk4OWRkNWViNDg3NDk0Y2EzZDY3Y2RjNzMwN2Q=|1473305517|5e44a389c8a8d1c492f8e28b6c33558684f9709b"; cap_id="NWVjNTlmODg0ZjA0NDNjNmFjZjczYzVkOGE1NDVlOWM=|1473305517|483c89cfd8b6a38c9dfa3e64ca36874c5b1fc7f6"; login="******"; __utmt=1; __utma=51854390.1478424689.1473601532.1474426191.1474436825.11; __utmb=51854390.3.9.1474436838115; __utmc=51854390; __utmz=51854390.1474436825.11.11.utmcsr=baidu.com|utmccn=(referral)|utmcmd=referral|utmcct=/index.php; __utmv=51854390.100-1|2=registration_date=20131021=1^3=entry_date=20131021=1; a_t="2.0AAAAmHsfAAAXAAAA5KsJWAAAAJh7HwAAAEAAHF7tUgoXAAAAYQJVTcJo-FcAPd0Msxz2C-lOeeikr99GzbSowiV5bg9W3imFV1LlbeAWXvSVBCxarA=="; z_c0=Mi4wQUFBQW1Ic2ZBQUFBUUFBY1h1MVNDaGNBQUFCaEFsVk53bWo0VndBOTNReXpIUFlMNlU1NTZLU3YzMGJOdEtqQ0pR|1474436836|6bf62efd8b544ab5415eda5bd7a08201827a15ac')
    print code
Example #7
0
 def __init__(self, string):
     self.requestUrl = 'http://baike.baidu.com/search/word?word='
     self.http = Http()
     self.secondUrl = 'http://baike.baidu.com'
     self.searchWord(string)
Example #8
0
class spider(object):
    def __init__(self, string):
        self.requestUrl = 'http://baike.baidu.com/search/word?word='
        self.http = Http()
        self.secondUrl = 'http://baike.baidu.com'
        self.searchWord(string)

    def searchWord(self, string):
        splitList = string.split(':')
        self.keyword = splitList[-1]
        self.limitWord = splitList[:-1]

    def start(self):
        code, msg, res = self.http.open(self.requestUrl + self.keyword)
        if code == 200:
            try:
                result = self.analyze(res, self.limitWord)
                return result
            except Exception:
                return ERROR.PARSEERROR, 'parse error'
        else:
            return ERROR.NETWORK, 'network failure'

    def analyze(self, res, limitWord):
        self.soup = bs(res, 'html.parser')
        existFlag = self.soup.find('div', class_='no-result')
        if existFlag != None:
            tool.echo(self.keyword + '  未找到此人的关联信息')
            return ERROR.DATANONE, 'error'
        polysemant = self.soup.find('div', class_='polysemant-list')
        if polysemant == None:
            # 需要核实无异议的项目
            tool.echo(self.keyword + ' 无异议')
            code, res = ERROR.LEVEL1, self.parserData(res)
            for i in limitWord:
                if i in res.simpleInfo:
                    return code, res
            return ERROR.DATANONE, 'error'
        else:
            max = [0, '']
            for line in polysemant.findAll('li'):
                content = line.text
                weight = 0
                for oneLimit in limitWord:
                    if oneLimit in content:
                        weight += 1
                if max[0] < weight:
                    max[0] = weight
                    max[1] = line
            if max[0] == 0:
                tool.echo(self.keyword + '  未找到此人的关联信息')
                return ERROR.DATANONE, 'error'

            info = max[1]
            if info.find('span', class_='selected') != None:
                tool.echo(self.keyword + '  已选择:' + info.text)
                return ERROR.LEVEL2, self.parserData(res)

            tool.echo(self.keyword + '  优先选择' + info.text)
            return ERROR.LEVEL3, self.parserData(
                self.polysemantAchive(info.a['href']))

    def polysemantAchive(self, url):
        code, msg, res = self.http.open(self.secondUrl + url)
        if code == 200:
            return res
        else:
            # raise Exception("network error")
            return 'network error'

    def parserData(self, res):
        keyword = self.keyword
        complexInfoList = []
        complexParamDict = {}
        simpleInfo = ''
        soup = bs(res, 'html.parser')
        # 获取keyword的剪短介绍和详细介绍
        infomation = soup.findAll('div', class_='para')
        if infomation != None:
            simpleInfo += infomation[0].text
            for info in infomation[1:]:
                complexInfoList.append(info.text)

        # 某些次要关键词
        complexParam = soup.find('div', class_='basic-info cmn-clearfix')
        if complexParam != None:
            for line in complexParam.findAll('dl', class_='basicInfo-block'):
                for one in line.findAll('dt', class_='name'):
                    complexParamDict[str(one.string).strip()] = str(
                        one.find_next_sibling('dd',
                                              class_='value').string).strip()

        return self.instanceBean(
            [keyword, simpleInfo, complexInfoList, complexParamDict])

    def instanceBean(self, list):
        oneBean = bean()
        oneBean.keyword = list[0]
        oneBean.simpleInfo = list[1]
        oneBean.complexInfoList = list[2]
        oneBean.complexParamDict = list[3]
        return oneBean
Example #9
0
    def start(self):
        http = Http()
        print '++++++++++++加农炮正在瞄准敌机中'
        code, msg, res = http.open(self.baseUrl)
        if code == 200:
            print '++++++++++++加农炮发射'
            soup = bs(res, 'html.parser')
            provinceList = soup.find_all('h2', class_='red2')
            print '++++++++++++加农炮确定敌机轨迹,连续发射'
            for province in provinceList:
                provinceName = province.text
                for line in province.find_next_sibling(
                        'div', class_='ld2 new_ld').find_all('li',
                                                             class_=False):
                    areaName = line.span.text
                    base = line.find_all('a')
                    secretaryBoss = base[1].string
                    secretaryBossUrl = base[1]['href']
                    govBoss = base[2].string
                    govBossUrl = base[2]['href']

                    partyBean = bean()
                    partyBean.setProvince(provinceName)
                    partyBean.setCity(areaName)
                    partyBean.setName(secretaryBoss)
                    if secretaryBoss != '空缺':
                        partyRes = county.resumeInfo(secretaryBossUrl,
                                                     self.baseRoot)
                        if len(partyRes) != 0:
                            partyBean.setPosition(partyRes[0])
                            partyBean.setSex(partyRes[1])
                            partyBean.setBirth(partyRes[2])
                            partyBean.setNativePlace(partyRes[3])
                            partyBean.setEducation(partyRes[4])
                            partyBean.setResume(partyRes[5])

                    govBean = bean()
                    govBean.setProvince(provinceName)
                    govBean.setCity(areaName)
                    govBean.setName(govBoss)
                    if govBoss != '空缺':
                        govRes = county.resumeInfo(govBossUrl, self.baseRoot)
                        if len(govRes) != 0:
                            govBean.setPosition(govRes[0])
                            govBean.setSex(govRes[1])
                            govBean.setBirth(govRes[2])
                            govBean.setNativePlace(govRes[3])
                            govBean.setEducation(govRes[4])
                            govBean.setResume(govRes[5])
                    print '省:' + partyBean.province + '  市、城区:' + partyBean.city + '  姓名:' + partyBean.name + '  职位:' + partyBean.position + '  性别:' + partyBean.sex + '  生日:' + partyBean.birth + '  籍贯:' + partyBean.nativePlace + '  学历:' + partyBean.education
                    print '省:' + govBean.province + '  市、城区:' + govBean.city + '  姓名:' + govBean.name + '  职位:' + govBean.position + '  性别:' + govBean.sex + '  生日:' + govBean.birth + '  籍贯:' + govBean.nativePlace + '  学历:' + govBean.education

                    partyList = [
                        partyBean.province, partyBean.city, partyBean.name,
                        '党委书记', partyBean.position, partyBean.sex,
                        partyBean.birth, partyBean.nativePlace,
                        partyBean.education, partyBean.resume
                    ]
                    govList = [
                        govBean.province, govBean.city, govBean.name, '政府一把手',
                        govBean.position, govBean.sex, govBean.birth,
                        govBean.nativePlace, govBean.education, govBean.resume
                    ]
                    # self.flag = viceProvince.write2excel(self.flag, self.excelFileName, partyList)
                    # self.flag = viceProvince.write2excel(self.flag, self.excelFileName, govList)
                    # print '++++++++++写入至xls文件中的  '+str(self.flag-2)+'  和  '+str(self.flag-1)+'  行'
                    common.write2Word(self.wordFileName, partyList)
                    common.write2Word(self.wordFileName, govList)
                    print "正在写入"
        else:
            print 'network error'
            exit(1)
Example #10
0
 def start(self):
     http = Http()
     code, msg, res = http.open(self.startUrl)
     print "爬取状态:" + str(code)
     print "爬取信息:" + msg
     self.parser(res)