Beispiel #1
0
    def not_person_info(self, response):
        now_person = json.loads(response.text)
        person_info = {
            'companyName': response.meta['company_name'],
            'licenseNum': response.meta['number'],
            'area': '江西省',
            'sex': ''
        }

        for i in now_person['rows']:
            # 人员名称
            person_info['name'] = i['name']
            # 联系电话
            person_info['tel'] = i['mobileNum']
            # 身份证
            person_info['idCard'] = i['idNumber']
            # 职称专业
            if len(i['jobTitleCertInfo']) != 0:
                # 职称
                try:
                    person_info['grade'] = i['titleLevel']['name']
                except KeyError as e:
                    person_info['grade'] = ''
                # 职称专业
                person_info['major'] = i['jobTitleCertInfo'][0][
                    'specificTitleMajor']
                # 证书编号
                person_info['num'] = i['jobTitleCertInfo'][0][
                    'certificateNumber']
                # 发证时间
                try:
                    # 有效期
                    c = time.localtime(
                        int(i['jobTitleCertInfo'][0]['issuedDt'] / 1000))
                    use_time = time.strftime("%Y-%m-%d", c)
                    use_time = str(use_time)
                    person_info['validTime'] = use_time
                except KeyError as e:
                    person_info['validTime'] = ''
                person_info['regNum'] = ''
                person_info['tokenKey'] = self.token

                print('非人员信息%s' % person_info)
                yield scrapy.FormRequest(
                    url=
                    'https://api.maotouin.com/rest/companyInfo/addCompanyRecordEngineer.htm',
                    formdata=person_info,
                    callback=self.person_zz,
                    meta={'company_name': response.meta['company_name']},
                    dont_filter=True)
            else:
                # 岗位
                if i['positionCertInfos']:
                    print('我是%s--非注册人员--公司是%s---' %
                          (i['name'], response.meta['company_name']))
                    try:
                        person_info['grade'] = i['positionCertInfos'][0][
                            'positionType']['name']
                    except IndexError as e:
                        person_info['grade'] = ''

                    # 证书编号
                    try:
                        person_info['num'] = i['positionCertInfos'][0][
                            'certificateNumber']
                    except IndexError as e:
                        person_info['num'] = ''
                    # 有效期
                    try:
                        # 有效期
                        c = time.localtime(
                            int(i['positionCertInfos'][0]['expiryDt'] / 1000))
                        use_time = time.strftime("%Y-%m-%d", c)
                        use_time = str(use_time)
                        person_info['validTime'] = use_time
                    except KeyError as e:
                        person_info['validTime'] = ''
                    person_info['regNum'] = ''
                    person_info['major'] = ''
                    person_info['tokenKey'] = self.token
                    print('非人员信息%s' % person_info)
                    yield scrapy.FormRequest(
                        url=
                        'https://api.maotouin.com/rest/companyInfo/addCompanyRecordEngineer.htm',
                        formdata=person_info,
                        callback=self.person_zz,
                        meta={'company_name': response.meta['company_name']},
                        dont_filter=True)
                else:
                    person_info['grade'] = ''
                    person_info['major'] = ''
                    person_info['validTime'] = ''
                    person_info['num'] = ''
                    person_info['regNum'] = ''
                    person_info['tokenKey'] = self.token
                    yield scrapy.FormRequest(
                        url=
                        'https://api.maotouin.com/rest/companyInfo/addCompanyRecordEngineer.htm',
                        formdata=person_info,
                        callback=self.person_zz,
                        meta={'company_name': response.meta['company_name']},
                        dont_filter=True)
Beispiel #2
0
 def start_requests(self):
     yield scrapy.FormRequest(self.url, formdata=self.formdata)
Beispiel #3
0
    def sv11_tableau_cvi(self, response):
        self.log('sv11_tableau_cvi')

        if os.environ.get('PRODOUANE_DEBUG'):
            with open("debug/sv11_07_tableau.html", 'wb') as f:
                f.write(response.body)

        info = []
        nb_docs = 0
        if (response.css('.dr-table-headercell')):
            nb_docs = int(response.css('.dr-table-headercell').re(r'\((\d+)\)')[0])

        if (nb_docs):
            for tr in response.css('#formDeclaration tbody tr'):
                if (tr.css('td')[3].css('a::attr(id)')):
                    idhtml = tr.css('td')[3].css('a::attr(id)')[0].extract()
                    cvi = tr.css('td::text')[1].extract()
                    date = 'XXXX'
                    if (len(tr.css('td::text')[2].extract().split('/')) > 1):
                        date = tr.css('td::text')[2].extract().split('/')[2]
                    #telechargement que si CVI specifie
                    self.log("new line : {date: %s, cvi: %s, idhtml: %s}"% (date, cvi, idhtml))
                    info.append({'date': date, 'cvi': cvi, 'idhtml': idhtml})
                    if (not len(response.meta['cvi'])):
                        print("new cvi found : sv11 "+cvi)

        args = self.get_input_args(response, '#formFiltre')

        id = response.meta['id']

        if (not len(response.meta['cvi'])) :
            self.log('id %s : %d (%d)' % ('NO MORE CVI', id, len(info)) )
            if (len(info) == 30) and (nb_docs > (30 * (response.meta['page'] + 1))) :
                response.meta['page'] = response.meta['page'] + 1
                myargs = {
                        'javax.faces.ViewState' : args['javax.faces.ViewState'],
                        'formDeclaration:_link_hidden_':'',
                        'formDeclaration:listeDeclaration:scrollerId': '%d' % (response.meta['page'] + 1),
                        'formDeclaration_SUBMIT':"1",
                        'autoScroll':"0,0",
                        }
                response.meta['id'] = 0
                yield scrapy.FormRequest(url='https://www.douane.gouv.fr/ncvi-web-sv11-prodouane/jsp/accueilOrganisme.jsf', formdata=myargs, callback=self.sv11_tableau_cvi, meta=response.meta)
            else:
                response.meta['page'] = 0
                response.meta['id'] = 0
                response.meta['commune'] = response.meta['commune'] + 1
                if (response.meta['nb_communes'] <= response.meta['commune']):
                    response.meta['departement'] = response.meta['departement'] + 1
                    response.meta['commune'] = 0
                if (response.meta['nb_departements'] > response.meta['departement']):
                    yield scrapy.FormRequest(url='https://www.douane.gouv.fr/ncvi-web-sv11-prodouane/jsp/accueilOrganisme.jsf?commune=%d&dep=%d' % (response.meta['commune'], response.meta['departement']), callback=self.sv11_accueil, meta=response.meta)
        elif (len(info) > id):
            self.log('id %s : %d (%d)' % (info[id]['cvi'], id, len(info)) )
            i = info[id]
            myargs = {
                    'javax.faces.ViewState' : args['javax.faces.ViewState'],
                    'formDeclaration:_link_hidden_':'',
                    'formDeclaration:_idcl': i['idhtml'],
                    'formDeclaration_SUBMIT':"1",
                    'autoScroll':"0,0",
                    }

            response.meta['id'] = id
            response.meta['info'] = info

            yield scrapy.FormRequest(url='https://www.douane.gouv.fr/ncvi-web-sv11-prodouane/jsp/accueilOrganisme.jsf', formdata=myargs, callback=self.sv11_html_sv11, meta=response.meta)
        else:
            self.log('no document found for %s' % response.meta['cvi'])
Beispiel #4
0
 def start_requests(self):
     self.formdata['province'] = str(next(self.provinces))
     yield scrapy.FormRequest(url=self.query_url, formdata=self.formdata, callback=self.parse)
Beispiel #5
0
 def start_requests(self):
     for url in self.start_urls:
         yield scrapy.FormRequest(url=url,
                                  cookies=self.cookies,
                                  callback=self.parse)
Beispiel #6
0
 def start_requests(self):
     # start_requests 不能与Rule规则同时使用
     return [scrapy.FormRequest('https://www.douban.com/accounts/login',
                                formdata={'form_email': 'email', 'form_password': '******'},
                                callback=self.loged_in)]
Beispiel #7
0
    def parse_data(self, response):
        try:

            trs = response.xpath('//tr[contains(@bgcolor, "#ffffff")]')
            if len(trs) > 0:
                #open_in_browser(response)
                self.logger.info("fd = %s" % str(response.meta['fd']))
                for tr in trs:
                    trx = Selector(text=tr.extract())
                    row = trx.xpath('//td')

                    if len(row) == 0:
                        return

                    td2 = Selector(text=row[2].extract().replace(
                        '\r\n', '').replace('\t', ''))
                    itemType = td2.xpath('//b//text()').extract_first()

                    if itemType == '아파트':
                        auctionDate = re.findall('\d{4}\.\d{2}\.\d{2}',
                                                 td2.extract())
                        td3 = Selector(text=row[3].extract().replace(
                            '\r\n', '').replace('\t', ''))
                        addr = td3.extract()[td3.extract().index('<br>') +
                                             4:td3.extract().index('<!--')]
                        auctionLoc = re.findall('>(\S*계)', td3.extract())

                        id = re.findall("pop_detail\('(.+)',",
                                        td3.extract())[0]

                        self.logger.info("{},{},{}".format(id, addr, itemType))

                        fd2 = {"idcode": '{}'.format(id)}
                        yield scrapy.FormRequest(self.detail_url,
                                                 callback=self.parse_detail,
                                                 formdata=fd2)

                pages = re.findall("javascript:submit_chk\('(\d)'\);",
                                   response.text)
                if len(pages) > 0:
                    self.logger.info("paging = %s" % str(pages))
                    #open_in_browser(response)

                    fd = response.meta['fd']
                    nowPage = int(str(fd['nowPge']))
                    linkPage = int(str(pages[-1]))

                    if linkPage > nowPage:
                        fd['nowPge'] = str(nowPage + 1)
                        #self.logger.info("paging = %s -> %s" % (str(nowPage), fd['nowPge']))
                        r = scrapy.FormRequest(self.base_url,
                                               callback=self.parse_data,
                                               formdata=fd)
                        r.meta['fd'] = fd

                        yield r

            else:
                self.logger.info("blank")

        except:
            open_in_browser(response)
 def start_requests(self):
     return [
         scrapy.FormRequest("http://www.gewara.com/cinema/searchOpi.xhtml",
                            cookies={'citycode': '320100'},
                            callback=self.parse_movie)
     ]
Beispiel #9
0
 def start_requests(self):
     url = 'https://gateway.chotot.com/v1/public/web-proxy-api/loadRegions'
     yield scrapy.FormRequest(url=url,
                              method='GET',
                              headers=self.headers_city,
                              callback=self.parse_district)
Beispiel #10
0
    def parse(self, response):
        def default_item(xpath_value):
            try:
                return response.xpath(xpath_value).extract_first().strip()
            except AttributeError:
                return ''

        try:
            EVENTTARGET = default_item(
                '//a[@class="consultationSummaryBtn"]/@href').split("'")[1]
        except (TypeError, IndexError):
            EVENTTARGET = ''

        VIEWSTATEFIELDCOUNT = default_item(
            '//input[@id="__VIEWSTATEFIELDCOUNT"]/@value')
        VIEWSTATE = default_item('//input[@id="__VIEWSTATE"]/@value')
        VIEWSTATE1 = default_item('//input[@id="__VIEWSTATE1"]/@value')
        VIEWSTATEGENERATOR = default_item(
            '//input[@id="__VIEWSTATEGENERATOR"]/@value')
        SCROLLPOSITIONX = '0'
        SCROLLPOSITIONY = '0'
        EVENTVALIDATION = default_item(
            '//input[@id="__EVENTVALIDATION"]/@value')
        ctl22_ctl00_ddBoards = 'http://www.medicalboard.gov.au/'
        ctl22_ddBoards = ctl22_ctl00_ddBoards
        ItemId = default_item(
            '//input[@name="content_0$contentcolumnmain_0$rptPastConsultations$ctl00$hdnConsultationItemId"]/@value'
        )
        DetailedContent = default_item(
            '//input[@name="content_0$contentcolumnmain_0$rptPastConsultations$ctl00$hdnDetailedContent"]/@value'
        )
        Submissions = default_item(
            '//input[@name="content_0$contentcolumnmain_0$rptPastConsultations$ctl00$hdnSubmissions"]/@value'
        )

        yield scrapy.FormRequest(
            url=self.start_urls[0],
            formdata={
                '__EVENTTARGET':
                EVENTTARGET,
                '__VIEWSTATEFIELDCOUNT':
                VIEWSTATEFIELDCOUNT,
                '__VIEWSTATE':
                VIEWSTATE,
                '__VIEWSTATE1':
                VIEWSTATE1,
                '__VIEWSTATEGENERATOR':
                VIEWSTATEGENERATOR,
                '__SCROLLPOSITIONX':
                SCROLLPOSITIONX,
                '__SCROLLPOSITIONY':
                SCROLLPOSITIONY,
                '__EVENTVALIDATION':
                EVENTVALIDATION,
                'ctl22$ctl00$ddBoards':
                ctl22_ctl00_ddBoards,
                'ctl22$ddBoards':
                ctl22_ddBoards,
                'content_0$contentcolumnmain_0$rptPastConsultations$ctl00$hdnConsultationItemId':
                ItemId,
                'content_0$contentcolumnmain_0$rptPastConsultations$ctl00$hdnDetailedContent':
                DetailedContent,
                'content_0$contentcolumnmain_0$rptPastConsultations$ctl00$hdnSubmissions':
                Submissions,
                '__EVENTARGUMENT':
                '',
                '__LASTFOCUS':
                '',
                '__VIEWSTATEENCRYPTED':
                '',
                'ctl22$ctl00$ucSearch$txtSearch':
                '',
                'ctl22$ucSearch$txtSearch':
                '',
            },
            headers={
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                'Accept-Encoding':
                'gzip, deflate',
                'Accept-Language':
                'ru-RU,ru;q=0.8,en-US;q=0.6,en;q=0.4',
                'Cache-Control':
                'max-age=0',
                'Connection':
                'keep-alive',
                'Content-Type':
                'application/x-www-form-urlencoded',
                'Host':
                'www.medicalboard.gov.au',
                'Origin':
                'http://www.medicalboard.gov.au',
                'Referer':
                'http://www.medicalboard.gov.au/News/Past-Consultations.aspx',
                'Upgrade-Insecure-Requests':
                '1',
                'User-Agent':
                'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36',
            },
            callback=self.parse_comments)
Beispiel #11
0
 def start_requests(self):
     yield scrapy.FormRequest(self.get_url,
                              method='GET',
                              formdata=self.data,
                              callback=self.parse_list)
     pass
Beispiel #12
0
 def start_requests(self):
     yield scrapy.FormRequest(self.url,
                              formdata=self.formdata,
                              headers=self.headers,
                              callback=self.parse)
Beispiel #13
0
    def start_requests(self):
        """
        爬虫默认接口,启动方法
        :return:
        """
        # 获取爬取时传过来的参数
        # start_time: 开始时间
        # end_time: 结束时间
        # start_page: 开始页 (优先于start_time)
        # end_page: 结束页 (优先于end_time)
        # stop_item: 连续遇到[stop_item]个重复条目后,退出本次爬取
        # spider_name: 指定的spider_name,如果不指定,使用self.name
        # command example:
        # nohup python3 -m scrapy crawl ccgp_guizhou_spider -a start_time="2019:01:01" -a end_time="2020:02:25" > /dev/null&
        # py -3 -m scrapy crawl base_spider -a start_time="now" -a end_time="now"
        # py -3 -m scrapy crawl base_spider -a start_time="now" -a end_time="now" -a start_page="700" -a end_page="1000" -a stop_item="10000"
        assert self.start_time is not None
        assert self.end_time is not None

        self.crawl_mode = CrawlMode.REAL_TIME if str(self.start_time).lower() == 'now' else CrawlMode.HISTORY

        if self.crawl_mode == CrawlMode.HISTORY:
            if (len(self.start_time) != 10 or len(self.end_time) != 10
                    or self.start_time[4] != ':' or self.end_time[4] != ':'):
                logging.error('Bad date format start_time:[{}] end_time:[{}]. Example: 2019:01:01'.format(
                    self.start_time, self.end_time))
                return
        else:
            # 取当天日期
            _dt = datetime.fromtimestamp(time.time())
            self.start_time = _dt.strftime("%Y:%m:%d")
            self.end_time = self.start_time

        # 初始化self.crawl_helper
        self.init_crawl_helper()

        # 主要配置项
        _source_info = {
            # 页面的key,保证唯一
            'page_1': {
                # 通常会被填充在'source'字段里,有时也可以放在'tos'
                'name': '贵州省政府采购网',

                # list页面的base地址
                'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html',

                # list页面的call_back处理函数
                'callback': self.parse_list_page_common,

                'method': "post",

                'requests_type': "html",

                # 得到下一页url的函数,返回值一定是一个url
                'get_next_page_url': self.get_normal_next_page_url,

                # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填)
                'stop_page_num': 1000,

                # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取
                # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录
                'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60,

                # list页面中,获得条目列表的xpath
                'xpath_of_list': '//div[@class="xnrx"]/ul/li',

                # 获得每一个条目链接地址的xpath
                'xpath_of_detail_url': './a/@href',

                # 对每一个条目进行解析,返回CommonRawItem的类,需要实现
                'item_parse_class': BaseItemCommonParser,

                # 其它信息,可以辅助生成CommonRawItem的字段
                # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码
                'tos': '政府采购',
                'tos_code': '02',
                'source': '贵州省政府采购网',
                'notice_type': '采购需求公告',
                'notice_type_code': '0201',
                'site_name': '贵州省政府采购网',
                'area_code': '52',
                'content_code': '1',
                'industryName': '',
                'category.id': '1153332561072666',
                'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0,
            },
            'page_2': {
                # 通常会被填充在'source'字段里,有时也可以放在'tos'
                'name': '贵州省政府采购网',

                # list页面的base地址
                'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html',

                # list页面的call_back处理函数
                'callback': self.parse_list_page_common,

                'method': "post",

                'requests_type': "html",

                # 得到下一页url的函数,返回值一定是一个url
                'get_next_page_url': self.get_normal_next_page_url,

                # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填)
                'stop_page_num': 1000,

                # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取
                # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录
                'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60,

                # list页面中,获得条目列表的xpath
                'xpath_of_list': '//div[@class="xnrx"]/ul/li',

                # 获得每一个条目链接地址的xpath
                'xpath_of_detail_url': './a/@href',

                # 对每一个条目进行解析,返回CommonRawItem的类,需要实现
                'item_parse_class': BaseItemCommonParser,

                # 其它信息,可以辅助生成CommonRawItem的字段
                # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码
                'tos': '政府采购',
                'tos_code': '02',
                'source': '贵州省政府采购网',
                'notice_type': '采购公告',
                'notice_type_code': '0201',
                'site_name': '贵州省政府采购网',
                'area_code': '52',
                'content_code': '1',
                'industryName': '',
                'category.id': '1153418052184995',
                'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0,
            },
            'page_3': {
                # 通常会被填充在'source'字段里,有时也可以放在'tos'
                'name': '贵州省政府采购网',

                # list页面的base地址
                'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html',

                # list页面的call_back处理函数
                'callback': self.parse_list_page_common,

                'method': "post",

                'requests_type': "html",

                # 得到下一页url的函数,返回值一定是一个url
                'get_next_page_url': self.get_normal_next_page_url,

                # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填)
                'stop_page_num': 1000,

                # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取
                # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录
                'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60,

                # list页面中,获得条目列表的xpath
                'xpath_of_list': '//div[@class="xnrx"]/ul/li',

                # 获得每一个条目链接地址的xpath
                'xpath_of_detail_url': './a/@href',

                # 对每一个条目进行解析,返回CommonRawItem的类,需要实现
                'item_parse_class': BaseItemCommonParser,

                # 其它信息,可以辅助生成CommonRawItem的字段
                # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码
                'tos': '政府采购',
                'tos_code': '02',
                'source': '贵州省政府采购网',
                'notice_type': '更正公告',
                'notice_type_code': '0204',
                'site_name': '贵州省政府采购网',
                'area_code': '52',
                'content_code': '1',
                'industryName': '',
                'category.id': '1153454200156791',
                'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0,
            },
            'page_4': {
                # 通常会被填充在'source'字段里,有时也可以放在'tos'
                'name': '贵州省政府采购网',

                # list页面的base地址
                'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html',

                # list页面的call_back处理函数
                'callback': self.parse_list_page_common,

                'method': "post",

                'requests_type': "html",

                # 得到下一页url的函数,返回值一定是一个url
                'get_next_page_url': self.get_normal_next_page_url,

                # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填)
                'stop_page_num': 1000,

                # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取
                # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录
                'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60,

                # list页面中,获得条目列表的xpath
                'xpath_of_list': '//div[@class="xnrx"]/ul/li',

                # 获得每一个条目链接地址的xpath
                'xpath_of_detail_url': './a/@href',

                # 对每一个条目进行解析,返回CommonRawItem的类,需要实现
                'item_parse_class': BaseItemCommonParser,

                # 其它信息,可以辅助生成CommonRawItem的字段
                # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码
                'tos': '政府采购',
                'tos_code': '02',
                'source': '贵州省政府采购网',
                'notice_type': '废标公告',
                'notice_type_code': '0204',
                'site_name': '贵州省政府采购网',
                'area_code': '52',
                'content_code': '1',
                'industryName': '',
                'category.id': '1153488085289816',
                'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0,
            },
            'page_4': {
                # 通常会被填充在'source'字段里,有时也可以放在'tos'
                'name': '贵州省政府采购网',

                # list页面的base地址
                'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html',

                # list页面的call_back处理函数
                'callback': self.parse_list_page_common,

                'method': "post",

                'requests_type': "html",

                # 得到下一页url的函数,返回值一定是一个url
                'get_next_page_url': self.get_normal_next_page_url,

                # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填)
                'stop_page_num': 1000,

                # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取
                # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录
                'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60,

                # list页面中,获得条目列表的xpath
                'xpath_of_list': '//div[@class="xnrx"]/ul/li',

                # 获得每一个条目链接地址的xpath
                'xpath_of_detail_url': './a/@href',

                # 对每一个条目进行解析,返回CommonRawItem的类,需要实现
                'item_parse_class': BaseItemCommonParser,

                # 其它信息,可以辅助生成CommonRawItem的字段
                # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码
                'tos': '政府采购',
                'tos_code': '02',
                'source': '贵州省政府采购网',
                'notice_type': '中标(成交)公告',
                'notice_type_code': '0202',
                'site_name': '贵州省政府采购网',
                'area_code': '52',
                'content_code': '1',
                'industryName': '',
                'category.id': '1153531755759540',
                'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0,
            },
            'page_5': {
                # 通常会被填充在'source'字段里,有时也可以放在'tos'
                'name': '贵州省政府采购网',

                # list页面的base地址
                'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html',

                # list页面的call_back处理函数
                'callback': self.parse_list_page_common,

                'method': "post",

                'requests_type': "html",

                # 得到下一页url的函数,返回值一定是一个url
                'get_next_page_url': self.get_normal_next_page_url,

                # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填)
                'stop_page_num': 1000,

                # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取
                # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录
                'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60,

                # list页面中,获得条目列表的xpath
                'xpath_of_list': '//div[@class="xnrx"]/ul/li',

                # 获得每一个条目链接地址的xpath
                'xpath_of_detail_url': './a/@href',

                # 对每一个条目进行解析,返回CommonRawItem的类,需要实现
                'item_parse_class': BaseItemCommonParser,

                # 其它信息,可以辅助生成CommonRawItem的字段
                # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码
                'tos': '政府采购',
                'tos_code': '02',
                'source': '贵州省政府采购网',
                'notice_type': '单一来源公告',
                'notice_type_code': '0201',
                'site_name': '贵州省政府采购网',
                'area_code': '52',
                'content_code': '1',
                'industryName': '',
                'category.id': '1153567415242344',
                'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0,
            },
            'page_7': {
                # 通常会被填充在'source'字段里,有时也可以放在'tos'
                'name': '贵州省政府采购网',

                # list页面的base地址
                'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html',

                # list页面的call_back处理函数
                'callback': self.parse_list_page_common,

                'method': "post",

                'requests_type': "html",

                # 得到下一页url的函数,返回值一定是一个url
                'get_next_page_url': self.get_normal_next_page_url,

                # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填)
                'stop_page_num': 1000,

                # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取
                # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录
                'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60,

                # list页面中,获得条目列表的xpath
                'xpath_of_list': '//div[@class="xnrx"]/ul/li',

                # 获得每一个条目链接地址的xpath
                'xpath_of_detail_url': './a/@href',

                # 对每一个条目进行解析,返回CommonRawItem的类,需要实现
                'item_parse_class': BaseItemCommonParser,

                # 其它信息,可以辅助生成CommonRawItem的字段
                # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码
                'tos': '政府采购',
                'tos_code': '02',
                'source': '贵州省政府采购网',
                'notice_type': '单一来源(成交)公告',
                'notice_type_code': '0202',
                'site_name': '贵州省政府采购网',
                'area_code': '52',
                'content_code': '1',
                'industryName': '',
                'category.id': '1153595823404526',
                'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0,
            },

            # 市县标讯
            'page_8': {
                # 通常会被填充在'source'字段里,有时也可以放在'tos'
                'name': '贵州省政府采购网',

                # list页面的base地址
                'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html',

                # list页面的call_back处理函数
                'callback': self.parse_list_page_common,

                'method': "post",

                'requests_type': "html",

                # 得到下一页url的函数,返回值一定是一个url
                'get_next_page_url': self.get_normal_next_page_url,

                # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填)
                'stop_page_num': 1000,

                # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取
                # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录
                'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60,

                # list页面中,获得条目列表的xpath
                'xpath_of_list': '//div[@class="xnrx"]/ul/li',

                # 获得每一个条目链接地址的xpath
                'xpath_of_detail_url': './a/@href',

                # 对每一个条目进行解析,返回CommonRawItem的类,需要实现
                'item_parse_class': BaseItemCommonParser,

                # 其它信息,可以辅助生成CommonRawItem的字段
                # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码
                'tos': '政府采购',
                'tos_code': '02',
                'source': '贵州省政府采购网',
                'notice_type': '单一来源(成交)公告',
                'notice_type_code': '0202',
                'site_name': '贵州省政府采购网',
                'area_code': '52',
                'content_code': '1',
                'industryName': '',
                'category.id': '1153796890012888',
                'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0,
            },
            'page_9': {
                # 通常会被填充在'source'字段里,有时也可以放在'tos'
                'name': '贵州省政府采购网',

                # list页面的base地址
                'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html',

                # list页面的call_back处理函数
                'callback': self.parse_list_page_common,

                'method': "post",

                'requests_type': "html",

                # 得到下一页url的函数,返回值一定是一个url
                'get_next_page_url': self.get_normal_next_page_url,

                # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填)
                'stop_page_num': 1000,

                # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取
                # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录
                'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60,

                # list页面中,获得条目列表的xpath
                'xpath_of_list': '//div[@class="xnrx"]/ul/li',

                # 获得每一个条目链接地址的xpath
                'xpath_of_detail_url': './a/@href',

                # 对每一个条目进行解析,返回CommonRawItem的类,需要实现
                'item_parse_class': BaseItemCommonParser,

                # 其它信息,可以辅助生成CommonRawItem的字段
                # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码
                'tos': '政府采购',
                'tos_code': '02',
                'source': '贵州省政府采购网',
                'notice_type': '采购需求公告',
                'notice_type_code': '0201',
                'site_name': '贵州省政府采购网',
                'area_code': '52',
                'content_code': '1',
                'industryName': '',
                'category.id': '1153796890012888',
                'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0,
            },
            'page_10': {
                # 通常会被填充在'source'字段里,有时也可以放在'tos'
                'name': '贵州省政府采购网',

                # list页面的base地址
                'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html',

                # list页面的call_back处理函数
                'callback': self.parse_list_page_common,

                'method': "post",

                'requests_type': "html",

                # 得到下一页url的函数,返回值一定是一个url
                'get_next_page_url': self.get_normal_next_page_url,

                # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填)
                'stop_page_num': 1000,

                # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取
                # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录
                'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60,

                # list页面中,获得条目列表的xpath
                'xpath_of_list': '//div[@class="xnrx"]/ul/li',

                # 获得每一个条目链接地址的xpath
                'xpath_of_detail_url': './a/@href',

                # 对每一个条目进行解析,返回CommonRawItem的类,需要实现
                'item_parse_class': BaseItemCommonParser,

                # 其它信息,可以辅助生成CommonRawItem的字段
                # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码
                'tos': '政府采购',
                'tos_code': '02',
                'source': '贵州省政府采购网',
                'notice_type': '采购公告',
                'notice_type_code': '0201',
                'site_name': '贵州省政府采购网',
                'area_code': '52',
                'content_code': '1',
                'industryName': '',
                'category.id': '1153796890012888',
                'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0,
            },
            'page_11': {
                # 通常会被填充在'source'字段里,有时也可以放在'tos'
                'name': '贵州省政府采购网',

                # list页面的base地址
                'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html',

                # list页面的call_back处理函数
                'callback': self.parse_list_page_common,

                'method': "post",

                'requests_type': "html",

                # 得到下一页url的函数,返回值一定是一个url
                'get_next_page_url': self.get_normal_next_page_url,

                # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填)
                'stop_page_num': 1000,

                # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取
                # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录
                'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60,

                # list页面中,获得条目列表的xpath
                'xpath_of_list': '//div[@class="xnrx"]/ul/li',

                # 获得每一个条目链接地址的xpath
                'xpath_of_detail_url': './a/@href',

                # 对每一个条目进行解析,返回CommonRawItem的类,需要实现
                'item_parse_class': BaseItemCommonParser,

                # 其它信息,可以辅助生成CommonRawItem的字段
                # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码
                'tos': '政府采购',
                'tos_code': '02',
                'source': '贵州省政府采购网',
                'notice_type': '采购公告',
                'notice_type_code': '0201',
                'site_name': '贵州省政府采购网',
                'area_code': '52',
                'content_code': '1',
                'industryName': '',
                'category.id': '1153797950913584',
                'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0,
            },
            'page_12': {
                # 通常会被填充在'source'字段里,有时也可以放在'tos'
                'name': '贵州省政府采购网',

                # list页面的base地址
                'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html',

                # list页面的call_back处理函数
                'callback': self.parse_list_page_common,

                'method': "post",

                'requests_type': "html",

                # 得到下一页url的函数,返回值一定是一个url
                'get_next_page_url': self.get_normal_next_page_url,

                # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填)
                'stop_page_num': 1000,

                # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取
                # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录
                'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60,

                # list页面中,获得条目列表的xpath
                'xpath_of_list': '//div[@class="xnrx"]/ul/li',

                # 获得每一个条目链接地址的xpath
                'xpath_of_detail_url': './a/@href',

                # 对每一个条目进行解析,返回CommonRawItem的类,需要实现
                'item_parse_class': BaseItemCommonParser,

                # 其它信息,可以辅助生成CommonRawItem的字段
                # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码
                'tos': '政府采购',
                'tos_code': '02',
                'source': '贵州省政府采购网',
                'notice_type': '更正公告',
                'notice_type_code': '0204',
                'site_name': '贵州省政府采购网',
                'area_code': '52',
                'content_code': '1',
                'industryName': '',
                'category.id': '1153817836808214',
                'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0,
            },
            'page_13': {
                # 通常会被填充在'source'字段里,有时也可以放在'tos'
                'name': '贵州省政府采购网',

                # list页面的base地址
                'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html',

                # list页面的call_back处理函数
                'callback': self.parse_list_page_common,

                'method': "post",

                'requests_type': "html",

                # 得到下一页url的函数,返回值一定是一个url
                'get_next_page_url': self.get_normal_next_page_url,

                # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填)
                'stop_page_num': 1000,

                # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取
                # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录
                'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60,

                # list页面中,获得条目列表的xpath
                'xpath_of_list': '//div[@class="xnrx"]/ul/li',

                # 获得每一个条目链接地址的xpath
                'xpath_of_detail_url': './a/@href',

                # 对每一个条目进行解析,返回CommonRawItem的类,需要实现
                'item_parse_class': BaseItemCommonParser,

                # 其它信息,可以辅助生成CommonRawItem的字段
                # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码
                'tos': '政府采购',
                'tos_code': '02',
                'source': '贵州省政府采购网',
                'notice_type': '废标公告',
                'notice_type_code': '0202',
                'site_name': '贵州省政府采购网',
                'area_code': '52',
                'content_code': '1',
                'industryName': '',
                'category.id': '1153845808113747',
                'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0,
            },
            'page_14': {
                # 通常会被填充在'source'字段里,有时也可以放在'tos'
                'name': '贵州省政府采购网',

                # list页面的base地址
                'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html',

                # list页面的call_back处理函数
                'callback': self.parse_list_page_common,

                'method': "post",

                'requests_type': "html",

                # 得到下一页url的函数,返回值一定是一个url
                'get_next_page_url': self.get_normal_next_page_url,

                # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填)
                'stop_page_num': 1000,

                # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取
                # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录
                'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60,

                # list页面中,获得条目列表的xpath
                'xpath_of_list': '//div[@class="xnrx"]/ul/li',

                # 获得每一个条目链接地址的xpath
                'xpath_of_detail_url': './a/@href',

                # 对每一个条目进行解析,返回CommonRawItem的类,需要实现
                'item_parse_class': BaseItemCommonParser,

                # 其它信息,可以辅助生成CommonRawItem的字段
                # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码
                'tos': '政府采购',
                'tos_code': '02',
                'source': '贵州省政府采购网',
                'notice_type': '中标(成交)公告',
                'notice_type_code': '0202',
                'site_name': '贵州省政府采购网',
                'area_code': '52',
                'content_code': '1',
                'industryName': '',
                'category.id': '1153905922931045',
                'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0,
            },
            'page_15': {
                # 通常会被填充在'source'字段里,有时也可以放在'tos'
                'name': '贵州省政府采购网',

                # list页面的base地址
                'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html',

                # list页面的call_back处理函数
                'callback': self.parse_list_page_common,

                'method': "post",

                'requests_type': "html",

                # 得到下一页url的函数,返回值一定是一个url
                'get_next_page_url': self.get_normal_next_page_url,

                # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填)
                'stop_page_num': 1000,

                # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取
                # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录
                'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60,

                # list页面中,获得条目列表的xpath
                'xpath_of_list': '//div[@class="xnrx"]/ul/li',

                # 获得每一个条目链接地址的xpath
                'xpath_of_detail_url': './a/@href',

                # 对每一个条目进行解析,返回CommonRawItem的类,需要实现
                'item_parse_class': BaseItemCommonParser,

                # 其它信息,可以辅助生成CommonRawItem的字段
                # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码
                'tos': '政府采购',
                'tos_code': '02',
                'source': '贵州省政府采购网',
                'notice_type': '单一来源公示',
                'notice_type_code': '0201',
                'site_name': '贵州省政府采购网',
                'area_code': '52',
                'content_code': '1',
                'industryName': '',
                'category.id': '1153924595764135',
                'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0,
            },
            'page_16': {
                # 通常会被填充在'source'字段里,有时也可以放在'tos'
                'name': '贵州省政府采购网',

                # list页面的base地址
                'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html',

                # list页面的call_back处理函数
                'callback': self.parse_list_page_common,

                'method': "post",

                'requests_type': "html",

                # 得到下一页url的函数,返回值一定是一个url
                'get_next_page_url': self.get_normal_next_page_url,

                # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填)
                'stop_page_num': 1000,

                # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取
                # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录
                'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60,

                # list页面中,获得条目列表的xpath
                'xpath_of_list': '//div[@class="xnrx"]/ul/li',

                # 获得每一个条目链接地址的xpath
                'xpath_of_detail_url': './a/@href',

                # 对每一个条目进行解析,返回CommonRawItem的类,需要实现
                'item_parse_class': BaseItemCommonParser,

                # 其它信息,可以辅助生成CommonRawItem的字段
                # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码
                'tos': '政府采购',
                'tos_code': '02',
                'source': '贵州省政府采购网',
                'notice_type': '单一来源(成交)公示',
                'notice_type_code': '0202',
                'site_name': '贵州省政府采购网',
                'area_code': '52',
                'content_code': '1',
                'industryName': '',
                'category.id': '1153937977184763',
                'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0,
            },
            'page_17': {
                # 通常会被填充在'source'字段里,有时也可以放在'tos'
                'name': '贵州省政府采购网',

                # list页面的base地址
                'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html',

                # list页面的call_back处理函数
                'callback': self.parse_list_page_common,

                'method': "post",

                'requests_type': "html",

                # 得到下一页url的函数,返回值一定是一个url
                'get_next_page_url': self.get_normal_next_page_url,

                # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填)
                'stop_page_num': 1000,

                # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取
                # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录
                'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60,

                # list页面中,获得条目列表的xpath
                'xpath_of_list': '//div[@class="xnrx"]/ul/li',

                # 获得每一个条目链接地址的xpath
                'xpath_of_detail_url': './a/@href',

                # 对每一个条目进行解析,返回CommonRawItem的类,需要实现
                'item_parse_class': BaseItemCommonParser,

                # 其它信息,可以辅助生成CommonRawItem的字段
                # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码
                'tos': '政府采购',
                'tos_code': '02',
                'source': '贵州省政府采购网',
                'notice_type': '资格预审公告',
                'notice_type_code': '0201',
                'site_name': '贵州省政府采购网',
                'area_code': '52',
                'content_code': '1',
                'industryName': '',
                'category.id': '1156071132710859',
                'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0,
            },
        }

        logging.info('start crawling...')

        # 轮询每个类别
        for _k, _v in _source_info.items():

            # 填充爬取的基本信息
            self.crawl_helper.init_crawl_info(_k, _v)

            # 假定每个类别有不超过100000个页面
            for _page_num in range(100000):
                # 轮询公告中的不同list页面
                if self.crawl_helper.get_stop_flag(_k):
                    break

                # 根据获得下一页的函数,得到下一页的URL
                _request_url = _v['get_next_page_url'](page_index=_page_num, base_url=_v['base_url'])
                # _request = ""
                # 生成request
                if _v["method"] == "post":
                    _payload = {
                        'siteId': "1",
                        'category.id': _v["category.id"],
                        'areaName': "",
                        'tenderRocurementPm': "",
                        'keywords': "",
                        'articlePageNo': str(_page_num + 1),
                        'articlePageSize': "15"
                    }
                    _request = scrapy.FormRequest(url=_request_url, formdata=_payload, callback=_v['callback'])
                else:
                    _request = scrapy.Request(_request_url, callback=_v['callback'])

                # 如果需要js渲染,需要使用下面的函数
                # _request = SplashRequest(_request_url, callback=_v['callback'], args={'wait': 2})

                # 填充必要的参数
                _request.meta['param'] = _v
                _request.meta['crawl_key'] = _k
                _request.meta['page_index'] = _page_num + 1

                yield _request

            # 单个类别的爬取结束
            self.crawl_helper.stop_crawl_info(_k)

        logging.info('stop crawling...')
Beispiel #14
0
    def parse(self, response):

        re = response
        # print(response.text)

        # 处理数据
        # 订单id
        order_id = re.xpath(
            '//*[@id="order-form"]/div[1]/div[2]/div[1]/div[1]/p/text()'
        ).extract()[0]
        # 物流单号
        tracking_number = re.xpath(
            '//*[@id="order-form"]/div[5]/div[2]/div[2]/div[1]/input/@value'
        ).extract()[0]
        # 订单状态
        order_st = re.xpath(
            '//*[@id="order-form"]/div[1]/div[2]/div[1]/div[2]/p/text()'
        ).extract()[0]
        # 马帮发货时间
        expresstime = re.xpath(
            '//*[@id="order-form"]/div[1]/div[2]/div[8]/div[2]/input/@value'
        ).extract()[0]
        # 马帮订单id
        mb_orderid = re.xpath(
            '//*[@id="order-form"]/div[1]/div[1]/input[1]/@value').extract()[0]
        # print(mb_orderid)
        # print(order_st)

        if order_st == '已发货':
            # 传输数据
            mb_meta = {
                'order_id': order_id,
                'tracking_number': tracking_number,
                'order_st': order_st,
                'expresstime': expresstime,
                'mb_orderid': mb_orderid
            }
            # 获取合并订单的sku
            url = 'https://aamz.mabangerp.com/index.php?mod=order.findrelevantinfo'

            headers = {
                # "Accept": "application/json, text/javascript, */*; q=0.01",
                # "Accept-Encoding": "gzip, deflate, br",
                # "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
                # "Cache-Control": "no-cache",
                # "Connection": "keep-alive",
                # "Content-Type": "application/json; charset=UTF-8",

                # "Host": "aamz.mabangerp.com",
                # "Content-Length": "",
                # "X-Requested-With": "XMLHttpRequest",
                # "Referer": "https://aamz.mabangerp.com/index.php?mod=order.detail&platformOrderId=0O43LJNW&orderStatus=2&orderTable=2&tableBase=2&cMKey=MABANG_ERP_PRO_MEMBERINFO_LOGIN_191565&lang=cn",

                # 注意user-agent不要出现空格
                "User-Agent":
                "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0",
            }

            # post请求数据
            data1 = {'orderId': mb_orderid, 'type': '1', 'tableBase': '2'}
            # cookies数据
            cookies = "gr_user_id=493499a8-83fc-4e47-87c3-08b1ded6df3c; MULTI_LANGUAGE_TYPE=%2BYjZ6oacL7xJ%2FKOcmBg9Z7cTOqi7UgOUgujRs4KQ4Ms%3D; lang=cn; stock_show_product_data_cookie=ico-minus-circle; stock_data_js_cookie_is_change_weight=1; mabang_lite_rowsPerPage=500; stock_data_js_cookie_is_change_name=1; order_data_js_cookie_orderErrorbysVal=paidTime; order_data_js_cookie_orderErrorbydacname=orderByspaidTime; order_data_js_cookie_orderErrorbydacnameval=down; order_data_js_cookie_isSyn=2; employ_rows_per_page_data_cookie=50; order_data_js_cookie_isImmediately=1; signed=222014_00f6735cc675f0abb6f483d9913f72bf; PHPSESSID=gjgkl12ntct9knahgq66qtlks1; event_rember12_222014=0; CRAWL_KANDENG_KEY=K6uqW0ZkQEouz0n1adoI%2FWqfFs2PbJ8%2BCpQKvtnzAvWpTX174VXBmq5L9cDOSOj%2Bm2IcDf7pRauH34yzR4OEyw%3D%3D; loginLiteCookie=a%3A2%3A%7Bs%3A8%3A%22username%22%3Bs%3A6%3A%22222014%22%3Bs%3A9%3A%22passsword%22%3Bs%3A32%3A%22f1c7edfb07a416030a0f976bac902add%22%3B%7D"
            cookies = {
                i.split("=")[0]: i.split("=")[1]
                for i in cookies.split("; ")
            }

            # yield scrapy.Request(url=url, cookies=self.cookies, headers=headers, meta=mb_meta, callback=self.parse2)
            yield scrapy.FormRequest(url=url,
                                     cookies=cookies,
                                     formdata=data1,
                                     headers=headers,
                                     meta=mb_meta,
                                     callback=self.detail_parse)
        else:
            pass
Beispiel #15
0
 def start_requests(self):
     for i in range(1, self.page + 1):
         data = {
             "pageNum": "{}".format(i)
         }
         yield scrapy.FormRequest(self.start_urls[0].format(i), headers=self.headers, formdata=data)
Beispiel #16
0
import os
Beispiel #17
0
 def start_requests(self):
     request_url = 'https://maoyan.com/films'
     return [scrapy.FormRequest(request_url, callback=self.parse_movie)]
Beispiel #18
0
    def parse_item_1(self, response):

        response_url = response.url
        print('1.response_url:', response_url)

        response_status = response.status
        print('1状态码为:', response_status)

        if response_status == 403:
            captcha = response.xpath("//img[@class='yzm-pic']/@src").extract()
            print('captcha', captcha)
            if len(captcha) > 0:
                # '''此时有验证码'''
                # 人工输入验证码
                print("正在保存验证码图片")

                captchapicfile = "/Users/ozintel/Tsl_exercise/znfw_crawer/lawtime_married_family/data/captcha1.png"
                # urlopen = urllib.URLopener()
                # 下载图片流
                ssl._create_default_https_context = ssl._create_unverified_context

                with request.urlopen(captcha[0]) as fp:
                    data = fp.read()
                    # 清除并以二进制写入
                    f = open(captchapicfile, 'wb')
                    f.write(data)
                    f.close()

            # captcha = input('第%s次遇到验证码,请处理验证码:\n' % (self.state_count))
            # print('已经输入验证码,继续抓取:')
            #
            power_key = ''.join(
                response.xpath(
                    '//div[@class="regform-box"]/form[@name="reform"]/input[1]/@value'
                ).extract())
            captcha = input('*********captcha请输入验证码:\n')
            con_value = ''.join(
                response.xpath(
                    '//div[@class="regform-box"]/form[@name="reform"]/input[2]/@value'
                ).extract())
            formdata = {
                'vgcode': captcha,
                'power_key': power_key,
                # 'servertype': '10',
                # 'requestmode': 'async',
                'continue': con_value
            }
            print('formdata', formdata)
            self.state_count = self.state_count + 1
            # 分析源代码表单所得而不是network中看到的,那个不行
            # yield  FormRequest.from_response(response,
            #                            url='http://ipfilter.lsurl.cn/index.php?m=Home&c=IpFilter&a=submit_verification',
            #                            meta={"cookiejar": response.meta["cookiejar"]},
            #                            headers=self.headers,
            #                            formdata=formdata,
            #                            callback=self.parse_page,
            #                            )
            yield scrapy.FormRequest(
                url=
                'http://ipfilter.lsurl.cn/index.php?m=Home&c=IpFilter&a=submit_verification',
                headers=self.headers,
                formdata=formdata,
                # callback=self.parse_page
                callback=self.parse_item_1)

            # yield scrapy.FormRequest(
            #     url=url,
            #     formdata={"email": "xxx", "password": "******"},
            #     callback=self.parse_page
            # )
            # with open(r'/Users/ozintel/Tsl_exercise/znfw_crawer/lihun/data/law.html', 'wb') as f:
            #     f.write(response.body)
            # # very_code = input('第%s次遇到验证码,请处理验证码:\n' % (self.state_count))
            # print('已经输入验证码,继续抓取:')
        else:
            # with open(r'/Users/ozintel/Tsl_exercise/znfw_crawer/tutorial2/data/law.html','wb') as f:
            #     body=response.body
            #     print(body)
            #     f.write(body)
            item = DmozItem()  #在item.py中定义好的,该字典将会被返回给pipeline调用
            links = response.xpath(
                '//ul[@class="list-main"]/li/div/a/@href').extract()
            print('每一页中的内容数为:', len(links))
            for link in links:
                # print(link)
                yield response.follow(url=link, callback=self.parse_item_2)
Beispiel #19
0
 def start_requests(self):
     start_urls = 'http://cdfy.chinacourt.gov.cn/article/search/content_time_publish_begin/2002-01-01/content_time_publish_end/2030-03-03/article_category_id//content_author//keyword/%E4%B8%8D%E5%BF%98%E5%88%9D%E5%BF%83%E3%80%81%E7%89%A2%E8%AE%B0%E4%BD%BF%E5%91%BD/button/%E6%8F%90%E4%BA%A4/page/1.shtml'
     yield scrapy.FormRequest(start_urls,callback=self.parse, headers = self.Headers, cookies = self.cookies)
Beispiel #20
0
    def parse(self, response):
        response_status = response.status
        print('0状态码为:', response_status)
        response_url = response.url
        print('0.response_url:', response_url)

        if response_status == 403:
            captcha = response.xpath("//img[@class='yzm-pic']/@src").extract()
            print('captcha', captcha)
            if len(captcha) > 0:
                # '''此时有验证码'''
                # 人工输入验证码
                print("正在保存验证码图片")

                captchapicfile = "/Users/ozintel/Tsl_exercise/znfw_crawer/lawtime_married_family/data/captcha.png"
                # urlopen = urllib.URLopener()
                # 下载图片流
                ssl._create_default_https_context = ssl._create_unverified_context

                with request.urlopen(captcha[0]) as fp:
                    data = fp.read()
                    # 清除并以二进制写入
                    f = open(captchapicfile, 'wb')
                    f.write(data)
                    f.close()

            print('开始写入response.body', response.body)
            with open(
                    r'/Users/ozintel/Tsl_exercise/znfw_crawer/lawtime_married_family/data/A_1.html',
                    'wb') as f:
                f.write(response.body)

            # captcha = input('第%s次遇到验证码,请处理验证码:\n' % (self.state_count))
            # print('已经输入验证码,继续抓取:')
            #
            power_key = ''.join(
                response.xpath(
                    '//div[@class="regform-box"]/form[@name="reform"]/input[1]/@value'
                ).extract())
            captcha = input('*********captcha请输入验证码:\n')
            con_value = ''.join(
                response.xpath(
                    '//div[@class="regform-box"]/form[@name="reform"]/input[2]/@value'
                ).extract())
            formdata = {
                'vgcode': captcha,
                'power_key': power_key,
                # 'servertype': '10',
                # 'requestmode': 'async',
                'continue': con_value
            }
            print('formdata', formdata)
            self.state_count = self.state_count + 1
            #分析源代码表单所得而不是network中看到的,那个不行
            # yield  FormRequest.from_response(response,
            #                            url='http://ipfilter.lsurl.cn/index.php?m=Home&c=IpFilter&a=submit_verification',
            #                            meta={"cookiejar": response.meta["cookiejar"]},
            #                            headers=self.headers,
            #                            formdata=formdata,
            #                            callback=self.parse_page,
            #                            )
            yield scrapy.FormRequest(
                url=
                'http://ipfilter.lsurl.cn/index.php?m=Home&c=IpFilter&a=submit_verification',
                headers=self.headers,
                formdata=formdata,
                # callback=self.parse_page
                callback=self.parse)

            # time.sleep(5)
        else:

            # self.item_count = response.xpath('//div[@class="paging paging-a"]//span/text()').re('\d+')[0]
            # self.all = int(self.item_count) + self.all
            # print('********************', self.item_count, self.all)

            # uri_list=response.xpath('//div[@class="list-block h100p o-h"]//dd//a/@href').extract()

            #这个response总是有的
            uri_list = response.xpath(
                '(//div[@class="paging paging-a"]/a/@href)[position()<last()]'
            ).extract()
            count_link = len(uri_list)
            print('每个网页中页码数为:', count_link)

            # if count_link==0 and response_status!=403:
            #     print('哈哈哈哈哈')
            #     raise CloseSpider()
            # if count_link!=0 or response_status==403:
            url_first = re.sub(r'http://www.lawtime.cn', '', response_url)
            uri_list.append(url_first)
            for uri in uri_list:
                uri_1 = 'http://www.lawtime.cn/' + uri
                # print(uri_1)
                yield response.follow(url=uri_1, callback=self.parse_item_1)

        self.month_1 = self.month_1 + 1
        if self.month_1 == 13:
            self.year_1 = self.year_1 + 1
            self.month_1 = 1
            m = '0' + str(self.month_1)
            ym = (self.year_1, m)
            # print(self.year_1, m)
        else:
            if self.month_1 <= 9:
                m = '0' + str(self.month_1)
                ym = (self.year_1, m)
                # print(self.year_1, m)
            else:
                ym = (self.year_1, self.month_1)
                # print(year_1, month_1)

        if self.year_1 == 2017 and self.month_1 > 11:
            raise CloseSpider()
            # print('ym',ym)
        uri_2 = "http://www.lawtime.cn/ask/browse_s91_d%s%s.html" % ym
        # http: // www.lawtime.cn / ask / browse_s4_p13.html
        # uri_2="http://www.lawtime.cn/ask/browse_s4_p%s.html" %(self.page)
        # "http://www.lawtime.cn/ask/browse_s4_p1.html"

        # item = DmozItem()  # 在item.py中定义好的,该字典将会被返回给pipeline调用**********
        # item['type'] = '%s' %page   # ***********
        # yield item

        # yield scrapy.Request(url=uri_2,meta={"cookiejar":1}, callback=self.parse)
        yield scrapy.Request(url=uri_2, callback=self.parse)
Beispiel #21
0
 def start_requests(self):
     return [
         scrapy.FormRequest("http://txdai.com/", headers=self.user_agent)
     ]
Beispiel #22
0
 def start_requests(self):
     yield scrapy.FormRequest(
         url=self.endpoint, method='GET', formdata=self.query, callback=self.parse
     )
    def start_requests(self):
        ### TODO: figure out what to do about dates
        DATES = '2017_12_09_2017_12_10'
        URL = "https://www.tripadvisor.com/Hotels"

        headers = {
            'Accept':
            'text/javascript, text/html, application/xml, text/xml, */*',
            'Accept-Encoding': 'gzip,deflate',
            'Accept-Language': 'en-US,en;q=0.5',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8',
            'Host': 'www.tripadvisor.com',
            'Pragma': 'no-cache',
            'Referer': '',
            'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36.',
            'X-Requested-With': 'XMLHttpRequest'
        }

        form_data = {
            'adults': '2',
            'dateBumped': 'NONE',
            'displayedSortOrder': 'popularity',
            'geo': '',
            'hs': '',
            'isFirstPageLoad': 'false',
            'rad': '0',
            'refineForm': 'true',
            'requestingServlet': 'Hotels',
            'rooms': '1',
            'scid': 'null_coupon',
            'searchAll': 'false',
            'seen': '150',
            'sequence': '7',
            'o': "0",
            'staydates': DATES
        }

        cookies = {"SetCurrency": "USD"}

        # read from the necessary intermediate URLs
        with open("intermediate/urls.csv") as f:
            reader = csv.reader(f)
            for line in reader:
                url = urljoin('http://www.tripadvisor.com', line[0])
                geo = line[1]
                headers['Referer'] = url
                form_data['geo'] = geo

                yield scrapy.FormRequest(url=URL,
                                         method='POST',
                                         formdata=form_data,
                                         cookies=cookies,
                                         headers=headers,
                                         callback=self.parse,
                                         meta={
                                             'seen': '0',
                                             'url': url
                                         })
 def start_requests(self):
     yield scrapy.FormRequest(self.start_urls[0],
                              formdata=self.form_data,
                              callback=self.homepage_parse)
Beispiel #25
0
    def parse(self, response):
        print("## Beginning to parse ", self.crtPage, " page")

        crtTr = 0
        item = BacItem()
        self.errors = 0

        if self.started:
            for tr in response.xpath(
                    '(//table[@class="mainTable"]/tr/td[@class="tdBac"])'):
                if crtTr == None:
                    continue

                if crtTr % 31 == 0:
                    item['nr'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 1:
                    item['nume'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 2:
                    item['posIerarhieJudet'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 3:
                    item['posIerarhieTara'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 4:
                    item['unitInvatamant'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 5:
                    item['judet'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 6:
                    item['promotieAnterioara'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 7:
                    item['formaEducatie'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 8:
                    item['specializare'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 9:
                    item['examenOralRomana'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 10:
                    item['notaScrisaRomana'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 11:
                    item['notaContestatieRomana'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 12:
                    item['notaFinalaRomana'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 13:
                    item['limbaMaterna'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 14:
                    item['limbaModerna'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 15:
                    item['notaLimbaModerna'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 16:
                    item['disciplinaObligatorie'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 17:
                    item['disciplinaAlegere'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 18:
                    item['competenteDigitale'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 19:
                    item['medie'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 20:
                    item['rezultatFinal'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 21:
                    item['competenteMaterna'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 22:
                    item['notaScrisaMaterna'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 23:
                    item['notaContestatieMaterna'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 24:
                    item['notaFinalaMaterna'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 25:
                    item['notaDisciplinaObligatorie'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 26:
                    item[
                        'notaContestatieDisciplinaObligatorie'] = BeautifulSoup(
                            tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 27:
                    item['notaFinalaDisciplinaObligatorie'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 28:
                    item['notaDisciplinaAlegere'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 29:
                    item['notaContestatieDisciplinaAlegere'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 30:
                    item['notaFinalaDisciplinaAlegere'] = BeautifulSoup(
                        tr.extract().encode("utf-8")).get_text()

                if crtTr % 31 == 0 and crtTr != 0:
                    yield item

                crtTr += 1

        if self.started:
            print("## Parsing page ", self.crtPage, " ended")

        #Go to the next page
        if not self.started:
            self.crtPage = self.startPage
            self.started = True
        else:
            self.crtPage += 1

        if (self.crtPage <= self.endPage):
            while True:
                print("## Trying to jump to next page: ", self.crtPage)

                try:
                    if response.css(
                            'input#__VIEWSTATE::attr(value)').extract_first(
                            ) != None and response.css(
                                'input#__VIEWSTATEGENERATOR::attr(value)'
                            ).extract_first() != None and response.css(
                                'input#__EVENTVALIDATION::attr(value)'
                            ).extract_first() != None:
                        self._viewState = response.css(
                            'input#__VIEWSTATE::attr(value)').extract_first()
                        self._viewGenerator = response.css(
                            'input#__VIEWSTATEGENERATOR::attr(value)'
                        ).extract_first()
                        self._eventValidation = response.css(
                            'input#__EVENTVALIDATION::attr(value)'
                        ).extract_first()

                    yield scrapy.FormRequest(
                        'http://bacalaureat.edu.ro/Pages/TaraRezultAlfa.aspx',
                        formdata={
                            'ctl00$ContentPlaceHolderBody$DropDownList2':
                            str(self.crtPage),
                            '__VIEWSTATE':
                            self._viewState,
                            '__VIEWSTATEGENERATOR':
                            self._viewGenerator,
                            '__EVENTVALIDATION':
                            self._eventValidation
                        },
                        callback=self.parse,
                        dont_filter=True)
                    break
                except Exception as e:
                    print("Error when loading page ", self.crtPage, e)
                    self.errors += 1
                    if self.errors >= 2:
                        exit(0)

                    time.sleep(25)
    def parse(self, response):
        url_done = []
        urls = [
            'https://www.amazon.com/Haier-HC17SF15RB-Refrigerator-Freezer-Qualified/dp/B00N142GLI?_encoding=UTF8&psc=1'
        ]

        #url='http://www.upcbarcodes.com/wp-admin/admin-ajax.php'        # print input_seller_name
        for seller_url in urls:
            if seller_url not in url_done:

                requsturl = scrapy.FormRequest(
                    'https://www.amazon.com/gp/delivery/ajax/address-change.html',
                    headers={
                        'Origin': 'https://www.amazon.com',
                        'Accept-Encoding': 'gzip, deflate, br',
                        'Accept-Language': 'en-US,en;q=0.9',
                        'User-Agent':
                        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                        'Content-Type':
                        'application/x-www-form-urlencoded;charset=UTF-8',
                        'Accept': 'text/html,*/*',
                        'Referer': 'https://www.amazon.com/',
                        'X-Requested-With': 'XMLHttpRequest',
                        'Connection': 'keep-alive'
                    },
                    formdata={
                        'zipCode': '60629',
                        'locationType': 'LOCATION_INPUT',
                        'deviceType': 'web',
                        'pageType': 'Detail'
                    },
                    callback=self.getupc,
                    method="POST")
                #print requsturl.body

                requsturl.cookies = {
                    'aws-target-static-id':
                    '1452239187641-159842',
                    'aws-business-metrics-last-visit':
                    '1460349972900',
                    '__utmv':
                    '194891197.%22QDSe8l%404pyTIl%3FpKm5C24aEFXeBtLGw3BhDIGikRUeXlFGLshyp4Dtw4gLRG%3F9cU%22',
                    'aws-userInfo':
                    '%7B%22arn%22%3A%22arn%3Aaws%3Aiam%3A%3A111320495319%3Aroot%22%2C%22alias%22%3A%22%22%2C%22username%22%3A%22rahul%2520bhaskar%22%2C%22keybase%22%3A%22%22%2C%22issuer%22%3A%22https%3A%2F%2Fwww.amazon.com%2Fap%2Fsignin%22%7D',
                    's_pers':
                    '%20s_vnum%3D1880352564702%2526vn%253D3%7C1880352564702%3B%20s_invisit%3Dtrue%7C1470309348410%3B%20s_nr%3D1470307548416-Repeat%7C1478083548416%3B',
                    '__utma':
                    '194891197.372951375.1452236845.1470290622.1470307550.17',
                    'aws-ubid-main':
                    '182-7331780-4611541',
                    '_mkto_trk':
                    'id:112-TZM-766&token:_mch-aws.amazon.com-1484112318467-15791',
                    'x-amz-captcha-1':
                    '1494506850389641',
                    'x-amz-captcha-2':
                    '7TNj5/ZBUiQE8Q7M1TGIGw==',
                    'aws-target-visitor-id':
                    '1452239187643-451048.20_19',
                    'aws-target-data':
                    '%7B%22support%22%3A%221%22%7D',
                    's_fid':
                    '70673D38D5DFE123-1B689FC000FE2EFF',
                    's_vn':
                    '1515648318007%26vn%3D7',
                    'regStatus':
                    'registered',
                    'x-wl-uid':
                    '1YbHUe7z4Q16UzYOKnza7nF0Z8c60AUse7MqEp+CAv+wdJamSRB88EpQjCOb5Xsg9wS/EFz0+hhSbAl3qbMeh7dWiD1jtJRDs/6R5VxAFk6LzV16+6hZ0Cz+uIpt9TzsXS7IGe2aDx3Q=',
                    'sst-main':
                    'Sst1|PQGX_RwjQAxLFI_BwdTV0Q4UCL8-RIlysfyKrjYoFGe3oqm9lnuttlbX-lGX4weSExupeA7cYB3Zb0CSGU91LcK9xa8Av4IeMWfbcMKAV4AXqvCSM7S-SXJJpEWQhn0AsaJNc4wwxVVQzrZRhD4jVmdocyJewDAfSRGF1SSTgg_cvNGYGZx8-WqW1z-bekrkDEc-ZrMz9f9Ii077rpcz7Q0tBrE5xr2htKXdWZUzmT4ZSBqkJ9NlatkaEU7sYxBuyl0LadTT6wmYRPPfHnJzSQYdUQ',
                    'ubid-main':
                    '156-9680828-0484351',
                    'ca':
                    'ALAAAAAAEAAAAAQGAAEIAUQ=',
                    's_vnum':
                    '1926421318258%26vn%3D2',
                    's_nr':
                    '1514281824850-New',
                    's_dslv':
                    '1514281824851',
                    'session-id':
                    '144-3935774-8062208',
                    'session-token':
                    '"n2d7o5bJUB480T+okCcD+Qgte5eb6+XVoWrh4WzA8cPLcyI8v4G8hDqqoR2uWyzLBg4ETAaFwIQ6lGxkm9Hx8EmSQMVq4In0q2pXM0KD/1jNBUtqnPJf5WRZb/xRJGL2mIv58UxYLpLX0e1wf6XYjtrHfPcAOONchcbeZIpAXZOil1fCyrFDBgE3AmUSvlFNadxFHlRhG6IUrSJ/W7TAEw=="',
                    'session-id-time':
                    '2082787201l',
                    'csm-hit':
                    '%7B%22tb%22%3A%227099AAE54J4R4DQXDH89%2Bs-7099AAE54J4R4DQXDH89%7C1515471521353%22%2C%22adb%22%3A%22adblk_no%22%7D'
                }
                time.sleep(1)
                yield requsturl
Beispiel #27
0
    def parse_company_list(self, response):
        for company_item in response.xpath('//div[@class="jie_nei"]/ul/li/a'):
            info = json.loads(
                company_item.xpath('@onclick').re_first(
                    r'company0(.*?);').replace('(',
                                               '[').replace(')', ']').replace(
                                                   '\'', '\"'))
            company_code, info_no, zj = info
            column_id = response.meta['columnid']
            company = ExposureCompanyItem.get_company(column_id, company_code,
                                                      info_no)

            # 合作中介列表
            yield scrapy.FormRequest(
                url='http://icid.iachina.cn/ICID/front/viewAllZJ.do',
                method='POST',
                formdata={
                    'columnid': column_id,
                    'internetInformationNo': info_no,
                    'informationno': info_no,
                    'zj': zj
                },
                meta={'company': company},
                callback=self.parse_cooperation_list,
                dont_filter=True)

            # 合作中介列表 历史
            yield scrapy.FormRequest(
                url='http://icid.iachina.cn/ICID/front/viewAllZJHis.do',
                method='POST',
                formdata={
                    'columnid': column_id,
                    'internetInformationNo': info_no,
                    'informationno': info_no,
                    'zj': zj
                },
                meta={'company': company},
                callback=self.parse_cooperation_list,
                dont_filter=True)

            # 合作第三方列表
            yield scrapy.FormRequest(
                url='http://icid.iachina.cn/ICID/front/viewAllSecond.do',
                method='POST',
                formdata={
                    'columnid': column_id,
                    'internetInformationNo': info_no,
                    'informationno': info_no,
                    'zj': zj
                },
                meta={'company': company},
                callback=self.parse_cooperation_list,
                dont_filter=True)

            # 合作第三方列表 历史
            yield scrapy.FormRequest(
                url='http://icid.iachina.cn/ICID/front/viewAllSecondHis.do',
                method='POST',
                formdata={
                    'columnid': column_id,
                    'internetInformationNo': info_no,
                    'informationno': info_no,
                    'zj': zj
                },
                meta={'company': company},
                callback=self.parse_cooperation_list,
                dont_filter=True)
Beispiel #28
0
 def start_requests(self):
     url = 'http://www.sxt.cn/index/login/login.html'
     formdata = {"user": "******", "password": "******"}
     yield scrapy.FormRequest(url, formdata=formdata, callback=self.parse)
Beispiel #29
0
 def parse(self, response):
     lastpage = int(response.xpath("//select[@id='Page']/option[last()]/@value")[0].extract())
     for page in xrange(1, lastpage + 1):
         yield scrapy.FormRequest(response.url, callback=self.parse_songs,  formdata={'Page': str(page)})
Beispiel #30
0
    def person_info(self, response):
        now_person = json.loads(response.text)
        for n in now_person['rows']:
            person_info = {
                'companyName': response.meta['company_name'],
                'licenseNum': response.meta['number'],
                'area': '江西省',
                'sex': '',
                'idCard': '',
                'major': '',
                'phone': '',
                'tokenKey': self.token,
                'name': n['name']
            }
            # 人员名称
            # print(n['name'])
            print('我是%s----公司是%s' % (n['name'], response.meta['company_name']))

            # 证书编号
            try:
                person_info['num'] = n['registrationInfo'][0][
                    'regCertificateNumber']
            except KeyError as e:
                person_info['num'] = ''

            # 注册类别
            person_info['grade'] = n['registrationInfo'][0]['registerType'][
                'name']
            # 注册专业
            try:
                person_info['major'] = n['registrationInfo'][0][
                    'qualificationRegMajors'][0]['name']
            except KeyError as e:
                person_info['major'] = ''

            # 执业印章号
            print(n['registrationInfo'][0]['qualificationCertNumber'])
            person_info['regNum'] = n['registrationInfo'][0][
                'qualificationCertNumber']

            # 发证机关 ---待续

            # 证件有效时间
            try:
                print(n['registrationInfo'][0]['registrationDt'])
                c = time.localtime(
                    int(n['registrationInfo'][0]['registrationDt'] / 1000))
                use_time = time.strftime("%Y-%m-%d", c)
                use_time = str(use_time)
                person_info['validTime'] = use_time
            except KeyError as e:
                person_info['validTime'] = ''

            # print(person_info)
            print('注册人员信息%s' % person_info)
            yield scrapy.FormRequest(
                url=
                'https://api.maotouin.com/rest/companyInfo/addCompanyRecordEngineer.htm',
                formdata=person_info,
                callback=self.person_zz,
                meta={'company_name': response.meta['company_name']},
                dont_filter=True)