コード例 #1
0
ファイル: buchong2.py プロジェクト: wagaman/dollop
    def start_requests(self):
        body = {'type':'1','target1':self.target1,'target2':self.target2}
        for i in self.shortlist:
            obj = DaohangItem.get_object_by_pk(i)
            if obj.plat_id:
                plat_id = obj.plat_id
                self.mapping[plat_id] = obj
                body['wdzjPlatId'] = str(plat_id)

                yield scrapy.FormRequest(self.start_url, formdata=body, meta = body, dont_filter=True)
コード例 #2
0
    def parse(self, response):
        symbol = (self.mapping.get(self.get_pin_from_url(response.url)),
                  response.url)
        self.logger.info('Parsing ID.%d Wangjia Feature From <%s>.' % symbol)
        self.object = DaohangItem.get_object_by_pk(symbol[0])

        item = TedianItem()
        item['name'] = self.object.name

        rtag = response.xpath('//div[@class="rTags"]')
        if rtag:
            item['status'] = get_content(
                rtag.xpath('./span[@class="tag3"]/text()').extract())
            item['company_tag'] = get_content(
                rtag.xpath('./span[@class="tag tag2"]/text()').extract())

            tag_info = rtag.xpath('./span[@class = "tag"]')
            item['illustration'] = '/'.join([
                get_trunk(info) for info in tag_info.xpath('text()').extract()
            ])

        comment_info = response.xpath(
            '//div[contains(@class,"box commentBox")]')
        if comment_info:
            commentScores = comment_info.xpath('./dl[@class="comment"]')
            item['recommendation'] = get_content(
                commentScores.xpath('./dt/span/text()').extract())

            score = commentScores.xpath('./dd/span[@class="num"]')
            item['withdraw_num'] = get_content(
                score[0].xpath('text()').extract())
            item['guard_num'] = get_content(score[1].xpath('text()').extract())
            item['service_num'] = get_content(
                score[2].xpath('text()').extract())
            item['experience_num'] = get_content(
                score[3].xpath('text()').extract())

            scoreInfo = commentScores.xpath('.//span[not(@class="num")]')
            item['withdraw_day'] = get_content(
                scoreInfo[0].xpath('text()').extract())
            item['guard_day'] = get_content(
                scoreInfo[1].xpath('text()').extract())
            item['service_status'] = get_content(
                scoreInfo[2].xpath('text()').extract())
            item['experience_status'] = get_content(
                scoreInfo[3].xpath('text()').extract())

            impress_info = comment_info.xpath(
                './dl[@class="impression"]/dd//span')
            item['impression'] = '\001'.join([
                get_trunk(impress)
                for impress in impress_info.xpath('text()').extract()
            ])

        return item
コード例 #3
0
    def start_requests(self):
        for i in self.shortlist:
            obj = DaohangItem.get_object_by_pk(i)
            if obj.plat_id:
                plat_id = obj.plat_id
                self.mapping[plat_id] = obj
                body = {'wdzjPlatId': str(plat_id)}

                yield scrapy.FormRequest(self.start_url,
                                         formdata=body,
                                         meta=body,
                                         dont_filter=True)
コード例 #4
0
ファイル: tedian.py プロジェクト: lwh1992/blotus
    def parse(self, response):
        symbol = (self.mapping.get(self.get_pin_from_url(response.url)), response.url)
        self.logger.info('Parsing ID.%d Wangjia Feature From <%s>.' % symbol)
        self.object = DaohangItem.get_object_by_pk(symbol[0])

        item = TedianItem()
        item['name'] = self.object.name

        rtag = response.xpath('//div[@class="rTags"]')
        if rtag:
            item['status'] = get_content(rtag.xpath('./span[@class="tag3"]/text()').extract())
            item['company_tag'] = get_content(rtag.xpath('./span[@class="tag tag2"]/text()').extract())

            tag_info = rtag.xpath('./span[@class = "tag"]')
            item['illustration'] = '/'.join([get_trunk(info) for info in tag_info.xpath('text()').extract()])

        comment_info = response.xpath('//div[contains(@class,"box commentBox")]')
        if comment_info:
            commentScores = comment_info.xpath('./dl[@class="comment"]')
            item['recommendation'] = get_content(commentScores.xpath('./dt/span/text()').extract())

            score = commentScores.xpath('./dd/span[@class="num"]')
            item['withdraw_num'] = get_content(score[0].xpath('text()').extract())
            item['guard_num'] = get_content(score[1].xpath('text()').extract())
            item['service_num'] = get_content(score[2].xpath('text()').extract())
            item['experience_num'] = get_content(score[3].xpath('text()').extract())
    
            scoreInfo = commentScores.xpath('.//span[not(@class="num")]')
            item['withdraw_day'] = get_content(scoreInfo[0].xpath('text()').extract())
            item['guard_day'] = get_content(scoreInfo[1].xpath('text()').extract())
            item['service_status'] = get_content(scoreInfo[2].xpath('text()').extract())
            item['experience_status'] = get_content(scoreInfo[3].xpath('text()').extract())

            impress_info = comment_info.xpath('./dl[@class="impression"]/dd//span')
            item['impression'] = '\001'.join([get_trunk(impress) for impress in impress_info.xpath('text()').extract()])

        return item
コード例 #5
0
    def parse(self, response):
        item_list = []
        if response.url.endswith('html'):
            # For Regular Platform.
            content = response.xpath(
                '//div[@id="platList"]/div[starts-with(@class, "rnav")]')
            for sel_ct in content:
                province_name = get_content(
                    sel_ct.xpath(
                        'div[@class="til"]/div/p[not(@class="til_num")]/text()'
                    ).extract())
                province_id = ProvinceItem.get_id_by_name(province_name)

                plat_list = sel_ct.xpath('ul[@class="til_cn"]/li')
                for sel_pt in plat_list:
                    daohang = DaohangItem()
                    purl = get_content(
                        sel_pt.xpath('a/@purl').extract()).split('/')
                    while not purl[-1]:
                        purl.pop()
                    daohang['pin'] = purl.pop()
                    daohang['name'] = get_content(
                        sel_pt.xpath('a/text()').extract())
                    daohang['link'] = get_content(
                        sel_pt.xpath('a/@href').extract())
                    daohang['province_id'] = province_id

                    item_list.append(daohang)

            # For Problematic Platform.
            # Disabled Here Temporarily.
            #content = response.xpath('//div[@id="issuePlatList"]/div[starts-with(@class, "rnav")]')
            #for sel_ct in content:
            #    province_name = get_content(sel_ct.xpath('div[@class="til"]/div/p[not(@class="til_num")]/text()').extract())
            #    province_id = ProvinceItem.get_id_by_name(province_name)

            #    plat_list = sel_ct.xpath('ul[@class="til_cn"]/li')
            #    for sel_pt in plat_list:
            #        daohang = DaohangItem()
            #        purl = get_content(sel_pt.xpath('a/@purl').extract()).split('/')
            #        while not purl[-1]: purl.pop()
            #        daohang['pin'] = purl.pop()
            #        daohang['name'] = get_content(sel_pt.xpath('a/text()').extract())
            #        # Invalid Link For Problematic Platform.
            #        #daohang['link'] = get_content(sel_pt.xpath('a/@href').extract())
            #        daohang['province_id'] = province_id

            #        item_list.append(daohang)
        else:
            content = json.loads(response.body_as_unicode())
            if response.url.endswith('json'):
                for ct in content:
                    daohang = DaohangItem()
                    daohang['pin'] = ct.get('platPin', None)
                    daohang['allPin'] = ct.get('allPlatPin', None)
                    daohang['name'] = ct.get('platName', None)
                    daohang['link'] = ct.get('platUrl', None)

                    item_list.append(daohang)
            else:
                for ct in content:
                    if not ct.get('city'): continue

                    province_id = ProvinceItem.get_id_by_name(ct.get('city'))
                    plat_list = ct.get('platList')
                    for pt in plat_list:
                        daohang = DaohangItem()
                        daohang['pin'] = pt.get('platLetter', None)
                        daohang['name'] = pt.get('platName', None)
                        daohang['link'] = pt.get('platUrl', None)
                        daohang['province_id'] = province_id
                        daohang['launch_time'] = pt.get('onlineDateStr', None)
                        daohang['icon_url'] = pt.get('platIconUrl', None)

                        item_list.append(daohang)

        return item_list
コード例 #6
0
 def start_requests(self):
     for i in self.shortlist:
         obj = DaohangItem.get_object_by_pk(i)
         self.mapping[obj.pin] = obj.id
         url = self.start_formated_url.format(plat_pin=obj.pin)
         yield self.make_requests_from_url(url)
コード例 #7
0
ファイル: dangan.py プロジェクト: lwh1992/blotus
    def parse(self, response):
        #NOTE: (zacky, 2015.APR.27th) PIPELINE FUNCTIONS RELATED WILL BE PROCESSED, SO WE KEEP THE OBJECT STATE HERE.
        symbol = (self.mapping.get(self.get_pin_from_url(response.url)), response.url)
        self.logger.info('Parsing ID.%d Wangjia Archive From <%s>.' % symbol)
        self.object = DaohangItem.get_object_by_pk(symbol[0])

        item = DanganItem()
        item['name'] = self.object.name
        item['logo_url'] = get_content(response.xpath('//div[@class="rLogo"]/a/img/@src').extract())

        detail = response.xpath('//div[contains(@class, "detailBox")]/p')
        if detail:
            item['link'] = get_content(detail[1].xpath('a/@href').extract())
            item['location'] = get_content(detail[3].xpath('text()').extract())
            item['launch_time'] = get_content(detail[4].xpath('text()').extract())

        about = response.xpath('//div[contains(@class, "aboutBd")]/p')
        if about:
            item['introduction'] = ' '.join([get_trunk(c) for c in about.xpath('.//text()').extract()])

        info = response.xpath('//div[contains(@class, "inforBd")]/p[not(contains(@class, "line"))]')
        if info:
            item['company_name'] = get_content(info[0].xpath('text()').extract())
            item['artificial_person'] = get_content(info[1].xpath('text()').extract())
            item['company_type'] = get_content(info[2].xpath('text()').extract())
            item['shareholder_stucture'] = get_content(info[3].xpath('text()').extract())
            item['registered_capital'] = get_content(info[4].xpath('text()').extract())
            item['contributed_capital'] = get_content(info[5].xpath('text()').extract())
            item['registered_address'] = get_content(info[6].xpath('text()').extract())
            item['opening_date'] = get_content(info[7].xpath('text()').extract())
            item['approved_date'] = get_content(info[8].xpath('text()').extract())
            item['registration_authority'] = get_content(info[9].xpath('text()').extract())
            item['business_licence'] = get_content(info[10].xpath('text()').extract())
            item['institutional_framework'] = get_content(info[11].xpath('text()').extract())
            item['tax_registration_num'] = get_content(info[12].xpath('text()').extract())

        record = response.xpath('//div[contains(@class, "webRecordBd")]/table/tbody/tr')[1].xpath('td')
        if record:
            item['domain_name'] = get_content(record[0].xpath('text()').extract())
            item['domain_date'] = get_content(record[1].xpath('text()').extract())
            item['domain_company_type'] = get_content(record[2].xpath('text()').extract())
            item['domain_company_name'] = get_content(record[3].xpath('text()').extract())
            item['icp'] = get_content(record[4].xpath('text()').extract())

        people = response.xpath('//div[contains(@class, "peopleBd")]/ul/li')
        if people:
            avatar_url = []
            content = []
            for i in xrange(len(people)):
                avatar_url.extend(people[i].xpath('div[@class="avatar"]/img/@src').extract())
                content.extend([get_trunk(c) for c in people[i].xpath('p//text()').extract()])
            item['company_person_avatar_url'] = '#'.join(avatar_url)
            item['company_person'] = ' '.join(content)

        cost = response.xpath('//div[contains(@class, "costBd")]')[0].xpath('p')
        if cost:
            item['management_fee'] = get_content(cost[0].xpath('text()').extract())
            item['prepaid_fee'] = get_content(cost[1].xpath('text()').extract())
            item['cash_withdrawal_fee'] = get_content(cost[2].xpath('text()').extract())
            item['vip_fee'] = get_content(cost[3].xpath('text()').extract())
            item['transfer_fee'] = get_content(cost[4].xpath('text()').extract())
            item['mode_of_payment'] = get_content(cost[5].xpath('text()').extract())

        contact = response.xpath('//div[contains(@class, "costBd")]')[1].xpath('p')
        if contact:
            item['contact_address'] = get_content(contact[0].xpath('text()').extract())
            item['phone_400'] = get_content(contact[1].xpath('text()').extract())
            item['phone'] = get_content(contact[2].xpath('text()').extract())
            item['fax'] = get_content(contact[3].xpath('text()').extract())
            item['email'] = get_content(contact[4].xpath('text()').extract())

        record = response.xpath('//div[contains(@class, "recordListBox")]/ul/li')
        if record:
            item['is_automatic_bid'] = get_content(record[3].xpath('.//text()').extract(), skipFirst=True)
            item['is_equitable_assignment'] = get_content(record[4].xpath('.//text()').extract(), skipFirst=True)
            item['trust_fund'] = get_content(record[5].xpath('.//text()').extract(), skipFirst=True)
            item['tender_security'] = get_content(record[6].xpath('.//text()').extract(), skipFirst=True)
            item['security_mode'] = get_content(record[7].xpath('.//text()').extract(), skipFirst=True)
            item['guarantee_institution'] = get_content(record[8].xpath('.//text()').extract(), skipFirst=True)
            item['business_type'] = len(record) >= 10 and get_content(record[9].xpath('.//text()').extract(), skipFirst=True)

        log_empty_fields(item, self.logger)
        return item
コード例 #8
0
ファイル: dangan.py プロジェクト: lwh1992/blotus
 def start_requests(self):
     for i in self.shortlist:
         obj = DaohangItem.get_object_by_pk(i)
         self.mapping[obj.pin] = obj.id
         url = self.start_formated_url.format(pin=obj.pin)
         yield self.make_requests_from_url(url)
コード例 #9
0
ファイル: dangan.py プロジェクト: michael1011101/blotus
    def parse(self, response):
        #NOTE: (zacky, 2015.APR.27th) PIPELINE FUNCTIONS RELATED WILL BE PROCESSED, SO WE KEEP THE OBJECT STATE HERE.
        symbol = (self.mapping.get(self.get_pin_from_url(response.url)),
                  response.url)
        self.logger.info('Parsing ID.%d Wangjia Archive From <%s>.' % symbol)
        self.object = DaohangItem.get_object_by_pk(symbol[0])

        item = DanganItem()
        item['name'] = self.object.name
        item['logo_url'] = get_content(
            response.xpath('//div[@class="rLogo"]/a/img/@src').extract())

        detail = response.xpath('//div[contains(@class, "detailBox")]/p')
        if detail:
            item['link'] = get_content(detail[1].xpath('a/@href').extract())
            item['location'] = get_content(detail[3].xpath('text()').extract())
            item['launch_time'] = get_content(
                detail[4].xpath('text()').extract())

        about = response.xpath('//div[contains(@class, "aboutBd")]/p')
        if about:
            item['introduction'] = ' '.join(
                [get_trunk(c) for c in about.xpath('.//text()').extract()])

        info = response.xpath(
            '//div[contains(@class, "inforBd")]/p[not(contains(@class, "line"))]'
        )
        if info:
            item['company_name'] = get_content(
                info[0].xpath('text()').extract())
            item['artificial_person'] = get_content(
                info[1].xpath('text()').extract())
            item['company_type'] = get_content(
                info[2].xpath('text()').extract())
            item['shareholder_stucture'] = get_content(
                info[3].xpath('text()').extract())
            item['registered_capital'] = get_content(
                info[4].xpath('text()').extract())
            item['contributed_capital'] = get_content(
                info[5].xpath('text()').extract())
            item['registered_address'] = get_content(
                info[6].xpath('text()').extract())
            item['opening_date'] = get_content(
                info[7].xpath('text()').extract())
            item['approved_date'] = get_content(
                info[8].xpath('text()').extract())
            item['registration_authority'] = get_content(
                info[9].xpath('text()').extract())
            item['business_licence'] = get_content(
                info[10].xpath('text()').extract())
            item['institutional_framework'] = get_content(
                info[11].xpath('text()').extract())
            item['tax_registration_num'] = get_content(
                info[12].xpath('text()').extract())

        record = response.xpath(
            '//div[contains(@class, "webRecordBd")]/table/tbody/tr')[1].xpath(
                'td')
        if record:
            item['domain_name'] = get_content(
                record[0].xpath('text()').extract())
            item['domain_date'] = get_content(
                record[1].xpath('text()').extract())
            item['domain_company_type'] = get_content(
                record[2].xpath('text()').extract())
            item['domain_company_name'] = get_content(
                record[3].xpath('text()').extract())
            item['icp'] = get_content(record[4].xpath('text()').extract())

        people = response.xpath('//div[contains(@class, "peopleBd")]/ul/li')
        if people:
            avatar_url = []
            content = []
            for i in xrange(len(people)):
                avatar_url.extend(
                    people[i].xpath('div[@class="avatar"]/img/@src').extract())
                content.extend([
                    get_trunk(c)
                    for c in people[i].xpath('p//text()').extract()
                ])
            item['company_person_avatar_url'] = '#'.join(avatar_url)
            item['company_person'] = ' '.join(content)

        cost = response.xpath('//div[contains(@class, "costBd")]')[0].xpath(
            'p')
        if cost:
            item['management_fee'] = get_content(
                cost[0].xpath('text()').extract())
            item['prepaid_fee'] = get_content(
                cost[1].xpath('text()').extract())
            item['cash_withdrawal_fee'] = get_content(
                cost[2].xpath('text()').extract())
            item['vip_fee'] = get_content(cost[3].xpath('text()').extract())
            item['transfer_fee'] = get_content(
                cost[4].xpath('text()').extract())
            item['mode_of_payment'] = get_content(
                cost[5].xpath('text()').extract())

        contact = response.xpath('//div[contains(@class, "costBd")]')[1].xpath(
            'p')
        if contact:
            item['contact_address'] = get_content(
                contact[0].xpath('text()').extract())
            item['phone_400'] = get_content(
                contact[1].xpath('text()').extract())
            item['phone'] = get_content(contact[2].xpath('text()').extract())
            item['fax'] = get_content(contact[3].xpath('text()').extract())
            item['email'] = get_content(contact[4].xpath('text()').extract())

        record = response.xpath(
            '//div[contains(@class, "recordListBox")]/ul/li')
        if record:
            item['is_automatic_bid'] = get_content(
                record[3].xpath('.//text()').extract(), skipFirst=True)
            item['is_equitable_assignment'] = get_content(
                record[4].xpath('.//text()').extract(), skipFirst=True)
            item['trust_fund'] = get_content(
                record[5].xpath('.//text()').extract(), skipFirst=True)
            item['tender_security'] = get_content(
                record[6].xpath('.//text()').extract(), skipFirst=True)
            item['security_mode'] = get_content(
                record[7].xpath('.//text()').extract(), skipFirst=True)
            item['guarantee_institution'] = get_content(
                record[8].xpath('.//text()').extract(), skipFirst=True)
            item['business_type'] = len(record) >= 10 and get_content(
                record[9].xpath('.//text()').extract(), skipFirst=True)

        log_empty_fields(item, self.logger)
        return item