Beispiel #1
0
    def start_requests(self):
        body = {'type':'1','target1':self.target1,'target2':self.target2}
        for i in self.shortlist:
            obj = DaohangItem.get_object_by_pk(i)
            if obj.plat_id:
                plat_id = obj.plat_id
                self.mapping[plat_id] = obj
                body['wdzjPlatId'] = str(plat_id)

                yield scrapy.FormRequest(self.start_url, formdata=body, meta = body, dont_filter=True)
Beispiel #2
0
    def parse(self, response):
        symbol = (self.mapping.get(self.get_pin_from_url(response.url)),
                  response.url)
        self.logger.info('Parsing ID.%d Wangjia Feature From <%s>.' % symbol)
        self.object = DaohangItem.get_object_by_pk(symbol[0])

        item = TedianItem()
        item['name'] = self.object.name

        rtag = response.xpath('//div[@class="rTags"]')
        if rtag:
            item['status'] = get_content(
                rtag.xpath('./span[@class="tag3"]/text()').extract())
            item['company_tag'] = get_content(
                rtag.xpath('./span[@class="tag tag2"]/text()').extract())

            tag_info = rtag.xpath('./span[@class = "tag"]')
            item['illustration'] = '/'.join([
                get_trunk(info) for info in tag_info.xpath('text()').extract()
            ])

        comment_info = response.xpath(
            '//div[contains(@class,"box commentBox")]')
        if comment_info:
            commentScores = comment_info.xpath('./dl[@class="comment"]')
            item['recommendation'] = get_content(
                commentScores.xpath('./dt/span/text()').extract())

            score = commentScores.xpath('./dd/span[@class="num"]')
            item['withdraw_num'] = get_content(
                score[0].xpath('text()').extract())
            item['guard_num'] = get_content(score[1].xpath('text()').extract())
            item['service_num'] = get_content(
                score[2].xpath('text()').extract())
            item['experience_num'] = get_content(
                score[3].xpath('text()').extract())

            scoreInfo = commentScores.xpath('.//span[not(@class="num")]')
            item['withdraw_day'] = get_content(
                scoreInfo[0].xpath('text()').extract())
            item['guard_day'] = get_content(
                scoreInfo[1].xpath('text()').extract())
            item['service_status'] = get_content(
                scoreInfo[2].xpath('text()').extract())
            item['experience_status'] = get_content(
                scoreInfo[3].xpath('text()').extract())

            impress_info = comment_info.xpath(
                './dl[@class="impression"]/dd//span')
            item['impression'] = '\001'.join([
                get_trunk(impress)
                for impress in impress_info.xpath('text()').extract()
            ])

        return item
Beispiel #3
0
    def start_requests(self):
        for i in self.shortlist:
            obj = DaohangItem.get_object_by_pk(i)
            if obj.plat_id:
                plat_id = obj.plat_id
                self.mapping[plat_id] = obj
                body = {'wdzjPlatId': str(plat_id)}

                yield scrapy.FormRequest(self.start_url,
                                         formdata=body,
                                         meta=body,
                                         dont_filter=True)
Beispiel #4
0
    def parse(self, response):
        symbol = (self.mapping.get(self.get_pin_from_url(response.url)), response.url)
        self.logger.info('Parsing ID.%d Wangjia Feature From <%s>.' % symbol)
        self.object = DaohangItem.get_object_by_pk(symbol[0])

        item = TedianItem()
        item['name'] = self.object.name

        rtag = response.xpath('//div[@class="rTags"]')
        if rtag:
            item['status'] = get_content(rtag.xpath('./span[@class="tag3"]/text()').extract())
            item['company_tag'] = get_content(rtag.xpath('./span[@class="tag tag2"]/text()').extract())

            tag_info = rtag.xpath('./span[@class = "tag"]')
            item['illustration'] = '/'.join([get_trunk(info) for info in tag_info.xpath('text()').extract()])

        comment_info = response.xpath('//div[contains(@class,"box commentBox")]')
        if comment_info:
            commentScores = comment_info.xpath('./dl[@class="comment"]')
            item['recommendation'] = get_content(commentScores.xpath('./dt/span/text()').extract())

            score = commentScores.xpath('./dd/span[@class="num"]')
            item['withdraw_num'] = get_content(score[0].xpath('text()').extract())
            item['guard_num'] = get_content(score[1].xpath('text()').extract())
            item['service_num'] = get_content(score[2].xpath('text()').extract())
            item['experience_num'] = get_content(score[3].xpath('text()').extract())
    
            scoreInfo = commentScores.xpath('.//span[not(@class="num")]')
            item['withdraw_day'] = get_content(scoreInfo[0].xpath('text()').extract())
            item['guard_day'] = get_content(scoreInfo[1].xpath('text()').extract())
            item['service_status'] = get_content(scoreInfo[2].xpath('text()').extract())
            item['experience_status'] = get_content(scoreInfo[3].xpath('text()').extract())

            impress_info = comment_info.xpath('./dl[@class="impression"]/dd//span')
            item['impression'] = '\001'.join([get_trunk(impress) for impress in impress_info.xpath('text()').extract()])

        return item
Beispiel #5
0
 def start_requests(self):
     for i in self.shortlist:
         obj = DaohangItem.get_object_by_pk(i)
         self.mapping[obj.pin] = obj.id
         url = self.start_formated_url.format(plat_pin=obj.pin)
         yield self.make_requests_from_url(url)
Beispiel #6
0
    def parse(self, response):
        #NOTE: (zacky, 2015.APR.27th) PIPELINE FUNCTIONS RELATED WILL BE PROCESSED, SO WE KEEP THE OBJECT STATE HERE.
        symbol = (self.mapping.get(self.get_pin_from_url(response.url)), response.url)
        self.logger.info('Parsing ID.%d Wangjia Archive From <%s>.' % symbol)
        self.object = DaohangItem.get_object_by_pk(symbol[0])

        item = DanganItem()
        item['name'] = self.object.name
        item['logo_url'] = get_content(response.xpath('//div[@class="rLogo"]/a/img/@src').extract())

        detail = response.xpath('//div[contains(@class, "detailBox")]/p')
        if detail:
            item['link'] = get_content(detail[1].xpath('a/@href').extract())
            item['location'] = get_content(detail[3].xpath('text()').extract())
            item['launch_time'] = get_content(detail[4].xpath('text()').extract())

        about = response.xpath('//div[contains(@class, "aboutBd")]/p')
        if about:
            item['introduction'] = ' '.join([get_trunk(c) for c in about.xpath('.//text()').extract()])

        info = response.xpath('//div[contains(@class, "inforBd")]/p[not(contains(@class, "line"))]')
        if info:
            item['company_name'] = get_content(info[0].xpath('text()').extract())
            item['artificial_person'] = get_content(info[1].xpath('text()').extract())
            item['company_type'] = get_content(info[2].xpath('text()').extract())
            item['shareholder_stucture'] = get_content(info[3].xpath('text()').extract())
            item['registered_capital'] = get_content(info[4].xpath('text()').extract())
            item['contributed_capital'] = get_content(info[5].xpath('text()').extract())
            item['registered_address'] = get_content(info[6].xpath('text()').extract())
            item['opening_date'] = get_content(info[7].xpath('text()').extract())
            item['approved_date'] = get_content(info[8].xpath('text()').extract())
            item['registration_authority'] = get_content(info[9].xpath('text()').extract())
            item['business_licence'] = get_content(info[10].xpath('text()').extract())
            item['institutional_framework'] = get_content(info[11].xpath('text()').extract())
            item['tax_registration_num'] = get_content(info[12].xpath('text()').extract())

        record = response.xpath('//div[contains(@class, "webRecordBd")]/table/tbody/tr')[1].xpath('td')
        if record:
            item['domain_name'] = get_content(record[0].xpath('text()').extract())
            item['domain_date'] = get_content(record[1].xpath('text()').extract())
            item['domain_company_type'] = get_content(record[2].xpath('text()').extract())
            item['domain_company_name'] = get_content(record[3].xpath('text()').extract())
            item['icp'] = get_content(record[4].xpath('text()').extract())

        people = response.xpath('//div[contains(@class, "peopleBd")]/ul/li')
        if people:
            avatar_url = []
            content = []
            for i in xrange(len(people)):
                avatar_url.extend(people[i].xpath('div[@class="avatar"]/img/@src').extract())
                content.extend([get_trunk(c) for c in people[i].xpath('p//text()').extract()])
            item['company_person_avatar_url'] = '#'.join(avatar_url)
            item['company_person'] = ' '.join(content)

        cost = response.xpath('//div[contains(@class, "costBd")]')[0].xpath('p')
        if cost:
            item['management_fee'] = get_content(cost[0].xpath('text()').extract())
            item['prepaid_fee'] = get_content(cost[1].xpath('text()').extract())
            item['cash_withdrawal_fee'] = get_content(cost[2].xpath('text()').extract())
            item['vip_fee'] = get_content(cost[3].xpath('text()').extract())
            item['transfer_fee'] = get_content(cost[4].xpath('text()').extract())
            item['mode_of_payment'] = get_content(cost[5].xpath('text()').extract())

        contact = response.xpath('//div[contains(@class, "costBd")]')[1].xpath('p')
        if contact:
            item['contact_address'] = get_content(contact[0].xpath('text()').extract())
            item['phone_400'] = get_content(contact[1].xpath('text()').extract())
            item['phone'] = get_content(contact[2].xpath('text()').extract())
            item['fax'] = get_content(contact[3].xpath('text()').extract())
            item['email'] = get_content(contact[4].xpath('text()').extract())

        record = response.xpath('//div[contains(@class, "recordListBox")]/ul/li')
        if record:
            item['is_automatic_bid'] = get_content(record[3].xpath('.//text()').extract(), skipFirst=True)
            item['is_equitable_assignment'] = get_content(record[4].xpath('.//text()').extract(), skipFirst=True)
            item['trust_fund'] = get_content(record[5].xpath('.//text()').extract(), skipFirst=True)
            item['tender_security'] = get_content(record[6].xpath('.//text()').extract(), skipFirst=True)
            item['security_mode'] = get_content(record[7].xpath('.//text()').extract(), skipFirst=True)
            item['guarantee_institution'] = get_content(record[8].xpath('.//text()').extract(), skipFirst=True)
            item['business_type'] = len(record) >= 10 and get_content(record[9].xpath('.//text()').extract(), skipFirst=True)

        log_empty_fields(item, self.logger)
        return item
Beispiel #7
0
 def start_requests(self):
     for i in self.shortlist:
         obj = DaohangItem.get_object_by_pk(i)
         self.mapping[obj.pin] = obj.id
         url = self.start_formated_url.format(pin=obj.pin)
         yield self.make_requests_from_url(url)
Beispiel #8
0
    def parse(self, response):
        #NOTE: (zacky, 2015.APR.27th) PIPELINE FUNCTIONS RELATED WILL BE PROCESSED, SO WE KEEP THE OBJECT STATE HERE.
        symbol = (self.mapping.get(self.get_pin_from_url(response.url)),
                  response.url)
        self.logger.info('Parsing ID.%d Wangjia Archive From <%s>.' % symbol)
        self.object = DaohangItem.get_object_by_pk(symbol[0])

        item = DanganItem()
        item['name'] = self.object.name
        item['logo_url'] = get_content(
            response.xpath('//div[@class="rLogo"]/a/img/@src').extract())

        detail = response.xpath('//div[contains(@class, "detailBox")]/p')
        if detail:
            item['link'] = get_content(detail[1].xpath('a/@href').extract())
            item['location'] = get_content(detail[3].xpath('text()').extract())
            item['launch_time'] = get_content(
                detail[4].xpath('text()').extract())

        about = response.xpath('//div[contains(@class, "aboutBd")]/p')
        if about:
            item['introduction'] = ' '.join(
                [get_trunk(c) for c in about.xpath('.//text()').extract()])

        info = response.xpath(
            '//div[contains(@class, "inforBd")]/p[not(contains(@class, "line"))]'
        )
        if info:
            item['company_name'] = get_content(
                info[0].xpath('text()').extract())
            item['artificial_person'] = get_content(
                info[1].xpath('text()').extract())
            item['company_type'] = get_content(
                info[2].xpath('text()').extract())
            item['shareholder_stucture'] = get_content(
                info[3].xpath('text()').extract())
            item['registered_capital'] = get_content(
                info[4].xpath('text()').extract())
            item['contributed_capital'] = get_content(
                info[5].xpath('text()').extract())
            item['registered_address'] = get_content(
                info[6].xpath('text()').extract())
            item['opening_date'] = get_content(
                info[7].xpath('text()').extract())
            item['approved_date'] = get_content(
                info[8].xpath('text()').extract())
            item['registration_authority'] = get_content(
                info[9].xpath('text()').extract())
            item['business_licence'] = get_content(
                info[10].xpath('text()').extract())
            item['institutional_framework'] = get_content(
                info[11].xpath('text()').extract())
            item['tax_registration_num'] = get_content(
                info[12].xpath('text()').extract())

        record = response.xpath(
            '//div[contains(@class, "webRecordBd")]/table/tbody/tr')[1].xpath(
                'td')
        if record:
            item['domain_name'] = get_content(
                record[0].xpath('text()').extract())
            item['domain_date'] = get_content(
                record[1].xpath('text()').extract())
            item['domain_company_type'] = get_content(
                record[2].xpath('text()').extract())
            item['domain_company_name'] = get_content(
                record[3].xpath('text()').extract())
            item['icp'] = get_content(record[4].xpath('text()').extract())

        people = response.xpath('//div[contains(@class, "peopleBd")]/ul/li')
        if people:
            avatar_url = []
            content = []
            for i in xrange(len(people)):
                avatar_url.extend(
                    people[i].xpath('div[@class="avatar"]/img/@src').extract())
                content.extend([
                    get_trunk(c)
                    for c in people[i].xpath('p//text()').extract()
                ])
            item['company_person_avatar_url'] = '#'.join(avatar_url)
            item['company_person'] = ' '.join(content)

        cost = response.xpath('//div[contains(@class, "costBd")]')[0].xpath(
            'p')
        if cost:
            item['management_fee'] = get_content(
                cost[0].xpath('text()').extract())
            item['prepaid_fee'] = get_content(
                cost[1].xpath('text()').extract())
            item['cash_withdrawal_fee'] = get_content(
                cost[2].xpath('text()').extract())
            item['vip_fee'] = get_content(cost[3].xpath('text()').extract())
            item['transfer_fee'] = get_content(
                cost[4].xpath('text()').extract())
            item['mode_of_payment'] = get_content(
                cost[5].xpath('text()').extract())

        contact = response.xpath('//div[contains(@class, "costBd")]')[1].xpath(
            'p')
        if contact:
            item['contact_address'] = get_content(
                contact[0].xpath('text()').extract())
            item['phone_400'] = get_content(
                contact[1].xpath('text()').extract())
            item['phone'] = get_content(contact[2].xpath('text()').extract())
            item['fax'] = get_content(contact[3].xpath('text()').extract())
            item['email'] = get_content(contact[4].xpath('text()').extract())

        record = response.xpath(
            '//div[contains(@class, "recordListBox")]/ul/li')
        if record:
            item['is_automatic_bid'] = get_content(
                record[3].xpath('.//text()').extract(), skipFirst=True)
            item['is_equitable_assignment'] = get_content(
                record[4].xpath('.//text()').extract(), skipFirst=True)
            item['trust_fund'] = get_content(
                record[5].xpath('.//text()').extract(), skipFirst=True)
            item['tender_security'] = get_content(
                record[6].xpath('.//text()').extract(), skipFirst=True)
            item['security_mode'] = get_content(
                record[7].xpath('.//text()').extract(), skipFirst=True)
            item['guarantee_institution'] = get_content(
                record[8].xpath('.//text()').extract(), skipFirst=True)
            item['business_type'] = len(record) >= 10 and get_content(
                record[9].xpath('.//text()').extract(), skipFirst=True)

        log_empty_fields(item, self.logger)
        return item