Example #1
0
    def parse(self, response):
        symbol = (self.timestamp, response.url)
        self.logger.info('Parsing %s Wangjia Rating From Archive <%s>.' % symbol)

        item = PingjiItem()
        item['timestamp'] = symbol[0]

        detail = response.xpath('//div[contains(@class, "detailBox")]/p')
        if detail:
            item['name'] = get_content(detail[0].xpath('text()').extract())
            item['launch_time'] = get_content(detail[4].xpath('text()').extract())
            item['location'] = get_content(detail[3].xpath('text()').extract())

        record = response.xpath('//div[@class="recordHead"]/div[@class="con"]/p')
        if record:
            item['exponent'] = get_content(record.xpath('span[@class="num"]/text()').extract())

        exp = response.xpath('//div[contains(@class, "expBox")]/div[@class="bd"]/div[@class="detail"]/p')
        if not exp: return None
        item['deal'] = get_content(exp[0].xpath('span[@class="num"]/text()').extract())
        item['popularity'] = get_content(exp[1].xpath('span[@class="num"]/text()').extract())
        item['profit'] = get_content(exp[2].xpath('span[@class="num"]/text()').extract())
        item['revenue'] = get_content(exp[3].xpath('span[@class="num"]/text()').extract())
        item['lever'] = get_content(exp[4].xpath('span[@class="num"]/text()').extract())
        item['brand'] = get_content(exp[5].xpath('span[@class="num"]/text()').extract())
        item['dispersity'] = get_content(exp[7].xpath('span[@class="num"]/text()').extract())
        item['mobility'] = get_content(exp[8].xpath('span[@class="num"]/text()').extract())
        item['transparency'] = get_content(exp[6].xpath('span[@class="num"]/text()').extract())

        log_empty_fields(item, self.logger)
        return item
Example #2
0
    def parse(self, response):
        if self.method:
            symbol = (self.plat_id, get_url_param(response.url, 'from_date'), get_url_param(response.url, 'to_date'), response.url)
        else:
            symbol = (self.plat_id, get_url_param(response.request.body, 'from_date'), get_url_param(response.request.body, 'to_date'), response.url)
        self.logger.info('Parsing No.%s Plat [%s, %s] Daily Data From <%s>.' % symbol)

        try:
            content = json.loads(response.body_as_unicode())
            self.logger.info(content)
            internal_content = content.get('data', {})
            if int(content.get('result_code', -1)) != 1 or not internal_content:
                raise ValueError
        except Exception:
            self.logger.warning('Fail To Receive No.%s Plat [%s, %s] Daily Data From <%s>.' % symbol)
            return None

        item_list = []
        for dd in internal_content:
            item = MeiriItem()
            item['plat_id'] = self.plat_id
            item['date'] = dd.get('current_date')
            item['daily_turnover'] = dd.get('daily_turnover')
            item['daily_trade_cnt'] = dd.get('daily_trade_cnt')
            item['daily_invest_cnt'] = dd.get('daily_invest_cnt')
            item['thityday_income'] = dd.get('thityday_income')
            item['service_time'] = dd.get('service_time')

            log_empty_fields(item, self.logger)
            item_list.append(item)

        return item_list
Example #3
0
    def parse(self, response):
        #symbol = (self.plat_id, get_url_param(response.url, 'from_month'), get_url_param(response.url, 'to_month'), response.url)
        #self.logger.info('Parsing No.%s Plat [%s, %s] Monthly Data From <%s>.' % symbol)
        if self.method:
            symbol = (self.plat_id, get_url_param(response.url, 'month'), response.url)
        else:
            symbol = (self.plat_id, get_url_param(response.request.body, 'month'), response.url)
        self.logger.info('Parsing No.%s Plat %s Monthly Data From <%s>.' % symbol)

        try:
            content = json.loads(response.body_as_unicode())
            self.logger.info(content)
            internal_content = content.get('data', {})[0] if content.get('data', {})[0] else content.get('data', {})
            if int(content.get('result_code', -1)) != 1 or not internal_content:
                raise ValueError
        except Exception:
            self.logger.warning('Fail To Receive No.%s Plat %s Monthly Data From <%s>' % symbol)
            return None

        item = MeiyueItem()
        item['plat_id'] = self.plat_id
        item['date'] = symbol[1]
        item['loan_amount_per_capita'] = internal_content.get('loan_amount_per_capita')
        item['avg_loan_per_trade'] = internal_content.get('avg_loan_per_trade')
        item['invest_amount_per_capita'] = internal_content.get('invest_amount_per_capita')
        item['avg_invest_per_trade'] = internal_content.get('avg_invest_per_trade')
        item['max_borrower_ratio'] = internal_content.get('max_borrower_ratio')
        item['topten_borrowers_ratio'] = internal_content.get('topten_borrowers_ratio')
        item['overdue_project_amount'] = internal_content.get('overdue_project_amount')
        item['avg_interest_rate'] = internal_content.get('avg_interest_rate')
        item['avg_borrow_period'] = internal_content.get('avg_borrow_period')

        log_empty_fields(item, self.logger)
        return item
Example #4
0
    def parse(self, response):
        symbol = (get_url_param(response.url, 'page_index'), get_url_host(response.url), response.url)
        self.logger.info('Parsing No.%s Page %s Overdue Info From <%s>.' % symbol)

        try:
            content = json.loads(response.body_as_unicode())
            if int(content.get('result_code', 0)) != 1:
                raise ValueError
        except Exception:
            self.logger.warning('Response Error In No.%s Page %s Overdue Info From <%s>.' % symbol)
            return None

        item_list = []
        for dy in content.get('data', []):
            item = YuqiItem()
            item['plat_id'] = self.plat_id
            item['plat_name'] = self.plat_name
            item['user_id'] = dy.get('user_id')
            item['username'] = dy.get('username')
            item['idcard'] = dy.get('idcard')
            item['overdue_count'] = dy.get('overdue_count')
            item['overdue_total'] = dy.get('overdue_total')
            item['overdue_principal'] = dy.get('overdue_principal')
            item['payment_total'] = dy.get('payment_total')
            item['payment_count'] = dy.get('payment_count')
            item['payment_period'] = dy.get('payment_period')
            item['repay_amount'] = dy.get('repay_amount')
            item['wait_amount'] = dy.get('wait_amount')

            log_empty_fields(item, self.logger)
            item_list.append(item)

        return item_list
Example #5
0
    def parse(self, response):
        symbol = (get_url_param(response.url, 'page_index'), get_url_host(response.url), \
                  get_url_param(response.url, 'id'), response.url)
        self.logger.info('Parsing No.%s Page %s Invest Info About %s BidId From <%s>.' % symbol)

        try:
            content = json.loads(response.body_as_unicode())
            if int(content.get('result_code', 0)) != 1:
                raise ValueError
        except Exception:
            self.logger.info('Response Error In No.%s Page %s Invest Info About %s BidId From <%s>.' % symbol)
            return None

        item_list = []
        for dt in content.get('data', []):
            item = ToubiaoItem()
            item['invest_id'] = dt.get('invest_id')
            item['bid_id'] = dt.get('id')
            item['plat_id'] = self.plat_id
            item['plat_name'] = self.plat_name
            item['user_id'] = dt.get('user_id')
            item['username'] = dt.get('username')
            item['amount'] = dt.get('amount')
            item['valid_amount'] = dt.get('valid_amount')
            item['add_date'] = dt.get('add_date')
            item['status'] = dt.get('status')
            item['type'] = dt.get('type')
            item['url'] = dt.get('url')

            log_empty_fields(item, self.logger)
            item_list.append(item)

        return item_list
Example #6
0
    def parse(self, response):
        if self.method:
            symbol = (self.plat_id, get_url_param(response.url,
                                                  'date'), response.url)
        else:
            if self.is_json:
                symbol = (self.plat_id,
                          json.loads(response.request.body)['date'],
                          response.url)
            else:
                symbol = (self.plat_id,
                          get_url_param(response.request.body,
                                        'date'), response.url)
        self.logger.info('Parsing No.%s Plat %s Basic Data From <%s>.' %
                         symbol)

        try:
            content = json.loads(response.body_as_unicode())
            self.logger.info(content)
            if isinstance(content.get('data', {}), list):
                internal_content = content.get('data', {})[0]
            else:
                internal_content = content.get('data', {})
            if int(content.get('result_code',
                               -1)) != 1 or not internal_content:
                raise ValueError
        except Exception:
            self.logger.warning(
                'Fail To Receive No.%s Plat %s Basic Data From <%s>.' % symbol)
            return None

        item = JibenItem()
        item['plat_id'] = self.plat_id
        item['date'] = symbol[1]
        item['turnover_amount'] = internal_content.get('turnover_amount')
        item['unconventional_turnover_amount'] = internal_content.get(
            'unconventional_turnover_amount')
        item['trade_amount'] = internal_content.get('trade_amount')
        item['borrower_amount'] = internal_content.get('borrower_amount')
        item['investor_amount'] = internal_content.get('investor_amount')
        item['different_borrower_amount'] = internal_content.get(
            'different_borrower_amount')
        item['different_investor_amount'] = internal_content.get(
            'different_investor_amount')
        item['loan_balance'] = internal_content.get('loan_balance')
        item['avg_full_time'] = internal_content.get('avg_full_time')
        item['product_overdue_rate'] = internal_content.get(
            'product_overdue_rate')
        item['overdue_loan_amount'] = internal_content.get(
            'overdue_loan_amount')
        item['compensatory_amount'] = internal_content.get(
            'compensatory_amount')
        item['loan_overdue_rate'] = internal_content.get('loan_overdue_rate')

        log_empty_fields(item, self.logger)
        return item
Example #7
0
    def parse(self, response):
        #symbol = (self.plat_id, get_url_param(response.url, 'from_month'), get_url_param(response.url, 'to_month'), response.url)
        #self.logger.info('Parsing No.%s Plat [%s, %s] Monthly Data From <%s>.' % symbol)
        if self.method:
            symbol = (self.plat_id, get_url_param(response.url,
                                                  'month'), response.url)
        else:
            if self.is_json:
                symbol = (self.plat_id,
                          json.loads(response.request.body)['month'],
                          response.url)
            else:
                symbol = (self.plat_id,
                          get_url_param(response.request.body,
                                        'month'), response.url)
        self.logger.info('Parsing No.%s Plat %s Monthly Data From <%s>.' %
                         symbol)

        try:
            content = json.loads(response.body_as_unicode())
            self.logger.info(content)
            if isinstance(content.get('data', {}), list):
                internal_content = content.get('data', {})[0]
            else:
                internal_content = content.get('data', {})
            if int(content.get('result_code',
                               -1)) != 1 or not internal_content:
                raise ValueError
        except Exception:
            self.logger.warning(
                'Fail To Receive No.%s Plat %s Monthly Data From <%s>' %
                symbol)
            return None

        item = MeiyueItem()
        item['plat_id'] = self.plat_id
        item['date'] = symbol[1]
        item['loan_amount_per_capita'] = internal_content.get(
            'loan_amount_per_capita')
        item['avg_loan_per_trade'] = internal_content.get('avg_loan_per_trade')
        item['invest_amount_per_capita'] = internal_content.get(
            'invest_amount_per_capita')
        item['avg_invest_per_trade'] = internal_content.get(
            'avg_invest_per_trade')
        item['max_borrower_ratio'] = internal_content.get('max_borrower_ratio')
        item['topten_borrowers_ratio'] = internal_content.get(
            'topten_borrowers_ratio')
        item['overdue_project_amount'] = internal_content.get(
            'overdue_project_amount')
        item['avg_interest_rate'] = internal_content.get('avg_interest_rate')
        item['avg_borrow_period'] = internal_content.get('avg_borrow_period')

        log_empty_fields(item, self.logger)
        return item
Example #8
0
    def parse(self, response):
        symbol = (get_url_param(response.url, 'page_index'),
                  get_url_host(response.url), response.url)
        self.logger.info('Parsing No.%s Page %s Loan Info From <%s>.' % symbol)

        try:
            content = json.loads(response.body_as_unicode())
            if int(content.get('result_code', 0)) != 1:
                raise ValueError
        except Exception:
            self.logger.warning(
                'Response Error In No.%s Page %s Loan Info From <%s>.' %
                symbol)
            return None

        item_list = []
        for dj in content.get('data', []):
            item = JiekuanItem()
            item['bid_id'] = dj.get('id')
            item['plat_id'] = self.plat_id
            item['plat_name'] = self.plat_name
            item['status'] = get_url_param(response.url, 'status')
            item['title'] = dj.get('title')
            item['amount'] = dj.get('amount')
            item['process'] = dj.get('process')
            item['interest_rate'] = dj.get('interest_rate')
            item['borrow_period'] = dj.get('borrow_period')
            item['borrow_unit'] = dj.get('borrow_unit')
            item['reward'] = dj.get('reward')
            item['type'] = dj.get('type')
            item['repay_type'] = dj.get('repay_type')
            item['username'] = dj.get('username')
            item['user_id'] = dj.get('user_id')
            item['user_avatar_url'] = dj.get('user_avatar_url')
            item['province'] = dj.get('province')
            item['city'] = dj.get('city')
            item['borrow_detail'] = dj.get('borrow_detail')
            item['url'] = dj.get('url')
            item['success_time'] = dj.get('success_time')
            item['publish_time'] = dj.get('publish_time')
            item['invest_count'] = dj.get('invest_count')

            log_empty_fields(item, self.logger)
            item_list.append(item)

        return item_list
Example #9
0
    def parse(self, response):
        if self.method:
            symbol = (self.plat_id, get_url_param(response.url, 'from_date'),
                      get_url_param(response.url, 'to_date'), response.url)
        else:
            if self.is_json:
                symbol = (self.plat_id,
                          json.loads(response.request.body)['from_date'],
                          json.loads(response.request.body)['to_date'],
                          response.url)
            else:
                symbol = (self.plat_id,
                          get_url_param(response.request.body, 'from_date'),
                          get_url_param(response.request.body,
                                        'to_date'), response.url)
        self.logger.info('Parsing No.%s Plat [%s, %s] Daily Data From <%s>.' %
                         symbol)

        try:
            content = json.loads(response.body_as_unicode())
            self.logger.info(content)
            internal_content = content.get('data', {})
            if int(content.get('result_code',
                               -1)) != 1 or not internal_content:
                raise ValueError
        except Exception:
            self.logger.warning(
                'Fail To Receive No.%s Plat [%s, %s] Daily Data From <%s>.' %
                symbol)
            return None

        item_list = []
        for dd in internal_content:
            item = MeiriItem()
            item['plat_id'] = self.plat_id
            item['date'] = dd.get('current_date')
            item['daily_turnover'] = dd.get('daily_turnover')
            item['daily_trade_cnt'] = dd.get('daily_trade_cnt')
            item['daily_invest_cnt'] = dd.get('daily_invest_cnt')
            item['thityday_income'] = dd.get('thityday_income')
            item['service_time'] = dd.get('service_time')

            log_empty_fields(item, self.logger)
            item_list.append(item)

        return item_list
Example #10
0
    def parse(self, response):
        symbol = (get_url_param(response.url, 'page_index'), get_url_host(response.url), response.url)
        self.logger.info('Parsing No.%s Page %s Loan Info From <%s>.' % symbol)

        try:
            content = json.loads(response.body_as_unicode())
            if int(content.get('result_code', 0)) != 1:
                raise ValueError
        except Exception:
            self.logger.warning('Response Error In No.%s Page %s Loan Info From <%s>.' % symbol)
            return None

        item_list = []
        for dj in content.get('data', []):
            item = JiekuanItem()
            item['bid_id'] = dj.get('id')
            item['plat_id'] = self.plat_id
            item['plat_name'] = self.plat_name
            item['status'] = get_url_param(response.url, 'status')
            item['title'] = dj.get('title')
            item['amount'] = dj.get('amount')
            item['process'] = dj.get('process')
            item['interest_rate'] = dj.get('interest_rate')
            item['borrow_period'] = dj.get('borrow_period')
            item['borrow_unit'] = dj.get('borrow_unit')
            item['reward'] = dj.get('reward')
            item['type'] = dj.get('type')
            item['repay_type'] = dj.get('repay_type')
            item['username'] = dj.get('username')
            item['user_id'] = dj.get('user_id')
            item['user_avatar_url'] = dj.get('user_avatar_url')
            item['province'] = dj.get('province')
            item['city'] = dj.get('city')
            item['borrow_detail'] = dj.get('borrow_detail')
            item['url'] = dj.get('url')
            item['success_time'] = dj.get('success_time')
            item['publish_time'] = dj.get('publish_time')
            item['invest_count'] = dj.get('invest_count')

            log_empty_fields(item, self.logger)
            item_list.append(item)

        return item_list
Example #11
0
    def parse(self, response):
        symbol = (get_url_param(response.url, 'page_index'),
                  get_url_host(response.url), response.url)
        self.logger.info('Parsing No.%s Page %s Overdue Info From <%s>.' %
                         symbol)

        try:
            content = json.loads(response.body_as_unicode())
            if int(content.get('result_code', 0)) != 1:
                raise ValueError
        except Exception:
            self.logger.warning(
                'Response Error In No.%s Page %s Overdue Info From <%s>.' %
                symbol)
            return None

        item_list = []
        for dy in content.get('data', []):
            item = YuqiItem()
            item['plat_id'] = self.plat_id
            item['plat_name'] = self.plat_name
            item['user_id'] = dy.get('user_id')
            item['username'] = dy.get('username')
            item['idcard'] = dy.get('idcard')
            item['overdue_count'] = dy.get('overdue_count')
            item['overdue_total'] = dy.get('overdue_total')
            item['overdue_principal'] = dy.get('overdue_principal')
            item['payment_total'] = dy.get('payment_total')
            item['payment_count'] = dy.get('payment_count')
            item['payment_period'] = dy.get('payment_period')
            item['repay_amount'] = dy.get('repay_amount')
            item['wait_amount'] = dy.get('wait_amount')

            log_empty_fields(item, self.logger)
            item_list.append(item)

        return item_list
Example #12
0
    def parse(self, response):
        symbol = (get_url_param(response.url, 'page_index'), get_url_host(response.url), \
                  get_url_param(response.url, 'id'), response.url)
        self.logger.info(
            'Parsing No.%s Page %s Invest Info About %s BidId From <%s>.' %
            symbol)

        try:
            content = json.loads(response.body_as_unicode())
            if int(content.get('result_code', 0)) != 1:
                raise ValueError
        except Exception:
            self.logger.info(
                'Response Error In No.%s Page %s Invest Info About %s BidId From <%s>.'
                % symbol)
            return None

        item_list = []
        for dt in content.get('data', []):
            item = ToubiaoItem()
            item['invest_id'] = dt.get('invest_id')
            item['bid_id'] = dt.get('id')
            item['plat_id'] = self.plat_id
            item['plat_name'] = self.plat_name
            item['user_id'] = dt.get('user_id')
            item['username'] = dt.get('username')
            item['amount'] = dt.get('amount')
            item['valid_amount'] = dt.get('valid_amount')
            item['add_date'] = dt.get('add_date')
            item['status'] = dt.get('status')
            item['type'] = dt.get('type')
            item['url'] = dt.get('url')

            log_empty_fields(item, self.logger)
            item_list.append(item)

        return item_list
Example #13
0
    def parse(self, response):
        if self.method:
            symbol = (self.plat_id, get_url_param(response.url, 'date'), response.url)
        else:
            symbol = (self.plat_id, get_url_param(response.request.body, 'date'), response.url)
        self.logger.info('Parsing No.%s Plat %s Basic Data From <%s>.' % symbol)

        try:
            content = json.loads(response.body_as_unicode())
            self.logger.info(content)
            internal_content = content.get('data', {})[0]
            if int(content.get('result_code', -1)) != 1 or not internal_content:
                raise ValueError
        except Exception:
            self.logger.warning('Fail To Receive No.%s Plat %s Basic Data From <%s>.' % symbol)
            return None

        item = JibenItem()
        item['plat_id'] = self.plat_id
        item['date'] = symbol[1]
        item['turnover_amount'] = internal_content.get('turnover_amount')
        item['unconventional_turnover_amount'] = internal_content.get('unconventional_turnover_amount')
        item['trade_amount'] = internal_content.get('trade_amount')
        item['borrower_amount'] = internal_content.get('borrower_amount')
        item['investor_amount'] = internal_content.get('investor_amount')
        item['different_borrower_amount'] = internal_content.get('different_borrower_amount')
        item['different_investor_amount'] = internal_content.get('different_investor_amount')
        item['loan_balance'] = internal_content.get('loan_balance')
        item['avg_full_time'] = internal_content.get('avg_full_time')
        item['product_overdue_rate'] = internal_content.get('product_overdue_rate')
        item['overdue_loan_amount'] = internal_content.get('overdue_loan_amount')
        item['compensatory_amount'] = internal_content.get('compensatory_amount')
        item['loan_overdue_rate'] = internal_content.get('loan_overdue_rate')

        log_empty_fields(item, self.logger)
        return item
Example #14
0
    def parse(self, response):
        #NOTE: (zacky, 2015.APR.27th) PIPELINE FUNCTIONS RELATED WILL BE PROCESSED, SO WE KEEP THE OBJECT STATE HERE.
        symbol = (self.mapping.get(self.get_pin_from_url(response.url)), response.url)
        self.logger.info('Parsing ID.%d Wangjia Archive From <%s>.' % symbol)
        self.object = DaohangItem.get_object_by_pk(symbol[0])

        item = DanganItem()
        item['name'] = self.object.name
        item['logo_url'] = get_content(response.xpath('//div[@class="rLogo"]/a/img/@src').extract())

        detail = response.xpath('//div[contains(@class, "detailBox")]/p')
        if detail:
            item['link'] = get_content(detail[1].xpath('a/@href').extract())
            item['location'] = get_content(detail[3].xpath('text()').extract())
            item['launch_time'] = get_content(detail[4].xpath('text()').extract())

        about = response.xpath('//div[contains(@class, "aboutBd")]/p')
        if about:
            item['introduction'] = ' '.join([get_trunk(c) for c in about.xpath('.//text()').extract()])

        info = response.xpath('//div[contains(@class, "inforBd")]/p[not(contains(@class, "line"))]')
        if info:
            item['company_name'] = get_content(info[0].xpath('text()').extract())
            item['artificial_person'] = get_content(info[1].xpath('text()').extract())
            item['company_type'] = get_content(info[2].xpath('text()').extract())
            item['shareholder_stucture'] = get_content(info[3].xpath('text()').extract())
            item['registered_capital'] = get_content(info[4].xpath('text()').extract())
            item['contributed_capital'] = get_content(info[5].xpath('text()').extract())
            item['registered_address'] = get_content(info[6].xpath('text()').extract())
            item['opening_date'] = get_content(info[7].xpath('text()').extract())
            item['approved_date'] = get_content(info[8].xpath('text()').extract())
            item['registration_authority'] = get_content(info[9].xpath('text()').extract())
            item['business_licence'] = get_content(info[10].xpath('text()').extract())
            item['institutional_framework'] = get_content(info[11].xpath('text()').extract())
            item['tax_registration_num'] = get_content(info[12].xpath('text()').extract())

        record = response.xpath('//div[contains(@class, "webRecordBd")]/table/tbody/tr')[1].xpath('td')
        if record:
            item['domain_name'] = get_content(record[0].xpath('text()').extract())
            item['domain_date'] = get_content(record[1].xpath('text()').extract())
            item['domain_company_type'] = get_content(record[2].xpath('text()').extract())
            item['domain_company_name'] = get_content(record[3].xpath('text()').extract())
            item['icp'] = get_content(record[4].xpath('text()').extract())

        people = response.xpath('//div[contains(@class, "peopleBd")]/ul/li')
        if people:
            avatar_url = []
            content = []
            for i in xrange(len(people)):
                avatar_url.extend(people[i].xpath('div[@class="avatar"]/img/@src').extract())
                content.extend([get_trunk(c) for c in people[i].xpath('p//text()').extract()])
            item['company_person_avatar_url'] = '#'.join(avatar_url)
            item['company_person'] = ' '.join(content)

        cost = response.xpath('//div[contains(@class, "costBd")]')[0].xpath('p')
        if cost:
            item['management_fee'] = get_content(cost[0].xpath('text()').extract())
            item['prepaid_fee'] = get_content(cost[1].xpath('text()').extract())
            item['cash_withdrawal_fee'] = get_content(cost[2].xpath('text()').extract())
            item['vip_fee'] = get_content(cost[3].xpath('text()').extract())
            item['transfer_fee'] = get_content(cost[4].xpath('text()').extract())
            item['mode_of_payment'] = get_content(cost[5].xpath('text()').extract())

        contact = response.xpath('//div[contains(@class, "costBd")]')[1].xpath('p')
        if contact:
            item['contact_address'] = get_content(contact[0].xpath('text()').extract())
            item['phone_400'] = get_content(contact[1].xpath('text()').extract())
            item['phone'] = get_content(contact[2].xpath('text()').extract())
            item['fax'] = get_content(contact[3].xpath('text()').extract())
            item['email'] = get_content(contact[4].xpath('text()').extract())

        record = response.xpath('//div[contains(@class, "recordListBox")]/ul/li')
        if record:
            item['is_automatic_bid'] = get_content(record[3].xpath('.//text()').extract(), skipFirst=True)
            item['is_equitable_assignment'] = get_content(record[4].xpath('.//text()').extract(), skipFirst=True)
            item['trust_fund'] = get_content(record[5].xpath('.//text()').extract(), skipFirst=True)
            item['tender_security'] = get_content(record[6].xpath('.//text()').extract(), skipFirst=True)
            item['security_mode'] = get_content(record[7].xpath('.//text()').extract(), skipFirst=True)
            item['guarantee_institution'] = get_content(record[8].xpath('.//text()').extract(), skipFirst=True)
            item['business_type'] = len(record) >= 10 and get_content(record[9].xpath('.//text()').extract(), skipFirst=True)

        log_empty_fields(item, self.logger)
        return item
Example #15
0
    def parse(self, response):
        #NOTE: (zacky, 2015.APR.27th) PIPELINE FUNCTIONS RELATED WILL BE PROCESSED, SO WE KEEP THE OBJECT STATE HERE.
        symbol = (self.mapping.get(self.get_pin_from_url(response.url)),
                  response.url)
        self.logger.info('Parsing ID.%d Wangjia Archive From <%s>.' % symbol)
        self.object = DaohangItem.get_object_by_pk(symbol[0])

        item = DanganItem()
        item['name'] = self.object.name
        item['logo_url'] = get_content(
            response.xpath('//div[@class="rLogo"]/a/img/@src').extract())

        detail = response.xpath('//div[contains(@class, "detailBox")]/p')
        if detail:
            item['link'] = get_content(detail[1].xpath('a/@href').extract())
            item['location'] = get_content(detail[3].xpath('text()').extract())
            item['launch_time'] = get_content(
                detail[4].xpath('text()').extract())

        about = response.xpath('//div[contains(@class, "aboutBd")]/p')
        if about:
            item['introduction'] = ' '.join(
                [get_trunk(c) for c in about.xpath('.//text()').extract()])

        info = response.xpath(
            '//div[contains(@class, "inforBd")]/p[not(contains(@class, "line"))]'
        )
        if info:
            item['company_name'] = get_content(
                info[0].xpath('text()').extract())
            item['artificial_person'] = get_content(
                info[1].xpath('text()').extract())
            item['company_type'] = get_content(
                info[2].xpath('text()').extract())
            item['shareholder_stucture'] = get_content(
                info[3].xpath('text()').extract())
            item['registered_capital'] = get_content(
                info[4].xpath('text()').extract())
            item['contributed_capital'] = get_content(
                info[5].xpath('text()').extract())
            item['registered_address'] = get_content(
                info[6].xpath('text()').extract())
            item['opening_date'] = get_content(
                info[7].xpath('text()').extract())
            item['approved_date'] = get_content(
                info[8].xpath('text()').extract())
            item['registration_authority'] = get_content(
                info[9].xpath('text()').extract())
            item['business_licence'] = get_content(
                info[10].xpath('text()').extract())
            item['institutional_framework'] = get_content(
                info[11].xpath('text()').extract())
            item['tax_registration_num'] = get_content(
                info[12].xpath('text()').extract())

        record = response.xpath(
            '//div[contains(@class, "webRecordBd")]/table/tbody/tr')[1].xpath(
                'td')
        if record:
            item['domain_name'] = get_content(
                record[0].xpath('text()').extract())
            item['domain_date'] = get_content(
                record[1].xpath('text()').extract())
            item['domain_company_type'] = get_content(
                record[2].xpath('text()').extract())
            item['domain_company_name'] = get_content(
                record[3].xpath('text()').extract())
            item['icp'] = get_content(record[4].xpath('text()').extract())

        people = response.xpath('//div[contains(@class, "peopleBd")]/ul/li')
        if people:
            avatar_url = []
            content = []
            for i in xrange(len(people)):
                avatar_url.extend(
                    people[i].xpath('div[@class="avatar"]/img/@src').extract())
                content.extend([
                    get_trunk(c)
                    for c in people[i].xpath('p//text()').extract()
                ])
            item['company_person_avatar_url'] = '#'.join(avatar_url)
            item['company_person'] = ' '.join(content)

        cost = response.xpath('//div[contains(@class, "costBd")]')[0].xpath(
            'p')
        if cost:
            item['management_fee'] = get_content(
                cost[0].xpath('text()').extract())
            item['prepaid_fee'] = get_content(
                cost[1].xpath('text()').extract())
            item['cash_withdrawal_fee'] = get_content(
                cost[2].xpath('text()').extract())
            item['vip_fee'] = get_content(cost[3].xpath('text()').extract())
            item['transfer_fee'] = get_content(
                cost[4].xpath('text()').extract())
            item['mode_of_payment'] = get_content(
                cost[5].xpath('text()').extract())

        contact = response.xpath('//div[contains(@class, "costBd")]')[1].xpath(
            'p')
        if contact:
            item['contact_address'] = get_content(
                contact[0].xpath('text()').extract())
            item['phone_400'] = get_content(
                contact[1].xpath('text()').extract())
            item['phone'] = get_content(contact[2].xpath('text()').extract())
            item['fax'] = get_content(contact[3].xpath('text()').extract())
            item['email'] = get_content(contact[4].xpath('text()').extract())

        record = response.xpath(
            '//div[contains(@class, "recordListBox")]/ul/li')
        if record:
            item['is_automatic_bid'] = get_content(
                record[3].xpath('.//text()').extract(), skipFirst=True)
            item['is_equitable_assignment'] = get_content(
                record[4].xpath('.//text()').extract(), skipFirst=True)
            item['trust_fund'] = get_content(
                record[5].xpath('.//text()').extract(), skipFirst=True)
            item['tender_security'] = get_content(
                record[6].xpath('.//text()').extract(), skipFirst=True)
            item['security_mode'] = get_content(
                record[7].xpath('.//text()').extract(), skipFirst=True)
            item['guarantee_institution'] = get_content(
                record[8].xpath('.//text()').extract(), skipFirst=True)
            item['business_type'] = len(record) >= 10 and get_content(
                record[9].xpath('.//text()').extract(), skipFirst=True)

        log_empty_fields(item, self.logger)
        return item