def start_requests(self): body = {'type':'1','target1':self.target1,'target2':self.target2} for i in self.shortlist: obj = DaohangItem.get_object_by_pk(i) if obj.plat_id: plat_id = obj.plat_id self.mapping[plat_id] = obj body['wdzjPlatId'] = str(plat_id) yield scrapy.FormRequest(self.start_url, formdata=body, meta = body, dont_filter=True)
def parse(self, response): symbol = (self.mapping.get(self.get_pin_from_url(response.url)), response.url) self.logger.info('Parsing ID.%d Wangjia Feature From <%s>.' % symbol) self.object = DaohangItem.get_object_by_pk(symbol[0]) item = TedianItem() item['name'] = self.object.name rtag = response.xpath('//div[@class="rTags"]') if rtag: item['status'] = get_content( rtag.xpath('./span[@class="tag3"]/text()').extract()) item['company_tag'] = get_content( rtag.xpath('./span[@class="tag tag2"]/text()').extract()) tag_info = rtag.xpath('./span[@class = "tag"]') item['illustration'] = '/'.join([ get_trunk(info) for info in tag_info.xpath('text()').extract() ]) comment_info = response.xpath( '//div[contains(@class,"box commentBox")]') if comment_info: commentScores = comment_info.xpath('./dl[@class="comment"]') item['recommendation'] = get_content( commentScores.xpath('./dt/span/text()').extract()) score = commentScores.xpath('./dd/span[@class="num"]') item['withdraw_num'] = get_content( score[0].xpath('text()').extract()) item['guard_num'] = get_content(score[1].xpath('text()').extract()) item['service_num'] = get_content( score[2].xpath('text()').extract()) item['experience_num'] = get_content( score[3].xpath('text()').extract()) scoreInfo = commentScores.xpath('.//span[not(@class="num")]') item['withdraw_day'] = get_content( scoreInfo[0].xpath('text()').extract()) item['guard_day'] = get_content( scoreInfo[1].xpath('text()').extract()) item['service_status'] = get_content( scoreInfo[2].xpath('text()').extract()) item['experience_status'] = get_content( scoreInfo[3].xpath('text()').extract()) impress_info = comment_info.xpath( './dl[@class="impression"]/dd//span') item['impression'] = '\001'.join([ get_trunk(impress) for impress in impress_info.xpath('text()').extract() ]) return item
def start_requests(self): for i in self.shortlist: obj = DaohangItem.get_object_by_pk(i) if obj.plat_id: plat_id = obj.plat_id self.mapping[plat_id] = obj body = {'wdzjPlatId': str(plat_id)} yield scrapy.FormRequest(self.start_url, formdata=body, meta=body, dont_filter=True)
def parse(self, response): symbol = (self.mapping.get(self.get_pin_from_url(response.url)), response.url) self.logger.info('Parsing ID.%d Wangjia Feature From <%s>.' % symbol) self.object = DaohangItem.get_object_by_pk(symbol[0]) item = TedianItem() item['name'] = self.object.name rtag = response.xpath('//div[@class="rTags"]') if rtag: item['status'] = get_content(rtag.xpath('./span[@class="tag3"]/text()').extract()) item['company_tag'] = get_content(rtag.xpath('./span[@class="tag tag2"]/text()').extract()) tag_info = rtag.xpath('./span[@class = "tag"]') item['illustration'] = '/'.join([get_trunk(info) for info in tag_info.xpath('text()').extract()]) comment_info = response.xpath('//div[contains(@class,"box commentBox")]') if comment_info: commentScores = comment_info.xpath('./dl[@class="comment"]') item['recommendation'] = get_content(commentScores.xpath('./dt/span/text()').extract()) score = commentScores.xpath('./dd/span[@class="num"]') item['withdraw_num'] = get_content(score[0].xpath('text()').extract()) item['guard_num'] = get_content(score[1].xpath('text()').extract()) item['service_num'] = get_content(score[2].xpath('text()').extract()) item['experience_num'] = get_content(score[3].xpath('text()').extract()) scoreInfo = commentScores.xpath('.//span[not(@class="num")]') item['withdraw_day'] = get_content(scoreInfo[0].xpath('text()').extract()) item['guard_day'] = get_content(scoreInfo[1].xpath('text()').extract()) item['service_status'] = get_content(scoreInfo[2].xpath('text()').extract()) item['experience_status'] = get_content(scoreInfo[3].xpath('text()').extract()) impress_info = comment_info.xpath('./dl[@class="impression"]/dd//span') item['impression'] = '\001'.join([get_trunk(impress) for impress in impress_info.xpath('text()').extract()]) return item
def parse(self, response): item_list = [] if response.url.endswith('html'): # For Regular Platform. content = response.xpath( '//div[@id="platList"]/div[starts-with(@class, "rnav")]') for sel_ct in content: province_name = get_content( sel_ct.xpath( 'div[@class="til"]/div/p[not(@class="til_num")]/text()' ).extract()) province_id = ProvinceItem.get_id_by_name(province_name) plat_list = sel_ct.xpath('ul[@class="til_cn"]/li') for sel_pt in plat_list: daohang = DaohangItem() purl = get_content( sel_pt.xpath('a/@purl').extract()).split('/') while not purl[-1]: purl.pop() daohang['pin'] = purl.pop() daohang['name'] = get_content( sel_pt.xpath('a/text()').extract()) daohang['link'] = get_content( sel_pt.xpath('a/@href').extract()) daohang['province_id'] = province_id item_list.append(daohang) # For Problematic Platform. # Disabled Here Temporarily. #content = response.xpath('//div[@id="issuePlatList"]/div[starts-with(@class, "rnav")]') #for sel_ct in content: # province_name = get_content(sel_ct.xpath('div[@class="til"]/div/p[not(@class="til_num")]/text()').extract()) # province_id = ProvinceItem.get_id_by_name(province_name) # plat_list = sel_ct.xpath('ul[@class="til_cn"]/li') # for sel_pt in plat_list: # daohang = DaohangItem() # purl = get_content(sel_pt.xpath('a/@purl').extract()).split('/') # while not purl[-1]: purl.pop() # daohang['pin'] = purl.pop() # daohang['name'] = get_content(sel_pt.xpath('a/text()').extract()) # # Invalid Link For Problematic Platform. # #daohang['link'] = get_content(sel_pt.xpath('a/@href').extract()) # daohang['province_id'] = province_id # item_list.append(daohang) else: content = json.loads(response.body_as_unicode()) if response.url.endswith('json'): for ct in content: daohang = DaohangItem() daohang['pin'] = ct.get('platPin', None) daohang['allPin'] = ct.get('allPlatPin', None) daohang['name'] = ct.get('platName', None) daohang['link'] = ct.get('platUrl', None) item_list.append(daohang) else: for ct in content: if not ct.get('city'): continue province_id = ProvinceItem.get_id_by_name(ct.get('city')) plat_list = ct.get('platList') for pt in plat_list: daohang = DaohangItem() daohang['pin'] = pt.get('platLetter', None) daohang['name'] = pt.get('platName', None) daohang['link'] = pt.get('platUrl', None) daohang['province_id'] = province_id daohang['launch_time'] = pt.get('onlineDateStr', None) daohang['icon_url'] = pt.get('platIconUrl', None) item_list.append(daohang) return item_list
def start_requests(self): for i in self.shortlist: obj = DaohangItem.get_object_by_pk(i) self.mapping[obj.pin] = obj.id url = self.start_formated_url.format(plat_pin=obj.pin) yield self.make_requests_from_url(url)
def parse(self, response): #NOTE: (zacky, 2015.APR.27th) PIPELINE FUNCTIONS RELATED WILL BE PROCESSED, SO WE KEEP THE OBJECT STATE HERE. symbol = (self.mapping.get(self.get_pin_from_url(response.url)), response.url) self.logger.info('Parsing ID.%d Wangjia Archive From <%s>.' % symbol) self.object = DaohangItem.get_object_by_pk(symbol[0]) item = DanganItem() item['name'] = self.object.name item['logo_url'] = get_content(response.xpath('//div[@class="rLogo"]/a/img/@src').extract()) detail = response.xpath('//div[contains(@class, "detailBox")]/p') if detail: item['link'] = get_content(detail[1].xpath('a/@href').extract()) item['location'] = get_content(detail[3].xpath('text()').extract()) item['launch_time'] = get_content(detail[4].xpath('text()').extract()) about = response.xpath('//div[contains(@class, "aboutBd")]/p') if about: item['introduction'] = ' '.join([get_trunk(c) for c in about.xpath('.//text()').extract()]) info = response.xpath('//div[contains(@class, "inforBd")]/p[not(contains(@class, "line"))]') if info: item['company_name'] = get_content(info[0].xpath('text()').extract()) item['artificial_person'] = get_content(info[1].xpath('text()').extract()) item['company_type'] = get_content(info[2].xpath('text()').extract()) item['shareholder_stucture'] = get_content(info[3].xpath('text()').extract()) item['registered_capital'] = get_content(info[4].xpath('text()').extract()) item['contributed_capital'] = get_content(info[5].xpath('text()').extract()) item['registered_address'] = get_content(info[6].xpath('text()').extract()) item['opening_date'] = get_content(info[7].xpath('text()').extract()) item['approved_date'] = get_content(info[8].xpath('text()').extract()) item['registration_authority'] = get_content(info[9].xpath('text()').extract()) item['business_licence'] = get_content(info[10].xpath('text()').extract()) item['institutional_framework'] = get_content(info[11].xpath('text()').extract()) item['tax_registration_num'] = get_content(info[12].xpath('text()').extract()) record = response.xpath('//div[contains(@class, "webRecordBd")]/table/tbody/tr')[1].xpath('td') if record: item['domain_name'] = get_content(record[0].xpath('text()').extract()) item['domain_date'] = get_content(record[1].xpath('text()').extract()) item['domain_company_type'] = get_content(record[2].xpath('text()').extract()) item['domain_company_name'] = get_content(record[3].xpath('text()').extract()) item['icp'] = get_content(record[4].xpath('text()').extract()) people = response.xpath('//div[contains(@class, "peopleBd")]/ul/li') if people: avatar_url = [] content = [] for i in xrange(len(people)): avatar_url.extend(people[i].xpath('div[@class="avatar"]/img/@src').extract()) content.extend([get_trunk(c) for c in people[i].xpath('p//text()').extract()]) item['company_person_avatar_url'] = '#'.join(avatar_url) item['company_person'] = ' '.join(content) cost = response.xpath('//div[contains(@class, "costBd")]')[0].xpath('p') if cost: item['management_fee'] = get_content(cost[0].xpath('text()').extract()) item['prepaid_fee'] = get_content(cost[1].xpath('text()').extract()) item['cash_withdrawal_fee'] = get_content(cost[2].xpath('text()').extract()) item['vip_fee'] = get_content(cost[3].xpath('text()').extract()) item['transfer_fee'] = get_content(cost[4].xpath('text()').extract()) item['mode_of_payment'] = get_content(cost[5].xpath('text()').extract()) contact = response.xpath('//div[contains(@class, "costBd")]')[1].xpath('p') if contact: item['contact_address'] = get_content(contact[0].xpath('text()').extract()) item['phone_400'] = get_content(contact[1].xpath('text()').extract()) item['phone'] = get_content(contact[2].xpath('text()').extract()) item['fax'] = get_content(contact[3].xpath('text()').extract()) item['email'] = get_content(contact[4].xpath('text()').extract()) record = response.xpath('//div[contains(@class, "recordListBox")]/ul/li') if record: item['is_automatic_bid'] = get_content(record[3].xpath('.//text()').extract(), skipFirst=True) item['is_equitable_assignment'] = get_content(record[4].xpath('.//text()').extract(), skipFirst=True) item['trust_fund'] = get_content(record[5].xpath('.//text()').extract(), skipFirst=True) item['tender_security'] = get_content(record[6].xpath('.//text()').extract(), skipFirst=True) item['security_mode'] = get_content(record[7].xpath('.//text()').extract(), skipFirst=True) item['guarantee_institution'] = get_content(record[8].xpath('.//text()').extract(), skipFirst=True) item['business_type'] = len(record) >= 10 and get_content(record[9].xpath('.//text()').extract(), skipFirst=True) log_empty_fields(item, self.logger) return item
def start_requests(self): for i in self.shortlist: obj = DaohangItem.get_object_by_pk(i) self.mapping[obj.pin] = obj.id url = self.start_formated_url.format(pin=obj.pin) yield self.make_requests_from_url(url)
def parse(self, response): #NOTE: (zacky, 2015.APR.27th) PIPELINE FUNCTIONS RELATED WILL BE PROCESSED, SO WE KEEP THE OBJECT STATE HERE. symbol = (self.mapping.get(self.get_pin_from_url(response.url)), response.url) self.logger.info('Parsing ID.%d Wangjia Archive From <%s>.' % symbol) self.object = DaohangItem.get_object_by_pk(symbol[0]) item = DanganItem() item['name'] = self.object.name item['logo_url'] = get_content( response.xpath('//div[@class="rLogo"]/a/img/@src').extract()) detail = response.xpath('//div[contains(@class, "detailBox")]/p') if detail: item['link'] = get_content(detail[1].xpath('a/@href').extract()) item['location'] = get_content(detail[3].xpath('text()').extract()) item['launch_time'] = get_content( detail[4].xpath('text()').extract()) about = response.xpath('//div[contains(@class, "aboutBd")]/p') if about: item['introduction'] = ' '.join( [get_trunk(c) for c in about.xpath('.//text()').extract()]) info = response.xpath( '//div[contains(@class, "inforBd")]/p[not(contains(@class, "line"))]' ) if info: item['company_name'] = get_content( info[0].xpath('text()').extract()) item['artificial_person'] = get_content( info[1].xpath('text()').extract()) item['company_type'] = get_content( info[2].xpath('text()').extract()) item['shareholder_stucture'] = get_content( info[3].xpath('text()').extract()) item['registered_capital'] = get_content( info[4].xpath('text()').extract()) item['contributed_capital'] = get_content( info[5].xpath('text()').extract()) item['registered_address'] = get_content( info[6].xpath('text()').extract()) item['opening_date'] = get_content( info[7].xpath('text()').extract()) item['approved_date'] = get_content( info[8].xpath('text()').extract()) item['registration_authority'] = get_content( info[9].xpath('text()').extract()) item['business_licence'] = get_content( info[10].xpath('text()').extract()) item['institutional_framework'] = get_content( info[11].xpath('text()').extract()) item['tax_registration_num'] = get_content( info[12].xpath('text()').extract()) record = response.xpath( '//div[contains(@class, "webRecordBd")]/table/tbody/tr')[1].xpath( 'td') if record: item['domain_name'] = get_content( record[0].xpath('text()').extract()) item['domain_date'] = get_content( record[1].xpath('text()').extract()) item['domain_company_type'] = get_content( record[2].xpath('text()').extract()) item['domain_company_name'] = get_content( record[3].xpath('text()').extract()) item['icp'] = get_content(record[4].xpath('text()').extract()) people = response.xpath('//div[contains(@class, "peopleBd")]/ul/li') if people: avatar_url = [] content = [] for i in xrange(len(people)): avatar_url.extend( people[i].xpath('div[@class="avatar"]/img/@src').extract()) content.extend([ get_trunk(c) for c in people[i].xpath('p//text()').extract() ]) item['company_person_avatar_url'] = '#'.join(avatar_url) item['company_person'] = ' '.join(content) cost = response.xpath('//div[contains(@class, "costBd")]')[0].xpath( 'p') if cost: item['management_fee'] = get_content( cost[0].xpath('text()').extract()) item['prepaid_fee'] = get_content( cost[1].xpath('text()').extract()) item['cash_withdrawal_fee'] = get_content( cost[2].xpath('text()').extract()) item['vip_fee'] = get_content(cost[3].xpath('text()').extract()) item['transfer_fee'] = get_content( cost[4].xpath('text()').extract()) item['mode_of_payment'] = get_content( cost[5].xpath('text()').extract()) contact = response.xpath('//div[contains(@class, "costBd")]')[1].xpath( 'p') if contact: item['contact_address'] = get_content( contact[0].xpath('text()').extract()) item['phone_400'] = get_content( contact[1].xpath('text()').extract()) item['phone'] = get_content(contact[2].xpath('text()').extract()) item['fax'] = get_content(contact[3].xpath('text()').extract()) item['email'] = get_content(contact[4].xpath('text()').extract()) record = response.xpath( '//div[contains(@class, "recordListBox")]/ul/li') if record: item['is_automatic_bid'] = get_content( record[3].xpath('.//text()').extract(), skipFirst=True) item['is_equitable_assignment'] = get_content( record[4].xpath('.//text()').extract(), skipFirst=True) item['trust_fund'] = get_content( record[5].xpath('.//text()').extract(), skipFirst=True) item['tender_security'] = get_content( record[6].xpath('.//text()').extract(), skipFirst=True) item['security_mode'] = get_content( record[7].xpath('.//text()').extract(), skipFirst=True) item['guarantee_institution'] = get_content( record[8].xpath('.//text()').extract(), skipFirst=True) item['business_type'] = len(record) >= 10 and get_content( record[9].xpath('.//text()').extract(), skipFirst=True) log_empty_fields(item, self.logger) return item