def parse_item(self, response): result = json.loads(response.body) for r in result['resultData']: url = 'http://aj.hifda.gov.cn/web/showContent.jsp?id=' + r['id'] data = { u'企业(商户)名称': r['companyname'], u'注册地址': r['companysite'], u'法定代表人姓名': r['companyman'], u'法定代表人身份证号': r['companymanid'], u'负责人姓名': r['responsible_man'], u'负责人身份证号': r['resp_man_id'], u'直接责任人': r['direct_person'], u'社会信用代码': r['idcode'], u'案件分类': r['toclassify'], u'案件名称': r['losecase'], u'行政处罚决定文书号': r['punish_writ_num'], u'主要违法事实': r['losedetail'], u'处罚依据和内容': r['punishway'], u'处罚机关': r['punishunit'], u'处罚时间': r['punishtime'] } already = SpiderData.objects.filter(url=url) if already.count() == 0: sendData('hainan', data, url) else: pass
def parse_detail(self, response): trs = response.xpath('//*[@id="main"]/div/div[2]/table//tr') data = {} for tr in trs: key = tr.xpath('td[1]/text()').extract_first() val = tr.xpath('td[2]/text()').extract_first() data[key] = val sendData('shanghai', data, response.url)
def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Response, dict # or Item objects. sendData(spider.name, { 'error': unicode(exception), 'url': response.url }, response.url, True)
def parse_detail(self, response): trs = response.xpath('//*[@id="edit"]//tr') data = {} for i, tr in enumerate(trs): if i > 0: key = tr.xpath('th/text()').extract_first() val = val = tr.xpath('td/text()').extract_first() if key: key = key.replace(':', '').replace(' ', '') if not val: val = '' data[key] = val sendData('gansu', data, response.url)
def parse_item(self, response): trs = response.xpath('//table/tr') data = {} for tr in trs: key = tr.xpath('td[1]/text()').extract_first() val = tr.xpath('td[2]/text()').extract_first() if key or val: data[key] = val try: already = SpiderData.objects.filter(scrapyname='sfda',data__contains={u"被抽样单位名称":data[u'被抽样单位名称'],u"生产日期/批号":data[u'生产日期/批号'],u"抽检项目":data[u'抽检项目']}).count() except: already = 1 if not already: sendData('sfda',data,response.url)
def parse_item(self, response): date = response.meta['date'] title = response.meta['title'] data = {} tables = response.xpath('//table[@class="rtab2"]') for table in tables: trs = table.xpath('tr') for tr in trs: key = tr.xpath('th/text()').extract_first().split(u':')[0] value = tr.xpath('td/text()').extract_first() data[key] = value already = SpiderData.objects.filter(url=response.url) if already.count() == 0: sendData('case', data, response.url) else: pass