def parse_forum(self, response): """ 解析评论内容 """ item = Item() itemloader = ItemLoader(item=item, selector=response) for field in self.forum_xpath: item.fields[field] = Field() if 'main_body' in field: content = re.compile(r'<html><body><.*?>(.*?)</body></html>', re.S | re.M) content = content.findall(response.text) content = re.sub(r'<script>.*?</script>', '', ''.join(content)) content = re.sub(r'[\r\n]', '', content) content = re.sub(r'<div .*?>.*?</div>', '', content) content = re.sub(r'<style .*?>.*?</style>', '', content, re.S | re.M) content = re.sub(r'&.*?;', '', content) content = re.sub(r'<.*?>', '', content, re.M | re.I) content = re.sub(' ', '', content) itemloader.add_value(field, content) elif 'content_url' in field: itemloader.add_value(field, response.url) else: itemloader.add_xpath(field, self.forum_xpath[field]) item = self.format_item(itemloader.load_item()) yield item
def parse_shop(self, response): """ 解析提取model数据 :param response: """ item = Item() item_loader = ItemLoader(item=item, selector=response) for field in self.model_xpath: item.fields[field] = Field() if 'model_url' in field: item_loader.add_value(field, response.url) else: item_loader.add_xpath(field, self.model_xpath[field]) item = self.format_item(item_loader.load_item()) yield item # 拼接用户评论链接 user_url = response.xpath(self.user_url).extract() for uid in user_url: brand = uid.split('-')[0] model = uid.split('-')[1] yield Request( self.user_review_url.format(brand=brand, model=model), self.parse_comment)
def PlayerRow(**fields): item = Item() for field in all_fields: item.fields[field] = Field() for field, value in fields.iteritems(): item[field] = value return item
def parse_item(self, response): item = Item() item.fields['url'] = Field() item.fields['url_md5'] = Field() item.fields['title'] = Field() item.fields['pubtime'] = Field() item.fields['content'] = Field() item.fields['author'] = Field() item.fields['author_url'] = Field() item.fields['site_name'] = Field() l = ItemLoader(item=item, response=response) url = response.url md5 = hashlib.md5() l.add_value(u'url', url) md5.update(url) url_md5 = md5.hexdigest() l.add_value(u'url_md5', url_md5) l.add_xpath('title', "//h1/text()", MapCompose(unicode.lstrip, unicode.rstrip)) l.add_xpath( 'pubtime', "//p[@class='clearfix']//span/text() | //div[@class='titleLine-gY7DniPB']//span/text()", MapCompose(unicode.lstrip, unicode.rstrip)) l.add_xpath('content', "//div[@class='text-3zQ3cZD4']//text()", MapCompose(unicode.lstrip, unicode.rstrip)) l.add_xpath( 'author', "//p[@class='clearfix']/a[2]/text() | //div[@class='titleLine-gY7DniPB']/p/a[2]/text()", MapCompose(unicode.lstrip, unicode.rstrip)) l.add_xpath('author_url', "//div[@class='titleLine-gY7DniPB']/p/a[2]/@href", MapCompose(unicode.lstrip, unicode.rstrip)) l.add_value("site_name", u"大风号") yield l.load_item()
def __init__(self): """ 将item中的值初始化为空 """ Item.__init__(self) self['coal'] = '' self['non_coal'] = ''
def parse(self, response): item = Item() fields = json.loads(self.conf.get("fields")) l = ItemLoader(item, response) item.fields["url"] = Field() item.fields["spider_jobid"] = Field() l.add_value("url", response.url) l.add_value("spider_jobid", self.spider_jobid) # 加载动态库字段建立Field,xpath规则 (方法一) for k in self.keys: if fields.get("fields", "") == "": logging.error(u"内容解析未得到!!!") return l.load_item() if fields.get("fields").get(k) != None: item.fields[k] = Field() if fields.get("fields").get(k).keys()[0] == "xpath": l.add_xpath( k, u"{}".format(fields.get("fields").get(k).get("xpath")), MapCompose(unicode.strip)) elif fields.get("fields").get(k).keys()[0] == "value": if fields.get("fields").get(k).get("value") == u"{TODAY}": l.add_value(k, u"{}".format(datetime.now())) l.add_value( k, u"{}".format(fields.get("fields").get(k).get("value"))) return l.load_item()
def __setitem__(self, name, value): processed_tag = code_to_name(name) if processed_tag in self._values and isinstance( self._values[processed_tag], BaseField): self._values[processed_tag].set_value(value) else: Item.__setitem__(self, name, value)
def save_item(item: Item, spider): if isinstance(item, MessageItem): def message_is_unique(message_model: Message, limit=20) -> bool: subquery = session.query(Message) \ .order_by(Message.created_at.desc(), Message.id.desc()) \ .limit(limit) \ .subquery() alias = aliased(Message, subquery) return not session.query( session.query(alias).filter( alias.text == message_model.text, alias.image == message_model.image).exists()).scalar() def message_fit_the_length(message_model: Message) -> bool: if message_model.image: return len(remove_tags(message_model.text)) <= 1024 return len(remove_tags(message_model.text)) <= 4096 message = Message(text=item.get('text'), image=item.get('image'), url=item.get('url')) if message_is_unique(message) and message_fit_the_length(message): with session.begin(): session.add(message)
def parse_user(self, response): """ 根据返回的 response 进行数据解析 :param response: scrapy 框架返回的响应 """ result = { 'user_name': response.meta['user_name'], 'title': response.meta['title'], 'date': response.meta['date'], 'main_body': response.meta['main_body'], 'content_url': response.meta['content_url'], 'brand': response.meta['brand'] } for content in response.xpath(self.user_list_xpath): item = Item() item_loader = ItemLoader(item=item, selector=content) for field in self.user_xpath: item.fields[field] = Field() if 'user_url' in field: item_loader.add_value(field, response.url) else: item_loader.add_xpath(field, self.user_xpath[field]) result.update(item_loader.load_item()) item = self.format_item(result) yield item # 用户评论 user_comment = response.xpath('.//ul/li[@class="Comments"]/a/@href').extract() if user_comment: yield Request(self.url + user_comment[0], self.parse_comment)
def __init__(self): Item.__init__(self) #爬取下来的一个全局唯一ID self['crawlerid'] = '' #页面链接 self['url'] = '' #html源码 self['html_code'] = '' #页面编码 self['encoding'] = '' #标题 self['title'] = '' #作者 self['authors'] = [] #正文 self['content'] = '' #新闻时间 self['time'] = '' #来源 self['source'] = '' #编辑 self['editor'] = '' #频道类别 self['ctype'] = '' #频道类别 self['subtype'] = '' #关键词 self['keywords'] = [] #摘要 self['abstract'] = '' self['copyright'] = '' self['originality'] = '' self['type'] = 'text'
def parse(self, response): """ 根据返回的 response 进行数据解析 :param response: scrapy 框架返回的响应 """ keyword = response.meta['keyword'] for complaint in response.xpath(self.list_xpath): item = Item() item_loader = ItemLoader(item=item, selector=complaint) for field in self.content_xpath: item.fields[field] = Field() if 'content_url' in field: item_loader.add_value(field, response.url) elif 'brand' in field: item_loader.add_value(field, keyword) else: item_loader.add_xpath(field, self.content_xpath[field]) # 用户链接 user_id = complaint.xpath('.//span[@class="Author"]/a/@href').extract() for uid in user_id: yield Request(self.url + uid, self.parse_user, meta=dict(item_loader.load_item())) # 下一页 next_page = response.xpath('//div[@id="PagerBefore"]/a[last()]/@href').extract() if next_page: yield Request(self.url + next_page[0], meta={'keyword': keyword}, callback=self.parse)
def parse(self, response): """ 解析网页获取评论 :param response:响应内容 """ search = response.meta['keyword'] # 评论页 for complaint in response.xpath(self.list_xpath): item = Item() item_loader = ItemLoader(item=item, selector=complaint) for field in self.content_xpath: item.fields[field] = Field() if 'content_url' in field: item_loader.add_value(field, response.url) elif 'brand' in field: item_loader.add_value(field, search) else: item_loader.add_xpath(field, self.content_xpath[field]) # 拼接user_url链接 uid = complaint.xpath('.//tr/td[@class="small"]/a/@href').extract() yield Request(self.urls + uid[0], self.parse_user, meta=dict(item_loader.load_item()), dont_filter=True) # 内容下一页 next_page = response.xpath( '//div[@class="pagelinks"]/a[last()]/@href').extract() for page in next_page: yield Request(self.urls + page, self.parse, meta={'keyword': search})
def second_parse(self, response): href_list = response.xpath('//ul[@class="wp100"]/li//div/p[@class="fs22"]/a/@href') print(len(href_list), response.url) for href in href_list: item = Item() next_url = 'http://www.sinyi.com.cn/' + href.extract().split('/', 1)[1].split('&cookieuid=')[0] item.fields['HouseUrl'] = Field() item['HouseUrl'] = next_url yield scrapy.Request(next_url, callback=self.third_parse, meta={'item': item})
def parse_item(self, response): item = Item() item.fields['url'] = Field() item.fields['url_md5'] = Field() item.fields['title'] = Field() item.fields['pubtime'] = Field() item.fields['content'] = Field() item.fields['author'] = Field() item.fields['site_name'] = Field() l = ItemLoader(item=item, response=response) url = response.url url = url.replace("http://rym.quwenge.com/baidu_tiaozhuan.php?url=", "") md5 = hashlib.md5() l.add_value(u'url', url) md5.update(url) url_md5 = md5.hexdigest() l.add_value(u'url_md5', url_md5) l.add_xpath('title', "//h1/text()", MapCompose(unicode.lstrip, unicode.rstrip)) l.add_xpath('pubtime', "//span[@class='read']/text()", MapCompose(unicode.lstrip, unicode.rstrip)) l.add_xpath('content', "//div[@id='content']//text()", MapCompose(unicode.lstrip, unicode.rstrip)) l.add_xpath('author', "//div[@class='name']/text()", MapCompose(unicode.lstrip, unicode.rstrip)) l.add_value("site_name", u"百家号") yield l.load_item()
def parse_item(self, response): item = Item() item.fields['url'] = Field() item.fields['url_md5'] = Field() item.fields['title'] = Field() item.fields['pubtime'] = Field() item.fields['content'] = Field() item.fields['author'] = Field() item.fields['site_name'] = Field() l = ItemLoader(item=item, response=response) url = response.url md5 = hashlib.md5() l.add_value(u'url', url) md5.update(url) url_md5 = md5.hexdigest() l.add_value(u'url_md5', url_md5) l.add_xpath('title', "//h2/text()", MapCompose(unicode.lstrip, unicode.rstrip)) l.add_xpath('pubtime', "//p[@class='time']/span[1]/text()", MapCompose(unicode.lstrip, unicode.rstrip)) l.add_xpath('content', "//div[@class='content']//text()", MapCompose(unicode.lstrip, unicode.rstrip)) l.add_xpath('author', "//p[@class='time']/span[last()]/text()", MapCompose(unicode.lstrip, unicode.rstrip)) l.add_value("site_name", u"网易号") yield l.load_item()
def normalize(item: Item) -> School: return School(name=item.get('name'), id='SH-{}'.format(item.get('Dienststellennummer')), address=item.get('Strasse'), zip=item.get("Postleitzahl"), city=item.get("Ort"), email=item.get('E-Mail'), school_type=item.get('Schularten'), fax=item.get('Fax'), phone=item.get('Telefon'), director=item.get('Schulleitung'))
def handle_1(self, response): item1 = response.meta.get("item1") req = response.meta.get("req") fragments = re.findall("url \+= '(.*?)'", response.text, re.S) detail_url = '' for j in fragments: detail_url += j item2 = Item() item2.fields["NewUrl"] = Field() item2["NewUrl"] = detail_url item2.update(item1) yield scrapy.Request(url=detail_url, callback=self.parse, meta={"req": req, "item2": item2, "last_page": True})
def normalize(item: Item) -> School: return School(name=item.get('name'), id='RP-{}'.format(item.get('id')), address=item.get('Adresse'), city=re.split('\d{5}', item.get('Ort').strip())[1].strip(), zip=re.findall('\d{5}', item.get('Ort'))[0], website=item.get('Internet'), email=item.get('E-Mail'), school_type=item.get('Schulform'), fax=item.get('Fax'), phone=item.get('Telefon'))
def start_requests(self): for kw in self.kw_list: start_page = 1 for page in range(start_page, 11): item = Item() item.fields["SearchWord"] = Field() item.fields["Page"] = Field() item["SearchWord"] = kw item["Page"] = page start_url = 'https://weixin.sogou.com/weixin?query={}&type=2&page={}&ie=utf8'.format( parse.quote(kw), str(page)) yield scrapy.Request(url=start_url, callback=self.parse, meta={"start_url": start_url, "item": item})
def normalize(item: Item) -> School: tel = item.get('telefon') return School(name=item.get('name'), phone=tel, fax=item.get('telefax'), website=item.get('homepage'), email=item.get('e-mail'), address=item.get('straße'), city=item.get('ort'), zip=item.get('plz'), school_type=item.get('schultyp'), director=item.get('schulleitung'), id='SL-{}'.format(tel.replace(" ", "-")))
def on_duplicate_sql(*args, item: scrapy.Item): if args and isinstance(item, scrapy.Item): dup_keys = list() for index, key in enumerate(args): if index == 0: update_str = ' ON DUPLICATE KEY UPDATE {}="{}"'.format(key, item.get(key)) else: update_str = '{}="{}"'.format(key, item.get(key)) dup_keys.append(update_str) return ', '.join(dup_keys) else: return ''
def normalize(self, item: Item) -> School: return School(name=item.get('name'), id='RP-{}'.format(item.get('id')), address=item.get('Adresse'), city=item.get('Ort'), website=item.get('Internet'), email=item.get('E-Mail'), school_type=item.get('Schulform'), fax=item.get('Fax'), phone=item.get('Telefon'))
def normalize(item: Item) -> School: return School(name=item.get('name'), phone=item.get('telefon'), fax=item.get('fax'), website=item.get('homepage'), address=item.get('straße'), city=item.get('ort'), zip=item.get('plz'), school_type=item.get('schultyp'), id='HE-{}'.format(item.get('id')))
def start_requests(self): item = Item() for url in self.start_urls: url_slit = url.split(":") site_name = url_slit[0] url = ":".join(url_slit[1:-1]) item.fields["site_name"] = Field() item['site_name'] = site_name item.fields["source_url"] = Field() item['source_url'] = url_slit[-1] yield scrapy.Request(url=url, meta={"item": item}, dont_filter=True, callback=self.parse_url)
def normalize(item: Item) -> School: city_parts = item.get('Ort').split() zip, city = city_parts[0], ' '.join(city_parts[1:]) return School(name=item.get('Schulname'), id='TH-{}'.format(item.get('Schulnummer')), address=item.get('Straße'), zip=zip, city=city, website=item.get('Internet'), email=ThueringenSpider._deobfuscate_email(item.get('E-Mail')), school_type=item.get('Schulart'), provider=item.get('Schulträger'), fax=item.get('Telefax'), phone=item.get('Telefon'))
def parse(self, response): req = response.meta.get("req") item = response.meta.get("item") url_list_handle = Selector(text=response.text) url_list = url_list_handle.xpath("//div[@class='txt-box']/h3/a/@href").extract() for index, url in enumerate(url_list): item1 = Item() item1.fields["Located"] = Field() item1["Located"] = index url = self.base_url + url url = self.get_real_url_handle(url) item1.update(item) yield scrapy.Request(url=url, callback=self.handle_1, meta={"req": req, "item1": item1})
def __init__(self, *args, **kwargs): Item.__init__(self, *args, **kwargs) self['platform'] = kwargs.get("keyword") self['keyword'] = kwargs.get("keyword") self['crawl_time'] = int(time.time()) self['url'] = kwargs.get("url") self['real_url'] = kwargs.get("real_url") self['title'] = kwargs.get("title") self['source_url'] = kwargs.get("title") self['spider'] = kwargs.get("spider") self['skip_url'] = kwargs.get("skip_url") self['snapshot_url'] = kwargs.get("snapshot_url") self['show_url'] = kwargs.get("show_url") self['is_ad'] = kwargs.get("is_ad") self['content'] = kwargs.get("content")
def normalize(item: Item) -> School: city_parts = item.get('adresse_ort').split() zip_code, city = city_parts[0], city_parts[1:] return School(name=item.get('schulname'), id='HH-{}'.format(item.get('schul_id')), address=item.get('adresse_strasse_hausnr'), address2='', zip=zip_code, city=' '.join(city), website=item.get('schul_homepage'), email=item.get('schul_email'), school_type=item.get('schulform'), fax=item.get('fax'), phone=item.get('schul_telefonnr'), director=item.get('name_schulleiter'))
def normalize(item: Item) -> School: zip_code, *city_parts = item.get('city').split() return School(name=item.get('name'), phone=item.get('phone'), fax=item.get('fax'), website=item.get('web'), address=item.get('street'), city=' '.join(city_parts), zip=zip_code, school_type=item.get('school_type'), legal_status=item.get('type'), id='BY-{}'.format(item.get('number')))
def normalize(item: Item) -> School: return School(name=item.get('Name'), id = 'SA-{}'.format(item.get('ID')), address=re.split('\d{5}', item.get('Adresse').strip())[0].strip(), zip=re.findall('\d{5}', item.get('Adresse').strip())[0], city=re.split('\d{5}', item.get('Adresse').strip())[1].strip(), # address=item.get('Adresse'), website=item.get('Homepage'), email=item.get('E-Mail'), fax=item.get('Telefax'), phone=item.get('Telefon'), )
def test_add_stats_item_scraped_count_by_item_type(spider): for _ in range(15): spider.crawler.signals.send_catch_log_deferred( signal=signals.item_scraped, item={"_type": "regular_dict"}, response="", spider=spider, ) for _ in range(20): spider.crawler.signals.send_catch_log_deferred( signal=signals.item_scraped, item=Item(), response="", spider=spider, ) for _ in range(25): spider.crawler.signals.send_catch_log_deferred( signal=signals.item_scraped, item=TestItem(), response="", spider=spider, ) stats = spider.crawler.stats.get_stats() assert stats.get("spidermon_item_scraped_count") == 60 assert stats.get("spidermon_item_scraped_count/dict") == 15 assert stats.get("spidermon_item_scraped_count/Item") == 20 assert stats.get("spidermon_item_scraped_count/TestItem") == 25
def __getitem__(self, name): """Retrieve field's data Parameters ---------- name : str Name or code of the field to be retrieved. Returns ---------- What ever the field contained. Notes ---------- Data returned by method may be of any type depending what the field contained. Examples ---------- >>> record = Record(title="I'm a title") >>> record['245 a'] "I'm a title" >>> record['title'] "I'm a title" >>> record.get_field['issn'] None """ processed_tag = code_to_name(name) if processed_tag in self._values and isinstance( self._values[processed_tag], BaseField): return self._values[processed_tag].get_value() else: try: return Item.__getitem__(self, name) except KeyError: return None
def PlayerRow(**data): item = Item() for field, value in data.iteritems(): item.fields[field] = Field() item[field] = value return item