class Photo(Item): house_id = Field() url = Field()
class Official(Item): """ Object type for municipal officials. Inherits Scrapy Item functions. Parameters ---------- self.muniName : name of municipality self.muniType : type of municipality (borough, township, city) self.office : name of office self.district : official's ward, district, or other internal division. defaults to AT-LARGE self.name : official's name self.email : official's email self.phone : official's phone number self.address : official's address self.url : response.url for prior info self.termStart : start date of official's term in office, string in ISO format self.termEnd : end date of official's term in office, string in ISO self.vacant : position status, boolean value Attributes ---------- self.setdefault() : assigns values """ muniName = Field() muniType = Field() office = Field() district = Field() name = Field() email = Field() phone = Field() address = Field() url = Field() termStart = Field() termEnd = Field() vacant = Field() def setdefault(self, key, value): if key not in self: self[key] = value
class CustomItem(Item): name = Field() def __str__(self): return "name: %s" % self['name']
class ZhihuAnswerItem(Item): url = Field(output_processor=Join(separator='')) answer_id = Field(output_processor=Join(separator='')) user_url = Field(output_processor=Join(separator='')) question_id = Field(output_processor=Join(separator='')) question_url = Field(output_processor=Join(separator='')) agree_num = Field(output_processor=Join(separator='')) summary = Field(output_processor=Compose(''.join, str.split, ''.join)) content = Field(output_processor=Compose(''.join, str.split, ''.join)) md5 = Field(output_processor=Join(separator='')) comment_num = Field(output_processor=Join(separator='')) collection_name = Field(output_processor=Join(separator='')) website = Field(output_processor=Join(separator=''))
class DatasetItem(Item): # define the fields for your item here like: # name = Field() url = Field() name = Field() frequency = Field()
class EpisodeItem(Item): show_id = Field() video_id = Field() owner_show_id = Field() title = Field() tag = Field() category = Field() played = Field() upload_time = Field() spider_id = Field() site_id = Field() url = Field() thumb_url = Field() description = Field() stash = Field() duration = Field() priority = Field() format_id = Field() audit = Field() kw_id = Field() pg_id = Field() cat_id = Field() subject_id = Field()
class T66yItem(Item): title = Field() url = Field()
class Article(scrapy.Item): title = Field() link = Field() desc = Field() pubDate = Field() source = Field()
class YelpReview(Item): """Information of a restaurant. All fields are string unless specified otherwise. """ # Crawl date. crawl_date = Field(output_processor=TakeFirst()) # Crawled page URL. page_url = Field(output_processor=TakeFirst()) # Yelp biz ID. yelp_biz_id = Field(output_processor=TakeFirst()) # Name of the restaurant. restaurant_name = Field(output_processor=TakeFirst()) # Restaurant address. restaurant_address = Field(output_processor=TakeFirst()) # City restaurant_city = Field(output_processor=TakeFirst()) # State restaurant_state = Field(output_processor=TakeFirst()) # Postal code restaurant_postal_code = Field(output_processor=TakeFirst()) # Phone restaurant_phone = Field(output_processor=TakeFirst()) # Restaurant website restaurant_website = Field(output_processor=TakeFirst()) # Restaurant reviews count. restaurant_reviews_count = Field(output_processor=TakeFirst()) # Restaurant rating. restaurant_rating = Field(output_processor=TakeFirst()) # Restaurant category. This could be a string or a list of categories. restaurant_category = Field() # Review ID. review_id = Field(output_processor=TakeFirst()) # Review content. A list of paragraphs. review_content = Field() # Review content date. review_content_date = Field(output_processor=TakeFirst()) # Reviewer restaurant rating. reviewer_restaurant_rating = Field(output_processor=TakeFirst()) # Reviewer name. reviewer_name = Field(output_processor=TakeFirst()) # Reviewer URL reviewer_url = Field(output_processor=TakeFirst()) # Reviewer location. reviewer_location = Field(output_processor=TakeFirst()) # Reviewer friends count. reviewer_friends_count = Field(output_processor=TakeFirst()) # Reviewer reviews count. reviewer_reviews_count = Field(output_processor=TakeFirst())
class Project(Item): name = Field() url = Field()
class HackernewsItem(Item): title = Field() url = Field()
class House(Item): name = Field() query_text = Field() guests = Field() bedroom = Field() bed = Field() bath = Field() amenities = Field() rules = Field() reviews_count = Field() city = Field() tp = Field() airbnb_id = Field() guest_label = Field() lat = Field() lng = Field()
class Review(Item): house_id = Field() username = Field() review = Field() rating = Field() date_review = Field()
class User(Item): house_id = Field() username = Field() pic_url = Field()
class PersonMore(Item): cid = Field() name = Field() caseCode = Field() age = Field() sex = Field() #focusNumber = Field() cardNum = Field() courtName = Field() areaName = Field() partyTypeName = Field() gistId = Field() regDate = Field() gistUnit = Field() duty = Field() performance = Field() disruptTypeName = Field() publishDate = Field() detailLink = Field() def __init__(self, item=None): if item == None: Item.__init__(self) self['cid'] = 0 self['name'] = "" self['caseCode'] = "" self['age'] = "" self['sex'] = "" #self['focusNumber'] = "" self['cardNum'] = "" self['courtName'] = "" self['areaName'] = "" self['partyTypeName'] = "" self['gistId'] = "" self['regDate'] = "" self['gistUnit'] = "" self['duty'] = "" self['performance'] = "" self['disruptTypeName'] = "" self['publishDate'] = "" self['detailLink'] = "" else: Item.__init__(self, item)
class PersonProfileItem(Item): company_name = Field() UID = Field() legal_form = Field() also_view = Field() education = Field() locality = Field() industry = Field() summary = Field() specilities = Field() skills = Field() interests = Field() group = Field() honors = Field() education = Field() experience = Field() overview_html = Field() homepage = Field()
class UserItem(Item): owner_id = Field() show_id = Field() user_name = Field() intro = Field() played = Field() fans = Field() vcount = Field() #video count spider_id = Field() site_id = Field() url = Field() aka = Field()
class BaseItem(Item): """ 只保留博客url地址和网页内容 """ url = Field() html_content = Field()
class Article(Item): # define the fields for your item here like: # name = scrapy.Field() title = Field() url = Field()
class TagItem(Item): """ 用于抓取tag列表的item """ tag = Field()
class SpiderItem(Item): # define the fields for your item here like: # name = scrapy.Field() url = Field(output_processor=Join(separator='')) date = Field(output_processor=Join(separator='')) source = Field(output_processor=Join(separator='')) title = Field(output_processor=Join(separator='')) abstract = Field(output_processor=Compose(''.join, removern, ''.join)) content = Field(output_processor=Compose(''.join, removern, ''.join)) md5 = Field(output_processor=Join(separator='')) collection_name = Field(output_processor=Join(separator='')) view_num = Field(output_processor=Join(separator='')) brief = Field(output_processor=Join(separator='')) website = Field(output_processor=Join(separator=''))
class CateItem(Item): """ 用于抓取分类列表的item """ cate = Field()
class ErfMeta(Item): gtin = Field()
class BricomanMeta(Item): code = Field() model = Field() ean = Field()
item.fields[key] = Field() l.add_value(key, Some_Info[key]) yield l.load_item() else: #感觉这里不能用itemloader的add_xxx方法了,因为要先找到一个页面所有的含有目标item的块,再在每个块里面提取出单个item,itemloader的话是一次性直接全取出,add_xpath不能再细分了;;打算用add_value方法 my_Final_Xpath = Final_Xpath.copy() All_Xpath = my_Final_Xpath['All_Xpath'].copy() del my_Final_Xpath['All_Xpath'] all_xpath = All_Xpath['all_xpath'] del All_Xpath['all_xpath'] for i in response.xpath(all_xpath[0]): item = NettvSpiderItem() l = ItemLoader(item=item, response=response) #把All_Xpath中的数据提取出来 for key in All_Xpath.keys(): item.fields[key] = Field() try: #itemloader在add_xxx方法找不到值的时候,会自动忽略这个字段,可是我不想忽略它,这时候需要将其置为空("") if "".join( map(lambda x: i.xpath(x).extract(), All_Xpath[key])[0]) == '': map(lambda x: l.add_value(key, ""), All_Xpath[key]) else: map( lambda x: l.add_value(key, i.xpath(x).extract()), All_Xpath[key]) except Exception, e: print Exception, ",", e #将除了All_Xpath中的数据提取出来,像豆瓣就特别需要这种情况,一般下面的数据是(多次取得),All_Xpath中才是真正单条的数据 for key in my_Final_Xpath.keys():
class productListItem(scrapy.Item): ranking = Field(output_processor=TakeFirst()) company_id = Field(output_processor=TakeFirst()) cas = Field(output_processor=TakeFirst()) cas_company_id = Field(output_processor=TakeFirst())
class ScholarScraperItem(Item): # define the fields for your item here like: origin_url = Field() meta_source_format = Field() title = Field() title_link = Field() aut_pub_year = Field() cited_by = Field() cited_by_link = Field() related_link = Field() versions = Field() versions_link = Field() wos_cit = Field() wos_ut = Field() ft_link = Field() ft_format = Field() ft_text = Field() cited_cluster = Field() start_page = Field()
class exampleItem(scrapy.Item): cas = Field(output_processor=TakeFirst()) categories = Field() synonyms = Field(output_processor=MapCompose(strip_dot))
class ScrapyLearnItem(Item): # Primary fields title = Field() body = Field() # Calculated fields images = Field() location = Field() # HouseKeeping fields url = Field() project = Field() spider = Field() server = Field() date = Field() # uu image_urls = Field() img_thumb_url = Field() file_size = Field() info = Field()
class Data(Item): houses = Field()