Example #1
0
class Photo(Item):
    house_id = Field()
    url = Field()
Example #2
0
class Official(Item):
    """
	Object type for municipal officials. Inherits Scrapy Item functions.

	Parameters
	----------
	self.muniName : name of municipality
	self.muniType : type of municipality (borough, township, city)
	self.office : name of office
	self.district : official's ward, district, or other internal division. defaults to AT-LARGE
	self.name : official's name
	self.email : official's email
	self.phone : official's phone number
	self.address : official's address
	self.url : response.url for prior info
	self.termStart : start date of official's term in office, string in ISO format
	self.termEnd : end date of official's term in office, string in ISO
	self.vacant : position status, boolean value

	Attributes
    ----------
	self.setdefault() : assigns values
	"""

    muniName = Field()
    muniType = Field()
    office = Field()
    district = Field()
    name = Field()
    email = Field()
    phone = Field()
    address = Field()
    url = Field()
    termStart = Field()
    termEnd = Field()
    vacant = Field()

    def setdefault(self, key, value):
        if key not in self:
            self[key] = value
class CustomItem(Item):

    name = Field()

    def __str__(self):
        return "name: %s" % self['name']
Example #4
0
class ZhihuAnswerItem(Item):
    url = Field(output_processor=Join(separator=''))
    answer_id = Field(output_processor=Join(separator=''))
    user_url = Field(output_processor=Join(separator=''))
    question_id = Field(output_processor=Join(separator=''))
    question_url = Field(output_processor=Join(separator=''))
    agree_num = Field(output_processor=Join(separator=''))
    summary = Field(output_processor=Compose(''.join, str.split, ''.join))
    content = Field(output_processor=Compose(''.join, str.split, ''.join))
    md5 = Field(output_processor=Join(separator=''))
    comment_num = Field(output_processor=Join(separator=''))
    collection_name = Field(output_processor=Join(separator=''))
    website = Field(output_processor=Join(separator=''))
Example #5
0
class DatasetItem(Item):
    # define the fields for your item here like:
    # name = Field()
    url = Field()
    name = Field()
    frequency = Field()
Example #6
0
class EpisodeItem(Item):
    show_id = Field()
    video_id = Field()
    owner_show_id = Field()
    title = Field()
    tag = Field()
    category = Field()
    played = Field()
    upload_time = Field()
    spider_id = Field()
    site_id = Field()
    url = Field()
    thumb_url = Field()
    description = Field()
    stash = Field()
    duration = Field()
    priority = Field()
    format_id = Field()
    audit = Field()

    kw_id = Field()
    pg_id = Field()
    cat_id = Field()
    subject_id = Field()
Example #7
0
class T66yItem(Item):
    title = Field()
    url = Field()
Example #8
0
class Article(scrapy.Item):
    title = Field()
    link = Field()
    desc = Field()
    pubDate = Field()
    source = Field()
Example #9
0
class YelpReview(Item):
    """Information of a restaurant.

    All fields are string unless specified otherwise.
    """

    # Crawl date.
    crawl_date = Field(output_processor=TakeFirst())

    # Crawled page URL.
    page_url = Field(output_processor=TakeFirst())

    # Yelp biz ID.
    yelp_biz_id = Field(output_processor=TakeFirst())

    # Name of the restaurant.
    restaurant_name = Field(output_processor=TakeFirst())

    # Restaurant address.
    restaurant_address = Field(output_processor=TakeFirst())

    # City
    restaurant_city = Field(output_processor=TakeFirst())

    # State
    restaurant_state = Field(output_processor=TakeFirst())

    # Postal code
    restaurant_postal_code = Field(output_processor=TakeFirst())

    # Phone
    restaurant_phone = Field(output_processor=TakeFirst())

    # Restaurant website
    restaurant_website = Field(output_processor=TakeFirst())

    # Restaurant reviews count.
    restaurant_reviews_count = Field(output_processor=TakeFirst())

    # Restaurant rating.
    restaurant_rating = Field(output_processor=TakeFirst())

    # Restaurant category.  This could be a string or a list of categories.
    restaurant_category = Field()

    # Review ID.
    review_id = Field(output_processor=TakeFirst())

    # Review content.  A list of paragraphs.
    review_content = Field()

    # Review content date.
    review_content_date = Field(output_processor=TakeFirst())

    # Reviewer restaurant rating.
    reviewer_restaurant_rating = Field(output_processor=TakeFirst())

    # Reviewer name.
    reviewer_name = Field(output_processor=TakeFirst())

    # Reviewer URL
    reviewer_url = Field(output_processor=TakeFirst())

    # Reviewer location.
    reviewer_location = Field(output_processor=TakeFirst())

    # Reviewer friends count.
    reviewer_friends_count = Field(output_processor=TakeFirst())

    # Reviewer reviews count.
    reviewer_reviews_count = Field(output_processor=TakeFirst())
Example #10
0
class Project(Item):

    name = Field()
    url = Field()
Example #11
0
class HackernewsItem(Item):
    title = Field()
    url = Field()
Example #12
0
class House(Item):
    name = Field()
    query_text = Field()
    guests = Field()
    bedroom = Field()
    bed = Field()
    bath = Field()
    amenities = Field()
    rules = Field()
    reviews_count = Field()
    city = Field()
    tp = Field()
    airbnb_id = Field()
    guest_label = Field()
    lat = Field()
    lng = Field()
Example #13
0
class Review(Item):
    house_id = Field()
    username = Field()
    review = Field()
    rating = Field()
    date_review = Field()
Example #14
0
class User(Item):
    house_id = Field()
    username = Field()
    pic_url = Field()
Example #15
0
class PersonMore(Item):
    cid = Field()
    name = Field()
    caseCode = Field()
    age = Field()
    sex = Field()
    #focusNumber = Field()
    cardNum = Field()
    courtName = Field()
    areaName = Field()
    partyTypeName = Field()
    gistId = Field()
    regDate = Field()
    gistUnit = Field()
    duty = Field()
    performance = Field()
    disruptTypeName = Field()
    publishDate = Field()
    detailLink = Field()

    def __init__(self, item=None):
        if item == None:
            Item.__init__(self)
            self['cid'] = 0
            self['name'] = ""
            self['caseCode'] = ""
            self['age'] = ""
            self['sex'] = ""
            #self['focusNumber'] = ""
            self['cardNum'] = ""
            self['courtName'] = ""
            self['areaName'] = ""
            self['partyTypeName'] = ""
            self['gistId'] = ""
            self['regDate'] = ""
            self['gistUnit'] = ""
            self['duty'] = ""
            self['performance'] = ""
            self['disruptTypeName'] = ""
            self['publishDate'] = ""
            self['detailLink'] = ""
        else:
            Item.__init__(self, item)
Example #16
0
class PersonProfileItem(Item):
    company_name = Field()
    UID = Field()
    legal_form = Field()
    also_view = Field()
    education = Field()
    locality = Field()
    industry = Field()
    summary = Field()
    specilities = Field()
    skills = Field()
    interests = Field()
    group = Field()
    honors = Field()
    education = Field()
    experience = Field()
    overview_html = Field()
    homepage = Field()
Example #17
0
class UserItem(Item):
    owner_id = Field()
    show_id = Field()
    user_name = Field()
    intro = Field()
    played = Field()
    fans = Field()
    vcount = Field()  #video count
    spider_id = Field()
    site_id = Field()
    url = Field()
    aka = Field()
Example #18
0
class BaseItem(Item):
    """
    只保留博客url地址和网页内容
    """
    url = Field()
    html_content = Field()
Example #19
0
class Article(Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = Field()
    url = Field()
Example #20
0
class TagItem(Item):
    """
    用于抓取tag列表的item
    """
    tag = Field()
Example #21
0
class SpiderItem(Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    url = Field(output_processor=Join(separator=''))
    date = Field(output_processor=Join(separator=''))
    source = Field(output_processor=Join(separator=''))
    title = Field(output_processor=Join(separator=''))
    abstract = Field(output_processor=Compose(''.join, removern, ''.join))
    content = Field(output_processor=Compose(''.join, removern, ''.join))
    md5 = Field(output_processor=Join(separator=''))
    collection_name = Field(output_processor=Join(separator=''))
    view_num = Field(output_processor=Join(separator=''))
    brief = Field(output_processor=Join(separator=''))
    website = Field(output_processor=Join(separator=''))
Example #22
0
class CateItem(Item):
    """
    用于抓取分类列表的item
    """
    cate = Field()
Example #23
0
class ErfMeta(Item):
    gtin = Field()
Example #24
0
class BricomanMeta(Item):
    code = Field()
    model = Field()
    ean = Field()
Example #25
0
             item.fields[key] = Field()
             l.add_value(key, Some_Info[key])
     yield l.load_item()
 else:
     #感觉这里不能用itemloader的add_xxx方法了,因为要先找到一个页面所有的含有目标item的块,再在每个块里面提取出单个item,itemloader的话是一次性直接全取出,add_xpath不能再细分了;;打算用add_value方法
     my_Final_Xpath = Final_Xpath.copy()
     All_Xpath = my_Final_Xpath['All_Xpath'].copy()
     del my_Final_Xpath['All_Xpath']
     all_xpath = All_Xpath['all_xpath']
     del All_Xpath['all_xpath']
     for i in response.xpath(all_xpath[0]):
         item = NettvSpiderItem()
         l = ItemLoader(item=item, response=response)
         #把All_Xpath中的数据提取出来
         for key in All_Xpath.keys():
             item.fields[key] = Field()
             try:
                 #itemloader在add_xxx方法找不到值的时候,会自动忽略这个字段,可是我不想忽略它,这时候需要将其置为空("")
                 if "".join(
                         map(lambda x: i.xpath(x).extract(),
                             All_Xpath[key])[0]) == '':
                     map(lambda x: l.add_value(key, ""), All_Xpath[key])
                 else:
                     map(
                         lambda x: l.add_value(key,
                                               i.xpath(x).extract()),
                         All_Xpath[key])
             except Exception, e:
                 print Exception, ",", e
         #将除了All_Xpath中的数据提取出来,像豆瓣就特别需要这种情况,一般下面的数据是(多次取得),All_Xpath中才是真正单条的数据
         for key in my_Final_Xpath.keys():
Example #26
0
class productListItem(scrapy.Item):
    ranking = Field(output_processor=TakeFirst())
    company_id = Field(output_processor=TakeFirst())
    cas = Field(output_processor=TakeFirst())
    cas_company_id = Field(output_processor=TakeFirst())
Example #27
0
class ScholarScraperItem(Item):
    # define the fields for your item here like:
    origin_url = Field()
    meta_source_format = Field()
    title = Field()
    title_link = Field()
    aut_pub_year = Field()
    cited_by = Field()
    cited_by_link = Field()
    related_link = Field()
    versions = Field()
    versions_link = Field()
    wos_cit = Field()
    wos_ut = Field()
    ft_link = Field()
    ft_format = Field()
    ft_text = Field()
    cited_cluster = Field()
    start_page = Field()
Example #28
0
class exampleItem(scrapy.Item):
    cas = Field(output_processor=TakeFirst())
    categories = Field()
    synonyms = Field(output_processor=MapCompose(strip_dot))
Example #29
0
class ScrapyLearnItem(Item):
    # Primary fields
    title = Field()
    body = Field()
    # Calculated fields
    images = Field()
    location = Field()
    # HouseKeeping fields
    url = Field()
    project = Field()
    spider = Field()
    server = Field()
    date = Field()
    # uu
    image_urls = Field()
    img_thumb_url = Field()
    file_size = Field()
    info = Field()
Example #30
0
class Data(Item):
    houses = Field()