class _ToriDealItem(Item): __doc__ = """ Returned fields: * url (str): URL of the scraped web page. * time (int): UNIX timestamp of the scraping. * seller (str): Seller of the item. * name (str): Name of the item. * description (list of str): Description of the item. * price (str): Price of the item. * type (str): Type of the deal. * published (str): Publish time of the deal. * images (list of dict): Images of the item. """ url = Field(input_processor=Identity(), output_processor=TakeFirst()) time = Field(input_processor=Identity(), output_processor=TakeFirst()) seller = Field(input_processor=Compose(drop_empty_elements, paragraph_join), output_processor=TakeFirst()) name = Field(input_processor=strip_join, output_processor=TakeFirst()) description = Field(input_processor=Compose(drop_empty_elements, paragraph_join), output_processor=TakeFirst()) price = Field(input_processor=strip_join, output_processor=TakeFirst()) type = Field(input_processor=strip_join, output_processor=TakeFirst()) published = Field(input_processor=strip_join, output_processor=TakeFirst()) images = Field(input_processor=MapCompose( _ToriDealSpider._get_image_metadata), output_processor=Identity())
def parse_other(self, response: HtmlResponse): item = response.meta['item'] loader = QianchengItemLoader(item, response) _extract_info = partial(extract_info, response) info_text = _extract_info("//p[@class='msg ltype']/@title")[0].split( "|") if len(_extract_info( "//p[@class='msg ltype']/@title")) != 0 else ["空"] * 5 loader.add_value("experience", info_text[1]) loader.add_value("education", info_text[2] if len(info_text) == 5 else "空") loader.add_value("job_number", info_text[3] if len(info_text) == 5 else info_text[2]) loader.add_xpath("advantage", '//div[@class="jtag"]/div//span/text()', processors=Compose(Join())) info = _extract_info("//div[@class='com_tag']/p/@title") loader.add_value("company_nature", info[0] if len(info) != 0 else "空") loader.add_value("company_size", info[1] if len(info) != 0 else "空") loader.add_value("company_industry", info[2] if len(info) != 0 else "空") loader.add_xpath("company_address", "//*[text()='联系方式']/parent::*/parent::*//p/text()", processors=Compose(Join(""), self.replace_all_n)) info2 = self.replace_all_n("".join( _extract_info( u"//*[text()='职位信息']/parent::*/parent::*/div//p//text()"))) loc_div = info2.find(u"职能类别") loader.add_value("job_content", info2[:loc_div]) loader.add_value("job_kind", info2[loc_div:]) yield loader.load_item()
class FeedEntryItemLoader(BaseItemLoader): default_item_class = FeedEntryItem # Field specific content_text_in = MapCompose(skip_false, str.strip, remove_tags) content_text_out = Join("\n") content_html_in = MapCompose( skip_false, replace_regex, build_tree, convert_footnotes, pullup_elems, replace_elems, remove_elems, change_attribs, change_tags, cleanup_html, convert_iframes, lxml_cleaner, flatten_tree, skip_empty_tree, make_links_absolute, make_srcset_absolute, serialize_tree, ) content_html_out = Compose(Join(), truncate_text) # Use sorted to keep the output stable. category_out = Compose(set, sorted) enclosure_in = Identity() enclosure_out = Identity()
def test_compose(self): proc = Compose(lambda v: v[0], str.upper) self.assertEqual(proc(["hello", "world"]), "HELLO") proc = Compose(str.upper) self.assertEqual(proc(None), None) proc = Compose(str.upper, stop_on_none=False) self.assertRaises(ValueError, proc, None) proc = Compose(str.upper, lambda x: x + 1) self.assertRaises(ValueError, proc, "hello")
def test_compose(self): proc = Compose(lambda v: v[0], str.upper) self.assertEqual(proc(['hello', 'world']), 'HELLO') proc = Compose(str.upper) self.assertEqual(proc(None), None) proc = Compose(str.upper, stop_on_none=False) self.assertRaises(ValueError, proc, None) proc = Compose(str.upper, lambda x: x + 1) self.assertRaises(ValueError, proc, 'hello')
class Producte(scrapy.Item): nom = scrapy.Field( output_processor=Compose(TakeFirst(), neteja_caracters, trimx)) editorial = scrapy.Field(output_processor=TakeFirst()) url = scrapy.Field(output_processor=TakeFirst()) preu = scrapy.Field( output_processor=Compose(TakeFirst(), remove_tags, neteja_moneda)) preu_original = scrapy.Field( output_processor=Compose(TakeFirst(), remove_tags, neteja_moneda)) stock = scrapy.Field(output_processor=Compose(TakeFirst(), trimx)) status_stock = scrapy.Field() status_preu = scrapy.Field() date_lastseen = scrapy.Field(serializer=str) date_created = scrapy.Field(serializer=str) date_updated = scrapy.Field(serializer=str) botiga = scrapy.Field(serializer=str) _id = scrapy.Field(serializer=str) def test(self): return self['nom'] def init_new(self): self['date_created'] = date.today().isoformat() self['date_lastseen'] = date.today().isoformat() self['status_stock'] = 'NOU' self['status_preu'] = 'IGUAL' if not "preu" in self: self['preu'] = 0 else: if self['preu'] != '': self['preu'] = float(self['preu']) else: self['preu'] = 0 if "preu_original" in self: try: self['preu_original'] = float(self['preu_original']) except: self['preu_original'] = float(0) else: self['preu'] = float(self['preu']) if not "stock" in self: self['stock'] = "N/A" self['_id'] = self['url'] def iguals(self, producteDB): if all([ self['preu'] == producteDB['preu'], self['stock'] == producteDB['stock'], producteDB['date_lastseen'] == date.today().isoformat() ]): return True else: return False
class UserComment(scrapy.Item): username = _name use = _name date = scrapy.Field(input_processor=MapCompose(lambda d: d.split('em ')), output_processor=Compose(TakeFirst())) title = scrapy.Field(output_processor=Compose(TakeFirst())) stars = _stars recommended = _name text = _name useful = scrapy.Field(output_processor=format_usefulness)
class ProductLoader(ItemLoader): default_output_processor = TakeFirst() product_name_in = MapCompose(str.title) product_brand_in = MapCompose(str.upper) product_category_in = Compose(normalize_taxonomy) product_category_out = Join(separator=" >> ") product_price_in = MapCompose(filter_price) product_sale_price_in = MapCompose(filter_price) product_img_links_in = Compose(generate_img_url) product_img_links_out = Identity()
class Loader(ItemLoader): default_output_processor = TakeFirst() link_in = Compose(first, clean_proxied_url, clean_url) company_link_in = Compose(first, clean_url) employment_types_in = MapCompose(str.lower, split) employment_types_out = Identity() posted_at_in = Compose(first, parse_relative_date) experience_levels_in = MapCompose(str.lower, split) experience_levels_out = Identity() company_logo_urls_out = Identity() remote_in = MapCompose(parse_remote) locations_raw_out = Identity()
class ImmoScoutLoader(ItemLoader): default_output_processor = TakeFirst() address_in = Compose(first_element) zip_code_in = Compose(second_element) city_in = Compose(fourth_element) canton_in = Compose(fifth_element, remove_comma) rooms_in = Compose(first_element, remove_room) area_m2_in = Compose(third_element, remove_m2) price_chf_in = Compose(first_element, remove_chf) date_available_in = Compose(second_element) floor_in = Compose(second_element) utilities_chf_in = Compose(second_element, remove_chf)
class _Suomi24PageItem(Item): __doc__ = """ Returned page fields: * url (str): URL of the scraped web page. * time (int): UNIX timestamp of the scraping. * title (str): Title of the thread. * content (str): Content of the first message. * comments (str): Comments of the thread page. * published (str): Publish time of the thread. * author (str): Author of the thread. * n_comments (int): Number of comments in the thread. * views (str): Number of views. """ + _Suomi24CommentItem.__doc__ + _Suomi24CommentResponseItem.__doc__ url = Field(input_processor=Identity(), output_processor=TakeFirst()) time = Field(input_processor=Identity(), output_processor=TakeFirst()) title = Field(input_processor=strip_join, output_processor=TakeFirst()) content = Field(input_processor=paragraph_join, output_processor=TakeFirst()) comments = Field(input_processor=Identity(), output_processor=Identity()) published = Field(input_processor=strip_join, output_processor=Compose(strip_elements, TakeFirst())) author = Field(input_processor=strip_join, output_processor=TakeFirst()) n_comments = Field(input_processor=MapCompose(safe_cast_int), output_processor=TakeFirst()) views = Field(input_processor=strip_join, output_processor=TakeFirst())
class UserRating(scrapy.Item): name = _name stars = _stars ratings = scrapy.Field( output_processor=Compose(TakeFirst(), get_numbers, TakeFirst())) approval_rate = _name comments = scrapy.Field()
class MpdataItem(scrapy.Item): """ Defines the item fields and specifies processors for each field """ name = scrapy.Field( input_processor=MapCompose(str.title, str.strip), output_processor=Join(' '), ) image = scrapy.Field(output_processor=TakeFirst(), ) birthdate = scrapy.Field(output_processor=TakeFirst(), ) birthplace = scrapy.Field( input_processor=TakeFirst(), output_processor=Compose(lambda x: x[0] if len(x[0]) > 2 else "-"), ) profession = scrapy.Field( input_processor=MapCompose(str.strip), output_processor=TakeFirst(), ) languages = scrapy.Field( input_processor=MapCompose(str.strip), output_processor=Join(', '), ) party = scrapy.Field( input_processor=MapCompose(str.strip), output_processor=TakeFirst(), ) electoral_district = scrapy.Field(output_processor=TakeFirst(), ) first_time_mp = scrapy.Field( input_processor=MapCompose(str.strip), output_processor=Join(', '), ) email = scrapy.Field(output_processor=TakeFirst(), )
class LeroyparserItem(scrapy.Item): # define the fields for your item here like: name = scrapy.Field() _id = scrapy.Field() article = scrapy.Field() price = scrapy.Field() url = scrapy.Field(output_processor=TakeFirst() ) # т.к. получили список, выберем первое значение characteristic = scrapy.Field(output_processor=Compose( get_charakteristics)) # преобразуем полученный список в словарь photos = scrapy.Field() general = scrapy.Field( output_processor=TakeFirst(), input_processor=Compose( get_general)) # из полученного словаря выберем необходимые ключи main = scrapy.Field(output_processor=Compose( get_main)) # поле необходимо для создания структуры каталога фото
class Street(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() name = scrapy.Field() stadtteil = scrapy.Field() link = scrapy.Field() demographics = scrapy.Field(input_processor=Compose(TakeFirst(), jsonify))
class BookItem(scrapy.Item): # Scalars url = Field() title = Field(input_processor=MapCompose(str.strip)) author = Field(input_processor=MapCompose(str.strip)) num_ratings = Field(input_processor=MapCompose(str.strip, int)) num_reviews = Field(input_processor=MapCompose(str.strip, int)) avg_rating = Field(input_processor=MapCompose(str.strip, float)) num_pages = Field( input_processor=MapCompose(str.strip, num_page_extractor, int)) language = Field(input_processor=MapCompose(str.strip)) publish_date = Field(input_processor=extract_publish_dates) original_publish_year = Field( input_processor=MapCompose(extract_year, int)) isbn = Field(input_processor=MapCompose(str.strip, isbn_filter)) isbn13 = Field(input_processor=MapCompose(str.strip, isbn13_filter)) asin = Field(input_processor=MapCompose(filter_asin)) series = Field() # Lists awards = Field(output_processor=Identity()) places = Field(output_processor=Identity()) characters = Field(output_processor=Identity()) genres = Field(output_processor=Compose(set, list)) # Dicts rating_histogram = Field(input_processor=MapCompose(extract_ratings))
class IggItem(scrapy.Item): title = scrapy.Field( input_processor=Compose(TakeFirst(), filter_title, lambda v: v.strip()), output_processor=Join('')) developer = scrapy.Field( input_processor=MapCompose(lambda v: v.strip()), output_processor=Join('')) publisher = scrapy.Field( input_processor=MapCompose(lambda v: v.strip()), output_processor=Join('')) release_date = scrapy.Field( input_processor=MapCompose(lambda v: v.strip()), output_processor=Join('')) genre = scrapy.Field( input_processor=MapCompose(lambda v: v.strip(), filter_empty), output_processor=Identity()) links = scrapy.Field( input_processor=MapCompose(lambda v: v.strip(), filter_empty), output_processor=Identity())
class Listing(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() heading = scrapy.Field( input_processor=MapCompose(lambda s: s.strip(), decode_special_chars)) link = scrapy.Field(output_processor=TakeFirst()) descr = scrapy.Field( input_processor=MapCompose(lambda s: s.strip(), decode_special_chars)) size = scrapy.Field( input_processor=Compose(extract_size, decode_special_chars)) rooms = scrapy.Field( input_processor=Compose(extract_rooms, decode_special_chars)) preis = scrapy.Field(input_processor=MapCompose( lambda s: s.replace("€", "").replace(".", "").replace("VB", "").strip( ).replace(" ", ""), decode_special_chars)) ausstattung = scrapy.Field( input_processor=Compose(Join("; "), decode_special_chars))
class ApodScraperItemLoader(ItemLoader): default_item_class = ApodScraperItem date_in = MapCompose(str.strip) date_out = TakeFirst() title_in = MapCompose(str.strip) title_out = Join() image_urls_in = MapCompose(add_base_url) credits_in = Compose(extract_credits)
class QuoteLoader(ItemLoader): g = lambda x: bytes(x, 'utf-8') f = lambda x: x.decode('utf-8') author_in = MapCompose(g) quote_in = MapCompose(g) tags_in = MapCompose(g) author_out = MapCompose(f) quote_out = MapCompose(f) tags_out = Compose(MapCompose(f), Join('|'))
class LeruamerlenparserItem(scrapy.Item): # define the fields for your item here like: name = scrapy.Field(output_processor=TakeFirst()) photos = scrapy.Field(input_processor=MapCompose()) characteristics = scrapy.Field( output_processor=TakeFirst(), input_processor=Compose(process_characteristics)) url = scrapy.Field(output_processor=TakeFirst()) price = scrapy.Field(output_processor=TakeFirst(), input_processor=MapCompose(process_price))
def test_error_processor_as_argument(self): class TestItem(Item): name = Field() class TestItemLoader(ItemLoader): default_item_class = TestItem il = TestItemLoader() self.assertRaises(ValueError, il.add_value, 'name', [u'marta', u'other'], Compose(float))
def parse_page(self, response): """@url http://www.usatoday.com/story/money/markets/2017/02/28/bonds-telling-less-bullish-tale-than-stocks/98503646/ @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime modtime headline @scrapes keywords section source summary url """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. mutate_selector_del_xpath( s, '//*[contains(@class, "inline-share-tools")]') mutate_selector_del_xpath( s, '//*[contains(@class, "article-print-url")]') mutate_selector_del_xpath(s, '//aside') l = NewsLoader(selector=s) l.add_xpath('bylines', 'head/meta[@name="cXenseParse:author"]/@content') # Section metadata comes out as "news,world". For this, take "News". l.add_xpath( 'section', 'head/meta[@itemprop="articleSection"]/@content', Compose( TakeFirst(), lambda x: x.split(','), TakeFirst(), lambda x: x.title(), )) # Video pages l.add_xpath('summary', '//p[contains(@class, "vgm-video-description")]//text()') # USA Today provide timestamps to millisecond precision, in a format # which dateparser can't handle. l.add_xpath( 'firstpubtime', '//*[@itemprop="datePublished" or @property="datePublished"]/@content', MapCompose(self.fix_usatoday_date)) # CreativeWork l.add_xpath( 'modtime', '//*[@itemprop="dateModified" or @property="dateModified"]/@content', MapCompose(self.fix_usatoday_date)) # CreativeWork # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) return l.load_item()
class Loader(ItemLoader): default_input_processor = MapCompose(str.strip) default_output_processor = TakeFirst() title_in = MapCompose(html.unescape) company_name_in = MapCompose(html.unescape) employment_types_in = Compose(MapCompose(str.strip), drop_remote) employment_types_out = Identity() posted_at_in = MapCompose(parse_iso_date) company_logo_urls_out = Identity() remote_in = MapCompose(bool) locations_raw_out = Identity()
class CommentLoader(ItemLoader): '''Item loader for CommentItem.''' # Default processors default_input_processor = Identity() default_output_processor = TakeFirst() # Custom input processors # Basic identity comment_id_in = prep_comment_id # Metadata user_name = MapCompose(strip_space_characters) reply_to_in = MapCompose(comment_id_from_url) image_urls_in = MapCompose(extract_image_url) # Custom output processors # Metadata comment_id_out = Join('_') comment_url_out = Join('/') body_out = Compose(Join(), strip_space_characters, remove_span_img_tags) reply_to_out = Identity() # To keep as a list is_aa_out = Compose(TakeFirst(), bool) image_urls_out = Identity() # To keep as a list
class Nytimes_Dir_Item(scrapy.Item): title = Field(input_processor=Join(), output_processor=MapCompose(tags_and_unicode)) para = Field(input_processor=Join(), output_processor=MapCompose(tags_and_unicode)) captions = Field(input_processor=Compose(elim_dupes), output_processor=MapCompose(tags_and_unicode)) images = Field() author = Field(input_processor=Join(), output_processor=TakeFirst()) pubtime = Field(input_processor=MapCompose(iso_time_to_df)) tag = Field() url = Field() source = Field(output_processor=TakeFirst())
class AuthorItem(scrapy.Item): # Scalars url = Field() name = Field() birth_date = Field(input_processor=MapCompose(safe_parse_date)) death_date = Field(input_processor=MapCompose(safe_parse_date)) avg_rating = Field(serializer=float) num_ratings = Field(serializer=int) num_reviews = Field(serializer=int) # Lists genres = Field(output_processor=Compose(set, list)) influences = Field(output_processor=Compose(set, list)) # Blobs about = Field( # Take the first match, remove HTML tags, convert to list of lines, remove empty lines, remove the "edit data" prefix input_processor=Compose(TakeFirst(), remove_tags, split_by_newline, filter_empty, lambda s: s[1:]), output_processor=Join())
class FashionItemLoader(ItemLoader): default_input_processor = Compose(normalize) length_in = Compose(set, list, sorted) brand_out = Join('/') price_out = Compose(lambda x: x[0].replace(',','')) wish_out = Compose(TakeFirst(),int) category_out = Compose(Join('>')) default_output_processor = Compose(set,list, sorted, Join(','))
class RespuestasLegalAppItem(scrapy.Item): pregunta = scrapy.Field(input_processor=MapCompose(str.strip), output_processor=TakeFirst()) descripcion = scrapy.Field( input_processor=MapCompose(remove_tags, replace_escape_chars, str.strip), output_processor=Compose(descripcion_clean, solo_primero)) que_hacer = scrapy.Field( input_processor=MapCompose(remove_tags, replace_escape_chars, str.strip), #output_processor=Compose(remove_line, remove_costos_abogado_otros, replace_tab) output_processor=Compose(split_dot, remove_line, remove_costos_abogado_otros, remove_requisitos_generales)) donde_acudir = scrapy.Field(input_processor=MapCompose( remove_tags, replace_escape_chars, str.strip, string_replace_dondeAcudir, str.title), #output_processor=TakeFirst() ) tenga_encuenta = scrapy.Field( input_processor=MapCompose(remove_tags, replace_escape_chars, str.strip), #output_processor=Compose(remove_vineta, split_dot, remove_line) output_processor=Compose(split_dot, remove_line, remove_vineta)) normatividad = scrapy.Field( input_processor=MapCompose(remove_tags, replace_escape_chars, str.strip), output_processor=Compose(split_dot, remove_line, remove_vineta)) fecha = scrapy.Field(input_processor=MapCompose(str.strip), output_processor=TakeFirst())
class _Suomi24CommentResponseItem(Item): """ Returned comment response fields: * author (str): Author of the comment response. * date (str): Publish time of the comment response. * quotes (list of str): List of quotes in the comment response. * content (str): Contents of the comment response. """ author = Field(input_processor=strip_elements, output_processor=TakeFirst()) date = Field(input_processor=strip_join, output_processor=Compose(strip_elements, TakeFirst())) quotes = Field(input_processor=drop_empty_elements, output_processor=Identity()) content = Field(input_processor=paragraph_join, output_processor=TakeFirst())