class JobItem(scrapy.Item): # General infos url = scrapy.Field() job_id = scrapy.Field() title = scrapy.Field() company = scrapy.Field() location = scrapy.Field() sponsor = scrapy.Field() salary = scrapy.Field() paid = scrapy.Field() # About the job _type = scrapy.Field() industry = scrapy.Field() experience_level = scrapy.Field() role = scrapy.Field() company_size = scrapy.Field() company_type = scrapy.Field() description = scrapy.Field() link_apply = scrapy.Field() joel_test = scrapy.Field(output_processor=Identity()) # list # Tecnologies tecnologies = scrapy.Field(output_processor=Identity()) # list # Benefits benefits = scrapy.Field(output_processor=Identity()) # list
class ConestogacCourseItem(scrapy.Item): institution_name = scrapy.Field() course_code = scrapy.Field(input_processor=MapCompose( remove_garbage, lambda x: re.sub(r'^Code:\s*', '', x), )) course_name = scrapy.Field() delivery_types = scrapy.Field() url = scrapy.Field() faculty = scrapy.Field() description = scrapy.Field() location = scrapy.Field() subject = scrapy.Field() price = scrapy.Field(input_processor=Compose(get_prices, ), output_processor=Compose(lambda x: str(x), )) duration_as_string = scrapy.Field(output_processor=Compose( lambda x: '{} hrs/day, {} days/week for {} months'.format(*x), )) days = scrapy.Field(input_processor=Compose(get_days_tuned, ), output_processor=Compose(lambda x: ' | '.join(x), )) prerequisite = scrapy.Field(output_processor=Join(', ')) capacity = scrapy.Field(output_processor=Identity()) corequisites = scrapy.Field(output_processor=Join(', ')) program = scrapy.Field() duration_hours = scrapy.Field(input_processor=Compose( get_duration_hours_am_pm, ), output_processor=Compose(lambda x: str(x), )) duration_days_week = scrapy.Field( input_processor=Compose(get_duration_days_week, ), output_processor=Compose(lambda x: str(x), )) duration_months = scrapy.Field(input_processor=Compose( get_duration_months_tune_final, ), output_processor=Identity()) total_hours = scrapy.Field(input_processor=Compose(get_total_hours, ), output_processor=Identity())
class ChappellAndMatthewsPropertyLoader(ItemLoader): default_input_processor = Identity() default_output_processor = TakeFirst() area_in = Compose(Split(','), Get(0), Get(0)) street_name_in = Compose(Split(','), Get(0), Get(1)) postcode_in = Compose(Split(','), Get(0), Get(-1)) price_per_month_in = ChappellAndMatthewsPriceProcessor() number_bedrooms_in = Compose(Split(' '), Get(0), Get(0)) description_out = Join() amenities_in = Concatenate( Compose(TextSearch('washing machine'), lambda x: ['Washing machine'] if x else []), Compose(TextSearch('parking'), lambda x: ['Parking'] if x else []), Compose(TextSearch('dishwasher'), lambda x: ['Dishwasher'] if x else [])) amenities_out = Identity() heating_type_in = Compose(TextSearch('gas'), lambda is_gas: 'gas' if is_gas else 'unknown')
class CamosunCourseItem(scrapy.Item): institution_name = scrapy.Field() course_code = scrapy.Field() course_name = scrapy.Field() delivery_types = scrapy.Field() url = scrapy.Field() faculty = scrapy.Field() description = scrapy.Field() location = scrapy.Field() subject = scrapy.Field() price = scrapy.Field(input_processor=Compose(get_prices, ), output_processor=Compose(lambda x: str(x), )) duration_as_string = scrapy.Field(output_processor=Compose( lambda x: '{} hrs/day, {} days/week for {} months'.format(*x), )) days = scrapy.Field(input_processor=Compose(get_days_tuned, ), output_processor=Compose(lambda x: ' | '.join(x), )) prerequisite = scrapy.Field() capacity = scrapy.Field() corequisites = scrapy.Field(output_processor=Identity()) program = scrapy.Field() duration_hours = scrapy.Field(input_processor=Compose( get_duration_hours_am_pm, ), output_processor=Compose(lambda x: str(x), )) duration_days_week = scrapy.Field( input_processor=Compose(get_duration_days_week, ), output_processor=Compose(lambda x: str(x), )) duration_months = scrapy.Field(input_processor=Compose( get_duration_months_tune, ), output_processor=Identity()) total_hours = scrapy.Field(input_processor=Compose(get_just_total_hours), output_processor=Identity())
class CtripItem(scrapy.Item): url = scrapy.Field() num = scrapy.Field( input_processor=Compose(get_num) ) title = scrapy.Field() price = scrapy.Field( input_processor=MapCompose(convert_to_int) ) img_urls = scrapy.Field( input_processor=Compose(img_url_prase), output_processor=Identity() ) trip_type = scrapy.Field() image_path = scrapy.Field( output_processor=Join(',') ) destination = scrapy.Field() vendor = scrapy.Field() guarantee = scrapy.Field( input_processor=Identity(), output_processor=Join(',') ) pass
class AccommodationUnlimitedPropertyLoader(ItemLoader): default_input_processor = Identity() default_output_processor = TakeFirst() area_in = Compose(Split(','), Get(0), Get(1)) street_name_in = Compose(Split(','), Get(0), Get(0)) #postcode_in = Identity() number_bedrooms_in = Compose(MapCompose(strip), TakeFirst(), Split(' '), Get(0), MapCompose(int)) price_per_month_in = Compose(TakeFirst(), MapCompose(format_price)) description_out = Join() amenities_in = Concatenate( Compose(TextSearch('washing machine'), lambda x: ['Washing machine'] if x else []), Compose(TextSearch('parking'), lambda x: ['Parking'] if x else []), Compose(TextSearch('dishwasher'), lambda x: ['Dishwasher'] if x else [])) amenities_out = Identity() heating_type_in = Compose(TextSearch('gas'), lambda is_gas: 'gas' if is_gas else 'unknown')
class CourseLoader(ItemLoader): default_item_class = Course default_output_processor = TakeFirst() code_in = MapCompose(str.strip, normalize_characters) code_out = TakeFirst() title_in = MapCompose(str.strip, normalize_characters) title_out = TakeFirst() instructor_in = MapCompose(str.strip, normalize_characters) instructor_out = TakeFirst() year_in = MapCompose(str.strip, normalize_characters, string_to_int) year_out = TakeFirst() term_in = MapCompose(str.strip, normalize_characters) term_out = TakeFirst() school_in = MapCompose(str.strip, normalize_characters) school_out = TakeFirst() keywords_in = MapCompose(create_keyword_list) keywords_out = Identity() lang_in = MapCompose(str.strip, normalize_characters) lang_out = TakeFirst() occurrences_out = Identity() _id_in = MapCompose(str.strip, normalize_characters, onclick_url_to_id) _id_out = TakeFirst()
class FacebookPostItemLoader(ItemLoader): default_item_class = FacebookPostItem default_output_processor = TakeFirst() post_text_out = Join() image_urls_out = Identity() comments_out = Identity()
class CpcpowerLoader(ItemLoader): default_input_processor = Identity() default_output_processor = Identity() title_out = TakeFirst() source_out = TakeFirst() platform_out = TakeFirst() region_out = TakeFirst() typeins_out = TakeFirst() players_out = Join(';') genre_out = TakeFirst() group_out = Join(';') criticScore_in = MapCompose(strip, filter_empty) year_out = MapCompose(validate_year) publisher_in = MapCompose(strip_copyright, rchop_parentheses, replace_underscore, strip) publisher_out = TakeFirst() presentation_out = MapCompose(unicode.strip) perspective_out = MapCompose(unicode.strip) visual_out = MapCompose(unicode.strip) pacing_out = MapCompose(unicode.strip) gameplay_out = MapCompose(unicode.strip) image_urls_in = MapCompose(cpcpower_image)
class AbsolutePropertyLoader(ItemLoader): default_input_processor = Identity() default_output_processor = TakeFirst() area_in = Identity() street_name_in = Compose(Split(','), Get(0), Get(1)) postcode_in = Compose(Split(','), Get(0), Get(2)) price_per_month_in = AbsolutePriceProcessor() description_out = Join() amenities_in = Concatenate( Compose(TextSearch('washing machine'), lambda x: ['Washing machine'] if x else []), Compose(TextSearch('parking'), lambda x: ['Parking'] if x else []), Compose(TextSearch('dishwasher'), lambda x: ['Dishwasher'] if x else []) ) amenities_out = Identity() heating_type_in = Compose(TextSearch('gas'), lambda is_gas: 'gas' if is_gas else 'unknown') let_agreed_in = Compose(lambda xs: 'Yes' if any('Let' in x for x in xs) else 'No')
class FamilyCircleLoader(SpiderLoader): name_out = Identity() img_url_out = Identity() direction_out = Compose(get_direction) servings_out = Compose(get_servings) products_out = Identity() ingredients_out = Compose(get_ingredients)
class MobyLoader(ItemLoader): default_input_processor = Identity() default_output_processor = Identity() title_out = TakeFirst() source_out = TakeFirst() platform_out = TakeFirst() region_out = TakeFirst() description_out = Join() genre_out = Join(';') group_out = Join(';') criticScore_in = MapCompose(strip_dots, filter_empty) userScore_in = MapCompose(strip_dots, filter_empty) date_out = TakeFirst() publisher_out = TakeFirst() developer_out = TakeFirst() presentation_out = MapCompose(unicode.strip) perspective_out = MapCompose(unicode.strip) visual_out = MapCompose(unicode.strip) vehicular_out = MapCompose(unicode.strip) pacing_out = MapCompose(unicode.strip) setting_out = MapCompose(unicode.strip) gameplay_out = MapCompose(unicode.strip) pacing_out = MapCompose(unicode.strip) image_urls_in = MapCompose(in_parentheses, moby_image)
class MypicLoader(ItemLoader): ''' mypic_master product_name product_category product_format product_pages mypic_prices price_options extra_charges pocket_price full_price variant_name variant_pocket_price variant_full_price ''' # Used if fields don't specify one default_input_processor = Identity() default_output_processor = TakeFirst() # Should be untouched mypic_prices___price_options_in = Identity() mypic_prices___price_options_out = Identity() # Prices mypic_prices___pocket_price_in = MapCompose(remove_tags, unicode.strip, clean_uni, get_dec) mypic_prices___full_price_in = MapCompose(remove_tags, unicode.strip, clean_uni, get_dec) mypic_prices___extra_charges_out = Join()
class RlTradeLoader(ItemLoader): """Item loader for a RlTrade.""" default_output_processor = TakeFirst() platform_out = Compose(lambda x: x[0], str.upper) have_out = Identity() want_out = Identity()
class TerryOlpinPropertyLoader(ItemLoader): default_input_processor = Identity() default_output_processor = TakeFirst() area_in = Identity() price_per_month_in = TerryOlpinPriceProcessor() number_bedrooms_in = MapCompose(get_bedrooms) description_out = Join() amenities_in = Concatenate( Compose(TextSearch('washing machine'), lambda x: ['Washing machine'] if x else []), Compose(TextSearch('parking'), lambda x: ['Parking'] if x else []), Compose(TextSearch('dishwasher'), lambda x: ['Dishwasher'] if x else [])) amenities_out = Identity() heating_type_in = Compose(TextSearch('gas'), lambda is_gas: 'gas' if is_gas else 'unknown') epc_rating_in = MapCompose(get_epc_rating)
class Game(scrapy.Item): # basic information title = scrapy.Field() subtitle = scrapy.Field() author = scrapy.Field() price = scrapy.Field() iap = scrapy.Field() age = scrapy.Field() desc = scrapy.Field(output_processor=Join()) # game popularity and reception list_rank = scrapy.Field( ) #is this game ranked? if so what category and position? score = scrapy.Field() #rating nrating = scrapy.Field() #number of ratings stars = scrapy.Field(output_processor=Identity()) # other details editor = scrapy.Field() #game noted by editor? seller = scrapy.Field() size = scrapy.Field() category = scrapy.Field() compat = scrapy.Field() #hardware compatibility lang = scrapy.Field(input_processor=Compose(lambda v: v[ 1])) # language in second element of list returned by xpath age_copy = scrapy.Field(output_processor=Identity( )) # age AND copyright in an unorganized list to be cleaned later support = scrapy.Field( output_processor=Identity()) # list of features supported
class AnnonceItem(scrapy.Item): plateforme = scrapy.Field(input_processor=MapCompose(str.strip), output_processor=TakeFirst()) type_bien = scrapy.Field(input_processor=MapCompose( remove_tags, str.strip), output_processor=TakeFirst()) prix = scrapy.Field(input_processor=MapCompose(remove_tags, remove_quotations, str.strip), output_processor=TakeFirst()) surface = scrapy.Field(input_processor=MapCompose(remove_tags, format_surface, remove_quotations, str.strip), output_processor=TakeFirst()) ville = scrapy.Field(input_processor=MapCompose(remove_tags), output_processor=TakeFirst()) les_plus = scrapy.Field(input_processor=MapCompose(remove_tags, remove_quotations), output_processor=Identity()) general = scrapy.Field(input_processor=MapCompose(remove_tags, remove_quotations), output_processor=Identity()) interieur = scrapy.Field(input_processor=MapCompose( remove_tags, remove_quotations), output_processor=Identity()) exterieur = scrapy.Field(input_processor=MapCompose( remove_tags, remove_quotations), output_processor=Identity()) diag_perf_energie = scrapy.Field(input_processor=MapCompose(), output_processor=TakeFirst()) indice_gaz = scrapy.Field(input_processor=MapCompose(), output_processor=TakeFirst())
class JobItem(scrapy.Item): # General infos url = scrapy.Field() jobId = scrapy.Field() title = scrapy.Field() company = scrapy.Field() location = scrapy.Field() sponsor = scrapy.Field() salary = scrapy.Field() paid = scrapy.Field() # About the job jobType = scrapy.Field() industry = scrapy.Field(output_processor=Identity()) experienceLevel = scrapy.Field(output_processor=Identity()) role = scrapy.Field() companySize = scrapy.Field() companyType = scrapy.Field() companyLogo = scrapy.Field() description = scrapy.Field() aboutCompany = scrapy.Field() jobLike = scrapy.Field() jobDislike = scrapy.Field() jobLove = scrapy.Field() linkApply = scrapy.Field() joelTest = scrapy.Field(output_processor=Identity()) # list technologies = scrapy.Field(output_processor=Identity()) # list benefits = scrapy.Field(output_processor=Identity()) # list
class House(scrapy.Item): craiglist_postingid = scrapy.Field( input_processor=TakeFirst(), output_processor=Compose( lambda x: int(x[0]))) # transfering to integer url = scrapy.Field() craiglist_postingdate = scrapy.Field( output_processor=Compose(lambda x: str(x[0]))) neighborhood = scrapy.Field() address = scrapy.Field() housing_type = scrapy.Field() rent = scrapy.Field() title = scrapy.Field() bedrooms = scrapy.Field(output_processor=Compose(lambda x: str(x[0]))) bathrooms = scrapy.Field(output_processor=Compose(lambda x: str(x[0]))) sqfeet = scrapy.Field() description = scrapy.Field(output_processor=Identity()) latitude = scrapy.Field() longitude = scrapy.Field() laundry = scrapy.Field() parking = scrapy.Field() walkcore = scrapy.Field() transitscore = scrapy.Field() bikescore = scrapy.Field() image_urls = scrapy.Field(output_processor=Identity()) images = scrapy.Field() pass
class MovieLoader(ItemLoader): default_input_processor = MapCompose(replace_chars) description_out = TakeFirst() title_out = TakeFirst() world_premier_out = TakeFirst() rf_premiere_out = TakeFirst() trailer_out = TakeFirst() time_in = MapCompose(str_to_int) time_out = TakeFirst() rating_kp_in = MapCompose(str_to_float) rating_kp_out = TakeFirst() rating_imdb_in = MapCompose(str_to_float) rating_imdb_out = TakeFirst() budget_in = MapCompose(str_to_int) budget_out = TakeFirst() country_out = TakeFirst() if MapCompose( lambda value: len(value) == 1) else Identity() directors_out = TakeFirst() if MapCompose( lambda value: len(value) == 1) else Identity() fees_in_usa_in = MapCompose(str_to_int) fees_in_usa_out = TakeFirst() fees_in_world_in = MapCompose(str_to_int) fees_in_world_out = TakeFirst()
class BookItem(scrapy.Item): # Scalars url = Field() title = Field(input_processor=MapCompose(str.strip)) author = Field(input_processor=MapCompose(str.strip)) num_ratings = Field(input_processor=MapCompose(str.strip, int)) num_reviews = Field(input_processor=MapCompose(str.strip, int)) avg_rating = Field(input_processor=MapCompose(str.strip, float)) num_pages = Field( input_processor=MapCompose(str.strip, num_page_extractor, int)) language = Field(input_processor=MapCompose(str.strip)) publish_date = Field(input_processor=extract_publish_dates) original_publish_year = Field( input_processor=MapCompose(extract_year, int)) isbn = Field() asin = Field() series = Field() # Lists awards = Field(output_processor=Identity()) places = Field(output_processor=Identity()) characters = Field(output_processor=Identity()) genres = Field(output_processor=Compose(set, list)) # Dicts rating_histogram = Field(input_processor=MapCompose(extract_ratings))
class BristolPropertyCentrePropertyLoader(ItemLoader): default_input_processor = Identity() default_output_processor = TakeFirst() area_in = Compose(Split(','), Get(0), Get(1)) street_name_in = Compose(Split(','), Get(0), Get(0)) postcode_in = Identity() number_bedrooms_in = Compose(TakeFirst(), Split(' '), Get(0), MapCompose(int)) number_bathrooms_in = Compose(TakeFirst(), Split(' '), Get(0), MapCompose(int)) price_per_month_in = Compose(TakeFirst(), MapCompose(format_price)) description_out = Join() amenities_in = Concatenate( Compose(TextSearch('washing machine'), lambda x: ['Washing machine'] if x else []), Compose(TextSearch('parking'), lambda x: ['Parking'] if x else []), Compose(TextSearch('dishwasher'), lambda x: ['Dishwasher'] if x else [])) amenities_out = Identity() heating_type_in = Compose(TextSearch('gas'), lambda is_gas: 'gas' if is_gas else 'unknown') let_agreed_in = Compose(lambda xs: 'Let' if any('Let Agreed' in x for x in xs) else 'No')
class FeedEntryItemLoader(BaseItemLoader): default_item_class = FeedEntryItem # Field specific content_text_in = MapCompose(skip_false, str.strip, remove_tags) content_text_out = Join("\n") content_html_in = MapCompose( skip_false, replace_regex, build_tree, convert_footnotes, pullup_elems, replace_elems, remove_elems, change_attribs, change_tags, cleanup_html, convert_iframes, lxml_cleaner, flatten_tree, skip_empty_tree, make_links_absolute, serialize_tree, ) content_html_out = Compose(Join(), truncate_text) # Use sorted to keep the output stable. category_out = Compose(set, sorted) enclosure_in = Identity() enclosure_out = Identity()
class Loader(ItemLoader): default_output_processor = TakeFirst() employment_types_in = MapCompose(str.lower, split) employment_types_out = Identity() posted_at_in = Compose(first, parse_relative_time) experience_levels_in = MapCompose(str.lower, split) experience_levels_out = Identity()
class RestItemLoader(ItemLoader): default_input_processor = MapCompose(unicode_convert) default_output_processor = TakeFirst() r_id_in = MapCompose(int_convert) link_in = Identity() city_in = MapCompose(str.capitalize) cost_in = MapCompose(lambda x: re.sub('[^0-9]+', '', x), int_convert) rating_in = MapCompose(unicode.strip, float_convert) rating_votes_in = MapCompose(int_convert) reviews_in = MapCompose(int_convert) photos_in = MapCompose(int_convert) bookmarks_in = MapCompose(int_convert) checkins_in = MapCompose(int_convert) cuisines_out = Identity() collections_out = Identity() r_address_in = MapCompose(unicode.strip, unicode_convert) r_address_out = Join() r_latitude_in = MapCompose(float_convert) r_longitude_in = MapCompose(float_convert)
class CinemaLoader(ItemLoader): default_item_class = Cinema default_input_processor = Identity() default_output_processor = TakeFirst() names_out = Identity() site_in = MapCompose(site_in)
class TeamStatItem(scrapy.Item): id = scrapy.Field(input_processor=MapCompose( lambda x: re.findall(r'/([0-9]+)/', x), eval), output_processor=TakeFirst()) nationality = scrapy.Field(input_processor=Identity(), output_processor=TakeFirst()) region = scrapy.Field(input_processor=Identity(), output_processor=TakeFirst()) num_players = scrapy.Field(input_processor=MapCompose(eval), output_processor=TakeFirst()) hits = scrapy.Field( input_processor=MapCompose(lambda x: re.findall(r'<?[0-9.K]+', x)[0]), output_processor=TakeFirst()) comments = scrapy.Field( input_processor=MapCompose(lambda x: re.findall(r'<?[0-9.K]+', x)[1]), output_processor=TakeFirst()) club_page = scrapy.Field(input_processor=MapCompose( lambda x: f'{urljoin("https://sofifa.com", x)}'), output_processor=TakeFirst())
class CompanyItem(scrapy.Item): # General infos name = scrapy.Field() companyType = scrapy.Field() companyHomePage = scrapy.Field() openJobs = scrapy.Field(output_processor=Identity()) # list benefits = scrapy.Field(output_processor=Identity()) # list technologyStack = scrapy.Field(output_processor=Identity()) # list
class ContactLoader(ItemLoader): default_item_class = ContactItem() default_input_processor = Identity() default_output_processor = Identity() email_in = TakeFirst() email_out = ShowMe()
class ReviewerItemLoader(ItemLoader): """Reviewer item loader""" default_input_processor = MapCompose(clean_text) default_output_processor = TakeFirst() staff_reviewer_in = Identity() verified_buyer_in = Identity() verified_reviewer_in = Identity()