class MyspiderItem(scrapy.Item): # define the fields for your item here like: #关于这里使用的输入输出处理器,建议最好配合mapcompose一起使用 #首先调用scrapy内建的输入输出处理器函数,然后在调用自定义的函数 #这样就可以得到较为精准的数据 movie_name = scrapy.Field( input_processor=Compose(TakeFirst()), output_processor=Compose(Join()), ) movie_type = scrapy.Field( input_processor=Compose(TakeFirst()), output_processor=Compose(Join()), ) movie_rate = scrapy.Field( input_processor=Compose(TakeFirst()), output_processor=Compose(Join()), ) movie_year = scrapy.Field( input_processor=Compose(TakeFirst(), RegExp()), output_processor=Compose(Join()), ) url = scrapy.Field(output_processor=Compose(Join()))
class ZhihuQuestionItem(scrapy.Item): # 知乎的问题Item question_id = scrapy.Field() question_url = scrapy.Field() question_title = scrapy.Field() question_descr = scrapy.Field() question_object_id = scrapy.Field() answer_num = scrapy.Field(input_processor=MapCompose(extract_num), ) followers = scrapy.Field( input_processor=Compose(return_followers_num, extract_num)) visitors = scrapy.Field( input_processor=Compose(return_visitors_num, extract_num)) topics = scrapy.Field(input_processor=Join(",")) answer_id_list = scrapy.Field(input_processor=Join(",")) answer_url_list = scrapy.Field(input_processor=Join(",")) def save_to_es(self): question = ZhihuQuestionType() question.question_id = self['question_id'] question.question_url = self['question_url'] question.question_title = self['question_title'] question.question_descr = self['question_descr'] question.question_object_id = self['question_object_id'] question.answer_num = self['answer_num'] question.followers = self['followers'] question.visitors = self['visitors'] question.topics = self['topics'] question.answer_id_list = self['answer_id_list'] question.answer_url_list = self['answer_url_list'] question.suggest = get_suggest(question_es, ZhihuQuestionType._doc_type.index, ((question.question_title, 10), (question.topics, 7))) question.save() return def get_insert_sql(self): # 插入zhihu_question表的sql语句 insert_sql = ''' insert into zhihu_question(question_id, question_url, question_title, question_descr, answer_num, followers, visitors, topics, answer_id_list, answer_url_list, question_object_id) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ''' params = (self['question_id'], self['question_url'], self['question_title'], self['question_descr'], self['answer_num'], self['followers'], self['visitors'], self['topics'], self['answer_id_list'], self['answer_url_list'], self['question_object_id']) return insert_sql, params
class HouseRentingLianjiaItem(HouseRentingBaseItem): publish_time = scrapy.Field(input_processor=MapCompose(str.strip), output_processor=Compose( Join(), str.strip, publish_time_serializer)) # publish_time = scrapy.Field() price = scrapy.Field(input_processor=MapCompose(str.strip), output_processor=Compose(Join(), str.strip)) # price = scrapy.Field() detail = scrapy.Field(input_processor=MapCompose(str.strip), output_processor=Compose(Join(), str.strip))
class ZcoolInfoLoader(ExtractLoader): designer_out = Compose( Join(), lambda s: s.strip().replace('\n', '').replace('\r', '')) hometown_out = Compose( Join(), lambda s: s.strip().replace('\n', '').replace('\r', '')) introduce_out = Compose( Join(), lambda s: s.strip().replace('\n', '').replace('\r', '')) brief_out = Compose(Join("\n")) equipment_out = Identity() label_out = Identity() personal_link_out = Identity()
class ScraperContentLoader(ItemLoader): default_output_processor = TakeFirst() name_in = Compose(TakeFirst(), unicode.strip) description_in = Compose(Join(), sanitize_html) details_in = Compose(Join(), sanitize_html) attributes_out = Compose(DefaultValue(lambda: {}), MergeDicts())
class PerekrestokAllItem(scrapy.Item): date_time = scrapy.Field( output_processor=TakeFirst() ) product_id = scrapy.Field( input_processor=MapCompose(to_int), output_processor=TakeFirst() ) category_id = scrapy.Field( input_processor=MapCompose(to_int), output_processor=TakeFirst() ) category_name = scrapy.Field( output_processor=TakeFirst() ) product_name = scrapy.Field( output_processor=TakeFirst() ) vendor = scrapy.Field( output_processor=TakeFirst() ) vendor_id = scrapy.Field( input_processor=MapCompose(to_int), output_processor=TakeFirst() ) country = scrapy.Field( input_processor=MapCompose(str.strip), output_processor=Join() ) regular_price = scrapy.Field( input_processor=MapCompose(to_float), output_processor=Compose(lambda x: max(x)) #Compose applies finction to the whole list, while MapCompose apply finction to each element in the list ) sale_price = scrapy.Field( input_processor=MapCompose(to_float), output_processor=Compose(lambda x: max(x)) ) unit = scrapy.Field( output_processor=TakeFirst() ) availability = scrapy.Field( input_processor=MapCompose(to_bool), output_processor=TakeFirst() ) link = scrapy.Field( output_processor=TakeFirst() ) second_level_cat = scrapy.Field( output_processor=TakeFirst() ) first_level_cat = scrapy.Field( output_processor=TakeFirst() )
class MoscowmapItem(scrapy.Item): _id = scrapy.Field() street = scrapy.Field( input_processor=MapCompose(lambda s: s.strip().strip(',')), output_processor=TakeFirst()) house = scrapy.Field(input_processor=MapCompose(lambda s: s.strip()), output_processor=Compose(parse_house)) params_house = scrapy.Field(output_processor=Compose(parse_params)) query = scrapy.Field(output_processor=TakeFirst()) url = scrapy.Field(output_processor=TakeFirst())
class ProxyItemLoader(ItemLoader): default_item_class = ProxyItem default_input_processor = MapCompose(lambda x: x.strip()) default_output_processor = TakeFirst() port_number_out = Compose(TakeFirst(), int) country_code_out = Compose(TakeFirst(), country_code_processor) city_out = Compose(TakeFirst(), lambda x: x.replace('"', '').strip()) response_time_out = Compose(TakeFirst(), lambda x: x.replace('ms', '').strip(), int) last_check_out = Compose(TakeFirst(), TimeDeltaProcessor())
class taxes_item(scrapy.Item): property_tax_assessment_year = scrapy.Field( output_processor=Compose(TakeFirst(), int)) property_tax = scrapy.Field( output_processor=Compose(TakeFirst(), get_number_from_string)) property_tax_assessment_land = scrapy.Field( output_processor=Compose(TakeFirst(), get_number_from_string)) property_tax_assessment_improvements = scrapy.Field( output_processor=Compose(TakeFirst(), get_number_from_string)) property_tax_assessment_total = scrapy.Field( output_processor=Compose(TakeFirst(), get_number_from_string))
class AuthorItemLoader(ItemLoader): author_bio_in = Compose(description_clean, description_join) author_bio_out = TakeFirst() name_out = TakeFirst() gender_out = TakeFirst() birth_date_out = TakeFirst() birth_place_out = TakeFirst() death_date_out = TakeFirst() death_place_out = TakeFirst() author_id_in = Compose(id_parser) author_id_out = TakeFirst()
class BookItemLoader(ItemLoader): title_out = TakeFirst() number_of_pages_out = TakeFirst() number_of_pages_in = Join() in_language_out = TakeFirst() description_in = Compose(description_clean, description_join) description_out = TakeFirst() author_book_id_in = Compose(id_parser) book_id_in = Compose(id_parser) author_book_id_out = TakeFirst() book_id_out = TakeFirst()
class AuthorItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() name = Field() author_url = Field() rating = Field() rating_count = Field() review_count = Field() image_url = Field() books = Field(output_processsor=Compose(set, list)) related_authors = Field(output_processsor=Compose(set, list))
class XueQiuLoader(NewsLoader): content_out = Compose(Join(), lambda s: s.strip(), lambda s: s.replace('\u3000', ''), lambda s: s.replace('\n', ''), lambda s: s.replace('\xa0', ''), lambda s: s.replace('\r', '')) # message_out = Compose(Join(), lambda s: s.strip(),lambda s: s.replace('\u3000',''),lambda s:s.replace('\n',''),lambda s:s.replace('\xa0','')) title_out = Compose(Join(), lambda s: s.strip(), lambda s: s.replace('\u3000', ''), lambda s: s.replace('\n', ''), lambda s: s.replace('\xa0', ''))
class ProductItem(scrapy.Item): # define the fields for your item here like: _id = scrapy.Field() url = scrapy.Field(output_processor=TakeFirst()) title = scrapy.Field(output_processor=TakeFirst()) price = scrapy.Field(output_processor=Compose(process_price)) description = scrapy.Field(output_processor=TakeFirst()) features = scrapy.Field(output_processor=Compose(process_features)) images = scrapy.Field(input_processor=Compose())
class ScienceDailyArticleLoader(ArticleLoader): content_in = Compose( Join('\n\n'), lambda x: remove_tags_with_content(x, ('div',)), # there's "div"s for advertisements ArticleLoader.default_input_processor, ) date_out = Compose( TakeFirst(), lambda date_str: datetime.strptime(date_str, "%B %d, %Y"), )
class LinkLoader(scrapy.loader.ItemLoader): product_in = MapCompose(remove_tags) product_out = TakeFirst() retailer_in = Compose() retailer_out = TakeFirst() date_in = Compose() date_out = TakeFirst() link_in = Compose() link_out = TakeFirst()
class WebsiteLoader(scrapy.loader.ItemLoader): retailer_in = Compose() retailer_out = TakeFirst() date_in = Compose() date_out = TakeFirst() product_in = TakeFirst() product_out = TakeFirst() html_in = Compose() html_out = TakeFirst()
class Committee(scrapy.Item): committee_id = scrapy.Field(input_processor=Compose(to_int),output_processor=TakeFirst()) committee_name = scrapy.Field(input_processor=Compose(clean),output_processor=TakeFirst()) election_cycle = scrapy.Field(input_processor=MapCompose(to_int),output_processor=TakeFirst()) historical_names = scrapy.Field(input_processor=MapCompose(clean),serializer=str) status = scrapy.Field(input_processor=Compose(clean),output_processor=TakeFirst()) reporting_period = scrapy.Field(output_processor=TakeFirst()) current_contributions = scrapy.Field(output_processor=TakeFirst()) year_contributions = scrapy.Field(output_processor=TakeFirst()) current_expenditures = scrapy.Field(output_processor=TakeFirst()) year_expenditures = scrapy.Field(output_processor=TakeFirst()) ending_cash = scrapy.Field(output_processor=TakeFirst())
class LmItem(scrapy.Item): # define the fields for your item here like: _id = scrapy.Field() name = scrapy.Field(output_processor=TakeFirst()) price = scrapy.Field(input_processor=Compose(to_int), output_processor=TakeFirst()) cur = scrapy.Field(output_processor=TakeFirst()) unic_photo = scrapy.Field(output_processor=Identity()) pict = scrapy.Field(output_processor=Identity()) art = scrapy.Field(output_processor=Identity()) unic_pict = scrapy.Field() params_dict = scrapy.Field(input_processor=Identity()) params = scrapy.Field(input_processor=Compose(clear_desc), output_processor=TakeFirst())
class ReviewerItemLoader(ItemLoader): """Reviewer item loader.""" default_input_processor = MapCompose(clean_text) default_output_processor = TakeFirst() reviews_count_in = MapCompose(clean_text, parse_int) skin_tone_out = Compose(set) skin_type_out = Compose(set) skin_concerns_out = Compose(set) eyes = Compose(set)
class NewsLoaderV3(BasicLoader): #title default #dateissued default subject_out = Identity() text_out = Compose(Join(), lambda s: s.strip(), lambda s: s.replace("\u3000\u3000", ""), lambda s: s.replace("\xa0", ""), lambda s: s.replace(r"\r\n", "<br>")) #description default source_out = Compose(Join(), lambda s: s.strip()) author_out = Identity()
class NewsLoader(BasicLoader): ''' itemname_in : itemname_out : ''' text_out = Compose(Join(), lambda s: s.strip(), lambda s: s.replace("\u3000\u3000", ""), lambda s: s.replace("\xa0", ""), lambda s: s.replace(r"\r\n", "<br>")) source_out = Compose(Join(), lambda s: s.strip())
class BaseAdLoader(ItemLoader): default_item_class = AdItem default_output_processor = TakeFirst() title_out = Compose(TakeFirst(), filters.clean_whitespace, filters.lower_string) description_out = Compose(filters.multiline_joiner, filters.lower_string) posted_date_out = Compose(TakeFirst(), filters.parse_date) price_out = Compose(TakeFirst(), filters.clean_whitespace, filters.missing_price_filter, int) currency_out = Compose(TakeFirst(), filters.clean_whitespace, filters.currency_mapper)
class MemberLoader(ItemLoader): default_item_class = MemberItem default_output_processor = TakeFirst() website_out = Identity() pgp_fingerprint_out = Join(' ') full_name_out = Compose(TakeFirst(), six.text_type.strip) bio_out = Compose(TakeFirst(), six.text_type.strip) location_out = Compose(TakeFirst(), six.text_type.strip) image_urls_in = MapCompose(absolute_url) image_urls_out = Identity()
class QiDianLoader(NewsLoader): content_out = Compose(Join(), lambda s: s.strip(), lambda s: s.replace('\u3000', ''), lambda s: s.replace('\n', ''), lambda s: s.replace('\xa0', ''), lambda s: s.replace('\r', ''), lambda s: s.replace('\t', '')) # message_out = Compose(Join(), lambda s: s.strip(),lambda s: s.replace('\u3000',''),lambda s:s.replace('\n',''),lambda s:s.replace('\xa0','')) chapter_out = Compose(Join(), lambda s: s.strip(), lambda s: s.replace('\u3000', ''), lambda s: s.replace('\n', ''), lambda s: s.replace('\xa0', ''))
class ReviewItem(scrapy.Item): product_id = scrapy.Field() recommended = scrapy.Field(output_processor=Compose( TakeFirst(), simplify_recommended), ) date = scrapy.Field( output_processor=Compose(TakeFirst(), standardize_date)) hours = scrapy.Field(output_processor=Compose(TakeFirst(), str_to_float)) found_helpful = scrapy.Field( output_processor=Compose(TakeFirst(), str_to_int)) found_funny = scrapy.Field( output_processor=Compose(TakeFirst(), str_to_int)) user_id = scrapy.Field() early_access = scrapy.Field()
class ZhihuAnswerItem(Item): url = Field(output_processor=Join(separator='')) answer_id = Field(output_processor=Join(separator='')) user_url = Field(output_processor=Join(separator='')) question_id = Field(output_processor=Join(separator='')) question_url = Field(output_processor=Join(separator='')) agree_num = Field(output_processor=Join(separator='')) summary = Field(output_processor=Compose(''.join, str.split, ''.join)) content = Field(output_processor=Compose(''.join, str.split, ''.join)) md5 = Field(output_processor=Join(separator='')) comment_num = Field(output_processor=Join(separator='')) collection_name = Field(output_processor=Join(separator='')) website = Field(output_processor=Join(separator=''))
class MujiLoader(ItemLoader): default_output_processor = TakeFirst() price_in = Compose(Join('')) salePrice_in = Compose(Join('')) title_out = Compose(Join('')) originalSizeLabel_out = str originalCategory_out = str detailImages_out = str description_out = str detailHtml_out = str
class DataLoader(ItemLoader): """ 处理数据类 """ default_input_processor = MapCompose(str.strip) default_output_processor = TakeFirst() registration_date_out = Compose( TakeFirst(), lambda x: x.split(": ")[1] if len(x.split(": ")) > 1 else x, RegistrationDateProcessor() ) main_body_out = Join() time_out = Compose(lambda x: x[1], DatetimeProcessor())
class ChinaLawCourtLoader(NewsLoader): content_out = Compose(Join(), lambda s: s.strip(), lambda s: s.replace('\u3000', ''), lambda s: s.replace('\n', ''), lambda s: s.replace('\xa0', ''), lambda s: s.replace('\r', '')) message_out = Compose(Join(), lambda s: s.strip(), lambda s: s.replace('\u3000', ''), lambda s: s.replace('\n', ''), lambda s: s.replace('\xa0', '')) title_out = Compose(Join(), lambda s: s.strip(), lambda s: s.replace('\u3000', ''), lambda s: s.replace('\n', ''), lambda s: s.replace('\xa0', ''))