Ejemplo n.º 1
0
class MyspiderItem(scrapy.Item):
    # define the fields for your item here like:
    #关于这里使用的输入输出处理器,建议最好配合mapcompose一起使用
    #首先调用scrapy内建的输入输出处理器函数,然后在调用自定义的函数
    #这样就可以得到较为精准的数据
    movie_name = scrapy.Field(
        input_processor=Compose(TakeFirst()),
        output_processor=Compose(Join()),
    )

    movie_type = scrapy.Field(
        input_processor=Compose(TakeFirst()),
        output_processor=Compose(Join()),
    )

    movie_rate = scrapy.Field(
        input_processor=Compose(TakeFirst()),
        output_processor=Compose(Join()),
    )

    movie_year = scrapy.Field(
        input_processor=Compose(TakeFirst(), RegExp()),
        output_processor=Compose(Join()),
    )

    url = scrapy.Field(output_processor=Compose(Join()))
Ejemplo n.º 2
0
class ZhihuQuestionItem(scrapy.Item):
    # 知乎的问题Item
    question_id = scrapy.Field()
    question_url = scrapy.Field()
    question_title = scrapy.Field()
    question_descr = scrapy.Field()
    question_object_id = scrapy.Field()
    answer_num = scrapy.Field(input_processor=MapCompose(extract_num), )
    followers = scrapy.Field(
        input_processor=Compose(return_followers_num, extract_num))
    visitors = scrapy.Field(
        input_processor=Compose(return_visitors_num, extract_num))
    topics = scrapy.Field(input_processor=Join(","))
    answer_id_list = scrapy.Field(input_processor=Join(","))
    answer_url_list = scrapy.Field(input_processor=Join(","))

    def save_to_es(self):
        question = ZhihuQuestionType()
        question.question_id = self['question_id']
        question.question_url = self['question_url']
        question.question_title = self['question_title']
        question.question_descr = self['question_descr']
        question.question_object_id = self['question_object_id']
        question.answer_num = self['answer_num']
        question.followers = self['followers']
        question.visitors = self['visitors']
        question.topics = self['topics']
        question.answer_id_list = self['answer_id_list']
        question.answer_url_list = self['answer_url_list']

        question.suggest = get_suggest(question_es,
                                       ZhihuQuestionType._doc_type.index,
                                       ((question.question_title, 10),
                                        (question.topics, 7)))

        question.save()

        return

    def get_insert_sql(self):
        # 插入zhihu_question表的sql语句
        insert_sql = '''
            insert into zhihu_question(question_id, question_url, question_title, question_descr,
            answer_num, followers, visitors, topics, answer_id_list, answer_url_list, question_object_id)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        '''

        params = (self['question_id'], self['question_url'],
                  self['question_title'], self['question_descr'],
                  self['answer_num'], self['followers'], self['visitors'],
                  self['topics'], self['answer_id_list'],
                  self['answer_url_list'], self['question_object_id'])

        return insert_sql, params
Ejemplo n.º 3
0
class HouseRentingLianjiaItem(HouseRentingBaseItem):
    publish_time = scrapy.Field(input_processor=MapCompose(str.strip),
                                output_processor=Compose(
                                    Join(), str.strip,
                                    publish_time_serializer))
    # publish_time = scrapy.Field()
    price = scrapy.Field(input_processor=MapCompose(str.strip),
                         output_processor=Compose(Join(), str.strip))
    # price = scrapy.Field()
    detail = scrapy.Field(input_processor=MapCompose(str.strip),
                          output_processor=Compose(Join(), str.strip))
Ejemplo n.º 4
0
class ZcoolInfoLoader(ExtractLoader):
    designer_out = Compose(
        Join(), lambda s: s.strip().replace('\n', '').replace('\r', ''))
    hometown_out = Compose(
        Join(), lambda s: s.strip().replace('\n', '').replace('\r', ''))
    introduce_out = Compose(
        Join(), lambda s: s.strip().replace('\n', '').replace('\r', ''))
    brief_out = Compose(Join("\n"))
    equipment_out = Identity()
    label_out = Identity()
    personal_link_out = Identity()
Ejemplo n.º 5
0
class ScraperContentLoader(ItemLoader):

    default_output_processor = TakeFirst()

    name_in = Compose(TakeFirst(), unicode.strip)

    description_in = Compose(Join(), sanitize_html)

    details_in = Compose(Join(), sanitize_html)

    attributes_out = Compose(DefaultValue(lambda: {}), MergeDicts())
Ejemplo n.º 6
0
class PerekrestokAllItem(scrapy.Item):
    date_time = scrapy.Field(
        output_processor=TakeFirst()
    )
    product_id = scrapy.Field(
        input_processor=MapCompose(to_int),
        output_processor=TakeFirst()
    )
    category_id = scrapy.Field(
        input_processor=MapCompose(to_int),
        output_processor=TakeFirst()
    )
    category_name = scrapy.Field(
        output_processor=TakeFirst()
    )
    product_name = scrapy.Field(
        output_processor=TakeFirst()
    )    
    vendor = scrapy.Field(
        output_processor=TakeFirst()
    )
    vendor_id = scrapy.Field(
        input_processor=MapCompose(to_int),
        output_processor=TakeFirst()
    )
    country = scrapy.Field(
        input_processor=MapCompose(str.strip),
        output_processor=Join()
    )
    regular_price = scrapy.Field(
        input_processor=MapCompose(to_float),
        output_processor=Compose(lambda x: max(x))
        #Compose applies finction to the whole list, while MapCompose apply finction to each element in the list
    )
    sale_price = scrapy.Field(
        input_processor=MapCompose(to_float),
        output_processor=Compose(lambda x: max(x))
    )
    unit = scrapy.Field(
        output_processor=TakeFirst()
    )
    availability = scrapy.Field(
        input_processor=MapCompose(to_bool),
        output_processor=TakeFirst()
    )
    link = scrapy.Field(
        output_processor=TakeFirst()
    )
    second_level_cat = scrapy.Field(
        output_processor=TakeFirst()
    )
    first_level_cat = scrapy.Field(
        output_processor=TakeFirst()
    )
Ejemplo n.º 7
0
class MoscowmapItem(scrapy.Item):
    _id = scrapy.Field()
    street = scrapy.Field(
        input_processor=MapCompose(lambda s: s.strip().strip(',')),
        output_processor=TakeFirst())

    house = scrapy.Field(input_processor=MapCompose(lambda s: s.strip()),
                         output_processor=Compose(parse_house))
    params_house = scrapy.Field(output_processor=Compose(parse_params))
    query = scrapy.Field(output_processor=TakeFirst())
    url = scrapy.Field(output_processor=TakeFirst())
Ejemplo n.º 8
0
class ProxyItemLoader(ItemLoader):
    default_item_class = ProxyItem
    default_input_processor = MapCompose(lambda x: x.strip())
    default_output_processor = TakeFirst()

    port_number_out = Compose(TakeFirst(), int)
    country_code_out = Compose(TakeFirst(), country_code_processor)
    city_out = Compose(TakeFirst(), lambda x: x.replace('"', '').strip())
    response_time_out = Compose(TakeFirst(),
                                lambda x: x.replace('ms', '').strip(), int)
    last_check_out = Compose(TakeFirst(), TimeDeltaProcessor())
Ejemplo n.º 9
0
class taxes_item(scrapy.Item):
    property_tax_assessment_year = scrapy.Field(
        output_processor=Compose(TakeFirst(), int))
    property_tax = scrapy.Field(
        output_processor=Compose(TakeFirst(), get_number_from_string))
    property_tax_assessment_land = scrapy.Field(
        output_processor=Compose(TakeFirst(), get_number_from_string))
    property_tax_assessment_improvements = scrapy.Field(
        output_processor=Compose(TakeFirst(), get_number_from_string))
    property_tax_assessment_total = scrapy.Field(
        output_processor=Compose(TakeFirst(), get_number_from_string))
Ejemplo n.º 10
0
class AuthorItemLoader(ItemLoader):
    author_bio_in = Compose(description_clean, description_join)
    author_bio_out = TakeFirst()
    name_out = TakeFirst()
    gender_out = TakeFirst()
    birth_date_out = TakeFirst()
    birth_place_out = TakeFirst()
    death_date_out = TakeFirst()
    death_place_out = TakeFirst()
    author_id_in = Compose(id_parser)
    author_id_out = TakeFirst()
Ejemplo n.º 11
0
class BookItemLoader(ItemLoader):
    title_out = TakeFirst()
    number_of_pages_out = TakeFirst()
    number_of_pages_in = Join()
    in_language_out = TakeFirst()
    description_in = Compose(description_clean, description_join)
    description_out = TakeFirst()
    author_book_id_in = Compose(id_parser)
    book_id_in = Compose(id_parser)
    author_book_id_out = TakeFirst()
    book_id_out = TakeFirst()
Ejemplo n.º 12
0
class AuthorItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name = Field()
    author_url = Field()
    rating = Field()
    rating_count = Field()
    review_count = Field()
    image_url = Field()
    books = Field(output_processsor=Compose(set, list))
    related_authors = Field(output_processsor=Compose(set, list))
Ejemplo n.º 13
0
class XueQiuLoader(NewsLoader):
    content_out = Compose(Join(), lambda s: s.strip(),
                          lambda s: s.replace('\u3000', ''),
                          lambda s: s.replace('\n', ''),
                          lambda s: s.replace('\xa0', ''),
                          lambda s: s.replace('\r', ''))
    # message_out = Compose(Join(), lambda s: s.strip(),lambda s: s.replace('\u3000',''),lambda s:s.replace('\n',''),lambda s:s.replace('\xa0',''))
    title_out = Compose(Join(), lambda s: s.strip(),
                        lambda s: s.replace('\u3000', ''),
                        lambda s: s.replace('\n', ''),
                        lambda s: s.replace('\xa0', ''))
Ejemplo n.º 14
0
class ProductItem(scrapy.Item):
    # define the fields for your item here like:
    _id = scrapy.Field()
    url = scrapy.Field(output_processor=TakeFirst())

    title = scrapy.Field(output_processor=TakeFirst())
    price = scrapy.Field(output_processor=Compose(process_price))
    description = scrapy.Field(output_processor=TakeFirst())

    features = scrapy.Field(output_processor=Compose(process_features))
    images = scrapy.Field(input_processor=Compose())
Ejemplo n.º 15
0
class ScienceDailyArticleLoader(ArticleLoader):
    content_in = Compose(
        Join('\n\n'),
        lambda x: remove_tags_with_content(x, ('div',)),  # there's "div"s for advertisements
        ArticleLoader.default_input_processor,
    )

    date_out = Compose(
        TakeFirst(),
        lambda date_str: datetime.strptime(date_str, "%B %d, %Y"),
    )
Ejemplo n.º 16
0
class LinkLoader(scrapy.loader.ItemLoader):
    product_in = MapCompose(remove_tags)
    product_out = TakeFirst()

    retailer_in = Compose()
    retailer_out = TakeFirst()

    date_in = Compose()
    date_out = TakeFirst()

    link_in = Compose()
    link_out = TakeFirst()
Ejemplo n.º 17
0
class WebsiteLoader(scrapy.loader.ItemLoader):
    retailer_in = Compose()
    retailer_out = TakeFirst()

    date_in = Compose()
    date_out = TakeFirst()

    product_in = TakeFirst()
    product_out = TakeFirst()

    html_in = Compose()
    html_out = TakeFirst()
class Committee(scrapy.Item):
    committee_id = scrapy.Field(input_processor=Compose(to_int),output_processor=TakeFirst())
    committee_name = scrapy.Field(input_processor=Compose(clean),output_processor=TakeFirst())
    election_cycle = scrapy.Field(input_processor=MapCompose(to_int),output_processor=TakeFirst())
    historical_names = scrapy.Field(input_processor=MapCompose(clean),serializer=str)
    status = scrapy.Field(input_processor=Compose(clean),output_processor=TakeFirst())
    reporting_period = scrapy.Field(output_processor=TakeFirst())
    current_contributions = scrapy.Field(output_processor=TakeFirst())
    year_contributions = scrapy.Field(output_processor=TakeFirst())
    current_expenditures = scrapy.Field(output_processor=TakeFirst())
    year_expenditures = scrapy.Field(output_processor=TakeFirst())
    ending_cash = scrapy.Field(output_processor=TakeFirst())
Ejemplo n.º 19
0
class LmItem(scrapy.Item):
    # define the fields for your item here like:
    _id = scrapy.Field()
    name = scrapy.Field(output_processor=TakeFirst())
    price = scrapy.Field(input_processor=Compose(to_int), output_processor=TakeFirst())
    cur = scrapy.Field(output_processor=TakeFirst())
    unic_photo = scrapy.Field(output_processor=Identity())
    pict = scrapy.Field(output_processor=Identity())
    art = scrapy.Field(output_processor=Identity())
    unic_pict = scrapy.Field()
    params_dict = scrapy.Field(input_processor=Identity())
    params = scrapy.Field(input_processor=Compose(clear_desc), output_processor=TakeFirst())
Ejemplo n.º 20
0
class ReviewerItemLoader(ItemLoader):
    """Reviewer item loader."""

    default_input_processor = MapCompose(clean_text)
    default_output_processor = TakeFirst()

    reviews_count_in = MapCompose(clean_text, parse_int)

    skin_tone_out = Compose(set)
    skin_type_out = Compose(set)
    skin_concerns_out = Compose(set)
    eyes = Compose(set)
Ejemplo n.º 21
0
class NewsLoaderV3(BasicLoader):

    #title default
    #dateissued default
    subject_out = Identity()
    text_out = Compose(Join(), lambda s: s.strip(),
                       lambda s: s.replace("\u3000\u3000", ""),
                       lambda s: s.replace("\xa0", ""),
                       lambda s: s.replace(r"\r\n", "<br>"))
    #description default
    source_out = Compose(Join(), lambda s: s.strip())
    author_out = Identity()
Ejemplo n.º 22
0
class NewsLoader(BasicLoader):
    '''

    itemname_in     : 
    itemname_out    : 
    '''
    text_out = Compose(Join(), lambda s: s.strip(),
                       lambda s: s.replace("\u3000\u3000", ""),
                       lambda s: s.replace("\xa0", ""),
                       lambda s: s.replace(r"\r\n", "<br>"))

    source_out = Compose(Join(), lambda s: s.strip())
Ejemplo n.º 23
0
class BaseAdLoader(ItemLoader):
    default_item_class = AdItem
    default_output_processor = TakeFirst()

    title_out = Compose(TakeFirst(), filters.clean_whitespace,
                        filters.lower_string)
    description_out = Compose(filters.multiline_joiner, filters.lower_string)
    posted_date_out = Compose(TakeFirst(), filters.parse_date)
    price_out = Compose(TakeFirst(), filters.clean_whitespace,
                        filters.missing_price_filter, int)
    currency_out = Compose(TakeFirst(), filters.clean_whitespace,
                           filters.currency_mapper)
Ejemplo n.º 24
0
class MemberLoader(ItemLoader):
    default_item_class = MemberItem
    default_output_processor = TakeFirst()

    website_out = Identity()
    pgp_fingerprint_out = Join(' ')
    full_name_out = Compose(TakeFirst(), six.text_type.strip)
    bio_out = Compose(TakeFirst(), six.text_type.strip)
    location_out = Compose(TakeFirst(), six.text_type.strip)

    image_urls_in = MapCompose(absolute_url)
    image_urls_out = Identity()
Ejemplo n.º 25
0
class QiDianLoader(NewsLoader):
    content_out = Compose(Join(), lambda s: s.strip(),
                          lambda s: s.replace('\u3000', ''),
                          lambda s: s.replace('\n', ''),
                          lambda s: s.replace('\xa0', ''),
                          lambda s: s.replace('\r', ''),
                          lambda s: s.replace('\t', ''))
    # message_out = Compose(Join(), lambda s: s.strip(),lambda s: s.replace('\u3000',''),lambda s:s.replace('\n',''),lambda s:s.replace('\xa0',''))
    chapter_out = Compose(Join(), lambda s: s.strip(),
                          lambda s: s.replace('\u3000', ''),
                          lambda s: s.replace('\n', ''),
                          lambda s: s.replace('\xa0', ''))
Ejemplo n.º 26
0
class ReviewItem(scrapy.Item):
    product_id = scrapy.Field()
    recommended = scrapy.Field(output_processor=Compose(
        TakeFirst(), simplify_recommended), )
    date = scrapy.Field(
        output_processor=Compose(TakeFirst(), standardize_date))
    hours = scrapy.Field(output_processor=Compose(TakeFirst(), str_to_float))
    found_helpful = scrapy.Field(
        output_processor=Compose(TakeFirst(), str_to_int))
    found_funny = scrapy.Field(
        output_processor=Compose(TakeFirst(), str_to_int))
    user_id = scrapy.Field()
    early_access = scrapy.Field()
Ejemplo n.º 27
0
class ZhihuAnswerItem(Item):
    url = Field(output_processor=Join(separator=''))
    answer_id = Field(output_processor=Join(separator=''))
    user_url = Field(output_processor=Join(separator=''))
    question_id = Field(output_processor=Join(separator=''))
    question_url = Field(output_processor=Join(separator=''))
    agree_num = Field(output_processor=Join(separator=''))
    summary = Field(output_processor=Compose(''.join, str.split, ''.join))
    content = Field(output_processor=Compose(''.join, str.split, ''.join))
    md5 = Field(output_processor=Join(separator=''))
    comment_num = Field(output_processor=Join(separator=''))
    collection_name = Field(output_processor=Join(separator=''))
    website = Field(output_processor=Join(separator=''))
Ejemplo n.º 28
0
class MujiLoader(ItemLoader):
    default_output_processor = TakeFirst()
    
    price_in = Compose(Join(''))
    salePrice_in = Compose(Join(''))
    
    title_out = Compose(Join(''))
    originalSizeLabel_out = str

    originalCategory_out = str
    detailImages_out = str
    description_out = str
    detailHtml_out = str
Ejemplo n.º 29
0
class DataLoader(ItemLoader):
    """
    处理数据类
    """
    default_input_processor = MapCompose(str.strip)
    default_output_processor = TakeFirst()
    registration_date_out = Compose(
        TakeFirst(),
        lambda x: x.split(": ")[1] if len(x.split(": ")) > 1 else x,
        RegistrationDateProcessor()
    )
    main_body_out = Join()
    time_out = Compose(lambda x: x[1], DatetimeProcessor())
Ejemplo n.º 30
0
class ChinaLawCourtLoader(NewsLoader):
    content_out = Compose(Join(), lambda s: s.strip(),
                          lambda s: s.replace('\u3000', ''),
                          lambda s: s.replace('\n', ''),
                          lambda s: s.replace('\xa0', ''),
                          lambda s: s.replace('\r', ''))
    message_out = Compose(Join(), lambda s: s.strip(),
                          lambda s: s.replace('\u3000', ''),
                          lambda s: s.replace('\n', ''),
                          lambda s: s.replace('\xa0', ''))
    title_out = Compose(Join(), lambda s: s.strip(),
                        lambda s: s.replace('\u3000', ''),
                        lambda s: s.replace('\n', ''),
                        lambda s: s.replace('\xa0', ''))