Ejemplo n.º 1
0
class MpdataItem(scrapy.Item):
    """
    Defines the item fields and specifies processors for each field
    """

    name = scrapy.Field(
        input_processor=MapCompose(str.title, str.strip),
        output_processor=Join(' '),
    )
    image = scrapy.Field(output_processor=TakeFirst(), )
    birthdate = scrapy.Field(output_processor=TakeFirst(), )
    birthplace = scrapy.Field(
        input_processor=TakeFirst(),
        output_processor=Compose(lambda x: x[0] if len(x[0]) > 2 else "-"),
    )
    profession = scrapy.Field(
        input_processor=MapCompose(str.strip),
        output_processor=TakeFirst(),
    )
    languages = scrapy.Field(
        input_processor=MapCompose(str.strip),
        output_processor=Join(', '),
    )
    party = scrapy.Field(
        input_processor=MapCompose(str.strip),
        output_processor=TakeFirst(),
    )
    electoral_district = scrapy.Field(output_processor=TakeFirst(), )
    first_time_mp = scrapy.Field(
        input_processor=MapCompose(str.strip),
        output_processor=Join(', '),
    )
    email = scrapy.Field(output_processor=TakeFirst(), )
Ejemplo n.º 2
0
class LegalDocumentLoader(ItemLoader):
    """
    Process the scraped data.

    The raw html is formatted so that opening tags are on new lines
    and their content indented.

    This allows line-by-line diff across versions.
    """

    default_output_processor = TakeFirst()

    url_in = Identity()
    url_out = Join()

    title_in = Identity()
    title_out = Join()

    provider_in = Identity()
    provider_out = Join()

    last_updated_in = Identity()
    last_updated_out = Join()

    text_in = MapCompose(prettify_html)  # break lines
    text_out = Join()
Ejemplo n.º 3
0
class TruliaItemLoader(ItemLoader):
    default_input_processor = MapCompose(str.strip)
    default_output_processor = TakeFirst()

    price_out = Compose(
        lambda v: take_first_two(v), Join(' - '), lambda s: s.replace(',', '')
    )  # we will get multiple items if the price is a range - join to gether to make a string "<price 1> - <price 2>"
    description_out = Compose(remove_empty)
    features_out = Compose(remove_empty)
    heating_out = Compose(remove_empty)
    floors_out = Compose(remove_empty)
    city_state_out = Join(', ')
    tags_out = Compose(remove_empty)

    attribute_values_out = Compose(remove_empty)
    attribute_names_out = Compose(remove_empty)

    area_out = Compose(
        TakeFirst(), lambda s: s.replace(',', ''), str.strip
    )  # area could be "2,500" or a range, "2,500 - 5,000". To keep range we do not convert to int
    bedrooms_out = Compose(TakeFirst(), int)
    bathrooms_out = Compose(TakeFirst(), float)
    deposit = Compose(TakeFirst(), lambda s: int(s.replace(',', '')))
    year_built = Compose(TakeFirst(), int)
    days_on_market = Compose(TakeFirst(), int)
    year_built = Compose(TakeFirst(), int)
Ejemplo n.º 4
0
    def parse_other(self, response: HtmlResponse):
        item = response.meta['item']
        loader = QianchengItemLoader(item, response)
        _extract_info = partial(extract_info, response)
        info_text = _extract_info("//p[@class='msg ltype']/@title")[0].split(
            "|") if len(_extract_info(
                "//p[@class='msg ltype']/@title")) != 0 else ["空"] * 5
        loader.add_value("experience", info_text[1])
        loader.add_value("education",
                         info_text[2] if len(info_text) == 5 else "空")
        loader.add_value("job_number",
                         info_text[3] if len(info_text) == 5 else info_text[2])
        loader.add_xpath("advantage",
                         '//div[@class="jtag"]/div//span/text()',
                         processors=Compose(Join()))

        info = _extract_info("//div[@class='com_tag']/p/@title")

        loader.add_value("company_nature", info[0] if len(info) != 0 else "空")
        loader.add_value("company_size", info[1] if len(info) != 0 else "空")
        loader.add_value("company_industry",
                         info[2] if len(info) != 0 else "空")
        loader.add_xpath("company_address",
                         "//*[text()='联系方式']/parent::*/parent::*//p/text()",
                         processors=Compose(Join(""), self.replace_all_n))

        info2 = self.replace_all_n("".join(
            _extract_info(
                u"//*[text()='职位信息']/parent::*/parent::*/div//p//text()")))
        loc_div = info2.find(u"职能类别")

        loader.add_value("job_content", info2[:loc_div])
        loader.add_value("job_kind", info2[loc_div:])
        yield loader.load_item()
Ejemplo n.º 5
0
class Chapter(Item):
    title_content = Field(input_processor=MapCompose(remove_tags,
                                                     get_title_content),
                          output_processor=Join())
    content = Field(input_processor=MapCompose(convert_line_break, remove_tags,
                                               reformat_chapter_content),
                    output_processor=Join('\n\n'))
Ejemplo n.º 6
0
class NewsLoader(ItemLoader):
    url_out = TakeFirst()

    parent_url_out = TakeFirst()

    published_at_in = MapCompose(date_to_string)
    published_at_out = TakeFirst()

    author_in = MapCompose(trim_author)
    author_out = TakeFirst()

    title_in = MapCompose(unidecode.unidecode, trim)
    title_out = Join()

    description_in = MapCompose(unidecode.unidecode, trim)
    description_out = Join()

    outlet_out = TakeFirst()
    outlet_url_out = TakeFirst()

    type_out = TakeFirst()

    scraped_at_in = MapCompose(date_to_string)
    scraped_at_out = TakeFirst()
    scraped_url_out = TakeFirst()
Ejemplo n.º 7
0
class BookInfo(Item):
    full_name = Field(input_processor=MapCompose(str.strip),
                      output_processor=Join())
    author = Field(input_processor=MapCompose(str.strip),
                   output_processor=Join())
    last_chapter = Field(input_processor=MapCompose(get_last_chapter),
                         output_processor=Join())
Ejemplo n.º 8
0
class IggItem(scrapy.Item):

    title = scrapy.Field(
        input_processor=Compose(TakeFirst(), filter_title, lambda v: v.strip()),
        output_processor=Join(''))

    developer = scrapy.Field(
        input_processor=MapCompose(lambda v: v.strip()),
        output_processor=Join(''))

    publisher = scrapy.Field(
        input_processor=MapCompose(lambda v: v.strip()),
        output_processor=Join(''))

    release_date = scrapy.Field(
        input_processor=MapCompose(lambda v: v.strip()),
        output_processor=Join(''))

    genre = scrapy.Field(
        input_processor=MapCompose(lambda v: v.strip(), filter_empty),
        output_processor=Identity())

    links = scrapy.Field(
        input_processor=MapCompose(lambda v: v.strip(), filter_empty),
        output_processor=Identity())
Ejemplo n.º 9
0
class FeedEntryItemLoader(BaseItemLoader):
    default_item_class = FeedEntryItem

    # Field specific
    content_text_in = MapCompose(skip_false, str.strip, remove_tags)
    content_text_out = Join("\n")

    content_html_in = MapCompose(
        skip_false,
        replace_regex,
        build_tree,
        convert_footnotes,
        pullup_elems,
        replace_elems,
        remove_elems,
        change_attribs,
        change_tags,
        cleanup_html,
        convert_iframes,
        lxml_cleaner,
        flatten_tree,
        skip_empty_tree,
        make_links_absolute,
        make_srcset_absolute,
        serialize_tree,
    )
    content_html_out = Compose(Join(), truncate_text)

    # Use sorted to keep the output stable.
    category_out = Compose(set, sorted)

    enclosure_in = Identity()
    enclosure_out = Identity()
Ejemplo n.º 10
0
class ComputerAdLoader(SecondHandAdLoader):
    """
    """
    ram_model_in = Identity()
    ram_model_out = Join()

    ram_size_in = Identity()
    ram_size_out = Join()
Ejemplo n.º 11
0
class ShoesAdLoader(SecondHandAdLoader):
    """
    """
    category_in = Identity()
    category_out = Join()

    size_in = Identity()
    size_out = Join()
Ejemplo n.º 12
0
class HhLoader(ItemLoader):
    default_item_class = dict
    title_out = TakeFirst()
    salary_in = MapCompose(clear_salary)
    salary_out = Join()
    description_out = Join()
    author_href_in = MapCompose(make_author_link)
    author_href_out = TakeFirst()
Ejemplo n.º 13
0
class HhVacancyLoader(ItemLoader):
    default_item_class = HhVacancyItem
    vacancy_url_out = TakeFirst()
    title_out = TakeFirst()
    salary_out = Join()
    description_out = Join()
    company_url_in = MapCompose(company_url)
    company_url_out = TakeFirst()
Ejemplo n.º 14
0
class FashionItemLoader(ItemLoader):
    default_input_processor = Compose(normalize)
    length_in = Compose(set, list, sorted)
    
    brand_out = Join('/')
    price_out = Compose(lambda x: x[0].replace(',',''))
    wish_out = Compose(TakeFirst(),int)
    category_out = Compose(Join('>'))
    default_output_processor = Compose(set,list, sorted, Join(','))
Ejemplo n.º 15
0
class User(Item):
    full_name = Field(input_processor=MapCompose(remove_tags, str.strip),
                      output_processor=Join())
    username = Field(input_processor=MapCompose(remove_tags, get_username,
                                                str.strip),
                     output_processor=Join())
    follower = Field(input_processor=MapCompose(remove_tags, str.strip),
                     output_processor=Join())
    following = Field(input_processor=MapCompose(remove_tags, str.strip),
                      output_processor=Join())
Ejemplo n.º 16
0
class Chapter(Item):
    title_index = Field(input_processor=MapCompose(remove_tags,
                                                   get_title_index),
                        output_processor=Join())
    title_content = Field(input_processor=MapCompose(remove_tags,
                                                     get_title_content),
                          output_processor=Join())
    content = Field(input_processor=MapCompose(replace_break_element,
                                               remove_tags,
                                               reformat_chapter_content),
                    output_processor=Join('\n'))
Ejemplo n.º 17
0
class ProductItemMeta(scrapy.Item):
    detail_name = scrapy.Field(output_processor=Join())
    brand = scrapy.Field(output_processor=Join())
    description = scrapy.Field(
        input_processor=MapCompose(remove_whitespace),
        output_processor=Join()
    )
    # options = scrapy.Field(serializer=list)
    price = scrapy.Field()
    size_format = scrapy.Field(serializer=str)
    discount_percent = scrapy.Field(serializer=str)
Ejemplo n.º 18
0
class LermerparserItem(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field(output_processor=TakeFirst())
    photos = scrapy.Field()
    price = scrapy.Field(input_processor=Join('.'),
                         output_processor=MapCompose(to_float))
    description = scrapy.Field(input_processor=Join(''),
                               output_processor=TakeFirst())
    specifications_keys = scrapy.Field()
    specifications_vals = scrapy.Field(
        input_processor=MapCompose(remove_empty_space))
    link = scrapy.Field()
Ejemplo n.º 19
0
class Juxtapoz_Item(scrapy.Item):
    title = Field(input_processor=Join(),
                  output_processor=MapCompose(tags_and_unicode))
    para = Field(input_processor=Join(),
                 output_processor=MapCompose(tags_and_unicode))
    captions = Field(input_processor=MapCompose(tags_and_unicode))
    images = Field()
    author = Field(input_processor=Join(), output_processor=TakeFirst())
    pubtime = Field(output_processor=TakeFirst())
    tag = Field()
    url = Field()
    source = Field(output_processor=TakeFirst())
Ejemplo n.º 20
0
class Nytimes_Dir_Item(scrapy.Item):
    title = Field(input_processor=Join(),
                  output_processor=MapCompose(tags_and_unicode))
    para = Field(input_processor=Join(),
                 output_processor=MapCompose(tags_and_unicode))
    captions = Field(input_processor=Compose(elim_dupes),
                     output_processor=MapCompose(tags_and_unicode))
    images = Field()
    author = Field(input_processor=Join(), output_processor=TakeFirst())
    pubtime = Field(input_processor=MapCompose(iso_time_to_df))
    tag = Field()
    url = Field()
    source = Field(output_processor=TakeFirst())
Ejemplo n.º 21
0
class Artag_and_eflux_Item(scrapy.Item):
    title = Field(input_processor=Join(),
                  output_processor=MapCompose(tags_and_unicode))
    para = Field(input_processor=Join(),
                 output_processor=MapCompose(para_clean))
    captions = Field(input_processor=MapCompose(tags_and_unicode))
    images = Field()
    author = Field(input_processor=Join(), output_processor=TakeFirst())
    pubtime = Field(input_processor=MapCompose(word_time_to_df),
                    output_processor=TakeFirst())
    tag = Field(input_processor=MapCompose(tags_and_unicode))
    url = Field()
    source = Field(output_processor=TakeFirst())
Ejemplo n.º 22
0
    def parse_item(self, response):
        item = ProductItem()

        l = ItemLoader(item=item, response=response)
        l.add_css("title", "span._3mRKt::text", Join(), MapCompose(str.strip))
        l.add_css("sale_price", "div.eP0wn._26-lJ._28iFq::text", Join(), MapCompose(str.strip), re='[,.0-9]+')
        l.add_css("full_price", "span._2plVT._35rbh::text", re="[,.0-9]+")
        l.add_css("description", "div._34YUR._1K7NF > span::text", MapCompose(str.strip))
        l.add_css("brand", "h1._1psEi > a::text")
        l.add_css("category", "li._1Hb_0:nth-child(4) > a > span::text")

        l.add_value("url", response.url)

        return l.load_item()
Ejemplo n.º 23
0
class Hyperallergic_Dir_Item(scrapy.Item):
    title = Field(input_processor=Join(),
                  output_processor=MapCompose(tags_and_unicode))
    para = Field(input_processor=Join(),
                 output_processor=MapCompose(para_clean))

    captions = Field(input_processor=MapCompose(tags_and_unicode))
    images = Field()
    author = Field(input_processor=Join(), output_processor=TakeFirst())
    pubtime = Field(input_processor=MapCompose(iso_time_to_df),
                    output_processor=TakeFirst())
    tag = Field()
    url = Field()
    source = Field(output_processor=TakeFirst())
Ejemplo n.º 24
0
class AmazonReviewItem(scrapy.Item):
    name = scrapy.Field(output_processor=TakeFirst())
    product_title = scrapy.Field(output_processor=TakeFirst())
    product_url = scrapy.Field(output_processor=TakeFirst(),input_processor=MapCompose(parse_url))
    rating = scrapy.Field(output_processor=TakeFirst())
    review_short = scrapy.Field(output_processor=TakeFirst())
    review_long = scrapy.Field(output_processor=Join(),input_processor=MapCompose(str.strip))
Ejemplo n.º 25
0
class AuthorLoader(ItemLoader):
    default_item_class = AuthorItem
    name_out = Join()
    site_url_out = TakeFirst()
    field_of_activity_in = MapCompose(parse_field_of_activity)
    author_description_out = TakeFirst()
    url_out = TakeFirst()
Ejemplo n.º 26
0
    def parse_item(self, response):
        """ This function parses a property page.
        @url https://www.gumtree.com/p/property-to-rent/one-bedroom-property-near-chiswick-park-tube-station./1405437559
        @returns items 1
        @scrapes title price description address image_urls
        @scrapes url project server spider date
        """
        loader = ItemLoader(item=PropertiesItem(), response=response)

        loader.add_value('title', response.meta['title'],
                         MapCompose(str.strip, str.title))

        loader.add_xpath('price',
                         '//h2[@itemprop="price"]/text()',
                         MapCompose(lambda i: i.replace(',', ''), float),
                         re='[,.0-9]+')

        loader.add_xpath('description', '//p[@itemprop="description"]/text()',
                         MapCompose(str.strip), Join())
        loader.add_xpath('address', '//h4[@itemprop="addressLocality"]/text()',
                         MapCompose(str.strip))
        loader.add_xpath('image_urls', '//*[@class="carousel-item"]/img/@src',
                         MapCompose(lambda i: urljoin(response.url, i)))

        loader.add_value('url', response.url)
        loader.add_value('project', self.settings.get('BOT_NAME'))
        loader.add_value('spider', self.name)
        loader.add_value('server', socket.gethostname())
        loader.add_value('date', datetime.datetime.now())

        return loader.load_item()
Ejemplo n.º 27
0
    def parse_item(self, response):
        item = Item()
        l = ItemLoader(item=item, response=response)
        
        for a in self.config["attributes"]:
            item.fields[a["name"]] = scrapy.Field()
            
            processors = []
            if "processors" in a:
                for p in a["processors"]:
                    if p == "join":
                        processors.append(Join())
                    elif p == "strip":
                        processors.append(MapCompose(str.strip))

            kwargs = {}
            if "regex" in a:
                kwargs["re"] = a["regex"]

            l.add_css(a["name"], a["selector"], *processors, **kwargs)
        
        item.fields["url"] = scrapy.Field()
        l.add_value("url", response.url)

        return l.load_item()
Ejemplo n.º 28
0
    def parse_item(self, response):
        """ This function parses a property page.
        @url http://web:9312/properties/property_000000.html
        @returns items 1
        @scrapes title price description address image_urls
        @scrapes url project spider server date
        """
        if not response:
            self.log("RESPONSE IS NONE")
        # Create the loader using the response
        l = ItemLoader(item=PropertiesItem(), response=response)

        # Load fields using XPath expressions
        l.add_xpath('title', '//*[@itemprop="name"][1]/text()',
                    MapCompose(str.strip, str.title))
        l.add_xpath('price',
                    './/*[@itemprop="price"][1]/text()',
                    MapCompose(lambda i: i.replace(',', ''), float),
                    re='[,.0-9]+')
        l.add_xpath('description', '//*[@itemprop="description"][1]/text()',
                    MapCompose(str.strip), Join())
        l.add_xpath('address',
                    '//*[@itemtype="http://schema.org/Place"][1]/text()',
                    MapCompose(str.strip))
        l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
                    MapCompose(lambda i: urljoin(response.url, i)))

        # Housekeeping fields
        l.add_value('url', response.url)
        l.add_value('project', self.settings.get('BOT_NAME'))
        l.add_value('spider', self.name)
        l.add_value('server', socket.gethostname())
        l.add_value('date', datetime.datetime.now())

        return l.load_item()
Ejemplo n.º 29
0
class QuotesParquetItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    text = scrapy.Field(
        output_processor=Join()
    )
    author = scrapy.Field()
    tags = scrapy.Field()
Ejemplo n.º 30
0
class WeekliesScraperItem(Item):
    issue_name = Field(
        input_processor = MapCompose(),
        output_processor = TakeFirst()
    )
    issue_number = Field(
        input_processor = MapCompose(str.strip),
        output_processor = TakeFirst()
    )
    issue_cover_url = Field(
        input_processor = MapCompose(),
        output_processor = TakeFirst()
    )
    issue_url = Field(
        input_processor = MapCompose(),
        output_processor = TakeFirst()
    )
    section_name = Field(
        input_processor = MapCompose(remove_tags, str.strip, str.lower), 
        output_processor = TakeFirst()
    )
    article_url = Field(
        input_processor = MapCompose(),
        output_processor = TakeFirst()
    )
    article_title = Field(
        input_processor = MapCompose(remove_tags, str.strip),
        output_processor = TakeFirst()
    )
    article_authors = Field(
        input_processor = MapCompose(remove_tags, str.strip),
        output_processor = Join(', ')
    )
    article_intro = Field(
        input_processor = MapCompose(remove_tags, str.strip, remove_empty_lines), 
        output_processor = Join()
    )
    article_content = Field(
        input_processor = MapCompose(remove_tags, str.strip, remove_empty_lines, remove_xml_tags),
        output_processor = Join('\n')
    )
    article_tags = Field(
        input_processor = MapCompose(str.strip, str.lower),
        output_processor = Join(', ')
    )