コード例 #1
0
ファイル: torideal.py プロジェクト: jmyrberg/finscraper
class _ToriDealItem(Item):
    __doc__ = """
    Returned fields:
        * url (str): URL of the scraped web page.
        * time (int): UNIX timestamp of the scraping.
        * seller (str): Seller of the item.
        * name (str): Name of the item.
        * description (list of str): Description of the item.
        * price (str): Price of the item.
        * type (str): Type of the deal.
        * published (str): Publish time of the deal.
        * images (list of dict): Images of the item.
    """
    url = Field(input_processor=Identity(), output_processor=TakeFirst())
    time = Field(input_processor=Identity(), output_processor=TakeFirst())
    seller = Field(input_processor=Compose(drop_empty_elements,
                                           paragraph_join),
                   output_processor=TakeFirst())
    name = Field(input_processor=strip_join, output_processor=TakeFirst())
    description = Field(input_processor=Compose(drop_empty_elements,
                                                paragraph_join),
                        output_processor=TakeFirst())
    price = Field(input_processor=strip_join, output_processor=TakeFirst())
    type = Field(input_processor=strip_join, output_processor=TakeFirst())
    published = Field(input_processor=strip_join, output_processor=TakeFirst())
    images = Field(input_processor=MapCompose(
        _ToriDealSpider._get_image_metadata),
                   output_processor=Identity())
コード例 #2
0
    def parse_other(self, response: HtmlResponse):
        item = response.meta['item']
        loader = QianchengItemLoader(item, response)
        _extract_info = partial(extract_info, response)
        info_text = _extract_info("//p[@class='msg ltype']/@title")[0].split(
            "|") if len(_extract_info(
                "//p[@class='msg ltype']/@title")) != 0 else ["空"] * 5
        loader.add_value("experience", info_text[1])
        loader.add_value("education",
                         info_text[2] if len(info_text) == 5 else "空")
        loader.add_value("job_number",
                         info_text[3] if len(info_text) == 5 else info_text[2])
        loader.add_xpath("advantage",
                         '//div[@class="jtag"]/div//span/text()',
                         processors=Compose(Join()))

        info = _extract_info("//div[@class='com_tag']/p/@title")

        loader.add_value("company_nature", info[0] if len(info) != 0 else "空")
        loader.add_value("company_size", info[1] if len(info) != 0 else "空")
        loader.add_value("company_industry",
                         info[2] if len(info) != 0 else "空")
        loader.add_xpath("company_address",
                         "//*[text()='联系方式']/parent::*/parent::*//p/text()",
                         processors=Compose(Join(""), self.replace_all_n))

        info2 = self.replace_all_n("".join(
            _extract_info(
                u"//*[text()='职位信息']/parent::*/parent::*/div//p//text()")))
        loc_div = info2.find(u"职能类别")

        loader.add_value("job_content", info2[:loc_div])
        loader.add_value("job_kind", info2[loc_div:])
        yield loader.load_item()
コード例 #3
0
class FeedEntryItemLoader(BaseItemLoader):
    default_item_class = FeedEntryItem

    # Field specific
    content_text_in = MapCompose(skip_false, str.strip, remove_tags)
    content_text_out = Join("\n")

    content_html_in = MapCompose(
        skip_false,
        replace_regex,
        build_tree,
        convert_footnotes,
        pullup_elems,
        replace_elems,
        remove_elems,
        change_attribs,
        change_tags,
        cleanup_html,
        convert_iframes,
        lxml_cleaner,
        flatten_tree,
        skip_empty_tree,
        make_links_absolute,
        make_srcset_absolute,
        serialize_tree,
    )
    content_html_out = Compose(Join(), truncate_text)

    # Use sorted to keep the output stable.
    category_out = Compose(set, sorted)

    enclosure_in = Identity()
    enclosure_out = Identity()
コード例 #4
0
 def test_compose(self):
     proc = Compose(lambda v: v[0], str.upper)
     self.assertEqual(proc(["hello", "world"]), "HELLO")
     proc = Compose(str.upper)
     self.assertEqual(proc(None), None)
     proc = Compose(str.upper, stop_on_none=False)
     self.assertRaises(ValueError, proc, None)
     proc = Compose(str.upper, lambda x: x + 1)
     self.assertRaises(ValueError, proc, "hello")
コード例 #5
0
 def test_compose(self):
     proc = Compose(lambda v: v[0], str.upper)
     self.assertEqual(proc(['hello', 'world']), 'HELLO')
     proc = Compose(str.upper)
     self.assertEqual(proc(None), None)
     proc = Compose(str.upper, stop_on_none=False)
     self.assertRaises(ValueError, proc, None)
     proc = Compose(str.upper, lambda x: x + 1)
     self.assertRaises(ValueError, proc, 'hello')
コード例 #6
0
class Producte(scrapy.Item):
    nom = scrapy.Field(
        output_processor=Compose(TakeFirst(), neteja_caracters, trimx))
    editorial = scrapy.Field(output_processor=TakeFirst())
    url = scrapy.Field(output_processor=TakeFirst())
    preu = scrapy.Field(
        output_processor=Compose(TakeFirst(), remove_tags, neteja_moneda))
    preu_original = scrapy.Field(
        output_processor=Compose(TakeFirst(), remove_tags, neteja_moneda))
    stock = scrapy.Field(output_processor=Compose(TakeFirst(), trimx))
    status_stock = scrapy.Field()
    status_preu = scrapy.Field()
    date_lastseen = scrapy.Field(serializer=str)
    date_created = scrapy.Field(serializer=str)
    date_updated = scrapy.Field(serializer=str)
    botiga = scrapy.Field(serializer=str)
    _id = scrapy.Field(serializer=str)

    def test(self):
        return self['nom']

    def init_new(self):
        self['date_created'] = date.today().isoformat()
        self['date_lastseen'] = date.today().isoformat()
        self['status_stock'] = 'NOU'
        self['status_preu'] = 'IGUAL'
        if not "preu" in self:
            self['preu'] = 0
        else:
            if self['preu'] != '':
                self['preu'] = float(self['preu'])
            else:
                self['preu'] = 0

        if "preu_original" in self:
            try:
                self['preu_original'] = float(self['preu_original'])
            except:
                self['preu_original'] = float(0)

        else:
            self['preu'] = float(self['preu'])
        if not "stock" in self:
            self['stock'] = "N/A"
        self['_id'] = self['url']

    def iguals(self, producteDB):

        if all([
                self['preu'] == producteDB['preu'],
                self['stock'] == producteDB['stock'],
                producteDB['date_lastseen'] == date.today().isoformat()
        ]):
            return True
        else:
            return False
コード例 #7
0
class UserComment(scrapy.Item):
    username = _name
    use = _name
    date = scrapy.Field(input_processor=MapCompose(lambda d: d.split('em ')),
                        output_processor=Compose(TakeFirst()))
    title = scrapy.Field(output_processor=Compose(TakeFirst()))
    stars = _stars
    recommended = _name
    text = _name
    useful = scrapy.Field(output_processor=format_usefulness)
コード例 #8
0
class ProductLoader(ItemLoader):
    default_output_processor = TakeFirst()

    product_name_in = MapCompose(str.title)
    product_brand_in = MapCompose(str.upper)
    product_category_in = Compose(normalize_taxonomy)
    product_category_out = Join(separator=" >> ")
    product_price_in = MapCompose(filter_price)
    product_sale_price_in = MapCompose(filter_price)
    product_img_links_in = Compose(generate_img_url)
    product_img_links_out = Identity()
コード例 #9
0
ファイル: linkedin.py プロジェクト: honzajavorek/junior.guru
class Loader(ItemLoader):
    default_output_processor = TakeFirst()
    link_in = Compose(first, clean_proxied_url, clean_url)
    company_link_in = Compose(first, clean_url)
    employment_types_in = MapCompose(str.lower, split)
    employment_types_out = Identity()
    posted_at_in = Compose(first, parse_relative_date)
    experience_levels_in = MapCompose(str.lower, split)
    experience_levels_out = Identity()
    company_logo_urls_out = Identity()
    remote_in = MapCompose(parse_remote)
    locations_raw_out = Identity()
コード例 #10
0
class ImmoScoutLoader(ItemLoader):
    default_output_processor = TakeFirst()

    address_in = Compose(first_element)
    zip_code_in = Compose(second_element)
    city_in = Compose(fourth_element)
    canton_in = Compose(fifth_element, remove_comma)
    rooms_in = Compose(first_element, remove_room)
    area_m2_in = Compose(third_element, remove_m2)
    price_chf_in = Compose(first_element, remove_chf)
    date_available_in = Compose(second_element)
    floor_in = Compose(second_element)
    utilities_chf_in = Compose(second_element, remove_chf)
コード例 #11
0
ファイル: suomi24page.py プロジェクト: jmyrberg/finscraper
class _Suomi24PageItem(Item):
    __doc__ = """
    Returned page fields:
        * url (str): URL of the scraped web page.
        * time (int): UNIX timestamp of the scraping.
        * title (str): Title of the thread.
        * content (str): Content of the first message.
        * comments (str): Comments of the thread page.
        * published (str): Publish time of the thread.
        * author (str): Author of the thread.
        * n_comments (int): Number of comments in the thread.
        * views (str): Number of views.
    """ + _Suomi24CommentItem.__doc__ + _Suomi24CommentResponseItem.__doc__
    url = Field(input_processor=Identity(), output_processor=TakeFirst())
    time = Field(input_processor=Identity(), output_processor=TakeFirst())
    title = Field(input_processor=strip_join, output_processor=TakeFirst())
    content = Field(input_processor=paragraph_join,
                    output_processor=TakeFirst())
    comments = Field(input_processor=Identity(), output_processor=Identity())
    published = Field(input_processor=strip_join,
                      output_processor=Compose(strip_elements, TakeFirst()))
    author = Field(input_processor=strip_join, output_processor=TakeFirst())
    n_comments = Field(input_processor=MapCompose(safe_cast_int),
                       output_processor=TakeFirst())
    views = Field(input_processor=strip_join, output_processor=TakeFirst())
コード例 #12
0
class UserRating(scrapy.Item):
    name = _name
    stars = _stars
    ratings = scrapy.Field(
        output_processor=Compose(TakeFirst(), get_numbers, TakeFirst()))
    approval_rate = _name
    comments = scrapy.Field()
コード例 #13
0
class MpdataItem(scrapy.Item):
    """
    Defines the item fields and specifies processors for each field
    """

    name = scrapy.Field(
        input_processor=MapCompose(str.title, str.strip),
        output_processor=Join(' '),
    )
    image = scrapy.Field(output_processor=TakeFirst(), )
    birthdate = scrapy.Field(output_processor=TakeFirst(), )
    birthplace = scrapy.Field(
        input_processor=TakeFirst(),
        output_processor=Compose(lambda x: x[0] if len(x[0]) > 2 else "-"),
    )
    profession = scrapy.Field(
        input_processor=MapCompose(str.strip),
        output_processor=TakeFirst(),
    )
    languages = scrapy.Field(
        input_processor=MapCompose(str.strip),
        output_processor=Join(', '),
    )
    party = scrapy.Field(
        input_processor=MapCompose(str.strip),
        output_processor=TakeFirst(),
    )
    electoral_district = scrapy.Field(output_processor=TakeFirst(), )
    first_time_mp = scrapy.Field(
        input_processor=MapCompose(str.strip),
        output_processor=Join(', '),
    )
    email = scrapy.Field(output_processor=TakeFirst(), )
コード例 #14
0
class LeroyparserItem(scrapy.Item):
    # define the fields for your item here like:
    name = scrapy.Field()
    _id = scrapy.Field()
    article = scrapy.Field()
    price = scrapy.Field()
    url = scrapy.Field(output_processor=TakeFirst()
                       )  # т.к. получили список, выберем первое значение
    characteristic = scrapy.Field(output_processor=Compose(
        get_charakteristics))  # преобразуем полученный список в словарь
    photos = scrapy.Field()
    general = scrapy.Field(
        output_processor=TakeFirst(), input_processor=Compose(
            get_general))  # из полученного словаря выберем необходимые ключи
    main = scrapy.Field(output_processor=Compose(
        get_main))  # поле необходимо для создания структуры каталога фото
コード例 #15
0
class Street(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name = scrapy.Field()
    stadtteil = scrapy.Field()
    link = scrapy.Field()
    demographics = scrapy.Field(input_processor=Compose(TakeFirst(), jsonify))
コード例 #16
0
class BookItem(scrapy.Item):
    # Scalars
    url = Field()

    title = Field(input_processor=MapCompose(str.strip))
    author = Field(input_processor=MapCompose(str.strip))

    num_ratings = Field(input_processor=MapCompose(str.strip, int))
    num_reviews = Field(input_processor=MapCompose(str.strip, int))
    avg_rating = Field(input_processor=MapCompose(str.strip, float))
    num_pages = Field(
        input_processor=MapCompose(str.strip, num_page_extractor, int))

    language = Field(input_processor=MapCompose(str.strip))
    publish_date = Field(input_processor=extract_publish_dates)

    original_publish_year = Field(
        input_processor=MapCompose(extract_year, int))

    isbn = Field(input_processor=MapCompose(str.strip, isbn_filter))
    isbn13 = Field(input_processor=MapCompose(str.strip, isbn13_filter))
    asin = Field(input_processor=MapCompose(filter_asin))

    series = Field()

    # Lists
    awards = Field(output_processor=Identity())
    places = Field(output_processor=Identity())
    characters = Field(output_processor=Identity())
    genres = Field(output_processor=Compose(set, list))

    # Dicts
    rating_histogram = Field(input_processor=MapCompose(extract_ratings))
コード例 #17
0
class IggItem(scrapy.Item):

    title = scrapy.Field(
        input_processor=Compose(TakeFirst(), filter_title, lambda v: v.strip()),
        output_processor=Join(''))

    developer = scrapy.Field(
        input_processor=MapCompose(lambda v: v.strip()),
        output_processor=Join(''))

    publisher = scrapy.Field(
        input_processor=MapCompose(lambda v: v.strip()),
        output_processor=Join(''))

    release_date = scrapy.Field(
        input_processor=MapCompose(lambda v: v.strip()),
        output_processor=Join(''))

    genre = scrapy.Field(
        input_processor=MapCompose(lambda v: v.strip(), filter_empty),
        output_processor=Identity())

    links = scrapy.Field(
        input_processor=MapCompose(lambda v: v.strip(), filter_empty),
        output_processor=Identity())
コード例 #18
0
ファイル: items.py プロジェクト: moritzwilksch/ScrapyProjects
class Listing(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    heading = scrapy.Field(
        input_processor=MapCompose(lambda s: s.strip(), decode_special_chars))
    link = scrapy.Field(output_processor=TakeFirst())
    descr = scrapy.Field(
        input_processor=MapCompose(lambda s: s.strip(), decode_special_chars))
    size = scrapy.Field(
        input_processor=Compose(extract_size, decode_special_chars))
    rooms = scrapy.Field(
        input_processor=Compose(extract_rooms, decode_special_chars))
    preis = scrapy.Field(input_processor=MapCompose(
        lambda s: s.replace("€", "").replace(".", "").replace("VB", "").strip(
        ).replace(" ", ""), decode_special_chars))
    ausstattung = scrapy.Field(
        input_processor=Compose(Join("; "), decode_special_chars))
コード例 #19
0
ファイル: items.py プロジェクト: jonbesga/apod-bot
class ApodScraperItemLoader(ItemLoader):
    default_item_class = ApodScraperItem

    date_in = MapCompose(str.strip)
    date_out = TakeFirst()
    title_in = MapCompose(str.strip)
    title_out = Join()
    image_urls_in = MapCompose(add_base_url)
    credits_in = Compose(extract_credits)
コード例 #20
0
class QuoteLoader(ItemLoader):
    g = lambda x: bytes(x, 'utf-8')
    f = lambda x: x.decode('utf-8')
    author_in = MapCompose(g)
    quote_in = MapCompose(g)
    tags_in = MapCompose(g)
    author_out = MapCompose(f)
    quote_out = MapCompose(f)
    tags_out = Compose(MapCompose(f), Join('|'))
コード例 #21
0
class LeruamerlenparserItem(scrapy.Item):
    # define the fields for your item here like:
    name = scrapy.Field(output_processor=TakeFirst())
    photos = scrapy.Field(input_processor=MapCompose())
    characteristics = scrapy.Field(
        output_processor=TakeFirst(),
        input_processor=Compose(process_characteristics))
    url = scrapy.Field(output_processor=TakeFirst())
    price = scrapy.Field(output_processor=TakeFirst(),
                         input_processor=MapCompose(process_price))
コード例 #22
0
    def test_error_processor_as_argument(self):
        class TestItem(Item):
            name = Field()

        class TestItemLoader(ItemLoader):
            default_item_class = TestItem

        il = TestItemLoader()
        self.assertRaises(ValueError, il.add_value, 'name',
                          [u'marta', u'other'], Compose(float))
コード例 #23
0
    def parse_page(self, response):
        """@url http://www.usatoday.com/story/money/markets/2017/02/28/bonds-telling-less-bullish-tale-than-stocks/98503646/
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime modtime headline
        @scrapes keywords section source summary url
        """
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        mutate_selector_del_xpath(
            s, '//*[contains(@class, "inline-share-tools")]')
        mutate_selector_del_xpath(
            s, '//*[contains(@class, "article-print-url")]')
        mutate_selector_del_xpath(s, '//aside')

        l = NewsLoader(selector=s)

        l.add_xpath('bylines',
                    'head/meta[@name="cXenseParse:author"]/@content')
        # Section metadata comes out as "news,world". For this, take "News".
        l.add_xpath(
            'section', 'head/meta[@itemprop="articleSection"]/@content',
            Compose(
                TakeFirst(),
                lambda x: x.split(','),
                TakeFirst(),
                lambda x: x.title(),
            ))

        # Video pages
        l.add_xpath('summary',
                    '//p[contains(@class, "vgm-video-description")]//text()')

        # USA Today provide timestamps to millisecond precision, in a format
        # which dateparser can't handle.
        l.add_xpath(
            'firstpubtime',
            '//*[@itemprop="datePublished" or @property="datePublished"]/@content',
            MapCompose(self.fix_usatoday_date))  # CreativeWork
        l.add_xpath(
            'modtime',
            '//*[@itemprop="dateModified" or @property="dateModified"]/@content',
            MapCompose(self.fix_usatoday_date))  # CreativeWork

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)

        return l.load_item()
コード例 #24
0
class Loader(ItemLoader):
    default_input_processor = MapCompose(str.strip)
    default_output_processor = TakeFirst()
    title_in = MapCompose(html.unescape)
    company_name_in = MapCompose(html.unescape)
    employment_types_in = Compose(MapCompose(str.strip), drop_remote)
    employment_types_out = Identity()
    posted_at_in = MapCompose(parse_iso_date)
    company_logo_urls_out = Identity()
    remote_in = MapCompose(bool)
    locations_raw_out = Identity()
コード例 #25
0
class CommentLoader(ItemLoader):
    '''Item loader for CommentItem.'''
    # Default processors
    default_input_processor = Identity()
    default_output_processor = TakeFirst()

    # Custom input processors
    # Basic identity
    comment_id_in = prep_comment_id
    # Metadata
    user_name = MapCompose(strip_space_characters)
    reply_to_in = MapCompose(comment_id_from_url)
    image_urls_in = MapCompose(extract_image_url)

    # Custom output processors
    # Metadata
    comment_id_out = Join('_')
    comment_url_out = Join('/')
    body_out = Compose(Join(), strip_space_characters, remove_span_img_tags)
    reply_to_out = Identity()  # To keep as a list
    is_aa_out = Compose(TakeFirst(), bool)
    image_urls_out = Identity()  # To keep as a list
コード例 #26
0
class Nytimes_Dir_Item(scrapy.Item):
    title = Field(input_processor=Join(),
                  output_processor=MapCompose(tags_and_unicode))
    para = Field(input_processor=Join(),
                 output_processor=MapCompose(tags_and_unicode))
    captions = Field(input_processor=Compose(elim_dupes),
                     output_processor=MapCompose(tags_and_unicode))
    images = Field()
    author = Field(input_processor=Join(), output_processor=TakeFirst())
    pubtime = Field(input_processor=MapCompose(iso_time_to_df))
    tag = Field()
    url = Field()
    source = Field(output_processor=TakeFirst())
コード例 #27
0
class AuthorItem(scrapy.Item):
    # Scalars
    url = Field()

    name = Field()
    birth_date = Field(input_processor=MapCompose(safe_parse_date))
    death_date = Field(input_processor=MapCompose(safe_parse_date))

    avg_rating = Field(serializer=float)
    num_ratings = Field(serializer=int)
    num_reviews = Field(serializer=int)

    # Lists
    genres = Field(output_processor=Compose(set, list))
    influences = Field(output_processor=Compose(set, list))

    # Blobs
    about = Field(
        # Take the first match, remove HTML tags, convert to list of lines, remove empty lines, remove the "edit data" prefix
        input_processor=Compose(TakeFirst(), remove_tags, split_by_newline,
                                filter_empty, lambda s: s[1:]),
        output_processor=Join())
コード例 #28
0
class FashionItemLoader(ItemLoader):
    default_input_processor = Compose(normalize)
    length_in = Compose(set, list, sorted)
    
    brand_out = Join('/')
    price_out = Compose(lambda x: x[0].replace(',',''))
    wish_out = Compose(TakeFirst(),int)
    category_out = Compose(Join('>'))
    default_output_processor = Compose(set,list, sorted, Join(','))
コード例 #29
0
class RespuestasLegalAppItem(scrapy.Item):
    pregunta = scrapy.Field(input_processor=MapCompose(str.strip),
                            output_processor=TakeFirst())

    descripcion = scrapy.Field(
        input_processor=MapCompose(remove_tags, replace_escape_chars,
                                   str.strip),
        output_processor=Compose(descripcion_clean, solo_primero))

    que_hacer = scrapy.Field(
        input_processor=MapCompose(remove_tags, replace_escape_chars,
                                   str.strip),
        #output_processor=Compose(remove_line, remove_costos_abogado_otros, replace_tab)
        output_processor=Compose(split_dot, remove_line,
                                 remove_costos_abogado_otros,
                                 remove_requisitos_generales))

    donde_acudir = scrapy.Field(input_processor=MapCompose(
        remove_tags, replace_escape_chars, str.strip,
        string_replace_dondeAcudir, str.title),
                                #output_processor=TakeFirst()
                                )

    tenga_encuenta = scrapy.Field(
        input_processor=MapCompose(remove_tags, replace_escape_chars,
                                   str.strip),
        #output_processor=Compose(remove_vineta, split_dot, remove_line)
        output_processor=Compose(split_dot, remove_line, remove_vineta))

    normatividad = scrapy.Field(
        input_processor=MapCompose(remove_tags, replace_escape_chars,
                                   str.strip),
        output_processor=Compose(split_dot, remove_line, remove_vineta))

    fecha = scrapy.Field(input_processor=MapCompose(str.strip),
                         output_processor=TakeFirst())
コード例 #30
0
ファイル: suomi24page.py プロジェクト: jmyrberg/finscraper
class _Suomi24CommentResponseItem(Item):
    """
    Returned comment response fields:
        * author (str): Author of the comment response.
        * date (str): Publish time of the comment response.
        * quotes (list of str): List of quotes in the comment response.
        * content (str): Contents of the comment response.
    """
    author = Field(input_processor=strip_elements,
                   output_processor=TakeFirst())
    date = Field(input_processor=strip_join,
                 output_processor=Compose(strip_elements, TakeFirst()))
    quotes = Field(input_processor=drop_empty_elements,
                   output_processor=Identity())
    content = Field(input_processor=paragraph_join,
                    output_processor=TakeFirst())