コード例 #1
0
class ArticleItemLoader(ItemLoader):

    default_item_class = ArticleItem
    default_output_processor = TakeFirst()

    title_out = Compose(TakeFirst(), Net39ArticleTitle())
    content_out = Compose(Join(''), Net39ArticleContent())
コード例 #2
0
class ArticleLoader(XPathItemLoader):
    """
    Used for easier construction of ArticleItem
    """
    def is_string(string):
        if isinstance(string, str) or isinstance(string, unicode):
            if string.strip() != "":
                #log.msg("returning string: "+ unicode(string.strip()))
                return string.strip()
        #log.msg("returning None for string: "+ unicode(string))
        return None

    def separate_tags(tags_string):
        return tags_string.replace(";", ",").split(",")

    default_input_processor = MapCompose(is_string)
    default_output_processor = TakeFirst()

    publishers_in = MapCompose(is_string)
    publishers_out = Identity()

    title_in = MapCompose(is_string, unicode.title)
    title_out = TakeFirst()

    time_published_in = MapCompose(is_string)
    time_published_out = Identity()

    summary_in = MapCompose(is_string)
    summary_out = TakeFirst()

    tags_in = MapCompose(is_string, separate_tags)
    tags_out = Identity()
コード例 #3
0
class CompraLineaItem(Item):
    cantidad = Field(output_processor=TakeFirst())
    unidad_medida = Field(output_processor=TakeFirst())
    importe = Field(output_processor=lambda x: parse_money(x[0]))
    importe_total = Field(output_processor=lambda x: parse_money(x[0]))
    detalle = Field(output_processor=TakeFirst())
    anio = Field(output_processor=TakeFirst())
コード例 #4
0
 def process_item(self, task_id):
     report = self.db.loadScrapedFullReport(task_id)
     if report is None:
         return
         
     text = report['full_report_body']
     text = "".join(chr(min(ord(c),127)) for c in text)
     t = TextResponse (url=report['full_report_url'], body=text.encode('utf-8')) #must have utf-8 here
     l = XPathItemLoader(NrcParsedReport(), response=t)
     l.add_value('reportnum', task_id)
     
     patterns = self.compile_patterns ()
     
     for p in patterns:
         l.add_value(p[0], text, TakeFirst(), unicode.strip, re=p[1])
             
     county = l.get_output_value('county')
     pattern = self.get_area_code_pattern(county)
     if pattern:
         l.add_value ('areaid', county)
         l.add_value('blockid', text, TakeFirst(), unicode.strip, re="%s[\s]+(?:BLOCK[\s]+)?([\d]+)" % pattern)
         l.add_value('blockid', text, TakeFirst(), unicode.strip, re="BLOCK[\s]+([\d]+)")
         
                     
     item = l.load_item()
     
     yield item
     self.item_completed(task_id)
コード例 #5
0
class ZhiHuU_T(Item):
    '''
    Zhihu user topic relationship
    '''
    crawled_from = Field(output_processor=TakeFirst())
    user_url = Field(output_processor=TakeFirst())
    topic_url = Field(output_processor=TakeFirst())
コード例 #6
0
ファイル: test_contrib_loader.py プロジェクト: zrbruce/scrapy
    def test_get_xpath(self):
        l = TestItemLoader(response=self.response)
        self.assertEqual(l.get_xpath('//p/text()'), [u'paragraph'])
        self.assertEqual(l.get_xpath('//p/text()', TakeFirst()), u'paragraph')
        self.assertEqual(l.get_xpath('//p/text()', TakeFirst(), re='pa'), u'pa')

        self.assertEqual(l.get_xpath(['//p/text()', '//div/text()']), [u'paragraph', 'marta'])
コード例 #7
0
class VkItem(Item):
    id = Field(output_processor=TakeFirst())
    name = Field(output_processor=TakeFirst())
    text = Field(input_processor=MapCompose(remove_tags),
                 output_processor=TakeFirst())
    date = Field(output_processor=TakeFirst())
    words = Field()
コード例 #8
0
    def parse(self, response):
        """
        Default callback used by Scrapy to process download response

        Testing contracts:
        @url http://www.livingsocial.com/cities/15-san-francisco
        @returns items 1
        @scrapes title link
        :param response:
        :return:
        """
        selector = HtmlXPathSelector(response)

        # iterate over deals
        for deal in selector.select(self.deals_list_xpath):

            loader = XPathItemLoader(LivingSocialDeal(), selector=deal)

            # define processors
            loader.default_input_processor = TakeFirst()
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_input_processor = Join()
            loader.defalut_output_processor = TakeFirst()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)
            yield loader.load_item()
コード例 #9
0
ファイル: items.py プロジェクト: lzh6710/amazon-review-spam
class Product(Item):
    id = Field()
    name = Field(input_processor=Compose(TakeFirst(), unicode.strip))
    price = Field(input_processor=Compose(TakeFirst(), unicode.strip,
                                          remove_comma, float))
    cat = Field()
    avgStars = Field(input_processor=Compose(only_elem_or_default, float))
    nReviews = Field(
        input_processor=Compose(only_elem, unicode.strip, remove_comma, int))
    salesRank = Field(
        input_processor=Compose(unicode.strip, remove_comma, int))
    subCatRank = Field(input_processor=Compose(
        only_elem_or_default, unicode.strip, remove_comma, int))
    subCat = Field(
        input_processor=Compose(only_elem_or_default, unicode.strip))
    manufact = Field(
        input_processor=Compose(only_elem_or_default, unicode.strip))
    referrer = Field()

    @property
    def export_filename(self):
        return 'product'

    @property
    def key(self):
        return self._values['id']
コード例 #10
0
    def parse_item(self, response):
        loader = self.get_product_item_loader_with_default_values(response)
        loader.brand_in = lambda x: x[0][14:] if x else 'no brand'
        loader.brand_out = TakeFirst()
        loader.description_out = JoinExcludingEmptyValues('\n')
        loader.sale_price_out = TakeFirst()

        reviews = self.parse_review(response)
        loader.add_value('reviews', reviews)

        loader.add_value('url', response.meta['url'])
        loader.add_value('product_number', response.meta['product_number'])
        loader.add_xpath('brand', '//a[@class="brandstore"]/text()')
        loader.add_xpath(
            'title', '//div[@id="divCaption"]/h1[@class="captionText"]/text()')
        loader.add_xpath('description', '//div[@id="divPromosPDetail"]')
        loader.add_xpath('description', '//div[@id="divingredientsPDetail"]')
        loader.add_xpath('original_price', '//span[@class="rowMSRP"]/s/text()')
        loader.add_xpath('sale_price', '//div[@id="productprice"]/span/text()')
        loader.add_xpath(
            'sizes',
            '//div[@id="divCaption"]//span[@class="captionSizeText"]/text()')

        # images
        for sel in response.xpath('//div[@id="divPImage"]'):
            image_loader = ProductImageLoader(response=response, selector=sel)
            image_loader.add_value('thumbnail', response.meta['thumbnail'])
            image_loader.add_xpath('normal_size', 'a/img/@src')
            image_loader.add_xpath('zoomed', 'a/img/@src')

        loader.add_value('images', image_loader.load_item())

        yield loader.load_item()
コード例 #11
0
    def parse_item(self, response):
        loader = self.get_product_item_loader_with_default_values(response)
        loader.original_price_out = TakeFirst()
        loader.sale_price_out = TakeFirst()

        values_from_list = response.meta.get('values_from_list', {})
        for key, value in values_from_list.iteritems():
            loader.add_value(key, value)

        loader.add_xpath('product_number', '//div[@id="swatchContent"]/div[@id="productNumber"]/text()', re='#(.*)')
        loader.add_xpath('title', '//div[@id="productNameText"]/span[@class="productName"]/text()')
        loader.add_xpath('description', '//div[@id="tabWindow"]//text()')
        loader.add_xpath('original_price', '//div[@id="selectionContent"]/span[@id="priceText"]/strike/text()')
        loader.add_xpath('original_price', '//div[@id="selectionContent"]/span[@id="priceText"]/text()')
        loader.add_xpath('sale_price', '//div[@id="selectionContent"]/span[@id="priceText"]/span[@class="salePrice"]/text()')
        loader.add_xpath('sale_price', '//div[@id="selectionContent"]/span[@id="priceText"]/text()')
        loader.add_xpath('sizes', '//div[@id="productContentRight"]/div[@id="swatchContent"]/div[@id="sizeDimensionSwatchContent"]/div[@id="sizeDimension1SwatchContent"]/div[@id="sizeDimension1Swatches"]/button/text()')
        loader.add_xpath('default_color', '//div[@id="selectionContent"]/span[@id="selectionConfirmText"]/text()')

        #colors
        for selector in response.xpath('//div[@id="swatchContent"]/div[@id="colorSwatchContent"]/input'):
            color_loader = ProductColorLoader(response=response, selector=selector)
            color_loader.add_xpath('name', '@alt', re='(.*) product image$')
            color_loader.add_xpath('swatch_image', '@src')
            loader.add_value('colors', color_loader.load_item())

        #images
        images_data = response.meta.get('images_data', {})
        if images_data.get('P01'):
            image_loader = ProductImageLoader(response=response)
            image_loader.add_value('thumbnail', images_data.get('T'))
            image_loader.add_value('normal_size', images_data.get('P01'))
            image_loader.add_value('zoomed', images_data.get('Z'))
            loader.add_value('images', image_loader.load_item())
        num = 1
        while num < 9:
            av_num = 'AV%s' % num
            if images_data.get(av_num):
                image_loader = ProductImageLoader(response=response)
                image_loader.add_value('thumbnail', images_data.get('%s_T' % av_num))
                image_loader.add_value('normal_size', images_data.get(av_num))
                image_loader.add_value('zoomed', images_data.get('%s_Z' % av_num))
                loader.add_value('images', image_loader.load_item())
            num += 1

        #reviews
        for selector in response.xpath('//div[@id="BVRRContainer"]//ol[contains(@class,"bv-content-list")]/li[contains(@class,"bv-content-item")]'):
            review_loader = ProductReviewLoader(response=response, selector=selector)
            review_loader.body_out = JoinExcludingEmptyValues('\n')
            review_loader.add_xpath('author', 'div[@class="bv-author-profile"]/div[@class="bv-inline-profile"]/div[@class="bv-author-avatar"]/div[@class="bv-author-avatar-nickname"]/div[@class="bv-content-author-name"]/span/h3/text()')
            review_loader.add_xpath('title', 'div/div[@class="bv-content-container"]//h4[@class="bv-content-title"]/text()')
            review_loader.add_xpath('date', 'div/div[@class="bv-content-container"]//div[@class="bv-content-datetime"]/meta[@itemprop="dateCreated"]/@content',
                                  MapCompose(Date('%Y-%m-%d')))
            review_loader.add_xpath('body', 'div/div[@class="bv-content-container"]//div[contains(@class,"bv-content-summary-body-text")]/p/text()')
            review_loader.add_xpath('max_stars', 'div/div[@class="bv-content-container"]//span[contains(@class,"bv-content-rating")]/meta[@itemprop="bestRating"]/@content')
            review_loader.add_xpath('stars', 'div/div[@class="bv-content-container"]//span[contains(@class,"bv-content-rating")]/meta[@itemprop="ratingValue"]/@content')
            loader.add_value('reviews', review_loader.load_item())

        yield loader.load_item()
コード例 #12
0
ファイル: items.py プロジェクト: KeithYue/QA-spider
class LazyTweetAnswer(Item):
    question_id = Field(input_processor=MapCompose(lambda x: int(x)),
                        output_processor=TakeFirst())
    answer_content = Field(input_processor=MapCompose(remove_entities,
                                                      unicode.strip),
                           output_processor=Join())
    answerer = Field(output_processor=TakeFirst())
    answer_id = Field()
コード例 #13
0
class ZhiHuQ(Item):
    title = Field(input_processor=MapCompose(remove_entities, unicode.strip),
                  output_processor=Join())
    content = Field(input_processor=MapCompose(remove_entities, unicode.strip),
                    output_processor=Join())
    id = Field(output_processor=TakeFirst())
    user = Field(output_processor=TakeFirst())
    num = Field(output_processor=TakeFirst())
コード例 #14
0
class RakutenItem(Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    handbag_price = Field(output_processor=TakeFirst())
    handbag_url = Field(output_processor=TakeFirst())
    handbag_brand = Field(output_processor=TakeFirst())
    handbag_image_urls = Field(output_processor=TakeFirst())
コード例 #15
0
ファイル: whsmithcouk.py プロジェクト: oceancloud82/scraping
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        name_xpath = '//div[@id="product-details"]/h1/span/text()'
        names = hxs.select('//h1[@id="product_title"]/text()').extract()

        if names and len(names) > 0:
            name = names[0].strip()
        else:
            # product not found. Just continue
            self.log('WARNING: Product not found => %s' % response.url)
            return

        quantity = hxs.select('//p[@id="stock_status"]/text()').extract()
        if quantity and "In Stock" in quantity.pop():
            quantity = None
        else:
            quantity = 0

        category = hxs.select(
            '//ul[@id="crumbs"]/li[@class="last"]/a/text()').extract()

        brand = hxs.select(
            '//div[@id="product_title_container"]/span[@class="secondary"]/text()'
        ).extract()

        loader = ProductLoader(response=response, item=Product())
        loader.add_value('url', urljoin(base_url, response.url))
        loader.add_value('name', name)
        loader.add_xpath('image_url', '//img[@id="main_image"]/@src',
                         TakeFirst(), Compose(lambda v: urljoin(base_url, v)))
        loader.add_xpath(
            'price',
            '//div[@class="product_price"]/span[@class="price"]/text()',
            TakeFirst(),
            re="([.0-9]+)")
        if not loader.get_output_value('price'):
            loader.add_value('price', 0)

        if category:
            loader.add_value('category', category[0].strip())

        loader.add_value('sku', name, TakeFirst(), re='(\d\d\d+)\s*$')

        if brand:
            loader.add_value('brand', brand[0].strip())

        identifier = hxs.select('//input[@name="ProductID"]/@value').extract()
        if not identifier:
            identifier = hxs.select('//li[@itemprop="id"]/text()').extract()

        loader.add_value('identifier', identifier[0])

        if quantity == 0:
            loader.add_value('stock', 0)

        yield loader.load_item()
コード例 #16
0
class RPostItemsLoader(ItemLoader):
    default_item_class = RpostResultsItem
    default_output_processor = Compose(TakeFirst(), unicode, unicode.strip)
    racename_out = Compose(Join(), unicode, unicode.strip)
    racetime_out= Compose(Join(),unicode, unicode.strip)
    rpOR_out = Compose(TakeFirst(), unicode, unicode.strip, processOR) 
    rpTS_out = Compose(TakeFirst(), unicode, unicode.strip, processTS)
    prizemoney_out =Compose(TakeFirst(), unicode, unicode.strip, toascii)
    rphorseurl_out = Compose(TakeFirst(), unicode, unicode.strip)
コード例 #17
0
ファイル: test_contrib_loader.py プロジェクト: zrbruce/scrapy
    def test_get_css(self):
        l = TestItemLoader(response=self.response)
        self.assertEqual(l.get_css('p::text'), [u'paragraph'])
        self.assertEqual(l.get_css('p::text', TakeFirst()), u'paragraph')
        self.assertEqual(l.get_css('p::text', TakeFirst(), re='pa'), u'pa')

        self.assertEqual(l.get_css(['p::text', 'div::text']), [u'paragraph', 'marta'])
        self.assertEqual(l.get_css(['a::attr(href)', 'img::attr(src)']),
            [u'http://www.scrapy.org', u'/images/logo.png'])
コード例 #18
0
class HospitalItem(Item):
    _hospitalName = Field(output_processor=TakeFirst(), )
    grade = Field(output_processor=TakeFirst(), )
    feature = Field(
        input_processor=MapCompose(lambda v: v.strip()),
        output_processor=TakeFirst(),
    )
    city = Field(output_processor=TakeFirst(), )
    area = Field(output_processor=TakeFirst(), )
コード例 #19
0
class CommentItemLoader(ItemLoader):
    default_item_class = CommentItem
    default_input_processor = MapCompose(lambda x: x.strip())
    default_output_processor = Compose(TakeFirst(), lambda x: x.strip())
    default_selector_class = Selector
    textpost_out = Compose(Join(" "), lambda x: x.strip())
    comments_out = Compose(TakeFirst(), get_comments_count, lambda x: x.strip())
    upvoted_out = Compose(TakeFirst(), get_upvoted, lambda x: x.strip())
    comment_out = Compose(Join(" "), lambda x: x.strip())
コード例 #20
0
class ZhiHuA(Item):
    id = Field(input_processor=MapCompose(lambda x: int(x)),
               output_processor=TakeFirst())
    qid = Field(output_processor=TakeFirst())
    asr = Field(output_processor=TakeFirst())
    content = Field(input_processor=MapCompose(remove_entities, unicode.strip),
                    output_processor=Join())
    score = Field(input_processor=MapCompose(lambda x: int(x)),
                  output_processor=TakeFirst())
コード例 #21
0
ファイル: lockhart.py プロジェクト: oceancloud82/scraping
    def parse_brand_list(self, response):
        hxs = HtmlXPathSelector(response)

        # products
        product_items = hxs.select('//div[@class="productGrid"]/ul/li/div[@class="item"]')
        category_items = hxs.select('//h1[@class="categoryLandingPageTitle_heading"]/a/text()').extract()
        category = category_items[0] if category_items else ''
        brand_name = get_brand_from_url(response.url)

        def get_full_image_url(url):
            return get_full_url(response, url)

        for product_item in product_items:

            image_url = product_item.select(u'div[@class="prodimg"]/a/img/@src').extract()
            if image_url:
                image_url = get_full_url(response, image_url[0])

            ploadr = ProductLoader(item=Product(), selector=product_item, response=response)

            ploadr.add_xpath('name',
                             'div[@class="prodname"]/a/text()',
                             TakeFirst(), Compose(unicode.strip))
            ploadr.add_xpath('url', 'div[@class="prodname"]/a/@href',
                             TakeFirst(), Compose(unicode.strip), Compose(get_full_image_url))
            ploadr.add_value('category', category)
            ploadr.add_value('image_url', image_url)

            price = ploadr.get_xpath('div[@class="proddetails"]//div[@class="prodnowprice"]/span/text()',
                                     TakeFirst(), Compose(extract_price))
            price_excl_vat = Decimal(price)

            ploadr.add_value('price', price_excl_vat)

            ploadr.add_value('shipping_cost', Decimal('5.00') if price_excl_vat < 50 else Decimal('0.0'))
            ploadr.add_xpath('sku',
                             'div[@class="proddetails"]//div[@class="proditemcode"]/a/span/following-sibling::text()',
                             TakeFirst(), Compose(unicode.strip))

            ploadr.add_value('identifier', ploadr.get_output_value('sku'))
            stock_info = product_item.select(u'div[@class="proddetails"]/div/div/span[contains(@class, "instock")]/@class').extract()
            buy_button = product_item.select(u'div[@class="proddetails"]/div[@class="prodquickbuy"]/a[@class="primaryBtn"]').extract()

            ploadr.add_value('brand', brand_name)

            ploadr.add_value('stock', 1 if stock_info or buy_button else 0)

            item = ploadr.load_item()

            tmp = ''.join(product_item.select("//div[@class='proditemcode']//text()").extract())
            item['metadata'] = {'product_code': tmp.split(':')[-1].strip()}

            if not ploadr.get_output_value('brand'):
                yield Request(item['url'], meta={'item': item}, callback=self.parse_brand)
            else:
                yield item
コード例 #22
0
ファイル: test_contrib_loader.py プロジェクト: zrbruce/scrapy
    def test_get_value(self):
        il = NameItemLoader()
        self.assertEqual(u'FOO', il.get_value([u'foo', u'bar'], TakeFirst(), unicode.upper))
        self.assertEqual([u'foo', u'bar'], il.get_value([u'name:foo', u'name:bar'], re=u'name:(.*)$'))
        self.assertEqual(u'foo', il.get_value([u'name:foo', u'name:bar'], TakeFirst(), re=u'name:(.*)$'))

        il.add_value('name', [u'name:foo', u'name:bar'], TakeFirst(), re=u'name:(.*)$')
        self.assertEqual([u'foo'], il.get_collected_values('name'))
        il.replace_value('name', u'name:bar', re=u'name:(.*)$')
        self.assertEqual([u'bar'], il.get_collected_values('name'))
コード例 #23
0
    def parse(self, response):

        for e in response.xpath(
                '//table[@id="tbl_proxy_list"]//tr[count(td)=6]'):
            l = ItemLoader(ProxyHunterItem(), selector=e)
            l.add_value('prot', 'http')
            l.add_xpath('ip', 'td[1]', TakeFirst(), remove_tags, unicode.strip)
            l.add_xpath('port', 'td[2]', TakeFirst(), remove_tags,
                        unicode.strip)
            yield l.load_item()
コード例 #24
0
 def _set_loader(self, response, xs, item):
     if not xs:
         self.from_detail_page = True
         item = response.request.meta['item']
         self.loader = XPathItemLoader(item=item, response=response)
         self.loader.default_output_processor = TakeFirst()
     else:
         self.from_detail_page = False
         self.loader = XPathItemLoader(item=item, selector=xs)
         self.loader.default_output_processor = TakeFirst()
コード例 #25
0
ファイル: items.py プロジェクト: KeithYue/QA-spider
class StackOverflowAnswer(Item):
    answer_id = Field(input_processor=MapCompose(lambda x: int(x)),
                      output_processor=TakeFirst())
    answer_content = Field(input_processor=MapCompose(remove_entities,
                                                      unicode.strip),
                           output_processor=Join())
    answerer = Field(output_processor=TakeFirst())
    marks = Field(input_processor=MapCompose(lambda x: int(x)),
                  output_processor=TakeFirst())
    is_best_answer = Field(output_processor=TakeFirst())
コード例 #26
0
class YelpItem(Item):
    source = Field(output_processor=TakeFirst(), )
    source_link = Field(output_processor=TakeFirst(), )
    name = Field(
        default='',
        input_processor=MapCompose(unquote_markup, strip_space),
        output_processor=TakeFirst(),
    )
    rating = Field(
        default='',
        input_processor=MapCompose(unquote_markup, strip_space),
        output_processor=TakeFirst(),
    )
    category = Field(
        default='',
        input_processor=MapCompose(unquote_markup, strip_space),
        output_processor=Join(','),
    )
    reviews = Field(
        default='',
        input_processor=MapCompose(unquote_markup, strip_space),
        output_processor=TakeFirst(),
    )
    price = Field(
        default='',
        input_processor=MapCompose(unquote_markup, strip_space),
        output_processor=TakeFirst(),
    )
    city = Field(
        default='',
        input_processor=MapCompose(unquote_markup, strip_space),
        output_processor=TakeFirst(),
    )
    address = Field(
        default='',
        input_processor=MapCompose(unquote_markup, strip_space),
        output_processor=Join(),
    )
    owner_website = Field(
        default='',
        input_processor=MapCompose(unquote_markup, strip_space),
        output_processor=TakeFirst(),
    )
    phone = Field(
        default='',
        input_processor=MapCompose(unquote_markup, strip_space),
        output_processor=TakeFirst(),
    )
    longitude_latitude = Field(
        default='',
        input_processor=MapCompose(unquote_markup, strip_space),
        output_processor=TakeFirst(),
    )
    last_crawl = Field()
コード例 #27
0
ファイル: test_contrib_loader.py プロジェクト: zrbruce/scrapy
    def test_replace_css_multi_fields(self):
        l = TestItemLoader(response=self.response)
        l.add_css(None, 'div::text', TakeFirst(), lambda x: {'name': x})
        self.assertEqual(l.get_output_value('name'), [u'Marta'])
        l.replace_css(None, 'p::text', TakeFirst(), lambda x: {'name': x})
        self.assertEqual(l.get_output_value('name'), [u'Paragraph'])

        l.add_css(None, 'a::attr(href)', TakeFirst(), lambda x: {'url': x})
        self.assertEqual(l.get_output_value('url'), [u'http://www.scrapy.org'])
        l.replace_css(None, 'img::attr(src)', TakeFirst(), lambda x: {'url': x})
        self.assertEqual(l.get_output_value('url'), [u'/images/logo.png'])
コード例 #28
0
        def _convert(data):
            if t not in ['join', 'list'] and isinstance(data, list):
                data = TakeFirst()(data)
                if type(data) in [str, unicode]:
                    data = data.strip()
                elif type(data) in [int, float, datetime]:
                    data = str(data)
                else:
                    return data

            if t == 'join':
                sep = inf.get('sep', u' ')
                return Join(sep)(data)
            elif t == 'list':
                sep = inf.get('sep', u' ')
                return remove_tags(Join(sep)(data)).strip()
            elif t == 'text':
                return remove_tags(data).strip()
            elif t == 'clean':
                cleaner = Cleaner(style=True,
                                  scripts=True,
                                  javascript=True,
                                  links=True,
                                  meta=True)
                return cleaner.clean_html(data)
            elif t == 'unesc':
                return HTMLParser().unescape(data)
            elif t == 'base64':
                return base64.decodestring(data)
            elif t == 'sub':
                frm = inf.get('from')
                to = inf.get('to')
                return re.sub(frm, to, data)
            elif t == 'jpath':
                qs = inf.get('query')
                return jsonpath.jsonpath(json.loads(data), qs)
            elif t == 'map':
                m = inf.get('map')
                d = inf.get('default')
                return m.get(data, d)
            elif t == 'int':
                return int(float(data))
            elif t == 'float':
                return float(data)
            elif t == 'date':
                fmt = inf.get('fmt', 'auto')
                tz = inf.get('tz', '+00:00')
                return parse_date(data, fmt, tz)
            elif t == 'cst':
                fmt = inf.get('fmt', 'auto')
                return parse_date(data, fmt, '+08:00')
            else:
                return data
コード例 #29
0
class ActiveDoctorItem(Item):
    _name = Field(output_processor=TakeFirst(), )
    hospital = Field(output_processor=TakeFirst(), )
    city = Field(output_processor=TakeFirst(), )
    area = Field(output_processor=TakeFirst(), )
    specialty = Field(output_processor=TakeFirst(), )
    title = Field(output_processor=TakeFirst(), )
    count_ReplyInTwoWeeks = Field(output_processor=TakeFirst(), )
    count_ReplyTotal = Field(output_processor=TakeFirst(), )
    count_Calls = Field(output_processor=TakeFirst(), )
    external_id = Field(output_processor=TakeFirst(), )
    comment = Field(output_processor=Join(), )
コード例 #30
0
class ReviewLoader(XPathItemLoader):
    date_in = MapCompose(unicode, unicode.strip, extract_date, date_format='%d/%m/%Y')
    date_out = TakeFirst()

    rating_in = MapCompose(unicode, extract_rating)
    rating_out = TakeFirst()

    full_text_in = MapCompose(unicode, unicode.strip, remove_entities)
    full_text_out = Join()

    url_in = MapCompose(unicode, unicode.strip)
    url_out = TakeFirst()
コード例 #31
0
ファイル: utils.py プロジェクト: xiangxiaobaog3/project
        def _convert(data):
            if t not in ['join', 'list'] and isinstance(data, list):
                data = TakeFirst()(data)
                if type(data) in [str, unicode]:
                    data = data.strip()
                elif type(data) in [int, float, datetime]:
                    data = str(data)
                else:
                    return data

            if t=='join':
                sep = inf.get('sep', u' ')
                return Join(sep)(data)
            elif t=='list':
                sep = inf.get('sep', u' ')
                return remove_tags(Join(sep)(data)).strip()
            elif t=='text':
                return remove_tags(data).strip()
            elif t=='clean':
                cleaner = Cleaner(style=True, scripts=True, javascript=True, links=True, meta=True)
                return cleaner.clean_html(data)
            elif t=='unesc':
                return HTMLParser().unescape(data)
            elif t=='base64':
                return base64.decodestring(data)
            elif t=='sub':
                frm = inf.get('from')
                to = inf.get('to')
                return re.sub(frm, to, data)
            elif t=='jpath':
                qs = inf.get('query')
                return jsonpath.jsonpath(json.loads(data), qs)
            elif t=='map':
                m = inf.get('map')
                d = inf.get('default')
                return m.get(data, d)
            elif t=='int':
                return int(float(data))
            elif t=='float':
                return float(data)
            elif t=='date':
                fmt = inf.get('fmt', 'auto')
                tz = inf.get('tz', '+00:00')
                return parse_date(data, fmt, tz)
            elif t=='cst':
                fmt = inf.get('fmt', 'auto')
                return parse_date(data, fmt, '+08:00')
            else:
                return data