Ejemplo n.º 1
0
        def _convert(data):
            if t not in ['join', 'list'] and isinstance(data, list):
                data = TakeFirst()(data)
                if type(data) in [str, unicode]:
                    data = data.strip()
                elif type(data) in [int, float, datetime]:
                    data = str(data)
                else:
                    return data

            if t=='join':
                sep = inf.get('sep', u' ')
                return Join(sep)(data)
            elif t=='list':
                sep = inf.get('sep', u' ')
                return remove_tags(Join(sep)(data)).strip()
            elif t=='text':
                return remove_tags(data).strip()
            elif t=='clean':
                cleaner = Cleaner(style=True, scripts=True, javascript=True, links=True, meta=True)
                return cleaner.clean_html(data)
            elif t=='unesc':
                return HTMLParser().unescape(data)
            elif t=='base64':
                return base64.decodestring(data)
            elif t=='sub':
                frm = inf.get('from')
                to = inf.get('to')
                return re.sub(frm, to, data)
            elif t=='jpath':
                qs = inf.get('query')
                return jsonpath.jsonpath(json.loads(data), qs)
            elif t=='map':
                m = inf.get('map')
                d = inf.get('default')
                return m.get(data, d)
            elif t=='int':
                return int(float(data))
            elif t=='float':
                return float(data)
            elif t=='date':
                fmt = inf.get('fmt', 'auto')
                tz = inf.get('tz', '+00:00')
                return parse_date(data, fmt, tz)
            elif t=='cst':
                fmt = inf.get('fmt', 'auto')
                return parse_date(data, fmt, '+08:00')
            else:
                return data
Ejemplo n.º 2
0
class ArticleItemLoader(ItemLoader):
    #自定义itemloader
    default_output_processor = TakeFirst()
Ejemplo n.º 3
0
class UfItemLoader(ItemLoader):
    default_item_class = UfItem
    default_output_processor = TakeFirst()
Ejemplo n.º 4
0
class CtripItermLoader(ItemLoader):
    default_output_processor = TakeFirst()
Ejemplo n.º 5
0
class LagouRedisItemLoader(ItemLoader):
    default_output_processor = TakeFirst()
    tags_out = Identity()
Ejemplo n.º 6
0
class AticDataItem(ItemLoader):

    default_output_processor = TakeFirst()
Ejemplo n.º 7
0
class StrandbooksscraperItem(scrapy.Item):

    organization = scrapy.Field(output_processor=TakeFirst(),
                                )  #rich text format - no special chars
    title = scrapy.Field(
        input_processor=Compose(TakeFirst(), name_filter),
        output_processor=TakeFirst(),
    )  #rich text format - no special chars
    description = scrapy.Field(output_processor=TakeFirst(), )
    eventWebsite = scrapy.Field(
        output_processor=TakeFirst(),
    )  #full link! Hard-code http://.... if missing!
    street = scrapy.Field(output_processor=TakeFirst(),
                          )  #rich text format - no special chars
    city = scrapy.Field(output_processor=TakeFirst(),
                        )  #rich text format - no special chars
    state = scrapy.Field(output_processor=TakeFirst(),
                         )  #rich text format - no special chars
    zip = scrapy.Field(output_processor=TakeFirst(),
                       )  #numerical fromat required: xxxxx
    dateFrom = scrapy.Field(
        input_processor=Compose(TakeFirst(), date_converter),
        output_processor=TakeFirst(),
    )  # Only acceptable format is dd/mm/yyyy !! - ex: 19/12/2017
    startTime = scrapy.Field(
        input_processor=Compose(TakeFirst(), time_converter),
        output_processor=TakeFirst(),
    )  # Only acceptable format is hh:mm am/pm !! - ex: 07:45 pm
    In_group_id = scrapy.Field(output_processor=Compose(lambda v: v[0]),
                               )  # should be empty! will code that later
    ticketUrl = scrapy.Field(output_processor=TakeFirst(),
                             )  #full link! Hard-code http://.... if missing!
    eventImage = scrapy.Field(
        output_processor=TakeFirst(),
    )  #full link! Hard-code http://.... if missing! Leave empty if event image is missing!
    dateTo = scrapy.Field(
        input_processor=Compose(TakeFirst(), date_converter),
        output_processor=TakeFirst(),
    )  #(REQUIRED FORMAT: dd/mm/yyyy)
    endTime = scrapy.Field(
        input_processor=Compose(TakeFirst(), time_converter),
        output_processor=TakeFirst(),
    )  #(REQUIRED FORMAT: hh:mm am/pm)
Ejemplo n.º 8
0
class EwgScraperProduct(scrapy.Item):
    # Define the fields for Products
    url = scrapy.Field(output_processor=TakeFirst())
    product_id = scrapy.Field(output_processor=TakeFirst())
    product_name = scrapy.Field(output_processor=TakeFirst())
    product_score = scrapy.Field(output_processor=TakeFirst())
    product_type = scrapy.Field(output_processor=TakeFirst())
    data_availability = scrapy.Field(output_processor=TakeFirst())
    overall_hazard_score = scrapy.Field(output_processor=TakeFirst())
    cancer_score = scrapy.Field(output_processor=TakeFirst())
    dev_reprod_tox_score = scrapy.Field(output_processor=TakeFirst())
    allergy_imm_tox_score = scrapy.Field(output_processor=TakeFirst())
    use_restrict_score = scrapy.Field(output_processor=TakeFirst())
    ingredient_list = scrapy.Field(output_processor=Identity())
Ejemplo n.º 9
0
class MoreTicketsEventLoader(ItemLoader):
    default_output_processor = TakeFirst()
    id_in = MapCompose(lambda x: re.sub(r'/content/', '', x))
    desc_in = MapCompose(lambda x: re.sub(r'\n', '', x))
    url_in = MapCompose(lambda x: 'https://www.moretickets.com' + x)
Ejemplo n.º 10
0
class PiaoNiuEventLoader(ItemLoader):
    default_output_processor = TakeFirst()
    id_in = MapCompose(lambda x: re.sub(r'\D', '', x))
    desc_in = MapCompose(lambda x: re.sub(r'\n', '', x))
    url_in = MapCompose(lambda x: 'https:' + x)
Ejemplo n.º 11
0
class Company(Item):
    report_id = Field(output_processor=TakeFirst())
    company_name = Field(output_processor=TakeFirst())
    stock = Field(output_processor=TakeFirst())
    company_participants = Field(output_processor=TakeFirst())
    external_participants = Field(output_processor=TakeFirst())
    published_quarter = Field(output_processor=TakeFirst())
    article_url = Field(output_processor=TakeFirst())
    date_published = Field(output_processor=TakeFirst())
    earning_call_talk = Field(output_processor=TakeFirst())
    question_answers = Field(output_processor=TakeFirst())
    article_title = Field(output_processor=TakeFirst())
    audio_call_url = Field(output_processor=TakeFirst())
Ejemplo n.º 12
0
class HrTencentItem(ItemLoader):
    default_output_processor = TakeFirst()
Ejemplo n.º 13
0
class ProductoFybeca(scrapy.Item):
    titulo = scrapy.Field()
    imagen = scrapy.Field(input_processor=MapCompose(transformar_url_imagen),
                          output_processor=TakeFirst())
Ejemplo n.º 14
0
def price_field():
    return scrapy.Field(input_processor=MapCompose(
        lambda value: value.replace('$', '')
        if type(value) == str else value, DataUtils.remove_html, float),
                        output_processor=TakeFirst())
Ejemplo n.º 15
0
    def parse(self, response):

        #All data must be extracted using XPATH queries
        #This path should return a list of block of HTML code that contain the information about the listings
        items = response.xpath("//article[contains(@class,'property-row')]")
        for item in items:
            l = ItemLoader(item=RentslamItem(), response=response)

            #All data must be extracted using XPATH queries
            image_url = item.xpath('.//img/@src').extract_first()
            url = item.xpath('.//a/@href').extract_first()
            price = item.xpath(
                './/span[contains(@class,"property-row-meta-item-price")]/strong/text()'
            ).extract_first()
            bedrooms = item.xpath(
                './/span[contains(@class,"property-row-meta-item-beds")]/strong/text()'
            ).extract_first()
            size = item.xpath(
                './/span[contains(@class,"property-row-meta-item-area")]/strong/text()'
            ).extract_first()
            address = item.xpath('.//h2/a/text()').extract_first()
            text = item.xpath(
                './/div[@class="property-row-body"]/p/text()').extract_first()
            city = item.xpath('.//div[@class="property-row-location"]/a/text()'
                              ).extract_first()

            #In this example there is no furnishing info, it can be left enpty
            #furnishing = item.xpath('').extract_first()

            #Full url. Only the first image is required
            l.add_value('ImageUrl', image_url)

            #Full url
            l.add_value('Url', url)

            #Price must not include currency symbol, dot or comma. Decimals must be filtered out. Example: € 1.348,77 --> 1348
            l.add_value('Price', price, Join(''), re=r'\d+')

            #Number
            l.add_value('Bedrooms', bedrooms)

            #Size must include only the number. Things like "m2" must be filtered out. Example: 90 m2 --> 90
            l.add_value('Size', size, TakeFirst(), re=r'\d+')
            #The address must contain the street name and the house number (if it is present). It must not contain the city name or the postcode
            l.add_value('Address', address)

            #This is the description of the listing
            l.add_value('Text', text)

            #You can copy th email address from the website here
            l.add_value('ContactEmailAddress', '*****@*****.**')

            #You can copy th phoen number from the website here
            l.add_value('ContactPhoneNumber', '085 - 273 67 30')

            #In this example there is no furnishing info, it can be left enpty
            #l.add_value('Furnishing', furnishing)

            #Name of the city. Sometimes it can have a literal value, like "Amsterdam", if the website only contains listings from amsterdam.
            l.add_value('City', city)

            yield l.load_item()
Ejemplo n.º 16
0
class ArticleItemLoader(ItemLoader):
    """
        自定义 ItemLoader
    """
    # 为每一个字段都指定一个 output_processor
    default_output_processor = TakeFirst()
Ejemplo n.º 17
0
class AuthorLoader(ItemLoader):
    default_output_processor = TakeFirst()
Ejemplo n.º 18
0
class FirmwareLoader(ItemLoader):

    @staticmethod
    def find_product(text):
        match = re.search(r"(?:model[:. #]*([\w-][\w.-]+))", " ".join(
            text).replace(u"\xa0", " ").strip(), flags=re.IGNORECASE)
        return next((x for x in match.groups() if x), None) if match else None

    @staticmethod
    def find_version(text):
        match = re.search(r"(?:version[:. ]*([\w-][\w.-]+)|ve?r?s?i?o?n?[:. ]*([\d-][\w.-]+))",
                          " ".join(text).replace(u"\xa0", " ").strip(), flags=re.IGNORECASE)
        return next((x for x in match.groups() if x), None) if match else None

    @staticmethod
    def find_build(text):
        match = re.search(r"(?:build[:. ]*([\w-][\w.-]+)|bu?i?l?d?[:. ]*([\d-][\w.-]+))",
                          " ".join(text).replace(u"\xa0", " ").strip(), flags=re.IGNORECASE)
        return next((x for x in match.groups() if x), None) if match else None

    @staticmethod
    def find_version_period(text):
        match = re.search(r"((?:[0-9])(?:[\w-]*\.[\w-]*)+)",
                          " ".join(text).replace(u"\xa0", " ").strip())
        return next((x for x in match.groups() if x and "192.168." not in x.lower()), None) if match else None

    def find_date(self, text):
        for fmt in self.context.get("date_fmt", []):
            fmt = "(" + re.escape(fmt).replace("\%b", "[a-zA-Z]{3}").replace("\%B", "[a-zA-Z]+").replace(
                "\%m", "\d{1,2}").replace("\%d", "\d{1,2}").replace("\%y", "\d{2}").replace("\%Y", "\d{4}") + ")"
            match = re.search(fmt, "".join(text).strip())
            res = filter(lambda x: x, match.groups()) if match else None

            if res:
                return next(res)
        return None

    def clean(s):
        return "".join(filter(lambda x: x in string.printable, s)).replace("\r", "").replace("\n", "").replace(u"\xa0", " ").strip()

    def fix_url(url, loader_context):
        if not urlparse(url).netloc:
            return urljoin(loader_context.get("response").url, url)
        return url

    def parse_date(date, loader_context):
        for fmt in loader_context.get("date_fmt", []):
            try:
                return datetime.datetime.strptime(date, fmt)
            except ValueError:
                pass
        return None

    def remove_html(s):
        return re.sub(r"<[a-zA-Z0-9\"/=: ]+>", "", s)

    default_output_processor = TakeFirst()

    product_in = MapCompose(clean)
    vendor_in = Identity()

    description_in = MapCompose(remove_html, clean)
    version_in = MapCompose(clean)
    build_in = MapCompose(clean)
    date_in = MapCompose(clean, parse_date)

    mib_in = MapCompose(fix_url)
    sdk_in = MapCompose(fix_url)
    url_in = MapCompose(fix_url)
Ejemplo n.º 19
0
class Form(scrapy.Item):
    url = scrapy.Field()
    action = scrapy.Field(output_processor=TakeFirst())
    inputs = scrapy.Field()
Ejemplo n.º 20
0
class ArticleItemLoader(ItemLoader):
    default_output_processor = TakeFirst()  # TakeFirst是取第一个不为空的元素
Ejemplo n.º 21
0
class InputLoader(ItemLoader):
    default_item_class = Input
    default_output_processor = TakeFirst()
Ejemplo n.º 22
0
class JokeItem(scrapy.Item):
    # define the fields for your item here like:
    joke_text = scrapy.Field(
        input_processor=MapCompose(remove_tags, remove_whitespace),
        output_processor=TakeFirst()
    )
Ejemplo n.º 23
0
class HexunFundDetailItemLoader(ItemLoader):
    #自定义itemloader
    default_output_processor = TakeFirst()
Ejemplo n.º 24
0
class YunqiItemLoader(ItemLoader):
    default_output_processor = TakeFirst()
Ejemplo n.º 25
0
    def parse_course(self, response):
        l = ItemLoader(item=ConcordiaCourseItem(), response=response)
        l.default_output_processor = TakeFirst()

        l.add_value('institution_name', 'Concordia University')
        l.add_xpath('course_code',
                    '//div[@class="container"]//div[@class="ccode"]/text()')
        l.add_xpath('course_name', '//section[@id]//h1/text()')
        l.add_value('url', response.url)
        l.add_value('faculty', 'School of Continuing Studies')
        l.add_xpath(
            'description',
            '//div[@class="container"]//span[@class="xlarge-text"]/div[@class="ccode"]/following-sibling::text()[normalize-space()]'
        )
        l.add_value('location', '')
        l.add_value('subject', '')

        # get all blocks of course data
        course_data = response.xpath(
            '//div[@class="course-section xlarge-text"]').getall()

        # get all prices
        prices = [re.search(r'\$([^\s]+)', block) for block in course_data]
        prices = [price.group(1) if price else '0.0' for price in prices]
        l.add_value('price', prices)

        # Get all days
        days_in_blocks = [
            re.findall(r'([\w ]+) +\(', block) for block in course_data
        ]
        l.add_value('days', days_in_blocks)

        l.add_value('program', response.meta['program'])

        # # Get all courses time intervals
        time_in_blocks = [
            re.findall(r'\d{1,2}:\d{1,2}', block) for block in course_data
        ]
        l.add_value('duration_hours', time_in_blocks)

        l.add_value('duration_days_week', l.get_collected_values('days'))

        l.add_xpath('duration_months', '//h3[@class="date burgundy"]/text()')
        l.add_value('duration_as_string', [
            l.get_collected_values('duration_hours'),
            l.get_collected_values('duration_days_week'),
            l.get_collected_values('duration_months'),
        ])

        hours_site = re.search(r'Duration[^\d]+(\d+)', course_data[0])
        if hours_site:
            hours_site = hours_site.group(1)
        else:
            hours_site = 0
        l.add_value('total_hours', [
            l.get_collected_values('duration_hours'),
            l.get_collected_values('duration_days_week'),
            hours_site,
        ])

        l.add_value('delivery_types', l.get_collected_values('duration_hours'))

        yield l.load_item()
Ejemplo n.º 26
0
class SszeLoaderItem(ItemLoader):
    '''
    自定义ITEM,取每个字段数组的第一个值
    '''
    default_output_processor = TakeFirst()
Ejemplo n.º 27
0
class RiLab01Loader(ItemLoader):

    default_output_processor = TakeFirst()
    text_out = Join()
Ejemplo n.º 28
0
class LagouJobItemLoader(ItemLoader):
    # 自定义itemloader
    default_output_processor = TakeFirst()
Ejemplo n.º 29
0
class RiLab01CommentLoader(ItemLoader):

    default_output_processor = TakeFirst()
Ejemplo n.º 30
0
class NewLoader(ItemLoader):
    """重写item loader,默认取第一个"""
    default_output_processor = TakeFirst()
Ejemplo n.º 31
0
class MyLoader(ItemLoader):
    # 自定义TtemLoader
    default_output_processor = TakeFirst()