Ejemplo n.º 1
0
    def parse(self, response):

        # Lets instantiate one of our horse race items.
        itemLoader = ItemLoader(item=HorseRaceItem(), response=response)

        # First lets do housekeeping data for this web spider scrape. We need to check it is only done once.
        # So only happens for the first asynchronous block of code to run (scrapy uses twisted!). Store the value in
        # the class attribute so that it can be used by every item that is parsed.
        if not self.housekeeping:
            self.housekeeping["url"] = response.url
            self.housekeeping["project"] = self.settings.get("BOT_NAME")
            self.housekeeping["spider"] = self.name
            self.housekeeping["server"] = socket.gethostname()
            self.housekeeping["date"] = (datetime.now().replace(
                tzinfo=timezone.utc).strftime("%d %B %Y %H:%M:%S"))
            # self.logger.info(
            #     f"*** The housekeeping attributes are now data filled with: {self.housekeeping} ***")

        # Set housekeeping attributes in the HorseRaceItem
        itemLoader.add_value('url', response.url)
        itemLoader.add_value('project', self.settings.get("BOT_NAME"))
        itemLoader.add_value('spider', self.name)
        itemLoader.add_value('server', socket.gethostname())
        itemLoader.add_value(
            'date',
            datetime.now().replace(
                tzinfo=timezone.utc).strftime("%d %B %Y %H:%M:%S"))

        # XPaths for race details
        itemLoader.add_xpath(
            'race_time',
            '//div[@class="race-header__content js-race-header__content"]/div/div/div/div/h1/b/text()'
        )
        itemLoader.add_xpath(
            'race_date_and_place',
            '//div[@class="race-header__content js-race-header__content"]/div/div/div/div/h1/text()',
            MapCompose(str.strip), lambda i: i[1])
        itemLoader.add_xpath(
            'race_class',
            '//div[@class="race-header__content js-race-header__content"]/div/div/div/div/p[2]/text()',
            MapCompose(str.strip), lambda i: i[0])
        itemLoader.add_xpath(
            'race_start_time',
            '//div[@class="card-footer__content"]/div/span/span/text()',
            lambda i: i[0])
        itemLoader.add_xpath(
            'race_winning_time',
            '//div[@class="card-footer__content"]/div/span/span[2]/text()',
            lambda i: i[0])

        # XPaths for horse and rider details
        itemLoader.add_xpath(
            'position',
            '//div[@class="tabs__content"]//div[@class="card-body"]//div[@class="card-entry "]//div[@class="card-no-draw"]//span/text()'
        )
        itemLoader.add_xpath('horse_url', '//div[@class="horse"]//img/@src')
        itemLoader.add_xpath(
            'race_or',
            '//div[@class="tabs__content"]//div[@class="card-body"]//div[@class="card-entry "]//div[@class="card-cell card-cell--fill unpadded-left"]/div/div[4]/span[1]/text()'
        )
        itemLoader.add_xpath('horse_colour',
                             '//div[@class="horse"]//img/@title')
        itemLoader.add_xpath(
            'raced_description',
            '//div[@class="tabs__content"]//div[@class="card-body"]//div[@class="card-entry "]//div[@class="card-cell card-cell--fill unpadded-left"]/p/span/text()',
            MapCompose(str.strip))
        itemLoader.add_xpath(
            'horse_name',
            '//div[@class="tabs__content"]//div[@class="card-body"]//div[@class="card-entry "]//div[@class="card-cell card-cell--fill unpadded-left"]/div/div/div/h2/a/text()',
            MapCompose(str.strip, lambda i: i if (len(i) >= 1) else None))
        itemLoader.add_xpath(
            'dst_btn',
            '//div[@class="tabs__content"]//div[@class="card-body"]//div[@class="card-entry "]//div[@class="card-cell card-cell--form text-align--center"]/text()',
            MapCompose(lambda i: i.replace('\r', '').replace('\n', ''),
                       str.strip))
        itemLoader.add_xpath(
            'race_ods',
            '//div[@class="tabs__content"]//div[@class="card-body"]//div[@class="card-entry "]//div[@class="card-cell card-cell--fill unpadded-left"]/div/div[2]/text()',
            MapCompose(
                lambda i: i.replace('\r', '').replace('\n', '').replace(
                    ' ', ''), str.strip))
        itemLoader.add_xpath(
            'race_age_weight',
            '//*[@id="tab-full-result"]/div/div/div/div[2]//div/div/div[4]/div/div[3]/text()',
            MapCompose(str.split))

        #return itemLoader.load_item()
        return self.parse_item(itemLoader)
Ejemplo n.º 2
0
class TestItemLoader(NameItemLoader):
    name_in = MapCompose(lambda v: v.title())
Ejemplo n.º 3
0
class ProgramaSemanaItems(scrapy.Item):

    semana_referencia = scrapy.Field(input_processor= MapCompose(remove_tags, remover_quebra_linhas), output_processor= TakeFirst())
    leitura_semana = scrapy.Field(input_processor= MapCompose(remove_tags, remover_quebra_linhas), output_processor= TakeFirst())
    cantico_inicial = scrapy.Field(input_processor= MapCompose(remove_tags, remover_quebra_linhas), output_processor= TakeFirst())
    
    #Tesouros da Palavra de Deus
    tpd_titulo = scrapy.Field(input_processor= MapCompose(remove_tags, remover_quebra_linhas, get_titulo_de_descricao_com_duracao), output_processor= TakeFirst())
    tpd_href = scrapy.Field(input_processor= MapCompose(remove_tags, add_jw_prefix_to_href), output_processor= TakeFirst())
    tpd_duracao = scrapy.Field(input_processor= MapCompose(remove_tags, remover_quebra_linhas, get_duracao_de_descricao_com_duracao), output_processor= TakeFirst())

    tpd_joias_titulo = scrapy.Field(input_processor= MapCompose(remove_tags, remover_quebra_linhas, get_titulo_de_descricao_com_duracao), output_processor= TakeFirst())
    tpd_joias_duracao = scrapy.Field(input_processor= MapCompose(remove_tags, remover_quebra_linhas, get_duracao_de_descricao_com_duracao), output_processor= TakeFirst())
    tpd_joias_descricao = scrapy.Field(input_processor= MapCompose(remove_tags, remover_quebra_linhas, get_lista_filha_as_string), output_processor= TakeFirst())

    tpd_leitura_duracao = scrapy.Field(input_processor= MapCompose(remove_tags, remover_quebra_linhas, get_duracao_de_descricao_com_duracao_mais_texto_base), output_processor= TakeFirst())
    tpd_leitura_titulo = scrapy.Field(input_processor= MapCompose(remove_tags, remover_quebra_linhas, get_titulo_de_descricao_com_duracao), output_processor= TakeFirst())
    tpd_leitura_texto_base = scrapy.Field(input_processor= MapCompose(remove_tags, remover_quebra_linhas), output_processor= TakeFirst())
    tpd_leitura_href = scrapy.Field(input_processor= MapCompose(remove_tags, add_jw_prefix_to_href), output_processor= TakeFirst())
    tpd_leitura_licao_melhore_titulo = scrapy.Field(input_processor= MapCompose(remove_tags, remover_quebra_linhas, get_titulo_de_descricao_com_duracao), output_processor= TakeFirst())
    tpd_leitura_licao_melhore_href = scrapy.Field(input_processor= MapCompose(remove_tags, add_jw_prefix_to_href), output_processor= TakeFirst())

    #Faça seu melhor no ministério
    fmm_design_1_titulo = scrapy.Field()
    fmm_design_1_duracao = scrapy.Field()
    fmm_design_1_descricao = scrapy.Field()
    fmm_design_1_tipo = scrapy.Field()
    fmm_design_1_tipo_perguntas_respostas_href = scrapy.Field()
    fmm_design_1_tipo_apresentacao_licao_melhore_titulo = scrapy.Field()
    fmm_design_1_tipo_apresentacao_licao_melhore_href = scrapy.Field()

    fmm_design_2_titulo = scrapy.Field()
    fmm_design_2_duracao = scrapy.Field()
    fmm_design_2_descricao = scrapy.Field()
    fmm_design_2_tipo = scrapy.Field()
    fmm_design_2_tipo_perguntas_respostas_href = scrapy.Field()
    fmm_design_2_tipo_apresentacao_licao_melhore_titulo = scrapy.Field()
    fmm_design_2_tipo_apresentacao_licao_melhore_href = scrapy.Field()

    fmm_design_3_titulo = scrapy.Field()
    fmm_design_3_duracao = scrapy.Field()
    fmm_design_3_descricao = scrapy.Field()
    fmm_design_3_tipo = scrapy.Field()
    fmm_design_3_tipo_perguntas_respostas_href = scrapy.Field()
    fmm_design_3_tipo_apresentacao_licao_melhore_titulo = scrapy.Field()
    fmm_design_3_tipo_apresentacao_licao_melhore_href = scrapy.Field()

    fmm_design_4_titulo = scrapy.Field()
    fmm_design_4_duracao = scrapy.Field()
    fmm_design_4_descricao = scrapy.Field()
    fmm_design_4_tipo = scrapy.Field()
    fmm_design_4_tipo_perguntas_respostas_href = scrapy.Field()
    fmm_design_4_tipo_apresentacao_licao_melhore_titulo = scrapy.Field()
    fmm_design_4_tipo_apresentacao_licao_melhore_href = scrapy.Field()
    
    #Nossa vida cristã
    cantico_transicao = scrapy.Field()

    nvc_design_1_titulo = scrapy.Field()
    nvc_design_1_duracao = scrapy.Field()
    nvc_design_1_video_ou_materia_href = scrapy.Field()
    nvc_design_1_descricao = scrapy.Field()

    nvc_design_2_titulo = scrapy.Field()
    nvc_design_2_duracao = scrapy.Field()
    nvc_design_2_video_ou_materia_href = scrapy.Field()
    nvc_design_2_descricao = scrapy.Field()

    nvc_design_3_titulo = scrapy.Field()
    nvc_design_3_duracao = scrapy.Field()
    nvc_design_3_video_ou_materia_href = scrapy.Field()
    nvc_design_3_descricao = scrapy.Field()

    nvc_design_4_titulo = scrapy.Field()
    nvc_design_4_duracao = scrapy.Field()
    nvc_design_4_video_ou_materia_href = scrapy.Field()
    nvc_design_4_descricao = scrapy.Field()
    
Ejemplo n.º 4
0
class RekonItem(Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    awbRekon = Field()
    mpCode = Field()
    periodeRekon = Field()
    tahunRekon = Field()
    awbKendali = Field()
    jenisLayanan = Field()
    tanggalKirim = Field()
    isiKiriman = Field(input_processor = MapCompose(removeSemiColon), output_processor = TakeFirst())
    berat = Field(input_processor = MapCompose(removeGr), output_processor = TakeFirst())
    jenisKiriman = Field()
    beaDasar = Field(input_processor = MapCompose(removePoint), output_processor = TakeFirst())
    nilaiBarang = Field(input_processor = MapCompose(removePoint), output_processor = TakeFirst())
    htnb = Field(input_processor = MapCompose(removePoint), output_processor = TakeFirst())
    pengirim = Field(input_processor = MapCompose(getPengirimPenerima), output_processor = TakeFirst())
    kotaPengirim = Field(input_processor = MapCompose(getKotaPengirimPenerima), output_processor = TakeFirst())
    kodePosPengirim = Field(input_processor = MapCompose(getKodePos), output_processor = TakeFirst())
    penerima = Field(input_processor = MapCompose(getPengirimPenerima), output_processor = TakeFirst())
    kotaPenerima = Field(input_processor = MapCompose(getKotaPengirimPenerima), output_processor = TakeFirst())
    kodePosPenerima = Field(input_processor = MapCompose(getKodePos), output_processor = TakeFirst())
    statusRekon = Field()
    ketRekon = Field()
    beaTotal = Field()
    kantorKirim = Field()
    nopendKantorKirim = Field(input_processor = MapCompose(getNopendKantor), output_processor = TakeFirst())
    kantorKirim = Field(input_processor = MapCompose(getNamaKantor), output_processor = TakeFirst())
    tanggalPosting = Field()
    statusAkhir = Field()
    kantorAkhir = Field(input_processor = MapCompose(getNamaKantor), output_processor = TakeFirst())
    ketStatusAkhir = Field()
    nopendKantorAkhir = Field(input_processor = MapCompose(getNopendKantor), output_processor = TakeFirst())
    tanggalStatusAkhir = Field()
    statusAntar = Field()
    ketStatusAntar = Field()
    penerimaKiriman = Field()
    waktuUpdateStatus = Field()
    #listProsesAntar = Field()    

    pass
Ejemplo n.º 5
0
class InvestorTradingItem(scrapy.Item):
    date = scrapy.Field(input_processor=MapCompose(str.strip, ItemParser.p_date))  # 資料日期
    code = scrapy.Field()  # 證券代號
    name = scrapy.Field()  # 證券名稱

    foreign_buy = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int))  # 外陸資買進股數(不含外資自營商)
    foreign_sell = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int))  # 外陸資賣出股數(不含外資自營商)
    foreign_net = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int))  # 外陸資買賣超股數(不含外資自營商)

    foreign_dealer_buy = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int))  # 外資自營商買進股數
    foreign_dealer_sell = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int))  # 外資自營商賣出股數
    foreign_dealer_net = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int))  # 外資自營商買賣超股數

    trust_buy = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int))  # 投信買進股數
    trust_sell = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int))  # 投信賣出股數
    trust_net = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int))  # 投信買賣超股數

    dealer_net = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int))  # 自營商買賣超股數
    native_dealer_buy = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int))  # 自營商買進股數(自行買賣)
    native_dealer_sell = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int))  # 自營商賣出股數(自行買賣)
    native_dealer_net = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int))  # 自營商買賣超股數(自行買賣)
    native_dealer_hedge_buy = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int))  # 自營商買進股數(避險)
    native_dealer_hedge_sell = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int))  # 自營商賣出股數(避險)
    native_dealer_hedge_net = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int))  # 自營商買賣超股數(避險)

    net = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int))  # 三大法人買賣超股數

    class Meta:
        name = 'twse_investor_trading'
        fields = [
            'code', 'name', 'foreign_buy', 'foreign_sell', 'foreign_net',
            'foreign_dealer_buy', 'foreign_dealer_sell', 'foreign_dealer_net',
            'trust_buy', 'trust_sell', 'trust_net', 'dealer_net',
            'native_dealer_buy', 'native_dealer_sell', 'native_dealer_net',
            'native_dealer_hedge_buy', 'native_dealer_hedge_sell', 'native_dealer_hedge_net', 'net'
        ]
Ejemplo n.º 6
0
 class TestItemLoader(ItemLoader):
     name_in = MapCompose(float)
Ejemplo n.º 7
0
 def parse(self, response):
     # parse response to get author list
     urls = MapCompose(lambda i: str.replace(i, "users", "u"))(
         response.css('div.wrap > a::attr(href)').getall())
     yield from response.follow_all(urls, callback=self.parse_author)
Ejemplo n.º 8
0
class City(scrapy.Item):
    country = scrapy.Field()
    city = scrapy.Field(input_processor=MapCompose(get_country),
                        output_processor=TakeFirst())
Ejemplo n.º 9
0
class Recipe(Item):
    ingredients = Field(input_processor=MapCompose(remove_tags, str.strip))
    directions = Field(input_processor=MapCompose(remove_tags, str.strip))
Ejemplo n.º 10
0
class PracujItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    kat_mias = Field(input_processor=MapCompose(remove_n))
    liczba = Field()
Ejemplo n.º 11
0
class MyautoItemLoader(ItemLoader):
    default_item_class = MyautoItem
    default_input_processor = MapCompose(str.strip, str.upper)
    default_output_processor = TakeFirst()

    price_in = RemoveComma()
Ejemplo n.º 12
0
 class TestItemLoader(ItemLoader):
     default_item_class = TestItem
     name_in = MapCompose(float)
Ejemplo n.º 13
0
class BookItem(scrapy.Item):
    # Scalars
    url = Field()

    title = Field(input_processor=MapCompose(str.strip))
    author = Field(input_processor=MapCompose(str.strip))

    num_ratings = Field(input_processor=MapCompose(str.strip, int))
    num_reviews = Field(input_processor=MapCompose(str.strip, int))
    avg_rating = Field(input_processor=MapCompose(str.strip, float))
    num_pages = Field(
        input_processor=MapCompose(str.strip, num_page_extractor, int))

    language = Field(input_processor=MapCompose(str.strip))
    publish_date = Field(input_processor=extract_publish_dates)

    original_publish_year = Field(
        input_processor=MapCompose(extract_year, int))

    isbn = Field(input_processor=MapCompose(str.strip, isbn_filter))
    isbn13 = Field(input_processor=MapCompose(str.strip, isbn13_filter))
    asin = Field(input_processor=MapCompose(filter_asin))

    series = Field()

    # Lists
    awards = Field(output_processor=Identity())
    places = Field(output_processor=Identity())
    characters = Field(output_processor=Identity())
    genres = Field(output_processor=Compose(set, list))

    # Dicts
    rating_histogram = Field(input_processor=MapCompose(extract_ratings))
Ejemplo n.º 14
0
class NikeItem(scrapy.Item):
    name = scrapy.Field(output_processor=TakeFirst())
    subtitle = scrapy.Field(output_processor=TakeFirst())
    price = scrapy.Field(input_processor=MapCompose(int_price),
                         output_processor=TakeFirst())
    link = scrapy.Field(output_processor=TakeFirst())
Ejemplo n.º 15
0
 class ChildItemLoader(TestItemLoader):
     url_in = MapCompose(processor_with_args, key='val')
Ejemplo n.º 16
0
class NewsLoader(ItemLoader):
    default_item_class = NewsItem
    default_output_processor = TakeFirst()

    # fetchtime/modtime/firstpubtime: parse input to datetime.datetime,
    # write output as standard ISO format string
    fetchtime_in = MapCompose(wrapped_parse)  #dateutil.parser.parse)
    fetchtime_out = Compose(TakeFirst(), lambda x: x.isoformat())
    modtime_in = MapCompose(wrapped_parse)  #dateutil.parser.parse)
    modtime_out = Compose(TakeFirst(), lambda x: x.isoformat())
    firstpubtime_in = MapCompose(wrapped_parse)  #dateutil.parser.parse)
    firstpubtime_out = Compose(TakeFirst(), lambda x: x.isoformat())

    clean_fn = MapCompose(
        lambda x: x.strip(),
        lambda x: replace_escape_chars(x, replace_by=' '),
        replace_entities,
    )
    headline_in = clean_fn
    summary_in = clean_fn

    # Finding the body can be tricky. The Join() here allows multiple
    # attempts to be taken, as each (joined to a single string) body-try
    # will be a separate list entry, and the default TakeFirst() output
    # processor will choose the first non-empty one. So our highest-quality
    # extractor can be tried first, followed by a less-likely/lower quality
    # one, with a really broad option as a final fallback if desired.
    bodytext_in = Compose(  #remove_comments,
        #remove_tags,
        Join(' '),
        lambda x: replace_escape_chars(x, replace_by=' '),
        replace_entities,
    )
    bodytext_out = TakeFirst()

    rawpagegzipb64_out = Compose(
        TakeFirst(),
        compress,
        b64encode,
        lambda x: str(x, encoding='UTF-8'),
    )

    # Store keywords and bylines as a comma-separated string (the native format
    # of most of the input formats). Export as a list, TakeFirst()ing the first
    # (best) string available.
    # TODO FIXME: This causes problems for bylines with titles in, e.g. the
    #             Daily Mail's "John Smith, Chief Political Reporter". Can
    #             either change the storage format, escape/unescape the comma,
    #             or erase the title in the Mail's input routine.
    keywords_in = Compose(_strip_strl, Join(','))
    bylines_in = Compose(_strip_strl, _remove_fluff, _split_and, Join(','))
    keywords_out = Compose(
        TakeFirst(),
        lambda x: x.split(','),
    )
    bylines_out = Compose(TakeFirst(), lambda x: x.split(','))

    # Post out any notes and comments we've got.
    notes_out = Identity()
    rawcomments_out = Identity()
    comments_out = Identity()

    # TODO: Consider converting these to use a proper RDFa/microdata parser
    #       like rdflib. scrapinghub/extruct looks ideal.
    # TODO: Consider splitting these out into a separate "processors" class,
    #       and/or allowing pre-processing of the selector to remove elements
    #       which we don't want in the output (such as BBC News captions?)
    #       before doing the extraction.

    def add_fromresponse(self, response):
        """Extracts standard data from the response object itself"""
        # TODO: Should be we using the canonicalised value of this from og:url
        #       or whatever to avoid dupes? Not important when taking a feed,
        #       but may be necessary to avoid duplicative crawls.
        self.add_value('url', response.url)
        self.add_value('rawpagegzipb64', response.body)
        self.add_value('fetchtime',
                       str(response.headers['Date'], encoding='utf-8'))
        # TODO: Consider (and check vs actual responses:)
        # self.add_value('modtime',
        #                str(response.headers['Last-Modified'],
        #                    encoding='utf-8'))

    def add_htmlmeta(self):
        """Extracts the content potentially encoded in standard HTML meta tags,
           such as <meta name=author ...> and <meta name=keywords ...>.
           Extensions, such as the schema.org and Open Graph codings, are in
           their own methods."""
        self.add_xpath(
            'bylines', 'head/meta[@name="author" or '
            '@property="author"]/@content')
        # self.add_xpath('bylines', '//a[@rel=author]/text()') # If needed
        # This is Google News specific
        self.add_xpath('keywords', 'head/meta[@name="news_keywords"]/@content')
        self.add_xpath('keywords', 'head/meta[@name="keywords"]/@content')
        self.add_xpath('language', '/html/@lang')

    def add_schemaorg(self, response, jsonld=True, microdata=True, rdfa=True):
        """Indirect to the add_schemaorg methods"""
        self.add_schemaorg_mde(response,
                               jsonld=True,
                               microdata=False,
                               rdfa=False)
        self.add_schemaorg_by_xpath()

    def add_schemaorg_mde(self,
                          response,
                          jsonld=True,
                          microdata=True,
                          rdfa=True):
        mde = RISJMetadataExtractor(
            response,
            jsonld=jsonld,
            microdata=microdata,
            rdfa=rdfa,
        )

        data = mde.extract_newsarticle_schemaorg(jsonld=True)
        self.add_value('firstpubtime', data.get('datePublished'))
        self.add_value('modtime', data.get('dateModified'))
        self.add_value('keywords', data.get('keywords'))
        self.add_value('headline', data.get('headline'))
        try:
            self.add_value('bodytext', data.get('articleBody'))
        except Exception as e:
            logger.warning("Can't extract body from {}: {}".format(
                response, e))
        self.add_value('section', data.get('articleSection'))
        try:
            self.add_value('bylines', data['author']['name'])
        except (ValueError, KeyError, TypeError):
            a = data.get('author')
            if isinstance(a, str):
                a = [a]
            if a:
                self.add_value('bylines', [x for x in a if isinstance(x, str)])
        except Exception as e:
            logger.error("Failed to handle byline extraction from {} for "
                         "{}: {}".format(data, response, e))
        try:
            self.add_value('source', data['publisher']['name'])
        except (ValueError, KeyError):
            self.add_value('source', data.get('publisher'))

    def add_schemaorg_by_xpath(self):
        """Extracts the content encoded by the standards at schema.org,
           which consist of standard structured data added for the benefit of
           the major search engines. There are several ways to encode this;
           microdata uses @itemprop, RDFa Lite uses @property. There are
           subtle differences, but that's the big one. We'll try to handle
           both.

           The full schemas are *very* large, and variably implemented. We use
           only bits of it, mostly from NewsArticle and its parents.
        """
        #self.add_schemaorg_bylines()

        # These xpaths are fairly naive; in particular, they don't rely on
        # the presence of an appropriate 'itemscope' for microdata.

        # == CreativeWork ==
        # TODO: These dateXxxx are allowed to be dates, not times. Should
        #       probably check somewhere if they're not full times and push
        #       them to the bottom of the queue for those sites where
        #       that's true.
        self.add_xpath(
            'firstpubtime', '//*[@itemprop="datePublished" or '
            '@property="datePublished"]/@content')
        # self.add_xpath('firstpubtime',
        #                '//[@itemprop="dateCreated"]/@content]')
        # TODO: Check if needed - less apposite than datePublished
        self.add_xpath(
            'modtime', '//*[@itemprop="dateModified" or '
            '@property="dateModified"]/@content')
        self.add_xpath(
            'keywords', '//*[@itemprop="keywords" or '
            '@property="keywords"]/@content')
        self.add_xpath(
            'headline', '//*[@itemprop="headline" or '
            '@property="headline"]//text()')
        # == Article ==
        self.add_xpath(
            'section', '//*[@itemprop="articleSection" or '
            '@property="articleSection"]/@content')
        # == Article and Review ==
        self.add_xpath(
            'bodytext', '//*[@itemprop="articleBody" or '
            '@property="articleBody" or '
            '@itemprop="reviewBody" or '
            '@property="reviewBody"]//text()')

    def add_schemaorg_bylines(self):
        # This has a high false-positive rate, so is separated out.
        # == CreativeWork ==
        self.add_xpath('bylines',
                       '//*[@itemprop="author"]//*[@itemprop="name"]//text()')

    def add_opengraph(self):
        """Extracts the content encoded by the Open Graph Protocol, a means
           of marking up web objects used by Facebook to produce a rich social
           graph. The schema is at http://ogp.me. Dates are ISO 8601 strings.
        """
        # TODO: Can these be exposed as microdata instead of RDFa?
        self.add_xpath('source',
                       'head/meta[@property="og:site_name"]/@content')
        self.add_xpath('headline', 'head/meta[@property="og:title"]/@content')
        self.add_xpath('summary',
                       'head/meta[@property="og:description"]/@content')
        # There are also: og:type (normally 'article'), og:image
        # (representative image) og:url (canonical URL), og:audio (audio
        # representation), og:determiner (title preceeded by 'a'/'an'/...),
        # og:locale and og:locale:alternate (language_TERRITORY tags),
        # and og:video (complementary video URL)

        # These are OG tags for the 'article' subclass
        self.add_xpath(
            'modtime', 'head/meta[@property="article:modified_time"]/@content')
        self.add_xpath(
            'firstpubtime', 'head/meta[@property="article:published_time"]'
            '/@content')
        self.add_xpath('section',
                       'head/meta[@property="article:section"]/@content')
        self.add_xpath('bylines',
                       'head/meta[@property="article:author"]/@content')
        self.add_xpath('keywords',
                       'head/meta[@property="article:tag"]/@content')
        # Also:
        # article:expiration_time - When the article is out of date after.

    def add_dublincore(self):
        """Extracts Dublin Core metadata information from the head"""
        # TODO: arrange to extract properly? Will be better if the namespace
        #       is properly referenced in all the headers, but worse otherwise.
        #       May not be a good idea.
        self.add_xpath(
            'headline', 'head/meta[@name="dc.title" or '
            '@name="DC.title"]/@content')
        self.add_xpath(
            'summary', 'head/meta[@name="dcterms.abstract" or '
            '@name="DCTERMS.abstract"]/@content')
        self.add_xpath(
            'summary', 'head/meta[@name="dc.description" or '
            '@name="DC.description"]/@content')
        self.add_xpath(
            'modtime', 'head/meta[@name="dcterms.modified" or '
            '@name="DCTERMS.modified"]/@content')
        self.add_xpath(
            'firstpubtime', 'head/meta[@name="dcterms.created" or '
            '@name="DCTERMS.created"]/@content')
        self.add_xpath(
            'source', 'head/meta[@name="dc.publisher" or '
            '@name="DC.publisher"]/@content')
        # Correct assumption creator==bylines for some docs, not for all.
        #self.add_xpath('bylines',
        #               'head/meta[@name="dc.creator" or '
        #                   '@name="DC.creator"]/@content')
        #self.add_xpath('language',
        #               'head/meta[@name="dc.language" or '
        #                   '@name="DC.language"]/@content')

    # TODO: def add_rNews():? Similar to the add_schemaorg work (which was
    #       based on it, but featuring different implementation.

    # TODO: def add_hNews():?

    def add_scrapymeta(self, response):
        """Extracts the content passed through meta tags from the Request. This
           is normally metadata from the RSS feed which linked to the article,
           or from Google News sitemaps."""

        if 'originalurl' in response.meta:
            self.add_value('originalurl', response.meta['originalurl'])

        if 'newsmeta' in response.meta:
            for k in response.meta.get('newsmeta'):
                self.add_value(k, response.meta['newsmeta'][k])

        if 'RSSFeed' in response.meta:
            d = response.meta['RSSFeed']
            self.add_value('headline', d.get('title'))
            self.add_value('summary', d.get('description'))
            self.add_value('section', d.get('section'))
            self.add_value('firstpubtime', d.get('pubDate'))
            # Extract (some) non-url parts of each sitemap node and pass in meta
            # tag
#            title = selector.xpath('title/text()').extract_first()
#            if title:
#                nm['headline'] = title.strip()
#
#            description = selector.xpath('description/text()').extract_first()
#            if description:
#                nm['summary'] = description.strip()
#
#            section = selector.xpath('category/text()').extract_first()
#            if section:
#                nm['section'] = section.strip()
#
#            pubdate = selector.xpath('pubDate/text()').extract_first()
#            if pubdate:
#                nm['firstpubtime'] = pubdate.strip() # TODO: Maybe should be modtime?

        if 'NewsSitemap' in response.meta:
            d = response.meta['NewsSitemap']
            self.add_value('modtime', d.get('lastmod'))
            if 'news' in d:
                self.add_value('keywords', d['news'].get('keywords'))
                self.add_value('firstpubtime',
                               d['news'].get('publication_date'))
                self.add_value('headline', d['news'].get('title'))
#            if 'lastmod' in d:
#                self.add_value(nm['modtime'] = d['lastmod'].strip()
#            if 'news' in d:
#                for k, v in d['news'].items():
#                    if k == 'keywords':
#                        nm['keywords'] = v.strip()
#                    elif k == 'publication_date':
#                        nm['firstpubtime'] = v.strip()
#                    elif k == 'title':
#                        nm['headline'] = v.strip()

# Record no of previous fetches
        if 'refetchcontrol_previous' in response.meta:
            self.add_value('previousfetches',
                           response.meta.get('refetchcontrol_previous'))

    def add_readability(self, response):
        """Extracts content using readability-lxml. This is non-specific,
           but flexible, and a good fallback."""

        # Don't do the readability parsing (which is comparatively expensive)
        # unless it's needed
        if self.get_output_value('headline') and self.get_output_value(
                'bodytext'):
            return

        readified_doc = readability.readability.Document(response.text)

        if not self.get_output_value('headline'):
            logger.debug(
                f'Using readability fallback for headline: {self.get_output_value("url")}'
            )
            # There is a .title() method, but short_title() strips chaff
            self.add_value('headline', readified_doc.short_title())

        if not self.get_output_value('bodytext'):
            logger.debug(
                f'Using readability fallback for bodytext: {self.get_output_value("url")}'
            )
            reparsed = lxml.html.fromstring(readified_doc.summary())

            self.add_value('bodytext', reparsed.xpath('//body//text()'))
Ejemplo n.º 17
0
 class ChildItemLoader(TestItemLoader):
     url_in = MapCompose(processor)
Ejemplo n.º 18
0
 class TestItemLoader(ItemLoader):
     name_in = MapCompose(lambda v: v.title(), lambda v: v[:-1])
Ejemplo n.º 19
0
class SecondHandAdLoader(ItemLoader):
    """
    Generic ad loader: cleans and formats the raw data scraped from
    the webservices.

    The data of each ad is then stored in an 'item' object, waiting for
    further processing.
    """

    default_output_processor = TakeFirst()

    url_in = MapCompose(format_text)
    url_out = Join()

    vendor_in = MapCompose(format_text)
    vendor_out = Join()

    title_in = MapCompose(format_text)
    title_out = Join()

    price_in = MapCompose(format_text, remove_all_spacing, extract_price_value)
    price_out = TakeFirst()

    condition_in = MapCompose(format_text)
    condition_out = Join()

    location_in = MapCompose(format_text)
    location_out = Join()

    postal_code_in = MapCompose(str, remove_special_characters, format_number,
                                int)
    postal_code_out = TakeFirst()

    first_posted_in = MapCompose(format_text, parse_datetime)
    first_posted_out = Join()

    last_updated_in = MapCompose(format_text, parse_datetime)
    last_updated_out = Join()

    description_in = MapCompose(format_text)
    description_out = Join()

    images_in = MapCompose(format_text)
    images_out = Join(', ')

    brand_in = MapCompose(format_text)
    brand_out = Join()

    model_in = MapCompose(format_text)
    model_out = Join()

    make_in = MapCompose(format_text)
    make_out = Join()

    color_in = MapCompose(format_text)
    color_out = Join()

    price_new_in = MapCompose(format_text)
    price_new_out = Join()

    user_rating_in = MapCompose(format_text)
    user_rating_out = Join()

    value_rating_in = MapCompose(format_text)
    value_rating_out = Join()

    leverage_rating_in = MapCompose(format_text)
    leverage_rating_out = Join()

    def _summarize(self, item: dict) -> str:
        """
        Generate an HTML summary of an item, to display
        in a dashboard.

        Parameters
        ----------
        item: dict.
            The scraped item.

        Returns
        -------
        out: str.
            The corresponding summary.
        """
        return ('{}: {} {}<br />'.format(
            'price', serialize_html_tag('<i>', str(item.get('price', ''))),
            '€') + '{}: {} {}<br />'.format(
                'condition',
                serialize_html_tag('<i>', str(item.get('condition', ''))), '')
                + '{}: {} {}<br />'.format(
                    'value',
                    serialize_html_tag('<i>', str(item.get('value_rating',
                                                           ''))), '/ 10') +
                '{}: {} {}<br />'.format(
                    'leverage',
                    serialize_html_tag(
                        '<i>', str(item.get('leverage_rating', ''))), '/ 10') +
                '{}: {} {}<br />'.format(
                    'age',
                    serialize_html_tag('<i>', str(item.get('value_rating',
                                                           ''))), 'days') +
                '{}: {} {}<br />'.format(
                    'url',
                    serialize_html_tag(
                        tag='<a>',
                        value=str(self.context.get('domain', 'leboncoin.fr')),
                        attributes={'href': item.get('url', '')}), ''))

    def load_item(self):
        """
        Complete the raw information with computed data.
        """
        __item = super(SecondHandAdLoader, self).load_item()
        __geolocator = Nominatim(user_agent='adspying')
        __location = __geolocator.geocode(
            (str(__item.get('postal_code', '69000')) + ', ' +
             __item.get('location', 'lyon')),
            exactly_one=True)

        # gps coordinates
        __item['latitude'] = __location.latitude
        __item['longitude'] = __location.longitude

        # timeline
        __item['first_posted'] = __item.get(
            'last_updated',
            datetime.now().isoformat(sep='T', timespec='seconds'))
        __item['age'] = (datetime.now() - datetime.strptime(
            __item.get('first_posted',
                       datetime.now().isoformat(sep='T', timespec='seconds')),
            '%Y-%m-%dT%H:%M:%S')).days
        __item['reposting_count'] = 0
        __item['starting_price'] = __item.get('price', 0)

        # vendor
        __item['vendor'] = urljoin(self.context.get('base_url', ''),
                                   __item.get('vendor', ''))

        # evaluation & sorting depend on the query
        __item['value_rating'] = 5  # neutral value
        __item['leverage_rating'] = 5  # neutral value

        # map marker
        __item['icon'] = self.context.get('icon', 'marker')

        # summary
        __item['summary'] = self._summarize(__item)

        return __item
Ejemplo n.º 20
0
 class ChildItemLoader(TestItemLoader):
     url_in = MapCompose(lambda v: v.lower())
Ejemplo n.º 21
0
class CustomItemLoader(ItemLoader):
    name_in = MapCompose(lambda v: v.title())
Ejemplo n.º 22
0
 class ChildChildItemLoader(ChildItemLoader):
     url_in = MapCompose(lambda v: v.upper())
     summary_in = MapCompose(lambda v: v)
Ejemplo n.º 23
0
 def _get_input_processor(self):
     return self.kwargs.pop('in', MapCompose(*self.args))
Ejemplo n.º 24
0
 class IdentityDefaultedItemLoader(DefaultedItemLoader):
     name_in = MapCompose()
Ejemplo n.º 25
0
class BaseNoInputReprocessingLoader(ItemLoader):
    title_in = MapCompose(str.upper)
    title_out = TakeFirst()
Ejemplo n.º 26
0
 class ChildItemLoader(TestItemLoader):
     name_in = MapCompose(TestItemLoader.name_in, str.swapcase)
Ejemplo n.º 27
0
class DefaultedItemLoader(NameItemLoader):
    default_input_processor = MapCompose(lambda v: v[:-1])
Ejemplo n.º 28
0
 class ChildDefaultedItemLoader(DefaultedItemLoader):
     name_in = MapCompose(DefaultedItemLoader.default_input_processor,
                          str.swapcase)
Ejemplo n.º 29
0
class Work01Item(scrapy.Item):
    title = scrapy.Field(output_processor=TakeFirst())
    category = scrapy.Field(input_processor=strip,
                            output_processor=Join(separator=","))
    show_time = scrapy.Field(input_processor=MapCompose(filter_time),
                             output_processor=TakeFirst())
Ejemplo n.º 30
0
class RealEstateRawLoader(ItemLoader):
    default_output_processor = TakeFirst()
    title_in = MapCompose(str.strip, str.capitalize)
    value_in = MapCompose(str.strip, str.lower)
    value_out = Join()
    area_in = MapCompose(str.strip, str.lower)
    area_out = Join()
    address_in = MapCompose(str.strip)
    ward_in = MapCompose(str.strip)
    district_in = MapCompose(str.strip)
    province_in = MapCompose(str.strip)
    type_in = MapCompose(str.strip, str.lower)
    description_in = MapCompose(str.strip)
    description_out = Join('\n')
    sellerName_in = MapCompose(str.strip, str.lower, str.title)
    time_in = MapCompose(str.strip, str.lower)
    image_in = MapCompose(str.strip)
    image_out = Join(' ')