def parse(self, response): # Lets instantiate one of our horse race items. itemLoader = ItemLoader(item=HorseRaceItem(), response=response) # First lets do housekeeping data for this web spider scrape. We need to check it is only done once. # So only happens for the first asynchronous block of code to run (scrapy uses twisted!). Store the value in # the class attribute so that it can be used by every item that is parsed. if not self.housekeeping: self.housekeeping["url"] = response.url self.housekeeping["project"] = self.settings.get("BOT_NAME") self.housekeeping["spider"] = self.name self.housekeeping["server"] = socket.gethostname() self.housekeeping["date"] = (datetime.now().replace( tzinfo=timezone.utc).strftime("%d %B %Y %H:%M:%S")) # self.logger.info( # f"*** The housekeeping attributes are now data filled with: {self.housekeeping} ***") # Set housekeeping attributes in the HorseRaceItem itemLoader.add_value('url', response.url) itemLoader.add_value('project', self.settings.get("BOT_NAME")) itemLoader.add_value('spider', self.name) itemLoader.add_value('server', socket.gethostname()) itemLoader.add_value( 'date', datetime.now().replace( tzinfo=timezone.utc).strftime("%d %B %Y %H:%M:%S")) # XPaths for race details itemLoader.add_xpath( 'race_time', '//div[@class="race-header__content js-race-header__content"]/div/div/div/div/h1/b/text()' ) itemLoader.add_xpath( 'race_date_and_place', '//div[@class="race-header__content js-race-header__content"]/div/div/div/div/h1/text()', MapCompose(str.strip), lambda i: i[1]) itemLoader.add_xpath( 'race_class', '//div[@class="race-header__content js-race-header__content"]/div/div/div/div/p[2]/text()', MapCompose(str.strip), lambda i: i[0]) itemLoader.add_xpath( 'race_start_time', '//div[@class="card-footer__content"]/div/span/span/text()', lambda i: i[0]) itemLoader.add_xpath( 'race_winning_time', '//div[@class="card-footer__content"]/div/span/span[2]/text()', lambda i: i[0]) # XPaths for horse and rider details itemLoader.add_xpath( 'position', '//div[@class="tabs__content"]//div[@class="card-body"]//div[@class="card-entry "]//div[@class="card-no-draw"]//span/text()' ) itemLoader.add_xpath('horse_url', '//div[@class="horse"]//img/@src') itemLoader.add_xpath( 'race_or', '//div[@class="tabs__content"]//div[@class="card-body"]//div[@class="card-entry "]//div[@class="card-cell card-cell--fill unpadded-left"]/div/div[4]/span[1]/text()' ) itemLoader.add_xpath('horse_colour', '//div[@class="horse"]//img/@title') itemLoader.add_xpath( 'raced_description', '//div[@class="tabs__content"]//div[@class="card-body"]//div[@class="card-entry "]//div[@class="card-cell card-cell--fill unpadded-left"]/p/span/text()', MapCompose(str.strip)) itemLoader.add_xpath( 'horse_name', '//div[@class="tabs__content"]//div[@class="card-body"]//div[@class="card-entry "]//div[@class="card-cell card-cell--fill unpadded-left"]/div/div/div/h2/a/text()', MapCompose(str.strip, lambda i: i if (len(i) >= 1) else None)) itemLoader.add_xpath( 'dst_btn', '//div[@class="tabs__content"]//div[@class="card-body"]//div[@class="card-entry "]//div[@class="card-cell card-cell--form text-align--center"]/text()', MapCompose(lambda i: i.replace('\r', '').replace('\n', ''), str.strip)) itemLoader.add_xpath( 'race_ods', '//div[@class="tabs__content"]//div[@class="card-body"]//div[@class="card-entry "]//div[@class="card-cell card-cell--fill unpadded-left"]/div/div[2]/text()', MapCompose( lambda i: i.replace('\r', '').replace('\n', '').replace( ' ', ''), str.strip)) itemLoader.add_xpath( 'race_age_weight', '//*[@id="tab-full-result"]/div/div/div/div[2]//div/div/div[4]/div/div[3]/text()', MapCompose(str.split)) #return itemLoader.load_item() return self.parse_item(itemLoader)
class TestItemLoader(NameItemLoader): name_in = MapCompose(lambda v: v.title())
class ProgramaSemanaItems(scrapy.Item): semana_referencia = scrapy.Field(input_processor= MapCompose(remove_tags, remover_quebra_linhas), output_processor= TakeFirst()) leitura_semana = scrapy.Field(input_processor= MapCompose(remove_tags, remover_quebra_linhas), output_processor= TakeFirst()) cantico_inicial = scrapy.Field(input_processor= MapCompose(remove_tags, remover_quebra_linhas), output_processor= TakeFirst()) #Tesouros da Palavra de Deus tpd_titulo = scrapy.Field(input_processor= MapCompose(remove_tags, remover_quebra_linhas, get_titulo_de_descricao_com_duracao), output_processor= TakeFirst()) tpd_href = scrapy.Field(input_processor= MapCompose(remove_tags, add_jw_prefix_to_href), output_processor= TakeFirst()) tpd_duracao = scrapy.Field(input_processor= MapCompose(remove_tags, remover_quebra_linhas, get_duracao_de_descricao_com_duracao), output_processor= TakeFirst()) tpd_joias_titulo = scrapy.Field(input_processor= MapCompose(remove_tags, remover_quebra_linhas, get_titulo_de_descricao_com_duracao), output_processor= TakeFirst()) tpd_joias_duracao = scrapy.Field(input_processor= MapCompose(remove_tags, remover_quebra_linhas, get_duracao_de_descricao_com_duracao), output_processor= TakeFirst()) tpd_joias_descricao = scrapy.Field(input_processor= MapCompose(remove_tags, remover_quebra_linhas, get_lista_filha_as_string), output_processor= TakeFirst()) tpd_leitura_duracao = scrapy.Field(input_processor= MapCompose(remove_tags, remover_quebra_linhas, get_duracao_de_descricao_com_duracao_mais_texto_base), output_processor= TakeFirst()) tpd_leitura_titulo = scrapy.Field(input_processor= MapCompose(remove_tags, remover_quebra_linhas, get_titulo_de_descricao_com_duracao), output_processor= TakeFirst()) tpd_leitura_texto_base = scrapy.Field(input_processor= MapCompose(remove_tags, remover_quebra_linhas), output_processor= TakeFirst()) tpd_leitura_href = scrapy.Field(input_processor= MapCompose(remove_tags, add_jw_prefix_to_href), output_processor= TakeFirst()) tpd_leitura_licao_melhore_titulo = scrapy.Field(input_processor= MapCompose(remove_tags, remover_quebra_linhas, get_titulo_de_descricao_com_duracao), output_processor= TakeFirst()) tpd_leitura_licao_melhore_href = scrapy.Field(input_processor= MapCompose(remove_tags, add_jw_prefix_to_href), output_processor= TakeFirst()) #Faça seu melhor no ministério fmm_design_1_titulo = scrapy.Field() fmm_design_1_duracao = scrapy.Field() fmm_design_1_descricao = scrapy.Field() fmm_design_1_tipo = scrapy.Field() fmm_design_1_tipo_perguntas_respostas_href = scrapy.Field() fmm_design_1_tipo_apresentacao_licao_melhore_titulo = scrapy.Field() fmm_design_1_tipo_apresentacao_licao_melhore_href = scrapy.Field() fmm_design_2_titulo = scrapy.Field() fmm_design_2_duracao = scrapy.Field() fmm_design_2_descricao = scrapy.Field() fmm_design_2_tipo = scrapy.Field() fmm_design_2_tipo_perguntas_respostas_href = scrapy.Field() fmm_design_2_tipo_apresentacao_licao_melhore_titulo = scrapy.Field() fmm_design_2_tipo_apresentacao_licao_melhore_href = scrapy.Field() fmm_design_3_titulo = scrapy.Field() fmm_design_3_duracao = scrapy.Field() fmm_design_3_descricao = scrapy.Field() fmm_design_3_tipo = scrapy.Field() fmm_design_3_tipo_perguntas_respostas_href = scrapy.Field() fmm_design_3_tipo_apresentacao_licao_melhore_titulo = scrapy.Field() fmm_design_3_tipo_apresentacao_licao_melhore_href = scrapy.Field() fmm_design_4_titulo = scrapy.Field() fmm_design_4_duracao = scrapy.Field() fmm_design_4_descricao = scrapy.Field() fmm_design_4_tipo = scrapy.Field() fmm_design_4_tipo_perguntas_respostas_href = scrapy.Field() fmm_design_4_tipo_apresentacao_licao_melhore_titulo = scrapy.Field() fmm_design_4_tipo_apresentacao_licao_melhore_href = scrapy.Field() #Nossa vida cristã cantico_transicao = scrapy.Field() nvc_design_1_titulo = scrapy.Field() nvc_design_1_duracao = scrapy.Field() nvc_design_1_video_ou_materia_href = scrapy.Field() nvc_design_1_descricao = scrapy.Field() nvc_design_2_titulo = scrapy.Field() nvc_design_2_duracao = scrapy.Field() nvc_design_2_video_ou_materia_href = scrapy.Field() nvc_design_2_descricao = scrapy.Field() nvc_design_3_titulo = scrapy.Field() nvc_design_3_duracao = scrapy.Field() nvc_design_3_video_ou_materia_href = scrapy.Field() nvc_design_3_descricao = scrapy.Field() nvc_design_4_titulo = scrapy.Field() nvc_design_4_duracao = scrapy.Field() nvc_design_4_video_ou_materia_href = scrapy.Field() nvc_design_4_descricao = scrapy.Field()
class RekonItem(Item): # define the fields for your item here like: # name = scrapy.Field() awbRekon = Field() mpCode = Field() periodeRekon = Field() tahunRekon = Field() awbKendali = Field() jenisLayanan = Field() tanggalKirim = Field() isiKiriman = Field(input_processor = MapCompose(removeSemiColon), output_processor = TakeFirst()) berat = Field(input_processor = MapCompose(removeGr), output_processor = TakeFirst()) jenisKiriman = Field() beaDasar = Field(input_processor = MapCompose(removePoint), output_processor = TakeFirst()) nilaiBarang = Field(input_processor = MapCompose(removePoint), output_processor = TakeFirst()) htnb = Field(input_processor = MapCompose(removePoint), output_processor = TakeFirst()) pengirim = Field(input_processor = MapCompose(getPengirimPenerima), output_processor = TakeFirst()) kotaPengirim = Field(input_processor = MapCompose(getKotaPengirimPenerima), output_processor = TakeFirst()) kodePosPengirim = Field(input_processor = MapCompose(getKodePos), output_processor = TakeFirst()) penerima = Field(input_processor = MapCompose(getPengirimPenerima), output_processor = TakeFirst()) kotaPenerima = Field(input_processor = MapCompose(getKotaPengirimPenerima), output_processor = TakeFirst()) kodePosPenerima = Field(input_processor = MapCompose(getKodePos), output_processor = TakeFirst()) statusRekon = Field() ketRekon = Field() beaTotal = Field() kantorKirim = Field() nopendKantorKirim = Field(input_processor = MapCompose(getNopendKantor), output_processor = TakeFirst()) kantorKirim = Field(input_processor = MapCompose(getNamaKantor), output_processor = TakeFirst()) tanggalPosting = Field() statusAkhir = Field() kantorAkhir = Field(input_processor = MapCompose(getNamaKantor), output_processor = TakeFirst()) ketStatusAkhir = Field() nopendKantorAkhir = Field(input_processor = MapCompose(getNopendKantor), output_processor = TakeFirst()) tanggalStatusAkhir = Field() statusAntar = Field() ketStatusAntar = Field() penerimaKiriman = Field() waktuUpdateStatus = Field() #listProsesAntar = Field() pass
class InvestorTradingItem(scrapy.Item): date = scrapy.Field(input_processor=MapCompose(str.strip, ItemParser.p_date)) # 資料日期 code = scrapy.Field() # 證券代號 name = scrapy.Field() # 證券名稱 foreign_buy = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int)) # 外陸資買進股數(不含外資自營商) foreign_sell = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int)) # 外陸資賣出股數(不含外資自營商) foreign_net = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int)) # 外陸資買賣超股數(不含外資自營商) foreign_dealer_buy = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int)) # 外資自營商買進股數 foreign_dealer_sell = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int)) # 外資自營商賣出股數 foreign_dealer_net = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int)) # 外資自營商買賣超股數 trust_buy = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int)) # 投信買進股數 trust_sell = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int)) # 投信賣出股數 trust_net = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int)) # 投信買賣超股數 dealer_net = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int)) # 自營商買賣超股數 native_dealer_buy = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int)) # 自營商買進股數(自行買賣) native_dealer_sell = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int)) # 自營商賣出股數(自行買賣) native_dealer_net = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int)) # 自營商買賣超股數(自行買賣) native_dealer_hedge_buy = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int)) # 自營商買進股數(避險) native_dealer_hedge_sell = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int)) # 自營商賣出股數(避險) native_dealer_hedge_net = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int)) # 自營商買賣超股數(避險) net = scrapy.Field(input_processor=MapCompose(ItemParser.p_num, int)) # 三大法人買賣超股數 class Meta: name = 'twse_investor_trading' fields = [ 'code', 'name', 'foreign_buy', 'foreign_sell', 'foreign_net', 'foreign_dealer_buy', 'foreign_dealer_sell', 'foreign_dealer_net', 'trust_buy', 'trust_sell', 'trust_net', 'dealer_net', 'native_dealer_buy', 'native_dealer_sell', 'native_dealer_net', 'native_dealer_hedge_buy', 'native_dealer_hedge_sell', 'native_dealer_hedge_net', 'net' ]
class TestItemLoader(ItemLoader): name_in = MapCompose(float)
def parse(self, response): # parse response to get author list urls = MapCompose(lambda i: str.replace(i, "users", "u"))( response.css('div.wrap > a::attr(href)').getall()) yield from response.follow_all(urls, callback=self.parse_author)
class City(scrapy.Item): country = scrapy.Field() city = scrapy.Field(input_processor=MapCompose(get_country), output_processor=TakeFirst())
class Recipe(Item): ingredients = Field(input_processor=MapCompose(remove_tags, str.strip)) directions = Field(input_processor=MapCompose(remove_tags, str.strip))
class PracujItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() kat_mias = Field(input_processor=MapCompose(remove_n)) liczba = Field()
class MyautoItemLoader(ItemLoader): default_item_class = MyautoItem default_input_processor = MapCompose(str.strip, str.upper) default_output_processor = TakeFirst() price_in = RemoveComma()
class TestItemLoader(ItemLoader): default_item_class = TestItem name_in = MapCompose(float)
class BookItem(scrapy.Item): # Scalars url = Field() title = Field(input_processor=MapCompose(str.strip)) author = Field(input_processor=MapCompose(str.strip)) num_ratings = Field(input_processor=MapCompose(str.strip, int)) num_reviews = Field(input_processor=MapCompose(str.strip, int)) avg_rating = Field(input_processor=MapCompose(str.strip, float)) num_pages = Field( input_processor=MapCompose(str.strip, num_page_extractor, int)) language = Field(input_processor=MapCompose(str.strip)) publish_date = Field(input_processor=extract_publish_dates) original_publish_year = Field( input_processor=MapCompose(extract_year, int)) isbn = Field(input_processor=MapCompose(str.strip, isbn_filter)) isbn13 = Field(input_processor=MapCompose(str.strip, isbn13_filter)) asin = Field(input_processor=MapCompose(filter_asin)) series = Field() # Lists awards = Field(output_processor=Identity()) places = Field(output_processor=Identity()) characters = Field(output_processor=Identity()) genres = Field(output_processor=Compose(set, list)) # Dicts rating_histogram = Field(input_processor=MapCompose(extract_ratings))
class NikeItem(scrapy.Item): name = scrapy.Field(output_processor=TakeFirst()) subtitle = scrapy.Field(output_processor=TakeFirst()) price = scrapy.Field(input_processor=MapCompose(int_price), output_processor=TakeFirst()) link = scrapy.Field(output_processor=TakeFirst())
class ChildItemLoader(TestItemLoader): url_in = MapCompose(processor_with_args, key='val')
class NewsLoader(ItemLoader): default_item_class = NewsItem default_output_processor = TakeFirst() # fetchtime/modtime/firstpubtime: parse input to datetime.datetime, # write output as standard ISO format string fetchtime_in = MapCompose(wrapped_parse) #dateutil.parser.parse) fetchtime_out = Compose(TakeFirst(), lambda x: x.isoformat()) modtime_in = MapCompose(wrapped_parse) #dateutil.parser.parse) modtime_out = Compose(TakeFirst(), lambda x: x.isoformat()) firstpubtime_in = MapCompose(wrapped_parse) #dateutil.parser.parse) firstpubtime_out = Compose(TakeFirst(), lambda x: x.isoformat()) clean_fn = MapCompose( lambda x: x.strip(), lambda x: replace_escape_chars(x, replace_by=' '), replace_entities, ) headline_in = clean_fn summary_in = clean_fn # Finding the body can be tricky. The Join() here allows multiple # attempts to be taken, as each (joined to a single string) body-try # will be a separate list entry, and the default TakeFirst() output # processor will choose the first non-empty one. So our highest-quality # extractor can be tried first, followed by a less-likely/lower quality # one, with a really broad option as a final fallback if desired. bodytext_in = Compose( #remove_comments, #remove_tags, Join(' '), lambda x: replace_escape_chars(x, replace_by=' '), replace_entities, ) bodytext_out = TakeFirst() rawpagegzipb64_out = Compose( TakeFirst(), compress, b64encode, lambda x: str(x, encoding='UTF-8'), ) # Store keywords and bylines as a comma-separated string (the native format # of most of the input formats). Export as a list, TakeFirst()ing the first # (best) string available. # TODO FIXME: This causes problems for bylines with titles in, e.g. the # Daily Mail's "John Smith, Chief Political Reporter". Can # either change the storage format, escape/unescape the comma, # or erase the title in the Mail's input routine. keywords_in = Compose(_strip_strl, Join(',')) bylines_in = Compose(_strip_strl, _remove_fluff, _split_and, Join(',')) keywords_out = Compose( TakeFirst(), lambda x: x.split(','), ) bylines_out = Compose(TakeFirst(), lambda x: x.split(',')) # Post out any notes and comments we've got. notes_out = Identity() rawcomments_out = Identity() comments_out = Identity() # TODO: Consider converting these to use a proper RDFa/microdata parser # like rdflib. scrapinghub/extruct looks ideal. # TODO: Consider splitting these out into a separate "processors" class, # and/or allowing pre-processing of the selector to remove elements # which we don't want in the output (such as BBC News captions?) # before doing the extraction. def add_fromresponse(self, response): """Extracts standard data from the response object itself""" # TODO: Should be we using the canonicalised value of this from og:url # or whatever to avoid dupes? Not important when taking a feed, # but may be necessary to avoid duplicative crawls. self.add_value('url', response.url) self.add_value('rawpagegzipb64', response.body) self.add_value('fetchtime', str(response.headers['Date'], encoding='utf-8')) # TODO: Consider (and check vs actual responses:) # self.add_value('modtime', # str(response.headers['Last-Modified'], # encoding='utf-8')) def add_htmlmeta(self): """Extracts the content potentially encoded in standard HTML meta tags, such as <meta name=author ...> and <meta name=keywords ...>. Extensions, such as the schema.org and Open Graph codings, are in their own methods.""" self.add_xpath( 'bylines', 'head/meta[@name="author" or ' '@property="author"]/@content') # self.add_xpath('bylines', '//a[@rel=author]/text()') # If needed # This is Google News specific self.add_xpath('keywords', 'head/meta[@name="news_keywords"]/@content') self.add_xpath('keywords', 'head/meta[@name="keywords"]/@content') self.add_xpath('language', '/html/@lang') def add_schemaorg(self, response, jsonld=True, microdata=True, rdfa=True): """Indirect to the add_schemaorg methods""" self.add_schemaorg_mde(response, jsonld=True, microdata=False, rdfa=False) self.add_schemaorg_by_xpath() def add_schemaorg_mde(self, response, jsonld=True, microdata=True, rdfa=True): mde = RISJMetadataExtractor( response, jsonld=jsonld, microdata=microdata, rdfa=rdfa, ) data = mde.extract_newsarticle_schemaorg(jsonld=True) self.add_value('firstpubtime', data.get('datePublished')) self.add_value('modtime', data.get('dateModified')) self.add_value('keywords', data.get('keywords')) self.add_value('headline', data.get('headline')) try: self.add_value('bodytext', data.get('articleBody')) except Exception as e: logger.warning("Can't extract body from {}: {}".format( response, e)) self.add_value('section', data.get('articleSection')) try: self.add_value('bylines', data['author']['name']) except (ValueError, KeyError, TypeError): a = data.get('author') if isinstance(a, str): a = [a] if a: self.add_value('bylines', [x for x in a if isinstance(x, str)]) except Exception as e: logger.error("Failed to handle byline extraction from {} for " "{}: {}".format(data, response, e)) try: self.add_value('source', data['publisher']['name']) except (ValueError, KeyError): self.add_value('source', data.get('publisher')) def add_schemaorg_by_xpath(self): """Extracts the content encoded by the standards at schema.org, which consist of standard structured data added for the benefit of the major search engines. There are several ways to encode this; microdata uses @itemprop, RDFa Lite uses @property. There are subtle differences, but that's the big one. We'll try to handle both. The full schemas are *very* large, and variably implemented. We use only bits of it, mostly from NewsArticle and its parents. """ #self.add_schemaorg_bylines() # These xpaths are fairly naive; in particular, they don't rely on # the presence of an appropriate 'itemscope' for microdata. # == CreativeWork == # TODO: These dateXxxx are allowed to be dates, not times. Should # probably check somewhere if they're not full times and push # them to the bottom of the queue for those sites where # that's true. self.add_xpath( 'firstpubtime', '//*[@itemprop="datePublished" or ' '@property="datePublished"]/@content') # self.add_xpath('firstpubtime', # '//[@itemprop="dateCreated"]/@content]') # TODO: Check if needed - less apposite than datePublished self.add_xpath( 'modtime', '//*[@itemprop="dateModified" or ' '@property="dateModified"]/@content') self.add_xpath( 'keywords', '//*[@itemprop="keywords" or ' '@property="keywords"]/@content') self.add_xpath( 'headline', '//*[@itemprop="headline" or ' '@property="headline"]//text()') # == Article == self.add_xpath( 'section', '//*[@itemprop="articleSection" or ' '@property="articleSection"]/@content') # == Article and Review == self.add_xpath( 'bodytext', '//*[@itemprop="articleBody" or ' '@property="articleBody" or ' '@itemprop="reviewBody" or ' '@property="reviewBody"]//text()') def add_schemaorg_bylines(self): # This has a high false-positive rate, so is separated out. # == CreativeWork == self.add_xpath('bylines', '//*[@itemprop="author"]//*[@itemprop="name"]//text()') def add_opengraph(self): """Extracts the content encoded by the Open Graph Protocol, a means of marking up web objects used by Facebook to produce a rich social graph. The schema is at http://ogp.me. Dates are ISO 8601 strings. """ # TODO: Can these be exposed as microdata instead of RDFa? self.add_xpath('source', 'head/meta[@property="og:site_name"]/@content') self.add_xpath('headline', 'head/meta[@property="og:title"]/@content') self.add_xpath('summary', 'head/meta[@property="og:description"]/@content') # There are also: og:type (normally 'article'), og:image # (representative image) og:url (canonical URL), og:audio (audio # representation), og:determiner (title preceeded by 'a'/'an'/...), # og:locale and og:locale:alternate (language_TERRITORY tags), # and og:video (complementary video URL) # These are OG tags for the 'article' subclass self.add_xpath( 'modtime', 'head/meta[@property="article:modified_time"]/@content') self.add_xpath( 'firstpubtime', 'head/meta[@property="article:published_time"]' '/@content') self.add_xpath('section', 'head/meta[@property="article:section"]/@content') self.add_xpath('bylines', 'head/meta[@property="article:author"]/@content') self.add_xpath('keywords', 'head/meta[@property="article:tag"]/@content') # Also: # article:expiration_time - When the article is out of date after. def add_dublincore(self): """Extracts Dublin Core metadata information from the head""" # TODO: arrange to extract properly? Will be better if the namespace # is properly referenced in all the headers, but worse otherwise. # May not be a good idea. self.add_xpath( 'headline', 'head/meta[@name="dc.title" or ' '@name="DC.title"]/@content') self.add_xpath( 'summary', 'head/meta[@name="dcterms.abstract" or ' '@name="DCTERMS.abstract"]/@content') self.add_xpath( 'summary', 'head/meta[@name="dc.description" or ' '@name="DC.description"]/@content') self.add_xpath( 'modtime', 'head/meta[@name="dcterms.modified" or ' '@name="DCTERMS.modified"]/@content') self.add_xpath( 'firstpubtime', 'head/meta[@name="dcterms.created" or ' '@name="DCTERMS.created"]/@content') self.add_xpath( 'source', 'head/meta[@name="dc.publisher" or ' '@name="DC.publisher"]/@content') # Correct assumption creator==bylines for some docs, not for all. #self.add_xpath('bylines', # 'head/meta[@name="dc.creator" or ' # '@name="DC.creator"]/@content') #self.add_xpath('language', # 'head/meta[@name="dc.language" or ' # '@name="DC.language"]/@content') # TODO: def add_rNews():? Similar to the add_schemaorg work (which was # based on it, but featuring different implementation. # TODO: def add_hNews():? def add_scrapymeta(self, response): """Extracts the content passed through meta tags from the Request. This is normally metadata from the RSS feed which linked to the article, or from Google News sitemaps.""" if 'originalurl' in response.meta: self.add_value('originalurl', response.meta['originalurl']) if 'newsmeta' in response.meta: for k in response.meta.get('newsmeta'): self.add_value(k, response.meta['newsmeta'][k]) if 'RSSFeed' in response.meta: d = response.meta['RSSFeed'] self.add_value('headline', d.get('title')) self.add_value('summary', d.get('description')) self.add_value('section', d.get('section')) self.add_value('firstpubtime', d.get('pubDate')) # Extract (some) non-url parts of each sitemap node and pass in meta # tag # title = selector.xpath('title/text()').extract_first() # if title: # nm['headline'] = title.strip() # # description = selector.xpath('description/text()').extract_first() # if description: # nm['summary'] = description.strip() # # section = selector.xpath('category/text()').extract_first() # if section: # nm['section'] = section.strip() # # pubdate = selector.xpath('pubDate/text()').extract_first() # if pubdate: # nm['firstpubtime'] = pubdate.strip() # TODO: Maybe should be modtime? if 'NewsSitemap' in response.meta: d = response.meta['NewsSitemap'] self.add_value('modtime', d.get('lastmod')) if 'news' in d: self.add_value('keywords', d['news'].get('keywords')) self.add_value('firstpubtime', d['news'].get('publication_date')) self.add_value('headline', d['news'].get('title')) # if 'lastmod' in d: # self.add_value(nm['modtime'] = d['lastmod'].strip() # if 'news' in d: # for k, v in d['news'].items(): # if k == 'keywords': # nm['keywords'] = v.strip() # elif k == 'publication_date': # nm['firstpubtime'] = v.strip() # elif k == 'title': # nm['headline'] = v.strip() # Record no of previous fetches if 'refetchcontrol_previous' in response.meta: self.add_value('previousfetches', response.meta.get('refetchcontrol_previous')) def add_readability(self, response): """Extracts content using readability-lxml. This is non-specific, but flexible, and a good fallback.""" # Don't do the readability parsing (which is comparatively expensive) # unless it's needed if self.get_output_value('headline') and self.get_output_value( 'bodytext'): return readified_doc = readability.readability.Document(response.text) if not self.get_output_value('headline'): logger.debug( f'Using readability fallback for headline: {self.get_output_value("url")}' ) # There is a .title() method, but short_title() strips chaff self.add_value('headline', readified_doc.short_title()) if not self.get_output_value('bodytext'): logger.debug( f'Using readability fallback for bodytext: {self.get_output_value("url")}' ) reparsed = lxml.html.fromstring(readified_doc.summary()) self.add_value('bodytext', reparsed.xpath('//body//text()'))
class ChildItemLoader(TestItemLoader): url_in = MapCompose(processor)
class TestItemLoader(ItemLoader): name_in = MapCompose(lambda v: v.title(), lambda v: v[:-1])
class SecondHandAdLoader(ItemLoader): """ Generic ad loader: cleans and formats the raw data scraped from the webservices. The data of each ad is then stored in an 'item' object, waiting for further processing. """ default_output_processor = TakeFirst() url_in = MapCompose(format_text) url_out = Join() vendor_in = MapCompose(format_text) vendor_out = Join() title_in = MapCompose(format_text) title_out = Join() price_in = MapCompose(format_text, remove_all_spacing, extract_price_value) price_out = TakeFirst() condition_in = MapCompose(format_text) condition_out = Join() location_in = MapCompose(format_text) location_out = Join() postal_code_in = MapCompose(str, remove_special_characters, format_number, int) postal_code_out = TakeFirst() first_posted_in = MapCompose(format_text, parse_datetime) first_posted_out = Join() last_updated_in = MapCompose(format_text, parse_datetime) last_updated_out = Join() description_in = MapCompose(format_text) description_out = Join() images_in = MapCompose(format_text) images_out = Join(', ') brand_in = MapCompose(format_text) brand_out = Join() model_in = MapCompose(format_text) model_out = Join() make_in = MapCompose(format_text) make_out = Join() color_in = MapCompose(format_text) color_out = Join() price_new_in = MapCompose(format_text) price_new_out = Join() user_rating_in = MapCompose(format_text) user_rating_out = Join() value_rating_in = MapCompose(format_text) value_rating_out = Join() leverage_rating_in = MapCompose(format_text) leverage_rating_out = Join() def _summarize(self, item: dict) -> str: """ Generate an HTML summary of an item, to display in a dashboard. Parameters ---------- item: dict. The scraped item. Returns ------- out: str. The corresponding summary. """ return ('{}: {} {}<br />'.format( 'price', serialize_html_tag('<i>', str(item.get('price', ''))), '€') + '{}: {} {}<br />'.format( 'condition', serialize_html_tag('<i>', str(item.get('condition', ''))), '') + '{}: {} {}<br />'.format( 'value', serialize_html_tag('<i>', str(item.get('value_rating', ''))), '/ 10') + '{}: {} {}<br />'.format( 'leverage', serialize_html_tag( '<i>', str(item.get('leverage_rating', ''))), '/ 10') + '{}: {} {}<br />'.format( 'age', serialize_html_tag('<i>', str(item.get('value_rating', ''))), 'days') + '{}: {} {}<br />'.format( 'url', serialize_html_tag( tag='<a>', value=str(self.context.get('domain', 'leboncoin.fr')), attributes={'href': item.get('url', '')}), '')) def load_item(self): """ Complete the raw information with computed data. """ __item = super(SecondHandAdLoader, self).load_item() __geolocator = Nominatim(user_agent='adspying') __location = __geolocator.geocode( (str(__item.get('postal_code', '69000')) + ', ' + __item.get('location', 'lyon')), exactly_one=True) # gps coordinates __item['latitude'] = __location.latitude __item['longitude'] = __location.longitude # timeline __item['first_posted'] = __item.get( 'last_updated', datetime.now().isoformat(sep='T', timespec='seconds')) __item['age'] = (datetime.now() - datetime.strptime( __item.get('first_posted', datetime.now().isoformat(sep='T', timespec='seconds')), '%Y-%m-%dT%H:%M:%S')).days __item['reposting_count'] = 0 __item['starting_price'] = __item.get('price', 0) # vendor __item['vendor'] = urljoin(self.context.get('base_url', ''), __item.get('vendor', '')) # evaluation & sorting depend on the query __item['value_rating'] = 5 # neutral value __item['leverage_rating'] = 5 # neutral value # map marker __item['icon'] = self.context.get('icon', 'marker') # summary __item['summary'] = self._summarize(__item) return __item
class ChildItemLoader(TestItemLoader): url_in = MapCompose(lambda v: v.lower())
class CustomItemLoader(ItemLoader): name_in = MapCompose(lambda v: v.title())
class ChildChildItemLoader(ChildItemLoader): url_in = MapCompose(lambda v: v.upper()) summary_in = MapCompose(lambda v: v)
def _get_input_processor(self): return self.kwargs.pop('in', MapCompose(*self.args))
class IdentityDefaultedItemLoader(DefaultedItemLoader): name_in = MapCompose()
class BaseNoInputReprocessingLoader(ItemLoader): title_in = MapCompose(str.upper) title_out = TakeFirst()
class ChildItemLoader(TestItemLoader): name_in = MapCompose(TestItemLoader.name_in, str.swapcase)
class DefaultedItemLoader(NameItemLoader): default_input_processor = MapCompose(lambda v: v[:-1])
class ChildDefaultedItemLoader(DefaultedItemLoader): name_in = MapCompose(DefaultedItemLoader.default_input_processor, str.swapcase)
class Work01Item(scrapy.Item): title = scrapy.Field(output_processor=TakeFirst()) category = scrapy.Field(input_processor=strip, output_processor=Join(separator=",")) show_time = scrapy.Field(input_processor=MapCompose(filter_time), output_processor=TakeFirst())
class RealEstateRawLoader(ItemLoader): default_output_processor = TakeFirst() title_in = MapCompose(str.strip, str.capitalize) value_in = MapCompose(str.strip, str.lower) value_out = Join() area_in = MapCompose(str.strip, str.lower) area_out = Join() address_in = MapCompose(str.strip) ward_in = MapCompose(str.strip) district_in = MapCompose(str.strip) province_in = MapCompose(str.strip) type_in = MapCompose(str.strip, str.lower) description_in = MapCompose(str.strip) description_out = Join('\n') sellerName_in = MapCompose(str.strip, str.lower, str.title) time_in = MapCompose(str.strip, str.lower) image_in = MapCompose(str.strip) image_out = Join(' ')