class ThreadLoader(XPathItemLoader): default_output_processor = TakeFirst() zeta_id_in = MapCompose(unicode.strip, extract_numbers_url) user_in = MapCompose(unicode.strip) replies_in = MapCompose(unicode.strip, to_int) views_in = MapCompose(unicode.strip, to_int) date_posted_in = MapCompose(unicode.strip, strip_start_date, to_datetime_long)
class SearchResultPostLoader(ItemLoader): default_item_class = SearchResultPost default_input_processor = Identity() default_output_processor = TakeFirst() def date_in(self, values): for s in values: yield s.strip()
class UserLoader(XPathItemLoader): default_output_processor = TakeFirst() zeta_id_in = MapCompose(unicode.strip, extract_numbers_url) member_number_in = MapCompose(unicode.strip, extract_numbers_url) post_count_in = MapCompose(unicode.strip, to_int) signature_in = Join() date_birthday_in = MapCompose(unicode.strip, to_datetime_short) date_joined_in = MapCompose(unicode.strip, to_datetime_short)
class MilkshakeLoader(ItemLoader): default_item_class = MilkshakeItem default_input_processor = MapCompose(remove_tags, lambda s: s.strip()) default_output_processor = TakeFirst() # price input #price_in = MapCompose(remove_tags, filter_price) # description output description_out = Join()
class BookLoader(ItemLoader): format_map = { u"/images/adobe_icon.gif": "PDF", u"/images/epubDRM_icon.gif": "EPUB", u"/images/mobi_icon.gif": "MobiPocket" } default_item_class = Book default_output_processor = TakeFirst() author_out = TakeFirst() # Ignore other than first publish_date_in = MapCompose(lambda s: datetime.strptime(s, "%Y%m").date()) format_in = MapCompose(lambda f: BookLoader.format_map[f]) format_out = Join(", ") category_out = Join(", ")
class Product(scrapy.Item): title = Field( input_processor = MapCompose(clean), output_processor = Join() ) url = Field(output_processor = TakeFirst()) current_price = Field( input_processor = MapCompose(extract_price), output_processor = TakeFirst() ) regular_price = Field( input_processor = MapCompose(extract_price), output_processor = TakeFirst() ) availability = Field(output_processor = TakeFirst()) category_name = Field( input_processor = MapCompose(clean), output_processor = Join() )
def test_get_value(self): il = NameItemLoader() self.assertEqual( u'FOO', il.get_value([u'foo', u'bar'], TakeFirst(), unicode.upper)) self.assertEqual([u'foo', u'bar'], il.get_value([u'name:foo', u'name:bar'], re=u'name:(.*)$')) self.assertEqual( u'foo', il.get_value([u'name:foo', u'name:bar'], TakeFirst(), re=u'name:(.*)$')) il.add_value('name', [u'name:foo', u'name:bar'], TakeFirst(), re=u'name:(.*)$') self.assertEqual([u'foo'], il.get_collected_values('name')) il.replace_value('name', u'name:bar', re=u'name:(.*)$') self.assertEqual([u'bar'], il.get_collected_values('name'))
class TramiteProyectoItemLoader(XPathItemLoader): default_item_class = TramiteProyectoItem default_input_processor = MapCompose(fix_space, unicode.strip) default_output_processor = TakeFirst() proyecto_camara_origen_in = MapCompose(fix_space, unicode.strip, normalize_camara) proyecto_camara_origen_expediente_in = MapCompose(fix_space, unicode.strip, normalize_codigo_expediente) camara_in = MapCompose(fix_space, unicode.strip, normalize_camara) fecha_in = MapCompose(fix_space, unicode.strip, partial(spanish_date, allow_empty=True)) fecha_out = Compose(lambda v: v[0].isoformat()) index_in = MapCompose(digits_only)
class AppleItem(CrawledItem): icon_path = Field() images_path = Field() last_crawl = Field() price = Field( default=0, input_processor=MapCompose(strip_space), output_processor=TakeFirst(), ) app_id = Field() apple_id = Field()
class AppItem(Item): # define the fields for your item here like: # name = Field() app_id = Field(input_processor=MapCompose(remove_entities, unicode.strip), output_processor=TakeFirst()) app_type = Field(input_processor=MapCompose(remove_entities, unicode.strip), output_processor=Join()) title = Field(input_processor=MapCompose(remove_entities, unicode.strip), output_processor=Join()) description = Field(input_processor=MapCompose(remove_entities, unicode.strip), output_processor=Join()) score = Field(input_processor=MapCompose(remove_entities, unicode.strip), output_processor=Join()) author = Field(input_processor=MapCompose(remove_entities, unicode.strip), output_processor=Join()) icon_url = Field(output_processor=TakeFirst()) similarity = Field() more_from_devs = Field()
class GalleryItemLoader(ItemLoader): default_output_processor = TakeFirst() title_in = MapCompose(unicode.title) title_out = Join() poster = MapCompose(unicode.title) #title_out = Join() url_in = MapCompose(unicode.title)
class WeiboComItemLoader(ItemLoader): default_item_class = WeiboComItem default_input_processor = MapCompose(lambda s: s.strip()) default_output_processor = TakeFirst() description_out = Join() content_in = MapCompose(fo.removeBlankStr, fo.filterHtml) comments_in = MapCompose(fo.getNum) repost_in = MapCompose(fo.getNum)
def _convert(data): if t not in ['join', 'list'] and isinstance(data, list): data = TakeFirst()(data) if type(data) in [str, unicode]: data = data.strip() elif type(data) in [int, float, datetime]: data = str(data) else: return data if t == 'join': sep = inf.get('sep', u' ') return Join(sep)(data) elif t == 'list': sep = inf.get('sep', u' ') return remove_tags(Join(sep)(data)).strip() elif t == 'text': return remove_tags(data).strip() elif t == 'clean': cleaner = Cleaner(style=True, scripts=True, javascript=True, links=True, meta=True) return cleaner.clean_html(data) elif t == 'unesc': return HTMLParser().unescape(data) elif t == 'base64': return base64.decodestring(data) elif t == 'sub': frm = inf.get('from') to = inf.get('to') return re.sub(frm, to, data) elif t == 'jpath': qs = inf.get('query') return jsonpath.jsonpath(json.loads(data), qs) elif t == 'map': m = inf.get('map') d = inf.get('default') return m.get(data, d) elif t == 'int': return int(float(data)) elif t == 'float': return float(data) elif t == 'date': fmt = inf.get('fmt', 'auto') tz = inf.get('tz', '+00:00') return parse_date(data, fmt, tz) elif t == 'cst': fmt = inf.get('fmt', 'auto') return parse_date(data, fmt, '+08:00') else: return data
class ReviewLoader(XPathItemLoader): date_in = MapCompose(unicode, unicode.strip, extract_date, date_format='%d/%m/%Y') date_out = TakeFirst() rating_in = MapCompose(unicode, extract_rating) rating_out = TakeFirst() full_text_in = MapCompose(unicode, unicode.strip, remove_entities) full_text_out = Join() url_in = MapCompose(unicode, unicode.strip) url_out = TakeFirst() product_url_in = MapCompose(unicode, unicode.strip) product_url_out = TakeFirst() sku_in = MapCompose(unicode, unicode.strip, unicode.lower) sku_out = TakeFirst()
class Cogis_spillItemLoader(ItemLoader): default_input_processor = Compose(TakeFirst(), extract_text) doc_href_in = Compose(TakeFirst(), extract_link) date_in = Compose(TakeFirst(), extract_date) county_code_in = TakeFirst() county_name_in = TakeFirst() default_output_processor = TakeFirst()
class StreamItem(Item): """Scrapy Item definition for a streamitem""" url = Field(output_processor=TakeFirst()) body = Field(output_processor=TakeFirst()) source_url = Field(output_processor=TakeFirst()) redirect_urls = Field() http_status = Field(output_processor=TakeFirst()) content_type = Field(output_processor=TakeFirst()) response_size = Field(output_processor=TakeFirst()) metadata = Field(output_processor=TakeFirst())
class DoctorItem(Item): _name = Field(output_processor=TakeFirst(), ) hospital = Field(output_processor=TakeFirst(), ) specialty = Field(output_processor=TakeFirst(), ) title = Field(output_processor=TakeFirst(), ) acadamicDegree = Field(output_processor=TakeFirst(), ) shortDesc = Field( input_processor=MapCompose(lambda v: v.strip()), output_processor=TakeFirst(), ) clinicTime = Field(output_processor=TakeFirst(), )
class ResultsItemLoader(ItemLoader): default_item_class = ResultsItem default_output_processor = Compose(TakeFirst(), unicode, unicode.strip) pm1_out = Compose(default_output_processor, removeunichars, tidytomoney) pm2_out = Compose(default_output_processor, removeunichars, tidytomoney) pm3_out = Compose(default_output_processor, removeunichars, tidytomoney) pm4_out = Compose(default_output_processor, removeunichars, tidytomoney) pm5_out = Compose(default_output_processor, removeunichars, tidytomoney) prizemoney_out = Compose(default_output_processor, removeunichars, tidytomoney) racename_out = Compose(default_output_processor, removeunichars) gear_out = Compose(default_output_processor, removeunichars) OR_out = Compose(default_output_processor, removeunichars) TS_out = Compose(default_output_processor, removeunichars) RPR_out = Compose(default_output_processor, removeunichars) damsire_out = Compose(default_output_processor, removeunichars, cleandamsire) jockeyname_out = Compose(default_output_processor, removeunichars) trainername_out = Compose(default_output_processor, removeunichars) sire_out = Compose(default_output_processor, removeunichars) dam_out = Compose(default_output_processor, removeunichars) horsename_out = Compose(default_output_processor, removeunichars) prizemoney_in = Compose(default_output_processor, removeunichars, tidytomoney) L1racedate = Compose(default_output_processor, removeunichars) L2racedate = Compose(default_output_processor, removeunichars) L3racedate = Compose(default_output_processor, removeunichars) L4racedate = Compose(default_output_processor, removeunichars) L5racedate = Compose(default_output_processor, removeunichars) L6racedate = Compose(default_output_processor, removeunichars) L1comment_out = Compose(default_output_processor, removeunichars) L2comment_out = Compose(default_output_processor, removeunichars) L3comment_out = Compose(default_output_processor, removeunichars) L4comment_out = Compose(default_output_processor, removeunichars) L5comment_out = Compose(default_output_processor, removeunichars) L6comment_out = Compose(default_output_processor, removeunichars) currentodds_out = Compose(default_output_processor, decimalizeodds) horse_out = Compose(TakeFirst(), Identity()) horse_in = Compose(TakeFirst(), Identity())
def _get_processors(self, procs_str): procs = [TakeFirst(), processors.string_strip, ] if not procs_str: return procs procs_tmp = list(procs_str.split(',')) for p in procs_tmp: p = p.strip() if hasattr(processors, p): procs.append(getattr(processors, p)) else: self.log("Processor '%s' is not defined!" % p, log.ERROR) procs = tuple(procs) return procs
class FirmaProyectoItemLoader(XPathItemLoader): default_item_class = FirmaProyectoItem default_input_processor = MapCompose(fix_space, unicode.strip) default_output_processor = TakeFirst() proyecto_camara_origen_in = MapCompose(fix_space, unicode.strip, normalize_camara) proyecto_camara_origen_expediente_in = MapCompose(fix_space, unicode.strip, normalize_codigo_expediente) firmante_nombre_in = MapCompose(fix_space, format_personal_name) firmante_apellido_in = MapCompose(fix_space, format_personal_name) firmante_distrito_in = MapCompose(fix_space, unicode.strip, partial(normalize_distrito_name, allow_empty=True)) firmante_special_in = MapCompose(fix_space, unicode.strip, normalize_firmante_special) firmante_poder_in = MapCompose(fix_space, unicode.strip, normalize_poder) firmante_bloque_in = MapCompose(fix_space, unicode.strip, partial(normalize_bloque_name, allow_empty=True))
class ArticleLoader(XPathItemLoader): category_loader = CategoryLoader tagline_loader = TaglineLoader url_out = Join() body_text_out = Compose(Join(), TextTool(normalize=True)) date_of_out = Compose(TakeFirst(), ParseDate(patch_table={u"мая": "May"})) headline_out = Compose(Join(), TextTool(normalize=True)) #image_urls_out = FullUrl() def __init__(self, search, response, check_exists=False): self.search = search self.response = response self.check_exists = check_exists super(ArticleLoader, self).__init__(item=ArticleItem(), response=self.response) def load_item(self): self._configure_main_rules() if self.check_exists and ArticleChecker( url=self.response.url, headline=self.get_output_value('headline')).exists(): return result = super(ArticleLoader, self).load_item() return result if result['headline'] else None def _configure_main_rules(self): self.add_value('url', self.response.url) for field, xpath in self.search.iteritems(): try: self.add_xpath(field, xpath) except KeyError: continue def _load_category_item(self): try: category_loader = self.category_loader(self.search, self.response) category = category_loader.load_item() if not (category and category['name']): raise ValueError() except (KeyError, ValueError): pass # -> category rule not exists - simple skip this case else: self._setup_text_processing(category) self.item["categories"].append(category) def _setup_text_processing(self, category): self.body_text_out.cut_substring = category['name']
class EventLoader(ItemLoader): # used if fields don't specify one default_input_processor = Strip() default_output_processor = TakeFirst() teams_in = MapCompose(unicode.strip, unicode.title) teams_out = Identity() # don't apply default dateTime_in = Compose(take_first, parse_str2date, parse_date2str) # dateTime_out = MapCompose(parse_date2str) markets_in = MapCompose(strip_mkt_name, strip_odds, convert_odds, format_runners) markets_out = Identity() # don't apply default
class YahooQuestion(Item): question_id = Field(output_processor=Join()) question_url = Field(input_processor=MapCompose(remove_entities, unicode.strip), output_processor=Join()) asker = Field(output_processor=TakeFirst()) asking_date = Field(output_processor=Join()) number_of_answers = Field(input_processor=MapCompose( remove_entities, get_answers_number), output_processor=TakeFirst()) number_of_interesting_marks = Field( output_processor=Compose(Join(), get_number_from_string)) status = Field(input_processor=MapCompose(remove_entities, unicode.strip), output_processor=Join()) import_date = Field(output_processor=Join()) question_user = Field(output_processor=TakeFirst()) question_title = Field(input_processor=MapCompose(remove_entities, unicode.strip), output_processor=Join()) question_content = Field(input_processor=MapCompose( remove_entities, unicode.strip), output_processor=Join()) category = Field(input_processor=MapCompose(filter_home))
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) name_xpath = '//div[@class="productpagetitlewrap"]/h1/text()' name = hxs.select(name_xpath).extract().pop().strip() quantity = hxs.select( '//form/input[@class="AddToCartButton"]/@value').extract() if quantity and "Add to Cart" in quantity.pop(): quantity = None else: quantity = 0 loader = ProductLoader(response=response, item=Product()) loader.add_value('url', urljoin(base_url, response.url)) loader.add_value('name', name) loader.add_xpath( 'image_url', '//div[@id="imageWrapperRpt"]/a[@id="zoom1"]/img[@id="zoom2"]/@src', Compose(lambda v: urljoin(base_url, v[0]))) loader.add_xpath('price', '//div[@class="productpagerightpriceswrap"]/b/text()', TakeFirst(), re="([.0-9]+)") loader.add_xpath('category', '//meta[@name="description"]/@content') loader.add_value('sku', name, re='(\d\d\d+)') loader.add_xpath( 'identifier', '//form//input[@type="hidden" and @name="ProductID"]/@value', TakeFirst()) if quantity == 0: loader.add_value('stock', 0) yield loader.load_item()
def parse(self, response): item = PdfItem() loader = ItemLoader(response=response) pdf_path = '//*[contains(text(), "[PDF]")]' pdf_url_path = '%s//following-sibling::*' % pdf_path item['url'] = loader.get_xpath('%s' % pdf_url_path) item['title'] = loader.get_xpath('%s/text()' % pdf_url_path, TakeFirst()) summary_path = '%s//parent::*//parent::*/*[@class="s"]/*' % pdf_url_path description_path = '%s/*[@class="st"]/*' % summary_path item['description'] = loader.get_xpath( '%s/text()|%s/*/text()' % (description_path, description_path)) similar_path = '%s/*[contains(@class, "f")]//a[contains(@href, "q=related:")]' % summary_path # similar_url = loader.get_xpath('%s/@href' % similar_path, TakeFirst()) # yield Request( # url=urlparse.urljoin(response.url, similar_url), # callback=self.parse, # meta=response.meta, # dont_filter=True # ) # # next_path = '//*[@class="pn"]' # next_url = loader.get_xpath('%s/@href' % next_path, TakeFirst()) # yield Request( # url=urlparse.urljoin(response.url, next_url), # callback=self.parse, # meta=response.meta, # dont_filter=True # ) pdf_url = item['url'] print item if pdf_url: pdf_filename = os.path.basename(pdf_url) pdf_filepath = '%s/%s/%s' % (DOWNLOAD_DIR, SEARCH_TERM, pdf_filename) if self.download_files: self.download_file(pdf_url, pdf_filepath, response.url) yield item
class ActaVotacionItemLoader(XPathItemLoader): default_item_class = ActaVotacionItem default_input_processor = MapCompose(fix_space, unicode.strip) default_output_processor = TakeFirst() camara_in = MapCompose(fix_space, unicode.strip, normalize_camara) tipo_in = MapCompose(fix_space, unicode.strip, normalize_votacion_tipo) resultado_in = MapCompose(fix_space, unicode.strip, normalize_votacion_resultado) reunion_fecha_in = MapCompose(fix_space, unicode.strip, partial(spanish_date, allow_empty=True)) reunion_fecha_out = Compose(lambda v: v[0].isoformat()) sesion_tipo_in = MapCompose(fix_space, unicode.strip, normalize_sesion_tipo) sesion_numero_in = MapCompose(digits_only) reunion_numero_in = MapCompose(digits_only) year_inicio_in = MapCompose(digits_only) year_fin_in = MapCompose(digits_only)
class LegisladorItemLoader(XPathItemLoader): default_item_class = LegisladorItem default_input_processor = MapCompose(fix_space, unicode.strip) default_output_processor = TakeFirst() apellido_in = MapCompose(fix_space, format_personal_name) nombre_in = MapCompose(fix_space, format_personal_name) camara_in = MapCompose(fix_space, unicode.strip, normalize_camara) distrito_nombre_in = MapCompose(fix_space, unicode.strip, normalize_distrito_name) bloque_nombre_in = MapCompose(fix_space, unicode.strip, normalize_bloque_name) mandato_inicio_in = MapCompose(fix_space, unicode.strip, spanish_date) mandato_inicio_out = Compose(lambda v: v[0].isoformat()) mandato_fin_in = MapCompose(fix_space, unicode.strip, spanish_date) mandato_fin_out = Compose(lambda v: v[0].isoformat())
def parse(self, response): sel = response.xpath('.//*[@class="post_info"]') if not sel: self.log('posts are not find') return self.group_id = response.xpath( './/div[@id="group_followers"]/a/@href').re('group.=(\d+?)$')[0] for s in sel: wall_text = s.xpath('div[@class="wall_text"]') text = wall_text.xpath( 'div/div[@class="wall_post_text"]').extract() spam_words = get_spam_words_from_msg(text, self.spam_words_from_file) if spam_words: l = ItemLoader(item=VkItem(), selector=s, response=response) date = s.xpath( 'div[@class="replies"]/div/small/a[1]/span/text()' ).extract() date = l.get_value(date, MapCompose(normalize_date), TakeFirst()) if is_date_less_last_date(date, self.days_count_to_parse): return l.add_value('id', wall_text.xpath('div/a/@data-from-id').extract()) l.add_value('name', wall_text.xpath('div/a/text()').extract()) l.add_value('text', text) l.add_value('date', date) l.add_value('words', spam_words) yield l.load_item() #ban => Request() replies_hidden = s.xpath('.//a[@class="wr_header"]/@onclick') if replies_hidden: url = get_url_hided_replies(replies_hidden[0].extract(), self.main_page) yield Request(url=url, callback=self.get_hided_items) else: replies = s.xpath('.//div[@class="reply_table"]').extract() for reply in replies: raw_html = ''.join(reply.splitlines()).encode('utf-8') html_response = HtmlResponse(url=response.url, body=raw_html) for i in self.get_replies_items(html_response): yield i.load_item() yield Request(url=self.get_next_msgs_url(), method='POST', callback=self.parse, body=self.get_post_body_for_next_msgs())
def parse_question(self, response): question = ItemLoader(item=ArabiaQuestionItem(), response=response) question.default_output_processor = TakeFirst() question.add_xpath('id', '//*[@id="question_id"]/@value', MapCompose(int)) question.add_xpath('asker_username', '//*[@class="question_meta"]/a/text()') question.add_xpath('answerer_username', '//*[@class="inblock username"]/text()') question.add_xpath('title', '//*[@class="question_title"]/h2/text()') question.add_xpath('date', '//*[@class="question_date"]/text()') question.add_xpath('content', '//*[@id="question_answer"]/*', Join('\n')) question.add_value('url', response.url) question.add_value('item', 'question') yield question.load_item()
class AppInfoItemLoader(ItemLoader): default_item_class = AppInfoItem default_output_processor = TakeFirst() default_input_processor = MapCompose(unicode.strip) screenshots_out = Identity() intro_out = Join('<br>') tags_out = Identity() permissions_str_out = Join(';') permissions_out = Identity() instance_in = Identity()