Example #1
0
class ParkLoader(ItemLoader):
    default_item_class = Park
    default_output_processor = TakeFirst()

    images_out = Identity()
    description_out = Identity()
    features_out = Identity()
Example #2
0
class ArticleLoader(XPathItemLoader):
    """
    Used for easier construction of ArticleItem
    """
    def is_string(string):
        if isinstance(string, str) or isinstance(string, unicode):
            if string.strip() != "":
                #log.msg("returning string: "+ unicode(string.strip()))
                return string.strip()
        #log.msg("returning None for string: "+ unicode(string))
        return None

    def separate_tags(tags_string):
        return tags_string.replace(";", ",").split(",")

    default_input_processor = MapCompose(is_string)
    default_output_processor = TakeFirst()

    publishers_in = MapCompose(is_string)
    publishers_out = Identity()

    title_in = MapCompose(is_string, unicode.title)
    title_out = TakeFirst()

    time_published_in = MapCompose(is_string)
    time_published_out = Identity()

    summary_in = MapCompose(is_string)
    summary_out = TakeFirst()

    tags_in = MapCompose(is_string, separate_tags)
    tags_out = Identity()
Example #3
0
class DefaultItemLoader(ItemLoader):
  default_input_processor = MapCompose(unicode, unicode.strip) #good for string input
  default_output_processor = Identity()
  
  image_urls_in = MapCompose(canonicalize)
  image_urls_out = Identity()
  postinfo_in = Identity()
  postinfo_out = Identity()
Example #4
0
class ActivityItemLoader(XPathItemLoader):
    default_output_processor = TakeFirst()

    name_out = TakeFirst()
    categories_out = Identity()
    address_in = MapCompose(unicode.strip)
    address_out = JoinAddress()
    description_in = MapCompose(replace_nbrs)
    description_out = Join('\n')
    time_needed_out = TakeFirst()
    price_out = TakeFirst()
    image_urls_out = Identity()
    images = Identity()
Example #5
0
class EventLoader(ItemLoader):

    # used if fields don't specify one
    default_input_processor = Strip()
    default_output_processor = TakeFirst()

    teams_in = MapCompose(unicode.strip, unicode.title)
    teams_out = Identity()  # don't apply default

    dateTime_in = Compose(take_first, parse_str2date, parse_date2str)
    # dateTime_out = MapCompose(parse_date2str)

    markets_in = MapCompose(strip_mkt_name, strip_odds, convert_odds, format_runners)
    markets_out = Identity()  # don't apply default
Example #6
0
class CategoryItemLoader(ItemLoader):
    def catIDfromURL(url):
        #needs to be put in a utility file, because this is also used in td_spider.py
        itemIdQuery = re.compile('[Cc]at[Ii]d=[0-9]+$')
        categoryIDtxt = re.findall(itemIdQuery, url)[0]
        categoryID = categoryIDtxt.replace("CatId=", "")
        return int(categoryID)

    def catLevelfromURL(url):
        #needs to be put in a utility file, because this is also used in td_spider.py
        catLevelQuery = re.compile('(?:category\_)([st])(?:lc)')
        catLevelArry = re.findall(catLevelQuery, url)
        if catLevelArry:
            catLevel = catLevelArry[0]
            if catLevel == 't':
                #top level - level 1
                return 1
            if catLevel == 's':
                #second level - level 2
                return 2

    def linkToMfgID(link):
        onclick = link.xpath("@onclick").extract()
        if onclick:
            mfgquery = re.compile('[Mm]fr[Ii]d=[0-9]+\"')
            mfgId = re.findall(mfgquery, onclick[0])
            if mfgId:
                mfgId = mfgId[0].replace("MfrId=", "").replace("\"", "")
                mfgId = mfgId.encode('utf-8')
                return int(mfgId)

    default_input_processor = Identity()
    default_output_processor = Join()

    categoryName_in = TakeFirst()
    categoryName_out = Join()

    tdCategoryID_in = MapCompose(catIDfromURL)
    tdCategoryID_out = TakeFirst()

    tdCategoryParent_in = Identity()
    tdCategoryParent_out = Identity()

    tdCategoryLevel_in = MapCompose(catLevelfromURL)
    tdCategoryLevel_out = TakeFirst()

    manufacturers_in = MapCompose(linkToMfgID)
    manufacturers_out = Identity()
Example #7
0
class MfgItemLoader(ItemLoader):
	def cleanString(string):
		cleanQuery=re.compile('[A-Za-z0-9 .",\'!-]')
		string=''.join(re.findall(cleanQuery, string))
		return string
	def linkToMfgID(link):
		onclick = link.xpath("@onclick").extract()
		if onclick:
			mfgquery = re.compile('[Mm]fr[Ii]d=[0-9]+\"')
			mfgId = re.findall(mfgquery, onclick[0])
			if mfgId:
				mfgId = mfgId[0].replace("MfrId=","").replace("\"","")
				mfgId = mfgId.encode('utf-8')
				return mfgId

	def parsemfgName(link):
		#mfgName = link.xpath("text()").extract()[0]
		mfgName = link.encode('utf-8').strip()
		return mfgName

	default_input_processor = Identity()
	default_output_processor = Join()

	mfgName_in = MapCompose(parsemfgName, cleanString)
	mfgName_out = Join()

	mfgID_in = MapCompose(linkToMfgID)
	mfgID_out = Join()
Example #8
0
class PriceItemLoader(ItemLoader):
    def cleanString(string):
        cleanQuery = re.compile('[A-Za-z0-9 .",\'!-]')
        string = ''.join(re.findall(cleanQuery, string))
        return string

    def parseSalePrice(priceIn):
        salePrice = priceIn.strip().replace(" ", "").replace("$", "")
        return salePrice

    def parseRebateAmount(rebateIn):
        priceRebate = rebateIn.strip().replace("\n",
                                               "").replace("\r", "").replace(
                                                   " ", "").replace("$", "")
        return priceRebate

    def parseFinalPrice(priceIn):
        #print priceIn
        query = re.compile('[0-9\.]')
        price = re.findall(query, priceIn)
        if price:
            finalPrice = ''.join(price)
        return finalPrice

    default_input_processor = Identity()
    default_output_processor = Join()

    salePrice_in = MapCompose(parseSalePrice)
    salePrice_out = Join()

    finalPrice_in = MapCompose(parseFinalPrice)
    finalPrice_out = Join()
Example #9
0
class DaywatchLoader(ItemLoader):
    '''
    Base ItemLoader. Users can create a custom loader to handle certain fields
    of the project item model.
    '''
    default_input_processor = Identity()
    default_output_processor = TakeFirst()
Example #10
0
class PostLoader(ItemLoader):

    default_item_class = Post
    default_input_processor = Identity()
    default_output_processor = TakeFirst()

    def ctime_in(self, values):
        for s in values:
            yield (datetime.strptime(s.strip(), '%Y-%m-%d %H:%M').replace(
                tzinfo=pytz.timezone('America/Anguilla'))  # UTC -4
                   .astimezone(pytz.utc).strftime(TIME_FORMAT))

    def cover_uri_in(self, values):
        for s in values:
            if s.startswith('http://'):
                yield s
            else:
                parts = s.split('~')
                # title may contain ~
                if len(parts) >= 4 and parts[0] == 'init':
                    yield 'http://%s/%s' % (parts[1], parts[2])

    def rating_in(self, values):
        for s in values:
            yield parse_rating(s)
Example #11
0
class SpeakerLoader(ItemLoader):
    default_item_class = Speaker
    default_input_processor = MapCompose(remove_tags, unquote_markup,
                                         unicode.strip)
    default_output_processor = Join()

    image_urls_out = Identity()
    name_out = Compose(Join(), _cleanup_name)
Example #12
0
class AppInfoItemLoader(ItemLoader):
    default_item_class = AppInfoItem

    default_output_processor = TakeFirst()
    default_input_processor = MapCompose(unicode.strip)

    screenshots_out = Identity()

    intro_out = Join('<br>')

    tags_out = Identity()

    permissions_str_out = Join(';')

    permissions_out = Identity()

    instance_in = Identity()
Example #13
0
class SearchResultPostLoader(ItemLoader):

    default_item_class = SearchResultPost
    default_input_processor = Identity()
    default_output_processor = TakeFirst()

    def date_in(self, values):
        for s in values:
            yield s.strip()
Example #14
0
class PostLoader(ItemLoader):

    default_item_class = Post
    default_input_processor = Identity()
    default_output_processor = TakeFirst()

    def ctime_in(self, values):
        for s in values:
            yield s[5:]
Example #15
0
class DealLoaderBase(XPathItemLoader):
    default_input_processor = MapCompose(unicode.strip)
    default_output_processor = Join()

    title_in = MapCompose(unicode.strip)
    description1_in = MapCompose(unicode.strip)
    description1_out = Join('\n')
    description2_in = MapCompose(unicode.strip)
    description2_out = Join('\n')
    days_in = MapCompose(strip_alpha)
    hours_in = MapCompose(strip_alpha)
    minutes_in = MapCompose(strip_alpha)
    initial_price_in = MapCompose(strip_alpha)
    sell_price_in = MapCompose(strip_alpha)
    discount_in = MapCompose(strip_alpha)
    saving_in = MapCompose(strip_alpha)
    nbr_buyers_in = MapCompose(strip_alpha)
    validity_in = MapCompose(unicode.strip, sanitize_validity)
    cities_in = MapCompose(unicode.strip)
    cities_out = Identity()
    image_urls_out = Identity()
Example #16
0
class HistDataItem(scrapy.Item):
    url = scrapy.Field(
        input_processor=MapCompose(urllib2.unquote),
        output_processor=TakeFirst(),
    )
    tk = scrapy.Field(
        input_processor=Identity(),
        output_processor=TakeFirst(),
    )
    date = scrapy.Field(
        input_processor=Identity(),
        output_processor=TakeFirst(),
    )
    datemonth = scrapy.Field(
        input_processor=Identity(),
        output_processor=TakeFirst(),
    )
    platform = scrapy.Field(
        input_processor=Identity(),
        output_processor=TakeFirst(),
    )
    timeframe = scrapy.Field(
        input_processor=Identity(),
        output_processor=TakeFirst(),
    )
    fxpair = scrapy.Field(
        input_processor=Identity(),
        output_processor=TakeFirst(),
    )
Example #17
0
class ResultsItemLoader(ItemLoader):
    default_item_class = ResultsItem
    default_output_processor = Compose(TakeFirst(), unicode, unicode.strip)
    pm1_out = Compose(default_output_processor, removeunichars, tidytomoney)
    pm2_out = Compose(default_output_processor, removeunichars, tidytomoney)
    pm3_out = Compose(default_output_processor, removeunichars, tidytomoney)
    pm4_out = Compose(default_output_processor, removeunichars, tidytomoney)
    pm5_out = Compose(default_output_processor, removeunichars, tidytomoney)
    prizemoney_out = Compose(default_output_processor, removeunichars,
                             tidytomoney)
    racename_out = Compose(default_output_processor, removeunichars)
    gear_out = Compose(default_output_processor, removeunichars)
    OR_out = Compose(default_output_processor, removeunichars)
    TS_out = Compose(default_output_processor, removeunichars)
    RPR_out = Compose(default_output_processor, removeunichars)
    damsire_out = Compose(default_output_processor, removeunichars,
                          cleandamsire)
    jockeyname_out = Compose(default_output_processor, removeunichars)
    trainername_out = Compose(default_output_processor, removeunichars)
    sire_out = Compose(default_output_processor, removeunichars)
    dam_out = Compose(default_output_processor, removeunichars)
    horsename_out = Compose(default_output_processor, removeunichars)
    prizemoney_in = Compose(default_output_processor, removeunichars,
                            tidytomoney)
    L1racedate = Compose(default_output_processor, removeunichars)
    L2racedate = Compose(default_output_processor, removeunichars)
    L3racedate = Compose(default_output_processor, removeunichars)
    L4racedate = Compose(default_output_processor, removeunichars)
    L5racedate = Compose(default_output_processor, removeunichars)
    L6racedate = Compose(default_output_processor, removeunichars)
    L1comment_out = Compose(default_output_processor, removeunichars)
    L2comment_out = Compose(default_output_processor, removeunichars)
    L3comment_out = Compose(default_output_processor, removeunichars)
    L4comment_out = Compose(default_output_processor, removeunichars)
    L5comment_out = Compose(default_output_processor, removeunichars)
    L6comment_out = Compose(default_output_processor, removeunichars)
    currentodds_out = Compose(default_output_processor, decimalizeodds)
    horse_out = Compose(TakeFirst(), Identity())
    horse_in = Compose(TakeFirst(), Identity())
Example #18
0
class ProyectoItemLoader(XPathItemLoader):
    default_item_class = ProyectoItem
    default_input_processor = MapCompose(fix_space, unicode.strip)
    default_output_processor = TakeFirst()

    tipo_in = MapCompose(fix_space, unicode.strip, normalize_tipo_proyecto)
    camara_origen_in = MapCompose(fix_space, unicode.strip, normalize_camara)
    camara_origen_expediente_in = MapCompose(fix_space, unicode.strip, normalize_codigo_expediente)
    origen_in = MapCompose(fix_space, unicode.strip, normalize_proyecto_origen)
    reproduccion_expediente_in = MapCompose(fix_space, unicode.strip, partial(normalize_codigo_expediente, allow_empty=True))
    camara_revisora_in = MapCompose(fix_space, unicode.strip, partial(normalize_camara, allow_empty=True))
    camara_revisora_expediente_in = MapCompose(fix_space, unicode.strip, partial(normalize_codigo_expediente, allow_empty=True))
    ley_numero_in = MapCompose(fix_space, unicode.strip, digits_only)
    mensaje_codigo_in = MapCompose(fix_space, unicode.strip, partial(normalize_codigo_mensaje, allow_empty=True))


    publicacion_en_in = MapCompose(fix_space, unicode.strip, partial(normalize_publicacion_en, allow_empty=True))
    publicacion_fecha_in = MapCompose(fix_space, unicode.strip, spanish_date)
    publicacion_fecha_out = Compose(lambda v: v[0].isoformat())

    comisiones_diputados_out = Identity()
    comisiones_senadores_out = Identity()
Example #19
0
class TimetableLoader(XPathItemLoader):
   default_output_processor = Join()
   default_input_processor = MapCompose(unicode.strip)
   flight_in = Compose(flight_handler)
   flight_type_in = Identity()
   flight_type_out = Compose(return_first)
   flight_status_in = Compose(flight_status_handler)
   flight_status_out = Compose(return_first)
   city_of_departure_out = Compose(title)
   city_of_arrival_out = Compose(title)
   airport_of_departure_out = Compose(title)
   airport_of_arrival_out = Compose(title)
   airline_in = Compose(airline_handler)
   airline_out = Compose(title)
Example #20
0
class ItemItemLoader(ItemLoader):
    default_input_processor = Identity()
    default_output_processor = Join()

    def itemIDfromURL(url):
        #Item=[a-zA-Z0-9]*
        itemQuery = re.compile('Item=[a-zA-Z0-9]*')
        itemString = re.findall(itemQuery, url)
        print itemString
        if itemString[0]:
            print itemString[0]
            return itemString[0].replace("Item=", "")

    itemNo_in = MapCompose(itemIDfromURL)
    itemNo_out = Join()
Example #21
0
class ArrayField(PredefinedField):
    defaults = {
        'output_processor': Identity(),
        'default_value': []
    }

    def __init__(self, field_or_item, **kwargs):
        if issubclass(field_or_item, scrapy.Item):
            defaults = {
                'input_processor': MapCompose(dict)
            }
        elif issubclass(field_or_item, PredefinedField):
            defaults = field_or_item.defaults.copy()
        else:
            defaults = {}

        defaults.update(self.defaults)
        super(ArrayField, self).__init__(defaults, **kwargs)
Example #22
0
class IrrAdvertisementLoader(ItemLoader):
    """
    Defines input and output processors and actions for iir.ru advertisement
    data.
    """
    default_output_processor = TakeFirst()
    default_input_processor = MapCompose(
        lambda txt: txt.strip() if isinstance(txt, (unicode, str)) else txt)

    foreign_id_in = MapCompose(only_digits)
    views_in = MapCompose(only_digits)
    price_in = MapCompose(only_price)
    seller_in = MapCompose(only_letters)
    published_in = MapCompose(datetime_interpretation)
    mileage_in = MapCompose(only_digits)
    mileage_units_in = MapCompose(only_letters)
    volume_in = MapCompose(only_digits)
    volume_units_in = MapCompose(only_letters)
    release_year_in = MapCompose(only_digits)
    horsepower_in = MapCompose(only_digits)
    photos_in = MapCompose(
        lambda x: None if isinstance(x, (unicode, str)) and len(x) < 10 else x)
    photos_out = Identity()
Example #23
0
 class LalaItemLoader(TestItemLoader):
     default_output_processor = Identity()
Example #24
0
 class IdentityDefaultedItemLoader(DefaultedItemLoader):
     name_in = Identity()
Example #25
0
 def test_identity(self):
     proc = Identity()
     self.assertEqual(proc([None, '', 'hello', 'world']),
                      [None, '', 'hello', 'world'])
Example #26
0
class RootItemLoader(XPathItemLoader):
    default_item_class = projectItems
    default_input_processor = Identity()
    default_ouput_processor = Identity()
Example #27
0
class ItemItemLoader(ItemLoader):
    def cleanString(string):
        cleanQuery = re.compile('[A-Za-z0-9 .",\'!-]')
        string = ''.join(re.findall(cleanQuery, string))
        return string

    def parseItemNo(itemin):
        #srItemNumber = itemin.extract()
        strItemNumber = itemin.strip().replace("\n", "").replace(
            "|", "").replace("\r", "").replace("\u00a0", "").strip()
        return strItemNumber

    def parseModelNo(itemin):
        strModelNumber = itemin.strip().replace("\n", "").replace(
            "|", "").replace("\r", "").replace("\u00a0", "").strip()
        return strModelNumber

    def catIDfromURL(url):
        itemIdQuery = re.compile('[Cc]at[Ii]d=[0-9]+')
        #print url
        catIDArry = re.findall(itemIdQuery, url)
        if catIDArry:
            categoryIDtxt = catIDArry[0]
            #print categoryIDtxt
            categoryID = categoryIDtxt.replace("CatId=", "")
            return categoryID

    def parseSpecification(specificationKV):
        # def cleanKey(key):
        # 	cleanQuery=re.compile('[A-Za-z0-9 .-]')
        # 	string=''.join(re.findall(cleanQuery, key))
        # 	return string
        # def cleanValue(value):
        # 	cleanQuery=re.compile('[A-Za-z0-9 .",\'!-]')
        # 	string=''.join(re.findall(cleanQuery, value))
        # 	return value
        # def cleanKV(specificationKV):
        # 	returnKV={}
        # 	for key, value in specificationKV.items():
        # 		returnKV[cleanKey(key)]=cleanValue(value)
        # 	return specificationKV
        # def isSpecial(specificationKV):
        # 	isspecial=false
        # 	for key, value in specificationKV.items():
        # 		if key=="Capactity":
        # 			isspecial=true
        # 	return isspecial

        returnVal = cleanKV(specificationKV)
        #if isSpecial(returnVal):
        # print "CAPACAITY"
        # print "CAPACAITY"
        # print "CAPACAITY"
        # print "CAPACAITY"
        # print "CAPACAITY"
        # print "CAPACAITY"
        # print "CAPACAITY"
        # print "CAPACAITY"
        # print "CAPACAITY"
        # print "CAPACAITY"
        # print "CAPACAITY"
        # print "CAPACAITY"
        # print "CAPACAITY"
        # print "CAPACAITY"
        # print "CAPACAITY"
        # print "CAPACAITY"
        # print "CAPACAITY"
        # print "CAPACAITY"
        # print "CAPACAITY"
        # print "CAPACAITY"
        # print "CAPACAITY"
        # print "CAPACAITY"
        # print "CAPACAITY"

        return returnVal
        #kv because it comes in as key/value dict

    default_input_processor = Identity()
    default_output_processor = Join()

    productName_in = MapCompose(unicode.title, cleanString)
    productName_out = Join()

    itemNo_in = MapCompose(parseItemNo)
    itemNo_out = Join()

    modelNo_in = MapCompose(parseModelNo)
    modelNo_out = Join()

    tdCategoryID_in = MapCompose(catIDfromURL)
    tdCategoryID_out = Identity()

    specifications_in = MapCompose(parseSpecification)
    specifications_out = Identity()
Example #28
0
 def __init__(cls, name, bases, dct):
   """Customizing __init__ because it has the cls ready"""
   cls.Images_in = MapCompose(filter_js_func_call)
   cls.Images_out = Identity()
   super(CarItemLoaderMeta, cls).__init__(name, bases, dct)
Example #29
0
class DealLoader(DealLoaderBase):

    supplier_phones_in = MapCompose(unicode.strip, sanitize_phones)
    supplier_phones_our = Identity()
Example #30
0
class ArtLoader(ItemLoader):

    default_item_class = Art
    default_input_processor = Identity()
    default_output_processor = TakeFirst()