def test_strip_html_complex(self): html_marked = """<div class="shortcode recipe-box" itemscope="" itemtype="http://data-vocabulary.org/Recipe"> <div class="recipe-header" title="recipe"> <div class="recipe-box-label">Recipe</div> <h2 class="recipe-title" id="recipe-form-266104"><span itemprop="name">Pesto Pasta Salad</span></h2> <dl class="recipe-data"> <dt>Prep Time:</dt><dd> <time itemprop="prepTime" datetime="PT1H">1 Hour</time></dd> <dt>Cook Time:</dt><dd> <time itemprop="cookTime" datetime="PT12M">12 Minutes</time></dd> <dt>Difficulty:</dt><dd> Easy</dd> <dt>Servings:</dt><dd> <span itemprop="yield">4</span></dd> </dl> </div><!--/recipe-header--> <div class="shortcode-box"> <img width="213" src="http://tastykitchen.com/recipes/files/2013/04/pestopasta-420x279.jpg" class="photo" itemprop="photo"> <a class="print-recipe-card" href="http://tastykitchen.com/recipes/salads/pesto-pasta-salad-2/?print=1/#size3x5" id="pl_266104">Print Recipe</a> </div> <h4 class="recipe-sub-head">Ingredients</h4> <ul><li><span itemprop="ingredient" itemscope="" itemtype="http://data-vocabulary.org/RecipeIngredient"><span itemprop="amount">8 ounces, weight</span><span itemprop="name"> Short Fusilli Or Rotini (corkscrew) Pasta</span></span></li><li><span itemprop="ingredient" itemscope="" itemtype="http://data-vocabulary.org/RecipeIngredient"><span itemprop="amount">1 head</span><span itemprop="name"> (large) Romaine Lettuce, Sliced Into 1-inch Pieces</span></span></li><li><span itemprop="ingredient" itemscope="" itemtype="http://data-vocabulary.org/RecipeIngredient"><span itemprop="amount">1/3 cup</span><span itemprop="name"> Prepared Pesto</span></span></li><li><span itemprop="ingredient" itemscope="" itemtype="http://data-vocabulary.org/RecipeIngredient"><span itemprop="amount">1/2 cup</span><span itemprop="name"> Shredded Parmesan Cheese</span></span></li><li><span itemprop="ingredient" itemscope="" itemtype="http://data-vocabulary.org/RecipeIngredient"><span itemprop="amount">1 cup</span><span itemprop="name"> Grape Tomatoes, Halved</span></span></li><li><span itemprop="ingredient" itemscope="" itemtype="http://data-vocabulary.org/RecipeIngredient"><span itemprop="amount">1/2 cup</span><span itemprop="name"> Black Or Kalamata Olives, Halved</span></span></li><li><span itemprop="ingredient" itemscope="" itemtype="http://data-vocabulary.org/RecipeIngredient"><span itemprop="amount">4 ounces, weight</span><span itemprop="name"> Mozzarella Cheese, Cut Into Cubes</span></span></li><li><span itemprop="ingredient" itemscope="" itemtype="http://data-vocabulary.org/RecipeIngredient"><span itemprop="amount">1/2 cup</span><span itemprop="name"> Mayonnaise</span></span></li><li><span itemprop="ingredient" itemscope="" itemtype="http://data-vocabulary.org/RecipeIngredient"><span itemprop="amount">1/2 cup</span><span itemprop="name"> Sour Cream</span></span></li><li><span itemprop="ingredient" itemscope="" itemtype="http://data-vocabulary.org/RecipeIngredient"><span itemprop="amount">1/4 cup</span><span itemprop="name"> Milk, More For Thinning</span></span></li><li><span itemprop="ingredient" itemscope="" itemtype="http://data-vocabulary.org/RecipeIngredient"><span itemprop="amount">1/2 teaspoon</span><span itemprop="name"> Salt</span></span></li><li><span itemprop="ingredient" itemscope="" itemtype="http://data-vocabulary.org/RecipeIngredient"><span itemprop="amount">1/2 teaspoon</span><span itemprop="name"> Pepper</span></span></li><li><span itemprop="ingredient" itemscope="" itemtype="http://data-vocabulary.org/RecipeIngredient"><span itemprop="amount">4 Tablespoons</span><span itemprop="name"> Pine Nuts (optional)</span></span></li><li><span itemprop="ingredient" itemscope="" itemtype="http://data-vocabulary.org/RecipeIngredient"><span itemprop="amount">8 </span><span itemprop="name"> Extra Parmesan, For Sprinkling</span></span></li></ul> <h4 class="recipe-sub-head">Preparation Instructions</h4> <div itemprop="instructions"> <p>Cook the pasta in salted water until done, then drain and rinse in cold water. Allow pasta to dry slightly, then toss in a bowl with 4 tablespoons pesto. (Add more if you want the pasta to be more coated.) Add Parmesan and toss. Cover and refrigerate pasta until cold.</p> <p>Make the dressing by whisking together the mayonnaise, sour cream, and milk with the rest of the pesto. Add salt and pepper, then taste and adjust seasonings as needed. The dressing needs to be somewhat thin and pourable in order to coat the lettuce and pasta later. Set the dressing aside.</p> <p>If you're using pine nuts, toast them over medium-low heat in a small skillet. Set them aside. </p> <p>To assemble the salads, make a bed of lettuce in a large bowl, then add a generous layer of pesto-coated pasta. Add tomatoes, olives, and chunks of cheese. Spoon a good amount of dressing all over the top; it should be thin enough to seep down into the salad, not so thick it will stay on top of everything.</p> <p>Sprinkle salads with a little extra Parmesan and serve!</p> </div> <p style="display:none;">Posted by <span itemprop="author">Ree</span> on <span itemprop="published" datetime="2013-04-15">April 15 2013</span></p> </div> """ html_stripped = """\n \n Recipe\n Pesto Pasta Salad\n \n Prep Time: 1 Hour\n Cook Time: 12 Minutes\n Difficulty: Easy\n Servings: 4\n \n \n \n \n Print Recipe\n \n Ingredients\n 8 ounces, weight Short Fusilli Or Rotini (corkscrew) Pasta1 head (large) Romaine Lettuce, Sliced Into 1-inch Pieces1/3 cup Prepared Pesto1/2 cup Shredded Parmesan Cheese1 cup Grape Tomatoes, Halved1/2 cup Black Or Kalamata Olives, Halved4 ounces, weight Mozzarella Cheese, Cut Into Cubes1/2 cup Mayonnaise1/2 cup Sour Cream1/4 cup Milk, More For Thinning1/2 teaspoon Salt1/2 teaspoon Pepper4 Tablespoons Pine Nuts (optional)8 Extra Parmesan, For Sprinkling Preparation Instructions\n \n Cook the pasta in salted water until done, then drain and rinse in cold water. Allow pasta to dry slightly, then toss in a bowl with 4 tablespoons pesto. (Add more if you want the pasta to be more coated.) Add Parmesan and toss. Cover and refrigerate pasta until cold.\nMake the dressing by whisking together the mayonnaise, sour cream, and milk with the rest of the pesto. Add salt and pepper, then taste and adjust seasonings as needed. The dressing needs to be somewhat thin and pourable in order to coat the lettuce and pasta later. Set the dressing aside.\nIf you're using pine nuts, toast them over medium-low heat in a small skillet. Set them aside. \nTo assemble the salads, make a bed of lettuce in a large bowl, then add a generous layer of pesto-coated pasta. Add tomatoes, olives, and chunks of cheese. Spoon a good amount of dressing all over the top; it should be thin enough to seep down into the salad, not so thick it will stay on top of everything.\nSprinkle salads with a little extra Parmesan and serve!\n \n Posted by Ree on April 15 2013\n \n """ self.assertEqual(html_stripped, strip_html(html_marked))
def process_item(self, item, spider): if not item.get('source', False): raise DropItem("Missing 'source' in %s" % item) if not item.get('name', False): raise DropItem("Missing 'name' in %s" % item) if not item.get('url', False): raise DropItem("Missing 'url' in %s" % item) if not item.get('ingredients', False): raise DropItem("Missing 'ingredients' in %s" % item) for k, v in item.iteritems(): if k == 'ingredients': # with ingredients, we want to separate each entry with a # newline character item[k] = "\n".join(v) elif isinstance(item[k], list): # otherwise just smash them together with nothing between. # We expect these to always just be lists with 1 or 0 # elements, so it effectively converts the list into a # string item[k] = "".join(v) # Use Bleach to strip all HTML tags. The tags could be a source # of code injection, and it's generally not safe to keep them. # We may consider storing a whitelisted subset in special # properties for the sake of presentation. item[k] = strip_html(item[k]) # trim whitespace item[k] = item[k].strip() return item
def test_strip_html(self): html_marked = '<strong>foo</strong> <script>bar baz</script>' self.assertEqual('foo bar baz', strip_html(html_marked))
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = """//div[@class='hrecipe']""" recipes_scopes = hxs.select(base_path) name_path = '//h1[@class="fn"][1]/text() | //h1/text()' description_path = './/p[@class="subhead summary"]/text()' image_path = './/img[@class="photo"]/@src' # a lot of the content is repeated inside the '#printSidebar' div # so specify that to avoid doubling up content prepTime_path = '//div[@id="printSidebar"]/div[@id="prep"]/p[1]/text()' cookTime_path = '//div[@id="printSidebar"]/div[@id="prep"]/p[2]/text()' recipeYield_path = '//div[@id="printSidebar"]/div[@id="serving"]/p/text()' recipeYieldAmount_path = '//span[@class="yield"]/text()' ingredients_path = '//div[@id="printSidebar"]/div[@id="ingredients"]//li' recipes = [] for r_scope in recipes_scopes: item = RecipeItem() item['source'] = self.source item['name'] = "".join(r_scope.select(name_path).extract()) item['url'] = response.url # construct base url for image by removing recipe title from url base_img_url = '/'.join(response.url.split('/')[:-1]) img_name = "".join(r_scope.select(image_path).extract()).strip() if img_name: item['image'] = '/'.join([base_img_url, img_name]) item['description'] = r_scope.select(description_path).extract() # remove extra tabs and newlines from Prep Time and Cook Time prepSentence = " ".join(r_scope.select(prepTime_path).extract()).strip() if prepSentence: prepSentence = self.remove_whitespace(prepSentence) # also remove preceding 'Prep ' item['prepTime'] = prepSentence.split(' ', 1)[1] cookSentence = " ".join(r_scope.select(cookTime_path).extract()).strip() if cookSentence: cookSentence = self.remove_whitespace(cookSentence) item['cookTime'] = cookSentence.split(' ', 1)[1] # the number of servings is a bit tricky # if there's a span with class 'yield' it contains the number of servings # otherwise number of servings is given in the <p> element if r_scope.select(recipeYieldAmount_path).extract(): yieldAmount = r_scope.select(recipeYieldAmount_path).extract()[0].strip() else: yieldAmount = "" yieldList = r_scope.select(recipeYield_path).extract() if yieldList: yieldString = yieldList[0].strip() item['recipeYield'] = ('%s %s' % (yieldString, yieldAmount)).strip() ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: ingredient = strip_html(i_scope.extract()) # clean extra tabs and newlines ingredient = self.remove_whitespace(ingredient) ingredients.append(ingredient) item['ingredients'] = ingredients recipes.append(item) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = """//div[@class='hrecipe']""" recipes_scopes = hxs.select(base_path) name_path = '//h1[@class="fn"][1]/text() | //h1/text()' description_path = './/p[@class="subhead summary"]/text()' image_path = './/img[@class="photo"]/@src' # a lot of the content is repeated inside the '#printSidebar' div # so specify that to avoid doubling up content prepTime_path = '//div[@id="printSidebar"]/div[@id="prep"]/p[1]/text()' cookTime_path = '//div[@id="printSidebar"]/div[@id="prep"]/p[2]/text()' recipeYield_path = '//div[@id="printSidebar"]/div[@id="serving"]/p/text()' recipeYieldAmount_path = '//span[@class="yield"]/text()' ingredients_path = '//div[@id="printSidebar"]/div[@id="ingredients"]//li' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', "".join(r_scope.select(name_path).extract())) il.add_value('url', response.url) # construct base url for image by removing recipe title from url base_img_url = '/'.join(response.url.split('/')[:-1]) img_name = "".join(r_scope.select(image_path).extract()).strip() if img_name: il.add_value('image', '/'.join([base_img_url, img_name])) il.add_value('description', r_scope.select(description_path).extract()) # remove extra tabs and newlines from Prep Time and Cook Time prepSentence = " ".join(r_scope.select(prepTime_path).extract()).strip() if prepSentence: prepSentence = self.remove_whitespace(prepSentence) # also remove preceding 'Prep ' il.add_value('prepTime', prepSentence.split(' ', 1)[1]) cookSentence = " ".join(r_scope.select(cookTime_path).extract()).strip() if cookSentence: cookSentence = self.remove_whitespace(cookSentence) il.add_value('cookTime', cookSentence.split(' ', 1)[1]) # the number of servings is a bit tricky # if there's a span with class 'yield' it contains the number of servings # otherwise number of servings is given in the <p> element if r_scope.select(recipeYieldAmount_path).extract(): yieldAmount = r_scope.select(recipeYieldAmount_path).extract()[0].strip() else: yieldAmount = "" yieldList = r_scope.select(recipeYield_path).extract() if yieldList: yieldString = yieldList[0].strip() il.add_value('recipeYield', ('%s %s' % (yieldString, yieldAmount)).strip()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: ingredient = strip_html(i_scope.extract()) # clean extra tabs and newlines ingredient = self.remove_whitespace(ingredient) ingredients.append(ingredient) il.add_value('ingredients', ingredients) recipes.append(il.load_item()) return recipes