Ejemplo n.º 1
0
class KetoSizeMe(spiders.CrawlSpider):
    name = 'keto-size-me'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['ketosizeme.com']
    start_urls = ['https://ketosizeme.com/category/ketogenic-diet-recipes/']

    rules = [

        # Extract links for finding additional pages within recipe index,
        # e.g. https://ketosizeme.com/category/ketogenic-diet-recipes/page/2/
        spiders.Rule(
            linkextractors.LinkExtractor(
                allow=
                r'https://ketosizeme.com/category/ketogenic-diet-recipes/page/\d+/'
            )),

        # Extract links for recipes.
        spiders.Rule(linkextractors.LinkExtractor(
            allow=r'https://ketosizeme.com/.+/$', restrict_xpaths='//main'),
                     callback=callback_handler.process_callback,
                     follow=False),
    ]
Ejemplo n.º 2
0
class KetogasmSpider(spiders.CrawlSpider):
    name = 'ketogasm'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['ketogasm.com']
    _url_format = ('https://ketogasm.com/recipe-index/?'
                   'fwp_recipes_filters=recipe&'
                   'fwp_paged=%d')
    start_urls = [
        (_url_format % 1),
        (_url_format % 2),
        (_url_format % 3),
        (_url_format % 4),
    ]

    rules = [
        # Extract links for recipes.
        spiders.Rule(linkextractors.LinkExtractor(
            allow=r'https://ketogasm.com/.*/$',
            restrict_xpaths='//div[@id="recipes-grid"]'),
                     callback=callback_handler.process_callback,
                     follow=False)
    ]
Ejemplo n.º 3
0
class RuledMeSpider(spiders.CrawlSpider):
    name = 'ruled-me'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['ruled.me']
    start_urls = ['https://www.ruled.me/keto-recipes/']

    rules = [
        # Extract links for food category pages,
        # e.g. https://www.ruled.me/keto-recipes/breakfast/
        spiders.Rule(
            linkextractors.LinkExtractor(
                allow=r'https://www.ruled.me/keto-recipes/\w+(\-\w+)*/$',
                restrict_xpaths='//div[@class="r-list"]')),

        # Extract links for finding additional pages within food category pages,
        # e.g. https://www.ruled.me/keto-recipes/dinner/page/2/
        spiders.Rule(
            linkextractors.LinkExtractor(allow=(
                r'https://www.ruled.me/keto-recipes/\w+(\-\w+)*/page/\d+/'))),

        # Extract links for the actual recipes,
        # e.g. https://www.ruled.me/easy-keto-cordon-bleu/
        spiders.Rule(linkextractors.LinkExtractor(
            allow=r'https://www.ruled.me/\w+(\-\w+)*/$',
            restrict_xpaths='//div[@id="content"]'),
                     callback=callback_handler.process_callback,
                     follow=False)
    ]
Ejemplo n.º 4
0
class QueenBs(spiders.CrawlSpider):
    name = 'queen-bs'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['queenbsincredibleedibles.com']
    start_urls = ['http://queenbsincredibleedibles.com/category/keto/page/1/']

    rules = [
        # Extract links for finding additional keto recipe pages,
        # e.g. http://queenbsincredibleedibles.com/category/keto/page/2/
        spiders.Rule(
            linkextractors.LinkExtractor(
                allow=
                r'http://queenbsincredibleedibles.com/category/keto/page/\d+/')
        ),

        # Extract links for recipes,
        # e.g. http://queenbsincredibleedibles.com/creamy-coconut-kale-sausage-soup/
        spiders.Rule(linkextractors.LinkExtractor(
            allow=r'http://queenbsincredibleedibles.com/.*/$',
            deny=r'(category\/)|(ive-fallen-in-love-with-keto)'),
                     callback=callback_handler.process_callback,
                     follow=False)
    ]
Ejemplo n.º 5
0
class Ketovale(spiders.CrawlSpider):
    name = 'ketovale'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['ketovale.com']
    start_urls = ['https://www.ketovale.com/category/recipes/']

    rules = [
        # Extract links for finding additional recipe pages,
        # e.g. https://www.ketovale.com/category/recipes/page/3/
        spiders.Rule(
            linkextractors.LinkExtractor(
                allow=r'https://www.ketovale.com/category/recipes/page/\d+/')),
        # Extract links for recipes.
        spiders.Rule(linkextractors.LinkExtractor(
            allow=r'https://www.ketovale.com/recipe/.*/$',
            restrict_xpaths='//h2[@class="entry-title"]'),
                     callback=callback_handler.process_callback,
                     follow=False),
    ]
Ejemplo n.º 6
0
class KetovangelistKitchen(spiders.CrawlSpider):
    name = 'ketovangelist-kitchen'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['ketovangelistkitchen.com']
    # Organize start URLs in descending order of category strength (e.g. muffins
    # should be categorized as "snack" not "eggs".
    start_urls = [
        'http://www.ketovangelistkitchen.com/indexes/recipes/appetizers/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/desserts/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/beverages/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/sides/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/snack/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/soup/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/sauces-dressings/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/casseroles/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/fat-bombs/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/dairy-free/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/kid-friendly/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/baked-goods/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/beef/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/chicken-turkey/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/chocolate/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/fish/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/pork/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/vegetables/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/nuts/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/eggs/',
    ]

    rules = [
        # Extract links for recipes.
        spiders.Rule(linkextractors.LinkExtractor(
            restrict_xpaths='//div[@class="entry-content"]'),
                     callback=callback_handler.process_callback,
                     follow=False)
    ]
Ejemplo n.º 7
0
class SugarFreeMom(spiders.CrawlSpider):
    name = 'sugar-free-mom'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['sugarfreemom.com']
    start_urls = ['https://www.sugarfreemom.com/recipes/category/diet/keto/']

    rules = [
        # Extract links for finding additional recipe pages,
        # e.g. https://www.sugarfreemom.com/recipes/category/diet/keto/page/2/
        spiders.Rule(
            linkextractors.LinkExtractor(allow=(
                r'sugarfreemom.com/recipes/category/diet/keto/page/\d+/'))),
        # Extract links for recipes.
        spiders.Rule(linkextractors.LinkExtractor(
            allow=r'sugarfreemom.com/recipes/[^\/]+/$',
            restrict_xpaths='//main'),
                     callback=callback_handler.process_callback,
                     follow=False),
    ]
Ejemplo n.º 8
0
class GreekGoesKetoSpider(spiders.CrawlSpider):
    name = 'greek-goes-keto'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['greekgoesketo.com']
    start_urls = ['https://www.greekgoesketo.com/category/recipes/']

    rules = [
        # Extract links for finding additional recipe pages,
        # e.g. https://www.greekgoesketo.com/category/recipes/page/1/
        spiders.Rule(
            linkextractors.LinkExtractor(
                allow=
                r'https://(.+\.)greekgoesketo.com/category/recipes/page/\d+/')
        ),
        # Extract links for recipes,
        spiders.Rule(linkextractors.LinkExtractor(restrict_css='main article'),
                     callback=callback_handler.process_callback,
                     follow=False),
    ]
Ejemplo n.º 9
0
class KetoConnectSpider(spiders.CrawlSpider):
    name = 'ketoconnect'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['ketoconnect.net']
    start_urls = [
        'https://www.ketoconnect.net/main-dishes/',
        'https://www.ketoconnect.net/side-dishes/',
        'https://www.ketoconnect.net/breakfasts/',
        'https://www.ketoconnect.net/snacks/',
        'https://www.ketoconnect.net/desserts/',
        'https://www.ketoconnect.net/beverages/'
    ]

    rules = [
        # Extract links for the actual recipes
        # e.g. https://www.ketoconnect.net/recipe/spicy-cilantro-dressing/
        spiders.Rule(linkextractors.LinkExtractor(restrict_xpaths='//article'),
                     callback=callback_handler.process_callback,
                     follow=False),
    ]
Ejemplo n.º 10
0
class HeyKetoMamaSpider(spiders.CrawlSpider):
    name = 'hey-keto-mama'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['heyketomama.com']
    start_urls = ['https://www.heyketomama.com/category/recipes/page/1/']

    rules = [
        # Extract links for finding additional recipe pages,
        # e.g. https://www.heyketomama.com/category/recipes/page/6/
        spiders.Rule(
            linkextractors.LinkExtractor(
                allow=r'https://www.heyketomama.com/category/recipes/page/\d+/'
            )),
        # Extract links for recipes,
        # e.g. https://www.heyketomama.com/ten-minute-keto-nachos/
        spiders.Rule(linkextractors.LinkExtractor(
            restrict_xpaths='//div[@class="entry-content"]'),
                     callback=callback_handler.process_callback,
                     follow=False),
    ]
Ejemplo n.º 11
0
class WholesomeYum(spiders.CrawlSpider):
    name = 'wholesome-yum'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['wholesomeyum.com']
    start_urls = ['https://www.wholesomeyum.com/tag/keto/']

    rules = [
        # Extract links for finding additional recipe pages,
        # e.g. https://www.wholesomeyum.com/tag/keto/page/2/
        spiders.Rule(
            linkextractors.LinkExtractor(
                allow=r'wholesomeyum.com/tag/keto/page/\d+/')),
        # Extract links for recipes.
        spiders.Rule(linkextractors.LinkExtractor(allow=[
            r'wholesomeyum.com/[^\/]+/$', r'wholesomeyum.com/recipes/[^\/]+/$'
        ],
                                                  restrict_xpaths='//main'),
                     callback=callback_handler.process_callback,
                     follow=False),
    ]
Ejemplo n.º 12
0
class DietDoctorSpider(spiders.CrawlSpider):
    name = 'diet-doctor'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['dietdoctor.com']

    # TODO(mtlynch): Make this more flexible. It's now limited to only 40 pages
    # but it should just figure out which ones are present. I've adding Rules
    # for the Previous/Next links but they don't seem to work.
    _url_prefix = ('https://www.dietdoctor.com/low-carb/recipes'
                   '?s=&st=recipe&lowcarb%5B%5D=keto&sp=')
    start_urls = [_url_prefix + str(i) for i in range(1, 40)]

    rules = [
        # Extract links for recipes,
        # e.g. /recipes/green-onion-no-chile-chicken-enchiladas
        spiders.Rule(linkextractors.LinkExtractor(
            allow=r'https://www.dietdoctor.com/recipes/'),
                     callback=callback_handler.process_callback,
                     follow=False),
    ]
Ejemplo n.º 13
0
class LowCarbYum(spiders.CrawlSpider):
    name = 'low-carb-yum'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['lowcarbyum.com']
    start_urls = ['https://lowcarbyum.com/recipes/']

    rules = [
        # Extract links for food category pages,
        # e.g. https://lowcarbyum.com/category/desserts/
        spiders.Rule(
            linkextractors.LinkExtractor(
                allow=r'https://lowcarbyum.com/category/',
                deny=r'https://lowcarbyum.com/category/((reviews)|(articles))')
        ),
        # Extract links for recipes.
        spiders.Rule(linkextractors.LinkExtractor(
            allow=r'https://lowcarbyum.com/.+/$',
            restrict_xpaths='//header[@class="entry-header"]'),
                     callback=callback_handler.process_callback,
                     follow=False)
    ]
Ejemplo n.º 14
0
class GreekGoesKetoSpider(spiders.CrawlSpider):
    name = 'greek-goes-keto'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['greekgoesketo.com']
    start_urls = ['https://greekgoesketo.com/category/recipes/']

    rules = [
        # Extract links for finding additional recipe pages,
        # e.g. https://greekgoesketo.com/category/recipes/page/1/
        spiders.Rule(
            linkextractors.LinkExtractor(
                allow=r'https://greekgoesketo.com/category/recipes/page/\d+/')
        ),
        # Extract links for recipes,
        # e.g. https://www.heyketomama.com/ten-minute-keto-nachos/
        spiders.Rule(linkextractors.LinkExtractor(
            allow=r'https://greekgoesketo.com/\d{4}/\d{2}/\d{2}/.+/',
            restrict_xpaths='//div[@class="content-block"]'),
                     callback=callback_handler.process_callback,
                     follow=False),
    ]
Ejemplo n.º 15
0
class YourFriendsJ(spiders.CrawlSpider):
    name = 'your-friends-j'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['yourfriendsj.com']
    start_urls = ['http://yourfriendsj.com/recipe-library/']

    rules = [

        # Extract links for finding additional recipe pages,
        # e.g. http://yourfriendsj.com/tag/keto/page/2/
        spiders.Rule(
            linkextractors.LinkExtractor(
                allow=r'yourfriendsj.com/recipe-library/\?paged=\d+')),
        # Extract links for recipes,
        # e.g. http://yourfriendsj.com/recipes/easy-guacamole-recipe/
        spiders.Rule(linkextractors.LinkExtractor(
            allow=r'http://yourfriendsj.com/recipes/[^\/]*/$',
            restrict_xpaths='//article'),
                     callback=callback_handler.process_callback,
                     follow=False)
    ]
Ejemplo n.º 16
0
class SkinnyTaste(spiders.CrawlSpider):
    name = 'skinny-taste'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['skinnytaste.com']
    start_urls = ['https://www.skinnytaste.com/recipes/keto/']

    rules = [
        # Extract links for finding additional recipe pages,
        # e.g. https://www.skinnytaste.com/recipes/keto/page/2/
        spiders.Rule(
            linkextractors.LinkExtractor(
                allow=r'skinnytaste.com/recipes/keto/page/\d+/')),
        # Extract links for recipes.
        spiders.Rule(linkextractors.LinkExtractor(
            allow=[
                r'skinnytaste.com/[^\/]+/$',
            ],
            restrict_xpaths='//div[@class="archives"]'),
                     callback=callback_handler.process_callback,
                     follow=False),
    ]
Ejemplo n.º 17
0
 def parse_recipe(self, response):
     callback_handler = CallbackHandler(
         content_saver=persist.ContentSaver(_get_download_root()))
     callback_handler.process_callback(response)