Ejemplo n.º 1
0
        def get_json(url):
            """ Find and load "standardized" json document containing recipe """

            return_value = None

            user_agent = {'User-agent': 'Mozilla/5.0'}
            page = requests.get(url, headers = user_agent)

            match = re.search(r'<script[^>]*type=.?application/ld\+json.?[^>]*>', page.text)
            if match:
                print_debug("Found an occurance of 'application/ld+json'")
                soup = BeautifulSoup(page.text, 'html5lib')
                scripts = soup.findAll('script', attrs = {'type':'application/ld+json'})
                for script in scripts:
                    json_stripped=re.sub('^[^\{\[]*', '', script.text)
                    raw_json = json.loads(json_stripped)
                    if type(raw_json) == list:
                        return_value = json_find_array_element(raw_json, '@type', 'Recipe')
                        try:
                            return_value['publisher'] = json_clean_value(json_clean_value(source_json, 'publisher', json.loads('{}'), 'name', ''))
                            if return_value['publisher'] == '':
                                return_value['publisher'] = json_clean_value(json_find_array_element(raw_json, '@type', 'Organization'), 'name', url2publisher(url))
                        except:
                            if not return_value is None:
                                return_value['publisher'] = url2publisher(url)
                    elif '@graph' in raw_json and type(raw_json['@graph']) == list:
                        return_value = json_find_array_element(raw_json['@graph'], '@type', 'Recipe')
                        try:
                            return_value['publisher'] = json_clean_value(json_clean_value(source_json, 'publisher', json.loads('{}'), 'name', ''))
                            if return_value['publisher'] == '':
                                return_value['publisher'] = json_clean_value(json_find_array_element(raw_json['@graph'], '@type', 'Organization'), 'name', url2publisher(url))
                        except:
                            if not return_value is None:
                                return_value['publisher']=url2publisher(url)
                    else:
                        if return_value is None:
                            try:
                                if raw_json['@type'] == 'Recipe' and 'recipeIngredient' in raw_json:
                                    return_value = raw_json
                                else:
                                    return_value = None
                            except:
                                return_value = None
                        try:
                            return_value['publisher'] = json_clean_value(json_clean_value(source_json, 'publisher', json.loads('{}')), 'name', url2publisher(url))
                        except:
                            if not return_value is None:
                                return_value['publisher']=url2publisher(url)

                    if (not return_value is None) and ('recipeIngredient' in return_value):
                        pass
                    else:
                        return_value = None
            return return_value
Ejemplo n.º 2
0
            def output_group(json_obj,
                             group_key,
                             item_key,
                             item_prefix,
                             item_wrap=False,
                             format='rst',
                             base_level=2):
                """ returns string containg formated groups/lists """

                out_string = ''
                group_count = len(json_clean_value(recipe_json, group_key))
                for group_index, group in enumerate(
                        json_clean_value(recipe_json, group_key)):
                    group_title = json_clean_value(group, 'title')

                    if group_title != '':
                        if group_index > 0:
                            out_string += '\n'
                        out_string += output_header(group_title,
                                                    format=format,
                                                    level=(base_level + 1))

                    for item_count, item in enumerate(
                            json_clean_value(group, item_key), 1):
                        if item_prefix == '#':
                            prefix = str(item_count).strip() + '. '
                        else:
                            prefix = item_prefix.strip() + ' '
                        if item_wrap:
                            item_lines = textwrap.wrap(
                                item,
                                width=75,
                                initial_indent=prefix,
                                subsequent_indent=re.sub('.', ' ', prefix))
                            for line in item_lines:
                                out_string += line + '\n'
                        else:
                            out_string += prefix.strip() + ' ' + str(
                                item) + '\n'

                return out_string
Ejemplo n.º 3
0
        def get_json(args, url):
            """ Find and load "standardized" json document containing recipe """
            return_value = None
            page = requests.get(url)

            page = BeautifulSoup(requests.get(url).text, 'html5lib')
            scripts = page.findAll('script')
            for script in scripts:
                match = re.search(r'root\.__INITIAL_STATE__\.store', script.text)
                if match:
                    for line in iter(script.text.splitlines()):
                        match = re.search(r'root\.__INITIAL_STATE__\.store', line)
                        if match:
                            raw_json_text = re.sub('[^}]*$','', line)
                            raw_json_text = re.sub('^[^{]*', '', raw_json_text)
                            raw_json_text = re.sub('"email":{"regExp":.*,"password"', '"email":{"regExp":"","password"', raw_json_text)
                            raw_json_text = re.sub('"password":{"regExp":.*,"messages"', '"password":{"regExp":""},"messages"', raw_json_text)
                            raw_json = json.loads(raw_json_text)
                            return_value = json_clean_value(raw_json, 'content', json.loads('{}'))
                            #print_debug(json.dumps(return_value, indent=4))
            return return_value
Ejemplo n.º 4
0
        def recipe_json2doc(args, recipe_json, format='rst', base_level=1):
            """ Build reStructuredText from recipe JSON """
            def format2text(format):
                """ Formats output ext to human readable format name """

                format_text = ''
                if format == 'json':
                    format_text = 'JSON'
                elif format == 'md':
                    format_text = 'Markdown'
                elif format == 'rst':
                    format_text = 'reStructuredText'
                else:
                    format_text = "Unknown format [%s]" % (format)
                    print_warning("Unknown format [%s]" % (format))
                    #raise ("ERROR: Unknown format [%s]" % (format))
                return format_text

            def output_header(header_text, format='rst', level=1):
                """ returns string containg formated header """

                out_string = ''
                if format == 'md':
                    out_string += '#' * (level + 1)
                    out_string += ' '
                out_string += header_text + '\n'
                if format == 'rst':
                    level_chars = ['=', '-', '^']
                    level_char = level_chars[level - 1]
                    out_string += re.sub('.', level_char, header_text) + '\n'
                out_string += '\n'

                return out_string

            def output_group(json_obj,
                             group_key,
                             item_key,
                             item_prefix,
                             item_wrap=False,
                             format='rst',
                             base_level=2):
                """ returns string containg formated groups/lists """

                out_string = ''
                group_count = len(json_clean_value(recipe_json, group_key))
                for group_index, group in enumerate(
                        json_clean_value(recipe_json, group_key)):
                    group_title = json_clean_value(group, 'title')

                    if group_title != '':
                        if group_index > 0:
                            out_string += '\n'
                        out_string += output_header(group_title,
                                                    format=format,
                                                    level=(base_level + 1))

                    for item_count, item in enumerate(
                            json_clean_value(group, item_key), 1):
                        if item_prefix == '#':
                            prefix = str(item_count).strip() + '. '
                        else:
                            prefix = item_prefix.strip() + ' '
                        if item_wrap:
                            item_lines = textwrap.wrap(
                                item,
                                width=75,
                                initial_indent=prefix,
                                subsequent_indent=re.sub('.', ' ', prefix))
                            for line in item_lines:
                                out_string += line + '\n'
                        else:
                            out_string += prefix.strip() + ' ' + str(
                                item) + '\n'

                return out_string

            format_prefix = '-'
            if format == 'md':
                format_prefix = '*'

            print_debug("Building " + format2text(format) +
                        " from recipe JSON...")
            print_debug(recipe_json)

            output = output_header(json_clean_value(recipe_json, 'title'),
                                   format)

            recipe_yield = json_clean_value(recipe_json, 'yield')
            preptime = json_clean_value(recipe_json, 'preptime')
            cooktime = json_clean_value(recipe_json, 'cooktime')
            totaltime = json_clean_value(recipe_json, 'totaltime')

            info = "| "
            if preptime != '':
                info += 'Prep: ' + preptime + ' | '
            if totaltime != '':
                info += 'Total: ' + totaltime + ' | '
            if recipe_yield != '':
                info += 'Yield: ' + str(recipe_yield) + ' | '
            info = info.strip()

            if info != '|':
                divider_line = re.sub('[^|]', '-', info)
                if format == 'rst':
                    divider_line = re.sub('[|]', '+', divider_line)
                output += divider_line + '\n' + info + '\n' + divider_line + '\n\n'

            # TODO: make this work with markdown and missing URL
            url = json_clean_value(recipe_json, 'url')
            author = json_clean_value(recipe_json, 'author')
            if url is None or url == '':
                if not author is None and author != '':
                    output += 'Source: ' + author + '\n\n'
            else:
                if author is None or author == '':
                    author = url2domain(url)
                if format == 'md':
                    output += 'Source: [' + author + '](' + url + ')\n\n'
                elif format == 'rst':
                    output += 'Source: `' + author + ' <' + url + '>`__\n\n'
                else:
                    output += 'Source: ' + author + '\n\n'

            description = textwrap.wrap(json_clean_value(
                recipe_json, 'description'),
                                        width=75)
            for line in description:
                output += line + '\n'

            output += '\n'
            output += output_header('Ingredients', format=format, level=2)
            output += output_group(recipe_json,
                                   'ingredient_groups',
                                   'ingredients',
                                   format_prefix,
                                   format=format,
                                   base_level=2)

            output += '\n'
            output += output_header('Directions', format=format, level=2)
            output += output_group(recipe_json,
                                   'direction_groups',
                                   'directions',
                                   '#',
                                   item_wrap=True,
                                   format=format,
                                   base_level=2)

            notes = json_clean_value(recipe_json, 'notes')
            if not notes is None and notes != '':
                output += '\n'
                output += output_header('Notes', format=format, level=2)

                for note in notes:
                    note = re.sub('\*\*\*', '', note)
                    if len(notes) > 1:
                        not_prefic = format_prefix.strip() + ' '
                        for line in textwrap.wrap(note,
                                                  width=75,
                                                  initial_indent=note_prefix,
                                                  subsequent_indent=re.sub(
                                                      '.', ' ', note_prefix)):
                            output += line + '\n'
                    else:
                        for line in textwrap.wrap(note, width=75):
                            output += line + '\n'
                    output += '\n'

            return output
Ejemplo n.º 5
0
    def recipe_output_file(args, recipe_json, format=""):
        """ Output recipe_json document in the desired format """
        def recipe_json2doc(args, recipe_json, format='rst', base_level=1):
            """ Build reStructuredText from recipe JSON """
            def format2text(format):
                """ Formats output ext to human readable format name """

                format_text = ''
                if format == 'json':
                    format_text = 'JSON'
                elif format == 'md':
                    format_text = 'Markdown'
                elif format == 'rst':
                    format_text = 'reStructuredText'
                else:
                    format_text = "Unknown format [%s]" % (format)
                    print_warning("Unknown format [%s]" % (format))
                    #raise ("ERROR: Unknown format [%s]" % (format))
                return format_text

            def output_header(header_text, format='rst', level=1):
                """ returns string containg formated header """

                out_string = ''
                if format == 'md':
                    out_string += '#' * (level + 1)
                    out_string += ' '
                out_string += header_text + '\n'
                if format == 'rst':
                    level_chars = ['=', '-', '^']
                    level_char = level_chars[level - 1]
                    out_string += re.sub('.', level_char, header_text) + '\n'
                out_string += '\n'

                return out_string

            def output_group(json_obj,
                             group_key,
                             item_key,
                             item_prefix,
                             item_wrap=False,
                             format='rst',
                             base_level=2):
                """ returns string containg formated groups/lists """

                out_string = ''
                group_count = len(json_clean_value(recipe_json, group_key))
                for group_index, group in enumerate(
                        json_clean_value(recipe_json, group_key)):
                    group_title = json_clean_value(group, 'title')

                    if group_title != '':
                        if group_index > 0:
                            out_string += '\n'
                        out_string += output_header(group_title,
                                                    format=format,
                                                    level=(base_level + 1))

                    for item_count, item in enumerate(
                            json_clean_value(group, item_key), 1):
                        if item_prefix == '#':
                            prefix = str(item_count).strip() + '. '
                        else:
                            prefix = item_prefix.strip() + ' '
                        if item_wrap:
                            item_lines = textwrap.wrap(
                                item,
                                width=75,
                                initial_indent=prefix,
                                subsequent_indent=re.sub('.', ' ', prefix))
                            for line in item_lines:
                                out_string += line + '\n'
                        else:
                            out_string += prefix.strip() + ' ' + str(
                                item) + '\n'

                return out_string

            format_prefix = '-'
            if format == 'md':
                format_prefix = '*'

            print_debug("Building " + format2text(format) +
                        " from recipe JSON...")
            print_debug(recipe_json)

            output = output_header(json_clean_value(recipe_json, 'title'),
                                   format)

            recipe_yield = json_clean_value(recipe_json, 'yield')
            preptime = json_clean_value(recipe_json, 'preptime')
            cooktime = json_clean_value(recipe_json, 'cooktime')
            totaltime = json_clean_value(recipe_json, 'totaltime')

            info = "| "
            if preptime != '':
                info += 'Prep: ' + preptime + ' | '
            if totaltime != '':
                info += 'Total: ' + totaltime + ' | '
            if recipe_yield != '':
                info += 'Yield: ' + str(recipe_yield) + ' | '
            info = info.strip()

            if info != '|':
                divider_line = re.sub('[^|]', '-', info)
                if format == 'rst':
                    divider_line = re.sub('[|]', '+', divider_line)
                output += divider_line + '\n' + info + '\n' + divider_line + '\n\n'

            # TODO: make this work with markdown and missing URL
            url = json_clean_value(recipe_json, 'url')
            author = json_clean_value(recipe_json, 'author')
            if url is None or url == '':
                if not author is None and author != '':
                    output += 'Source: ' + author + '\n\n'
            else:
                if author is None or author == '':
                    author = url2domain(url)
                if format == 'md':
                    output += 'Source: [' + author + '](' + url + ')\n\n'
                elif format == 'rst':
                    output += 'Source: `' + author + ' <' + url + '>`__\n\n'
                else:
                    output += 'Source: ' + author + '\n\n'

            description = textwrap.wrap(json_clean_value(
                recipe_json, 'description'),
                                        width=75)
            for line in description:
                output += line + '\n'

            output += '\n'
            output += output_header('Ingredients', format=format, level=2)
            output += output_group(recipe_json,
                                   'ingredient_groups',
                                   'ingredients',
                                   format_prefix,
                                   format=format,
                                   base_level=2)

            output += '\n'
            output += output_header('Directions', format=format, level=2)
            output += output_group(recipe_json,
                                   'direction_groups',
                                   'directions',
                                   '#',
                                   item_wrap=True,
                                   format=format,
                                   base_level=2)

            notes = json_clean_value(recipe_json, 'notes')
            if not notes is None and notes != '':
                output += '\n'
                output += output_header('Notes', format=format, level=2)

                for note in notes:
                    note = re.sub('\*\*\*', '', note)
                    if len(notes) > 1:
                        not_prefic = format_prefix.strip() + ' '
                        for line in textwrap.wrap(note,
                                                  width=75,
                                                  initial_indent=note_prefix,
                                                  subsequent_indent=re.sub(
                                                      '.', ' ', note_prefix)):
                            output += line + '\n'
                    else:
                        for line in textwrap.wrap(note, width=75):
                            output += line + '\n'
                    output += '\n'

            return output

        def output_filename(filename, ext=""):
            """ Ensures filename has proper extension. """

            ret_value = filename
            if ext != "":
                ret_value = (filename + "." + ext).replace(
                    "." + ext + "." + ext, "." + ext)
            return ret_value

        ret_value = 0

        title = json_clean_value(recipe_json, "title")
        if format == '':
            if not args.outfile is None and args.outfile != '':
                try:
                    format = (os.path.splitext(args.outfile)[1]).split(".")[-1]
                except:
                    pass

        if format == 'json':
            output = json.dumps(recipe_json, indent=4)
        elif format == 'md':
            output = recipe_json2doc(args, recipe_json, format='md')
        elif format == 'rst':
            output = recipe_json2doc(args, recipe_json, format='rst')
        else:
            print_error("Unknown format [%s]" % (format))
            raise ("ERROR: Unknown format [%s]" % (format))

        if output is None or output.strip() == "":
            print_error("Problem output is empty")
        else:
            if args.save_to_file:
                if args.outfile is None or args.outfile == "":
                    savefile = output_filename(re.sub(r'\W+', '', title),
                                               format)
                else:
                    savefile = output_filename(args.outfile, format)
                print_info("Writing output to %s..." % savefile)
                text_file = open(savefile, "w")
                ret_value = text_file.write(output)
                text_file.close()
            else:
                print(output)

        return ret_value
Ejemplo n.º 6
0
            def found_paywall(source_json):
                ret_value = False
                paywall_json = list(json_find_key(source_json, 'paywall'))

                if paywall_json or paywall_json[0] == 'TRUE' or json_clean_value(paywall_json[1], 'status') == "READY":
                    ret_value = True
Ejemplo n.º 7
0
    def ci2json(args, url):
        """ Loads Cook's Illustrated (and affiliated) URL and checks for
            authentication and then builds Recipe JSON
        """

        def get_json(args, url):
            """ Get JSON from page """

            import pickle

            def find_script(source_html):
                if source_html is None:
                    return None
                else:
                    tree = html.fromstring(source_html)
                    script_element = tree.xpath('//script[@id="__NEXT_DATA__"]')[0]
                    return json.loads(script_element.text)

            def found_paywall(source_json):
                ret_value = False
                paywall_json = list(json_find_key(source_json, 'paywall'))

                if paywall_json or paywall_json[0] == 'TRUE' or json_clean_value(paywall_json[1], 'status') == "READY":
                    ret_value = True

            def cookie_filename(url):
                return '.' + url2domain(url) + '.cookies'

            def save_cookies(requests_cookiejar, url):
                """ save cookie jar """
                filename = cookie_filename(url)
                # Check if ~/.config/recipe-dl exists
                path = os.path.expanduser('~') + "/.config/recipe-dl"
                filename = path + '/' + filename
                if not os.path.isdir(path):
                    # If not check for ~/.config and create recipe-dl
                    if os.path.isdir(os.path.expanduser('~') + "/.config"):
                        try:
                            os.makedirs(path)
                        except OSError:
                            if not os.path.isdir(path):
                                raise
                    else:
                        path = os.path.expanduser('.')
                        if os.path.isdir(path):
                            filename = path + '/' + filename

                print_debug ('Saving cookies to ' + path )
                with open(filename, 'wb') as f:
                    pickle.dump(requests_cookiejar, f)

            def load_cookies(url):
                """ Loads Cookie jar """
                print_debug ('Loading cookies...')

                filename = cookie_filename(url)
                # First look in current directory
                if not os.path.isfile(filename):
                    print_debug ("Unable to find " + filename + ' locally.')
                    # Next look in ~/.config/recipe-dl
                    path = os.path.expanduser('~') + "/.config/recipe-dl"
                    print_debug ("Searching " + path)
                    if os.path.isfile(path + '/' + filename):
                        filename = path + '/' + filename
                    else:
                        # Lastly look where the script is located.
                        print_debug ("Not found. Using script location.")
                        filename = os.path.dirname(os.path.abspath(__file__)) + "/" + filename

                if os.path.isfile(filename):
                    print_debug("found.")
                    with open(filename, 'rb') as f:
                        return pickle.load(f)
                else:
                    print_debug ("Unable to find " + filename)
                    return None

            def get_credentials():
                """ Retrieve Credentals """

                def input_credential(prompt):
                    """ Prompt and input credentals """
                    credential = ''
                    while credential == '':
                        print_to_console(prompt)
                        credential = input()
                    return credential

                credential_json = {}
                credential_json['user'] = input_credential("Enter email address:")
                credential_json['pass'] = input_credential("Enter password:"******""" Load page using existing cookies """

                print_debug ("Getting page using cookies...")

                recipe_page = None

                if cookies is None:
                    #load cookies and do a request
                    cookies = load_cookies(url)

                if not cookies is None:
                    print_debug ('cookies = ' + str(requests.utils.dict_from_cookiejar(cookies)))
                    recipe_page = requests.get(url, cookies=cookies).text

                return recipe_page

            def get_page_using_session(args, url):

                print_debug ("Getting page using sessions...")

                auth_json = get_credentials()

                session_requests = requests.session()

                domain = url2domain(url)
                signin_url = "https://" + domain +"/sign_in?next=%2F"

                signin_page = session_requests.get(signin_url)
                tree = html.fromstring(signin_page.text)
                action = tree.xpath('//form[@class="appForm"]/@action')[0]

                payload={}
                input_elements = tree.xpath('//form[@class="appForm"]//input')
                for input_element in input_elements:
                    payload[input_element.name] = input_element.value
                payload['utf8'] = '&#x2713;'
                payload['user[email]'] = auth_json['user']
                payload['user[password]'] = auth_json['pass']

                # Perform login
                authorize_url = "https://" + domain + action + "?next=%2F"
                result = session_requests.post(authorize_url, data = payload, headers = dict(referer = signin_url))

                save_cookies(session_requests.cookies, url)

                # Grab page
                recipe_page = session_requests.get(url, headers = dict(referer = url))

                return recipe_page.text

            raw_json = json.loads('{ "paywall": true }')
            if not args.authorize_ci:
                # Getting file using cookies
                raw_html = get_page_using_cookie(args, url)
                if not raw_html is None:
                    raw_json = find_script(raw_html)

            if args.authorize_ci or raw_html is None or found_paywall(raw_json):
                'Getting page using full authentication'
                raw_html = get_page_using_session(args, url)
                raw_json = find_script(raw_html)

            raw_json = raw_json['props']['initialState']['content']['documents']
            raw_json = raw_json[list(json.loads(json.dumps(raw_json)))[0]]

            return raw_json

        print_debug("Using Cook's Illustrated scraper...")
        recipe_json={}
        recipe_json['url'] = url

        source_json = get_json(args, url)

        if not source_json is None:
            print_debug(str(source_json))
            recipe_json['title'] = json_clean_value(source_json, 'title')
            recipe_json['description'] = strip_tags(json_clean_value(source_json['metaData']['fields'], 'description'), strip_newline = True)
            recipe_json['yield'] = json_clean_value(source_json, 'yields')

            # Parse Times
            time_note = json_clean_value(source_json, 'recipeTimeNote')
            if time_note == '':
                time_note = 'TBD'
            recipe_json['preptime'] = ''
            recipe_json['cooktime'] = ''
            recipe_json['totaltime'] = time_note

            author = json_clean_value(source_json['metaData']['fields'], 'source')
            if author == '':
                author = url2publisher(url)
            recipe_json['author'] = author

            # Ingredients
            recipe_json['ingredient_groups'] = []
            ingredient_groups = json_clean_value(source_json, "ingredientGroups")
            for group in ingredient_groups:
                group_json = json.loads('{"title":"","ingredients":[]}')
                if len(ingredient_groups) > 1:
                    group_json['title'] = json_clean_value(group['fields'], 'title')
                ingredients = json_clean_value(group['fields'], "recipeIngredientItems")
                for ingredient in ingredients:
                    qty  = json_clean_value(ingredient['fields'], "qty")
                    unit  = json_clean_value(ingredient['fields'], "preText")
                    item  = json_clean_value(json_clean_value(ingredient['fields'], "ingredient", json.loads('{"fields": ""}'))['fields'], 'title')
                    modifier  = json_clean_value(ingredient['fields'], "postText")
                    group_json['ingredients'].append(strip_tags("%s %s %s%s" % (qty, unit, item, modifier), strip_newline = True))
                recipe_json['ingredient_groups'].append(group_json)

            # Directions
            recipe_json['direction_groups'] = []
            group_json = json.loads('{"group":"","directions":[]}')
            steps = json_clean_value(source_json, "instructions")
            for step in steps:
                group_json['directions'].append(strip_tags(json_clean_value(step['fields'], "content"), strip_newline = True))
            recipe_json['direction_groups'].append(group_json)

            recipe_json['notes'] = []
            recipe_json['notes'].append(strip_tags(json_clean_value(source_json, 'headnote'), strip_newline = True))

        else:
            raise UrlError(url, 'URL not supported.')

        return recipe_json
Ejemplo n.º 8
0
    def generic2json(args, url):
        """ Loads generic URL and builds recipe JSON """

        def get_json(url):
            """ Find and load "standardized" json document containing recipe """

            return_value = None

            user_agent = {'User-agent': 'Mozilla/5.0'}
            page = requests.get(url, headers = user_agent)

            match = re.search(r'<script[^>]*type=.?application/ld\+json.?[^>]*>', page.text)
            if match:
                print_debug("Found an occurance of 'application/ld+json'")
                soup = BeautifulSoup(page.text, 'html5lib')
                scripts = soup.findAll('script', attrs = {'type':'application/ld+json'})
                for script in scripts:
                    json_stripped=re.sub('^[^\{\[]*', '', script.text)
                    raw_json = json.loads(json_stripped)
                    if type(raw_json) == list:
                        return_value = json_find_array_element(raw_json, '@type', 'Recipe')
                        try:
                            return_value['publisher'] = json_clean_value(json_clean_value(source_json, 'publisher', json.loads('{}'), 'name', ''))
                            if return_value['publisher'] == '':
                                return_value['publisher'] = json_clean_value(json_find_array_element(raw_json, '@type', 'Organization'), 'name', url2publisher(url))
                        except:
                            if not return_value is None:
                                return_value['publisher'] = url2publisher(url)
                    elif '@graph' in raw_json and type(raw_json['@graph']) == list:
                        return_value = json_find_array_element(raw_json['@graph'], '@type', 'Recipe')
                        try:
                            return_value['publisher'] = json_clean_value(json_clean_value(source_json, 'publisher', json.loads('{}'), 'name', ''))
                            if return_value['publisher'] == '':
                                return_value['publisher'] = json_clean_value(json_find_array_element(raw_json['@graph'], '@type', 'Organization'), 'name', url2publisher(url))
                        except:
                            if not return_value is None:
                                return_value['publisher']=url2publisher(url)
                    else:
                        if return_value is None:
                            try:
                                if raw_json['@type'] == 'Recipe' and 'recipeIngredient' in raw_json:
                                    return_value = raw_json
                                else:
                                    return_value = None
                            except:
                                return_value = None
                        try:
                            return_value['publisher'] = json_clean_value(json_clean_value(source_json, 'publisher', json.loads('{}')), 'name', url2publisher(url))
                        except:
                            if not return_value is None:
                                return_value['publisher']=url2publisher(url)

                    if (not return_value is None) and ('recipeIngredient' in return_value):
                        pass
                    else:
                        return_value = None
            return return_value

        print_debug("Using generic scraper...")
        recipe_json={}
        recipe_json['url'] = url
        source_json = get_json(url)

        if source_json is None:
            print_info("No application+ld json attempting to use recipe-scrapers...")
            recipe_json = recipe_scraper2json(args, url)
        else:
            print_debug(json.dumps(source_json))

            recipe_json['title'] = json_clean_value(source_json, 'headline', json_clean_value(source_json, 'name'))
            recipe_json['description'] = json_clean_value(source_json, 'description')
            if 'recipeYield' in source_json and type(source_json['recipeYield']) == list:
                recipe_json['yield'] = max(source_json['recipeYield'])
            else:
                recipe_json['yield'] = json_clean_value(source_json, 'recipeYield')

            # Parse Times
            minutes_total = iso8601.to_minutes(json_clean_value(source_json, 'totalTime'))
            minutes_cook = iso8601.to_minutes(json_clean_value(source_json, 'cookTime'))
            minutes_prep = iso8601.to_minutes(json_clean_value(source_json, 'prepTime'))
            if minutes_prep == 0 and minutes_total > 0 and minutes_cook > 0:
                minutes_prep = minutes_total - minutes_cook
            if minutes_total == 0 and (minutes_prep > 0 or minutes_cook > 0):
                minutes_total = minutes_prep + minutes_cook
            recipe_json['preptime'] = minutes2time(minutes_prep, '')
            recipe_json['cooktime'] = minutes2time(minutes_cook, '')
            recipe_json['totaltime'] = minutes2time(minutes_total)

            # Parse Author
            publisher = json_clean_value(source_json, 'publisher')
            author = json_clean_value(source_json, 'author')
            if type(author) == list:
                if 'name' in author[0]:
                    author = author[0]['name']
            elif 'name' in author:
                author = author['name']
            if publisher != "":
                if author == "" or publisher == author:
                    author == publisher
                else:
                    if not (publisher in author):
                        author = publisher + ' (' + author + ')'
            recipe_json['author'] = author

            # Ingredients
            ingredients = list(json_find_key(source_json, "recipeIngredient"))[0]
            recipe_json['ingredient_groups'] = []
            recipe_json['ingredient_groups'].append(json.loads('{"title":"","ingredients":[]}'))
            out_ingredients = []
            for ingredient in ingredients:
                out_ingredients.append(strip_tags(ingredient))
            recipe_json['ingredient_groups'][0]['ingredients'] = out_ingredients

            # Directions
            out_instruction=[]
            instructionsSection=list(json_find_key(source_json, 'recipeInstructions'))[0]
            try:
                instructions=list(json_find_key(source_json, 'itemListElement'))[0]
            except IndexError:
                instructions=instructionsSection
            print_debug(str(instructions))
            if str(instructions)[0] == '[':

                for instruction in instructions:
                    try:
                        instruction_json = instruction
                        out_instruction.append(strip_tags(instruction_json['text']))
                    except:
                        out_instruction.append(strip_tags(str(instruction)))
            else:
                out_instruction.append(strip_tags(str(instructions)))

            recipe_json['direction_groups'] = []
            recipe_json['direction_groups'].append(json.loads('{"group":"","directions":[]}'))
            recipe_json['direction_groups'][0]['directions'] = out_instruction

        return recipe_json
Ejemplo n.º 9
0
    def epicurious2json(args, url):
        """ Loads Epicurious URL and builds recipe JSON """

        def get_json(args, url):
            """ Find and load "standardized" json document containing recipe """
            return_value = None
            page = requests.get(url)

            page = BeautifulSoup(requests.get(url).text, 'html5lib')
            scripts = page.findAll('script')
            for script in scripts:
                match = re.search(r'root\.__INITIAL_STATE__\.store', script.text)
                if match:
                    for line in iter(script.text.splitlines()):
                        match = re.search(r'root\.__INITIAL_STATE__\.store', line)
                        if match:
                            raw_json_text = re.sub('[^}]*$','', line)
                            raw_json_text = re.sub('^[^{]*', '', raw_json_text)
                            raw_json_text = re.sub('"email":{"regExp":.*,"password"', '"email":{"regExp":"","password"', raw_json_text)
                            raw_json_text = re.sub('"password":{"regExp":.*,"messages"', '"password":{"regExp":""},"messages"', raw_json_text)
                            raw_json = json.loads(raw_json_text)
                            return_value = json_clean_value(raw_json, 'content', json.loads('{}'))
                            #print_debug(json.dumps(return_value, indent=4))
            return return_value

        print_debug("Using Epicurious scraper...")
        recipe_json={}
        recipe_json['url'] = url

        source_json = get_json(args, url)

        if not source_json is None:
            recipe_json['title'] = json_clean_value(source_json, 'hed')
            recipe_json['description'] = strip_tags(json_clean_value(source_json, 'dek'))
            recipe_json['yield'] = json_clean_value(json_clean_value(source_json, 'servingSizeInfo',json.loads('{}')), 'servingSizeDescription')

            # Parse Times
            minutes_prep = iso8601.to_minutes(json_clean_value(source_json, 'formattedPrepTime'))
            minutes_cook = iso8601.to_minutes(json_clean_value(source_json, 'formattedCookTime'))
            minutes_total = minutes_prep + minutes_cook
            if minutes_prep == 0 and minutes_total > 0 and minutes_cook > 0:
                minutes_prep = minutes_total - minutes_cook
            recipe_json['preptime'] = minutes2time(minutes_prep, '')
            recipe_json['cooktime'] = minutes2time(minutes_cook, '')
            recipe_json['totaltime'] = minutes2time(minutes_total)

            # Parse Author
            publisher = "Epicurious"
            author = json_clean_value(source_json, 'author', '')
            if type(author) == list:
                if 'name' in author[0]:
                    author = author[0]['name']
            elif 'name' in author:
                author = author['name']
            if publisher != "":
                if author == "" or publisher == author:
                    author == publisher
                else:
                    if not (publisher in author):
                        author = publisher + ' (' + author + ')'
            recipe_json['author'] = author

            # Ingredients
            recipe_json['ingredient_groups'] = []
            ingredient_groups = json_clean_value(source_json, "ingredientGroups")
            for group in ingredient_groups:
                group_json = json.loads('{"title":"","ingredients":[]}')
                if len(ingredient_groups) > 1:
                    group_json['title'] = json_clean_value(group_json, "hed")
                ingredients = json_clean_value(group, "ingredients")
                for ingredient in ingredients:
                    group_json['ingredients'].append(strip_tags(json_clean_value(ingredient, "description")))
                recipe_json['ingredient_groups'].append(group_json)

            # Directions
            recipe_json['direction_groups'] = []
            direction_groups = json_clean_value(source_json, "preparationGroups")
            for group in direction_groups:
                group_json = json.loads('{"group":"","directions":[]}')
                if len(direction_groups) > 1:
                    group_json['group'] = strip_tags(json_clean_value(group_json, "hed"))
                steps = json_clean_value(group, "steps")
                for step in steps:
                    group_json['directions'].append(strip_tags(json_clean_value(step, "description")))
                recipe_json['direction_groups'].append(group_json)

        else:
            raise UrlError(url, 'URL not supported.')

        return recipe_json