Python fix_bad_unicode Examples

Programming Language: Python

Namespace/Package Name: fix_unicode

Method/Function: fix_bad_unicode

Examples at hotexamples.com: 5

Python fix_bad_unicode - 5 examples found. These are the top rated real world Python examples of fix_unicode.fix_bad_unicode extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: catalog.py Project: samv/disk-catalogue

 def get_filename(self, fobj, relname):
     """Returns a Filename object, given the Inode object and the filesystem
     relative name."""
     try:
         relname_utf8 = relname.decode("utf8")
     except UnicodeDecodeError:
         latin1 = relname.decode("latin1")
         relname_utf8 = fix_bad_unicode(latin1)
         print u"Tidied {rn} to '{new}'".format(
             rn=repr(relname),
             new=relname_utf8,
         )
     relname_bin = None
     if relname_utf8.encode("utf8") != relname:
         relname_bin = relname
     try:
         fnobj = self.sesh.query(Filename).filter(
             Filename.volume_id == fobj.volume_id,
             Filename.inode_num == fobj.inode_num,
             Filename.filename == relname_utf8,
         ).one()
         #TODO: check other names & remove if necc.
         #FIXME: multiple links to same file differing only by unicode!
     except NoResultFound:
         fnobj = Filename(
             volume_id=fobj.volume_id,
             inode_num=fobj.inode_num,
             filename=relname_utf8,
             filename_raw=relname_bin,
         )
         self.sesh.add(fnobj)
     return fnobj

Example #2

Show file

File: edx2course_axis.py Project: wellesleycollege/edx2bigquery

        def walk(x, seq_num=1, path=[], seq_type=None, parent_start=None, parent=None, chapter=None,
                 parent_url_name=None, is_split_flag=False):
            '''
            Recursively traverse course tree.  
            
            x        = current etree element
            seq_num  = sequence of current element in its parent, starting from 1
            path     = list of url_name's to current element, following edX's hierarchy conventions
            seq_type = problemset, sequential, or videosequence
            parent_start = start date of parent of current etree element
            parent   = parent module
            chapter  = the last chapter module_id seen while walking through the tree
            parent_url_name = url_name of parent
            is_split_flag   = boolean indicating if this subtree is within a split_test
            '''
            url_name = x.get('url_name',x.get('url_name_orig',''))
            if not url_name:
                dn = x.get('display_name')
                if dn is not None:
                    url_name = dn.strip().replace(' ','_')     # 2012 convention for converting display_name to url_name
                    url_name = url_name.replace(':','_')
                    url_name = url_name.replace('.','_')
                    url_name = url_name.replace('(','_').replace(')','_').replace('__','_')
            
            data = None
            start = None

            if not FORCE_NO_HIDE:
                hide = policy.get_metadata(x, 'hide_from_toc')
                if hide is not None and not hide=="false":
                    logit('[edx2course_axis] Skipping %s (%s), it has hide_from_toc=%s' % (x.tag, x.get('display_name','<noname>'), hide))
                    return

            if x.tag=='video':	# special: for video, let data = youtube ID(s)
                data = x.get('youtube','')
                if data:
                    # old ytid format - extract just the 1.0 part of this 
                    # 0.75:JdL1Vo0Hru0,1.0:lbaG3uiQ6IY,1.25:Lrj0G8RWHKw,1.50:54fs3-WxqLs
                    ytid = data.replace(' ','').split(',')
                    ytid = [z[1] for z in [y.split(':') for y in ytid] if z[0]=='1.0']
                    # print "   ytid: %s -> %s" % (x.get('youtube',''), ytid)
                    if ytid:
                        data = ytid
                if not data:
                    data = x.get('youtube_id_1_0', '')
                if data:
                    data = '{"ytid": "%s"}' % data

            if x.tag=="split_test":
                data = {}
                to_copy = ['group_id_to_child', 'user_partition_id']
                for tc in to_copy:
                    data[tc] = x.get(tc, None)

            if x.tag=='problem' and x.get('weight') is not None and x.get('weight'):
                try:
                    data = '{"weight": %f}' % float(x.get('weight'))
                except Exception as err:
                    logit("    Error converting weight %s" % x.get('weight'))
                
            if x.tag=='html':
                iframe = x.find('.//iframe')
                if iframe is not None:
                    logit("   found iframe in html %s" % url_name)
                    src = iframe.get('src','')
                    if 'https://www.youtube.com/embed/' in src:
                        m = re.search('embed/([^"/?]+)', src)
                        if m:
                            data = '{"ytid": "%s"}' % m.group(1)
                            logit("    data=%s" % data)
                
            if url_name:              # url_name is mandatory if we are to do anything with this element
                # url_name = url_name.replace(':','_')
                dn = x.get('display_name', url_name)
                try:
                    #dn = dn.decode('utf-8')
                    dn = unicode(dn)
                    dn = fix_bad_unicode(dn)
                except Exception as err:
                    logit('unicode error, type(dn)=%s'  % type(dn))
                    raise
                pdn = policy.get_metadata(x, 'display_name')      # policy display_name - if given, let that override default
                if pdn is not None:
                    dn = pdn

                #start = date_parse(x.get('start', policy.get_metadata(x, 'start', '')))
                start = date_parse(policy.get_metadata(x, 'start', '', parent=True))
                
                if parent_start is not None and start < parent_start:
                    if VERBOSE_WARNINGS:
                        logit("    Warning: start of %s element %s happens before start %s of parent: using parent start" % (start, x.tag, parent_start), nolog=True)
                    start = parent_start
                #print "start for %s = %s" % (x, start)
                
                # drop bad due date strings
                if date_parse(x.get('due',None), retbad=True)=='Bad':
                    x.set('due', '')

                due = date_parse(policy.get_metadata(x, 'due', '', parent=True))
                if x.tag=="problem":
                    logit("    setting problem due date: for %s due=%s" % (url_name, due), nolog=True)

                gformat = x.get('format', policy.get_metadata(x, 'format', ''))
                if url_name=='hw0':
                    logit( "gformat for hw0 = %s" % gformat)

                graded = x.get('graded', policy.get_metadata(x, 'graded', ''))

                # compute path
                # The hierarchy goes: `course > chapter > (problemset | sequential | videosequence)`
                if x.tag=='chapter':
                    path = [url_name]
                elif x.tag in ['problemset', 'sequential', 'videosequence', 'proctor', 'randomize']:
                    seq_type = x.tag
                    path = [path[0], url_name]
                else:
                    path = path[:] + [str(seq_num)]      # note arrays are passed by reference, so copy, don't modify
                    
                # compute module_id
                if x.tag=='html':
                    module_id = '%s/%s/%s/%s' % (org, course, seq_type, '/'.join(path[1:3]))  # module_id which appears in tracking log
                else:
                    module_id = '%s/%s/%s/%s' % (org, course, x.tag, url_name)
                
                # debugging
                # print "     module %s gformat=%s" % (module_id, gformat)

                # done with getting all info for this axis element; save it
                path_str = '/' + '/'.join(path)
                ae = Axel(cid, index[0], url_name, x.tag, gformat, start, due, dn, path_str, module_id, data, chapter, graded,
                          parent_url_name,
                          is_split_flag)
                caxis.append(ae)
                index[0] += 1
            else:
                if VERBOSE_WARNINGS:
                    if x.tag in ['transcript', 'wiki', 'metadata']:
                        pass
                    else:
                        logit("Missing url_name for element %s (attrib=%s, parent_tag=%s)" % (x, x.attrib, (parent.tag if parent is not None else '')))

            # chapter?
            if x.tag=='chapter':
                the_chapter = module_id
            else:
                the_chapter = chapter

            # done processing this element, now process all its children
            if (not x.tag in ['html', 'problem', 'discussion', 'customtag', 'poll_question', 'combinedopenended', 'metadata']):
                inherit_seq_num = (x.tag=='vertical' and not url_name)    # if <vertical> with no url_name then keep seq_num for children
                if not inherit_seq_num:
                    seq_num = 1
                for y in x:
                    if (not str(y).startswith('<!--')) and (not y.tag in ['discussion', 'source']):
                        walk(y, seq_num, path, seq_type, parent_start=start, parent=x, chapter=the_chapter,
                             parent_url_name=url_name,
                             is_split_flag=((x.tag=="split_test") or is_split_flag),
                        )
                        if not inherit_seq_num:
                            seq_num += 1

Example #3

Show file

File: edx2course_axis.py Project: musixhine/edx2bigquery

        def walk(x, seq_num=1, path=[], seq_type=None, parent_start=None, parent=None, chapter=None,
                 parent_url_name=None, split_url_name=None):
            '''
            Recursively traverse course tree.  
            
            x        = current etree element
            seq_num  = sequence of current element in its parent, starting from 1
            path     = list of url_name's to current element, following edX's hierarchy conventions
            seq_type = problemset, sequential, or videosequence
            parent_start = start date of parent of current etree element
            parent   = parent module
            chapter  = the last chapter module_id seen while walking through the tree
            parent_url_name = url_name of parent
            split_url_name   = url_name of split_test element if this subtree is in a split_test, otherwise None
            '''
            url_name = x.get('url_name',x.get('url_name_orig',''))
            if not url_name:
                dn = x.get('display_name')
                if dn is not None:
                    url_name = dn.strip().replace(' ','_')     # 2012 convention for converting display_name to url_name
                    url_name = url_name.replace(':','_')
                    url_name = url_name.replace('.','_')
                    url_name = url_name.replace('(','_').replace(')','_').replace('__','_')
            
            data = None
            start = None

            if not FORCE_NO_HIDE:
                hide = policy.get_metadata(x, 'hide_from_toc')
                if hide is not None and not hide=="false":
                    logit('[edx2course_axis] Skipping %s (%s), it has hide_from_toc=%s' % (x.tag, x.get('display_name','<noname>'), hide))
                    return

            if x.tag=='video':	# special: for video, let data = youtube ID(s)
                data = x.get('youtube','')
                if data:
                    # old ytid format - extract just the 1.0 part of this 
                    # 0.75:JdL1Vo0Hru0,1.0:lbaG3uiQ6IY,1.25:Lrj0G8RWHKw,1.50:54fs3-WxqLs
                    ytid = data.replace(' ','').split(',')
                    ytid = [z[1] for z in [y.split(':') for y in ytid] if z[0]=='1.0']
                    # print "   ytid: %s -> %s" % (x.get('youtube',''), ytid)
                    if ytid:
                        data = ytid
                if not data:
                    data = x.get('youtube_id_1_0', '')
                if data:
                    data = '{"ytid": "%s"}' % data

            if x.tag=="split_test":
                data = {}
                to_copy = ['group_id_to_child', 'user_partition_id']
                for tc in to_copy:
                    data[tc] = x.get(tc, None)

            if x.tag=='problem' and x.get('weight') is not None and x.get('weight'):
                try:
                    # Changed from string to dict. In next code block.
                    data = {"weight": "%f" % float(x.get('weight'))}
                except Exception as err:
                    logit("    Error converting weight %s" % x.get('weight'))

            ### Had a hard time making my code work within the try/except for weight. Happy to improve
            ### Also note, weight is typically missing in problems. So I find it weird that we throw an exception.
            if x.tag=='problem':
                # Initialize data if no weight
                if not data:
                    data = {}

                # meta will store all problem related metadata, then be used to update data
                meta = {}
                # Items is meant to help debug - an ordered list of encountered problem types with url names
                # Likely should not be pulled to Big Query 
                meta['items'] = []
                # Known Problem Types
                known_problem_types = ['multiplechoiceresponse','numericalresponse','choiceresponse',
                                       'optionresponse','stringresponse','formularesponse',
                                       'customresponse','fieldset']

                # Loop through all child nodes in a problem. If encountering a known problem type, add metadata.
                for a in x:
                    if a.tag in known_problem_types:
                        meta['items'].append({'itype':a.tag,'url_name':a.get('url_name')})

                ### Check for accompanying image
                images = x.findall('.//img')
                # meta['has_image'] = False
                
                if images and len(images)>0:
                    meta['has_image'] = True #Note, one can use a.get('src'), but needs to account for multiple images
                    # print meta['img'],len(images)

                ### Search for all solution tags in a problem
                solutions = x.findall('.//solution')
                # meta['has_solution'] = False

                if solutions and len(solutions)>0:
                    text = ''
                    for sol in solutions:
                        text = text.join(html.tostring(e, pretty_print=False) for e in sol)
                        # This if statment checks each solution. Note, many MITx problems have multiple solution tags.
                        # In 8.05x, common to put image in one solution tag, and the text in a second. So we are checking each tag.
                        # If we find one solution with > 65 char, or one solution with an image, we set meta['solution'] = True
                        if len(text) > 65 or 'img src' in text:
                            meta['has_solution'] = True

                ### If meta is empty, log all tags for debugging later. 
                if len(meta)==0:
                    logit('item type not found - here is the list of tags:['+','.join(a.tag if a else ' ' for a in x)+']')
                    # print 'problem type not found - here is the list of tags:['+','.join(a.tag for a in x)+']'

                ### Add easily accessible metadata for problems
                # num_items: number of items
                # itype: problem type - note, mixed is used when items are not of same type
                if len(meta['items']) > 0:
                    # Number of Items
                    meta['num_items'] = len(meta['items'])

                    # Problem Type
                    if all(meta['items'][0]['itype'] == item['itype'] for item in meta['items']):
                        meta['itype'] = meta['items'][0]['itype']
                        # print meta['items'][0]['itype']
                    else:
                        meta['itype'] = 'mixed'

                # Update data field
                ### ! For now, removing the items field. 
                del meta["items"]               

                data.update(meta)
                data = json.dumps(data)

            if x.tag=='html':
                iframe = x.find('.//iframe')
                if iframe is not None:
                    logit("   found iframe in html %s" % url_name)
                    src = iframe.get('src','')
                    if 'https://www.youtube.com/embed/' in src:
                        m = re.search('embed/([^"/?]+)', src)
                        if m:
                            data = '{"ytid": "%s"}' % m.group(1)
                            logit("    data=%s" % data)
                
            if url_name:              # url_name is mandatory if we are to do anything with this element
                # url_name = url_name.replace(':','_')
                dn = x.get('display_name', url_name)
                try:
                    #dn = dn.decode('utf-8')
                    dn = unicode(dn)
                    dn = fix_bad_unicode(dn)
                except Exception as err:
                    logit('unicode error, type(dn)=%s'  % type(dn))
                    raise
                pdn = policy.get_metadata(x, 'display_name')      # policy display_name - if given, let that override default
                if pdn is not None:
                    dn = pdn

                #start = date_parse(x.get('start', policy.get_metadata(x, 'start', '')))
                start = date_parse(policy.get_metadata(x, 'start', '', parent=True))
                
                if parent_start is not None and start < parent_start:
                    if VERBOSE_WARNINGS:
                        logit("    Warning: start of %s element %s happens before start %s of parent: using parent start" % (start, x.tag, parent_start), nolog=True)
                    start = parent_start
                #print "start for %s = %s" % (x, start)
                
                # drop bad due date strings
                if date_parse(x.get('due',None), retbad=True)=='Bad':
                    x.set('due', '')

                due = date_parse(policy.get_metadata(x, 'due', '', parent=True))
                if x.tag=="problem":
                    logit("    setting problem due date: for %s due=%s" % (url_name, due), nolog=True)

                gformat = x.get('format', policy.get_metadata(x, 'format', ''))
                if url_name=='hw0':
                    logit( "gformat for hw0 = %s" % gformat)

                graded = x.get('graded', policy.get_metadata(x, 'graded', ''))
                if not (type(graded) in [unicode, str]):
                    graded = str(graded)

                # compute path
                # The hierarchy goes: `course > chapter > (problemset | sequential | videosequence)`
                if x.tag=='chapter':
                    path = [url_name]
                elif x.tag in ['problemset', 'sequential', 'videosequence', 'proctor', 'randomize']:
                    seq_type = x.tag
                    path = [path[0], url_name]
                else:
                    path = path[:] + [str(seq_num)]      # note arrays are passed by reference, so copy, don't modify
                    
                # compute module_id
                if x.tag=='html':
                    module_id = '%s/%s/%s/%s' % (org, course, seq_type, '/'.join(path[1:3]))  # module_id which appears in tracking log
                else:
                    module_id = '%s/%s/%s/%s' % (org, course, x.tag, url_name)
                
                # debugging
                # print "     module %s gformat=%s" % (module_id, gformat)

                # done with getting all info for this axis element; save it
                path_str = '/' + '/'.join(path)
                ae = Axel(cid, index[0], url_name, x.tag, gformat, start, due, dn, path_str, module_id, data, chapter, graded,
                          parent_url_name,
                          not split_url_name==None,
                          split_url_name)
                caxis.append(ae)
                index[0] += 1
            else:
                if VERBOSE_WARNINGS:
                    if x.tag in ['transcript', 'wiki', 'metadata']:
                        pass
                    else:
                        logit("Missing url_name for element %s (attrib=%s, parent_tag=%s)" % (x, x.attrib, (parent.tag if parent is not None else '')))

            # chapter?
            if x.tag=='chapter':
                the_chapter = module_id
            else:
                the_chapter = chapter

            # done processing this element, now process all its children
            if (not x.tag in ['html', 'problem', 'discussion', 'customtag', 'poll_question', 'combinedopenended', 'metadata']):
                inherit_seq_num = (x.tag=='vertical' and not url_name)    # if <vertical> with no url_name then keep seq_num for children
                if not inherit_seq_num:
                    seq_num = 1
                for y in x:
                    if (not str(y).startswith('<!--')) and (not y.tag in ['discussion', 'source']):
                        if not split_url_name and x.tag=="split_test":
                            split_url_name = url_name
                                
                        walk(y, seq_num, path, seq_type, parent_start=start, parent=x, chapter=the_chapter,
                             parent_url_name=url_name,
                             split_url_name=split_url_name,
                        )
                        if not inherit_seq_num:
                            seq_num += 1

Example #4

Show file

File: edx2course_axis.py Project: mitodl/edx2course_axis

def walk(
        element, course, cid, org, policy, index, caxis, seq_num=1, paths=None,
        seq_type=None, parent_start=None, parent=None, chapter=None):
    """
    Recursively traverse course tree.

    element        = current etree element
    seq_num  = sequence of current element in its parent, starting from 1
    paths     = list of url_name's to current element, following edX's hierarchy conventions
    seq_type = problemset, sequential, or videosequence
    parent_start = start date of parent of current etree element
    parent   = parent module
    chapter  = the last chapter module_id seen while walking through the tree
    """

    # Fixes dangerous-default-value.
    if paths is None:
        paths = []
    url_name = element.get(
        'url_name',
        element.get(
            'url_name_orig',
            ''))
    if not url_name:
        display_name = element.get('display_name')
        if display_name is not None:
            # 2012 convention for converting display_name to url_name
            url_name = display_name.strip().replace(
                ' ',
                '_')
            url_name = url_name.replace(':', '_')
            url_name = url_name.replace('.', '_')
            url_name = url_name.replace(
                '(', '_').replace(')', '_').replace('__', '_')

    data = None
    start = None

    if not FORCE_NO_HIDE:
        hide = policy.get_metadata(element, 'hide_from_toc')
        if hide is not None and not hide == "false":
            msg = (
                '[edx2course_axis] Skipping {0} ({1}), it has '
                'hide_from_toc={3}'
            )
            log.debug(
                msg.format(
                    element.tag, element.get('display_name', '<noname>'), hide)
            )
            return

    # special: for video, let data = youtube ID(s)
    if element.tag == 'video':
        data = element.get('youtube', '')
        if data:
            # old ytid format - extract just the 1.0 part of this
            # 0.75:JdL1Vo0Hru0,1.0:lbaG3uiQ6IY,1.25:Lrj0G8RWHKw,1.50:54fs3-WxqLs
            ytid = data.replace(' ', '').split(',')
            ytid = [
                z[1] for z in [
                    y.split(':') for y in ytid] if z[0] == '1.0']
            if ytid:
                data = ytid
        if not data:
            data = element.get('youtube_id_1_0', '')
        if data:
            data = '{"ytid": "%s"}' % data

    if element.tag == 'problem' and element.get(
            'weight') is not None and element.get('weight'):
        try:
            data = '{"weight": %f}' % float(element.get('weight'))
        except (TypeError, ValueError) as err:
            log.error("Error converting weight {0}: {1}".format(
                element.get('weight'), err,
            ))

    if element.tag == 'html':
        iframe = element.find('.//iframe')
        if iframe is not None:
            log.debug("found iframe in html {0}".format(url_name))
            src = iframe.get('src', '')
            if 'https://www.youtube.com/embed/' in src:
                match = re.search('embed/([^"/?]+)', src)
                if match:
                    data = '{"ytid": "%s"}' % match.group(1)
                    log.debug("data={0}".format(data))

    # url_name is mandatory if we are to do anything with this element
    if url_name:
        # url_name = url_name.replace(':','_')
        display_name = element.get('display_name', url_name)
        try:
            display_name = unicode(display_name)
            display_name = fix_bad_unicode(display_name)
        except Exception as ex:
            log.error(
                'unicode error, type(display_name)={0}'.format(
                    type(display_name)))
            raise ex
        # policy display_name - if given, let that override default
        pdn = policy.get_metadata(element, 'display_name')
        if pdn is not None:
            display_name = pdn

        start = date_parse(
            policy.get_metadata(
                element,
                'start',
                '',
                parent=True))

        if parent_start is not None and start < parent_start:
            if VERBOSE_WARNINGS:
                msg = (
                    "Warning: start of {0} element {1} happens before start "
                    "{2} of parent: using parent start"
                )
                log.warning(msg.format(start, element.tag, parent_start))
            start = parent_start

        # drop bad due date strings
        if date_parse(element.get('due', None), retbad=True) == 'Bad':
            element.set('due', '')

        due = date_parse(
            policy.get_metadata(
                element,
                'due',
                '',
                parent=True))
        if element.tag == "problem":
            log.debug(
                "setting problem due date: for {0} due={1}".format(
                    url_name, due))

        gformat = element.get(
            'format',
            policy.get_metadata(
                element,
                'format',
                ''))
        if url_name == 'hw0':
            log.debug("gformat for hw0 = {0}".format(gformat))

        # compute path
        # The hierarchy goes: `course > chapter > (problemset |
        # sequential | videosequence)`

        tags = set([
            'problemset', 'sequential', 'videosequence', 'proctor', 'randomize'
        ])
        if element.tag == 'chapter':
            paths = [url_name]
        elif element.tag in tags:
            seq_type = element.tag
            paths = [paths[0], url_name]
        else:
            # note arrays are passed by reference, so copy, don't
            # modify
            paths = paths[:] + [str(seq_num)]

        # compute module_id
        if element.tag == 'html':
            # module_id which appears in tracking log
            module_id = '{0}/{1}/{2}/{3}'.format(
                org, course, seq_type, '/'.join(paths[1:3]))
        else:
            module_id = '{0}/{1}/{2}/{3}'.format(
                org, course, element.tag, url_name)

        # done with getting all info for this axis element; save it
        path_str = '/' + '/'.join(paths)
        axel = Axel(
            cid, index[
                0], url_name, element.tag, gformat, start, due, display_name,
            path_str, module_id, data, chapter,
        )
        caxis.append(axel)
        index[0] += 1
    else:
        if VERBOSE_WARNINGS:
            if element.tag in ['transcript', 'wiki', 'metadata']:
                pass
            else:
                msg = (
                    "Missing url_name for element {0} "
                    "(attrib={1}, parent_tag={2})"
                )
                log.warning(
                    msg.format(
                        element, element.attrib,
                        (parent.tag if parent is not None else ''))
                )

    # chapter?
    if element.tag == 'chapter':
        the_chapter = module_id
    else:
        the_chapter = chapter

    # done processing this element, now process all its children
    tags = set([
        'html', 'problem', 'discussion', 'customtag', 'poll_question',
        'combinedopenended', 'metadata',
    ])
    if element.tag not in tags:
        # if <vertical> with no url_name then keep seq_num for children
        inherit_seq_num = (element.tag == 'vertical' and not url_name)
        if not inherit_seq_num:
            seq_num = 1
        for child in element:
            if (not str(child).startswith('<!--')) \
                    and (not child.tag in ['discussion', 'source']):
                walk(
                    child,
                    course,
                    cid,
                    org,
                    policy,
                    index,
                    caxis,
                    seq_num,
                    paths,
                    seq_type,
                    parent_start=start,
                    parent=element,
                    chapter=the_chapter)
                if not inherit_seq_num:
                    seq_num += 1

Example #5

Show file

        def walk(x, seq_num=1, path=[], seq_type=None, parent_start=None, parent=None, chapter=None,
                 parent_url_name=None, split_url_name=None):
            '''
            Recursively traverse course tree.  
            
            x        = current etree element
            seq_num  = sequence of current element in its parent, starting from 1
            path     = list of url_name's to current element, following edX's hierarchy conventions
            seq_type = problemset, sequential, or videosequence
            parent_start = start date of parent of current etree element
            parent   = parent module
            chapter  = the last chapter module_id seen while walking through the tree
            parent_url_name = url_name of parent
            split_url_name   = url_name of split_test element if this subtree is in a split_test, otherwise None
            '''
            url_name = x.get('url_name',x.get('url_name_orig',''))
            if not url_name:
                dn = x.get('display_name')
                if dn is not None:
                    url_name = dn.strip().replace(' ','_')     # 2012 convention for converting display_name to url_name
                    url_name = url_name.replace(':','_')
                    url_name = url_name.replace('.','_')
                    url_name = url_name.replace('(','_').replace(')','_').replace('__','_')
            
            data = None
            start = None

            if not FORCE_NO_HIDE:
                hide = policy.get_metadata(x, 'hide_from_toc')
                if hide is not None and not hide=="false":
                    logit('[edx2course_axis] Skipping %s (%s), it has hide_from_toc=%s' % (x.tag, x.get('display_name','<noname>'), hide))
                    return

            if x.tag=='video':	# special: for video, let data = youtube ID(s)
                data = x.get('youtube','')
                if data:
                    # old ytid format - extract just the 1.0 part of this 
                    # 0.75:JdL1Vo0Hru0,1.0:lbaG3uiQ6IY,1.25:Lrj0G8RWHKw,1.50:54fs3-WxqLs
                    ytid = data.replace(' ','').split(',')
                    ytid = [z[1] for z in [y.split(':') for y in ytid] if z[0]=='1.0']
                    # print "   ytid: %s -> %s" % (x.get('youtube',''), ytid)
                    if ytid:
                        data = ytid
                if not data:
                    data = x.get('youtube_id_1_0', '')
                if data:
                    data = '{"ytid": "%s"}' % data

            if x.tag=="split_test":
                data = {}
                to_copy = ['group_id_to_child', 'user_partition_id']
                for tc in to_copy:
                    data[tc] = x.get(tc, None)

            if x.tag=='problem' and x.get('weight') is not None and x.get('weight'):
                try:
                    # Changed from string to dict. In next code block.
                    data = {"weight": "%f" % float(x.get('weight'))}
                except Exception as err:
                    logit("    Error converting weight %s" % x.get('weight'))

            ### Had a hard time making my code work within the try/except for weight. Happy to improve
            ### Also note, weight is typically missing in problems. So I find it weird that we throw an exception.
            if x.tag=='problem':
                # Initialize data if no weight
                if not data:
                    data = {}

                # meta will store all problem related metadata, then be used to update data
                meta = {}
                # Items is meant to help debug - an ordered list of encountered problem types with url names
                # Likely should not be pulled to Big Query 
                meta['items'] = []
                # Known Problem Types
                known_problem_types = ['multiplechoiceresponse','numericalresponse','choiceresponse',
                                       'optionresponse','stringresponse','formularesponse',
                                       'customresponse','fieldset']

                # Loop through all child nodes in a problem. If encountering a known problem type, add metadata.
                for a in x:
                    if a.tag in known_problem_types:
                        meta['items'].append({'itype':a.tag,'url_name':a.get('url_name')})

                ### Check for accompanying image
                images = x.findall('.//img')
                # meta['has_image'] = False
                
                if images and len(images)>0:
                    meta['has_image'] = True #Note, one can use a.get('src'), but needs to account for multiple images
                    # print meta['img'],len(images)

                ### Search for all solution tags in a problem
                solutions = x.findall('.//solution')
                # meta['has_solution'] = False

                if solutions and len(solutions)>0:
                    text = ''
                    for sol in solutions:
                        text = text.join(html.tostring(e, pretty_print=False) for e in sol)
                        # This if statment checks each solution. Note, many MITx problems have multiple solution tags.
                        # In 8.05x, common to put image in one solution tag, and the text in a second. So we are checking each tag.
                        # If we find one solution with > 65 char, or one solution with an image, we set meta['solution'] = True
                        if len(text) > 65 or 'img src' in text:
                            meta['has_solution'] = True

                ### If meta is empty, log all tags for debugging later. 
                if len(meta)==0:
                    logit('item type not found - here is the list of tags:['+','.join(a.tag if a else ' ' for a in x)+']')
                    # print 'problem type not found - here is the list of tags:['+','.join(a.tag for a in x)+']'

                ### Add easily accessible metadata for problems
                # num_items: number of items
                # itype: problem type - note, mixed is used when items are not of same type
                if len(meta['items']) > 0:
                    # Number of Items
                    meta['num_items'] = len(meta['items'])

                    # Problem Type
                    if all(meta['items'][0]['itype'] == item['itype'] for item in meta['items']):
                        meta['itype'] = meta['items'][0]['itype']
                        # print meta['items'][0]['itype']
                    else:
                        meta['itype'] = 'mixed'

                # Update data field
                ### ! For now, removing the items field. 
                del meta["items"]               

                data.update(meta)
                data = json.dumps(data)

            if x.tag=='html':
                iframe = x.find('.//iframe')
                if iframe is not None:
                    logit("   found iframe in html %s" % url_name)
                    src = iframe.get('src','')
                    if 'https://www.youtube.com/embed/' in src:
                        m = re.search('embed/([^"/?]+)', src)
                        if m:
                            data = '{"ytid": "%s"}' % m.group(1)
                            logit("    data=%s" % data)
                
            if url_name:              # url_name is mandatory if we are to do anything with this element
                # url_name = url_name.replace(':','_')
                dn = x.get('display_name', url_name)
                try:
                    #dn = dn.decode('utf-8')
                    dn = unicode(dn)
                    dn = fix_bad_unicode(dn)
                except Exception as err:
                    logit('unicode error, type(dn)=%s'  % type(dn))
                    raise
                pdn = policy.get_metadata(x, 'display_name')      # policy display_name - if given, let that override default
                if pdn is not None:
                    dn = pdn

                #start = date_parse(x.get('start', policy.get_metadata(x, 'start', '')))
                start = date_parse(policy.get_metadata(x, 'start', '', parent=True))
                
                if parent_start is not None and start < parent_start:
                    if VERBOSE_WARNINGS:
                        logit("    Warning: start of %s element %s happens before start %s of parent: using parent start" % (start, x.tag, parent_start), nolog=True)
                    start = parent_start
                #print "start for %s = %s" % (x, start)
                
                # drop bad due date strings
                if date_parse(x.get('due',None), retbad=True)=='Bad':
                    x.set('due', '')

                due = date_parse(policy.get_metadata(x, 'due', '', parent=True))
                if x.tag=="problem":
                    logit("    setting problem due date: for %s due=%s" % (url_name, due), nolog=True)

                gformat = x.get('format', policy.get_metadata(x, 'format', ''))
                if url_name=='hw0':
                    logit( "gformat for hw0 = %s" % gformat)

                graded = x.get('graded', policy.get_metadata(x, 'graded', ''))

                # compute path
                # The hierarchy goes: `course > chapter > (problemset | sequential | videosequence)`
                if x.tag=='chapter':
                    path = [url_name]
                elif x.tag in ['problemset', 'sequential', 'videosequence', 'proctor', 'randomize']:
                    seq_type = x.tag
                    path = [path[0], url_name]
                else:
                    path = path[:] + [str(seq_num)]      # note arrays are passed by reference, so copy, don't modify
                    
                # compute module_id
                if x.tag=='html':
                    module_id = '%s/%s/%s/%s' % (org, course, seq_type, '/'.join(path[1:3]))  # module_id which appears in tracking log
                else:
                    module_id = '%s/%s/%s/%s' % (org, course, x.tag, url_name)
                
                # debugging
                # print "     module %s gformat=%s" % (module_id, gformat)

                # done with getting all info for this axis element; save it
                path_str = '/' + '/'.join(path)
                ae = Axel(cid, index[0], url_name, x.tag, gformat, start, due, dn, path_str, module_id, data, chapter, graded,
                          parent_url_name,
                          not split_url_name==None,
                          split_url_name)
                caxis.append(ae)
                index[0] += 1
            else:
                if VERBOSE_WARNINGS:
                    if x.tag in ['transcript', 'wiki', 'metadata']:
                        pass
                    else:
                        logit("Missing url_name for element %s (attrib=%s, parent_tag=%s)" % (x, x.attrib, (parent.tag if parent is not None else '')))

            # chapter?
            if x.tag=='chapter':
                the_chapter = module_id
            else:
                the_chapter = chapter

            # done processing this element, now process all its children
            if (not x.tag in ['html', 'problem', 'discussion', 'customtag', 'poll_question', 'combinedopenended', 'metadata']):
                inherit_seq_num = (x.tag=='vertical' and not url_name)    # if <vertical> with no url_name then keep seq_num for children
                if not inherit_seq_num:
                    seq_num = 1
                for y in x:
                    if (not str(y).startswith('<!--')) and (not y.tag in ['discussion', 'source']):
                        if not split_url_name and x.tag=="split_test":
                            split_url_name = url_name
                                
                        walk(y, seq_num, path, seq_type, parent_start=start, parent=x, chapter=the_chapter,
                             parent_url_name=url_name,
                             split_url_name=split_url_name,
                        )
                        if not inherit_seq_num:
                            seq_num += 1