コード例 #1
0
 def xhtml2xbundle(self):
     '''
     Convert XHTML output of PlasTeX to an edX xbundle file.
     Use lxml to parse the XML and extract the desired parts.
     '''
     xml = etree.fromstring(self.xhtml)
     self.xml = xml
     for filter in self.fix_filters:
         filter(xml)
     no_overwrite = ['course'] if self.do_merge else []
     xb = xbundle.XBundle(force_studio_format=True,
                          keep_urls=True,
                          no_overwrite=no_overwrite)
     xb.KeepTogetherTags = ['sequential', 'vertical', 'conditional']
     course = xml.find('.//course')
     if course is not None:
         xb.set_course(course)
     self.xb = xb
     return xb
コード例 #2
0
def make_axis(dir):
    '''
    return dict of {course_id : { policy, xbundle, axis (as list of Axel elements) }}
    '''
    
    courses = []
    log_msg = []

    def logit(msg, nolog=False):
        if not nolog:
            log_msg.append(msg)
        print msg

    dir = path(dir)

    if os.path.exists(dir / 'roots'):	# if roots directory exists, use that for different course versions
        # get roots
        roots = glob.glob(dir / 'roots/*.xml')
        courses = [ CourseInfo(fn, '', dir) for fn in roots ]

    else:	# single course.xml file - use differnt policy files in policy directory, though

        fn = dir / 'course.xml'
    
        # get semesters
        policies = glob.glob(dir/'policies/*.json')
        assetsfn = dir / 'policies/assets.json'
        if str(assetsfn) in policies:
            policies.remove(assetsfn)
        if not policies:
            policies = glob.glob(dir/'policies/*/policy.json')
        if not policies:
            logit("Error: no policy files found!")
        
        courses = [ CourseInfo(fn, pfn) for pfn in policies ]


    logit("%d course runs found: %s" % (len(courses), [c.url_name for c in courses]))
    
    ret = {}

    # construct axis for each policy
    for cinfo in courses:
        policy = cinfo.policy
        semester = policy.semester
        org = cinfo.org
        course = cinfo.course
        cid = '%s/%s/%s' % (org, course, semester)
        logit('course_id=%s' %  cid)
    
        cfn = dir / ('course/%s.xml' % semester)
        
        # generate XBundle for course
        xml = etree.parse(cfn).getroot()
        xb = xbundle.XBundle(keep_urls=True, skip_hidden=True, keep_studio_urls=True)
        xb.policy = policy.policy
        cxml = xb.import_xml_removing_descriptor(dir, xml)

        # append metadata
        metadata = etree.Element('metadata')
        cxml.append(metadata)
        policy_xml = etree.Element('policy')
        metadata.append(policy_xml)
        policy_xml.text = json.dumps(policy.policy)
        grading_policy_xml = etree.Element('grading_policy')
        metadata.append(grading_policy_xml)
        grading_policy_xml.text = json.dumps(policy.grading_policy)
    
        bundle = etree.tostring(cxml, pretty_print=True)
        #print bundle[:500]
        index = [1]
        caxis = []
    
        def walk(x, seq_num=1, path=[], seq_type=None, parent_start=None, parent=None, chapter=None,
                 parent_url_name=None, split_url_name=None):
            '''
            Recursively traverse course tree.  
            
            x        = current etree element
            seq_num  = sequence of current element in its parent, starting from 1
            path     = list of url_name's to current element, following edX's hierarchy conventions
            seq_type = problemset, sequential, or videosequence
            parent_start = start date of parent of current etree element
            parent   = parent module
            chapter  = the last chapter module_id seen while walking through the tree
            parent_url_name = url_name of parent
            split_url_name   = url_name of split_test element if this subtree is in a split_test, otherwise None
            '''
            url_name = x.get('url_name',x.get('url_name_orig',''))
            if not url_name:
                dn = x.get('display_name')
                if dn is not None:
                    url_name = dn.strip().replace(' ','_')     # 2012 convention for converting display_name to url_name
                    url_name = url_name.replace(':','_')
                    url_name = url_name.replace('.','_')
                    url_name = url_name.replace('(','_').replace(')','_').replace('__','_')
            
            data = None
            start = None

            if not FORCE_NO_HIDE:
                hide = policy.get_metadata(x, 'hide_from_toc')
                if hide is not None and not hide=="false":
                    logit('[edx2course_axis] Skipping %s (%s), it has hide_from_toc=%s' % (x.tag, x.get('display_name','<noname>'), hide))
                    return

            if x.tag=='video':	# special: for video, let data = youtube ID(s)
                data = x.get('youtube','')
                if data:
                    # old ytid format - extract just the 1.0 part of this 
                    # 0.75:JdL1Vo0Hru0,1.0:lbaG3uiQ6IY,1.25:Lrj0G8RWHKw,1.50:54fs3-WxqLs
                    ytid = data.replace(' ','').split(',')
                    ytid = [z[1] for z in [y.split(':') for y in ytid] if z[0]=='1.0']
                    # print "   ytid: %s -> %s" % (x.get('youtube',''), ytid)
                    if ytid:
                        data = ytid
                if not data:
                    data = x.get('youtube_id_1_0', '')
                if data:
                    data = '{"ytid": "%s"}' % data

            if x.tag=="split_test":
                data = {}
                to_copy = ['group_id_to_child', 'user_partition_id']
                for tc in to_copy:
                    data[tc] = x.get(tc, None)

            if x.tag=='problem' and x.get('weight') is not None and x.get('weight'):
                try:
                    # Changed from string to dict. In next code block.
                    data = {"weight": "%f" % float(x.get('weight'))}
                except Exception as err:
                    logit("    Error converting weight %s" % x.get('weight'))

            ### Had a hard time making my code work within the try/except for weight. Happy to improve
            ### Also note, weight is typically missing in problems. So I find it weird that we throw an exception.
            if x.tag=='problem':
                # Initialize data if no weight
                if not data:
                    data = {}

                # meta will store all problem related metadata, then be used to update data
                meta = {}
                # Items is meant to help debug - an ordered list of encountered problem types with url names
                # Likely should not be pulled to Big Query 
                meta['items'] = []
                # Known Problem Types
                known_problem_types = ['multiplechoiceresponse','numericalresponse','choiceresponse',
                                       'optionresponse','stringresponse','formularesponse',
                                       'customresponse','fieldset']

                # Loop through all child nodes in a problem. If encountering a known problem type, add metadata.
                for a in x:
                    if a.tag in known_problem_types:
                        meta['items'].append({'itype':a.tag,'url_name':a.get('url_name')})

                ### Check for accompanying image
                images = x.findall('.//img')
                # meta['has_image'] = False
                
                if images and len(images)>0:
                    meta['has_image'] = True #Note, one can use a.get('src'), but needs to account for multiple images
                    # print meta['img'],len(images)

                ### Search for all solution tags in a problem
                solutions = x.findall('.//solution')
                # meta['has_solution'] = False

                if solutions and len(solutions)>0:
                    text = ''
                    for sol in solutions:
                        text = text.join(html.tostring(e, pretty_print=False) for e in sol)
                        # This if statment checks each solution. Note, many MITx problems have multiple solution tags.
                        # In 8.05x, common to put image in one solution tag, and the text in a second. So we are checking each tag.
                        # If we find one solution with > 65 char, or one solution with an image, we set meta['solution'] = True
                        if len(text) > 65 or 'img src' in text:
                            meta['has_solution'] = True

                ### If meta is empty, log all tags for debugging later. 
                if len(meta)==0:
                    logit('item type not found - here is the list of tags:['+','.join(a.tag if a else ' ' for a in x)+']')
                    # print 'problem type not found - here is the list of tags:['+','.join(a.tag for a in x)+']'

                ### Add easily accessible metadata for problems
                # num_items: number of items
                # itype: problem type - note, mixed is used when items are not of same type
                if len(meta['items']) > 0:
                    # Number of Items
                    meta['num_items'] = len(meta['items'])

                    # Problem Type
                    if all(meta['items'][0]['itype'] == item['itype'] for item in meta['items']):
                        meta['itype'] = meta['items'][0]['itype']
                        # print meta['items'][0]['itype']
                    else:
                        meta['itype'] = 'mixed'

                # Update data field
                ### ! For now, removing the items field. 
                del meta["items"]               

                data.update(meta)
                data = json.dumps(data)

            if x.tag=='html':
                iframe = x.find('.//iframe')
                if iframe is not None:
                    logit("   found iframe in html %s" % url_name)
                    src = iframe.get('src','')
                    if 'https://www.youtube.com/embed/' in src:
                        m = re.search('embed/([^"/?]+)', src)
                        if m:
                            data = '{"ytid": "%s"}' % m.group(1)
                            logit("    data=%s" % data)
                
            if url_name:              # url_name is mandatory if we are to do anything with this element
                # url_name = url_name.replace(':','_')
                dn = x.get('display_name', url_name)
                try:
                    #dn = dn.decode('utf-8')
                    dn = unicode(dn)
                    dn = fix_bad_unicode(dn)
                except Exception as err:
                    logit('unicode error, type(dn)=%s'  % type(dn))
                    raise
                pdn = policy.get_metadata(x, 'display_name')      # policy display_name - if given, let that override default
                if pdn is not None:
                    dn = pdn

                #start = date_parse(x.get('start', policy.get_metadata(x, 'start', '')))
                start = date_parse(policy.get_metadata(x, 'start', '', parent=True))
                
                if parent_start is not None and start < parent_start:
                    if VERBOSE_WARNINGS:
                        logit("    Warning: start of %s element %s happens before start %s of parent: using parent start" % (start, x.tag, parent_start), nolog=True)
                    start = parent_start
                #print "start for %s = %s" % (x, start)
                
                # drop bad due date strings
                if date_parse(x.get('due',None), retbad=True)=='Bad':
                    x.set('due', '')

                due = date_parse(policy.get_metadata(x, 'due', '', parent=True))
                if x.tag=="problem":
                    logit("    setting problem due date: for %s due=%s" % (url_name, due), nolog=True)

                gformat = x.get('format', policy.get_metadata(x, 'format', ''))
                if url_name=='hw0':
                    logit( "gformat for hw0 = %s" % gformat)

                graded = x.get('graded', policy.get_metadata(x, 'graded', ''))

                # compute path
                # The hierarchy goes: `course > chapter > (problemset | sequential | videosequence)`
                if x.tag=='chapter':
                    path = [url_name]
                elif x.tag in ['problemset', 'sequential', 'videosequence', 'proctor', 'randomize']:
                    seq_type = x.tag
                    path = [path[0], url_name]
                else:
                    path = path[:] + [str(seq_num)]      # note arrays are passed by reference, so copy, don't modify
                    
                # compute module_id
                if x.tag=='html':
                    module_id = '%s/%s/%s/%s' % (org, course, seq_type, '/'.join(path[1:3]))  # module_id which appears in tracking log
                else:
                    module_id = '%s/%s/%s/%s' % (org, course, x.tag, url_name)
                
                # debugging
                # print "     module %s gformat=%s" % (module_id, gformat)

                # done with getting all info for this axis element; save it
                path_str = '/' + '/'.join(path)
                ae = Axel(cid, index[0], url_name, x.tag, gformat, start, due, dn, path_str, module_id, data, chapter, graded,
                          parent_url_name,
                          not split_url_name==None,
                          split_url_name)
                caxis.append(ae)
                index[0] += 1
            else:
                if VERBOSE_WARNINGS:
                    if x.tag in ['transcript', 'wiki', 'metadata']:
                        pass
                    else:
                        logit("Missing url_name for element %s (attrib=%s, parent_tag=%s)" % (x, x.attrib, (parent.tag if parent is not None else '')))

            # chapter?
            if x.tag=='chapter':
                the_chapter = module_id
            else:
                the_chapter = chapter

            # done processing this element, now process all its children
            if (not x.tag in ['html', 'problem', 'discussion', 'customtag', 'poll_question', 'combinedopenended', 'metadata']):
                inherit_seq_num = (x.tag=='vertical' and not url_name)    # if <vertical> with no url_name then keep seq_num for children
                if not inherit_seq_num:
                    seq_num = 1
                for y in x:
                    if (not str(y).startswith('<!--')) and (not y.tag in ['discussion', 'source']):
                        if not split_url_name and x.tag=="split_test":
                            split_url_name = url_name
                                
                        walk(y, seq_num, path, seq_type, parent_start=start, parent=x, chapter=the_chapter,
                             parent_url_name=url_name,
                             split_url_name=split_url_name,
                        )
                        if not inherit_seq_num:
                            seq_num += 1
                
        walk(cxml)
        ret[cid] = dict(policy=policy.policy, 
                        bundle=bundle, 
                        axis=caxis, 
                        grading_policy=policy.grading_policy,
                        log_msg=log_msg,
                        )
    
    return ret