def xhtml2xbundle(self): ''' Convert XHTML output of PlasTeX to an edX xbundle file. Use lxml to parse the XML and extract the desired parts. ''' xml = etree.fromstring(self.xhtml) self.xml = xml for filter in self.fix_filters: filter(xml) no_overwrite = ['course'] if self.do_merge else [] xb = xbundle.XBundle(force_studio_format=True, keep_urls=True, no_overwrite=no_overwrite) xb.KeepTogetherTags = ['sequential', 'vertical', 'conditional'] course = xml.find('.//course') if course is not None: xb.set_course(course) self.xb = xb return xb
def make_axis(dir): ''' return dict of {course_id : { policy, xbundle, axis (as list of Axel elements) }} ''' courses = [] log_msg = [] def logit(msg, nolog=False): if not nolog: log_msg.append(msg) print msg dir = path(dir) if os.path.exists(dir / 'roots'): # if roots directory exists, use that for different course versions # get roots roots = glob.glob(dir / 'roots/*.xml') courses = [ CourseInfo(fn, '', dir) for fn in roots ] else: # single course.xml file - use differnt policy files in policy directory, though fn = dir / 'course.xml' # get semesters policies = glob.glob(dir/'policies/*.json') assetsfn = dir / 'policies/assets.json' if str(assetsfn) in policies: policies.remove(assetsfn) if not policies: policies = glob.glob(dir/'policies/*/policy.json') if not policies: logit("Error: no policy files found!") courses = [ CourseInfo(fn, pfn) for pfn in policies ] logit("%d course runs found: %s" % (len(courses), [c.url_name for c in courses])) ret = {} # construct axis for each policy for cinfo in courses: policy = cinfo.policy semester = policy.semester org = cinfo.org course = cinfo.course cid = '%s/%s/%s' % (org, course, semester) logit('course_id=%s' % cid) cfn = dir / ('course/%s.xml' % semester) # generate XBundle for course xml = etree.parse(cfn).getroot() xb = xbundle.XBundle(keep_urls=True, skip_hidden=True, keep_studio_urls=True) xb.policy = policy.policy cxml = xb.import_xml_removing_descriptor(dir, xml) # append metadata metadata = etree.Element('metadata') cxml.append(metadata) policy_xml = etree.Element('policy') metadata.append(policy_xml) policy_xml.text = json.dumps(policy.policy) grading_policy_xml = etree.Element('grading_policy') metadata.append(grading_policy_xml) grading_policy_xml.text = json.dumps(policy.grading_policy) bundle = etree.tostring(cxml, pretty_print=True) #print bundle[:500] index = [1] caxis = [] def walk(x, seq_num=1, path=[], seq_type=None, parent_start=None, parent=None, chapter=None, parent_url_name=None, split_url_name=None): ''' Recursively traverse course tree. x = current etree element seq_num = sequence of current element in its parent, starting from 1 path = list of url_name's to current element, following edX's hierarchy conventions seq_type = problemset, sequential, or videosequence parent_start = start date of parent of current etree element parent = parent module chapter = the last chapter module_id seen while walking through the tree parent_url_name = url_name of parent split_url_name = url_name of split_test element if this subtree is in a split_test, otherwise None ''' url_name = x.get('url_name',x.get('url_name_orig','')) if not url_name: dn = x.get('display_name') if dn is not None: url_name = dn.strip().replace(' ','_') # 2012 convention for converting display_name to url_name url_name = url_name.replace(':','_') url_name = url_name.replace('.','_') url_name = url_name.replace('(','_').replace(')','_').replace('__','_') data = None start = None if not FORCE_NO_HIDE: hide = policy.get_metadata(x, 'hide_from_toc') if hide is not None and not hide=="false": logit('[edx2course_axis] Skipping %s (%s), it has hide_from_toc=%s' % (x.tag, x.get('display_name','<noname>'), hide)) return if x.tag=='video': # special: for video, let data = youtube ID(s) data = x.get('youtube','') if data: # old ytid format - extract just the 1.0 part of this # 0.75:JdL1Vo0Hru0,1.0:lbaG3uiQ6IY,1.25:Lrj0G8RWHKw,1.50:54fs3-WxqLs ytid = data.replace(' ','').split(',') ytid = [z[1] for z in [y.split(':') for y in ytid] if z[0]=='1.0'] # print " ytid: %s -> %s" % (x.get('youtube',''), ytid) if ytid: data = ytid if not data: data = x.get('youtube_id_1_0', '') if data: data = '{"ytid": "%s"}' % data if x.tag=="split_test": data = {} to_copy = ['group_id_to_child', 'user_partition_id'] for tc in to_copy: data[tc] = x.get(tc, None) if x.tag=='problem' and x.get('weight') is not None and x.get('weight'): try: # Changed from string to dict. In next code block. data = {"weight": "%f" % float(x.get('weight'))} except Exception as err: logit(" Error converting weight %s" % x.get('weight')) ### Had a hard time making my code work within the try/except for weight. Happy to improve ### Also note, weight is typically missing in problems. So I find it weird that we throw an exception. if x.tag=='problem': # Initialize data if no weight if not data: data = {} # meta will store all problem related metadata, then be used to update data meta = {} # Items is meant to help debug - an ordered list of encountered problem types with url names # Likely should not be pulled to Big Query meta['items'] = [] # Known Problem Types known_problem_types = ['multiplechoiceresponse','numericalresponse','choiceresponse', 'optionresponse','stringresponse','formularesponse', 'customresponse','fieldset'] # Loop through all child nodes in a problem. If encountering a known problem type, add metadata. for a in x: if a.tag in known_problem_types: meta['items'].append({'itype':a.tag,'url_name':a.get('url_name')}) ### Check for accompanying image images = x.findall('.//img') # meta['has_image'] = False if images and len(images)>0: meta['has_image'] = True #Note, one can use a.get('src'), but needs to account for multiple images # print meta['img'],len(images) ### Search for all solution tags in a problem solutions = x.findall('.//solution') # meta['has_solution'] = False if solutions and len(solutions)>0: text = '' for sol in solutions: text = text.join(html.tostring(e, pretty_print=False) for e in sol) # This if statment checks each solution. Note, many MITx problems have multiple solution tags. # In 8.05x, common to put image in one solution tag, and the text in a second. So we are checking each tag. # If we find one solution with > 65 char, or one solution with an image, we set meta['solution'] = True if len(text) > 65 or 'img src' in text: meta['has_solution'] = True ### If meta is empty, log all tags for debugging later. if len(meta)==0: logit('item type not found - here is the list of tags:['+','.join(a.tag if a else ' ' for a in x)+']') # print 'problem type not found - here is the list of tags:['+','.join(a.tag for a in x)+']' ### Add easily accessible metadata for problems # num_items: number of items # itype: problem type - note, mixed is used when items are not of same type if len(meta['items']) > 0: # Number of Items meta['num_items'] = len(meta['items']) # Problem Type if all(meta['items'][0]['itype'] == item['itype'] for item in meta['items']): meta['itype'] = meta['items'][0]['itype'] # print meta['items'][0]['itype'] else: meta['itype'] = 'mixed' # Update data field ### ! For now, removing the items field. del meta["items"] data.update(meta) data = json.dumps(data) if x.tag=='html': iframe = x.find('.//iframe') if iframe is not None: logit(" found iframe in html %s" % url_name) src = iframe.get('src','') if 'https://www.youtube.com/embed/' in src: m = re.search('embed/([^"/?]+)', src) if m: data = '{"ytid": "%s"}' % m.group(1) logit(" data=%s" % data) if url_name: # url_name is mandatory if we are to do anything with this element # url_name = url_name.replace(':','_') dn = x.get('display_name', url_name) try: #dn = dn.decode('utf-8') dn = unicode(dn) dn = fix_bad_unicode(dn) except Exception as err: logit('unicode error, type(dn)=%s' % type(dn)) raise pdn = policy.get_metadata(x, 'display_name') # policy display_name - if given, let that override default if pdn is not None: dn = pdn #start = date_parse(x.get('start', policy.get_metadata(x, 'start', ''))) start = date_parse(policy.get_metadata(x, 'start', '', parent=True)) if parent_start is not None and start < parent_start: if VERBOSE_WARNINGS: logit(" Warning: start of %s element %s happens before start %s of parent: using parent start" % (start, x.tag, parent_start), nolog=True) start = parent_start #print "start for %s = %s" % (x, start) # drop bad due date strings if date_parse(x.get('due',None), retbad=True)=='Bad': x.set('due', '') due = date_parse(policy.get_metadata(x, 'due', '', parent=True)) if x.tag=="problem": logit(" setting problem due date: for %s due=%s" % (url_name, due), nolog=True) gformat = x.get('format', policy.get_metadata(x, 'format', '')) if url_name=='hw0': logit( "gformat for hw0 = %s" % gformat) graded = x.get('graded', policy.get_metadata(x, 'graded', '')) # compute path # The hierarchy goes: `course > chapter > (problemset | sequential | videosequence)` if x.tag=='chapter': path = [url_name] elif x.tag in ['problemset', 'sequential', 'videosequence', 'proctor', 'randomize']: seq_type = x.tag path = [path[0], url_name] else: path = path[:] + [str(seq_num)] # note arrays are passed by reference, so copy, don't modify # compute module_id if x.tag=='html': module_id = '%s/%s/%s/%s' % (org, course, seq_type, '/'.join(path[1:3])) # module_id which appears in tracking log else: module_id = '%s/%s/%s/%s' % (org, course, x.tag, url_name) # debugging # print " module %s gformat=%s" % (module_id, gformat) # done with getting all info for this axis element; save it path_str = '/' + '/'.join(path) ae = Axel(cid, index[0], url_name, x.tag, gformat, start, due, dn, path_str, module_id, data, chapter, graded, parent_url_name, not split_url_name==None, split_url_name) caxis.append(ae) index[0] += 1 else: if VERBOSE_WARNINGS: if x.tag in ['transcript', 'wiki', 'metadata']: pass else: logit("Missing url_name for element %s (attrib=%s, parent_tag=%s)" % (x, x.attrib, (parent.tag if parent is not None else ''))) # chapter? if x.tag=='chapter': the_chapter = module_id else: the_chapter = chapter # done processing this element, now process all its children if (not x.tag in ['html', 'problem', 'discussion', 'customtag', 'poll_question', 'combinedopenended', 'metadata']): inherit_seq_num = (x.tag=='vertical' and not url_name) # if <vertical> with no url_name then keep seq_num for children if not inherit_seq_num: seq_num = 1 for y in x: if (not str(y).startswith('<!--')) and (not y.tag in ['discussion', 'source']): if not split_url_name and x.tag=="split_test": split_url_name = url_name walk(y, seq_num, path, seq_type, parent_start=start, parent=x, chapter=the_chapter, parent_url_name=url_name, split_url_name=split_url_name, ) if not inherit_seq_num: seq_num += 1 walk(cxml) ret[cid] = dict(policy=policy.policy, bundle=bundle, axis=caxis, grading_policy=policy.grading_policy, log_msg=log_msg, ) return ret