def get_filename(self, fobj, relname): """Returns a Filename object, given the Inode object and the filesystem relative name.""" try: relname_utf8 = relname.decode("utf8") except UnicodeDecodeError: latin1 = relname.decode("latin1") relname_utf8 = fix_bad_unicode(latin1) print u"Tidied {rn} to '{new}'".format( rn=repr(relname), new=relname_utf8, ) relname_bin = None if relname_utf8.encode("utf8") != relname: relname_bin = relname try: fnobj = self.sesh.query(Filename).filter( Filename.volume_id == fobj.volume_id, Filename.inode_num == fobj.inode_num, Filename.filename == relname_utf8, ).one() #TODO: check other names & remove if necc. #FIXME: multiple links to same file differing only by unicode! except NoResultFound: fnobj = Filename( volume_id=fobj.volume_id, inode_num=fobj.inode_num, filename=relname_utf8, filename_raw=relname_bin, ) self.sesh.add(fnobj) return fnobj
def walk(x, seq_num=1, path=[], seq_type=None, parent_start=None, parent=None, chapter=None, parent_url_name=None, is_split_flag=False): ''' Recursively traverse course tree. x = current etree element seq_num = sequence of current element in its parent, starting from 1 path = list of url_name's to current element, following edX's hierarchy conventions seq_type = problemset, sequential, or videosequence parent_start = start date of parent of current etree element parent = parent module chapter = the last chapter module_id seen while walking through the tree parent_url_name = url_name of parent is_split_flag = boolean indicating if this subtree is within a split_test ''' url_name = x.get('url_name',x.get('url_name_orig','')) if not url_name: dn = x.get('display_name') if dn is not None: url_name = dn.strip().replace(' ','_') # 2012 convention for converting display_name to url_name url_name = url_name.replace(':','_') url_name = url_name.replace('.','_') url_name = url_name.replace('(','_').replace(')','_').replace('__','_') data = None start = None if not FORCE_NO_HIDE: hide = policy.get_metadata(x, 'hide_from_toc') if hide is not None and not hide=="false": logit('[edx2course_axis] Skipping %s (%s), it has hide_from_toc=%s' % (x.tag, x.get('display_name','<noname>'), hide)) return if x.tag=='video': # special: for video, let data = youtube ID(s) data = x.get('youtube','') if data: # old ytid format - extract just the 1.0 part of this # 0.75:JdL1Vo0Hru0,1.0:lbaG3uiQ6IY,1.25:Lrj0G8RWHKw,1.50:54fs3-WxqLs ytid = data.replace(' ','').split(',') ytid = [z[1] for z in [y.split(':') for y in ytid] if z[0]=='1.0'] # print " ytid: %s -> %s" % (x.get('youtube',''), ytid) if ytid: data = ytid if not data: data = x.get('youtube_id_1_0', '') if data: data = '{"ytid": "%s"}' % data if x.tag=="split_test": data = {} to_copy = ['group_id_to_child', 'user_partition_id'] for tc in to_copy: data[tc] = x.get(tc, None) if x.tag=='problem' and x.get('weight') is not None and x.get('weight'): try: data = '{"weight": %f}' % float(x.get('weight')) except Exception as err: logit(" Error converting weight %s" % x.get('weight')) if x.tag=='html': iframe = x.find('.//iframe') if iframe is not None: logit(" found iframe in html %s" % url_name) src = iframe.get('src','') if 'https://www.youtube.com/embed/' in src: m = re.search('embed/([^"/?]+)', src) if m: data = '{"ytid": "%s"}' % m.group(1) logit(" data=%s" % data) if url_name: # url_name is mandatory if we are to do anything with this element # url_name = url_name.replace(':','_') dn = x.get('display_name', url_name) try: #dn = dn.decode('utf-8') dn = unicode(dn) dn = fix_bad_unicode(dn) except Exception as err: logit('unicode error, type(dn)=%s' % type(dn)) raise pdn = policy.get_metadata(x, 'display_name') # policy display_name - if given, let that override default if pdn is not None: dn = pdn #start = date_parse(x.get('start', policy.get_metadata(x, 'start', ''))) start = date_parse(policy.get_metadata(x, 'start', '', parent=True)) if parent_start is not None and start < parent_start: if VERBOSE_WARNINGS: logit(" Warning: start of %s element %s happens before start %s of parent: using parent start" % (start, x.tag, parent_start), nolog=True) start = parent_start #print "start for %s = %s" % (x, start) # drop bad due date strings if date_parse(x.get('due',None), retbad=True)=='Bad': x.set('due', '') due = date_parse(policy.get_metadata(x, 'due', '', parent=True)) if x.tag=="problem": logit(" setting problem due date: for %s due=%s" % (url_name, due), nolog=True) gformat = x.get('format', policy.get_metadata(x, 'format', '')) if url_name=='hw0': logit( "gformat for hw0 = %s" % gformat) graded = x.get('graded', policy.get_metadata(x, 'graded', '')) # compute path # The hierarchy goes: `course > chapter > (problemset | sequential | videosequence)` if x.tag=='chapter': path = [url_name] elif x.tag in ['problemset', 'sequential', 'videosequence', 'proctor', 'randomize']: seq_type = x.tag path = [path[0], url_name] else: path = path[:] + [str(seq_num)] # note arrays are passed by reference, so copy, don't modify # compute module_id if x.tag=='html': module_id = '%s/%s/%s/%s' % (org, course, seq_type, '/'.join(path[1:3])) # module_id which appears in tracking log else: module_id = '%s/%s/%s/%s' % (org, course, x.tag, url_name) # debugging # print " module %s gformat=%s" % (module_id, gformat) # done with getting all info for this axis element; save it path_str = '/' + '/'.join(path) ae = Axel(cid, index[0], url_name, x.tag, gformat, start, due, dn, path_str, module_id, data, chapter, graded, parent_url_name, is_split_flag) caxis.append(ae) index[0] += 1 else: if VERBOSE_WARNINGS: if x.tag in ['transcript', 'wiki', 'metadata']: pass else: logit("Missing url_name for element %s (attrib=%s, parent_tag=%s)" % (x, x.attrib, (parent.tag if parent is not None else ''))) # chapter? if x.tag=='chapter': the_chapter = module_id else: the_chapter = chapter # done processing this element, now process all its children if (not x.tag in ['html', 'problem', 'discussion', 'customtag', 'poll_question', 'combinedopenended', 'metadata']): inherit_seq_num = (x.tag=='vertical' and not url_name) # if <vertical> with no url_name then keep seq_num for children if not inherit_seq_num: seq_num = 1 for y in x: if (not str(y).startswith('<!--')) and (not y.tag in ['discussion', 'source']): walk(y, seq_num, path, seq_type, parent_start=start, parent=x, chapter=the_chapter, parent_url_name=url_name, is_split_flag=((x.tag=="split_test") or is_split_flag), ) if not inherit_seq_num: seq_num += 1
def walk(x, seq_num=1, path=[], seq_type=None, parent_start=None, parent=None, chapter=None, parent_url_name=None, split_url_name=None): ''' Recursively traverse course tree. x = current etree element seq_num = sequence of current element in its parent, starting from 1 path = list of url_name's to current element, following edX's hierarchy conventions seq_type = problemset, sequential, or videosequence parent_start = start date of parent of current etree element parent = parent module chapter = the last chapter module_id seen while walking through the tree parent_url_name = url_name of parent split_url_name = url_name of split_test element if this subtree is in a split_test, otherwise None ''' url_name = x.get('url_name',x.get('url_name_orig','')) if not url_name: dn = x.get('display_name') if dn is not None: url_name = dn.strip().replace(' ','_') # 2012 convention for converting display_name to url_name url_name = url_name.replace(':','_') url_name = url_name.replace('.','_') url_name = url_name.replace('(','_').replace(')','_').replace('__','_') data = None start = None if not FORCE_NO_HIDE: hide = policy.get_metadata(x, 'hide_from_toc') if hide is not None and not hide=="false": logit('[edx2course_axis] Skipping %s (%s), it has hide_from_toc=%s' % (x.tag, x.get('display_name','<noname>'), hide)) return if x.tag=='video': # special: for video, let data = youtube ID(s) data = x.get('youtube','') if data: # old ytid format - extract just the 1.0 part of this # 0.75:JdL1Vo0Hru0,1.0:lbaG3uiQ6IY,1.25:Lrj0G8RWHKw,1.50:54fs3-WxqLs ytid = data.replace(' ','').split(',') ytid = [z[1] for z in [y.split(':') for y in ytid] if z[0]=='1.0'] # print " ytid: %s -> %s" % (x.get('youtube',''), ytid) if ytid: data = ytid if not data: data = x.get('youtube_id_1_0', '') if data: data = '{"ytid": "%s"}' % data if x.tag=="split_test": data = {} to_copy = ['group_id_to_child', 'user_partition_id'] for tc in to_copy: data[tc] = x.get(tc, None) if x.tag=='problem' and x.get('weight') is not None and x.get('weight'): try: # Changed from string to dict. In next code block. data = {"weight": "%f" % float(x.get('weight'))} except Exception as err: logit(" Error converting weight %s" % x.get('weight')) ### Had a hard time making my code work within the try/except for weight. Happy to improve ### Also note, weight is typically missing in problems. So I find it weird that we throw an exception. if x.tag=='problem': # Initialize data if no weight if not data: data = {} # meta will store all problem related metadata, then be used to update data meta = {} # Items is meant to help debug - an ordered list of encountered problem types with url names # Likely should not be pulled to Big Query meta['items'] = [] # Known Problem Types known_problem_types = ['multiplechoiceresponse','numericalresponse','choiceresponse', 'optionresponse','stringresponse','formularesponse', 'customresponse','fieldset'] # Loop through all child nodes in a problem. If encountering a known problem type, add metadata. for a in x: if a.tag in known_problem_types: meta['items'].append({'itype':a.tag,'url_name':a.get('url_name')}) ### Check for accompanying image images = x.findall('.//img') # meta['has_image'] = False if images and len(images)>0: meta['has_image'] = True #Note, one can use a.get('src'), but needs to account for multiple images # print meta['img'],len(images) ### Search for all solution tags in a problem solutions = x.findall('.//solution') # meta['has_solution'] = False if solutions and len(solutions)>0: text = '' for sol in solutions: text = text.join(html.tostring(e, pretty_print=False) for e in sol) # This if statment checks each solution. Note, many MITx problems have multiple solution tags. # In 8.05x, common to put image in one solution tag, and the text in a second. So we are checking each tag. # If we find one solution with > 65 char, or one solution with an image, we set meta['solution'] = True if len(text) > 65 or 'img src' in text: meta['has_solution'] = True ### If meta is empty, log all tags for debugging later. if len(meta)==0: logit('item type not found - here is the list of tags:['+','.join(a.tag if a else ' ' for a in x)+']') # print 'problem type not found - here is the list of tags:['+','.join(a.tag for a in x)+']' ### Add easily accessible metadata for problems # num_items: number of items # itype: problem type - note, mixed is used when items are not of same type if len(meta['items']) > 0: # Number of Items meta['num_items'] = len(meta['items']) # Problem Type if all(meta['items'][0]['itype'] == item['itype'] for item in meta['items']): meta['itype'] = meta['items'][0]['itype'] # print meta['items'][0]['itype'] else: meta['itype'] = 'mixed' # Update data field ### ! For now, removing the items field. del meta["items"] data.update(meta) data = json.dumps(data) if x.tag=='html': iframe = x.find('.//iframe') if iframe is not None: logit(" found iframe in html %s" % url_name) src = iframe.get('src','') if 'https://www.youtube.com/embed/' in src: m = re.search('embed/([^"/?]+)', src) if m: data = '{"ytid": "%s"}' % m.group(1) logit(" data=%s" % data) if url_name: # url_name is mandatory if we are to do anything with this element # url_name = url_name.replace(':','_') dn = x.get('display_name', url_name) try: #dn = dn.decode('utf-8') dn = unicode(dn) dn = fix_bad_unicode(dn) except Exception as err: logit('unicode error, type(dn)=%s' % type(dn)) raise pdn = policy.get_metadata(x, 'display_name') # policy display_name - if given, let that override default if pdn is not None: dn = pdn #start = date_parse(x.get('start', policy.get_metadata(x, 'start', ''))) start = date_parse(policy.get_metadata(x, 'start', '', parent=True)) if parent_start is not None and start < parent_start: if VERBOSE_WARNINGS: logit(" Warning: start of %s element %s happens before start %s of parent: using parent start" % (start, x.tag, parent_start), nolog=True) start = parent_start #print "start for %s = %s" % (x, start) # drop bad due date strings if date_parse(x.get('due',None), retbad=True)=='Bad': x.set('due', '') due = date_parse(policy.get_metadata(x, 'due', '', parent=True)) if x.tag=="problem": logit(" setting problem due date: for %s due=%s" % (url_name, due), nolog=True) gformat = x.get('format', policy.get_metadata(x, 'format', '')) if url_name=='hw0': logit( "gformat for hw0 = %s" % gformat) graded = x.get('graded', policy.get_metadata(x, 'graded', '')) if not (type(graded) in [unicode, str]): graded = str(graded) # compute path # The hierarchy goes: `course > chapter > (problemset | sequential | videosequence)` if x.tag=='chapter': path = [url_name] elif x.tag in ['problemset', 'sequential', 'videosequence', 'proctor', 'randomize']: seq_type = x.tag path = [path[0], url_name] else: path = path[:] + [str(seq_num)] # note arrays are passed by reference, so copy, don't modify # compute module_id if x.tag=='html': module_id = '%s/%s/%s/%s' % (org, course, seq_type, '/'.join(path[1:3])) # module_id which appears in tracking log else: module_id = '%s/%s/%s/%s' % (org, course, x.tag, url_name) # debugging # print " module %s gformat=%s" % (module_id, gformat) # done with getting all info for this axis element; save it path_str = '/' + '/'.join(path) ae = Axel(cid, index[0], url_name, x.tag, gformat, start, due, dn, path_str, module_id, data, chapter, graded, parent_url_name, not split_url_name==None, split_url_name) caxis.append(ae) index[0] += 1 else: if VERBOSE_WARNINGS: if x.tag in ['transcript', 'wiki', 'metadata']: pass else: logit("Missing url_name for element %s (attrib=%s, parent_tag=%s)" % (x, x.attrib, (parent.tag if parent is not None else ''))) # chapter? if x.tag=='chapter': the_chapter = module_id else: the_chapter = chapter # done processing this element, now process all its children if (not x.tag in ['html', 'problem', 'discussion', 'customtag', 'poll_question', 'combinedopenended', 'metadata']): inherit_seq_num = (x.tag=='vertical' and not url_name) # if <vertical> with no url_name then keep seq_num for children if not inherit_seq_num: seq_num = 1 for y in x: if (not str(y).startswith('<!--')) and (not y.tag in ['discussion', 'source']): if not split_url_name and x.tag=="split_test": split_url_name = url_name walk(y, seq_num, path, seq_type, parent_start=start, parent=x, chapter=the_chapter, parent_url_name=url_name, split_url_name=split_url_name, ) if not inherit_seq_num: seq_num += 1
def walk( element, course, cid, org, policy, index, caxis, seq_num=1, paths=None, seq_type=None, parent_start=None, parent=None, chapter=None): """ Recursively traverse course tree. element = current etree element seq_num = sequence of current element in its parent, starting from 1 paths = list of url_name's to current element, following edX's hierarchy conventions seq_type = problemset, sequential, or videosequence parent_start = start date of parent of current etree element parent = parent module chapter = the last chapter module_id seen while walking through the tree """ # Fixes dangerous-default-value. if paths is None: paths = [] url_name = element.get( 'url_name', element.get( 'url_name_orig', '')) if not url_name: display_name = element.get('display_name') if display_name is not None: # 2012 convention for converting display_name to url_name url_name = display_name.strip().replace( ' ', '_') url_name = url_name.replace(':', '_') url_name = url_name.replace('.', '_') url_name = url_name.replace( '(', '_').replace(')', '_').replace('__', '_') data = None start = None if not FORCE_NO_HIDE: hide = policy.get_metadata(element, 'hide_from_toc') if hide is not None and not hide == "false": msg = ( '[edx2course_axis] Skipping {0} ({1}), it has ' 'hide_from_toc={3}' ) log.debug( msg.format( element.tag, element.get('display_name', '<noname>'), hide) ) return # special: for video, let data = youtube ID(s) if element.tag == 'video': data = element.get('youtube', '') if data: # old ytid format - extract just the 1.0 part of this # 0.75:JdL1Vo0Hru0,1.0:lbaG3uiQ6IY,1.25:Lrj0G8RWHKw,1.50:54fs3-WxqLs ytid = data.replace(' ', '').split(',') ytid = [ z[1] for z in [ y.split(':') for y in ytid] if z[0] == '1.0'] if ytid: data = ytid if not data: data = element.get('youtube_id_1_0', '') if data: data = '{"ytid": "%s"}' % data if element.tag == 'problem' and element.get( 'weight') is not None and element.get('weight'): try: data = '{"weight": %f}' % float(element.get('weight')) except (TypeError, ValueError) as err: log.error("Error converting weight {0}: {1}".format( element.get('weight'), err, )) if element.tag == 'html': iframe = element.find('.//iframe') if iframe is not None: log.debug("found iframe in html {0}".format(url_name)) src = iframe.get('src', '') if 'https://www.youtube.com/embed/' in src: match = re.search('embed/([^"/?]+)', src) if match: data = '{"ytid": "%s"}' % match.group(1) log.debug("data={0}".format(data)) # url_name is mandatory if we are to do anything with this element if url_name: # url_name = url_name.replace(':','_') display_name = element.get('display_name', url_name) try: display_name = unicode(display_name) display_name = fix_bad_unicode(display_name) except Exception as ex: log.error( 'unicode error, type(display_name)={0}'.format( type(display_name))) raise ex # policy display_name - if given, let that override default pdn = policy.get_metadata(element, 'display_name') if pdn is not None: display_name = pdn start = date_parse( policy.get_metadata( element, 'start', '', parent=True)) if parent_start is not None and start < parent_start: if VERBOSE_WARNINGS: msg = ( "Warning: start of {0} element {1} happens before start " "{2} of parent: using parent start" ) log.warning(msg.format(start, element.tag, parent_start)) start = parent_start # drop bad due date strings if date_parse(element.get('due', None), retbad=True) == 'Bad': element.set('due', '') due = date_parse( policy.get_metadata( element, 'due', '', parent=True)) if element.tag == "problem": log.debug( "setting problem due date: for {0} due={1}".format( url_name, due)) gformat = element.get( 'format', policy.get_metadata( element, 'format', '')) if url_name == 'hw0': log.debug("gformat for hw0 = {0}".format(gformat)) # compute path # The hierarchy goes: `course > chapter > (problemset | # sequential | videosequence)` tags = set([ 'problemset', 'sequential', 'videosequence', 'proctor', 'randomize' ]) if element.tag == 'chapter': paths = [url_name] elif element.tag in tags: seq_type = element.tag paths = [paths[0], url_name] else: # note arrays are passed by reference, so copy, don't # modify paths = paths[:] + [str(seq_num)] # compute module_id if element.tag == 'html': # module_id which appears in tracking log module_id = '{0}/{1}/{2}/{3}'.format( org, course, seq_type, '/'.join(paths[1:3])) else: module_id = '{0}/{1}/{2}/{3}'.format( org, course, element.tag, url_name) # done with getting all info for this axis element; save it path_str = '/' + '/'.join(paths) axel = Axel( cid, index[ 0], url_name, element.tag, gformat, start, due, display_name, path_str, module_id, data, chapter, ) caxis.append(axel) index[0] += 1 else: if VERBOSE_WARNINGS: if element.tag in ['transcript', 'wiki', 'metadata']: pass else: msg = ( "Missing url_name for element {0} " "(attrib={1}, parent_tag={2})" ) log.warning( msg.format( element, element.attrib, (parent.tag if parent is not None else '')) ) # chapter? if element.tag == 'chapter': the_chapter = module_id else: the_chapter = chapter # done processing this element, now process all its children tags = set([ 'html', 'problem', 'discussion', 'customtag', 'poll_question', 'combinedopenended', 'metadata', ]) if element.tag not in tags: # if <vertical> with no url_name then keep seq_num for children inherit_seq_num = (element.tag == 'vertical' and not url_name) if not inherit_seq_num: seq_num = 1 for child in element: if (not str(child).startswith('<!--')) \ and (not child.tag in ['discussion', 'source']): walk( child, course, cid, org, policy, index, caxis, seq_num, paths, seq_type, parent_start=start, parent=element, chapter=the_chapter) if not inherit_seq_num: seq_num += 1
def walk(x, seq_num=1, path=[], seq_type=None, parent_start=None, parent=None, chapter=None, parent_url_name=None, split_url_name=None): ''' Recursively traverse course tree. x = current etree element seq_num = sequence of current element in its parent, starting from 1 path = list of url_name's to current element, following edX's hierarchy conventions seq_type = problemset, sequential, or videosequence parent_start = start date of parent of current etree element parent = parent module chapter = the last chapter module_id seen while walking through the tree parent_url_name = url_name of parent split_url_name = url_name of split_test element if this subtree is in a split_test, otherwise None ''' url_name = x.get('url_name',x.get('url_name_orig','')) if not url_name: dn = x.get('display_name') if dn is not None: url_name = dn.strip().replace(' ','_') # 2012 convention for converting display_name to url_name url_name = url_name.replace(':','_') url_name = url_name.replace('.','_') url_name = url_name.replace('(','_').replace(')','_').replace('__','_') data = None start = None if not FORCE_NO_HIDE: hide = policy.get_metadata(x, 'hide_from_toc') if hide is not None and not hide=="false": logit('[edx2course_axis] Skipping %s (%s), it has hide_from_toc=%s' % (x.tag, x.get('display_name','<noname>'), hide)) return if x.tag=='video': # special: for video, let data = youtube ID(s) data = x.get('youtube','') if data: # old ytid format - extract just the 1.0 part of this # 0.75:JdL1Vo0Hru0,1.0:lbaG3uiQ6IY,1.25:Lrj0G8RWHKw,1.50:54fs3-WxqLs ytid = data.replace(' ','').split(',') ytid = [z[1] for z in [y.split(':') for y in ytid] if z[0]=='1.0'] # print " ytid: %s -> %s" % (x.get('youtube',''), ytid) if ytid: data = ytid if not data: data = x.get('youtube_id_1_0', '') if data: data = '{"ytid": "%s"}' % data if x.tag=="split_test": data = {} to_copy = ['group_id_to_child', 'user_partition_id'] for tc in to_copy: data[tc] = x.get(tc, None) if x.tag=='problem' and x.get('weight') is not None and x.get('weight'): try: # Changed from string to dict. In next code block. data = {"weight": "%f" % float(x.get('weight'))} except Exception as err: logit(" Error converting weight %s" % x.get('weight')) ### Had a hard time making my code work within the try/except for weight. Happy to improve ### Also note, weight is typically missing in problems. So I find it weird that we throw an exception. if x.tag=='problem': # Initialize data if no weight if not data: data = {} # meta will store all problem related metadata, then be used to update data meta = {} # Items is meant to help debug - an ordered list of encountered problem types with url names # Likely should not be pulled to Big Query meta['items'] = [] # Known Problem Types known_problem_types = ['multiplechoiceresponse','numericalresponse','choiceresponse', 'optionresponse','stringresponse','formularesponse', 'customresponse','fieldset'] # Loop through all child nodes in a problem. If encountering a known problem type, add metadata. for a in x: if a.tag in known_problem_types: meta['items'].append({'itype':a.tag,'url_name':a.get('url_name')}) ### Check for accompanying image images = x.findall('.//img') # meta['has_image'] = False if images and len(images)>0: meta['has_image'] = True #Note, one can use a.get('src'), but needs to account for multiple images # print meta['img'],len(images) ### Search for all solution tags in a problem solutions = x.findall('.//solution') # meta['has_solution'] = False if solutions and len(solutions)>0: text = '' for sol in solutions: text = text.join(html.tostring(e, pretty_print=False) for e in sol) # This if statment checks each solution. Note, many MITx problems have multiple solution tags. # In 8.05x, common to put image in one solution tag, and the text in a second. So we are checking each tag. # If we find one solution with > 65 char, or one solution with an image, we set meta['solution'] = True if len(text) > 65 or 'img src' in text: meta['has_solution'] = True ### If meta is empty, log all tags for debugging later. if len(meta)==0: logit('item type not found - here is the list of tags:['+','.join(a.tag if a else ' ' for a in x)+']') # print 'problem type not found - here is the list of tags:['+','.join(a.tag for a in x)+']' ### Add easily accessible metadata for problems # num_items: number of items # itype: problem type - note, mixed is used when items are not of same type if len(meta['items']) > 0: # Number of Items meta['num_items'] = len(meta['items']) # Problem Type if all(meta['items'][0]['itype'] == item['itype'] for item in meta['items']): meta['itype'] = meta['items'][0]['itype'] # print meta['items'][0]['itype'] else: meta['itype'] = 'mixed' # Update data field ### ! For now, removing the items field. del meta["items"] data.update(meta) data = json.dumps(data) if x.tag=='html': iframe = x.find('.//iframe') if iframe is not None: logit(" found iframe in html %s" % url_name) src = iframe.get('src','') if 'https://www.youtube.com/embed/' in src: m = re.search('embed/([^"/?]+)', src) if m: data = '{"ytid": "%s"}' % m.group(1) logit(" data=%s" % data) if url_name: # url_name is mandatory if we are to do anything with this element # url_name = url_name.replace(':','_') dn = x.get('display_name', url_name) try: #dn = dn.decode('utf-8') dn = unicode(dn) dn = fix_bad_unicode(dn) except Exception as err: logit('unicode error, type(dn)=%s' % type(dn)) raise pdn = policy.get_metadata(x, 'display_name') # policy display_name - if given, let that override default if pdn is not None: dn = pdn #start = date_parse(x.get('start', policy.get_metadata(x, 'start', ''))) start = date_parse(policy.get_metadata(x, 'start', '', parent=True)) if parent_start is not None and start < parent_start: if VERBOSE_WARNINGS: logit(" Warning: start of %s element %s happens before start %s of parent: using parent start" % (start, x.tag, parent_start), nolog=True) start = parent_start #print "start for %s = %s" % (x, start) # drop bad due date strings if date_parse(x.get('due',None), retbad=True)=='Bad': x.set('due', '') due = date_parse(policy.get_metadata(x, 'due', '', parent=True)) if x.tag=="problem": logit(" setting problem due date: for %s due=%s" % (url_name, due), nolog=True) gformat = x.get('format', policy.get_metadata(x, 'format', '')) if url_name=='hw0': logit( "gformat for hw0 = %s" % gformat) graded = x.get('graded', policy.get_metadata(x, 'graded', '')) # compute path # The hierarchy goes: `course > chapter > (problemset | sequential | videosequence)` if x.tag=='chapter': path = [url_name] elif x.tag in ['problemset', 'sequential', 'videosequence', 'proctor', 'randomize']: seq_type = x.tag path = [path[0], url_name] else: path = path[:] + [str(seq_num)] # note arrays are passed by reference, so copy, don't modify # compute module_id if x.tag=='html': module_id = '%s/%s/%s/%s' % (org, course, seq_type, '/'.join(path[1:3])) # module_id which appears in tracking log else: module_id = '%s/%s/%s/%s' % (org, course, x.tag, url_name) # debugging # print " module %s gformat=%s" % (module_id, gformat) # done with getting all info for this axis element; save it path_str = '/' + '/'.join(path) ae = Axel(cid, index[0], url_name, x.tag, gformat, start, due, dn, path_str, module_id, data, chapter, graded, parent_url_name, not split_url_name==None, split_url_name) caxis.append(ae) index[0] += 1 else: if VERBOSE_WARNINGS: if x.tag in ['transcript', 'wiki', 'metadata']: pass else: logit("Missing url_name for element %s (attrib=%s, parent_tag=%s)" % (x, x.attrib, (parent.tag if parent is not None else ''))) # chapter? if x.tag=='chapter': the_chapter = module_id else: the_chapter = chapter # done processing this element, now process all its children if (not x.tag in ['html', 'problem', 'discussion', 'customtag', 'poll_question', 'combinedopenended', 'metadata']): inherit_seq_num = (x.tag=='vertical' and not url_name) # if <vertical> with no url_name then keep seq_num for children if not inherit_seq_num: seq_num = 1 for y in x: if (not str(y).startswith('<!--')) and (not y.tag in ['discussion', 'source']): if not split_url_name and x.tag=="split_test": split_url_name = url_name walk(y, seq_num, path, seq_type, parent_start=start, parent=x, chapter=the_chapter, parent_url_name=url_name, split_url_name=split_url_name, ) if not inherit_seq_num: seq_num += 1