def test_add_descriptors(self): """ Test add_descriptors. """ # Note url_name_orig in chapter. input_xml = input_data.URL_NAME_ORIG_IN_CHAPTER1 bundle = XBundle(keep_urls=True) bundle.load(file_from_string(input_xml)) # str(bundle) doesn't change input xml, but export_to_directory will. self.assertEqual(clean_xml(input_xml), clean_xml(str(bundle))) old_current_dir = os.getcwd() tempdir = mkdtemp() try: os.chdir(tempdir) bundle.export_to_directory() bundle2 = XBundle(keep_urls=True) bundle2.import_from_directory() expected = expected_data.URL_NAME_ORIG self.assertEqual(clean_xml(expected), clean_xml(str(bundle2))) finally: os.chdir(old_current_dir) rmtree(tempdir)
def test_import_export(self): # pylint: disable=no-self-use """ Test import then export. """ bundle = XBundle() bundle.import_from_directory(os.path.join("input_testdata", "mitx.01")) tdir = mkdtemp() try: bundle.export_to_directory(tdir) knownDir = os.path.join("input_testdata", "mitx.01.exported") knownTempDir = os.path.join(tdir, 'mitx.01.exported') newDir = os.path.join(tdir, "mitx.01") # Transform xml files to remove spaces. This allows for cross tests # to pass across platforms with slightly different xml serializers # (see: travis). We copy the files for easy cleanup. copytree(knownDir, knownTempDir) _normalize_xml(tdir) check_call([ "diff", "-r", knownTempDir, newDir ]) finally: rmtree(tdir)
def test_save(self): """ Test save method. """ input_xml = "<xbundle><metadata /><course /></xbundle>" bundle = XBundle() bundle.load(file_from_string(input_xml)) self.assertEqual(clean_xml(str(bundle)), clean_xml(input_xml)) curdir = os.getcwd() tempdir = mkdtemp() try: os.chdir(tempdir) bundle.save() with open(os.path.join(tempdir, "xbundle.xml")) as f: self.assertEqual(clean_xml(f.read()), clean_xml(input_xml)) bundle.save(filename="other.xml") with open(os.path.join(tempdir, "other.xml")) as f: self.assertEqual(clean_xml(f.read()), clean_xml(input_xml)) handle_path = os.path.join(tempdir, "third.xml") with open(handle_path, "w") as f: bundle.save(file_handle=f) with open(handle_path) as f: self.assertEqual(clean_xml(f.read()), clean_xml(input_xml)) finally: os.chdir(curdir) rmtree(tempdir)
def test_nested_leaves(self): """ Test that nested leaves are not imported. """ template = """ <course org="DevOps" course="0.001" url_name="2015_Summer" semester="2015_Summer"> <chapter> <sequential> <vertical> <{tag}><{tag}></{tag}></{tag}> </vertical> </sequential> </chapter> </course> """ for tag in ("html", "problem", "discussion", "video"): repo = create_repo( "{tag}_repo".format(tag=tag), "...", self.user.id) xml = etree.fromstring(template.format(tag=tag)) bundle = XBundle( keep_urls=True, keep_studio_urls=True, preserve_url_name=True ) bundle.set_course(xml) import_course(bundle, repo.id, self.user.id, "") self.assertEqual( LearningResource.objects.filter( learning_resource_type__name=tag ).count(), 1 )
def test_export_and_keep_urls(self): """ Test the changes to url_name after export_to_directory and import. """ # Note url_name_orig in chapter. input_xml = input_data.URL_NAME_ORIG_IN_CHAPTER2 bundle = XBundle(keep_urls=True, force_studio_format=True) bundle.load(file_from_string(input_xml)) # str(bundle) doesn't change input xml, but export_to_directory will. self.assertEqual(clean_xml(input_xml), clean_xml(str(bundle))) old_current_dir = os.getcwd() tempdir = mkdtemp() try: os.chdir(tempdir) bundle.export_to_directory() bundle2 = XBundle(keep_urls=True, force_studio_format=True) bundle2.import_from_directory() expected = expected_data.KEEP_URLS_FORCE_STUDIO_FORMAT self.assertEqual(clean_xml(expected), clean_xml(str(bundle2))) finally: os.chdir(old_current_dir) rmtree(tempdir)
def test_fix_old_descriptor_name(self): """ Test fix_old_descriptor_name. """ bundle = XBundle() elem = etree.XML('<sequential name="abc" />') bundle.fix_old_descriptor_name(elem) expected = '<sequential display_name="abc" />' self.assertEqual(clean_xml(expected), clean_xml(etree.tostring(elem)))
def test_xml_header(self): """ Test removal of xml header. The <?xml ... should not show up in the output and the XML should still be parsed correctly. """ input_xml = input_data.EMPTY_XBUNDLE bundle = XBundle() bundle.load(file_from_string(input_xml)) self.assertFalse(str(bundle).startswith("<?xml")) self.assertEqual(clean_xml(input_xml), clean_xml(str(bundle)))
def test_import_skip_hidden(self): """ Test skip_hidden flag. """ bundle = XBundle(skip_hidden=True) path = os.path.join('input_testdata', 'mitx.01') bundle.import_from_directory(path) expected = expected_data.SKIP_HIDDEN self.assertEqual(clean_xml(str(bundle)), clean_xml(expected))
def test_fix_old_course_section(self): """ Test fix_old_course_section. """ bundle = XBundle() bundle.import_from_directory( os.path.join("input_testdata", "sections")) # Section element should be removed. expected = expected_data.MISSING_SECTION self.assertEqual(clean_xml(expected), clean_xml(str(bundle)))
def test_fix_old_course_section(self): """ Test fix_old_course_section. """ bundle = XBundle() bundle.import_from_directory(os.path.join("input_testdata", "sections")) # Section element should be removed. expected = expected_data.MISSING_SECTION self.assertEqual(clean_xml(expected), clean_xml(str(bundle)))
def import_course_from_path(path, repo_id, user_id): """ Import course from an OLX directory. Args: path (unicode): path to extracted OLX tree user_id (int): pk of Django user doing the import """ bundle = XBundle() bundle.import_from_directory(path) return import_course(bundle, repo_id, user_id)
def test_import_url_name(self): """ Test that we import url_name as url_name_orig. """ bundle = XBundle(keep_urls=True, keep_studio_urls=True) bundle.import_from_directory(os.path.join('input_testdata', 'mitx.01')) bundle_string = str(bundle) expected = expected_data.KEEP_URLS self.assertEqual(clean_xml(expected), clean_xml(bundle_string))
def test_preserve_url_name(self): """ Test that preserve_url_name imports as url_name and not url_name_orig. """ bundle = XBundle( keep_urls=True, keep_studio_urls=True, preserve_url_name=True) bundle.import_from_directory('input_testdata/mitx.01') bundle_string = str(bundle) expected = expected_data.PRESERVE_URL_NAME self.assertEqual(clean_xml(expected), clean_xml(bundle_string))
def test_preserve_url_name(self): """ Test that preserve_url_name imports as url_name and not url_name_orig. """ bundle = XBundle(keep_urls=True, keep_studio_urls=True, preserve_url_name=True) bundle.import_from_directory('input_testdata/mitx.01') bundle_string = str(bundle) expected = expected_data.PRESERVE_URL_NAME self.assertEqual(clean_xml(expected), clean_xml(bundle_string))
def documents_from_olx(olx_path): # pylint: disable=too-many-locals """ Extract text from OLX directory Args: olx_path (str): The path to the directory with the OLX data Returns: list of tuple: A list of (bytes of content, metadata) """ documents = [] bundle = XBundle() bundle.import_from_directory(olx_path) for index, vertical in enumerate(bundle.course.findall(".//vertical")): content = get_text_from_element(vertical) documents.append(( content, { "key": f"vertical_{index + 1}", "content_type": CONTENT_TYPE_VERTICAL, "title": vertical.attrib.get("display_name") or "", "mime_type": "application/xml", }, )) counter = _infinite_counter() for root, _, files in os.walk(olx_path): for filename in files: _, extension = os.path.splitext(filename) extension_lower = extension.lower() if extension_lower in VALID_TEXT_FILE_TYPES: with open(os.path.join(root, filename), "rb") as f: filebytes = f.read() mimetype = mimetypes.types_map.get(extension_lower) documents.append(( filebytes, { "key": f"document_{next(counter)}_{filename}", "content_type": CONTENT_TYPE_FILE, "mime_type": mimetype, }, )) return documents
def import_course_from_path(path, repo_id, user_id): """ Import course from an OLX directory. Args: path (unicode): Path to extracted OLX tree repo_id (int): Primary key of repository course belongs to user_id (int): Primary key of Django user doing the import Returns: course (learningresources.Course) """ bundle = XBundle( keep_urls=True, keep_studio_urls=True, preserve_url_name=True ) bundle.import_from_directory(path) static_dir = join(path, 'static') course = import_course(bundle, repo_id, user_id, static_dir) return course
def test_unicode_in_html(self): """ Test that unicode doesn't cause problems in overview file. """ bundle = XBundle() bundle.import_from_directory(os.path.join("input_testdata", "mitx.01")) bundle.add_about_file("overview.html", "\u2e18 interrobang \u203d") expected = expected_data.ESCAPED_UNICODE self.assertEqual(clean_xml(str(bundle)), clean_xml(expected)) # Reimport to start from a clean slate. This time use bytes. bundle = XBundle() bundle.import_from_directory(os.path.join("input_testdata", "mitx.01")) bundle.add_about_file( "overview.html", "\u2e18 interrobang \u203d".encode('utf-8')) self.assertEqual(clean_xml(str(bundle)), clean_xml(expected))
def import_course_from_path(path, repo_id, user_id): """ Import course from an OLX directory. Args: path (unicode): Path to extracted OLX tree repo_id (int): Primary key of repository course belongs to user_id (int): Primary key of Django user doing the import Returns: course (learningresources.Course) """ bundle = XBundle(keep_urls=True, keep_studio_urls=True, preserve_url_name=True) bundle.import_from_directory(path) static_dir = join(path, 'static') with transaction.atomic(): course = import_course(bundle, repo_id, user_id, static_dir) return course
def test_parent_preview_link(self): """ Test that if url_name is blank we import the parent's url_name when viewing the preview link. """ xml = """ <course org="DevOps" course="0.001" url_name="2015_Summer" semester="2015_Summer"> <chapter> <sequential> <vertical> <html></html> </vertical> </sequential> </chapter> </course> """ repo = create_repo("html_repo", "...", self.user.id) xml = etree.fromstring(xml) bundle = XBundle( keep_urls=True, keep_studio_urls=True, preserve_url_name=True ) bundle.set_course(xml) import_course(bundle, repo.id, self.user.id, "") html_resources = LearningResource.objects.filter( learning_resource_type__name="html" ) self.assertEqual(html_resources.count(), 1) html_resource = html_resources.first() self.assertEqual( get_preview_url(html_resource), "{base}courses/{org}/{course}/{run}/jump_to_id/{url_path}".format( base=settings.LORE_PREVIEW_BASE_URL, org=html_resource.course.org, course=html_resource.course.course_number, run=html_resource.course.run, url_path="2015_Summer" ) )
def test_is_not_random_urlname(self): """ Test behavior of is_not_random_urlname. """ # Randomness test used in method input_hash = 'z5bc076ad06e4ede9d0561948c03be2f' input_letters = 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' input_empty = '' # Function always returns True if self.keep_studio_urls is True. bundle_studio_urls = XBundle(keep_studio_urls=True) self.assertTrue(bundle_studio_urls.is_not_random_urlname(input_hash)) self.assertTrue( bundle_studio_urls.is_not_random_urlname(input_letters)) self.assertTrue(bundle_studio_urls.is_not_random_urlname(input_empty)) bundle = XBundle() self.assertFalse(bundle.is_not_random_urlname(input_hash)) self.assertTrue(bundle.is_not_random_urlname(input_letters)) self.assertTrue(bundle.is_not_random_urlname(input_empty))
def test_import_export(self): # pylint: disable=no-self-use """ Test import then export. """ bundle = XBundle() bundle.import_from_directory(os.path.join("input_testdata", "mitx.01")) tdir = mkdtemp() try: bundle.export_to_directory(tdir) knownDir = os.path.join("input_testdata", "mitx.01.exported") knownTempDir = os.path.join(tdir, 'mitx.01.exported') newDir = os.path.join(tdir, "mitx.01") # Transform xml files to remove spaces. This allows for cross tests # to pass across platforms with slightly different xml serializers # (see: travis). We copy the files for easy cleanup. copytree(knownDir, knownTempDir) _normalize_xml(tdir) check_call(["diff", "-r", knownTempDir, newDir]) finally: rmtree(tdir)
def test_import_large(self): """ Test import of a course slightly larger than mitx.01. """ bundle = XBundle() path = os.path.join('input_testdata', 'content-devops-0001') bundle.import_from_directory(path) expected_path = os.path.join( 'input_testdata', 'content-devops-0001.out.xml') with open(expected_path) as f: self.assertEqual(clean_xml(f.read()), clean_xml(str(bundle))) tempdir = mkdtemp() try: bundle.export_to_directory(tempdir, xml_only=True, newfmt=True) for _, _, files in os.walk(os.path.join(tempdir, "0.001")): for filename in files: # We set xml_only=True so there shouldn't be anything else. self.assertTrue(filename.endswith(".xml")) finally: rmtree(tempdir)
def test_import_large(self): """ Test import of a course slightly larger than mitx.01. """ bundle = XBundle() path = os.path.join('input_testdata', 'content-devops-0001') bundle.import_from_directory(path) expected_path = os.path.join('input_testdata', 'content-devops-0001.out.xml') with open(expected_path) as f: self.assertEqual(clean_xml(f.read()), clean_xml(str(bundle))) tempdir = mkdtemp() try: bundle.export_to_directory(tempdir, xml_only=True, newfmt=True) for _, _, files in os.walk(os.path.join(tempdir, "0.001")): for filename in files: # We set xml_only=True so there shouldn't be anything else. self.assertTrue(filename.endswith(".xml")) finally: rmtree(tempdir)
def export(self): meta = self.meta sys.stderr.write("metadata = %s\n" % meta) fn = self.dir / 'contents/Syllabus/index.htm' sxml = self.parse_broken_html(fn=fn) edxxml = etree.Element('course') edxxml.set('dirname',os.path.basename(os.getcwd())) edxxml.set('semester', self.DefaultSemester) for k, v in meta.items(): edxxml.set(k,v) self.processed_files = [fn] # track which content files have been ingested, to avoid duplication self.files_to_copy = {} # dict of files (key=OCW source, val=edX static dest) to copy to "/static" self.processed_pdf_files = [] self.element_counts = defaultdict(int) self.do_chapters(sxml, edxxml) policies = self.policies # grab course image via index.htm self.get_course_image() # make xbundle xb = XBundle(force_studio_format=True) xb.DefaultOrg = self.DefaultOrg xb.set_course(edxxml) xb.add_policies(policies) self.add_about_files(xb) def c(x): return len(xb.course.findall(".//%s" % x)) elist = ["chapter", "sequential", "vertical", "problem", "html", "video"] xbundle_counts = {x:c(x) for x in elist} self.element_counts['n_static_files'] = len(self.files_to_copy) self.element_counts['n_ocw_files_processed'] = len(self.processed_files) # save it outfn = self.output_fn or ('%s_xbundle.xml' % self.cid) if outfn.endswith(".xml"): xb.save(outfn) self.copy_static_files(".") elif outfn.endswith(".tar.gz") or outfn.endswith(".tgz"): tempd = tempfile.mkdtemp(prefix="tmp_ocw2xbundle") cdir = path(tempd) / "course" os.mkdir(cdir) self.copy_static_files(cdir) xb.export_to_directory(cdir, dir_include_course_id=False) curdir = os.path.abspath(os.curdir) cmd = "cd %s; tar czf '%s/%s' course" % (tempd, curdir, outfn) print cmd os.system(cmd) shutil.rmtree(tempd) else: if not os.path.exists(outfn): print "Making directory for output: %s" % outfn os.mkdir(outfn) self.copy_static_files(outfn) xb.export_to_directory(outfn, dir_include_course_id=False) print "OCW element counts: %s" % json.dumps(self.element_counts, indent=4) print "edX XML element counts: %s" % json.dumps(xbundle_counts, indent=4) print "Done, wrote to %s" % outfn
def test_set_course(self): """ Test functionality of set_course. """ input_xml = input_data.EMPTY_COURSE bundle = XBundle(keep_urls=True) bundle.load(file_from_string(input_xml)) # No org or semester is specified in XML above. self.assertEqual(bundle.course.get("org"), None) self.assertEqual(bundle.course.get("semester"), None) self.assertEqual(bundle.semester, "") # Note lack of org attribute and url_name for course element. course_str = input_data.NO_COURSE with self.assertRaises(Exception) as ex: bundle.set_course(etree.XML("<x>" + course_str + "</x>")) self.assertTrue( "set_course should be called with a <course> element" in ex.exception.args) with self.assertRaises(Exception) as ex: bundle.set_course(etree.XML("<course />")) self.assertTrue("No semester found." in ex.exception.args) bundle.set_course(etree.XML("<course url_name='x' />")) self.assertEqual(bundle.semester, "x") bundle.set_course(etree.XML(course_str)) # MITx is not present in data, it is automatically set. self.assertEqual(bundle.course.get("org"), "MITx") self.assertEqual(bundle.course.get("semester"), "2013_Spring") self.assertEqual(bundle.semester, "2013_Spring") bundle_string = str(bundle) expected = expected_data.SET_COURSE self.assertEqual(clean_xml(bundle_string), clean_xml(expected))
def test_export_import(self): """ Test export then import. """ bundle = XBundle() cxmls = input_data.COURSE pxmls = input_data.POLICIES bundle.set_course(etree.XML(cxmls)) bundle.add_policies(etree.XML(pxmls)) bundle.add_about_file("overview.html", "hello overview") xbin = str(bundle) tdir = mkdtemp() try: bundle.export_to_directory(tdir) # Test round- trip. xb2 = XBundle() xb2.import_from_directory(os.path.join(tdir, 'mitx.01')) xbreloaded = str(xb2) self.assertEqual(clean_xml(xbin), clean_xml(xbreloaded)) finally: rmtree(tdir)
def export(self): meta = self.meta sys.stderr.write("metadata = %s\n" % meta) fn = self.dir / 'contents/Syllabus/index.htm' sxml = self.parse_broken_html(fn=fn) edxxml = etree.Element('course') edxxml.set('dirname', os.path.basename(os.getcwd())) edxxml.set('semester', self.DefaultSemester) for k, v in meta.items(): edxxml.set(k, v) self.processed_files = [ fn ] # track which content files have been ingested, to avoid duplication self.files_to_copy = { } # dict of files (key=OCW source, val=edX static dest) to copy to "/static" self.processed_pdf_files = [] self.element_counts = defaultdict(int) self.do_chapters(sxml, edxxml) policies = self.policies # grab course image via index.htm self.get_course_image() # make xbundle xb = XBundle(force_studio_format=True) xb.DefaultOrg = self.DefaultOrg xb.set_course(edxxml) xb.add_policies(policies) self.add_about_files(xb) def c(x): return len(xb.course.findall(".//%s" % x)) elist = [ "chapter", "sequential", "vertical", "problem", "html", "video" ] xbundle_counts = {x: c(x) for x in elist} self.element_counts['n_static_files'] = len(self.files_to_copy) self.element_counts['n_ocw_files_processed'] = len( self.processed_files) # save it outfn = self.output_fn or ('%s_xbundle.xml' % self.cid) if outfn.endswith(".xml"): xb.save(outfn) self.copy_static_files(".") elif outfn.endswith(".tar.gz") or outfn.endswith(".tgz"): tempd = tempfile.mkdtemp(prefix="tmp_ocw2xbundle") cdir = path(tempd) / "course" os.mkdir(cdir) self.copy_static_files(cdir) xb.export_to_directory(cdir, dir_include_course_id=False) curdir = os.path.abspath(os.curdir) cmd = "cd %s; tar czf '%s/%s' course" % (tempd, curdir, outfn) print cmd os.system(cmd) shutil.rmtree(tempd) else: if not os.path.exists(outfn): print "Making directory for output: %s" % outfn os.mkdir(outfn) self.copy_static_files(outfn) xb.export_to_directory(outfn, dir_include_course_id=False) print "OCW element counts: %s" % json.dumps(self.element_counts, indent=4) print "edX XML element counts: %s" % json.dumps(xbundle_counts, indent=4) print "Done, wrote to %s" % outfn
def test_unicode_in_html(self): """ Test that unicode doesn't cause problems in overview file. """ bundle = XBundle() bundle.import_from_directory(os.path.join("input_testdata", "mitx.01")) bundle.add_about_file("overview.html", "\u2e18 interrobang \u203d") expected = expected_data.ESCAPED_UNICODE self.assertEqual(clean_xml(str(bundle)), clean_xml(expected)) # Reimport to start from a clean slate. This time use bytes. bundle = XBundle() bundle.import_from_directory(os.path.join("input_testdata", "mitx.01")) bundle.add_about_file("overview.html", "\u2e18 interrobang \u203d".encode('utf-8')) self.assertEqual(clean_xml(str(bundle)), clean_xml(expected))
def test_set_course(self): """ Test functionality of set_course. """ input_xml = input_data.EMPTY_COURSE bundle = XBundle(keep_urls=True) bundle.load(file_from_string(input_xml)) # No org or semester is specified in XML above. self.assertEqual(bundle.course.get("org"), None) self.assertEqual(bundle.course.get("semester"), None) self.assertEqual(bundle.semester, "") # Note lack of org attribute and url_name for course element. course_str = input_data.NO_COURSE with self.assertRaises(Exception) as ex: bundle.set_course(etree.XML("<x>" + course_str + "</x>")) self.assertTrue("set_course should be called with a <course> element" in ex.exception.args) with self.assertRaises(Exception) as ex: bundle.set_course(etree.XML("<course />")) self.assertTrue("No semester found." in ex.exception.args) bundle.set_course(etree.XML("<course url_name='x' />")) self.assertEqual(bundle.semester, "x") bundle.set_course(etree.XML(course_str)) # MITx is not present in data, it is automatically set. self.assertEqual(bundle.course.get("org"), "MITx") self.assertEqual(bundle.course.get("semester"), "2013_Spring") self.assertEqual(bundle.semester, "2013_Spring") bundle_string = str(bundle) expected = expected_data.SET_COURSE self.assertEqual(clean_xml(bundle_string), clean_xml(expected))