def handle(self, *args, **options): """Run Populate ocw courses""" if options["delete"]: self.stdout.write("Deleting all existing OCW courses") for course in Course.objects.filter(platform="ocw"): course.delete() delete_course(course) else: task = get_ocw_data.delay( force_overwrite=options["force_overwrite"], upload_to_s3=options["upload_to_s3"], ignore_flag=True, ) self.stdout.write( "Started task {task} to get ocw course data w/force_overwrite={overwrite}, upload_to_s3={s3}" .format( task=task, overwrite=options["force_overwrite"], s3=options["upload_to_s3"], )) self.stdout.write("Waiting on task...") start = now_in_utc() task.get() total_seconds = (now_in_utc() - start).total_seconds() self.stdout.write( "Population of ocw data finished, took {} seconds".format( total_seconds))
def test_delete_course(mocker): """ Tests that delete_course calls the delete tasks for the course and its content files """ patched_delete_task = mocker.patch("search.task_helpers.delete_document") course = CourseFactory.create() course_es_id = gen_course_id(course.platform, course.course_id) content_files = [ContentFileFactory.create(run=run) for run in course.runs.all()] delete_course(course) patched_delete_task.delay.assert_any_call(course_es_id, COURSE_TYPE) for content_file in content_files: patched_delete_task.delay.assert_any_call( gen_content_file_id(content_file.key), COURSE_TYPE, routing=course_es_id )
def delete_resources(apps, schema_editor): """ Delete all courses, runs, and bootcamps. They will need to be imported again. """ Course = apps.get_model("course_catalog", "Course") CourseRun = apps.get_model("course_catalog", "CourseRun") Bootcamp = apps.get_model("course_catalog", "Bootcamp") CourseRun.objects.all().delete() for course in Course.objects.all(): course.delete() delete_course(course) for bootcamp in Bootcamp.objects.all(): bootcamp.delete() delete_bootcamp(bootcamp)
def handle(self, *args, **options): """Run Populate edx courses""" if options["delete"]: self.stdout.write( "Deleting all existing MITx courses from database and ElasticSearch" ) for course in Course.objects.filter(platform=PlatformType.mitx.value): course.delete() delete_course(course) else: task = get_mitx_data.delay() self.stdout.write(f"Started task {task} to get edx course data") self.stdout.write("Waiting on task...") start = now_in_utc() task.get() total_seconds = (now_in_utc() - start).total_seconds() self.stdout.write( "Population of edx data finished, took {} seconds".format(total_seconds) )
def sync_ocw_course(*, course_prefix, raw_data_bucket, force_overwrite, upload_to_s3, blacklist): """ Sync an OCW course run Args: course_prefix (str): The course prefix raw_data_bucket (boto3.resource): The S3 bucket containing the OCW information force_overwrite (bool): A boolean value to force the incoming course data to overwrite existing data upload_to_s3 (bool): If True, upload course media to S3 blacklist (list of str): list of course ids that should not be published Returns: str: The UID, or None if the run_id is not found, or if it was found but not synced """ loaded_raw_jsons_for_course = [] last_modified_dates = [] uid = None is_published = True log.info("Syncing: %s ...", course_prefix) # Collect last modified timestamps for all course files of the course for obj in raw_data_bucket.objects.filter(Prefix=course_prefix): # the "1.json" metadata file contains a course's uid if obj.key == course_prefix + "0/1.json": try: first_json = safe_load_json(get_s3_object_and_read(obj), obj.key) uid = first_json.get("_uid") last_published_to_production = format_date( first_json.get("last_published_to_production", None)) last_unpublishing_date = format_date( first_json.get("last_unpublishing_date", None)) if last_published_to_production is None or ( last_unpublishing_date and (last_unpublishing_date > last_published_to_production)): is_published = False except: # pylint: disable=bare-except log.exception("Error encountered reading 1.json for %s", course_prefix) # accessing last_modified from s3 object summary is fast (does not download file contents) last_modified_dates.append(obj.last_modified) if not uid: # skip if we're unable to fetch course's uid log.info("Skipping %s, no course_id", course_prefix) return None # get the latest modified timestamp of any file in the course last_modified = max(last_modified_dates) # if course run synced before, check if modified since then courserun_instance = LearningResourceRun.objects.filter( platform=PlatformType.ocw.value, run_id=uid).first() # Make sure that the data we are syncing is newer than what we already have if (courserun_instance and last_modified <= courserun_instance.last_modified and not force_overwrite): log.info("Already synced. No changes found for %s", course_prefix) return None # fetch JSON contents for each course file in memory (slow) log.info("Loading JSON for %s...", course_prefix) for obj in sorted( raw_data_bucket.objects.filter(Prefix=course_prefix), key=lambda x: int(x.key.split("/")[-1].split(".")[0]), ): loaded_raw_jsons_for_course.append( safe_load_json(get_s3_object_and_read(obj), obj.key)) log.info("Parsing for %s...", course_prefix) # pass course contents into parser parser = OCWParser(loaded_jsons=loaded_raw_jsons_for_course) course_json = parser.get_master_json() course_json["uid"] = uid course_json["course_id"] = "{}.{}".format( course_json.get("department_number"), course_json.get("master_course_number")) if course_json["course_id"] in blacklist: is_published = False if upload_to_s3 and is_published: try: parser.setup_s3_uploading( settings.OCW_LEARNING_COURSE_BUCKET_NAME, settings.OCW_LEARNING_COURSE_ACCESS_KEY, settings.OCW_LEARNING_COURSE_SECRET_ACCESS_KEY, # course_prefix now has trailing slash so [-2] below is the last # actual element and [-1] is an empty string course_prefix.split("/")[-2], ) if settings.OCW_UPLOAD_IMAGE_ONLY: parser.upload_course_image() else: parser.upload_all_media_to_s3(upload_master_json=True) except: # pylint: disable=bare-except log.exception(("Error encountered uploading OCW files for %s", course_prefix)) raise log.info("Digesting %s...", course_prefix) try: course, run = digest_ocw_course(course_json, last_modified, is_published, course_prefix) except TypeError: log.info("Course and run not returned, skipping") return None if upload_to_s3 and is_published: load_content_files(run, transform_content_files(course_json)) course.published = is_published or (Course.objects.get( id=course.id).runs.filter(published=True).exists()) course.save() if course.published: upsert_course(course.id) else: delete_course(course)
def load_course(course_data, blacklist, duplicates): """Load the course into the database""" # pylint: disable=too-many-branches,too-many-locals course_id = course_data.pop("course_id") runs_data = course_data.pop("runs", []) topics_data = course_data.pop("topics", []) offered_bys_data = course_data.pop("offered_by", []) if course_id in blacklist: course_data["published"] = False duplicates_record = next( (record for record in duplicates if course_id in record["duplicate_course_ids"]), None, ) if duplicates_record: course = Course.objects.filter( course_id=duplicates_record["course_id"]).first() if not course: course_data["course_id"] = duplicates_record["course_id"] course = Course.objects.create(**course_data) created = True else: created = False if course_id != duplicates_record["course_id"]: duplicate_course = Course.objects.filter( course_id=course_id).first() if duplicate_course: duplicate_course.published = False duplicate_course.save() search_task_helpers.delete_course(duplicate_course) else: platform = course_data.get("platform") course, created = Course.objects.update_or_create(platform=platform, course_id=course_id, defaults=course_data) run_ids_to_update_or_create = [run["run_id"] for run in runs_data] for course_run_data in runs_data: load_run(course, course_run_data) if duplicates_record and not created: most_relevent_run = get_most_relevant_run(course.runs.all()) if most_relevent_run.run_id in run_ids_to_update_or_create: for attr, val in course_data.items(): setattr(course, attr, val) course.save() load_topics(course, topics_data) load_offered_bys(course, offered_bys_data) if not created and not course.published: search_task_helpers.delete_course(course) elif course.published: search_task_helpers.upsert_course(course.id) return course