Example #1
0
 def handle(self, *args, **options):
     """Run Populate ocw courses"""
     if options["delete"]:
         self.stdout.write("Deleting all existing OCW courses")
         for course in Course.objects.filter(platform="ocw"):
             course.delete()
             delete_course(course)
     else:
         task = get_ocw_data.delay(
             force_overwrite=options["force_overwrite"],
             upload_to_s3=options["upload_to_s3"],
             ignore_flag=True,
         )
         self.stdout.write(
             "Started task {task} to get ocw course data w/force_overwrite={overwrite}, upload_to_s3={s3}"
             .format(
                 task=task,
                 overwrite=options["force_overwrite"],
                 s3=options["upload_to_s3"],
             ))
         self.stdout.write("Waiting on task...")
         start = now_in_utc()
         task.get()
         total_seconds = (now_in_utc() - start).total_seconds()
         self.stdout.write(
             "Population of ocw data finished, took {} seconds".format(
                 total_seconds))
def test_delete_course(mocker):
    """
    Tests that delete_course calls the delete tasks for the course and its content files
    """
    patched_delete_task = mocker.patch("search.task_helpers.delete_document")
    course = CourseFactory.create()
    course_es_id = gen_course_id(course.platform, course.course_id)
    content_files = [ContentFileFactory.create(run=run) for run in course.runs.all()]

    delete_course(course)
    patched_delete_task.delay.assert_any_call(course_es_id, COURSE_TYPE)
    for content_file in content_files:
        patched_delete_task.delay.assert_any_call(
            gen_content_file_id(content_file.key), COURSE_TYPE, routing=course_es_id
        )
def delete_resources(apps, schema_editor):
    """
    Delete all courses, runs, and bootcamps.  They will need to be imported again.
    """
    Course = apps.get_model("course_catalog", "Course")
    CourseRun = apps.get_model("course_catalog", "CourseRun")
    Bootcamp = apps.get_model("course_catalog", "Bootcamp")

    CourseRun.objects.all().delete()
    for course in Course.objects.all():
        course.delete()
        delete_course(course)
    for bootcamp in Bootcamp.objects.all():
        bootcamp.delete()
        delete_bootcamp(bootcamp)
Example #4
0
 def handle(self, *args, **options):
     """Run Populate edx courses"""
     if options["delete"]:
         self.stdout.write(
             "Deleting all existing MITx courses from database and ElasticSearch"
         )
         for course in Course.objects.filter(platform=PlatformType.mitx.value):
             course.delete()
             delete_course(course)
     else:
         task = get_mitx_data.delay()
         self.stdout.write(f"Started task {task} to get edx course data")
         self.stdout.write("Waiting on task...")
         start = now_in_utc()
         task.get()
         total_seconds = (now_in_utc() - start).total_seconds()
         self.stdout.write(
             "Population of edx data finished, took {} seconds".format(total_seconds)
         )
Example #5
0
def sync_ocw_course(*, course_prefix, raw_data_bucket, force_overwrite,
                    upload_to_s3, blacklist):
    """
    Sync an OCW course run

    Args:
        course_prefix (str): The course prefix
        raw_data_bucket (boto3.resource): The S3 bucket containing the OCW information
        force_overwrite (bool): A boolean value to force the incoming course data to overwrite existing data
        upload_to_s3 (bool): If True, upload course media to S3
        blacklist (list of str): list of course ids that should not be published

    Returns:
        str:
            The UID, or None if the run_id is not found, or if it was found but not synced
    """
    loaded_raw_jsons_for_course = []
    last_modified_dates = []
    uid = None
    is_published = True
    log.info("Syncing: %s ...", course_prefix)

    # Collect last modified timestamps for all course files of the course
    for obj in raw_data_bucket.objects.filter(Prefix=course_prefix):
        # the "1.json" metadata file contains a course's uid
        if obj.key == course_prefix + "0/1.json":
            try:
                first_json = safe_load_json(get_s3_object_and_read(obj),
                                            obj.key)
                uid = first_json.get("_uid")
                last_published_to_production = format_date(
                    first_json.get("last_published_to_production", None))
                last_unpublishing_date = format_date(
                    first_json.get("last_unpublishing_date", None))
                if last_published_to_production is None or (
                        last_unpublishing_date and
                    (last_unpublishing_date > last_published_to_production)):
                    is_published = False
            except:  # pylint: disable=bare-except
                log.exception("Error encountered reading 1.json for %s",
                              course_prefix)
        # accessing last_modified from s3 object summary is fast (does not download file contents)
        last_modified_dates.append(obj.last_modified)
    if not uid:
        # skip if we're unable to fetch course's uid
        log.info("Skipping %s, no course_id", course_prefix)
        return None
    # get the latest modified timestamp of any file in the course
    last_modified = max(last_modified_dates)

    # if course run synced before, check if modified since then
    courserun_instance = LearningResourceRun.objects.filter(
        platform=PlatformType.ocw.value, run_id=uid).first()

    # Make sure that the data we are syncing is newer than what we already have
    if (courserun_instance
            and last_modified <= courserun_instance.last_modified
            and not force_overwrite):
        log.info("Already synced. No changes found for %s", course_prefix)
        return None

    # fetch JSON contents for each course file in memory (slow)
    log.info("Loading JSON for %s...", course_prefix)
    for obj in sorted(
            raw_data_bucket.objects.filter(Prefix=course_prefix),
            key=lambda x: int(x.key.split("/")[-1].split(".")[0]),
    ):
        loaded_raw_jsons_for_course.append(
            safe_load_json(get_s3_object_and_read(obj), obj.key))

    log.info("Parsing for %s...", course_prefix)
    # pass course contents into parser
    parser = OCWParser(loaded_jsons=loaded_raw_jsons_for_course)
    course_json = parser.get_master_json()
    course_json["uid"] = uid
    course_json["course_id"] = "{}.{}".format(
        course_json.get("department_number"),
        course_json.get("master_course_number"))
    if course_json["course_id"] in blacklist:
        is_published = False

    if upload_to_s3 and is_published:
        try:
            parser.setup_s3_uploading(
                settings.OCW_LEARNING_COURSE_BUCKET_NAME,
                settings.OCW_LEARNING_COURSE_ACCESS_KEY,
                settings.OCW_LEARNING_COURSE_SECRET_ACCESS_KEY,
                # course_prefix now has trailing slash so [-2] below is the last
                # actual element and [-1] is an empty string
                course_prefix.split("/")[-2],
            )
            if settings.OCW_UPLOAD_IMAGE_ONLY:
                parser.upload_course_image()
            else:
                parser.upload_all_media_to_s3(upload_master_json=True)
        except:  # pylint: disable=bare-except
            log.exception(("Error encountered uploading OCW files for %s",
                           course_prefix))
            raise

    log.info("Digesting %s...", course_prefix)
    try:
        course, run = digest_ocw_course(course_json, last_modified,
                                        is_published, course_prefix)
    except TypeError:
        log.info("Course and run not returned, skipping")
        return None

    if upload_to_s3 and is_published:
        load_content_files(run, transform_content_files(course_json))

    course.published = is_published or (Course.objects.get(
        id=course.id).runs.filter(published=True).exists())
    course.save()
    if course.published:
        upsert_course(course.id)
    else:
        delete_course(course)
Example #6
0
def load_course(course_data, blacklist, duplicates):
    """Load the course into the database"""
    # pylint: disable=too-many-branches,too-many-locals

    course_id = course_data.pop("course_id")
    runs_data = course_data.pop("runs", [])
    topics_data = course_data.pop("topics", [])
    offered_bys_data = course_data.pop("offered_by", [])

    if course_id in blacklist:
        course_data["published"] = False

    duplicates_record = next(
        (record for record in duplicates
         if course_id in record["duplicate_course_ids"]),
        None,
    )

    if duplicates_record:
        course = Course.objects.filter(
            course_id=duplicates_record["course_id"]).first()
        if not course:
            course_data["course_id"] = duplicates_record["course_id"]
            course = Course.objects.create(**course_data)
            created = True
        else:
            created = False

        if course_id != duplicates_record["course_id"]:
            duplicate_course = Course.objects.filter(
                course_id=course_id).first()
            if duplicate_course:
                duplicate_course.published = False
                duplicate_course.save()
                search_task_helpers.delete_course(duplicate_course)
    else:
        platform = course_data.get("platform")
        course, created = Course.objects.update_or_create(platform=platform,
                                                          course_id=course_id,
                                                          defaults=course_data)

    run_ids_to_update_or_create = [run["run_id"] for run in runs_data]

    for course_run_data in runs_data:
        load_run(course, course_run_data)

    if duplicates_record and not created:
        most_relevent_run = get_most_relevant_run(course.runs.all())

        if most_relevent_run.run_id in run_ids_to_update_or_create:
            for attr, val in course_data.items():
                setattr(course, attr, val)
            course.save()

    load_topics(course, topics_data)
    load_offered_bys(course, offered_bys_data)

    if not created and not course.published:
        search_task_helpers.delete_course(course)
    elif course.published:
        search_task_helpers.upsert_course(course.id)

    return course