def test_join_when_invalid_grade_negative_only_course_of_student( spark_session): student_data = [(1, 'Mario', 'CS'), (2, 'Paula', 'Biology'), (3, 'Juan', 'Business')] course_data = [(1, 4, 'CS'), (2, 2, 'Biology'), (3, 4, 'Business')] grade_data = [(1, 1, -10), (2, 2, 95), (3, 3, 84)] students_df = spark_session.createDataFrame( student_data, ['id', 'name', 'student_major']) courses_df = spark_session.createDataFrame( course_data, ['id', 'credits', 'course_major']) grades_df = spark_session.createDataFrame( grade_data, ['student_id', 'course_id', 'grade']) expected_data = [ (3, 'Juan', 3, 4, 84, 'Business'), (2, 'Paula', 2, 2, 95, 'Biology'), ] expected_df = spark_session.createDataFrame(expected_data, [ 'student_id', 'name', 'course_id', 'credits', 'grade', 'student_major' ]) top_students_finder = TopStudentsFinder(students_df, courses_df, grades_df) actual_df = top_students_finder.join_student_data() assert expected_df.collect() == actual_df.collect()
def test_join_when_more_than_one_course_per_student(spark_session): student_data = [(1, 'Mario', 'CS'), (2, 'Paula', 'Biology'), (3, 'Juan', 'Business')] course_data = [(1, 4, 'CS'), (2, 2, 'Biology'), (3, 4, 'Business'), (4, 6, 'CS'), (5, 8, 'CS')] grade_data = [(1, 1, 90), (2, 2, 95), (3, 3, 84), (1, 4, 65), (1, 5, 87)] students_df = spark_session.createDataFrame( student_data, ['id', 'name', 'student_major']) courses_df = spark_session.createDataFrame( course_data, ['id', 'credits', 'course_major']) grades_df = spark_session.createDataFrame( grade_data, ['student_id', 'course_id', 'grade']) expected_data = [ (3, 'Juan', 3, 4, 84, 'Business'), (1, 'Mario', 4, 6, 65, 'CS'), (1, 'Mario', 5, 8, 87, 'CS'), (1, 'Mario', 1, 4, 90, 'CS'), (2, 'Paula', 2, 2, 95, 'Biology'), ] expected_df = spark_session.createDataFrame(expected_data, [ 'student_id', 'name', 'course_id', 'credits', 'grade', 'student_major' ]) top_students_finder = TopStudentsFinder(students_df, courses_df, grades_df) actual_df = top_students_finder.join_student_data() assert expected_df.collect() == actual_df.collect()
def main(): spark = SparkSession.builder.appName('database').master( 'local').getOrCreate() students_df, courses_df, grades_df = load_data(spark) top_students_finder = TopStudentsFinder(students_df, courses_df, grades_df) top_three_students = top_students_finder.get_top_n_students(2) top_three_students.show()
def test_join_when_students_null(spark_session): course_data = [(1, 4, 'CS'), (2, 2, 'Biology'), (3, 4, 'Business')] grade_data = [(1, 1, 90), (2, 2, 95), (3, 3, 84)] students_df = None courses_df = spark_session.createDataFrame( course_data, ['id', 'credits', 'course_major']) grades_df = spark_session.createDataFrame( grade_data, ['student_id', 'course_id', 'grade']) top_students_finder = TopStudentsFinder(students_df, courses_df, grades_df) actual_df = top_students_finder.join_student_data() assert actual_df is None
def test_join_when_grades_null(spark_session): student_data = [(1, 'Mario', 'CS'), (2, 'Paula', 'Biology'), (3, 'Juan', 'Business')] course_data = [(1, 4, 'CS'), (2, 2, 'Biology'), (3, 4, 'Business')] students_df = spark_session.createDataFrame( student_data, ['id', 'name', 'student_major']) courses_df = spark_session.createDataFrame( course_data, ['id', 'credits', 'course_major']) grades_df = None top_students_finder = TopStudentsFinder(students_df, courses_df, grades_df) actual_df = top_students_finder.join_student_data() assert actual_df is None
def test_get_top_when_top_is_negative(spark_session): student_data = [ (1, 'Mario', 'CS'), (2, 'Paula', 'Biology'), (3, 'Juan', 'CS'), (4, 'Marcela', 'CS'), (5, 'Carlos', 'Biology'), (6, 'Ivan', 'CS'), (7, 'Pablo', 'Biology'), (8, 'Karla', 'Biology'), (9, 'Pedro', 'Biology'), ] students_df = spark_session.createDataFrame( student_data, ['id', 'name', 'student_major']) grouped_data = [(1, 'CS', 76.0), (2, 'Biology', 88.0), (3, 'CS', 91.0), (4, 'CS', 95.0), (5, 'Biology', 87.0), (6, 'CS', 84.0), (7, 'Biology', 92.0), (8, 'Biology', 93.0), (9, 'Biology', 83.0)] grouped_df = spark_session.createDataFrame( grouped_data, ['student_id', 'student_major', 'score']) actual_df = TopStudentsFinder.get_top_n_students_internal( students_df, grouped_df, -1) assert actual_df is None
def test_get_top_happy_path(spark_session): student_data = [ (1, 'Mario', 'CS'), (2, 'Paula', 'Biology'), (3, 'Juan', 'CS'), (4, 'Marcela', 'CS'), (5, 'Carlos', 'Biology'), (6, 'Ivan', 'CS'), (7, 'Pablo', 'Biology'), (8, 'Karla', 'Biology'), (9, 'Pedro', 'Biology'), ] students_df = spark_session.createDataFrame( student_data, ['id', 'name', 'student_major']) grouped_data = [(1, 'CS', 76.0), (2, 'Biology', 88.0), (3, 'CS', 91.0), (4, 'CS', 95.0), (5, 'Biology', 87.0), (6, 'CS', 84.0), (7, 'Biology', 92.0), (8, 'Biology', 93.0), (9, 'Biology', 83.0)] grouped_df = spark_session.createDataFrame( grouped_data, ['student_id', 'student_major', 'score']) expected_data = [(4, 'Marcela', 'CS', 95.0), (3, 'Juan', 'CS', 91.0), (8, 'Karla', 'Biology', 93.0), (7, 'Pablo', 'Biology', 92.0)] expected_df = spark_session.createDataFrame( expected_data, ['student_id', 'name', 'major', 'score']) actual_df = TopStudentsFinder.get_top_n_students_internal( students_df, grouped_df, 2) assert expected_df.collect() == actual_df.collect()
def test_group_only_one_course_per_student_course_id_null(spark_session): data = [(3, 'Juan', 3, 4, 84, 'Business'), (1, 'Mario', None, 6, 65, 'CS'), (2, 'Paula', 2, 2, 95, 'Biology'),] df = spark_session.createDataFrame(data, ['student_id', 'name', 'course_id', 'credits', 'grade', 'student_major']) expected_data = [(2, 'Biology', 95.0), (3, 'Business', 84.0)] expected_df = spark_session.createDataFrame(expected_data, ['student_id', 'student_major','score']) actual_df = TopStudentsFinder.group_student_data(df) assert expected_df.collect() == actual_df.collect()
def test_join_when_no_students(spark_session): student_data = [] course_data = [(1, 4, 'CS'), (2, 2, 'Biology'), (3, 4, 'Business'), (4, 6, 'CS'), (5, 8, 'CS')] grade_data = [(1, 1, 90), (2, 2, 95), (3, 3, 84), (1, 4, 65), (1, 5, 87)] field = [ StructField('id', IntegerType(), True), StructField('name', IntegerType(), True), StructField('student_major', StringType(), True) ] students_df = spark_session.createDataFrame(student_data, schema=StructType(field)) courses_df = spark_session.createDataFrame( course_data, ['id', 'credits', 'course_major']) grades_df = spark_session.createDataFrame( grade_data, ['student_id', 'course_id', 'grade']) top_students_finder = TopStudentsFinder(students_df, courses_df, grades_df) actual_df = top_students_finder.join_student_data() assert actual_df.rdd.isEmpty()
def test_get_top_when_major_doesnt_have_data(spark_session): student_data = [(1, 'Mario', 'CS'), (2, 'Paula', 'Biology'), (3, 'Juan', 'CS'), (4, 'Carlos', 'Biology')] students_df = spark_session.createDataFrame( student_data, ['id', 'name', 'student_major']) grouped_data = [(1, 'CS', 76.0), (3, 'CS', 91.0)] grouped_df = spark_session.createDataFrame( grouped_data, ['student_id', 'student_major', 'score']) expected_data = [(3, 'Juan', 'CS', 91.0), (1, 'Mario', 'CS', 76.0)] expected_df = spark_session.createDataFrame( expected_data, ['student_id', 'name', 'major', 'score']) actual_df = TopStudentsFinder.get_top_n_students_internal( students_df, grouped_df, 2) assert expected_df.collect() == actual_df.collect()
def test_group_multiple_courses_multiple_students_grade_negative(spark_session): data = [(3, 'Juan', 3, 4, 84, 'Business'), (3, 'Juan', 6, 2, 90, 'Business'), (3, 'Juan', 7, 8, 40, 'Business'), (1, 'Mario', 4, 6, 65, 'CS'), (1, 'Mario', 5, 8, -1, 'CS'), (1, 'Mario', 1, 4, 90, 'CS'), (2, 'Paula', 2, 2, 95, 'Biology'), (2, 'Paula', 2, 4, 50, 'Biology'), (2, 'Paula', 2, 4, 75, 'Biology')] df = spark_session.createDataFrame(data, ['student_id', 'name', 'course_id', 'credits', 'grade', 'student_major']) expected_data = [(2, 'Biology', 69.0), (3, 'Business', 59.71), (1, 'CS', 75.0)] expected_df = spark_session.createDataFrame(expected_data, ['student_id', 'student_major','score']) actual_df = TopStudentsFinder.group_student_data(df) assert expected_df.collect() == actual_df.collect()
def test_group_when_student_data_is_null(): df = None actual_df = TopStudentsFinder.group_student_data(df) assert actual_df is None