Ejemplos de TextCleaner en Python, ejemplos de CanvasHacks.Processors.cleaners.TextCleaner en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: quizzes.py Proyecto: AdamSwenson/CanvasHacks

    def __init__( self, activity, course=None ):
        self.course = course
        self.activity = activity
        self.question_columns = [ ]

        # The cleaner class that will be called to
        # remove html and other messy stuff from student
        # work
        self.text_cleaner = TextCleaner()

        self.analyzer = WordCount()

Ejemplo n.º 2

0

Mostrar archivo

    def __init__(self, activity, course=None):
        self.course = course
        self.activity = activity

        # An assignment will only come in with a 'body' attribute
        # to line this up with things that use questions (i.e., quizzes)
        self.body_column_name = AssignmentSubmissionRepository.body_column_name
        self.question_columns = [self.body_column_name]

        # The cleaner class that will be called to
        # remove html and other messy stuff from student
        # work
        self.text_cleaner = TextCleaner()

        self.analyzer = WordCount()

Ejemplo n.º 3

0

Mostrar archivo

Archivo: discussions.py Proyecto: AdamSwenson/CanvasHacks

    def __init__(self, activity, course):
        self.activity = activity
        self.topic_id = activity.topic_id
        self.course = course

        # List of dictionaries from parsed data:
        # [{'student_id', 'student_name', 'text'}]
        self.data = []

        # The cleaner class that will be called to
        # remove html and other messy stuff from student
        # work
        self.text_cleaner = TextCleaner()

        self.analyzer = WordCount()

Ejemplo n.º 4

0

Mostrar archivo

Archivo: test_TextCleaner.py Proyecto: AdamSwenson/CanvasHacks

class TestTextCleaner( TestingBase ):

    def setUp( self ) -> None:
        self.config_for_test()
        self.obj = TextCleaner()

    def test_clean( self ):
        for t in TEST_DATA:
            self.assertEqual( t[ 'expect' ], self.obj.clean( t[ 'input' ] ) )

Ejemplo n.º 5

0

Mostrar archivo

Archivo: wordcount.py Proyecto: AdamSwenson/CanvasHacks

    def __init__(self,
                 threshold_dicts=[],
                 required_count=None,
                 pct_of_score=1,
                 count_stopwords=True):
        """
        Count dict defines the thresholds via wordcounts
        and percentage of total. Should be a list like:
        [
            { count : int,
            pct_credit : float
            },
            {count : 1000,
            pct_credit: 54.3
            }
        ]
        :param threshold_dicts:
        :param count_stopwords: Whether stopwords should be included in the wordcount
        """

        self.required_count = required_count
        self.threshold_dicts = threshold_dicts
        self.count_stopwords = count_stopwords

        # Will remove html and most encoding artifacts.
        # Whatever we're calling this on, should've already
        # removed that stuff, but just to be safe
        self.cleaner = TextCleaner()

        # The object which will handle the actual processing and computation
        self.analyzer = WordCount(count_stopwords=count_stopwords)

        if self.required_count is not None and len(self.threshold_dicts) > 0:
            self.make_pct_required_count_thresholds(self.required_count)

        self.prepare_dicts()

Ejemplo n.º 6

0

Mostrar archivo

Archivo: test_TextCleaner.py Proyecto: AdamSwenson/CanvasHacks

 def setUp( self ) -> None:
     self.config_for_test()
     self.obj = TextCleaner()

Ejemplo n.º 7

0

Mostrar archivo

class AssignmentRepository(IContentRepository, StoredDataFileMixin,
                           StudentWorkMixin, SelectableMixin,
                           FrameStorageMixin):
    """Manages the data for a non-quiz type unit
    """
    def __init__(self, activity, course=None):
        self.course = course
        self.activity = activity

        # An assignment will only come in with a 'body' attribute
        # to line this up with things that use questions (i.e., quizzes)
        self.body_column_name = AssignmentSubmissionRepository.body_column_name
        self.question_columns = [self.body_column_name]

        # The cleaner class that will be called to
        # remove html and other messy stuff from student
        # work
        self.text_cleaner = TextCleaner()

        self.analyzer = WordCount()

    def process(self, student_work_frame):
        self.data = student_work_frame
        self._cleanup_data()

    def _cleanup_data(self):
        """This is abstracted out so it can be
        called independently for use with test data
        """
        prev_len = len(self.data)
        # todo copy dataframe or run on original data?
        self.data = self.data[self.data.grade != 'complete'].copy(deep=True)
        # self.data = [j for j in self.data if j[0].grade != 'complete']
        print("Removed {} rows which have already been graded".format(
            prev_len - len(self.data)))

        # Remove html and other artifacts from student answers
        # DO NOT UNCOMMENT UNTIL CAN-59 HAS BEEN FULLY TESTED
        for c in self.question_columns:
            self.data[c] = self.data.apply(
                lambda x: self.text_cleaner.clean(x[c]), axis=1)

        # We set to student id to make look ups easier
        self.data.set_index('student_id', inplace=True)

    def get_student_work(self, student_id):
        try:
            return self.data.loc[student_id]
        except (ValueError, KeyError):
            # The student id may not be set as the index, depending
            # on the source of the data
            return self.data.set_index('student_id').loc[student_id]

    def get_formatted_work_by(self, student_id):
        """Returns all review entries by the student, formatted for
        sending out for review or display"""
        work = self.get_student_work(student_id)
        # narrow down to just the relevant columns
        rs = [{
            'prompt': column_name,
            'response': work[column_name]
        } for column_name in self.question_columns]
        r = make_prompt_and_response(rs)
        return self._check_empty(r)

    def make_question_selection_buttons(self):
        """Given a repository containing a dataframe and a
        list of names in question_names, this will allow to select
        which questions are used for things"""
        buttons = []
        for q in self.question_names:
            b = make_selection_button(q, q, self.get_selections, self.select,
                                      self.deselect, '100%')
            buttons.append(b)

    @property
    def points_per_question(self):
        return self.assignment.points_possible / len(self.question_columns)

    @property
    def assignment(self):
        """Returns the canvasapi.unit.Assignment object associated
        with this repository.
        Automatically initializes it if not set
        """
        try:
            if self._assignment:
                pass
        except AttributeError:
            self._assignment = self.course.get_assignment(self.activity.id)
        return self._assignment

    @property
    def submitters(self):
        """returns a list of student objects for whom work has been submitted"""
        return [Student(s) for s in self.student_ids]

    @property
    def submitter_ids(self):
        """Returns a list of canvas ids of students who have submitted the unit"""
        # try:
        return list(set(self.data.reset_index().student_id.tolist()))
        # except (ValueError, KeyError):

    def word_counts(self):
        d = []
        for i, row in self.data.iterrows():
            s = {'student_id': i}
            for c in self.question_columns:
                s[c] = self.analyzer.analyze(row[c])
            d.append(s)
        return pd.DataFrame(d)

Ejemplo n.º 8

0

Mostrar archivo

Archivo: discussions.py Proyecto: AdamSwenson/CanvasHacks

class DiscussionRepository(IContentRepository, StudentWorkMixin):
    """Manages the data for one discussion unit"""
    def __init__(self, activity, course):
        self.activity = activity
        self.topic_id = activity.topic_id
        self.course = course

        # List of dictionaries from parsed data:
        # [{'student_id', 'student_name', 'text'}]
        self.data = []

        # The cleaner class that will be called to
        # remove html and other messy stuff from student
        # work
        self.text_cleaner = TextCleaner()

        self.analyzer = WordCount()

    @property
    def course_id(self):
        return self.course.id

    def download(self):
        # self._get_discussion_entries(topic_id)
        self._get_submissions(self.topic_id)
        self._parse_posts_from_submissions()
        print("Loaded {} posts".format(len(self.data)))

    def _get_submissions(self, topic_id):
        """Retrieves all the information we'll need for grading
        Not using self.topic_id to allow method to be called
        independently for testing
        """
        topic = self.course.get_discussion_topic(topic_id)
        # Graded discussions will be tied to an unit, so
        # we need the id
        self.assignment_id = topic.assignment_id
        print("Assignment {} is associated with topic {}".format(
            self.assignment_id, topic_id))
        # Load the unit object
        self.assignment = self.course.get_assignment(self.assignment_id)
        # Load all submissions for the unit
        self.submissions = {
            s.user_id: s
            for s in self.assignment.get_submissions()
        }
        print("Loaded {} submissions for the unit".format(
            len(self.submissions.keys())))

    def _parse_posts_from_submissions(self):
        """The submission objects downloaded for the unit will
        have the post information stored in a list called dicussion_entries.
        This takes all of those and loads the user id, name and text into
        posts"""
        for sid, submission in self.submissions.items():
            for entry in submission.discussion_entries:
                # Remove html and other artifacts from student answers
                # DO NOT UNCOMMENT UNTIL CAN-59 HAS BEEN FULLY TESTED
                content = self.text_cleaner.clean(entry['message'])

                self.data.append({
                    'student_id': entry['user_id'],
                    'student_name': entry['user_name'],
                    'text': content
                })
            #
            # self.posts.append( (entry.user_id, entry.user_name, entry.message) )

    def get_student_posts(self, student_id):
        """Returns a list of all posts by student for the topic"""
        return [p['text'] for p in self.data if p['student_id'] == student_id]

        # return [ p.message for p in self.data if p.user_id == student_id ]

    def get_formatted_work_by(self, student_id):
        """Returns all posts by the student, formatted for
        sending out for review or display"""
        posts = self.get_student_posts(student_id)
        # self._check_empty(posts)
        posts = "\n        -------        \n".join(posts)
        return posts

    def upload_student_grade(self, student_id, pct_credit):
        upload_credit(self.course_id, self.assignment_id, student_id,
                      pct_credit)
        # Not sure why this doesn't work, but doing it manually does
        # pct = "{}%".format(pct_credit) if isinstance(pct_credit, int) or pct_credit[-1:] != '%' else pct_credit
        # Look up the student submission
        # submission = self.submissions.get( student_id )
        # return submission.edit(posted_grade=pct)

    def display_for_grading(self):
        """Returns student submissions in format expected for
        ipython display
        "Returns a list of dictionaries of all dicussion posts for the topic
        Format:
        """
        return self.data
        # [ e for e in self.data ]

    @property
    def submitter_ids(self):
        """Returns a list of canvas ids of students who have submitted the unit"""
        # try:
        return list(set([s['student_id'] for s in self.data]))

    # return [ (e.user_id, e.user_name, e.message) for e in self.data ]

    # @property
    # def student_ids( self ):
    #     uids = list( set( [ k['student_id'] for k in self.data ] ) )
    #     uids.sort()
    #     return uids

    @property
    def post_counts(self):
        """Returns list of tuples
        ( student id, # of posts )
        """
        counts = []
        for sid in self.student_ids:
            counts.append(
                (sid, len([s for s in self.data if s['student_id'] == sid])))
        return counts

    def filter_by_count(self, min_post_count):
        """
        Returns a copy of data without students who have not reached
        the minimum count
        :param min_post_count:
        :return:
        """
        students_to_keep = [
            sid for sid, cnt in self.post_counts if cnt >= min_post_count
        ]
        return [
            s for s in filter(lambda x: x['student_id'] in students_to_keep,
                              self.data)
        ]

Ejemplo n.º 9

0

Mostrar archivo

Archivo: quizzes.py Proyecto: AdamSwenson/CanvasHacks

class QuizRepository( IContentRepository, QuizDataMixin, StoredDataFileMixin, StudentWorkMixin, SelectableMixin, FrameStorageMixin ):
    """Manages the data for a quiz type unit"""

    def __init__( self, activity, course=None ):
        self.course = course
        self.activity = activity
        self.question_columns = [ ]

        # The cleaner class that will be called to
        # remove html and other messy stuff from student
        # work
        self.text_cleaner = TextCleaner()

        self.analyzer = WordCount()

    def process( self, work_frame, submissions ):
        self.submissions = submissions
        if not isinstance( submissions, pd.DataFrame ):
            submissions_frame = pd.DataFrame( submissions )
        else:
            submissions_frame = submissions

        # If we are loading from file the student_id may
        # already have been set
        # try:
        #     v = submissions_frame['student_id']
        # except KeyError:
        submissions_frame[ 'student_id' ] = submissions_frame.user_id
        self.data = process_work( work_frame, submissions_frame )
        remove_non_final_attempts( self.data )
        # finish setting up the dataframe
        self._cleanup_data()
        # Store the text column names
        self.set_question_columns( self.data )

    def _cleanup_data( self ):
        """
        Runs cleanup operations specific to this kind of data
        """
        # the name will be set as index from sorting
        # so we set to student id to make look ups easier
        self.data.set_index( 'student_id', inplace=True )
        # Remove unneeded columns
        # self.data = self.data[self.activity_inviting_to_complete.question_columns]

        # Remove html and other artifacts from student answers
        # DO NOT UNCOMMENT UNTIL CAN-59 HAS BEEN FULLY TESTED
        for c in self.question_columns:
            self.data[c] = self.data.apply(lambda x: self.text_cleaner.clean(x[c]), axis=1)


    def get_student_work( self, student_id ):
        try:
            return self.data.loc[ student_id ]
        except (ValueError, KeyError):
            # The student id may not be set as the index, depending
            # on the source of the data
            return self.data.set_index( 'student_id' ).loc[ student_id ]

    def get_formatted_work_by( self, student_id ):
        """Returns all review entries by the student, formatted for
        sending out for review or display"""
        work = self.get_student_work( student_id )
        # narrow down to just the relevant columns
        rs = [ { 'prompt': column_name, 'response': work[ column_name ] } for col_id, column_name in
               self.question_columns ]
        r = make_prompt_and_response( rs )
        return self._check_empty( r )

    def make_question_selection_buttons( self ):
        """Given a repository containing a dataframe and a
        list of names in question_names, this will allow to select
        which questions are used for things"""
        buttons = [ ]
        for q in self.question_names:
            b = make_selection_button( q, q, self.get_selections, self.select, self.deselect, '100%' )
            buttons.append( b )

    @property
    def points_per_question( self ):
        return self.quiz.points_possible / self.quiz.question_count

    @property
    def quiz( self ):
        """Returns the canvasapi.quiz.Quiz object associated
        with this repository.
        Automatically initializes it if not set
        """
        try:
            if self._quiz:
                pass
        except AttributeError:
            self._quiz = self.course.get_quiz( self.activity.quiz_id )
        return self._quiz

    @property
    def submitters( self ):
        """returns a list of student objects for whom work has been submitted"""
        return [ Student( s ) for s in self.student_ids ]

    @property
    def submitter_ids( self ):
        """Returns a list of canvas ids of students who have submitted the unit"""
        # try:
        return list( set( self.data.reset_index().student_id.tolist() ) )
        # except (ValueError, KeyError):


    def word_counts( self ):
        """
        Returns a dataframe with columns student_id, and all
        question columns. Question columns contain word counts
        :return:
        """
        d = []
        for i, row in self.data.iterrows():
            s = {'student_id': i }
            for c in self.question_columns:
                s[c] = self.analyzer.analyze(row[c])
            d.append(s)
        return pd.DataFrame(d)