Python read_tsv Examples, edx.analytics.tasks.util.tsv.read_tsv Python Examples

Example #1

0

Show file

    def read_user_registrations(self):
        """
        Read history of user registrations.

        Returns:
            Pandas DataFrame indexed by date with a single column
            representing the number of users who have accounts at
            the end of that day.
        """
        with self.input()['registrations'].open('r') as registrations_file:
            # The column name here will be converted in to a row name later when
            # the data is transposed.
            registration_changes = read_tsv(
                registrations_file, ['date', self.ROW_LABELS['registrations']])
            registration_changes.date = pandas.to_datetime(
                registration_changes.date)
            registration_changes.set_index(['date'], inplace=True)

            cumulative_registrations = registration_changes.cumsum()

            # Restrict the index to only the date range requested
            date_range = pandas.date_range(self.start_date, self.date)
            # Forward fill gaps because those dates have no change in registrations
            cumulative_registrations = cumulative_registrations.reindex(
                date_range, method='ffill')

        return cumulative_registrations

Example #2

0

Show file

File: enrollments.py Project: e/edx-analytics-pipeline

    def read_course_date_count_tsv(self, input_file):
        """Read TSV file with hard-coded column names into a pandas DataFrame."""
        names = ['course_id', 'date', 'count']

        # Not assuming any encoding, course_id will be read as plain string
        data = read_tsv(input_file, names)

        data.date = pandas.to_datetime(data.date)
        return data

Example #3

0

Show file

    def read_course_date_count_tsv(self, input_file):
        """Read TSV file with hard-coded column names into a pandas DataFrame."""
        names = ['course_id', 'date', 'count']

        # Not assuming any encoding, course_id will be read as plain string
        data = read_tsv(input_file, names)

        data.date = pandas.to_datetime(data.date)
        return data

Example #4

0

Show file

File: total_enrollments.py Project: EDUlib/edx-analytics-pipeline

    def read_course_blacklist(self):
        """
        Reads a set of course_ids from the blacklist input file if one was
        specified, otherwise returns an empty set.

        Expected input file format is a single course ID per line.

        Returns:
            A set of course_ids that should not be included in aggregates.
        """
        if self.input().get('blacklist'):
            with self.input()['blacklist'].open('r') as blacklist_file:
                data = read_tsv(blacklist_file, ['course_id'])
            return set(data['course_id'])
        else:
            return set()

Example #5

0

Show file

    def read_course_blacklist(self):
        """
        Reads a set of course_ids from the blacklist input file if one was
        specified, otherwise returns an empty set.

        Expected input file format is a single course ID per line.

        Returns:
            A set of course_ids that should not be included in aggregates.
        """
        if self.input().get('blacklist'):
            with self.input()['blacklist'].open('r') as blacklist_file:
                data = read_tsv(blacklist_file, ['course_id'])
            return set(data['course_id'])
        else:
            return set()

Example #6

0

Show file

File: total_enrollments.py Project: EDUlib/edx-analytics-pipeline

    def read_date_count_tsv(self, input_file):
        """
        Read TSV containing dates and corresponding counts into a pandas Series.

        NANs are not filled in here, as more than one filling strategy is
        used with such files.
        """
        names = ['date', 'count']

        data = read_tsv(input_file, names)
        data.date = pandas.to_datetime(data.date)
        data = data.set_index('date')

        # Ensure a continuos date range
        date_range = pandas.date_range(min(data.index), max(data.index))
        data = data.reindex(date_range)

        # Return as a Series
        return data['count']

Example #7

0

Show file

    def read_date_count_tsv(self, input_file):
        """
        Read TSV containing dates and corresponding counts into a pandas Series.

        NANs are not filled in here, as more than one filling strategy is
        used with such files.
        """
        names = ['date', 'count']

        data = read_tsv(input_file, names)
        data.date = pandas.to_datetime(data.date)
        data = data.set_index('date')

        # Ensure a continuos date range
        date_range = pandas.date_range(min(data.index), max(data.index))
        data = data.reindex(date_range)

        # Return as a Series
        return data['count']

Example #8

0

Show file

File: enrollments.py Project: e/edx-analytics-pipeline

    def read_statuses(self):
        """
        Read course statuses into a pandas DataFrame.

        Returns:
            Pandas dataframe with one row per course_id and
            a column for the status. The status should
            be either "past", "current" or "new".  The index
            for the DataFrame is the course_id.

            Returns None if no statuses was specified.
        """
        data = None
        names = ['course_id', 'status']

        if self.input().get('statuses'):
            with self.input()['statuses'].open('r') as status_file:
                data = read_tsv(status_file, names)
                data = data.set_index('course_id')

        return data

Example #9

0

Show file

    def read_statuses(self):
        """
        Read course statuses into a pandas DataFrame.

        Returns:
            Pandas dataframe with one row per course_id and
            a column for the status. The status should
            be either "past", "current" or "new".  The index
            for the DataFrame is the course_id.

            Returns None if no statuses was specified.
        """
        data = None
        names = ['course_id', 'status']

        if self.input().get('statuses'):
            with self.input()['statuses'].open('r') as status_file:
                data = read_tsv(status_file, names)
                data = data.set_index('course_id')

        return data

Example #10

0

Show file

File: total_enrollments.py Project: EDUlib/edx-analytics-pipeline

    def read_user_registrations(self):
        """
        Read history of user registrations.

        Returns:
            Pandas DataFrame indexed by date with a single column
            representing the number of users who have accounts at
            the end of that day.
        """
        with self.input()['registrations'].open('r') as registrations_file:
            # The column name here will be converted in to a row name later when
            # the data is transposed.
            registration_changes = read_tsv(registrations_file, ['date', self.ROW_LABELS['registrations']])
            registration_changes.date = pandas.to_datetime(registration_changes.date)
            registration_changes.set_index(['date'], inplace=True)

            cumulative_registrations = registration_changes.cumsum()

            # Restrict the index to only the date range requested
            date_range = pandas.date_range(self.start_date, self.date)
            # Forward fill gaps because those dates have no change in registrations
            cumulative_registrations = cumulative_registrations.reindex(date_range, method='ffill')

        return cumulative_registrations