def read_user_registrations(self): """ Read history of user registrations. Returns: Pandas DataFrame indexed by date with a single column representing the number of users who have accounts at the end of that day. """ with self.input()['registrations'].open('r') as registrations_file: # The column name here will be converted in to a row name later when # the data is transposed. registration_changes = read_tsv( registrations_file, ['date', self.ROW_LABELS['registrations']]) registration_changes.date = pandas.to_datetime( registration_changes.date) registration_changes.set_index(['date'], inplace=True) cumulative_registrations = registration_changes.cumsum() # Restrict the index to only the date range requested date_range = pandas.date_range(self.start_date, self.date) # Forward fill gaps because those dates have no change in registrations cumulative_registrations = cumulative_registrations.reindex( date_range, method='ffill') return cumulative_registrations
def read_course_date_count_tsv(self, input_file): """Read TSV file with hard-coded column names into a pandas DataFrame.""" names = ['course_id', 'date', 'count'] # Not assuming any encoding, course_id will be read as plain string data = read_tsv(input_file, names) data.date = pandas.to_datetime(data.date) return data
def read_course_blacklist(self): """ Reads a set of course_ids from the blacklist input file if one was specified, otherwise returns an empty set. Expected input file format is a single course ID per line. Returns: A set of course_ids that should not be included in aggregates. """ if self.input().get('blacklist'): with self.input()['blacklist'].open('r') as blacklist_file: data = read_tsv(blacklist_file, ['course_id']) return set(data['course_id']) else: return set()
def read_date_count_tsv(self, input_file): """ Read TSV containing dates and corresponding counts into a pandas Series. NANs are not filled in here, as more than one filling strategy is used with such files. """ names = ['date', 'count'] data = read_tsv(input_file, names) data.date = pandas.to_datetime(data.date) data = data.set_index('date') # Ensure a continuos date range date_range = pandas.date_range(min(data.index), max(data.index)) data = data.reindex(date_range) # Return as a Series return data['count']
def read_statuses(self): """ Read course statuses into a pandas DataFrame. Returns: Pandas dataframe with one row per course_id and a column for the status. The status should be either "past", "current" or "new". The index for the DataFrame is the course_id. Returns None if no statuses was specified. """ data = None names = ['course_id', 'status'] if self.input().get('statuses'): with self.input()['statuses'].open('r') as status_file: data = read_tsv(status_file, names) data = data.set_index('course_id') return data
def read_user_registrations(self): """ Read history of user registrations. Returns: Pandas DataFrame indexed by date with a single column representing the number of users who have accounts at the end of that day. """ with self.input()['registrations'].open('r') as registrations_file: # The column name here will be converted in to a row name later when # the data is transposed. registration_changes = read_tsv(registrations_file, ['date', self.ROW_LABELS['registrations']]) registration_changes.date = pandas.to_datetime(registration_changes.date) registration_changes.set_index(['date'], inplace=True) cumulative_registrations = registration_changes.cumsum() # Restrict the index to only the date range requested date_range = pandas.date_range(self.start_date, self.date) # Forward fill gaps because those dates have no change in registrations cumulative_registrations = cumulative_registrations.reindex(date_range, method='ffill') return cumulative_registrations