Python write_to_tsv_fileの例、nessie.lib.util.write_to_tsv_file Pythonの例

コード例 #1

0

ファイルを表示

ファイル: create_edl_schema.py プロジェクト: pfarestveit/nessie

def _process_demographics_feeds(app_arg, chunk):
    with app_arg.app_context():
        app_arg.logger.debug(f'{current_thread().name} will process demographics feeds chunk ({len(chunk)} records)')
        feeds = TemporaryFile()
        for sid, rows in chunk.items():
            gender = None
            visa = None
            nationalities = set()
            ethnic_map = {}
            for r in rows:
                # TODO: Prefer gender identity once available (NS-1073)
                gender = r['gender']
                if r['visa_type']:
                    visa = {'status': r['visa_status'], 'type': r['visa_type']}
                if r['citizenship_country']:
                    nationalities.add(r['citizenship_country'])
                if r['ethnic_group']:
                    if r['ethnic_group'] not in ethnic_map:
                        ethnic_map[r['ethnic_group']] = set()
                    ethnic_map[r['ethnic_group']].add(r['ethnicity'])
            feed = {
                'gender': GENDER_CODE_MAP[gender],
                'ethnicities': _simplified_ethnicities(ethnic_map),
                'nationalities': sorted(nationalities),
                'underrepresented': not UNDERREPRESENTED_GROUPS.isdisjoint(ethnic_map.keys()),
                'visa': visa,
            }
            write_to_tsv_file(feeds, [sid, json.dumps(feed)])
        app_arg.logger.debug(f'{current_thread().name} wrote all feeds, returning TSV tempfile')
        return feeds

コード例 #2

0

ファイルを表示

ファイル: student_demographics.py プロジェクト: ets-berkeley-edu/nessie

def add_demographics_rows(sid, feed, feed_files, feed_counts):
    use_edl = app.config['FEATURE_FLAG_EDL_DEMOGRAPHICS']
    parsed = feed if use_edl else parse_sis_demographics_api(feed)
    if parsed:
        if use_edl:
            filtered_ethnicities = filter_ethnicities(
                parsed.get('ethnicities', []))
        else:
            filtered_ethnicities = parsed.pop('filtered_ethnicities', [])
        for ethn in filtered_ethnicities:
            feed_counts['ethnicities'] += write_to_tsv_file(
                feed_files['ethnicities'], [sid, ethn])

        feed_counts['demographics'] += write_to_tsv_file(
            feed_files['demographics'],
            [sid,
             parsed.get('gender'),
             parsed.get('underrepresented', False)],
        )
        visa = parsed.get('visa')
        if visa:
            feed_counts['visas'] += write_to_tsv_file(
                feed_files['visas'],
                [sid, visa.get('status'),
                 visa.get('type')])
    return parsed

コード例 #3

0

ファイルを表示

ファイル: create_edl_schema.py プロジェクト: pfarestveit/nessie

 def generate_academic_plans_feeds(self):
     app.logger.info('Staging academic plans feeds...')
     rows = redshift.fetch(f'SELECT * FROM {self.internal_schema}.student_academic_plan_index ORDER by sid')
     with TemporaryFile() as feeds:
         for sid, rows_for_student in groupby(rows, itemgetter('sid')):
             rows_for_student = list(rows_for_student)
             feed = self.generate_academic_plans_feed(rows_for_student)
             write_to_tsv_file(feeds, [sid, json.dumps(feed)])
         self._upload_file_to_staging('student_academic_plans', feeds)

コード例 #4

0

ファイルを表示

    def build_target_feeds(self, app_arg, source_file):
        with app_arg.app_context():
            app_arg.logger.debug(
                f'{current_thread().name} will process profile feeds chunk')
            target_file = TemporaryFile()
            index = None

            for sid, feed_components, index in self.get_pickled_feeds(
                    source_file):
                # We may see results from multiple academic careers. We prefer a UGRD career if present; otherwise we look
                # for a non-Law career with the most recent entering term.
                plans = feed_components.get('plans', [])
                career_code = None
                career_admit_term = ''
                for plan_row in feed_components.get('plans', []):
                    if plan_row['academic_career_cd'] == 'UGRD':
                        career_code = 'UGRD'
                        break
                    elif plan_row['academic_career_cd'] in {
                            'UCBX', 'GRAD'
                    } and plan_row['current_admit_term'] > career_admit_term:
                        career_code = plan_row['academic_career_cd']
                        career_admit_term = plan_row['current_admit_term']

                feed = {
                    'identifiers': [
                        {
                            'id': sid,
                            'type': 'student-id',
                        },
                    ],
                }

                self._merge_profile(feed, feed_components.get('profile'))
                self._merge_holds(feed, feed_components.get('holds'))
                self._merge_academic_status(
                    feed, feed_components.get('profile_terms'), career_code)
                self._merge_plans(feed, plans, career_code)
                self._merge_degrees(feed, feed_components.get('degrees'))

                write_to_tsv_file(target_file, [sid, json.dumps(feed)])

            if index is None:
                app_arg.logger.warn(
                    f'{current_thread().name} wrote no profile feeds, returning empty tempfile'
                )
            else:
                app_arg.logger.debug(
                    f'{current_thread().name} wrote {index + 1} profile feeds, returning TSV tempfile'
                )
            return target_file

コード例 #5

0

ファイルを表示

ファイル: create_edl_schema.py プロジェクト: johncrossman/nessie

    def generate_demographics_feeds(self):
        app.logger.info('Staging demographics feeds...')
        df_chunks = []
        limit = 10000
        offset = 0
        while True:
            rows = get_demographics(limit=limit, offset=offset)
            df_chunks.append(pd.DataFrame(rows))
            if len(rows) < limit:
                break
            offset += limit
        df = pd.concat(df_chunks, ignore_index=True)

        with TemporaryFile() as feeds:
            sids_with_multiple_visas = []
            for sid, student in df.groupby('sid'):
                # TODO: Prefer gender identity once available (NS-1073)
                gender = student['gender'].drop_duplicates().dropna()
                if gender.count() > 1:
                    app.logger.warn(
                        f'Found more than one gender for SID {sid}; selecting only the first.'
                    )
                ethnic_map = student.groupby(
                    ['ethnic_group'])['ethnicity'].agg(set).to_dict()
                ethnicities = self.simplified_ethnicities(ethnic_map)
                nationalities = student['citizenship_country'].dropna().unique(
                ).tolist()
                visa = student[['visa_type',
                                'visa_status']].drop_duplicates().to_dict('r')
                if len(visa) > 1:
                    sids_with_multiple_visas.append(sid)
                feed = {
                    'gender':
                    GENDER_CODE_MAP[gender.iat[0]],
                    'ethnicities':
                    ethnicities,
                    'nationalities':
                    nationalities,
                    'underrepresented':
                    not UNDERREPRESENTED_GROUPS.isdisjoint(ethnic_map.keys()),
                    'visa':
                    visa[0],
                }
                write_to_tsv_file(feeds, [sid, json.dumps(feed)])
            if len(sids_with_multiple_visas):
                app.logger.warn(
                    f"SIDs with two or more visas: {', '.join(sids_with_multiple_visas)}"
                )
            self._upload_file_to_staging('student_demographics', feeds)

コード例 #6

0

ファイルを表示

    def build_target_feeds(self, app_arg, source_file):
        with app_arg.app_context():
            app_arg.logger.debug(
                f'{current_thread().name} will process demographics feeds chunk'
            )
            target_file = TemporaryFile()
            index = None

            for sid, rows, index in self.get_pickled_feeds(source_file):
                gender = None
                visa = None
                nationalities = set()
                ethnic_map = {}
                for r in rows:
                    gender = r['gender']
                    if r['visa_type']:
                        visa = {
                            'status': r['visa_status'],
                            'type': r['visa_type']
                        }
                    if r['citizenship_country']:
                        nationalities.add(r['citizenship_country'])
                    if r['ethnic_group']:
                        if r['ethnic_group'] not in ethnic_map:
                            ethnic_map[r['ethnic_group']] = set()
                        ethnic_map[r['ethnic_group']].add(r['ethnicity'])
                feed = {
                    'gender':
                    GENDER_CODE_MAP.get(gender, None),
                    'ethnicities':
                    self._simplified_ethnicities(ethnic_map),
                    'nationalities':
                    sorted(nationalities),
                    'underrepresented':
                    not UNDERREPRESENTED_GROUPS.isdisjoint(ethnic_map.keys()),
                    'visa':
                    visa,
                }
                write_to_tsv_file(target_file, [sid, json.dumps(feed)])

            if index is None:
                app_arg.logger.warn(
                    f'{current_thread().name} wrote no demographics feeds, returning empty tempfile'
                )
            else:
                app_arg.logger.debug(
                    f'{current_thread().name} wrote {index + 1} demographics feeds, returning TSV tempfile'
                )
            return target_file

コード例 #7

0

ファイルを表示

ファイル: create_edl_schema.py プロジェクト: pfarestveit/nessie

 def generate_degree_progress_feeds(self):
     app.logger.info('Staging degree progress feeds...')
     table = 'student_degree_progress'
     rows = redshift.fetch(f'SELECT * FROM {self.internal_schema}.{table}_index ORDER by sid')
     with TemporaryFile() as feeds:
         for sid, rows_for_student in groupby(rows, itemgetter('sid')):
             rows_for_student = list(rows_for_student)
             report_date = rows_for_student[0].get('report_date')
             feed = {
                 'reportDate': report_date.strftime('%Y-%m-%d'),
                 'requirements': {
                     row.get('requirement'): {
                         'name': row.get('requirement_desc'), 'status': row.get('status'),
                     } for row in rows_for_student
                 },
             }
             write_to_tsv_file(feeds, [sid, json.dumps(feed)])
         self._upload_file_to_staging(table, feeds)

コード例 #8

0

ファイルを表示

ファイル: student_demographics.py プロジェクト: dkawase/nessie

def add_demographics_rows(sid, feed, feed_files, feed_counts):
    parsed = parse_sis_demographics_api(feed)
    if parsed:
        filtered_ethnicities = parsed.pop('filtered_ethnicities', [])
        for ethn in filtered_ethnicities:
            feed_counts['ethnicities'] += write_to_tsv_file(
                feed_files['ethnicities'], [sid, ethn])

        feed_counts['demographics'] += write_to_tsv_file(
            feed_files['demographics'],
            [sid,
             parsed.get('gender'),
             parsed.get('underrepresented', False)],
        )
        visa = parsed.get('visa')
        if visa:
            feed_counts['visas'] += write_to_tsv_file(
                feed_files['visas'],
                [sid, visa.get('status'),
                 visa.get('type')])
    return parsed

コード例 #9

0

ファイルを表示

    def build_target_feeds(self, app_arg, source_file):
        with app_arg.app_context():
            app_arg.logger.debug(
                f'{current_thread().name} will process registration feeds chunk'
            )
            target_file = TemporaryFile()
            index = None

            for sid, rows, index in self.get_pickled_feeds(source_file):
                last_registration = self._find_last_registration(rows)
                if last_registration:
                    feed = self._generate_feed(last_registration)
                    write_to_tsv_file(target_file, [sid, json.dumps(feed)])

            if index is None:
                app_arg.logger.warn(
                    f'{current_thread().name} wrote no registration feeds, returning empty tempfile'
                )
            else:
                app_arg.logger.debug(
                    f'{current_thread().name} wrote {index + 1} registration feeds, returning TSV tempfile'
                )
            return target_file

コード例 #10

0

ファイルを表示

    def generate_student_profile_feed(self, feed_elements, advisors,
                                      feed_files, feed_counts):
        sid = feed_elements['sid']
        uid = feed_elements['ldap_uid']
        if not uid:
            return
        sis_profile = parse_merged_sis_profile(feed_elements)
        demographics = feed_elements.get('demographics_feed') and json.loads(
            feed_elements.get('demographics_feed'))
        if demographics:
            demographics = add_demographics_rows(sid, demographics, feed_files,
                                                 feed_counts)

        advisor_feed = []
        for a in advisors:
            advisor_feed.append({
                'uid':
                a['advisor_uid'],
                'sid':
                a['advisor_sid'],
                'firstName':
                a['advisor_first_name'],
                'lastName':
                a['advisor_last_name'],
                'email': (a['advisor_campus_email'] or a['advisor_email']),
                'role':
                a['advisor_role'],
                'title':
                a['advisor_title'],
                'program':
                a['program'],
                'plan':
                a['plan'],
            })

        merged_profile = {
            'sid':
            sid,
            'uid':
            uid,
            'firstName':
            feed_elements.get('first_name'),
            'lastName':
            feed_elements.get('last_name'),
            'name':
            ' '.join([
                feed_elements.get('first_name'),
                feed_elements.get('last_name')
            ]),
            'canvasUserId':
            feed_elements.get('canvas_user_id'),
            'canvasUserName':
            feed_elements.get('canvas_user_name'),
            'sisProfile':
            sis_profile,
            'demographics':
            demographics,
            'advisors':
            advisor_feed,
        }
        feed_counts['student_profiles'] += write_to_tsv_file(
            feed_files['student_profiles'],
            [sid, json.dumps(merged_profile)])

        if sis_profile:
            first_name = merged_profile['firstName'] or ''
            last_name = merged_profile['lastName'] or ''
            level = str(sis_profile.get('level', {}).get('code') or '')
            gpa = str(sis_profile.get('cumulativeGPA') or '')
            units = str(sis_profile.get('cumulativeUnits') or '')
            transfer = str(sis_profile.get('transfer') or False)
            expected_grad_term = str(
                sis_profile.get('expectedGraduationTerm', {}).get('id') or '')
            terms_in_attendance = str(
                sis_profile.get('termsInAttendance', {}) or '')

            feed_counts['student_profile_index'] += write_to_tsv_file(
                feed_files['student_profile_index'],
                [
                    sid, uid, first_name, last_name, level, gpa, units,
                    transfer, expected_grad_term, terms_in_attendance
                ],
            )

            for plan in sis_profile.get('plans', []):
                if plan.get('status') == 'Active':
                    feed_counts['student_majors'] += write_to_tsv_file(
                        feed_files['student_majors'],
                        [
                            sid,
                            plan.get('program', None),
                            plan.get('description', None)
                        ],
                    )
            for hold in sis_profile.get('holds', []):
                feed_counts['student_holds'] += write_to_tsv_file(
                    feed_files['student_holds'], [sid, json.dumps(hold)])
            for intended_major in sis_profile.get('intendedMajors', []):
                feed_counts['intended_majors'] += write_to_tsv_file(
                    feed_files['intended_majors'],
                    [sid, intended_major.get('description', None)])
            for plan in sis_profile.get('plansMinor', []):
                if plan.get('status') == 'Active':
                    feed_counts['minors'] += write_to_tsv_file(
                        feed_files['minors'],
                        [sid, plan.get('description', None)])

        return merged_profile