def load_licensed_repositories_to_bq( date: datetime.datetime) -> bigquery.table.Table: """Load licensed repositories to BigQuery for a given day""" df = DataLake().staging.get_repositories(date) return DataLake().big_query.load_dataframe( df=df, table_id=BigQueryLicensedRepository.table_id, schema=BigQueryLicensedRepository.schema)
def staging_repository_df(): return pd.DataFrame([ { DataLake().staging.schemas.repositories.name: "epam/OSCI", DataLake().staging.schemas.repositories.language: "Python", DataLake().staging.schemas.repositories.license: "gpl-3.0", DataLake().staging.schemas.repositories.downloaded_at: "2021-01-01" }, ])
def __init__(self, sheet_name: str, from_date: datetime, to_date: datetime, top_size: int): self.writer, self.buffer = DataLake().public.get_excel_writer() self.workbook: Workbook = self.writer.book self.worksheet: Worksheet = self.workbook.add_worksheet(sheet_name) self.from_date = from_date self.to_date = to_date self.top_size = top_size self.superscript_format = self.get_format(self.superscript_format_rule)
def get_daily_active_repositories(date: datetime.datetime) -> pd.DataFrame: df = DataLake().staging.get_union_daily_raw_push_events_commits(date=date) result_df = df[[LandingSchemas.push_commits.repo_name]].drop_duplicates() result_df = result_df[ result_df.apply(lambda row: not Blacklist().is_blocked_repo_by_account( repository_name=row[LandingSchemas.push_commits.repo_name] ), axis=1) ] DataLake().landing.save_repositories(df=result_df, date=date) return result_df
def filter_out_unlicensed(date: datetime): """Read row PEC, filter and save them with license, language :param date: push events on this day """ log.debug( f'Filter out unlicensed push events commits for date {date:%Y-%m-%d}') log.debug(f'Read licensed repos for date {date:%Y-%m-%d}') licensed_repos_df = Repositories(date=date).read() for company, df in DataLake().staging.get_daily_raw_push_events_commits( date): log.debug( f'Filter out unlicensed push events commits for date {date:%Y-%m-%d} for {company}' ) filtered_df = filter_and_adjunct_push_event_commit( df, licensed_repos_df, [DataLake().staging.schemas.repositories.license], [ DataLake().staging.schemas.repositories.name, DataLake().staging.schemas.repositories.language, DataLake().staging.schemas.repositories.license ], DataLake().staging.schemas.push_commits.required, right_index=DataLake().staging.schemas.repositories.name, left_index=DataLake().staging.schemas.push_commits.repo_name) if not filtered_df.empty: DataLake().staging.save_push_events_commits( push_event_commits=filtered_df, company_name=company, date=date)
def load_repositories(date: datetime) -> pd.DataFrame: log.debug(f'Load repositories information for {date:%Y-%m-%d}') repositories = Repositories(date=date) df = pd.DataFrame(data=[], columns=Repositories.schema.required) repositories_names = DataLake().landing.get_repositories(date=date) if not repositories_names.empty: df = _load_repositories(repos_names=repositories_names[ DataLake().landing.schemas.repositories_names.name]) repositories.save(df) return df
def _execute(self, day: datetime, company: str): df = ReposRankingMTD(date=day, company=company).read() out_df = filter_projects( df=df, projects_filter_list=DataLake().staging.load_projects_filter(), commits_amount_field=DataLake( ).public.schemas.repo_commits_ranking.commits, repo_name_field=DataLake( ).public.schemas.repo_commits_ranking.repo) DataLake().public.save_report(report_df=out_df, report_name='projects_activity_MTD', date=day, company=company)
def load_push_events_to_bq(date: datetime.datetime, hour: int) -> Dict[str, Dict[str, Any]]: date = date.replace(hour=hour) df = DataLake().staging.get_push_events_commits( from_date=date, to_date=date, date_period_type=DatePeriodType.DTD) job_results = DataLake().big_query.load_dataframe( df=df, table_id=BigQueryPushEventsCommitsColumns.table_id, schema=BigQueryPushEventsCommitsColumns.schema) return { 'num_rows': job_results.num_rows, 'num_columns': len(job_results.schema), 'table_id': BigQueryPushEventsCommitsColumns.table_id }
def process_github_daily_push_events(day: datetime.datetime): push_events_commits = DataLake().landing.get_daily_push_events_commits( date=day) if push_events_commits is not None and not push_events_commits.empty: companies_events = process_push_commits( push_events_commits, email_field=DataLake().landing.schemas.push_commits.author_email, company_field=DataLake().staging.schemas.push_commits.company, datetime_field=DataLake( ).landing.schemas.push_commits.event_created_at) for company, commits in companies_events: log.debug(f'Save company {company}') DataLake().staging.save_raw_push_events_commits( push_event_commits=commits, date=day, company_name=company)
def generate_email_body(date: datetime, company=Config().default_company): report = OSCIChangeRanking(date=date) company_contributors_ranking_schema = DataLake( ).public.schemas.company_contributors_ranking change_ranking = report.read().reset_index() change_ranking = change_ranking.rename( columns={'index': company_contributors_ranking_schema.position}) change_ranking[company_contributors_ranking_schema.position] += 1 change_ranking = __cast_columns_to_int( df=change_ranking, columns=[ report.schema.total, report.schema.active, company_contributors_ranking_schema.position, report.schema.total_change, report.schema.active_change, report.schema.position_change, ]) shift_up = __add_arrows_prefix(df=__get_shift_up( change_ranking=change_ranking, change_position_field=report.schema.position_change), column=report.schema.position_change) shift_down = __add_arrows_prefix(df=__get_shift_down( change_ranking=change_ranking, change_position_field=report.schema.position_change), column=report.schema.position_change) company_position = __add_arrows_prefix( df=__get_company_neighbors( df=change_ranking, company=company, company_field=report.schema.company, rank_field=company_contributors_ranking_schema.position), column=report.schema.position_change) DataLake().public.save_email(email_body=EmailBodyTemplate().render( date=date, compared_date=get_previous_date(date), shift_up=shift_up, shift_down=shift_down, company=company, company_position=company_position, solutionshub_osci_change_ranking=OSCIChangeRankingExcel( to_date=date).url, osci_reports_urls={ name: report_cls(date=date).url for name, report_cls in OSCI_REPORTS_URLS.items() }), date=date)
def load_companies_contrib_repos_to_bq(date: datetime.datetime) -> bigquery.table.Table: """Load companies contributors repositories to BigQuery for a given day""" df = CompaniesContributorsRepository(date).read().rename( columns=BigQueryCompaniesContributorsRepositoriesCommitsColumns.mapping) return DataLake().big_query.load_dataframe(df=df, table_id=BigQueryCompaniesContributorsRepositoriesCommitsColumns.table_id, schema=BigQueryCompaniesContributorsRepositoriesCommitsColumns.schema)
class Repositories: schema = DataLake().staging.schemas.repositories def __init__(self, date: datetime.datetime): self.date = date @property def path(self) -> str: """ Return the full path to repositories """ return DataLake().staging.get_repositories_path(self.date) @property def spark_path(self) -> str: """ Return the full path to repositories """ return DataLake().staging.get_repositories_spark_path(self.date) def save(self, df: pd.DataFrame): """ Save pandas DataFrame as file :param df: """ return DataLake().staging.save_repositories(df, self.date) def read(self) -> pd.DataFrame: """ Read repositories to pandas DataFrame from file """ return DataLake().staging.get_repositories(self.date)
class CompaniesContributorsRepository: schema = DataLake().public.schemas.company_contributors_repository_commits def __init__(self, date: datetime.datetime): self.date = date @property def path(self) -> str: """ Return the full path to company contributors repository commits """ return DataLake().public.get_companies_contributors_repository_commits_path(self.date) @property def spark_path(self) -> str: """ Return the full path to company contributors repository commits """ return DataLake().public.get_companies_contributors_repository_commits_spark_path(self.date) def save(self, df: pd.DataFrame): """ Save pandas DataFrame as file :param df: """ return DataLake().public.save_companies_contributors_repository_commits(df, self.date) def read(self) -> pd.DataFrame: """ Read company contributors repository commits to pandas DataFrame from file """ return DataLake().public.get_companies_contributors_repository_commits(self.date)
def get_github_daily_push_events(day: datetime.datetime): with GithubArchiveRest() as rest: for hour in range(24): log.info(f'Crawl events for {day}') day = day.replace(hour=hour) push_events_commits = get_hour_push_events_commits(day=day, rest=rest) DataLake().landing.save_push_events_commits( push_event_commits=push_events_commits, date=day)
def raw_push_events_commit_df(): return pd.DataFrame([ { DataLake().staging.schemas.push_commits.event_id: "1111111", DataLake().staging.schemas.push_commits.event_created_at: "2021-01-01 00:15:22+00:00", DataLake().staging.schemas.push_commits.actor_login: "******", DataLake().staging.schemas.push_commits.repo_name: "epam/OSCI", DataLake().staging.schemas.push_commits.org_name: 'EPAM', DataLake().staging.schemas.push_commits.sha: "aaa656b0d05ec5b8ed5beb2f94c4aa11ea111a1a", DataLake().staging.schemas.push_commits.author_name: "User Name", DataLake().staging.schemas.push_commits.author_email: "*****@*****.**", DataLake().staging.schemas.push_commits.company: "EPAM" }, ])
def load_osci_general_reports_to_bq(date: datetime.datetime): report = OSCIGeneralRanking(date=date) table = BigQueryOSCIGeneralRankingReport log.debug(f'Load {report.name} for {date:%Y-%m-%d} to {table.table_id}') report_df = report.read() report_df = report_df[PublicSchemas.osci_general_report.required] report_df[table.Columns.position] += 1 report_df = report_df.rename(columns=table.mapping) report_df[table.Columns.date] = date return DataLake().big_query.load_dataframe(df=report_df, table_id=table.table_id, schema=table.schema)
def load_company_repositories_events_commits(date: datetime, company: str): events = DataLake().staging.get_push_events_commits( company=company, from_date=date, to_date=date, date_period_type=DatePeriodType.DTD) schema = DataLake().staging.schemas.push_commits if events.empty: log.warning(f'No {company} events at {date}') return with GithubRest(token=Config().github_token) as rest: company_commits = get_company_repositories_events_commits( repositories_names=events[schema.repo_name].unique(), date=date, company=company, rest=rest) company_commits_df = pd.DataFrame(company_commits) DataLake().staging.save_private_push_events_commits( push_event_commits=company_commits_df, company_name=company, date=date)
def get_contributors_repositories_change(date: datetime, company: str): ranking = ContributorsReposYTD(date=date, company=company) ranking_df = ranking.read() compared_ranking = ContributorsReposYTD(date=get_previous_date(date), company=company) compared_ranking_df = compared_ranking.read() new_contributors = NewContributors(date=date, company=company) new_contributors_df = pd.DataFrame( data=set(ranking_df[ranking.schema.author]) - set(compared_ranking_df[ranking.schema.author]), columns=[DataLake().public.schemas.new_contributors.author]) new_contributors.save(df=new_contributors_df) new_repos = NewRepos(date=date, company=company) new_repos_df = pd.DataFrame( data=set(ranking_df[ranking.schema.repo]) - set(compared_ranking_df[ranking.schema.repo]), columns=[DataLake().public.schemas.new_repos.repo]) new_repos.save(df=new_repos_df)
def no_match_license_raw_push_event_commit_df(): return pd.DataFrame([{ DataLake().staging.schemas.push_commits.event_id: "222222", DataLake().staging.schemas.push_commits.event_created_at: "2021-01-01 00:15:22+00:00", DataLake().staging.schemas.push_commits.actor_login: "******", DataLake().staging.schemas.push_commits.repo_name: "test/TEST", DataLake().staging.schemas.push_commits.org_name: None, DataLake().staging.schemas.push_commits.sha: "bbb656b0d05ec5b8ed5beb2f94c4aa11ea111a1a", DataLake().staging.schemas.push_commits.author_name: "User Name", DataLake().staging.schemas.push_commits.author_email: "*****@*****.**", DataLake().staging.schemas.push_commits.company: "EPAM" }])
def load_osci_daily_ranking_to_bq(date: datetime.datetime): """Load Daily Change ranking to Big Query""" report = OSCIChangeRankingDTD(date=date) table = BigQueryOSCIDailyChangeRankingReport log.debug(f'Load {report.name} for {date:%Y-%m-%d} to {table.table_id}') report_df = report.read() report_df = report_df.reset_index().rename( columns={'index': table.Columns.position}) report_df = report_df[PublicSchemas.osci_ranking_schema.required] report_df[table.Columns.position] += 1 report_df = report_df.rename(columns=table.mapping) report_df[table.Columns.date] = date.date() return DataLake().big_query.load_dataframe(df=report_df, table_id=table.table_id, schema=table.schema)
def load_osci_ranking_to_bq(date: datetime.datetime, date_period: str = DatePeriodType.YTD): if date_period not in (DatePeriodType.MTD, DatePeriodType.YTD): raise ValueError(f'Unsupported {date_period}') report = OSCIRankingFactory().get_cls(date_period=date_period)(date=date) table = date_period_to_table_map[date_period] log.debug(f'Load {report.name} for {date:%Y-%m-%d} to {table.table_id}') report_df = report.read() report_df = report_df[PublicSchemas.company_contributors_ranking.required] report_df = report_df.reset_index().rename( columns={'index': table.Columns.position}) report_df[table.Columns.position] += 1 report_df = report_df.rename(columns=table.mapping) report_df[table.Columns.date] = date.date() return DataLake().big_query.load_dataframe(df=report_df, table_id=table.table_id, schema=table.schema)
def read(self) -> pd.DataFrame: """ Read company contributors repository commits to pandas DataFrame from file """ return DataLake().public.get_companies_contributors_repository_commits(self.date)
def save(self, df: pd.DataFrame): """ Save pandas DataFrame as file :param df: """ return DataLake().public.save_companies_contributors_repository_commits(df, self.date)
def spark_path(self) -> str: """ Return the full path to company contributors repository commits """ return DataLake().public.get_companies_contributors_repository_commits_spark_path(self.date)
def url(self) -> str: return DataLake().public.get_report_url(report_name=self.name, date=self.date)
def transfer_monthly_change_ranking(date: datetime) -> dict: web_ranking = generate_web_osci_change_ranking(date) DataLake().web.save_monthly_osci_ranking(ranking=web_ranking, date=date) return web_ranking
def read(self) -> pd.DataFrame: return DataLake().public.get_report(report_name=self.name, date=self.date)
def path(self) -> str: return DataLake().public.get_report_path(report_name=self.name, date=self.date)
def save(self, df: pd.DataFrame): DataLake().public.save_report(report_df=df, report_name=self.name, date=self.date)
def adjunct_columns(): return [ DataLake().staging.schemas.repositories.name, DataLake().staging.schemas.repositories.language, DataLake().staging.schemas.repositories.license ]