def load_company_repositories_events_commits(date: datetime, company: str): events = DataLake().staging.get_push_events_commits( company=company, from_date=date, to_date=date, date_period_type=DatePeriodType.DTD) schema = DataLake().staging.schemas.push_commits if events.empty: log.warning(f'No {company} events at {date}') return with GithubRest(token=Config().github_token) as rest: company_commits = get_company_repositories_events_commits( repositories_names=events[schema.repo_name].unique(), date=date, company=company, rest=rest) company_commits_df = pd.DataFrame(company_commits) DataLake().staging.save_private_push_events_commits( push_event_commits=company_commits_df, company_name=company, date=date)
def load_osci_general_reports_to_bq(date: datetime.datetime): report = OSCIGeneralRanking(date=date) table = BigQueryOSCIGeneralRankingReport log.debug(f'Load {report.name} for {date:%Y-%m-%d} to {table.table_id}') report_df = report.read() report_df = report_df[PublicSchemas.osci_general_report.required] report_df[table.Columns.position] += 1 report_df = report_df.rename(columns=table.mapping) report_df[table.Columns.date] = date return DataLake().big_query.load_dataframe(df=report_df, table_id=table.table_id, schema=table.schema)
def load_companies_contrib_repos_to_bq( date: datetime.datetime) -> bigquery.table.Table: """Load companies contributors repositories to BigQuery for a given day""" df = CompaniesContributorsRepository(date).read().rename( columns=BigQueryCompaniesContributorsRepositoriesCommitsColumns.mapping ) df[BigQueryCompaniesContributorsRepositoriesCommitsColumns.Columns. date] = date.date() return DataLake().big_query.load_dataframe( df=df, table_id=BigQueryCompaniesContributorsRepositoriesCommitsColumns. table_id, schema=BigQueryCompaniesContributorsRepositoriesCommitsColumns.schema)
def get_contributors_repositories_change(date: datetime, company: str): ranking = ContributorsReposYTD(date=date, company=company) ranking_df = ranking.read() compared_ranking = ContributorsReposYTD(date=get_previous_date(date), company=company) compared_ranking_df = compared_ranking.read() new_contributors = NewContributors(date=date, company=company) new_contributors_df = pd.DataFrame( data=set(ranking_df[ranking.schema.author]) - set(compared_ranking_df[ranking.schema.author]), columns=[DataLake().public.schemas.new_contributors.author]) new_contributors.save(df=new_contributors_df) new_repos = NewRepos(date=date, company=company) new_repos_df = pd.DataFrame( data=set(ranking_df[ranking.schema.repo]) - set(compared_ranking_df[ranking.schema.repo]), columns=[DataLake().public.schemas.new_repos.repo]) new_repos.save(df=new_repos_df)
def abnormal_staging_repository_df(): return pd.DataFrame([ { DataLake().staging.schemas.repositories.name: "epam/OSCI", DataLake().staging.schemas.repositories.language: "Python", DataLake().staging.schemas.repositories.license: "gpl-3.0", DataLake().staging.schemas.repositories.downloaded_at: "2021-01-01" }, { DataLake().staging.schemas.repositories.name: "not_exist/REPOSITORY", DataLake().staging.schemas.repositories.language: "Python", DataLake().staging.schemas.repositories.license: "gpl-3.0", DataLake().staging.schemas.repositories.downloaded_at: "2021-01-01" }, ])
def no_match_license_raw_push_event_commit_df(): return pd.DataFrame([{ DataLake().staging.schemas.push_commits.event_id: "222222", DataLake().staging.schemas.push_commits.event_created_at: "2021-01-01 00:15:22+00:00", DataLake().staging.schemas.push_commits.actor_login: "******", DataLake().staging.schemas.push_commits.repo_name: "test/TEST", DataLake().staging.schemas.push_commits.org_name: None, DataLake().staging.schemas.push_commits.sha: "bbb656b0d05ec5b8ed5beb2f94c4aa11ea111a1a", DataLake().staging.schemas.push_commits.author_name: "User Name", DataLake().staging.schemas.push_commits.author_email: "*****@*****.**", DataLake().staging.schemas.push_commits.company: "EPAM" }])
def load_osci_daily_ranking_to_bq(date: datetime.datetime): """Load Daily Change ranking to Big Query""" report = OSCIChangeRankingDTD(date=date) table = BigQueryOSCIDailyChangeRankingReport log.debug(f'Load {report.name} for {date:%Y-%m-%d} to {table.table_id}') report_df = report.read() report_df = report_df.reset_index().rename( columns={'index': table.Columns.position}) report_df = report_df[PublicSchemas.osci_ranking_schema.required] report_df[table.Columns.position] += 1 report_df = report_df.rename(columns=table.mapping) report_df[table.Columns.date] = date.date() return DataLake().big_query.load_dataframe(df=report_df, table_id=table.table_id, schema=table.schema)
def load_osci_ranking_to_bq(date: datetime.datetime, date_period: str = DatePeriodType.YTD): if date_period not in (DatePeriodType.MTD, DatePeriodType.YTD): raise ValueError(f'Unsupported {date_period}') report = OSCIRankingFactory().get_cls(date_period=date_period)(date=date) table = date_period_to_table_map[date_period] log.debug(f'Load {report.name} for {date:%Y-%m-%d} to {table.table_id}') report_df = report.read() report_df = report_df[PublicSchemas.company_contributors_ranking.required] report_df = report_df.reset_index().rename( columns={'index': table.Columns.position}) report_df[table.Columns.position] += 1 report_df = report_df.rename(columns=table.mapping) report_df[table.Columns.date] = date.date() return DataLake().big_query.load_dataframe(df=report_df, table_id=table.table_id, schema=table.schema)
def transfer_monthly_change_ranking(date: datetime) -> dict: web_ranking = generate_web_osci_change_ranking(date) DataLake().web.save_monthly_osci_ranking(ranking=web_ranking, date=date) return web_ranking
def save(self, df: pd.DataFrame): DataLake().public.save_report(report_df=df, report_name=self.name, date=self.date)
def read(self) -> pd.DataFrame: return DataLake().public.get_report(report_name=self.name, date=self.date)
def url(self) -> str: return DataLake().public.get_report_url(report_name=self.name, date=self.date)
def path(self) -> str: return DataLake().public.get_report_path(report_name=self.name, date=self.date)
def spark_path(self) -> str: """ Return the full path to company contributors repository commits """ return DataLake().public.get_companies_contributors_repository_commits_spark_path(self.date)
def staging_push_event_commit_df(): return pd.DataFrame([{ DataLake().staging.schemas.push_commits.event_id: "1111111", DataLake().staging.schemas.push_commits.event_created_at: "2021-01-01 00:15:22+00:00", DataLake().staging.schemas.push_commits.actor_login: "******", DataLake().staging.schemas.push_commits.repo_name: "epam/OSCI", DataLake().staging.schemas.push_commits.org_name: 'EPAM', DataLake().staging.schemas.push_commits.sha: "aaa656b0d05ec5b8ed5beb2f94c4aa11ea111a1a", DataLake().staging.schemas.push_commits.author_name: "User Name", DataLake().staging.schemas.push_commits.author_email: "*****@*****.**", DataLake().staging.schemas.push_commits.company: "EPAM", DataLake().staging.schemas.push_commits.language: "Python", DataLake().staging.schemas.push_commits.license: "gpl-3.0" }])
def save(self, df: pd.DataFrame): """ Save pandas DataFrame as file :param df: """ return DataLake().staging.save_repositories(df, self.date)
def path(self) -> str: return DataLake().public.get_osci_change_excel_report_path(base_report_name=self.base_name, report_dir_name=self.dir_name, date=self.to_date)
def left_index(): return DataLake().staging.schemas.push_commits.repo_name
def read(self) -> pd.DataFrame: """ Read company contributors repository commits to pandas DataFrame from file """ return DataLake().public.get_companies_contributors_repository_commits(self.date)
def right_index(): return DataLake().staging.schemas.repositories.name
def required_columns(): return DataLake().staging.schemas.push_commits.required
def __init__(self, date_period_type: str = DatePeriodType.YTD): self.data_lake = DataLake() self.commits_schema = self.data_lake.staging.schemas.push_commits self.date_period_type = date_period_type self.report_cls: Type[Report] = self.REPORT_FACTORY().get_cls(date_period=self.date_period_type)
def read_all(self): return DataLake().public\ .get_reports_for_last_days_of_month(report_name=self.name, date=self.date, company=self.company)
def filter_columns(): return [DataLake().staging.schemas.repositories.license]
def read(self) -> pd.DataFrame: """ Read repositories to pandas DataFrame from file """ return DataLake().staging.get_repositories(self.date)
def adjunct_columns(): return [ DataLake().staging.schemas.repositories.name, DataLake().staging.schemas.repositories.language, DataLake().staging.schemas.repositories.license ]
def save(self, df: pd.DataFrame): """ Save pandas DataFrame as file :param df: """ return DataLake().public.save_companies_contributors_repository_commits(df, self.date)
def save(self, df: pd.DataFrame): DataLake().public.write_bytes_to_file(path=self.path, buffer=self._write(df))
def unfiltered_raw_push_events_commit_df(): return pd.DataFrame([{ DataLake().staging.schemas.push_commits.event_id: "222222", DataLake().staging.schemas.push_commits.event_created_at: "2021-01-01 00:15:22+00:00", DataLake().staging.schemas.push_commits.actor_login: "******", DataLake().staging.schemas.push_commits.repo_name: "unfiltered/REPOSITORY", DataLake().staging.schemas.push_commits.org_name: "EPAM", DataLake().staging.schemas.push_commits.sha: "bbb656b0d05ec5b8ed5beb2f94c4aa11ea111a1a", DataLake().staging.schemas.push_commits.author_name: "User Name", DataLake().staging.schemas.push_commits.author_email: "*****@*****.**", DataLake().staging.schemas.push_commits.company: "EPAM" }, { DataLake().staging.schemas.push_commits.event_id: "1111111", DataLake().staging.schemas.push_commits.event_created_at: "2021-01-01 00:15:22+00:00", DataLake().staging.schemas.push_commits.actor_login: "******", DataLake().staging.schemas.push_commits.repo_name: "epam/OSCI", DataLake().staging.schemas.push_commits.org_name: 'EPAM', DataLake().staging.schemas.push_commits.sha: "aaa656b0d05ec5b8ed5beb2f94c4aa11ea111a1a", DataLake().staging.schemas.push_commits.author_name: "User Name", DataLake().staging.schemas.push_commits.author_email: "*****@*****.**", DataLake().staging.schemas.push_commits.company: "EPAM" }])
def spark_path(self) -> str: """ Return the full path to repositories """ return DataLake().staging.get_repositories_spark_path(self.date)