Exemple #1
0
def load_company_repositories_events_commits(date: datetime, company: str):
    events = DataLake().staging.get_push_events_commits(
        company=company,
        from_date=date,
        to_date=date,
        date_period_type=DatePeriodType.DTD)
    schema = DataLake().staging.schemas.push_commits
    if events.empty:
        log.warning(f'No {company} events at {date}')
        return
    with GithubRest(token=Config().github_token) as rest:
        company_commits = get_company_repositories_events_commits(
            repositories_names=events[schema.repo_name].unique(),
            date=date,
            company=company,
            rest=rest)
        company_commits_df = pd.DataFrame(company_commits)
        DataLake().staging.save_private_push_events_commits(
            push_event_commits=company_commits_df,
            company_name=company,
            date=date)
Exemple #2
0
def load_osci_general_reports_to_bq(date: datetime.datetime):
    report = OSCIGeneralRanking(date=date)
    table = BigQueryOSCIGeneralRankingReport
    log.debug(f'Load {report.name} for {date:%Y-%m-%d} to {table.table_id}')
    report_df = report.read()
    report_df = report_df[PublicSchemas.osci_general_report.required]
    report_df[table.Columns.position] += 1
    report_df = report_df.rename(columns=table.mapping)
    report_df[table.Columns.date] = date
    return DataLake().big_query.load_dataframe(df=report_df,
                                               table_id=table.table_id,
                                               schema=table.schema)
def load_companies_contrib_repos_to_bq(
        date: datetime.datetime) -> bigquery.table.Table:
    """Load companies contributors repositories to BigQuery for a given day"""
    df = CompaniesContributorsRepository(date).read().rename(
        columns=BigQueryCompaniesContributorsRepositoriesCommitsColumns.mapping
    )
    df[BigQueryCompaniesContributorsRepositoriesCommitsColumns.Columns.
       date] = date.date()
    return DataLake().big_query.load_dataframe(
        df=df,
        table_id=BigQueryCompaniesContributorsRepositoriesCommitsColumns.
        table_id,
        schema=BigQueryCompaniesContributorsRepositoriesCommitsColumns.schema)
def get_contributors_repositories_change(date: datetime, company: str):
    ranking = ContributorsReposYTD(date=date, company=company)
    ranking_df = ranking.read()
    compared_ranking = ContributorsReposYTD(date=get_previous_date(date),
                                            company=company)
    compared_ranking_df = compared_ranking.read()

    new_contributors = NewContributors(date=date, company=company)

    new_contributors_df = pd.DataFrame(
        data=set(ranking_df[ranking.schema.author]) -
        set(compared_ranking_df[ranking.schema.author]),
        columns=[DataLake().public.schemas.new_contributors.author])
    new_contributors.save(df=new_contributors_df)

    new_repos = NewRepos(date=date, company=company)

    new_repos_df = pd.DataFrame(
        data=set(ranking_df[ranking.schema.repo]) -
        set(compared_ranking_df[ranking.schema.repo]),
        columns=[DataLake().public.schemas.new_repos.repo])
    new_repos.save(df=new_repos_df)
Exemple #5
0
def abnormal_staging_repository_df():
    return pd.DataFrame([
        {
            DataLake().staging.schemas.repositories.name: "epam/OSCI",
            DataLake().staging.schemas.repositories.language: "Python",
            DataLake().staging.schemas.repositories.license: "gpl-3.0",
            DataLake().staging.schemas.repositories.downloaded_at: "2021-01-01"
        },
        {
            DataLake().staging.schemas.repositories.name:
            "not_exist/REPOSITORY",
            DataLake().staging.schemas.repositories.language: "Python",
            DataLake().staging.schemas.repositories.license: "gpl-3.0",
            DataLake().staging.schemas.repositories.downloaded_at: "2021-01-01"
        },
    ])
Exemple #6
0
def no_match_license_raw_push_event_commit_df():
    return pd.DataFrame([{
        DataLake().staging.schemas.push_commits.event_id:
        "222222",
        DataLake().staging.schemas.push_commits.event_created_at:
        "2021-01-01 00:15:22+00:00",
        DataLake().staging.schemas.push_commits.actor_login:
        "******",
        DataLake().staging.schemas.push_commits.repo_name:
        "test/TEST",
        DataLake().staging.schemas.push_commits.org_name:
        None,
        DataLake().staging.schemas.push_commits.sha:
        "bbb656b0d05ec5b8ed5beb2f94c4aa11ea111a1a",
        DataLake().staging.schemas.push_commits.author_name:
        "User Name",
        DataLake().staging.schemas.push_commits.author_email:
        "*****@*****.**",
        DataLake().staging.schemas.push_commits.company:
        "EPAM"
    }])
Exemple #7
0
def load_osci_daily_ranking_to_bq(date: datetime.datetime):
    """Load Daily Change ranking to Big Query"""
    report = OSCIChangeRankingDTD(date=date)
    table = BigQueryOSCIDailyChangeRankingReport

    log.debug(f'Load {report.name} for {date:%Y-%m-%d} to {table.table_id}')

    report_df = report.read()
    report_df = report_df.reset_index().rename(
        columns={'index': table.Columns.position})
    report_df = report_df[PublicSchemas.osci_ranking_schema.required]
    report_df[table.Columns.position] += 1
    report_df = report_df.rename(columns=table.mapping)
    report_df[table.Columns.date] = date.date()

    return DataLake().big_query.load_dataframe(df=report_df,
                                               table_id=table.table_id,
                                               schema=table.schema)
Exemple #8
0
def load_osci_ranking_to_bq(date: datetime.datetime,
                            date_period: str = DatePeriodType.YTD):
    if date_period not in (DatePeriodType.MTD, DatePeriodType.YTD):
        raise ValueError(f'Unsupported {date_period}')
    report = OSCIRankingFactory().get_cls(date_period=date_period)(date=date)
    table = date_period_to_table_map[date_period]

    log.debug(f'Load {report.name} for {date:%Y-%m-%d} to {table.table_id}')

    report_df = report.read()
    report_df = report_df[PublicSchemas.company_contributors_ranking.required]
    report_df = report_df.reset_index().rename(
        columns={'index': table.Columns.position})
    report_df[table.Columns.position] += 1
    report_df = report_df.rename(columns=table.mapping)
    report_df[table.Columns.date] = date.date()

    return DataLake().big_query.load_dataframe(df=report_df,
                                               table_id=table.table_id,
                                               schema=table.schema)
def transfer_monthly_change_ranking(date: datetime) -> dict:
    web_ranking = generate_web_osci_change_ranking(date)
    DataLake().web.save_monthly_osci_ranking(ranking=web_ranking, date=date)
    return web_ranking
Exemple #10
0
 def save(self, df: pd.DataFrame):
     DataLake().public.save_report(report_df=df,
                                   report_name=self.name,
                                   date=self.date)
Exemple #11
0
 def read(self) -> pd.DataFrame:
     return DataLake().public.get_report(report_name=self.name,
                                         date=self.date)
Exemple #12
0
 def url(self) -> str:
     return DataLake().public.get_report_url(report_name=self.name,
                                             date=self.date)
Exemple #13
0
 def path(self) -> str:
     return DataLake().public.get_report_path(report_name=self.name,
                                              date=self.date)
Exemple #14
0
 def spark_path(self) -> str:
     """
     Return the full path to company contributors repository commits
     """
     return DataLake().public.get_companies_contributors_repository_commits_spark_path(self.date)
Exemple #15
0
def staging_push_event_commit_df():
    return pd.DataFrame([{
        DataLake().staging.schemas.push_commits.event_id:
        "1111111",
        DataLake().staging.schemas.push_commits.event_created_at:
        "2021-01-01 00:15:22+00:00",
        DataLake().staging.schemas.push_commits.actor_login:
        "******",
        DataLake().staging.schemas.push_commits.repo_name:
        "epam/OSCI",
        DataLake().staging.schemas.push_commits.org_name:
        'EPAM',
        DataLake().staging.schemas.push_commits.sha:
        "aaa656b0d05ec5b8ed5beb2f94c4aa11ea111a1a",
        DataLake().staging.schemas.push_commits.author_name:
        "User Name",
        DataLake().staging.schemas.push_commits.author_email:
        "*****@*****.**",
        DataLake().staging.schemas.push_commits.company:
        "EPAM",
        DataLake().staging.schemas.push_commits.language:
        "Python",
        DataLake().staging.schemas.push_commits.license:
        "gpl-3.0"
    }])
Exemple #16
0
 def save(self, df: pd.DataFrame):
     """
     Save pandas DataFrame as file
     :param df:
     """
     return DataLake().staging.save_repositories(df, self.date)
Exemple #17
0
 def path(self) -> str:
     return DataLake().public.get_osci_change_excel_report_path(base_report_name=self.base_name,
                                                                report_dir_name=self.dir_name,
                                                                date=self.to_date)
Exemple #18
0
def left_index():
    return DataLake().staging.schemas.push_commits.repo_name
Exemple #19
0
 def read(self) -> pd.DataFrame:
     """
     Read company contributors repository commits to pandas DataFrame from file
     """
     return DataLake().public.get_companies_contributors_repository_commits(self.date)
Exemple #20
0
def right_index():
    return DataLake().staging.schemas.repositories.name
Exemple #21
0
def required_columns():
    return DataLake().staging.schemas.push_commits.required
Exemple #22
0
 def __init__(self, date_period_type: str = DatePeriodType.YTD):
     self.data_lake = DataLake()
     self.commits_schema = self.data_lake.staging.schemas.push_commits
     self.date_period_type = date_period_type
     self.report_cls: Type[Report] = self.REPORT_FACTORY().get_cls(date_period=self.date_period_type)
Exemple #23
0
 def read_all(self):
     return DataLake().public\
         .get_reports_for_last_days_of_month(report_name=self.name, date=self.date, company=self.company)
Exemple #24
0
def filter_columns():
    return [DataLake().staging.schemas.repositories.license]
Exemple #25
0
 def read(self) -> pd.DataFrame:
     """
     Read repositories to pandas DataFrame from file
     """
     return DataLake().staging.get_repositories(self.date)
Exemple #26
0
def adjunct_columns():
    return [
        DataLake().staging.schemas.repositories.name,
        DataLake().staging.schemas.repositories.language,
        DataLake().staging.schemas.repositories.license
    ]
Exemple #27
0
 def save(self, df: pd.DataFrame):
     """
     Save pandas DataFrame as file
     :param df:
     """
     return DataLake().public.save_companies_contributors_repository_commits(df, self.date)
Exemple #28
0
 def save(self, df: pd.DataFrame):
     DataLake().public.write_bytes_to_file(path=self.path,
                                           buffer=self._write(df))
Exemple #29
0
def unfiltered_raw_push_events_commit_df():
    return pd.DataFrame([{
        DataLake().staging.schemas.push_commits.event_id:
        "222222",
        DataLake().staging.schemas.push_commits.event_created_at:
        "2021-01-01 00:15:22+00:00",
        DataLake().staging.schemas.push_commits.actor_login:
        "******",
        DataLake().staging.schemas.push_commits.repo_name:
        "unfiltered/REPOSITORY",
        DataLake().staging.schemas.push_commits.org_name:
        "EPAM",
        DataLake().staging.schemas.push_commits.sha:
        "bbb656b0d05ec5b8ed5beb2f94c4aa11ea111a1a",
        DataLake().staging.schemas.push_commits.author_name:
        "User Name",
        DataLake().staging.schemas.push_commits.author_email:
        "*****@*****.**",
        DataLake().staging.schemas.push_commits.company:
        "EPAM"
    }, {
        DataLake().staging.schemas.push_commits.event_id:
        "1111111",
        DataLake().staging.schemas.push_commits.event_created_at:
        "2021-01-01 00:15:22+00:00",
        DataLake().staging.schemas.push_commits.actor_login:
        "******",
        DataLake().staging.schemas.push_commits.repo_name:
        "epam/OSCI",
        DataLake().staging.schemas.push_commits.org_name:
        'EPAM',
        DataLake().staging.schemas.push_commits.sha:
        "aaa656b0d05ec5b8ed5beb2f94c4aa11ea111a1a",
        DataLake().staging.schemas.push_commits.author_name:
        "User Name",
        DataLake().staging.schemas.push_commits.author_email:
        "*****@*****.**",
        DataLake().staging.schemas.push_commits.company:
        "EPAM"
    }])
Exemple #30
0
 def spark_path(self) -> str:
     """
     Return the full path to repositories
     """
     return DataLake().staging.get_repositories_spark_path(self.date)