Exemple #1
0
    def aggregate_brand_stats(self):
        self.brand_loss = self.load_brand_loss()
        self.brand_risk = self.compute_brand_risk()
        brand_groups = self.bigtable.groupby('brand')
        brand_df = brand_groups.agg(
            brandid=pd.NamedAgg(column='brandid', aggfunc='first'),
            mill_count=pd.NamedAgg(column='umlid', aggfunc='count'),
            rspo_mill_count=pd.NamedAgg(column='cert', aggfunc=lambda x: x.value_counts()['RSPO Certified']),
            unique_parent_co=pd.NamedAgg(column='prnt_comp', aggfunc=lambda x: len(x.unique())),
            unique_group_name=pd.NamedAgg(column='group_name', aggfunc=lambda x: len(x.unique()))
        )
        brand_df['nonrspo_mill_count'] = brand_df['mill_count']-brand_df['rspo_mill_count']
        brand_cols = ['brandid',
                      'brand',
                      'country',
                      'rspo_member_since',
                      'external_link',
                      'description_attribution',
                      'description']
        unique_brands_df = self.brands[brand_cols].drop_duplicates()
        brand_df1 = brand_df.merge(unique_brands_df, on='brandid', how='left')
        brand_df2 = self.brand_loss.merge(self.brand_risk, on='brandid', how='left')
        self.brand_df = brand_df1.merge(brand_df2, on='brandid', how='left')

        self.brand_df = self.brand_df.sort_values(by='mill_count',
                                                          ascending=False)
        logger.info("Aggregated brands data shape: %s" % str(self.brand_df.shape))
def write_json(mills, path):
    try:
        with open(path, 'w') as f:
            f.write(json.dumps(mills))
        logger.info('Completed writing %s' % path)
    except Exception as e:
        logger.error('Failed writing %s' % path)
Exemple #3
0
def webhook():
    """
    接收git提交参数,默认提交处理
    :return: 
    """
    # try:
    ret = request.data.decode('utf-8')
    # print(ret, type(ret))
    data = json.loads(ret)
    # git提交类型
    event = data['object_kind']
    print("event:", event)
    branch = data['ref']
    print(branch)
    branch_name = branch if branch.find('/') < 0 else branch[branch.rfind('/') + 1:]
    print('branch_name:', branch_name)
    git_url = data['project']['url']
    pro_name = "CI__" \
               + git_url[git_url.rfind(':') + 1:git_url.rfind("/")] + "__" \
               + git_url[git_url.rindex('/') + 1:git_url.rindex(".git")] + "__" + branch_name

    if event == 'push':
        if branch_name.startswith('release-'):
            create_build(data)
        elif branch_name.startswith('develop') and (request.args.get('dev') == '1' or request.args.get('dev') == 'true'):
            create_build(data)
        else:
            logger.error('未处理分支类型:[%s]!' % branch_name)
    elif event == 'tag_push':
        create_build(data)
    else:
        logger.error("未处理类型: [%s]!" % event)
    logger.info('已经完成对[%s]项目的创建!' % pro_name + '\n' + '分支/标签: [%s]' % branch_name)
    return 'Hello Webhook!'
Exemple #4
0
def build_uml_boundaries_data(output_file_path, input_file_path, radius, res):

    if os.path.exists(output_file_path):
        uml_gdf = gpd.read_file(output_file_path)
        logger.info("Reading UML boundaries data from local geojson file.")
        pass
    else:
        logger.info("Started reading mills data from json.")
        # Rename column for 'id' as 'umlid'
        uml_df = pd.read_json(input_file_path, orient='index')
        #uml_df.rename(columns={"id": "umlid"}, inplace=True)

        # Convert to GeoDataFrame
        uml_gdf = gpd.GeoDataFrame(uml_df[['umlid', 'latitude', 'longitude']],
                                   geometry=gpd.points_from_xy(
                                       uml_df.longitude, uml_df.latitude))

        # Set CRS initially to epsg:4326 (lat/lon in degrees)
        uml_gdf.set_crs(epsg=4326, inplace=True)

        # Convert to CRS epsg:3395 (lat/lon in meters) and create buffer,
        # then convert back to CRS epsg:4326.
        uml_gdf.to_crs('epsg:3395', inplace=True)
        uml_gdf['geometry'] = uml_gdf.buffer(radius, resolution=res)
        uml_gdf.to_crs('epsg:4326', inplace=True)

        # Write geodataframe out to geojson
        write_geojson(uml_gdf, output_file_path)

    return uml_gdf
Exemple #5
0
def build_uml_data(output_path, mills_api_url, request_params):
    res = {}
    if os.path.exists(output_path):
        logger.info("Reading UML mills from local file.")
        try:
            with open(output_path, 'r') as f:
                res = json.load(f)
        except Exception as e:
            logger.error("Failed to read UML file.")
            pass
    else:
        try:
            mills_dict = {}
            # Request mills from opendata.arcgis.com
            req = requests.get(mills_api_url, params=request_params)
            res_json = json.loads(req.text)

            # Handle empty response or missing mills data
            if 'features' not in res_json or len(res_json['features']) == 0:
                logger.error('Missing mills data')
                pass

            # Extract mills properties from response JSON
            mills = res_json['features']
            mills_dict = {
                x["properties"]["objectid"]: x["properties"]
                for x in mills
            }

            column_mapper = {'Group_Name': 'group_name', 'id': 'umlid'}
            for k, v in mills_dict.items():
                if v['country'] in request_params['country']:
                    for old, new in column_mapper.items():
                        v[new] = v.pop(old)
                    res[k] = v

            write_json(res, output_path)

        except Exception as e:
            print(e)
            logger.error("Failed to read UML mills from API.")

    return pd.DataFrame.from_dict(res, orient='index')
Exemple #6
0
def create_build(data, project_type='default'):
    """
    构建默认提交类型
    :param data: 
    :param project_type: 
    :return: 
    """
    git_url = data['project']['url']
    branch = data['ref']
    branch_name = branch if branch.find(
        '/') < 0 else branch[branch.rfind('/') + 1:]
    pro_name = "CI__" \
               + git_url[git_url.rfind(':') + 1:git_url.rfind("/")] + "__" \
               + git_url[git_url.rindex('/') + 1:git_url.rindex(".git")] + "__" + branch_name

    if project_type == 'jdk7':
        build_name = 'JDK_7u79'
        config = get_simple_maven_config(url=git_url,
                                         branch=branch,
                                         jdk_version=build_name)
    elif project_type == 'jdk8':
        build_name = 'JDK_8u112'
        config = get_simple_maven_config(url=git_url,
                                         branch=branch,
                                         jdk_version=build_name)
    elif project_type == 'npm':
        config = get_simple_npm_config(url=git_url, branch=branch)
    elif not project_type or project_type == 'default':
        config = get_simple_default_config(url=git_url, branch=branch)
    else:
        logger.error('项目类型有误: [%s]' % project_type)
        # raise Exception("Project Type Error: [%s]" % project_type)
    s = 0
    while s < 20:
        s += 1
        try:
            if not server.get_job_name(pro_name):
                server.create_job(pro_name, config)
                logger.info('新的项目,开始创建[%s]项目名称!' % pro_name)
            else:
                logger.info('[%s]项目名称已存在,准备开始构建!' % pro_name)
            break
        except jenkins.JenkinsException as e:
            continue
    while s < 30:
        s += 1
        try:
            log.info("项目 [%s] 构建中..." % pro_name)
            server.build_job(pro_name)
            log.info("项目 [%s] 构建完成!" % pro_name)
            break
        except Exception as e:
            log.error("项目 [%s] 构建失败!" + pro_name + '\n' +
                      traceback.format_exc())
            continue
    logger.info('构建项目 [%s] 信息:' % pro_name + '分支/标签: [%s]' % branch_name +
                '构建重试次数:[%s]' % (s - 2))
Exemple #7
0
def webhook_jdk_npm(j_n):
    """
    接收git提交参数,处理以分支和url是jdk和npm类型
    :param j_n: 
    :return: 
    """
    # 获取JDK版本
    # try:
    # git提交信息
    ret = request.data.decode('utf-8')
    data = json.loads(ret)

    # git提交类型
    event = data['object_kind']
    branch = data['ref']
    branch_name = branch if branch.find('/') < 0 else branch[branch.rfind('/') + 1:]
    git_url = data['project']['url']
    pro_name = "CI__" \
               + git_url[git_url.rfind(':') + 1:git_url.rfind("/")] + "__" \
               + git_url[git_url.rindex('/') + 1:git_url.rindex(".git")] + "__" + branch_name
    if event == 'push':
        if branch_name.startswith('release-'):
            create_build(data, j_n)
        elif branch_name.startswith('develop') and (request.args.get('dev') == '1' or request.args.get('dev') == 'true'):
            create_build(data, j_n)
        else:
            logger.info('未处理分支类型:[%s]!' % branch_name)
            raise Exception('未处理分支类型:[%s]!' % branch_name)

    elif event == 'tag_push':
        if j_n.startswith('jdk') or j_n.startswith('npm'):
            create_build(data, j_n)
    else:
        logger.info("未处理类型: [%s]!" % event)
        raise Exception("未处理类型: [%s]!" % event)
    # logger.info('JSON_jdk_npm: [%s]' % ret)
    logger.info('已经完成对[%s]项目的构建!' % pro_name + '\n' + '分支/标签:[%s]' % branch_name)
    return 'Hello Webhook!'
def write_geojson(gdf, path):
    try:
        gdf.to_file(path, driver='GeoJSON')
        logger.info('Completed writing %s' % path)
    except Exception as e:
        logger.error('Failed writing %s' % path)
def write_df(df, path, index=False):
    try:
        df.to_csv(path, index=index)
        logger.info('Completed writing %s' % path)
    except Exception as e:
        logger.error('Failed writing %s' % path)
Exemple #10
0
                   'risk_score_current',
                   'risk_score_future']]


    def write_uniquebrands(self):
        self.brand_df.to_csv(self.out_brands, index=False)

    def write_brand_mill_matches(self):
        self.brands[['brandid', 'umlid']].to_csv(self.out_matches, index=False)


if __name__ == '__main__':
    try:
        os.mkdir(OUTPUT_DIR)
    except FileExistsError:
        logger.info('Output directory already exists.')

    uml_df = load_uml_data()
    logger.info("UML data shape: %s" % str(uml_df.shape))

    brand_df = load_brand_data()
    logger.info("Brand data shape: %s" % str(brand_df.shape))

    ##
    # Input: umls.json, output: boundaries.geojson
    # This code should read the output/umls.json file, use EE to
    # calculate the polygon shapes (and probably water/land/intersection areas
    # when available), then write the output to geojson. Return geopandas df
    # with UML ID, lat/lon, and polygon shapes.
    uml_boundaries_geodf = load_uml_boundaries_data()
    logger.info("UML boundaries data shape: %s" % str(uml_boundaries_geodf.shape))
Exemple #11
0
def build_brand_data(input_path, input_brand_path, input_new_matches_path,
                     output_path):
    res = None
    if os.path.exists(output_path):
        res = pd.read_csv(output_path)
        logger.info("Reading brand data from local CSV file.")
    else:
        logger.info("Started parsing brand data from TSV.")
        df = pd.read_csv(input_path, sep='\t')

        # Drop mills not on in indonesia or null rows
        df = df[df['Country'].notnull()]
        df = df[df['Country'] == 'indonesia']

        # Keep wanted columns
        df = df[[
            'idx', 'UMLID', 'Consumer Company', 'Mill Name', 'Mill Company',
            'Parent Company', 'Province', 'District', 'RSPO'
        ]]

        # Rename columns
        mapper = {
            'idx': 'idx',
            'UMLID': 'umlid',
            'Consumer Company': 'brand',
            'Mill Name': 'mill_name',
            'Mill Company': 'group_name',
            'Parent Company': 'prnt_comp',
            'Province': 'state',
            'District': 'sub_state',
            'RSPO': 'rspo_model'
        }
        df = df.rename(columns=mapper)
        df.reset_index(drop=True, inplace=True)

        # Create df1 where each row has a company and mill idx
        df1 = df[df['brand'].notnull()].loc[:, ['idx', 'brand']]

        # Create df2 where each row has a uml and mill idx, mill info
        df2 = df[df['umlid'].notnull()]

        # Merge and filter unique id/company tuples
        dfm = df1.merge(df2, on='idx', how='left')
        dfm = dfm[(dfm['brand_x'].notnull()) & (dfm['umlid'].notnull())]

        # Clean up merged dataset
        dfm.reset_index(drop=True, inplace=True)
        dfm.drop_duplicates(subset=['brand_x', 'umlid'], inplace=True)
        dfm.drop(columns=['brand_y', 'idx'], inplace=True)
        dfm.rename(columns={'brand_x': 'brand'}, inplace=True)

        # Bring in new match dataset
        dfnew = pd.read_csv(input_new_matches_path)

        # Keep wanted columns
        dfnew = dfnew[[
            'UMLID', 'Consumer Company', 'Mill Name', 'Mill Company',
            'Parent Company', 'Province', 'District', 'RSPO'
        ]]

        # Rename columns
        del mapper['idx']
        dfnew = dfnew.rename(columns=mapper)
        dfnew.reset_index(drop=True, inplace=True)
        dfnew.drop_duplicates(subset=['brand', 'umlid'], inplace=True)

        # Concatenate datasets
        dfm = pd.concat([dfm, dfnew])
        dfm.drop_duplicates(subset=['brand', 'umlid'], inplace=True)

        # Rename brands
        brand_mapper = {
            'ferrero': 'Ferrero',
            'kellog': 'Kellogg Company',
            'pepsico': 'PepsiCo',
            'frieslandcampina': 'Royal FrieslandCampina N.V.',
            'johnson and johnson': 'Johnson & Johnson',
            'general mills': 'General Mills, Inc',
            'hershey': 'The Hershey Company',
            'loreal': "L'Oreal",
            'procter and gamble': 'The Procter & Gamble Company',
            'colgate palmolive': 'Colgate-Palmolive Company',
            'nestle': 'Nestlé',
            'mars': 'Mars, Incorporated',
            'unilever': 'Unilever'
        }

        for old, new in brand_mapper.items():
            dfm['brand'] = dfm['brand'].replace(old, new)

        # Merge brand info.
        df3 = pd.read_csv(input_brand_path)
        df3.rename(columns={'name': 'brand', 'id': 'brandid'}, inplace=True)
        dft = df3.merge(dfm, on='brand', how='right')
        res = dft
        write_df(res, output_path, index=False)

    return res
Exemple #12
0
def build_risk_data(input_file_path,
                    output_file_path,
                    id_col,
                    years=[2018, 2019]):
    risk_df = None
    if os.path.exists(output_file_path):
        risk_df = pd.read_csv(output_file_path)
        logger.info("Reading risk data from local csv file.")
        pass
    else:
        logger.info("Started reading loss data from csv.")
        loss_df = pd.read_csv(input_file_path)

        # Create a new column that is the z-score for the sqrt tree loss proportion.
        loss_df['past_risk_z'] = get_z(loss_df,
                                       'treeloss_sum_proportion_of_forest')

        # Create a new column that is the risk (1-5) associated with z-score
        # of past tree loss
        loss_df['risk_score_past'] = get_risk_from_z(loss_df, 'past_risk_z')

        # Create a new column that is the mean treeloss for specified years.
        mean_col = 'mean_loss_'
        for year in years:
            mean_col += str(year)

        col_list = ['treeloss_' + str(year) for year in years]
        loss_df[mean_col] = loss_df.loc[:, col_list].mean(axis=1)

        # Create a new colum that is the mean treeloss as a proportion of forest.
        mean_prop_sqrt_col = mean_col + '_proportion_sqrt'
        loss_df[mean_prop_sqrt_col] = np.sqrt(loss_df[mean_col] /
                                              loss_df['forest_area'])

        # Create a new column that is the z-score for the mean treeloss as a
        # proportion of forest.
        current_z_col = mean_prop_sqrt_col + "_z"
        loss_df[current_z_col] = get_z(loss_df, mean_prop_sqrt_col)

        # Convert z-score to risk (1-5)
        loss_df['risk_score_current'] = get_risk_from_z(loss_df, current_z_col)

        # Create a new column that is the z-score for the remaining
        # tree cover proportion.
        loss_df['remaining_forest_z'] = get_z(
            loss_df, 'remaining_proportion_of_forest')

        # Create a new column that is 0.5*remaining proportion of forest z-score,
        # and 0.5*z-score for the mean current treeloss proportion of forest.
        loss_df['future_risk_z'] = 0.5*loss_df['remaining_forest_z'] + \
                                          0.5*loss_df[current_z_col]

        # Create a new column that is the risk (1-5) associated with z-score
        # of past tree loss
        loss_df['risk_score_future'] = get_risk_from_z(loss_df,
                                                       'future_risk_z')

        # risk_df includes UMLid and risk_score columns only
        risk_df = loss_df.loc[:, [
            id_col, 'risk_score_current', 'risk_score_past',
            'risk_score_future'
        ]]

        # Write out risk_df to CSV
        write_df(risk_df, output_file_path, index=False)

    return risk_df
Exemple #13
0
def build_loss_data(input_file_path,
                    output_file_path,
                    GFC_DATASET_NAME,
                    id_col,
                    credentials,
                    area_factor=1):
    loss_data = None
    if os.path.exists(output_file_path):
        loss_data = pd.read_csv(output_file_path)
        logger.info("Reading loss data from {}.".format(output_file_path))
    else:
        # Earth Engine Initialization
        ee.Initialize(credentials)

        logger.info("Computing loss and area for geometries from {}.".format(
            input_file_path))
        logger.info("Loading GFC data.")
        # Load the Global Forest Change dataset as a GEE image
        gfc_img = ee.Image(GFC_DATASET_NAME)

        # Open geojson file and convert data to Earth Engine Feature Collection.
        with open(input_file_path) as f:
            data = json.load(f)
        geoms = ee.FeatureCollection(data['features'])

        # Compute cumulative tree cover loss per geometry across **all**
        # lossyears
        # NOTE: The resulting sum is a decimal number because a weighted
        # reduction is performed:
        # https://developers.google.com/earth-engine/guides/reducers_weighting.
        # The sum is a weighted aggregation of the bitmap property "loss,"
        # which is either 0 or 1.  We then convert to an area using the
        # area_factor parameter.
        logger.info("Computing tree cover loss sum.")
        lossdict = reduce_sum(gfc_img, 'loss', geoms)

        # Store area info in a dataframe.
        column_names = [id_col, "treeloss_sum"]
        rows = []

        for area in lossdict:
            rows.append([
                area['properties'][id_col],
                area_factor * area['properties']['sum']
            ])

        loss_data = pd.DataFrame(columns=column_names, data=rows)

        # Compute land area within each geometric boundary and add a column to data
        # frame.  Compute histogram of datamask layer per mill area.
        logger.info("Computing areas of land and forest.")
        datamask_bins = (1, 2, 1)  # 1 bin of [1,2)
        landTypedict = reduce_hist(gfc_img, 'datamask', geoms, datamask_bins)
        logger.info("Land finished.")
        # Extract land area for each mill and add to dataframe.
        land_areas = []
        for area in landTypedict:
            land_areas.append(area_factor *
                              area["properties"]['histogram'][0][1])

        loss_data['land_area'] = land_areas

        # Compute forested area for each area and add a column to dataframe.
        # Compute the area where treecover2000 is greater than or equal to 30%.
        treecover_bins = (30, 101, 1)  # 1 bin of [30,101)
        treecoverdict = reduce_hist(gfc_img, 'treecover2000', geoms,
                                    treecover_bins)

        # Extract the area for each area boundary and add to dataframe.
        treecover2000_area = []
        for area in treecoverdict:
            treecover2000_area.append(area_factor *
                                      area["properties"]['histogram'][0][1])

        loss_data['forest_area'] = treecover2000_area

        # Compute cumulative tree cover loss area per area per year
        # Add a column to the data frame for each year.
        logger.info("Computing yearly tree cover loss.")
        lossyears = list(range(1, 20))

        lossyear_bins = (1, 20, 19)  # 19 bins of 1 each from 1-19
        lossyeardict = reduce_hist(gfc_img, 'lossyear', geoms, lossyear_bins)

        for i, year in enumerate(lossyears):
            col_name = "treeloss_20" + str(year).zfill(2)
            loss = []
            for area in lossyeardict:
                loss.append(area_factor *
                            area['properties']['histogram'][i][1])

            loss_data[col_name] = loss

        logger.info("Yearly tree cover loss computation complete.")

        #Compute the total tree cover loss for each mill as a proportion of
        #land area and add to dataframe.
        loss_data['treeloss_sum_proportion_of_land'] = (
            loss_data['treeloss_sum'] / loss_data['land_area'])

        #Compute the total tree cover loss for each mill as a proportion of
        #forest in 2000 and add to dataframe.
        loss_data['treeloss_sum_proportion_of_forest'] = (
            loss_data['treeloss_sum'] / loss_data['forest_area'])

        #Compute the proportion of forest area that is remaining
        #(1 - proportion of forest lost).
        loss_data['remaining_proportion_of_forest'] = (
            1 - loss_data['treeloss_sum_proportion_of_forest'])

        logger.info("Writing tree cover loss data to file.")
        write_df(loss_data, output_file_path, index=False)

    return loss_data