Ejemplo n.º 1
0
def region_correlation(region, r, food_data, prod_data):
    region_pricedata = get_data_selection(food_data, region)
    region_proddata = get_data_selection(prod_data, region)
    overlap_countries = overlap_in_countries(region_pricedata, region_proddata)
    region_price = get_data_selection(region_pricedata, overlap_countries)
    region_production = get_data_selection(region_proddata, overlap_countries)

    reg_production, reg_price = get_overlapping_data(region_production, region_price)

    region_priceprods = reg_price._product.unique().tolist()
    region_prodprods = reg_production._product.unique().tolist()

    for product in region_prodprods:
        linked = getPriceLinkedProduct(product)
        productprice = get_data_selection(reg_price, None, None, linked)

        productprod = get_data_selection(reg_production, None, None, [product])
        productprod, productprice = get_overlapping_data(productprod, productprice)

        productprice_averaged = year_country_average(productprice, 'price_change')
        productprod_averaged = year_country_average(productprod, 'value_change')

        newfood, newprod = align_years(productprice_averaged, productprod_averaged)
        corr = correlation(newprod, newfood)
        print(product)
        if corr:
            rho, pvalue=corr
            if (rho < -0.5 or rho > 0.5) and pvalue < 0.05 and rho != 1.0:
                print(r, '&', product, '&', round(rho,2) , '&', pvalue, '\\\\')
                print('\\hline')
                cwd = os.getcwd()
                os.chdir('/home/student/Documents/Projecten/davFoodPrices/machinelearning/question3/toPlotOnWebsite/region_corr_improved')
                df = pd.concat([newprod['value_change'].reset_index(), newfood['price_change'].reset_index()], axis=1).drop('index', axis=1)
                df.to_csv(r.replace(' ', '') + '_' + product.replace(' ', '') + 'correlation.csv')
                os.chdir(cwd)
Ejemplo n.º 2
0
def calc_product_correlation(df, prod1, prod2):
    spearmans = []
    prod1_df = get_data_selection(df, products=[prod1])
    prod2_df = get_data_selection(df, products=[prod2])
    markets = overlap_in_markets(prod1_df, prod2_df)
    countries = overlap_in_countries(prod1_df, prod2_df)
    if (len(countries) > 0):
        for market in markets:
            market_prod1_df = get_data_selection(prod1_df, markets=[market])
            market_prod2_df = get_data_selection(prod2_df, markets=[market])
            years = overlap_in_years(market_prod1_df, market_prod2_df)
            if (len(years) > 2):
                # print(years)
                year_prod1_df = get_data_selection(market_prod1_df,
                                                   years=years)
                year_prod2_df = get_data_selection(market_prod2_df,
                                                   years=years)
                # print("({}, {}): {}".format(prod1, prod2, country))
                spearman, p_value = spearmanr(year_prod1_df['price_change'],
                                              year_prod2_df['price_change'])
                if (p_value <= 0.05 and not (isnan(spearman))):
                    spearmans.append(spearman)
        # print(spearmans)
        return (np.mean(spearmans), countries, len(countries))
    else:
        return (np.nan, [], 0)
Ejemplo n.º 3
0
def get_overlapping_data(df1, df2):
    products = df1._product.unique().tolist()
    linked_products = [getLinkedProduct(p) for p in products]
    year_overlap = overlap_in_years(df1, df2)
    country_overlap = overlap_in_countries(df1, df2)
    newdf1 = get_data_selection(df2, country_overlap, year_overlap, linked_products)
    newdf2 = get_data_selection(df1, country_overlap, year_overlap, products)
    return newdf1, newdf2
Ejemplo n.º 4
0
def get_overlapping_data(proddf, fooddf):
    products = [p for p in fooddf._product.unique().tolist() if getLinkedProduct(p)]
    linked_products = [getLinkedProduct(p) for p in products if getLinkedProduct(p)]
    linked_products = [item for sublist in linked_products for item in sublist]
    year_overlap = overlap_in_years(fooddf, proddf)
    country_overlap = overlap_in_countries(fooddf, proddf)
    newdf1 = get_data_selection(proddf, country_overlap, year_overlap, linked_products)
    newdf2 = get_data_selection(fooddf, country_overlap, year_overlap, products)
    return newdf1, newdf2
def all_products_region_correlation(region, reg_name, food_data, prod_data):
    region_pricedata = get_data_selection(food_data, region)
    region_proddata = get_data_selection(prod_data, region)

    overlap_countries = overlap_in_countries(region_pricedata, region_proddata)
    region_price = get_data_selection(region_pricedata, overlap_countries)
    region_production = get_data_selection(region_proddata, overlap_countries)

    newfood, newprod = align_products_and_years(region_price, region_production)
    print(len(newfood), len(newprod))
    corr = correlation(newprod, newfood)
    if corr:
        rho, pvalue=corr
        print(rho, pvalue)
        print(reg_name)
Ejemplo n.º 6
0
def calc_product_correlation(df, prod1, prod2):
	spearmans = []
	prod1_df = get_data_selection(df, products=[prod1])
	prod2_df = get_data_selection(df, products=[prod2])
	countries = overlap_in_countries(prod1_df, prod2_df)
	for country in countries:
		country_prod1_df = get_data_selection(prod1_df, countries=[country])
		country_prod2_df = get_data_selection(prod2_df, countries=[country])
		years = overlap_in_years(country_prod1_df, country_prod2_df)
		if(len(years) > 4):
			year_prod1_df = get_data_selection(country_prod1_df, years=years)
			year_prod2_df = get_data_selection(country_prod2_df, years=years)
			spearman, p_value = spearmanr(year_prod1_df['price_change'], year_prod2_df['price_change'])
			if(p_value <=0.05 and not(isnan(spearman))):
				spearmans.append(spearman)
	# print(spearmans)
	return np.mean(spearmans)
Ejemplo n.º 7
0
def align_years(fooddf, proddf):
    newfood = pd.DataFrame(columns=fooddf.columns.values)
    newprod = pd.DataFrame(columns=proddf.columns.values)
    years = overlap_in_years(fooddf, proddf)
    countries = overlap_in_countries(fooddf, proddf)
    for country in countries:
        for year in years:
            food_entry = fooddf.loc[(fooddf['year']==year) & (fooddf['country'] == country)]
            prod_entry = proddf.loc[(proddf['year']==year) & (proddf['country'] == country)]
            # if len(food_entry) > len(prod_entry):
            #     food_entry = year_average(food_entry, 'price_change')
            # elif len(prod_entry) > len(food_entry):
            #     print(prod_entry)
            #     prod_entry = year_average(prod_entry, 'value_change')
            if food_entry.empty or prod_entry.empty:
                continue
            newfood = newfood.append(food_entry)
            newprod = newprod.append(prod_entry)
    return newfood, newprod
def align_products_and_years(fooddf, proddf):
    newfooddf = pd.DataFrame(columns=fooddf.columns.values)
    newproddf = pd.DataFrame(columns=proddf.columns.values)
    products = [p for p in proddf._product.unique().tolist() if getPriceLinkedProduct(p)]
    for product in products:
        #print('product',product)
        linked = getPriceLinkedProduct(product)
        #print('linked', linked)
        #print(product, linked)
        productprice = get_data_selection(fooddf, None, None, linked)
        #print('product priceentries', len(productprice))
        productprod = get_data_selection(proddf, None, None, [product])
        #print('product productionentries', len(productprod))
        countries = overlap_in_countries(productprice, productprod)
        #print(countries, 'have this product data')
        productprice = get_data_selection(productprice, countries)
        productprod = get_data_selection(productprod, countries)
        #print(' this leaves product priceentries', len(productprice))
        #print('product productionentries', len(productprod))
        #print(len(productprice), linked, 'entries')
        overlapyears = overlap_in_years(productprice, productprod)
        productprice = get_data_selection(productprice, None, overlapyears)
        productprod = get_data_selection(productprod, None, overlapyears)
        #print(sorted(productprice.year.unique().tolist()) == sorted(productprod.year.unique().tolist())
        newfood, newprod = align_years(productprice, productprod)
        if len(newfood._product.unique()) != len(newprod._product.unique()):
            newfood = year_country_average(newfood, 'price_change')
        #print('Equal dataframes:', len(newfood) == len(newprod))
        if not len(newfood) == len(newprod):
            newprod, newfood = get_overlapping_data(newprod, newfood)
            #
            # if not sorted(newfood.year.unique().tolist()) == sorted(newprod.year.unique().tolist()):
            #     print('years misaligned')
            # if not (len(newfood.country.unique()) == len(newprod.country.unique())):
            #     print('countries misaligned')
            # if not (len(newfood._product.unique()) == len(newprod._product.unique())):
            #     print('product error')
            #     print(sorted(newprod.country.tolist()) == sorted(newfood.country.tolist()))
            #     break
            #print(' now Equal dataframes:', len(newfood) == len(newprod))
        newfooddf = newfooddf.append(newfood)
        newproddf = newproddf.append(newprod)
    return newfooddf, newproddf