def get_overlapping_data(df1, df2): products = df1._product.unique().tolist() linked_products = [getLinkedProduct(p) for p in products] year_overlap = overlap_in_years(df1, df2) country_overlap = overlap_in_countries(df1, df2) newdf1 = get_data_selection(df2, country_overlap, year_overlap, linked_products) newdf2 = get_data_selection(df1, country_overlap, year_overlap, products) return newdf1, newdf2
def get_overlapping_data(proddf, fooddf): products = [p for p in fooddf._product.unique().tolist() if getLinkedProduct(p)] linked_products = [getLinkedProduct(p) for p in products if getLinkedProduct(p)] linked_products = [item for sublist in linked_products for item in sublist] year_overlap = overlap_in_years(fooddf, proddf) country_overlap = overlap_in_countries(fooddf, proddf) newdf1 = get_data_selection(proddf, country_overlap, year_overlap, linked_products) newdf2 = get_data_selection(fooddf, country_overlap, year_overlap, products) return newdf1, newdf2
def product_correlation(food_df): products = food_df._product.unique().tolist() for p in range(1, len(products)): product_data1 = get_data_selection(food_df,None, None, [products[p]]) product_data2 = get_data_selection(food_df,None, None, [products[p-1]]) if len(product_data1) == len(product_data2): corr = spearmanr(product_data1['price'], product_data2['price']) rho, pval = corr print(rho, pval) print(products[p], products[p-1])
def month_price_correlation(df): years = df.year.unique().tolist() for prod in df._product.unique().tolist(): product_data = get_data_selection(df, None, None, [prod]) for year in years: year_data = get_data_selection(df, None, [year], None) months = year_data['month'] price_changes = year_data['price_change'] corr, pval = spearmanr(months, price_changes) if pval <= 0.05 and corr > 0.5: print(prod, year, corr)
def plot_all_data(region, food_data, prod_data): regionpricedata = get_data_selection(food_data, region) regionproddata = get_data_selection(prod_data, region) newfood, newprod = align_products_and_years(regionpricedata, regionproddata) products = newprod._product.unique().tolist() for product in products: newprod_product = newprod.loc[newprod['_product'] == product] linked = getPriceLinkedProduct(product) newfood_product = newfood.loc[newfood['_product'].isin(linked)] plt.scatter(newprod_product['value_change'], newfood_product['price_change']) plt.show()
def list_significant_correlations(food_data, prod_data): linked_products = pd.read_csv('/home/student/Documents/Projecten/davFoodPrices/fooddatasets/linked_products.csv', delimiter=';') best_products = findBestProducts(10, linked_products, prod_data, food_data) sign_correlations = [] for _, country, priceProduct, productionProduct in best_products: # products = get_data_selection(sign_food_data, [country])._product.unique() # for product in products: f_country_productdata = get_data_selection(food_data, [country], None, [priceProduct]) p_country_productdata =get_data_selection(prod_data, [country], None, [productionProduct]) corr = compute_product_correlation(country, priceProduct, f_country_productdata, p_country_productdata) if corr: rho,pval = corr # print(country, product, 'corr=',rho, 'p=',pval) sign_correlations.append((country, priceProduct, productionProduct)) return sign_correlations
def all_products_region_correlation(region, reg_name, food_data, prod_data): region_pricedata = get_data_selection(food_data, region) region_proddata = get_data_selection(prod_data, region) overlap_countries = overlap_in_countries(region_pricedata, region_proddata) region_price = get_data_selection(region_pricedata, overlap_countries) region_production = get_data_selection(region_proddata, overlap_countries) newfood, newprod = align_products_and_years(region_price, region_production) print(len(newfood), len(newprod)) corr = correlation(newprod, newfood) if corr: rho, pvalue=corr print(rho, pvalue) print(reg_name)
def list_significant_correlations(food_data, prod_data, best_products): sign_correlations = [] for _, country, priceProduct, productionProduct in best_products: f_country_productdata = get_data_selection(food_data, [country], None, [priceProduct]) p_country_productdata =get_data_selection(prod_data, [country], None, [productionProduct]) overlap_years = overlap_in_years(f_country_productdata, p_country_productdata) f_country_productdata = get_data_selection(f_country_productdata, None, overlap_years, None) p_country_productdata =get_data_selection(p_country_productdata, None, overlap_years, None) corr = percentage_prod_price_correlation(p_country_productdata, f_country_productdata, plot=True) if corr: sign_correlations.append((country, priceProduct, productionProduct)) production = p_country_productdata['value_change'] price = f_country_productdata['price_change'] save_df = pd.concat([production, price]) save_df.to_csv(country + priceProduct + productionProduct + '.csv') return sign_correlations
def region_correlation(region, r, food_data, prod_data): region_pricedata = get_data_selection(food_data, region) region_proddata = get_data_selection(prod_data, region) overlap_countries = overlap_in_countries(region_pricedata, region_proddata) region_price = get_data_selection(region_pricedata, overlap_countries) region_production = get_data_selection(region_proddata, overlap_countries) reg_production, reg_price = get_overlapping_data(region_production, region_price) region_priceprods = reg_price._product.unique().tolist() region_prodprods = reg_production._product.unique().tolist() for product in region_prodprods: linked = getPriceLinkedProduct(product) productprice = get_data_selection(reg_price, None, None, linked) productprod = get_data_selection(reg_production, None, None, [product]) productprod, productprice = get_overlapping_data(productprod, productprice) productprice_averaged = year_country_average(productprice, 'price_change') productprod_averaged = year_country_average(productprod, 'value_change') newfood, newprod = align_years(productprice_averaged, productprod_averaged) corr = correlation(newprod, newfood) print(product) if corr: rho, pvalue=corr if (rho < -0.5 or rho > 0.5) and pvalue < 0.05 and rho != 1.0: print(r, '&', product, '&', round(rho,2) , '&', pvalue, '\\\\') print('\\hline') cwd = os.getcwd() os.chdir('/home/student/Documents/Projecten/davFoodPrices/machinelearning/question3/toPlotOnWebsite/region_corr_improved') df = pd.concat([newprod['value_change'].reset_index(), newfood['price_change'].reset_index()], axis=1).drop('index', axis=1) df.to_csv(r.replace(' ', '') + '_' + product.replace(' ', '') + 'correlation.csv') os.chdir(cwd)
def calc_product_correlation(df, prod1, prod2): spearmans = [] prod1_df = get_data_selection(df, products=[prod1]) prod2_df = get_data_selection(df, products=[prod2]) markets = overlap_in_markets(prod1_df, prod2_df) countries = overlap_in_countries(prod1_df, prod2_df) if (len(countries) > 0): for market in markets: market_prod1_df = get_data_selection(prod1_df, markets=[market]) market_prod2_df = get_data_selection(prod2_df, markets=[market]) years = overlap_in_years(market_prod1_df, market_prod2_df) if (len(years) > 2): # print(years) year_prod1_df = get_data_selection(market_prod1_df, years=years) year_prod2_df = get_data_selection(market_prod2_df, years=years) # print("({}, {}): {}".format(prod1, prod2, country)) spearman, p_value = spearmanr(year_prod1_df['price_change'], year_prod2_df['price_change']) if (p_value <= 0.05 and not (isnan(spearman))): spearmans.append(spearman) # print(spearmans) return (np.mean(spearmans), countries, len(countries)) else: return (np.nan, [], 0)
def year_average(fooddf, column): new_fooddf = pd.DataFrame(columns=fooddf.columns.values) years = fooddf.year.unique().tolist() for year in years: year_data = get_data_selection(fooddf, None, [year], None) if year_data.empty: continue row = year_data.iloc[0] mean_pricechange = year_data[column].mean() row[column] = mean_pricechange new_fooddf = new_fooddf.append(row) return new_fooddf
def plot_product_year_country(fooddf, proddf, country, years): fooddf = get_data_selection(fooddf, [country], years, None) proddf = get_data_selection(proddf, [country], years, None) fooddf, proddf = align_products_and_years(fooddf, proddf) products = proddf._product.unique().tolist() print(products) for prod in products: #print(prod) linked = getPriceLinkedProduct(prod) if not linked: print('fail') continue newprod_product = proddf.loc[proddf['_product'] == prod] newfood_product = get_data_selection(fooddf, None, None, linked) newfood_product = year_country_average(newfood_product, 'price_change') print(len(newfood_product), len(newprod_product)) plt.scatter(newprod_product['value_change'], newfood_product['price_change']) plt.show() k = kmeans(fooddf, proddf, 3) plt.show()
def align_X_Y_data(food_data, prod_data): X_data = [] Y_data = [] s=0 years = [] for i, row in food_data.iterrows(): country = row['country'] product = getLinkedProduct(row['_product']) year = row['year'] years.append((year, country, product)) prod_data_row = get_data_selection(prod_data, [country], [year], product) if prod_data_row.empty: continue if prod_data_row.shape[0] > 1: print('multiple selected') return False X_data.append(prod_data_row['value'].values.item()) Y_data.append(row['price']) X_Y_data = np.array([X_data, Y_data]).T return X_Y_data
def calc_product_correlation(df, prod1, prod2): spearmans = [] prod1_df = get_data_selection(df, products=[prod1]) prod2_df = get_data_selection(df, products=[prod2]) countries = overlap_in_countries(prod1_df, prod2_df) for country in countries: country_prod1_df = get_data_selection(prod1_df, countries=[country]) country_prod2_df = get_data_selection(prod2_df, countries=[country]) years = overlap_in_years(country_prod1_df, country_prod2_df) if(len(years) > 4): year_prod1_df = get_data_selection(country_prod1_df, years=years) year_prod2_df = get_data_selection(country_prod2_df, years=years) spearman, p_value = spearmanr(year_prod1_df['price_change'], year_prod2_df['price_change']) if(p_value <=0.05 and not(isnan(spearman))): spearmans.append(spearman) # print(spearmans) return np.mean(spearmans)
def align_products_and_years(fooddf, proddf): newfooddf = pd.DataFrame(columns=fooddf.columns.values) newproddf = pd.DataFrame(columns=proddf.columns.values) products = [p for p in proddf._product.unique().tolist() if getPriceLinkedProduct(p)] for product in products: #print('product',product) linked = getPriceLinkedProduct(product) #print('linked', linked) #print(product, linked) productprice = get_data_selection(fooddf, None, None, linked) #print('product priceentries', len(productprice)) productprod = get_data_selection(proddf, None, None, [product]) #print('product productionentries', len(productprod)) countries = overlap_in_countries(productprice, productprod) #print(countries, 'have this product data') productprice = get_data_selection(productprice, countries) productprod = get_data_selection(productprod, countries) #print(' this leaves product priceentries', len(productprice)) #print('product productionentries', len(productprod)) #print(len(productprice), linked, 'entries') overlapyears = overlap_in_years(productprice, productprod) productprice = get_data_selection(productprice, None, overlapyears) productprod = get_data_selection(productprod, None, overlapyears) #print(sorted(productprice.year.unique().tolist()) == sorted(productprod.year.unique().tolist()) newfood, newprod = align_years(productprice, productprod) if len(newfood._product.unique()) != len(newprod._product.unique()): newfood = year_country_average(newfood, 'price_change') #print('Equal dataframes:', len(newfood) == len(newprod)) if not len(newfood) == len(newprod): newprod, newfood = get_overlapping_data(newprod, newfood) # # if not sorted(newfood.year.unique().tolist()) == sorted(newprod.year.unique().tolist()): # print('years misaligned') # if not (len(newfood.country.unique()) == len(newprod.country.unique())): # print('countries misaligned') # if not (len(newfood._product.unique()) == len(newprod._product.unique())): # print('product error') # print(sorted(newprod.country.tolist()) == sorted(newfood.country.tolist())) # break #print(' now Equal dataframes:', len(newfood) == len(newprod)) newfooddf = newfooddf.append(newfood) newproddf = newproddf.append(newprod) return newfooddf, newproddf
def get_overlapping_data(df1, df2): products = df1._product.unique().tolist() linked_products = [getLinkedProduct(p) for p in products] year_overlap = overlap_in_years(df1, df2) country_overlap = overlap_in_countries(df1, df2) newdf1 = get_data_selection(df2, country_overlap, year_overlap, linked_products) newdf2 = get_data_selection(df1, country_overlap, year_overlap, products) return newdf1, newdf2 def product_correlation(food_df): products = food_df._product.unique().tolist() for p in range(1, len(products)): product_data1 = get_data_selection(food_df,None, None, [products[p]]) product_data2 = get_data_selection(food_df,None, None, [products[p-1]]) if len(product_data1) == len(product_data2): corr = spearmanr(product_data1['price'], product_data2['price']) rho, pval = corr print(rho, pval) print(products[p], products[p-1]) if __name__ == '__main__': food_df = pd.read_csv('/home/student/Documents/Projecten/davFoodPrices/fooddatasets/onlycountry_year_average_data.csv') prod_df = load_production_data() food_rice, prod_rice = get_overlapping_data(food_df, prod_df) product_correlation(get_data_selection(food_df, ['India'], None, None))
sign_cors = [('Senegal', 'Sorghum', 'Sorghum'), ('Burkina Faso', 'Maize', 'Maize'),\ ('Tajikistan', 'Cabbage', 'Cabbages and other brassicas'), ('Tajikistan', 'Carrots',\ 'Carrots and turnips'), ('Tajikistan', 'Maize', 'Maize'), ('Tajikistan', 'Potatoes',\ 'Potatoes'), ('Tajikistan', 'Wheat', 'Wheat'), ('Guatemala', 'Maize (white)', 'Maize'), \ ('Guatemala', 'Maize (yellow)', 'Maize'), ('Mali', 'Maize', 'Maize'), \ ('Kenya', 'Beans (dry)', 'Beans, dry'), ('Kenya', 'Maize (white)', 'Maize'), \ ('Kenya', 'Sorghum', 'Sorghum'), ('Peru', 'Potatoes', 'Potatoes'),\ ('Tajikistan', 'Onions', 'Onions, dry'), ('Zambia', 'Maize (white)', 'Maize'),\ ('Indonesia', 'Chili (green)', 'Chillies and peppers, green'), \ ('Peru', 'Maize (local)', 'Maize')] sign_countries = [x[0] for x in sign_cors] sign_priceProd = [x[1] for x in sign_cors] sign_prodProd = [x[2] for x in sign_cors] sign_years = overlap_in_years(food_data, prod_data) relevant_food = get_data_selection(food_data, sign_countries, sign_years, sign_priceProd) relevant_prod = get_data_selection(prod_data, sign_countries, sign_years, sign_prodProd) # print(get_data_selection(relevant_prod, ['Burkina Faso'], [2003], ['Maize'])) # food_data = food_data.loc[(food_data['country'] == 'Burkina Faso') & ( food_data['_product'] == 'Maize')] # prod_data = food_data.loc[(prod_data['country'] == 'Burkina Faso') & ( prod_data['_product'] == 'Maize')] europe, middle_east, asia, africa = regions() europe_cluster = [] middle_east_cluster = [] asia_cluster = [] africa_cluster = [] for i in sign_cors: country = i[0] if country in europe: europe_cluster.append(country) elif country in asia: asia_cluster.append(country)
def get_rice_related_products(df): products = [p.lower() for p in df._product.unique()] rice = [p.capitalize() for p in products if 'rice' in p] rice_related = get_data_selection(df, None, None, rice) return rice_related