コード例 #1
0
def get_overlapping_data(df1, df2):
    products = df1._product.unique().tolist()
    linked_products = [getLinkedProduct(p) for p in products]
    year_overlap = overlap_in_years(df1, df2)
    country_overlap = overlap_in_countries(df1, df2)
    newdf1 = get_data_selection(df2, country_overlap, year_overlap, linked_products)
    newdf2 = get_data_selection(df1, country_overlap, year_overlap, products)
    return newdf1, newdf2
コード例 #2
0
def get_overlapping_data(proddf, fooddf):
    products = [p for p in fooddf._product.unique().tolist() if getLinkedProduct(p)]
    linked_products = [getLinkedProduct(p) for p in products if getLinkedProduct(p)]
    linked_products = [item for sublist in linked_products for item in sublist]
    year_overlap = overlap_in_years(fooddf, proddf)
    country_overlap = overlap_in_countries(fooddf, proddf)
    newdf1 = get_data_selection(proddf, country_overlap, year_overlap, linked_products)
    newdf2 = get_data_selection(fooddf, country_overlap, year_overlap, products)
    return newdf1, newdf2
コード例 #3
0
def product_correlation(food_df):
    products = food_df._product.unique().tolist()
    for p in range(1, len(products)):
        product_data1 = get_data_selection(food_df,None, None, [products[p]])
        product_data2 = get_data_selection(food_df,None, None, [products[p-1]])
        if len(product_data1) == len(product_data2):

            corr = spearmanr(product_data1['price'], product_data2['price'])
            rho, pval = corr
            print(rho, pval)
            print(products[p], products[p-1])
コード例 #4
0
def month_price_correlation(df):
    years = df.year.unique().tolist()
    for prod in df._product.unique().tolist():
        product_data = get_data_selection(df, None, None, [prod])
        for year in years:
            year_data = get_data_selection(df, None, [year], None)
            months = year_data['month']
            price_changes = year_data['price_change']
            corr, pval = spearmanr(months, price_changes)
            if pval <= 0.05 and corr > 0.5:
                print(prod, year, corr)
コード例 #5
0
ファイル: region_kmeans.py プロジェクト: SW0BBR/davFoodPrices
def plot_all_data(region, food_data, prod_data):
    regionpricedata = get_data_selection(food_data, region)
    regionproddata = get_data_selection(prod_data, region)
    newfood, newprod = align_products_and_years(regionpricedata,
                                                regionproddata)
    products = newprod._product.unique().tolist()
    for product in products:
        newprod_product = newprod.loc[newprod['_product'] == product]
        linked = getPriceLinkedProduct(product)
        newfood_product = newfood.loc[newfood['_product'].isin(linked)]
        plt.scatter(newprod_product['value_change'],
                    newfood_product['price_change'])
    plt.show()
コード例 #6
0
def list_significant_correlations(food_data, prod_data):
    linked_products = pd.read_csv('/home/student/Documents/Projecten/davFoodPrices/fooddatasets/linked_products.csv', delimiter=';')
    best_products = findBestProducts(10, linked_products, prod_data, food_data)
    sign_correlations = []
    for _, country, priceProduct, productionProduct in best_products:
        # products = get_data_selection(sign_food_data, [country])._product.unique()
        # for product in products:
        f_country_productdata = get_data_selection(food_data, [country], None, [priceProduct])
        p_country_productdata =get_data_selection(prod_data, [country], None, [productionProduct])
        corr = compute_product_correlation(country, priceProduct, f_country_productdata, p_country_productdata)
        if corr:
            rho,pval = corr
            # print(country, product, 'corr=',rho, 'p=',pval)
            sign_correlations.append((country, priceProduct, productionProduct))
    return sign_correlations
コード例 #7
0
def all_products_region_correlation(region, reg_name, food_data, prod_data):
    region_pricedata = get_data_selection(food_data, region)
    region_proddata = get_data_selection(prod_data, region)

    overlap_countries = overlap_in_countries(region_pricedata, region_proddata)
    region_price = get_data_selection(region_pricedata, overlap_countries)
    region_production = get_data_selection(region_proddata, overlap_countries)

    newfood, newprod = align_products_and_years(region_price, region_production)
    print(len(newfood), len(newprod))
    corr = correlation(newprod, newfood)
    if corr:
        rho, pvalue=corr
        print(rho, pvalue)
        print(reg_name)
コード例 #8
0
def list_significant_correlations(food_data, prod_data, best_products):
    sign_correlations = []
    for _, country, priceProduct, productionProduct in best_products:
        f_country_productdata = get_data_selection(food_data, [country], None, [priceProduct])
        p_country_productdata =get_data_selection(prod_data, [country], None, [productionProduct])
        overlap_years = overlap_in_years(f_country_productdata, p_country_productdata)
        f_country_productdata = get_data_selection(f_country_productdata, None, overlap_years, None)
        p_country_productdata =get_data_selection(p_country_productdata, None, overlap_years, None)
        corr = percentage_prod_price_correlation(p_country_productdata, f_country_productdata, plot=True)
        if corr:
            sign_correlations.append((country, priceProduct, productionProduct))
            production = p_country_productdata['value_change']
            price = f_country_productdata['price_change']
            save_df = pd.concat([production, price])
            save_df.to_csv(country + priceProduct + productionProduct + '.csv')

    return sign_correlations
コード例 #9
0
def region_correlation(region, r, food_data, prod_data):
    region_pricedata = get_data_selection(food_data, region)
    region_proddata = get_data_selection(prod_data, region)
    overlap_countries = overlap_in_countries(region_pricedata, region_proddata)
    region_price = get_data_selection(region_pricedata, overlap_countries)
    region_production = get_data_selection(region_proddata, overlap_countries)

    reg_production, reg_price = get_overlapping_data(region_production, region_price)

    region_priceprods = reg_price._product.unique().tolist()
    region_prodprods = reg_production._product.unique().tolist()

    for product in region_prodprods:
        linked = getPriceLinkedProduct(product)
        productprice = get_data_selection(reg_price, None, None, linked)

        productprod = get_data_selection(reg_production, None, None, [product])
        productprod, productprice = get_overlapping_data(productprod, productprice)

        productprice_averaged = year_country_average(productprice, 'price_change')
        productprod_averaged = year_country_average(productprod, 'value_change')

        newfood, newprod = align_years(productprice_averaged, productprod_averaged)
        corr = correlation(newprod, newfood)
        print(product)
        if corr:
            rho, pvalue=corr
            if (rho < -0.5 or rho > 0.5) and pvalue < 0.05 and rho != 1.0:
                print(r, '&', product, '&', round(rho,2) , '&', pvalue, '\\\\')
                print('\\hline')
                cwd = os.getcwd()
                os.chdir('/home/student/Documents/Projecten/davFoodPrices/machinelearning/question3/toPlotOnWebsite/region_corr_improved')
                df = pd.concat([newprod['value_change'].reset_index(), newfood['price_change'].reset_index()], axis=1).drop('index', axis=1)
                df.to_csv(r.replace(' ', '') + '_' + product.replace(' ', '') + 'correlation.csv')
                os.chdir(cwd)
コード例 #10
0
def calc_product_correlation(df, prod1, prod2):
    spearmans = []
    prod1_df = get_data_selection(df, products=[prod1])
    prod2_df = get_data_selection(df, products=[prod2])
    markets = overlap_in_markets(prod1_df, prod2_df)
    countries = overlap_in_countries(prod1_df, prod2_df)
    if (len(countries) > 0):
        for market in markets:
            market_prod1_df = get_data_selection(prod1_df, markets=[market])
            market_prod2_df = get_data_selection(prod2_df, markets=[market])
            years = overlap_in_years(market_prod1_df, market_prod2_df)
            if (len(years) > 2):
                # print(years)
                year_prod1_df = get_data_selection(market_prod1_df,
                                                   years=years)
                year_prod2_df = get_data_selection(market_prod2_df,
                                                   years=years)
                # print("({}, {}): {}".format(prod1, prod2, country))
                spearman, p_value = spearmanr(year_prod1_df['price_change'],
                                              year_prod2_df['price_change'])
                if (p_value <= 0.05 and not (isnan(spearman))):
                    spearmans.append(spearman)
        # print(spearmans)
        return (np.mean(spearmans), countries, len(countries))
    else:
        return (np.nan, [], 0)
コード例 #11
0
def year_average(fooddf, column):
    new_fooddf = pd.DataFrame(columns=fooddf.columns.values)
    years = fooddf.year.unique().tolist()

    for year in years:
        year_data = get_data_selection(fooddf, None, [year], None)
        if year_data.empty:
            continue
        row = year_data.iloc[0]
        mean_pricechange = year_data[column].mean()
        row[column] = mean_pricechange
        new_fooddf = new_fooddf.append(row)
    return new_fooddf
コード例 #12
0
ファイル: region_kmeans.py プロジェクト: SW0BBR/davFoodPrices
def plot_product_year_country(fooddf, proddf, country, years):

    fooddf = get_data_selection(fooddf, [country], years, None)
    proddf = get_data_selection(proddf, [country], years, None)
    fooddf, proddf = align_products_and_years(fooddf, proddf)
    products = proddf._product.unique().tolist()
    print(products)
    for prod in products:
        #print(prod)
        linked = getPriceLinkedProduct(prod)
        if not linked:
            print('fail')
            continue
        newprod_product = proddf.loc[proddf['_product'] == prod]
        newfood_product = get_data_selection(fooddf, None, None, linked)
        newfood_product = year_country_average(newfood_product, 'price_change')
        print(len(newfood_product), len(newprod_product))
        plt.scatter(newprod_product['value_change'],
                    newfood_product['price_change'])
    plt.show()
    k = kmeans(fooddf, proddf, 3)
    plt.show()
コード例 #13
0
def align_X_Y_data(food_data, prod_data):
    X_data = []
    Y_data = []
    s=0
    years = []
    for i, row in food_data.iterrows():
        country = row['country']
        product = getLinkedProduct(row['_product'])
        year = row['year']
        years.append((year, country, product))
        prod_data_row = get_data_selection(prod_data, [country], [year], product)
        if prod_data_row.empty:
            continue
        if prod_data_row.shape[0] > 1:
            print('multiple selected')
            return False

        X_data.append(prod_data_row['value'].values.item())
        Y_data.append(row['price'])
    X_Y_data = np.array([X_data, Y_data]).T
    return X_Y_data
コード例 #14
0
def calc_product_correlation(df, prod1, prod2):
	spearmans = []
	prod1_df = get_data_selection(df, products=[prod1])
	prod2_df = get_data_selection(df, products=[prod2])
	countries = overlap_in_countries(prod1_df, prod2_df)
	for country in countries:
		country_prod1_df = get_data_selection(prod1_df, countries=[country])
		country_prod2_df = get_data_selection(prod2_df, countries=[country])
		years = overlap_in_years(country_prod1_df, country_prod2_df)
		if(len(years) > 4):
			year_prod1_df = get_data_selection(country_prod1_df, years=years)
			year_prod2_df = get_data_selection(country_prod2_df, years=years)
			spearman, p_value = spearmanr(year_prod1_df['price_change'], year_prod2_df['price_change'])
			if(p_value <=0.05 and not(isnan(spearman))):
				spearmans.append(spearman)
	# print(spearmans)
	return np.mean(spearmans)
コード例 #15
0
def align_products_and_years(fooddf, proddf):
    newfooddf = pd.DataFrame(columns=fooddf.columns.values)
    newproddf = pd.DataFrame(columns=proddf.columns.values)
    products = [p for p in proddf._product.unique().tolist() if getPriceLinkedProduct(p)]
    for product in products:
        #print('product',product)
        linked = getPriceLinkedProduct(product)
        #print('linked', linked)
        #print(product, linked)
        productprice = get_data_selection(fooddf, None, None, linked)
        #print('product priceentries', len(productprice))
        productprod = get_data_selection(proddf, None, None, [product])
        #print('product productionentries', len(productprod))
        countries = overlap_in_countries(productprice, productprod)
        #print(countries, 'have this product data')
        productprice = get_data_selection(productprice, countries)
        productprod = get_data_selection(productprod, countries)
        #print(' this leaves product priceentries', len(productprice))
        #print('product productionentries', len(productprod))
        #print(len(productprice), linked, 'entries')
        overlapyears = overlap_in_years(productprice, productprod)
        productprice = get_data_selection(productprice, None, overlapyears)
        productprod = get_data_selection(productprod, None, overlapyears)
        #print(sorted(productprice.year.unique().tolist()) == sorted(productprod.year.unique().tolist())
        newfood, newprod = align_years(productprice, productprod)
        if len(newfood._product.unique()) != len(newprod._product.unique()):
            newfood = year_country_average(newfood, 'price_change')
        #print('Equal dataframes:', len(newfood) == len(newprod))
        if not len(newfood) == len(newprod):
            newprod, newfood = get_overlapping_data(newprod, newfood)
            #
            # if not sorted(newfood.year.unique().tolist()) == sorted(newprod.year.unique().tolist()):
            #     print('years misaligned')
            # if not (len(newfood.country.unique()) == len(newprod.country.unique())):
            #     print('countries misaligned')
            # if not (len(newfood._product.unique()) == len(newprod._product.unique())):
            #     print('product error')
            #     print(sorted(newprod.country.tolist()) == sorted(newfood.country.tolist()))
            #     break
            #print(' now Equal dataframes:', len(newfood) == len(newprod))
        newfooddf = newfooddf.append(newfood)
        newproddf = newproddf.append(newprod)
    return newfooddf, newproddf
コード例 #16
0
def get_overlapping_data(df1, df2):
    products = df1._product.unique().tolist()
    linked_products = [getLinkedProduct(p) for p in products]
    year_overlap = overlap_in_years(df1, df2)
    country_overlap = overlap_in_countries(df1, df2)
    newdf1 = get_data_selection(df2, country_overlap, year_overlap, linked_products)
    newdf2 = get_data_selection(df1, country_overlap, year_overlap, products)
    return newdf1, newdf2

def product_correlation(food_df):
    products = food_df._product.unique().tolist()
    for p in range(1, len(products)):
        product_data1 = get_data_selection(food_df,None, None, [products[p]])
        product_data2 = get_data_selection(food_df,None, None, [products[p-1]])
        if len(product_data1) == len(product_data2):

            corr = spearmanr(product_data1['price'], product_data2['price'])
            rho, pval = corr
            print(rho, pval)
            print(products[p], products[p-1])




if __name__ == '__main__':
    food_df = pd.read_csv('/home/student/Documents/Projecten/davFoodPrices/fooddatasets/onlycountry_year_average_data.csv')
    prod_df = load_production_data()
    food_rice, prod_rice = get_overlapping_data(food_df, prod_df)
    product_correlation(get_data_selection(food_df, ['India'], None, None))
コード例 #17
0
    sign_cors = [('Senegal', 'Sorghum', 'Sorghum'), ('Burkina Faso', 'Maize', 'Maize'),\
     ('Tajikistan', 'Cabbage', 'Cabbages and other brassicas'), ('Tajikistan', 'Carrots',\
      'Carrots and turnips'), ('Tajikistan', 'Maize', 'Maize'), ('Tajikistan', 'Potatoes',\
       'Potatoes'), ('Tajikistan', 'Wheat', 'Wheat'), ('Guatemala', 'Maize (white)', 'Maize'), \
       ('Guatemala', 'Maize (yellow)', 'Maize'), ('Mali', 'Maize', 'Maize'), \
       ('Kenya', 'Beans (dry)', 'Beans, dry'), ('Kenya', 'Maize (white)', 'Maize'), \
       ('Kenya', 'Sorghum', 'Sorghum'), ('Peru', 'Potatoes', 'Potatoes'),\
        ('Tajikistan', 'Onions', 'Onions, dry'), ('Zambia', 'Maize (white)', 'Maize'),\
         ('Indonesia', 'Chili (green)', 'Chillies and peppers, green'), \
         ('Peru', 'Maize (local)', 'Maize')]

    sign_countries = [x[0] for x in sign_cors]
    sign_priceProd = [x[1] for x in sign_cors]
    sign_prodProd = [x[2] for x in sign_cors]
    sign_years = overlap_in_years(food_data, prod_data)
    relevant_food = get_data_selection(food_data, sign_countries, sign_years, sign_priceProd)
    relevant_prod = get_data_selection(prod_data, sign_countries, sign_years, sign_prodProd)
    # print(get_data_selection(relevant_prod, ['Burkina Faso'], [2003], ['Maize']))
    # food_data = food_data.loc[(food_data['country'] == 'Burkina Faso') & ( food_data['_product'] == 'Maize')]
    # prod_data = food_data.loc[(prod_data['country'] == 'Burkina Faso') & ( prod_data['_product'] == 'Maize')]
    europe, middle_east, asia, africa = regions()
    europe_cluster = []
    middle_east_cluster = []
    asia_cluster = []
    africa_cluster = []
    for i in sign_cors:
        country = i[0]
        if country in europe:
            europe_cluster.append(country)
        elif country in asia:
            asia_cluster.append(country)
コード例 #18
0
def get_rice_related_products(df):
    products = [p.lower() for p in df._product.unique()]
    rice = [p.capitalize() for p in products if 'rice' in p]
    rice_related = get_data_selection(df, None, None, rice)
    return rice_related