Esempio n. 1
0
def plotResult(ds, save=True):
    df = pd.read_csv(getbase_dir('results') + ds + '.csv', sep=';')
    df = df[df.bf_type != 'BBLIP']

    fig, ax = plt.subplots(figsize=(10, 6))
    sns.boxplot(data=df, x='p', y='diff', hue='bf_type', showfliers=False)
    plt.title("")
    plt.ylabel("Diferença em %")
    # plt.show()
    fig.savefig(getbase_dir('results') + 'sns_bp_' + ds + '.png')
def readResultsSPF02():
    df = pd.read_csv(getbase_dir(['results','sbf_02_data']) + 'msplit_' +'bikes' + '.csv', sep=';')
    df['ds'] = 'bike'
    df = df[df.bf_type != 'BBLIP']

    for ds in ['beer', 'books1', 'eletronics', 'movies1', 'music', 'restaurants1']:
        bdf = pd.read_csv(getbase_dir(['results','sbf_02_data']) + 'msplit_' + ds + '.csv', sep=';')
        bdf['ds'] = ds
        bdf = bdf[bdf.bf_type != 'BBLIP']
        df = pd.concat([df, bdf])

    return df
def processMultisplit(datadir, basename, e1_fields, e2_fields, bflen):
    pool = mp.Pool(processes=4)

    # no maximo 8bits por split
    b = BloomFilter(cap=bflen)
    max_split = round(np.log2(b.bit_size)) - 2

    ed = encrypt_data(datadir,
                      basename,
                      e1_fields,
                      e2_fields,
                      bflen,
                      set_p=0.5)

    results = [
        pool.apply_async(parallel_compare_multisplit, args=(ed, s))
        for s in np.arange(1, max_split, 1)
    ]
    output = [p.get() for p in results]

    df = output[0]
    for pdf in output[1:]:
        df = df.append(pdf, ignore_index=True)

    # df['diff'] = abs(df.full - df.sbf_sim) * 100
    # ax = df.boxplot('diff',by='p',rot=30)
    df.to_csv(getbase_dir('results') + "msplit_" + datadir + '.csv', sep=';')
    print('Done ' + datadir + "!")
def plot_all_ds_considering_split_number(df,dash_styltes):
    """
    Equation 01 of section 2

    :param df:
    :param dash_styltes:
    :return:
    """
    sns.set_style("whitegrid")
    fig, ax = plt.subplots(figsize=(10, 6))

    sns.lineplot(data=df, x='splits', y='mean_erro',hue='bf_type',style='similarity',dashes=dash_styles)
    ax.set_xscale('log')
    # ax.set_yscale('log')
    ax.xaxis.set_major_locator(ticker.LogLocator(base=2.0, subs=(1.0, ), numdecs=0, numticks=None))
    ax.xaxis.set_major_formatter(ticker.ScalarFormatter())

    # ax.axhline(0.01, ls='--')

    plt.title("Similarity Error")
    # plt.ylabel("Error (\u03B5)")
    plt.ylabel("Error")
    plt.xlabel("Number of splits")

    plt.show()
    fig.savefig(getbase_dir(['results','sbf_02b']) + "zz_all_ds_considering_split_number.png", dpi=300)
def exponential_regression2var(func_exp,x_data, y_data, xg, yg , eq_label=r'$f(x) = {:.2f} * ln( {:.2f} * x) + {:.2f}$'):
    # func_exp = q2
    # x_data = X
    # y_data = y
    # xg = Xg
    fig = plt.gcf()
    popt, pcov = scipy.optimize.curve_fit(func_exp, x_data, y_data, p0 = (-1, 0.01, 0))
    print(popt)
    puntos = plt.plot(xg, yg, 'x', color='xkcd:maroon', label = "data")

    y_predicted = func_exp(xg, *popt)
    rmse = np.sqrt(mean_squared_error(yg, y_predicted))

    eq_label = eq_label.format(*popt) + ", rmse = {:.3f}".format(rmse)
    curva_regresion = plt.plot(x_data, func_exp(x_data, *popt), color='xkcd:teal', label = eq_label)

    # curva_regresion = plt.plot(x_data, func_exp(x_data, *popt), color='xkcd:teal', label=eq_label + end_label)
    plt.legend()
    plt.title("Estimated Error in SBF")
    plt.xlabel("$x=\\frac{s}{l}$")
    plt.ylabel('Error')
    plt.show()
    fig.savefig(getbase_dir(['results', 'sbf_02b']) + "new_estimated_sbf_erro.png", dpi=300)
    # plt.close()

    return popt
def plot_all_ds_considering_percent(df,dash_styltes):
    sns.set_style("whitegrid")
    fig, ax = plt.subplots(figsize=(10, 6))
    # sns.lineplot(data=df, x='x', y='mean_dist_of_real',hue='ds',style='ds',dashes=dash_styles)
    sns.lineplot(data=df, x='x', y='mean_dist_of_real', hue='ds' , dashes=[(2, 2)])

    # q1 = lambda p: np.exp(6.999 - .7903 * np.log(p))
    #
    # sns.set_style("whitegrid")
    # q1 = lambda p:  -.7903 * np.log(p)
    # fig, ax = plt.subplots(figsize=(10, 6))
    # q2 = lambda p: -1*np.log(0.34*p)
    # q3 = lambda p: 1/(1 + np.log(p)) #-1.69314718/1
    # q3 = lambda p: 1 / (1 + p * np.log(p))  # -1.69314718/1
    # q3 = lambda p: 1 / (1 + p)  # -1.69314718/1
    # P = np.linspace(0.0, 0.5, num=10)
    # # print(P)
    # q3(P)
    # ax.plot(P, q2(P), color="BLUE", lw=3, label='Q2')
    # ax.plot(P, q1(P), color="RED", lw=3, label='Q1')
    # ax.plot(P, q3(P), color="GREEN", lw=3, label='Q1')
    # plt.show()

    #ax.set_xscale('log')
    # ax.xaxis.set_major_locator(ticker.LogLocator(base=2.0, subs=(1.0, ), numdecs=0, numticks=None))
    ax.xaxis.set_major_locator(ticker.MultipleLocator(0.1))
    ax.xaxis.set_minor_locator(ticker.MultipleLocator(0.05))
    plt.title("SBF Split Error")
    plt.ylabel("Mean Error (\u03B5)")
    plt.xlabel("Split length in % of original filter")

    plt.show()
    fig.savefig(getbase_dir(['results','sbf_02b']) + "zz_all_ds_error_bit_percent.png",dpi=300)
Esempio n. 7
0
def compileContract(file, lib=None, ldlib=None, file_path="Contracts"):
    file_path = getbase_dir(file_path)

    input_json = get_input_json(file_path, file, lib, ldlib)
    set_solc_version('v0.5.4')
    # return compile_files([file_path+file])
    return compile_standard(input_json, allow_paths=file_path)
def plot_sbfError(df):
    sns.set(style="whitegrid")
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.boxplot(data=df, x='p', y='diff', hue='bf_type', showfliers=False)
    plt.title("SBF Error")
    plt.ylabel("Error in %")
    plt.show()
    fig.savefig(getbase_dir('results') + "sbf_error_all_ds" + ".png")
Esempio n. 9
0
def encrypt_data2(datadir,
                  basename,
                  e1_fields,
                  bflen,
                  fp=0.01,
                  ngrams=2,
                  lpower=256,
                  enc='utf-8',
                  set_p=None):

    base_dir = getbase_dir(['Datasets', datadir])  # + os.sep

    rows = []
    # print(base_dir+basename)
    with open(base_dir + basename, encoding=enc, errors='replace') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                # print(f'Column names are {", ".join(row)}')
                line_count += 1
            else:
                try:
                    dbf1 = row[e1_fields[0]]
                    for i in e1_fields[1:]:
                        dbf1 = dbf1 + row[i]

                    if set_p == None:
                        erow = [
                            row[0],
                            encryptData(dbf1,
                                        bflen,
                                        n=ngrams,
                                        fp=fp,
                                        bpower=lpower)
                        ]
                    else:
                        erow = [
                            row[0],
                            encryptData(dbf1,
                                        bflen,
                                        n=ngrams,
                                        fp=fp,
                                        bpower=lpower,
                                        p=set_p)
                        ]
                    rows.append(erow)
                    line_count += 1
                except IndexError:
                    print(row)
                    print(e1_fields)
            # print(f'Processed {line_count} lines.')
    return rows
def plot_erroInSBFParts(pdf):
    sns.set(style="whitegrid")
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.boxplot(data=pdf,
                x='ds',
                y='temp',
                hue='part',
                showfliers=False,
                notch=True)

    plt.title("SBF Error in splits")
    plt.ylabel("Error in %")
    plt.show()
    fig.savefig(getbase_dir('results') + "sbf_parts_error_.png")
def process(datadir, basename, e1_fields, e2_fields, bflen):
    pool = mp.Pool(processes=8)
    results = [
        pool.apply_async(parallel_compare,
                         args=(datadir, basename, e1_fields, e2_fields, bflen),
                         kwds={'set_p': p}) for p in np.arange(0.1, 1.0, 0.1)
    ]
    output = [p.get() for p in results]

    df = output[0]
    for pdf in output[1:]:
        df = df.append(pdf, ignore_index=True)

    df['diff'] = abs(df.full - df.sbf_sim) * 100

    # ax = df.boxplot('diff',by='p',rot=30)
    ax = df.boxplot('diff', by=['p', 'bf_type'], rot=90, figsize=(18, 10))
    fig = ax.get_figure()
    plt.title("")
    plt.xlabel("")
    plt.ylabel("Diferença em %")
    fig.savefig(getbase_dir('results') + datadir + '.png')
    df.to_csv(getbase_dir('results') + datadir + '.csv', sep=';')
    print('Done ' + datadir + "!")
def plot_error_epsilon_distribution(z,marcas=[1, 3, 5, 6],fs=(12, 6)):
    # z = df[df.bf_type == 'BBF']
    # z = df
    z['nd'] = (z.full - z.psim_mean)*100
    # zz = z[z.x <= 0.2]
    # zz = zz[zz.x >= 0.1]

    #fig, ax = plt.subplots()

    #ax.violinplot(zz.nd, vert=True)
    #sns.distplot(zz.nd, fit=st.laplace, kde=False)
    # Show the plot
    #plt.show()

    # labels = list(z.x.unique()[[1, 2, 3, 4, 5, 6]])
    labels = list(z.x.unique()[marcas])
    fig, axes = plt.subplots(2, int(len(labels) / 2), figsize=fs,
                             constrained_layout=True)
                             # sharex=True)
    # fig.subplots_adjust(top=0.8)
    fig.suptitle("\u03B5-Error Distribution",y=1.05)

    colors = ['skyblue', 'olive', 'gold', 'purple', 'teal', 'red']
    # labels = z.x.unique()

    for x in range(0, len(labels)):
        print(x)
        eixo = False
        if x < len(labels) / 2:
            eixo = axes[0, x]
            # axes[x]
        else:
            eixo = axes[1, int(x - len(labels) / 2)]

        eixo.set_title('Split of {:.2%}'.format(labels[x]))
        # sns.distplot(z[z.x == labels[x]].nd, fit=st.laplace , color=colors[x],
        sns.distplot(z[z.x == labels[x]].nd, fit=st.laplace,
                     label='length={}'.format(x), kde=False,
                     ax=eixo)
        # ax=axes[0,x])

    for ax1 in axes.flat:
        # ax1.set(xlabel='x-label', ylabel='y-label')
        ax1.set(xlabel='error')
    plt.show()
    fig.savefig(getbase_dir(['results', 'sbf_02b']) + "zz_p_error_episilon.png", dpi=300)
def plot_episilon_approximation(a,b):
    a=-0.042876301194393125
    b=3.2574724870013103
    sns.set_style("whitegrid")
    fig, ax = plt.subplots(figsize=(5, 4))
    fe = lambda x, a , b: -1 * np.log(a * np.log(b * x)) #eq completa
    fe1 = lambda x, a, b: np.log( 1/ (a * np.log(b * x)) )
    simplificada_01 = lambda x, a, b: np.log(a) - np.log(b * x)
    as1 = lambda x: ( np.log(1/np.log(1/x)) ) + 3.7 # aqui
    # as2 = lambda x: np.log(1/np.log(1 / x))
    # as2 = lambda x: np.log(1 / np.log(x))
    # as2 = lambda x,a: np.log(-1*np.abs(a)*np.log(1/x))
    # as2(p)
    # as2 = lambda x,a: 1 / np.log(a*np.log(1 / x))
    # as3 = lambda x, b: 1 / np.log( (np.log(1/b)+np.log(1 / x)) )
    # assintotico = lambda x: -1 / np.log(x)

    p = np.linspace(0.00001, .25, num=50)

    data = []
    for xp in p:
        data.append((xp,fe(xp, a, b),r'$ln(\frac{1}{a * ln(b*x)})$'))
        # data.append((xp, fe(xp, -1, 1), r'$ln1(\frac{1}{a * ln(b*x)})$'))
                     # "$\\frac{1}{a * ln(b*x)}$"))
        # data.append((xp,fe1(xp, a, b), 'c1'))
        # data.append((xp, simplificada_01(xp, 1, b), 's1'))
        data.append((xp, as1(xp), r'$ln(\frac{1}{ln(\frac{1}{x})})+ c ,  c=2$'))
        # data.append((xp, as2(xp,-1*a), 'as2'))
        # data.append((xp, as2(xp,a), 'as2'))
        # data.append((xp, as3(xp, b), 'as3'))
    labels = ['x', 'y', 'function']

    r = pd.DataFrame.from_records(data, columns=labels)
    sns.lineplot(data=r,x='x',y='y',hue='function')

    plt.title("$\\epsilon\ estimation$")
    plt.xlabel("splits size$(\\frac{s}{l})$")
    plt.ylabel(r'$\epsilon$')

    plt.show()
    plt.close()
    fig.savefig(getbase_dir(['results', 'sbf_02b']) + "episilon_estimation.png", dpi=400)
def plot_summaryByDataset(rdf):

    #jitter x
    def f(g):
        return np.random.normal(g, 0.03)

    def g(x):
        return abs(np.random.normal(g, 0.03))

    rdf.p = rdf.np.apply(f)
    # rdf['median_error'] = rdf['median_error'].apply(g)
    # cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True)
    sns.set(style="whitegrid")
    fig, ax = plt.subplots(figsize=(10, 6))
    ax = sns.scatterplot(x="p",
                         y="median_error",
                         hue="bf_type",
                         alpha=0.8,
                         x_jitter=True,
                         s=150,
                         style='dataset',
                         palette="Set2",
                         data=rdf)

    handles, labels = ax.get_legend_handles_labels()
    lgd = ax.legend(handles,
                    labels,
                    loc='upper center',
                    bbox_to_anchor=(0.9, -0.1),
                    ncol=3)
    # ax.legend(frameon=True, loc='lower center', ncol=4)
    # text = ax.text(-0.2,1.05, "Aribitrary text", transform=ax.transAxes)
    import matplotlib.ticker as ticker
    ax.xaxis.set_major_locator(ticker.MultipleLocator(0.1))
    ax.set_title("SBF Error")
    ax.set_ylabel("Median error in %")
    ax.grid('on')
    plt.tight_layout()
    plt.show()
    fig.savefig(getbase_dir('results') + 'erro_in_all_ds_.png')
    b = BloomFilter(cap=96)

    #gabarito
    bases = ['bikes','beer', 'books1', 'eletronics', 'movies1', 'music', 'restaurants1']




    df = df[df.id_a != 'ltable._id']
    df = df[df.id_b != 'rtable._id']
    df = df.round(2)

    for datadir in df.ds.unique():
        dsg = df[(df.ds == datadir) & (df.bf_type == 'BBF')]
        base_dir = getbase_dir(['Datasets', datadir ])  # + os.sep
        gab_files = base_dir + 'labeled_data.csv'
        print(gab_files)

dsg.id_a = pd.to_numeric(dsg.id_a)
dsg.id_b = pd.to_numeric(dsg.id_b)

gs = pd.read_csv(gab_files,skiprows=5)

r0 = []
r1 = []
for index, row in dsg.iterrows():
    aid = row.id_a
    bid = row.id_b
    if len(gs[(gs['ltable._id'] == aid) & (gs['rtable._id'] == bid) & (gs.gold == 1)]) == 1:
        s1 = row.sbf_sim
######################################################################################################

df['py'] = abs(df.full - df.psim_median)
sns.set(style="whitegrid")
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(data=df, x='splits', y='py')
#ax.set_xscale('log')
#ax.set_yscale('log')
#plt.axvline(7, 0.05 ,3,color='red')
#ax.set(xscale="log")
ax.yaxis.set_major_locator(ticker.MultipleLocator(0.1))
plt.title("SBF Split Error in ")
plt.ylabel("Error")
plt.show()
fig.savefig(getbase_dir('results') + "zz_erro_incease_split.png")

sns.set(style="whitegrid")
fig, ax = plt.subplots(figsize=(10, 6))
sns.lineplot(data=df.head(50000),
             x='bits',
             y='median_dist_of_real',
             hue='ds',
             style='ds',
             dashes=dash_styles)
ax.set_xscale('log')
# ax.set_xticklabels(rotation=30)
ax.xaxis.set_minor_formatter(ticker.ScalarFormatter())
#ax.get_xaxis().get_major_formatter().set_scientific(False)
#ax.get_xaxis().get_major_formatter().set_useOffset(False)
plt.xticks(2**np.arange(10, dtype=np.uint64))