Exemple #1
0
def compore_distribution(field, feds, randoms, youngs):
    # print '---------------Compare ' + field + '---------------------'
    edcomm = statis_util.comm_stat(feds)
    rdcomm = statis_util.comm_stat(randoms)
    ygcomm = statis_util.comm_stat(youngs)
    ed_rdz = statis_util.ks_test(randoms, feds)
    ed_ygz = statis_util.ks_test(youngs, feds)
    yg_rdz = statis_util.ks_test(youngs, randoms)
    # if min(ed_rdz[2], ed_ygz[2])>yg_rdz[2]:
    print '%s & %.2f($\sigma$=%.2f) & %.2f($\sigma$=%.2f) & %.2f($\sigma$=%.2f) & %.2f%s & %.2f%s & %.2f%s \\\\' \
          % (field, edcomm[2], edcomm[3], rdcomm[2], rdcomm[3], ygcomm[2], ygcomm[3], ed_rdz[2],
             pvalue(ed_rdz[3]), ed_ygz[2], pvalue(ed_ygz[3]), yg_rdz[2], pvalue(yg_rdz[3]))

    # print 'ED & ' + str(edcomm[0]) + ' & ' + str(edcomm[1]) \
    #       + ' & ' + str(edcomm[2]) + ' & ' + str(edcomm[3]) + '\\\\'
    # print 'Random &' + str(rdcomm[0]) + ' & ' + str(rdcomm[1]) \
    #       + ' & ' + str(rdcomm[2]) + ' & ' + str(rdcomm[3]) + '\\\\'
    # print 'Younger &' + str(ygcomm[0]) + ' & ' + str(ygcomm[1]) \
    #       + ' & ' + str(ygcomm[2]) + ' & ' + str(ygcomm[3]) + '\\\\'
    # print '\\hline'
    # print 'ks-test(Random, ED): & $n_1$: ' + str(ed_rdz[0]) + ' & $n_2$: ' + str(ed_rdz[1]) \
    #       + ' & ks-value: ' + str(ed_rdz[2]) + ' & p-value: ' + str(ed_rdz[3]) + '\\\\'
    # print 'ks-test(Younger, ED): & $n_1$: ' + str(ed_ygz[0]) + ' & $n_2$: ' + str(ed_ygz[1]) \
    #       + ' & ks-value: ' + str(ed_ygz[2]) + ' & p-value: ' + str(ed_ygz[3]) + '\\\\'
    # print 'ks-test(Younger, Random): & $n_1$: ' + str(yg_rdz[0]) + ' & $n_2$: ' + str(yg_rdz[1]) \
    #       + ' & ks-value: ' + str(yg_rdz[2]) + ' & p-value: ' + str(yg_rdz[3]) + '\\\\'

    plot.plot_pdf_mul_data([feds, randoms, youngs], field, ['--g', '--b', '--r'], ['s', 'o', '^'],
                           ['ED', 'Random', 'Younger'],
                           linear_bins=True, central=True, fit=False, fitranges=None, savefile=field + '.pdf')
Exemple #2
0
def profile_feature_stat():
    # 'favourites_count'
    fields = ['friends_count', 'followers_count', 'statuses_count']
    names = ['following', 'follower', 'tweet']

    filter = {}
    fitranges = [[(200, 100000), (1000, 100000000), (800, 10000000)],
                 [(700, 10000), (800, 10000000), (800, 1000000)],
                 [(800, 100000), (20000, 10000000), (10000, 10000000)]]
    for i in range(len(fields)):
        field = fields[i]
        print '=====================', field
        feds = np.array(io.get_values_one_field('fed', 'scom', field,
                                                filter)) + 1
        randoms = np.array(
            io.get_values_one_field('random', 'scom', field, filter)) + 1
        youngs = np.array(
            io.get_values_one_field('young', 'scom', field, filter)) + 1

        comm = statis_util.comm_stat(feds)
        print 'ED & ' + str(comm[0]) + ' & ' + str(comm[1]) \
              + ' & ' + str(comm[2])+ ' & ' + str(comm[3]) + '\\\\'
        comm = statis_util.comm_stat(randoms)
        print 'Random &' + str(comm[0]) + ' & ' + str(comm[1]) \
              + ' & ' + str(comm[2])+ ' & ' + str(comm[3])+ '\\\\'
        comm = statis_util.comm_stat(youngs)
        print 'Younger &' + str(comm[0]) + ' & ' + str(comm[1]) \
              + ' & ' + str(comm[2])+ ' & ' + str(comm[3])+ '\\\\'
        print '\\hline'

        # z = statis_util.z_test(randoms, feds)
        # print 'z-test(Random, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \
        #       + ' & z-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\'
        # z = statis_util.z_test(youngs, feds)
        # print 'z-test(Younger, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$:' + str(z[1]) \
        #       + ' & z-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\'
        # z = statis_util.z_test(youngs, randoms)
        # print 'z-test(Younger, Random): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \
        #       + ' & z-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\'

        z = statis_util.ks_test(randoms, feds)
        print 'ks-test(Random, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \
              + ' & ks-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\'
        z = statis_util.ks_test(youngs, feds)
        print 'ks-test(Younger, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \
              + ' & ks-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\'
        z = statis_util.ks_test(youngs, randoms)
        print 'ks-test(Younger, Random): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \
              + ' & ks-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\'

        plot.plot_pdf_mul_data([feds, randoms, youngs],
                               names[i], ['g', 'b', 'r'], ['s', 'o', '^'],
                               ['ED', 'Random', 'Younger'],
                               linear_bins=False,
                               central=False,
                               fit=True,
                               fitranges=fitranges[i],
                               savefile=field + '.pdf')
Exemple #3
0
def profile_feature_stat():
    # 'favourites_count'
    fields = ['friends_count', 'followers_count', 'statuses_count']
    names = ['following', 'follower', 'tweet']

    filter = {}
    fitranges = [[(200, 100000), (1000, 100000000), (800, 10000000)],
                     [(700, 10000), (800, 10000000), (800, 1000000)],
                     [(800, 100000), (20000, 10000000), (10000, 10000000)]]
    for i in range(len(fields)):
        field = fields[i]
        print '=====================', field
        feds = np.array(io.get_values_one_field('fed', 'scom', field, filter))+1
        randoms = np.array(io.get_values_one_field('random', 'scom', field, filter))+1
        youngs = np.array(io.get_values_one_field('young', 'scom', field, filter))+1

        comm = statis_util.comm_stat(feds)
        print 'ED & ' + str(comm[0]) + ' & ' + str(comm[1]) \
              + ' & ' + str(comm[2])+ ' & ' + str(comm[3]) + '\\\\'
        comm = statis_util.comm_stat(randoms)
        print 'Random &' + str(comm[0]) + ' & ' + str(comm[1]) \
              + ' & ' + str(comm[2])+ ' & ' + str(comm[3])+ '\\\\'
        comm = statis_util.comm_stat(youngs)
        print 'Younger &' + str(comm[0]) + ' & ' + str(comm[1]) \
              + ' & ' + str(comm[2])+ ' & ' + str(comm[3])+ '\\\\'
        print '\\hline'

        # z = statis_util.z_test(randoms, feds)
        # print 'z-test(Random, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \
        #       + ' & z-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\'
        # z = statis_util.z_test(youngs, feds)
        # print 'z-test(Younger, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$:' + str(z[1]) \
        #       + ' & z-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\'
        # z = statis_util.z_test(youngs, randoms)
        # print 'z-test(Younger, Random): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \
        #       + ' & z-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\'

        z = statis_util.ks_test(randoms, feds)
        print 'ks-test(Random, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \
              + ' & ks-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\'
        z = statis_util.ks_test(youngs, feds)
        print 'ks-test(Younger, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \
              + ' & ks-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\'
        z = statis_util.ks_test(youngs, randoms)
        print 'ks-test(Younger, Random): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \
              + ' & ks-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\'

        plot.plot_pdf_mul_data([feds, randoms, youngs], names[i], ['g', 'b', 'r'], ['s', 'o', '^'], ['ED', 'Random', 'Younger'],
                               linear_bins=False, central=False, fit=True, fitranges=fitranges[i], savefile=field+'.pdf')
def compore_distribution(field, feds, randoms, youngs=None):
    # print '---------------Compare ' + field + '---------------------'
    edcomm = statis_util.comm_stat(feds)
    rdcomm = statis_util.comm_stat(randoms)
    # ygcomm = statis_util.comm_stat(youngs)
    ed_rdz = statis_util.ks_test(randoms, feds)
    # ed_ygz = statis_util.ks_test(youngs, feds)
    # yg_rdz = statis_util.ks_test(youngs, randoms)
    # if min(ed_rdz[2], ed_ygz[2])>yg_rdz[2]:
    # print '%s & %.2f($\sigma$=%.2f) & %.2f($\sigma$=%.2f) & %.2f($\sigma$=%.2f) & %.2f%s & %.2f%s & %.2f%s \\\\' \
    #       % (field, edcomm[2], edcomm[3], rdcomm[2], rdcomm[3], ygcomm[2], ygcomm[3], ed_rdz[2],
    #          pvalue(ed_rdz[3]), ed_ygz[2], pvalue(ed_ygz[3]), yg_rdz[2], pvalue(yg_rdz[3]))

    print '%s & %.2f($\sigma$=%.2f) & %.2f($\sigma$=%.2f) & %.2f%s \\\\' \
          % (field, edcomm[2], edcomm[3], rdcomm[2], rdcomm[3], ed_rdz[2],
             pvalue(ed_rdz[3]))

    sns.distplot(feds, hist=False, label='Positive')
    sns.distplot(randoms, hist=False, label='Negative')
    plt.xlabel('value')
    plt.ylabel('PDF')
    plt.savefig('data/' + field + '.pdf')
    plt.clf()