コード例 #1
0
def sample_tables(A, B, proportion, lid='id', rid='id', lstopwords=[], rstopwords=[]):

    num_tuples_b = int(math.floor(len(B)*proportion))
    print(len(A), len(B), len(A), num_tuples_b)
    if num_tuples_b > len(B):
        num_tuples_b = len(B)

    A1 = copy.deepcopy(A)
    B1 = copy.deepcopy(B)
    A1.reset_index(inplace=True, drop=True)
    B1.reset_index(inplace=True, drop=True)
    A1['_pos'] = list(range(len(A1)))
    B1['_pos'] = list(range(len(A1)))
    sampled_table_a, sampled_table_b = downsample_dk(A1, B1, lid, rid, size=num_tuples_b,
                                                     y=1,
                                                     lstopwords=lstopwords,
                                                     rstopwords=rstopwords, compute=True)
    #sampled_table_a.sort_index(inplace=True)
    #sampled_table_b.sort_index(inplace=True)
    sampled_table_a = sampled_table_a.sort_values('_pos')
    sampled_table_b = sampled_table_b.sort_values('_pos')
    print(len(sampled_table_a), len(sampled_table_b))
    sampled_table_a.drop('_pos', axis=1, inplace=True)
    sampled_table_b.drop('_pos', axis=1, inplace=True)
    return sampled_table_a, sampled_table_b
コード例 #2
0
print("Mem. usage before reading:{0} (GB)".format(
    psutil.virtual_memory().used / 1e9))
A = pd.read_csv('../datasets/citeseer.csv')
B = pd.read_csv('../datasets/dblp.csv')
print("Mem. usage after reading:{0} (GB)".format(psutil.virtual_memory().used /
                                                 1e9))

#stopWords = list(get_stop_words())

memUsageBefore = psutil.virtual_memory().used / 1e9
timeBefore = time.time()
C = downsample_dk(A,
                  B,
                  'id',
                  'id',
                  100000,
                  1,
                  lstopwords=stopwords,
                  rstopwords=stopwords,
                  compute=False,
                  nlchunks=1,
                  nrchunks=4)
_ = C.compute(get=threaded.get)
timeAfter = time.time()
memUsageAfter = psutil.virtual_memory().used / 1e9

print(
    'Mem.usage (after reading): {0} (GB), Mem.usage (after downsampling): {1} (GB), diff: {2} (GB)'
    .format(memUsageBefore, memUsageAfter, memUsageAfter - memUsageBefore))
print('Time. diff: {0} (secs)'.format(timeAfter - timeBefore))