def sample_tables(A, B, proportion, lid='id', rid='id', lstopwords=[], rstopwords=[]): num_tuples_b = int(math.floor(len(B)*proportion)) print(len(A), len(B), len(A), num_tuples_b) if num_tuples_b > len(B): num_tuples_b = len(B) A1 = copy.deepcopy(A) B1 = copy.deepcopy(B) A1.reset_index(inplace=True, drop=True) B1.reset_index(inplace=True, drop=True) A1['_pos'] = list(range(len(A1))) B1['_pos'] = list(range(len(A1))) sampled_table_a, sampled_table_b = downsample_dk(A1, B1, lid, rid, size=num_tuples_b, y=1, lstopwords=lstopwords, rstopwords=rstopwords, compute=True) #sampled_table_a.sort_index(inplace=True) #sampled_table_b.sort_index(inplace=True) sampled_table_a = sampled_table_a.sort_values('_pos') sampled_table_b = sampled_table_b.sort_values('_pos') print(len(sampled_table_a), len(sampled_table_b)) sampled_table_a.drop('_pos', axis=1, inplace=True) sampled_table_b.drop('_pos', axis=1, inplace=True) return sampled_table_a, sampled_table_b
print("Mem. usage before reading:{0} (GB)".format( psutil.virtual_memory().used / 1e9)) A = pd.read_csv('../datasets/citeseer.csv') B = pd.read_csv('../datasets/dblp.csv') print("Mem. usage after reading:{0} (GB)".format(psutil.virtual_memory().used / 1e9)) #stopWords = list(get_stop_words()) memUsageBefore = psutil.virtual_memory().used / 1e9 timeBefore = time.time() C = downsample_dk(A, B, 'id', 'id', 100000, 1, lstopwords=stopwords, rstopwords=stopwords, compute=False, nlchunks=1, nrchunks=4) _ = C.compute(get=threaded.get) timeAfter = time.time() memUsageAfter = psutil.virtual_memory().used / 1e9 print( 'Mem.usage (after reading): {0} (GB), Mem.usage (after downsampling): {1} (GB), diff: {2} (GB)' .format(memUsageBefore, memUsageAfter, memUsageAfter - memUsageBefore)) print('Time. diff: {0} (secs)'.format(timeAfter - timeBefore))