コード例 #1
0
from Generators.gen_csv import generate_data
from dask import delayed,compute
from dask.distributed import Client
from collections import defaultdict
import dask.dataframe as dd
import pprint
import sys
import math
import dask

if (len(sys.argv) < 6):
    print("USAGE ./union.py <file 1 size> <file 1 columns> <file 2 size> <file 2 columns> <scheduler url>")
    exit(1)

file1 = (int(sys.argv[1]), int(sys.argv[2]))
file2 = (int(sys.argv[3]), int(sys.argv[4]))
sched_IP = sys.argv[5]
client = Client(sched_IP)
datafile1 = generate_data("file1",file1[0],file1[1],0)
datafile2 = generate_data("file2",file2[0],file2[1],0)

dataframe1 = dd.read_csv("file1.csv")
dataframe2 = dd.read_csv("file2.csv")
dataframe3 = dataframe1.merge(dataframe2, on=['col-1'])
dataframe3.compute()
pprint.pprint(dataframe3)
コード例 #2
0
from Generators.gen_csv import generate_data
from dask import delayed, compute
from dask.distributed import Client
from collections import defaultdict
import dask.dataframe as dd
import pprint
import sys
import dask

if (len(sys.argv) < 4):
    print("USAGE ./scan.py <file 1 size> <file 1 columns> <scheduler url>")
    exit(1)

file1 = (int(sys.argv[1]), int(sys.argv[2]))
sched_IP = sys.argv[3]
client = Client(sched_IP)

datafile1 = generate_data("file1", file1[0], file1[1], 0)


def scan_data(file):
    output = []
    for i in range(len(file)):
        for line in file[1].split("\n"):
            output.append(line)
    return output


task = delayed(scan_data)(open("file1.csv"))
output = compute(task)
print(output)