def main(): args, eval_args = cli() for backbone in args.backbones: run_eval_coco(args.output, backbone, eval_args) sc = pysparkling.Context() stats = (sc.textFile(args.output + '*.stats.json').map( json.loads).map(lambda d: (d['checkpoint'], d)).collectAsMap()) LOG.debug('all data: %s', stats) # pretty printing for backbone, data in sorted(stats.items(), key=lambda b_d: b_d[1]['stats'][0]): print('| {backbone: <15} ' '| __{AP:.1f}__ ' '| {APM: <8.1f} ' '| {APL: <8.1f} ' '| {t: <15.0f} ' '| {tdec: <12.0f} |' ''.format( backbone=backbone, AP=100.0 * data['stats'][0], APM=100.0 * data['stats'][3], APL=100.0 * data['stats'][4], t=1000.0 * data['total_time'] / data['n_images'], tdec=1000.0 * data['decoder_time'] / data['n_images'], ))
def read_log(self, path): sc = pysparkling.Context() # modify individual file names and comma-seperated filenames files = path.split(',') files = ','.join( [ '{}.epoch???.evalcoco-edge{}-samples{}-decoder{}.txt' ''.format(f[:-4], self.edge, self.samples, self.decoder) for f in files ] + [ '{}.epoch???.evalcoco-edge{}-samples{}.txt' ''.format(f[:-4], self.edge, self.samples) for f in files ] + ([ '{}.epoch???.evalcoco-edge{}.txt' ''.format(f[:-4], self.edge) for f in files ] if not self.samples else []) ) def epoch_from_filename(filename): i = filename.find('epoch') return int(filename[i+5:i+8]) return (sc .wholeTextFiles(files) .map(lambda k_c: ( epoch_from_filename(k_c[0]), [float(l) for l in k_c[1].splitlines()], )) .filter(lambda k_c: len(k_c[1]) == 10) .sortByKey() .collect())
def read_log(self, path): sc = pysparkling.Context() # modify individual file names and comma-seperated filenames files = path.split(',') files = ','.join( [ '{}.epoch???.evalcoco-edge{}{}.stats.json' ''.format(f[:-4], self.edge, self.modifiers) for f in files ] ) def epoch_from_filename(filename): i = filename.find('epoch') return int(filename[i+5:i+8]) return (sc .wholeTextFiles(files) .map(lambda k_c: ( epoch_from_filename(k_c[0]), json.loads(k_c[1]), )) .filter(lambda k_c: k_c[0] >= self.first_epoch and len(k_c[1]['stats']) == 10) .sortByKey() .collect())
def test_lock1(self): """Should not be able to create a new RDD inside a map operation.""" sc = pysparkling.Context() self.assertRaises( pysparkling.exceptions.ContextIsLockedException, lambda: (sc.parallelize(range(5)).map(lambda _: sc.parallelize([1])). collect()))
def read_log(self, path): sc = pysparkling.Context() # modify individual file names and comma-seperated filenames files = path.split(',') files = ','.join( ['{}.epoch???{}'.format(f[:-4], self.file_suffix) for f in files]) def epoch_from_filename(filename): i = filename.find('epoch') return int(filename[i + 5:i + 8]) def migrate(data): # earlier versions did not contain 'dataset' if 'dataset' not in data and len(data['stats']) == 10: data['dataset'] = 'cocokp' if 'dataset' not in data and len(data['stats']) == 12: data['dataset'] = 'cocodet' # earlier versions did not contain 'text_labels' if 'text_labels' not in data and len(data['stats']) == 10: data['text_labels'] = metric.Coco.text_labels_keypoints if 'text_labels' not in data and len(data['stats']) == 12: data['text_labels'] = metric.Coco.text_labels_bbox return data return (sc.wholeTextFiles(files).map(lambda k_c: ( epoch_from_filename(k_c[0]), json.loads(k_c[1]), )).filter(lambda k_c: k_c[0] >= self.first_epoch and k_c[1]['stats']). mapValues(migrate).sortByKey().collect())
def read_log(self, path): sc = pysparkling.Context() return (sc.textFile(path).filter( lambda line: line.startswith(('{', 'json:')) and line.endswith('}') ).map(lambda line: json.loads(line.strip('json:'))).filter( lambda data: fractional_epoch(data, default=np.inf) >= self. first_epoch).groupBy(lambda data: data.get('type')).collectAsMap())
def _run_process(self, n, to_kv, format_): c = pysparkling.Context() stream_c = pysparkling.streaming.StreamingContext(c, 1.0) counts = [] sensor_sums = defaultdict(float) sensor_squares = defaultdict(float) sensor_counts = defaultdict(int) if format_ not in ('bello', 'struct'): t = stream_c.socketTextStream('localhost', self.port) else: length = {'bello': 5, 'struct': 8}[format_] t = stream_c.socketBinaryStream('localhost', self.port, length) t.count().foreachRDD(lambda _, rdd: counts.append(rdd.collect()[0])) if to_kv is not None: def update(rdd): for k, v in rdd.collect(): sensor_sums[k] += sum(v) sensor_squares[k] += sum(vv**2 for vv in v) sensor_counts[k] += len(v) t.map(to_kv).groupByKey().foreachRDD(lambda _, rdd: update(rdd)) self.client(n, format_=format_) stream_c.start() stream_c.awaitTermination(timeout=5.0) return (counts, sensor_sums, sensor_squares, sensor_counts)
def test_union(self): sc = pysparkling.Context() rdd1 = sc.parallelize(['Hello']) rdd2 = sc.parallelize(['World']) union = sc.union([rdd1, rdd2]).collect() print(union) self.assertEqual(union, ['Hello', 'World'])
def main(): sc = pysparkling.Context() data = sc.textFile(DATA_FILE).map(json.loads).cache() train_data_score, val_data_score = train_val_split_score(data) plot_training_data(train_data_score, val_data_score, entryname='score') train_data, val_data = train_val_split_keypointscores(data) model = InstanceScorer() train_dataset = torch.utils.data.TensorDataset(*train_data) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=256, shuffle=True) val_dataset = torch.utils.data.TensorDataset(*val_data) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=256, shuffle=False) optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.9) for epoch_i in range(100): train_loss = train_epoch(model, train_loader, optimizer) val_loss = val_epoch(model, val_loader) print(epoch_i, train_loss, val_loss) with torch.no_grad(): post_train_data = (model(train_data[0]), train_data[1]) post_val_data = (model(val_data[0]), val_data[1]) plot_training_data(post_train_data, post_val_data, entryname='optimized score') torch.save(model, 'instance_scorer.pkl')
def write(rows, path, new_scenes, new_frames): """ Writing scenes with categories """ output_path = path.replace('output_pre', 'output') pysp_tracks = rows.filter(lambda r: r.frame in new_frames).map( trajnetplusplustools.writers.trajnet) pysp_scenes = pysparkling.Context().parallelize(new_scenes).map( trajnetplusplustools.writers.trajnet) pysp_scenes.union(pysp_tracks).saveAsTextFile(output_path)
def main(): tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() sc = pysparkling.Context() sc.parallelize(range(1000000)).saveAsTextFile(tempFile.name + '.gz') rdd = sc.textFile(tempFile.name + '.gz') rdd.collect()
def read_log(path): sc = pysparkling.Context() return (sc .textFile(path) .filter(lambda line: line.startswith(('{', 'json:')) and line.endswith('}')) .map(lambda line: json.loads(line.strip('json:'))) .groupBy(lambda data: data.get('type')) .collectAsMap())
def stats(self): sc = pysparkling.Context() stats = (sc.wholeTextFiles(self.output_folder + '*.stats.json').mapValues(json.loads). map(lambda d: (d[0].replace('.stats.json', '').replace( self.output_folder, ''), d[1])).collectAsMap()) LOG.debug('all data: %s', stats) return stats
def main(): parser = argparse.ArgumentParser() parser.add_argument('--obs_len', type=int, default=9, help='Length of observation') parser.add_argument('--pred_len', type=int, default=12, help='Length of prediction') parser.add_argument('--train_fraction', default=0.6, type=float, help='Training set fraction') parser.add_argument('--val_fraction', default=0.2, type=float, help='Validation set fraction') parser.add_argument('--fps', default=2.5, type=float, help='fps') parser.add_argument('--order_frames', action='store_true', help='For CFF') parser.add_argument('--chunk_stride', type=int, default=2, help='Sampling Stride') parser.add_argument('--min_length', default=0.0, type=float, help='Min Length of Primary Trajectory') parser.add_argument('--goal_file', default=None, help='Pkl file for goals (REQUIRED for ORCA Filter Process)') ## For Trajectory categorizing and filtering categorizers = parser.add_argument_group('categorizers') categorizers.add_argument('--static_threshold', type=float, default=1.0, help='Type I static threshold') categorizers.add_argument('--linear_threshold', type=float, default=0.5, help='Type II linear threshold (0.3 for Synthetic)') categorizers.add_argument('--inter_dist_thresh', type=float, default=5, help='Type IIId distance threshold for cone') categorizers.add_argument('--inter_pos_range', type=float, default=15, help='Type IIId angle threshold for cone (degrees)') categorizers.add_argument('--grp_dist_thresh', type=float, default=0.8, help='Type IIIc distance threshold for group') categorizers.add_argument('--grp_std_thresh', type=float, default=0.2, help='Type IIIc std deviation for group') categorizers.add_argument('--acceptance', nargs='+', type=float, default=[0.1, 1, 1, 1], help='acceptance ratio of different trajectory (I, II, III, IV) types') args = parser.parse_args() sc = pysparkling.Context() # Example Conversions # # real datasets write(biwi(sc, 'data/raw/biwi/seq_hotel/obsmat.txt'), 'output_pre/{split}/biwi_hotel.ndjson', args) categorize(sc, 'output_pre/{split}/biwi_hotel.ndjson', args) write(crowds(sc, 'data/raw/crowds/crowds_zara01.vsp'), 'output_pre/{split}/crowds_zara01.ndjson', args) categorize(sc, 'output_pre/{split}/crowds_zara01.ndjson', args) write(crowds(sc, 'data/raw/crowds/crowds_zara03.vsp'), 'output_pre/{split}/crowds_zara03.ndjson', args) categorize(sc, 'output_pre/{split}/crowds_zara03.ndjson', args) write(crowds(sc, 'data/raw/crowds/students001.vsp'), 'output_pre/{split}/crowds_students001.ndjson', args) categorize(sc, 'output_pre/{split}/crowds_students001.ndjson', args) write(crowds(sc, 'data/raw/crowds/students003.vsp'), 'output_pre/{split}/crowds_students003.ndjson', args) categorize(sc, 'output_pre/{split}/crowds_students003.ndjson', args)
def test_save_gz(self): sc = pysparkling.Context() ssc = pysparkling.streaming.StreamingContext(sc, 0.1) ( ssc.textFileStream('LICENS*') .count() .saveAsTextFiles('tests/textout/', suffix='.gz') )
def runtime(self, n=10, processes=1): start = time.time() with futures.ProcessPoolExecutor(processes) as pool: sc = pysparkling.Context(pool=pool, serializer=cloudpickle.dumps, deserializer=pickle.loads) rdd = sc.parallelize(range(n), 10) rdd.map(lambda _: time.sleep(0.1)).collect() return time.time() - start
def test_lock2(self): """Should not be able to create RDDs containing RDDs.""" sc = pysparkling.Context() def parallelize_in_parallelize(): o = sc.parallelize(sc.parallelize(range(x)) for x in range(5)) print(o.map(lambda x: x.collect()).collect()) self.assertRaises(pysparkling.exceptions.ContextIsLockedException, parallelize_in_parallelize)
def test_connect(self): sc = pysparkling.Context() ssc = pysparkling.streaming.StreamingContext(sc, 0.1) result = [] (ssc.textFileStream('LICENS*', process_all=True).count().foreachRDD( lambda rdd: result.append(rdd.collect()[0]))) ssc.start() ssc.awaitTermination(timeout=0.3) self.assertEqual(sum(result), 22)
def test_mapValues(self): sc = pysparkling.Context() ssc = pysparkling.streaming.StreamingContext(sc, 0.1) result = [] (ssc.queueStream([[('a', [5, 8, 2]), ('b', [6, 3, 8])]]).mapValues( sorted).foreachRDD(lambda rdd: result.append(rdd.collect()))) ssc.start() ssc.awaitTermination(timeout=0.15) self.assertEqual(result, [[('a', [2, 5, 8]), ('b', [3, 6, 8])]])
def test_read_chunks(self): sc = pysparkling.Context() ssc = pysparkling.streaming.StreamingContext(sc, 0.1) result = [] (ssc.fileBinaryStream('LICENS*', recordLength=40, process_all=True).count().foreachRDD( lambda rdd: result.append(rdd.collect()[0]))) ssc.start() ssc.awaitTermination(timeout=0.3) self.assertEqual(sum(result), 28)
def main(): sc = pysparkling.Context() # Example Conversions # # real datasets write(biwi(sc, 'data/raw/biwi/seq_hotel/obsmat.txt'), 'output_pre/{split}/biwi_hotel.ndjson') categorize(sc, 'output_pre/{split}/biwi_hotel.ndjson') write(orca_crowdnav(sc, 'data/raw/saleh_dataset/orca_crowdNav.txt'), 'output_pre/{split}/orca_crowdNav.ndjson') categorize(sc, 'output_pre/{split}/orca_crowdNav.ndjson')
def test_cache_empty_partition(): m = Manip() c = pysparkling.Context() rdd = c.parallelize(range(10), 2) rdd = rdd.map(m.trivial_manip_with_debug) rdd = rdd.filter(lambda e: e > 6).cache() print(rdd.collect()) print(rdd.collect()) print(f'count of map executions: {m.count}') assert m.count == 10
def main(): args, eval_args = cli() if args.iccv2019_ablation: assert len(args.backbones) == 1 multi_eval_args = [ eval_args, eval_args + ['--connection-method=blend'], eval_args + [ '--connection-method=blend', '--long-edge=961', '--multi-scale', '--no-multi-scale-hflip' ], eval_args + ['--connection-method=blend', '--long-edge=961', '--multi-scale'], ] names = [ 'singlescale-max', 'singlescale', 'multiscale-nohflip', 'multiscale', ] for eval_args_i, name_i in zip(multi_eval_args, names): run_eval_coco(args.output, args.backbones[0], eval_args_i, output_name=name_i) else: for backbone in args.backbones: run_eval_coco(args.output, backbone, eval_args) sc = pysparkling.Context() stats = (sc.wholeTextFiles(args.output + '*.stats.json').mapValues( json.loads).map(lambda d: (d[0].replace('.stats.json', '').replace( args.output, ''), d[1])).collectAsMap()) LOG.debug('all data: %s', stats) # pretty printing for backbone, data in sorted(stats.items(), key=lambda b_d: b_d[1]['stats'][0]): print('| {backbone: <25} ' '| __{AP:.1f}__ ' '| {APM: <8.1f} ' '| {APL: <8.1f} ' '| {t: <15.0f} ' '| {tdec: <12.0f} |' ''.format( backbone=backbone, AP=100.0 * data['stats'][0], APM=100.0 * data['stats'][3], APL=100.0 * data['stats'][4], t=1000.0 * data['total_time'] / data['n_images'], tdec=1000.0 * data['decoder_time'] / data['n_images'], ))
def test_count(self): sc = pysparkling.Context() ssc = pysparkling.streaming.StreamingContext(sc, 0.1) result = [] (ssc.queueStream([ range(20), ['a', 'b'], ['c'] ]).count().foreachRDD(lambda rdd: result.append(rdd.collect()[0]))) ssc.start() ssc.awaitTermination(timeout=0.3) self.assertEqual(sum(result), 23)
def test_main(self): sc = pysparkling.Context() ssc = pysparkling.streaming.StreamingContext(sc, 0.1) counter = Counter() (ssc.socketBinaryStream( '127.0.0.1', 8125, length='<I').foreachRDD(lambda rdd: counter.update(rdd.collect()))) self.client() ssc.start() ssc.awaitTermination(timeout=0.3) self.assertEqual(counter[b'hellohello'], 1)
def test_groupByKey(self): sc = pysparkling.Context() ssc = pysparkling.streaming.StreamingContext(sc, 0.1) result = [] (ssc.queueStream([[('a', 5), ('b', 8), ('a', 2)], [ ('a', 2), ('b', 3) ]]).groupByKey().mapPartitions(sorted).mapValues(sorted).foreachRDD( lambda rdd: result.append(rdd.collect()))) ssc.start() ssc.awaitTermination(timeout=0.25) self.assertEqual(result, [[('a', [2, 5]), ('b', [8])], [('a', [2]), ('b', [3])]])
def test_connect(self): sc = pysparkling.Context() ssc = pysparkling.streaming.StreamingContext(sc, 0.1) counter = Counter() (ssc.socketTextStream( '127.0.0.1', 8123).foreachRDD(lambda rdd: counter.update(''.join(rdd.collect())) if rdd.collect() else None)) self.client() ssc.start() ssc.awaitTermination(timeout=0.3) self.assertEqual(counter['a'], 20)
def test_retry(self): class EverySecondCallFails: def __init__(self): self.attempt = 0 def __call__(self, value): self.attempt += 1 if self.attempt % 2 == 1: raise Exception return value data = list(range(6)) rdd = pysparkling.Context().parallelize(data, 3) result = rdd.mapPartitions(EverySecondCallFails()).collect() self.assertEqual(result, data)
def run(self, n=2000, to_kv=None, format_='hello'): c = pysparkling.Context() stream_c = pysparkling.streaming.StreamingContext(c, 1.0) counts = [] sensor_sums = defaultdict(float) sensor_squares = defaultdict(float) sensor_counts = defaultdict(int) if format_ not in ('bello', 'struct'): t = stream_c.socketTextStream('localhost', self.port) else: length = {'bello': 5, 'struct': 8}[format_] t = stream_c.socketBinaryStream('localhost', self.port, length) t.count().foreachRDD(lambda _, rdd: counts.append(rdd.collect()[0])) if to_kv is not None: def update(rdd): for k, v in rdd.collect(): sensor_sums[k] += sum(v) sensor_squares[k] += sum(vv**2 for vv in v) sensor_counts[k] += len(v) t.map(to_kv).groupByKey().foreachRDD(lambda _, rdd: update(rdd)) self.client(n, format_=format_) stream_c.start() stream_c.awaitTermination(timeout=5.0) result = max(counts) if counts else 0 sensor_expections = { # expectation of X and X^2 k: (sensor_sums[k] / v, sensor_squares[k] / v) for k, v in sensor_counts.items() } sensors = { k: (ex_ex2[0], math.sqrt(ex_ex2[1] - ex_ex2[0]**2)) for k, ex_ex2 in sensor_expections.items() } print('run: n = {}, counts = {}, result = {}' ''.format(n, counts, result)) print('sensors = {}'.format(sensors)) time.sleep(self.pause) self.port += 1 return result
def main(): sc = pysparkling.Context() sc.parallelize( linear(0, 0) + linear_static(1, 1000, perpendicular_distance=0.2) + linear_static(2, 2000, perpendicular_distance=0.5) + linear_random(3, 3000, perpendicular_distance=0.2, random_radius=0.2) + linear_random(4, 4000, perpendicular_distance=1.0, random_radius=0.5) + opposing(5, 5000, perpendicular_distance=0.2) + opposing(6, 6000, perpendicular_distance=1.5) + sf_opposing(7, 7000, perpendicular_distance=-0.3) + sf_opposing(8, 8000, perpendicular_distance=0.3)).map( trajnettools.writers.trajnet).saveAsTextFile( 'data/testscenes.ndjson') (sc.parallelize(range(1000)).flatMap(lambda i: sf_opposing( i, i * 1000, perpendicular_distance=((i % 2) - 0.5) * 2.0 * 0.3)).map( trajnettools.writers.trajnet).saveAsTextFile( 'data/socialforce_overtrain.ndjson'))