def test_stream_attrs(types_json_path): # Valid path with gpsdio.open(types_json_path) as src: assert src.name == types_json_path # Stream with no name with gpsdio.open(StringIO(), driver='NewlineJSON', compression=False) as src: assert src.name == "<unknown name>" # I/O mode for m in ('r', 'w', 'a'): with gpsdio.open(StringIO(), mode=m, driver='NewlineJSON', compression=False) as src: assert src.mode == m # Schema structure with gpsdio.open(types_json_path) as src: # Stream is a driver that should also have the schema attached assert src._stream.schema == gpsdio.schema.build_schema() # Stop method, which is really just included because there's a start(), # so there should be a stop() with gpsdio.drivers.NewlineJSONDriver() as drv: drv.start(types_json_path) assert not drv.closed assert not drv.f.closed drv.stop() assert drv.closed assert drv.f.closed
def test_read_both(types_nmea_path, types_nmea_gz_path): with gpsdio.open(types_nmea_path) as dsrc, gpsdio.open(types_nmea_gz_path) as csrc: for idx, (d, c) in enumerate(zip(dsrc, csrc)): assert d == c assert 'mmsi' in d and 'type' in d assert 'mmsi' in c and 'type' in c assert idx > 0
def load(ctx, outfile, input_driver_opts, output_driver, output_driver_opts, output_compression, output_compression_opts): """ Load newline JSON msgs from stdin to a file. """ logger.setLevel(ctx.obj['verbosity']) logger.debug('Starting load') with gpsdio.open( '-', driver='NewlineJSON', compression=False, do=input_driver_opts, **ctx.obj['idefine']) as src: with gpsdio.open( outfile, 'w', driver=output_driver, compression=output_compression, co=output_compression_opts, do=output_driver_opts, **ctx.obj['odefine']) as dst: for msg in src: dst.write(msg)
def test_sort(): cleanup() try: with gpsdio.open("test.csv", "w") as f: for i in range(0, 1000): f.writerow({'timestamp': randdate(), 'lat': 180*random.random()-90.0, 'lon': 360*random.random()-180.0, 'foo': 4711}) with open("test.csv") as f: for row in csv.DictReader(f): assert 'timestamp' in row assert 'lat' in row assert 'lon' in row assert 'extra' in row extra = json.loads(row['extra']) assert 'foo' in extra with gpsdio.open('test.csv') as f: for row in f: print row assert isinstance(row['timestamp'], datetime.datetime) assert isinstance(row['lat'], float) assert 'foo' in row finally: cleanup()
def test_read_from_write_stream(types_msg_gz_path, tmpdir): pth = str(tmpdir.mkdir('test').join('rw-io.json')) with gpsdio.open(types_msg_gz_path) as src, \ gpsdio.open(pth, 'w', driver='NewlineJSON') as dst: for msg in src: dst.write(msg) with pytest.raises(TypeError): next(dst)
def test_cat(types_msg_gz_path, msg_almost_equal): result = subprocess.check_output(['gpsdio', 'cat', types_msg_gz_path]).decode('utf-8') with gpsdio.open(types_msg_gz_path) as expected: with gpsdio.open(six.moves.StringIO(result), driver='NewlineJSON', compression=False) as actual: for e, a in zip(expected, actual): print(e) print(a) msg_almost_equal(e, a)
def test_cat_geojson_non_posit(types_json_path, tmpdir): pth = str(tmpdir.mkdir('test').join('test_cat_geojson_non_posit.json')) with gpsdio.open(types_json_path) as src, gpsdio.open(pth, 'w') as dst: # Write a single type 1 and a single type 5 - one posit and one non-posit messages = {msg['type']: msg for msg in src} dst.write(messages[1]) dst.write(messages[5]) result = subprocess.check_output(['gpsdio', 'cat', '--geojson', pth]).decode('utf-8') geojson = [json.loads(l) for l in result.splitlines()] assert len(geojson) is 1 assert geojson[0]['properties']['type'] is 1
def test_sort(): with unittestfiles(): with gpsdio.open('unittest.out.msg.xz', "w") as f: f.writerows([ {"mmsi": "123", "name": "Rainbow warrior", "speed": 1.0}, {"name": "France", "speed": 1.1}, {"mmsi": "456", "name": "Rainbow warrior II", "speed": 2.0} ]) with gpsdio.open('unittest.out.msg.xz') as f: rows = list(f) assert len(rows) == 3 assert rows[0]["name"] == "Rainbow warrior"
def test_validate_msg(types_json_path): # Not enough fields with gpsdio.open(types_json_path) as src: with pytest.raises(gpsdio.errors.SchemaError): src.validate_msg({'type': 1}) # Too many fields with gpsdio.open(types_json_path) as src: msg = { k: v.get('default', None) for k, v in six.iteritems(src.schema[1]) } msg['other'] = None with pytest.raises(gpsdio.errors.SchemaError): src.validate_msg(msg)
def test_bad_message(): message = {'mmsi': 123456789, 'type': 1} stream = StringIO(json.dumps(message)) with pytest.raises(gpsdio.errors.SchemaError): with gpsdio.open(stream, driver='NewlineJSON', compression=False) as src: next(src)
def _processor(args): """ Create an empty numpy array, open a file, and iterate over all the messages with latitude and longitude. Every time a point intersects an array element the element gets a +1. Parameters ---------- args : dict ctx_obj : dict The `ctx.obj` from the parent Click context if it is a dictionary, or an empty dict. filepath : str Path to the file to process. meta : dict Metadata for a `rasterio` raster. There are issues pickling `affine.Affine()` so `meta['transform']` are the affine elements as a tuple and a local instance of `affine.Affine()` is constructed before processing. field : str or None If summing a field rather than computing density this is the field name. Returns ------- np.array """ ctx_obj = args['ctx_obj'] filepath = args['filepath'] meta = args['meta'] field = args['field'] filter_expr = "isinstance(msg.get('lat'), (int, float)) and " \ "isinstance(msg.get('lon'), (int, float))" if field is not None: filter_expr += " and '%s' in msg" % field log.debug("Starting %s" % filepath) log.debug("Filter expr: %s" % filter_expr) data = np.zeros((meta['height'], meta['width']), dtype=meta['dtype']) width = meta['width'] height = meta['height'] aff = meta['transform'] with gpsdio.open(filepath, driver=ctx_obj.get('i_drv'), compression=ctx_obj.get('i_cmp')) as src: for msg in gpsdio.filter( src, "isinstance(msg.get('lat'), (int, float)) and " "isinstance(msg.get('lon'), (int, float))"): col, row = (msg['lon'], msg['lat']) * ~aff if 0 <= row < height and 0 <= col < width: if field is not None: val = msg[field] else: val = 1 data[row][col] += val return data
def test_cat_geojson(types_msg_gz_path): result = subprocess.check_output( ['gpsdio', 'cat', '--geojson', types_msg_gz_path]).decode('utf-8') actual = (json.loads(l) for l in result.splitlines()) with gpsdio.open(types_msg_gz_path) as expected: for e, a in zip(gpsdio.ops.geojson(expected), actual): assert a['type'] == 'Feature' assert a['geometry']['type'] == 'Point' assert a == a
def test_no_validate_messages(): message = {'field': 'val'} stream = StringIO(json.dumps(message)) with gpsdio.open(stream, driver='NewlineJSON', compression=False, _check=False) as src: msgs = list(src) assert len(msgs) is 1 assert msgs[0] == message
def test_load(types_json_path, types_msg_gz_path, tmpdir, compare_msg): pth = str(tmpdir.mkdir('test').join('test_load')) with open(types_json_path) as f: stdin_input = f.read() result = CliRunner().invoke(gpsdio.cli.main.main_group, [ 'load', '--o-drv', 'NewlineJSON', '--o-cmp', 'GZIP', pth ], input=stdin_input) assert result.exit_code is 0 with gpsdio.open(types_msg_gz_path) as expected, \ gpsdio.open(pth, driver='NewlineJSON', compression='GZIP') as actual: for e, a in zip(expected, actual): assert compare_msg(e, a)
def test_io_on_closed_stream(tmpdir): # Make sure the tempfile actually appears on disk pth = str(tmpdir.mkdir('test').join('test_io_on_closed_file')) with open(pth, 'w') as f: f.write('') # Have to check in read and write in order to trigger all the exceptions for mode in ('r', 'w'): with gpsdio.open(pth, mode=mode, driver='NewlineJSON') as src: src.close() assert src.closed with pytest.raises(Exception): next(src) with pytest.raises(Exception): src.write(None)
def insp(ctx, infile, interpreter, input_driver, input_compression, input_driver_opts, input_compression_opts): # A good idea borrowed from Fiona and rasterio # https://github.com/Toblerity/Fiona # https://github.com/Mapbox/rasterio """ Open a dataset in an interactive inspector. IPython will be used if it can be imported unless otherwise specified. Analogous to doing: \b >>> import gpsdio >>> with gpsdio.open(infile) as stream: ... # Operations """ logger.setLevel(ctx.obj['verbosity']) logger.debug('Starting insp') header = os.linesep.join( ("gpsdio {gversion} Interactive Inspector Session (Python {pyversion})" .format(gversion=gpsdio.__version__, pyversion='.'.join(map(str, sys.version_info[:3]))), 'Try "help(src)" or "next(src)".')) with gpsdio.open(infile, driver=input_driver, compression=input_compression, do=input_driver_opts, co=input_compression_opts, **ctx.obj['idefine']) as src: scope = {'src': src, 'gpsdio': gpsdio} if not interpreter: code.interact(header, local=scope) elif interpreter == 'ipython': import IPython IPython.InteractiveShell.banner1 = header IPython.start_ipython(argv=[], user_ns=scope) else: raise click.BadParameter( "Unrecognized interpreter: {}".format(interpreter))
def test_filter(types_msg_gz_path, tmpdir, runner): pth = str(tmpdir.mkdir('test').join('test_filter')) result = runner.invoke(gpsdio.cli.main.main_group, [ 'etl', '--o-drv', 'MsgPack', '--o-cmp', 'BZ2', '--filter', "lat and lon", '--sort', 'lat', types_msg_gz_path, pth ]) assert result.exit_code is 0 prev = None with gpsdio.open(pth, driver='MsgPack', compression='BZ2') as actual: for msg in actual: if prev is None: prev = msg else: assert msg['lat'] >= prev['lat']
def test_sort_time(types_msg_gz_path, tmpdir, runner): # Process everything and sort on timestamp pth = str(tmpdir.mkdir('test').join('test_sort_time')) result = runner.invoke(gpsdio.cli.main.main_group, [ 'etl', '--o-drv', 'MsgPack', '--o-cmp', 'BZ2', '--sort', 'timestamp', types_msg_gz_path, pth ]) print(result.output) assert result.exit_code is 0 prev = None with gpsdio.open(pth, driver='MsgPack', compression='BZ2') as actual: for msg in actual: if prev is None: prev = msg else: assert msg['timestamp'] >= prev['timestamp']
def cat(ctx, infile, input_driver, geojson, input_compression, input_driver_opts, input_compression_opts, output_driver_opts): """ Print messages to stdout as newline JSON. """ logger.setLevel(ctx.obj['verbosity']) logger.debug('Starting cat') with gpsdio.open(infile, driver=input_driver, compression=input_compression, do=input_driver_opts, co=input_compression_opts, **ctx.obj['idefine']) as src: base_driver = gpsdio.base.BaseDriver(schema=src.schema) if geojson: outlib = nlj kwargs = output_driver_opts kwargs.update(json_lib=kwargs.get('json_lib', ujson)) else: outlib = gpsdio kwargs = { 'driver': 'NewlineJSON', 'compression': False, 'do': output_driver_opts } kwargs.update(**ctx.obj['odefine']) out = click.get_text_stream('stdout') with outlib.open(out, 'w', **kwargs) as dst: for msg in src: if geojson: if 'lat' in msg and 'lon' in msg: # Dump datetimes to string msg = gpsdio.ops.msg2geojson(base_driver.dump(msg)) else: continue dst.write(msg)
def __init__(self, filename, __bare__ = False, **kw): self.filename = filename for key, value in kw.iteritems(): setattr(self, key, value) if __bare__: return self.root = quad_tree_node.QuadtreeNode(self) print "Loading data..." with utils.msgpack_open(self.root.source_filename, "w") as outf: with gpsdio.open(filename) as f: for row in f: out_row = {} for key, value in row.iteritems(): if isinstance(value, datetime.datetime): value = float(value.strftime("%s")) * 1000.0 if not isinstance(value, (float, int, bool)): continue out_row[key] = value outf.write(out_row) self.root.count += 1
def test_read(types_nmea_path): with gpsdio.open(types_nmea_path) as src: validate_stream(src)
import sys import math names = set() classifications = {} with open(sys.argv[2]) as f: for row in csv.DictReader(f, skipinitialspace=True): mmsi = row['mmsi'] = int(row['mmsi']) if mmsi not in classifications: classifications[mmsi] = {} t = row['start_hour_ms'] = datetime.datetime.utcfromtimestamp( float(row['start_hour_ms']) / 1000.0) if t not in classifications[mmsi]: classifications[mmsi][t] = set() classifications[mmsi][t].add(row['classification']) names.add(row['classification']) with gpsdio.open(sys.argv[3], "w") as outf: with gpsdio.open(sys.argv[1]) as f: c = 0 for row in f: mmsiclass = classifications[int(row['mmsi'])] keys = [ key for key in mmsiclass.iterkeys() if key <= row['timestamp'] and key + datetime.timedelta(hours=1) >= row['timestamp'] ] if keys: key = keys[0] clss = mmsiclass[key] total = len(clss) if total > 0: fishing = len(
def test_default_mode_is_read(types_msg_path): with gpsdio.open(types_msg_path) as stream: assert stream.mode == 'r'
"measure_new_score_21600", "measure_new_score_3600", "measure_new_score_43200", "measure_new_score_86400", "measure_pos_10800", "measure_pos_1800", "measure_pos_21600", "measure_pos_3600", "measure_pos_43200", "measure_pos_86400", "measure_speed", "measure_speed_diff", "measure_speedavg_10800", "measure_speedavg_1800", "measure_speedavg_21600", "measure_speedavg_3600", "measure_speedavg_43200", "measure_speedavg_86400", "measure_speedstddev_10800", "measure_speedstddev_1800", "measure_speedstddev_21600", "measure_speedstddev_3600", "measure_speedstddev_43200", "measure_speedstddev_86400", "speed", "heading", "course", "measure_course", "timestamp", "timestamp_diff" ] length = 0 with gpsdio.open(sys.argv[1], skip_failures=True) as f: for row in f: length += 1 x = numpy.zeros(length, dtype=[(name, "f8") for name in cols + ['mmsi']]) segids = {} segid_counter = 0 with gpsdio.open(sys.argv[1], skip_failures=True) as f: for rownum, row in enumerate(f): for col in cols: val = row.get(col, numpy.Infinity) if isinstance(val, datetime.datetime): val = float(val.strftime("%s")) x[col][rownum] = val
import vessel_scoring.models import vessel_scoring.utils import gpsdio import numpy import sys models = vessel_scoring.models.load_models() model = sys.argv[1] input = sys.argv[2] output = sys.argv[3] if input.endswith(".msg"): with gpsdio.open(output, "w") as fout: with gpsdio.open(input) as fin: fout.write(models[model].predict_messages(fin)) else: data = numpy.load(input)['x'] datalen = len(data) data = vessel_scoring.utils.numpy_to_messages(data) data = models[model].predict_messages(data) data = vessel_scoring.utils.messages_to_numpy(data, datalen) numpy.savez(output, x=data)
def info(ctx, infile, indent, meta_member, sort_field, with_mmsi_hist, with_type_hist, with_field_hist, with_all, input_driver, input_driver_opts, input_compression, input_compression_opts): """ Print metadata about a datasource as JSON. Can optionally print a single item as a string. One caveat of this tool is that JSON does not support integer keys, which means that the keys of items like `type_histogram` and `mmsi_histogram` have been converted to a string when in reality they should be integers. Tools reading the JSON output will need account for this when parsing. """ logger.setLevel(ctx.obj['verbosity']) logger.debug('Starting info') if meta_member == 'mmsi_histogram': with_mmsi_hist = True if meta_member == 'type_histogram': with_type_hist = True if meta_member == 'field_histogram': with_field_hist = True xmin = ymin = xmax = ymax = None ts_min = ts_max = None mmsi_hist = {} msg_typehist = {} field_hist = {} is_sorted = True prev_ts = None with gpsdio.open(infile, driver=input_driver, compression=input_compression, do=input_driver_opts, co=input_compression_opts, **ctx.obj['idefine']) as src: idx = 0 # In case file is empty for idx, msg in enumerate(src): # ts = msg.get('timestamp') x = msg.get('lon') y = msg.get('lat') mmsi = msg.get('mmsi') msg_type = msg.get('type') sort_val = msg.get(sort_field) for key in msg.keys(): field_hist.setdefault(key, 0) field_hist[key] += 1 if sort_val is not None: # Adjust min and max timestamp if ts_min is None or sort_val < ts_min: ts_min = sort_val if ts_max is None or sort_val > ts_max: ts_max = sort_val # Figure out if the data is sorted by time if prev_ts is None: prev_ts = sort_val elif (sort_val and prev_ts) and sort_val < prev_ts: is_sorted = False if x is not None and y is not None: # Adjust bounding box if xmin is None or x < xmin: xmin = x if ymin is None or y < ymin: ymin = y if xmax is None or x > xmax: xmax = x if ymax is None or y > ymax: ymax = y # Type histogram msg_typehist.setdefault(msg_type, 0) msg_typehist[msg_type] += 1 # MMSI histogram mmsi_hist.setdefault(mmsi, 0) mmsi_hist[mmsi] += 1 stats = { 'bounds': (xmin, ymin, xmax, ymax), 'count': idx + 1, 'min_timestamp': gpsdio.validate.datetime2str(ts_min), 'max_timestamp': gpsdio.validate.datetime2str(ts_max), 'sorted': is_sorted, 'num_unique_mmsi': len(set(mmsi_hist.keys())), 'num_unique_type': len(set(msg_typehist.keys())), 'num_unique_field': len(set(field_hist.keys())) } if with_all or with_mmsi_hist: stats['mmsi_histogram'] = OrderedDict( ((k, mmsi_hist[k]) for k in sorted(mmsi_hist.keys()))) if with_all or with_type_hist: stats['type_histogram'] = OrderedDict( ((k, msg_typehist[k]) for k in sorted(msg_typehist.keys()))) if with_all or with_field_hist: stats['field_histogram'] = OrderedDict( ((k, field_hist[k]) for k in sorted(field_hist.keys()))) stats = OrderedDict((k, stats[k]) for k in sorted(stats.keys())) if meta_member: if isinstance(stats[meta_member], (tuple, list)): click.echo(" ".join((map(str, stats[meta_member])))) elif isinstance(stats[meta_member], (dict, bool)): click.echo(json.dumps(stats[meta_member], indent=indent)) else: click.echo(stats[meta_member]) else: click.echo(json.dumps(stats, indent=indent))
def etl(ctx, infile, outfile, filter_expr, sort_field, input_driver, input_driver_opts, input_compression, input_compression_opts, output_driver, output_driver_opts, output_compression, output_compression_opts): """ Format conversion, filtering, and sorting. Messages are filtered before sorting to limit the amount of data kept in memory. Filtering expressions take the form of Python boolean expressions and provide access to fields and the entire message via a custom scope. Each field name can be referenced directly and the entire messages is available via a `msg` variable. It is important to remember that `gpsdio` converts `timestamps` to `datetime.datetime()` objects internally. Since fields differ by message type any expression that raises a `NameError` when evaluated is considered a failure. Any Python expression that evalues as `True` or `False` can be used so so expressions can be combined into a single filter using `and` or split into multiple by using one instance of `--filter` for each side of the `and`. Only process messages containing a timestamp: \b $ gpsdio ${INFILE} ${OUTFILE} \\ --filter "'timestamp' in msg" Only process messages from May 2010 for a specific MMSI: \b $ gpsdio ${INFIE} ${OUTFILE} \\ --filter "timestamp.month == 5 and timestamp.year == 2010"" \\ --filter "mmsi == 123456789" Filter and sort: \b $ gpsdio ${INFILE} ${OUTFILE} \\ --filter "timestamp.year == 2010" \\ --sort timestamp """ logger.setLevel(ctx.obj['verbosity']) logger.debug('Starting etl') with gpsdio.open(infile, driver=input_driver, compression=input_compression, do=input_driver_opts, co=input_compression_opts, **ctx.obj['idefine']) as src: with gpsdio.open(outfile, 'w', driver=output_driver, compression=output_compression, do=output_driver_opts, co=output_compression_opts, **ctx.obj['odefine']) as dst: iterator = gpsdio.ops.filter(filter_expr, src) if filter_expr else src for msg in gpsdio.ops.sort(iterator, sort_field) if sort_field else iterator: dst.write(msg)
def test_no_detect_compression(types_msg_path): with gpsdio.open(types_msg_path, compression=False) as actual, \ gpsdio.open(types_msg_path) as expected: for e_line, a_line in zip(expected, actual): assert e_line == a_line
def gpsdio_filtersplit(ctx, infile, outfile, split, buckets, filter_expr, timeresolution, filter_env_expr, change_exprs): """ Filter by expression and split by field. Filter expressions are passed to `eval()` and must be valid Python code. Column names are available as as variables and types are maintained. Examples: Speed range --filter '0.5 < speed < 10.0' Specific MMSI's --filter "mmsi in ['123456789', '987654321']" Bounding box --filter "20 <= lon <= 30 and -10 <= lat <= 30" Splitting into 4 buckets of mmsi:s for parallellization of further processing: --split "mmsi" --buckets 4 Using external functions to evaluate rows --filter "fishing_score(lat, lon, speed, course) > 0.5" --filter-env scoring_functions.py Using a change expression to split by a lat/lon grid: --change "latgrid=round(row.get('lat',0)/10)*10" --change "longrid=round(row.get('lon',0)/10)*10" --split "latgrid,longrid" """ change_exprs = {key: value for key, value in (item.split("=") for item in change_exprs)} split = split.split(",") filter_env = {} if filter_env_expr: exec filter_env_expr in filter_env, filter_env if split is None: split = ["mmsi"] def getKey(row, key): value = row[key] if isinstance(value, datetime.datetime): return value.strftime(timeresolution) else: return value bucketinfo = {} splitkey = '' with gpsdio.open(infile, driver=ctx.obj['i_drv'], do=ctx.obj['i_drv_opts'], compression=ctx.obj['i_cmp'], co=ctx.obj['i_cmp_opts']) as f: for row in f: env_vars = dict(filter_env) env_vars.update(row) env_vars['row'] = env_vars for key, expr in change_exprs.iteritems(): env_vars[key] = row[key] = eval(expr, env_vars) if filter_expr is not None: if not eval(filter_expr, env_vars): continue try: splitkey = ','.join('%s=%s' % (key, getKey(row, key)) for key in split) except KeyError: pass else: if buckets is not None: bucket = str(int(hashlib.sha224(splitkey).hexdigest(), 16) % buckets) bucketinfo[splitkey] = bucket splitkey = "bucket=%s" % (bucket,) with gpsdio.open(outfile % {'split': splitkey}, "a", driver=ctx.obj['o_drv'], do=ctx.obj['o_drv_opts'], compression=ctx.obj['o_cmp'], co=ctx.obj['o_cmp_opts']) as f: f.writerow(row) if buckets is not None: with gpsdio.open(outfile % {'split': 'bucketlist'}, "a", driver=ctx.obj['o_drv'], do=ctx.obj['o_drv_opts'], compression=ctx.obj['o_cmp'], co=ctx.obj['o_cmp_opts']) as f: for splitkey, bucket in bucketinfo.iteritems(): f.writerow({'type': -1, 'splitkey': splitkey, 'bucket': bucket})
def test_write_bad_msg(tmpdir): pth = str(tmpdir.mkdir('test').join('test_write_bad_msg')) with gpsdio.open(pth, 'w', driver='NewlineJSON') as dst: with pytest.raises(Exception): dst.write({'field': six})
def gpsdio_sort(ctx, infile, outfile, cols='timestamp'): """ Sort messages by column in large files. Sorts messages in an arbitrarily large file according to an arbitrary set of columns, by default 'timestamp'. The unix `sort` command is used so large tempfiles may be written. """ # Make sure unix sort is available for p in os.environ['PATH'].split(os.pathsep): if os.access(os.path.join(p, 'sort'), os.X_OK): break else: raise click.ClickException("Unix sort is not on path.") # Figure out if we can get the driver and compression out of the # parent click context if isinstance(ctx.obj, dict): i_drv = ctx.obj.get('i_drv') i_cmp = ctx.obj.get('i_cmp') o_drv = ctx.obj.get('o_drv') o_cmp = ctx.obj.get('o_cmp') else: i_drv = None i_cmp = None o_drv = None o_cmp = None tempfile1 = outfile + ".tmp1" tempfile2 = outfile + ".tmp2" cols = cols.split(",") def mangle(item): if isinstance(item, int): return "%020d" % item elif isinstance(item, float): return "%040.20f" % item elif isinstance(item, datetime.datetime): return item.strftime("%Y-%m-%dT%H:%M:%S.%fZ") else: return unicode(item).encode("utf-8") def getKey(row): return ' : '.join(mangle(row.get(col, '')) for col in cols) def format_row(row): return msgpack.dumps( gpsdio.schema.export_msg(row) ).replace('\1', '\1\1').replace('\n', '\1\2') def load_row(row): return gpsdio.schema.import_msg( msgpack.loads( row.replace('\1\2', '\n').replace('\1\1', '\1'))) with gpsdio.open(infile, driver=i_drv, compression=i_cmp) as i: with open(tempfile1, "w") as t: for line in i: key = getKey(line) t.write(key + " * " + format_row(line) + '\n') # Collate using C locale to sort by character value # See http://unix.stackexchange.com/questions/31886/how-do-get-unix-sort-to-sort-in-same-order-as-java-by-unicode-value/31922#31922 # for infor on why this works for utf-8 text too env = dict(os.environ) env['LC_COLLATE'] = 'C' subprocess.call(["sort", tempfile1, "-o", tempfile2], env=env) with open(tempfile2) as t: with gpsdio.open(outfile, "w", driver=o_drv, compression=o_cmp) as o: for line in t: o.writerow(load_row(line.split(" * ", 1)[1][:-1])) os.unlink(tempfile1) os.unlink(tempfile2)
def test_read_compressed(types_nmea_gz_path): with gpsdio.open(types_nmea_gz_path) as src: idx = 0 for idx, msg in enumerate(src): assert 'mmsi' in msg and 'type' in msg assert idx > 0
def test_wrong_mode(types_json_path): with pytest.raises(ValueError): with gpsdio.open(types_json_path, mode='bad mode') as src: pass
import sys import math names=set() classifications = {} with open("classification-hourlyResultsAll.txt") as f: for row in csv.DictReader(f, skipinitialspace=True): mmsi = row['mmsi'] = int(row['mmsi']) if mmsi not in classifications: classifications[mmsi] = {} t = row['start_hour_ms'] = datetime.datetime.utcfromtimestamp(float(row['start_hour_ms'])/1000.0) if t not in classifications[mmsi]: classifications[mmsi][t] = set() classifications[mmsi][t].add(row['classification']) names.add(row['classification']) with gpsdio.open("classified.msg", "w") as outf: with gpsdio.open("tracks.msg") as f: c = 0 for row in f: mmsiclass = classifications[int(row['mmsi'])] keys = [key for key in mmsiclass.iterkeys() if key <= row['timestamp'] and key + datetime.timedelta(hours=1) >= row['timestamp']] if keys: key = keys[0] clss = mmsiclass[key] total = len(clss) if total > 0: fishing = len([cls for cls in clss if cls and cls != 'Not fishing']) row['classification'] = float(fishing) / float(total) for key in row: if isinstance(row[key], float) and (math.isnan(row[key]) or math.isinf(row[key])):