Example #1
0
def test_stream_attrs(types_json_path):

    # Valid path
    with gpsdio.open(types_json_path) as src:
        assert src.name == types_json_path

    # Stream with no name
    with gpsdio.open(StringIO(), driver='NewlineJSON',
                     compression=False) as src:
        assert src.name == "<unknown name>"

    # I/O mode
    for m in ('r', 'w', 'a'):
        with gpsdio.open(StringIO(),
                         mode=m,
                         driver='NewlineJSON',
                         compression=False) as src:
            assert src.mode == m

    # Schema structure
    with gpsdio.open(types_json_path) as src:
        # Stream is a driver that should also have the schema attached
        assert src._stream.schema == gpsdio.schema.build_schema()

    # Stop method, which is really just included because there's a start(),
    # so there should be a stop()
    with gpsdio.drivers.NewlineJSONDriver() as drv:
        drv.start(types_json_path)
        assert not drv.closed
        assert not drv.f.closed
        drv.stop()
        assert drv.closed
        assert drv.f.closed
Example #2
0
def test_read_both(types_nmea_path, types_nmea_gz_path):
    with gpsdio.open(types_nmea_path) as dsrc, gpsdio.open(types_nmea_gz_path) as csrc:
        for idx, (d, c) in enumerate(zip(dsrc, csrc)):
            assert d == c
            assert 'mmsi' in d and 'type' in d
            assert 'mmsi' in c and 'type' in c
        assert idx > 0
Example #3
0
File: load.py Project: WuArj/gpsdio
def load(ctx, outfile, input_driver_opts,
         output_driver, output_driver_opts, output_compression, output_compression_opts):

    """
    Load newline JSON msgs from stdin to a file.
    """

    logger.setLevel(ctx.obj['verbosity'])
    logger.debug('Starting load')

    with gpsdio.open(
            '-',
            driver='NewlineJSON',
            compression=False,
            do=input_driver_opts,
            **ctx.obj['idefine']) as src:

        with gpsdio.open(
                outfile, 'w',
                driver=output_driver,
                compression=output_compression,
                co=output_compression_opts,
                do=output_driver_opts,
                **ctx.obj['odefine']) as dst:

            for msg in src:
                dst.write(msg)
Example #4
0
def test_sort():
    cleanup()
    try:
        with gpsdio.open("test.csv", "w") as f:
            for i in range(0, 1000):
                f.writerow({'timestamp': randdate(), 'lat': 180*random.random()-90.0, 'lon': 360*random.random()-180.0, 'foo': 4711})

        with open("test.csv") as f:
            for row in csv.DictReader(f):
                assert 'timestamp' in row
                assert 'lat' in row
                assert 'lon' in row
                assert 'extra' in row
                extra = json.loads(row['extra'])
                assert 'foo' in extra

        with gpsdio.open('test.csv') as f:
            for row in f:
                print row
                assert isinstance(row['timestamp'], datetime.datetime)
                assert isinstance(row['lat'], float)
                assert 'foo' in row

    finally:
        cleanup()
Example #5
0
def test_read_from_write_stream(types_msg_gz_path, tmpdir):
    pth = str(tmpdir.mkdir('test').join('rw-io.json'))
    with gpsdio.open(types_msg_gz_path) as src, \
            gpsdio.open(pth, 'w', driver='NewlineJSON') as dst:
        for msg in src:
            dst.write(msg)
        with pytest.raises(TypeError):
            next(dst)
Example #6
0
def test_cat(types_msg_gz_path, msg_almost_equal):
    result = subprocess.check_output(['gpsdio', 'cat',
                                      types_msg_gz_path]).decode('utf-8')
    with gpsdio.open(types_msg_gz_path) as expected:
        with gpsdio.open(six.moves.StringIO(result),
                         driver='NewlineJSON',
                         compression=False) as actual:
            for e, a in zip(expected, actual):
                print(e)
                print(a)
                msg_almost_equal(e, a)
Example #7
0
def test_cat_geojson_non_posit(types_json_path, tmpdir):
    pth = str(tmpdir.mkdir('test').join('test_cat_geojson_non_posit.json'))
    with gpsdio.open(types_json_path) as src, gpsdio.open(pth, 'w') as dst:
        # Write a single type 1 and a single type 5 - one posit and one non-posit
        messages = {msg['type']: msg for msg in src}
        dst.write(messages[1])
        dst.write(messages[5])
    result = subprocess.check_output(['gpsdio', 'cat', '--geojson',
                                      pth]).decode('utf-8')
    geojson = [json.loads(l) for l in result.splitlines()]
    assert len(geojson) is 1
    assert geojson[0]['properties']['type'] is 1
Example #8
0
def test_sort():
    with unittestfiles():
        with gpsdio.open('unittest.out.msg.xz', "w") as f:
            f.writerows([
                    {"mmsi": "123", "name": "Rainbow warrior", "speed": 1.0},
                    {"name": "France", "speed": 1.1},
                    {"mmsi": "456", "name": "Rainbow warrior II", "speed": 2.0}
                    ])
        with gpsdio.open('unittest.out.msg.xz') as f:
            rows = list(f)

        assert len(rows) == 3
        assert rows[0]["name"] == "Rainbow warrior"
Example #9
0
def test_validate_msg(types_json_path):
    # Not enough fields
    with gpsdio.open(types_json_path) as src:
        with pytest.raises(gpsdio.errors.SchemaError):
            src.validate_msg({'type': 1})
    # Too many fields
    with gpsdio.open(types_json_path) as src:
        msg = {
            k: v.get('default', None)
            for k, v in six.iteritems(src.schema[1])
        }
        msg['other'] = None
        with pytest.raises(gpsdio.errors.SchemaError):
            src.validate_msg(msg)
Example #10
0
def test_bad_message():
    message = {'mmsi': 123456789, 'type': 1}
    stream = StringIO(json.dumps(message))
    with pytest.raises(gpsdio.errors.SchemaError):
        with gpsdio.open(stream, driver='NewlineJSON',
                         compression=False) as src:
            next(src)
Example #11
0
def _processor(args):

    """
    Create an empty numpy array, open a file, and iterate over all the messages
    with latitude and longitude.  Every time a point intersects an array element
    the element gets a +1.

    Parameters
    ----------
    args : dict
        ctx_obj : dict
            The `ctx.obj` from the parent Click context if it is a dictionary,
            or an empty dict.
        filepath : str
            Path to the file to process.
        meta : dict
            Metadata for a `rasterio` raster.  There are issues pickling
            `affine.Affine()` so `meta['transform']` are the affine elements
            as a tuple and a local instance of `affine.Affine()` is
            constructed before processing.
        field : str or None
            If summing a field rather than computing density this is the field name.

    Returns
    -------
    np.array
    """

    ctx_obj = args['ctx_obj']
    filepath = args['filepath']
    meta = args['meta']
    field = args['field']

    filter_expr = "isinstance(msg.get('lat'), (int, float)) and " \
                  "isinstance(msg.get('lon'), (int, float))"

    if field is not None:
        filter_expr += " and '%s' in msg" % field

    log.debug("Starting %s" % filepath)
    log.debug("Filter expr: %s" % filter_expr)

    data = np.zeros((meta['height'], meta['width']), dtype=meta['dtype'])
    width = meta['width']
    height = meta['height']
    aff = meta['transform']
    with gpsdio.open(filepath, driver=ctx_obj.get('i_drv'),
                     compression=ctx_obj.get('i_cmp')) as src:
        for msg in gpsdio.filter(
                src, "isinstance(msg.get('lat'), (int, float)) and "
                     "isinstance(msg.get('lon'), (int, float))"):
            col, row = (msg['lon'], msg['lat']) * ~aff
            if 0 <= row < height and 0 <= col < width:
                if field is not None:
                    val = msg[field]
                else:
                    val = 1
                data[row][col] += val

    return data
Example #12
0
def test_cat_geojson(types_msg_gz_path):
    result = subprocess.check_output(
        ['gpsdio', 'cat', '--geojson', types_msg_gz_path]).decode('utf-8')
    actual = (json.loads(l) for l in result.splitlines())
    with gpsdio.open(types_msg_gz_path) as expected:
        for e, a in zip(gpsdio.ops.geojson(expected), actual):
            assert a['type'] == 'Feature'
            assert a['geometry']['type'] == 'Point'
            assert a == a
Example #13
0
def test_no_validate_messages():
    message = {'field': 'val'}
    stream = StringIO(json.dumps(message))
    with gpsdio.open(stream,
                     driver='NewlineJSON',
                     compression=False,
                     _check=False) as src:
        msgs = list(src)
        assert len(msgs) is 1
        assert msgs[0] == message
Example #14
0
def test_load(types_json_path, types_msg_gz_path, tmpdir, compare_msg):

    pth = str(tmpdir.mkdir('test').join('test_load'))
    with open(types_json_path) as f:
        stdin_input = f.read()

    result = CliRunner().invoke(gpsdio.cli.main.main_group, [
        'load',
        '--o-drv', 'NewlineJSON',
        '--o-cmp', 'GZIP',
        pth

    ], input=stdin_input)

    assert result.exit_code is 0

    with gpsdio.open(types_msg_gz_path) as expected, \
            gpsdio.open(pth, driver='NewlineJSON', compression='GZIP') as actual:
        for e, a in zip(expected, actual):
            assert compare_msg(e, a)
Example #15
0
def test_io_on_closed_stream(tmpdir):

    # Make sure the tempfile actually appears on disk
    pth = str(tmpdir.mkdir('test').join('test_io_on_closed_file'))
    with open(pth, 'w') as f:
        f.write('')

    # Have to check in read and write in order to trigger all the exceptions
    for mode in ('r', 'w'):
        with gpsdio.open(pth, mode=mode, driver='NewlineJSON') as src:
            src.close()
        assert src.closed
        with pytest.raises(Exception):
            next(src)
        with pytest.raises(Exception):
            src.write(None)
Example #16
0
File: insp.py Project: WuArj/gpsdio
def insp(ctx, infile, interpreter, input_driver, input_compression,
         input_driver_opts, input_compression_opts):

    # A good idea borrowed from Fiona and rasterio
    # https://github.com/Toblerity/Fiona
    # https://github.com/Mapbox/rasterio
    """
    Open a dataset in an interactive inspector.

    IPython will be used if it can be imported unless otherwise specified.

    Analogous to doing:

        \b
        >>> import gpsdio
        >>> with gpsdio.open(infile) as stream:
        ...     # Operations
    """

    logger.setLevel(ctx.obj['verbosity'])
    logger.debug('Starting insp')

    header = os.linesep.join(
        ("gpsdio {gversion} Interactive Inspector Session (Python {pyversion})"
         .format(gversion=gpsdio.__version__,
                 pyversion='.'.join(map(str, sys.version_info[:3]))),
         'Try "help(src)" or "next(src)".'))

    with gpsdio.open(infile,
                     driver=input_driver,
                     compression=input_compression,
                     do=input_driver_opts,
                     co=input_compression_opts,
                     **ctx.obj['idefine']) as src:

        scope = {'src': src, 'gpsdio': gpsdio}

        if not interpreter:
            code.interact(header, local=scope)
        elif interpreter == 'ipython':
            import IPython
            IPython.InteractiveShell.banner1 = header
            IPython.start_ipython(argv=[], user_ns=scope)
        else:
            raise click.BadParameter(
                "Unrecognized interpreter: {}".format(interpreter))
Example #17
0
def test_filter(types_msg_gz_path, tmpdir, runner):

    pth = str(tmpdir.mkdir('test').join('test_filter'))
    result = runner.invoke(gpsdio.cli.main.main_group, [
        'etl', '--o-drv', 'MsgPack', '--o-cmp', 'BZ2', '--filter',
        "lat and lon", '--sort', 'lat', types_msg_gz_path, pth
    ])

    assert result.exit_code is 0

    prev = None
    with gpsdio.open(pth, driver='MsgPack', compression='BZ2') as actual:
        for msg in actual:
            if prev is None:
                prev = msg
            else:
                assert msg['lat'] >= prev['lat']
Example #18
0
def test_sort_time(types_msg_gz_path, tmpdir, runner):

    # Process everything and sort on timestamp
    pth = str(tmpdir.mkdir('test').join('test_sort_time'))
    result = runner.invoke(gpsdio.cli.main.main_group, [
        'etl', '--o-drv', 'MsgPack', '--o-cmp', 'BZ2', '--sort', 'timestamp',
        types_msg_gz_path, pth
    ])

    print(result.output)
    assert result.exit_code is 0

    prev = None
    with gpsdio.open(pth, driver='MsgPack', compression='BZ2') as actual:
        for msg in actual:
            if prev is None:
                prev = msg
            else:
                assert msg['timestamp'] >= prev['timestamp']
Example #19
0
File: cat.py Project: WuArj/gpsdio
def cat(ctx, infile, input_driver, geojson, input_compression,
        input_driver_opts, input_compression_opts, output_driver_opts):
    """
    Print messages to stdout as newline JSON.
    """

    logger.setLevel(ctx.obj['verbosity'])
    logger.debug('Starting cat')

    with gpsdio.open(infile,
                     driver=input_driver,
                     compression=input_compression,
                     do=input_driver_opts,
                     co=input_compression_opts,
                     **ctx.obj['idefine']) as src:

        base_driver = gpsdio.base.BaseDriver(schema=src.schema)

        if geojson:
            outlib = nlj
            kwargs = output_driver_opts
            kwargs.update(json_lib=kwargs.get('json_lib', ujson))
        else:
            outlib = gpsdio
            kwargs = {
                'driver': 'NewlineJSON',
                'compression': False,
                'do': output_driver_opts
            }
            kwargs.update(**ctx.obj['odefine'])

        out = click.get_text_stream('stdout')
        with outlib.open(out, 'w', **kwargs) as dst:
            for msg in src:
                if geojson:
                    if 'lat' in msg and 'lon' in msg:
                        # Dump datetimes to string
                        msg = gpsdio.ops.msg2geojson(base_driver.dump(msg))
                    else:
                        continue
                dst.write(msg)
Example #20
0
    def __init__(self, filename, __bare__ = False, **kw):
        self.filename = filename
        for key, value in kw.iteritems():
            setattr(self, key, value)

        if __bare__: return

        self.root = quad_tree_node.QuadtreeNode(self)

        print "Loading data..."

        with utils.msgpack_open(self.root.source_filename, "w") as outf:
            with gpsdio.open(filename) as f:
                for row in f:
                    out_row = {}
                    for key, value in row.iteritems():
                        if isinstance(value, datetime.datetime):
                            value = float(value.strftime("%s")) * 1000.0
                        if not isinstance(value, (float, int, bool)):
                            continue
                        out_row[key] = value
                    outf.write(out_row)
                    self.root.count += 1
Example #21
0
def test_read(types_nmea_path):
    with gpsdio.open(types_nmea_path) as src:
        validate_stream(src)
Example #22
0
import sys
import math

names = set()
classifications = {}
with open(sys.argv[2]) as f:
    for row in csv.DictReader(f, skipinitialspace=True):
        mmsi = row['mmsi'] = int(row['mmsi'])
        if mmsi not in classifications: classifications[mmsi] = {}
        t = row['start_hour_ms'] = datetime.datetime.utcfromtimestamp(
            float(row['start_hour_ms']) / 1000.0)
        if t not in classifications[mmsi]: classifications[mmsi][t] = set()
        classifications[mmsi][t].add(row['classification'])
        names.add(row['classification'])

with gpsdio.open(sys.argv[3], "w") as outf:
    with gpsdio.open(sys.argv[1]) as f:
        c = 0
        for row in f:
            mmsiclass = classifications[int(row['mmsi'])]
            keys = [
                key for key in mmsiclass.iterkeys()
                if key <= row['timestamp'] and key +
                datetime.timedelta(hours=1) >= row['timestamp']
            ]
            if keys:
                key = keys[0]
                clss = mmsiclass[key]
                total = len(clss)
                if total > 0:
                    fishing = len(
Example #23
0
def test_default_mode_is_read(types_msg_path):
    with gpsdio.open(types_msg_path) as stream:
        assert stream.mode == 'r'
Example #24
0
    "measure_new_score_21600", "measure_new_score_3600",
    "measure_new_score_43200", "measure_new_score_86400", "measure_pos_10800",
    "measure_pos_1800", "measure_pos_21600", "measure_pos_3600",
    "measure_pos_43200", "measure_pos_86400", "measure_speed",
    "measure_speed_diff", "measure_speedavg_10800", "measure_speedavg_1800",
    "measure_speedavg_21600", "measure_speedavg_3600",
    "measure_speedavg_43200", "measure_speedavg_86400",
    "measure_speedstddev_10800", "measure_speedstddev_1800",
    "measure_speedstddev_21600", "measure_speedstddev_3600",
    "measure_speedstddev_43200", "measure_speedstddev_86400", "speed",
    "heading", "course", "measure_course", "timestamp", "timestamp_diff"
]

length = 0

with gpsdio.open(sys.argv[1], skip_failures=True) as f:
    for row in f:
        length += 1

x = numpy.zeros(length, dtype=[(name, "f8") for name in cols + ['mmsi']])

segids = {}
segid_counter = 0

with gpsdio.open(sys.argv[1], skip_failures=True) as f:
    for rownum, row in enumerate(f):
        for col in cols:
            val = row.get(col, numpy.Infinity)
            if isinstance(val, datetime.datetime):
                val = float(val.strftime("%s"))
            x[col][rownum] = val
Example #25
0
import vessel_scoring.models
import vessel_scoring.utils
import gpsdio
import numpy
import sys

models = vessel_scoring.models.load_models()

model = sys.argv[1]
input = sys.argv[2]
output = sys.argv[3]

if input.endswith(".msg"):
    with gpsdio.open(output, "w") as fout:
        with gpsdio.open(input) as fin:
            fout.write(models[model].predict_messages(fin))
else:
    data = numpy.load(input)['x']
    datalen = len(data)
    data = vessel_scoring.utils.numpy_to_messages(data)
    data = models[model].predict_messages(data)
    data = vessel_scoring.utils.messages_to_numpy(data, datalen)
    numpy.savez(output, x=data)
Example #26
0
File: info.py Project: WuArj/gpsdio
def info(ctx, infile, indent, meta_member, sort_field, with_mmsi_hist,
         with_type_hist, with_field_hist, with_all, input_driver,
         input_driver_opts, input_compression, input_compression_opts):
    """
    Print metadata about a datasource as JSON.

    Can optionally print a single item as a string.

    One caveat of this tool is that JSON does not support integer keys, which
    means that the keys of items like `type_histogram` and `mmsi_histogram`
    have been converted to a string when in reality they should be integers.
    Tools reading the JSON output will need account for this when parsing.
    """

    logger.setLevel(ctx.obj['verbosity'])
    logger.debug('Starting info')

    if meta_member == 'mmsi_histogram':
        with_mmsi_hist = True
    if meta_member == 'type_histogram':
        with_type_hist = True
    if meta_member == 'field_histogram':
        with_field_hist = True

    xmin = ymin = xmax = ymax = None
    ts_min = ts_max = None
    mmsi_hist = {}
    msg_typehist = {}
    field_hist = {}
    is_sorted = True
    prev_ts = None

    with gpsdio.open(infile,
                     driver=input_driver,
                     compression=input_compression,
                     do=input_driver_opts,
                     co=input_compression_opts,
                     **ctx.obj['idefine']) as src:

        idx = 0  # In case file is empty
        for idx, msg in enumerate(src):

            # ts = msg.get('timestamp')
            x = msg.get('lon')
            y = msg.get('lat')
            mmsi = msg.get('mmsi')
            msg_type = msg.get('type')
            sort_val = msg.get(sort_field)

            for key in msg.keys():
                field_hist.setdefault(key, 0)
                field_hist[key] += 1

            if sort_val is not None:

                # Adjust min and max timestamp
                if ts_min is None or sort_val < ts_min:
                    ts_min = sort_val
                if ts_max is None or sort_val > ts_max:
                    ts_max = sort_val

                # Figure out if the data is sorted by time
                if prev_ts is None:
                    prev_ts = sort_val
                elif (sort_val and prev_ts) and sort_val < prev_ts:
                    is_sorted = False

            if x is not None and y is not None:

                # Adjust bounding box
                if xmin is None or x < xmin:
                    xmin = x
                if ymin is None or y < ymin:
                    ymin = y
                if xmax is None or x > xmax:
                    xmax = x
                if ymax is None or y > ymax:
                    ymax = y

            # Type histogram
            msg_typehist.setdefault(msg_type, 0)
            msg_typehist[msg_type] += 1

            # MMSI histogram
            mmsi_hist.setdefault(mmsi, 0)
            mmsi_hist[mmsi] += 1

    stats = {
        'bounds': (xmin, ymin, xmax, ymax),
        'count': idx + 1,
        'min_timestamp': gpsdio.validate.datetime2str(ts_min),
        'max_timestamp': gpsdio.validate.datetime2str(ts_max),
        'sorted': is_sorted,
        'num_unique_mmsi': len(set(mmsi_hist.keys())),
        'num_unique_type': len(set(msg_typehist.keys())),
        'num_unique_field': len(set(field_hist.keys()))
    }

    if with_all or with_mmsi_hist:
        stats['mmsi_histogram'] = OrderedDict(
            ((k, mmsi_hist[k]) for k in sorted(mmsi_hist.keys())))
    if with_all or with_type_hist:
        stats['type_histogram'] = OrderedDict(
            ((k, msg_typehist[k]) for k in sorted(msg_typehist.keys())))
    if with_all or with_field_hist:
        stats['field_histogram'] = OrderedDict(
            ((k, field_hist[k]) for k in sorted(field_hist.keys())))

    stats = OrderedDict((k, stats[k]) for k in sorted(stats.keys()))

    if meta_member:
        if isinstance(stats[meta_member], (tuple, list)):
            click.echo(" ".join((map(str, stats[meta_member]))))
        elif isinstance(stats[meta_member], (dict, bool)):
            click.echo(json.dumps(stats[meta_member], indent=indent))
        else:
            click.echo(stats[meta_member])
    else:
        click.echo(json.dumps(stats, indent=indent))
Example #27
0
File: etl.py Project: WuArj/gpsdio
def etl(ctx, infile, outfile, filter_expr, sort_field, input_driver,
        input_driver_opts, input_compression, input_compression_opts,
        output_driver, output_driver_opts, output_compression,
        output_compression_opts):
    """
    Format conversion, filtering, and sorting.

    Messages are filtered before sorting to limit the amount of data kept in
    memory.

    Filtering expressions take the form of Python boolean expressions and provide
    access to fields and the entire message via a custom scope.  Each field name
    can be referenced directly and the entire messages is available via a `msg`
    variable.  It is important to remember that `gpsdio` converts `timestamps`
    to `datetime.datetime()` objects internally.

    Since fields differ by message type any expression that raises a `NameError`
    when evaluated is considered a failure.

    Any Python expression that evalues as `True` or `False` can be used so so
    expressions can be combined into a single filter using `and` or split into
    multiple by using one instance of `--filter` for each side of the `and`.

    Only process messages containing a timestamp:

    \b
        $ gpsdio ${INFILE} ${OUTFILE} \\
            --filter "'timestamp' in msg"

    Only process messages from May 2010 for a specific MMSI:

    \b
        $ gpsdio ${INFIE} ${OUTFILE} \\
            --filter "timestamp.month == 5 and timestamp.year == 2010"" \\
            --filter "mmsi == 123456789"

    Filter and sort:

    \b
        $ gpsdio ${INFILE} ${OUTFILE} \\
            --filter "timestamp.year == 2010" \\
            --sort timestamp
    """

    logger.setLevel(ctx.obj['verbosity'])
    logger.debug('Starting etl')

    with gpsdio.open(infile,
                     driver=input_driver,
                     compression=input_compression,
                     do=input_driver_opts,
                     co=input_compression_opts,
                     **ctx.obj['idefine']) as src:

        with gpsdio.open(outfile,
                         'w',
                         driver=output_driver,
                         compression=output_compression,
                         do=output_driver_opts,
                         co=output_compression_opts,
                         **ctx.obj['odefine']) as dst:

            iterator = gpsdio.ops.filter(filter_expr,
                                         src) if filter_expr else src
            for msg in gpsdio.ops.sort(iterator,
                                       sort_field) if sort_field else iterator:
                dst.write(msg)
Example #28
0
def test_no_detect_compression(types_msg_path):

    with gpsdio.open(types_msg_path, compression=False) as actual, \
            gpsdio.open(types_msg_path) as expected:
        for e_line, a_line in zip(expected, actual):
            assert e_line == a_line
Example #29
0
def gpsdio_filtersplit(ctx, infile, outfile, split, buckets, filter_expr, timeresolution, filter_env_expr, change_exprs):

    """
    Filter by expression and split by field. Filter expressions are passed to
    `eval()` and must be valid Python code. Column names are available as
    as variables and types are maintained.

    Examples:

        Speed range
        --filter '0.5 < speed < 10.0'

        Specific MMSI's
        --filter "mmsi in ['123456789', '987654321']"

        Bounding box
        --filter "20 <= lon <= 30 and -10 <= lat <= 30"

        Splitting into 4 buckets of mmsi:s for parallellization of further processing:
        --split "mmsi" --buckets 4

        Using external functions to evaluate rows
        --filter "fishing_score(lat, lon, speed, course) > 0.5" --filter-env scoring_functions.py

        Using a change expression to split by a lat/lon grid:

        --change "latgrid=round(row.get('lat',0)/10)*10" --change "longrid=round(row.get('lon',0)/10)*10" --split "latgrid,longrid"

    """

    change_exprs = {key: value
                    for key, value
                    in (item.split("=")
                        for item in change_exprs)}
    
    split = split.split(",")
    
    filter_env = {}
    if filter_env_expr:
        exec filter_env_expr in filter_env, filter_env

    if split is None:
        split = ["mmsi"]

    def getKey(row, key):
        value = row[key]
        if isinstance(value, datetime.datetime):
            return value.strftime(timeresolution)
        else:
            return value

    bucketinfo = {}

    splitkey = ''
    with gpsdio.open(infile,
                     driver=ctx.obj['i_drv'], do=ctx.obj['i_drv_opts'],
                     compression=ctx.obj['i_cmp'], co=ctx.obj['i_cmp_opts']) as f:
        for row in f:
            env_vars = dict(filter_env)
            env_vars.update(row)
            env_vars['row'] = env_vars

            for key, expr in change_exprs.iteritems():
                env_vars[key] = row[key] = eval(expr, env_vars)

            if filter_expr is not None:
                if not eval(filter_expr, env_vars):
                    continue

            try:
                splitkey = ','.join('%s=%s' % (key, getKey(row, key)) for key in split)
            except KeyError:
                pass
            else:
                if buckets is not None:
                    bucket = str(int(hashlib.sha224(splitkey).hexdigest(), 16) % buckets)
                    bucketinfo[splitkey] = bucket
                    splitkey = "bucket=%s" % (bucket,)

            with gpsdio.open(outfile % {'split': splitkey}, "a",
                             driver=ctx.obj['o_drv'], do=ctx.obj['o_drv_opts'],
                             compression=ctx.obj['o_cmp'], co=ctx.obj['o_cmp_opts']) as f:
                f.writerow(row)

    if buckets is not None:
        with gpsdio.open(outfile % {'split': 'bucketlist'}, "a",
                         driver=ctx.obj['o_drv'], do=ctx.obj['o_drv_opts'],
                         compression=ctx.obj['o_cmp'], co=ctx.obj['o_cmp_opts']) as f:
            for splitkey, bucket in bucketinfo.iteritems():
                f.writerow({'type': -1, 'splitkey': splitkey, 'bucket': bucket})
Example #30
0
def test_write_bad_msg(tmpdir):
    pth = str(tmpdir.mkdir('test').join('test_write_bad_msg'))
    with gpsdio.open(pth, 'w', driver='NewlineJSON') as dst:
        with pytest.raises(Exception):
            dst.write({'field': six})
Example #31
0
def gpsdio_sort(ctx, infile, outfile, cols='timestamp'):

    """
    Sort messages by column in large files.

    Sorts messages in an arbitrarily large file according to an arbitrary set
    of columns, by default 'timestamp'.

    The unix `sort` command is used so large tempfiles may be written.
    """

    # Make sure unix sort is available
    for p in os.environ['PATH'].split(os.pathsep):
        if os.access(os.path.join(p, 'sort'), os.X_OK):
            break
    else:
        raise click.ClickException("Unix sort is not on path.")

    # Figure out if we can get the driver and compression out of the
    # parent click context
    if isinstance(ctx.obj, dict):
        i_drv = ctx.obj.get('i_drv')
        i_cmp = ctx.obj.get('i_cmp')
        o_drv = ctx.obj.get('o_drv')
        o_cmp = ctx.obj.get('o_cmp')
    else:
        i_drv = None
        i_cmp = None
        o_drv = None
        o_cmp = None

    tempfile1 = outfile + ".tmp1"
    tempfile2 = outfile + ".tmp2"

    cols = cols.split(",")

    def mangle(item):
        if isinstance(item, int):
            return "%020d" % item
        elif isinstance(item, float):
            return "%040.20f" % item
        elif isinstance(item, datetime.datetime):
            return item.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
        else:
            return unicode(item).encode("utf-8")

    def getKey(row):
        return ' : '.join(mangle(row.get(col, '')) for col in cols)

    def format_row(row):
        return msgpack.dumps(
            gpsdio.schema.export_msg(row)
        ).replace('\1', '\1\1').replace('\n', '\1\2')

    def load_row(row):
        return gpsdio.schema.import_msg(
            msgpack.loads(
                row.replace('\1\2', '\n').replace('\1\1', '\1')))

    with gpsdio.open(infile, driver=i_drv, compression=i_cmp) as i:
        with open(tempfile1, "w") as t:
            for line in i:
                key = getKey(line)
                t.write(key + " * " + format_row(line) + '\n')

    # Collate using C locale to sort by character value
    # See http://unix.stackexchange.com/questions/31886/how-do-get-unix-sort-to-sort-in-same-order-as-java-by-unicode-value/31922#31922
    # for infor on why this works for utf-8 text too
    env = dict(os.environ)
    env['LC_COLLATE'] = 'C' 

    subprocess.call(["sort", tempfile1, "-o", tempfile2], env=env)

    with open(tempfile2) as t:
        with gpsdio.open(outfile, "w", driver=o_drv, compression=o_cmp) as o:
            for line in t:
                o.writerow(load_row(line.split(" * ", 1)[1][:-1]))

    os.unlink(tempfile1)
    os.unlink(tempfile2)
Example #32
0
def test_read_compressed(types_nmea_gz_path):
    with gpsdio.open(types_nmea_gz_path) as src:
        idx = 0
        for idx, msg in enumerate(src):
            assert 'mmsi' in msg and 'type' in msg
        assert idx > 0
Example #33
0
def test_wrong_mode(types_json_path):
    with pytest.raises(ValueError):
        with gpsdio.open(types_json_path, mode='bad mode') as src:
            pass
import sys
import math

names=set()
classifications = {}
with open("classification-hourlyResultsAll.txt") as f:
    for row in csv.DictReader(f, skipinitialspace=True):
        mmsi = row['mmsi'] = int(row['mmsi'])
        if mmsi not in classifications: classifications[mmsi] = {}
        t = row['start_hour_ms'] = datetime.datetime.utcfromtimestamp(float(row['start_hour_ms'])/1000.0)
        if t not in classifications[mmsi]: classifications[mmsi][t] = set()
        classifications[mmsi][t].add(row['classification'])
        names.add(row['classification'])


with gpsdio.open("classified.msg", "w") as outf:
    with gpsdio.open("tracks.msg") as f:
        c = 0
        for row in f:
            mmsiclass = classifications[int(row['mmsi'])]
            keys = [key for key in mmsiclass.iterkeys()
                   if key <= row['timestamp'] and key + datetime.timedelta(hours=1) >= row['timestamp']]   
            if keys:
                key = keys[0]
                clss = mmsiclass[key]
                total = len(clss)
                if total > 0:
                    fishing = len([cls for cls in clss if cls and cls != 'Not fishing'])
                    row['classification'] = float(fishing) / float(total)
            for key in row:
                if isinstance(row[key], float) and (math.isnan(row[key]) or math.isinf(row[key])):