def as_timestamp(d): """ Attempt to convert the passed parameter to a datetime. Note that for strings this uses python-dateutil, which is flexible, but SLOW. So do not use this methig if you are going to be parsing a billion dates. Use the high performance methods in timestamp.py return: float seconds since epoch (unix timestamp) """ if d is None: return None elif isinstance(d, datetime): return timestampFromDatetime( pytz.UTC.localize(d) if d.tzinfo is None else d) # return timestampFromDatetime(pytz.UTC.localize(d)) elif isinstance(d, date): return timestampFromDatetime( pytz.UTC.localize(datetime.combine(d, datetime.min.time()))) elif isinstance(d, (float, int, Timestamp)): return float(d) elif isinstance(d, basestring): return as_timestamp(dateutil_parse(d)) else: raise ValueError( 'Unsupported data type. Unable to convert value "%s" to to a timestamp' % d)
def _date_to_sql_timestamp(date): if isinstance(date, basestring): return 'TIMESTAMP({})'.format(date) elif isinstance(date, datetime): timestamp = timestampFromDatetime(date) else: # assume that date is already a timestamp timestamp = date return 'SEC_TO_TIMESTAMP({})'.format(int(timestamp))
def _date_to_sql_timestamp(date, use_legacy_sql=True): if isinstance(date, six.string_types): return 'TIMESTAMP({})'.format(date) elif isinstance(date, datetime): timestamp = timestampFromDatetime(date) else: # assume that date is already a timestamp timestamp = date ts_fn = 'SEC_TO_TIMESTAMP' if use_legacy_sql else 'TIMESTAMP_SECONDS' return '{}({})'.format(ts_fn, int(timestamp))
def segment_source(self): if self.date_range[0] is None: return beam.Create([]) dt = datetimeFromTimestamp(self.date_range[0]) ts = timestampFromDatetime(dt - timedelta(days=1)) try: source = GCPSource(gcp_path=self.options.segments, first_date_ts=ts, last_date_ts=ts) except HttpError as exn: logging.warn("Segment source not found: %s %s" % (self.options.segments, dt)) if exn.status_code == 404: return beam.Create([]) else: raise return source
from datetime import datetime import pytz import apache_beam as beam from apache_beam import typehints from pipe_tools.timestamp import timestampFromDatetime from pipe_tools.coders import JSONDict DEFAULT_START_TS = timestampFromDatetime( datetime(2017, 1, 1, 0, 0, 0, tzinfo=pytz.UTC)) HOUR_IN_SECONDS = 60 * 60 class MessageGenerator(): def __init__(self, start_ts=DEFAULT_START_TS, increment=HOUR_IN_SECONDS, count=72): self.start_ts = start_ts self.increment = increment self.count = count def __iter__(self): return self.messages() def messages(self): ts = self.start_ts for idx in xrange(self.count): yield JSONDict(mmsi=1, timestamp=ts, idx=idx) ts += self.increment
class TestTransforms(): ts = timestampFromDatetime(datetime(2017, 1, 1, 0, 0, 0, tzinfo=pytz.UTC)) @staticmethod def _seg_id(ssvid, ts): ts = datetimeFromTimestamp(ts) return '{}-{}'.format(ssvid, datetime2str(ts)) @staticmethod def groupby_fn(msg): return (msg['ssvid'], msg) def _run_segment(self, messages_in, segments_in, temp_dir): messages_file = pp.join(temp_dir, '_run_segment', 'messages') segments_file = pp.join(temp_dir, '_run_segment', 'segments') with _TestPipeline() as p: messages = ( p | 'CreateMessages' >> beam.Create(messages_in) | 'AddKeyMessages' >> beam.Map(self.groupby_fn) | "MessagesGroupByKey" >> beam.GroupByKey() ) segments = ( p | 'CreateSegments' >> beam.Create(segments_in) | 'AddKeySegments' >> beam.Map(self.groupby_fn) | "SegmentsGroupByKey" >> beam.GroupByKey() ) segmented = ( messages | "Segment" >> Segment(segments) ) messages = segmented['messages'] segments = segmented[Segment.OUTPUT_TAG_SEGMENTS] messages | "WriteMessages" >> beam.io.WriteToText( messages_file, coder=JSONDictCoder()) segments | "WriteSegments" >> beam.io.WriteToText( segments_file, coder=JSONDictCoder()) p.run() with open_shards('%s*' % messages_file) as output: messages = sorted(list(nlj.load(output)), key=lambda m: (m['ssvid'], m['timestamp'])) with open_shards('%s*' % segments_file) as output: segments = list(nlj.load(output)) assert list_contains(messages, messages_in) return messages, segments def test_segment_empty(self, temp_dir): self._run_segment([], [], temp_dir=temp_dir) def test_segment_single(self, temp_dir): messages_in = [{'ssvid': 1, 'timestamp': self.ts}] segments_in = [] messages_out, segments_out = self._run_segment(messages_in, segments_in, temp_dir=temp_dir) def test_segment_segments_in(self, temp_dir): prev_ts = self.ts - 1 messages_in = [{'ssvid': "1", 'timestamp': self.ts}] segments_in = [{'ssvid': "1", 'timestamp': prev_ts, 'seg_id': self._seg_id(1, prev_ts), 'origin_ts': prev_ts, 'timestamp_last': self.ts, 'noise': False, 'last_pos_lat': 0, 'last_pos_lon': 0, 'message_count': 1}] messages_out, segments_out = self._run_segment(messages_in, segments_in, temp_dir=temp_dir) assert messages_out[0]['seg_id'] == segments_in[0]['seg_id'] def test_segment_out_in(self, temp_dir): prev_ts = self.ts - 1 messages_in = [{'ssvid': "1", 'timestamp': self.ts-1}, {'ssvid': "2", 'timestamp': self.ts-1}] segments_in = [] messages_out, segments_out = self._run_segment(messages_in, segments_in, temp_dir=temp_dir) messages_in = [{'ssvid': "1", 'timestamp': self.ts}, {'ssvid': "2", 'timestamp': self.ts}] segments_in = segments_out messages_out, segments_out = self._run_segment(messages_in, segments_in, temp_dir=temp_dir) assert len(segments_out) == 2 assert all(seg['message_count'] == 2 for seg in segments_out) assert all(seg['seg_id'] == self._seg_id(seg['ssvid'], prev_ts) for seg in segments_out) @pytest.mark.parametrize("message, expected", [ ({}, {}), ({'shipname': 'f/v boaty Mc Boatface'}, {'n_shipname': 'BOATYMCBOATFACE'}), ({'shipname': 'Bouy 42%'}, {'n_shipname': 'BOUY'}), ({'callsign': '@@123'}, {'n_callsign': '123'}), ({'imo': 8814275}, {'n_imo': 8814275}), ]) def test_normalize(self, message, expected): normalize = NormalizeDoFn() assert list_contains(list(normalize.process(message)), [expected]) def test_normalize_invalid_imo(self): normalize = NormalizeDoFn() assert all ('n_imo' not in m for m in list(normalize.process({'imo': 0000000}))) def test_noise_segment(self, temp_dir): messages_in = [ {"timestamp": as_timestamp("2017-07-20T05:59:35.000000Z"), "ssvid": "338013000", "lon": -161.3321333333, "lat": -9.52616, "speed": 11.1}, {"timestamp": as_timestamp("2017-07-20T06:00:38.000000Z"), "ssvid": "338013000", "lon": -161.6153106689, "lat": -9.6753702164, "speed": 11.3999996185}, {"timestamp": as_timestamp("2017-07-20T06:01:00.000000Z"), "ssvid": "338013000"} ] segments_in = [] messages_out, segments_out = self._run_segment(messages_in, segments_in, temp_dir=temp_dir) seg_stats = {(seg['seg_id'], seg['message_count'], seg['noise']) for seg in segments_out} assert seg_stats == {('338013000-2017-07-20T05:59:35.000000Z', 2, False), ('338013000-2017-07-20T06:00:38.000000Z', 1, True)} messages_in = [{"timestamp": as_timestamp("2017-07-20T06:02:00.000000Z"), "ssvid": "338013000"} ] segments_in = segments_out messages_out, segments_out = self._run_segment(messages_in, segments_in, temp_dir=temp_dir) seg_stats = {(seg['seg_id'], seg['message_count'], seg['noise']) for seg in segments_out} assert seg_stats == {('338013000-2017-07-20T05:59:35.000000Z', 3, False)} def test_expected_segments(self, temp_dir): messages_in = [ {"timestamp": as_timestamp("2017-11-15T11:14:32.000000Z"), "ssvid": 257666800, "lon": 5.3108466667, "lat": 60.40065, "speed": 6.5}, {"timestamp": as_timestamp("2017-11-26T11:20:16.000000Z"), "ssvid": 257666800, "lon": 5.32334, "lat": 60.396235, "speed": 3.2000000477}, ] segments_in = [] messages_out, segments_out = self._run_segment(messages_in, segments_in, temp_dir=temp_dir) seg_stats = [(seg['seg_id'], seg['message_count'], seg['noise']) for seg in segments_out] expected = [('257666800-2017-11-15T11:14:32.000000Z', 1, False), ('257666800-2017-11-26T11:20:16.000000Z', 1, False)] assert seg_stats == expected def test_message_type(self, temp_dir): messages_in = [ {"timestamp": as_timestamp("2018-01-01 00:00"), "ssvid": "123456789", "type": "AIS.1", "lon": 0.0, "lat": 0.0}, {"timestamp": as_timestamp("2018-01-01 01:00"), "ssvid": "123456789", "type": "AIS.18", "lon": 0.0, "lat": 2.0}, {"timestamp": as_timestamp("2018-01-01 02:00"), "ssvid": "123456789", "type": "AIS.1", "lon": 0.0, "lat": 0.5}, {"timestamp": as_timestamp("2018-01-01 03:00"), "ssvid": "123456789", "type": "AIS.18", "lon": 0.0, "lat": 1.5}, {"timestamp": as_timestamp("2018-01-01 04:00"), "ssvid": "123456789", "type": "AIS.5", "shipname": "Boaty"}, ] segments_in = [] messages_out, segments_out = self._run_segment(messages_in, segments_in, temp_dir=temp_dir) seg_stats = [(seg['seg_id'], seg['message_count'], seg['shipname_most_common']) for seg in segments_out] expected = [('123456789-2018-01-01T00:00:00.000000Z', 3, 'Boaty'), ('123456789-2018-01-01T01:00:00.000000Z', 2, None)] assert seg_stats == expected