def charter_mapping(): return { 'vessel_name': ('vessel_name', may_strip), 'vessel_imo': ('vessel_imo', lambda x: may_apply(x, float, int, str)), 'vessel_length': ('vessel_length', lambda x: may_apply(x, float, int)), 'vessel_dwt': ('vessel_dwt', lambda x: may_apply(x, float, int)), 'charterer': ('charterer', may_strip), 'status': ('status', lambda x: STATUS_MAPPING.get(x.lower(), x) if x else None), 'lay_can_start': ( 'lay_can_start', lambda x: to_isoformat(x, dayfirst=False, yearfirst=True), ), 'lay_can_end': ('lay_can_end', lambda x: to_isoformat(x, dayfirst=False, yearfirst=True)), 'rate_value': ('rate_value', may_strip), 'rate_raw_value': ('rate_raw_value', may_strip), 'departure_zone': ('departure_zone', may_strip), 'arrival_zone': ('arrival_zone', lambda x: may_strip(x).split('-') if x else None), 'cargo_product': ('cargo_product', may_strip), 'cargo_movement': ('cargo_movement', None), 'cargo_volume': ('cargo_volume', None), 'cargo_unit': ('cargo_unit', lambda x: UNIT_MAPPING.get(x.lower(), x) if x else None), 'provider_name': ('provider_name', None), 'reported_date': ('reported_date', lambda x: parse_date(x).strftime('%d %b %Y')), }
def grades_mapping(): return { 'port_name': ('port_name', may_strip), 'berthed': ('berthed', lambda x: to_isoformat(x, dayfirst=False, yearfirst=True)), 'eta': ('eta', lambda x: to_isoformat(x, dayfirst=False, yearfirst=True)), 'departure': ('departure', lambda x: to_isoformat(x, dayfirst=False, yearfirst=True)), 'arrival': ('arrival', lambda x: to_isoformat(x, dayfirst=False, yearfirst=True)), 'vessel_name': ('vessel_name', may_strip), 'vessel_imo': ('vessel_imo', lambda x: may_apply(x, float, int, str)), 'vessel_length': ('vessel_length', lambda x: may_apply(x, float, int)), 'vessel_dwt': ('vessel_dwt', lambda x: may_apply(x, float, int)), 'cargo_product': ('cargo_product', may_strip), 'cargo_movement': ('cargo_movement', may_strip), 'cargo_volume': ('cargo_volume', may_strip), 'cargo_unit': ('cargo_unit', lambda x: UNIT_MAPPING.get(x.lower(), x) if x else None), 'provider_name': ('provider_name', None), 'reported_date': ( 'reported_date', lambda x: to_isoformat(x, dayfirst=False, yearfirst=True), ), 'cargo_seller': ('cargo_seller', may_strip), 'cargo_buyer': ('cargo_buyer', may_strip), }
def build_cargo(item): """Normalize cargo. Args: item: Yields: Dict: """ product = item.pop('cargo_product') volume_handled = may_apply(item.pop('cargo_volume_handled', 0), int) volume_leftover = may_apply(item.pop('cargo_volume_leftover', 0), int) return { 'product': product, 'volume': str(volume_handled + volume_leftover), 'volume_unit': Unit.tons, }
def _normalize_numeric(raw_data): """Normalize numeric fields. Examples: >>> _normalize_numeric('32,431.00 総ト') 32431 >>> _normalize_numeric('189.99 LOA') 189 """ return may_apply(raw_data.split()[0].replace(',', ''), float, int)
def normalize_numeric(raw_data): """Normalize numeric data such as vessel length, dwt. Args: raw_data (str): Returns: Optional[int]: """ return may_apply(raw_data, float, int) if raw_data != '0' else None
def key_map(): return { 'Voyage From': ('departure_zone', parser.may_strip), 'Voyage To': ('arrival_zone', parser.may_strip), 'Tonnes': ('dwt', lambda x: parser.may_apply(x, int)), # nothing is casted for those two values since they only are raw # strings used for parsing the real rate value 'Rate Amount': ('rate', None), 'Rate Unit': ('unit', None), 'Lay Date': ('lay_can_start', parse_lay_day), 'Charterer Name': ('charterer', parse_charterer), # support both web and api response 'Vessel Name': ('vessel_name', None), 'Vessel': ('vessel_name', None), }
def charters_mapping(): return { 'Arrived': ('lay_can_start', normalize_laycan_date), 'AGENT': ('shipping_agent', None), 'Berthed': ('lay_can_start_alt', normalize_laycan_date), 'BL DD': ('lay_can_start_alt', normalize_laycan_date), 'CHARTERER': ('charterer', normalize_charterer), 'COUNTRY OF DEST': (ignore_key('not specific enough; we already have "NEXT PORT"')), 'ETA': ('lay_can_start', normalize_laycan_date), 'ETB': ('lay_can_start_alt', normalize_laycan_date), 'ETS': ('lay_can_end', normalize_laycan_date), 'GRADE DETAIL': ('product', None), 'GRADE GROUP': (ignore_key('cargo is being extracted by grades spider')), 'IMO NR': ('vessel_imo', lambda x: may_apply(x, float, int, str) if x else None), 'LOAD POSITION': (ignore_key('irrelevant')), 'LOAD/DISCH': ('is_export', lambda x: x.lower() == 'load'), 'NEXT PORT': ('arrival_zone', lambda x: [ZONE_MAPPING.get(x) or x.upper()] if x else None), 'PORT': ('current_zone', None), 'PRE. PORT': ('previous_zone', None), 'provider_name': ('provider_name', None), 'QTT IN MT': ('volume', None), 'region_name': ('departure_zone', lambda x: ZONE_MAPPING.get(x, x)), 'reported_date': ('reported_date', normalize_reported_date), 'Sailed': ('lay_can_end', normalize_laycan_date), 'SHIPPERS/RECEIVERS': ('buyer_seller', lambda x: x.split('/')[-1] if x else None), 'STATUS': (ignore_key('irrelevant')), 'TERMINAL': (ignore_key('not required for spot charters yet')), 'VESSEL': ( 'vessel_name_and_charter_status', # don't use the separator value lambda x: [ may_strip(each) for idx, each in enumerate(x.partition('/')) if idx != 1 ], ), }
def portcall_mapping(): return { 'Agent': ('shipping_agent', None), 'Date, time': ('matching_date', normalize_matching_date), 'event_type': ('event_type', None), 'Flag': ignore_key( 'vessel flag; ignored because they are not in ISO3166 format'), 'IMO No.': ('vessel_imo', lambda x: may_apply(x, int)), 'No. ': ignore_key('table row serial number; irrelevant'), 'Port Company': ignore_key('port company; stevedore ?'), 'port_name': ('port_name', None), 'provider_name': ('provider_name', None), 'Quay No.': ('berth', None), 'reported_date': ('reported_date', None), 'Ship': ('vessel_name', None), 'unknown': ignore_key('contains a date; unsure; to clarify with product owner'), }
def parse(self, response): """Parse overview website and obtain URLs for the individual PDF reports. Args: response (scrapy.Response): Yields: Dict[str, str]: """ pdf_url = response.xpath( "//a[@id='link-download-mapa']/@href").extract() pdf_url = 'http://www.portodoitaqui.ma.gov.br' + pdf_url[0] # scan reported date through url reported_date = pdf_url.split('- ')[1].split(' clientes')[0] day, month, year = ( reported_date.split(' ')[0], reported_date.split(' ')[1], reported_date.split(' ')[2], ) self.reported_date = may_apply(f'{day}/{month}/{year}', to_isoformat) yield Request(url=pdf_url, callback=self.parse_pdf_report)
def normalize_date(raw_date, reported_date): """Normalize raw date to an ISO8601 compatible timestamp. Args: raw_date (str): raw date string reported_date (dt.datetime): used to infer missing month/year in the dates event (str): used to determine if we should increment month/year when inferring Returns: str: ISO8601 formatted timestamp Examples: >>> normalize_date('11/4', dt.date(year=2020, month=4, day=16)) '2020-04-11T00:00:00' """ split_date = raw_date.split('/') day, month, year = split_date[0], split_date[1], reported_date.year # handle year rollover scenarios if int(month) == 12 and reported_date.month == 1: year -= 1 elif int(month) == 1 and reported_date.month == 12: year += 1 return may_apply(f'{day}/{month}/{year}', to_isoformat)
def _parse_node(url, node): """Parse a single vessel XML structure. Args: url(str): original request url node(xml.ElementTree): partial of the initial XML response Returns: (kp_scrapers.models.items.VesselPosition): Structured information of the vessel """ ais_type = extract(node, 'source') raw_pos_updated_at = extract(node, 'dt_pos_utc') pos_updated_at = None if raw_pos_updated_at: pos_updated_at = dateutil.parser.parse(raw_pos_updated_at).isoformat() static_updated_at = None raw_static_updated_at = extract(node, 'dt_static_utc') if raw_static_updated_at: static_updated_at = dateutil.parser.parse(raw_static_updated_at).isoformat() try: raw_eta = extract(node, 'eta') eta = parse_eta_fmt(dt.datetime.utcnow().year, raw_eta).isoformat() if raw_eta else None except (TypeError, ValueError, AttributeError) as e: if raw_eta not in BAD_ETAS: # those values is probably the None equivalent of the ais emmitter... # so since this case is known, we only log the others. # some of them are not supported and others are simply bad formatted logger.debug('unable to parse eta: {} ({})'.format(e, raw_eta)) eta = None imo = extract(node, 'imo') item = { 'vessel': { 'name': may_strip(extract(node, 'vessel_name')), 'imo': None if imo == '0' else imo, 'mmsi': extract(node, 'mmsi'), 'vessel_type': extract(node, 'vessel_type_code'), 'call_sign': extract(node, 'callsign'), }, 'position': { 'lat': may_apply(extract(node, 'latitude'), float), 'lon': may_apply(extract(node, 'longitude'), float), 'speed': may_apply(extract(node, 'sog'), float), 'course': may_apply(extract(node, 'cog'), float), 'ais_type': ais_type, 'received_time': pos_updated_at, 'heading': safe_heading(extract(node, 'heading')), 'nav_state': may_apply(extract(node, 'nav_status_code'), int), # current draught values proved to be outwright wrong or late on the # platform, messing up with a lot of our features. It needs more # investigation but at this moment we need to stop it, although still # receive data from EE to continue assessing its quality 'draught_raw': may_apply(extract(node, 'draught'), float), }, 'reported_date': static_updated_at, 'provider_name': PROVIDER_ID, 'ais_type': ais_type, 'message_type': extract(node, 'message_type'), 'next_destination_eta': eta, 'next_destination_ais_type': ais_type, 'next_destination_destination': may_strip(extract(node, 'destination')), } return item
def normalize_date(raw_date, reported_date, event): """Normalize raw date to an ISO8601 compatible timestamp. Raw dates can come in different formats( see comment in code) Raw dates as given by the source do not provide month and/or year, so this info needs to be inferred from the reported date, together with the associated vessel movement type. A naive approach using month at scraping time may lead to inaccurate dates, especially during the rollover period for each month/year where dates from the previous and current month can appear together in the same table. To solve this, we compare if date is in the past or future: - For 'arrived', 'etb', 'berthed', 'new etb' vessel movements, reject if computed date is in the future. Then, return computed date adjusted 1 month/year prior. - For other vessel movements, reject if computed date is in the past. Then, return the computed date adjusted 1 month/year into the future. Args: raw_date (str): raw date string reported_date (dt.datetime): used to infer missing month/year in the dates event (str): used to determine if we should increment month/year when inferring Returns: str: ISO8601 formatted timestamp Examples: >>> normalize_date('25/02 2230', dt.date(year=2020, month=2, day=26), 'etb') '2020-02-25T00:00:00' >>> normalize_date('26/0902', dt.date(year=2020, month=2, day=26), 'arrived') '2020-02-26T00:00:00' >>> normalize_date('12/02/20 PM', dt.date(year=2020, month=2, day=18), 'arrived') '2020-02-12T00:00:00' >>> normalize_date('25/02(TBC)', dt.date(year=2020, month=2, day=18), 'eta') '2020-02-25T00:00:00' >>> normalize_date('26/AM (SUSPENDED)', dt.date(year=2018, month=11, day=7), 'etb') '2018-10-26T00:00:00' """ regularize_month = False # remove the alphabetic part of the date raw_date = re.split(r'([^0-9\/]+)', raw_date)[0] split_date = raw_date.split('/') # if raw_date in the form : '18/PM' > remove alphabetical > raw_date = '18/' (DD/) if len(split_date) > 1 and split_date[1] == '': day, month, year = split_date[0], reported_date.month, reported_date.year regularize_month = True # if raw_date in the form for ex: '12/02/20PM' > ... > raw_date = '12/02/20' (DD/MM/YY) elif len(split_date) == 3: day, month, year = split_date[0], split_date[1], reported_date.year elif len(split_date) == 2: # if raw_date in the form for ex: '15/02AM' > ... > raw_date = '15/02' (DD/MM) if len(split_date[1]) == 2: day, month, year = split_date[0], split_date[1], reported_date.year # if raw_date in the form for ex: '26/0902' (DD/hhmm) if len(split_date[1]) == 4: day, month, year = split_date[0], reported_date.month, reported_date.year regularize_month = True # if raw_date in the form for ex: '21/02 2234' (DD/MM hhmm) if len(split_date[1]) == 7: day, month, year = split_date[0], split_date[1][0:2], reported_date.year else: return None # regularize month for the case where we get month infos from the reported_date if regularize_month: if event in ['arrived', 'etb', 'berthed', 'new etb'] and reported_date.day < int(day): month = reported_date.month - 1 if event not in ['arrived', 'etb', 'berthed', 'new etb'] and reported_date.day > int(day): month = reported_date.month + 1 # handle year rollover scenarios if int(month) == 12 and reported_date.month == 1: year -= 1 elif int(month) == 1 and reported_date.month == 12: year += 1 return may_apply(f'{day}/{month}/{year}', to_isoformat)
def normalize_date(raw_date, reported_date, event): """Normalize raw date to an ISO8601 compatible timestamp. Raw dates can come in two formats: - 1st format: DD/MM, where DD is day of month and MM is the month - 2nd format: DDhhmm, where DD is day of month and hhmm is the time in 24hr format Some examples: - 1st format: '01/07' - 2nd format: '010830' Raw dates as given by the source do not provide month and/or year, so this info needs to be inferred from the reported date, together with the associated vessel movement type. A naive approach using month at scraping time may lead to inaccurate dates, especially during the rollover period for each month/year where dates from the previous and current month can appear together in the same table. To solve this, we compare if date is in the past or future: - For "eta" and "etb" vessel movements, reject if computed date is in the future. Then, return computed date adjusted 1 month/year prior. - For other vessel movements, reject if computed date is in the past. Then, return the computed date adjusted 1 month/year into the future. TODO use time information in raw date string Examples: >>> normalize_date('032200', dt.date(year=2018, month=6, day=29), 'eta') '2018-07-03T00:00:00' >>> normalize_date('032200', dt.date(year=2018, month=6, day=29), 'berthed') '2018-06-03T00:00:00' >>> normalize_date('032200', dt.date(year=2018, month=6, day=1), 'etb') '2018-06-03T00:00:00' >>> normalize_date('032200', dt.date(year=2018, month=6, day=1), 'berthed') '2018-05-03T00:00:00' >>> normalize_date('032200', dt.date(year=2018, month=12, day=30), 'eta') '2019-01-03T00:00:00' >>> normalize_date('302200', dt.date(year=2019, month=1, day=2), 'berthed') '2018-12-30T00:00:00' >>> normalize_date('02/07', dt.date(year=2018, month=7, day=5), 'eta') '2018-07-02T00:00:00' >>> normalize_date('02/01', dt.date(year=2018, month=12, day=30), 'eta') '2019-01-02T00:00:00' >>> normalize_date('30/12', dt.date(year=2019, month=1, day=2), 'berthed') '2018-12-30T00:00:00' Args: raw_date (str): raw date string reported_date (dt.datetime): used to infer missing month/year in the dates event (str): used to determine if we should increment month/year when inferring Returns: str: ISO8601 formatted timestamp """ # normalize 1st format if '/' in raw_date: day, month, year = raw_date[:2], raw_date[3:], reported_date.year # normalize 2nd format else: day, month, year = raw_date[: 2], reported_date.month, reported_date.year if event in ('eta', 'etb'): if reported_date.day > int(day): month = (reported_date + relativedelta(months=1)).month else: if reported_date.day < int(day): month = (reported_date - relativedelta(months=1)).month # handle year rollover scenarios if int(month) == 12 and reported_date.month == 1: year -= 1 elif int(month) == 1 and reported_date.month == 12: year += 1 return may_apply(f'{day}/{month}/{year}', to_isoformat)