Python may_applyの例、kp_scrapers.lib.parser.may_apply Pythonの例

コード例 #1

0

ファイルを表示

ファイル: normalize.py プロジェクト: theHausdorffMetric/test

def charter_mapping():
    return {
        'vessel_name': ('vessel_name', may_strip),
        'vessel_imo': ('vessel_imo', lambda x: may_apply(x, float, int, str)),
        'vessel_length': ('vessel_length', lambda x: may_apply(x, float, int)),
        'vessel_dwt': ('vessel_dwt', lambda x: may_apply(x, float, int)),
        'charterer': ('charterer', may_strip),
        'status': ('status', lambda x: STATUS_MAPPING.get(x.lower(), x)
                   if x else None),
        'lay_can_start': (
            'lay_can_start',
            lambda x: to_isoformat(x, dayfirst=False, yearfirst=True),
        ),
        'lay_can_end':
        ('lay_can_end',
         lambda x: to_isoformat(x, dayfirst=False, yearfirst=True)),
        'rate_value': ('rate_value', may_strip),
        'rate_raw_value': ('rate_raw_value', may_strip),
        'departure_zone': ('departure_zone', may_strip),
        'arrival_zone': ('arrival_zone', lambda x: may_strip(x).split('-')
                         if x else None),
        'cargo_product': ('cargo_product', may_strip),
        'cargo_movement': ('cargo_movement', None),
        'cargo_volume': ('cargo_volume', None),
        'cargo_unit': ('cargo_unit', lambda x: UNIT_MAPPING.get(x.lower(), x)
                       if x else None),
        'provider_name': ('provider_name', None),
        'reported_date':
        ('reported_date', lambda x: parse_date(x).strftime('%d %b %Y')),
    }

コード例 #2

0

ファイルを表示

ファイル: normalize.py プロジェクト: theHausdorffMetric/test

def grades_mapping():
    return {
        'port_name': ('port_name', may_strip),
        'berthed':
        ('berthed', lambda x: to_isoformat(x, dayfirst=False, yearfirst=True)),
        'eta':
        ('eta', lambda x: to_isoformat(x, dayfirst=False, yearfirst=True)),
        'departure':
        ('departure',
         lambda x: to_isoformat(x, dayfirst=False, yearfirst=True)),
        'arrival':
        ('arrival', lambda x: to_isoformat(x, dayfirst=False, yearfirst=True)),
        'vessel_name': ('vessel_name', may_strip),
        'vessel_imo': ('vessel_imo', lambda x: may_apply(x, float, int, str)),
        'vessel_length': ('vessel_length', lambda x: may_apply(x, float, int)),
        'vessel_dwt': ('vessel_dwt', lambda x: may_apply(x, float, int)),
        'cargo_product': ('cargo_product', may_strip),
        'cargo_movement': ('cargo_movement', may_strip),
        'cargo_volume': ('cargo_volume', may_strip),
        'cargo_unit': ('cargo_unit', lambda x: UNIT_MAPPING.get(x.lower(), x)
                       if x else None),
        'provider_name': ('provider_name', None),
        'reported_date': (
            'reported_date',
            lambda x: to_isoformat(x, dayfirst=False, yearfirst=True),
        ),
        'cargo_seller': ('cargo_seller', may_strip),
        'cargo_buyer': ('cargo_buyer', may_strip),
    }

コード例 #3

0

ファイルを表示

ファイル: normalize.py プロジェクト: theHausdorffMetric/test

def build_cargo(item):
    """Normalize cargo.

    Args:
        item:

    Yields:
        Dict:

    """
    product = item.pop('cargo_product')
    volume_handled = may_apply(item.pop('cargo_volume_handled', 0), int)
    volume_leftover = may_apply(item.pop('cargo_volume_leftover', 0), int)

    return {
        'product': product,
        'volume': str(volume_handled + volume_leftover),
        'volume_unit': Unit.tons,
    }

コード例 #4

0

ファイルを表示

def _normalize_numeric(raw_data):
    """Normalize numeric fields.

    Examples:
        >>> _normalize_numeric('32,431.00 総ト')
        32431
        >>> _normalize_numeric('189.99 LOA')
        189

    """
    return may_apply(raw_data.split()[0].replace(',', ''), float, int)

コード例 #5

0

ファイルを表示

def normalize_numeric(raw_data):
    """Normalize numeric data such as vessel length, dwt.

    Args:
        raw_data (str):

    Returns:
        Optional[int]:

    """
    return may_apply(raw_data, float, int) if raw_data != '0' else None

コード例 #6

0

ファイルを表示

ファイル: common.py プロジェクト: theHausdorffMetric/test

def key_map():
    return {
        'Voyage From': ('departure_zone', parser.may_strip),
        'Voyage To': ('arrival_zone', parser.may_strip),
        'Tonnes': ('dwt', lambda x: parser.may_apply(x, int)),
        # nothing is casted for those two values since they only are raw
        # strings used for parsing the real rate value
        'Rate Amount': ('rate', None),
        'Rate Unit': ('unit', None),
        'Lay Date': ('lay_can_start', parse_lay_day),
        'Charterer Name': ('charterer', parse_charterer),
        # support both web and api response
        'Vessel Name': ('vessel_name', None),
        'Vessel': ('vessel_name', None),
    }

コード例 #7

0

ファイルを表示

ファイル: normalize_charters.py プロジェクト: theHausdorffMetric/test

def charters_mapping():
    return {
        'Arrived': ('lay_can_start', normalize_laycan_date),
        'AGENT': ('shipping_agent', None),
        'Berthed': ('lay_can_start_alt', normalize_laycan_date),
        'BL DD': ('lay_can_start_alt', normalize_laycan_date),
        'CHARTERER': ('charterer', normalize_charterer),
        'COUNTRY OF DEST':
        (ignore_key('not specific enough; we already have "NEXT PORT"')),
        'ETA': ('lay_can_start', normalize_laycan_date),
        'ETB': ('lay_can_start_alt', normalize_laycan_date),
        'ETS': ('lay_can_end', normalize_laycan_date),
        'GRADE DETAIL': ('product', None),
        'GRADE GROUP':
        (ignore_key('cargo is being extracted by grades spider')),
        'IMO NR': ('vessel_imo', lambda x: may_apply(x, float, int, str)
                   if x else None),
        'LOAD POSITION': (ignore_key('irrelevant')),
        'LOAD/DISCH': ('is_export', lambda x: x.lower() == 'load'),
        'NEXT PORT':
        ('arrival_zone', lambda x: [ZONE_MAPPING.get(x) or x.upper()]
         if x else None),
        'PORT': ('current_zone', None),
        'PRE. PORT': ('previous_zone', None),
        'provider_name': ('provider_name', None),
        'QTT IN MT': ('volume', None),
        'region_name': ('departure_zone', lambda x: ZONE_MAPPING.get(x, x)),
        'reported_date': ('reported_date', normalize_reported_date),
        'Sailed': ('lay_can_end', normalize_laycan_date),
        'SHIPPERS/RECEIVERS': ('buyer_seller', lambda x: x.split('/')[-1]
                               if x else None),
        'STATUS': (ignore_key('irrelevant')),
        'TERMINAL': (ignore_key('not required for spot charters yet')),
        'VESSEL': (
            'vessel_name_and_charter_status',
            # don't use the separator value
            lambda x: [
                may_strip(each) for idx, each in enumerate(x.partition('/'))
                if idx != 1
            ],
        ),
    }

コード例 #8

0

ファイルを表示

def portcall_mapping():
    return {
        'Agent': ('shipping_agent', None),
        'Date, time': ('matching_date', normalize_matching_date),
        'event_type': ('event_type', None),
        'Flag':
        ignore_key(
            'vessel flag; ignored because they are not in ISO3166 format'),
        'IMO No.': ('vessel_imo', lambda x: may_apply(x, int)),
        'No. ':
        ignore_key('table row serial number; irrelevant'),
        'Port Company':
        ignore_key('port company; stevedore ?'),
        'port_name': ('port_name', None),
        'provider_name': ('provider_name', None),
        'Quay No.': ('berth', None),
        'reported_date': ('reported_date', None),
        'Ship': ('vessel_name', None),
        'unknown':
        ignore_key('contains a date; unsure; to clarify with product owner'),
    }

コード例 #9

0

ファイルを表示

    def parse(self, response):
        """Parse overview website and obtain URLs for the individual PDF reports.

        Args:
            response (scrapy.Response):

        Yields:
            Dict[str, str]:

        """
        pdf_url = response.xpath(
            "//a[@id='link-download-mapa']/@href").extract()
        pdf_url = 'http://www.portodoitaqui.ma.gov.br' + pdf_url[0]
        # scan reported date through url
        reported_date = pdf_url.split('- ')[1].split(' clientes')[0]
        day, month, year = (
            reported_date.split(' ')[0],
            reported_date.split(' ')[1],
            reported_date.split(' ')[2],
        )
        self.reported_date = may_apply(f'{day}/{month}/{year}', to_isoformat)

        yield Request(url=pdf_url, callback=self.parse_pdf_report)

コード例 #10

0

ファイルを表示

ファイル: normalize.py プロジェクト: theHausdorffMetric/test

def normalize_date(raw_date, reported_date):
    """Normalize raw date to an ISO8601 compatible timestamp.

    Args:
        raw_date (str): raw date string
        reported_date (dt.datetime): used to infer missing month/year in the dates
        event (str): used to determine if we should increment month/year when inferring

    Returns:
        str: ISO8601 formatted timestamp

    Examples:
        >>> normalize_date('11/4', dt.date(year=2020, month=4, day=16))
        '2020-04-11T00:00:00'
    """
    split_date = raw_date.split('/')
    day, month, year = split_date[0], split_date[1], reported_date.year
    # handle year rollover scenarios
    if int(month) == 12 and reported_date.month == 1:
        year -= 1
    elif int(month) == 1 and reported_date.month == 12:
        year += 1

    return may_apply(f'{day}/{month}/{year}', to_isoformat)

コード例 #11

0

ファイルを表示

ファイル: parser.py プロジェクト: theHausdorffMetric/test

def _parse_node(url, node):
    """Parse a single vessel XML structure.

    Args:
        url(str): original request url
        node(xml.ElementTree): partial of the initial XML response

    Returns:
        (kp_scrapers.models.items.VesselPosition): Structured information of the vessel
    """
    ais_type = extract(node, 'source')

    raw_pos_updated_at = extract(node, 'dt_pos_utc')
    pos_updated_at = None
    if raw_pos_updated_at:
        pos_updated_at = dateutil.parser.parse(raw_pos_updated_at).isoformat()

    static_updated_at = None
    raw_static_updated_at = extract(node, 'dt_static_utc')
    if raw_static_updated_at:
        static_updated_at = dateutil.parser.parse(raw_static_updated_at).isoformat()

    try:
        raw_eta = extract(node, 'eta')
        eta = parse_eta_fmt(dt.datetime.utcnow().year, raw_eta).isoformat() if raw_eta else None
    except (TypeError, ValueError, AttributeError) as e:
        if raw_eta not in BAD_ETAS:
            # those values is probably the None equivalent of the ais emmitter...
            # so since this case is known, we only log the others.
            # some of them are not supported and others are simply bad formatted
            logger.debug('unable to parse eta: {} ({})'.format(e, raw_eta))
        eta = None

    imo = extract(node, 'imo')
    item = {
        'vessel': {
            'name': may_strip(extract(node, 'vessel_name')),
            'imo': None if imo == '0' else imo,
            'mmsi': extract(node, 'mmsi'),
            'vessel_type': extract(node, 'vessel_type_code'),
            'call_sign': extract(node, 'callsign'),
        },
        'position': {
            'lat': may_apply(extract(node, 'latitude'), float),
            'lon': may_apply(extract(node, 'longitude'), float),
            'speed': may_apply(extract(node, 'sog'), float),
            'course': may_apply(extract(node, 'cog'), float),
            'ais_type': ais_type,
            'received_time': pos_updated_at,
            'heading': safe_heading(extract(node, 'heading')),
            'nav_state': may_apply(extract(node, 'nav_status_code'), int),
            # current draught values proved to be outwright wrong or late on the
            # platform, messing up with a lot of our features. It needs more
            # investigation but at this moment we need to stop it, although still
            # receive data from EE to continue assessing its quality
            'draught_raw': may_apply(extract(node, 'draught'), float),
        },
        'reported_date': static_updated_at,
        'provider_name': PROVIDER_ID,
        'ais_type': ais_type,
        'message_type': extract(node, 'message_type'),
        'next_destination_eta': eta,
        'next_destination_ais_type': ais_type,
        'next_destination_destination': may_strip(extract(node, 'destination')),
    }

    return item

コード例 #12

0

ファイルを表示

ファイル: normalize.py プロジェクト: theHausdorffMetric/test

def normalize_date(raw_date, reported_date, event):
    """Normalize raw date to an ISO8601 compatible timestamp.

    Raw dates can come in different formats( see comment in code)

    Raw dates as given by the source do not provide month and/or year, so this info needs to be
    inferred from the reported date, together with the associated vessel movement type.

    A naive approach using month at scraping time may lead to inaccurate dates, especially
    during the rollover period for each month/year where dates from the previous and current
    month can appear together in the same table.

    To solve this, we compare if date is in the past or future:
        - For 'arrived', 'etb', 'berthed', 'new etb' vessel movements,
          reject if computed date is in the future.
          Then, return computed date adjusted 1 month/year prior.
        - For other vessel movements, reject if computed date is in the past.
          Then, return the computed date adjusted 1 month/year into the future.


    Args:
        raw_date (str): raw date string
        reported_date (dt.datetime): used to infer missing month/year in the dates
        event (str): used to determine if we should increment month/year when inferring

    Returns:
        str: ISO8601 formatted timestamp

    Examples:
        >>> normalize_date('25/02 2230', dt.date(year=2020, month=2, day=26), 'etb')
        '2020-02-25T00:00:00'
        >>> normalize_date('26/0902', dt.date(year=2020, month=2, day=26), 'arrived')
        '2020-02-26T00:00:00'
        >>> normalize_date('12/02/20 PM', dt.date(year=2020, month=2, day=18), 'arrived')
        '2020-02-12T00:00:00'
        >>> normalize_date('25/02(TBC)', dt.date(year=2020, month=2, day=18), 'eta')
        '2020-02-25T00:00:00'
        >>> normalize_date('26/AM (SUSPENDED)', dt.date(year=2018, month=11, day=7), 'etb')
        '2018-10-26T00:00:00'

    """
    regularize_month = False
    # remove the alphabetic part of the date
    raw_date = re.split(r'([^0-9\/]+)', raw_date)[0]
    split_date = raw_date.split('/')

    # if raw_date in the form : '18/PM' > remove alphabetical > raw_date = '18/' (DD/)
    if len(split_date) > 1 and split_date[1] == '':
        day, month, year = split_date[0], reported_date.month, reported_date.year
        regularize_month = True

    # if raw_date in the form for ex: '12/02/20PM' > ... > raw_date = '12/02/20' (DD/MM/YY)
    elif len(split_date) == 3:
        day, month, year = split_date[0], split_date[1], reported_date.year
    elif len(split_date) == 2:

        # if raw_date in the form for ex: '15/02AM' > ... > raw_date = '15/02' (DD/MM)
        if len(split_date[1]) == 2:
            day, month, year = split_date[0], split_date[1], reported_date.year

        # if raw_date in the form for ex: '26/0902' (DD/hhmm)
        if len(split_date[1]) == 4:
            day, month, year = split_date[0], reported_date.month, reported_date.year
            regularize_month = True

        # if raw_date in the form for ex: '21/02 2234' (DD/MM hhmm)
        if len(split_date[1]) == 7:
            day, month, year = split_date[0], split_date[1][0:2], reported_date.year
    else:
        return None

    # regularize month for the case where we get month infos from the reported_date
    if regularize_month:
        if event in ['arrived', 'etb', 'berthed', 'new etb'] and reported_date.day < int(day):
            month = reported_date.month - 1
        if event not in ['arrived', 'etb', 'berthed', 'new etb'] and reported_date.day > int(day):
            month = reported_date.month + 1

    # handle year rollover scenarios
    if int(month) == 12 and reported_date.month == 1:
        year -= 1
    elif int(month) == 1 and reported_date.month == 12:
        year += 1

    return may_apply(f'{day}/{month}/{year}', to_isoformat)

コード例 #13

0

ファイルを表示

ファイル: normalize.py プロジェクト: theHausdorffMetric/test

def normalize_date(raw_date, reported_date, event):
    """Normalize raw date to an ISO8601 compatible timestamp.

    Raw dates can come in two formats:
        - 1st format: DD/MM, where DD is day of month and MM is the month
        - 2nd format: DDhhmm, where DD is day of month and hhmm is the time in 24hr format

    Some examples:
        - 1st format: '01/07'
        - 2nd format: '010830'

    Raw dates as given by the source do not provide month and/or year, so this info needs to be
    inferred from the reported date, together with the associated vessel movement type.

    A naive approach using month at scraping time may lead to inaccurate dates, especially
    during the rollover period for each month/year where dates from the previous and current
    month can appear together in the same table.

    To solve this, we compare if date is in the past or future:
        - For "eta" and "etb" vessel movements, reject if computed date is in the future.
          Then, return computed date adjusted 1 month/year prior.
        - For other vessel movements, reject if computed date is in the past.
          Then, return the computed date adjusted 1 month/year into the future.

    TODO use time information in raw date string

    Examples:
        >>> normalize_date('032200', dt.date(year=2018, month=6, day=29), 'eta')
        '2018-07-03T00:00:00'
        >>> normalize_date('032200', dt.date(year=2018, month=6, day=29), 'berthed')
        '2018-06-03T00:00:00'
        >>> normalize_date('032200', dt.date(year=2018, month=6, day=1), 'etb')
        '2018-06-03T00:00:00'
        >>> normalize_date('032200', dt.date(year=2018, month=6, day=1), 'berthed')
        '2018-05-03T00:00:00'
        >>> normalize_date('032200', dt.date(year=2018, month=12, day=30), 'eta')
        '2019-01-03T00:00:00'
        >>> normalize_date('302200', dt.date(year=2019, month=1, day=2), 'berthed')
        '2018-12-30T00:00:00'
        >>> normalize_date('02/07', dt.date(year=2018, month=7, day=5), 'eta')
        '2018-07-02T00:00:00'
        >>> normalize_date('02/01', dt.date(year=2018, month=12, day=30), 'eta')
        '2019-01-02T00:00:00'
        >>> normalize_date('30/12', dt.date(year=2019, month=1, day=2), 'berthed')
        '2018-12-30T00:00:00'

    Args:
        raw_date (str): raw date string
        reported_date (dt.datetime): used to infer missing month/year in the dates
        event (str): used to determine if we should increment month/year when inferring

    Returns:
        str: ISO8601 formatted timestamp

    """
    # normalize 1st format
    if '/' in raw_date:
        day, month, year = raw_date[:2], raw_date[3:], reported_date.year

    # normalize 2nd format
    else:
        day, month, year = raw_date[:
                                    2], reported_date.month, reported_date.year
        if event in ('eta', 'etb'):
            if reported_date.day > int(day):
                month = (reported_date + relativedelta(months=1)).month
        else:
            if reported_date.day < int(day):
                month = (reported_date - relativedelta(months=1)).month

    # handle year rollover scenarios
    if int(month) == 12 and reported_date.month == 1:
        year -= 1
    elif int(month) == 1 and reported_date.month == 12:
        year += 1

    return may_apply(f'{day}/{month}/{year}', to_isoformat)