Python may_strip Examples, kp_scrapers.lib.parser.may_strip Python Examples

Example #1

0

Show file

File: spider.py Project: theHausdorffMetric/test

    def __init__(self, table_name='Scheduled Calls', *args, **kwargs):
        """Init BilbaoWeb spider.

        There are five tables of vessel movements given by the source.
            - Vessel Arrivals
            - Vessel Departures
            - Vessel Operating
            - Scheduled Calls
            - Inactive Stay

        Each spider job needs to choose one table to scrape.

        """
        super().__init__(*args, **kwargs)
        self.table_name = may_strip(table_name)

Example #2

0

Show file

    def parse_mail(self, mail):
        """Parse each email that was matched with the spider filter arguments.

        Args:
            mail (Mail):

        Yields:
            Dict[str, str]:
        """
        for attachment in mail.attachments():
            for sheet in xlrd.open_workbook(file_contents=attachment.body, on_demand=True).sheets():
                start_processing = False
                for idx, raw_row in enumerate(sheet.get_rows()):
                    row = format_cells_in_row(raw_row, sheet.book.datemode)
                    # detect if cell is a port cell and memoise it
                    if any(may_strip(x.lower()) in HEADER_STR for x in (row[0], row[1], row[3])):
                        header = row
                        start_processing = True
                        continue

                    if start_processing:
                        raw_item = {
                            may_strip(h.lower()): may_strip(row[idx].replace('\n', '|').lower())
                            for idx, h in enumerate(header)
                        }

                        # occasionally the header might be missing an important header
                        if '' in [key for key in raw_item.keys()]:
                            raw_item['eta_holder'] = raw_item.pop('', None)

                        raw_item.update(
                            provider_name=self.provider,
                            reported_date=mail.envelope['subject'],
                            port_name=may_strip(sheet.name.lower()),
                        )
                        yield from normalize.process_item(raw_item)

Example #3

0

Show file

def field_mapping():
    return {
        '0': ('vessel_status', None),
        '1':
        ignore_key('size'),
        '2': ('lay_can', None),
        '3': ('departure_zone', lambda x: ZONE_MAPPING.get(x.lower(), x)),
        '4':
        ('arrival_zone',
         lambda x: [ZONE_MAPPING.get(z.lower(), z) for z in x.split('-')]),
        '5': ('rate_value', lambda x: re.sub(TO_REMOVE, '', x).strip()),
        '6': ('charterer', lambda x: None if may_strip(x) == 'CNR' else x),
        'provider_name': ('provider_name', None),
        'reported_date': ('reported_date', None),
    }

Example #4

0

Show file

def normalize_item_dates(item):
    """Cleanup item dates.

    Args:
        item (dict):

    Returns:
        item (dict):

    Examples:
    >>> normalize_item_dates({'lay_can_start': '12-14', \
         'reported_date': '05.03.2020'})
    {'lay_can_start': '2020-03-14T00:00:00', 'reported_date': '05 Mar 2020'}
    >>> normalize_item_dates({'lay_can_start': 'TBA', 'reported_date': '05.03.2020'})
    {'lay_can_start': None, 'reported_date': '05 Mar 2020'}
    """
    try:
        reported_date = datetime.datetime.strptime(item['reported_date'], '%d.%m.%Y')
        item['reported_date'] = reported_date.strftime('%d %b %Y')
    except ValueError:
        item['reported_date'] = None
        item['lay_can_start'] = None
        return item

    if not item['lay_can_start'] or item['lay_can_start'] in STRING_BLACKLIST:
        item['lay_can_start'] = None
    else:
        if item['lay_can_start'].split('-'):
            lay_can = may_strip(item['lay_can_start'].split('-')[-1])
        else:
            lay_can = may_strip(item['lay_can_start'])
        year_month = str(reported_date.year) + '-' + str(reported_date.month) + '-'
        item['lay_can_start'] = to_isoformat(year_month + lay_can, dayfirst=False)

    item['reported_date'] = reported_date.strftime('%d %b %Y')
    return item

Example #5

0

Show file

File: normalize.py Project: theHausdorffMetric/test

def split_product(raw_product, raw_volume_unit, raw_attachment_name):
    """split and yield multiple products

    Args:
        raw_product (str):
        raw_volume_unit (str):
        raw_attachment_name (str):

    Examples:
        >>> split_product('Jet A1 + Gasoil', '30000 + 20000', 'yanbu')
        ([('Jet A1', '30000'), ('Gasoil', '20000')], 'tons')
        >>> split_product('Jet A1 + Gasoil', '30000', 'yanbu')
        ([('Jet A1', '15000.0'), ('Gasoil', '15000.0')], 'tons')
        >>> split_product('Jet A1', '30000', 'yanbu')
        ([('Jet A1', '30000')], 'tons')
        >>> split_product('Butane / Propane', 'TBA', 'yanbu')
        ([('Butane', None), ('Propane', None)], None)


    Returns:
        str:
    """
    product_list = [
        may_strip(prod) for prod in re.split(r'[\\\+\&\/\;]', raw_product)
    ]

    volume_unit_list, units = get_vol_unit(raw_volume_unit,
                                           raw_attachment_name)

    if len(product_list) == len(volume_unit_list):
        return list(zip(product_list, volume_unit_list)), units

    if len(product_list) > 1 and len(volume_unit_list) == 1:
        final_list = []
        for item_product in product_list:
            # source may contain typo errors
            try:
                vol_append = str(
                    float(volume_unit_list[0]) / len(product_list))
            except Exception:
                vol_append = None
            final_list.append((item_product, vol_append))
        return final_list, units

    if product_list and not volume_unit_list:
        return list(zip_longest(product_list, volume_unit_list)), None

    return None, None

Example #6

0

Show file

    def parse(self, response):
        """Parse and extract raw items from html tables.

        Each entry in the port activity table has a link on the vessel name, with the vessel IMO
        in the link itself. We append vessel imo to each row we extract, since it is not technically
        part of the table cells.

        Vessel imo appears as part of the html query string, e.g.:
        ".../phpcodes/navire_a.php?ship=9297905"
                                        ^^^^^^^
                                          imo

        Args:
            response (scrapy.HtmlResponse):

        Yields:
            Dict[str, str]:

        """
        # memoise reported_date so it won't need to be called repeatedly
        reported_date = (dt.datetime.utcnow().replace(
            hour=0, minute=0, second=0, microsecond=0).isoformat())

        logger.info(f"JJJJJJJJJJJJJJ {self.provider}")

        # each index corresponds to the vessel movement type in the table
        # 0: attendus
        # 1: a quai
        # 2: en rade
        for table_idx in range(3):
            table = response.xpath(
                f'//div[contains(@class, "et_pb_tab_{table_idx}")]//table')
            header = [
                may_strip(head)
                for head in table.xpath('.//th/text()').extract()
            ]

            for row in table.xpath('./tbody//tr'):
                raw_item = row_to_dict(row, header)
                # conextextualise raw item with meta info
                raw_item.update(
                    port_name=self.name,
                    provider_name=self.provider,
                    reported_date=reported_date,
                    vessel_imo=row.xpath('./td//@href').extract_first().split(
                        'ship=')[1],
                )
                yield normalize.process_item(raw_item)

Example #7

0

Show file

File: spider.py Project: theHausdorffMetric/test

    def parse_response(self, response):
        """Parse response from Alaska website.

        Args:
            response (scrapy.Response):

        Yields:
            Dict[str, str]:
        """
        # denotes if record should be processed when iterating over each table srow
        PROCESS_FLAG = False

        tables = response.xpath('(//table[@id="ContentPlaceHolder1_Table1"])')
        for raw_row in tables.xpath('//tr'):
            row = [
                may_strip(td.xpath('.//text()').extract_first()) for td in raw_row.xpath('.//td')
            ]

            # According to the source, the number colums in the table is 8
            if len(row) < 8:
                continue

            # To indicate the start of the first data row in the table
            if row[0] == 'Date':
                PROCESS_FLAG = True
                continue

            # To indicate the end of the table
            elif row[0] == 'Average':
                PROCESS_FLAG = False
                continue

            if not PROCESS_FLAG:
                continue

            yield {
                # 7 denotes the position of the volume in the table's row.
                'volume': int(row[7].replace(',', '')),
                'country': 'Alaska',
                'country_type': 'region',
                'start_date': parser.parse_input_date(row[0]),  # is inclusive
                'end_date': parser.parse_input_date(row[0], days=1),  # is exclusive
                'unit': Unit.barrel,
                'provider_name': self.provider,
                'reported_date': self.reported_date,
                'balance': BalanceType.ending_stocks,
                'product': 'Crude Oil',
            }

Example #8

0

Show file

File: normalize.py Project: theHausdorffMetric/test

def field_mapping():
    return {
        '0': ('vessel', lambda x: {
            'name': x
        } if 'TBN' not in x.split() else None),
        '1': ('cargo_volume', None),
        '2': ('cargo_product', None),
        '3': ('departure_zone', None),
        '4': ('arrival_zone', lambda x: x.split('-') if x else None),
        '5': ('laycan', None),
        '6': ('rate_value', None),
        '7': ('charterer', lambda x: may_strip(x.replace('-', ''))),
        '8': ('status', lambda x: STATUS_MAPPING.get(x, None)),
        'provider': ('provider_name', None),
        'reported_date': ('reported_date', None),
    }

Example #9

0

Show file

File: parser.py Project: theHausdorffMetric/test

def split_row(row):
    """Try to split the row.
    CHANGELOS GABRIEL FOR UNKNWON PARTY. 60,000MT FUEL OIL. LAYCAN EX U.S GULF 03-05 DECEMBER.
    FREIGHT UNKNOWN NORDIC GENEVA FOR CLEARLAKE. 60,000MT FUEL OIL. LAYCAN EX TANJUNG PELEPAS 29-31
    DECEMBER. FREIGHT OWN PROGRAM TC

    Args: row (str):

    Returns: Tuple(List[str], List[str]): cells and headers

    """
    match = re.match(INFORMATION_REGEX, may_strip(row))
    if match:
        return list(match.groups()), HEADERS

    return None, None

Example #10

0

Show file

def grades_mapping():
    return {
        'SHIPS NAME': ('vessel', lambda x: {
            'name': may_strip(x.replace('*', ''))
        }),
        'PRODUCTS': ('cargo_product', None),
        'QUANTITY': ('cargo_volume_movement', None),
        'TERMINAL': ('installation', None),
        'PORT': ('port_name', None),
        'ARRIVAL': ('arrival', normalize_date),
        'BERTHING': ('berthed', normalize_date),
        'ETD': ('departure', normalize_date),
        'COMMENTS': ignore_key('irrelevant'),
        'provider_name': ('provider_name', None),
        'reported_date': ('reported_date', None),
    }

Example #11

0

Show file

def extract_headers(table, rm_headers=[]):
    """Extract headers of specified table.

    NOTE could be made generic

    Args:
        table (scrapy.Selector):
        rm_headers (List[str]): list of header names to remove before returning

    Returns:
        List[str]:

    """
    headers = table.xpath('.//tr[@class="omg"]//font/text()').extract()
    # remove unneccessary headers
    return [may_strip(head) for head in headers if head not in rm_headers]

Example #12

0

Show file

def _clean_string(raw):
    """Clean strings and transform empty strings into NoneType.

    Examples:
        >>> _clean_string('  ')
        >>> _clean_string('')
        >>> _clean_string(' DOW CHEMICAL TEXAS OPERATIONS')
        'DOW CHEMICAL TEXAS OPERATIONS'
        >>> _clean_string(None)

    """
    if not raw:
        return None

    cleaned = may_strip(raw)
    return cleaned if cleaned else None

Example #13

0

Show file

File: normalize.py Project: theHausdorffMetric/test

def field_mapping():
    return {
        'vessel': ('vessel', lambda x: {
            'name': may_strip(x)
        }),
        'size': ('cargo_volume', None),
        'cargo': ('cargo_product', None),
        'layday': ('lay_can', None),
        'load': ('departure_zone', None),
        'discharge': ('arrival_zone', lambda x: normalize_voyage(x)),
        'freight': ('rate_value', None),
        'charterer': ('charterer', None),
        'status': ('status', lambda x: STATUS_MAPPING.get(x, None)),
        'provider_name': ('provider_name', None),
        'reported_date': ('reported_date', None),
    }

Example #14

0

Show file

File: normalize.py Project: theHausdorffMetric/test

def field_mapping():
    return {
        'eta': ('eta', None),
        'etb': ('etb', None),
        'discharge': ('discharge', normalize_quantity),
        'installation': ('installation', None),
        'load': ('load', normalize_quantity),
        'port_name': ('port_name', normalize_port_name),
        'product': (
            'product',
            lambda x: [may_strip(product) for product in x.split('/') if x not in INVALID_CARGOES],
        ),
        'provider_name': ('provider_name', None),
        'reported_date': ('reported_date', lambda x: to_isoformat(x, dayfirst=True)),
        'vessel': ('vessel', lambda x: {'name': normalize_vessel_name(x)}),
    }

Example #15

0

Show file

def normalize_string(raw_value):
    """Remove unnecessary strings

    Args:
        raw_value (str):

    Examples:
        >>> normalize_string(None)
        >>> normalize_string('TBC')
        >>> normalize_string('DHT LEOPARD')
        'DHT LEOPARD'
    """
    if (raw_value and str(raw_value) in BLACKLIST) or raw_value == '':
        return None

    return may_strip(raw_value)

Example #16

0

Show file

File: normalize.py Project: theHausdorffMetric/test

def field_mapping():
    return {
        'SR': (ignore_key('irrelevant')),
        'Date': ('berthed', None),
        'Jetty': ('berth', lambda x: try_apply(x, str).replace('.0', '')),
        'Operation': ('cargo_movement', lambda x: re.sub(r'[\W]', '', x.lower())),
        'Product': ('cargo_product', None),
        'Vessel Name': ('vessel', lambda x: {'name': may_strip(x)}),
        # TODO include buyer and seller
        'Customer / Terminal': (ignore_key('buyer and seller, cannot be used currently')),
        'Latest Value b/w CT inform & Berth Available': (ignore_key('irrelevant')),
        'Parcel size M3': ('cargo_volume', lambda x: try_apply(x, str).replace(',', '')),
        'port_name': ('port_name', None),
        'installation': ('installation', None),
        'provider_name': ('provider_name', None),
        'reported_date': ('reported_date', None),
    }

Example #17

0

Show file

File: parser.py Project: theHausdorffMetric/test

def _remove_elements(lst, rm_elements=[]):
    """Remove useless elements of a list

    NOTE could be made generic

    Args:
        lst (List[str]):
        rm_elements (List[str]): list of elements to remove before returning

    Returns:
        List[str]:

    """
    # remove unneccessary elements
    for rm_element in rm_elements:
        stripped_lst = [element for element in lst if element != rm_element]
    return [may_strip(element) for element in stripped_lst if element != '\n']

Example #18

0

Show file

File: spider.py Project: theHausdorffMetric/test

    def parse_pdf(self, p_name, body, tab_opt, mail):
        """Parse PDF reports.
        Args:
            attachment (Attachment): mail attachment object
        Yields
            Dict[str, str]:
        """
        prev_row = None
        reported_date = self.extract_reported_date(mail.envelope['subject'])

        for idx, row in enumerate(self.extract_pdf_io(body, **tab_opt)):
            # remove rows with no vessels
            if not row[1]:
                continue

            # memoise row for ffill
            prev_row = row if row[1] else prev_row

            # remove filler rows
            if any(sub in row[1]
                   for sub in ('EMPTY', 'TGL', 'TERMINAL', 'TPG')):
                continue

            if 'VESSEL' in row[1]:
                header = row
                continue

            if not row[1]:
                row = [
                    prev_r if not row[prev_idx] else row[prev_idx]
                    for prev_idx, prev_r in enumerate(prev_row)
                ]

            # extract data row
            raw_item = {
                head: may_strip(row[head_idx])
                for head_idx, head in enumerate(header)
            }
            # contextualise raw item with meta info
            raw_item.update(
                reported_date=reported_date,
                port_name='Aratu',  # report only contains data for Aratu port
                provider_name=self.provider,
            )

            yield process_item(raw_item)

Example #19

0

Show file

File: normalize.py Project: theHausdorffMetric/test

def map_berth_to_installation(raw_berth: str) -> Optional[str]:
    """Clean, normalize, and map a raw berth name to a known installation.

    Examples:
        >>> map_berth_to_installation('Valero 8')
        'Valero Pembroke'
        >>> map_berth_to_installation('Dragon No1')
        'Dragon'
        >>> map_berth_to_installation('Milford Dock')

    """
    for known_berth in BERTH_TO_INSTALLATION_MAPPING:
        if known_berth in may_strip(raw_berth):
            return BERTH_TO_INSTALLATION_MAPPING[known_berth]

    logger.debug('Unknown berth: %s', raw_berth)
    return None

Example #20

0

Show file

File: normalize.py Project: theHausdorffMetric/test

def portcall_mapping() -> Dict[str, Tuple[str, Optional[Callable]]]:
    return {
        'Date':
        ('eta', lambda x: to_isoformat(x, dayfirst=False, yearfirst=True)),
        'Ship': ('vessel_name', None),
        'GT': ('vessel_gt', lambda x: validate_weight(x)),
        'DWT': ('vessel_dwt', lambda x: validate_weight(x)),
        'Move Type': ('event', lambda x: may_strip(x.lower())),
        'Remarks': ignore_key('handwritten notes from port authority'),
        'From': ignore_key('irrelevant'),
        'To': ('installation', map_berth_to_installation),
        'ACTION_STATUS_ID': ignore_key('internal ID used by port'),
        'Tug': ignore_key('irrelevant'),
        'port_name': ('port_name', None),
        'provider_name': ('provider_name', None),
        'reported_date': ('reported_date', None),
    }

Example #21

0

Show file

File: normalize.py Project: theHausdorffMetric/test

def sanitize_date(raw_date):
    """Sanitise raw date as ISO8601 timestamp.

    Raw date is usually on 2 lines.
    The first one being a nicely formatted %y-%m-%d.
    The second one being a messy combination of hours, AM, PM, ...

    The date could be invalid, in this case, we want to keep calm and keep processing.

    Examples:
        >>> sanitize_date('2019-05-11')
        '2019-05-11T00:00:00'
        >>> sanitize_date('2019-05-0907:20')
        '2019-05-09T07:20:00'
        >>> sanitize_date('2019-05-19      19:00')
        '2019-05-19T19:00:00'
        >>> sanitize_date('2019-05-11 AM:-')
        '2019-05-11T00:00:00'
        >>> sanitize_date('2019-05-05 09  :00')
        '2019-05-05T09:00:00'
        >>> sanitize_date('2019-05-10 :')
        '2019-05-10T00:00:00'
        >>> sanitize_date('2019-06-01 sh:')
        '2019-06-01T00:00:00'
        >>> sanitize_date('2019-06-01 -06:-00')
        '2019-06-01T06:00:00'
        >>> sanitize_date('')

    """
    # sanity check
    if not raw_date:
        return None

    # remove excessive whitespace first
    raw_date = may_strip(raw_date)
    try:
        date, hour, minute = re.match(DATE_PARSING_PATTERN, raw_date).groups()
        hour = try_apply(hour, int) if try_apply(hour, int) else '00'
        minute = try_apply(minute, int) if try_apply(minute, int) else '00'
        return to_isoformat(f'{date} {hour}:{minute}', dayfirst=False)

    # keep calm so that we can proceed processing
    except AttributeError:
        logger.error('Date might be invalid, please double check: %s',
                     raw_date)
        return None

Example #22

0

Show file

def field_mapping():
    return {
        'VESSEL': ('vessel', lambda x: {'name': may_strip(x)}),
        'ETA': ('eta', normalize_date),
        'ETB': ('berthed', normalize_date),
        'ETS': ('departure', normalize_date),
        'GRADE': ('cargo_product', lambda x: None if 'TBI' in x else x),
        'QTTY': ('cargo_quantity', lambda x: try_apply(x.replace('.', '', 1), int)),
        'TTL  QTTY': ignore_key('redundant'),
        'LD PORT': ('load_cargo_movement', None),
        'DISC PORT': ('dis_cargo_movement', None),
        'SH / REC': ignore_key('redundant'),
        'RMK': ignore_key('redundant'),
        'port_name': ('port_name', None),
        'provider_name': ('provider_name', None),
        'reported_date': ('reported_date', None),
    }

Example #23

0

Show file

    def _process_tarragona_rows(self, table):
        """Process rows for tarragona port.

        Decode all rows as unicode strings since tabula outputs byte strings by default.
        Extract matching date for each table section based on the section's description.
        Yield only rows that contain table data, skipping table section description.

        Matching date in table header can have the following format:
            - "Buques atracados el día 18 de marzo del 2018 (Información actualizada a las 8:30 horas)"  # noqa
            - "Buques atracados el día18 de marzo del 2018 (Información actualizada a las 8:30 horas)"  # noqa

        raw table row order:
            - table name (discard):        ['PUE', 'RTO DE', 'TARRAGO', 'NA']
            - matching_date (extract):     ['Buques atracados el d\xc3\xada', '29 de marzo del 2018 (', 'Informaci\xc3\xb3n actualizada a la', 's 8:00 horas)']  # noqa
            - header (keep):               ['BUQUE', 'MUELLE', 'CONSIGNATARIO/', 'TONS']
            - 2nd header (discard):        ['', '', 'ESTIBADOR', '']
            - data row (keep):             ['CANNETO M', 'FONDEADO ZONA II', 'IB\xc3\x89RICA MAR\xc3\x8dTIMA', '-']  # noqa
            - data row (keep):             ['OTTOMANA', 'PANTAL\xc3\x81 REPSOL', 'MARITIME/REPSOL', 'P. PETROL\xc3\x8dFEROS/23.300-D']  # noqa
            - subsequent data rows (keep): ...
            - ...

        Args:
            table (List[List[str]]): list of table rows from pdf

        Yields:
            List[str]:

        """
        for idx, row in enumerate(table):
            # tabula stores string data as bytes by default
            row = [cell for cell in row]

            # try deciphering matching_date in first row
            if idx == 1:
                matching_date = parse_raw_date(
                    may_strip(''.join(row).split('atracados el día')[1].split(
                        '(Info')[0]))
                logger.debug('Found matching date: {}'.format(matching_date))

            if idx >= 2:
                if not ('ESTIBADOR' in ' '.join(row)
                        or 'PUERTO DE' in ''.join(row)):
                    # third row contains headers
                    row.append('matching_date' if idx == 2 else matching_date)
                    yield row

Example #24

0

Show file

def normalize_currency_rate(raw_currency, raw_rate):
    """Combine currency and rate column together

    Args:
        raw_currency (str):
        raw_rate (str):

    Returns:
        str:

    Examples:
        >>> normalize_currency_rate('WS', '167.5')
        'WS 167.5'
        >>> normalize_currency_rate('RNR', None)
        'RNR'
    """
    return may_strip(f'{raw_currency} {raw_rate}'
                     ) if raw_currency and raw_rate else raw_currency

Example #25

0

Show file

File: parser.py Project: theHausdorffMetric/test

def split_row(row):
    """Try to split the row.

    Args:
        row (str):

    Returns:
        Tuple(List[str], List[str]): cells and headers

    """
    for desc, value in PATTERN_HEADER_MAPPING.items():
        pattern, headers = value

        match = re.match(pattern, may_strip(row))
        if match:
            return list(match.groups()), headers

    return None, None

Example #26

0

Show file

    def parse_mail(self, mail):
        self.reported_date = parse_date(mail.envelope['date']).strftime('%d %b %Y')

        body = self.select_body_html(mail)
        for row_html in body.xpath('//p'):
            # we divide the row by <p> tag, however, some rows are in the same <p> tag, usually,
            # they are separated by two <br> tag, therefore in here we use `\r\n\r\n`, as single
            # <br> is spotted in a row.
            # this is not a very good solution, but due to the format is quite inconsistent, there's
            # no other way to detect the rows elegantly.
            rows = ''.join(row_html.xpath('.//text()').extract()).split('\r\n\r\n')
            for raw_row in rows:
                row = self.split_row(may_strip(raw_row))
                if row:
                    raw_item = {str(idx): cell for idx, cell in enumerate(row)}
                    raw_item.update(self.meta_field)

                    yield normalize.process_item(raw_item)

Example #27

0

Show file

File: normalize.py Project: theHausdorffMetric/test

def normalize_charterer(raw_charterer):
    """Remove unnecessary strings

    Examples:
        >>> normalize_charterer('SHELL')
        'SHELL'
        >>> normalize_charterer('SHELL-REPLACED')
        'SHELL'

    Args:
        raw_charterer (str):

    Returns:
        str:

    """
    charter = raw_charterer.partition('-')[0]
    return may_strip(charter)

Example #28

0

Show file

File: normalize.py Project: theHausdorffMetric/test

def normalize_cargoes(item):
    """Normalize cargoes.

    Args:
        item (Dict[str, str]):

    Yields:
        Dict[str, str]:
    """
    cargoes = item['product_volume'].split(';')
    for cargo in cargoes:
        # TODO confirm with analysts on volume unit, until then we cannot use volume figure
        # product, _, volume
        product, _, _ = cargo.partition(',')
        yield {
            'product': may_strip(product),
            'movement': 'load' if item['is_load'] else 'discharge',
        }

Example #29

0

Show file

def normalize_cargoes(item):
    # filter out irrelevant cargoes
    if not is_relevant_cargo(item['cargo_product'], item['cargo_movement']):
        return

    # multi cargoes, ignore volume to avoid confusion
    elif '+' in item['cargo_product']:
        for product in [may_strip(prod) for prod in item['cargo_product'].split('+')]:
            yield {'product': product}

    # single cargo, assign volume and movement
    else:
        yield {
            'product': item['cargo_product'],
            'movement': MOVEMENT_MAPPING.get(item['cargo_movement']),
            'volume': item['cargo_volume'],
            'volume_unit': Unit.tons,
        }

Example #30

0

Show file

File: normalize.py Project: theHausdorffMetric/test

def field_mapping(**kwargs):
    return {
        'port_name': ('port_name', None),
        'provider_name': ('provider_name', None),
        'reported_date': ('reported_date', None),
        'berth': ('berth', None),
        'vessel': ('vessel', lambda x: {'name': normalize_vessel_name(x)}),
        'arrived': ('arrival', lambda x: normalize_date(may_strip(x), **kwargs, event='arrived')),
        'arrived eta': (
            'arrived eta',
            lambda x: normalize_date(may_strip(x), **kwargs, event='arrived'),
        ),
        'eta': ('eta', lambda x: normalize_date(may_strip(x), **kwargs, event='eta')),
        'etb': ('etb', lambda x: normalize_date(may_strip(x), **kwargs, event='etb')),
        'etc': ('etc', lambda x: normalize_date(may_strip(x), **kwargs, event='etc')),
        'ets': ('departure', lambda x: normalize_date(may_strip(x), **kwargs, event='ets')),
        'berthed': ('berthed', lambda x: normalize_date(may_strip(x), **kwargs, event='berthed')),
        'new etb': ('new etb', lambda x: normalize_date(may_strip(x), **kwargs, event='new etb')),
        'ops': ('cargoes', lambda x: list(normalize_cargo(x))),
    }