Example #1
0
    def parse_sheet_rows(self, sheet, post_process=lambda x: x):
        """Parse and process rows of a spreadsheet.

        Args:
            sheet (xlrd.Sheet):
            post_process (Callable[Any, List[str]]):

        Yields:
            Dict[str, str]:
        """
        for idx, row in enumerate(sheet.get_rows()):
            row = [
                xldate_to_datetime(cell.value, sheet.book.datemode).isoformat()
                if is_xldate(cell)
                else may_strip(cell.value)
                for cell in row
            ]

            # headers will always be the first row of any sheet from this source
            if idx == 0:
                header = row
                continue

            # post-process row according to caller specifications
            row = post_process(row)
            if row:
                yield {head: row[head_idx] for head_idx, head in enumerate(header)}
Example #2
0
    def parse_attachment(
        self, attachment_doc, reported_date, sheet_list, get_header, pn_row, pn, doc_name
    ):
        for sheet in xlrd.open_workbook(file_contents=attachment_doc, on_demand=True).sheets():
            # only parse relevant sheets
            if any(sub in sheet.name.lower() for sub in ['sheet1', 'new']) or not any(
                sub in sheet.name.lower() for sub in sheet_list
            ):
                continue

            first_row, second_row = None, None
            port_name = pn if pn else pn_row

            for idx, raw_row in enumerate(sheet.get_rows()):
                row = []
                # to handle is xldate exception
                for cell in raw_row:
                    if is_xldate(cell):
                        try:
                            cell = xldate_to_datetime(cell.value, sheet.book.datemode).isoformat()
                        except Exception:
                            cell = str(cell.value)

                    else:
                        cell = str(cell.value)

                    row.append(cell)

                # get port name
                if isinstance(port_name, int) and idx == pn_row:
                    port_name = row[1]

                # format headers because it is split into 2, make it adaptable
                # across sheets instead of hardcoding
                if idx in get_header:
                    if idx == get_header[0]:
                        first_row = row
                        continue

                    if idx == get_header[1]:
                        second_row = row
                        continue

                if first_row and second_row:
                    headers = self.combine_rows(first_row, second_row)

                # only process relevant rows after header row
                if idx > get_header[1]:
                    raw_item = {may_strip(head): row[idx] for idx, head in enumerate(headers)}
                    # contextualise raw item with meta info
                    raw_item.update(
                        reported_date=reported_date,
                        provider_name=self.provider,
                        port_name=port_name,
                        file_name=doc_name,
                    )
                    yield normalize.process_item(raw_item)
Example #3
0
    def parse_mail(self, mail):
        """The method will be called for every mail the search_term matched.

        Args:
            mail (Mail):

        Yields:
            Dict[str, str]:

        """
        for attachment in mail.attachments():
            for sheet in xlrd.open_workbook(file_contents=attachment.body, on_demand=True).sheets():
                # only process the sheet we want
                if not any(sub in sheet.name.lower() for sub in ('export', 'import')):
                    continue

                sheet_name = sheet.name.lower()

                for idx, raw_row in enumerate(sheet.get_rows()):
                    # Include handling of xlrd.xldate.XLDateAmbiguous cases
                    row = []
                    for cell in raw_row:
                        if is_xldate(cell):
                            try:
                                cell = xldate_to_datetime(
                                    cell.value, sheet.book.datemode
                                ).isoformat()
                            except Exception:
                                cell = str(cell.value)

                        else:
                            cell = str(cell.value)

                        row.append(cell)

                    if idx == 0:
                        reported_date = to_isoformat(row[1], dayfirst=True)
                        continue

                    # second row will always contain header; extract it
                    if idx == 1:
                        header = row
                        continue

                    # extract data row
                    raw_item = {head: row[head_idx] for head_idx, head in enumerate(header)}
                    raw_item.update(
                        reported_date=reported_date,
                        provider_name=self.provider,
                        sheet_name=sheet_name,
                    )

                    if DataTypes.SpotCharter in self.produces:
                        print(raw_item)
                        yield normalize_charters.process_item(raw_item)
                    else:
                        yield normalize_grades.process_item(raw_item)
Example #4
0
def convert_xldate(raw_date_float):
    """Convert an xldate cell into an ISO-8601 string depending on value of immediate cell.

    Args:
        raw_date_float (float):

    Returns:
        str: ISO-8601 string if no errors, else empty string
    """
    return xldate_to_datetime(raw_date_float, sheet_datemode=0).isoformat()
Example #5
0
    def parse_mail(self, mail):
        """Extract mail found with specified email filters in spider arguments.

        Args:
            mail (Mail):

        Yields:
            Dict[str, str]:
        """
        for attachment in mail.attachments():
            # memoise reported date so it won't need to be called repeatedly later
            _match = re.match(r'.*DBE\s(\d+)', attachment.name)
            if not _match:
                raise ValueError(f'Unknown reported date format: {attachment.name}')
            reported_date = _match.group(1)

            # each xlsx file by this provider will only have one sheet
            sheet = xlrd.open_workbook(
                file_contents=attachment.body, on_demand=True
            ).sheet_by_index(0)

            # extract raw data from sheet
            for idx, row in enumerate(sheet.get_rows()):
                row = [
                    xldate_to_datetime(cell.value, sheet.book.datemode).isoformat()
                    if is_xldate(cell)
                    else str(cell.value)
                    for cell in row
                ]

                # initialise headers
                if idx == HEADER_ROW_IDX:
                    header = row
                    continue

                # skip vessels with () inside, i.e., vessel has not been identified by provider
                if '(' in row[VESSEL_COLUMN_INDEX]:
                    continue

                # FIXME timecharter type
                if 'timecharter' in row[COMMODITY_COLUMN_INDEX].lower():
                    continue

                raw_item = {head: row[idx] for idx, head in enumerate(header)}
                # contextualise raw item with metadata
                raw_item.update(
                    provider_name='MRI', reported_date=reported_date, spider_name=self.name
                )

                if DataTypes.SpotCharter in self.produces:
                    yield normalize_charters.process_item(raw_item)
                # FIXME supposed to be `DataTypes.PortCall` here, but we don't want
                # data-dispatcher to consume data from these spiders and the ETL to create PCs
                else:
                    yield normalize_grades.process_item(raw_item)
Example #6
0
    def parse_mail(self, mail):
        """This method will be called for every mail the search_term matched.

        Each vessel movement has an associated uuid that is linked to a cargo's uuid,
        allowing for easy retrieval of a vessel's cargo movement.

        However, each vessel may contain multiple cargo movements with the same uuid,
        therefore we store the products as a list value against the uuid key.

        Args:
            mail (Mail):

        Yields:
            Dict[str, str]:
        """
        # memoise reported date so it does not need to be called repeatedly later
        self.reported_date = to_isoformat(mail.envelope['date'])

        for attachment in mail.attachments():
            # sanity check in case file is not a spreadsheet
            if not attachment.is_spreadsheet:
                continue

            sheet = xlrd.open_workbook(file_contents=attachment.body,
                                       on_demand=True).sheet_by_index(0)

            header = None

            for idx, row in enumerate(sheet.get_rows()):
                row = [
                    xldate_to_datetime(cell.value,
                                       sheet.book.datemode).isoformat()
                    if is_xldate(cell) else cell.value
                    for cell_idx, cell in enumerate(row)
                ]

                if 'approach' in row[3]:
                    # sheet has 2 tables with different headers,
                    # first table has header split into 2
                    header = row
                    header[2] = 'Vessel'
                    header[5] = 'Cargo'
                    continue

                if 'Status' in row[1]:
                    header = row
                    continue

                if header and len(header) == len(row):
                    raw_item = {h: row[idx] for idx, h in enumerate(header)}
                    raw_item.update(provider_name=self.provider,
                                    reported_date=self.reported_date)
                    yield normalize.process_item(raw_item)
Example #7
0
    def parse_file(self, response):
        sheet = xlrd.open_workbook(file_contents=response.body,
                                   on_demand=True).sheet_by_index(0)

        headers = None
        do_process = False
        for idx, raw_row in enumerate(sheet.get_rows()):
            row = [
                xldate_to_datetime(cell.value,
                                   sheet.book.datemode).isoformat()
                if is_xldate(cell) else may_strip(str(cell.value))
                for cell in raw_row
            ][:MAX_ROW_LENGTH]

            # filtering unwanted portions for easier processing of remarks field (to retrieve
            # cargo information)
            # this hold true when the field names don't change
            # if any(row[0] == alias for alias in NO_PROCESSING):
            #     processing = False

            if row and any(sub in row[0] for sub in RESUME_PROCESSING):
                do_process = True

            if row and any(sub in row[0] for sub in PAUSE_PROCESSING):
                do_process = False

            if not do_process:
                continue

            # vessels expected table
            if HEADER_SIGN in row:
                headers = row
                continue

            if headers and row[ETA_COL_IDX]:
                raw_item = {
                    headers[cell_idx]: cell
                    for cell_idx, cell in enumerate(row)
                }
                raw_item.update(self.meta_field)

                yield normalize.process_item(raw_item)

            # waiters info in text
            waiters = self.parse_waiters(row[0])
            if waiters:
                raw_item = {
                    str(cell_idx): cell
                    for cell_idx, cell in enumerate(waiters)
                }
                raw_item.update(self.meta_field)

                yield normalize.process_item(raw_item)
Example #8
0
    def _extract_reported_date(sheet, *coordinates):
        """Extract reported date listed inside sheet, as an ISO-8601 string.

        Args:
            sheet (xlrd.sheet):
            coordinates (int): (row, col) of cell where reported date is

        Returns:
            str: ISO-8601 formatted timestamp

        """
        return xldate_to_datetime(
            sheet.cell(*coordinates).value, sheet.book.datemode).isoformat()
Example #9
0
    def parse_mail(self, mail):
        """This method will be called for every mail the search_term matched.

        Each vessel movement has an associated uuid that is linked to a cargo's uuid,
        allowing for easy retrieval of a vessel's cargo movement.

        However, each vessel may contain multiple cargo movements with the same uuid,
        therefore we store the products as a list value against the uuid key.

        Args:
            mail (Mail):

        Yields:
            Dict[str, str]:
        """
        # memoise reported date so it does not need to be called repeatedly later
        self.reported_date = to_isoformat(mail.envelope['date'])

        for attachment in mail.attachments():
            # sanity check in case file is not a spreadsheet
            if not attachment.is_spreadsheet:
                continue

            sheet = xlrd.open_workbook(file_contents=attachment.body,
                                       on_demand=True).sheet_by_index(0)

            header = None
            for idx, row in enumerate(sheet.get_rows()):
                # report seperates date and timestamp into 2 cells,
                # this would cause the cell to try to convert the timestamps into a date
                # hence throwing an error (index affected 2,5,9)
                row = [
                    xldate_to_datetime(cell.value,
                                       sheet.book.datemode).isoformat()
                    if is_xldate(cell) and cell_idx not in (2, 5, 9) else
                    cell.value for cell_idx, cell in enumerate(row)
                ]

                if idx == 0:
                    header = row
                    continue

                if header:
                    raw_item = {h: row[idx] for idx, h in enumerate(header)}
                    raw_item.update(provider_name='Interadria',
                                    reported_date=self.reported_date)

                    yield normalize.process_item(raw_item)
                else:
                    self.logger.warning('No headers are found.')
Example #10
0
    def parse_mail(self, mail):
        """Extract data from each mail matched by the query spider argument.

        Args:
            mail (Mail):

        Yields:
            Dict[str, str]:

        """
        # memoise reported_date so it won't need to be called repeatedly later
        reported_date = to_isoformat(mail.envelope['date'])

        for attachment in mail.attachments():
            sheet = xlrd.open_workbook(
                file_contents=attachment.body, on_demand=True
            ).sheet_by_index(0)

            # store state of the table, in order to get relevant rows to extract
            for idx, row in enumerate(sheet.get_rows()):
                row = [
                    xldate_to_datetime(cell.value, sheet.book.datemode).isoformat()
                    if is_xldate(cell)
                    else cell.value
                    for cell in row
                ]

                # ignore unnecessary rows
                if idx < 3:
                    continue

                # locate port row
                if idx == 3:
                    port = row[0].lower().replace('port: ', '')
                    continue

                # locate header row
                if idx == 4:
                    header = row
                    continue

                raw_item = {head: row[idx] for idx, head in enumerate(header)}
                # contextualise raw item with meta info
                raw_item.update(
                    reported_date=reported_date,
                    provider_name=self.provider,
                    port_name=port,
                    attachment_name=attachment.name,
                )
                yield from normalize.process_item(raw_item)
Example #11
0
    def parse_attachment(self, attachment_doc, reported_date):
        # each xlsx file by this provider will only have one sheet
        sheet = xlrd.open_workbook(file_contents=attachment_doc, on_demand=True).sheet_by_name(
            'Market Data Report'
        )
        # for this spot charter we don't look at cargo information (already saved
        # in corresponding cargo movement)
        # when several rows for a vessel with same charter values,port values and
        # arrival values we only build one item
        spot_charters_seen = {}
        for idx, row in enumerate(sheet.get_rows()):
            row = [
                xldate_to_datetime(cell.value, sheet.book.datemode).isoformat()
                if is_xldate(cell)
                else str(cell.value)
                for cell in row
            ]

            # remove empty rows before and after the main data table
            # denoted by a falsy value
            if not row[EMPTY_ROW_INDICATOR_INDEX]:
                continue

            # initialise headers
            if HEADER_PATTERN in row:
                header = row
                continue

            # extract data row
            raw_item = {head: row[head_idx] for head_idx, head in enumerate(header)}
            raw_item.update(reported_date=reported_date, provider_name=self.provider)
            if DataTypes.SpotCharter in self.produces:
                if (
                    raw_item['Charterer Name']
                    + raw_item['Vessel']
                    + raw_item['Arrival'].split(' ')[0]
                    + raw_item['Port Name']
                    in spot_charters_seen.keys()
                ):
                    continue
                spot_charters_seen[
                    raw_item['Charterer Name']
                    + raw_item['Vessel']
                    + raw_item['Arrival'].split(' ')[0]
                    + raw_item['Port Name']
                ] = True
                yield normalize_charters.process_item(raw_item)
            elif DataTypes.Cargo in self.produces:
                yield normalize_grades.process_item(raw_item)
Example #12
0
    def parse_row(self, line):
        item = IOItem()
        row_datetime = xldate_to_datetime(self.rowl_at(line, 'date'),
                                          self.book.datemode)

        if self.start_date is not None and row_datetime < self.start_date:
            return None

        item['unit'] = self.unit_str
        item['date'] = create_str_from_time(row_datetime)
        item['level_o'] = str_to_float(self.rowl_at(line, 'level_o'))
        item['input_o'] = self.rowl_at(line, 'input_o')
        item['output_o'] = self.rowl_at(line, 'output_o')
        item['src_file'] = self.url

        return item
Example #13
0
def normalize_date(raw_date):
    """Dealing with date extracted from excel, could be a date text or a float.

    Examples:
        >>> normalize_date(43373.0)
        '2018-09-30T00:00:00'
        >>> normalize_date('29.09.18')
        '2018-09-29T00:00:00'

    Args:
        raw_date:

    Returns:

    """
    if isinstance(raw_date, float):
        return xldate_to_datetime(raw_date, sheet_datemode=0).isoformat()
    else:
        return to_isoformat(raw_date, dayfirst=True)
Example #14
0
    def parse_mail(self, mail):
        """Extract data from each mail matched by the query spider argument.

        Args:
            mail (Mail):

        Yields:
            Dict[str, str]:

        """
        for attachment in mail.attachments():
            # each xls file only has one sheet
            sheet = xlrd.open_workbook(file_contents=attachment.body,
                                       on_demand=True).sheet_by_index(0)

            # store state of the table, in order to get relevant rows to extract
            is_relevant = False
            for row in sheet.get_rows():
                row = [
                    xldate_to_datetime(cell.value,
                                       sheet.book.datemode).isoformat()
                    if is_xldate(cell) else cell.value for cell in row
                ]

                # discard irrelavant rows until we see the start pattern
                if any(sub in row
                       for sub in ('Voyage Reference', 'Vessel Name')):
                    is_relevant = True
                    header = row
                    continue

                # sanity check using first element in case of empty row
                if is_relevant and row[0]:
                    raw_item = {
                        head: row[idx]
                        for idx, head in enumerate(header)
                    }
                    # contextualise raw item with some meta info
                    raw_item.update(
                        reported_date=to_isoformat(mail.envelope['date']),
                        provider_name=self.provider,
                    )
                    yield from normalize.process_item(raw_item)
Example #15
0
    def parse_mail(self, mail):
        """Extract data from each mail matched by the query spider argument.

        Args:
            mail (Mail):

        Yields:
            Dict[str, str]:

        """
        # memoise reported_date so it won't need to be called repeatedly later
        reported_date = to_isoformat(mail.envelope['date'])

        for attachment in mail.attachments():
            # only one sheet in the excel file is relevant
            sheet = xlrd.open_workbook(
                file_contents=attachment.body,
                on_demand=True).sheet_by_index(RELEVANT_SHEET_INDEX)

            for idx, row in enumerate(sheet.get_rows()):
                row = [
                    xldate_to_datetime(cell.value,
                                       sheet.book.datemode).isoformat()
                    if is_xldate(cell) else cell.value for cell in row
                ]

                # initialise headers and standardise as headers may have variations
                if HEADER_PATTERN in str(row):
                    header = [cell for cell in row if cell]
                    continue

                # remove empty, useless rows
                if not row[RELEVANT_ROW_INDICATOR]:
                    continue

                raw_item = {
                    head: row[col_idx]
                    for col_idx, head in enumerate(header)
                }
                # contextualise raw item with metadata
                raw_item.update(provider_name=self.provider,
                                reported_date=reported_date)
                yield normalize.process_item(raw_item)
Example #16
0
    def parse_mail(self, mail):
        """Extract mail found with specified email filters in spider arguments.

        Args:
            mail (Mail):

        Yields:
            Dict[str, str]:

        """
        for attachment in mail.attachments():
            # each xlsx file by this provider will only have one sheet
            sheet = xlrd.open_workbook(file_contents=attachment.body,
                                       on_demand=True).sheet_by_index(0)

            for idx, row in enumerate(sheet.get_rows()):
                row = [
                    xldate_to_datetime(cell.value,
                                       sheet.book.datemode).isoformat()
                    if is_xldate(cell) else cell.value for cell in row
                ]

                # remove empty filler rows before and after the main data table
                if not row[EMPTY_ROW_INDICATOR_INDEX]:
                    continue

                # initialise headers
                if HEADER_PATTERN in row:
                    header = row
                    continue

                # sanity check, just in case we somehow miss the headers
                # due to changes in table structures
                if 'header' in locals():
                    raw_item = {
                        head: row[idx]
                        for idx, head in enumerate(header) if head
                    }
                    # contextualise raw item with metadata
                    raw_item.update(provider_name='Affinity',
                                    spider_name=self.name)
                    yield normalize.process_item(raw_item)
Example #17
0
    def parse_sheet(self, sheet):
        """Extract raw table data from specified sheet.

        Args:
            sheet (xlrd.Sheet):

        Yields:
            Dict[str, str]:

        """
        header = None
        for idx, row in enumerate(sheet.get_rows()):
            # first row will always be the header
            if idx == 0:
                header = [cell.value for cell in row]
                continue

            row = [
                xldate_to_datetime(cell.value,
                                   sheet.book.datemode).isoformat()
                if is_xldate(cell) else cell.value for cell in row
            ]
            yield {head: row[head_idx] for head_idx, head in enumerate(header)}
Example #18
0
    def parse_mail(self, mail):
        """Extract data from each mail matched by the query spider argument.

        Args:
            mail (Mail):

        Yields:
            Dict[str, str]:

        """
        # memoise reported_date so it won't need to be called repeatedly later
        reported_date = to_isoformat(mail.envelope['date'])

        for attachment in mail.attachments():
            # sheet 0 contains historical info, but we want monthly info in sheet 1
            sheet = xlrd.open_workbook(file_contents=attachment.body,
                                       on_demand=True).sheet_by_index(1)

            # store state of the table, in order to get relevant rows to extract
            for idx, row in enumerate(sheet.get_rows()):
                row = [
                    xldate_to_datetime(cell.value,
                                       sheet.book.datemode).isoformat()
                    if is_xldate(cell) else cell.value for cell in row
                ]

                # initialise headers
                if idx == 0:
                    header = row
                    continue

                raw_item = {head: row[idx] for idx, head in enumerate(header)}
                # contextualise raw item with meta info
                raw_item.update(reported_date=reported_date,
                                provider_name=self.provider)
                yield normalize.process_item(raw_item)
Example #19
0
    def parse_attachment(self, attachment_doc, reported_date):

        workbook = xlrd.open_workbook(file_contents=attachment_doc,
                                      on_demand=True)

        for sheet in workbook.sheets():

            if sheet.name.lower() not in BLACKLIST_SHEETS:
                # assign variable to detect which row to start processing
                start_processing = False
                raw_port_name = None

                # store state of the table, in order to get relevant rows to extract
                for raw_row in sheet.get_rows():
                    # Include handling of xlrd.xldate.XLDateAmbiguous cases
                    row = []
                    for cell in raw_row:
                        if is_xldate(cell):
                            try:
                                cell = xldate_to_datetime(
                                    cell.value,
                                    sheet.book.datemode).isoformat()
                            except Exception:
                                cell = str(cell.value)

                        else:
                            cell = str(cell.value)

                        row.append(cell)

                    # detect portname as some tabs/attachments have the port name above
                    # the data row
                    if 'Port' in row[0]:
                        raw_port_name = row[0]
                        continue

                    # detect relevant row
                    if row[0] in START_PROCESSING_WORD:
                        start_processing = True
                        header = row
                        continue

                    # detect irrelevant row
                    if row[0] == STOP_PROCESSING_WORD:
                        start_processing = False
                        continue

                    if start_processing:
                        # remove unnecessary rows
                        if row[0] == '' or 'Without' in row[0] or ':' in row[0]:
                            continue

                        raw_item = {
                            head: row[idx]
                            for idx, head in enumerate(header)
                        }
                        # contextualise raw item with meta info
                        raw_item.update(
                            reported_date=reported_date,
                            provider_name=self.provider,
                            raw_port_name=raw_port_name,
                        )

                        if DataTypes.SpotCharter in self.produces:
                            yield normalize_charters.process_item(raw_item)

                        if DataTypes.Cargo in self.produces:
                            yield from normalize_grades.process_item(raw_item)
Example #20
0
    def parse_workbook_content(self, response):
        """Parse workbook content.

        Excel file contains 3 tables: berthed table, arrival table and eta table in that order.
        Both arrival and berthed tables use the same headers.

        Args:
            response (scrapy.HtmlResponse):

        Returns:
            dict[str, str]:

        """
        sheet = parser.get_xlsx_sheet(response)

        for idx, row in enumerate(sheet.get_rows()):
            row = [
                xldate_to_datetime(cell.value,
                                   sheet.book.datemode).isoformat()
                if is_xldate(cell) else str(cell.value) for cell in row
            ]

            # flag to indicate extraction of arrival table
            if not self.arrival_table and 'No' in row[0]:
                self.arrival_table = True
                self.berthed_table = False
                self.eta_table = False
                self.arrival_header = row
                continue

            # flag to indicate extraction of berthed table
            if not self.berthed_table and 'NAME OF SHIP' in row[2]:
                self.arrival_table = False
                self.berthed_table = True
                self.eta_table = False
                self.berthed_header = row
                continue

            # flag to indicate extraction of eta table
            if not self.eta_table and 'EXPECTED VESSEL' in row[2]:
                self.arrival_table = False
                self.berthed_table = False
                self.eta_table = True
                # eta table does not have headers; they are derived from arrival table
                self.eta_header = self.arrival_header
                continue

            # third table row contains `reported_date`
            if idx == 2:
                reported_date = parser._extract_reported_date(row[0])
                continue

            # extract berthed rows, berthed rows have a value for the 8th cell
            if self.berthed_table and row[1] and 'THIS MAY ALTER' not in row[1]:
                # vessels that are 'ships to follow' are processed in the other tables
                if 'AT BERTH' not in row[1]:
                    continue
                raw_item = parser._map_row_to_dict(
                    row,
                    self.berthed_header,
                    event='berthed',
                    port_name=self.provider,
                    provider_name=self.provider,
                    reported_date=reported_date,
                )
                yield normalize.process_item(raw_item)

            # extract eta rows, eta rows have a value for the 4th cell
            if self.eta_table and row[3]:
                raw_item = parser._map_row_to_dict(
                    row,
                    self.eta_header,
                    event='eta',
                    port_name=self.provider,
                    provider_name=self.provider,
                    reported_date=reported_date,
                )
                yield normalize.process_item(raw_item)

            # extract arrival rows, arrival rows have a value for the 4th cell
            if self.arrival_table and row[3]:
                raw_item = parser._map_row_to_dict(
                    row,
                    self.arrival_header,
                    event='arrival',
                    port_name=self.provider,
                    provider_name=self.provider,
                    reported_date=reported_date,
                )
                yield normalize.process_item(raw_item)
Example #21
0
    def parse_mail(self, mail):
        """Extract data from each mail matched by the query spider argument.

        Args:
            mail (Mail):

        Yields:
            Dict[str, str]:

        """
        # memoise reported_date so it won't need to be called repeatedly later
        reported_date = self.extract_reported_date(mail.envelope['subject'])

        for attachment in mail.attachments():
            if not attachment.is_spreadsheet:
                continue

            for sheet in xlrd.open_workbook(file_contents=attachment.body,
                                            on_demand=True).sheets():
                if sheet.name.lower() not in RELEVANT_NAMES:
                    continue

                # store state of the table, in order to get relevant rows to extract
                for idx, raw_row in enumerate(sheet.get_rows()):
                    row = []
                    # to handle is xldate exception
                    for cell in raw_row:
                        if is_xldate(cell):
                            try:
                                cell = xldate_to_datetime(
                                    cell.value,
                                    sheet.book.datemode).isoformat()
                            except Exception:
                                cell = str(cell.value)

                        else:
                            cell = str(cell.value)

                        row.append(cell)

                    # retrieve static information
                    _static_info = parser.HEADER_SHEET_MAPPING.get(
                        sheet.name.lower())

                    # ignore unnecessary rows before header row
                    if idx < _static_info[0]:
                        continue

                    header = _static_info[2]

                    # ignore rows where vessel column is empty
                    # can be put in normalize but this file has a lot of empty rows
                    # at the end of the report. Putting it here would cut the noise
                    if not row[_static_info[1]]:
                        continue

                    raw_item = {
                        head: row[idx]
                        for idx, head in enumerate(header)
                    }
                    # contextualise raw item with meta info
                    raw_item.update(reported_date=reported_date,
                                    provider_name=self.provider)
                    yield from normalize.process_item(raw_item)