def parse_sheet_rows(self, sheet, post_process=lambda x: x): """Parse and process rows of a spreadsheet. Args: sheet (xlrd.Sheet): post_process (Callable[Any, List[str]]): Yields: Dict[str, str]: """ for idx, row in enumerate(sheet.get_rows()): row = [ xldate_to_datetime(cell.value, sheet.book.datemode).isoformat() if is_xldate(cell) else may_strip(cell.value) for cell in row ] # headers will always be the first row of any sheet from this source if idx == 0: header = row continue # post-process row according to caller specifications row = post_process(row) if row: yield {head: row[head_idx] for head_idx, head in enumerate(header)}
def parse_attachment( self, attachment_doc, reported_date, sheet_list, get_header, pn_row, pn, doc_name ): for sheet in xlrd.open_workbook(file_contents=attachment_doc, on_demand=True).sheets(): # only parse relevant sheets if any(sub in sheet.name.lower() for sub in ['sheet1', 'new']) or not any( sub in sheet.name.lower() for sub in sheet_list ): continue first_row, second_row = None, None port_name = pn if pn else pn_row for idx, raw_row in enumerate(sheet.get_rows()): row = [] # to handle is xldate exception for cell in raw_row: if is_xldate(cell): try: cell = xldate_to_datetime(cell.value, sheet.book.datemode).isoformat() except Exception: cell = str(cell.value) else: cell = str(cell.value) row.append(cell) # get port name if isinstance(port_name, int) and idx == pn_row: port_name = row[1] # format headers because it is split into 2, make it adaptable # across sheets instead of hardcoding if idx in get_header: if idx == get_header[0]: first_row = row continue if idx == get_header[1]: second_row = row continue if first_row and second_row: headers = self.combine_rows(first_row, second_row) # only process relevant rows after header row if idx > get_header[1]: raw_item = {may_strip(head): row[idx] for idx, head in enumerate(headers)} # contextualise raw item with meta info raw_item.update( reported_date=reported_date, provider_name=self.provider, port_name=port_name, file_name=doc_name, ) yield normalize.process_item(raw_item)
def parse_mail(self, mail): """The method will be called for every mail the search_term matched. Args: mail (Mail): Yields: Dict[str, str]: """ for attachment in mail.attachments(): for sheet in xlrd.open_workbook(file_contents=attachment.body, on_demand=True).sheets(): # only process the sheet we want if not any(sub in sheet.name.lower() for sub in ('export', 'import')): continue sheet_name = sheet.name.lower() for idx, raw_row in enumerate(sheet.get_rows()): # Include handling of xlrd.xldate.XLDateAmbiguous cases row = [] for cell in raw_row: if is_xldate(cell): try: cell = xldate_to_datetime( cell.value, sheet.book.datemode ).isoformat() except Exception: cell = str(cell.value) else: cell = str(cell.value) row.append(cell) if idx == 0: reported_date = to_isoformat(row[1], dayfirst=True) continue # second row will always contain header; extract it if idx == 1: header = row continue # extract data row raw_item = {head: row[head_idx] for head_idx, head in enumerate(header)} raw_item.update( reported_date=reported_date, provider_name=self.provider, sheet_name=sheet_name, ) if DataTypes.SpotCharter in self.produces: print(raw_item) yield normalize_charters.process_item(raw_item) else: yield normalize_grades.process_item(raw_item)
def parse_mail(self, mail): """Extract mail found with specified email filters in spider arguments. Args: mail (Mail): Yields: Dict[str, str]: """ for attachment in mail.attachments(): # memoise reported date so it won't need to be called repeatedly later _match = re.match(r'.*DBE\s(\d+)', attachment.name) if not _match: raise ValueError(f'Unknown reported date format: {attachment.name}') reported_date = _match.group(1) # each xlsx file by this provider will only have one sheet sheet = xlrd.open_workbook( file_contents=attachment.body, on_demand=True ).sheet_by_index(0) # extract raw data from sheet for idx, row in enumerate(sheet.get_rows()): row = [ xldate_to_datetime(cell.value, sheet.book.datemode).isoformat() if is_xldate(cell) else str(cell.value) for cell in row ] # initialise headers if idx == HEADER_ROW_IDX: header = row continue # skip vessels with () inside, i.e., vessel has not been identified by provider if '(' in row[VESSEL_COLUMN_INDEX]: continue # FIXME timecharter type if 'timecharter' in row[COMMODITY_COLUMN_INDEX].lower(): continue raw_item = {head: row[idx] for idx, head in enumerate(header)} # contextualise raw item with metadata raw_item.update( provider_name='MRI', reported_date=reported_date, spider_name=self.name ) if DataTypes.SpotCharter in self.produces: yield normalize_charters.process_item(raw_item) # FIXME supposed to be `DataTypes.PortCall` here, but we don't want # data-dispatcher to consume data from these spiders and the ETL to create PCs else: yield normalize_grades.process_item(raw_item)
def parse_mail(self, mail): """This method will be called for every mail the search_term matched. Each vessel movement has an associated uuid that is linked to a cargo's uuid, allowing for easy retrieval of a vessel's cargo movement. However, each vessel may contain multiple cargo movements with the same uuid, therefore we store the products as a list value against the uuid key. Args: mail (Mail): Yields: Dict[str, str]: """ # memoise reported date so it does not need to be called repeatedly later self.reported_date = to_isoformat(mail.envelope['date']) for attachment in mail.attachments(): # sanity check in case file is not a spreadsheet if not attachment.is_spreadsheet: continue sheet = xlrd.open_workbook(file_contents=attachment.body, on_demand=True).sheet_by_index(0) header = None for idx, row in enumerate(sheet.get_rows()): row = [ xldate_to_datetime(cell.value, sheet.book.datemode).isoformat() if is_xldate(cell) else cell.value for cell_idx, cell in enumerate(row) ] if 'approach' in row[3]: # sheet has 2 tables with different headers, # first table has header split into 2 header = row header[2] = 'Vessel' header[5] = 'Cargo' continue if 'Status' in row[1]: header = row continue if header and len(header) == len(row): raw_item = {h: row[idx] for idx, h in enumerate(header)} raw_item.update(provider_name=self.provider, reported_date=self.reported_date) yield normalize.process_item(raw_item)
def parse_file(self, response): sheet = xlrd.open_workbook(file_contents=response.body, on_demand=True).sheet_by_index(0) headers = None do_process = False for idx, raw_row in enumerate(sheet.get_rows()): row = [ xldate_to_datetime(cell.value, sheet.book.datemode).isoformat() if is_xldate(cell) else may_strip(str(cell.value)) for cell in raw_row ][:MAX_ROW_LENGTH] # filtering unwanted portions for easier processing of remarks field (to retrieve # cargo information) # this hold true when the field names don't change # if any(row[0] == alias for alias in NO_PROCESSING): # processing = False if row and any(sub in row[0] for sub in RESUME_PROCESSING): do_process = True if row and any(sub in row[0] for sub in PAUSE_PROCESSING): do_process = False if not do_process: continue # vessels expected table if HEADER_SIGN in row: headers = row continue if headers and row[ETA_COL_IDX]: raw_item = { headers[cell_idx]: cell for cell_idx, cell in enumerate(row) } raw_item.update(self.meta_field) yield normalize.process_item(raw_item) # waiters info in text waiters = self.parse_waiters(row[0]) if waiters: raw_item = { str(cell_idx): cell for cell_idx, cell in enumerate(waiters) } raw_item.update(self.meta_field) yield normalize.process_item(raw_item)
def parse_mail(self, mail): """This method will be called for every mail the search_term matched. Each vessel movement has an associated uuid that is linked to a cargo's uuid, allowing for easy retrieval of a vessel's cargo movement. However, each vessel may contain multiple cargo movements with the same uuid, therefore we store the products as a list value against the uuid key. Args: mail (Mail): Yields: Dict[str, str]: """ # memoise reported date so it does not need to be called repeatedly later self.reported_date = to_isoformat(mail.envelope['date']) for attachment in mail.attachments(): # sanity check in case file is not a spreadsheet if not attachment.is_spreadsheet: continue sheet = xlrd.open_workbook(file_contents=attachment.body, on_demand=True).sheet_by_index(0) header = None for idx, row in enumerate(sheet.get_rows()): # report seperates date and timestamp into 2 cells, # this would cause the cell to try to convert the timestamps into a date # hence throwing an error (index affected 2,5,9) row = [ xldate_to_datetime(cell.value, sheet.book.datemode).isoformat() if is_xldate(cell) and cell_idx not in (2, 5, 9) else cell.value for cell_idx, cell in enumerate(row) ] if idx == 0: header = row continue if header: raw_item = {h: row[idx] for idx, h in enumerate(header)} raw_item.update(provider_name='Interadria', reported_date=self.reported_date) yield normalize.process_item(raw_item) else: self.logger.warning('No headers are found.')
def parse_mail(self, mail): """Extract data from each mail matched by the query spider argument. Args: mail (Mail): Yields: Dict[str, str]: """ # memoise reported_date so it won't need to be called repeatedly later reported_date = to_isoformat(mail.envelope['date']) for attachment in mail.attachments(): sheet = xlrd.open_workbook( file_contents=attachment.body, on_demand=True ).sheet_by_index(0) # store state of the table, in order to get relevant rows to extract for idx, row in enumerate(sheet.get_rows()): row = [ xldate_to_datetime(cell.value, sheet.book.datemode).isoformat() if is_xldate(cell) else cell.value for cell in row ] # ignore unnecessary rows if idx < 3: continue # locate port row if idx == 3: port = row[0].lower().replace('port: ', '') continue # locate header row if idx == 4: header = row continue raw_item = {head: row[idx] for idx, head in enumerate(header)} # contextualise raw item with meta info raw_item.update( reported_date=reported_date, provider_name=self.provider, port_name=port, attachment_name=attachment.name, ) yield from normalize.process_item(raw_item)
def parse_attachment(self, attachment_doc, reported_date): # each xlsx file by this provider will only have one sheet sheet = xlrd.open_workbook(file_contents=attachment_doc, on_demand=True).sheet_by_name( 'Market Data Report' ) # for this spot charter we don't look at cargo information (already saved # in corresponding cargo movement) # when several rows for a vessel with same charter values,port values and # arrival values we only build one item spot_charters_seen = {} for idx, row in enumerate(sheet.get_rows()): row = [ xldate_to_datetime(cell.value, sheet.book.datemode).isoformat() if is_xldate(cell) else str(cell.value) for cell in row ] # remove empty rows before and after the main data table # denoted by a falsy value if not row[EMPTY_ROW_INDICATOR_INDEX]: continue # initialise headers if HEADER_PATTERN in row: header = row continue # extract data row raw_item = {head: row[head_idx] for head_idx, head in enumerate(header)} raw_item.update(reported_date=reported_date, provider_name=self.provider) if DataTypes.SpotCharter in self.produces: if ( raw_item['Charterer Name'] + raw_item['Vessel'] + raw_item['Arrival'].split(' ')[0] + raw_item['Port Name'] in spot_charters_seen.keys() ): continue spot_charters_seen[ raw_item['Charterer Name'] + raw_item['Vessel'] + raw_item['Arrival'].split(' ')[0] + raw_item['Port Name'] ] = True yield normalize_charters.process_item(raw_item) elif DataTypes.Cargo in self.produces: yield normalize_grades.process_item(raw_item)
def parse_mail(self, mail): """Extract data from each mail matched by the query spider argument. Args: mail (Mail): Yields: Dict[str, str]: """ # memoise reported_date so it won't need to be called repeatedly later reported_date = to_isoformat(mail.envelope['date']) for attachment in mail.attachments(): # only one sheet in the excel file is relevant sheet = xlrd.open_workbook( file_contents=attachment.body, on_demand=True).sheet_by_index(RELEVANT_SHEET_INDEX) for idx, row in enumerate(sheet.get_rows()): row = [ xldate_to_datetime(cell.value, sheet.book.datemode).isoformat() if is_xldate(cell) else cell.value for cell in row ] # initialise headers and standardise as headers may have variations if HEADER_PATTERN in str(row): header = [cell for cell in row if cell] continue # remove empty, useless rows if not row[RELEVANT_ROW_INDICATOR]: continue raw_item = { head: row[col_idx] for col_idx, head in enumerate(header) } # contextualise raw item with metadata raw_item.update(provider_name=self.provider, reported_date=reported_date) yield normalize.process_item(raw_item)
def parse_mail(self, mail): """Extract data from each mail matched by the query spider argument. Args: mail (Mail): Yields: Dict[str, str]: """ for attachment in mail.attachments(): # each xls file only has one sheet sheet = xlrd.open_workbook(file_contents=attachment.body, on_demand=True).sheet_by_index(0) # store state of the table, in order to get relevant rows to extract is_relevant = False for row in sheet.get_rows(): row = [ xldate_to_datetime(cell.value, sheet.book.datemode).isoformat() if is_xldate(cell) else cell.value for cell in row ] # discard irrelavant rows until we see the start pattern if any(sub in row for sub in ('Voyage Reference', 'Vessel Name')): is_relevant = True header = row continue # sanity check using first element in case of empty row if is_relevant and row[0]: raw_item = { head: row[idx] for idx, head in enumerate(header) } # contextualise raw item with some meta info raw_item.update( reported_date=to_isoformat(mail.envelope['date']), provider_name=self.provider, ) yield from normalize.process_item(raw_item)
def parse_mail(self, mail): """Extract mail found with specified email filters in spider arguments. Args: mail (Mail): Yields: Dict[str, str]: """ for attachment in mail.attachments(): # each xlsx file by this provider will only have one sheet sheet = xlrd.open_workbook(file_contents=attachment.body, on_demand=True).sheet_by_index(0) for idx, row in enumerate(sheet.get_rows()): row = [ xldate_to_datetime(cell.value, sheet.book.datemode).isoformat() if is_xldate(cell) else cell.value for cell in row ] # remove empty filler rows before and after the main data table if not row[EMPTY_ROW_INDICATOR_INDEX]: continue # initialise headers if HEADER_PATTERN in row: header = row continue # sanity check, just in case we somehow miss the headers # due to changes in table structures if 'header' in locals(): raw_item = { head: row[idx] for idx, head in enumerate(header) if head } # contextualise raw item with metadata raw_item.update(provider_name='Affinity', spider_name=self.name) yield normalize.process_item(raw_item)
def parse_sheet(self, sheet): """Extract raw table data from specified sheet. Args: sheet (xlrd.Sheet): Yields: Dict[str, str]: """ header = None for idx, row in enumerate(sheet.get_rows()): # first row will always be the header if idx == 0: header = [cell.value for cell in row] continue row = [ xldate_to_datetime(cell.value, sheet.book.datemode).isoformat() if is_xldate(cell) else cell.value for cell in row ] yield {head: row[head_idx] for head_idx, head in enumerate(header)}
def parse_mail(self, mail): """Extract data from each mail matched by the query spider argument. Args: mail (Mail): Yields: Dict[str, str]: """ # memoise reported_date so it won't need to be called repeatedly later reported_date = to_isoformat(mail.envelope['date']) for attachment in mail.attachments(): # sheet 0 contains historical info, but we want monthly info in sheet 1 sheet = xlrd.open_workbook(file_contents=attachment.body, on_demand=True).sheet_by_index(1) # store state of the table, in order to get relevant rows to extract for idx, row in enumerate(sheet.get_rows()): row = [ xldate_to_datetime(cell.value, sheet.book.datemode).isoformat() if is_xldate(cell) else cell.value for cell in row ] # initialise headers if idx == 0: header = row continue raw_item = {head: row[idx] for idx, head in enumerate(header)} # contextualise raw item with meta info raw_item.update(reported_date=reported_date, provider_name=self.provider) yield normalize.process_item(raw_item)
def parse_attachment(self, attachment_doc, reported_date): workbook = xlrd.open_workbook(file_contents=attachment_doc, on_demand=True) for sheet in workbook.sheets(): if sheet.name.lower() not in BLACKLIST_SHEETS: # assign variable to detect which row to start processing start_processing = False raw_port_name = None # store state of the table, in order to get relevant rows to extract for raw_row in sheet.get_rows(): # Include handling of xlrd.xldate.XLDateAmbiguous cases row = [] for cell in raw_row: if is_xldate(cell): try: cell = xldate_to_datetime( cell.value, sheet.book.datemode).isoformat() except Exception: cell = str(cell.value) else: cell = str(cell.value) row.append(cell) # detect portname as some tabs/attachments have the port name above # the data row if 'Port' in row[0]: raw_port_name = row[0] continue # detect relevant row if row[0] in START_PROCESSING_WORD: start_processing = True header = row continue # detect irrelevant row if row[0] == STOP_PROCESSING_WORD: start_processing = False continue if start_processing: # remove unnecessary rows if row[0] == '' or 'Without' in row[0] or ':' in row[0]: continue raw_item = { head: row[idx] for idx, head in enumerate(header) } # contextualise raw item with meta info raw_item.update( reported_date=reported_date, provider_name=self.provider, raw_port_name=raw_port_name, ) if DataTypes.SpotCharter in self.produces: yield normalize_charters.process_item(raw_item) if DataTypes.Cargo in self.produces: yield from normalize_grades.process_item(raw_item)
def parse_workbook_content(self, response): """Parse workbook content. Excel file contains 3 tables: berthed table, arrival table and eta table in that order. Both arrival and berthed tables use the same headers. Args: response (scrapy.HtmlResponse): Returns: dict[str, str]: """ sheet = parser.get_xlsx_sheet(response) for idx, row in enumerate(sheet.get_rows()): row = [ xldate_to_datetime(cell.value, sheet.book.datemode).isoformat() if is_xldate(cell) else str(cell.value) for cell in row ] # flag to indicate extraction of arrival table if not self.arrival_table and 'No' in row[0]: self.arrival_table = True self.berthed_table = False self.eta_table = False self.arrival_header = row continue # flag to indicate extraction of berthed table if not self.berthed_table and 'NAME OF SHIP' in row[2]: self.arrival_table = False self.berthed_table = True self.eta_table = False self.berthed_header = row continue # flag to indicate extraction of eta table if not self.eta_table and 'EXPECTED VESSEL' in row[2]: self.arrival_table = False self.berthed_table = False self.eta_table = True # eta table does not have headers; they are derived from arrival table self.eta_header = self.arrival_header continue # third table row contains `reported_date` if idx == 2: reported_date = parser._extract_reported_date(row[0]) continue # extract berthed rows, berthed rows have a value for the 8th cell if self.berthed_table and row[1] and 'THIS MAY ALTER' not in row[1]: # vessels that are 'ships to follow' are processed in the other tables if 'AT BERTH' not in row[1]: continue raw_item = parser._map_row_to_dict( row, self.berthed_header, event='berthed', port_name=self.provider, provider_name=self.provider, reported_date=reported_date, ) yield normalize.process_item(raw_item) # extract eta rows, eta rows have a value for the 4th cell if self.eta_table and row[3]: raw_item = parser._map_row_to_dict( row, self.eta_header, event='eta', port_name=self.provider, provider_name=self.provider, reported_date=reported_date, ) yield normalize.process_item(raw_item) # extract arrival rows, arrival rows have a value for the 4th cell if self.arrival_table and row[3]: raw_item = parser._map_row_to_dict( row, self.arrival_header, event='arrival', port_name=self.provider, provider_name=self.provider, reported_date=reported_date, ) yield normalize.process_item(raw_item)
def parse_mail(self, mail): """Extract data from each mail matched by the query spider argument. Args: mail (Mail): Yields: Dict[str, str]: """ # memoise reported_date so it won't need to be called repeatedly later reported_date = self.extract_reported_date(mail.envelope['subject']) for attachment in mail.attachments(): if not attachment.is_spreadsheet: continue for sheet in xlrd.open_workbook(file_contents=attachment.body, on_demand=True).sheets(): if sheet.name.lower() not in RELEVANT_NAMES: continue # store state of the table, in order to get relevant rows to extract for idx, raw_row in enumerate(sheet.get_rows()): row = [] # to handle is xldate exception for cell in raw_row: if is_xldate(cell): try: cell = xldate_to_datetime( cell.value, sheet.book.datemode).isoformat() except Exception: cell = str(cell.value) else: cell = str(cell.value) row.append(cell) # retrieve static information _static_info = parser.HEADER_SHEET_MAPPING.get( sheet.name.lower()) # ignore unnecessary rows before header row if idx < _static_info[0]: continue header = _static_info[2] # ignore rows where vessel column is empty # can be put in normalize but this file has a lot of empty rows # at the end of the report. Putting it here would cut the noise if not row[_static_info[1]]: continue raw_item = { head: row[idx] for idx, head in enumerate(header) } # contextualise raw item with meta info raw_item.update(reported_date=reported_date, provider_name=self.provider) yield from normalize.process_item(raw_item)