def parse(self, url, resp_text): sel = Selector(resp_text) directories = [] files = [] for anchor_sel in sel.css('a'): link = anchor_sel.css('::attr(href)').extract_first() if link.startswith('.'): continue link_url = f'{url}{link}' if link.endswith('/'): directories.append(link_url) else: fingerprint = anchor_sel.xpath( './following-sibling::text()[1]').extract_first() match = re.match(r'\s*(\d+-\w+-\d+ \d+:\d+)\s+(\d+)', fingerprint) last_modified = dateutil.parser.parse( match.group(1)).replace(tzinfo=tzutc()) file_size = int(match.group(2)) parser_cls = get_parser(link) if parser_cls and not parser_cls(url=link_url).should_skip(): files.append({ 'url': link_url, 'parser': parser_cls.__name__, 'last_modified': last_modified, 'file_size': file_size, }) self.logger.debug("Found %d directories and %d files at %s", len(directories), len(files), url) yield from files for dir_url in directories: yield from self.poll_url(dir_url)
def poll(enqueue=False): updated_files = DWDPoller().poll() if enqueue: from brightsky.worker import huey, process if (expired_locks := huey.expire_locks(1800)): logger.warning('Removed expired locks: %s', ', '.join(expired_locks)) pending_urls = [ t.args[0] for t in huey.pending() if t.name == 'process' ] enqueued = 0 for updated_file in updated_files: url = updated_file['url'] if url in pending_urls: logger.debug('Skipping "%s": already queued', url) continue elif huey.is_locked(url): logger.debug('Skipping "%s": already running', url) continue logger.debug('Enqueueing "%s"', url) parser_cls = get_parser(os.path.basename(url)) process(url, priority=parser_cls.PRIORITY) enqueued += 1 logger.info('Enqueued %d updated files for processing. Queue size: %d', enqueued, enqueued + len(pending_urls))
def test_get_parser(): synop_with_timestamp = ( 'Z__C_EDZW_20200617114802_bda01,synop_bufr_GER_999999_999999__MW_617' '.json.bz2') synop_latest = ( 'Z__C_EDZW_latest_bda01,synop_bufr_GER_999999_999999__MW_XXX.json.bz2') expected = { '10minutenwerte_extrema_wind_00427_akt.zip': (WindGustsObservationsParser), 'stundenwerte_FF_00011_akt.zip': WindObservationsParser, 'stundenwerte_FF_00090_akt.zip': WindObservationsParser, 'stundenwerte_N_01766_akt.zip': CloudCoverObservationsParser, 'stundenwerte_P0_00096_akt.zip': PressureObservationsParser, 'stundenwerte_RR_00102_akt.zip': PrecipitationObservationsParser, 'stundenwerte_SD_00125_akt.zip': SunshineObservationsParser, 'stundenwerte_TD_01766.zip': DewPointObservationsParser, 'stundenwerte_TU_00161_akt.zip': TemperatureObservationsParser, 'stundenwerte_VV_00161_akt.zip': VisibilityObservationsParser, 'MOSMIX_S_LATEST_240.kmz': MOSMIXParser, 'K611_-BEOB.csv': CurrentObservationsParser, synop_with_timestamp: SYNOPParser, synop_latest: None, } for filename, expected_parser in expected.items(): assert get_parser(filename) is expected_parser
def parse(path=None, url=None, export=False): if not path and not url: raise ValueError('Please provide either path or url') parser_cls = get_parser(os.path.basename(path or url)) parser = parser_cls(path=path, url=url) if url: parser.download() fingerprint = { 'url': url, **dwd_fingerprint(parser.path), } else: fingerprint = None records = list(parser.parse()) parser.cleanup() if export: exporter = parser.exporter() exporter.export(records, fingerprint=fingerprint) return records