Ejemplo n.º 1
0
 def parse(self, url, resp_text):
     sel = Selector(resp_text)
     directories = []
     files = []
     for anchor_sel in sel.css('a'):
         link = anchor_sel.css('::attr(href)').extract_first()
         if link.startswith('.'):
             continue
         link_url = f'{url}{link}'
         if link.endswith('/'):
             directories.append(link_url)
         else:
             fingerprint = anchor_sel.xpath(
                 './following-sibling::text()[1]').extract_first()
             match = re.match(r'\s*(\d+-\w+-\d+ \d+:\d+)\s+(\d+)',
                              fingerprint)
             last_modified = dateutil.parser.parse(
                 match.group(1)).replace(tzinfo=tzutc())
             file_size = int(match.group(2))
             parser_cls = get_parser(link)
             if parser_cls and not parser_cls(url=link_url).should_skip():
                 files.append({
                     'url': link_url,
                     'parser': parser_cls.__name__,
                     'last_modified': last_modified,
                     'file_size': file_size,
                 })
     self.logger.debug("Found %d directories and %d files at %s",
                       len(directories), len(files), url)
     yield from files
     for dir_url in directories:
         yield from self.poll_url(dir_url)
Ejemplo n.º 2
0
def poll(enqueue=False):
    updated_files = DWDPoller().poll()
    if enqueue:
        from brightsky.worker import huey, process
        if (expired_locks := huey.expire_locks(1800)):
            logger.warning('Removed expired locks: %s',
                           ', '.join(expired_locks))
        pending_urls = [
            t.args[0] for t in huey.pending() if t.name == 'process'
        ]
        enqueued = 0
        for updated_file in updated_files:
            url = updated_file['url']
            if url in pending_urls:
                logger.debug('Skipping "%s": already queued', url)
                continue
            elif huey.is_locked(url):
                logger.debug('Skipping "%s": already running', url)
                continue
            logger.debug('Enqueueing "%s"', url)
            parser_cls = get_parser(os.path.basename(url))
            process(url, priority=parser_cls.PRIORITY)
            enqueued += 1
        logger.info('Enqueued %d updated files for processing. Queue size: %d',
                    enqueued, enqueued + len(pending_urls))
Ejemplo n.º 3
0
def test_get_parser():
    synop_with_timestamp = (
        'Z__C_EDZW_20200617114802_bda01,synop_bufr_GER_999999_999999__MW_617'
        '.json.bz2')
    synop_latest = (
        'Z__C_EDZW_latest_bda01,synop_bufr_GER_999999_999999__MW_XXX.json.bz2')
    expected = {
        '10minutenwerte_extrema_wind_00427_akt.zip':
        (WindGustsObservationsParser),
        'stundenwerte_FF_00011_akt.zip': WindObservationsParser,
        'stundenwerte_FF_00090_akt.zip': WindObservationsParser,
        'stundenwerte_N_01766_akt.zip': CloudCoverObservationsParser,
        'stundenwerte_P0_00096_akt.zip': PressureObservationsParser,
        'stundenwerte_RR_00102_akt.zip': PrecipitationObservationsParser,
        'stundenwerte_SD_00125_akt.zip': SunshineObservationsParser,
        'stundenwerte_TD_01766.zip': DewPointObservationsParser,
        'stundenwerte_TU_00161_akt.zip': TemperatureObservationsParser,
        'stundenwerte_VV_00161_akt.zip': VisibilityObservationsParser,
        'MOSMIX_S_LATEST_240.kmz': MOSMIXParser,
        'K611_-BEOB.csv': CurrentObservationsParser,
        synop_with_timestamp: SYNOPParser,
        synop_latest: None,
    }
    for filename, expected_parser in expected.items():
        assert get_parser(filename) is expected_parser
Ejemplo n.º 4
0
def parse(path=None, url=None, export=False):
    if not path and not url:
        raise ValueError('Please provide either path or url')
    parser_cls = get_parser(os.path.basename(path or url))
    parser = parser_cls(path=path, url=url)
    if url:
        parser.download()
        fingerprint = {
            'url': url,
            **dwd_fingerprint(parser.path),
        }
    else:
        fingerprint = None
    records = list(parser.parse())
    parser.cleanup()
    if export:
        exporter = parser.exporter()
        exporter.export(records, fingerprint=fingerprint)
    return records