コード例 #1
0
def add_to_database(**kwargs):
    objs = kwargs['ti'].xcom_pull(key='object_location',
                                  task_ids='generate_object_list')
    logging.info(f'Processing object list from {objs}')
    with open(objs, 'r') as f:
        wl = read_object_list(f)

    # execution_date = kwargs['execution_date'].strftime('%Y-%m-%dT%H-%M')
    # previous_run = kwargs['prev_execution_date'].strftime('%Y-%m-%dT%H-%M')
    # filtered = list(filter_objects(all_objects=wl, start_date=previous_run, end_date=execution_date))
    filtered = list(wl)

    station_dao, series_dao, mes_dao = setup_daos()
    records = 0

    for obj in filtered:
        for record in get_jsons_from_object(bucket=FETCHES_BUCKET,
                                            object_name=obj['Name']):
            station, measurement, _ = split_record(record)
            add_to_db(station_dao=station_dao,
                      series_dao=series_dao,
                      mes_dao=mes_dao,
                      station=station,
                      measurement=measurement)
            records += 1

    print_db_stats(station_dao, series_dao, mes_dao)
コード例 #2
0
def update_last(**kwargs):
    prefix = get_prefix(**kwargs)
    target_dir = os.path.join(Variable.get('target_dir'), prefix)
    logging.info(f'Will be processing [{ target_dir }]')

    flist = list_directory(target_dir)
    logging.info(f'Files detected: { len(flist)}')

    previous_run = kwargs['prev_execution_date']
    next_run = kwargs['next_execution_date']
    filtered_list = filter_file_list(
        flist=flist, previous_run=previous_run, next_run=next_run)
    logging.info(f'Previous run was @{previous_run}, next will be @{next_run}. File list reduced to: {len(filtered_list)}')

    station_dao, series_dao, mes_dao = setup_daos()
    m = 0

    for fname in filtered_list:
        logging.info(f'Analyzing { fname}')

        with open(fname, 'rb') as f:
            for record in get_jsons_from_stream(stream=f, object_name=fname):
                station, measurement, _ = split_record(record)
                m += 1
                add_to_db(station_dao, series_dao, mes_dao, station=station,
                          measurement=measurement)

    logging.info(f'Number of measurements added to DB: {m}')
    print_db_stats(station_dao, series_dao, mes_dao)
    return True
コード例 #3
0
def store_objects_in_db(**kwargs):
    objs = kwargs['ti'].xcom_pull(
        key='object_location', task_ids='generate_object_list')
    logging.info('Processing object list from %s', objs)
    with open(objs, 'r') as f:
        wl = read_object_list(f)

    execution_date = kwargs['execution_date']
    previous_run = kwargs['prev_execution_date']

    if kwargs['filter_objects']:
        logging.info('Filtering objects...')
        filtered = list(filter_objects(
            all_objects=wl, start_date=previous_run, end_date=execution_date))
        logging.info('Filterd objects. Number of objects from [%s, %s]: %d', previous_run, execution_date,
                     len(filtered))
    else:
        filtered = list(wl)
        logging.info('Number of non-filtered objects %d', len(filtered))

    station_dao, series_dao, mes_dao = setup_daos()
    with concurrent.futures.ThreadPoolExecutor(max_workers=None) as executor:
        processor_objects = {executor.submit(
            local_process_file, obj['Name']): obj['Name'] for obj in filtered}

        for future in concurrent.futures.as_completed(processor_objects):
            object_name = processor_objects[future]
            try:
                rr = future.result()
            except Exception as exc:
                logging.warning(
                    '%r generated an exception: %s', object_name, exc)
            else:
                logging.info('Processing %s', object_name)
                for it in rr:
                    add_to_db(station_dao=station_dao, series_dao=series_dao,
                              mes_dao=mes_dao, station=it[0], measurement=it[1])

    print_db_stats(station_dao, serialize_object, mes_dao)
コード例 #4
0
def go_through(**kwargs):
    prefix = get_prefix(**kwargs)
    target_dir = os.path.join(Variable.get('target_dir'), prefix)
    logging.info(f'Will be processing [{ target_dir }]')

    flist = glob.glob(os.path.join(target_dir, '*'))
    logging.info(f'Files detected: { len(flist)}')

    station_dao, series_dao, mes_dao = setup_daos()

    for fname in flist:
        logging.info(f'Processing { fname}')
        with open(fname, 'rb') as f:
            for record in get_jsons_from_stream(stream=f, object_name=fname):
                station, measurement, _ = split_record(record)
                add_to_db(station_dao,
                          series_dao,
                          mes_dao,
                          station=station,
                          measurement=measurement)

    print_db_stats(station_dao, series_dao, mes_dao)
    return True
コード例 #5
0
def transform_objects(**kwargs):
    pfl = setup_objectlist(**kwargs)
    pfl.load()
    objects_count = len(pfl.get_list())
    logging.info(f'Loaded {objects_count} objects.')

    station_dao, series_dao, mes_dao = setup_daos()

    def process(x):
        return local_process_file(x['Name'])

    with concurrent.futures.ThreadPoolExecutor(max_workers=None) as executor:
        for obj, results in zip(pfl.get_list(),
                                executor.map(process, pfl.get_list())):
            logging.info(f"Processing { obj['Name'] } ({ obj['Size']})")
            # we are linerizing it here anyways?
            for it in results:
                add_to_db(station_dao,
                          series_dao,
                          mes_dao,
                          station=it[0],
                          measurement=it[1])

    print_db_stats(station_dao, series_dao, mes_dao)