def get_catalog_urls(json_catalog, agency_lookup={}):

    if agency_lookup == {}: agency_lookup = tools.load_agency_lookup()

    catalog_urls = []

    for index, dataset in enumerate(json_catalog):

        dataset_url_dict = get_dataset_url_dict(dataset, agency_lookup, index)

        catalog_urls.append(dataset_url_dict)

    return catalog_urls
def get_catalog_urls(json_catalog, agency_lookup ={} ):
    
    if agency_lookup == {}: agency_lookup = tools.load_agency_lookup()

    catalog_urls = []
    
    
    for index,dataset in enumerate(json_catalog):
        
        dataset_url_dict = get_dataset_url_dict(dataset, agency_lookup, index)
        
        catalog_urls.append(dataset_url_dict)
        
    return catalog_urls
def get_dataset_url_dict(dataset, agency_lookup={}, index=0):
    
    if agency_lookup == {}: agency_lookup = tools.load_agency_lookup()

    dataset_id           = dataset.get('identifier','(Missing_identifier_'+str(index)+')')
    dataset_title        = dataset.get('title'     ,'(Missing_title_'     +str(index)+')')
    
    dataset_urls        = get_dataset_urls(dataset)
    
    dataset_bureau_code = dataset.get('bureauCode','[Other]')[0]  # Take only 1st element of bureau_code list 
    dataset_agency      = agency_lookup.get(dataset_bureau_code,'Other')


    
    # FIXME: use tools.get_key_list(), because don't have to deal with hierarchy fluctuations
    dataset_url_dict = {}
    dataset_url_dict['id'       ] = dataset_id
    dataset_url_dict['title'    ] = dataset_title
    dataset_url_dict['agency'   ] = dataset_agency
    dataset_url_dict['url'      ] = dataset_urls
    
    return dataset_url_dict
def get_dataset_url_dict(dataset, agency_lookup={}, index=0):

    if agency_lookup == {}: agency_lookup = tools.load_agency_lookup()

    dataset_id = dataset.get('identifier',
                             '(Missing_identifier_' + str(index) + ')')
    dataset_title = dataset.get('title', '(Missing_title_' + str(index) + ')')

    dataset_urls = get_dataset_urls(dataset)

    dataset_bureau_code = dataset.get(
        'bureauCode',
        '[Other]')[0]  # Take only 1st element of bureau_code list
    dataset_agency = agency_lookup.get(dataset_bureau_code, 'Other')

    # FIXME: use tools.get_key_list(), because don't have to deal with hierarchy fluctuations
    dataset_url_dict = {}
    dataset_url_dict['id'] = dataset_id
    dataset_url_dict['title'] = dataset_title
    dataset_url_dict['agency'] = dataset_agency
    dataset_url_dict['url'] = dataset_urls

    return dataset_url_dict
def build_catalog_urls_list(file_list):
    
    agency_lookup = tools.load_agency_lookup()

    global url_df
    url_df = pd.DataFrame(columns=['date', 'id', 'agency', 'url', 'url_index', 'url_status'])
    row_index = 0

    
    global catalog_date_urls
    catalog_date_urls = {}

    
    for file_name in file_list:
        
        json_catalog = tools.load_file_json(file_name)

        catalog_urls = get_catalog_urls(json_catalog, agency_lookup)
        
        file_date_str = tools.parse_date(file_name)
        file_date     = datetime.strptime(file_date_str,'%Y-%m-%d')
        
        catalog_date_urls[file_date_str] = catalog_urls   # Append file just processed
        
        
        for dataset_urls in catalog_urls:
            for url_index,dataset_url in enumerate(dataset_urls['url']):
                
                url_df.loc[row_index] = [  file_date
                                         , dataset_urls['id']
                                         , dataset_urls['agency']
                                         , dataset_url
                                         , url_index
                                         , '' # 'url_status'
                                        ]
                row_index += 1
def build_catalog_urls_list(file_list):

    agency_lookup = tools.load_agency_lookup()

    global url_df
    url_df = pd.DataFrame(
        columns=['date', 'id', 'agency', 'url', 'url_index', 'url_status'])
    row_index = 0

    global catalog_date_urls
    catalog_date_urls = {}

    for file_name in file_list:

        json_catalog = tools.load_file_json(file_name)

        catalog_urls = get_catalog_urls(json_catalog, agency_lookup)

        file_date_str = tools.parse_date(file_name)
        file_date = datetime.strptime(file_date_str, '%Y-%m-%d')

        catalog_date_urls[
            file_date_str] = catalog_urls  # Append file just processed

        for dataset_urls in catalog_urls:
            for url_index, dataset_url in enumerate(dataset_urls['url']):

                url_df.loc[row_index] = [
                    file_date,
                    dataset_urls['id'],
                    dataset_urls['agency'],
                    dataset_url,
                    url_index,
                    ''  # 'url_status'
                ]
                row_index += 1