Example #1
0
                        elif country_name in existing_entities_list:
                            newentity = Entity.objects.get(name=country_name)
                        else:
                            newentity = Entity(name=country_name, validated=False)
                            newentity.save()
                            logger.info("Inserting a country %s." % newentity.name.encode('utf8'))
                        country_name_entity_ref[country_code] = newentity

            column_number = 0

        insert_string = 'INSERT into data_values (value, year, entityId, variableId) VALUES (%s, %s, %s, %s)'  # this is used for constructing the query for mass inserting to the data_values table
        data_values_tuple_list = []
        datasets_list = []
        for category in wdi_categories_list:
            newdataset = Dataset(name='World Development Indicators - ' + category,
                                 description='This is a dataset imported by the automated fetcher',
                                 namespace=DATASET_NAMESPACE) # rewrite removed DatasetSubcategory
            newdataset.save()
            dataset_tag = DatasetTag(dataset_id=newdataset, tag_id=parent_tag)
            dataset_tag.save()
            datasets_list.append(newdataset)
            logger.info("Inserting a dataset %s." % newdataset.name.encode('utf8'))
            row_number = 0
            for row in data_ws.rows:
                row_number += 1
                data_values = []
                for cell in row:
                    if row_number == 1:
                        if cell.value:
                            try:
                                last_available_year = int(cell.value)
                        if row_number == 2 and column_number == 1:
                            varunit = cell.value

                        if row_number == 3 and column_number == 1:
                            # inserting a subcategory and dataset
                            if dataset_to_category[
                                    varname] not in existing_subcategories_list:
                                the_subcategory = DatasetSubcategory(
                                    name=dataset_to_category[varname],
                                    categoryId=the_category)
                                the_subcategory.save()
                                newdataset = Dataset(
                                    name='Clio-Infra - %s' %
                                    the_subcategory.name,
                                    description=
                                    'This is a dataset imported by the automated fetcher',
                                    namespace='clioinfra',
                                    categoryId=the_category,
                                    subcategoryId=the_subcategory)
                                newdataset.save()
                                new_datasets_list.append(newdataset)
                                existing_subcategories_list.add(
                                    dataset_to_category[varname])
                            else:
                                the_subcategory = DatasetSubcategory.objects.get(
                                    name=dataset_to_category[varname],
                                    categoryId=the_category)
                                newdataset = Dataset.objects.get(
                                    name='Clio-Infra - %s' %
                                    the_subcategory.name,
                                    namespace='clioinfra')
 for each in z.namelist():
     if '.csv' in each:
         csv_filename = ghdx_downloads_save_location + each
 z.extractall(ghdx_downloads_save_location)
 with open(csv_filename, 'r', encoding='utf8') as f:
     print('Processing: %s' % file)
     reader = csv.DictReader(f)
     for row in reader:
         row_number += 1
         if row['sex_name'] in sex_names and row['age_name'] in age_names and row[
             'metric_name'] in metric_names and row['measure_name'] in measure_names and row['cause_name'] == 'All causes':
             if row['rei_name'] not in existing_subcategories_list:
                 the_subcategory = DatasetSubcategory(name=row['rei_name'], fk_dst_cat_id=the_category)
                 the_subcategory.save()
                 newdataset = Dataset(name=row['rei_name'],
                                      description='This is a dataset imported by the automated fetcher',
                                      namespace='gbd_risk', fk_dst_cat_id=the_category,
                                      fk_dst_subcat_id=the_subcategory)
                 newdataset.save()
                 dataset_name_to_object[row['rei_name']] = newdataset
                 new_datasets_list.append(newdataset)
                 newsource = Source(name=row['rei_name'],
                                    description=json.dumps(source_description),
                                    datasetId=newdataset.pk)
                 newsource.save()
                 source_name_to_object[row['rei_name']] = newsource
                 existing_subcategories = DatasetSubcategory.objects.filter(
                     fk_dst_cat_id=the_category.pk).values(
                     'name')
                 existing_subcategories_list = {item['name'] for item in existing_subcategories}
             else:
                 if row['rei_name'] not in dataset_name_to_object:
Example #4
0
                        elif country_name in existing_entities_list:
                            newentity = Entity.objects.get(name=country_name)
                        else:
                            newentity = Entity(name=country_name, validated=False)
                            newentity.save()
                            logger.info("Inserting a country %s." % newentity.name.encode('utf8'))
                        country_name_entity_ref[country_code] = newentity

            column_number = 0

        insert_string = 'INSERT into data_values (value, year, entityId, variableId) VALUES (%s, %s, %s, %s)'  # this is used for constructing the query for mass inserting to the data_values table
        data_values_tuple_list = []
        datasets_list = []
        for category in povstats_categories_list:
            newdataset = Dataset(name='World Bank Poverty and Equity database - ' + category,
                                 description='This is a dataset imported by the automated fetcher',
                                 namespace='povstats', categoryId=the_category,
                                 subcategoryId=DatasetSubcategory.objects.get(name=category, categoryId=the_category))
            newdataset.save()
            datasets_list.append(newdataset)
            logger.info("Inserting a dataset %s." % newdataset.name.encode('utf8'))
            row_number = 0
            columns_to_years = {}
            for row in data_ws.rows:
                row_number += 1
                data_values = []
                for cell in row:
                    if row_number == 1:
                        column_number += 1
                        if cell.value:
                            try:
                                last_available_year = int(cell.value)
Example #5
0
            existing_subcategories = DatasetSubcategory.objects.filter(
                categoryId=the_category.pk).values('name')
            existing_subcategories_list = {
                item['name']
                for item in existing_subcategories
            }
        else:
            the_subcategory = DatasetSubcategory.objects.get(
                name=section, categoryId=the_category)

        if section not in dataset_name_to_object:
            newdataset = Dataset(
                name=section,
                description=
                'This is a dataset imported by the automated fetcher',
                namespace='who_wash',
                categoryId=the_category,
                subcategoryId=the_subcategory)
            newdataset.save()
            dataset_name_to_object[section] = newdataset
        else:
            newdataset = Dataset.objects.get(name=section,
                                             categoryId=the_category)

        source_name = "WHO UNICEF - {}".format(section)
        if source_name not in source_name_to_object:
            newsource = Source(name=source_name,
                               description=json.dumps(source_description),
                               datasetId=newdataset.pk)
            newsource.save()
Example #6
0
def process_csv_file_insert(filename_to_process: str, original_filename: str):
    print('Processing: %s' % original_filename)

    global unique_data_tracker
    global datasets_list

    current_file_vars_countries = set(
    )  # keeps track of variables+countries we saw in the current file
    current_file_var_codes = set()
    current_file_var_names = set()
    previous_row = tuple()

    # inserting a subcategory
    if file_to_category_dict[
            original_filename] not in existing_subcategories_list:
        the_subcategory = DatasetSubcategory(
            name=file_to_category_dict[original_filename],
            fk_dst_cat_id=the_category)
        the_subcategory.save()
        existing_subcategories_list.add(
            file_to_category_dict[original_filename])
    else:
        the_subcategory = DatasetSubcategory.objects.get(
            name=file_to_category_dict[original_filename])

    insert_string = 'INSERT into data_values (value, year, fk_ent_id, fk_var_id) VALUES (%s, %s, %s, %s)'  # this is used for constructing the query for mass inserting to the data_values table
    data_values_tuple_list = []

    # inserting a dataset
    newdataset = Dataset(
        name='%s: %s' % (file_to_category_dict[original_filename],
                         file_dataset_names[original_filename]),
        description='This is a dataset imported by the automated fetcher',
        namespace='faostat',
        fk_dst_cat_id=the_category,
        fk_dst_subcat_id=the_subcategory)
    newdataset.save()
    datasets_list.append(newdataset)

    # reading source information from a csv file in metadata_dir
    metadata_file_path = os.path.join(
        metadata_dir,
        os.path.splitext(original_filename)[0] + ".csv")
    data_published_by = 'Food and Agriculture Organization of the United Nations (FAO)'
    data_publishers_source = ''
    additional_information = ''
    variable_description = ''
    if os.path.isfile(metadata_file_path):
        with open(metadata_file_path, encoding='latin-1') as metadatacsv:
            metadatareader = csv.DictReader(metadatacsv)
            metadatacolumns = tuple(metadatareader.fieldnames)
            for row in metadatareader:
                if row['Subsection Code'] == '1.1':
                    data_published_by = row['Metadata']
                if row['Subsection Code'] == '3.1':
                    variable_description = row['Metadata']
                if row['Subsection Code'] == '3.4':
                    additional_information = row['Metadata']
                if row['Subsection Code'] == '20.1':
                    data_publishers_source = row['Metadata']

    # inserting a dataset source
    newsource = Source(
        name=file_dataset_names[original_filename],
        description=source_template %
        (file_dataset_names[original_filename], data_published_by,
         data_publishers_source, additional_information),
        datasetId=newdataset.pk)
    newsource.save()

    existing_fao_variables = Variable.objects.filter(
        fk_dst_id__in=Dataset.objects.filter(namespace='faostat'))
    existing_fao_variables_dict = {}
    for each in existing_fao_variables:
        existing_fao_variables_dict[each.name] = each

    with open(filename_to_process, encoding='latin-1') as currentfile:
        currentreader = csv.DictReader(currentfile)
        filecolumns = tuple(currentreader.fieldnames)

        # these column types are very similar
        if filecolumns == column_types[0] or filecolumns == column_types[1] \
           or filecolumns == column_types[2] or filecolumns == column_types[3] \
           or filecolumns == column_types[4]:

            for row in currentreader:
                if filecolumns == column_types[0]:
                    countryname = row['Area']
                    variablename = row['Item']
                    variablecode = row['Item Code']
                if filecolumns == column_types[1]:
                    countryname = row['Country']
                    variablename = '%s - %s' % (row['Item'], row['Element'])
                    variablecode = '%s - %s' % (row['ItemCode'],
                                                row['ElementCode'])
                if filecolumns == column_types[2]:
                    countryname = row['Area']
                    variablename = '%s - %s' % (row['Item'], row['Element'])
                    variablecode = '%s - %s' % (row['Item Code'],
                                                row['Element Code'])
                if filecolumns == column_types[3]:
                    countryname = row['Country']
                    variablename = '%s - %s' % (row['Item'], row['Element'])
                    variablecode = '%s - %s' % (row['Item Code'],
                                                row['Element Code'])
                if filecolumns == column_types[4]:
                    countryname = row['Country']
                    variablename = '%s - %s' % (row['Indicator'],
                                                row['Source'])
                    variablecode = '%s - %s' % (row['Indicator Code'],
                                                row['Source Code'])

                if original_filename == 'Emissions_Agriculture_Energy_E_All_Data_(Norm).zip':
                    variablename += ' - %s' % row['Unit']

                if original_filename == 'Production_LivestockPrimary_E_All_Data_(Normalized).zip':
                    variablename += ' - %s' % row['Unit']

                if original_filename == 'Trade_LiveAnimals_E_All_Data_(Normalized).zip':
                    variablename += ' - %s' % row['Unit']

                # avoiding duplicate rows
                if original_filename == 'Inputs_Pesticides_Use_E_All_Data_(Normalized).zip':
                    if row['Item Code'] not in current_file_var_codes and row[
                            'Item'] not in current_file_var_names:
                        current_file_var_codes.add(row['Item Code'])
                        current_file_var_names.add(row['Item'])
                    elif row['Item Code'] in current_file_var_codes and row[
                            'Item'] in current_file_var_names:
                        pass
                    else:
                        continue

                # avoiding duplicate rows
                if original_filename == 'FoodBalanceSheets_E_All_Data_(Normalized).csv':
                    if tuple(row) == previous_row:
                        previous_row = tuple(row)
                        continue
                    else:
                        previous_row = tuple(row)

                try:
                    year = int(row['Year'])
                    value = float(row['Value'])
                except ValueError:
                    year = False
                    value = False

                variablename = file_dataset_names[
                    original_filename] + ': ' + variablename

                current_file_vars_countries.add(
                    tuple([countryname, variablecode]))

                process_one_row(year, value, countryname, variablecode,
                                variablename, existing_fao_variables_dict,
                                row['Unit'], newsource, newdataset,
                                variable_description, data_values_tuple_list)

            unique_data_tracker.update(current_file_vars_countries)

        # these are the files that require several iterations over all rows
        if filecolumns == column_types[5] or filecolumns == column_types[
                6] or filecolumns == column_types[7]:
            if filecolumns == column_types[5]:
                iterations = [{
                    'country_field': 'Donor Country',
                    'varname_format': '%s - Donors'
                }, {
                    'country_field': 'Recipient Country',
                    'varname_format': '%s - Recipients'
                }]
            if filecolumns == column_types[6]:
                iterations = [{
                    'country_field': 'Reporter Countries',
                    'varname_format': '%s - %s - Reporters'
                }, {
                    'country_field': 'Partner Countries',
                    'varname_format': '%s - %s - Partners'
                }]
            if filecolumns == column_types[7]:
                iterations = [{
                    'country_field': 'Donor',
                    'varname_format': '%s - %s - Donors'
                }, {
                    'country_field': 'Recipient Country',
                    'varname_format': '%s - %s - Recipients'
                }]
            for oneiteration in iterations:
                file_stream_holder = {
                }  # we will break down these files into smaller files
                dict_writer_holder = {}
                separate_files_names = {
                }  # we will keep the filenames in this dict
                unique_vars = []
                # first we collect all variable names
                currentfile.seek(0)
                row_counter = 0
                for row in currentreader:
                    if row['Year'] == 'Year':
                        continue
                    row_counter += 1
                    if row_counter % 300 == 0:
                        time.sleep(
                            0.001
                        )  # this is done in order to not keep the CPU busy all the time
                    if filecolumns == column_types[5]:
                        variablename = oneiteration['varname_format'] % row[
                            'Item']
                    if filecolumns == column_types[6]:
                        variablename = oneiteration['varname_format'] % (
                            row['Item'], row['Element'])
                    if filecolumns == column_types[7]:
                        variablename = oneiteration['varname_format'] % (
                            row['Item'], row['Purpose'])
                    if variablename not in unique_vars:
                        unique_vars.append(variablename)
                # then we break the dataset into files named after the variable names
                for varname in unique_vars:
                    separate_files_names[varname.replace('/', '+') +
                                         '.csv'] = varname
                    file_stream_holder[varname] = open(os.path.join(
                        '/tmp',
                        varname.replace('/', '+') + '.csv'),
                                                       'w+',
                                                       encoding='latin-1')
                    dict_writer_holder[varname] = csv.DictWriter(
                        file_stream_holder[varname],
                        fieldnames=[
                            'Country', 'Variable', 'Varcode', 'Year', 'Unit',
                            'Value'
                        ])
                    dict_writer_holder[varname].writeheader()
                # go back to the beginning of the file
                currentfile.seek(0)
                row_counter = 0
                for row in currentreader:
                    if row['Year'] == 'Year':
                        continue
                    row_counter += 1
                    if row_counter % 300 == 0:
                        time.sleep(
                            0.001
                        )  # this is done in order to not keep the CPU busy all the time
                    if filecolumns == column_types[5]:
                        variablename = oneiteration['varname_format'] % row[
                            'Item']
                        variablecode = row['Item Code']
                        dict_writer_holder[variablename].writerow({
                            'Country':
                            row[oneiteration['country_field']],
                            'Variable':
                            variablename,
                            'Varcode':
                            variablecode,
                            'Unit':
                            row['Unit'],
                            'Year':
                            row['Year'],
                            'Value':
                            row['Value']
                        })
                    if filecolumns == column_types[6]:
                        variablename = oneiteration['varname_format'] % (
                            row['Item'], row['Element'])
                        variablecode = '%s - %s' % (row['Item Code'],
                                                    row['Element Code'])
                        dict_writer_holder[variablename].writerow({
                            'Country':
                            row[oneiteration['country_field']],
                            'Variable':
                            variablename,
                            'Varcode':
                            variablecode,
                            'Unit':
                            row['Unit'],
                            'Year':
                            row['Year'],
                            'Value':
                            row['Value']
                        })
                    if filecolumns == column_types[7]:
                        variablename = oneiteration['varname_format'] % (
                            row['Item'], row['Purpose'])
                        variablecode = '%s - %s' % (row['Item Code'],
                                                    row['Purpose Code'])
                        dict_writer_holder[variablename].writerow({
                            'Country':
                            row[oneiteration['country_field']],
                            'Variable':
                            variablename,
                            'Varcode':
                            variablecode,
                            'Unit':
                            row['Unit'],
                            'Year':
                            row['Year'],
                            'Value':
                            row['Value']
                        })
                    if row_counter % 100000 == 0:
                        for fileholder, actual_file in file_stream_holder.items(
                        ):
                            actual_file.flush()
                            os.fsync(actual_file.fileno())
                for fileholder, actual_file in file_stream_holder.items():
                    actual_file.close()

                # now parsing and importing each file individually

                for each_separate_file, file_variable_name in separate_files_names.items(
                ):
                    unique_records_holder = {}
                    with open('/tmp/%s' % each_separate_file,
                              encoding='latin-1') as separate_file:
                        separate_file_reader = csv.DictReader(separate_file)
                        row_counter = 0
                        for row in separate_file_reader:
                            row_counter += 1
                            if row_counter % 300 == 0:
                                time.sleep(
                                    0.001
                                )  # this is done in order to not keep the CPU busy all the time
                            countryname = row['Country']
                            variablecode = row['Varcode']
                            variableunit = row['Unit']
                            year = row['Year']
                            value = row['Value']

                            try:
                                year = int(year)
                                value = float(value)
                            except ValueError:
                                year = False
                                value = False
                            if year is not False and value is not False:
                                unique_record = tuple([countryname, year])
                                if unique_record not in unique_records_holder:
                                    unique_records_holder[
                                        unique_record] = value
                                else:
                                    unique_records_holder[
                                        unique_record] += value
                    for key, value in unique_records_holder.items():
                        variablename = file_dataset_names[
                            original_filename] + ': ' + file_variable_name
                        process_one_row(
                            list(key)[1], str(value),
                            list(key)[0], variablecode, variablename,
                            existing_fao_variables_dict, variableunit,
                            newsource, newdataset, variable_description,
                            data_values_tuple_list)

                    os.remove('/tmp/%s' % each_separate_file)

        if len(data_values_tuple_list):  # insert any leftover data_values
            with connection.cursor() as c:
                c.executemany(insert_string, data_values_tuple_list)
 z = zipfile.ZipFile(file)
 for each in z.namelist():
     if '.csv' in each:
         csv_filename = ghdx_downloads_save_location + each
 z.extractall(ghdx_downloads_save_location)
 with open(csv_filename, 'r', encoding='utf8') as f:
     print('Processing: %s' % file)
     reader = csv.DictReader(f)
     for row in reader:
         row_number += 1
         if row['sex_name'] in sex_names and row['age_name'] in age_names and row['metric_name'] in metric_names and row['measure_name'] in measure_names:
             if row['cause_name'] not in existing_subcategories_list:
                 the_subcategory = DatasetSubcategory(name=row['cause_name'], categoryId=the_category)
                 the_subcategory.save()
                 newdataset = Dataset(name=row['cause_name'],
                                      description='This is a dataset imported by the automated fetcher',
                                      namespace='gbd_prevalence_by_gender', categoryId=the_category,
                                      subcategoryId=the_subcategory)
                 newdataset.save()
                 dataset_name_to_object[row['cause_name']] = newdataset
                 new_datasets_list.append(newdataset)
                 newsource = Source(name=row['cause_name'],
                                    description=json.dumps(source_description),
                                    datasetId=newdataset.pk)
                 newsource.save()
                 source_name_to_object[row['cause_name']] = newsource
                 existing_subcategories = DatasetSubcategory.objects.filter(categoryId=the_category.pk).values(
                     'name')
                 existing_subcategories_list = {item['name'] for item in existing_subcategories}
             else:
                 if row['cause_name'] not in dataset_name_to_object:
                     newdataset = Dataset.objects.get(name=row['cause_name'], categoryId=the_category)
                metadata_string += mstring[
                                   mstring.index('Data Availability '):mstring.index('\nCalendar ')] + '\n'

        source_description['additionalInfo'] = metadata_string if metadata_string else None
        with open(file, 'r', encoding='utf8') as f:

            reader = csv.DictReader(f)
            for row in reader:
                row_number += 1
                subcategory_name = row['Indicator Description'][:250]
                if subcategory_name not in existing_subcategories_list:
                    the_subcategory = DatasetSubcategory(name=subcategory_name, fk_dst_cat_id=the_category)
                    the_subcategory.save()
                    newdataset = Dataset(name=subcategory_name,
                                         description='This is a dataset imported by the automated fetcher',
                                         namespace='un_sdg', fk_dst_cat_id=the_category,
                                         fk_dst_subcat_id=the_subcategory)
                    newdataset.save()
                    dataset_name_to_object[subcategory_name] = newdataset
                    new_datasets_list.append(newdataset)
                    newsource = Source(name=subcategory_name,
                                       description=json.dumps(source_description),
                                       datasetId=newdataset.pk)
                    newsource.save()
                    source_name_to_object[subcategory_name] = newsource
                    existing_subcategories = DatasetSubcategory.objects.filter(fk_dst_cat_id=the_category.pk).values(
                        'name')
                    existing_subcategories_list = {item['name'] for item in existing_subcategories}
                else:
                    if subcategory_name not in dataset_name_to_object:
                        newdataset = Dataset.objects.get(name=subcategory_name, fk_dst_cat_id=the_category)
                    'DIOC_FIELD_STUDY', 'DIOC_LFS', 'DIOC_SECTOR',
                    'DIOC_SEX_AGE', 'MIG', 'REF_TOTALOFFICIAL',
                    'REF_TOTALRECPTS', 'TABLE3A', 'EDU_ENRL_MOBILE',
                    'EDU_GRAD_MOBILE', 'IO_GHG_2015'
            ]:

                if metadata_dict[file_name][
                        'category'] not in existing_subcategories_list:
                    the_subcategory = DatasetSubcategory(
                        name=metadata_dict[file_name]['category'],
                        fk_dst_cat_id=the_category)
                    the_subcategory.save()
                    newdataset = Dataset(
                        name=metadata_dict[file_name]['category'],
                        description=
                        'This is a dataset imported by the automated fetcher',
                        namespace='oecd_stat',
                        fk_dst_cat_id=the_category,
                        fk_dst_subcat_id=the_subcategory)
                    newdataset.save()
                    dataset_name_to_object[metadata_dict[file_name]
                                           ['category']] = newdataset
                    new_datasets_list.append(newdataset)

                    existing_subcategories = DatasetSubcategory.objects.filter(
                        fk_dst_cat_id=the_category.pk).values('name')
                    existing_subcategories_list = {
                        item['name']
                        for item in existing_subcategories
                    }
                else:
Example #10
0
                        elif country_name in existing_entities_list:
                            newentity = Entity.objects.get(name=country_name)
                        else:
                            newentity = Entity(name=country_name, validated=False)
                            newentity.save()
                            logger.info("Inserting a country %s." % newentity.name.encode('utf8'))
                        country_name_entity_ref[country_code] = newentity

            column_number = 0

        insert_string = 'INSERT into data_values (value, year, entityId, fk_var_id) VALUES (%s, %s, %s, %s)'  # this is used for constructing the query for mass inserting to the data_values table
        data_values_tuple_list = []
        datasets_list = []
        for category in hnpstats_categories_list:
            newdataset = Dataset(name='World Bank Health Nutrition and Population Statistics - ' + category,
                                 description='This is a dataset imported by the automated fetcher',
                                 namespace='hnpstats', fk_dst_cat_id=the_category,
                                 fk_dst_subcat_id=DatasetSubcategory.objects.get(name=category, fk_dst_cat_id=the_category))
            newdataset.save()
            datasets_list.append(newdataset)
            logger.info("Inserting a dataset %s." % newdataset.name.encode('utf8'))
            row_number = 0
            columns_to_years = {}
            for row in data_ws.rows:
                row_number += 1
                data_values = []
                for cell in row:
                    if row_number == 1:
                        column_number += 1
                        if cell.value:
                            try:
                                last_available_year = int(cell.value)
Example #11
0
                newentity = Entity.objects.get(name=country_tool_names_dict[
                    unidecode.unidecode(country_name.lower())].owid_name)
            elif country_name in existing_entities_list:
                newentity = Entity.objects.get(name=country_name)
            else:
                newentity = Entity(name=country_name, validated=False)
                newentity.save()
                logger.info("Inserting a country %s." %
                            newentity.name.encode('utf8'))
            country_name_entity_ref[key] = newentity

        for key, category in abbr_category_names.items():
            newdataset = Dataset(
                name='QoG - ' + category,
                description=
                'This is a dataset imported by the automated fetcher',
                namespace='qog',
                fk_dst_cat_id=the_category,
                fk_dst_subcat_id=categories_ref_models[category])
            newdataset.save()
            logger.info("Inserting a dataset %s." %
                        newdataset.name.encode('utf8'))
            datasets_ref_models[category] = newdataset

        saved_sources = {
        }  # variables coming from one source don't all fall into one dataset
        # so we need to save the source info for each dataset where the source's variables are present

        for varcode, vardata in qog_vars.items():
            source_name = varcode[:varcode.index('_') + 1]
            if source_name in saved_sources:
Example #12
0
                                                   validated=False)
                                newentity.save()
                                logger.info("Inserting a country %s." %
                                            newentity.name.encode('utf8'))
                            country_name_entity_ref[country_code] = newentity

            column_number = 0

        insert_string = 'INSERT into data_values (value, year, entityId, fk_var_id) VALUES (%s, %s, %s, %s)'  # this is used for constructing the query for mass inserting to the data_values table
        data_values_tuple_list = []
        datasets_list = []
        for category in climatech_categories_list:
            newdataset = Dataset(
                name='World Bank Climate Change Data - ' + category,
                description=
                'This is a dataset imported by the automated fetcher',
                namespace='climatech',
                fk_dst_cat_id=the_category,
                fk_dst_subcat_id=DatasetSubcategory.objects.get(
                    name=category, fk_dst_cat_id=the_category))
            newdataset.save()
            datasets_list.append(newdataset)
            logger.info("Inserting a dataset %s." %
                        newdataset.name.encode('utf8'))
            row_number = 0
            columns_to_years = {}
            for row in data_ws.rows:
                row_number += 1
                data_values = []
                for cell in row:
                    if row_number == 1:
                        column_number += 1
Example #13
0
                if metadata_dict[file_name]['category'] not in existing_subcategories_list:
                    the_subcategory = DatasetSubcategory(name=metadata_dict[file_name]['category'],
                                                         categoryId=the_category)
                    the_subcategory.save()

                    existing_subcategories = DatasetSubcategory.objects.filter(categoryId=the_category.pk).values(
                        'name')
                    existing_subcategories_list = {item['name'] for item in existing_subcategories}
                else:
                    the_subcategory = DatasetSubcategory.objects.get(name=metadata_dict[file_name]['category'],
                                                                     categoryId=the_category)

                long_dataset_name = "{} - {}".format(file_category, filename)
                if long_dataset_name not in dataset_name_to_object:
                    newdataset = Dataset(name=long_dataset_name,
                                         description='This is a dataset imported by the automated fetcher',
                                         namespace='oecd_stat', categoryId=the_category,
                                         subcategoryId=the_subcategory)
                    newdataset.save()
                    dataset_name_to_object[long_dataset_name] = newdataset
                    new_datasets_list.append(newdataset)
                else:
                    newdataset = Dataset.objects.get(name=long_dataset_name, categoryId=the_category)

                source_name = "OECD - {} - {}".format(metadata_dict[file_name]['category'], filename)
                source_description['additionalInfo'] = metadata_dict[file_name]['meta_text']
                source_description['link'] = oecd_dataset_base_link.format(file_name)
                if source_name not in source_name_to_object:
                    newsource = Source(name=source_name,
                                       description=json.dumps(source_description),
                                       datasetId=newdataset.pk)
                    newsource.save()
Example #14
0
                        elif country_name in existing_entities_list:
                            newentity = Entity.objects.get(name=country_name)
                        else:
                            newentity = Entity(name=country_name, validated=False)
                            newentity.save()
                            logger.info("Inserting a country %s." % newentity.name.encode('utf8'))
                        country_name_entity_ref[country_code] = newentity

            column_number = 0

        insert_string = 'INSERT into data_values (value, year, entityId, variableId) VALUES (%s, %s, %s, %s)'  # this is used for constructing the query for mass inserting to the data_values table
        data_values_tuple_list = []
        datasets_list = []
        for category in bbsc_categories_list:
            newdataset = Dataset(name='World Bank Data on Statistical Capacity - ' + category,
                                 description='This is a dataset imported by the automated fetcher',
                                 namespace='bbsc', categoryId=the_category,
                                 subcategoryId=DatasetSubcategory.objects.get(name=category, categoryId=the_category))
            newdataset.save()
            datasets_list.append(newdataset)
            logger.info("Inserting a dataset %s." % newdataset.name.encode('utf8'))
            row_number = 0
            columns_to_years = {}
            for row in data_ws.rows:
                row_number += 1
                data_values = []
                for cell in row:
                    if row_number == 1:
                        column_number += 1
                        if cell.value:
                            try:
                                last_available_year = int(cell.value)
Example #15
0
                        if row_number == 11:
                            if cell.value:
                                variant = cell.value
                                timespan = variant[variant.index(', ') + 2:]
                        if row_number == 16:
                            if cell.value:
                                main_var_name = cell.value
                        if row_number == 17:
                            if column_number > 6:
                                if cell.value:
                                    var_to_add_dict[column_number] = '%s: %s - %s' % (variant, main_var_name, cell.value)

                        if row_number == 18:
                            if not dataset_saved:
                                newdataset = Dataset(name='UN WPP - %s' % dataset_name,
                                                     description='This is a dataset imported by the automated fetcher',
                                                     namespace='unwpp', fk_dst_cat_id=the_category,
                                                     fk_dst_subcat_id=the_subcategory)
                                newdataset.save()
                                dataset_saved = True
                                source_description['additionalInfo'] = dataset_info['description']
                                newsource = Source(name='United Nations – Population Division (2017 Revision)',
                                                   description=json.dumps(source_description),
                                                   datasetId=newdataset.pk)
                                newsource.save()

                            if not variables_saved:
                                for columnnum, varname in var_to_add_dict.items():
                                    if '(' not in varname:
                                        unit_of_measure = ''
                                    else:
                                        unit_of_measure = varname[varname.index('('):varname.index(')') + 1].replace('(', '').replace(')','')
 print('Processing: %s' % file)
 reader = csv.DictReader(f)
 for row in reader:
     row_number += 1
     if row['sex_name'] in sex_names and row[
             'age_name'] in age_names and row[
                 'metric_name'] in metric_names and row[
                     'measure_name'] in measure_names:
         if row['cause_name'] not in existing_subcategories_list:
             the_subcategory = DatasetSubcategory(
                 name=row['cause_name'], categoryId=the_category)
             the_subcategory.save()
             newdataset = Dataset(
                 name=row['cause_name'],
                 description=
                 'This is a dataset imported by the automated fetcher',
                 namespace='gbd_mental_health',
                 categoryId=the_category,
                 subcategoryId=the_subcategory)
             newdataset.save()
             dataset_name_to_object[row['cause_name']] = newdataset
             new_datasets_list.append(newdataset)
             newsource = Source(
                 name=row['cause_name'],
                 description=json.dumps(source_description),
                 datasetId=newdataset.pk)
             newsource.save()
             source_name_to_object[row['cause_name']] = newsource
             existing_subcategories = DatasetSubcategory.objects.filter(
                 categoryId=the_category.pk).values('name')
             existing_subcategories_list = {
Example #17
0
        the_subcategory.save()

        existing_subcategories = DatasetSubcategory.objects.filter(
            categoryId=the_category.pk).values('name')
        existing_subcategories_list = {
            item['name']
            for item in existing_subcategories
        }
    else:
        the_subcategory = DatasetSubcategory.objects.get(
            name=subcategory_name, categoryId=the_category)

    if subcategory_name not in dataset_name_to_object:
        newdataset = Dataset(
            name=subcategory_name,
            description='This is a dataset imported by the automated fetcher',
            namespace='unaids',
            categoryId=the_category,
            subcategoryId=the_subcategory)
        newdataset.save()
        dataset_name_to_object[subcategory_name] = newdataset
        new_datasets_list.append(newdataset)
    else:
        newdataset = Dataset.objects.get(name=subcategory_name,
                                         categoryId=the_category)

    source_name = 'UNAIDS'
    if source_name not in source_name_to_object:
        newsource = Source(name=source_name,
                           description=json.dumps(source_description),
                           datasetId=newdataset.pk)
        newsource.save()
Example #18
0
                        elif country_name in existing_entities_list:
                            newentity = Entity.objects.get(name=country_name)
                        else:
                            newentity = Entity(name=country_name, validated=False)
                            newentity.save()
                            logger.info("Inserting a country %s." % newentity.name.encode('utf8'))
                        country_name_entity_ref[country_code] = newentity

            column_number = 0

        insert_string = 'INSERT into data_values (value, year, entityId, fk_var_id) VALUES (%s, %s, %s, %s)'  # this is used for constructing the query for mass inserting to the data_values table
        data_values_tuple_list = []
        datasets_list = []
        for category in findex_categories_list:
            newdataset = Dataset(name='World Bank Global Findex - ' + category,
                                 description='This is a dataset imported by the automated fetcher',
                                 namespace='findex', fk_dst_cat_id=the_category,
                                 fk_dst_subcat_id=DatasetSubcategory.objects.get(name=category, fk_dst_cat_id=the_category))
            newdataset.save()
            datasets_list.append(newdataset)
            logger.info("Inserting a dataset %s." % newdataset.name.encode('utf8'))
            row_number = 0
            columns_to_years = {}
            for row in data_ws.rows:
                row_number += 1
                data_values = []
                for cell in row:
                    if row_number == 1:
                        column_number += 1
                        if cell.value:
                            try:
                                last_available_year = int(cell.value)