column_number = 0

                    data_values_tuple_list = []
                    for country, data_value in data_values_dict.items():
                        for year, value in data_value.items():
                            data_values_tuple_list.append(
                                (value, year, country, newvariable.pk))

                    with connection.cursor() as c:
                        c.executemany(insert_string, data_values_tuple_list)

                    newimport = ImportHistory(
                        import_type='clioinfra',
                        import_time=timezone.now().strftime(
                            '%Y-%m-%d %H:%M:%S'),
                        import_notes='Importing file %s' % one_file,
                        import_state=json.dumps({
                            'file_hash':
                            file_checksum(file),
                            'file_name':
                            one_file
                        }))
                    newimport.save()

    for eachdataset in new_datasets_list:
        write_dataset_csv(eachdataset.pk, eachdataset.name, None,
                          'clioinfra_fetcher', '')
    for eachdataset in old_datasets_list:
        write_dataset_csv(eachdataset.pk, eachdataset.name, eachdataset.name,
                          'clioinfra_fetcher', '')
Example #2
0
                            c.executemany(insert_string, data_values_tuple_list)
                        data_values_tuple_list = []

                if len(data_values_tuple_list):  # insert any leftover data_values
                    with connection.cursor() as c:
                        c.executemany(insert_string, data_values_tuple_list)

        newimport = ImportHistory(import_type='unwpp',
                                      import_time=timezone.now().strftime('%Y-%m-%d %H:%M:%S'),
                                      import_notes='Importing file %s' % file_to_parse,
                                      import_state=json.dumps(
                                      {'file_hash': file_checksum(os.path.join(wpp_downloads_save_location, file_to_parse)),
                                       'file_name': file_to_parse
                                       }))
        newimport.save()
        write_dataset_csv(newdataset.pk, newdataset.name, None, 'unwpp_fetcher', '')
    else:

        if imported_before_hash == file_checksum(os.path.join(wpp_downloads_save_location, file_to_parse)):
            sys.exit('No updates available.')

        country_name_entity_ref = process_entities(country_names_dict)

        existing_categories = DatasetCategory.objects.values('name')
        existing_categories_list = {item['name'] for item in existing_categories}

        if un_wpp_category_name_in_db not in existing_categories_list:
            the_category = DatasetCategory(name=un_wpp_category_name_in_db, fetcher_autocreated=True)
            the_category.save()

        else:
Example #3
0
import os
import sys
sys.path.insert(1, os.path.join(sys.path[0], '../..'))
import grapher_admin.wsgi
from grapher_admin.views import write_dataset_csv
from grapher_admin.models import Dataset, Variable
from django.conf import settings

# use this script to make the initial csv and metadata export of all datasets to the repo

all_datasets = Dataset.objects.all()

for each in all_datasets:
    last_updated_by = Variable.objects.filter(datasetId=each).order_by('-updated_at')
    if last_updated_by:
        committer = last_updated_by.first()
        if not committer.uploaded_by:
            committer_name = settings.DATASETS_REPO_USERNAME
            committer_email = settings.DATASETS_REPO_EMAIL
        else:
            committer_name = committer.uploaded_by.get_full_name()
            committer_email = committer.uploaded_by.email
        write_dataset_csv(each.pk, each.name, None, committer_name, committer_email)
Example #4
0
                column_number = 0
                if row_number % 10 == 0:
                    time.sleep(0.001)  # this is done in order to not keep the CPU busy all the time, the delay after each 10th row is 1 millisecond

        if len(data_values_tuple_list):  # insert any leftover data_values
            with connection.cursor() as c:
                c.executemany(insert_string, data_values_tuple_list)
            logger.info("Dumping data values...")

        newimport = ImportHistory(import_type='povstats', import_time=timezone.now().strftime('%Y-%m-%d %H:%M:%S'),
                                  import_notes='Initial import of POVSTATS datasets',
                                  import_state=json.dumps({'file_hash': file_checksum(povstats_downloads_save_location + 'povstats.zip')}))
        newimport.save()
        for dataset in datasets_list:
            write_dataset_csv(dataset.pk, dataset.name, None, 'povstats_fetcher', '')
        logger.info("Import complete.")

    else:
        last_import = import_history.last()
        deleted_indicators = {}  # This is used to keep track which variables' data values were already deleted before writing new values

        if json.loads(last_import.import_state)['file_hash'] == file_checksum(povstats_downloads_save_location + 'povstats.zip'):
            logger.info('No updates available.')
            sys.exit('No updates available.')

        logger.info('New data is available.')
        available_variables = Variable.objects.filter(datasetId__in=Dataset.objects.filter(namespace='povstats'))
        available_variables_list = []

        for each in available_variables.values('code'):
                                data_values_tuple_list.append((str(float(row[str(i)])), i,
                                                   c_name_entity_ref[row['Country or Area Name']].pk, variable_name_to_object[variable_name].pk))
                            except:
                                pass

                if len(data_values_tuple_list) > 3000:  # insert when the length of the list goes over 3000

                    with connection.cursor() as c:
                        c.executemany(insert_string, data_values_tuple_list)
                    data_values_tuple_list = []

                if row_number % 100 == 0:
                    time.sleep(0.001)  # this is done in order to not keep the CPU busy all the time, the delay after each 100th row is 1 millisecond

        if len(data_values_tuple_list):  # insert any leftover data_values
            with connection.cursor() as c:
                c.executemany(insert_string, data_values_tuple_list)
            data_values_tuple_list = []

for dataset in existing_datasets_list:
    write_dataset_csv(dataset.pk, dataset.name, dataset.name, 'un_sdg_fetcher', '')
for dataset in new_datasets_list:
    write_dataset_csv(dataset.pk, dataset.name, None, 'un_sdg_fetcher', '')

newimport = ImportHistory(import_type='un_sdg', import_time=timezone.now().strftime('%Y-%m-%d %H:%M:%S'),
                                  import_notes='A un_sdg import was performed',
                                  import_state='There are a total of %s un_sdg variables after the import' % Variable.objects.filter(fk_dst_id__namespace='un_sdg').count())
newimport.save()

print("--- %s seconds ---" % (time.time() - start_time))
Example #6
0
                    )  # this is done in order to not keep the CPU busy all the time, the delay after each 10th row is 1 millisecond

        if len(data_values_tuple_list):  # insert any leftover data_values
            with connection.cursor() as c:
                c.executemany(insert_string, data_values_tuple_list)
            logger.info("Dumping data values...")

        newimport = ImportHistory(
            import_type='climatech',
            import_time=timezone.now().strftime('%Y-%m-%d %H:%M:%S'),
            import_notes='Initial import of climatech datasets',
            import_state=json.dumps(
                {'file_hash': file_checksum(excel_filename)}))
        newimport.save()
        for dataset in datasets_list:
            write_dataset_csv(dataset.pk, dataset.name, None,
                              'climatech_fetcher', '')
        logger.info("Import complete.")

    else:
        last_import = import_history.last()
        deleted_indicators = {
        }  # This is used to keep track which variables' data values were already deleted before writing new values

        if json.loads(last_import.import_state)['file_hash'] == file_checksum(
                excel_filename):
            logger.info('No updates available.')
            sys.exit('No updates available.')

        logger.info('New data is available.')
        available_variables = Variable.objects.filter(
            fk_dst_id__in=Dataset.objects.filter(namespace='climatech'))
Example #7
0
            for oneimport in import_history:
                if json.loads(oneimport.import_state
                              )['file_name'] == os.path.basename(eachfile):
                    file_imported_before = True
                    imported_before_hash = json.loads(
                        oneimport.import_state)['file_hash']
            if not file_imported_before:
                process_csv_file_insert(eachfile, os.path.basename(eachfile))
                newimport = ImportHistory(
                    import_type='faostat',
                    import_time=timezone.now().strftime('%Y-%m-%d %H:%M:%S'),
                    import_notes='Importing file %s' %
                    os.path.basename(eachfile),
                    import_state=json.dumps({
                        'file_hash':
                        file_checksum(eachfile),
                        'file_name':
                        os.path.basename(eachfile)
                    }))
                newimport.save()
            else:
                if imported_before_hash == file_checksum(eachfile):
                    print('No updates available for file %s.' %
                          os.path.basename(eachfile))

    for eachdataset in datasets_list:
        write_dataset_csv(eachdataset.pk, eachdataset.name, None,
                          'faostat_fetcher', '')

print("Script execution time: %s" % (datetime.now() - start_time))
Example #8
0
                                    with connection.cursor() as c:
                                        c.executemany(insert_string, data_values_tuple_list)
                                    data_values_tuple_list = []

                            if row_number % 100 == 0:
                                time.sleep(
                                    0.001)  # this is done in order to not keep the CPU busy all the time, the delay after each 100th row is 1 millisecond

                    if len(data_values_tuple_list):  # insert any leftover data_values
                        with connection.cursor() as c:
                            c.executemany(insert_string, data_values_tuple_list)
                        data_values_tuple_list = []
                    print(
                        '################################################################################################')

                newimport = ImportHistory(import_type='ilostat',
                                          import_time=timezone.now().strftime('%Y-%m-%d %H:%M:%S'),
                                          import_notes='Importing file %s' % one_file,
                                          import_state=json.dumps(
                                              {'file_hash': file_checksum(file),
                                               'file_name': one_file
                                               }))
                newimport.save()

                os.remove(file.replace('.gz', ''))

    for onedataset in new_datasets_list:
        write_dataset_csv(onedataset.pk, onedataset.name, None, 'ilostat_fetcher', '')
    for onedataset in old_datasets_list:
        write_dataset_csv(onedataset.pk, onedataset.name, onedataset.name, 'ilostat_fetcher', '')
Example #9
0
                column_number = 0
                if row_number % 10 == 0:
                    time.sleep(0.001)  # this is done in order to not keep the CPU busy all the time, the delay after each 10th row is 1 millisecond

        if len(data_values_tuple_list):  # insert any leftover data_values
            with connection.cursor() as c:
                c.executemany(insert_string, data_values_tuple_list)
            logger.info("Dumping data values...")

        newimport = ImportHistory(import_type='findex', import_time=timezone.now().strftime('%Y-%m-%d %H:%M:%S'),
                                  import_notes='Initial import of Findex datasets',
                                  import_state=json.dumps({'file_hash': file_checksum(findex_downloads_save_location + 'findex.zip')}))
        newimport.save()
        for dataset in datasets_list:
            write_dataset_csv(dataset.pk, dataset.name, None, 'findex_fetcher', '')
        logger.info("Import complete.")

    else:
        last_import = import_history.last()
        deleted_indicators = {}  # This is used to keep track which variables' data values were already deleted before writing new values

        if json.loads(last_import.import_state)['file_hash'] == file_checksum(findex_downloads_save_location + 'findex.zip'):
            logger.info('No updates available.')
            sys.exit('No updates available.')

        logger.info('New data is available.')
        available_variables = Variable.objects.filter(fk_dst_id__in=Dataset.objects.filter(namespace='findex'))
        available_variables_list = []

        for each in available_variables.values('code'):