if row_number == 17: if column_number > 6: if cell.value: var_to_add_dict[column_number] = '%s: %s - %s' % (variant, main_var_name, cell.value) if row_number == 18: if not dataset_saved: newdataset = Dataset(name='UN WPP - %s' % dataset_name, description='This is a dataset imported by the automated fetcher', namespace='unwpp', fk_dst_cat_id=the_category, fk_dst_subcat_id=the_subcategory) newdataset.save() dataset_saved = True source_description['additionalInfo'] = dataset_info['description'] newsource = Source(name='United Nations – Population Division (2017 Revision)', description=json.dumps(source_description), datasetId=newdataset.pk) newsource.save() if not variables_saved: for columnnum, varname in var_to_add_dict.items(): if '(' not in varname: unit_of_measure = '' else: unit_of_measure = varname[varname.index('('):varname.index(')') + 1].replace('(', '').replace(')','') s_unit = short_unit_extract(unit_of_measure) newvariable = Variable(name=varname, unit=unit_of_measure, short_unit=s_unit, description='', code=None,
name=subcategory_name, description='This is a dataset imported by the automated fetcher', namespace='unaids', categoryId=the_category, subcategoryId=the_subcategory) newdataset.save() dataset_name_to_object[subcategory_name] = newdataset new_datasets_list.append(newdataset) else: newdataset = Dataset.objects.get(name=subcategory_name, categoryId=the_category) source_name = 'UNAIDS' if source_name not in source_name_to_object: newsource = Source(name=source_name, description=json.dumps(source_description), datasetId=newdataset.pk) newsource.save() source_name_to_object[source_name] = newsource else: newsource = Source.objects.get(name=source_name, datasetId=newdataset.pk) newsource.description = json.dumps(source_description) newsource.save() source_name_to_object[source_name] = newsource for eachfile in glob.glob(unaids_downloads + '/*.csv'): print("Processing: {}".format(eachfile)) with open(eachfile, mode='rt', encoding='utf-8-sig') as f: reader = csv.DictReader(f)
for row in reader: row_number += 1 if row['sex_name'] in sex_names and row['age_name'] in age_names and row[ 'metric_name'] in metric_names and row['measure_name'] in measure_names and row['cause_name'] == 'All causes': if row['rei_name'] not in existing_subcategories_list: the_subcategory = DatasetSubcategory(name=row['rei_name'], fk_dst_cat_id=the_category) the_subcategory.save() newdataset = Dataset(name=row['rei_name'], description='This is a dataset imported by the automated fetcher', namespace='gbd_risk', fk_dst_cat_id=the_category, fk_dst_subcat_id=the_subcategory) newdataset.save() dataset_name_to_object[row['rei_name']] = newdataset new_datasets_list.append(newdataset) newsource = Source(name=row['rei_name'], description=json.dumps(source_description), datasetId=newdataset.pk) newsource.save() source_name_to_object[row['rei_name']] = newsource existing_subcategories = DatasetSubcategory.objects.filter( fk_dst_cat_id=the_category.pk).values( 'name') existing_subcategories_list = {item['name'] for item in existing_subcategories} else: if row['rei_name'] not in dataset_name_to_object: newdataset = Dataset.objects.get(name=row['rei_name'], fk_dst_cat_id=the_category) dataset_name_to_object[row['rei_name']] = newdataset existing_datasets_list.append(newdataset) newsource = Source.objects.get(name=row['rei_name'], datasetId=newdataset.pk) newsource.description = json.dumps(source_description) newsource.save()
if indicator_code in category_vars[category]: if not global_cat[indicator_code]['saved']: source_description['additionalInfo'] = "Definitions and characteristics of countries and other territories: " + "https://ourworldindata.org" + reverse("servewdicountryinfo") + "\n" source_description['additionalInfo'] += "Limitations and exceptions:\n" + global_cat[indicator_code]['limitations'] + "\n" if global_cat[indicator_code]['limitations'] else '' source_description['additionalInfo'] += "Notes from original source:\n" + global_cat[indicator_code]['sourcenotes'] + "\n" if global_cat[indicator_code]['sourcenotes'] else '' source_description['additionalInfo'] += "General comments:\n" + global_cat[indicator_code]['comments'] + "\n" if global_cat[indicator_code]['comments'] else '' source_description['additionalInfo'] += "Statistical concept and methodology:\n" + global_cat[indicator_code]['concept'] if global_cat[indicator_code]['concept'] else '' source_description['additionalInfo'] += "Related source links:\n" + global_cat[indicator_code]['sourcelinks'] + "\n" if global_cat[indicator_code]['sourcelinks'] else '' source_description['additionalInfo'] += "Other web links:\n" + global_cat[indicator_code]['weblinks'] + "\n" if global_cat[indicator_code]['weblinks'] else '' source_description['dataPublisherSource'] = global_cat[indicator_code]['source'] if 'iea.org' in json.dumps(source_description).lower() or 'iea stat' in json.dumps(source_description).lower() or 'iea 2014' in json.dumps(source_description).lower(): source_description['dataPublishedBy'] = 'International Energy Agency (IEA) via The World Bank' else: source_description['dataPublishedBy'] = 'World Bank – World Development Indicators' newsource = Source(name='World Bank – WDI: ' + global_cat[indicator_code]['name'], description=json.dumps(source_description), datasetId=newdataset) newsource.save() logger.info("Inserting a source %s." % newsource.name.encode('utf8')) s_unit = extract_short_unit(global_cat[indicator_code]['unitofmeasure']) newvariable = Variable(name=global_cat[indicator_code]['name'], unit=global_cat[indicator_code]['unitofmeasure'] if global_cat[indicator_code]['unitofmeasure'] else '', short_unit=s_unit, description=global_cat[indicator_code]['description'], code=indicator_code, timespan='1960-' + str(last_available_year), datasetId=newdataset, sourceId=newsource) # rewrite removed VariableType newvariable.save() logger.info("Inserting a variable %s." % newvariable.name.encode('utf8')) global_cat[indicator_code]['variable_object'] = newvariable global_cat[indicator_code]['saved'] = True else: newvariable = global_cat[indicator_code]['variable_object'] for i in range(0, len(data_values)): data_values_tuple_list.append((data_values[i]['value'], data_values[i]['year'], country_name_entity_ref[country_code].pk, newvariable.pk)) if len(data_values_tuple_list) > 3000: # insert when the length of the list goes over 3000
if cell.value or cell.value == 0: data_values.append({'value': cell.value, 'year': columns_to_years[column_number]}) if column_number > 4 and column_number == last_available_column: if len(data_values): if indicator_code in category_vars[category]: if not global_cat[indicator_code]['saved']: source_description['additionalInfo'] = "Definitions and characteristics of countries and other territories: " + "https://ourworldindata.org" + reverse("servepovstatscountryinfo") + "\n" source_description['additionalInfo'] += "Limitations and exceptions:\n" + global_cat[indicator_code]['limitations'] + "\n" if global_cat[indicator_code]['limitations'] else '' source_description['additionalInfo'] += "Notes from original source:\n" + global_cat[indicator_code]['sourcenotes'] + "\n" if global_cat[indicator_code]['sourcenotes'] else '' source_description['additionalInfo'] += "General comments:\n" + global_cat[indicator_code]['comments'] + "\n" if global_cat[indicator_code]['comments'] else '' source_description['additionalInfo'] += "Statistical concept and methodology:\n" + global_cat[indicator_code]['concept'] + "\n" if global_cat[indicator_code]['concept'] else '' source_description['additionalInfo'] += "Related source links:\n" + global_cat[indicator_code]['sourcelinks'] + "\n" if global_cat[indicator_code]['sourcelinks'] else '' source_description['additionalInfo'] += "Other web links:\n" + global_cat[indicator_code]['weblinks'] + "\n" if global_cat[indicator_code]['weblinks'] else '' source_description['dataPublisherSource'] = global_cat[indicator_code]['source'] newsource = Source(name='World Bank Poverty and Equity database: ' + global_cat[indicator_code]['name'], description=json.dumps(source_description), datasetId=newdataset.pk) newsource.save() logger.info("Inserting a source %s." % newsource.name.encode('utf8')) s_unit = short_unit_extract(global_cat[indicator_code]['unitofmeasure']) newvariable = Variable(name=global_cat[indicator_code]['name'], unit=global_cat[indicator_code]['unitofmeasure'] if global_cat[indicator_code]['unitofmeasure'] else '', short_unit=s_unit, description=global_cat[indicator_code]['description'], code=indicator_code, timespan='', datasetId=newdataset, variableTypeId=VariableType.objects.get(pk=4), sourceId=newsource) newvariable.save() logger.info("Inserting a variable %s." % newvariable.name.encode('utf8')) global_cat[indicator_code]['variable_object'] = newvariable global_cat[indicator_code]['saved'] = True else: newvariable = global_cat[indicator_code]['variable_object'] for i in range(0, len(data_values)): data_values_tuple_list.append((data_values[i]['value'], data_values[i]['year'], country_name_entity_ref[country_code].pk, newvariable.pk)) if len(data_values_tuple_list) > 3000: # insert when the length of the list goes over 3000
if row_number == 18: if not dataset_saved: newdataset = Dataset( name='UN WPP - %s' % dataset_name, description= 'This is a dataset imported by the automated fetcher', namespace='unwpp', fk_dst_cat_id=the_category, fk_dst_subcat_id=the_subcategory) newdataset.save() dataset_saved = True newsource = Source( name='UN WPP - %s' % dataset_name, description=source_template % (dataset_info['description']), datasetId=newdataset.pk) newsource.save() if not variables_saved: for columnnum, varname in var_to_add_dict.items( ): if '(' not in varname: unit_of_measure = '' else: unit_of_measure = varname[ varname. index('('):varname.index(')') + 1].replace('(', '').replace(')', '')
new_datasets_list.append(newdataset) existing_subcategories_list.add( dataset_to_category[varname]) else: the_subcategory = DatasetSubcategory.objects.get( name=dataset_to_category[varname], fk_dst_cat_id=the_category) newdataset = Dataset.objects.get( name='Clio-Infra - %s' % the_subcategory.name, namespace='clioinfra') newsource = Source( name=varname, description=source_template % ( newdataset.name, filename_to_pagelink[one_file], filename_to_pagelink[one_file], ), datasetId=newdataset.pk) newsource.save() newvariable = Variable( name=varname, unit=varunit if varunit else '', short_unit=short_unit_extract(varunit), description='', code=filename_to_pagelink[one_file] [filename_to_pagelink[one_file].rfind('/') + 1:], timespan='', fk_dst_id=newdataset,
for item in existing_subcategories } else: if subcategory_name not in dataset_name_to_object: newdataset = Dataset.objects.get( name=subcategory_name, categoryId=the_category) dataset_name_to_object[ subcategory_name] = newdataset existing_datasets_list.append(newdataset) if row_number == 1: source_name = row['Indicator'] if source_name.lower() not in source_name_to_object: newsource = Source( name=source_name, description=json.dumps(source_description), datasetId=dataset_name_to_object[ subcategory_name].pk) newsource.save() source_name_to_object[ source_name.lower()] = newsource else: newsource = Source.objects.get( name=source_name, datasetId__in=[ x.pk for x in Dataset.objects.filter( namespace='who_gho') ]) newsource.description = json.dumps( source_description) newsource.save()
if cell.value or cell.value == 0: data_values.append({'value': cell.value, 'year': columns_to_years[column_number]}) if column_number > 4 and column_number == last_available_column: if len(data_values): if indicator_code in category_vars[category]: if not global_cat[indicator_code]['saved']: source_description['additionalInfo'] = "Definitions and characteristics of countries and other territories: " + "https://ourworldindata.org" + reverse("servehnpstatscountryinfo") + "\n" source_description['additionalInfo'] += "Limitations and exceptions:\n" + global_cat[indicator_code]['limitations'] + "\n" if global_cat[indicator_code]['limitations'] else '' source_description['additionalInfo'] += "Notes from original source:\n" + global_cat[indicator_code]['sourcenotes'] + "\n" if global_cat[indicator_code]['sourcenotes'] else '' source_description['additionalInfo'] += "General comments:\n" + global_cat[indicator_code]['comments'] + "\n" if global_cat[indicator_code]['comments'] else '' source_description['additionalInfo'] += "Statistical concept and methodology:\n" + global_cat[indicator_code]['concept'] + "\n" if global_cat[indicator_code]['concept'] else '' source_description['additionalInfo'] += "Related source links:\n" + global_cat[indicator_code]['sourcelinks'] + "\n" if global_cat[indicator_code]['sourcelinks'] else '' source_description['additionalInfo'] += "Other web links:\n" + global_cat[indicator_code]['weblinks'] + "\n" if global_cat[indicator_code]['weblinks'] else '' source_description['dataPublisherSource'] = global_cat[indicator_code]['source'] newsource = Source(name='World Bank Health Nutrition and Population Statistics: ' + global_cat[indicator_code]['name'], description=json.dumps(source_description), datasetId=newdataset.pk) newsource.save() logger.info("Inserting a source %s." % newsource.name.encode('utf8')) s_unit = short_unit_extract(global_cat[indicator_code]['unitofmeasure']) newvariable = Variable(name=global_cat[indicator_code]['name'], unit=global_cat[indicator_code]['unitofmeasure'] if global_cat[indicator_code]['unitofmeasure'] else '', short_unit=s_unit, description=global_cat[indicator_code]['description'], code=indicator_code, timespan='1960-' + str(last_available_year), fk_dst_id=newdataset, fk_var_type_id=VariableType.objects.get(pk=4), sourceId=newsource) newvariable.save() logger.info("Inserting a variable %s." % newvariable.name.encode('utf8')) global_cat[indicator_code]['variable_object'] = newvariable global_cat[indicator_code]['saved'] = True else: newvariable = global_cat[indicator_code]['variable_object'] for i in range(0, len(data_values)): data_values_tuple_list.append((data_values[i]['value'], data_values[i]['year'], country_name_entity_ref[country_code].pk, newvariable.pk)) if len(data_values_tuple_list) > 3000: # insert when the length of the list goes over 3000
for varcode, vardata in qog_vars.items(): source_name = varcode[:varcode.index('_') + 1] if source_name in saved_sources: if vardata['category'] not in saved_sources[source_name]: source_description['additionalInfo'] = qog_sources[ source_name]['description'] source_description['link'] = "http://qog.pol.gu.se/data" source_description[ 'link'] += ", " + qog_sources[source_name][ 'url'] if qog_sources[source_name]['url'] else "" source_description['dataPublisherSource'] = qog_sources[ source_name]['name'] newsource = Source( name='%s via the Quality of Government dataset' % (qog_sources[source_name]['name']), description=json.dumps(source_description), datasetId=datasets_ref_models[vardata['category']].pk) # in the metadata file, some of the sources have the same name, but are treated as different sources # so if we see a source with the same name in the same category, we switch to using the original dataset name try: with transaction.atomic(): newsource.save() except django.db.utils.IntegrityError: newsource.name = '%s via the Quality of Government dataset' % ( qog_sources[source_name]['original_dataset']) newsource.save() logger.info("Inserting a source %s." % newsource.name.encode('utf8')) saved_sources[source_name].update( {vardata['category']: newsource})
newdataset.name.encode('utf8')) datasets_ref_models[category] = newdataset saved_sources = { } # variables coming from one source don't all fall into one dataset # so we need to save the source info for each dataset where the source's variables are present for varcode, vardata in qog_vars.items(): source_name = varcode[:varcode.index('_') + 1] if source_name in saved_sources: if vardata['category'] not in saved_sources[source_name]: newsource = Source( name='%s via the Quality of Government dataset' % (qog_sources[source_name]['name']), description=source_template % (qog_sources[source_name]['description'], qog_sources[source_name]['name'], qog_sources[source_name]['original_dataset'], qog_sources[source_name]['url'], qog_sources[source_name]['url']), datasetId=datasets_ref_models[vardata['category']].pk) # in the metadata file, some of the sources have the same name, but are treated as different sources # so if we see a source with the same name in the same category, we switch to using the original dataset name try: with transaction.atomic(): newsource.save() except django.db.utils.IntegrityError: newsource.name = '%s via the Quality of Government dataset' % ( qog_sources[source_name]['original_dataset']) newsource.save() logger.info("Inserting a source %s." % newsource.name.encode('utf8'))
}) except ValueError: pass if column_number > 4 and column_number == last_available_column: if len(data_values): if indicator_code in category_vars[category]: if not global_cat[indicator_code]['saved']: source_description[ 'additionalInfo'] = None source_description[ 'dataPublisherSource'] = global_cat[ indicator_code]['source'] newsource = Source( name= 'World Bank Climate Change Data: ' + global_cat[indicator_code]['name'], description=json.dumps( source_description), datasetId=newdataset.pk) newsource.save() logger.info( "Inserting a source %s." % newsource.name.encode('utf8')) if global_cat[indicator_code][ 'unitofmeasure']: if len(global_cat[indicator_code] ['unitofmeasure']) < 40: unit_of_measure = global_cat[ indicator_code][ 'unitofmeasure'] else:
if cell.value or cell.value == 0: data_values.append({'value': cell.value, 'year': columns_to_years[column_number]}) if column_number > 4 and column_number == last_available_column: if len(data_values): if indicator_code in category_vars[category]: if not global_cat[indicator_code]['saved']: source_description['additionalInfo'] = "Definitions and characteristics of countries and other territories: " + "https://ourworldindata.org" + reverse("servebbsccountryinfo") + "\n" source_description['additionalInfo'] += "Limitations and exceptions:\n" + global_cat[indicator_code]['limitations'] + "\n" if global_cat[indicator_code]['limitations'] else '' source_description['additionalInfo'] += "Notes from original source:\n" + global_cat[indicator_code]['sourcenotes'] + "\n" if global_cat[indicator_code]['sourcenotes'] else '' source_description['additionalInfo'] += "General comments:\n" + global_cat[indicator_code]['comments'] + "\n" if global_cat[indicator_code]['comments'] else '' source_description['additionalInfo'] += "Statistical concept and methodology:\n" + global_cat[indicator_code]['concept'] + "\n" if global_cat[indicator_code]['concept'] else '' source_description['additionalInfo'] += "Related source links:\n" + global_cat[indicator_code]['sourcelinks'] + "\n" if global_cat[indicator_code]['sourcelinks'] else '' source_description['additionalInfo'] += "Other web links:\n" + global_cat[indicator_code]['weblinks'] + "\n" if global_cat[indicator_code]['weblinks'] else '' source_description['dataPublisherSource'] = global_cat[indicator_code]['source'] newsource = Source(name='World Bank Data on Statistical Capacity: ' + global_cat[indicator_code]['name'], description=json.dumps(source_description), datasetId=newdataset.pk) newsource.save() logger.info("Inserting a source %s." % newsource.name.encode('utf8')) s_unit = short_unit_extract(global_cat[indicator_code]['unitofmeasure']) newvariable = Variable(name=global_cat[indicator_code]['name'], unit=global_cat[indicator_code]['unitofmeasure'] if global_cat[indicator_code]['unitofmeasure'] else '', short_unit=s_unit, description=global_cat[indicator_code]['description'], code=indicator_code, timespan='', datasetId=newdataset, variableTypeId=VariableType.objects.get(pk=4), sourceId=newsource) newvariable.save() logger.info("Inserting a variable %s." % newvariable.name.encode('utf8')) global_cat[indicator_code]['variable_object'] = newvariable global_cat[indicator_code]['saved'] = True else: newvariable = global_cat[indicator_code]['variable_object'] for i in range(0, len(data_values)): data_values_tuple_list.append((data_values[i]['value'], data_values[i]['year'], country_name_entity_ref[country_code].pk, newvariable.pk)) if len(data_values_tuple_list) > 3000: # insert when the length of the list goes over 3000
def process_csv_file_insert(filename_to_process: str, original_filename: str): print('Processing: %s' % original_filename) global unique_data_tracker global datasets_list current_file_vars_countries = set( ) # keeps track of variables+countries we saw in the current file current_file_var_codes = set() current_file_var_names = set() previous_row = tuple() # inserting a subcategory if file_to_category_dict[ original_filename] not in existing_subcategories_list: the_subcategory = DatasetSubcategory( name=file_to_category_dict[original_filename], fk_dst_cat_id=the_category) the_subcategory.save() existing_subcategories_list.add( file_to_category_dict[original_filename]) else: the_subcategory = DatasetSubcategory.objects.get( name=file_to_category_dict[original_filename]) insert_string = 'INSERT into data_values (value, year, fk_ent_id, fk_var_id) VALUES (%s, %s, %s, %s)' # this is used for constructing the query for mass inserting to the data_values table data_values_tuple_list = [] # inserting a dataset newdataset = Dataset( name='%s: %s' % (file_to_category_dict[original_filename], file_dataset_names[original_filename]), description='This is a dataset imported by the automated fetcher', namespace='faostat', fk_dst_cat_id=the_category, fk_dst_subcat_id=the_subcategory) newdataset.save() datasets_list.append(newdataset) # reading source information from a csv file in metadata_dir metadata_file_path = os.path.join( metadata_dir, os.path.splitext(original_filename)[0] + ".csv") data_published_by = 'Food and Agriculture Organization of the United Nations (FAO)' data_publishers_source = '' additional_information = '' variable_description = '' if os.path.isfile(metadata_file_path): with open(metadata_file_path, encoding='latin-1') as metadatacsv: metadatareader = csv.DictReader(metadatacsv) metadatacolumns = tuple(metadatareader.fieldnames) for row in metadatareader: if row['Subsection Code'] == '1.1': data_published_by = row['Metadata'] if row['Subsection Code'] == '3.1': variable_description = row['Metadata'] if row['Subsection Code'] == '3.4': additional_information = row['Metadata'] if row['Subsection Code'] == '20.1': data_publishers_source = row['Metadata'] # inserting a dataset source newsource = Source( name=file_dataset_names[original_filename], description=source_template % (file_dataset_names[original_filename], data_published_by, data_publishers_source, additional_information), datasetId=newdataset.pk) newsource.save() existing_fao_variables = Variable.objects.filter( fk_dst_id__in=Dataset.objects.filter(namespace='faostat')) existing_fao_variables_dict = {} for each in existing_fao_variables: existing_fao_variables_dict[each.name] = each with open(filename_to_process, encoding='latin-1') as currentfile: currentreader = csv.DictReader(currentfile) filecolumns = tuple(currentreader.fieldnames) # these column types are very similar if filecolumns == column_types[0] or filecolumns == column_types[1] \ or filecolumns == column_types[2] or filecolumns == column_types[3] \ or filecolumns == column_types[4]: for row in currentreader: if filecolumns == column_types[0]: countryname = row['Area'] variablename = row['Item'] variablecode = row['Item Code'] if filecolumns == column_types[1]: countryname = row['Country'] variablename = '%s - %s' % (row['Item'], row['Element']) variablecode = '%s - %s' % (row['ItemCode'], row['ElementCode']) if filecolumns == column_types[2]: countryname = row['Area'] variablename = '%s - %s' % (row['Item'], row['Element']) variablecode = '%s - %s' % (row['Item Code'], row['Element Code']) if filecolumns == column_types[3]: countryname = row['Country'] variablename = '%s - %s' % (row['Item'], row['Element']) variablecode = '%s - %s' % (row['Item Code'], row['Element Code']) if filecolumns == column_types[4]: countryname = row['Country'] variablename = '%s - %s' % (row['Indicator'], row['Source']) variablecode = '%s - %s' % (row['Indicator Code'], row['Source Code']) if original_filename == 'Emissions_Agriculture_Energy_E_All_Data_(Norm).zip': variablename += ' - %s' % row['Unit'] if original_filename == 'Production_LivestockPrimary_E_All_Data_(Normalized).zip': variablename += ' - %s' % row['Unit'] if original_filename == 'Trade_LiveAnimals_E_All_Data_(Normalized).zip': variablename += ' - %s' % row['Unit'] # avoiding duplicate rows if original_filename == 'Inputs_Pesticides_Use_E_All_Data_(Normalized).zip': if row['Item Code'] not in current_file_var_codes and row[ 'Item'] not in current_file_var_names: current_file_var_codes.add(row['Item Code']) current_file_var_names.add(row['Item']) elif row['Item Code'] in current_file_var_codes and row[ 'Item'] in current_file_var_names: pass else: continue # avoiding duplicate rows if original_filename == 'FoodBalanceSheets_E_All_Data_(Normalized).csv': if tuple(row) == previous_row: previous_row = tuple(row) continue else: previous_row = tuple(row) try: year = int(row['Year']) value = float(row['Value']) except ValueError: year = False value = False variablename = file_dataset_names[ original_filename] + ': ' + variablename current_file_vars_countries.add( tuple([countryname, variablecode])) process_one_row(year, value, countryname, variablecode, variablename, existing_fao_variables_dict, row['Unit'], newsource, newdataset, variable_description, data_values_tuple_list) unique_data_tracker.update(current_file_vars_countries) # these are the files that require several iterations over all rows if filecolumns == column_types[5] or filecolumns == column_types[ 6] or filecolumns == column_types[7]: if filecolumns == column_types[5]: iterations = [{ 'country_field': 'Donor Country', 'varname_format': '%s - Donors' }, { 'country_field': 'Recipient Country', 'varname_format': '%s - Recipients' }] if filecolumns == column_types[6]: iterations = [{ 'country_field': 'Reporter Countries', 'varname_format': '%s - %s - Reporters' }, { 'country_field': 'Partner Countries', 'varname_format': '%s - %s - Partners' }] if filecolumns == column_types[7]: iterations = [{ 'country_field': 'Donor', 'varname_format': '%s - %s - Donors' }, { 'country_field': 'Recipient Country', 'varname_format': '%s - %s - Recipients' }] for oneiteration in iterations: file_stream_holder = { } # we will break down these files into smaller files dict_writer_holder = {} separate_files_names = { } # we will keep the filenames in this dict unique_vars = [] # first we collect all variable names currentfile.seek(0) row_counter = 0 for row in currentreader: if row['Year'] == 'Year': continue row_counter += 1 if row_counter % 300 == 0: time.sleep( 0.001 ) # this is done in order to not keep the CPU busy all the time if filecolumns == column_types[5]: variablename = oneiteration['varname_format'] % row[ 'Item'] if filecolumns == column_types[6]: variablename = oneiteration['varname_format'] % ( row['Item'], row['Element']) if filecolumns == column_types[7]: variablename = oneiteration['varname_format'] % ( row['Item'], row['Purpose']) if variablename not in unique_vars: unique_vars.append(variablename) # then we break the dataset into files named after the variable names for varname in unique_vars: separate_files_names[varname.replace('/', '+') + '.csv'] = varname file_stream_holder[varname] = open(os.path.join( '/tmp', varname.replace('/', '+') + '.csv'), 'w+', encoding='latin-1') dict_writer_holder[varname] = csv.DictWriter( file_stream_holder[varname], fieldnames=[ 'Country', 'Variable', 'Varcode', 'Year', 'Unit', 'Value' ]) dict_writer_holder[varname].writeheader() # go back to the beginning of the file currentfile.seek(0) row_counter = 0 for row in currentreader: if row['Year'] == 'Year': continue row_counter += 1 if row_counter % 300 == 0: time.sleep( 0.001 ) # this is done in order to not keep the CPU busy all the time if filecolumns == column_types[5]: variablename = oneiteration['varname_format'] % row[ 'Item'] variablecode = row['Item Code'] dict_writer_holder[variablename].writerow({ 'Country': row[oneiteration['country_field']], 'Variable': variablename, 'Varcode': variablecode, 'Unit': row['Unit'], 'Year': row['Year'], 'Value': row['Value'] }) if filecolumns == column_types[6]: variablename = oneiteration['varname_format'] % ( row['Item'], row['Element']) variablecode = '%s - %s' % (row['Item Code'], row['Element Code']) dict_writer_holder[variablename].writerow({ 'Country': row[oneiteration['country_field']], 'Variable': variablename, 'Varcode': variablecode, 'Unit': row['Unit'], 'Year': row['Year'], 'Value': row['Value'] }) if filecolumns == column_types[7]: variablename = oneiteration['varname_format'] % ( row['Item'], row['Purpose']) variablecode = '%s - %s' % (row['Item Code'], row['Purpose Code']) dict_writer_holder[variablename].writerow({ 'Country': row[oneiteration['country_field']], 'Variable': variablename, 'Varcode': variablecode, 'Unit': row['Unit'], 'Year': row['Year'], 'Value': row['Value'] }) if row_counter % 100000 == 0: for fileholder, actual_file in file_stream_holder.items( ): actual_file.flush() os.fsync(actual_file.fileno()) for fileholder, actual_file in file_stream_holder.items(): actual_file.close() # now parsing and importing each file individually for each_separate_file, file_variable_name in separate_files_names.items( ): unique_records_holder = {} with open('/tmp/%s' % each_separate_file, encoding='latin-1') as separate_file: separate_file_reader = csv.DictReader(separate_file) row_counter = 0 for row in separate_file_reader: row_counter += 1 if row_counter % 300 == 0: time.sleep( 0.001 ) # this is done in order to not keep the CPU busy all the time countryname = row['Country'] variablecode = row['Varcode'] variableunit = row['Unit'] year = row['Year'] value = row['Value'] try: year = int(year) value = float(value) except ValueError: year = False value = False if year is not False and value is not False: unique_record = tuple([countryname, year]) if unique_record not in unique_records_holder: unique_records_holder[ unique_record] = value else: unique_records_holder[ unique_record] += value for key, value in unique_records_holder.items(): variablename = file_dataset_names[ original_filename] + ': ' + file_variable_name process_one_row( list(key)[1], str(value), list(key)[0], variablecode, variablename, existing_fao_variables_dict, variableunit, newsource, newdataset, variable_description, data_values_tuple_list) os.remove('/tmp/%s' % each_separate_file) if len(data_values_tuple_list): # insert any leftover data_values with connection.cursor() as c: c.executemany(insert_string, data_values_tuple_list)
if sourcedata['classif1_code']: varcode += ' ' + sourcedata['classif1_code'] if sourcedata['classif2_code']: varcode += ' ' + sourcedata['classif2_code'] if sourcedata['sex_code']: varcode += ' ' + sourcedata['sex_code'] varcode_for_reference = varcode # source_description['additionalInfo'] = '\n'.join(list(sourcedata['note_indicator'])) + '\n'.join(list(sourcedata['note_source'])) if not source_description['additionalInfo']: source_description['additionalInfo'] = None newsource = Source(name='%s %s: %s' % ('ILOSTAT', sourcedata['category'], variable_name), description=json.dumps(source_description), datasetId=Dataset.objects.get(name=file_name_to_category[row['indicator']], namespace='ilostat').pk) newsource.save() if '(' in the_indicator_label and ')' in the_indicator_label: varunit = the_indicator_label[the_indicator_label.index('('):-1].replace('(', '').replace(')','') newvariable = Variable(name=variable_name, unit=varunit if varunit else '', short_unit=short_unit_extract(varunit), description='See concepts and methods provided by ILOSTAT at http://www.ilo.org/ilostat/faces/ilostat-home/metadata', code=varcode_for_reference, timespan='', datasetId=Dataset.objects.get(name=file_name_to_category[row['indicator']], namespace='ilostat'), variableTypeId=VariableType.objects.get(pk=4), sourceId=newsource)