def test_csv(self): file_path = 'test.csv' lines = [ ['a', 'b', 'c'], ['1', '3', '2'], ['a', 'b', 'c'], [1, 2, 3] ] CsvWriter.write(file_path, lines)
def main(argv): SamplingVillageIds.prepare_reservation() lines = [] lines.append(['villageid', 'gender', 'caste']) for village_id, values in SamplingVillageIds.village_id_to_reservation.items( ): lines.append([village_id, values[0], values[1]]) file_path = f'C:/Data_PoloFr/scrap-python-indian-gov/src/scripts_final_merge/csv_files_merged/Sampling.csv' CsvWriter.write(file_path, lines)
def main(argv): file_prefixes = [ 'Gram_Sevak_Survey_', 'Group_Survey_', 'Notable_Survey_', 'Sarpanch_Survey_', 'Upa_Sarpanch_Survey_' ] for file_prefix in file_prefixes: file_suffixes = ['1', '2', '2_bis', '3', '4'] for file_suffix in file_suffixes: file_path = f'C:/Data_PoloFr/scrap-python-indian-gov/src/scripts_final_merge/csv_files_corrected/{file_prefix}{file_suffix}.csv' if not os.path.isfile(file_path): raise Exception(f'{file_path} is not valid ') result_lines = [] with open(file_path, 'r', encoding='utf-8') as original: lines = csv.reader(original, delimiter=',') skip_first = True for line in lines: if skip_first is True: skip_first = False for idx, col in enumerate(line): line[idx] = col.lower().replace('gps-', 'gps') result_lines.append(line) CsvWriter.write(file_path, result_lines)
def main(argv): district_column_name = 'q6' villageid_column_name = 'villageid' villagename_column_name = 'q1' instanceid_set = {} files_with_ids = [ 'C:/Data_PoloFr/scrap-python-indian-gov/src/scripts_final_merge/with_village_id/Sarpanch_Group_merged.csv', 'C:/Data_PoloFr/scrap-python-indian-gov/csv_files/ahmednagar/Sarpanch Survey_WIDE.csv', 'C:/Data_PoloFr/scrap-python-indian-gov/csv_files/ahmednagar/Sarpanch Survey_WIDE (1).csv', 'C:/Data_PoloFr/scrap-python-indian-gov/csv_files/Sarpanch_Survey_Merged_20210824.csv' ] for file_with_ids in files_with_ids: with open(file_with_ids, 'r', encoding='utf-8') as original: lines = csv.reader(original, delimiter=',') skip_first = True district_column_pos = None villageid_column_pos = None villagename_column_pos = None instanceid_column_pos = None for line in lines: if skip_first is True: skip_first = False district_column_pos = Helper.find_column_position( line, district_column_name) villageid_column_pos = Helper.find_column_position( line, villageid_column_name) villagename_column_pos = Helper.find_column_position( line, villagename_column_name) instanceid_column_pos = Helper.find_column_position( line, 'instanceid') continue instanceid = line[instanceid_column_pos] if not instanceid: continue if not line[villageid_column_pos]: # print(f'missing village id for {instanceid} inside {file_with_ids}') continue if instanceid in instanceid_set: # print(f'Duplicate {instanceid} inside Sarpanch_Group_merged.csv') continue instanceid_set[instanceid] = { 'district': line[district_column_pos].split('.0')[0], 'villagename': line[villagename_column_pos], 'villageid': line[villageid_column_pos].split('.0')[0] } SamplingVillageIds.prepare() file_suffixes = ['1', '2', '2_bis', '3', '4'] for file_suffix in file_suffixes: file_path = f'C:/Data_PoloFr/scrap-python-indian-gov/src/scripts_final_merge/csv_files/Sarpanch_Survey_{file_suffix}.csv' if not os.path.isfile(file_path): raise Exception(f'{file_path} is not valid ') try: result_lines = [] with open(file_path, 'r', encoding='utf-8') as original: lines = csv.reader(original, delimiter=',') skip_first = True district_column_pos = None villageid_column_pos = None villagename_column_pos = None instanceid_column_pos = None for idx, line in enumerate(lines): result_lines.append(line) if skip_first is True: skip_first = False district_column_pos = Helper.find_column_position( line, district_column_name) villageid_column_pos = Helper.find_column_position( line, villageid_column_name) villagename_column_pos = Helper.find_column_position( line, villagename_column_name) instanceid_column_pos = Helper.find_column_position( line, 'instanceid') continue villageid = line[villageid_column_pos] if villageid: continue instanceid = line[instanceid_column_pos] district = line[district_column_pos] villagename = line[villagename_column_pos] result = instanceid_set.get(instanceid) if result is None: print( f'Could not find a village id at line {idx + 1} in Sarpanch_Survey_{file_suffix}.csv for {instanceid} {district} {villagename}' ) SamplingVillageIds.find_best_match( villagename, district) else: expected_result = { 'district': district, 'villagename': villagename, 'villageid': result['villageid'] } if result != expected_result: print( f'Found a village id for {instanceid} in Sarpanch_Survey_{file_suffix}.csv but {json.dumps(result)} vs {json.dumps(expected_result)}' ) result_lines[-1][villageid_column_pos] = result[ 'villageid'] result_lines[-1][villageid_column_pos + 1] = result['villageid'] CsvWriter.write( file_path.replace('csv_files', 'csv_files_corrected'), result_lines) except Exception as exp: raise Exception( f'Failed for Sarpanch_Survey_{file_suffix}.csv : {str(exp)}')
def main(argv): file_path = f'C:/Data_PoloFr/scrap-python-indian-gov/results_wide/results_MAHARASHTRA_2020.csv' if not os.path.isfile(file_path): return all_villages = { 'PUNE': {}, 'SOLAPUR': {} } with open(file_path, 'r') as original: lines = csv.reader(original, delimiter=',') skip_first = True for line in lines: if skip_first is True: skip_first = False continue district = line[3].strip().upper() if district != 'PUNE' and district != 'SOLAPUR': continue block_name = line[5].strip().upper() panchayat_name = line[7].strip().upper() panchayat_id = line[9].strip().upper() block_villages = all_villages[district].get(block_name) if block_villages is None: all_villages[district][block_name] = [] all_villages[district][block_name].append({ 'name': panchayat_name, 'id': panchayat_id, 'line': line }) file_path = f'C:/Data_PoloFr/scrap-python-indian-gov/villages/sarpanch.csv' if not os.path.isfile(file_path): return result_lines = [] with open(file_path, 'r') as original: lines = csv.reader(original, delimiter=',') skip_first = True for line in lines: if skip_first is True: skip_first = False continue village_id = line[0] village_name = line[5].upper().replace('GRAMPANCHAYAT', '').replace('GRAMPANCHAYT', '')\ .replace(', AKKALKOT', '').replace('(BHOINJE)', '').replace('SAPATNE(BHO)', 'SAPATNE (BHOSE)')\ .replace('GRAMPANCHYAT', '').replace('GRAMPANACHAYAT', '').replace('GRAM PANCHAYT', '')\ .replace('GRAMAPANCHAYAT', '').strip() block_name = line[7] if block_name == '1': block_name = 'MADHA' elif block_name == '2': block_name = 'AKKALKOT' elif block_name == '3': block_name = 'SOUTH SOLAPUR' elif block_name == '4': block_name = 'PANDHARPUR' elif block_name == '5': block_name = 'MOHOL' elif block_name == '6': block_name = 'BHOR' elif block_name == '7': block_name = 'BARAMATI' elif block_name == '8': block_name = 'DAUND' elif block_name == '9': block_name = 'MULSHI' elif block_name == '10': block_name = 'KHED' else: raise Exception(f'No block_name found for {line}') district_name = line[6] if district_name == '1': district_name = 'SOLAPUR' if block_name not in ['MADHA', 'AKKALKOT', 'SOUTH SOLAPUR', 'PANDHARPUR', 'MOHOL']: print(f'District and block mistmatch for {line}') continue elif district_name == '2': district_name = 'PUNE' if block_name not in ['BHOR', 'BARAMATI', 'DAUND', 'MULSHI', 'KHED']: print(f'District and block mistmatch for {line}') continue else: print(f'No district found for {line}') continue cmp_results = [] for village in all_villages[district_name][block_name]: cmp_results.append({ 'score': textdistance.hamming(village_name, village['name']), 'match': village['name'], 'id': village['id'], 'line': village['line'] }) cmp_results.sort(key=lambda v: v['score']) print(f'{district_name} - {block_name} - {village_name} vs {cmp_results[0]["match"]} = {cmp_results[0]["score"]}') line = cmp_results[0]['line'] if cmp_results[0]['score'] > 10: for idx, cmp_result in enumerate(cmp_results[0:4]): print('{:>2} {}'.format(cmp_result['score'], cmp_result['match'])) print() selected_row = read_user_input() - 1 if selected_row < 4: line = cmp_results[selected_row]['line'] elif selected_row == 4: line = [] new_line = [village_id, village_name] + line result_lines.append(new_line) new_file_path = f'C:/Data_PoloFr/scrap-python-indian-gov/villages/merge_sarpanch.csv' CsvWriter.write(new_file_path, result_lines)
def main(argv): file_path = f'C:/Data_PoloFr/scrap-python-indian-gov/results_wide/results_HARYANA_2020.csv' if not os.path.isfile(file_path): return all_villages = {} chosen_matches = {} with open(file_path, 'r') as original: lines = csv.reader(original, delimiter=',') skip_first = True for line in lines: if skip_first is True: skip_first = False continue district = line[3].strip().upper() block_name = line[5].strip().upper().replace(' (PART)', '') panchayat_name = line[7].strip().upper() panchayat_id = line[9].strip().upper() if all_villages.get(district) is None: all_villages[district] = {} block_villages = all_villages[district].get(block_name) if block_villages is None: all_villages[district][block_name] = [] all_villages[district][block_name].append({ 'name': panchayat_name, 'id': panchayat_id, 'line': line }) file_path = f'C:/Data_PoloFr/scrap-python-indian-gov/villages/Haryana_new_incomplete.csv' if not os.path.isfile(file_path): return result_lines = [] with open(file_path, 'r') as original: lines = csv.reader(original, delimiter=',') skip_first = 1 for line in lines: if skip_first > 0: skip_first -= 1 continue village_id = line[0] village_name = line[1].upper().strip() district_name = line[12].upper().strip() block_name = line[13].replace(' 1', '-I').replace(' 2', '-II').replace('Bhattu', 'Bhattu Kalan')\ .replace('Ballabhgarh', 'Ballabgarh').replace('Nissing', 'Nissing At Chirao')\ .replace('Meham', 'Maham').replace('Lakhan', 'Lakhan Majra') \ .replace('GHARAUNDA (PART)', 'GHARAUNDA')\ .replace('Block Saha', 'Saha').replace('Block Naraingarh', 'Naraingarh')\ .replace('Block Shahzadpur', 'Shahzadpur').replace('Block Barara', 'Barara')\ .upper().strip().replace('BLOCK ', f'{district_name}-') if all_villages.get(district_name) is None: raise Exception(f'Invalid district {district_name} for {line}') if all_villages[district_name].get(block_name) is None: raise Exception(f'Invalid {block_name} for {line}') cmp_results = [] for village in all_villages[district_name][block_name]: cmp_results.append({ 'score': textdistance.hamming(village_name, village['name']), 'match': village['name'], 'id': village['id'], 'line': village['line'] }) cmp_results.sort(key=lambda v: v['score']) print( f'{district_name} - {block_name} - {village_name} vs {cmp_results[0]["match"]} = {cmp_results[0]["score"]}' ) line = cmp_results[0]['line'] if cmp_results[0]['score'] > 10: selected_row = chosen_matches.get( f'{district_name} - {block_name} - {village_name}') if selected_row is None: for idx, cmp_result in enumerate(cmp_results[0:4]): print('{:>2} {}'.format(cmp_result['score'], cmp_result['match'])) print() selected_row = read_user_input() - 1 chosen_matches[ f'{district_name} - {block_name} - {village_name}'] = selected_row if selected_row < 4: line = cmp_results[selected_row]['line'] elif selected_row == 4: line = [] new_line = [village_id, village_name] + line result_lines.append(new_line) new_file_path = f'C:/Data_PoloFr/scrap-python-indian-gov/villages/merge_sarpanch_haryana.csv' CsvWriter.write(new_file_path, result_lines)
def main(argv): village_set = {} village_id_to_names = {} village_id_to_gan_sevac_sex = {} village_id_to_reservation = {} file_path = f'C:/Data_PoloFr/scrap-python-indian-gov/csv_files/sampling/Sampling_PUNE.csv' if not os.path.isfile(file_path): raise Exception(f'Failed to find {file_path}') with open(file_path, 'r', encoding='utf-8') as original: lines = csv.reader(original, delimiter=',') skip_first = True for line in lines: if skip_first is True: skip_first = False continue set_reservation_for_pune(village_id_to_reservation, line[1], line[0]) file_path = f'C:/Data_PoloFr/scrap-python-indian-gov/csv_files/Sarpanch_Survey_Merged_20210824.csv' if not os.path.isfile(file_path): raise Exception(f'Failed to find {file_path}') with open(file_path, 'r', encoding='utf-8') as original: lines = csv.reader(original, delimiter=',') skip_first = True pune_villages = [] for line in lines: if skip_first is True: skip_first = False print(f'For Sarpanch survey, district is q6 =?= {line[22]}') print(f'For Sarpanch survey, villageid =?= {line[16]}') print(f'For Sarpanch survey, village name is q1 =?= {line[21]}') continue if line[22] == '2.0': pune_villages.append(line) set_village(village_set, line[16], 'sarpanch') set_village_name(village_id_to_names, line[16], 'sarpanch', line[21]) print(f'Found {len(pune_villages)} in sarpanch survey for Pune district') print('\n\n') file_path = f'C:/Data_PoloFr/scrap-python-indian-gov/csv_files/Upa_Sarpanch_Survey_Merged_20210824.csv' if not os.path.isfile(file_path): raise Exception(f'Failed to find {file_path}') with open(file_path, 'r', encoding='utf-8') as original: lines = csv.reader(original, delimiter=',') skip_first = True pune_villages = [] for line in lines: if skip_first is True: skip_first = False print(f'For Upa_Sarpanch, district is q6 =?= {line[22]}') print(f'For Upa_Sarpanch survey, villageid =?= {line[16]}') print(f'For Upa_Sarpanch survey, village name is q1 =?= {line[21]}') continue if line[22] == '2': pune_villages.append(line) set_village(village_set, line[16], 'upa-sarpanch') set_village_name(village_id_to_names, line[16], 'upa-sarpanch', line[21]) print(f'Found {len(pune_villages)} in Upa_Sarpanch survey for Pune district') print('\n\n') file_path = f'C:/Data_PoloFr/scrap-python-indian-gov/csv_files/Notable_Survey_20201026.csv' if not os.path.isfile(file_path): raise Exception(f'Failed to find {file_path}') with open(file_path, 'r', encoding='utf-8') as original: lines = csv.reader(original, delimiter=',') skip_first = True pune_villages = [] for line in lines: if skip_first is True: skip_first = False print(f'For Notable, assuming district is q6 =?= {line[21]}') print(f'For Notable survey, villageid =?= {line[16]}') print(f'For Notable survey, village name is q1 =?= {line[20]}') continue if line[21] == '2': pune_villages.append(line) set_village(village_set, line[16], 'notable') set_village_name(village_id_to_names, line[16], 'notable', line[20]) print(f'Found {len(pune_villages)} in Notable survey for Pune district') print('\n\n') file_path = f'C:/Data_PoloFr/scrap-python-indian-gov/csv_files/Gram_Sevak_Survey_Merged_20210904.csv' if not os.path.isfile(file_path): raise Exception(f'Failed to find {file_path}') with open(file_path, 'r', encoding='utf-8') as original: lines = csv.reader(original, delimiter=',') skip_first = True pune_villages = [] for line in lines: if skip_first is True: skip_first = False print(f'For Gram_Sevak, assuming district is q6 =?= {line[22]}') print(f'For Gram_Sevak survey, villageid =?= {line[16]}') print(f'For Gram_Sevak survey, village name is q1 =?= {line[21]}') print(f'For Gram_Sevak survey, sex is q15 =?= {line[50]}') continue if line[22] == '2': pune_villages.append(line) set_village(village_set, line[16], 'gram-sevak') set_village_name(village_id_to_names, line[16], 'gram-sevak', line[21]) set_gran_sevac_gender(village_id_to_gan_sevac_sex, line[16], line[50]) print(f'Found {len(pune_villages)} in Gram_Sevak survey for Pune district') print('\n\n') file_path = f'C:/Data_PoloFr/scrap-python-indian-gov/csv_files/Group_Survey_Merged_20210824.csv' if not os.path.isfile(file_path): raise Exception(f'Failed to find {file_path}') with open(file_path, 'r', encoding='utf-8') as original: lines = csv.reader(original, delimiter=',') skip_first = True pune_villages = [] for line in lines: if skip_first is True: skip_first = False print(f'For Group, assuming district is q7 =?= {line[26]}') print(f'For Group survey, villageid =?= {line[16]}') print(f'For Group survey, village name is q5 =?= {line[25]}') continue if line[26] == '2.0': pune_villages.append(line) set_village(village_set, line[16], 'group') set_village_name(village_id_to_names, line[16], 'group', line[25]) print(f'Found {len(pune_villages)} in Group survey for Pune district') print('\n\n') print('village ids') print(village_set.keys()) print('village ids end') new_csv = [] new_entry = ['village_id', 'reservation_sex', 'reservation_caste', 'gram_sevak_sex', 'village_name'] new_csv.append(new_entry) villages_with_all = [] for village_id, surveys in village_set.items(): if surveys[0] == 0: continue if surveys[0] != 1: print(f'Weird more than one survey for sarpanch {village_id} {village_id_to_names[village_id]}') continue if surveys[1] == 0: continue if surveys[1] != 1: print(f'Weird more than one survey for upa-sarpanch {village_id} {village_id_to_names[village_id]}') continue if surveys[2] == 0: continue if surveys[2] != 1: print(f'Weird more than one survey for gram-sevak {village_id} {village_id_to_names[village_id]}') continue if surveys[3] == 0: continue if surveys[3] != 1: print(f'Weird more than one survey for group {village_id} {village_id_to_names[village_id]}') continue if surveys[4] < 4: continue villages_with_all.append(village_id) if village_id_to_reservation.get(village_id) is None: continue new_entry = [village_id, village_id_to_reservation[village_id][0], village_id_to_reservation[village_id][1], village_id_to_gan_sevac_sex[village_id], village_id_to_names[village_id][0]] new_csv.append(new_entry) print('villages with all surveys') print(villages_with_all) print('villages with all surveys end') new_file_path = 'C:/Data_PoloFr/scrap-python-indian-gov/csv_files/results/result_pune.csv' CsvWriter.write(new_file_path, new_csv)