description='augments the data form dbpedia with data from wikipedia' ' directly, like population/elevation/climate') parser.add_argument('input_file', help='dbpedia dump') parser.add_argument('output_file', help='file where to dump the augmented data') parser.add_argument('--max-cities', '-m', type=int) parser.add_argument('--min-pop', default=1e6, help='minimum population to' ' keep the city (if there are multiple population' ' fields, we keep the maximum)', type=int) args = parser.parse_args() configure_logging() dump_in = pickle.load(open(args.input_file)) if True or ask_before_overwrite(args.output_file): dump_out = open(args.output_file, 'w') else: sys.exit() timer = Timer(len(dump_in)) new_data = {} nb_no_climate = 0 nb_coords_from_wiki = 0 nb_coords_from_dbpedia = 0 for i, (city, infos) in enumerate(dump_in.items()): timer.update(i) if args.max_cities is not None and i+1 > args.max_cities: break logger.debug(city) # parsing population
parser.add_argument('output_file', help='file where to dump the augmented data') parser.add_argument('--max-cities', '-m', type=int) parser.add_argument('--min-pop', default=1e6, help='minimum population to' ' keep the city (if there are multiple population' ' fields, we keep the maximum)', type=int) args = parser.parse_args() configure_logging() dump_in = pickle.load(open(args.input_file)) if True or ask_before_overwrite(args.output_file): dump_out = open(args.output_file, 'w') else: sys.exit() timer = Timer(len(dump_in)) new_data = {} nb_no_climate = 0 nb_coords_from_wiki = 0 nb_coords_from_dbpedia = 0 for i, (city, infos) in enumerate(dump_in.items()): timer.update(i) if args.max_cities is not None and i + 1 > args.max_cities: break logger.debug(city) # parsing population
print( 'summing rain and snow days, makes sense? what about days' ' with rain AND snow?') rainDays += stats['snowDays'] stats['precipitationDays'] = rainDays return month_stats if __name__ == '__main__': # arg 1 : file to open city_data = pickle.load(open(sys.argv[1])) # arg 2 : output dump output = sys.argv[2] if not ask_before_overwrite(output): sys.exit() filtered_cities = {} not_found = [] timer = Timer(len(city_data), 100) for city, data in city_data.items(): filtered_city = {} name = city.split('/')[-1] # remove keys we want to ignore for k in list(data.keys()): for regex in IGNORE: if regex.match(k): # print(' removing', k, 'from', city) # print(' using', regex.pattern)
parser.add_argument('--logging-level', choices = ['debug', 'info', 'warning', 'error', 'critical'], default='info') args = parser.parse_args() configure_logging(args.logging_level.upper()) # validation of the passed arguments if args.append and args.update_only: raise Exception('can not use --append and --update-only at the' ' same time') with open(args.input_file) as f: dump_in = pickle.load(f) if not (args.append or args.force or ask_before_overwrite(args.output_file)): sys.exit() if args.skip_wiki: logger.info('skipping wikipedia') for c in dump_in: c.month_stats = {'avgHigh': [0] * 12, 'precipitation': [0] * 12} c.wiki_source = '' with open(args.output_file, 'w') as dump_out: pickle.dump(dump_in, dump_out) sys.exit() if args.append or args.update_only: logger.info('updating in %s', args.output_file) with open(args.output_file) as f: new_data = {'{}/{}/{}'.format(x.name, x.region, x.country): x
return x.replace('−', '-').replace('−', '-') if __name__ == '__main__': parser = argparse.ArgumentParser(description='fetch the data from dbpedia') parser.add_argument('output_file', help='file where to dump the data') parser.add_argument('--min-pop', default=1e6, help='minimum population to' ' keep the city (if there are multiple population' ' fields, we keep the maximum)', type=int) parser.add_argument('--max-cities', '-m', type=int) args = parser.parse_args() configure_logging() output = args.output_file if not ask_before_overwrite(output): sys.exit() sparql = SPARQLWrapper('http://dbpedia.org/sparql') sparql.setReturnFormat(JSON) # nb_cities = int(query(sparql, # """ # PREFIX dbo: <http://dbpedia.org/ontology/> # PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> # PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#> # SELECT (count(?city) as ?count) # WHERE { # ?city a dbo:City. # ?city geo:lat ?lat. # ?city geo:long ?long}
help='don\'t ask' ' before overwriting the output file') parser.add_argument('--max-cities', '-m', type=int) parser.add_argument( '--too-close', type=float, default=25., help='a city' ' will be ignored if there is a bigger city closer than' ' this radius') args = parser.parse_args() configure_logging() output = args.output_file if not (args.force or ask_before_overwrite(output)): sys.exit() fields = ['country_region', 'name', 'asciiname', 'geonameid'] regions = defaultdict(dict) with open(args.admin1codes_file) as f: reader = csv.DictReader(f, delimiter='\t', fieldnames=fields) for row in reader: country, region = row['country_region'].split('.') if region in regions[country]: raise Exception('A region is present twice in the file') regions[country][region] = row['name'] # pprint(regions) countries = {} with open(args.country_infos_file) as f:
parser.add_argument('--admin1codes-file', help='admin1CodesASCII.txt' ' from geonames.org', required=True) parser.add_argument('--country-infos-file', help='countryInfo.txt' ' from geonames.org', required=True) parser.add_argument('--force', action='store_true', help='don\'t ask' ' before overwriting the output file') parser.add_argument('--max-cities', '-m', type=int) parser.add_argument('--too-close', type=float, default=25., help='a city' ' will be ignored if there is a bigger city closer than' ' this radius') args = parser.parse_args() configure_logging() output = args.output_file if not (args.force or ask_before_overwrite(output)): sys.exit() fields = ['country_region', 'name', 'asciiname', 'geonameid'] regions = defaultdict(dict) with open(args.admin1codes_file) as f: reader = csv.DictReader(f, delimiter='\t', fieldnames=fields) for row in reader: country, region = row['country_region'].split('.') if region in regions[country]: raise Exception('A region is present twice in the file') regions[country][region] = row['name'] # pprint(regions) countries = {} with open(args.country_infos_file) as f: