description='augments the data form dbpedia with data from wikipedia'
                    ' directly, like population/elevation/climate')
    parser.add_argument('input_file', help='dbpedia dump')
    parser.add_argument('output_file',
                        help='file where to dump the augmented data')
    parser.add_argument('--max-cities', '-m', type=int)
    parser.add_argument('--min-pop', default=1e6, help='minimum population to'
                        ' keep the city (if there are multiple population'
                        ' fields, we keep the maximum)', type=int)
    args = parser.parse_args()

    configure_logging()

    dump_in = pickle.load(open(args.input_file))

    if True or ask_before_overwrite(args.output_file):
        dump_out = open(args.output_file, 'w')
    else:
        sys.exit()

    timer = Timer(len(dump_in))
    new_data = {}
    nb_no_climate = 0
    nb_coords_from_wiki = 0
    nb_coords_from_dbpedia = 0
    for i, (city, infos) in enumerate(dump_in.items()):
        timer.update(i)
        if args.max_cities is not None and i+1 > args.max_cities:
            break
        logger.debug(city)
        # parsing population
    parser.add_argument('output_file',
                        help='file where to dump the augmented data')
    parser.add_argument('--max-cities', '-m', type=int)
    parser.add_argument('--min-pop',
                        default=1e6,
                        help='minimum population to'
                        ' keep the city (if there are multiple population'
                        ' fields, we keep the maximum)',
                        type=int)
    args = parser.parse_args()

    configure_logging()

    dump_in = pickle.load(open(args.input_file))

    if True or ask_before_overwrite(args.output_file):
        dump_out = open(args.output_file, 'w')
    else:
        sys.exit()

    timer = Timer(len(dump_in))
    new_data = {}
    nb_no_climate = 0
    nb_coords_from_wiki = 0
    nb_coords_from_dbpedia = 0
    for i, (city, infos) in enumerate(dump_in.items()):
        timer.update(i)
        if args.max_cities is not None and i + 1 > args.max_cities:
            break
        logger.debug(city)
        # parsing population
                print(
                    'summing rain and snow days, makes sense? what about days'
                    ' with rain AND snow?')
                rainDays += stats['snowDays']
            stats['precipitationDays'] = rainDays

    return month_stats


if __name__ == '__main__':

    # arg 1 : file to open
    city_data = pickle.load(open(sys.argv[1]))
    # arg 2 : output dump
    output = sys.argv[2]
    if not ask_before_overwrite(output):
        sys.exit()

    filtered_cities = {}
    not_found = []
    timer = Timer(len(city_data), 100)
    for city, data in city_data.items():
        filtered_city = {}
        name = city.split('/')[-1]

        # remove keys we want to ignore
        for k in list(data.keys()):
            for regex in IGNORE:
                if regex.match(k):
                    # print('  removing', k, 'from', city)
                    # print('   using', regex.pattern)
Beispiel #4
0
    parser.add_argument('--logging-level', choices =
                        ['debug', 'info', 'warning', 'error', 'critical'],
                        default='info')
    args = parser.parse_args()

    configure_logging(args.logging_level.upper())

    # validation of the passed arguments
    if args.append and args.update_only:
        raise Exception('can not use --append and --update-only at the'
                        ' same time')

    with open(args.input_file) as f:
        dump_in = pickle.load(f)

    if not (args.append or args.force or ask_before_overwrite(args.output_file)):
        sys.exit()

    if args.skip_wiki:
        logger.info('skipping wikipedia')
        for c in dump_in:
            c.month_stats = {'avgHigh': [0] * 12, 'precipitation': [0] * 12}
            c.wiki_source = ''
        with open(args.output_file, 'w') as dump_out:
            pickle.dump(dump_in, dump_out)
        sys.exit()

    if args.append or args.update_only:
        logger.info('updating in %s', args.output_file)
        with open(args.output_file) as f:
            new_data = {'{}/{}/{}'.format(x.name, x.region, x.country): x
Beispiel #5
0
    return x.replace('−', '-').replace('−', '-')


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='fetch the data from dbpedia')
    parser.add_argument('output_file', help='file where to dump the data')
    parser.add_argument('--min-pop', default=1e6, help='minimum population to'
                        ' keep the city (if there are multiple population'
                        ' fields, we keep the maximum)', type=int)
    parser.add_argument('--max-cities', '-m', type=int)
    args = parser.parse_args()

    configure_logging()

    output = args.output_file
    if not ask_before_overwrite(output):
        sys.exit()

    sparql = SPARQLWrapper('http://dbpedia.org/sparql')
    sparql.setReturnFormat(JSON)
    # nb_cities = int(query(sparql,
    #     """
    #     PREFIX dbo: <http://dbpedia.org/ontology/>
        # PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        # PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>

    #     SELECT (count(?city) as ?count)
    #     WHERE {
    #         ?city a dbo:City.
    #         ?city geo:lat ?lat.
    #         ?city geo:long ?long}
Beispiel #6
0
                        help='don\'t ask'
                        ' before overwriting the output file')
    parser.add_argument('--max-cities', '-m', type=int)
    parser.add_argument(
        '--too-close',
        type=float,
        default=25.,
        help='a city'
        ' will be ignored if there is a bigger city closer than'
        ' this radius')
    args = parser.parse_args()

    configure_logging()

    output = args.output_file
    if not (args.force or ask_before_overwrite(output)):
        sys.exit()

    fields = ['country_region', 'name', 'asciiname', 'geonameid']
    regions = defaultdict(dict)
    with open(args.admin1codes_file) as f:
        reader = csv.DictReader(f, delimiter='\t', fieldnames=fields)
        for row in reader:
            country, region = row['country_region'].split('.')
            if region in regions[country]:
                raise Exception('A region is present twice in the file')
            regions[country][region] = row['name']
    # pprint(regions)

    countries = {}
    with open(args.country_infos_file) as f:
Beispiel #7
0
    parser.add_argument('--admin1codes-file', help='admin1CodesASCII.txt'
                        ' from geonames.org', required=True)
    parser.add_argument('--country-infos-file', help='countryInfo.txt'
                        ' from geonames.org', required=True)
    parser.add_argument('--force', action='store_true', help='don\'t ask'
                        ' before overwriting the output file')
    parser.add_argument('--max-cities', '-m', type=int)
    parser.add_argument('--too-close', type=float, default=25., help='a city'
                       ' will be ignored if there is a bigger city closer than'
                       ' this radius')
    args = parser.parse_args()

    configure_logging()

    output = args.output_file
    if not (args.force or ask_before_overwrite(output)):
        sys.exit()

    fields = ['country_region', 'name', 'asciiname', 'geonameid']
    regions = defaultdict(dict)
    with open(args.admin1codes_file) as f:
        reader = csv.DictReader(f, delimiter='\t', fieldnames=fields)
        for row in reader:
            country, region = row['country_region'].split('.')
            if region in regions[country]:
                raise Exception('A region is present twice in the file')
            regions[country][region] = row['name']
    # pprint(regions)

    countries = {}
    with open(args.country_infos_file) as f: