' directly, like population/elevation/climate')
    parser.add_argument('input_file', help='dbpedia dump')
    parser.add_argument('output_file',
                        help='file where to dump the augmented data')
    parser.add_argument('--max-cities', '-m', type=int)
    parser.add_argument('--min-pop',
                        default=1e6,
                        help='minimum population to'
                        ' keep the city (if there are multiple population'
                        ' fields, we keep the maximum)',
                        type=int)
    args = parser.parse_args()

    configure_logging()

    dump_in = pickle.load(open(args.input_file))

    if True or ask_before_overwrite(args.output_file):
        dump_out = open(args.output_file, 'w')
    else:
        sys.exit()

    timer = Timer(len(dump_in))
    new_data = {}
    nb_no_climate = 0
    nb_coords_from_wiki = 0
    nb_coords_from_dbpedia = 0
    for i, (city, infos) in enumerate(dump_in.items()):
        timer.update(i)
        if args.max_cities is not None and i + 1 > args.max_cities:
            break
Example #2
0
                    session.query(MonthlyStat).delete()
                    session.query(City).delete()
                    session.commit()
            else:
                print('Did nothing.')
                sys.exit()
        else:
            # Here we are simply appending the new cities. You want to be sure
            # that this won't conflict in some way...
            if not are_you_sure(
                    'The database is not empty, there are already {} cities,'
                    ' do you still wish to pursue with loading data? If you'
                    ' want to clear already existing cities, use the'
                    ' --clear-cities flag'
                    .format(nb_cities)):
                print('Did nothing.')
                sys.exit()

    logger.info('loading the data')
    with open(args.input_file) as f:
        data = pickle.load(f)

    if args.max_cities is not None:
        data = data[:args.max_cities]

    with session_scope() as session:
        logger.info('filling the database')
        fill_cities(data, session)
        logger.info('adding the priority index')
        add_priority_index(session, args.fast_priority_index)
Example #3
0
        if 'precipitationDays' not in stats and 'rainDays' in stats:
            rainDays = stats['rainDays']
            if 'snowDays' in stats:
                print(
                    'summing rain and snow days, makes sense? what about days'
                    ' with rain AND snow?')
                rainDays += stats['snowDays']
            stats['precipitationDays'] = rainDays

    return month_stats


if __name__ == '__main__':

    # arg 1 : file to open
    city_data = pickle.load(open(sys.argv[1]))
    # arg 2 : output dump
    output = sys.argv[2]
    if not ask_before_overwrite(output):
        sys.exit()

    filtered_cities = {}
    not_found = []
    timer = Timer(len(city_data), 100)
    for city, data in city_data.items():
        filtered_city = {}
        name = city.split('/')[-1]

        # remove keys we want to ignore
        for k in list(data.keys()):
            for regex in IGNORE:
if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='augments the data form dbpedia with data from wikipedia'
                    ' directly, like population/elevation/climate')
    parser.add_argument('input_file', help='dbpedia dump')
    parser.add_argument('output_file',
                        help='file where to dump the augmented data')
    parser.add_argument('--max-cities', '-m', type=int)
    parser.add_argument('--min-pop', default=1e6, help='minimum population to'
                        ' keep the city (if there are multiple population'
                        ' fields, we keep the maximum)', type=int)
    args = parser.parse_args()

    configure_logging()

    dump_in = pickle.load(open(args.input_file))

    if True or ask_before_overwrite(args.output_file):
        dump_out = open(args.output_file, 'w')
    else:
        sys.exit()

    timer = Timer(len(dump_in))
    new_data = {}
    nb_no_climate = 0
    nb_coords_from_wiki = 0
    nb_coords_from_dbpedia = 0
    for i, (city, infos) in enumerate(dump_in.items()):
        timer.update(i)
        if args.max_cities is not None and i+1 > args.max_cities:
            break
Example #5
0
                       help = 'We assume we already have cities in the output'
                        ' and we will only (re-)augment those, skipping all the'
                        ' others')
    parser.add_argument('--logging-level', choices =
                        ['debug', 'info', 'warning', 'error', 'critical'],
                        default='info')
    args = parser.parse_args()

    configure_logging(args.logging_level.upper())

    # validation of the passed arguments
    if args.append and args.update_only:
        raise Exception('can not use --append and --update-only at the'
                        ' same time')

    with open(args.input_file) as f:
        dump_in = pickle.load(f)

    if not (args.append or args.force or ask_before_overwrite(args.output_file)):
        sys.exit()

    if args.skip_wiki:
        logger.info('skipping wikipedia')
        for c in dump_in:
            c.month_stats = {'avgHigh': [0] * 12, 'precipitation': [0] * 12}
            c.wiki_source = ''
        with open(args.output_file, 'w') as dump_out:
            pickle.dump(dump_in, dump_out)
        sys.exit()

    if args.append or args.update_only:
Example #6
0
    for city in cities_dict.keys():
        # get the properties of the city
        results = sparql_query(
            sparql, """
            PREFIX dbo: <http://dbpedia.org/ontology/>
            PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
            PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>

            SELECT ?p ?o
            WHERE {{
                <{}> ?p ?o.
                FILTER(
                    regex(?p, "population", "i") ||
                    regex(?p, "elevation", "i"))
            }}
            """.format(city))

        for c in results:
            att = c['p']['value']
            val = c['o']['value']
            # some negative values are weird
            val = clean_minuses(val)
            cities_dict[city][att].append(val)

        timer.update()

    # pprint(cities_dict)

    with open(output, 'w') as f:
        pickle.dump(dict(cities_dict), f)
Example #7
0
    for city in cities_dict.keys():
        # get the properties of the city
        results = sparql_query(sparql,
            """
            PREFIX dbo: <http://dbpedia.org/ontology/>
            PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
            PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>

            SELECT ?p ?o
            WHERE {{
                <{}> ?p ?o.
                FILTER(
                    regex(?p, "population", "i") ||
                    regex(?p, "elevation", "i"))
            }}
            """.format(city))

        for c in results:
            att = c['p']['value']
            val = c['o']['value']
            # some negative values are weird
            val = clean_minuses(val)
            cities_dict[city][att].append(val)

        timer.update()

    # pprint(cities_dict)

    with open(output, 'w') as f:
        pickle.dump(dict(cities_dict), f)
Example #8
0
                    logger.info('deleting cities')
                    session.query(MonthlyStat).delete()
                    session.query(City).delete()
                    session.commit()
            else:
                print('Did nothing.')
                sys.exit()
        else:
            # Here we are simply appending the new cities. You want to be sure
            # that this won't conflict in some way...
            if not are_you_sure(
                    'The database is not empty, there are already {} cities,'
                    ' do you still wish to pursue with loading data? If you'
                    ' want to clear already existing cities, use the'
                    ' --clear-cities flag'.format(nb_cities)):
                print('Did nothing.')
                sys.exit()

    logger.info('loading the data')
    with open(args.input_file) as f:
        data = pickle.load(f)

    if args.max_cities is not None:
        data = data[:args.max_cities]

    with session_scope() as session:
        logger.info('filling the database')
        fill_cities(data, session)
        logger.info('adding the priority index')
        add_priority_index(session, args.fast_priority_index)
Example #9
0
        type=float,
        default=25.,
        help='a city'
        ' will be ignored if there is a bigger city closer than'
        ' this radius')
    args = parser.parse_args()

    configure_logging()

    output = args.output_file
    if not (args.force or ask_before_overwrite(output)):
        sys.exit()

    fields = ['country_region', 'name', 'asciiname', 'geonameid']
    regions = defaultdict(dict)
    with open(args.admin1codes_file) as f:
        reader = csv.DictReader(f, delimiter='\t', fieldnames=fields)
        for row in reader:
            country, region = row['country_region'].split('.')
            if region in regions[country]:
                raise Exception('A region is present twice in the file')
            regions[country][region] = row['name']
    # pprint(regions)

    countries = {}
    with open(args.country_infos_file) as f:
        reader = csv.reader((line for line in f if not line.startswith('#')),
                            delimiter='\t')
        for row in reader:
            countries[row[0]] = row[4]
    # pprint(countries)
Example #10
0
                        ' before overwriting the output file')
    parser.add_argument('--max-cities', '-m', type=int)
    parser.add_argument('--too-close', type=float, default=25., help='a city'
                       ' will be ignored if there is a bigger city closer than'
                       ' this radius')
    args = parser.parse_args()

    configure_logging()

    output = args.output_file
    if not (args.force or ask_before_overwrite(output)):
        sys.exit()

    fields = ['country_region', 'name', 'asciiname', 'geonameid']
    regions = defaultdict(dict)
    with open(args.admin1codes_file) as f:
        reader = csv.DictReader(f, delimiter='\t', fieldnames=fields)
        for row in reader:
            country, region = row['country_region'].split('.')
            if region in regions[country]:
                raise Exception('A region is present twice in the file')
            regions[country][region] = row['name']
    # pprint(regions)

    countries = {}
    with open(args.country_infos_file) as f:
        reader = csv.reader((line for line in f if not line.startswith('#')),
                            delimiter='\t')
        for row in reader:
            countries[row[0]] = row[4]
    # pprint(countries)
Example #11
0
            stats['precipitation'] = prec
        if 'precipitationDays' not in stats and 'rainDays' in stats:
            rainDays = stats['rainDays']
            if 'snowDays' in stats:
                print('summing rain and snow days, makes sense? what about days'
                      ' with rain AND snow?')
                rainDays += stats['snowDays']
            stats['precipitationDays'] = rainDays

    return month_stats


if __name__ == '__main__':

    # arg 1 : file to open
    city_data = pickle.load(open(sys.argv[1]))
    # arg 2 : output dump
    output = sys.argv[2]
    if not ask_before_overwrite(output):
        sys.exit()

    filtered_cities = {}
    not_found = []
    timer = Timer(len(city_data), 100)
    for city, data in city_data.items():
        filtered_city = {}
        name = city.split('/')[-1]

        # remove keys we want to ignore
        for k in list(data.keys()):
            for regex in IGNORE:
Example #12
0
        ' and we will only (re-)augment those, skipping all the'
        ' others')
    parser.add_argument(
        '--logging-level',
        choices=['debug', 'info', 'warning', 'error', 'critical'],
        default='info')
    args = parser.parse_args()

    configure_logging(args.logging_level.upper())

    # validation of the passed arguments
    if args.append and args.update_only:
        raise Exception('can not use --append and --update-only at the'
                        ' same time')

    with open(args.input_file) as f:
        dump_in = pickle.load(f)

    if not (args.append or args.force
            or ask_before_overwrite(args.output_file)):
        sys.exit()

    if args.skip_wiki:
        logger.info('skipping wikipedia')
        for c in dump_in:
            c.month_stats = {'avgHigh': [0] * 12, 'precipitation': [0] * 12}
            c.wiki_source = ''
        with open(args.output_file, 'w') as dump_out:
            pickle.dump(dump_in, dump_out)
        sys.exit()
Example #13
0
import pickle, pprint, sys
from meteomap.utils import open

if __name__ == '__main__':
    some_file = sys.argv[1]
    pprint.pprint(pickle.load(open(some_file)))