Beispiel #1
0
def fill_cities(data, session):
    stats_dict = dict(session.query(Stat.code, Stat.id))
    timer = Timer(len(data))
    for city in data:
        geom = WKTElement('POINT({:.8f} {:.8f})'.format(
            city.coords[1], city.coords[0], srid=4326))
        city_db = City(
            location = geom,
            name = city.name,
            region = city.region,
            country = city.country,
            source = city.wiki_source,
            population = city.pop,
            country_rank = city.country_rank,
            region_rank = city.region_rank)
        session.add(city_db)
        # commit the cities so we can use them
        # I'm not sure this is the clean way to do it...
        session.commit()

        for code, month_stats in city.month_stats.items():
            if code not in stats_dict:
                continue
            for month_idx, value in enumerate(month_stats):
                ms = MonthlyStat(
                    month = month_idx,
                    city_id = city_db.id,
                    stat_id = stats_dict[code],
                    value = value)
                session.add(ms)
        timer.update()
    # for the last monthly stats
    session.commit()
Beispiel #2
0
def fill_cities(data, session):
    stats_dict = dict(session.query(Stat.code, Stat.id))
    timer = Timer(len(data))
    for city in data:
        geom = WKTElement('POINT({:.8f} {:.8f})'.format(city.coords[1],
                                                        city.coords[0],
                                                        srid=4326))
        city_db = City(location=geom,
                       name=city.name,
                       region=city.region,
                       country=city.country,
                       source=city.wiki_source,
                       population=city.pop,
                       country_rank=city.country_rank,
                       region_rank=city.region_rank)
        session.add(city_db)
        # commit the cities so we can use them
        # I'm not sure this is the clean way to do it...
        session.commit()

        for code, month_stats in city.month_stats.items():
            if code not in stats_dict:
                continue
            for month_idx, value in enumerate(month_stats):
                ms = MonthlyStat(month=month_idx,
                                 city_id=city_db.id,
                                 stat_id=stats_dict[code],
                                 value=value)
                session.add(ms)
        timer.update()
    # for the last monthly stats
    session.commit()
Beispiel #3
0
def add_priority_index(session, fast_mode=False):
    """ decides the order in which the cities should be selected """
    cities = session.query(City,
                          func.ST_Y(cast(City.location, Geometry())),
                          func.ST_X(cast(City.location, Geometry()))) \
        .join(MonthlyStat) \
        .order_by(City.region_rank, City.country_rank,
                  desc(func.count(MonthlyStat.id))) \
        .group_by(City.id) \
        .yield_per(1000).all()

    if fast_mode:
        logger.info('doing the fast version of priority index')
        for i,city in enumerate(cities):
            city[0].priority_index = i
        session.commit()
        return

    def distance_fn(tuple1, tuple2):
        _,lat1,lon1 = tuple1
        _,lat2,lon2 = tuple2
        return lat_lon_fast_distance(lat1, lon1, lat2, lon2)

    indices = [0]
    indices_left = list(range(1,len(cities)))

    # pre-calculate the distances between all the cities
    logger.info('pre-calculating the distances between all cities')
    lats = numpy.array([c[1] for c in cities])
    lons = numpy.array([c[2] for c in cities])
    distances = lat_lon_fast_distance(lats.reshape(-1,1),
                                      lons.reshape(-1,1),
                                      lats.reshape(1,-1),
                                      lons.reshape(1,-1))

    class CityComp(object):
        idx = None
        max_dist = None
        max_dist_idx = None

        def __init__(self, max_dist, max_dist_idx):
            self.max_dist = max_dist
            self.max_dist_idx = max_dist_idx

        def __lt__(self, other):
            return self.max_dist < other.max_dist

    # each city is compared to all the previous ones (maximum)
    timer = Timer(len(indices_left))
    # percent of closest cities to choose from
    perc_closest_cities = 0.1
    # same but max
    max_closest_cities = 200
    while len(indices_left) > 0:
        # let's find the next city amongst the next candidates
        # this will be our (heap) list of good candidates, i.e. the ones
        # farthest from all the others
        good_candidates = []
        nb_keep = min(perc_closest_cities * len(indices_left),
                      max_closest_cities)
        nb_keep = max(1, nb_keep)  # at least 1!
        logger.debug('will keep the farthest %i', nb_keep)
        # max_dist = 0.
        # max_dist_idx = 0
        logger.debug('---------looking for the next one----------')
        for no_candidate, i_left in enumerate(indices_left):
            # logger.debug('candidate %i, idx %i', no_candidate, i_left)
            # find how close is the nearest neighbor for this city
            # we are looking for the city with the fartest nearest neighbor
            dist_nearest_neighbor = 1e9
            # get the distance of our candidate to the closest (already chosen)
            # city
            too_close = False
            for i_chosen in indices:
                cur_dist = distances[i_chosen, i_left]
                # if we already have enough candidates, and if the current is
                # worse than all others, let's skip it
                if len(good_candidates) >= nb_keep \
                        and cur_dist <= good_candidates[0].max_dist:
                    too_close = True
                    # logger.debug('too close @%f', cur_dist)
                    break
                dist_nearest_neighbor = min(dist_nearest_neighbor, cur_dist)
            # we don't compare the distance of this candidate with all cities
            # if it's closer to (already chosen) city than our best candidate
            # so far
            if too_close:
                continue
            # dist_nearest_neighbor = numpy.min(distances[indices][:,i_left])
            # logger.debug('candidate %i has a city at %f', no_candidate,
                         # dist_nearest_neighbor)

            # if dist_nearest_neighbor > best_candidate.max_dist:
                # logger.debug('(new max)')
            new_candidate = CityComp(dist_nearest_neighbor, no_candidate)
            # logger.debug('trying to add new candidate with dist %f',
                         # new_candidate.max_dist)
            # if we don't have enough anyway
            if len(good_candidates) < nb_keep:
                heapq.heappush(good_candidates, new_candidate)
            else:
                # if we have enough, just keep the n best
                rejected_cand = heapq.heappushpop(good_candidates,
                                                  new_candidate)
                # logger.debug('removed candidate %i with dist %f',
                            # rejected_cand.max_dist_idx,
                            # rejected_cand.max_dist)

        # take the smallest index in our good candidates. this corresponds to
        # the best (according to our first ORDER BY) amongst the "far enough"
        # candidates
        best_candidate = min(good_candidates, key=lambda x: x.max_dist_idx)
        logger.debug('keeping %s with pop %i',
                     cities[indices_left[best_candidate.max_dist_idx]][0].name,
                     cities[indices_left[best_candidate.max_dist_idx]][0].population,)
        # input('press to continue')
        indices.append(indices_left.pop(best_candidate.max_dist_idx))
        logger.debug('done, best candidate was %i with distance %f',
                     best_candidate.max_dist_idx, best_candidate.max_dist)
        logger.debug('done, chosen: %i, remaining: %i', len(indices),
                     len(indices_left))
        timer.update()

    assert len(indices) == len(cities)
    for priority_index, i in enumerate(indices):
        cities[i][0].priority_index = priority_index
    session.commit()
                        help='minimum population to'
                        ' keep the city (if there are multiple population'
                        ' fields, we keep the maximum)',
                        type=int)
    args = parser.parse_args()

    configure_logging()

    dump_in = pickle.load(open(args.input_file))

    if True or ask_before_overwrite(args.output_file):
        dump_out = open(args.output_file, 'w')
    else:
        sys.exit()

    timer = Timer(len(dump_in))
    new_data = {}
    nb_no_climate = 0
    nb_coords_from_wiki = 0
    nb_coords_from_dbpedia = 0
    for i, (city, infos) in enumerate(dump_in.items()):
        timer.update(i)
        if args.max_cities is not None and i + 1 > args.max_cities:
            break
        logger.debug(city)
        # parsing population
        pop = parse_population(infos)
        if pop < args.min_pop:
            continue

        wikiurl = urlparse('http://' + infos['source'])
    parser.add_argument('--max-cities', '-m', type=int)
    parser.add_argument('--min-pop', default=1e6, help='minimum population to'
                        ' keep the city (if there are multiple population'
                        ' fields, we keep the maximum)', type=int)
    args = parser.parse_args()

    configure_logging()

    dump_in = pickle.load(open(args.input_file))

    if True or ask_before_overwrite(args.output_file):
        dump_out = open(args.output_file, 'w')
    else:
        sys.exit()

    timer = Timer(len(dump_in))
    new_data = {}
    nb_no_climate = 0
    nb_coords_from_wiki = 0
    nb_coords_from_dbpedia = 0
    for i, (city, infos) in enumerate(dump_in.items()):
        timer.update(i)
        if args.max_cities is not None and i+1 > args.max_cities:
            break
        logger.debug(city)
        # parsing population
        pop = parse_population(infos)
        if pop < args.min_pop:
            continue

        wikiurl = urlparse('http://' + infos['source'])
    return month_stats


if __name__ == '__main__':

    # arg 1 : file to open
    city_data = pickle.load(open(sys.argv[1]))
    # arg 2 : output dump
    output = sys.argv[2]
    if not ask_before_overwrite(output):
        sys.exit()

    filtered_cities = {}
    not_found = []
    timer = Timer(len(city_data), 100)
    for city, data in city_data.items():
        filtered_city = {}
        name = city.split('/')[-1]

        # remove keys we want to ignore
        for k in list(data.keys()):
            for regex in IGNORE:
                if regex.match(k):
                    # print('  removing', k, 'from', city)
                    # print('   using', regex.pattern)
                    data.pop(k)
                    break
                    # break
                # else:
                #     print('  ', k, 'not match', regex.pattern)
Beispiel #7
0
        HAVING(MAX(?pop) > %i)
        """ % args.min_pop,
                     limit=args.max_cities))
    print('got', len(cities))

    def f():
        return defaultdict(list)

    cities_dict = defaultdict(f)

    for c in cities:
        city = c['city']['value']
        for k in c.keys():
            cities_dict[city][k] = c[k]['value']

    timer = Timer(len(cities))

    for city in cities_dict.keys():
        # get the properties of the city
        results = sparql_query(
            sparql, """
            PREFIX dbo: <http://dbpedia.org/ontology/>
            PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
            PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>

            SELECT ?p ?o
            WHERE {{
                <{}> ?p ?o.
                FILTER(
                    regex(?p, "population", "i") ||
                    regex(?p, "elevation", "i"))
Beispiel #8
0
 def print_if(n):
     if n < 100:
         return Timer.default_print_if(n)
     else:
         return n % 100 == 0
Beispiel #9
0
        if n < 100:
            return Timer.default_print_if(n)
        else:
            return n % 100 == 0

    def keep_city(city):
        if (args.country is None or city.country == args.country) and \
                (args.region is None or city.region == args.region) and \
                (args.city is None or city.name == args.city):
            return True
        return False

    nb_cities = sum(1 for x in dump_in if keep_city(x))
    if args.max_cities is not None and args.max_cities < nb_cities:
        nb_cities = args.max_cities
    timer = Timer(nb_cities, print_if=print_if)
    nb_no_wiki = 0
    nb_no_climate = 0
    nb_already_there = 0
    nb_coords_from_wiki = 0
    nb_coords_from_geonames = 0
    nb_done = 0
    for city in dump_in:
        if args.max_cities is not None and nb_done >= args.max_cities:
            break
        if not keep_city(city):
            continue
        timer.update()
        logger.debug(city)

        city_id = '{}/{}/{}'.format(city.name, city.region, city.country)
Beispiel #10
0
def add_priority_index(session, fast_mode=False):
    """ decides the order in which the cities should be selected """
    cities = session.query(City,
                          func.ST_Y(cast(City.location, Geometry())),
                          func.ST_X(cast(City.location, Geometry()))) \
        .join(MonthlyStat) \
        .order_by(City.region_rank, City.country_rank,
                  desc(func.count(MonthlyStat.id))) \
        .group_by(City.id) \
        .yield_per(1000).all()

    if fast_mode:
        logger.info('doing the fast version of priority index')
        for i, city in enumerate(cities):
            city[0].priority_index = i
        session.commit()
        return

    def distance_fn(tuple1, tuple2):
        _, lat1, lon1 = tuple1
        _, lat2, lon2 = tuple2
        return lat_lon_fast_distance(lat1, lon1, lat2, lon2)

    indices = [0]
    indices_left = list(range(1, len(cities)))

    # pre-calculate the distances between all the cities
    logger.info('pre-calculating the distances between all cities')
    lats = numpy.array([c[1] for c in cities])
    lons = numpy.array([c[2] for c in cities])
    distances = lat_lon_fast_distance(lats.reshape(-1, 1), lons.reshape(-1, 1),
                                      lats.reshape(1, -1), lons.reshape(1, -1))

    class CityComp(object):
        idx = None
        max_dist = None
        max_dist_idx = None

        def __init__(self, max_dist, max_dist_idx):
            self.max_dist = max_dist
            self.max_dist_idx = max_dist_idx

        def __lt__(self, other):
            return self.max_dist < other.max_dist

    # each city is compared to all the previous ones (maximum)
    timer = Timer(len(indices_left))
    # percent of closest cities to choose from
    perc_closest_cities = 0.1
    # same but max
    max_closest_cities = 200
    while len(indices_left) > 0:
        # let's find the next city amongst the next candidates
        # this will be our (heap) list of good candidates, i.e. the ones
        # farthest from all the others
        good_candidates = []
        nb_keep = min(perc_closest_cities * len(indices_left),
                      max_closest_cities)
        nb_keep = max(1, nb_keep)  # at least 1!
        logger.debug('will keep the farthest %i', nb_keep)
        # max_dist = 0.
        # max_dist_idx = 0
        logger.debug('---------looking for the next one----------')
        for no_candidate, i_left in enumerate(indices_left):
            # logger.debug('candidate %i, idx %i', no_candidate, i_left)
            # find how close is the nearest neighbor for this city
            # we are looking for the city with the fartest nearest neighbor
            dist_nearest_neighbor = 1e9
            # get the distance of our candidate to the closest (already chosen)
            # city
            too_close = False
            for i_chosen in indices:
                cur_dist = distances[i_chosen, i_left]
                # if we already have enough candidates, and if the current is
                # worse than all others, let's skip it
                if len(good_candidates) >= nb_keep \
                        and cur_dist <= good_candidates[0].max_dist:
                    too_close = True
                    # logger.debug('too close @%f', cur_dist)
                    break
                dist_nearest_neighbor = min(dist_nearest_neighbor, cur_dist)
            # we don't compare the distance of this candidate with all cities
            # if it's closer to (already chosen) city than our best candidate
            # so far
            if too_close:
                continue
            # dist_nearest_neighbor = numpy.min(distances[indices][:,i_left])
            # logger.debug('candidate %i has a city at %f', no_candidate,
            # dist_nearest_neighbor)

            # if dist_nearest_neighbor > best_candidate.max_dist:
            # logger.debug('(new max)')
            new_candidate = CityComp(dist_nearest_neighbor, no_candidate)
            # logger.debug('trying to add new candidate with dist %f',
            # new_candidate.max_dist)
            # if we don't have enough anyway
            if len(good_candidates) < nb_keep:
                heapq.heappush(good_candidates, new_candidate)
            else:
                # if we have enough, just keep the n best
                rejected_cand = heapq.heappushpop(good_candidates,
                                                  new_candidate)
                # logger.debug('removed candidate %i with dist %f',
                # rejected_cand.max_dist_idx,
                # rejected_cand.max_dist)

        # take the smallest index in our good candidates. this corresponds to
        # the best (according to our first ORDER BY) amongst the "far enough"
        # candidates
        best_candidate = min(good_candidates, key=lambda x: x.max_dist_idx)
        logger.debug(
            'keeping %s with pop %i',
            cities[indices_left[best_candidate.max_dist_idx]][0].name,
            cities[indices_left[best_candidate.max_dist_idx]][0].population,
        )
        # input('press to continue')
        indices.append(indices_left.pop(best_candidate.max_dist_idx))
        logger.debug('done, best candidate was %i with distance %f',
                     best_candidate.max_dist_idx, best_candidate.max_dist)
        logger.debug('done, chosen: %i, remaining: %i', len(indices),
                     len(indices_left))
        timer.update()

    assert len(indices) == len(cities)
    for priority_index, i in enumerate(indices):
        cities[i][0].priority_index = priority_index
    session.commit()
Beispiel #11
0
        for row in reader:
            country, region = row['country_region'].split('.')
            if region in regions[country]:
                raise Exception('A region is present twice in the file')
            regions[country][region] = row['name']
    # pprint(regions)

    countries = {}
    with open(args.country_infos_file) as f:
        reader = csv.reader((line for line in f if not line.startswith('#')),
                            delimiter='\t')
        for row in reader:
            countries[row[0]] = row[4]
    # pprint(countries)

    timer = Timer()

    fields = [
        'geonameid', 'name', 'asciiname', 'alternatenames', 'latitude',
        'longitude', 'feature class', 'feature code', 'country code', 'cc2',
        'admin1 code', 'admin2 code', 'admin3 code', 'admin4 code',
        'population', 'elevation', 'dem', 'timezone', 'modification date'
    ]

    logger.info('reading the data')
    cities = defaultdict(lambda: defaultdict(dict))
    nb_cities_kept = 0
    with open(args.input_file) as f:
        reader = csv.DictReader(f,
                                delimiter='\t',
                                fieldnames=fields,
Beispiel #12
0
 def print_if(n):
     if n < 100:
         return Timer.default_print_if(n)
     else:
         return n % 100 == 0
Beispiel #13
0
        if n < 100:
            return Timer.default_print_if(n)
        else:
            return n % 100 == 0

    def keep_city(city):
        if (args.country is None or city.country == args.country) and \
                (args.region is None or city.region == args.region) and \
                (args.city is None or city.name == args.city):
            return True
        return False

    nb_cities = sum(1 for x in dump_in if keep_city(x))
    if args.max_cities is not None and args.max_cities < nb_cities:
        nb_cities = args.max_cities
    timer = Timer(nb_cities, print_if=print_if)
    nb_no_wiki = 0
    nb_no_climate = 0
    nb_already_there = 0
    nb_coords_from_wiki = 0
    nb_coords_from_geonames = 0
    nb_done = 0
    for city in dump_in:
        if args.max_cities is not None and nb_done >= args.max_cities:
            break
        if not keep_city(city):
            continue
        timer.update()
        logger.debug(city)

        city_id = '{}/{}/{}'.format(city.name, city.region, city.country)