Ejemplo n.º 1
0
def createTable():
    tgp.db().execute(
        "CREATE TABLE IF NOT EXISTS topcities"
        "(geonameid STRING PRIMARY KEY, name STRING, asciiname STRING, "
        "latitude STRING, longitude STRING, country_code STRING, admin1_code STRING, "
        "population BIGINT, timezone STRING, region STRING, country_name STRING)"
    )
Ejemplo n.º 2
0
def processCities():
    db = tgp.db()
    cur = db.cursor()
    i = 0
    dupe = 0
    crit = 0
    row = ''
    with open(dataPath() + 'cities15000.txt') as file:
        for line in file:
            try:
                fields = re.split(r'\t', line.strip())
                row = list(map(str.strip, fields))
                cur.execute(
                    "INSERT INTO geonames VALUES "
                    "(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
                    row)
                i = i + 1
            except sqlite3.IntegrityError as e:
                log.debug(e)
                log.debug(row)
                dupe = dupe + 1
            except sqlite3.OperationalError as e:
                log.critical(e)
                log.critical(row)
                crit = crit + 1

    cur.close()
    db.commit()
    log.info('cities - {:d} duplicate records'.format(dupe))
    log.info('cities - {:d} critial records'.format(crit))
    log.info('cities - Added {:d} records'.format(i))
Ejemplo n.º 3
0
def processAdminCode():
    """ only bothering with admin codes 1"""
    db = tgp.db()
    cur = db.cursor()
    i = 0
    dupe = 0
    crit = 0
    row = ''
    with open(dataPath() + 'admin1CodesASCII.txt') as file:
        for line in file:
            try:
                fields = re.split(r'\t', line.strip())
                row = list(map(str.strip, fields))
                parts = row[0].split('.', 2)
                cur.execute("INSERT INTO admin1codes VALUES "
                            "(?, ?, ?)", (parts[0], parts[1], row[1]))
                i = i + 1
            except sqlite3.IntegrityError as e:
                log.debug(e)
                log.debug(row)
                dupe = dupe + 1
            except sqlite3.OperationalError as e:
                log.critical(e)
                log.critical(row)
                crit = crit + 1

    cur.close()
    db.commit()
    log.info('admin1 - {:d} duplicate records'.format(dupe))
    log.info('admin1 - {:d} critial records'.format(crit))
    log.info('admin1 - Added {:d} records'.format(i))
Ejemplo n.º 4
0
def insertRows(data):
    db = tgp.db()
    cur = db.cursor()
    i = 0
    dupe = 0
    crit = 0

    for obj in data:
        try:
            cur.execute("INSERT INTO abbreviations (abbr, desc, desc_extra, offset, offset_sec) VALUES (?, ?, ?, ?, ?)",
                        (obj['abbr'], obj['desc'], obj['desc_extra'],
                         obj['offset'], obj['offset_sec'])
                        )
            i = i + 1
        except sqlite3.IntegrityError as e:
            log.debug(e)
            log.debug(obj)
            dupe = dupe + 1
        except sqlite3.OperationalError as e:
            log.critical(e)
            log.critical(obj)
            crit = crit + 1

    cur.close()
    db.commit()
    log.info('{:d} duplicate records'.format(dupe))
    log.info('{:d} critial records'.format(crit))
    log.info('Added {:d} records'.format(i))
Ejemplo n.º 5
0
def insertRows(data):
    db = tgp.db()
    cur = db.cursor()
    i = 0
    dupe = 0
    crit = 0
    for k, v in data['zones'].items():
        for x, y in v.items():
            rec = y[0]
            zone = ''
            if x is None:
                # log.info(rec.getArea())
                zone = rec.getArea()
            else:
                zone = rec.getArea() + '/' + x
            try:
                cur.execute("INSERT INTO timezones (timezone, offset, offset_sec) VALUES (?, ?, ?)",
                            (zone, util.secToOffset(rec.getGMTOffset()), rec.getGMTOffset())
                            )
                i = i + 1
            except sqlite3.IntegrityError as e:
                log.info(e)
                log.info(rec)
                dupe = dupe + 1
            except sqlite3.OperationalError as e:
                log.critical(e)
                log.critical(rec)
                crit = crit + 1

    cur.close()
    db.commit()
    log.info('{:d} duplicate records'.format(dupe))
    log.info('{:d} critial records'.format(crit))
    log.info('Added {:d} records'.format(i))
Ejemplo n.º 6
0
def processCountryInfo():
    """ only bothering with admin codes 1"""
    db = tgp.db()
    cur = db.cursor()
    i = 0
    dupe = 0
    crit = 0
    row = ''
    with open(dataPath() + 'countryInfo.txt') as file:
        for line in file:
            try:
                if line.startswith('#'):
                    continue

                fields = re.split(r'\t', line)
                row = list(map(str.strip, fields))

                cur.execute(
                    "INSERT INTO countryinfo VALUES "
                    "(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
                    row)
                i = i + 1
            except sqlite3.IntegrityError as e:
                log.debug(e)
                log.debug(row)
                dupe = dupe + 1
            except sqlite3.OperationalError as e:
                log.critical(e)
                log.critical(row)
                crit = crit + 1

    cur.close()
    db.commit()
    log.info('country - {:d} duplicate records'.format(dupe))
    log.info('country - {:d} critial records'.format(crit))
    log.info('country - Added {:d} records'.format(i))
Ejemplo n.º 7
0
def createTables():
    tgp.db().execute(getGeonamesSql('geonames'))
    tgp.db().execute(getAdminCodesSql())
    tgp.db().execute(getCountryInfoSql())
Ejemplo n.º 8
0
def createTable():
    tgp.db().execute("CREATE TABLE IF NOT EXISTS abbreviations"
                          "(abbr STRING, desc STRING, desc_extra STRING, offset STRING, "
                          "offset_sec INTEGER, PRIMARY KEY(abbr, offset))")
Ejemplo n.º 9
0
def createTable():
    tgp.db().execute("CREATE TABLE IF NOT EXISTS timezones"
                          "(timezone STRING PRIMARY KEY, offset STRING, offset_sec INTEGER)")
Ejemplo n.º 10
0
def build():
    os.makedirs(dataPath(), exist_ok=True)
    global db
    db = tgp.db()

    cur = db.cursor()
    for i in glob.glob(u'{}*.txt'.format(dataPath())):
        os.unlink(i)

    all_tz_sql = "select distinct timezone from geonames"
    cur.execute(all_tz_sql)
    alltz = cur.fetchall()

    MAX_CITY = 30

    cities = {}
    pops = {}

    SQL_SELECT = "select g.*, a.name, c.country from geonames g " \
                 "left join admin1codes a on g.country_code = a.country_code and g.admin1_code = a.code " \
                 "left join countryinfo c on g.country_code = c.iso "

    # set population baselines using capital(s)
    # ignore the MAX checks here, b/c the shouldn't happen
    log.info("Load capitals (PPLC) for each zone if they exist")
    for tz in alltz:
        cur_tz = tz[0]
        sql = SQL_SELECT + " where feature_code in ('PPLC') and timezone = '{}' " \
                           "order by feature_code desc, population desc".format(
                               cur_tz)
        pplcs = cur.execute(sql)
        for pplc in pplcs:
            # gather population nums
            if cur_tz not in pops:
                pops[cur_tz] = {}
                pops[cur_tz]['total'] = pplc[gn.POPUL]
                pops[cur_tz]['pplc'] = pplc[gn.POPUL]
                pops[cur_tz]['max'] = pplc[gn.POPUL]
            else:
                pops[cur_tz]['total'] = pops[cur_tz]['total'] + pplc[gn.POPUL]
                # capture pplc pop
                if pplc[gn.POPUL] > pops[cur_tz]['pplc']:
                    pops[cur_tz]['pplc'] = pplc[gn.POPUL]
                # set the max pop
                if pplc[gn.POPUL] > pops[cur_tz]['max']:
                    pops[cur_tz]['max'] = pplc[gn.POPUL]

            # prime the files (this should be in one place)
            if cur_tz not in cities:
                cities[cur_tz] = {}

            _record(pplc[gn.ID], cities[cur_tz], pplc)
            cities[cur_tz][pplc[gn.ID]] = pplc[gn.NAME]

    # set population baselines using max pop
    # ignore the MAX checks here, b/c they shouldn't happen
    log.info(
        "Load the city with the largest population for each zone (skip if it's the PPLC)"
    )
    for tz in alltz:
        cur_tz = tz[0]
        sql = SQL_SELECT + \
            " where timezone = '{}' order by population desc limit 1".format(
                cur_tz)
        maxes = cur.execute(sql)
        for max in maxes:
            # create the rec or if the pplc was the max, we have it, keep going
            if cur_tz not in cities:
                cities[cur_tz] = {}
            elif max[gn.ID] in cities[cur_tz]:
                continue

            # gather population nums
            if cur_tz not in pops:
                pops[cur_tz] = {}
                pops[cur_tz]['total'] = max[gn.POPUL]
                pops[cur_tz]['pplc'] = 0
                pops[cur_tz]['max'] = max[gn.POPUL]
            elif max[gn.POPUL] > pops[cur_tz]['max']:
                pops[cur_tz]['max'] = max[gn.POPUL]
                pops[cur_tz]['total'] += max[gn.POPUL]
            else:
                pops[cur_tz]['total'] += max[gn.POPUL]

            # prime the files (this should be in one place)
            if cur_tz not in cities:
                cities[cur_tz] = {}

            _record(max[gn.ID], cities[cur_tz], max)
            cities[cur_tz][max[gn.ID]] = max[gn.NAME]

    # fill in above the PPLC as necessary (max pop > pplc pop)
    log.info("Filling in all cities larger than the PPLC")
    for tz in pops:
        if pops[tz]['pplc'] == 0 or pops[tz]['max'] <= pops[tz]['pplc']:
            continue

        city_left = MAX_CITY - len(cities[tz])
        sql = SQL_SELECT + " where g.timezone = '{}' and g.population > {} and g.feature_code != 'PPLX' " \
                           "order by g.population desc limit {}".format(
                               tz, pops[tz]['pplc'], city_left)
        recs = cur.execute(sql)
        for rec in recs:
            # prime the files (this should be in one place)
            if tz not in cities:
                cities[tz] = {}

            _record(rec[gn.ID], cities[tz], rec)
            cities[tz][rec[gn.ID]] = rec[gn.NAME]

    # fill in below the PPLC (< pplc pop, *some* percentage of pop)
    log.info("Fill in cities smaller than the PPLC up to {}".format(MAX_CITY))
    for tz in pops:
        if pops[tz]['pplc'] >= pops[tz]['max']:
            pop = pops[tz]['pplc']
            pct = 0.25
        elif pops[tz]['pplc'] != 0:  # we had a max
            pop = pops[tz]['max']
            pct = 0.5
        else:
            pop = pops[tz]['max']
            pct = 0.35

        log.debug("{} : max = {} pplc = {} | pop = {}  pct = {}".format(
            tz, pops[tz]['max'], pops[tz]['pplc'], pop, pct))
        city_left = MAX_CITY - len(cities[tz])
        sql = SQL_SELECT + "where g.timezone = '{}' and g.population < {} and g.population > {} * {} " \
            "and g.feature_code != 'PPLX' order by g.population desc limit {}".format(
                tz, pop, pop, pct, city_left)

        recs = cur.execute(sql).fetchall()

        # often the max pop cities are way larger than other cities we want to include
        if len(recs) < 3:
            low_pop = 100000
            sql = SQL_SELECT + "where g.timezone = '{}' and g.population > {} " \
                "and g.feature_code != 'PPLX' order by g.population desc limit {}".format(
                    tz, low_pop, city_left)

            recs = cur.execute(sql)

        for rec in recs:
            # prime the files (this should be in one place)
            if tz not in cities:
                cities[tz] = {}

            _record(rec[gn.ID], cities[tz], rec)
            cities[tz][rec[gn.ID]] = rec[gn.NAME]

    log.info("Generating overview(s)")
    out = open(dataPath() + 'city_overview.tab', 'w')
    tot = 0
    for tz in cities:
        row = []
        row.append(tz)
        row.append(str(len(cities[tz])))
        tot += len(cities[tz])
        s = "\t".join(row)
        log.debug(s)
        out.write(s + "\n")
    s = "\t\t\tTotal = {}".format(tot)
    log.debug(s)
    out.write(s + "\n")
    out.close()